需要安装的包:
install.packages(c("mvtnorm","ICSNP","biotools"))
#dir<-"D:\\mywork\\study_R\\multivariate ana"
#setwd(dir)
rm(list = ls(all = TRUE))
dirdata<-"https://statstudy.github.io/data/"
设\(X_1,X_2,\cdots,X_n\)是来自某p维正态总体\((\mu,\Sigma)\)的随机样本,均值向量为\(\mu\),样本方差阵为\(S\)。
设\(x_1,x_2,\cdots,x_n\)是来自p维正态总体\(N(\mu,\Sigma)\)的一个随机样本
\[H_0:\mu= \mu_0 \quad vs \quad H_1: \mu\ne \mu_0 \]
当 \(p=1\) 时: \[t=\frac{\bar{x}-\mu_0}{s/ \sqrt{n}}\] 其中 \[s^2= \frac{1}{n-1}\sum_{i=1}^n (x_i-\bar{x})^2\] 考虑 \(t^2\): \[t^2=(\bar{x}-\mu_0)(\frac{s^2}{n})^{-1}(\bar{x}-\mu_0)\] 当 \(t^2\ge t_{n-1}^2 (\alpha/2)\) 时,拒绝原假设 \(H_0\)。
当 \(p>1\) 时:
\[ T^2=(\bar{\rm{x}}-\rm{\mu_0})^{\prime}(\frac{\rm{S}^2}{n})^{-1}(\bar{\rm x}-\rm{\mu_0})\] 其中: \[S=\frac{1}{n-1}\sum_{i=1}^n(x_i-\overline{x})(x_i-\overline{x})^\prime\]
上述统计量被称为 Hotelling \(T^2\) 统计量。当原假设 \(H_0\) 成立时,它服从\(\frac{(n-1)p}{n-p}F_{p,n-p}\) 。
下面两个函数一个是用来看数据是否服从正态分布的,一个是用来做Hotelling \(T^2\) 检验
"qqchi2" <- function(da){
# The data matrix is "da".
if(!is.matrix(da))da=as.matrix(da)
nr = dim(da)[1]
nc = dim(da)[2]
dev=scale(da,center=T,scale=F)
dev=as.matrix(dev)
s=cov(da)
si=solve(s)
d2=sort(diag(dev%*%si%*%t(dev)))
prob=(c(1:nr)-0.5)/nr
q1=qchisq(prob,nc)
plot(q1,d2,xlab='Quantile of chi-square',ylab='d^2')
fit = lsfit(q1,d2)
fitted = d2-fit$residuals
lines(q1,fitted)
rq=cor(q1,d2)
cat("correlation coefficient:",rq,"\n")
}
"Hotelling" <- function(da,mu){
# The data matrix is "da".
# The mean vector is mu. (a list of numbers).
#
if(!is.matrix(da))da=as.matrix(da)
Hotelling=NULL
nr = dim(da)[1]
nc = dim(da)[2]
cm=matrix(colMeans(da),nc,1)
S = cov(da)
si=solve(S)
mu0=matrix(mu,nc,1)
dev=cm-mu0
T2 = nr*(t(dev)%*%si%*%dev)
d2=nr-nc
tt=T2*d2/(nc*(nr-1))
pvalue=1-pf(tt,nc,d2)
Hotelling=cbind(Hotelling,c(T2,pvalue,nc,d2))
row.names(Hotelling)=c("Hoteliing-T2","p.value","1df","2df")
Hotelling
}
数据是某月四只股票的价格,现在检验四只股票价格的波动是否为0
x=read.table(file=paste(dirdata,"norm1.txt",sep=""))
dim(x)
## [1] 120 4
head(x)
## V1 V2 V3 V4
## 1 -4.7401928 -2.718622 8.1791025 4.0690786
## 2 18.1062431 13.282227 5.4949253 -6.8393408
## 3 -1.7376094 -3.995883 0.6660768 9.0107579
## 4 -0.5550375 -4.037210 -2.5381400 -8.5890188
## 5 7.1587370 -4.436577 1.4425451 -5.0585124
## 6 -7.3038544 -6.908645 10.0123124 -0.6009018
qqchi2(x)
## correlation coefficient: 0.9608707
xave=colMeans(x);xave
## V1 V2 V3 V4
## -0.26294605 0.61075048 0.67154781 0.02679363
S=var(x);S
## V1 V2 V3 V4
## V1 127.011719 28.776911 8.046536 50.919252
## V2 28.776911 80.328617 8.385634 13.216236
## V3 8.046536 8.385634 40.750353 -8.710395
## V4 50.919252 13.216236 -8.710395 122.712745
Si=solve(S)
T2=nrow(x)*t(xave)%*%Si%*%xave;T2
## [,1]
## [1,] 2.107425
Hotelling(x,c(0,0,0,0))
## [,1]
## Hoteliing-T2 2.1074254
## p.value 0.7258692
## 1df 4.0000000
## 2df 116.0000000
t.test(x[,1])
##
## One Sample t-test
##
## data: x[, 1]
## t = -0.25559, df = 119, p-value = 0.7987
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -2.300074 1.774182
## sample estimates:
## mean of x
## -0.2629461
t.test(x[,2])
##
## One Sample t-test
##
## data: x[, 2]
## t = 0.74648, df = 119, p-value = 0.4568
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -1.009311 2.230812
## sample estimates:
## mean of x
## 0.6107505
t.test(x[,3])
##
## One Sample t-test
##
## data: x[, 3]
## t = 1.1524, df = 119, p-value = 0.2515
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -0.4823362 1.8254318
## sample estimates:
## mean of x
## 0.6715478
library(ICSNP)
## Loading required package: mvtnorm
## Loading required package: ICS
HotellingsT2(x, mu=c(0,0,0,0))
##
## Hotelling's one sample T2-test
##
## data: x
## T.2 = 0.51357, df1 = 4, df2 = 116, p-value = 0.7259
## alternative hypothesis: true location is not equal to c(0,0,0,0)
下面是20个健康女人纳的摄入: sweat rate(X1), Sodium(X2), and Potassium(X3). 现在要检验这三个指标是否等于(4,50,10)
检验:
z=read.table(file=paste(dirdata,"norm2.DAT",sep=""))
head(z)
## V1 V2 V3
## 1 3.7 48.5 9.3
## 2 5.7 65.1 8.0
## 3 3.8 47.2 10.9
## 4 3.2 53.2 12.0
## 5 3.1 55.5 9.7
## 6 4.6 36.1 7.9
dim(z)
## [1] 20 3
colMeans(z)
## V1 V2 V3
## 4.640 45.400 9.965
colnames(z) <- c("rate","Sodium","Potassium")
qqchi2(z)
## correlation coefficient: 0.975713
#source("Hotelling.R")
Hotelling(z,c(4,50,10))
## [,1]
## Hoteliing-T2 9.73877286
## p.value 0.06492834
## 1df 3.00000000
## 2df 17.00000000
#HotellingsT2(z,c(4,50,10))
x=read.table(file=paste(dirdata,"m-ba4c9807.txt",sep=""))
# source("confreg.R")
# confreg(x)
美国要求所有企业的的污水排除都必须安装自动检测装置,下面同一种检测装置分别放置在两个不同的实验室且对污水中两种检测指标的11条记录,请分析这两个实验室的这两种指标是否有所不同。
x=read.table(file=paste(dirdata,"T6-1.DAT",sep=""))
d1=x[,1]-x[,3]
d2=x[,2]-x[,4]
d=cbind(d1,d2)
#source('Hotelling.R')
Hotelling(d,rep(0,2))
## [,1]
## Hoteliing-T2 13.63931214
## p.value 0.02082779
## 1df 2.00000000
## 2df 9.00000000
#source("confreg.R")
#confreg(d)
例: 检验一下下面随机生成的两个样本的均值向量是否相同。
library(mvtnorm)
X1<- rmvnorm(20, c(0, 0, 0, 0, 0))
X2<- rmvnorm(20, c(0,0, 4, 0, 0))
调用Hotellings检验
library(ICSNP)
HotellingsT2(X1, X2)
##
## Hotelling's two sample T2-test
##
## data: X1 and X2
## T.2 = 36.484, df1 = 5, df2 = 34, p-value = 9.97e-13
## alternative hypothesis: true location difference is not equal to c(0,0,0,0,0)
例4:教材P48 3-4题。
data3.3=read.csv("http://statstudy.github.io/data/data3_3.csv",head=T)#读入数据#
head(data3.3)
## X1 X2 X3 X4
## 1 5 6 9 8
## 2 8 5 3 6
## 3 9 6 7 9
## 4 9 2 2 8
## 5 9 4 3 7
## 6 9 5 3 7
HotellingsT2(data3.3, mu=c(7,5,4,8))#Hotellings检验#
##
## Hotelling's one sample T2-test
##
## data: data3.3
## T.2 = 7.5968, df1 = 4, df2 = 16, p-value = 0.00125
## alternative hypothesis: true location is not equal to c(7,5,4,8)
例:检验一下三个随机得到样本均值向量是否相同。
X1<- rmvnorm(20, c(0, 0, 0, 0, 0), diag(5))
X2<- rmvnorm(20, c(0,0, 4, 0, 0), diag(5))
X3<- rmvnorm(20, c(0,0, 0, 0, 0.05), diag(5))
使用多元方差分析对多个总体均值向量做比较
X=rbind(cbind(1,X1),cbind(2,X2),cbind(3,X3));head(X)
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 1 -0.97091994 -1.3843606 -0.41735522 1.67182920 -1.0065116
## [2,] 1 -0.76289718 -1.0450679 0.08306575 -0.03353745 0.6535281
## [3,] 1 -0.09799913 -1.5249261 0.96779784 -0.40120501 -0.5496460
## [4,] 1 -0.59975486 0.3356431 0.30287704 1.79828952 0.5414554
## [5,] 1 2.42954683 -0.6774344 -0.66182811 -1.47226602 1.3035056
## [6,] 1 -1.02574410 1.5284090 -0.10640259 0.16847874 -0.7724149
xx=X[,2:6]
head(xx)
## [,1] [,2] [,3] [,4] [,5]
## [1,] -0.97091994 -1.3843606 -0.41735522 1.67182920 -1.0065116
## [2,] -0.76289718 -1.0450679 0.08306575 -0.03353745 0.6535281
## [3,] -0.09799913 -1.5249261 0.96779784 -0.40120501 -0.5496460
## [4,] -0.59975486 0.3356431 0.30287704 1.79828952 0.5414554
## [5,] 2.42954683 -0.6774344 -0.66182811 -1.47226602 1.3035056
## [6,] -1.02574410 1.5284090 -0.10640259 0.16847874 -0.7724149
fac=factor(X[,1]);fac
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [36] 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## Levels: 1 2 3
re=manova(xx~factor(X[,1]))
summary(re)
## Df Pillai approx F num Df den Df Pr(>F)
## factor(X[, 1]) 2 0.86067 8.1584 10 108 1.021e-09 ***
## Residuals 57
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
例: 检验一下上个例子中各个样本的协方差阵是否一致。
library(biotools)
## ---
## biotools version 3.0
boxM(X[,2:6],factor(X[,1]))
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: X[, 2:6]
## Chi-Sq (approx.) = 24.852, df = 30, p-value = 0.7322
返回课程主页。