# # One-sample inference about a scalar mean # #univariate normal samples set.seed(123) x<-rnorm(100) n<-length(x) x.mean<-mean(x) x.var<-var(x) mu0<-0 #test wheter the mean is zero or not t.stat<-sqrt(n)*(x.mean-mu0)/sqrt(x.var) pvalue<-2*(1-pt(abs(t.stat),df=n-1)) pvalue #95% confidence interval print(c(x.mean-sqrt(x.var/n)*qt(0.975,df=n-1),x.mean+sqrt(x.var/n)*qt(0.975,df=n-1))) #an easier way: t.test(x,mean=0,alternative="two.sided") # # One-sample inference about a mean vector #Perspiration from 20 healthy females in terms of (a) sweat rate, (b) sodium and (c) potassium content. sweat <- read.table(file = "http://www.public.iastate.edu/~maitra/stat501/datasets/sweat.dat", header = F, col.names = c("subject", "x1", "x2", "x3")) sweat # Compute sample mean vector and # sample covariance matrix xbar <- colMeans(sweat[ ,2:4]) xbar xvar <- var(sweat[ ,2:4]) xvar # Compute Hotelling statistic p <- nrow(xvar) n <- nrow(sweat) nullmean <- c(4, 50, 10) d <- xbar-nullmean t2 <- n*t(d)%*%solve(xvar)%*%d; t2mod <- (n-p)*t2/(p*(n-1)) pval <- 1- pf(t2mod,p,n-p) cat("Hotelling T-squared statistic", fill=T) t2 cat("p-value", fill=T) pval # alternative using the function in the ISCP package library(ICSNP) HotellingsT2(X = sweat[, -1], mu = nullmean) # Confidence Region # Example 5.3 x<-read.table("datafiles/T4-1.DAT") y<-read.table("datafiles/T4-5.DAT") x<-x^{1/4} y<-y^{1/4} datatemp<-cbind(x,y) colnames(datatemp)<-c("x","y") conf.reg<-function(xdata,alpha){ if(ncol(xdata)!=2) stop("Only for bivariate normal") n<-nrow(xdata) xbar<-colMeans(xdata) S<-cov(xdata) es<-eigen(S) e1<-es\$vec %*% diag(sqrt(es\$val)) r1<-sqrt(qf(alpha,2,n-2))*sqrt(2*(n-1)/(n*(n-2))) theta<-seq(0,2*pi,len=250) v1<-cbind(r1*cos(theta), r1*sin(theta)) pts<-t(xbar-(e1%*%t(v1))) plot(pts,type="l",main="Confidence Region for Bivariate Normal",xlab=colnames(xdata)[1],ylab=colnames(xdata)[2],asp=1) segments(0,xbar[2],xbar[1],xbar[2],lty=2) # highlight the center segments(xbar[1],0,xbar[1],xbar[2],lty=2) th2<-c(0,pi/2,pi,3*pi/2,2*pi) #adding the axis v2<-cbind(r1*cos(th2), r1*sin(th2)) pts2<-t(xbar-(e1%*%t(v2))) segments(pts2[3,1],pts2[3,2],pts2[1,1],pts2[1,2],lty=3) segments(pts2[2,1],pts2[2,2],pts2[4,1],pts2[4,2],lty=3) } conf.reg(datatemp,alpha=0.95) # Compute 95% simultaneous confidence intervals for the two mean values p<-ncol(datatemp) n<-nrow(datatemp) S<-cov(datatemp) xbar<-colMeans(datatemp) mu1.L=xbar[1]-sqrt(((n-1)*p/(n-p))*qf(0.95,p,n-p))*sqrt(S[1,1]/n) mu1.U=xbar[1]+sqrt(((n-1)*p/(n-p))*qf(0.95,p,n-p))*sqrt(S[1,1]/n) mu2.L=xbar[2]-sqrt(((n-1)*p/(n-p))*qf(0.95,p,n-p))*sqrt(S[2,2]/n) mu2.U=xbar[2]+sqrt(((n-1)*p/(n-p))*qf(0.95,p,n-p))*sqrt(S[2,2]/n) c(mu1.L,mu1.U) c(mu2.L,mu2.U) lines(c(mu1.L,mu1.L),c(0.53,mu2.U),lty=2,col=2,lwd=2) lines(c(mu1.U,mu1.U),c(0.53,mu2.U),lty=2,col=2,lwd=2) lines(c(0.49,mu1.U),c(mu2.L,mu2.L),lty=2,col=2,lwd=2) lines(c(0.49,mu1.U),c(mu2.U,mu2.U),lty=2,col=2,lwd=2) # Compute 95% Bonferroni confidence intervals for the two mean values mu1.LB=xbar[1]-qt(0.05/(2*p),n-1,lower.tail=F)*sqrt(S[1,1]/n) mu1.UB=xbar[1]+qt(0.05/(2*p),n-1,lower.tail=F)*sqrt(S[1,1]/n) mu2.LB=xbar[2]-qt(0.05/(2*p),n-1,lower.tail=F)*sqrt(S[2,2]/n) mu2.UB=xbar[2]+qt(0.05/(2*p),n-1,lower.tail=F)*sqrt(S[2,2]/n) c(mu1.LB,mu1.UB) c(mu2.LB,mu2.UB) # Plot the confidence intervals together with the confidence ellipse: lines(c(mu1.LB,mu1.LB),c(0.53,mu2.UB),lty=3,col=3,lwd=2) lines(c(mu1.UB,mu1.UB),c(0.53,mu2.UB),lty=3,col=3,lwd=2) lines(c(0.49,mu1.UB),c(mu2.LB,mu2.LB),lty=3,col=3,lwd=2) lines(c(0.49,mu1.UB),c(mu2.UB,mu2.UB),lty=3,col=3,lwd=2) # Illustration of the critical distance multipliers for "one-at-a-time" and T2-intervals (cf. Table 5.3) # Compute and plot the critical distance multipliers (for alpha=0.05) for various values of n and p p=2 n=(p+1):100 onetime=qt(0.975,n-1) simultan=sqrt(((n-1)*p/(n-p))*qf(0.95,p,n-p)) plot(n,simultan/onetime,xlim=c(0,100),ylim=c(0,6),type="l",main="Illustration of the critical distance multipliers") p=5 n=(p+1):100 onetime=qt(0.975,n-1) simultan=sqrt(((n-1)*p/(n-p))*qf(0.95,p,n-p)) lines(n,simultan/onetime,lty=2) p=10 n=(p+1):100 onetime=qt(0.975,n-1) simultan=sqrt(((n-1)*p/(n-p))*qf(0.95,p,n-p)) lines(n,simultan/onetime,lty=3) legend(40,6,legend=c("p=2","p=5","p=10"),lty=1:3,lwd=2) # Illustration of length of Bonferroni intervals relative to length of T2-intervals (cf. Table 5.4) # Compute and plot the critical distance multipliers (for alpha=0.05) for various values of n and m = p p=2 n=(p+1):100 bonf=qt(0.05/(2*p),n-1,lower.tail=F) simultan=sqrt(((n-1)*p/(n-p))*qf(0.95,p,n-p)) plot(n,bonf/simultan,xlim=c(0,100),ylim=c(0,1),type="l",lwd=2,main="Illustration of Bonferroni intervals") p=5 n=(p+1):100 bonf=qt(0.05/(2*p),n-1,lower.tail=F) simultan=sqrt(((n-1)*p/(n-p))*qf(0.95,p,n-p)) lines(n,bonf/simultan,lty=2,lwd=2) p=10 n=(p+1):100 bonf=qt(0.05/(2*p),n-1,lower.tail=F) simultan=sqrt(((n-1)*p/(n-p))*qf(0.95,p,n-p)) lines(n,bonf/simultan,lty=3,lwd=2) legend(40,0.4,legend=c("p=2","p=5","p=10"),lty=1:3,lwd=2) ###---------------------------------------------------------------- ### Maximal Likelihood estimation of mean and covariance ### for multivariate normal distribution by EM algorithm, ### for demonstration purposes only ###---------------------------------------------------------------- em<-function(xdata,mu0,Sigma0){ n<-nrow(xdata) p<-ncol(xdata) err<-function(mu0,Sigma0,mu1,Sigma1){ th0<-c(mu0,as.vector(Sigma0)) th1<-c(mu1,as.vector(Sigma1)) sqrt(sum((th0-th1)*(th0-th1))) } mu1<-mu0+1 Sigma1<-Sigma0+1 while(err(mu0,Sigma0,mu1,Sigma1)>1e-6){ mu1<-mu0 Sigma1<-Sigma0 zdata<-xdata Ai<-matrix(0,p,p) for(i in 1:n){ if(any(is.na(xdata[i,]))){ zi<-xdata[i,] na.idx<-(1:p)[is.na(zi)] cs.idx<-(1:p)[-na.idx] Sigma012<-Sigma0[na.idx,cs.idx,drop=FALSE] Sigma022.iv<-solve(Sigma0[cs.idx,cs.idx]) zdata[i,na.idx]<-mu0[na.idx]+(Sigma012%*%Sigma022.iv)%*%(zi[cs.idx]-mu0[cs.idx]) Ai[na.idx,na.idx]<-Ai[na.idx,na.idx]+Sigma0[na.idx,na.idx]-Sigma012%*%Sigma022.iv%*%t(Sigma012) } } mu0<-colMeans(zdata) Sigma0<-(n-1)*cov(zdata)/n+Ai/n } return(list(mu=mu0,Sigma=Sigma0)) } ##A simulation example library(MASS) set.seed(1200) mu0<-c(1,0,-1) p=3 n<-1000 Sig0 <- matrix(c(1, .7, .6, .7, 1, .4, .6, .4, 1), nrow = 3) triv<-triv0<-mvrnorm(n,mu0,Sig0) ###Missing Complete at random misp<-0.2 #MCAR probability misidx<-matrix(rbinom(3*n,1,misp)==1,nrow=n) triv[misidx]<-NA #exclude the cases whose entire elements were missed er<-which(apply(apply(triv,1,is.na),2,sum)==3) if(length(er)>=1) triv<-triv[-er,] ##summary statistics triv.na<-apply(triv,2,is.na) apply(triv.na,2,sum)/nrow(triv) #variables have missing values triv.cases<-apply(triv.na,1,sum) table(triv.cases) #missingness in cases com.cases<-triv[complete.cases(triv),] #the estimates based on the complete cases mu<-colMeans(com.cases) mu mu0 #true values Sig<-cov(com.cases) Sig Sig0 #true values #both estimates seem to be consistent # EM estimates #initial values mu.ini<-rep(0,p) Sigma.ini<-diag(p) system.time(rlt<-em(triv,mu.ini,Sigma.ini)) #a better initial values by using mu and Sig system.time(rlt<-em(triv,mu,Sig)) ###Missing at random #Generate MAR data,the missingness of x3 depends on x2 #triv=[x1,x2,x3] triv<-triv0 x2<-triv[,2] alpha<-1/(1+exp(2*x2)) x3.mar<-rbinom(1000,1,alpha) table(x3.mar) x3<-triv[,3] x3[!x3.mar]<-NA triv.mar<-cbind(triv[,1:2],x3) #check missing scheme visually mis.triv<-is.na(triv.mar) a<-mis.triv[,3]+1 #define colors par(mfrow=c(1,2)) plot(jitter(as.numeric(mis.triv[,1])),jitter(as.numeric(mis.triv[,3])),xlab="x2",ylab="x3",col=a,pch=19) plot(triv[,1:2],xlab="x1",ylab="x2",col=a,pch=19) #the estimates based on the complete cases com.cases2<-triv.mar[complete.cases(triv.mar),] mu<-colMeans(com.cases2) mu mu0 #true values Sig<-cov(com.cases2) Sig Sig0 #true values #both estimates are biased. # EM estimates #initial values mu.ini<-rep(0,p) Sigma.ini<-diag(p) system.time(rlt<-em(triv.mar,mu.ini,Sigma.ini)) rlt #a better initial values by using mu and Sig system.time(rlt<-em(triv.mar,mu,Sig)) rlt