###------------------------------------------------------------ ### MLEs for Multivariate Normal distribution mu<-c(0,1) Sigma<-matrix(c(1,0.5,0.5,1),2,2) n<-1000 library(MASS) biv<-mvrnorm(n,mu,Sigma) Sbiv <- cov(biv) #sample covariance mu<-colMeans(biv) #mle for mean Sigma<-(n-1)*Sbiv/n #mle for covariance mu Sigma vars<-diag(sqrt(1/diag(Sigma))) Rhat<-vars%*%Sigma%*%vars #mle for correlation Rhat cor(biv) # for comparison purpose #hypothesis testing H0: rho12=0 #mle rho.hat<-Rhat[1,2] #test statisitc t.stat<-rho.hat*sqrt((n-2)/(1-rho.hat^2)) #p vlaue 2-2*pt(abs(t.stat),df=n-2) #confidence interval zl<-rho.hat-qnorm(0.975)/sqrt(n-3) zu<-rho.hat+qnorm(0.975)/sqrt(n-3) lower<-(exp(2*zl)-1)/(exp(2*zl)+1) upper<-(exp(2*zu)-1)/(exp(2*zu)+1) print(c(lower,upper)) ###Wishart distribution S <- toeplitz((10:1)/10) set.seed(11) R <- rWishart(1000, df=20, Sigma=S) dim(R) # 10 10 1000 mR <- apply(R, 1:2, mean) # ~= E[ Wish(S, 20) ] = 20 * S ###--------------------------------------------------------------- ### Wishart distribution #A test variance covariance matrix Sig <- matrix(c(1, .7, .6, .7, 1, .4, .6, .4, 1), nrow = 3) p<-dim(Sig)[1] df<-4 Sig.vec<-c(Sig) #empirical (Using simulation) set.seed(9524) reps<-100000 #number of obs in our sampling dist W.empr<-rWishart(reps,df=df,Sigma=Sig) W.empr.vec<-aperm(W.empr,c(3,2,1)) #convert tha array to matrix dim(W.empr.vec)<-c(reps,p*p) round(apply(W.empr.vec,2,mean),2) #sample mean df*Sig.vec #theoretical mean #comparet the variance Eij<-function(n,i,j){ Eij<- matrix(0,n,n) Eij[i,j]<-1 Eij } K<-matrix(0,p*p,p*p) for(i in 1:p) for(j in 1:p) { EIJ<-Eij(p,i,j) K<-K+kronecker(EIJ,t(EIJ)) } W.theor<-df*(diag(p*p)+K)%*%kronecker(Sig,Sig) round(cov(W.empr.vec),2) #sample covriance of Wishart samples W.theor ###----------------------------------------------------------------------------------- ### Test of Normality # Test of Normality for univariate # QQplot par(mfrow=c(1,3));set.seed(100); x<-rnorm(50);qqnorm(x);qqline(x);legend("topleft",legend=c("n=50")); x<-rnorm(100);qqnorm(x);qqline(x);legend("topleft",legend=c("n=100")); x<-rnorm(1000);qqnorm(x);qqline(x);legend("topleft",legend=c("n=1000")); ##T4-1 data mdat <- read.table(file ="datafiles/T4-1.dat",header=F) # Assign labels to the variables (columns # of the data frame) mdat<-cbind(1:nrow(mdat),mdat) names(mdat) <- c('Oven','Radiation') head(mdat) ## Compute the sample mean and the ## sample cvariance matrix and print ## the results mmean<-mean(mdat$Radiation) mmean mvar<-var(mdat$Radiation) mvar ## Create a normal probability plot. qqnorm(mdat$Radiation, pch = 20, main="Normal Probability Plot") qqline(mdat$Radiation) ## use qqtest package par(mfrow=c(1,2)); library(qqtest); qqtest(mdat$Radiation,main="Radiation"); install.packages("nortest") library(nortest) library(help='nortest') # Compute the Shapiro-Wilks' test statistic shapiro.test(mdat$Radiation) # Compute natural logarithm of radiation values mdat$logRad <- log(mdat$Radiation) hist(mdat$logRad) # Compute Q-Q plot and value of Shapiro-Wilk test # for the natural logaritm of the radiation values qqnorm(mdat$logRad, pch = 19, main="Normal Probability Plot") qqline(mdat$logRad) qqtest(mdat$logRad,main="Log Radiation"); shapiro.test(mdat$logRad) lillie.test(mdat$logRad) #Test normality for multivariates ##Chi-squre Q-Q plot mu<-c(0,1) Sigma<-matrix(c(1,0.5,0.5,1),2,2) n<-1000 library(MASS) biv<-mvrnorm(n,mu,Sigma) Sbiv <- cov(biv) D2 <- mahalanobis(biv, colMeans(biv), Sbiv) qqplot(qchisq(ppoints(n), df = 2), D2, main = expression("Q-Q plot for" ~~ {chi^2}[nu == 2])) abline(c(0,1)) # chiqqplot<-function(x){ if(!is.matrix(x)) x<-as.matrix(x) n<-nrow(x) p<-ncol(x) D2<-mahalanobis(x,colMeans(x),cov(x)) qqplot(qchisq(ppoints(n), df = p), D2,xlab="Theoretical Q of Chisquare", ylab="Mah distance", main = expression("Q-Q plot for" ~~ {chi^2}[p]),pch=19) abline(c(0,1)) } chiqqplot(mdat$logRad) ##Multiple hypothesis testing testnormality <- function(X, numproj = 1000) { p <- ncol(X) n <- nrow(X) x <- matrix(rnorm(numproj * p), nrow = p) # generate 1,000 standard p-variate normal random variables. y <- matrix(sqrt(apply(x^2, 2, sum)), nrow = p, ncol = numproj, by = T) z <- x / y tempdat <- as.matrix(X) %*% z # this gives rise to a p x numproj matrix # perform Shapiro-Wilks' test and calculate individual p-values on each # of the numproj observation sets. pvals<-as.numeric(matrix(unlist(apply(tempdat,2,shapiro.test)),ncol=4, by =T)[,2]) # multiple hypthesis testing return(min(p.adjust(pvals,method="BH"))) } # system.time(pvals <- testnormality(mdat, numproj =10000)) pvals #simulated bivarite normal data system.time(pvals <- testnormality(biv, numproj =10000)) pvals ##Energy Statistics ##download energy binary for windows/mac at http://cran.us.r-project.org/web/packages/energy/index.html ##install the downloaded binary package locally library(energy) mvnorm.etest(mdat) ## compute normality test statistics for iris Setosa data data(iris) mvnorm.e(iris[1:50, 1:4]) normal.e(iris[1:50, 1]) ## test if the iris Setosa data has multivariate normal distribution mvnorm.etest(iris[1:50,1:4], R = 199) ## test a univariate sample for normality x <- runif(50, 0, 10) mvnorm.etest(x, R = 199) ##Example 5.2, test of Normality x<-read.table("T5-1.dat") qqnorm(x[,1]) qqnorm(x[,2]) qqnorm(x[,3]) system.time(pvals <- testnormality(mdat)) pvals mvnorm.etest(x, R = 199) ###------------------------------------------------------------------------------------------ ### outlier detection with multivariate normality assumed install.packages("mvoutlier") library(mvoutlier) library(help='mvoutlier') load(file = "3dExample.rda") pairs(dat) ## introduce outlier outFactor <- 1.5 dat1 <- rbind(dat, outFactor*c(-1,-1.2,0.7)) pairs(dat) pairs(dat1, col = c(rep(1,300), 2), pch = c(rep(1,300), 3), cex = c(rep(1,300), 2)) library(mvoutlier) chisq.plot(dat) abline(c(0,1)) library(rgl) plot3d(dat1, col = c(rep(1,300), 2)) ## automatic detection aq.plot(dat) ###----------------------------------------------------------------------------------------- ###Transformations for normality #box-cox tranformtaion install.packages("car") library(car) m1 <- read.table(file ="datafiles/T4-1.dat",header=F)#microwave.door.close m2<-read.table("datafiles/T4-5.dat",header=F) #microwave.door.open mdat1<-as.matrix(cbind(m1,m2)) colnames(mdat1)<-c("close","open") bc<-powerTransform(mdat1~1) #find the optimal box-cox parameter vector lambda summary(bc) bc.mdat<-bcPower(mdat1,bc$lambda) #save the transformed values #check the normality plot(bc.mdat) chiqqplot(bc.mdat) mvnorm.etest(bc.mdat,R=999) #nonparametric density estimation f1<-kde2d(bc.mdat[,1],bc.mdat[,2], h = c(width.SJ(bc.mdat[,1]), width.SJ(bc.mdat[,2]))) persp(f1, phi = 30, theta = 20, d = 5) #comparing bc.close<-powerTransform(mdat1[,1]) bc.nclose<-bcPower(mdat1[,1],bc.close$lambda) bc.open<-powerTransform(mdat1[,2]) bc.nopen<-bcPower(mdat1[,2],bc.open$lambda) bc.ndat<-cbind(bc.nclose,bc.nopen) #compare the two vecters of lambda bc$lambda c(bc.close$lambda,bc.open$lambda) #check the normality plot(bc.ndat) chiqqplot(bc.ndat) mvnorm.etest(bc.ndat,R=999) #nonparametric density estimation f2<-kde2d(bc.ndat[,1],bc.ndat[,2], h = c(width.SJ(bc.ndat[,1]), width.SJ(bc.ndat[,2]))) persp(f2, phi = 30, theta = 20, d = 5)