###------------------------------------------------------------
### MLEs for Multivariate Normal distribution

mu<-c(0,1)
Sigma<-matrix(c(1,0.5,0.5,1),2,2)
n<-1000

library(MASS)
biv<-mvrnorm(n,mu,Sigma)
Sbiv <- cov(biv)        #sample covariance
mu<-colMeans(biv)     #mle for mean
Sigma<-(n-1)*Sbiv/n   #mle for covariance
mu
Sigma
vars<-diag(sqrt(1/diag(Sigma)))
Rhat<-vars%*%Sigma%*%vars  #mle for correlation
Rhat
cor(biv)   # for comparison purpose


#hypothesis testing H0: rho12=0

#mle
rho.hat<-Rhat[1,2]
#test statisitc
t.stat<-rho.hat*sqrt((n-2)/(1-rho.hat^2))
#p vlaue
2-2*pt(abs(t.stat),df=n-2) 


#confidence interval

zl<-rho.hat-qnorm(0.975)/sqrt(n-3)
zu<-rho.hat+qnorm(0.975)/sqrt(n-3)
lower<-(exp(2*zl)-1)/(exp(2*zl)+1)
upper<-(exp(2*zu)-1)/(exp(2*zu)+1)
print(c(lower,upper))





###Wishart distribution

S <- toeplitz((10:1)/10)
set.seed(11)
R <- rWishart(1000, df=20, Sigma=S)
dim(R)  #  10 10  1000
mR <- apply(R, 1:2, mean)  # ~= E[ Wish(S, 20) ] = 20 * S

###---------------------------------------------------------------
### Wishart distribution

#A test variance covariance matrix
Sig <- matrix(c(1, .7, .6, .7, 1, .4, .6, .4, 1), nrow = 3)
p<-dim(Sig)[1]
df<-4
Sig.vec<-c(Sig)

#empirical (Using simulation)
set.seed(9524)
reps<-100000 #number of obs in our sampling dist
W.empr<-rWishart(reps,df=df,Sigma=Sig)
W.empr.vec<-aperm(W.empr,c(3,2,1))  #convert tha array to matrix
dim(W.empr.vec)<-c(reps,p*p)

round(apply(W.empr.vec,2,mean),2)  #sample mean
df*Sig.vec     #theoretical mean

#comparet the variance
Eij<-function(n,i,j){
	Eij<- matrix(0,n,n)
	Eij[i,j]<-1
	Eij
}
K<-matrix(0,p*p,p*p)
for(i in 1:p)
 for(j in 1:p) {
  EIJ<-Eij(p,i,j)
K<-K+kronecker(EIJ,t(EIJ))
 }

W.theor<-df*(diag(p*p)+K)%*%kronecker(Sig,Sig)
 
round(cov(W.empr.vec),2)  #sample covriance of Wishart samples
W.theor 


###-----------------------------------------------------------------------------------
### Test of Normality

# Test of Normality for univariate 

# QQplot

par(mfrow=c(1,3));set.seed(100);
x<-rnorm(50);qqnorm(x);qqline(x);legend("topleft",legend=c("n=50"));
x<-rnorm(100);qqnorm(x);qqline(x);legend("topleft",legend=c("n=100"));
x<-rnorm(1000);qqnorm(x);qqline(x);legend("topleft",legend=c("n=1000"));


##T4-1 data
mdat <- read.table(file ="datafiles/T4-1.dat",header=F)
 
#  Assign labels to the variables (columns 
#  of the data frame)
mdat<-cbind(1:nrow(mdat),mdat)
names(mdat) <- c('Oven','Radiation')

head(mdat)
 
##  Compute the sample mean and the
## sample cvariance matrix and print 
## the results

mmean<-mean(mdat$Radiation)
mmean
mvar<-var(mdat$Radiation)
mvar


## Create a normal probability plot. 

qqnorm(mdat$Radiation, pch = 20, main="Normal Probability Plot") 
qqline(mdat$Radiation)

## use qqtest package 
par(mfrow=c(1,2)); library(qqtest);
qqtest(mdat$Radiation,main="Radiation");
 


install.packages("nortest")
library(nortest)
library(help='nortest')
#  Compute the Shapiro-Wilks' test statistic
shapiro.test(mdat$Radiation)


# Compute natural logarithm of radiation values
mdat$logRad <- log(mdat$Radiation)	

hist(mdat$logRad)
  
# Compute Q-Q plot and value of Shapiro-Wilk test
# for the natural logaritm of the radiation values
		
qqnorm(mdat$logRad, pch = 19, main="Normal Probability Plot") 
qqline(mdat$logRad)

qqtest(mdat$logRad,main="Log Radiation");


shapiro.test(mdat$logRad)
lillie.test(mdat$logRad)
 


#Test normality for multivariates
##Chi-squre Q-Q plot
mu<-c(0,1)
Sigma<-matrix(c(1,0.5,0.5,1),2,2)
n<-1000

library(MASS)
biv<-mvrnorm(n,mu,Sigma)
Sbiv <- cov(biv)
D2 <- mahalanobis(biv, colMeans(biv), Sbiv)

qqplot(qchisq(ppoints(n), df = 2), D2,
       main = expression("Q-Q plot for" ~~ {chi^2}[nu == 2]))
abline(c(0,1))

#
chiqqplot<-function(x){
	if(!is.matrix(x))
	x<-as.matrix(x)
	n<-nrow(x)
	p<-ncol(x)
	D2<-mahalanobis(x,colMeans(x),cov(x))
	qqplot(qchisq(ppoints(n), df = p), D2,xlab="Theoretical Q of Chisquare", ylab="Mah distance",
       main = expression("Q-Q plot for" ~~ {chi^2}[p]),pch=19)
abline(c(0,1))
}

chiqqplot(mdat$logRad)


##Multiple hypothesis testing
testnormality <- function(X, numproj = 1000)
  {
    p <- ncol(X)
    n <- nrow(X)
    x <- matrix(rnorm(numproj * p), nrow = p)
     # generate 1,000 standard p-variate normal random variables.
    y <- matrix(sqrt(apply(x^2, 2, sum)), nrow = p, ncol = numproj, by = T)
    z <- x / y
    tempdat <- as.matrix(X) %*% z
    # this gives rise to a p x numproj matrix
    # perform Shapiro-Wilks' test and calculate individual p-values on each
    # of the numproj observation sets.
    pvals<-as.numeric(matrix(unlist(apply(tempdat,2,shapiro.test)),ncol=4,
            by =T)[,2])
    # multiple hypthesis testing
     return(min(p.adjust(pvals,method="BH")))
  }
#
system.time(pvals <- testnormality(mdat, numproj =10000))
pvals


#simulated bivarite normal data
system.time(pvals <- testnormality(biv, numproj =10000))
pvals



##Energy Statistics
##download energy binary for windows/mac at http://cran.us.r-project.org/web/packages/energy/index.html
##install the downloaded binary package locally

library(energy)
mvnorm.etest(mdat)


## compute normality test statistics for iris Setosa data
data(iris)
mvnorm.e(iris[1:50, 1:4])
normal.e(iris[1:50, 1])
## test if the iris Setosa data has multivariate normal distribution
mvnorm.etest(iris[1:50,1:4], R = 199)
## test a univariate sample for normality
x <- runif(50, 0, 10)
mvnorm.etest(x, R = 199)




##Example 5.2, test of Normality

x<-read.table("T5-1.dat")
qqnorm(x[,1])
qqnorm(x[,2])
qqnorm(x[,3])

system.time(pvals <- testnormality(mdat))
pvals

mvnorm.etest(x, R = 199)

###------------------------------------------------------------------------------------------
### outlier detection with multivariate normality assumed

install.packages("mvoutlier")
library(mvoutlier)
library(help='mvoutlier')
 
 
 
load(file = "3dExample.rda")
pairs(dat)
## introduce outlier
outFactor <- 1.5
dat1 <- rbind(dat, outFactor*c(-1,-1.2,0.7))
pairs(dat)
pairs(dat1, col = c(rep(1,300), 2), pch = c(rep(1,300), 3), cex = c(rep(1,300), 2))

library(mvoutlier)
chisq.plot(dat)
abline(c(0,1))

library(rgl)
plot3d(dat1, col = c(rep(1,300), 2))
 
## automatic detection
aq.plot(dat)



###-----------------------------------------------------------------------------------------
###Transformations for normality

#box-cox tranformtaion

install.packages("car")
library(car)
m1 <- read.table(file ="datafiles/T4-1.dat",header=F)#microwave.door.close
m2<-read.table("datafiles/T4-5.dat",header=F) #microwave.door.open
mdat1<-as.matrix(cbind(m1,m2))
colnames(mdat1)<-c("close","open") 
bc<-powerTransform(mdat1~1)  #find the optimal box-cox parameter vector lambda 
summary(bc)
bc.mdat<-bcPower(mdat1,bc$lambda)  #save the transformed values

#check the normality
plot(bc.mdat)
chiqqplot(bc.mdat)

mvnorm.etest(bc.mdat,R=999)
#nonparametric density estimation
f1<-kde2d(bc.mdat[,1],bc.mdat[,2], h = c(width.SJ(bc.mdat[,1]), width.SJ(bc.mdat[,2])))
persp(f1, phi = 30, theta = 20, d = 5)


#comparing 
bc.close<-powerTransform(mdat1[,1])
bc.nclose<-bcPower(mdat1[,1],bc.close$lambda)
bc.open<-powerTransform(mdat1[,2])
bc.nopen<-bcPower(mdat1[,2],bc.open$lambda)
bc.ndat<-cbind(bc.nclose,bc.nopen)

#compare the two vecters of lambda
bc$lambda
c(bc.close$lambda,bc.open$lambda)

#check the normality
plot(bc.ndat)
chiqqplot(bc.ndat)

mvnorm.etest(bc.ndat,R=999)
#nonparametric density estimation
f2<-kde2d(bc.ndat[,1],bc.ndat[,2], h = c(width.SJ(bc.ndat[,1]), width.SJ(bc.ndat[,2])))
persp(f2, phi = 30, theta = 20, d = 5)