################################################################################ # # The code implementing the SVD method of Kong et al. is also provided # Caution: Both the simulated data and the diabetes data use N1=N2 # This code is not copyrighted in solidarity with open source hackers everywhere. # MASS is needed for mvrnorm # clusterGeneration is needed for rcorrmatrix # Generate data under the assumption of a correlation matrix library(MASS) library(clusterGeneration) # Function to Winsorize the data matrices. Y = data matrix; Z = MAD multiple scrubber <- function(y,z){ for (i in 1:dim(y)[2]){ lowerco <- median(y[,i]) - z*mad(y[,i]) upperco <- median(y[,i]) + z*mad(y[,i]) y[y[,i]<= lowerco,i] <- lowerco y[y[,i]>= upperco,i] <- upperco } return(y) } # Production Version of 1/p eigenvalue to determine the regularization alpha weight alpha_weight <- function(y){ alpha_test <- 1; alpha_hat <- 1; step.size <- 0.01 identity <- diag(1,nrow = ncol(y)) p.var.ts <- 1/(1*ncol(y)) small_ev <- 1e-8 while ((alpha_test>=0) & (small_ev <= p.var.ts)){ newsigma <- alpha_test*y + (1-alpha_test)*identity groupeigen <- eigen(newsigma,symmetric=TRUE,only.values=TRUE) small_ev <- min(groupeigen$values[groupeigen$values>0]) if (small_ev >= p.var.ts) {alpha_hat <- alpha_test} alpha_test <- alpha_test - step.size } return(alpha_hat) } # Make data routine; used to simulate X ~ MVN(mu1,sigma) makedata <- function(a,b,c0,d1,d2){ # a = w/i group sample size; b = No. of variables; c0 = eigen.vec constant; d1/2 = eigen.vec corr.mat <- rcorrmatrix(b) g1_mu <- rep(0,b) g1_stdev <- diag(sqrt(rep(1,b)),nrow=b) g1_varcov <- g1_stdev%*%corr.mat%*%g1_stdev #g1_varcov <- diag(1,nrow = b) g2_stdev <- g1_stdev #g2_stdev <- diag(runif(b, min = 1, max = 3),nrow= b) g2_varcov <- g2_stdev%*%corr.mat%*%g2_stdev corrm.eig <- eigen(corr.mat) # Group 2 mean separation g2_mu <- c0*(d1*corrm.eig$values[1]*corrm.eig$vectors[,1] + d2*corrm.eig$values[floor(b/3)]*corrm.eig$vectors[,floor(b/3)]) g1 <- mvrnorm(n=a,g1_mu,g1_varcov) g2 <- mvrnorm(n=a,g2_mu,g2_varcov) combopack <- rbind(g1,g2,g1_mu,g2_mu) return(combopack) } # Regularized Hotelling's T^2 reghotT <- function(x,y){ g1_m <- apply(x,2,mean) g2_m <- apply(y,2,mean) n1 <- dim(x)[1] n2 <- dim(y)[1] g1g2delta <- g1_m - g2_m #groupmix <- (1/n1)*cov(x) + (1/n2)*cov(y) groupmix <-((1/n1) + (1/n2))*((n1-1)*cov(x) + (n2-1)*cov(y))/(n1+n2-2) groupalpha <- alpha_weight(groupmix) newsigma <- groupalpha*groupmix + (1-groupalpha)*diag(1,nrow=dim(x)[2]) ht_hat <- t(g1g2delta)%*%try(solve(newsigma),silent=TRUE)%*%g1g2delta return(c(ht_hat,groupalpha)) } # Kong, Pu, Park Projection (KPP) # from: "A multivariate approach for integrating genome-wide expression data and biological knowledge", Sek Kong, # William Pu, and Peter Park, Bioinformatics, Vol. 22, No. 19, 2006, pp. 2373-2380. sph.hotelling <- function(x, group = group, ...){ x <- as.matrix(x) tol <- 10^-4 n <- nrow(x) p <- ncol(x) g <- as.factor(group) counts <- as.vector(table(g)) proportions <- counts/n ng <- length(proportions) group.means <- tapply(x, list(rep(g, p), col(x)), mean) f1 <- sqrt(diag(var(x - group.means[g, ]))) scaling <- diag(1/f1,,p) fac <- 1/(n-ng) X <- sqrt(fac) * (x - group.means[g, ]) %*% scaling X.s <- svd(X, nu = 0) rank <- sum(X.s$d > tol) scaling2 <- scaling %*% X.s$v[, 1:rank] %*% diag(1/X.s$d[1:rank],,rank) x.sphered <- x %*% scaling2 x <- x.sphered M.1 <- x[group == 0,] M.2 <- x[group == 1,] xbar1 <- apply(M.1, 2, mean) N1 <- dim(M.1)[1] xbar2 <- apply(M.2, 2, mean) N2 <- dim(M.2)[1] dbar <- xbar2 - xbar1 vt <- ((N1-1)*var(M.1)+(N2-1)*var(M.2))/(N1+N2-2) t2 <- N1*N2*dbar%*%diag(1/diag(vt))%*%dbar/(N1+N2) return(t2) } # Regularization Permutation Test permHotT <- function(x,y,HotT,n){ blendg1g2 <- rbind(x,y) no.row.x <- dim(x)[1] getsample <- seq(1:dim(blendg1g2)[1]) permutedstat <- numeric() for (k in 1:n){ getg1 <- sample(getsample,no.row.x,replace=FALSE) g1prm <- blendg1g2[getg1,] g2prm <- blendg1g2[-getg1,] answer <- reghotT(g1prm,g2prm) permutedstat <- c(permutedstat,answer[1]) } perm.p.value <- sum(permutedstat>as.numeric(HotT))/length(permutedstat) return(perm.p.value) } # KPP Permutation Test # This was written to deal with the need to pass a binary phenotype indicator permKPP <- function(x,y,HotT,n){ blendg1g2 <- rbind(x,y) no.row.x <- dim(x)[1] no.row.y <- dim(y)[1] group <- c(rep(0,no.row.x),rep(1,no.row.y)) getsample <- seq(1:dim(blendg1g2)[1]) permutedstat <- numeric() for (k in 1:n){ getg1 <- sample(getsample,no.row.x,replace=FALSE) g1prm <- blendg1g2[getg1,] g2prm <- blendg1g2[-getg1,] groupperm <- rbind(g1prm,g2prm) answer <- sph.hotelling(groupperm,group) permutedstat <- c(permutedstat,answer) } perm.p.value <- sum(permutedstat>as.numeric(HotT))/length(permutedstat) return(perm.p.value) } ################################################################################ # # Simulation parameters to toggle # # Winsorize multiple (ex. 3xMAD) # scrubmult <- 3 # Number of variables/subgroup size. # no.var <- 10; subgroup.n <- 20 # Number of permutations; Number of simulations # perm.no <- 10000; no.simul <- 100 # Make data parameters to streamline simulations # c0 <- 0; d1 <- 1; d2 <- 0 # ################################################################################ ################################################################################ ################################################################################ # Simulation to explore Type I error and power of Regularization and KPP # Assumes N1=N2 for each group. power.sim <- function(no.var,subgroup.n,perm.no,no.simul,c0,d1,d2){ simulation.store <- matrix(nrow=no.simul,ncol=2) for (kk in 1:no.simul){ group1and2 <- makedata(subgroup.n,no.var,c0=c0,d1=d1,d2=d2) group1 <- group1and2[1:subgroup.n,] group2 <- group1and2[(subgroup.n+1):(2*subgroup.n),] g1_mu <- group1and2[2*subgroup.n+1,] g2_mu <- group1and2[2*subgroup.n+2,] group <- c(rep(0,subgroup.n),rep(1,subgroup.n)) KPPdata <- rbind(group1,group2) KPPstat <- sph.hotelling(KPPdata,group) Hstat_alpha <- reghotT(group1,group2) simulation.store[kk,1] <- permHotT(group1,group2,Hstat_alpha[1],perm.no) simulation.store[kk,2] <- permKPP(group1,group2,KPPstat,perm.no) } return(simulation.store) } ################################################################################ ################################################################################ # Subset of the simulated conditions performed power10_10_0_F <- power.sim(no.var=10,subgroup.n=10,perm.no=10000,no.simul=100,c0=0,d1=1,d2=0) power10_10_25_F <- power.sim(no.var=10,subgroup.n=10,perm.no=10000,no.simul=100,c0=0.25,d1=1,d2=0) power10_10_25_S <- power.sim(no.var=10,subgroup.n=10,perm.no=10000,no.simul=100,c0=0.25,d1=0,d2=1) power10_10_50_F <- power.sim(no.var=10,subgroup.n=10,perm.no=10000,no.simul=100,c0=0.5,d1=1,d2=0) power10_10_50_S <- power.sim(no.var=10,subgroup.n=10,perm.no=10000,no.simul=100,c0=0.5,d1=0,d2=1) power10_20_0_F <- power.sim(no.var=10,subgroup.n=20,perm.no=10000,no.simul=100,c0=0,d1=1,d2=0) power10_20_25_F <- power.sim(no.var=10,subgroup.n=20,perm.no=10000,no.simul=100,c0=0.25,d1=1,d2=0) power10_20_25_S <- power.sim(no.var=10,subgroup.n=20,perm.no=10000,no.simul=100,c0=0.25,d1=0,d2=1) power10_20_50_F <- power.sim(no.var=10,subgroup.n=20,perm.no=10000,no.simul=100,c0=0.5,d1=1,d2=0) power10_20_50_S <- power.sim(no.var=10,subgroup.n=20,perm.no=10000,no.simul=100,c0=0.5,d1=0,d2=1) power10_50_0_F <- power.sim(no.var=10,subgroup.n=50,perm.no=10000,no.simul=100,c0=0,d1=1,d2=0) power10_50_25_F <- power.sim(no.var=10,subgroup.n=50,perm.no=10000,no.simul=100,c0=0.25,d1=1,d2=0) power10_50_25_S <- power.sim(no.var=10,subgroup.n=50,perm.no=10000,no.simul=100,c0=0.25,d1=0,d2=1) power10_50_50_F <- power.sim(no.var=10,subgroup.n=50,perm.no=10000,no.simul=100,c0=0.5,d1=1,d2=0) power10_50_50_S <- power.sim(no.var=10,subgroup.n=50,perm.no=10000,no.simul=100,c0=0.5,d1=0,d2=1)