Last updated: 2018-08-01

workflowr checks: (Click a bullet for more information)
Expand here to see past versions:


Install packages

#devtools
library(devtools)
#iasva
devtools::install_github("UcarLab/IA-SVA")
#iasvaExamples  
devtools::install_github("dleelab/iasvaExamples")

Load packages

rm(list=ls())
library(irlba) # partial SVD, the augmented implicitly restarted Lanczos bidiagonalization algorithm
library(iasva)
library(iasvaExamples)
library(sva)
library(polyester)
library(corrplot)
library(DescTools) #pcc i.e., Pearson's contingency coefficient
library(RColorBrewer)
library(reshape2)
library(ggplot2)
library(SummarizedExperiment)

color.vec <- brewer.pal(9, "Set1")

iasva.tmp <- function(Y, X, intercept=TRUE, num.sv=NULL, permute=TRUE, num.p=100, sig.cutoff= 0.05, threads=1, num.sv.permtest=NULL, tol=1e-10, verbose=FALSE){
  cat("IA-SVA running...")
  sv <- NULL
  pc.stat.obs <- NULL
  pval <- NULL
  wgt.mat <- NULL
  rsq <- NULL
  isv <- 0
  while(TRUE){
    if(!is.null(num.sv)){
      if(isv==num.sv){ 
        break
      }
    }
    iasva.res <- iasva.unit.tmp(Y, X, intercept, permute, num.p, threads, num.sv.permtest, tol, verbose)
    if(iasva.res$pval < sig.cutoff){
      sv <- cbind(sv, iasva.res$sv)
      pc.stat.obs <- cbind(pc.stat.obs, iasva.res$pc.stat.obs)
      pval <- c(pval, iasva.res$pval)
      wgt.mat <- cbind(wgt.mat, iasva.res$wgt)
      X <- cbind(X,iasva.res$sv)
    } else { break }
    isv <- isv+1
    cat(paste0("\nSV",isv, " Detected!"))
  }
  if (isv > 0) {
    colnames(sv) <- paste0("SV", 1:ncol(sv))
    cat(paste0("\n# of significant surrogate variables: ",length(pval)))
    return(list(sv=sv, pc.stat.obs=pc.stat.obs, pval=pval, n.sv=length(pval), wgt.mat=wgt.mat))
  } else {
    cat ("\nNo significant surrogate variables")
  }
}

iasva.unit.tmp <- function(Y, X, intercept=TRUE, permute=TRUE, num.p=100, threads=1, num.sv.permtest=NULL, tol=1e-10, verbose=FALSE){
  if(min(Y)<0){ Y <- Y + abs(min(Y)) }
  lY <- log(Y+1)
  if(intercept){
    fit <- .lm.fit(cbind(1,X), lY)
  } else {
    fit <- .lm.fit(X, lY)
  }
  resid <- resid(fit)
  tresid = t(resid)
  
  if(verbose) {cat("\n Perform SVD on residuals")}
  svd_pca <- irlba::irlba(tresid-rowMeans(tresid), 1, tol=tol)
  
  if(verbose) {cat("\n Regress residuals on PC1")}
  fit <- .lm.fit(cbind(1,svd_pca$v[,1]), resid)
  
  if(verbose) {cat("\n Get Rsq")}
  rsq.vec <- calc.rsq(resid, fit)
  
  if(verbose) {cat("\n Rsq 0-1 Normalization")}
  rsq.vec[is.na(rsq.vec)] <- min(rsq.vec, na.rm=TRUE)
  wgt <- (rsq.vec-min(rsq.vec))/(max(rsq.vec)-min(rsq.vec)) #0-1 normalization
  
  if(verbose) {cat("\n Obtain weighted log-transformed read counts")}
  tlY = t(lY)*wgt # weigh each row (gene) with respect to its Rsq value.
  
  if(verbose) {cat("\n Perform SVD on weighted log-transformed read counts")}
  sv <- irlba::irlba(tlY-rowMeans(tlY), 1, tol=tol)$v[,1]
  
  if(permute==TRUE){
    if(verbose) {cat("\n Assess the significance of the contribution of SV")}
    if(is.null(num.sv.permtest)){
      svd.res.obs <- svd(tresid - rowMeans(tresid))
    } else {
      svd.res.obs <- irlba::irlba(tresid-rowMeans(tresid), num.sv.permtest, tol=tol)
    }

    pc.stat.obs <- svd.res.obs$d[1]^2/sum(svd.res.obs$d^2)
    if(verbose) {cat("\n PC test statistic value:", pc.stat.obs)}
    
    # Generate an empirical null distribution of the PC test statistic.
    pc.stat.null.vec <- rep(0, num.p)
    permute.svd <- permute.svd.factory(lY, X, num.sv.permtest, tol, verbose)
    if (threads > 1) {
      threads <- min(threads, parallel::detectCores()-1)
      cl <- parallel::makeCluster(threads)
      pc.stat.null.vec <- tryCatch(parallel::parSapply(cl, 1:num.p, permute.svd), error=function(err){parallel::stopCluster(cl); stop(err)})
      parallel::stopCluster(cl)
    } else {
      pc.stat.null.vec <- sapply(1:num.p, permute.svd)
    }
    if(verbose) {cat("\n Empirical null distribution of the PC statistic:", sort(pc.stat.null.vec))}
    pval <- sum(pc.stat.obs <= pc.stat.null.vec)/(num.p+1)
    if(verbose) {cat("\n Permutation p-value:", pval)}
  } else {
    pc.stat.obs <- -1
    pval <- -1
  }
  return(list(sv=sv, pc.stat.obs=pc.stat.obs, pval=pval, wgt=wgt))
}

calc.rsq <- function(resid, fit) {
  RSS <- colSums(resid(fit)^2)
  TSS <- colSums(t(t(resid) - colSums(resid)/ncol(resid)) ^ 2)  # vectorized
  # TSS <- colSums(sweep(resid, 2, mean, "-") ^ 2)  # alt-2
  return(1-(RSS/(nrow(resid)-2))/(TSS/(nrow(resid)-1)))
}

permute.svd.factory <- function(lY, X, num.sv.permtest, tol, verbose) {
  
  permute.svd <- function(i) {
    permuted.lY <- apply(t(lY), 1, sample, replace=FALSE)
    tresid.null <- t(resid(.lm.fit(cbind(1,X), permuted.lY)))
    if(is.null(num.sv.permtest)) {
      svd.res.null <- svd(tresid.null)
    } else {
      svd.res.null <- irlba::irlba(tresid.null, num.sv.permtest, tol=tol)
    }
    return(svd.res.null$d[1]^2/sum(svd.res.null$d^2))
  }
  
  return(permute.svd)
}

Load the islet single cell RNA-Seq data

data("Lawlor_Islet_scRNAseq_Read_Counts")
data("Lawlor_Islet_scRNAseq_Annotations")
ls()
[1] "calc.rsq"                          "color.vec"                        
[3] "iasva.tmp"                         "iasva.unit.tmp"                   
[5] "Lawlor_Islet_scRNAseq_Annotations" "Lawlor_Islet_scRNAseq_Read_Counts"
[7] "permute.svd.factory"              
counts <- Lawlor_Islet_scRNAseq_Read_Counts
anns <- Lawlor_Islet_scRNAseq_Annotations
dim(anns)
[1] 638  26
dim(counts)
[1] 26542   638
summary(anns)
     run             cell.type             COL1A1          INS       
 Length:638         Length:638         Min.   :1.00   Min.   :1.000  
 Class :character   Class :character   1st Qu.:1.00   1st Qu.:1.000  
 Mode  :character   Mode  :character   Median :1.00   Median :1.000  
                                       Mean   :1.03   Mean   :1.414  
                                       3rd Qu.:1.00   3rd Qu.:2.000  
                                       Max.   :2.00   Max.   :2.000  
                                                                     
     PRSS1            SST             GCG            KRT19      
 Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
 1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.000  
 Median :1.000   Median :1.000   Median :1.000   Median :1.000  
 Mean   :1.038   Mean   :1.039   Mean   :1.375   Mean   :1.044  
 3rd Qu.:1.000   3rd Qu.:1.000   3rd Qu.:2.000   3rd Qu.:1.000  
 Max.   :2.000   Max.   :2.000   Max.   :2.000   Max.   :2.000  
                                                                
      PPY          num.genes             Cell_ID        UNOS_ID   
 Min.   :1.000   Min.   :3529   10th_C1_S59  :  1   ACCG268 :136  
 1st Qu.:1.000   1st Qu.:5270   10th_C10_S104:  1   ACJV399 :108  
 Median :1.000   Median :6009   10th_C11_S96 :  1   ACEL337 :103  
 Mean   :1.028   Mean   :5981   10th_C13_S61 :  1   ACIW009 : 93  
 3rd Qu.:1.000   3rd Qu.:6676   10th_C14_S53 :  1   ACCR015A: 57  
 Max.   :2.000   Max.   :8451   10th_C16_S105:  1   ACIB065 : 57  
                                (Other)      :632   (Other) : 84  
      Age        Biomaterial_Provider    Gender              Phenotype  
 Min.   :22.00   IIDP      : 45       Female:303   Non-Diabetic   :380  
 1st Qu.:29.00   Prodo Labs:593       Male  :335   Type 2 Diabetic:258  
 Median :42.00                                                          
 Mean   :39.63                                                          
 3rd Qu.:53.00                                                          
 Max.   :56.00                                                          
                                                                        
               Race          BMI          Cell_Type     Patient_ID 
 African American:175   Min.   :22.00   INS    :264   P1     :136  
 Hispanic        :165   1st Qu.:26.60   GCG    :239   P8     :108  
 White           :298   Median :32.95   KRT19  : 28   P3     :103  
                        Mean   :32.85   SST    : 25   P7     : 93  
                        3rd Qu.:35.80   PRSS1  : 24   P5     : 57  
                        Max.   :55.00   none   : 21   P6     : 57  
                                        (Other): 37   (Other): 84  
 Sequencing_Run Batch       Coverage       Percent_Aligned 
 12t    : 57    B1:193   Min.   :1206135   Min.   :0.8416  
 4th    : 57    B2:148   1st Qu.:2431604   1st Qu.:0.8769  
 9th    : 57    B3:297   Median :3042800   Median :0.8898  
 10t    : 56             Mean   :3160508   Mean   :0.8933  
 7th    : 55             3rd Qu.:3871697   3rd Qu.:0.9067  
 3rd    : 53             Max.   :5931638   Max.   :0.9604  
 (Other):303                                               
 Mitochondrial_Fraction Num_Expressed_Genes
 Min.   :0.003873       Min.   :3529       
 1st Qu.:0.050238       1st Qu.:5270       
 Median :0.091907       Median :6009       
 Mean   :0.108467       Mean   :5981       
 3rd Qu.:0.142791       3rd Qu.:6676       
 Max.   :0.722345       Max.   :8451       
                                           
ContCoef(table(anns$Gender, anns$Cell_Type))
[1] 0.225969
ContCoef(table(anns$Phenotype, anns$Cell_Type))
[1] 0.1145096
ContCoef(table(anns$Race, anns$Cell_Type))
[1] 0.3084146
ContCoef(table(anns$Patient_ID, anns$Cell_Type))
[1] 0.5232058
ContCoef(table(anns$Batch, anns$Cell_Type))
[1] 0.3295619

Look at mean variance relationship

plot(rowMeans(log(counts+1)),rowVars(log(counts+1)),pch=19,col=color.vec[2])

Estimate zero inflated negative binomial parameters from the islet data

## Estimate the zero inflated negative binomial parameters
params = get_params(counts)

Generate a data set affected by two factors (known and unknown).

Here we compare unsupervised SVA, supervised SVA and IA-SVA

set.seed(1234) #5000
sample.size <- 50
num.genes <- 10000
prop.kfactor.genes <- 0.2    #known factor
prop.hfactor1.genes <- 0.1   #hidden factor1

num.kfactor.genes <- num.genes*prop.kfactor.genes
num.hfactor1.genes <- num.genes*prop.hfactor1.genes

factor.prop <- 0.5
kfactor = c(rep(-1,each=sample.size*factor.prop),rep(1,each=sample.size-(sample.size*factor.prop)))
coinflip = rbinom(sample.size,size=1,prob=0.8)
hfactor1 = kfactor*coinflip + -kfactor*(1-coinflip)

cor(cbind(kfactor,hfactor1))
           kfactor  hfactor1
kfactor  1.0000000 0.6821865
hfactor1 0.6821865 1.0000000
hfactor.mat <- cbind(hfactor1)
kfcoeffs = c(rnorm(num.kfactor.genes),rep(0,num.genes-num.kfactor.genes))
nullindex= (num.kfactor.genes+1):num.genes

overlap.prop <- 0.99

#hfcoeffs1 = c(rep(0,num.kfactor.genes*(1-overlap.prop)),rnorm(num.hfactor1.genes,sd=1),rep(0,num.genes-num.kfactor.genes*(1-overlap.prop)-num.hfactor1.genes))
hfcoeffs1 = c(rep(0,num.kfactor.genes-num.hfactor1.genes*overlap.prop),rnorm(num.hfactor1.genes,sd=1),rep(0,num.genes-(num.kfactor.genes-num.hfactor1.genes*overlap.prop)-num.hfactor1.genes))

par(mfrow=c(2,1))
plot(kfcoeffs)
plot(hfcoeffs1)

par(mfrow=c(1,1))

coeffs = cbind(hfcoeffs1,kfcoeffs)
controls = (hfcoeffs1!=0)&(kfcoeffs==0)
mod = model.matrix(~-1 + hfactor1 + kfactor)

dat0 = create_read_numbers(params$mu,params$fit,
                                     params$p0,beta=coeffs,mod=mod)
sum(dat0==0)/length(dat0)
[1] 0.680858
filter = apply(dat0, 1, function(x) length(x[x>5])>=2)
dat0 = dat0[filter,]
sum(dat0==0)/length(dat0)
[1] 0.5555512
controls <- controls[filter]

dim(dat0)
[1] 7067   50
dim(mod)
[1] 50  2

Estimate hidden factor using IA-SVA

We also plot effect sizes of known and hidden factor and estimated weights (R-squared)

## Set null and alternative models
mod1 = model.matrix(~kfactor)
mod0 = cbind(mod1[,1])

### iasva 
hfactors_iasva.res <- iasva.tmp(t(dat0), kfactor, num.sv=1, permute=F)
IA-SVA running...
SV1 Detected!
# of significant surrogate variables: 1
hfactors_iasva <- hfactors_iasva.res$sv
cor(hfactor1,hfactors_iasva)
            SV1
[1,] -0.9871754
filter2 <- hfcoeffs1[filter]!=0

par(mfrow=c(3,1))
plot(kfcoeffs[filter], xlim=c(0,2500), xlab="Gene Index", ylab="Known Factor Effect Size", cex=1.5)
plot(hfcoeffs1[filter], xlim=c(0,2500), xlab="Gene Index", ylab="Hidden Factor Effect Size")
plot(hfactors_iasva.res$wgt.mat, xlim=c(0,2500), xlab="Gene Index", ylab="R squared")

par(mfrow=c(1,1))


pdf("output/GeneOverlap.Fig1.pdf", width = 5, height=8)
par(mfrow=c(3,1))
plot(kfcoeffs[filter], xlim=c(0,2500), xlab="Gene Index", ylab="Known Factor Effect Size",cex.lab=1.5,cex.axis=1.5)
plot(hfcoeffs1[filter], xlim=c(0,2500), xlab="Gene Index", ylab="Hidden Factor Effect Size",cex.lab=1.5,cex.axis=1.5)
plot(hfactors_iasva.res$wgt.mat, xlim=c(0,2500), xlab="Gene Index", ylab="R squared",cex.lab=1.5,cex.axis=1.5)
dev.off()
quartz_off_screen 
                2 
par(mfrow=c(1,1))

R-squared (Weights) as a function of hidden and known factors’ effect size

overlap.hfcoeffs1 <- hfcoeffs1[filter][filter2]
overlap.kfcoeffs <- kfcoeffs[filter][filter2]
overlap.R2 <- hfactors_iasva.res$wgt.mat[filter2]

summary(lm(overlap.R2~overlap.hfcoeffs1+overlap.kfcoeffs))

Call:
lm(formula = overlap.R2 ~ overlap.hfcoeffs1 + overlap.kfcoeffs)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.14720 -0.10585 -0.07321  0.02319  0.91561 

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)       0.117215   0.006611  17.730   <2e-16 ***
overlap.hfcoeffs1 0.012627   0.006434   1.963   0.0501 .  
overlap.kfcoeffs  0.001709   0.006849   0.250   0.8030    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.1768 on 712 degrees of freedom
Multiple R-squared:  0.005446,  Adjusted R-squared:  0.002652 
F-statistic: 1.949 on 2 and 712 DF,  p-value: 0.1431
par(mfrow=c(2,1))
plot(overlap.hfcoeffs1, overlap.R2, xlab="Hidden Factor Effect Size", ylab="R squared")
plot(overlap.kfcoeffs, overlap.R2, xlab="Known Factor Effect Size", ylab="R squared")

par(mfrow=c(1,1))

pdf("output/GeneOverlap.Fig2.pdf", width = 5, height=8)
par(mfrow=c(2,1))
plot(overlap.hfcoeffs1, overlap.R2, xlab="Hidden Factor Effect Size", ylab="R squared")
plot(overlap.kfcoeffs, overlap.R2, xlab="Known Factor Effect Size", ylab="R squared")
dev.off()
quartz_off_screen 
                2 
par(mfrow=c(1,1))

Hidden factor estimation using existing methods

## Estimate batch with pca
ldat0 = log(dat0 + 1)
hfactors_pca = svd(ldat0 - rowMeans(ldat0))$v[,1]
cor(hfactor1,hfactors_pca)
[1] -0.8373417
## Estimate batch with svaseq (unsupervised)
hfactors_usva = svaseq(dat0,mod1,mod0, n.sv=1)$sv
Number of significant surrogate variables is:  1 
Iteration (out of 5 ):1  2  3  4  5  
cor(hfactor1,hfactors_usva)
           [,1]
[1,] -0.9720361
## Estimate batch with svaseq (supervised)
hfactors_ssva = svaseq(dat0,mod1,mod0,controls=controls, n.sv=1)$sv
sva warning: controls provided so supervised sva is being performed.
Number of significant surrogate variables is:  1 
cor(hfactor1,hfactors_ssva)
           [,1]
[1,] 0.02989441

Plot hidden factor estimates of IA-SVA and existing methods

## Plot the results
par(mfrow=c(5,1))
plot(hfactor1,col=color.vec[1],pch=19,main="Hidden Factor1")
plot(hfactors_iasva,pch=19,col=color.vec[2],main="IA-SVA")
plot(hfactors_pca,pch=19, col=color.vec[3],main="PCA")
plot(hfactors_usva,pch=19,col=color.vec[4],main="USVA")
plot(hfactors_ssva,pch=19,col=color.vec[5],main="SSVA")

par(mfrow=c(1,1))


Multiple simulations where hidden and known factors affect overlapping sets of genes

set.seed(4000)
sample.size <- 50
num.genes <- 10000
prop.kfactor.genes <- 0.2    #known factor
prop.hfactor1.genes <- 0.1   #hidden factor1

num.kfactor.genes <- num.genes*prop.kfactor.genes
num.hfactor1.genes <- num.genes*prop.hfactor1.genes

factor.prop <- 0.5
kfactor = c(rep(-1,each=sample.size*factor.prop),rep(1,each=sample.size-(sample.size*factor.prop)))
coinflip = rbinom(sample.size,size=1,prob=0.9)
hfactor1 = kfactor*coinflip + -kfactor*(1-coinflip)

cor(cbind(kfactor,hfactor1))
         kfactor hfactor1
kfactor     1.00     0.76
hfactor1    0.76     1.00
overlap.prop.vec <- c(0.99,0.9,0.8,0.7,0.6,0.5,0.4,0.3,0.2,0.1,0)

data.list <- list()
controls.list <- list()

hfactor.mat <- cbind(hfactor1)
kfbetas <- rnorm(num.kfactor.genes)
hfbetas <- rnorm(num.hfactor1.genes)  

kfcoeffs = c(rnorm(num.kfactor.genes),rep(0,num.genes-num.kfactor.genes))
nullindex= (num.kfactor.genes+1):num.genes

pdf("output/gene_overlap.pdf", width = 10, height=5)
for(i in 1:length(overlap.prop.vec)){

  hfcoeffs1 = c(rep(0,num.kfactor.genes-num.hfactor1.genes*overlap.prop.vec[i]),hfbetas,rep(0,num.genes-(num.kfactor.genes-num.hfactor1.genes*overlap.prop.vec[i])-num.hfactor1.genes))
  
  coeffs = cbind(hfcoeffs1,kfcoeffs)
  controls = (hfcoeffs1!=0)&(kfcoeffs==0)
  mod = model.matrix(~-1 + hfactor1 + kfactor)

  dat0 = create_read_numbers(params$mu,params$fit,
                                       params$p0,beta=coeffs,mod=mod)
  sum(dat0==0)/length(dat0)
  filter = apply(dat0, 1, function(x) length(x[x>5])>=2)
  dat0 = dat0[filter,]
  sum(dat0==0)/length(dat0)
  controls.list[[i]] <- controls[filter]

  #dim(dat0)
  #dim(mod)
  data.list[[i]] <- dat0
  par(mfrow=c(2,1))
  print(plot(kfcoeffs))
  print(plot(hfcoeffs1))
}
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
dev.off()
quartz_off_screen 
                2 

Estimation of hidden factors using IA-SVA and existing methods

## Set null and alternative models
mod1 = model.matrix(~kfactor)
mod0 = cbind(mod1[,1])
hfactors_iasva_list <- list()
hfactors_pca_list <- list()
hfactors_usva_list <- list()
hfactors_ssva_list <- list()
for(i in 1:length(overlap.prop.vec)){
  dat0 <- data.list[[i]]
  ### iasva 
  summ_exp <- SummarizedExperiment(assays = dat0)
  hfactors_iasva_list[[i]] <- iasva(summ_exp, as.matrix(kfactor), num.sv=1, permute = F)$sv
  ## Estimate batch with pca
  ldat0 = log(dat0 + 1)
  hfactors_pca_list[[i]] = svd(ldat0 - rowMeans(ldat0))$v[,1]
  ## Estimate batch with svaseq (unsupervised)
  hfactors_usva_list[[i]] = svaseq(dat0,mod1,mod0,n.sv=1)$sv[,1]
  ## Estimate batch with svaseq (supervised)
  hfactors_ssva_list[[i]] = svaseq(dat0,mod1,mod0,n.sv=1,controls=controls.list[[i]])$sv[,1]
  cat(i,"\n")
}
IA-SVA running...

SV 1 Detected!

# of significant surrogate variables: 1
Number of significant surrogate variables is:  1 
Iteration (out of 5 ):1  2  3  4  5  sva warning: controls provided so supervised sva is being performed.
Number of significant surrogate variables is:  1 
1 
IA-SVA running...

SV 1 Detected!

# of significant surrogate variables: 1
Number of significant surrogate variables is:  1 
Iteration (out of 5 ):1  2  3  4  5  sva warning: controls provided so supervised sva is being performed.
Number of significant surrogate variables is:  1 
2 
IA-SVA running...

SV 1 Detected!

# of significant surrogate variables: 1
Number of significant surrogate variables is:  1 
Iteration (out of 5 ):1  2  3  4  5  sva warning: controls provided so supervised sva is being performed.
Number of significant surrogate variables is:  1 
3 
IA-SVA running...

SV 1 Detected!

# of significant surrogate variables: 1
Number of significant surrogate variables is:  1 
Iteration (out of 5 ):1  2  3  4  5  sva warning: controls provided so supervised sva is being performed.
Number of significant surrogate variables is:  1 
4 
IA-SVA running...

SV 1 Detected!

# of significant surrogate variables: 1
Number of significant surrogate variables is:  1 
Iteration (out of 5 ):1  2  3  4  5  sva warning: controls provided so supervised sva is being performed.
Number of significant surrogate variables is:  1 
5 
IA-SVA running...

SV 1 Detected!

# of significant surrogate variables: 1
Number of significant surrogate variables is:  1 
Iteration (out of 5 ):1  2  3  4  5  sva warning: controls provided so supervised sva is being performed.
Number of significant surrogate variables is:  1 
6 
IA-SVA running...

SV 1 Detected!

# of significant surrogate variables: 1
Number of significant surrogate variables is:  1 
Iteration (out of 5 ):1  2  3  4  5  sva warning: controls provided so supervised sva is being performed.
Number of significant surrogate variables is:  1 
7 
IA-SVA running...

SV 1 Detected!

# of significant surrogate variables: 1
Number of significant surrogate variables is:  1 
Iteration (out of 5 ):1  2  3  4  5  sva warning: controls provided so supervised sva is being performed.
Number of significant surrogate variables is:  1 
8 
IA-SVA running...

SV 1 Detected!

# of significant surrogate variables: 1
Number of significant surrogate variables is:  1 
Iteration (out of 5 ):1  2  3  4  5  sva warning: controls provided so supervised sva is being performed.
Number of significant surrogate variables is:  1 
9 
IA-SVA running...

SV 1 Detected!

# of significant surrogate variables: 1
Number of significant surrogate variables is:  1 
Iteration (out of 5 ):1  2  3  4  5  sva warning: controls provided so supervised sva is being performed.
Number of significant surrogate variables is:  1 
10 
IA-SVA running...

SV 1 Detected!

# of significant surrogate variables: 1
Number of significant surrogate variables is:  1 
Iteration (out of 5 ):1  2  3  4  5  sva warning: controls provided so supervised sva is being performed.
Number of significant surrogate variables is:  1 
11 

Compare the accuracy

iasva.cor.vec <- as.vector(abs(cor(hfactor1, do.call(cbind, hfactors_iasva_list))))
pca.cor.vec <- as.vector(abs(cor(hfactor1, do.call(cbind, hfactors_pca_list))))
usva.cor.vec <- as.vector(abs(cor(hfactor1, do.call(cbind, hfactors_usva_list))))
ssva.cor.vec <- as.vector(abs(cor(hfactor1, do.call(cbind, hfactors_ssva_list))))

cor.df <- data.frame(overlap.pct = 100*overlap.prop.vec,
                     IASVA = iasva.cor.vec,
                     PCA = pca.cor.vec,
                     USVA = usva.cor.vec,
                     SSVA = ssva.cor.vec)

melt.cor.df <- melt(cor.df, id.var=c("overlap.pct"))

p <- ggplot(melt.cor.df, aes(x=overlap.pct, y=value, group=variable))
p <- p + geom_line(aes(col=variable), size=1)
p <- p + geom_point(aes(col=variable), size=3)
p <- p + xlab("Gene Set Overlap Percentage")
p <- p + ylab("Absolute Correlation Coefficient")
#p <- p + scale_color_manual(values=color.vec)
p <- p + scale_color_brewer(palette = "Spectral", name="Method")
p <- p + theme_bw()
p <- p + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
               #panel.grid.minor = element_blank(), 
               axis.line = element_line(colour = "black"))
p

ggsave("output/absCor_gene_overlap_pct.pdf", width=8, height=5)

Session information

sessionInfo()
R version 3.5.0 (2018-04-23)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Sierra 10.12.6

Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] parallel  stats4    stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] SummarizedExperiment_1.10.1 DelayedArray_0.6.1         
 [3] matrixStats_0.53.1          Biobase_2.40.0             
 [5] GenomicRanges_1.32.3        GenomeInfoDb_1.16.0        
 [7] IRanges_2.14.10             S4Vectors_0.18.3           
 [9] BiocGenerics_0.26.0         ggplot2_2.2.1.9000         
[11] reshape2_1.4.3              RColorBrewer_1.1-2         
[13] DescTools_0.99.24           corrplot_0.84              
[15] polyester_1.16.0            sva_3.28.0                 
[17] BiocParallel_1.14.2         genefilter_1.62.0          
[19] mgcv_1.8-23                 nlme_3.1-137               
[21] iasvaExamples_1.0.0         iasva_0.99.3               
[23] irlba_2.3.2                 Matrix_1.2-14              

loaded via a namespace (and not attached):
 [1] bitops_1.0-6           bit64_0.9-7            rprojroot_1.3-2       
 [4] tools_3.5.0            backports_1.1.2        R6_2.2.2              
 [7] DBI_1.0.0              lazyeval_0.2.1         colorspace_1.3-2      
[10] withr_2.1.2            tidyselect_0.2.4       bit_1.1-14            
[13] compiler_3.5.0         git2r_0.21.0           expm_0.999-2          
[16] labeling_0.3           scales_0.5.0           mvtnorm_1.0-8         
[19] stringr_1.3.1          digest_0.6.15          foreign_0.8-70        
[22] rmarkdown_1.9          R.utils_2.6.0          XVector_0.20.0        
[25] pkgconfig_2.0.1        htmltools_0.3.6        manipulate_1.0.1      
[28] limma_3.36.2           rlang_0.2.1            RSQLite_2.1.1         
[31] bindr_0.1.1            dplyr_0.7.5            R.oo_1.22.0           
[34] RCurl_1.95-4.10        magrittr_1.5           GenomeInfoDbData_1.1.0
[37] Rcpp_0.12.17           munsell_0.4.3          R.methodsS3_1.7.1     
[40] stringi_1.2.2          whisker_0.3-2          logspline_2.1.11      
[43] yaml_2.1.19            MASS_7.3-50            zlibbioc_1.26.0       
[46] plyr_1.8.4             grid_3.5.0             blob_1.1.1            
[49] lattice_0.20-35        Biostrings_2.48.0      splines_3.5.0         
[52] annotate_1.58.0        knitr_1.20             pillar_1.2.3          
[55] boot_1.3-20            XML_3.98-1.11          glue_1.2.0            
[58] evaluate_0.10.1        gtable_0.2.0           purrr_0.2.5           
[61] assertthat_0.2.0       xtable_1.8-2           survival_2.42-3       
[64] tibble_1.4.2           AnnotationDbi_1.42.1   memoise_1.1.0         
[67] workflowr_1.0.1        bindrcpp_0.2.2         cluster_2.0.7-1       

This reproducible R Markdown analysis was created with workflowr 1.0.1