Modern high-throughput datasets. Here we’ll analyze a 10x Chromium dataset using pagoda2.

library(pagoda2)

10x count matrix is normally split into three files. Let’s write a utility function to load it:

# set names euqal to the values
sn <- function(x) { names(x) <- x; return(x); }
# load 10x matrices from a named list of result folders
t.load.10x.data <- function(matrixPaths,n.cores=1) {
  require(parallel)
  require(Matrix)
  mclapply(sn(names(matrixPaths)),function(nam) {
    matrixPath <- matrixPaths[nam];
    # read all count files (*_unique.counts) under a given path
    #cat("loading data from ",matrixPath, " ");
    x <- as(readMM(paste(matrixPath,'matrix.mtx',sep='/')),'dgCMatrix'); # convert to the required sparse matrix representation
    cat(".")
    gs <- read.delim(paste(matrixPath,'genes.tsv',sep='/'),header=F)
    rownames(x) <- gs[,2]
    cat(".")
    gs <- read.delim(paste(matrixPath,'barcodes.tsv',sep='/'),header=F)
    colnames(x) <- gs[,1]
    cat(".")
    colnames(x) <- paste(nam,colnames(x),sep='_');
    x
  },mc.cores=n.cores)
}
n.cores <- 30;

Load (10x PBMC data)[https://support.10xgenomics.com/single-cell-gene-expression/datasets/pbmc8k]:

data.path <- "~/workshop_materials/transcriptomics/10x_pbmc8k"
data.path <- "/d0-mendel/home/pkharchenko/p2/walkthrough/pbmc8k/raw_gene_bc_matrices/GRCh38"
cd <- t.load.10x.data(list(PBMC8K=data.path))
Loading required package: parallel
Loading required package: Matrix
...
str(cd)
List of 1
 $ PBMC8K:Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
  .. ..@ i       : int [1:17171166] 1614 21625 110 153 231 484 548 550 633 673 ...
  .. ..@ p       : int [1:737281] 0 2 2 2 116 120 120 125 126 126 ...
  .. ..@ Dim     : int [1:2] 33694 737280
  .. ..@ Dimnames:List of 2
  .. .. ..$ : chr [1:33694] "RP11-34P13.3" "FAM138A" "OR4F5" "RP11-34P13.7" ...
  .. .. ..$ : chr [1:737280] "PBMC8K_AAACCTGAGAAACCAT-1" "PBMC8K_AAACCTGAGAAACCGC-1" "PBMC8K_AAACCTGAGAAACCTA-1" "PBMC8K_AAACCTGAGAAACGAG-1" ...
  .. ..@ x       : num [1:17171166] 1 1 1 1 1 2 1 1 1 1 ...
  .. ..@ factors : list()

Look at the summary counts

cd <- cd[[1]]
par(mfrow=c(1,2), mar = c(3.5,3.5,2.0,0.5), mgp = c(2,0.65,0), cex = 1.0)
hist(log10(colSums(cd)+1),main='reads per cell',col='wheat')
hist(log10(rowSums(cd)+1),main='reads per gene',col='wheat')

Define a quick cell filtering function based on cell depth vs. number of genes relationship:

# filter cells based on the gene/molecule dependency
t.filter.for.valid.cells <- function(countMatrix,min.cell.size=500, max.cell.size=5e4,p.level=min(1e-3,1/ncol(countMatrix)),alpha=0.1,do.par=T) {
  if(do.par) { par(mfrow=c(1,2), mar = c(3.5,3.5,2.0,0.5), mgp = c(2,0.65,0), cex = 1.0);}
  hist(log10(colSums(countMatrix)),col='wheat',xlab='log10[ molecules ]',main='') 
  # some of the cells are very large .. those can skew the analysis of more subtle populations (too much bias) .. letting them in here though
  
  abline(v=log10(c(min.cell.size,max.cell.size)),lty=2,col=2)
  # look at the number of genes vs. molecule size depenency
  df <- data.frame(molecules=colSums(countMatrix),genes=colSums(countMatrix>0)); 
  df <- df[df$molecules>=min.cell.size,];
  df <- log10(df);
  df <- df[order(df$molecules,decreasing=F),]
  plot(df,col=adjustcolor(1,alpha=alpha),cex=0.5,ylab='log10[ gene counts]',xlab='log10[ molecule counts]')
  abline(v=log10(c(min.cell.size,max.cell.size)),lty=2,col=2)
  #abline(lm(genes ~ molecules, data=df),col=4)
  require(MASS)  
  m <- rlm(genes~molecules,data=df)
  suppressWarnings(pb <- data.frame(predict(m,interval='prediction',level = 1-p.level,type="response")))
  polygon(c(df$molecules,rev(df$molecules)),c(pb$lwr,rev(pb$upr)),col=adjustcolor(2,alpha=0.1),border = NA)
  outliers <- rownames(df)[df$genes > pb$upr | df$genes < pb$lwr];
  points(df[outliers,],col=2,cex=0.6)
  # set of filtered cells to move forward with  
  valid.cells <- colSums(countMatrix)>min.cell.size & colSums(countMatrix)<max.cell.size & !(colnames(countMatrix) %in% outliers)
  countMatrix[,valid.cells,drop=F]
}

Run the filtering procedure:

counts <- t.filter.for.valid.cells(cd,min.cell.size=500)

str(counts)
Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
  ..@ i       : int [1:11965872] 33 45 59 153 335 375 408 440 484 495 ...
  ..@ p       : int [1:8787] 0 871 1677 2993 3891 5417 6912 8165 9598 11230 ...
  ..@ Dim     : int [1:2] 33694 8786
  ..@ Dimnames:List of 2
  .. ..$ : chr [1:33694] "RP11-34P13.3" "FAM138A" "OR4F5" "RP11-34P13.7" ...
  .. ..$ : chr [1:8786] "PBMC8K_AAACCTGAGCATCATC-1" "PBMC8K_AAACCTGAGCTAACTC-1" "PBMC8K_AAACCTGAGCTAGTGG-1" "PBMC8K_AAACCTGCACATTAGC-1" ...
  ..@ x       : num [1:11965872] 1 1 1 5 1 1 1 1 18 1 ...
  ..@ factors : list()

We can also looka at the number of molecules per gene, and omit low-expressed genes to save computational time:

hist(log10(rowSums(counts)+1),main='Molecules per gene',xlab='molecules (log10)',col='wheat')
abline(v=1,lty=2,col=2)

Now we have a clean/lean count matrix and are ready to start analysis. First we’ll create pagoda2 object that will maintain all of the results. It will also provide handles for running all operations on the data.

r <- Pagoda2$new(counts,log.scale=FALSE)
Error in setCountMatrix(x, min.cells.per.gene = min.cells.per.gene, trim = trim,  : 
  duplicate gene names are not allowed - please reduce

Yes, we need to make gene names unique:

rownames(counts) <- make.unique(rownames(counts))
r <- Pagoda2$new(counts,log.scale=FALSE)
8786 cells, 13004 genes; normalizing ... using plain model winsorizing ... done.

Next, we’ll adjust the variance, to normalize the extent to which genes with (very) different expression magnitudes will contribute to the downstream anlaysis:

r$adjustVariance(plot=T,gam.k=10)
calculating variance fit ... using gam 
Loading required package: mgcv
Loading required package: nlme
This is mgcv 1.8-18. For overview type 'help("mgcv-package")'.
743 overdispersed genes ... 743 persisting ... done.

There are many alternative ways of proceeding with the downstream analysis. Below we’ll use the simplest, default scenario, where we first reduce the dataset dimensions by running PCA, and then move into k-nearest neighbor graph space for clustering and visualization calculations. First, the PCA reduction:

r$calculatePcaReduction(nPcs=100,n.odgenes=3e3,maxit = 200)
Loading required package: irlba

Clustering, visualization and many other procedures can take advantage of a cell kNN graph. Let’s calculate it.

Loading required package: igraph

Attaching package: ‘igraph’

The following objects are masked from ‘package:stats’:

    decompose, spectrum

The following object is masked from ‘package:base’:

    union

2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: dummy distance type: INT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: dummy distance type: FLOAT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: dummy distance type: DOUBLE
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: bit_hamming distance type: INT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: leven distance type: INT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: normleven distance type: FLOAT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: kldivfast distance type: FLOAT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: kldivfast distance type: DOUBLE
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: kldivfastrq distance type: FLOAT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: kldivfastrq distance type: DOUBLE
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: kldivgenfast distance type: FLOAT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: kldivgenfast distance type: DOUBLE
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: kldivgenslow distance type: FLOAT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: kldivgenslow distance type: DOUBLE
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: kldivgenfastrq distance type: FLOAT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: kldivgenfastrq distance type: DOUBLE
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: itakurasaitofast distance type: FLOAT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: itakurasaitofast distance type: DOUBLE
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: jsdivslow distance type: FLOAT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: jsdivslow distance type: DOUBLE
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: jsdivfast distance type: FLOAT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: jsdivfast distance type: DOUBLE
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: jsdivfastapprox distance type: FLOAT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: jsdivfastapprox distance type: DOUBLE
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: jsmetrslow distance type: FLOAT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: jsmetrslow distance type: DOUBLE
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: jsmetrfast distance type: FLOAT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: jsmetrfast distance type: DOUBLE
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: jsmetrfastapprox distance type: FLOAT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: jsmetrfastapprox distance type: DOUBLE
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: word_embed distance type: FLOAT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: word_embed distance type: DOUBLE
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: lp distance type: FLOAT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: lp distance type: DOUBLE
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: linf distance type: FLOAT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: linf distance type: DOUBLE
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: l1 distance type: FLOAT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: l1 distance type: DOUBLE
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: l2 distance type: FLOAT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: l2 distance type: DOUBLE
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: cosinesimil distance type: FLOAT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: cosinesimil distance type: DOUBLE
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: angulardist distance type: FLOAT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: angulardist distance type: DOUBLE
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: lp_sparse distance type: FLOAT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: lp_sparse distance type: DOUBLE
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: linf_sparse distance type: FLOAT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: linf_sparse distance type: DOUBLE
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: l1_sparse distance type: FLOAT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: l1_sparse distance type: DOUBLE
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: l2_sparse distance type: FLOAT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: l2_sparse distance type: DOUBLE
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: cosinesimil_sparse distance type: FLOAT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: cosinesimil_sparse distance type: DOUBLE
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: angulardist_sparse distance type: FLOAT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: angulardist_sparse distance type: DOUBLE
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: negdotprod_sparse distance type: FLOAT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: querynorm_negdotprod_sparse distance type: DOUBLE
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: cosinesimil_sparse_fast distance type: FLOAT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: angulardist_sparse_fast distance type: FLOAT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: negdotprod_sparse_fast distance type: FLOAT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: querynorm_negdotprod_sparse_fast distance type: FLOAT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: savch distance type: FLOAT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: sqfd_heuristic_func distance type: FLOAT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: sqfd_heuristic_func distance type: DOUBLE
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: sqfd_minus_func distance type: FLOAT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: sqfd_minus_func distance type: DOUBLE
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: sqfd_gaussian_func distance type: FLOAT
2017-09-14 12:48:34 spacefactory.h:44 (Register) [INFO] Registering at the factory, space: sqfd_gaussian_func distance type: DOUBLE
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: dummy distance type: FLOAT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: dummy distance type: DOUBLE
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: dummy distance type: INT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: bbtree distance type: FLOAT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: bbtree distance type: DOUBLE
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: ghtree distance type: FLOAT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: ghtree distance type: DOUBLE
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: ghtree distance type: INT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: list_clusters distance type: FLOAT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: list_clusters distance type: DOUBLE
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: list_clusters distance type: INT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: lsh_cauchy distance type: FLOAT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: lsh_gaussian distance type: FLOAT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: lsh_threshold distance type: FLOAT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: mplsh distance type: FLOAT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: lsh_multiprobe distance type: FLOAT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: mvptree distance type: FLOAT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: mvptree distance type: DOUBLE
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: mvptree distance type: INT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: perm_bin_vptree distance type: FLOAT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: perm_bin_vptree distance type: DOUBLE
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: perm_bin_vptree distance type: INT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: perm_incsort_bin distance type: FLOAT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: perm_incsort_bin distance type: DOUBLE
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: perm_incsort_bin distance type: INT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: perm_bin_incsort distance type: FLOAT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: perm_bin_incsort distance type: DOUBLE
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: perm_bin_incsort distance type: INT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: perm_lsh_bin distance type: FLOAT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: perm_lsh_bin distance type: DOUBLE
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: perm_lsh_bin distance type: INT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: perm_inv_indx distance type: FLOAT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: perm_inv_indx distance type: DOUBLE
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: perm_inv_indx distance type: INT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: mi-file distance type: FLOAT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: mi-file distance type: DOUBLE
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: mi-file distance type: INT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: perm_prefix distance type: FLOAT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: perm_prefix distance type: DOUBLE
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: perm_prefix distance type: INT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: pp-index distance type: FLOAT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: pp-index distance type: DOUBLE
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: pp-index distance type: INT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: pivot_neighb_invindx distance type: FLOAT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: pivot_neighb_invindx distance type: DOUBLE
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: pivot_neighb_invindx distance type: INT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: napp distance type: FLOAT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: napp distance type: DOUBLE
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: napp distance type: INT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: omedrank distance type: FLOAT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: omedrank distance type: DOUBLE
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: omedrank distance type: INT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: proj_vptree distance type: FLOAT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: proj_vptree distance type: DOUBLE
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: proj_vptree distance type: INT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: proj_incsort distance type: FLOAT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: proj_incsort distance type: DOUBLE
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: proj_incsort distance type: INT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: brute_force distance type: FLOAT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: brute_force distance type: DOUBLE
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: brute_force distance type: INT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: seq_search distance type: FLOAT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: seq_search distance type: DOUBLE
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: seq_search distance type: INT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: sw-graph distance type: FLOAT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: sw-graph distance type: DOUBLE
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: sw-graph distance type: INT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: small_world_rand distance type: FLOAT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: small_world_rand distance type: DOUBLE
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: small_world_rand distance type: INT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: hnsw distance type: FLOAT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: hnsw distance type: DOUBLE
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: hnsw distance type: INT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: sw-graph-split distance type: FLOAT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: sw-graph-split distance type: DOUBLE
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: sw-graph-split distance type: INT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: nndes distance type: FLOAT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: nndes distance type: DOUBLE
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: nndes distance type: INT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: satree distance type: FLOAT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: satree distance type: DOUBLE
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: satree distance type: INT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: vptree distance type: INT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: vptree distance type: FLOAT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: vptree distance type: DOUBLE
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: mult_index distance type: FLOAT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: mult_index distance type: DOUBLE
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: mult_index distance type: INT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: nonmetr_list_clust distance type: FLOAT
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: nonmetr_list_clust distance type: DOUBLE
2017-09-14 12:48:34 methodfactory.h:49 (Register) [INFO] Registering at the factory, method: nonmetr_list_clust distance type: INT
reading points ... done (8786 points)
building the index ... 
2017-09-14 12:48:34 hnsw.cc:160 (CreateIndex) [INFO] M                   = 20
2017-09-14 12:48:34 hnsw.cc:161 (CreateIndex) [INFO] indexThreadQty      = 30
2017-09-14 12:48:34 hnsw.cc:162 (CreateIndex) [INFO] efConstruction      = 100
2017-09-14 12:48:34 hnsw.cc:163 (CreateIndex) [INFO] maxM                     = 20
2017-09-14 12:48:34 hnsw.cc:164 (CreateIndex) [INFO] maxM0                    = 40
2017-09-14 12:48:34 hnsw.cc:166 (CreateIndex) [INFO] mult                = 0.333808
2017-09-14 12:48:34 hnsw.cc:167 (CreateIndex) [INFO] skip_optimized_index= 0
2017-09-14 12:48:34 hnsw.cc:168 (CreateIndex) [INFO] delaunay_type       = 2
2017-09-14 12:48:34 hnsw.cc:434 (SetQueryTimeParams) [INFO] Set HNSW query-time parameters:
2017-09-14 12:48:34 hnsw.cc:435 (SetQueryTimeParams) [INFO] ef(Search)         =20
2017-09-14 12:48:34 hnsw.cc:436 (SetQueryTimeParams) [INFO] algoType           =2

0%   10   20   30   40   50   60   70   80   90   100%
|----|----|----|----|----|----|----|----|----|----|
**************************************************2017-09-14 12:48:34 hnsw.cc:320 (CreateIndex) [INFO] 
The vectorspace is Cosine Similarity
2017-09-14 12:48:34 hnsw.cc:322 (CreateIndex) [INFO] Vector length=100
2017-09-14 12:48:34 hnsw.cc:325 (CreateIndex) [INFO] Thus using an optimised function for base 4
2017-09-14 12:48:34 hnsw.cc:347 (CreateIndex) [INFO] searchMethod             = 4
2017-09-14 12:48:34 hnsw.cc:358 (CreateIndex) [INFO] Making optimized index
2017-09-14 12:48:34 hnsw.cc:402 (CreateIndex) [INFO] Finished making optimized index
2017-09-14 12:48:34 hnsw.cc:403 (CreateIndex) [INFO] Maximum level = 2
2017-09-14 12:48:34 hnsw.cc:404 (CreateIndex) [INFO] Total memory allocated for optimized index+data: 4 Mb
2017-09-14 12:48:34 hnsw.cc:434 (SetQueryTimeParams) [INFO] Set HNSW query-time parameters:
2017-09-14 12:48:34 hnsw.cc:435 (SetQueryTimeParams) [INFO] ef(Search)         =100
2017-09-14 12:48:34 hnsw.cc:436 (SetQueryTimeParams) [INFO] algoType           =2

done (0.109416s)
running queries with k=40 ... 
0%   10   20   30   40   50   60   70   80   90   100%
|----|----|----|----|----|----|----|----|----|----|
***************************************************

done (0.575964s)
creating report data frame ... done

Determine clusters using “multilevel.community” algorithm.

r$getKnnClusters(method=multilevel.community,type='PCA')

Determine a largeVis embedding:

M <- 30; r$getEmbedding(type = 'PCA',embeddingType = 'largeVis', M = M,  perplexity = 30,  gamma = 1 / M,  alpha = 1)
Loading required package: largeVis
largeVis was compiled with 32-bit types. This will limit the size of the datasets it can process. Consider recompiling with -DARMA_64BIT_WORD
Estimating embeddings.
0%   10   20   30   40   50   60   70   80   90   100%
|----|----|----|----|----|----|----|----|----|----|
**************************************************|

Now we can visualize the embedding using the determined clusters:

r$plotEmbedding(type='PCA',show.legend=F,mark.clusters=T,min.group.size=50,shuffle.colors=F,mark.cluster.cex=1,alpha=0.1,main='clusters (lV)')

r$getEmbedding(type='PCA',embeddingType='tSNE',perplexity=50,verbose=F,n.cores=n.cores)
Loading required package: Rtsne
calculating distance ... pearson ... done
running tSNE using 30 cores:
r$plotEmbedding(type='PCA',embeddingType='tSNE',show.legend=F,mark.clusters=T,min.group.size=1,shuffle.colors=F,mark.cluster.cex=1,alpha=0.1,main='clusters (tSNE)')

Or load precalculated tSNE

tSNE <- readRDS("tSNE.rds")
r$embeddings$PCA$tSNE <- tSNE;

We can use the same plotEmbedding() function to show all kinds of other values. For instance, let’s look at depth, or an expresson pattern of a gene:

str(r$depth)
 Named num [1:8786] 2376 1659 4517 2779 4656 ...
 - attr(*, "names")= chr [1:8786] "PBMC8K_AAACCTGAGCATCATC-1" "PBMC8K_AAACCTGAGCTAACTC-1" "PBMC8K_AAACCTGAGCTAGTGG-1" "PBMC8K_AAACCTGCACATTAGC-1" ...
par(mfrow=c(1,2))
r$plotEmbedding(type='PCA',embeddingType='tSNE',show.legend=F,mark.clusters=T,min.group.size=1,shuffle.colors=F,mark.cluster.cex=1,alpha=0.1,main='clusters (tSNE)')
r$plotEmbedding(type='PCA',embeddingType='tSNE',colors=r$depth,shuffle.colors=F,mark.cluster.cex=1,alpha=0.1,main='depth')
treating colors as a gradient with zlim: 1440 8903.75 

Or expression of a given gene:

par(mfrow=c(1,2))
r$plotEmbedding(type='PCA',embeddingType='tSNE',show.legend=F,mark.clusters=T,min.group.size=1,shuffle.colors=F,mark.cluster.cex=1,alpha=0.1,main='clusters (tSNE)')
gene <-"LYZ"
r$plotEmbedding(type='PCA',embeddingType='tSNE',colors=r$counts[,gene],shuffle.colors=F,mark.cluster.cex=1,alpha=0.1,main=gene)
treating colors as a gradient with zlim: 0 16.79779 

We can generate multiple potential clusterings, with different names. Here we’ll use multilevel clustering:

r$getKnnClusters(method=infomap.community,type='PCA',name='infomap')
str(r$clusters)
List of 1
 $ PCA:List of 2
  ..$ community: Factor w/ 12 levels "1","2","3","4",..: 9 10 6 2 10 10 12 2 10 2 ...
  .. ..- attr(*, "names")= chr [1:8786] "PBMC8K_AAACCTGAGCATCATC-1" "PBMC8K_AAACCTGAGCTAACTC-1" "PBMC8K_AAACCTGAGCTAGTGG-1" "PBMC8K_AAACCTGCACATTAGC-1" ...
  ..$ infomap  : Factor w/ 25 levels "1","2","3","4",..: 4 6 5 2 6 3 15 2 24 2 ...
  .. ..- attr(*, "names")= chr [1:8786] "PBMC8K_AAACCTGAGCATCATC-1" "PBMC8K_AAACCTGAGCTAACTC-1" "PBMC8K_AAACCTGAGCTAGTGG-1" "PBMC8K_AAACCTGCACATTAGC-1" ...

Compare with infomap:

par(mfrow=c(1,2))
r$plotEmbedding(type='PCA',embeddingType='tSNE',groups=r$clusters$PCA$community,show.legend=F,mark.clusters=T,min.group.size=1,shuffle.colors=F,mark.cluster.cex=1,alpha=0.1,main='multilevel clusters (tSNE)')
using provided groups as a factor
r$plotEmbedding(type='PCA',embeddingType='tSNE',clusterType='infomap',show.legend=F,mark.clusters=T,min.group.size=1,shuffle.colors=F,mark.cluster.cex=1,alpha=0.1,main='infomap clusters (tSNE)')

You can try other cluster methods (e.g. walktrap.community), or change k on the gene kNN graph.

Run differential expression on the infomap clusters:

r$getDifferentialGenes(type='PCA',verbose=T,clusterType='community')
running differential expression with  12  clusters ... adjusting p-values ... done.

Visualize top genes:

names(r$diffgenes)
[1] "PCA"
de <- r$diffgenes$PCA[[1]][['4']];
r$plotGeneHeatmap(genes=rownames(de)[1:15],groups=r$clusters$PCA[[1]])

Spot-check a gene:

gene <-"IGHM"
r$plotEmbedding(type='PCA',embeddingType='tSNE',colors=r$counts[,gene],shuffle.colors=F,mark.cluster.cex=1,alpha=0.1,main=gene)
treating colors as a gradient with zlim: 0 1.940716 

Pathway overdispersion analysis (a la PAGODA1)

First, build GO->gene environment:

suppressMessages(library(org.Hs.eg.db))
# translate gene names to ids
ids <- unlist(lapply(mget(colnames(r$counts),org.Hs.egALIAS2EG,ifnotfound=NA),function(x) x[1]))
# reverse map
rids <- names(ids); names(rids) <- ids;
# list all the ids per GO category
go.env <- list2env(eapply(org.Hs.egGO2ALLEGS,function(x) as.character(na.omit(rids[x]))))

Now run overdispersion anlaysis

r$testPathwayOverdispersion(go.env,verbose=T,correlation.distance.threshold=0.95,recalculate.pca=F,top.aspects=15)

We’ll use hierarchical differential expression results instead:

r$getHierarchicalDiffExpressionAspects(type='PCA',clusterName='community',z.threshold=3)
using community  clustering for PCA space

We’ll make an app with that, ordering the “differential expression aspects” explicitly (otherwise if row clustering is omitted they’ll be clustered by similarity)

app <- p2.make.pagoda1.app(r,inner.clustering=TRUE,embeddingType='tSNE',clusterType='community',min.group.size=50,row.clustering=list(order=rev(1:nrow(r$misc$pathwayOD$xv))))

Show app:

show.app(app,'pbmc',browse=T)
---
title: "Heterogeneity analysis III"
output: html_notebook
---

Modern high-throughput datasets. Here we'll analyze a 10x Chromium dataset using pagoda2.

```{r}
library(pagoda2)
```


10x count matrix is normally split into three files. Let's write a utility function to load it:
```{r}
# set names euqal to the values
sn <- function(x) { names(x) <- x; return(x); }

# load 10x matrices from a named list of result folders
t.load.10x.data <- function(matrixPaths,n.cores=1) {
  require(parallel)
  require(Matrix)
  mclapply(sn(names(matrixPaths)),function(nam) {
    matrixPath <- matrixPaths[nam];
    # read all count files (*_unique.counts) under a given path
    #cat("loading data from ",matrixPath, " ");
    x <- as(readMM(paste(matrixPath,'matrix.mtx',sep='/')),'dgCMatrix'); # convert to the required sparse matrix representation
    cat(".")
    gs <- read.delim(paste(matrixPath,'genes.tsv',sep='/'),header=F)
    rownames(x) <- gs[,2]
    cat(".")
    gs <- read.delim(paste(matrixPath,'barcodes.tsv',sep='/'),header=F)
    colnames(x) <- gs[,1]
    cat(".")
    colnames(x) <- paste(nam,colnames(x),sep='_');
    x
  },mc.cores=n.cores)
}
n.cores <- 30;
```


Load (10x PBMC data)[https://support.10xgenomics.com/single-cell-gene-expression/datasets/pbmc8k]:
```{r}
data.path <- "~/workshop_materials/transcriptomics/10x_pbmc8k"
#data.path <- "/d0-mendel/home/pkharchenko/p2/walkthrough/pbmc8k/raw_gene_bc_matrices/GRCh38"
cd <- t.load.10x.data(list(PBMC8K=data.path))
```
```{r}
str(cd)

```


Look at the summary counts
```{r}
cd <- cd[[1]]
par(mfrow=c(1,2), mar = c(3.5,3.5,2.0,0.5), mgp = c(2,0.65,0), cex = 1.0)
hist(log10(colSums(cd)+1),main='reads per cell',col='wheat')
hist(log10(rowSums(cd)+1),main='reads per gene',col='wheat')
```



Define a quick cell filtering function based on cell depth vs. number of genes relationship:
```{r}
# filter cells based on the gene/molecule dependency
t.filter.for.valid.cells <- function(countMatrix,min.cell.size=500, max.cell.size=5e4,p.level=min(1e-3,1/ncol(countMatrix)),alpha=0.1,do.par=T) {
  if(do.par) { par(mfrow=c(1,2), mar = c(3.5,3.5,2.0,0.5), mgp = c(2,0.65,0), cex = 1.0);}
  hist(log10(colSums(countMatrix)),col='wheat',xlab='log10[ molecules ]',main='') 
  # some of the cells are very large .. those can skew the analysis of more subtle populations (too much bias) .. letting them in here though
  
  abline(v=log10(c(min.cell.size,max.cell.size)),lty=2,col=2)
  # look at the number of genes vs. molecule size depenency
  df <- data.frame(molecules=colSums(countMatrix),genes=colSums(countMatrix>0)); 
  df <- df[df$molecules>=min.cell.size,];
  df <- log10(df);
  df <- df[order(df$molecules,decreasing=F),]
  plot(df,col=adjustcolor(1,alpha=alpha),cex=0.5,ylab='log10[ gene counts]',xlab='log10[ molecule counts]')
  abline(v=log10(c(min.cell.size,max.cell.size)),lty=2,col=2)
  #abline(lm(genes ~ molecules, data=df),col=4)
  require(MASS)  
  m <- rlm(genes~molecules,data=df)
  suppressWarnings(pb <- data.frame(predict(m,interval='prediction',level = 1-p.level,type="response")))
  polygon(c(df$molecules,rev(df$molecules)),c(pb$lwr,rev(pb$upr)),col=adjustcolor(2,alpha=0.1),border = NA)
  outliers <- rownames(df)[df$genes > pb$upr | df$genes < pb$lwr];
  points(df[outliers,],col=2,cex=0.6)
  # set of filtered cells to move forward with  
  valid.cells <- colSums(countMatrix)>min.cell.size & colSums(countMatrix)<max.cell.size & !(colnames(countMatrix) %in% outliers)
  countMatrix[,valid.cells,drop=F]
}

```


Run the filtering procedure:
```{r}
counts <- t.filter.for.valid.cells(cd,min.cell.size=500)
```


```{r}
str(counts)
```

We can also looka at the number of molecules per gene, and omit low-expressed genes to save computational time:

```{r}
hist(log10(rowSums(counts)+1),main='Molecules per gene',xlab='molecules (log10)',col='wheat')
abline(v=1,lty=2,col=2)
```

Now we have a clean/lean count matrix and are ready to start analysis. First we’ll create pagoda2 object that will maintain all of the results. It will also provide handles for running all operations on the data.

```{r error=TRUE}
r <- Pagoda2$new(counts,log.scale=FALSE)
```


Yes, we need to make gene names unique:
```{r}
rownames(counts) <- make.unique(rownames(counts))
r <- Pagoda2$new(counts,log.scale=FALSE)
```

Next, we’ll adjust the variance, to normalize the extent to which genes with (very) different expression magnitudes will contribute to the downstream anlaysis:

```{r}
r$adjustVariance(plot=T,gam.k=10)
```

There are many alternative ways of proceeding with the downstream analysis. Below we’ll use the simplest, default scenario, where we first reduce the dataset dimensions by running PCA, and then move into k-nearest neighbor graph space for clustering and visualization calculations. First, the PCA reduction:
```{r}
r$calculatePcaReduction(nPcs=100,n.odgenes=3e3,maxit = 200)
```


Clustering, visualization and many other procedures can take advantage of a cell kNN graph. Let's calculate it.
```{r echo=FALSE}
r$makeKnnGraph(k=40,type='PCA',center=T,distance='cosine')
```


Determine clusters using "multilevel.community" algorithm.
```{r}
r$getKnnClusters(method=multilevel.community,type='PCA')
```


Determine a largeVis embedding:
```{r}
M <- 30; r$getEmbedding(type = 'PCA',embeddingType = 'largeVis', M = M,  perplexity = 30,  gamma = 1 / M,  alpha = 1)
```
Now we can visualize the embedding using the determined clusters:

```{r}
r$plotEmbedding(type='PCA',show.legend=F,mark.clusters=T,min.group.size=50,shuffle.colors=F,mark.cluster.cex=1,alpha=0.1,main='clusters (lV)')
```

```{r}
r$getEmbedding(type='PCA',embeddingType='tSNE',perplexity=50,verbose=F,n.cores=n.cores)
```
```{r}
r$plotEmbedding(type='PCA',embeddingType='tSNE',show.legend=F,mark.clusters=T,min.group.size=1,shuffle.colors=F,mark.cluster.cex=1,alpha=0.1,main='clusters (tSNE)')
```

Or load precalculated tSNE
```{r eval=F}
tSNE <- readRDS("tSNE.rds")
r$embeddings$PCA$tSNE <- tSNE;
```



We can use the same plotEmbedding() function to show all kinds of other values. For instance, let’s look at depth, or an expresson pattern of a gene:

```{r}
str(r$depth)
```
```{r}
par(mfrow=c(1,2))
r$plotEmbedding(type='PCA',embeddingType='tSNE',show.legend=F,mark.clusters=T,min.group.size=1,shuffle.colors=F,mark.cluster.cex=1,alpha=0.1,main='clusters (tSNE)')
r$plotEmbedding(type='PCA',embeddingType='tSNE',colors=r$depth,shuffle.colors=F,mark.cluster.cex=1,alpha=0.1,main='depth')
```
Or expression of a given gene:
```{r}
par(mfrow=c(1,2))
r$plotEmbedding(type='PCA',embeddingType='tSNE',show.legend=F,mark.clusters=T,min.group.size=1,shuffle.colors=F,mark.cluster.cex=1,alpha=0.1,main='clusters (tSNE)')

gene <-"LYZ"
r$plotEmbedding(type='PCA',embeddingType='tSNE',colors=r$counts[,gene],shuffle.colors=F,mark.cluster.cex=1,alpha=0.1,main=gene)
```

We can generate multiple potential clusterings, with different names. Here we’ll use multilevel clustering:
```{r}
r$getKnnClusters(method=infomap.community,type='PCA',name='infomap')
str(r$clusters)
```

Compare with infomap:
```{r}
par(mfrow=c(1,2))
r$plotEmbedding(type='PCA',embeddingType='tSNE',groups=r$clusters$PCA$community,show.legend=F,mark.clusters=T,min.group.size=1,shuffle.colors=F,mark.cluster.cex=1,alpha=0.1,main='multilevel clusters (tSNE)')
r$plotEmbedding(type='PCA',embeddingType='tSNE',clusterType='infomap',show.legend=F,mark.clusters=T,min.group.size=1,shuffle.colors=F,mark.cluster.cex=1,alpha=0.1,main='infomap clusters (tSNE)')

```

You can try other cluster methods (e.g. walktrap.community), or change k on the gene kNN graph.


Run differential expression on the infomap clusters:
```{r}
r$getDifferentialGenes(type='PCA',verbose=T,clusterType='community')
```
Visualize top genes:
```{r}
names(r$diffgenes)
```

```{r}
de <- r$diffgenes$PCA[[1]][['4']];
r$plotGeneHeatmap(genes=rownames(de)[1:15],groups=r$clusters$PCA[[1]])
```

Spot-check a gene:
```{r}
gene <-"IGHM"
r$plotEmbedding(type='PCA',embeddingType='tSNE',colors=r$counts[,gene],shuffle.colors=F,mark.cluster.cex=1,alpha=0.1,main=gene)
```

Pathway overdispersion analysis (a la PAGODA1)

First, build GO->gene environment:
```{r eval=F}
suppressMessages(library(org.Hs.eg.db))
# translate gene names to ids
ids <- unlist(lapply(mget(colnames(r$counts),org.Hs.egALIAS2EG,ifnotfound=NA),function(x) x[1]))
# reverse map
rids <- names(ids); names(rids) <- ids;
# list all the ids per GO category
go.env <- list2env(eapply(org.Hs.egGO2ALLEGS,function(x) as.character(na.omit(rids[x]))))
```

Now run overdispersion anlaysis
```{r eval=F}
r$testPathwayOverdispersion(go.env,verbose=T,correlation.distance.threshold=0.95,recalculate.pca=F,top.aspects=15)
```


We'll use hierarchical differential expression results instead:
```{r}
r$getHierarchicalDiffExpressionAspects(type='PCA',clusterName='community',z.threshold=3)
```

We'll make an app with that, ordering the "differential expression aspects" explicitly (otherwise if row clustering is omitted they'll be clustered by similarity)
```{r}
app <- p2.make.pagoda1.app(r,inner.clustering=TRUE,embeddingType='tSNE',clusterType='community',min.group.size=50,row.clustering=list(order=rev(1:nrow(r$misc$pathwayOD$xv))))
```

Show app:
```{r eval=F}
show.app(app,'pbmc',browse=T)
```

