Here we’ll use a recent dataset on chromaffin cell differentiation to look at different visualization and pseudotime techniques.

Loading data

We generally start with count matrices. These are obtained from alignment results and protocol-specific demultiplexing routines. Chromaffin dataset was measured using SMART-seq2 protocol, and the reads for each cell are then reported in a separate bam file. We use featureCounts to extract the read counts per gene:

for name in ``find ./ -name \"*.bam\" -print``; do sem -j 50 featureCounts -p -a mm10.genes.gtf -o ${name%.bam}.counts $name; done;

data.path <- '~/workshop_materials/transcriptomics/chromaffin';
counts.dir <- paste(data.path,'counts',sep='/')
str(list.files(path=counts.dir))
 chr [1:384] "A10_unique.counts" "A11_unique.counts" "A12_unique.counts" "A13_unique.counts" "A14_unique.counts" ...

We’ll define a quick function to read them in in parallel:

t.read.in.counts <- function(path,n.cores) {
  require(parallel)
  fs <- list.files(path=path,pattern=".*_unique.counts$",recursive=T)[1:6]
  names(fs) <- gsub("_unique.counts","",fs)
  names(fs) <- gsub(".+?/","",names(fs))
  dat <- do.call(cbind,mclapply(fs,function(file) { df <- read.delim(paste(path,file,sep="/"),header=F,stringsAsFactors=F,skip=2); x <- df[,7]; names(x) <- df[,1]; x;},mc.cores=n.cores))
  colnames(dat) <- paste(path,colnames(dat),sep=".");
  dat
}
dat <- t.read.in.counts(counts.dir,2)
Loading required package: parallel
str(dat)
 int [1:23420, 1:6] 0 0 0 43 26 5 0 18 0 0 ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:23420] "Xkr4" "Rp1" "Sox17" "Mrpl15" ...
  ..$ : chr [1:6] "~/workshop_materials/transcriptomics/chromaffin/counts.A10" "~/workshop_materials/transcriptomics/chromaffin/counts.A11" "~/workshop_materials/transcriptomics/chromaffin/counts.A12" "~/workshop_materials/transcriptomics/chromaffin/counts.A13" ...
colnames(dat) <- gsub("~/workshop_materials/transcriptomics/chromaffin/counts.","",colnames(dat))
str(dat)
 int [1:23420, 1:6] 0 0 0 43 26 5 0 18 0 0 ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:23420] "Xkr4" "Rp1" "Sox17" "Mrpl15" ...
  ..$ : chr [1:6] "A10" "A11" "A12" "A13" ...
rm(dat);

The data was the processed using scde/PAGODA:

source("pagoda.helpers.r")
library(Cairo); library(parallel)
n.cores <- 30;
min.cell.genes <- 3e3; min.cell.reads <- 1e5; min.gene.reads <- 10; min.gene.cells <- 5; min.nonfailed <- 8;
n.groups <- 10; trim <- 3;
res <- t.process.dataset(cd,nam)
res <- c(res,t.postprocess(res,name="E12_5_c",port = NULL,perplexity = 10,include.aspects = TRUE,distance.threshold = 0.9, n.clusters = 5, top.aspects = 7,return.details=T))

But let’s read in results of the processing:

suppressMessages(require(scde))
res <- readRDS(paste(data.path,"processed.rds",sep='/'))
names(res)
 [1] "cd"               "knn"              "prior"            "varinfo"          "pwpca"            "clpca"           
 [7] "batch"            "app"              "tam"              "hc"               "cl"               "clcol"           
[13] "colcols"          "tamr"             "emb"              "tamr2"            "cols"             "ct"              
[19] "ctcol"            "mat"              "matw"             "vc"               "igw"              "wgm"             
[25] "wgwm"             "ag"               "bpvt"             "btvg"             "gene.df"          "background.genes"
show.app(res$app,name='chromaffin',browse=F)
Loading required package: Rook
[1] "app loaded at: http://127.0.0.1:8787/custom/chromaffin/index.html"

Let’s recapitulate the t-SNE embedding plot:

par(mfrow=c(1,1), mar = c(0.5,0.5,0.5,0.5), mgp = c(2,0.65,0), cex = 0.9);
plot(res$emb,col=1,bg=adjustcolor(res$cols,alpha=0.6),cex=2,pch=21,panel.first=grid(),axes=F); box();

Let’s try different visualizations of the same dataset.

PCA

# get an FPM matrix as log10(FPM+1)
#fpm <- log10(exp(scde.expression.magnitude(res$knn, res$cd))+1)
lib.size <- colSums(res$cd);
fpm <- log10(t(t(res$cd)/lib.size)+1)

Run PCA:

suppressMessages(require(pcaMethods))
base.pca <- pca(t(fpm),nPcs = 2)

And visualize:

par(mfrow=c(1,1), mar = c(0.5,0.5,0.5,0.5), mgp = c(2,0.65,0), cex = 0.9);
plot(scores(base.pca),col=1,bg=adjustcolor(res$cols,alpha=0.6),cex=2,pch=21,panel.first=grid(),axes=F); box();