Here we’ll use a recent dataset on chromaffin cell differentiation to look at different visualization and pseudotime techniques.
We generally start with count matrices. These are obtained from alignment results and protocol-specific demultiplexing routines. Chromaffin dataset was measured using SMART-seq2 protocol, and the reads for each cell are then reported in a separate bam file. We use featureCounts to extract the read counts per gene:
for name in ``find ./ -name \"*.bam\" -print``; do sem -j 50 featureCounts -p -a mm10.genes.gtf -o ${name%.bam}.counts $name; done;
data.path <- '~/workshop_materials/transcriptomics/chromaffin';
counts.dir <- paste(data.path,'counts',sep='/')
str(list.files(path=counts.dir))
chr [1:384] "A10_unique.counts" "A11_unique.counts" "A12_unique.counts" "A13_unique.counts" "A14_unique.counts" ...
We’ll define a quick function to read them in in parallel:
t.read.in.counts <- function(path,n.cores) {
require(parallel)
fs <- list.files(path=path,pattern=".*_unique.counts$",recursive=T)[1:6]
names(fs) <- gsub("_unique.counts","",fs)
names(fs) <- gsub(".+?/","",names(fs))
dat <- do.call(cbind,mclapply(fs,function(file) { df <- read.delim(paste(path,file,sep="/"),header=F,stringsAsFactors=F,skip=2); x <- df[,7]; names(x) <- df[,1]; x;},mc.cores=n.cores))
colnames(dat) <- paste(path,colnames(dat),sep=".");
dat
}
dat <- t.read.in.counts(counts.dir,2)
Loading required package: parallel
str(dat)
int [1:23420, 1:6] 0 0 0 43 26 5 0 18 0 0 ...
- attr(*, "dimnames")=List of 2
..$ : chr [1:23420] "Xkr4" "Rp1" "Sox17" "Mrpl15" ...
..$ : chr [1:6] "~/workshop_materials/transcriptomics/chromaffin/counts.A10" "~/workshop_materials/transcriptomics/chromaffin/counts.A11" "~/workshop_materials/transcriptomics/chromaffin/counts.A12" "~/workshop_materials/transcriptomics/chromaffin/counts.A13" ...
colnames(dat) <- gsub("~/workshop_materials/transcriptomics/chromaffin/counts.","",colnames(dat))
str(dat)
int [1:23420, 1:6] 0 0 0 43 26 5 0 18 0 0 ...
- attr(*, "dimnames")=List of 2
..$ : chr [1:23420] "Xkr4" "Rp1" "Sox17" "Mrpl15" ...
..$ : chr [1:6] "A10" "A11" "A12" "A13" ...
rm(dat);
The data was the processed using scde/PAGODA:
source("pagoda.helpers.r")
library(Cairo); library(parallel)
n.cores <- 30;
min.cell.genes <- 3e3; min.cell.reads <- 1e5; min.gene.reads <- 10; min.gene.cells <- 5; min.nonfailed <- 8;
n.groups <- 10; trim <- 3;
res <- t.process.dataset(cd,nam)
res <- c(res,t.postprocess(res,name="E12_5_c",port = NULL,perplexity = 10,include.aspects = TRUE,distance.threshold = 0.9, n.clusters = 5, top.aspects = 7,return.details=T))
But let’s read in results of the processing:
suppressMessages(require(scde))
res <- readRDS(paste(data.path,"processed.rds",sep='/'))
names(res)
[1] "cd" "knn" "prior" "varinfo" "pwpca" "clpca"
[7] "batch" "app" "tam" "hc" "cl" "clcol"
[13] "colcols" "tamr" "emb" "tamr2" "cols" "ct"
[19] "ctcol" "mat" "matw" "vc" "igw" "wgm"
[25] "wgwm" "ag" "bpvt" "btvg" "gene.df" "background.genes"
show.app(res$app,name='chromaffin',browse=F)
Loading required package: Rook
[1] "app loaded at: http://127.0.0.1:8787/custom/chromaffin/index.html"
Let’s recapitulate the t-SNE embedding plot:
par(mfrow=c(1,1), mar = c(0.5,0.5,0.5,0.5), mgp = c(2,0.65,0), cex = 0.9);
plot(res$emb,col=1,bg=adjustcolor(res$cols,alpha=0.6),cex=2,pch=21,panel.first=grid(),axes=F); box();
Let’s try different visualizations of the same dataset.
# get an FPM matrix as log10(FPM+1)
#fpm <- log10(exp(scde.expression.magnitude(res$knn, res$cd))+1)
lib.size <- colSums(res$cd);
fpm <- log10(t(t(res$cd)/lib.size)+1)
Run PCA:
suppressMessages(require(pcaMethods))
base.pca <- pca(t(fpm),nPcs = 2)
And visualize:
par(mfrow=c(1,1), mar = c(0.5,0.5,0.5,0.5), mgp = c(2,0.65,0), cex = 0.9);
plot(scores(base.pca),col=1,bg=adjustcolor(res$cols,alpha=0.6),cex=2,pch=21,panel.first=grid(),axes=F); box();
Let’s use PAGODA library size estimation instead:
# get an FPM matrix as log10(FPM+1)
fpm <- log10(exp(scde.expression.magnitude(res$knn, res$cd))+1)
base.pca <- pca(t(fpm),nPcs = 2)
par(mfrow=c(1,1), mar = c(0.5,0.5,0.5,0.5), mgp = c(2,0.65,0), cex = 0.9);
plot(scores(base.pca),col=1,bg=adjustcolor(res$cols,alpha=0.6),cex=2,pch=21,panel.first=grid(),axes=F); box();
We can run PCA on just a set of overdispersed genes to make things faster:
od.genes <- names(which(res$varinfo$arv>1.2))
length(od.genes)
[1] 2412
od.pca <- pca(t(fpm[od.genes,]),nPcs = 2)
par(mfrow=c(1,1), mar = c(0.5,0.5,0.5,0.5), mgp = c(2,0.65,0), cex = 0.9);
plot(scores(od.pca),col=1,bg=adjustcolor(res$cols,alpha=0.6),cex=2,pch=21,panel.first=grid(),axes=F); box();
By default, tSNE will measure cell similarity using Euclidean distance in PCA space:
suppressMessages(require(Rtsne))
tsne <- Rtsne(t(fpm[od.genes,]), perplexity=50, verbose = FALSE, num_threads=2)
par(mfrow=c(1,1), mar = c(0.5,0.5,0.5,0.5), mgp = c(2,0.65,0), cex = 0.9);
plot(tsne$Y,col=1,bg=adjustcolor(res$cols,alpha=0.6),cex=2,pch=21,panel.first=grid(),axes=F); box();
Another good alternative is correlation distance (on od genes, or on PCs):
d <- as.dist(1-cor(fpm[od.genes,]))
tsne2 <- Rtsne(d, is_distance = TRUE, perplexity=50, verbose = F, num_threads=2)
par(mfrow=c(1,1), mar = c(0.5,0.5,0.5,0.5), mgp = c(2,0.65,0), cex = 0.9);
plot(tsne2$Y,col=1,bg=adjustcolor(res$cols,alpha=0.6),cex=2,pch=21,panel.first=grid(),axes=F); box();
We can see what increasing or decreasing perplexity does:
tsne3 <- Rtsne(d, is_distance = TRUE, perplexity=3, verbose = F, num_threads=2)
par(mfrow=c(1,1), mar = c(0.5,0.5,0.5,0.5), mgp = c(2,0.65,0), cex = 0.9);
plot(tsne3$Y,col=1,bg=adjustcolor(res$cols,alpha=0.6),cex=2,pch=21,panel.first=grid(),axes=F); box();
Calculate diffusion map using overdispersed genes:
require("destiny")
Loading required package: destiny
dm <- DiffusionMap(t(fpm[od.genes,]), verbose = FALSE,k=15,sigma=100)
visualize:
par(mfrow=c(1,1), mar = c(0.5,0.5,0.5,0.5), mgp = c(2,0.65,0), cex = 0.9);
plot(dm@eigenvectors[,c(1,2)],col=1,bg=adjustcolor(res$cols,alpha=0.6),cex=2,pch=21,panel.first=grid(),axes=F); box();
Monocle aims to reconstruct a tree and determine embedding that best represents that tree. First we have to represent a dataset in a different way:
require("monocle")
Loading required package: monocle
Loading required package: Matrix
Loading required package: ggplot2
Loading required package: VGAM
Loading required package: stats4
Loading required package: splines
Loading required package: DDRTree
Loading required package: irlba
fpm <- res$cd[,colnames(res$mat)];
pd <- as.data.frame(cbind(res$cols ));
rownames(pd) <- colnames(fpm); colnames(pd) <- c("color")
pd = new("AnnotatedDataFrame", data = pd )
fd <- data.frame(ifelse(rownames(fpm) %in% od.genes,1,0),rownames(fpm));
rownames(fd) <- rownames(fpm); colnames(fd) <- c("od","gene_short_name")
fd = new("AnnotatedDataFrame", data = fd)
fpm1 <- newCellDataSet(fpm, phenoData = pd, featureData = fd)
fpm1 <- estimateSizeFactors(fpm1)
fpm1 <- estimateDispersions(fpm1)
Deprecated, use tibble::rownames_to_column() instead.Removing 368 outliers
fpm1 <- setOrderingFilter(fpm1,od.genes)
mncl <- reduceDimension(fpm1)
suppressWarnings(mncl <- orderCells(mncl))
Visualize the resulting tree
plot_cell_trajectory(mncl)
Check correspondence with pre-defined clusters:
plot(mncl@reducedDimS[1,],mncl@reducedDimS[2,],col=as.character(mncl@phenoData@data$col))