library(pagoda2)
Loading required package: Matrix
Loading required package: igraph
package ‘igraph’ was built under R version 3.5.3
Attaching package: ‘igraph’
The following objects are masked from ‘package:stats’:
decompose, spectrum
The following object is masked from ‘package:base’:
union
library(ggplot2)
library(Matrix)
library(magrittr)
Read data
read.velo <- function(path,prefix='') {
t <- readr:::read_delim(paste(path,"matrix.mtx",sep='/'),delim=' ',skip=3,col_names=F)
f <- read.delim(paste(path,'features.tsv',sep='/'),header=F,stringsAsFactors=F)
b <- read.delim(paste(path,'barcodes.tsv',sep='/'),header=F,stringsAsFactors=F)
dims <- c(nrow(f),nrow(b))
spliced <- Matrix::sparseMatrix(i=t$X1,j=t$X2,x=t$X3,dims=dims)
unspliced <- Matrix::sparseMatrix(i=t$X1,j=t$X2,x=t$X4,dims=dims)
colnames(spliced) <- colnames(unspliced) <- paste0(prefix,b[,1]);
rownames(spliced) <- rownames(unspliced) <- make.unique(f[,2]);
list(spliced=spliced,unspliced=unspliced)
}
m1 <- read.velo("r8/Velocyto/raw",prefix='r8')
Parsed with column specification:
cols(
X1 = [32mcol_double()[39m,
X2 = [32mcol_double()[39m,
X3 = [32mcol_double()[39m,
X4 = [32mcol_double()[39m,
X5 = [32mcol_double()[39m
)
#m2 <- read.velo("r9/Velocyto/raw",prefix='r9')
#cd <- cbind(m1$spliced,m2$spliced)
#batch.f <- setNames(ifelse(grepl("r8",colnames(cd)),'r8','r9'),colnames(cd)) %>% as.factor
Initial processing - to select the relevant populations
r <- basicP2proc(m1$spliced,n.cores=20,nPcs=50,make.geneknn=F,n.odgenes=5e3,get.tsne = F, get.largevis = F)
5747 cells, 31053 genes; normalizing ... using plain model winsorizing ... log scale ... done.
calculating variance fit ... using gam 637 overdispersed genes ... 637persisting ... done.
running PCA using 5000 OD genes .... done
creating space of type angular done
adding data ... done
building index ... done
querying ... done
r$getKnnClusters(type='PCA',method=function(x) conos::leiden.community(x,r=1),name='leiden')
set.seed(1)
r$getEmbedding(type='PCA',embeddingType='tSNE',perplexity=70)
calculating distance ... pearson ...running tSNE using 20 cores:
r$plotEmbedding(type = 'PCA', embedding = 'tSNE', clusterType = 'leiden', mark.clusters = T,alpha=0.2,cex=0.3,mark.cluster.cex = 1)
set.seed(1)
r$getEmbedding(type='PCA',embeddingType='UMAP',n_neighbors=100,min_dist=0.4,spread=1,n.cores=30)
14:30:37 UMAP embedding parameters a = 0.7669 b = 1.223
14:30:37 Read 5747 rows and found 50 numeric columns
14:30:37 Using Annoy for neighbor search, n_neighbors = 100
14:30:37 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
14:30:38 Writing NN index file to temp file /tmp/Rtmpaayt91/filed2515364e31f
14:30:38 Searching Annoy index using 30 threads, search_k = 10000
14:30:39 Annoy recall = 100%
14:30:40 Commencing smooth kNN distance calibration using 30 threads
14:30:42 Initializing from normalized Laplacian + noise
14:30:43 Commencing optimization for 500 epochs, with 838026 positive edges using 30 threads
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
14:30:47 Optimization finished
r$plotEmbedding(type = 'PCA', embedding = 'UMAP', clusterType = 'leiden', mark.clusters = T,alpha=0.2,cex=0.3,mark.cluster.cex = 1)
gl <- c('Mki67', # cc
# progenitors
'Fos',
'Sox2',
# neroblasts
'Prc1',
'Sstr2',
'Penk',
#rgc
'Isl1',
'Pou4f2',
#photoreceptors
'Crx',
# amacrine/horizontal
'Lhx1',
'Onecut2'
)
lapply(gl,function(gn) {
r$plotEmbedding(type = 'PCA', embedding = 'UMAP', colors=r$counts[,gn],alpha=0.5,cex=0.2,main=gn)
})
treating colors as a gradient with zlim: 0 0.5648841
treating colors as a gradient with zlim: 0 1.861343
treating colors as a gradient with zlim: 0 0.3296107
treating colors as a gradient with zlim: 0 0.7482324
treating colors as a gradient with zlim: 0 1.465659
treating colors as a gradient with zlim: 0 1.383004
treating colors as a gradient with zlim: 0 1.243379
treating colors as a gradient with zlim: 0 0.1277146
treating colors as a gradient with zlim: 0 1.62815
treating colors as a gradient with zlim: 0 0.5593899
treating colors as a gradient with zlim: 0 0.6832949
[[1]]
NULL
[[2]]
NULL
[[3]]
NULL
[[4]]
NULL
[[5]]
NULL
[[6]]
NULL
[[7]]
NULL
[[8]]
NULL
[[9]]
NULL
[[10]]
NULL
[[11]]
NULL
vic <- names(r$clusters$PCA$leiden)[!r$clusters$PCA$leiden %in% c(13,1,3,7,9,6)]
r2 <- basicP2proc(m1$spliced[,colnames(m1$spliced) %in% vic],n.cores=20,nPcs=30,make.geneknn=F,get.tsne = F, get.largevis = F)
2797 cells, 31053 genes; normalizing ... using plain model winsorizing ... log scale ... done.
calculating variance fit ... using gam 392 overdispersed genes ... 392persisting ... done.
running PCA using 3000 OD genes .... done
creating space of type angular done
adding data ... done
building index ... done
querying ... done
r2$getKnnClusters(type='PCA',method=function(x) conos::leiden.community(x,r=1.5),name='leiden')
r2$getEmbedding(type='PCA',embeddingType='tSNE',perplexity=50)
calculating distance ... pearson ...running tSNE using 20 cores:
r2$plotEmbedding(type = 'PCA', embedding = 'tSNE', clusterType = 'leiden', mark.clusters = T,alpha=0.2,cex=0.3,mark.cluster.cex = 1)
r2$plotEmbedding(type = 'PCA', embedding = 'tSNE', groups=r$clusters$PCA$leiden,alpha=0.5,cex=0.2,mark.clusters = T,mark.cluster.cex = 1)
using provided groups as a factor
set.seed(1)
r2$getEmbedding(type='PCA',embeddingType='UMAP',n_neighbors=100,min_dist=0.4,spread=1,n.cores=30)
14:33:37 UMAP embedding parameters a = 0.7669 b = 1.223
14:33:37 Read 2797 rows and found 30 numeric columns
14:33:37 Using Annoy for neighbor search, n_neighbors = 100
14:33:37 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
14:33:38 Writing NN index file to temp file /tmp/Rtmpaayt91/filed25111e51eef
14:33:38 Searching Annoy index using 30 threads, search_k = 10000
14:33:38 Annoy recall = 100%
14:33:39 Commencing smooth kNN distance calibration using 30 threads
14:33:41 Initializing from normalized Laplacian + noise
14:33:41 Commencing optimization for 500 epochs, with 324128 positive edges using 30 threads
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
14:33:45 Optimization finished
r2$getKnnClusters(type='PCA',method=function(x) conos::leiden.community(x,r=5),name='leiden2')
r2$plotEmbedding(type = 'PCA', embedding = 'UMAP', clusterType = 'leiden', mark.clusters = T,alpha=0.2,cex=0.3,mark.cluster.cex = 1)
r2$plotEmbedding(type = 'PCA', embedding = 'UMAP', groups=r$clusters$PCA$leiden, mark.clusters = T,alpha=0.2,cex=0.3,mark.cluster.cex = 1)
using provided groups as a factor
r2$plotEmbedding(type = 'PCA', embedding = 'UMAP', groups=r2$clusters$PCA$leiden2, mark.clusters = T,alpha=0.2,cex=0.3,mark.cluster.cex = 1)
using provided groups as a factor
r2$plotEmbedding(type = 'PCA', embedding = 'UMAP', groups=ann, mark.clusters = T,alpha=0.2,cex=0.3,mark.cluster.cex = 1)
using provided groups as a factor
r2$plotEmbedding(type = 'PCA', embedding = 'UMAP', groups=batch.f, mark.clusters = T,alpha=0.2,cex=0.3,mark.cluster.cex = 1)
using provided groups as a factor
Remove mature cluster 15
vic2 <- names(r2$clusters$PCA$leiden)[!r2$clusters$PCA$leiden2 %in% c(26,28)]
r3 <- basicP2proc(m1$spliced[,colnames(m1$spliced) %in% vic2],n.cores=20,nPcs=20,make.geneknn=F,get.tsne = F, get.largevis = F)
2726 cells, 31053 genes; normalizing ... using plain model winsorizing ... log scale ... done.
calculating variance fit ... using gam 323 overdispersed genes ... 323persisting ... done.
running PCA using 3000 OD genes .... done
creating space of type angular done
adding data ... done
building index ... done
querying ... done
set.seed(0)
r3$getKnnClusters(type='PCA',method=function(x) conos::leiden.community(x,r=1.5),name='leiden')
set.seed(8) # 5/100/0.4
emb <- r3$getEmbedding(type='PCA',embeddingType='UMAP',n_neighbors=100,min_dist=0.4,spread=1,n.cores=1)
18:26:17 UMAP embedding parameters a = 0.7669 b = 1.223
18:26:17 Read 2726 rows and found 20 numeric columns
18:26:17 Using Annoy for neighbor search, n_neighbors = 100
18:26:17 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
18:26:18 Writing NN index file to temp file /tmp/Rtmpaayt91/filed2512ac48430
18:26:18 Searching Annoy index using 1 thread, search_k = 10000
18:26:20 Annoy recall = 100%
18:26:21 Commencing smooth kNN distance calibration using 1 thread
18:26:23 Initializing from normalized Laplacian + noise
18:26:23 Commencing optimization for 500 epochs, with 285834 positive edges using 1 thread
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
18:26:32 Optimization finished
emb[,1] <- -1*emb[,1]; emb[,2] <- -1*emb[,2]
r3$embeddings$PCA$UMAP <- emb
embs <- mclapply(1:30,function(x) { set.seed(x); r3$getEmbedding(type='PCA',embeddingType='UMAP',n_neighbors=100,min_dist=0.4,spread=1,n.cores=1)},mc.cores=30)
lapply(1:length(embs),function(i){
emb <- embs[[i]]
#emb[,1] <- -1*emb[,1]; emb[,2] <- -1*emb[,2]
r3$embeddings$PCA$UMAP <- emb;
r3$plotEmbedding(type = 'PCA', embedding = 'UMAP', clusterType = 'leiden', mark.clusters = T,alpha=0.2,cex=0.3,mark.cluster.cex = 1,main=i)})
[[1]]
NULL
[[2]]
NULL
[[3]]
NULL
[[4]]
NULL
[[5]]
NULL
[[6]]
NULL
[[7]]
NULL
[[8]]
NULL
[[9]]
NULL
[[10]]
NULL
[[11]]
NULL
[[12]]
NULL
[[13]]
NULL
[[14]]
NULL
[[15]]
NULL
[[16]]
NULL
[[17]]
NULL
[[18]]
NULL
[[19]]
NULL
[[20]]
NULL
[[21]]
NULL
[[22]]
NULL
[[23]]
NULL
[[24]]
NULL
[[25]]
NULL
[[26]]
NULL
[[27]]
NULL
[[28]]
NULL
[[29]]
NULL
[[30]]
NULL
r3$plotEmbedding(type = 'PCA', embedding = 'UMAP', clusterType = 'leiden', mark.clusters = T,alpha=0.2,cex=0.3,mark.cluster.cex = 1)
r3$plotEmbedding(type = 'PCA', embedding = 'UMAP', groups=r$clusters$PCA$leiden, mark.clusters = T,alpha=0.2,cex=0.3,mark.cluster.cex = 1)
using provided groups as a factor
r3$plotEmbedding(type = 'PCA', embedding = 'UMAP', groups=ann, mark.clusters = T,alpha=0.2,cex=0.3,mark.cluster.cex = 1)
using provided groups as a factor
gn <- 'Mki67'; r3$plotEmbedding(type = 'PCA', embedding = 'UMAP', colors=r3$counts[,gn],alpha=0.5,cex=0.2,main=gn)
treating colors as a gradient with zlim: 0 0.6875693
Clean plots
require(entropy)
entropy.scores <- apply(r3$misc$rawCounts,1,entropy)
# with cell depth subsampling
# subsample columns of a sparse matrix to approximately a desired depth
subsample.cell.depth <- function(m,depth=1e3) {
p.sample <- pmin(1,rep(depth/Matrix::colSums(m),diff(m@p)))
m@x <- as.numeric(rbinom(length(m@x),m@x,p.sample))
m
}
entropy.scores.sub <- apply(subsample.cell.depth(t(r3$misc$rawCounts)),2,entropy)
r3$plotEmbedding(type = 'PCA', embedding = 'UMAP', colors=pagoda2:::val2col(entropy.scores.sub,zlim=c(6.1,6.2)),alpha=0.5,cex=0.2)
using supplied colors as is
x <- entropy.scores.sub
x <- x-mean(x)
r3$plotEmbedding(type = 'PCA', embedding = 'UMAP', colors=x,alpha=0.2,cex=0.5,gradient.range.quantile=0.7)
treating colors as a gradient with zlim: -0.1487894 0.1487894
p2c.plot <- function(emb, ...) {
theme <- theme_void() + theme(panel.border = element_rect(fill=NA,color = 1, size=0.2,linetype=1),axis.line=element_blank(),plot.margin = margin(.1, .1, .1, .1, "cm"))
conos::embeddingPlot(emb, plot.theme=theme, ...)
}
emb <- r3$embeddings$PCA$UMAP
p2c.plot(emb,groups=r3$clusters$PCA$leiden, alpha=0.2,size=1)
Make an annotation
fann <- setNames(rep('Progenitor',length(r3$clusters$PCA$leiden)),names(r3$clusters$PCA$leiden))
fann[r3$clusters$PCA$leiden %in% c('10','9')] <- 'Neuroblast'
fann[r3$clusters$PCA$leiden %in% c('2','1')] <- 'RGC'
fann[r3$clusters$PCA$leiden %in% c('14','6','8')] <- 'AC/HC'
fann[r3$clusters$PCA$leiden %in% c('12','11')] <- 'PR'
fann <- as.factor(fann)
set.seed(0)
fann.pal <- setNames(sample(rainbow(length(levels(fann)),s=0.8,v=0.8)),levels(fann))
p <- p2c.plot(emb,groups=fann, alpha=0.5,size=0.02,palette=fann.pal,font.size=c(4,5),raster.width=3,raster.height=3,raster=T)
pdf(file='ret_ann.pdf',width=3,height=3); print(p); dev.off();
null device
1
p
Export for scanpy:
save(raw.count.matrix.merged,metadata.df,embedding.df,file=paste0(output.path,'/data.RData'))
Warning message:
package ‘igraph’ was built under R version 3.5.3
Cell cycle genes
gns <- c('Mcm6','Ccne2',"Esco2","Cdk1","Aurka","Cenpa")
pp <- lapply(gns,function(g) p2c.plot(emb,colors=r3$counts[,g],alpha=0.2,size=0.5,gradient.range.quantile=0.9,raster.width=2,raster.height=2,raster=T)+annotate('text',x=Inf,y=Inf,hjust=1.05,vjust=1.1,label=g,size=9))
#pp <- lapply(gns,function(g) p2c.plot(emb,color=r3$counts[g,],gradient.range.quantile=0.9)+annotate('text',x=Inf,y=Inf,vjust=1,hjust=0,label=g))
p <- cowplot::plot_grid(plotlist=pp,nrow=1)
pdf(file='ret_cc_genes.pdf',width=15,height=2.5); print(p); dev.off();
png
2
p
pdf(file='ret_cc_genes2.pdf',width=5.2,height=1); print(p); dev.off();
null device
1
p2c.plot2 <- function(emb, ...) {
theme <- theme_void() + theme(axis.line=element_blank(),plot.margin = margin(.1, .1, .1, .1, "cm"))
conos::embeddingPlot(emb, plot.theme=theme, ...)
}
gns <- c('Mcm6','Ccne2',"Esco2","Cdk1","Aurka","Cenpa")
pp <- lapply(gns,function(g) p2c.plot2(emb,colors=r3$counts[,g],alpha=0.2,size=0.5,gradient.range.quantile=0.9,raster.width=1,raster.height=1,raster=T)+
xlim(-6,3) + ylim(-9,4) +
annotate('text',x=-Inf,y=Inf,hjust=0,vjust=1.1,label=g,size=6))
#pp <- lapply(gns,function(g) p2c.plot(emb,color=r3$counts[g,],gradient.range.quantile=0.9)+annotate('text',x=Inf,y=Inf,vjust=1,hjust=0,label=g))
p <- cowplot::plot_grid(plotlist=pp,nrow=1)
Removed 1220 rows containing missing values (geom_point_rast).Removed 1220 rows containing missing values (geom_point_rast).Removed 1220 rows containing missing values (geom_point_rast).Removed 1220 rows containing missing values (geom_point_rast).Removed 1220 rows containing missing values (geom_point_rast).Removed 1220 rows containing missing values (geom_point_rast).
pdf(file='ret_cc_genes3.pdf',width=5.2,height=1); print(p); dev.off();
png
2
p
p2c.plot(emb,colors=r3$counts[,'Cdk1'],alpha=0.2,size=0.5,gradient.range.quantile=0.9,raster.width=1,raster.height=1,raster=T) + xlim(-6,3) + ylim(-9,4)
Reduced version:
gns <- c('Mcm6','Ccne2',"Esco2","Cdk1","Aurka","Cenpa")
pp <- lapply(gns,function(g) p2c.plot(emb,colors=r3$counts[,g],alpha=0.2,size=0.5,gradient.range.quantile=0.9,raster.width=2,raster.height=2,raster=T)+annotate('text',x=Inf,y=Inf,hjust=1.05,vjust=1.1,label=g,size=9))
#pp <- lapply(gns,function(g) p2c.plot(emb,color=r3$counts[g,],gradient.range.quantile=0.9)+annotate('text',x=Inf,y=Inf,vjust=1,hjust=0,label=g))
p <- cowplot::plot_grid(plotlist=pp,nrow=1)
pdf(file='ret_cc_genes.pdf',width=15,height=2.5); print(p); dev.off();
p
Marker genes:
gl <- c(#'Mki67', # cc
# progenitors
'Fos',
#'Sox2',
# neroblasts
#'Prc1',
'Sstr2',
'Penk',
#rgc
'Isl1',
'Pou4f2',
#photoreceptors
'Crx',
# amacrine/horizontal
'Lhx1',
'Onecut2'
)
pp <- lapply(gl,function(g) p2c.plot(emb,colors=r3$counts[,g],alpha=0.2,size=0.5,gradient.range.quantile=0.9,raster.width=2,raster.height=2,raster=T)+annotate('text',x=Inf,y=Inf,hjust=1.05,vjust=1.1,label=g,size=9))
p <- cowplot::plot_grid(plotlist=pp,nrow=2)
p
pdf(file='ret_genes.pdf',width=10,height=5); print(p); dev.off();
set.seed(2)
x <- entropy.scores.sub
x <- x-mean(x)
a1 <- p2c.plot(emb,colors=x[rownames(emb)],alpha=0.2,size=0.5,gradient.range.quantile=0.8,raster.width=2,raster.height=2,raster=T) + theme(legend.position=c(0.75,0.8)) +
guides(color = guide_colourbar(barwidth = 6, barheight = 0.8, direction="horizontal", frame.colour=c("black"),title.position='top',ticks=F,label.theme=element_text(size=12))) +
theme(legend.title=element_text(size=15))+
scale_color_gradient(name='Entropy',low='darkgreen',high='darkorange',limits=c(-0.1,0.1),oob=scales::squish,breaks=c(-0.08,0.08),labels=c('low','high'))
Scale for 'colour' is already present. Adding another scale for 'colour', which will replace the existing scale.
pdf(file='ret_entropy.pdf',width=3,height=3); print(a1); dev.off()
null device
1
a1
p <- p2c.plot(emb,groups=fann, alpha=0.5,size=0.5,palette=fann.pal,font.size=c(3,4),raster.width=3,raster.height=3,raster=T)
pdf(file='ret_ann2.pdf',width=1.5,height=1.5); print(p); dev.off();
p
source("functions_copy.R")
in 2 dimensions
z2 <- t.ppt.tree(X=t(emb),emb=emb,M=200,lambda=100,sigma=0.02,metrics='euclidean',plot=F,output=F)
a1 <- p <- p2c.plot(emb,groups=fann, alpha=0.2,size=0.5,palette=fann.pal,font.size=c(4,5),raster.width=3,raster.height=3,raster=T,mark.groups=F)+
t.ggplot.ppt.tree(z2,emb,size=0.65,col=adjustcolor(1,alpha=0.9))
pdf(file='ret_ann_tree.pdf',width=1.5,height=1.5); print(a1); dev.off();
null device
1
a1
High-dimensional tree
hde <- r3$getEmbedding(type='PCA',name='hd',embeddingType = 'UMAP',n_neighbors=100,min_dist=0.4,spread=1,n_components = 20)
z20 <- t.ppt.tree(X=t(hde),emb=emb,M=200,lambda=100,sigma=0.02,metrics='euclidean',plot=F,output=F)
zb <- t.bootstrap.ppt(X=t(hde),emb=emb, M=200,err.cut=1e-3,n.steps=100,plot=F,output=F,lambda=200,sigma=0.01, seed=9,metrics='euclidean',n.cores=30,n.samples=30)
a1 <- p2c.plot(emb,groups=fann, alpha=0.2,size=0.5,palette=fann.pal,font.size=c(4,5),raster.width=3,raster.height=3,raster=T,mark.groups=F)
for(i in 1:30) a1 <- a1+t.ggplot.ppt.tree(zb[[i]],emb,size=0.5,col=adjustcolor(1,alpha=0.05))
pdf(file='ret_ann_trees.pdf',width=1.5,height=1.5); print(a1); dev.off();
null device
1
a1