Skip to content
Snippets Groups Projects
Commit d74f670b authored by passt's avatar passt
Browse files

Initial commit

parent f083acfa
No related branches found
No related tags found
No related merge requests found
Showing
with 865 additions and 0 deletions
......@@ -31,3 +31,4 @@ vignettes/*.pdf
# Temporary files created by R markdown
*.utf8.md
*.knit.md
.Rproj.user
# function hexbin::grid.hexagons - modified in line 50 to discount NA values
grid.hexagons2 <- function (dat, style = c("colorscale", "centroids", "lattice",
"nested.lattice", "nested.centroids", "constant.col"), use.count = TRUE,
cell.at = NULL, minarea = 0.05, maxarea = 0.8, check.erosion = TRUE,
mincnt = 1, maxcnt = max(dat@count), trans = NULL, colorcut = seq(0,
1, length = 22), density = NULL, border = NULL, pen = NULL,
colramp = function(n) {
LinGray(n, beg = 90, end = 15)
}, def.unit = "native", verbose = getOption("verbose"))
{
if (!is(dat, "hexbin"))
stop("first argument must be a hexbin object")
style <- match.arg(style)
if (minarea <= 0)
stop("hexagons cannot have a zero area, change minarea")
if (maxarea > 1)
warning("maxarea > 1, hexagons may overplot")
if (use.count) {
cnt <- dat@count
}
else {
cnt <- cell.at
if (is.null(cnt)) {
if (is.null(dat@cAtt))
stop("Cell attribute cAtt is null")
else cnt <- dat@cAtt
}
}
xbins <- dat@xbins
shape <- dat@shape
tmp <- hcell2xy(dat, check.erosion = check.erosion)
good <- mincnt <= cnt & cnt <= maxcnt
xnew <- tmp$x[good]
ynew <- tmp$y[good]
cnt <- cnt[good]
sx <- xbins/diff(dat@xbnds)
sy <- (xbins * shape)/diff(dat@ybnds)
switch(style, centroids = , lattice = , constant.col = ,
colorscale = {
if (is.null(trans)) {
if (min(cnt, na.rm = TRUE) < 0) {
pcnt <- cnt + min(cnt)
rcnt <- {
if (maxcnt == mincnt) rep.int(1, length(cnt)) else (pcnt -
mincnt)/(maxcnt - mincnt)
}
} else rcnt <- {
# MODIFICATION: remove NA values
if (maxcnt == mincnt) rep.int(1, length(cnt)) else (cnt - min(cnt, na.rm=T)) / (max(cnt, na.rm=T) - min(cnt, na.rm=T))
}
} else {
rcnt <- (trans(cnt) - trans(mincnt))/(trans(maxcnt) -
trans(mincnt))
if (any(is.na(rcnt))) stop("bad count transformation")
}
area <- minarea + rcnt * (maxarea - minarea)
}, nested.lattice = , nested.centroids = {
diffarea <- maxarea - minarea
step <- 10^floor(log10(cnt))
f <- (cnt/step - 1)/9
area <- minarea + f * diffarea
area <- pmax(area, minarea)
})
area <- pmin(area, maxarea)
radius <- sqrt(area)
switch(style, centroids = , constant.col = , lattice = {
if (length(pen) != length(cnt)) {
if (is.null(pen)) pen <- rep.int(1, length(cnt)) else if (length(pen) ==
1) pen <- rep.int(pen, length(cnt)) else stop("'pen' has wrong length")
}
}, nested.lattice = , nested.centroids = {
if (!is.null(pen) && length(dim(pen)) == 2) {
dp <- dim(pen)
lgMcnt <- ceiling(log10(max(cnt)))
if (dp[1] != length(cnt) && dp[1] != lgMcnt) {
stop("pen is not of right dimension")
}
if (dp[1] == lgMcnt) {
ind <- ceiling(log10(dat@count))
ind[ind == 0] <- 1
pen <- pen[ind, ]
}
} else {
pen <- floor(log10(cnt)) + 2
pen <- cbind(pen, pen + 10)
}
}, colorscale = {
nc <- length(colorcut)
if (colorcut[1] > colorcut[nc]) {
colorcut[1] <- colorcut[1] + 1e-06
colorcut[nc] <- colorcut[nc] - 1e-06
} else {
colorcut[1] <- colorcut[1] - 1e-06
colorcut[nc] <- colorcut[nc] + 1e-06
}
colgrp <- cut(rcnt, colorcut, labels = FALSE)
if (any(is.na(colgrp))) colgrp <- ifelse(is.na(colgrp),
0, colgrp)
clrs <- colramp(length(colorcut) - 1)
pen <- clrs[colgrp]
})
inner <- 0.5
outer <- (2 * inner)/sqrt(3)
dx <- inner/sx
dy <- outer/(2 * sy)
rad <- sqrt(dx^2 + dy^2)
hexC <- hexcoords(dx, dy, sep = NULL)
switch(style, constant.col = , colorscale = {
hexpolygon(xnew, ynew, hexC, density = density, fill = pen,
border = if (!is.null(border)) border else pen)
return(invisible(paste("done", sQuote(style))))
}, nested.lattice = , nested.centroids = {
hexpolygon(xnew, ynew, hexC, density = density, fill = if (is.null(border) ||
border) 1 else pen[, 1], border = pen[, 1])
})
if (style == "centroids" || style == "nested.centroids") {
xcm <- dat@xcm[good]
ycm <- dat@ycm[good]
k <- sqrt(3)/2
cosx <- c(1, k, 0.5, 0, -0.5, -k, -1, -k, -0.5, 0, 0.5,
k, 1)/sx
siny <- c(0, 0.5, k, 1, k, 0.5, 0, -0.5, -k, -1, -k,
-0.5, 0)/sy
dx <- sx * (xcm - xnew)
dy <- sy * (ycm - ynew)
dlen <- sqrt(dx^2 + dy^2)
cost <- ifelse(dlen > 0, dx/dlen, 0)
tk <- (6 * acos(cost))/pi
tk <- round(ifelse(dy < 0, 12 - tk, tk)) + 1
hrad <- ifelse(tk%%2 == 1, inner, outer)
fr <- pmin(hrad * (1 - radius), dlen)
xnew <- xnew + fr * cosx[tk]
ynew <- ynew + fr * siny[tk]
}
n <- length(radius)
if (verbose)
cat("length = ", length(pen), "\n", "pen = ", pen + 1,
"\n")
n6 <- rep.int(6:6, n)
pltx <- rep.int(hexC$x, n) * rep.int(radius, n6) + rep.int(xnew,
n6)
plty <- rep.int(hexC$y, n) * rep.int(radius, n6) + rep.int(ynew,
n6)
switch(style, centroids = , lattice = {
grid.polygon(pltx, plty, default.units = def.unit, id = NULL,
id.lengths = n6, gp = gpar(fill = pen, col = border))
}, nested.lattice = , nested.centroids = {
grid.polygon(pltx, plty, default.units = def.unit, id = NULL,
id.lengths = n6, gp = gpar(fill = pen[, 2], col = if (!is.null(border)) border else pen[,
2]))
})
}
\ No newline at end of file
---
title: "Eigen-Networks"
author: "PS STUMPF"
date: "22/01/2019"
output: html_document
knit: (function(input_file, encoding) {
out_dir <- 'docs';
rmarkdown::render(input_file,
encoding=encoding,
output_file=file.path(dirname(input_file), out_dir, 'index.html'))})
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(message=FALSE, error=FALSE, warning=FALSE, cache=TRUE, auto.dep=TRUE, echo=TRUE, tidy=FALSE, width.cutoff=50, fig.width = 4, fig.asp = 1)
options(digits=6)
suppressPackageStartupMessages({
library(marray)
library(mclust)
library(flowCore)
library(flowViz)
library(beanplot)
})
# specific hexbin plot function
source('./hexpress.R')
```
This markdown file contains source code related to Stumpf & MacArthur *Machine learning of stem cell identities from single-cell expression data via regulatory network archetypes*. Frontiers in Genetics 10:2 (2019) https://doi.org/10.3389/fgene.2019.00002
Please cite this paper if you re-use any of the code below.
## Load and transform CyTOF data
Load regulatory network (manually curated using the ChIP-Seq database ChIPBasev2.0 and KEGG).
```{r load_adjacency_matrices}
IRN <- read.table('./RegulatoryNetwork', header=T, sep="\t", stringsAsFactors = FALSE)
colix <- unique(c(IRN$Source.Index, IRN$Target.Index))
```
Load CyTOF data from Zunder et al. (2015) Cell Stem Cell doi: 10.1016/j.stem.2015.01.015. (cytobank accession: 43324).
Data contains 47 features (including 6 condition barcodes, some DNA markers etc.).
```{r load_expression_data, results='hide', warning=FALSE}
mESC.NG <- read.flowSet(path='./cytobank43324/', pattern='NG')
```
Perform logicle transformation on data.
```{r transform_data}
# create transformation function with parameters
lgcl <- logicleTransform(w = 0.6, t = 10000, m = 4.5)
# create transformList to apply to non-linear features
trans <- transformList(parameters(mESC.NG[[1]])$name[9:46], lgcl)
# apply transformation
mESC.NG <- transform(mESC.NG, trans)
```
## Eigen-networks
Integrate single-cell expression data with regulatory network. Derived expression values can be regarded as the edge-expression of the regulatory network.
```{r scIRN, eval=TRUE}
# Apply function(X) to each frameset and apply function(x) to each row. Returns a list
IRN.NG <- fsApply(mESC.NG,
function(X){ apply(exprs(X), # expression values of cells in flowFrame X
1, # work along first dimenions (one row [cell] at a time)
function(x){ x[IRN$Source.Index] * (x[IRN$Target.Index] ^ IRN$Sign) }) },
simplify = F)
```
Obtain regulatory network archetypes (aka Eigen-Networks, aka principal components). Run *prcomp* on regulatory network data in 0i conditions, project all other conditions.
```{r PCA_spIRN_2iprojectElse, eval=TRUE}
# Run prcomp to obtain principal components
pcEdge0i <- prcomp(t(IRN.NG$'NG_CGR8-2i.fcs'), center=T, scale.=T)
```
Calculate the proportion of variance explained.
```{r screeplot, fig.width = 4, fig.asp = 1}
VarExpl <- (pcEdge0i$sdev^2) / sum(pcEdge0i$sdev^2)
# Screeplot
par(mar=c(4,4,.5,.5))
plot(cumsum(VarExpl), type='l', ylim=c(0,1), las=1,
xlab='Component', ylab='Variance Explained', bty='n'); abline(h=0.95, lty=2, col=2)
```
Function to project unseen data onto principal components.
```{r predictPCA}
# Function predictPCA()
predictPCA <- function(object, newdata, cmpnts=1:3) {
# function to project newdata onto principal components (object)
# newdata: matrix with samples (rows) and features (columns)
predictNewData <- apply(newdata, 2, function(iSample) { ((iSample - object$center) / object$scale) %*% object$rotation[,cmpnts] })
return(t(predictNewData))
}
```
Extract data of choice and project in principal component space of mESC.NG in 0i conditions.
```{r extractAndProject}
# specify PCs
pcomp = 1:2
# extract expression matrix (select flowFrame of choice, for instance 2i)
exprmat <- exprs(mESC.NG$'NG_CGR8-2i.fcs')
# project data (select corresponding integrated regulatory network of choice)
xy <- predictPCA(pcEdge0i, IRN.NG$'NG_CGR8-2i.fcs', cmpnts = pcomp)
colnames(xy) <- colnames(pcEdge0i$x[,pcomp])
```
Calculate density of cells in principal component space and visualise using hexbin.
```{r hexbinDensity}
hexpress(xy=xy, exprmat=exprmat, feats='density', pcomp=1:2)
```
Calculate overall expression and visualise using hexbin.
```{r hexbinOverall}
hexpress(xy=xy, exprmat=exprmat, feats='overall', pcomp=1:2, colix=colix)
```
Display node expression using hexbin.
```{r hexbinPCexprmat}
hexpress(xy=xy, exprmat=exprmat, feats='Nanog(Gd156)Dd', pcomp=1:2)
```
Display color bar.
```{r hexbinColorbar, fig.width = 3, fig.asp = 1}
col <- maPalette(low = 'blue', high = 'green', k = 11)
par(mar=c(3 ,7, 3, 7))
image(t(as.matrix(1:11)), col=col, axes=F)
axis(4, at = c(0,1),labels = c('low','high'),line = 0.25, las=1)
mtext(side=2, text='Expression',line = 0.5)
```
## Classification and surprisal analysis
```{r LoadPreviousData, eval=TRUE, echo=FALSE, cache.lazy=FALSE}
# Load data when knitting markdown
load(file = '../../Single Cell Networks Rnalyses/CyTOF/Mclust-GMM-Revised.RData')
load(file = '../../Single Cell Networks Rnalyses/CyTOF/Mclust-Density-Revised.RData')
load(file = '../../Single Cell Networks Rnalyses/CyTOF/Mclust-Classification-Revised.RData')
```
Fit multivariate Gaussian Mixture Model (GMM) with unconstrained co-variance.
```{r Mclust3D_gmm, eval=F}
# Fit multivariate gaussian mixture model
xyzMc <- Mclust(pcEdge0i$x[,1:3], G=4, modelNames = 'VVV')
# obtain density
xyzMc.density <- densityMclust(pcEdge0i$x[,1:3], G=4, modelNames = 'VVV')
```
Display total variance.
```{r TotalVariance, fig.width = 4, fig.asp = 1}
# total variance (trace of covariance matrix + and sum of across remaining off-diagona covariance matrix)
GMMvar <- rbind( apply(xyzMc$parameters$variance$sigma, 3,
function(x){sum(diag(x))}),
apply(xyzMc$parameters$variance$sigma, 3,
function(x){sum(abs(x))}) - apply(xyzMc$parameters$variance$sigma, 3,
function(x){sum(diag(x))})
)[,c(3,1,2,4)]
rownames(GMMvar) <- c('Trace','OffDiagonal')
# Plot results
par(mar=c(4,4,.5,.5))
barplot(GMMvar, las=1, names=1:4, xlab='Cluster', ylab='Total Variance')
```
Calculate centroid Eigen-Network for each cluster and export for external visualisation (e.g. using Cytoscape or yEd). See figure 4B in Stumpf & MacArthur (2019).
```{r CentroidEigenNetwork, eval=F}
# prepare data for export 1/3
clusterface <- cbind(( pcEdge0i$center + pcEdge0i$scale * t(xyzMc$parameters$mean[,1] %*% t(pcEdge0i$rotation[,1:3])) ),
( pcEdge0i$center + pcEdge0i$scale * t(xyzMc$parameters$mean[,2] %*% t(pcEdge0i$rotation[,1:3])) ),
( pcEdge0i$center + pcEdge0i$scale * t(xyzMc$parameters$mean[,3] %*% t(pcEdge0i$rotation[,1:3])) ),
( pcEdge0i$center + pcEdge0i$scale * t(xyzMc$parameters$mean[,4] %*% t(pcEdge0i$rotation[,1:3])) ))
colnames(clusterface) <- c('centroid-c1', 'centroid-c2', 'centroid-c3', 'centroid-c4')
rownames(clusterface) <- c()
# prepare data for export 2/3
pcexport <- cbind(pcEdge0i$center, pcEdge0i$scale)
colnames(pcexport) <- c('center', 'scale')
# prepare data for export 3/3
pcexport <- cbind(pcexport, pcEdge0i$rotation)
rownames(pcexport) <- c()
# write matrix for network
write.table(cbind(IRN, clusterface, pcexport),
file='IRN-NG.tgf', sep='\t', quote=F, row.names = F, col.names = T)
```
Calculate surprisal and visualise as beanplots.
```{r Mclust3D_surprisal, fig.width = 6, fig.asp = 2/3}
# Density - Surprisal
a <- lapply( IRN.NG, function(x) {-log(predict(xyzMc.density, newdata=predictPCA(pcEdge0i, x[,])))})
par(mar=c(4,4,.5,.5))
beanplot(a, what=c(F,T,F,F), cutmin=0, cutmax=40, ylim=c(5.5,28), log='y', las=2, bw=0.05, maxwidth = 1, border=NA, ylab='-log(p[x,y,z])')
```
Classification of points using Gaussian Mixture Model.
```{r Mclust3D_classification, eval=FALSE}
# Classification
xyzDA <- MclustDA(pcEdge0i$x[,1:3], xyzMc$classification, G=4, modelNames='VVV')
xyzDAcv <- cvMclustDA(xyzDA, nfold=10)
pcEdge0i.classification <- do.call(cbind, lapply(IRN.NG, function(x){table(predict(xyzDA, newdata=predictPCA(pcEdge0i, x))$classification)}))
# which cells are classified correctly?
pcEdge0i.classification.retained <- lapply(IRN.NG, function(x){rejectOption(xyzMc, predictPCA(pcEdge0i, x), G=1:4, alpha=0.1) })
```
Include reject-option (discard points that fall outside of a given percentile threshold).
```{r function_rejectOption}
##### function to reject points outside interval of any of the gaussian components
rejectOption <- function(gmModel, testData, G, alpha) {
# for each multivariate gaussian component calculate the F-value to compare against the quantile function
# and check against the chi-square distribution.
d = ncol(testData)
I = apply(gmModel$parameters$variance$sigma, 3, solve)
# calculate chi-square statistic
chiStat <- apply(testData, 1, function(x) { sapply(G, function(g) {
t(x - gmModel$parameters$mean[,g]) %*%
matrix(I[,g], d) %*%
(x - gmModel$parameters$mean[,g]) } ) } )
# check if chi-square statistic is larger (smaller) than corresponding quantile at the level of alpha (1-alpha)
retained = chiStat < qchisq(1-alpha, df=d)
# if G comprises multiple components (e.g. G=1:4), check whether cells are members of ANY component and return single logical vector
if (length(G)>1) {
retained = apply(retained, 2, any)
}
return(retained)
}
```
Illustrate percentiles of Gaussian components at PC3=0.
```{r Mclust3D_classification_percentile, fig.width = 4, fig.asp = 1}
# visualise the interval in 2D for z=0
xyzPlane <- cbind(as.vector(matrix(rep(seq(-20,20,0.1), 201), ncol = 401, byrow=T)), as.vector(matrix(rep(seq(-10,10, 0.1), 401), ncol = 201, byrow=F)), 0)
colnames(xyzPlane) <- c('x','y','z')
# plot an example - 90th percentile
par(mar=c(4,4,.5,.5))
# G=1
a <- rejectOption(xyzMc, xyzPlane, G=1, alpha=0.1)
z <- matrix(a, 201)
image(x=seq(-20,20,0.1), y=seq(-10,10,0.1), z=t(z), col=c(NA, '#1f78b466'), las=1, xlab='PC1', ylab='PC2')
# G=2
a <- rejectOption(xyzMc, xyzPlane, G=2, alpha=0.05)
z <- matrix(a, 201)
image(x=seq(-20,20,0.1), y=seq(-10,10,0.1), z=t(z), col=c(NA, '#33a02c66'), add=T, axes=F)
# G=3
a <- rejectOption(xyzMc, xyzPlane, G=3, alpha=0.05)
z <- matrix(a, 201)
image(x=seq(-20,20,0.1), y=seq(-10,10,0.1), z=t(z), col=c(NA, '#6a3d9a66'), add=T, axes=F)
# data points
points(pcEdge0i$x[,1:2], pch='.', col='black')
text(-12,9, labels='90th percentile')
```
Classify cells across entire reprogramming time-course.
```{r Mclust3D_classification_timecourse}
# Include reject option in classification
gmmClassification <- function(testData, gmModel, gmDA, pcaModel, tab=TRUE){
a=predictPCA(pcaModel, testData)
b=factor(predict(gmDA, newdata=a)$classification, levels=1:5)
c=rejectOption(gmModel, a, G=1:4, alpha=0.25)
b[!c]<-5
if (tab){ return(table(b))
} else { return(b) }
}
pcEdge0i.classification <- do.call(cbind, lapply(IRN.NG, gmmClassification, xyzMc, xyzDA, pcEdge0i ))
```
Visualise results as bar chart.
```{r Mclust3D_classification_timecourse_VIZ, fig.width = 6, fig.asp = 2/3}
# Viz timecourse classification results
par(mar=c(4,4,.5,.5))
barplot(t(t(pcEdge0i.classification) / colSums(pcEdge0i.classification)), las=2,
ylab='Fraction', col=c('#6a3d9a', '#1f78b4', '#33a02c', '#ff7f00', '#FFFFFF'))
```
Source diff could not be displayed: it is too large. Options to address this: view the blob.
docs/index_files/figure-html/Mclust3D_classification_percentile-1.png

82.3 KiB

docs/index_files/figure-html/Mclust3D_classification_timecourse_VIZ-1.png

56 KiB

docs/index_files/figure-html/Mclust3D_surprisal-1.png

109 KiB

docs/index_files/figure-html/TotalVariance-1.png

35.9 KiB

docs/index_files/figure-html/hexbinColorbar-1.png

17 KiB

docs/index_files/figure-html/hexbinDensity-1.png

66.1 KiB

docs/index_files/figure-html/hexbinOverall-1.png

81 KiB

docs/index_files/figure-html/hexbinPCexprmat-1.png

93.3 KiB

docs/index_files/figure-html/screeplot-1.png

44.1 KiB

hexpress <- function(xy, exprmat, feats, pcomp, colix) {
require(grid)
require(hexbin)
require(marray)
# modified hexagon bin function from hexbin package
source('./3rdparty/gridhexagons2.R')
# xy-boundaries
bndLo = c(-22, -9, -23, -11)
bndUp = c( 22, 12, 9, 11) # NG
# minimum data per bin
mincnt <- 5
# remove points outside some boundary.
is.na(xy[,1]) <- xy[,1] < bndLo[pcomp[1]] | xy[,1] > bndUp[pcomp[1]]
is.na(xy[,2]) <- xy[,2] < bndLo[pcomp[2]] | xy[,2] > bndUp[pcomp[2]]
if (feats == 'density') {
# Calculate density
hb12 <- hexbin::hexbin(xy, IDs=TRUE, xbins=40,
xbnds=c(bndLo[pcomp[1]], bndUp[pcomp[1]]),
ybnds=c(bndLo[pcomp[2]], bndUp[pcomp[2]]))
par(mar=c(2,2,.5,.5), bty='n')
P <- plot(hb12, type='n', legend=F, xaxt='n', yaxt='n', clip='on', main='Density')
P$plot.vp@hexVp.off$xscale=c(bndLo[pcomp[1]], bndUp[pcomp[1]])
P$plot.vp@hexVp.off$yscale=c(bndLo[pcomp[2]], bndUp[pcomp[2]])
pushViewport(P$plot.vp@hexVp.off)
grid.hexagons2(hb12, use.count=T, style= "colorscale",
colramp=function(n){maPalette(low='#EEEEEE', high='#000000', k = n)}, mincnt=mincnt)
da <- xy[hb12@cID %in% hb12@cell[hb12@count < mincnt],]
grid.xaxis(); grid.yaxis()
grid.points(da[,1], da[,2], pch='.')
popViewport()
}
else if (feats == 'overall'){
# Calculate total expression level
a <- rowSums(exprmat[,colix])
# Visualise as hexbin
hb12 <- hexbin::hexbin(xy, IDs=TRUE)
hb12@cAtt <- hexTapply(hbin = hb12, FUN = median, dat = a)
par(mar=c(2,2,.5,.5))
P <- plot(hb12, type='n', legend=F, xlab=paste0("PC", pcomp[1]), ylab=paste0("PC", pcomp[2]), main='Overall expression')
pushHexport(P$plot.vp)
grid.hexagons2(hb12, use.count=F, cell.at=NULL, style= "colorscale",
colramp=function(n){maPalette(low='blue', high='green', k = n)})
popViewport()
}
else {
nodeix <- which(colnames(exprmat) == feats)
hb12 <- hexbin::hexbin(xy, IDs=TRUE)
hb12@cAtt <- hexTapply(hbin = hb12, FUN = mean, dat = exprmat[,nodeix])
par(mar=c(2,2,.5,.5))
P <- plot(hb12, type='n', legend=F, xlab=paste0("PC", pcomp[1]), ylab=paste0("PC", pcomp[2]), main=feats)
pushHexport(P$plot.vp)
grid.hexagons2(hb12, use.count=F, cell.at=NULL, style= "colorscale", mincnt=0, colramp=function(n){maPalette(low='blue', high='green', k = n)})
popViewport()
}
}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment