男, 一个长大了才会遇到的帅哥, 稳健,潇洒,大方,靠谱。 一段生信缘,一棵技能树, 一枚大型测序工厂的螺丝钉, 一个随机森林中提灯觅食的津门旅客。 library(Seurat) library(SeuratData) levels(pbmc3k.final)
[1] "Naive CD4 T" "Memory CD4 T" "CD14+ Mono" "B" "CD8 T" [6] "FCGR3A+ Mono" "NK" "DC" "Platelet"
VlnPlot(pbmc3k.final, "CD4",slot = "data")
作为一个生物信息工程师,看到这样的图,请解释。 为什么CD14+ Mono和 Memory CD4 T 有怎么多的点,却没有小提琴呢? 那, 我们要看看作图细节了。 > VlnPlot function (object, features, cols = NULL, pt.size = 1, idents = NULL, sort = FALSE, assay = NULL, group.by = NULL, split.by = NULL, adjust = 1, y.max = NULL, same.y.lims = FALSE, log = FALSE, ncol = NULL, combine = TRUE, slot = "data", ...) { return(ExIPlot(object = object, type = "violin", features = features, idents = idents, ncol = ncol, sort = sort, assay = assay, y.max = y.max, same.y.lims = same.y.lims, adjust = adjust, pt.size = pt.size, cols = cols, group.by = group.by, split.by = split.by, log = log, slot = slot, combine = combine, ...)) } <bytecode: 0x1a20fce0> <environment: namespace:Seurat>`
可惜并没有细节,再看ExIPlot。 > ExIPlot Error: object 'ExIPlot' not found
不是显式函数啊。我们通过debug的方式进入函数内部。 > debug(VlnPlot) > VlnPlot(pbmc3k.final, "CD4",slot = "data") debugging in: VlnPlot(pbmc3k.final, "CD4", slot = "data") debug: { return(ExIPlot(object = object, type = "violin", features = features, idents = idents, ncol = ncol, sort = sort, assay = assay, y.max = y.max, same.y.lims = same.y.lims, adjust = adjust, pt.size = pt.size, cols = cols, group.by = group.by, split.by = split.by, log = log, slot = slot, combine = combine, ...)) } Browse[2]> debug: return(ExIPlot(object = object, type = "violin", features = features, idents = idents, ncol = ncol, sort = sort, assay = assay, y.max = y.max, same.y.lims = same.y.lims, adjust = adjust, pt.size = pt.size, cols = cols, group.by = group.by, split.by = split.by, log = log, slot = slot, combine = combine, ...)) Browse[2]> ExIPlot function (object, features, type = "violin", idents = NULL, ncol = NULL, sort = FALSE, assay = NULL, y.max = NULL, same.y.lims = FALSE, adjust = 1, cols = NULL, pt.size = 0, group.by = NULL, split.by = NULL, log = FALSE, combine = TRUE, slot = "data", ...) { assay <- assay %||% DefaultAssay(object = object) DefaultAssay(object = object) <- assay ncol <- ncol %||% ifelse(test = length(x = features) > 9, yes = 4, no = min(length(x = features), 3)) data <- FetchData(object = object, vars = features, slot = slot) features <- colnames(x = data) if (is.null(x = idents)) { cells <- colnames(x = object) } else { cells <- names(x = Idents(object = object)[Idents(object = object) %in% idents]) } data <- data[cells, , drop = FALSE] idents <- if (is.null(x = group.by)) { Idents(object = object)[cells] } else { object[[group.by, drop = TRUE]][cells] } if (!is.factor(x = idents)) { idents <- factor(x = idents) } if (is.null(x = split.by)) { split <- NULL } else { split <- object[[split.by, drop = TRUE]][cells] if (!is.factor(x = split)) { split <- factor(x = split) } if (is.null(x = cols)) { cols <- hue_pal()(length(x = levels(x = idents))) cols <- Interleave(cols, InvertHex(hexadecimal = cols)) } else if (length(x = cols) == 1 && cols == "interaction") { split <- interaction(idents, split) cols <- hue_pal()(length(x = levels(x = idents))) } else { cols <- Col2Hex(cols) } if (length(x = cols) < length(x = levels(x = split))) { cols <- Interleave(cols, InvertHex(hexadecimal = cols)) } cols <- rep_len(x = cols, length.out = length(x = levels(x = split))) names(x = cols) <- sort(x = levels(x = split)) } if (same.y.lims && is.null(x = y.max)) { y.max <- max(data) } plots <- lapply(X = features, FUN = function(x) { return(SingleExIPlot(type = type, data = data[, x, drop = FALSE], idents = idents, split = split, sort = sort, y.max = y.max, adjust = adjust, cols = cols, pt.size = pt.size, log = log, ...)) }) label.fxn <- switch(EXPR = type, violin = ylab, ridge = xlab, stop("Unknown ExIPlot type ", type, call. = FALSE)) for (i in 1:length(x = plots)) { key <- paste0(unlist(x = strsplit(x = features[i], split = "_"))[1], "_") obj <- names(x = which(x = Key(object = object) == key)) if (length(x = obj) == 1) { if (inherits(x = object[[obj]], what = "DimReduc")) { plots[[i]] <- plots[[i]] + label.fxn(label = "Embeddings Value") } else if (inherits(x = object[[obj]], what = "Assay")) { next } else { warning("Unknown object type ", class(x = object), immediate. = TRUE, call. = FALSE) plots[[i]] <- plots[[i]] + label.fxn(label = NULL) } } else if (!features[i] %in% rownames(x = object)) { plots[[i]] <- plots[[i]] + label.fxn(label = NULL) } } if (combine) { combine.args <- list(plots = plots, ncol = ncol) combine.args <- c(combine.args, list(...)) if (!"legend" %in% names(x = combine.args)) { combine.args$legend <- "none" } plots <- do.call(what = "CombinePlots", args = combine.args) } return(plots) } <bytecode: 0x19dc8580> <environment: namespace:Seurat>
这一层函数也没讲小提琴如何画的,再看SingleExIPlot。 Browse[2]> SingleExIPlot function (data, idents, split = NULL, type = "violin", sort = FALSE, y.max = NULL, adjust = 1, pt.size = 0, cols = NULL, seed.use = 42, log = FALSE) { if (!is.null(x = seed.use)) { set.seed(seed = seed.use) } if (!is.data.frame(x = data) || ncol(x = data) != 1) { stop("'SingleExIPlot requires a data frame with 1 column") } feature <- colnames(x = data) data$ident <- idents if ((is.character(x = sort) && nchar(x = sort) > 0) || sort) { data$ident <- factor(x = data$ident, levels = names(x = rev(x = sort(x = tapply(X = data[, feature], INDEX = data$ident, FUN = mean), decreasing = grepl(pattern = paste0("^", tolower(x = sort)), x = "decreasing"))))) } if (log) { noise <- rnorm(n = length(x = data[, feature]))/200 data[, feature] <- data[, feature] + 1 } else { noise <- rnorm(n = length(x = data[, feature]))/1e+05 } if (all(data[, feature] == data[, feature][1])) { warning(paste0("All cells have the same value of ", feature, ".")) } else { data[, feature] <- data[, feature] + noise } axis.label <- "Expression Level" y.max <- y.max %||% max(data[, feature]) if (is.null(x = split) || type != "violin") { vln.geom <- geom_violin fill <- "ident" } else { data$split <- split vln.geom <- geom_split_violin fill <- "split" } switch(EXPR = type, violin = { x <- "ident" y <- paste0("`", feature, "`") xlab <- "Identity" ylab <- axis.label geom <- list(vln.geom(scale = "width", adjust = adjust, trim = TRUE), theme(axis.text.x = element_text(angle = 45, hjust = 1))) jitter <- geom_jitter(height = 0, size = pt.size) log.scale <- scale_y_log10() axis.scale <- ylim }, ridge = { x <- paste0("`", feature, "`") y <- "ident" xlab <- axis.label ylab <- "Identity" geom <- list(geom_density_ridges(scale = 4), theme_ridges(), scale_y_discrete(expand = c(0.01, 0)), scale_x_continuous(expand = c(0, 0))) jitter <- geom_jitter(width = 0, size = pt.size) log.scale <- scale_x_log10() axis.scale <- function(...) { invisible(x = NULL) } }, stop("Unknown plot type: ", type)) plot <- ggplot(data = data, mapping = aes_string(x = x, y = y, fill = fill)[c(2, 3, 1)]) + labs(x = xlab, y = ylab, title = feature, fill = NULL) + theme_cowplot() + theme(plot.title = element_text(hjust = 0.5)) plot <- do.call(what = "+", args = list(plot, geom)) plot <- plot + if (log) { log.scale } else { axis.scale(min(data[, feature]), y.max) } if (pt.size > 0) { plot <- plot + jitter } if (!is.null(x = cols)) { if (!is.null(x = split)) { idents <- unique(x = as.vector(x = data$ident)) splits <- unique(x = as.vector(x = data$split)) labels <- if (length(x = splits) == 2) { splits } else { unlist(x = lapply(X = idents, FUN = function(pattern, x) { x.mod <- gsub(pattern = paste0(pattern, "."), replacement = paste0(pattern, ": "), x = x, fixed = TRUE) x.keep <- grep(pattern = ": ", x = x.mod, fixed = TRUE) x.return <- x.mod[x.keep] names(x = x.return) <- x[x.keep] return(x.return) }, x = unique(x = as.vector(x = data$split)))) } if (is.null(x = names(x = labels))) { names(x = labels) <- labels } } else { labels <- levels(x = droplevels(data$ident)) } plot <- plot + scale_fill_manual(values = cols, labels = labels) } return(plot) } <bytecode: 0x1a735330> <environment: namespace:Seurat>
我们看到核心了,ggplot的代码。 geom <- list(vln.geom(scale = "width", adjust = adjust, trim = TRUE), theme(axis.text.x = element_text(angle = 45, hjust = 1))) jitter <- geom_jitter(height = 0, size = pt.size) log.scale <- scale_y_log10() axis.scale <- ylim
这句子写的真美啊。 那我们就要来试一下了。记得退出debug模式哦。 undebug(VlnPlot) p<-VlnPlot(pbmc3k.final, "CD4",slot = "data") #ggplot对象中记录了一张图的所有信息,为了方便演示,我们只取数据出来。
head(p$data) # 从图中抠数据,学会了吗?
CD4 ident AAACATACAACCAC 1.370958e-05 Memory CD4 T AAACATTGAGCTAC -5.646982e-06 B AAACATTGATCAGC 3.631284e-06 Memory CD4 T AAACCGTGCTTCCG 6.328626e-06 CD14+ Mono AAACCGTGTATGCG 4.042683e-06 NK AAACGCACTGGTAC -1.061245e-06 Memory CD4 T
原汁原味的啊: ggplot(p$data,aes(ident,CD4)) + geom_violin() + theme_bw()
啥也没有: 加个抖动吧: ggplot(p$data,aes(ident,CD4)) + geom_violin() + geom_jitter()+ theme_bw()
这下有点了。所以我们看到的点有左右的区分其实是抖出来的,本身数据的点应该是在一条直线上。然而,小提琴呢? ggplot(p$data,aes(ident,CD4)) + geom_violin(scale = "width", adjust =1, trim = TRUE) + geom_jitter()+ theme_bw()
这里我们用seurat内部绘制小提琴图的方式还原了我们问题:为什么CD14+ Mono和 Memory CD4 T 有怎么多的点,却没有小提琴呢?经过上面演示我们知道,其实默认的情况下,我们的数据是都没有小提琴的。所以,当务之急是抓紧时间看看geom_violin的帮助文档。 ?geom_violin ?geom_violin ?geom_violin
好了,我们知道一个关键的参数scale = "width" 导致了这种局面,其他没有出现小提琴的应该是零值比例太多。 作为好奇,我们看看改一下adjust会有什么改变。 ggplot(p$data,aes(ident,CD4)) + geom_violin(scale = "width", adjust =.5, trim = TRUE) + geom_jitter()+ theme_bw()
腰变细了,好玩。 既然已经基本锁定问题,我们如何画出都有小提琴的小提琴图呢?也许可以用的方法之一就是,数据过滤。 VlnPlot(subset(pbmc3k.final,CD4 > 0 ), "CD4")+ theme_bw()
什么?改一下腰围? VlnPlot(subset(pbmc3k.final,CD4 > 0 ), "CD4",adjust = .2)+ theme_bw()
Seurat小提琴图为什么有的只有点儿?那是因为还有更多的点没忽视。
|