【原】表观调控13张图之三。。。

健明 2021-07-14

展开全文

现在我们再解读一下第三张图，如果你对视频感兴趣，还是可以继续留邮箱，我们在圣诞节（明天）统一发邮件给大家全部的视频云盘链接和配套代码哈！

本章节我们的视频审查员（刘博-二货潜）将继续带领大家学习视频，并且复现附件中Figure S13的一张图，如下：

本文的目录如下：

DESeq2 寻找差异基因
可视化

(1)MAplot：图a和图b

(2)差异基因韦恩图:UpSetR/VennDiagram

(3)两样本 log2FC 相关性散点图

DESeq2寻找差异基因

万事开头前先看包的说明书：DESeq2 manual ，里面是讲的很详细很全面的，所以下面与文章不相关的图就不展示出来了。

rm(list = ls())
options(stringsAsFactors = F)
a = read.table('../figure-01-check-gene-expression/all.counts.id.txt', header = T)
dim(a)
dat = a[,7:16]

# 第一列为基因名，将其赋值给行名, 做韦恩图需要
rownames(dat)=a[, 1]

# 查看前四行和前四列信息
dat[1:4, 1:4]
library(stringr)

# 要擅用 str_split 切割字符串，表示按照下划线 "_" 对列名进行切割，取第一列；即样本名
# 三组，每个样品一组，即 PhoKO、SppsKO、WT
group_list = str_split(colnames(dat), '_', simplify = T)[, 1] 
table(group_list)

######################################################################
###################      Firstly for DEseq2      #####################
######################################################################
# 一步运行
# 这里我们主要是得到差异基因中间部分就不细说
if(T){
  # 赋值表达矩阵和分组信息为一个新的变量，方便以后直接调用
  exprSet = dat
  suppressMessages(library(DESeq2)) # 加载包

  (colData <- data.frame(row.names = colnames(exprSet), 
                         group_list = group_list) )

  # 构建一个表达矩阵
  dds <- DESeqDataSetFromMatrix(countData = exprSet,
                                colData = colData,
                                design = ~ group_list)
  dds <- DESeq(dds)
  png("qc_dispersions.png", 1000, 1000, pointsize = 20)
  plotDispEsts(dds, main="Dispersion plot")
  dev.off()


  rld <- rlogTransformation(dds)
  exprMatrix_rlog = assay(rld) 
  #write.csv(exprMatrix_rlog,'exprMatrix.rlog.csv' )

  normalizedCounts1 <- t( t(counts(dds)) / sizeFactors(dds) )
  # normalizedCounts2 <- counts(dds, normalized=T) # it's the same for the tpm value
  # we also can try cpm or rpkm from edgeR pacage
  exprMatrix_rpm = as.data.frame(normalizedCounts1) 
  head(exprMatrix_rpm)
  #write.csv(exprMatrix_rpm,'exprMatrix.rpm.csv' )

  png("DEseq_RAWvsNORM.png", height = 800, width = 800)
  par(cex = 0.7)
  n.sample = ncol(exprSet)
  if(n.sample > 40) par(cex = 0.5)
  cols <- rainbow(n.sample*1.2)
  par(mfrow = c(2, 2))
  boxplot(exprSet, col = cols,main = "expression value", las = 2)
  boxplot(exprMatrix_rlog, col = cols, main = "expression value", las = 2)
  hist(as.matrix(exprSet))
  hist(exprMatrix_rlog)
  dev.off()

  library(RColorBrewer)
  (mycols <- brewer.pal(8, "Dark2")[1:length(unique(group_list))])
  cor(as.matrix(exprSet))
  # Sample distance heatmap
  sampleDists <- as.matrix(dist(t(exprMatrix_rlog)))
  #install.packages("gplots",repos = "http://cran.us.")
  library(gplots)

  png("qc-heatmap-samples.png", w = 1000, h = 1000, pointsize = 20)
  heatmap.2(as.matrix(sampleDists), key = F, trace = "none",
            col = colorpanel(100, "black", "white"),
            ColSideColors = mycols[group_list], RowSideColors = mycols[group_list],
            margin = c(10, 10), main="Sample Distance Matrix")
  dev.off()

  cor(exprMatrix_rlog) 

  table(group_list)
  res <- results(dds, 
                 contrast=c("group_list", "SppsKO", "WT"))
  resOrdered <- res[order(res$padj),]
  head(resOrdered)
  DEG_SppsKO = as.data.frame(resOrdered)
  DEG_SppsKO = na.omit(DEG_SppsKO)

  table(group_list)
  res <- results(dds, 
                 contrast = c("group_list","PhoKO","WT"))
  resOrdered <- res[order(res$padj),]
  head(resOrdered)
  DEG_PhoKO = as.data.frame(resOrdered)
  DEG_PhoKO = na.omit(DEG_PhoKO)
  save(DEG_PhoKO, DEG_SppsKO,file = 'deg_output.Rdata')
}

简化一下，如果不需要中间的信息，我们只需要差异基因的话，那么只需要运行以下代码：

if(T){
  # 赋值表达矩阵和分组信息为一个新的变量，方便以后直接调用
  exprSet = dat
  suppressMessages(library(DESeq2)) # 加载包

  (colData <- data.frame(row.names = colnames(exprSet), 
                         group_list = group_list) )

  # 构建一个表达矩阵
  dds <- DESeqDataSetFromMatrix(countData = exprSet,
                                colData = colData,
                                design = ~ group_list)
  dds <- DESeq(dds)

# 下面我们得到 Spps 突变后的差异基因    
  res <- results(dds, 
                 contrast=c("group_list", "SppsKO", "WT")) 
# 注意这里是前面比后面，别把顺序搞反了，到时候一不注意结果就是反的。所以拿差异倍数对着原始 reads 比较一下。

  resOrdered <- res[order(res$padj),] # 按照 padj 值进行排序
  head(resOrdered)
  DEG_SppsKO = as.data.frame(resOrdered) # 将数据转变为 data.frame 数据框
  DEG_SppsKO = na.omit(DEG_SppsKO) # 去除包含 NA 值的行

# 下面我们得到 Pho 突变后的差异基因 
  table(group_list)
  res <- results(dds, 
                 contrast = c("group_list","PhoKO","WT"))
  resOrdered <- res[order(res$padj),]
  head(resOrdered)
  DEG_PhoKO = as.data.frame(resOrdered)
  DEG_PhoKO = na.omit(DEG_PhoKO)

# 将关键结果以 Rdata 形式保存到本地，下次如有需要就不需要重新用 DESeq2 计算了    
  save(DEG_PhoKO, DEG_SppsKO,file = 'deg_output.Rdata')
}

好了，上面我们就得到了差异基因。

可视化

MAplot: 图 a 和图 b

什么是 MAplot ?DESeq2 包说明书中的一段话：

In DESeq2, the function plotMA shows the log₂ fold changes attributable to a given variable over the mean of normalized counts for all the samples in the DESeqDataSet. Points will be colored red if the adjusted p value is less than 0.1. Points which fall out of the window are plotted as open triangles pointing either up or down.

也就是说表示的是变化倍数 log₂(Fold change) 与所有样本标准化后的 counts 数的平均值之间的关系。那么怎么画呢？如果看过 DESeq2 说明书就知道这是 MA-plot 部分。由于我们这里是将三组都放在一个 dds 中，所以我们得分别挑出 Pho 和 Spps 进行可视化。

首先查看 dds 中的分组情况：

resultsNames(dds)

[1] "Intercept"                  "group_list_SppsKO_vs_PhoKO" "group_list_WT_vs_PhoKO"

lfcShrink 收缩 FC 三种方法如下（这里直接放原文）：

normal is the the original DESeq2 shrinkage estimator, an adaptive Normal distribution as prior. This is currently the default, although the default will likely change to apeglm in the October 2018 release given apeglm’s superior performance.
apeglm is the adaptive t prior shrinkage estimator from the apeglm package (Zhu, Ibrahim, and Love 2018).
ashr is the adaptive shrinkage estimator from the ashr package (Stephens 2016). Here DESeq2 uses the ashr option to fit a mixture of Normal distributions to form the prior, with method="shrinkage".

绘制 Spps

# apeglm 方法需要安装 apeglm 包
# BiocManager::install("apeglm")

# ashr 方法同样需要安装额外依赖的包
# BiocManager::install("ashr")

Spps_resLFC <- lfcShrink(dds, coef  = "group_list_SppsKO_vs_PhoKO", type = "apeglm")
Spps_resNorm <- lfcShrink(dds, coef = "group_list_SppsKO_vs_PhoKO", type = "normal")
Spps_resAsh <- lfcShrink(dds, coef  = "group_list_SppsKO_vs_PhoKO", type = "ashr")

par(mfrow=c(1,3), mar=c(4,4,2,1))
xlim <- c(1,1e5); ylim <- c(-3,3)

plotMA(Spps_resLFC, xlim  = xlim, ylim = ylim, main = "apeglm")
plotMA(Spps_resNorm, xlim = xlim, ylim = ylim, main = "normal")
plotMA(Spps_resAsh, xlim  = xlim, ylim = ylim, main = "ashr")

dev.off()

绘制 Pho

Pho_resLFC <- lfcShrink(dds, coef  = "group_list_WT_vs_PhoKO", type = "apeglm")
Pho_resNorm <- lfcShrink(dds, coef = "group_list_WT_vs_PhoKO", type = "normal")
Pho_resAsh <- lfcShrink(dds, coef  = "group_list_WT_vs_PhoKO", type = "ashr")

par(mfrow=c(1,3), mar=c(4,4,2,1))
xlim <- c(1,1e5); ylim <- c(-3,3)

plotMA(Pho_resLFC, xlim  = xlim, ylim = ylim, main = "apeglm")
plotMA(Pho_resNorm, xlim = xlim, ylim = ylim, main = "normal")
plotMA(Pho_resAsh, xlim  = xlim, ylim = ylim, main = "ashr")
dev.off()

好了，我们选取 normal （开心就好，你选哪个），来绘制图 a 和 b

par(mfrow=c(1,2), mar=c(4,4,2,1))
plotMA(Spps_resNorm, xlim = xlim, ylim = ylim, main = "Spps_normal")
plotMA(Pho_resNorm, xlim = xlim, ylim = ylim, main = "Pho_normal")
dev.off()

差异基因韦恩图:UpSetR/VennDiagram

我们下面将用两种方式来展示交集

第一种：我们使用 R 包 UpSetR 来绘制差异基因之间的韦恩图（多组时候，这种更加一目了然）

library(UpSetR)
load(file = 'deg_output.Rdata')

colnames(DEG_PhoKO)
DEG_PhoKO$change = ifelse(DEG_PhoKO$padj > 0.05,'stable',
                          ifelse(DEG_PhoKO$log2FoldChange > 0, 'up', 'down'))

DEG_SppsKO$change=ifelse(DEG_SppsKO$padj>0.05, 'stable',
                         ifelse(DEG_SppsKO$log2FoldChange > 0, 'up', 'down'))

SppsKO_up   = rownames(DEG_SppsKO[DEG_SppsKO$change == 'up',])
SppsKO_down = rownames(DEG_SppsKO[DEG_SppsKO$change == 'down',])
PhoKO_up    = rownames(DEG_PhoKO[DEG_PhoKO$change == 'up',])
PhoKO_down  = rownames(DEG_PhoKO[DEG_PhoKO$change == 'down',])

allG = unique(c(SppsKO_up, SppsKO_down, PhoKO_up, PhoKO_down))

df = data.frame(allG = allG,
              SppsKO_up   = as.numeric(allG %in% SppsKO_up),
              SppsKO_down = as.numeric(allG %in% SppsKO_down),
              PhoKO_up    = as.numeric(allG %in% PhoKO_up),
              PhoKO_down  = as.numeric(allG %in% PhoKO_down))

upset(df)

第二种：我们使用 VennDiagram来展示，也是就是文中那种图

# 这里直接 copy 琪同学的
# 链接: https://mp.weixin.qq.com/s/Pg0mjz7mD73atMnHz7jv1A

# 导入本地字体，不然 `Arial` 字体会警告
library("extrafont")
font_import()
loadfonts()

load(file = 'deg_output.Rdata')

colnames(DEG_PhoKO)
DEG_PhoKO$change = ifelse(DEG_PhoKO$padj > 0.05,'stable',
                          ifelse(DEG_PhoKO$log2FoldChange > 0, 'up', 'down'))

DEG_SppsKO$change=ifelse(DEG_SppsKO$padj>0.05, 'stable',
                         ifelse(DEG_SppsKO$log2FoldChange > 0, 'up', 'down'))

SppsKO_up   = rownames(DEG_SppsKO[DEG_SppsKO$change == 'up',])
SppsKO_down = rownames(DEG_SppsKO[DEG_SppsKO$change == 'down',])
PhoKO_up    = rownames(DEG_PhoKO[DEG_PhoKO$change == 'up',])
PhoKO_down  = rownames(DEG_PhoKO[DEG_PhoKO$change == 'down',])

library(VennDiagram)
venn.diagram(
  x = list(
    "Up in phoKO"    = PhoKO_up,
    "Down in phoKO"  = PhoKO_down,
    "Up in SppsKO"   = SppsKO_up,
    "Down in SppsKO" = SppsKO_down
    ),
  filename       = "Venn.png", # 保存到当前目录
  # stroke 颜色 形式 粗细
  col            = "#000000", lwd = 3, lty = 1,
  # 填充 透明度
  # 颜色可以参考前一篇，使用 takecolor 自己取色
  fill           = c("#D3E7F0", "#9FBEDB", "#A0D191", "#6DAE8A"),
  alpha          = 0.50,
  # 里外字体大小形式
  cex            = 1.5,
  fontfamily     = "Arial",
  fontface       = "bold",
  cat.cex        = 1.5,
  cat.dist       = 0.2,
  cat.fontfamily = "Arial",
  # 图像整体位置 分辨率
  margin         = 0.2,
  resolution     = 300)

与文章趋势基本一致。可以看出 Spps 和 Pho 共同调控许多基因，说明这两基因有一定的关系。

两样本 log2FC 相关性散点图

load(file = 'deg_output.Rdata')
library(ggpubr)
DEG_PhoKO = DEG_PhoKO[rownames(DEG_SppsKO),]
po = data.frame(SppsKO = DEG_SppsKO$log2FoldChange,
              PhoKO = DEG_PhoKO$log2FoldChange)

sp <- ggscatter(po, 'SppsKO', 'PhoKO',
                add        = "reg.line",  # Add regressin line
                add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line
                conf.int   = TRUE # Add confidence interval
)
# Add correlation coefficient
sp + stat_cor(method  = "pearson", 
              label.x = -10, label.y = 10) # 相关系数显示位置