最近我们 最优秀的R语言讲师 小洁也开启了TCGA知识库打卡之旅,分享一下她其中一个学习成果,TCGA(转录组)差异分析三大R包及其结果对比。
如果你跟着她的教程学会了相关分析,可以尝试完成一个学徒作业:理解RNA-seq表达矩阵的两个形式
TCGA数据库背景知识推文合辑 众所周知,TCGA数据库是目前最综合最全面 的癌症病人相关组学数据库,包括:
DNA Sequencing
miRNA Sequencing
Protein Expression array
mRNA Sequencing
Total RNA Sequencing
Array-based Expression
DNA Methylation
Copy Number array
知名的肿瘤研究机构都有着自己的TCGA数据库探索工具 ,比如:
Broad Institute FireBrowse portal, The Broad Institute
cBioPortal for Cancer Genomics, Memorial Sloan-Kettering Cancer Center
所以我挑选了部分,写了6个数据下载系列教程 :
虽然说,教程是关于TCGA数据库的不同数据的下载,实际上是希望可以帮助大家认识TCGA数据库的全貌 ,然后根据大家的提问,我也扩充了部分常见的TCGA数据库用法 :
1.准备R包 if (!require (stringr))install.packages('stringr' )if (!require (ggplotify))install.packages("ggplotify" )if (!require (patchwork))install.packages("patchwork" )if (!require (cowplot))install.packages("cowplot" )if (!require (DESeq2))install.packages('DESeq2' )if (!require (edgeR))install.packages('edgeR' )if (!require (limma))install.packages('limma' )
## 点评:这样的R包安装方法是有问题,大家自行思考一下
2.准备数据 本示例的数据是TCGA-KIRC的miRNA表达矩阵 。tcga样本编号14-15位是隐藏分组信息的,详见: 准考证号,身份证号码,TCGA样本条形码的区别
这里需要注意的是miRNA也是测序拿到的表达矩阵,所以分析等同于RNA-seq得到表达矩阵,一定要跟芯片数据分析区分开来哦! ! !
rm(list = ls()) load("tcga_kirc_exp.Rdata" ) #表达矩阵expr dim(expr)
# # [1] 552 593
group_list <- ifelse(as.numeric(str_sub(colnames(expr),14 ,15 ))<10 ,"tumor" ,"normal" ) group_list <- factor(group_list,levels = c("normal" ,"tumor" )) table(group_list)
# # group_list # # normal tumor # # 71 522
3.三大R包的差异分析 3.1 Deseq2 library (DESeq2) colData <- data.frame(row.names =colnames(expr), condition=group_list) dds <- DESeqDataSetFromMatrix( countData = expr, colData = colData, design = ~ condition)#参考因子应该是对照组 dds$condition <- relevel(dds$condition, ref = "untrt") dds <- DESeq(dds)# 两两比较 res <- results(dds, contrast = c("condition" ,rev(levels(group_list)))) resOrdered <- res[order(res$pvalue),] # 按照P值排序 DEG <- as.data.frame(resOrdered) head(DEG)# 去除NA值 DEG <- na.omit(DEG)#添加change列标记基因上调下调 #logFC_cutoff <- with(DEG,mean(abs(log2FoldChange)) + 2*sd(abs(log2FoldChange)) ) logFC_cutoff <- 1 DEG$change = as.factor( ifelse(DEG$pvalue < 0.05 & abs(DEG$log2FoldChange) > logFC_cutoff, ifelse(DEG$log2FoldChange > logFC_cutoff ,'UP' ,'DOWN' ),'NOT' ) ) head(DEG) DESeq2_DEG <- DEG
3.2 edgeR library (edgeR) dge <- DGEList(counts=expr,group=group_list) dge$samples$lib.size <- colSums(dge$counts) dge <- calcNormFactors(dge) design <- model.matrix(~0 +group_list) rownames(design)<-colnames(dge) colnames(design)<-levels(group_list) dge <- estimateGLMCommonDisp(dge,design) dge <- estimateGLMTrendedDisp(dge, design) dge <- estimateGLMTagwiseDisp(dge, design) fit <- glmFit(dge, design) fit2 <- glmLRT(fit, contrast=c(-1 ,1 )) DEG=topTags(fit2, n=nrow(expr)) DEG=as.data.frame(DEG) logFC_cutoff <- with(DEG,mean(abs(logFC)) + 2 *sd(abs(logFC)) ) logFC_cutoff <- 1 DEG$change = as.factor( ifelse(DEG$PValue < 0.05 & abs(DEG$logFC) > logFC_cutoff, ifelse(DEG$logFC > logFC_cutoff ,'UP' ,'DOWN' ),'NOT' ) ) head(DEG)
## logFC logCPM LR PValue FDR ## hsa-mir-508 -4 .264945 5.3610815 825.7952 1.329948e-181 7.341313e-179 ## hsa-mir-514-3 -4 .262325 3.5005425 674.3829 1.112883e-148 3.071556e-146 ## hsa-mir-514-2 -4 .258203 3.4771070 658.6855 2.885406e-145 5.309148e-143 ## hsa-mir-506 -5 .522829 0.7477531 654.6812 2.143124e-144 2.957511e-142 ## hsa-mir-514-1 -4 .271951 3.4852217 642.0128 1.219493e-141 1.346320e-139 ## hsa-mir-514b -5 .956182 0.3742949 579.5893 4.606971e-128 4.238413e-126 ## change ## hsa-mir-508 DOWN ## hsa-mir-514-3 DOWN ## hsa-mir-514-2 DOWN ## hsa-mir-506 DOWN ## hsa-mir-514-1 DOWN ## hsa-mir-514b DOWN
table(DEG$change)
# # # # DOWN NOT UP # # 64 368 120
edgeR_DEG <- DEG
3.limma-voom library (limma) design <- model.matrix(~0 +group_list) colnames(design)=levels(group_list) rownames(design)=colnames(expr) dge <- DGEList(counts=expr) dge <- calcNormFactors(dge) logCPM <- cpm(dge, log=TRUE , prior.count=3 ) v <- voom(dge,design, normalize="quantile" ) fit <- lmFit(v, design) constrasts = paste(rev(levels(group_list)),collapse = "-" ) cont.matrix <- makeContrasts(contrasts=constrasts,levels = design) fit2=contrasts.fit(fit,cont.matrix) fit2=eBayes(fit2) DEG = topTable(fit2, coef=constrasts, n=Inf ) DEG = na.omit(DEG)#logFC_cutoff <- with(DEG,mean(abs(logFC)) + 2*sd(abs(logFC)) ) logFC_cutoff <- 1 DEG$change = as.factor( ifelse(DEG$P.Value < 0.05 & abs(DEG$logFC) > logFC_cutoff, ifelse(DEG$logFC > logFC_cutoff ,'UP' ,'DOWN' ),'NOT' ) ) head(DEG)
## logFC AveExpr t P .Value adj .P .Val ## hsa-mir-141 -5 .492612 2.990323 -31 .22459 1.280624e-127 7.069043e-125 ## hsa-mir-200c -5 .333666 5.687063 -30 .87441 8.224995e-126 2.270099e-123 ## hsa-mir-3613 2.199074 3.862900 23.32209 5.152888e-86 9.481314e-84 ## hsa-mir-15a 1.335460 7.014647 22.83389 2.008313e-83 2.771472e-81 ## hsa-mir-934 -3 .234590 -1 .930201 -22 .39709 4.148098e-81 4.579500e-79 ## hsa-mir-122 5.554068 3.112250 22.33183 9.192713e-81 8.457296e-79 ## B change ## hsa-mir-141 280.9396 DOWN ## hsa-mir-200c 276.7881 DOWN ## hsa-mir-3613 185.4023 UP ## hsa-mir-15a 179.4222 UP ## hsa-mir-934 174.1455 DOWN ## hsa-mir-122 173.3486 UP
limma_voom_DEG <- DEG#save(DESeq2_DEG,edgeR_DEG,limma_voom_DEG,group_list,file = "DEG.Rdata")
4.差异分析结果的可视化 #作图函数是我自己写的,在3-plotfunction.R 里,在示例数据中提供了,需放在工作目录下。
rm(list = ls()) load("tcga_kirc_exp.Rdata" ) load("DEG.Rdata" )source ("3-plotfunction.R" ) logFC_cutoff <- 1 expr[1 :4 ,1 :4 ]
# # TCGA-A3-3307-01A-01T-0860-13 TCGA-A3-3308-01A-02R-1324-13 # # hsa-let-7a-1 5056 14503 # # hsa-let-7a-2 10323 29238 # # hsa-let-7a-3 5429 14738 # # hsa-let-7b 17908 37062 # # TCGA-A3-3311-01A-02R-1324-13 TCGA-A3-3313-01A-02R-1324-13 # # hsa-let-7a-1 8147 7138 # # hsa-let-7a-2 16325 14356 # # hsa-let-7a-3 8249 7002 # # hsa-let-7b 28984 6909
dat = log(expr+1 ) pca.plot = draw_pca(dat,group_list) cg1 = rownames(DESeq2_DEG)[DESeq2_DEG$change !="NOT" ] cg2 = rownames(edgeR_DEG)[edgeR_DEG$change !="NOT" ] cg3 = rownames(limma_voom_DEG)[limma_voom_DEG$change !="NOT" ] h1 = draw_heatmap(expr[cg1,],group_list) h2 = draw_heatmap(expr[cg2,],group_list) h3 = draw_heatmap(expr[cg3,],group_list)
v1 = draw_volcano(test = DESeq2_DEG[,c(2 ,5 ,7 )],pkg = 1 ) v2 = draw_volcano(test = edgeR_DEG[,c(1 ,4 ,6 )],pkg = 2 ) v3 = draw_volcano(test = limma_voom_DEG[,c(1 ,4 ,7 )],pkg = 3 ) ## 点评: 这个 拼图语法很棒,通俗易懂library (patchwork) (h1 + h2 + h3) / (v1 + v2 + v3) +plot_layout(guides = 'collect' )
#(v1 + v2 + v3) +plot_layout(guides = 'collect') ggsave("heat_volcano.png" ,width = 21 ,height = 9 )
5.三大R包差异基因对比 # 三大R包差异基因交集 UP=function (df){ rownames(df)[df$change=="UP" ] } DOWN=function (df){ rownames(df)[df$change=="DOWN" ] } up = intersect(intersect(UP(DESeq2_DEG),UP(edgeR_DEG)),UP(limma_voom_DEG)) down = intersect(intersect(DOWN(DESeq2_DEG),DOWN(edgeR_DEG)),DOWN(limma_voom_DEG)) hp = draw_heatmap(expr[c(up,down),],group_list)#上调、下调基因分别画维恩图 up.plot <- venn(UP(DESeq2_DEG),UP(edgeR_DEG),UP(limma_voom_DEG), "UPgene" )
down.plot <- venn(DOWN(DESeq2_DEG),DOWN(edgeR_DEG),DOWN(limma_voom_DEG), "DOWNgene" )#维恩图拼图,终于搞定 library (cowplot)library (ggplotify) up.plot = as.ggplot(as_grob(up.plot)) down.plot = as.ggplot(as_grob(down.plot))library (patchwork)#up.plot + down.plot
# 就爱玩拼图 pca.plot + hp+up.plot +down.plot
ggsave("deg.png" ,height = 10 ,width = 10 )
三个R包差异分析结果的交集共有50个上调和51个下调, 可以作为最终结果提交。当然,这三个包没有对错之分,你拿其中任意一个包的分析结果都是对的。取交集的方法更可靠,但不是必须的,有些数据取交集会很可怜的。
本文出自小洁的 TCGA
学习记录,跟着 生信技能树B站课程 学的,已获得授权(嗯,真的)。 课程链接: https://www.bilibili.com/video/av49363776
历史目录: