一、背景了解 芯片数据:首选limma 。
二代测序(RNA_seq):如果是counts 可选择limma的voom算法或者edgeR或者DESeq2。 如果是FPKM或TPM可选择limma,注意: edgeR和DESeq2只能处理count注意: count做差异分析计算上下调,FPKM或TPM进行下游可视化
二. GEO 数据处理流程 0.需要安装各种包if (!require ("BiocManager" )) install.packages("BiocManager" ,update = F ,ask = F ) options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/" ) cran_packages <- c('tidyr' , 'tibble' , 'dplyr' , 'stringr' , 'ggplot2' , 'ggpubr' , 'factoextra' , 'FactoMineR' , 'devtools' , 'cowplot' , 'patchwork' , 'basetheme' , 'paletteer' , 'AnnoProbe' , 'ggthemes' , 'VennDiagram' , 'tinyarray' ) Biocductor_packages <- c('GEOquery' , 'hgu133plus2.db' , 'ggnewscale' , "limma" , "impute" , "GSEABase" , "GSVA" , "clusterProfiler" , "org.Hs.eg.db" , "preprocessCore" , "enrichplot" , "ggplotify" )for (pkg in cran_packages){ if (! require (pkg,character.only=T ) ) { install.packages(pkg,ask = F ,update = F ) require (pkg,character.only=T ) } }for (pkg in Biocductor_packages){ if (! require (pkg,character.only=T ) ) { BiocManager::install(pkg,ask = F ,update = F ) require (pkg,character.only=T ) } }#前面的所有提示和报错都先不要管。主要看这里 for (pkg in c(Biocductor_packages,cran_packages)){ require (pkg,character.only=T ) }#没有任何提示就是成功了,如果有warningxx包不存在,用library检查一下。 #library报错,就单独安装。
1. 下载数据#数据下载 rm(list = ls())library (GEOquery)#先去网页确定是否是表达芯片数据,不是的话不能用本流程。 gse_number = "GSE56649" eSet <- getGEO(gse_number, destdir = '.' , getGPL = F ) class(eSet) length(eSet) eSet = eSet[[1 ]]#(1)提取表达矩阵exp exp <- exprs(eSet) dim(exp) exp[1 :4 ,1 :4 ]#检查矩阵是否正常,如果是空的就会报错,空的和有负值的、有异常值的矩阵需要处理原始数据。 #如果表达矩阵为空,大多数是转录组数据,不能用这个流程(后面另讲)。 #自行判断是否需要log exp = log2(exp+1 ) boxplot(exp)# 将原始数据标准化(针对画出的图水平不一致) exp =normalizeBetweenArrays(exp) boxplot(exp)#(2)提取临床信息 pd <- pData(eSet)#(3)让exp列名与pd的行名顺序完全一致 p = identical(rownames(pd),colnames(exp));pif (!p) exp = exp[,match(rownames(pd),colnames(exp))]#(4)提取芯片平台编号 gpl_number <- eSet@annotation;gpl_number save(gse_number,pd,exp,gpl_number,file = "step1output.Rdata" )
2. 进行分组Q:如何进行分组?
A:三种方法:芯片中最常用的是str_detect()函数;转录组数据中最常用的是Group = c(rep(“RA”,times=13),rep(“control”,times=9))注意 :需要把Group转换成因子,并设置参考水平,指定levels,对照组在前,处理组在后
# Group(实验分组)和ids(探针注释) rm(list = ls()) load(file = "step1output.Rdata" )library (stringr)# 标准流程代码是二分组,多分组数据的分析后面另讲 # 生成Group向量的三种常规方法,三选一,选谁就把第几个逻辑值写成T,另外两个为F。如果三种办法都不适用,可以继续往后写else if if (F ){ # 1.Group---- # 第一种方法,有现成的可以用来分组的列 Group = pd$`disease state:ch1` }else if (F ){ # 第二种方法,自己生成 Group = c(rep("RA" ,times=13 ), rep("control" ,times=9 )) Group = rep(c("RA" ,"control" ),times = c(13 ,9 )) }else if (T ){ # 第三种方法,使用字符串出理的函数获取分组 Group=ifelse(str_detect(pd$source_name_ch1,"control" ), "control" , "RA" ) }# 需要把Group转换成因子,并设置参考水平,指定levels,对照组在前,处理组在后 Group = factor(Group,levels = c("control" ,"RA" )) Group#2.探针注释的获取----------------- #捷径 library (tinyarray) # 这个包为生信技能树老师所写 find_anno(gpl_number) ids <- AnnoProbe::idmap('GPL570' )#四种方法,方法1里找不到就从方法2找,以此类推。 #方法1 BioconductorR包(最常用) gpl_number #http://www./1399.html if (!require (hgu133plus2.db))BiocManager::install("hgu133plus2.db" )library (hgu133plus2.db) ls("package:hgu133plus2.db" ) ids <- toTable(hgu133plus2SYMBOL) head(ids)# 方法2 读取GPL网页的表格文件,按列取子集 ##https://www.ncbi.nlm./geo/query/acc.cgi?acc=GPL570 if (F ){ #注:表格读取参数、文件列名不统一,活学活用,有的表格里没有symbol列,也有的GPL平台没有提供注释表格 b = read.table("GPL570-55999.txt" ,header = T , quote = "\"" ,sep = "\t" ,check.names = F ) colnames(b) ids2 = b[,c("ID" ,"Gene Symbol" )] colnames(ids2) = c("probe_id" ,"symbol" ) ids2 = ids2[ids2$symbol!="" & !str_detect(ids2$symbol,"///" ),] }# 方法3 官网下载注释文件并读取 ##http://www./support/technical/byproduct.affx?product=hg-u133-plus # 方法4 自主注释 #https://mp.weixin.qq.com/s/mrtjpN8yDKUdCSvSUuUwcA save(exp,Group,ids,gse_number,file = "step2output.Rdata" )
3.PCA和热图Q:画PCA和热图需要使用什么样的数据,使用什么函数呢?
A:(1)PCA:加载FactoMineR和factoextra包,使用PCA()和 fviz_pca_ind()函数;数据:需要对exp矩阵进行t转换,将行名设置为样本名,列名设置为基因名,并转换成数据框的形式,此外,这里还需要输入分组信息。
(2)热图:加载pheatmap()包,数据一般使用log(count+1),这样画出来的图较显著。
rm(list = ls()) load(file = "step1output.Rdata" ) load(file = "step2output.Rdata" )#输入数据:exp和Group #Principal Component Analysis #http://www./english/articles/31-principal-component-methods-in-r-practical-guide/112-pca-principal-component-analysis-essentials # 1.PCA 图---- dat=as.data.frame(t(exp))library (FactoMineR)library (factoextra) dat.pca <- PCA(dat, graph = FALSE ) pca_plot <- fviz_pca_ind(dat.pca, geom.ind = "point" , # show points only (nbut not "text") col.ind = Group, # color by groups palette = c("#00AFBB" , "#E7B800" ), addEllipses = TRUE , # Concentration ellipses legend.title = "Groups" ) pca_plot save(pca_plot,file = "pca_plot.Rdata" )# 2.top 1000 sd 热图---- cg=names(tail(sort(apply(exp,1 ,sd)),1000 )) n=exp[cg,]# 直接画热图,对比不鲜明 library (pheatmap) annotation_col=data.frame(group=Group) rownames(annotation_col)=colnames(n) pheatmap(n, show_colnames =F , show_rownames = F , annotation_col=annotation_col )# 按行标准化 pheatmap(n, show_colnames =F , show_rownames = F , annotation_col=annotation_col, scale = "row" , breaks = seq(-3 ,3 ,length.out = 100 ) ) dev.off()# 关于scale的进一步学习:zz.scale.R
按行标准化 关于scale的进一步学习上面的因为行名是基因,所以对行进行标准化,是为了让基因在不同的样本中进行标准化。
require (stats)# 1.示例数据 x <- matrix(sample(1 :30 ,30 ), ncol = 6 ) rownames(x) = paste0("gene" ,1 :5 ) colnames(x) = paste0("sample" ,1 :6 )# 2.标准化 scale(x) #函数只能按列标准化,但是我们需要按行 x y = t(scale(t(x)))# 3.标准化前后,某gene的表达量点图比较,大小趋势不变。 par(mfrow = c(2 ,2 )) plot(x[1 ,]) plot(y[1 ,]) plot(x[2 ,]) plot(y[2 ,])
zz.去重方式Q:为什么要去重,各种去重方法对结果有什么影响 A:去重是因为行名中不能有重复值,所以需要先将基因名去重,然后再将这个基因名作为行名。各种去重方法没有好坏的定论,一般都可以使用
rm(list = ls()) load("step2output.Rdata" )#保留最大值 exp2 = exp[ids$probe_id,] identical(ids$probe_id,rownames(exp2)) ids = ids[order(rowSums(exp2),decreasing = T ),] ids = ids[!duplicated(ids$symbol),];nrow(ids)# 拿这个ids去inner_join #求平均值 rm(list = ls()) load("step2output.Rdata" ) exp3 = exp[ids$probe_id,] rownames(exp3) = ids$symbol exp3[1 :4 ,1 :4 ] exp4 = limma::avereps(exp3)# 此时拿到的exp4已经是一个基因为行名的表达矩阵,直接差异分析,不再需要inner_join
4.GEG在这个部分才进行id转换,不过也可以提到热图之前,不过在求差异基因后,再进行ID转换,热图可以直接画上调和下调基因了
rm(list = ls()) load(file = "step2output.Rdata" )#差异分析,用limma包来做 #需要表达矩阵和Group,不需要改 library (limma) design=model.matrix(~Group) fit=lmFit(exp,design) fit=eBayes(fit) deg=topTable(fit,coef=2 ,number = Inf )#为deg数据框添加几列 #1.加probe_id列,把行名变成一列 library (dplyr) deg <- mutate(deg,probe_id=rownames(deg))#2.加上探针注释 ids = ids[!duplicated(ids$symbol),]#其他去重方式在zz.去重方式.R deg <- inner_join(deg,ids,by="probe_id" ) nrow(deg)#3.加change列,标记上下调基因 logFC_t=1 P.Value_t = 0.05 k1 = (deg$P.Value < P.Value_t)&(deg$logFC < -logFC_t) k2 = (deg$P.Value < P.Value_t)&(deg$logFC > logFC_t) deg <- mutate(deg,change = ifelse(k1,"down" ,ifelse(k2,"up" ,"stable" ))) table(deg$change)#4.加ENTREZID列,用于富集分析(symbol转entrezid,然后inner_join) library (clusterProfiler)library (org.Hs.eg.db) s2e <- bitr(deg$symbol, fromType = "SYMBOL" , toType = "ENTREZID" , OrgDb = org.Hs.eg.db)#人类 #其他物种http:///packages/release/BiocViews.html#___OrgDb deg <- inner_join(deg,s2e,by=c("symbol" ="SYMBOL" )) save(Group,deg,logFC_t,P.Value_t,gse_number,file = "step4output.Rdata" )
5.绘图Q1:画火山图需要什么数据 A1:需要差异分析后的数据,即DESeq2、edgeR、limma分析后的数据,需要使用logFC、P.Value。需要加载ggplot2包
Q2:如何画基因的相关性图? A2:需要加载corrplot包,然后筛选自己想要的基因和它在各组的表达量,M = cor(t(exp[g,])),具体看代码
Q3:如何拼图? A3:如果使用ggplot2画出来的图,可以加载patchwork包,如果是其他,可以使用plot_grid()函数,具体如下
rm(list = ls()) load(file = "step1output.Rdata" ) load(file = "step4output.Rdata" )#1.火山图---- library (dplyr)library (ggplot2) dat = deg[!duplicated(deg$symbol),] p <- ggplot(data = dat, aes(x = logFC, y = -log10(P.Value))) + geom_point(alpha=0.4 , size=3.5 , aes(color=change)) + ylab("-log10(Pvalue)" )+ scale_color_manual(values=c("blue" , "grey" ,"red" ))+ geom_vline(xintercept=c(-logFC_t,logFC_t),lty=4 ,col="black" ,lwd=0.8 ) + geom_hline(yintercept = -log10(P.Value_t),lty=4 ,col="black" ,lwd=0.8 ) + theme_bw() p for_label <- dat%>% filter(symbol %in % c("HADHA" ,"LRRFIP1" )) volcano_plot <- p + geom_point(size = 3 , shape = 1 , data = for_label) + ggrepel::geom_label_repel( aes(label = symbol), data = for_label, color="black" ) volcano_plot#2.差异基因热图---- load(file = 'step2output.Rdata' )# 表达矩阵行名替换 exp = exp[dat$probe_id,] rownames(exp) = dat$symbolif (F ){ #全部差异基因 cg = dat$symbol[dat$change !="stable" ] length(cg) }else { #取前10上调和前10下调 library (dplyr) dat2 = dat %>% filter(change!="stable" ) %>% arrange(logFC) cg = c(head(dat2$symbol,10 ), tail(dat2$symbol,10 )) } n=exp[cg,] dim(n)#差异基因热图 library (pheatmap) annotation_col=data.frame(group=Group) rownames(annotation_col)=colnames(n) heatmap_plot <- pheatmap(n,show_colnames =F , scale = "row" , #cluster_cols = F, annotation_col=annotation_col, breaks = seq(-3 ,3 ,length.out = 100 ) ) heatmap_plot#拼图 library (patchwork)library (ggplotify) volcano_plot +as.ggplot(heatmap_plot)# f <- volcano_plot +as.ggplot(heatmap_plot) # ggsave(f,filename = "./pin.pdf",width = 15,height = 15) # 3.感兴趣基因的相关性---- library (corrplot) g = deg$symbol[1 :10 ] # 换成自己感兴趣的基因 g M = cor(t(exp[g,])) pheatmap(M)library (paletteer) my_color = rev(paletteer_d("RColorBrewer::RdYlBu" )) my_color = colorRampPalette(my_color)(10 ) corrplot(M, type="upper" , method="pie" , order="hclust" , col=my_color, tl.col="black" , tl.srt=45 )library (cowplot) cor_plot <- recordPlot() # 拼图 load("pca_plot.Rdata" ) plot_grid(pca_plot,cor_plot, volcano_plot,heatmap_plot$gtable) dev.off()#保存 pdf("deg.pdf" ) plot_grid(pca_plot,cor_plot, volcano_plot,heatmap_plot$gtable) dev.off()
火山图 在火山图的基础上,找到特定的基因 差异基因热图 将火山图和热图拼起来 感兴趣基因的相关性 进一步拼图 6.富集分析(可看RNA_Seq和多个芯片分析,那里画的图好看)#富集分析所有图表默认都是用p.adjust,富集不到可以退而求其次用p值,在文中说明即可
Q:富集分析需要使用到什么数据
A:(1)如果仅仅是画条带图和气泡图,那么只使用基因的ENTREZID值即可;(2)如果需要画弦图(Goplot包),需要Term,logFC,symbol
(3)画Heatmap-like functional classification,Gene-Concept Network,需要geneList 和ego。
(4)画。
rm(list = ls()) load(file = 'step4output.Rdata' )library (clusterProfiler)library (ggthemes)library (org.Hs.eg.db)library (dplyr)library (ggplot2)library (stringr)library (enrichplot)# 1.GO 富集分析---- #(1)输入数据 gene_up = deg$ENTREZID[deg$change == 'up' ] gene_down = deg$ENTREZID[deg$change == 'down' ] gene_diff = c(gene_up,gene_down)#(2)富集 #以下步骤耗时很长,设置了存在即跳过 if (!file.exists(paste0(gse_number,"_GO.Rdata" ))){ ego <- enrichGO(gene = gene_diff, OrgDb= org.Hs.eg.db, ont = "ALL" , readable = TRUE ) ego_BP <- enrichGO(gene = gene_diff, OrgDb= org.Hs.eg.db, ont = "BP" , readable = TRUE ) #ont参数:One of "BP", "MF", and "CC" subontologies, or "ALL" for all three. save(ego,ego_BP,file = paste0(gse_number,"_GO.Rdata" )) } load(paste0(gse_number,"_GO.Rdata" )) ## 这个很有用 #(3)可视化 #条带图 barplot(ego)#气泡图 dotplot(ego) dotplot(ego, split = "ONTOLOGY" , font.size = 10 , showCategory = 5 ) + facet_grid(ONTOLOGY ~ ., space = "free_y" ,scales = "free_y" ) + scale_y_discrete(labels = function (x) str_wrap(x, width = 45 ))#geneList 用于设置下面图的颜色 geneList = deg$logFC names(geneList)=deg$ENTREZID#(3)展示top通路的共同基因,要放大看。 #Gene-Concept Network cnetplot(ego,categorySize="pvalue" , foldChange=geneList,colorEdge = TRUE ) cnetplot(ego, showCategory = 3 ,foldChange=geneList, circular = TRUE , colorEdge = TRUE )#Enrichment Map,这个函数最近更新过,版本不同代码会不同 Biobase::package.version("enrichplot" )if (T ){ emapplot(pairwise_termsim(ego)) #新版本 }else { emapplot(ego)#旧版本 }#(4)展示通路关系 https://zhuanlan.zhihu.com/p/99789859 #goplot(ego) goplot(ego_BP)#(5)Heatmap-like functional classification heatplot(ego,foldChange = geneList,showCategory = 8 )# 2.KEGG pathway analysis---- #上调、下调、差异、所有基因 #(1)输入数据 gene_up = deg[deg$change == 'up' ,'ENTREZID' ] gene_down = deg[deg$change == 'down' ,'ENTREZID' ] gene_diff = c(gene_up,gene_down)#(2)对上调/下调/所有差异基因进行富集分析 if (!file.exists(paste0(gse_number,"_KEGG.Rdata" ))){ kk.up <- enrichKEGG(gene = gene_up, organism = 'hsa' ) kk.down <- enrichKEGG(gene = gene_down, organism = 'hsa' ) kk.diff <- enrichKEGG(gene = gene_diff, organism = 'hsa' ) save(kk.diff,kk.down,kk.up,file = paste0(gse_number,"_KEGG.Rdata" )) } load(paste0(gse_number,"_KEGG.Rdata" ))#(3)看看富集到了吗?https://mp.weixin.qq.com/s/NglawJgVgrMJ0QfD-YRBQg table(kk.diff@result$p.adjust<0.05 ) table(kk.up@result$p.adjust<0.05 ) table(kk.down@result$p.adjust<0.05 )#(4)双向图 # 富集分析所有图表默认都是用p.adjust,富集不到可以退而求其次用p值,在文中说明即可 down_kegg <- kk.down@result %>% filter(pvalue<0.05 ) %>% #筛选行 mutate(group=-1 ) #新增列 up_kegg <- kk.up@result %>% filter(pvalue<0.05 ) %>% mutate(group=1 )source ("kegg_plot_function.R" ) g_kegg <- kegg_plot(up_kegg,down_kegg) g_kegg#g_kegg +scale_y_continuous(labels = c(2,0,2,4,6)) # 3.GSEA作kegg和GO富集分析---- #https://www./xiaojiewanglezenmofenshen/dbwkg1/ytawgg #(1)查看示例数据 data(geneList, package="DOSE" )#(2)将我们的数据转换成示例数据的格式 geneList=deg$logFC names(geneList)=deg$ENTREZID geneList=sort(geneList,decreasing = T )#(3)GSEA富集 kk_gse <- gseKEGG(geneList = geneList, organism = 'hsa' , verbose = FALSE ) down_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore < 0 ,];down_kegg$group=-1 up_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore > 0 ,];up_kegg$group=1 #(4)可视化 g2 = kegg_plot(up_kegg,down_kegg) g2# 4.能看懂的资料越来越多---- # GSEA学习更多:https://www./docs/share/a67a180f-dd2b-4f6f-96c2-68a4b86fe862?# # 富集分析学习更多:http:///clusterProfiler-book/index.html # 弦图:https://www./xiaojiewanglezenmofenshen/dbwkg1/dgs65p # GOplot:https://mp.weixin.qq.com/s/LonwdDhDn8iFUfxqSJ2Wew # 网上的资料和宝藏无穷无尽,学好R语言慢慢发掘~ # 弦图 ego1 <- data.frame(ego_BP) colnames(ego1) ego1 <- ego1[1 :10 ,c(1 ,2 ,8 ,6 )] ego1$geneID <- str_replace_all(ego1$geneID,"/" ,"," ) names(ego1)=c("ID" ,"Term" ,"Genes" ,"adj_pval" ) ego1$Category = "BP" head(ego1) genes = data.frame(ID=deg$symbol, logFC=deg$logFC) head(genes)# 开始画弦图 library (GOplot) circ <- circle_dat(ego1,genes) chord <- chord_dat(data=circ, genes=genes,process = ego1$Term) # f2 <- GOChord(chord, space = 0.01 , gene.order = 'logFC' , gene.space = 0.15 , gene.size = 3 ) ggsave(f2,filename = "./xiantu.pdf" ,width = 17 ,height = 18 )
kegg_plot_function.R里面的代码
### --------------- ### ### Create: Jianming Zeng ### Date: 2018-07-09 20:11:07 ### Email: jmzeng1314@163.com ### Blog: http://www./ ### Forum: http://www./thread-1376-1-1.html ### CAFS/SUSTC/Eli Lilly/University of Macau ### Update Log: 2018-07-09 First version ### ### Update: Xiaojie Liang ### Date: 2021-07-12 23:38:07 ### Email: liangxiaojiecom@163.com ### --------------- library (ggthemes)library (ggplot2) kegg_plot <- function (up.data,down.data){ dat=rbind(up.data,down.data) colnames(dat) dat$pvalue = -log10(dat$pvalue) dat$pvalue=dat$pvalue*dat$group dat=dat[order(dat$pvalue,decreasing = F ),] gk_plot <- ggplot(dat,aes(reorder(Description, pvalue), y=pvalue)) + geom_bar(aes(fill=factor((pvalue>0 )+1 )),stat="identity" , width=0.7 , position=position_dodge(0.7 )) + coord_flip() + scale_fill_manual(values=c("#0072B2" , "#B20072" ), guide="none" ) + labs(x="" , y="" ) + theme_pander() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), #axis.ticks.x = element_blank(), axis.line.x = element_line(size = 0.3 , colour = "black" ),#x轴连线 axis.ticks.length.x = unit(-0.20 , "cm" ),#修改x轴刻度的高度,负号表示向上 axis.text.x = element_text(margin = margin(t = 0.3 , unit = "cm" )),##线与数字不要重叠 axis.ticks.x = element_line(colour = "black" ,size = 0.3 ) ,#修改x轴刻度的线 axis.ticks.y = element_blank(), axis.text.y = element_text(hjust=0 ), panel.background = element_rect(fill=NULL , colour = 'white' ) ) }
条带图 点图
Gene-Concept Network 展示通路关系Heatmap-like functional classification
KEGG富集分析(针对下调和上调基因)
GSEA画出的图 弦图 7.STRING数据的提前准备rm(list = ls()) load("step4output.Rdata" ) gene_up= deg[deg$change == 'up' ,'symbol' ] gene_down=deg[deg$change == 'down' ,'symbol' ] gene_diff = c(gene_up,gene_down)# 1.制作string的输入数据 write.table(gene_diff, file="diffgene.txt" , row.names = F , col.names = F , quote = F )# 从string网页获得string_interactions.tsv # 2.准备cytoscape的输入文件 p = deg[deg$change != "stable" , c("symbol" ,"logFC" )] head(p) write.table(p, file = "deg.txt" , sep = "\t" , quote = F , row.names = F )# string_interactions.tsv是网络文件 # deg.txt是属性表格