【原】GEO实操|limma分析差异基因

医科研 2021-01-25

展开全文

欢迎来到医科研，这里是白介素2的读书笔记，跟我一起聊临床与科研的故事, 生物医学数据挖掘，R语言，TCGA、GEO数据挖掘。

---limma分析差异基因---

在经过了前两期中的数据下载，数据基本处理之后，解决了一个探针对应多个基因数的以及多个探针对应一个基因求平均值，在此基础上运用limma包分析差异基因；
除此以外，包括绘制火山图，热图，PCA等，都在本文中解决

数据载入

 1if(T){
 2Sys.setlocale('LC_ALL','C')
 3library(dplyr)
 4##
 5if(T){
 6load("expma.Rdata")
 7load("probe.Rdata")
 8}
 9expma[1:5,1:5]
10boxplot(expma)##看下表达情况
11metdata[1:5,1:5]
12head(probe)
13
14## 查看Gene Symbol是否有重复
15table(duplicated(probe$`Gene Symbol`))##12549 FALSE
16
17## 整合注释信息到表达矩阵
18ID<-rownames(expma)
19expma<-apply(expma,1,function(x){log2(x+1)})
20expma<-t(expma)
21eset<-expma[ID %in% probe$ID,] %>% cbind(probe)
22eset[1:5,1:5]
23colnames(eset)
24}

1## [1] "GSM188013"      "GSM188014"      "GSM188016"      "GSM188018"     
2## [5] "GSM188020"      "GSM188022"      "ID"             "Gene Symbol"   
3## [9] "ENTREZ_GENE_ID"

多个探针求平均值

 1test1<-aggregate(x=eset[,1:6],by=list(eset$`Gene Symbol`),FUN=mean,na.rm=T) 
 2test1[1:5,1:5]##与去重结果相吻合
 3##   Group.1 GSM188013 GSM188014 GSM188016 GSM188018
 4## 1          8.438846  8.368513  7.322442  7.813573
 5## 2    A1CF 10.979025 10.616926  9.940773 10.413311
 6## 3     A2M  6.565276  6.422112  8.142194  5.652593
 7## 4  A4GALT  7.728628  7.818966  8.679885  7.048563
 8## 5   A4GNT 10.243388 10.182382  9.391991  8.779887
 9dim(test1)##
10## [1] 12549     7
11colnames(test1)
12## [1] "Group.1"   "GSM188013" "GSM188014" "GSM188016" "GSM188018" "GSM188020"
13## [7] "GSM188022"
14rownames(test1)<-test1$Group.1
15test1<-test1[,-1]
16eset_dat<-test1

PCA plot

Principal Component Analysis (PCA)分析使用的是基于R语言的 prcomp() and princomp()函数.
完成PCA分析一般有两种方法：princomp()使用的是一种的spectral decomposition方法
prcomp() and PCA()[FactoMineR]使用的是SVD法

 1data<-eset_dat
 2data[1:5,1:5]##表达矩阵
 3##        GSM188013 GSM188014 GSM188016 GSM188018 GSM188020
 4##         8.438846  8.368513  7.322442  7.813573  7.244615
 5## A1CF   10.979025 10.616926  9.940773 10.413311  9.743305
 6## A2M     6.565276  6.422112  8.142194  5.652593  5.550033
 7## A4GALT  7.728628  7.818966  8.679885  7.048563  5.929258
 8## A4GNT  10.243388 10.182382  9.391991  8.779887  9.431585
 9metdata[1:5,1:5]
10##                                                              title
11## GSM188013   DMSO treated MCF7 breast cancer cells [HG-U133A] Exp 1
12## GSM188014 Dioxin treated MCF7 breast cancer cells [HG-U133A] Exp 1
13## GSM188016   DMSO treated MCF7 breast cancer cells [HG-U133A] Exp 2
14## GSM188018 Dioxin treated MCF7 breast cancer cells [HG-U133A] Exp 2
15## GSM188020   DMSO treated MCF7 breast cancer cells [HG-U133A] Exp 3
16##           geo_accession                status submission_date
17## GSM188013     GSM188013 Public on May 12 2007     May 08 2007
18## GSM188014     GSM188014 Public on May 12 2007     May 08 2007
19## GSM188016     GSM188016 Public on May 12 2007     May 08 2007
20## GSM188018     GSM188018 Public on May 12 2007     May 08 2007
21## GSM188020     GSM188020 Public on May 12 2007     May 08 2007
22##           last_update_date
23## GSM188013      May 11 2007
24## GSM188014      May 11 2007
25## GSM188016      May 11 2007
26## GSM188018      May 11 2007
27## GSM188020      May 11 2007
28##构建group_list
29group_list<-rep(c("Treat","Control"),3)
30colnames(data)<-group_list
31library(factoextra)
32## Warning: package 'factoextra' was built under R version 3.5.3
33## Loading required package: ggplot2
34## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https:///13EFCZ
35## 计算PCA
36data<-t(data)##转换数据至行为sample,列为gene
37data<-as.data.frame(data)##注意数据要转换为数据框
38res.pca <- prcomp(data, scale = TRUE)
39##展示主成分对差异的贡献
40fviz_eig(res.pca)

1## 可视化结果
2fviz_pca_ind(res.pca,
3             col.ind = group_list, # 颜色对应group信息
4             palette = c("#00AFBB",  "#FC4E07"),
5             addEllipses = TRUE, # Concentration ellipses
6             ellipse.type = "confidence",
7             legend.title = "Group",## Legend名称
8             repel = TRUE
9             )

层次聚类图-聚类结果也与PCA相似，结果并不好

聚类分析的结果也同样可以进一步美化，但这里不做
计算距离时同样需进行转置，但在前一步PCA分析中的data已经经过转置，故未重复

1dd <- dist(data, method = "euclidean")##data是经过行列转换的
2hc <- hclust(dd, method = "ward.D2")
3plot(hc)

1##对结果进行美化
2# Convert hclust into a dendrogram and plot
3hcd <- as.dendrogram(hc)
4# Define nodePar
5nodePar <- list(lab.cex = 0.6, pch = c(NA, 19), 
6                cex = 0.7, col = "blue")
7# Customized plot; remove labels
8plot(hcd, ylab = "Height", nodePar = nodePar, leaflab = "none")

limma分析差异基因

limma包的具体用法参考 limma Users Guide
构建分组信息，构建好比较矩阵是关键
注意这里的表达矩阵信息 eset_dat是经过处理后的，为转置，行为gene，列为sample

 1library(limma)
 2library(dplyr)
 3group_list
 4## [1] "Treat"   "Control" "Treat"   "Control" "Treat"   "Control"
 5design <- model.matrix(~0+factor(group_list))
 6colnames(design)=levels(factor(group_list))
 7design
 8##   Control Treat
 9## 1       0     1
10## 2       1     0
11## 3       0     1
12## 4       1     0
13## 5       0     1
14## 6       1     0
15## attr(,"assign")
16## [1] 1 1
17## attr(,"contrasts")
18## attr(,"contrasts")$`factor(group_list)`
19## [1] "contr.treatment"
20## 比较信息
21contrast.matrix<-makeContrasts("Treat-Control",
22                               levels = design)
23contrast.matrix##查看比较矩阵的信息，这里我们设置的是Treat Vs. control
24##          Contrasts
25## Levels    Treat-Control
26##   Control            -1
27##   Treat               1
28## 拟合模型
29fit <- lmFit(eset_dat,design)
30fit2 <- contrasts.fit(fit, contrast.matrix) 
31fit2 <- eBayes(fit2) 
32DEG<-topTable(fit2, coef=1, n=Inf) %>% na.omit()  ## coef比较分组 n基因数
33head(DEG)
34##             logFC   AveExpr          t      P.Value adj.P.Val         B
35## ALDH3A1 -3.227263 10.302323 -10.710306 4.482850e-05 0.3134585 -4.048355
36## CYP1B1  -3.033684 13.287607 -10.505888 4.995753e-05 0.3134585 -4.049713
37## CYP1A1  -9.003353 11.481268  -8.371476 1.762905e-04 0.7374232 -4.069681
38## HHLA2   -1.550587  6.595658  -7.443431 3.337672e-04 0.9308066 -4.083411
39## SLC7A5  -2.470333 13.628775  -7.298868 3.708688e-04 0.9308066 -4.085966
40## TIPARP  -1.581274 12.764218  -7.024252 4.552834e-04 0.9522252 -4.091192
41dim(DEG)
42## [1] 12549     6
43save(DEG,file = "DEG_all.Rdata")

绘制火山图

火山图其实仅仅是一种可视化的方式，能够从整体上让我们对整体的差异分析情况有个了解
筛选到差异基因后，可以直接绘制出火山图
火山图的横坐标为logFC, 纵坐标为-log10(pvalue)，因此其实理论上讲plot即可完成火山图绘制

最简单原始的画法

1colnames(DEG)
2## [1] "logFC"     "AveExpr"   "t"         "P.Value"   "adj.P.Val" "B"
3plot(DEG$logFC,-log10(DEG$P.Value))

高级的画法

借助于有人开发的更高级的包，用于完成某些特殊的功能，或者更美观

 1require(EnhancedVolcano)
 2EnhancedVolcano(DEG,
 3
 4                lab = rownames(DEG),
 5
 6                x = "logFC",
 7
 8                y = "P.Value",
 9
10                selectLab = rownames(DEG)[1:5],
11
12                xlab = bquote(~Log[2]~ "fold change"),
13
14                ylab = bquote(~-Log[10]~italic(P)),
15
16                pCutoff = 0.05,## pvalue阈值
17
18                FCcutoff = 1,## FC cutoff
19
20                xlim = c(-5,5),
21
22                transcriptPointSize = 1.8,
23
24                transcriptLabSize = 5.0,
25
26                colAlpha = 1,
27
28                legend=c("NS","Log2 FC"," p-value",
29                         " p-value & Log2 FC"),
30
31                legendPosition = "bottom",
32
33                legendLabSize = 10,
34
35                legendIconSize = 3.0)

绘制热图

热图的使用比较频繁，得到差异基因后可以直接绘制热图
相对简单好用的要属pheatmap包了
管道中的常规提取需要加上特殊的占位符.

 1## 首先提取出想要画的数据
 2head(DEG)
 3## 提取FC前50
 4up_50<-DEG %>% as_tibble() %>% 
 5  mutate(genename=rownames(DEG)) %>% 
 6  dplyr::arrange(desc(logFC)) %>% 
 7  .$genename %>% .[1:50] ## 管道符中的提取
 8## FC低前50
 9down_50<-DEG %>% as_tibble() %>% 
10  mutate(genename=rownames(DEG)) %>% 
11  dplyr::arrange(logFC) %>% 
12  .$genename %>% .[1:50] ## 管道符中的提取
13index<-c(up_50,down_50)  
14
15## 开始绘图-最简单的图
16library(pheatmap)
17pheatmap(eset_dat[index,],show_colnames =F,show_rownames = F)

稍微调整细节

 1index_matrix<-t(scale(t(eset_dat[index,])))##归一化
 2index_matrix[index_matrix>1]=1
 3index_matrix[index_matrix<-1]=-1
 4head(index_matrix)
 5
 6## 添加注释
 7anno=data.frame(group=group_list)
 8rownames(anno)=colnames(index_matrix)
 9anno##注释信息的数据框
10pheatmap(index_matrix,
11           show_colnames =F,
12           show_rownames = F,
13           cluster_cols = T, 
14           annotation_col=anno)
15