【原】9步骤完成单细胞数据挖掘文章全部图表复现

健明 2023-08-04 发布于广东

展开全文

两个月前我们更新了优秀的马拉松学员笔记：单细胞+芯片+转录组测序的数据挖掘文章一比一复现，内容非常详实，理论上该流程可以复用到然后一个癌症或者其它疾病，大纲也很清晰（单细胞数据分析本身往往是数据挖掘课题的一个环节而已），接下来继续更新另外一个癌症的复现，大纲不一样哦，大家一定要抽空刷完这波：

文章图表复现： Integration of scRNA-Seq and Bulk RNA-Seq to Analyse the Heterogeneity of Ovarian Cancer Immune Cells and Establish a Molecular Risk Model

文章大概思路如下：

对ssRNA数据降维、分群、注释
用GSVA对鉴定到的T细胞，B细胞，髓系细胞继续细分，发现了两种关键的细胞类型
对TCGA数据进行免疫浸润分析，发现其中一种细胞的比例与生存密切相关
用WGCNA分析找到与该细胞相关的hub gene
下载imvigor 210队列数据，用SVM模型预测免疫表型
用NMF把TCGA样本分为两个亚组，该细胞在这两个亚组的比例有显著差异
批量单因素cox回归找出hub gene中与生存相关的gene
lasso回归进一步筛选gene
多因素cox回归，计算riskScore，riskScore与生存信息的相关性

具体复现流程

1. 细胞分群注释

首先下载数据，文章中用了两个scRNA数据集，GSE154600和GES158937

普通点击下载太慢了，点进NCBI的FTP站点，在linux上用axel下载会快很多

网页显示

把数据整理成下图标准格式就可以读取了

数据标准格式

library(Seurat)
library(stringr)
library(clustree)
library(dplyr)
library(GSVA)
library(msigdbr)
library(GSEABase)
library(pheatmap)
library(harmony)
rm(list = ls())
gc()

# 1. 循环读取每个样本的数据
filename <- paste('processed_data/',list.files('processed_data/'),sep = '')
sceList <- lapply(filename, function(x){
  obj <- CreateSeuratObject(counts = Read10X(x),
                            project = str_split(x,'/')[[1]][2])
})
names(sceList) <- list.files('processed_data/')
sce <- merge(sceList[[1]],sceList[-1],add.cell.ids = names(sceList),project='OC')
dim(sce) # 38224个gene， 67323个cell

简单过滤后，看下PCA，有明显批次效应

# 2. 查看线粒体(MT开头)、核糖体(RPS/RPL开头)基因占比
grep('^MT',x=rownames(sce@assays$RNA@data),value = T)
grep('^RP[SL]',x=rownames(sce@assays$RNA@data),value = T)

sce <- PercentageFeatureSet(sce,pattern = '^MT',col.name = 'percent.MT')
sce <- PercentageFeatureSet(sce,pattern = '^RP[SL]',col.name = 'percent.RP')
VlnPlot(sce,features = 'percent.MT',pt.size = 0)
VlnPlot(sce,features = 'percent.RP',pt.size = 0)
VlnPlot(sce,features = 'nCount_RNA',pt.size = 0)
VlnPlot(sce,features = 'nFeature_RNA',pt.size = 0)
# 3. 过滤细胞
sce <- subset(sce,subset = nCount_RNA>3000 & nFeature_RNA>300 & percent.MT<40)
dim(sce) # 38224个gene， 29389个cell
# 4. 过滤gene
sce <- sce[rowSums(sce@assays$RNA@counts>0)>3,]
dim(sce) # 28074个gene， 29389个cell

# 5. 看下批次效应
sce <- NormalizeData(sce) # 默认用logNormalize
sce <- FindVariableFeatures(sce) # 默认选Top2000作为高变gene
sce <- ScaleData(sce) # 默认用variableFeature进行scale
sce <- RunPCA(sce) # 默认用variableFeature进行PCA降维，降维前一定要做scale
DimPlot(sce,reduction = 'pca',group.by = 'orig.ident')

原文中用SCTransform进行数据标准化，那再用SCTransform做一遍

运行完了后，在assay下面，除了RNA外还多了一个SCT，然后默认的assay也变成了'SCT’

再看看PCA结果，批次效应貌似也好了些

# 6. SCTransform进行标准化
sce <- SCTransform(sce) # 这一步包含了NormalizeData 、ScaleData、FindVariableFeatures三步
DefaultAssay(sce)
sce <- RunPCA(sce)
DimPlot(sce,reduction = 'pca',group.by = 'orig.ident')
ElbowPlot(sce,ndims = 40)

这里选择resolution为0.4，20个cluster，跟文章中差不多

从样本进行区分的umap图上来看，貌似还是有批次效应，其实可以再SCTransform之前再增加一步去批次效应的步骤，这里就略过了

# 7. 在PCA基础上进行umap降维,然后分群
sce <- RunUMAP(sce,reduction = 'pca',dims = 1:40)
sce <- FindNeighbors(sce,dims = 1:40)

sce_res <- sce
for (i in c(0.01, 0.05, 0.1, 0.15, 0.2, 0.3,0.4, 0.5,0.8,1)){
  sce_res <- FindClusters(sce_res,resolution = i)
}
clustree(sce_res,prefix = 'SCT_snn_res.')

sce <- FindClusters(sce,resolution = 0.4)
DimPlot(sce,reduction = 'umap',group.by = 'seurat_clusters',label = T)

细胞注释完了后，把每个细胞的注释结果添加到metadata里

# 8. 手动进行细胞注释

# 免疫细胞  5,6,8,9,15,20 
DotPlot(sce,features = c('PTPRC','CD45'))
# T细胞   0,[8],9,13
DotPlot(sce,features = c('CD3D','CD3E','CD8A')) 
# B细胞   14,16,18,[20]
DotPlot(sce,features = c('CD79A', 'CD37', 'CD19', 'CD79B', 'MS4A1','CD20'))
# 浆细胞 [14,16,18]
DotPlot(sce,features = c('IGHG1','MZB1','SDC1','CD79A'))
# NK细胞  5,[9,15]
DotPlot(sce,features = c('FGFBP2','FCG3RA','CX3CR1'))
# NK细胞  8,15
DotPlot(sce,features = c('CD160','NKG7','GNLY','CD247','CCL3','GZMB','CXCR1','TYOB','PRF1'))
# 内皮细胞 [11]
DotPlot(sce,features = c('PECAM1','VWF'))
# 成纤维细胞 [0,7],10,11
DotPlot(sce,features = c('FGF7','MME','DCN','LUM','GSN','PF4','PPBP'))
# 上皮细胞 1,2,[3],4,[12,13,17,19]
DotPlot(sce,features = c('EPCAM','KRT19','PROM1','ALDH1A1','CD24'))
# 单核细胞和巨噬细胞 5,6,9,15
DotPlot(sce,features = c('CD68','CD163','CD14','LYZ'))
# Ovarian somatic cell  [13]
DotPlot(sce,features = c('LGR5'))

marker <- data.frame(cluster = 0:20,cell = 0:20)
marker[marker$cluster %in% c(5,6,8,9,15,20),2] <- 'Immune cells'
marker[marker$cluster %in% c(0,8,9,13),2] <- 'T cells'
marker[marker$cluster %in% c(14,16,18,20),2] <- 'B cells'
marker[marker$cluster %in% c(14,16,18),2] <- 'plasma cells'
marker[marker$cluster %in% c(5,9,15),2] <- 'NK cells'
marker[marker$cluster %in% c(11),2] <- 'Endothelial cells'
marker[marker$cluster %in% c(5,6),2] <- 'myeloid cells'
marker[marker$cluster %in% c(1,2,3,4,12,13,17,19),2] <- 'epithelial cell'
marker[marker$cluster %in% c(0,7,10),2] <- 'fibroblast'
marker[marker$cluster %in% c(13),2] <- 'Ovarian somatic cell'

sce@meta.data$cell_type <- sapply(sce@meta.data$seurat_clusters,function(x){marker[x,2]})
DimPlot(sce,reduction = 'umap',group.by = 'cell_type',label = T)

# 9. marker gene 热图展示
heatmap_marker_gene <- c('CD3D','CD3E','CD8A',
                         'CD79A', 'CD19', 'CD79B', 'MS4A1','CD20',
                         'MZB1','SDC1','CD79A',
                         'CD68','CD163','CD14','LYZ')
heatmap_cells <- rownames(sce@meta.data[sce@meta.data$cell_type %in% c("myeloid cells","B cells","T cells"),])
sub_sce <- subset(sce,cells = heatmap_cells)
DoHeatmap(sub_sce,features = heatmap_marker_gene,group.by = 'cell_type',size = 3)

2.用GSVA对鉴定到的T细胞，B细胞，髓系细胞继续细分，发现了两种关键的细胞类型

# 10. GSVA分析

Hallmark_gene_set <- msigdbr(species = 'Homo sapiens',category = 'H')
Hallmark_list <- split(Hallmark_gene_set$gene_symbol,Hallmark_gene_set$gs_name)

gsva_res <- gsva(expr = sce@assays$SCT@scale.data,
                 gset.idx.list = Hallmark_list,
                 method = 'ssgsea',
                 kcdf='Poisson',
                 parallel.sz = parallel::detectCores())
pheatmap(gsva_res,fontsize_row = 4,
         height = 10,
         width=18,
         annotation_col = data.frame(row.names = colnames(sce),group=sce@meta.data$orig.ident),
         show_colnames = F,
         show_rownames = T)

文中作者鉴定出T细胞，B细胞和髓系细胞后，用GSVA再进行细分，采用的基因集就是msigdb的Hallmark gene set。

这里读入的hallmark_gene_set是一个8209*15的data frame，找出它的gene symbol和gene set name这两列，用split函数把它转换成一个列表，这样hallmark_list是一个长度为length(unique(Hallmark_gene_set$gs_name))的list，其中hallmark_list的每一项都是某一gs_name对应的所有gene_symbol，只需要把表达矩阵和gene list传递给gsva函数即可进行gsva分析

但是简单看了下hallmark gene set的名字，根据这些gene set似乎没法对T细胞和B细胞再进行细分，不太清楚作者是怎么做的

于是发现了另一个基因集C8

一共700个基因集，看下和T细胞，B细胞相关的基因集

并不是很多，和OV相关的也没有，感觉也不太合适，于是决定上cellmarker自己下载基因集

cell_type_gene_set <- msigdbr(species = 'Homo sapiens',category = 'C8')
cell_type_list <- split(cell_type_gene_set$gene_symbol,cell_type_gene_set$gs_name)
names(cell_type_list)

搜出来直接点击就可以下载了

t_cell_marker <- read.table('CellMarker_T_cell.csv',sep = ',',header = 1)
b_cell_marker <- read.table('CellMarker_B_cell.csv',sep = ',',header = 1)
macrophage_marker <- read.table('CellMarker_Macrophage.csv',sep = ',',header = 1)

读进来是这样的一个表格，species中除了人还有小鼠的，就根据cell type和cell marker这两列去构建gene list，只要保证最后构建出来的gene list的每一项是一个包含gene symbol的vector，然后每一项的名字是对应的细胞类型就可以了，我写的比较复杂，大家可以自己发挥

需要注意的有几点：

cellmarker这一列里，有多个gene的话，多个gene是一个字符串，需要自己拆开，添加到一个vector里
CD4+ T cell可能包含Activated CD4+ T cell和Effector CD4+ T cell，可以把它们对应的gene都合并成一个vector

t_cell_marker <- t_cell_marker[t_cell_marker$Species=='Human',]
b_cell_marker <- b_cell_marker[b_cell_marker$Species=='Human',]
macrophage_marker <- macrophage_marker[macrophage_marker$Species=='Human',]

# T cell 挑了4种进行热图展示
cytotoxic <- unique(t_cell_marker$Cell.Type)[str_detect(str_to_lower(unique(t_cell_marker$Cell.Type)),'cytotoxic')]
naive <- unique(t_cell_marker$Cell.Type)[str_detect(str_to_lower(unique(t_cell_marker$Cell.Type)),'naive')]
regulatory <- unique(t_cell_marker$Cell.Type)[str_detect(str_to_lower(unique(t_cell_marker$Cell.Type)),'regulatory')]
exhausted <- unique(t_cell_marker$Cell.Type)[str_detect(str_to_lower(unique(t_cell_marker$Cell.Type)),'exhausted')]
t_cell_df <- t_cell_marker[t_cell_marker$Cell.Type %in% c(cytotoxic,naive,regulatory,exhausted),]
t_cell_df[t_cell_df$Cell.Type %in% naive,"Cell.Type"] <- 'naive'
t_cell_df[t_cell_df$Cell.Type %in% cytotoxic,"Cell.Type"] <- 'cytotoxic'
t_cell_df[t_cell_df$Cell.Type %in% regulatory,"Cell.Type"] <- 'regulatory'
t_cell_df[t_cell_df$Cell.Type %in% exhausted,"Cell.Type"] <- 'exhausted'
t_cell_list <- split(t_cell_df$Cell.Marker,t_cell_df$Cell.Type)

# B cell 挑了5种进行热图展示
unique(b_cell_marker$Cell.Type)
b_plasma <- unique(b_cell_marker$Cell.Type)[str_detect(str_to_lower(unique(b_cell_marker$Cell.Type)),'plasma')]
b_memory <- unique(b_cell_marker$Cell.Type)[str_detect(str_to_lower(unique(b_cell_marker$Cell.Type)),'memory')]
b_naive <- unique(b_cell_marker$Cell.Type)[str_detect(str_to_lower(unique(b_cell_marker$Cell.Type)),'naive')]
b_activated <- unique(b_cell_marker$Cell.Type)[str_detect(str_to_lower(unique(b_cell_marker$Cell.Type)),'activated')]
b_transitional <- unique(b_cell_marker$Cell.Type)[str_detect(str_to_lower(unique(b_cell_marker$Cell.Type)),'transitional')]

b_cell_df <- b_cell_marker[b_cell_marker$Cell.Type %in% c(b_plasma,b_memory,b_naive,b_activated,b_transitional),]
b_cell_df[b_cell_df$Cell.Type %in% b_plasma,"Cell.Type"] <- 'plasma'
b_cell_df[b_cell_df$Cell.Type %in% b_memory,"Cell.Type"] <- 'memory'
b_cell_df[b_cell_df$Cell.Type %in% b_naive,"Cell.Type"] <- 'naive'
b_cell_df[b_cell_df$Cell.Type %in% b_activated,"Cell.Type"] <- 'activated'
b_cell_df[b_cell_df$Cell.Type %in% b_transitional,"Cell.Type"] <- 'transitional'
b_cell_list <- split(b_cell_df$Cell.Marker,b_cell_df$Cell.Type)

# Macrophage cell
unique(macrophage_marker$Cell.Type)
M1 <- unique(macrophage_marker$Cell.Type)[str_detect(str_to_lower(unique(macrophage_marker$Cell.Type)),'m1')]
M2 <- unique(macrophage_marker$Cell.Type)[str_detect(str_to_lower(unique(macrophage_marker$Cell.Type)),'m2')]
macrophage_df <- macrophage_marker[macrophage_marker$Cell.Type %in% c(M1,M2),]
macrophage_list <- split(macrophage_df$Cell.Marker,macrophage_df$Cell.Type)

# 到这一步，gene list里的每一项，是一个字符串（gene symbol组成的）组成的vector，我写个函数，把每个字符串里的gene symbol拆出来，
# 最后组合成一个由gene symbol组成的vector，然后去个重
myfun <- function(x){
  res <- c()
  for (i in x){
    temp <- unlist(str_split(gsub(' ','',i),','))
    res <- c(res,temp)
  }
  unique(res)
}

t_cell_list <- lapply(t_cell_list, function(x){myfun(x)})
b_cell_list <- lapply(b_cell_list, function(x){myfun(x)})
macrophage_list <- lapply(macrophage_list, function(x){myfun(x)})

# 找到三种细胞分别对应的表达矩阵
t_cell <- colnames(sce[,sce@meta.data$cell_type=='T cells'])
b_cell <- colnames(sce[,sce@meta.data$cell_type=='B cells'])
macrophage_cell <- colnames(sce[,sce@meta.data$cell_type=='myeloid cells'])

t_cell_exp <- sce@assays$SCT@scale.data[,t_cell]
b_cell_exp <- sce@assays$SCT@scale.data[,b_cell]
macrophage_cell_exp <- sce@assays$SCT@scale.data[,macrophage_cell]

t_cell_gsva_res <- gsva(expr = t_cell_exp,
                           gset.idx.list = t_cell_list,
                           method = 'ssgsea',
                           kcdf='Poisson',
                           parallel.sz = parallel::detectCores())
pheatmap(t_cell_gsva_res,fontsize_row = 8,
         height = 10,
         width=18,
         annotation_col = data.frame(row.names = colnames(sce),group=sce@meta.data$orig.ident),
         show_colnames = F,show_rownames = T,cluster_cols = F)

b_cell_gsva_res <- gsva(expr = b_cell_exp,
                        gset.idx.list = b_cell_list,
                        method = 'ssgsea',
                        kcdf='Poisson',
                        parallel.sz = parallel::detectCores())

pheatmap(b_cell_gsva_res,fontsize_row = 8,
         height = 10,
         width=18,
         annotation_col = data.frame(row.names = colnames(sce),group=sce@meta.data$orig.ident),
         show_colnames = F,show_rownames = T,cluster_cols = F)

macrophage_cell_gsva_res <- gsva(expr = macrophage_cell_exp,
                        gset.idx.list = macrophage_list,
                        method = 'ssgsea',
                        kcdf='Poisson',
                        parallel.sz = parallel::detectCores())

pheatmap(macrophage_cell_gsva_res,fontsize_row = 8,
         height = 10,
         width=18,
         annotation_col = data.frame(row.names = colnames(sce),group=sce@meta.data$orig.ident),
         show_colnames = F,show_rownames = T,cluster_cols = F)

3. 对TCGA数据进行免疫浸润分析，发现其中一种细胞的比例与生存密切相关

首先下载TCGA数据

在Xena的datasets找到OV相关的数据集，点进去后下载了FPKM数据

把行名从emsembl改成symbol之后，就可以分析了

#加载需要的R包
library(TCGAbiolinks)
library(WGCNA)
library(stringr)
library(SummarizedExperiment)
library(clusterProfiler)
library(org.Hs.eg.db)
library(tibble)
library(CIBERSORT)
library(tidyr)
library(ggplot2)
library(RColorBrewer)
library(readxl)
library(survival)
library(survminer)
library(WGCNA)
library(data.table)
library(dplyr)
library(IMvigor210CoreBiologies)
library(e1071)
library(pROC)
library(NMF)
library(pheatmap)
library(clusterProfiler)
library(ggsignif)
library(limma)
library(org.Hs.eg.db)
library(glmnet)
library(tibble)
library(ggrisk)

rm(list = ls())
gc()exp <- fread('TCGA-OV.htseq_fpkm.tsv',header = T)
exp <- column_to_rownames(exp,var = 'Ensembl_ID')

# 行名转换
exp <- as.data.frame(exp)
exp$ensembl_id <- str_split(rownames(exp),'\\.',simplify = T)[,1]
id_df <- bitr(exp$ensembl_id,fromType = 'ENSEMBL',toType = 'SYMBOL',OrgDb = org.Hs.eg.db)
id_df <- id_df[!duplicated(id_df$ENSEMBL),]
index <- match(exp$ensembl_id,id_df$ENSEMBL)
# 对symbol id这一列去重去空，设为行名
exp$symbol_id <- id_df[index,'SYMBOL']
exp <- exp[!is.na(exp$symbol_id),]
exp <- exp[!duplicated(exp$symbol_id),]
rownames(exp) <- exp$symbol_id
exp <- exp[,1:(ncol(exp)-2)]

save(exp,file = 'TCGA_OV.Rdata')

免疫浸润

# 免疫浸润
load('TCGA_OV.Rdata')
lm22f <- system.file('extdata','LM22.txt',package = 'CIBERSORT')
TME.result <- cibersort(lm22f,as.matrix(exp),perm = 100,QN=T)
TME.result <- as.data.frame(TME.result)
save(result,file = 'TME_result.Rdata')

group <- ifelse(as.numeric(substr(colnames(exp),14,15))<10,'tumor','normal')
patient_exp <- exp[,which(group=='tumor')]
result <- TME.result[,-c(23:25)] %>% as.data.frame() %>% 
  rownames_to_column('Sample') %>% gather(key = cell_type,value = proportion,-Sample)

my_pallet <- colorRampPalette(brewer.pal(8,'Set1'))
ggplot(result,aes(Sample,proportion,fill=cell_type))+geom_bar(stat = 'identity')+
  scale_fill_manual(values = my_pallet(22))+
  labs(x='',y = "Estiamted Proportion",fill='cell_type')+theme_bw()+
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        legend.position = "bottom")

cibersort计算了每种免疫细胞在每个样本中的丰度值，接下来要看看M1和M2这两种细胞对生存的影响

首先下载临床信息数据，从里面找到OV相关的临床信息即可

文中作者应该是分析了M1和M2两种细胞，然后发现只有M1与生存信息显著相关

# KM-plot
load('TCGA_OV.Rdata')
load('TME_result.Rdata')
# 准备生存数据，包含OS,OS.time,DFI,DFI.time这些信息
# https://gdc./about-data/publications/pancanatlas
sur_data <- read_xlsx('TCGA-CDR-SupplementalTableS1.xlsx',sheet = 1)
sur_data <- column_to_rownames(sur_data,var = '...1')
sur_data <- sur_data[sur_data$type=='OV',c('bcr_patient_barcode','OS','OS.time','DFI','DFI.time')]
rownames(sur_data) <- sur_data$bcr_patient_barcode
save(sur_data,file = 'survival_data.Rdata')
# 建立sample和patient对应关系，把cibersort结果中Macrophages M1在每个样本中的信息转移过来
sample_patient_df <- data.frame(sample=colnames(exp),patient=substr(colnames(exp),1,12))
sample_patient_df$TAM_M1 <- TME.result$`Macrophages M1`
sample_patient_df$TAM_M1_type <- ifelse(sample_patient_df$TAM_M1>median(sample_patient_df$TAM_M1),'high','low')
sample_patient_df <- sample_patient_df[!duplicated(sample_patient_df$patient),]
# 下载的生存数据的临床信息中patient id 与 exp中patient id 取交集
sur_data <- sur_data[intersect(sample_patient_df$patient,sur_data$bcr_patient_barcode),]
# 把sample_patient_df中Macrophages M1的信息对应到sur_data中每个patient id上
index <- match(sur_data$bcr_patient_barcode,sample_patient_df$patient)
sur_data$TAM_M1 <- sample_patient_df[index,'TAM_M1_type']

M1_OS <- survfit(Surv(OS.time,OS)~TAM_M1,data = sur_data)
ggsurvplot(M1_OS,pval = T,risk.table = T,surv.median.line = 'hv',
           title='Overall survival',xlab='Days')
M1_DFI <- survfit(Surv(DFI.time,DFI)~TAM_M1,data = sur_data)
ggsurvplot(M1_DFI,pval = T,risk.table = T,surv.median.line = 'hv',
           title='Disease Free Interval',xlab='Days')
save(sur_data,file = 'sur_data.Rdata')

生存信息中的行名是patient这一列，TCGA表达矩阵exp中列名是sample这一列，这两列不是一一对应的，一个patient可能对应多个sample。可以考虑直接去重或者手动删除一个病人的多个样本，这样一个病人只对应一个样本，同样也只有一个TAM_M1的丰度信息，就可以进行后续分析了

这里我根据TAM_M1的中位数，把样本分为了M1高表达和低表达

最后的sur_data长这样

4. 用WGCNA分析找到与该细胞相关的hub gene

# WGCNA
load('TCGA_OV.Rdata')
load('TME_result.Rdata')
load('sur_data.Rdata')
exp <- as.data.frame(exp)

# 过滤异常值
exp <- exp[rowMads(as.matrix(exp))>0.01,]
dim(exp)

# 查找并删除异常值
temp <- exp
colnames(temp) <- 1:ncol(exp)
plot(hclust(dist(t(temp)))) # 无异常样本

# 构建基因共表达网络
exp <- as.data.frame(t(exp))

# 选择合适的power
sft <- pickSoftThreshold(exp,networkType = 'signed')

a <- ggplot(sft$fitIndices,aes(x=Power,y=-sign(slope)*SFT.R.sq))+
  geom_text(label=sft$fitIndices$Power,color='red')+
  geom_hline(yintercept = 0.9,color='red')+
  xlab("Soft Threshold (power)")+ylab("Scale Free Topology Model Fit,signed R^2")+
  ggtitle("Scale independence")+theme(plot.title = element_text(hjust = 0.5))

b <- ggplot(sft$fitIndices,aes(x=Power,y=mean.k.))+
  geom_text(label=sft$fitIndices$Power,color='red')+
  xlab("Soft Threshold (power)")+ylab("Mean Connectivity")+
  ggtitle('Mean connectivity')+theme(plot.title = element_text(hjust = 0.5))

a+b
power <- sft$powerEstimate
power <- 5

# 检验是否符合无尺度网络
k <- softConnectivity(exp,power = power)
scaleFreePlot(k)

# 构建邻接矩阵
adj <- adjacency(exp,power = power,type = 'signed')

# 计算TOM矩阵
TOM <- TOMsimilarity(adj,TOMType = 'signed')
save(TOM,file = 'TOM.Rdata')
load('TOM.Rdata')
# 计算gene间相异性
dissTOM <- 1-TOM

# 对gene进行聚类和可视化
geneTree <- hclust(as.dist(dissTOM),method = 'average')
sizeGrWindow(12,9)
plot(geneTree,labels = F,hang=0.04,xlab='',sub='')

# 将聚类树进行修剪，把gene归到不同模块里
dynamic_module <- cutreeDynamic(dendro = geneTree,distM = dissTOM,minClusterSize = 50,pamRespectsDendro = F)
table(dynamic_module)
module_color <- labels2colors(dynamic_module)
plotDendroAndColors(dendro = geneTree,colors = module_color,dendroLabels = F,hang=0.03,addGuide = T,
                    guideHang = 0.05,groupLabels='Dynamic color tree',
                    main='gene dendrogram and module colors')

随着power的增加，gene之间的连通性降低，取log后成线性关系，符合无尺度网络分布

这里分出来20多个模块，太多了，后面进行合并

# 随机选择400个基因画拓扑重叠热图
set.seed(10)
select = sample(5000, size = 400)
selectTOM = dissTOM[select, select]
selectTree = hclust(as.dist(selectTOM), method = "average") 
selectColors = module_color[select]

sizeGrWindow(9,9) 
plotDiss = selectTOM^power
diag(plotDiss) = NA
TOMplot(plotDiss, 
        selectTree, 
        selectColors, 
        main = "Network heatmap plot, selected genes")

默认画出来的是第一张图，如果把plotDiss改成1-plotDiss，可以得到第二张图，颜色越深表示gene相关性越强。可以看到相关性较高的gene都是在同一个模块内的

默认结果图

改掉参数后

# 计算每个模块特征gene向量
MEList <- moduleEigengenes(exp,dynamic_module)
MEs <- MEList$eigengenes

# 计算特征基因的相异度，基于相异度对原有模块进行层次聚类
dissME <- 1-cor(MEs)
MEtree <- hclust(as.dist(dissME),method = 'average')

# 对模块进行合并
cut_height <- 0.6
sizeGrWindow(12,8)
plot(MEtree)
abline(h=cut_height,col='red')

merge_module <- mergeCloseModules(exprData = exp,colors=module_color,cutHeight = cut_height)
merged_color <- merge_module$colors
# merged_MEs每一行是一个病人ID，每一列是一个模块的特征gene向量
merged_MEs <- merge_module$newMEs

sizeGrWindow(12,8)
plotDendroAndColors(dendro = geneTree,colors = cbind(module_color,merged_color),dendroLabels = F,hang=0.03,addGuide = T,
                    guideHang = 0.05,groupLabels='Dynamic color tree',
                    main='gene dendrogram and module colors')
table(merged_color)

将height在0.6以下的模块进行合并

合并模块后，模块数量减少到13个

# 计算模块与TAM_M1，TAM_M2间的相关性
trait <- data.frame(row.names = rownames(exp),ID = substr(rownames(exp),1,12),
                    M1 = TME.result$`Macrophages M1`,M2 = TME.result$`Macrophages M2`)
index <- match(trait$ID,sur_data$bcr_patient_barcode)
trait$os <- sur_data[index,'OS']


module_trait_cor <- cor(merged_MEs,trait[,2:3],use = 'p')
module_trait_pvalue <- corPvalueStudent(module_trait_cor,ncol(exp))
trait_colors <- numbers2colors(trait[,2:3],signed = T,centered = T)
a <- paste(signif(module_trait_cor, 2), "\n(", signif(module_trait_pvalue, 1), ")", sep = "")
sizeGrWindow(8,6)
labeledHeatmap(module_trait_cor,xLabels = colnames(trait[,2:3]),yLabels = colnames(merged_MEs),colorLabels = FALSE, 
               colors = blueWhiteRed(50),textMatrix = a,
               setStdMargins = FALSE,cex.text = 0.65,zlim = c(-1,1), 
               main = paste("Module-trait relationships"))

蓝色模块和M1的相关性最高

# 找到M1相关的关键模块
module <- 'MEblue'
module_membership <- signedKME(exp,merged_MEs)
# 计算表达矩阵exp和OS之间相关性
gene_os_sig <- apply(exp,2,function(x){cor(x,trait$os)})

verboseScatterplot(abs(module_membership$kMEblue),
                   abs(gene_os_sig),
                   xlab = paste("Module Membership in", module, "module"),
                   ylab = "Gene significance for OS",
                   main = paste("Module membership vs. gene significance\n"),
                   cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2)

# 找出MEblue模块内所有基因
module_membership <- module_membership[!is.na(module_membership$kMEblue),]
# 筛选出MM>0.5 和 exp-OS相关性>0.1 的作为hub gene
MM_gene <- rownames(module_membership[abs(module_membership$kMEblue)>0.5,])
OS_sig_gene <- names(gene_os_sig[abs(gene_os_sig)>0.1])
hub_gene <- intersect(MM_gene,OS_sig_gene)

save(hub_gene,file = 'WGCNA_hub_gene.Rdata')

5. 下载imvigor 210队列数据，用SVM模型预测免疫表型

# SVM预测免疫类型
# DEseq安装 http:///packages/3.8/bioc/html/DESeq.html
# IMvigor210CoreBiologies安装：http://research-pub./IMvigor210CoreBiologies/packageVersions/
data(cds)
expreSet <- as.data.frame(counts(cds))
annoData <- fData(cds)
phenoData <- pData(cds)

这里需要安装两个包，e1071和IMvigor210CoreBiologies，后者比较难安，需要手动下载安装包到本地，然后再进行安装，然后就可以得到数据了

# 清洗数据
expreSet <- expreSet[rowSums(expreSet>0)>3,]

phenoData$`Immune phenotype` <- as.character(phenoData$`Immune phenotype`)
phenoData <- phenoData[!is.na(phenoData$`Immune phenotype`),]
phenoData$`Immune phenotype` <- as.factor(phenoData$`Immune phenotype`)

expreSet <- expreSet[,rownames(phenoData)]
# count转TPM
index <- match(rownames(expreSet),annoData$entrez_id)
expreSet$gene_length <- annoData[index,'length']
kb <- expreSet$gene_length/1000
rpk <- expreSet[,-ncol(expreSet)] / kb
tpm <- t(t(rpk) / colSums(rpk) * 1E6)
exp <- tpm

# 取log
exp <- log2(exp+1)
exp <- as.data.frame(exp)

用SVM建模的大体思路是这样的:

首先把数据分成训练集和测试集
训练集包含表达矩阵和每个样本对应的标签（免疫类型），以此构建svm模型
把测试集的表达矩阵带入到svm模型中，得到预测的结果
用测试集的真实标签和预测的结果即可画roc曲线

注意这里我把测试集的标签由factor改为了numeric，这样可以得到每个标签预测的概率，画出来的roc曲线点很多，如果不改，画出来的roc曲线点会很少，可能只有3个点

# 划分训练集和测试集
test_size <- 0.2
test_sample <- sample(colnames(exp),size = as.integer(ncol(exp)*test_size))
train_sample <- setdiff(colnames(exp),test_sample)
# 训练集和测试集的表达矩阵
test_exp <- as.data.frame(t(exp[,test_sample]))
train_exp <- as.data.frame(t(exp[,train_sample]))
# 训练集和测试集的标签  
test_label <- phenoData[test_sample,"Immune phenotype"]
train_label <- phenoData[train_sample,"Immune phenotype"]
# 训练集标签转换为二分类
train_label_desert <- as.factor(ifelse(train_label=='desert','desert','not desert'))
train_label_inflamed <- as.factor(ifelse(train_label=='inflamed','inflamed','not inflamed'))
train_label_excluded <- as.factor(ifelse(train_label=='excluded','excluded','not excluded'))
# 测试集标签转换为二分类
test_label_desert <- as.factor(ifelse(test_label=='desert','desert','not desert'))
test_label_inflamed <- as.factor(ifelse(test_label=='inflamed','inflamed','not inflamed'))
test_label_excluded <- as.factor(ifelse(test_label=='excluded','excluded','not excluded'))
# 测试集标签转换为数字
test_label_desert <- ifelse(test_label=='desert',1,0)
test_label_inflamed <- ifelse(test_label=='inflamed',1,0)
test_label_excluded <- ifelse(test_label=='excluded',1,0)

# 训练模型
desert_model <- svm(x = train_exp, 
                    y = train_label_desert,
                    type = 'C',kernel = 'radial',probability = T)
inflamed_model <- svm(x = train_exp, 
                    y = train_label_inflamed,
                    type = 'C',kernel = 'radial',probability = T)
excluded_model <- svm(x = train_exp, 
                    y = train_label_excluded,
                    type = 'C',kernel = 'radial',probability = T)
# 进行预测
test_predict_desert <- predict(desert_model,test_exp,prob = T)
# 得出预测结果的概率
desert_prob <- attr(test_predict_desert,'probabilities')[,2]
test_predict_inflamed <- predict(inflamed_model,test_exp)
inflamed_prob <- attr(test_predict_desert,'probabilities')[,2]
test_predict_excluded <- predict(excluded_model,test_exp)
excluded_prob <- attr(test_predict_desert,'probabilities')[,2]

desert_roc_obj <- roc(response = as.numeric(test_label_desert),
                      predictor = as.numeric(desert_prob),na.rm=T)
desert_auc <- round(auc(test_label_desert,desert_prob),2)
inflamed_roc_obj <- roc(response = as.numeric(test_label_inflamed),
                        predictor = as.numeric(inflamed_prob),na.rm=T)
inflamed_auc <- round(auc(test_label_inflamed,inflamed_prob),2)
excluded_roc_obj <- roc(response = as.numeric(test_label_excluded),
                        predictor = as.numeric(excluded_prob),na.rm=T)
excluded_auc <- round(auc(test_label_excluded,excluded_prob),2)

desert <- ggroc(desert_roc_obj)
inflamed <- ggroc(inflamed_roc_obj)
excluded <- ggroc(excluded_roc_obj)

roc_data <- list(desert_roc_obj,inflamed_roc_obj,excluded_roc_obj)
names(roc_data) <- c('desert','inflamed','excluded')

## sensitivity(敏感性): 也称recall或TPR(真阳性率，实际为正例并且模型预测为正例 占 所有所有正例 的比例)，越接近1越好
## specificity(特异性)：也称TNR(真阴性率，实际为负例并且模型预测为负例 占 所有所有负例 的比例)，越接近1越好
ggroc(roc_data)+geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1), color="red",linetype=6)+
  annotate('text',x=c(0.12,0.12,0.12),y=c(0.1,0.06,0.02),label=c(paste0('desert_auc: ',desert_auc),
                                                              paste0('inflamed_auc: ',inflamed_auc),
                                                              paste0('excluded_auc: ',excluded_auc))

6. 用NMF把TCGA样本分为两个亚组，该细胞在这两个亚组的比例有显著差异

load('TCGA_OV.Rdata')
load('WGCNA_hub_gene.Rdata')

hubgene_exp <- exp[hub_gene,]

estimate <- nmf(hubgene_exp,rank = 2:10,method = 'brunet')
save(estimate,file = 'estimate.Rdata')
plot(estimate)

rank <- 6
nmf.rank <- nmf(hubgene_exp,rank = rank,method = 'brunet')
group <- predict(nmf.rank)
index <- extractFeatures(nmf.rank,'max')
index <- unlist(index)

nmf.exp <- hubgene_exp[index,]
nmf.exp <- na.omit(nmf.exp)

jco <- c("#2874C5","#EABF00","#C6524A","#868686")
consensusmap(nmf.rank,labRow=NA,labCol=NA,
             annCol = data.frame("cluster"=group[colnames(nmf.exp)]))

cophenetic值在6-7之间变化最大，因此rank选6，就会把379个样本分为6类

anno_col <- data.frame(sample = colnames(nmf.exp),group = group)
anno_col <- anno_col[order(anno_col$group),]
pheatmap(nmf.exp[,rownames(anno_col)],show_colnames = F,annotation_col = select(anno_col,group),
         cluster_cols = F)

这里把annotation_col调整了顺序，这样每一个类的样本就在一起，如果按默认的就会非常混乱

这里annotation_col是一个一列的dataframe，行名是样本名，唯一的一列是样本分组信息

# 挑出两个组分析
group12 <- group[group==2 | group==1]
group12 <- sort(group12)
exp_temp <- nmf.exp[,names(group12)]
pheatmap(exp_temp,show_colnames = F,annotation_col = data.frame(row.names = colnames(exp_temp),group = group12),
         cluster_cols = F)

# KM-plot
load('survival_data.Rdata')
load('TME_result.Rdata')
temp <- data.frame(row.names = colnames(exp_temp),bcr_patient_barcode = substr(colnames(exp_temp),1,12),
                   group=group12)
temp <- left_join(temp,sur_data,by='bcr_patient_barcode')
group_OS <- survfit(Surv(OS.time,OS)~group,data = temp)
ggsurvplot(group_OS,pval = T,risk.table = T,surv.median.line = 'hv',
           title='Overall survival',xlab='Days')

# M1细胞的小提琴图
vln_df <- data.frame(row.names = names(group12),M1=TME.result[names(group12),'Macrophages M1'],group = group12)
ggplot(vln_df,aes(x=group,y=M1))+geom_violin(aes(fill=group))+geom_boxplot(width=0.2)+
    geom_signif(comparisons = list(c('1','2')),map_signif_level = F)

# KEGG富集
kegg_exp <- exp[,names(group12)]
design <- model.matrix(~factor(as.character(group12),levels = c('1','2')))
fit <- lmFit(kegg_exp,design=design)
fit <- eBayes(fit)
deg <- topTable(fit,coef = 2,number = Inf)
deg$change <- ifelse((deg$logFC>log2(2))&(deg$adj.P.Val<0.05),'up',
                     ifelse((deg$logFC<log2(0.5))&(deg$adj.P.Val<0.05),'down','nochange'))
save(deg,file = 'TCGA-deg.Rdata')
up_1_id <- bitr(rownames(deg[deg$change=='up',]),fromType = 'SYMBOL',toType = 'ENTREZID',OrgDb = org.Hs.eg.db)
kegg <- enrichKEGG(up_1_id$ENTREZID,organism = 'hsa',keyType = 'ncbi-geneid')
dotplot(kegg)

up_2_id <- bitr(rownames(deg[deg$change=='down',]),fromType = 'SYMBOL',toType = 'ENTREZID',OrgDb = org.Hs.eg.db)
kegg <- enrichKEGG(up_2_id$ENTREZID,organism = 'hsa',keyType = 'ncbi-geneid')
dotplot(kegg)

7. 批量单因素cox回归找出hub gene中与生存相关的gene

load('TCGA_OV.Rdata')
load('survival_data.Rdata')
load('TCGA-deg.Rdata')
# 数据处理
deg_gene <- rownames(deg[deg$change!='nochange',])
exp <- as.data.frame(t(exp))
exp <- exp[,deg_gene]
exp$ID <- substr(rownames(exp),1,12)
exp <- exp[!duplicated(exp$ID),]
index <- match(rownames(sur_data),exp$ID)
index <- index[!is.na(index)]

sur_data <- merge(sur_data,exp,by.x='bcr_patient_barcode',by.y='ID')

# 批量单因素COX回归
gene <- c()
p_value <- c()
HR <- c()
lower95 <- c()
upper95 <- c()
for (i in 6:ncol(sur_data)){
  res <- coxph(Surv(OS.time,OS)~sur_data[,i],data=sur_data)
  sum_res <- summary(res)
  p <- sum_res$coefficients[,'Pr(>|z|)']
  if (p<0.05){
    gene <- c(gene,colnames(sur_data)[i])
    p_value <- c(p_value,p)
    HR <- c(HR,sum_res$conf.int[,'exp(coef)'])
    lower95 <- c(lower95,sum_res$conf.int[,'lower .95'])
    upper95 <- c(upper95,sum_res$conf.int[,'upper .95'])
  }
}
cox_df <- data.frame(row.names = gene,pvalue=p_value,HR=HR,lower.95=lower95,upper.95=upper95)

用批量单因素cox回归，找到pvalue小于0.05的gene

8. lasso回归进一步筛选gene

# LASSO进一步筛选gene
sur_data <- column_to_rownames(sur_data,'bcr_patient_barcode')
sur_data <- select(sur_data,c(OS,OS.time,rownames(cox_df)))
sur_data <- sur_data[!is.na(sur_data$OS.time),]
x <- as.matrix(sur_data[,5:ncol(sur_data)])
y <- Surv(sur_data$OS.time,sur_data$OS)
glm.fit <- glmnet(x,y,family = 'cox',alpha = 1)
plot(glm.fit)
cv.fit <- cv.glmnet(x,y,alpha = 1,nfolds = 10,family="cox")
plot(cv.fit)
lambda <- cv.fit$lambda.min

glm.fit <- glmnet(x,y,family = 'cox',alpha = 1,lambda = lambda)
lasso_filter_gene <- names(glm.fit$beta[glm.fit$beta[,1]!=0,1])
coef <- glm.fit$beta[glm.fit$beta[,1]!=0,1]

9. 多因素cox回归，计算riskScore，riskScore与生存信息的相关性

# 筛选后的gene用多因素cox回归建模
sur_data_temp <- select(sur_data,c(OS,OS.time,lasso_filter_gene))
# gene名IGKV4-1，在ggrisk时会报错，改成.
colnames(sur_data_temp) <- c("OS","OS.time","LAMP3","STAT1","BATF2","CXCL10","CXCL11","IGKV4.1")
cox.res <- coxph(Surv(OS.time,OS)~.,data = sur_data_temp)
riskScore <- predict(cox.res,type = 'risk',newdata=sur_data_temp)
riskScore <- as.data.frame(riskScore)
riskScore$ID <- rownames(riskScore)
riskScore$risk <- ifelse(riskScore$riskScore>median(riskScore$riskScore),'high','low')
riskScore$OS <- sur_data$OS
riskScore$OS.time <- sur_data$OS.time

ggrisk(cox.res)

# riskscore做生存分析
surv_fit <- survfit(Surv(OS.time,OS)~risk,data = riskScore)
ggsurvplot(surv_fit,data = riskScore,pval = T,risk.table = T,surv.median.line = 'hv',
           legend.title = 'RiskScore',title = 'Overall survival',
           ylab='Cummulative survival',xlab='Time(Days)')