【原】单细胞分析如何确定合适的resolution

健明 2022-08-17 发布于广东

展开全文

在进行单细胞分析时，非常重要的一步就是聚类分群，而在这个过程当中经常令人困惑的是如何选择一个合适的分辨率（resolution），因为分辨率设置过大或过小都不利于我们后续的分析：设置过大会导致分群“过度”，理论上来讲每个细胞都是不同的，但这样过于细化的初步分群显然是没有意义的；相反设置过小会导致我们无法发现一些对我们可能比较重要的细胞亚群，这也丧失了单细胞数据本身的意义。

在这里分享3个可以供大家参考的单细胞数据降维聚类分群的resolution参数选定评估方法：

clustree
maker gene AUC 值
ROGUE

1. 上游分析

本次分析所使用的示例数据集为GSE197778，包含四个样本，首先使用Harmony进行批次效应的去除：

library(Seurat)
library(harmony)
library(dplyr)
library(ROGUE)

PATH = 'GSE197778'
samples = list.files(path = PATH)

sce_list = lapply(X = samples, FUN = function(sample){
  sce <- CreateSeuratObject(counts = Read10X(data.dir = file.path(PATH, sample)), 
                            project = sample,
                            min.cells = 10, 
                            min.features = 10)
  sce[['percent.mt']] <- PercentageFeatureSet(sce, pattern = '^MT-')
  return(sce)
})
#data qc
sce_list[[1]] <- subset(sce_list[[1]], subset = nFeature_RNA > 500 & nFeature_RNA < 5000)
sce_list[[2]] <- subset(sce_list[[2]], subset = nFeature_RNA > 500 & nFeature_RNA < 3000)
sce_list[[3]] <- subset(sce_list[[3]], subset = nFeature_RNA > 500 & nFeature_RNA < 3000)
sce_list[[4]] <- subset(sce_list[[4]], subset = nFeature_RNA > 500 & nFeature_RNA < 3000)
#merge sample
sce_all <- merge(x = sce_list[[1]], 
                 y = c(sce_list[[2]], sce_list[[3]], sce_list[[4]]),
                 add.cell.ids = samples)
sce_all <- subset(sce_all, subset = percent.mt < 10)
sce_all <- sce_all %>% 
  NormalizeData() %>% 
  FindVariableFeatures() %>% 
  ScaleData()
sce_all <- RunPCA(sce_all, features = VariableFeatures(sce_all), npcs = 30)
#run harmony
sce_all <- RunHarmony(sce_all, group.by.vars = 'orig.ident')

sce_all <- sce_all %>% 
  RunUMAP(reduction = "harmony", dims = 1:20) %>% 
  FindNeighbors(reduction = "harmony", dims = 1:20) 

seq <- seq(0.1, 1, by = 0.1)
for(res in seq){
  sce_all <- FindClusters(sce_all, resolution = res)
}

2. clustree

clustree通过可视化不同分辨率下不同cluster之间的交互关系来帮助我们选择合适的分辨率来进行下游分析。

library(clustree)
library(patchwork)
p1 <- clustree(sce_all, prefix = 'RNA_snn_res.') + coord_flip()
p2 <- DimPlot(sce_all, group.by = 'RNA_snn_res.0.3', label = T)
p1 + p2 + plot_layout(widths = c(3, 1))

通过这个图我们会发现当resolution大于0.3后不同cluster在不同的分辨率下会存在越来越多的相互交织，这暗示着我们可能存在分群过度的情况了，所以在这里我们可以选择0.3作为初步的分群resolution来进行后面的分析。

3. marker gene AUC 值

这个方法来自于今年6月发表于Nature上的一篇文章：Single-nucleus profiling of human dilated and hypertrophic cardiomyopathy。在其Method部分提到了他们确定resolution的方法：

Then, Leiden resolution was increased from 0.05 to 1.0 in increments of 0.05 until a cluster emerged with no genes having area under the receiver operating characteristic curve (AUC) > 0.60 in predicting that class.

这个AUC值使用FindAllMarkers()函数就能实现，只需要设置test.use = 'roc’即可，其实原理也很简单，主要是通过ROC分析来对marker基因的表达特异性进行评估，也就是将marker基因作为分类器来对细胞进行分类，用AUC值来评估分类的好坏，显然AUC值越接近于1或0就表示这个基因有非常好的细胞表达特异性，相反，如果鉴定出来的marker基因AUC值接近0.5就表明这个marker基因没有很好的细胞分类能力，这也提示我们这两个细胞亚群之间可能并没有非常显著的生物学差异，我们可能分群“分过度”了。

根据文章的描述，分群“恰到好处”的标准是，增加resolution直至出现一个细胞亚群，其所有marker基因的AUC值均低于0.6，我们就认为resolution再增加就会出现“过度分群”的状况。我们看一下不同分辨率下每个细胞亚群marker基因的AUC值情况：

data("pbmc3k.final")

seq <- seq(0.1, 1.5, by = 0.1)
for (res in seq){
  pbmc3k.final <- FindClusters(pbmc3k.final, resolution = res)
}

#res = 0.5
pbmc3k.final %>% 
  SetIdent(value = 'RNA_snn_res.0.5') %>% 
  FindAllMarkers(test.use = 'roc') %>% 
  filter(myAUC > 0.6) %>% 
  count(cluster, name = 'number')

##   cluster number
## 1       0     45
## 2       1      6
## 3       2     87
## 4       3     31
## 5       4     19
## 6       5    147
## 7       6     62
## 8       7    111
## 9       8    140

#res = 1.0
pbmc3k.final %>% 
  SetIdent(value = 'RNA_snn_res.1') %>% 
  FindAllMarkers(test.use = 'roc') %>% 
  filter(myAUC > 0.6) %>% 
  count(cluster, name = 'number')

##    cluster number
## 1        0     49
## 2        1      6
## 3        2     31
## 4        3     20
## 5        4    117
## 6        5     49
## 7        6    147
## 8        7     68
## 9        8      1
## 10       9    111
## 11      10    140

#res = 1.5
pbmc3k.final %>% 
  SetIdent(value = 'RNA_snn_res.1.5') %>% 
  FindAllMarkers(test.use = 'roc') %>% 
  filter(myAUC > 0.6) %>% 
  count(cluster, name = 'number')

##    cluster number
## 1        0      6
## 2        1     38
## 3        2     31
## 4        3     19
## 5        4    117
## 6        5     35
## 7        6     49
## 8        7    147
## 9        8      2
## 10       9     68
## 11      10    111
## 12      11    140

ROGUE

这个方法是由张泽民老师团队开发的，一个很好的、单纯的细胞亚群应该是其中的细胞基因表达具有相似的水平，没有具有显著差异的差异表达基因，ROGUE使用一个基于熵的方法来对一个细胞类群的“纯度”来进行评估。一个细胞类群的ROGUE值表征了该细胞类群的“纯度”：越接近于1表示细胞类群越“纯”，具体细节可以去看张泽民老师的文章：An entropy-based metric for assessing the purity of single cell populations。

首先使用ROGUE内置数据集进行分析：

library(ROGUE)
# inner data of package
expr <- readRDS('DC.rds')
meta <- readRDS('info.rds')

# filter low quality data
expr <- matr.filter(expr, min.cells = 10, min.genes = 10)

p1 <- rogue(expr, labels = meta$ct, samples = meta$Patient, 
            platform = 'UMI', span = 0.6) %>% 
      rogue.boxplot() + ggtitle('4 clusters')
p2 <- rogue(expr, labels = meta$`Major cell type`, samples = meta$Patient, 
            platform = 'UMI', span = 0.6) %>% 
      rogue.boxplot() + ggtitle('2 clusters')
p1 + p2

可以看到这里的分群在不同的样本中都具有比较好的均一性。

下面用我们自己的GSE197778数据集进行测试：

sce_sub <- sce_all %>% subset(., downsample = 100)
expr <- GetAssayData(sce_sub, slot = 'counts') %>% as.matrix()
meta <- sce_sub@meta.data

rogue(expr = expr, 
      labels = meta$RNA_snn_res.0.3, 
      samples = meta$orig.ident, 
      platform = 'UMI',
      span = 0.6) %>% rogue.boxplot()

最后，我们就可以利用细胞类型的基因marker对细胞进行注释啦，最终可以得到下面的结果：

sce_all <- SetIdent(object = sce_all, value = 'RNA_snn_res.0.3')
DotPlot(object = sce_all,
        features = c('PTPRC', 'CD3D', #T cell
                     'MS4A1', 'CD79A', #B cell
                     'JCHAIN', 'IGHG1', #plasma cell
                     'LYZ', 'CD68', 'FCGR3A', 'CD14', #myeloid cell
                     'EPCAM', 'CDH1', 'KRT19', #epithelial cell
                     'PECAM1', 'VWF', #endothelial cell
                     'COL1A1', 'COL1A2', 'DCN' #fibroblast cell
                     )) + 
  coord_flip()
#add cell type
#T cell
T_cluster <- c(0, 1, 4, 8, 14)
#B cell
B_cluster <- 2
#plasma cell
Plasma_cluster <- 9
#myeloid cell
Myeloid_cluster <- c(5, 13)
#epithelial cell
Epithelial_cluster <- 11
#endothelial
Endothelial_cluster <- c(7, 10)
#fibroblast
Fibroblast_cluster <- c(3, 6, 12)

sce_all@meta.data <- sce_all@meta.data %>% 
  mutate(celltype = case_when(
    RNA_snn_res.0.3 %in% T_cluster ~ 'T',
    RNA_snn_res.0.3 %in% B_cluster ~ 'B',
    RNA_snn_res.0.3 %in% Plasma_cluster ~ 'Plasma',
    RNA_snn_res.0.3 %in% Myeloid_cluster ~ 'Myeloid',
    RNA_snn_res.0.3 %in% Epithelial_cluster ~ 'Epithelial',
    RNA_snn_res.0.3 %in% Endothelial_cluster ~ 'Endotheliall',
    RNA_snn_res.0.3 %in% Fibroblast_cluster ~ 'Fibroblast'
  ))
sce_all@meta.data$celltype <- factor(sce_all@meta.data$celltype,
                                     levels = c('T', 'B', 'Plasma',
                                                'Myeloid', 'Epithelial',
                                                'Endotheliall', 'Fibroblast'))
p3 <- DimPlot(sce_all, group.by = 'celltype', label = T) + theme(legend.position = 'none')
p4 <- DotPlot(sce_all, 
              features = c('PTPRC', 'CD3D', #T cell
                           'MS4A1', 'CD79A', #B cell
                           'JCHAIN', 'IGHG1', #plasma cell
                           'LYZ', 'CD68', 'FCGR3A', 'CD14', #myeloid cell
                           'EPCAM', 'CDH1', 'KRT19', #epithelial cell
                           'PECAM1', 'VWF', #endothelial cell
                           'COL1A1', 'COL1A2', 'DCN' #fibroblast cell
                           ),
              group.by = 'celltype') + 
  coord_flip() + 
  theme(axis.text.x = element_text(hjust = 1, angle = 45),
        axis.title = element_blank()) +
  ggtitle(label = 'GSE197778')
p3 + p4