【原】MOVICS：分子分型一站式R包(03)

阿越就是我 2023-11-11 发布于上海

展开全文

💡专注R语言在🩺生物医学中的使用

设为“星标”，精彩不错过

本文主要参考官方介绍：https://xlucpu./MOVICS/MOVICS-VIGNETTE.html

前两篇请见：

MOVICS：分子分型一站式R包(01)

MOVICS：分子分型一站式R包(02)

本文目录：

RUN Module

差异分析
识别特定分子
GSEA
GSVA
NTP预测外部数据
PAM预测外部数据
一致性评价

RUN Module

接下来是第3部分的功能介绍。

差异分析

支持edgeR、DESeq2、limma3种差异分析方法。

对于多组数据，比如这个示例是分成5组，会自动进行每一个组和其他4组的比较。

# run DEA with edgeR
runDEA(dea.method = "edger",
       expr       = count, # raw count data
       moic.res   = cmoic.brca,
       prefix     = "TCGA-BRCA") # prefix of figure name
## --all samples matched.
## --you choose edger and please make sure an RNA-Seq count data was provided.
## edger of CS1_vs_Others exists and skipped...
## edger of CS2_vs_Others exists and skipped...
## edger of CS3_vs_Others exists and skipped...
## edger of CS4_vs_Others exists and skipped...
## edger of CS5_vs_Others exists and skipped...

# run DEA with DESeq2
runDEA(dea.method = "deseq2",
       expr       = count, # deseq2也需要count
       moic.res   = cmoic.brca,
       prefix     = "TCGA-BRCA")
## --all samples matched.
## --you choose deseq2 and please make sure an RNA-Seq count data was provided.
## deseq2 of CS1_vs_Others done...
## deseq2 of CS2_vs_Others done...
## deseq2 of CS3_vs_Others done...
## deseq2 of CS4_vs_Others done...
## deseq2 of CS5_vs_Others done...

# run DEA with limma
runDEA(dea.method = "limma",
       expr       = fpkm, # normalized expression data
       moic.res   = cmoic.brca,
       prefix     = "TCGA-BRCA")
## --all samples matched.
## --you choose limma and please make sure a microarray profile or a normalized expression data [FPKM or TPM without log2 transformation is recommended] was provided.
## --log2 transformation done for expression data.
## limma of CS1_vs_Others done...
## limma of CS2_vs_Others done...
## limma of CS3_vs_Others done...
## limma of CS4_vs_Others done...
## limma of CS5_vs_Others done...

结果会自动保存在当前工作目录下。

识别特定分子

在这个过程中，按照log2FoldChange排序选择差异表达最大的基因作为每个亚型的生物标志物（默认情况下每个亚型选择200个生物标志物）。这些生物标志物应该通过显著性阈值（例如，名义P值<0.05和调整后的P值<0.05），并且不能与其他亚型识别出的生物标志物重叠。

这一步需要使用上一步得到的结果，比如下面使用edgeR识别上调的100个基因：

# choose edgeR result to identify subtype-specific up-regulated biomarkers
marker.up <- runMarker(moic.res      = cmoic.brca,
                       dea.method    = "edger", # name of DEA method
                       prefix        = "TCGA-BRCA", # MUST be the same of argument in runDEA()
                       dat.path      = getwd(), # path of DEA files
                       res.path      = getwd(), # path to save marker files
                       p.cutoff      = 0.05, # p cutoff to identify significant DEGs
                       p.adj.cutoff  = 0.05, # padj cutoff to identify significant DEGs
                       dirct         = "up", # direction of dysregulation in expression
                       n.marker      = 100, # number of biomarkers for each subtype
                       doplot        = TRUE, # generate diagonal heatmap
                       norm.expr     = fpkm, # use normalized expression as heatmap input
                       annCol        = annCol, # sample annotation in heatmap
                       annColors     = annColors, # colors for sample annotation
                       show_rownames = FALSE, # show no rownames (biomarker name)
                       fig.name      = "UPREGULATED BIOMARKER HEATMAP")
## --all samples matched.
## --log2 transformation done for expression data.

unnamed-chunk-39-186542957

# check the upregulated biomarkers
head(marker.up$templates)
##     probe class dirct
## 1    PNMT   CS1    up
## 2 AKR1B15   CS1    up
## 3    DLK1   CS1    up
## 4    ACE2   CS1    up
## 5  CRISP3   CS1    up
## 6   ACSM1   CS1    up

使用limma识别下调的50个基因：

# choose limma result to identify subtype-specific down-regulated biomarkers
marker.dn <- runMarker(moic.res      = cmoic.brca,
                       dea.method    = "limma",
                       prefix        = "TCGA-BRCA",
                       dirct         = "down",
                       n.marker      = 50, # switch to 50
                       doplot        = TRUE,
                       norm.expr     = fpkm,
                       annCol        = annCol,
                       annColors     = annColors,
                       fig.name      = "DOWNREGULATED BIOMARKER HEATMAP")
## --all samples matched.
## --log2 transformation done for expression data.

unnamed-chunk-41-186542957

GSEA

富集分析我们写过很多了：

富集分析常见类型 enrichplot富集分析可视化 GSEA富集分析可视化 Goplot富集分析可视化 GseaVis富集分析可视化 simplifyEnrichment的使用示例单基因富集分析 GSVA和ssGSEA

这里用的是C5进行GSEA：

# MUST locate ABSOLUTE path of msigdb file
MSIGDB.FILE <- system.file("extdata", "c5.bp.v7.1.symbols.xls", package = "MOVICS", mustWork = TRUE)

会自动在每个亚型内进行GSEA分析，我们使用edger的结果：

# run GSEA to identify up-regulated GO pathways using results from edgeR
gsea.up <- runGSEA(moic.res     = cmoic.brca,
                   dea.method   = "edger", # name of DEA method
                   prefix       = "TCGA-BRCA", # MUST be the same of argument in runDEA()
                   dat.path     = getwd(), # path of DEA files
                   res.path     = getwd(), # path to save GSEA files
                   msigdb.path  = MSIGDB.FILE, # MUST be the ABSOLUTE path of msigdb file
                   norm.expr    = fpkm, # use normalized expression to calculate enrichment score
                   dirct        = "up", # direction of dysregulation in pathway
                   p.cutoff     = 0.05, # p cutoff to identify significant pathways
                   p.adj.cutoff = 0.25, # padj cutoff to identify significant pathways
                   gsva.method  = "gsva", # method to calculate single sample enrichment score
                   norm.method  = "mean", # normalization method to calculate subtype-specific enrichment score
                   fig.name     = "UPREGULATED PATHWAY HEATMAP")
## --all samples matched.
## GSEA done...
## --log2 transformation done for expression data.
## Estimating GSVA scores for 50 gene sets.
## Estimating ECDFs with Gaussian kernels
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=                                                                     |   2%
  |                                                                                 
  |======================================================================| 100%
## gsva done...
## heatmap done...

unnamed-chunk-43-186542957

查看第一个亚型的部分结果：

print(gsea.up$gsea.list$CS1[1:6,3:6])
##                                                     setSize enrichmentScore
## GO_CORNIFICATION                                         51      -0.7747559
## GO_KERATINIZATION                                        58      -0.7392665
## GO_ADULT_LOCOMOTORY_BEHAVIOR                             57      -0.7048011
## GO_REGULATION_OF_ACTIVIN_RECEPTOR_SIGNALING_PATHWAY      16      -0.8535591
## GO_HEMIDESMOSOME_ASSEMBLY                                12      -0.8908059
## GO_WALKING_BEHAVIOR                                      21      -0.8153693
##                                                           NES      pvalue
## GO_CORNIFICATION                                    -2.117694 0.001219512
## GO_KERATINIZATION                                   -2.056651 0.001197605
## GO_ADULT_LOCOMOTORY_BEHAVIOR                        -1.956867 0.001200480
## GO_REGULATION_OF_ACTIVIN_RECEPTOR_SIGNALING_PATHWAY -1.928063 0.001351351
## GO_HEMIDESMOSOME_ASSEMBLY                           -1.924928 0.001428571
## GO_WALKING_BEHAVIOR                                 -1.917848 0.001333333

查看每个通路中不同亚型的富集分数：

head(round(gsea.up$grouped.es,3))
##                                                          CS1    CS2    CS3
## GO_REGULATION_OF_MONOCYTE_CHEMOTAXIS                   0.392 -0.474 -0.329
## GO_INDOLE_CONTAINING_COMPOUND_METABOLIC_PROCESS        0.191 -0.457 -0.425
## GO_BENZENE_CONTAINING_COMPOUND_METABOLIC_PROCESS       0.153 -0.092  0.142
## GO_POSITIVE_REGULATION_OF_MONONUCLEAR_CELL_MIGRATION   0.358 -0.430 -0.236
## GO_AMINE_CATABOLIC_PROCESS                             0.174 -0.379 -0.204
## GO_REGULATION_OF_CELLULAR_AMINO_ACID_METABOLIC_PROCESS 0.332  0.053 -0.278
##                                                           CS4    CS5
## GO_REGULATION_OF_MONOCYTE_CHEMOTAXIS                    0.260  0.040
## GO_INDOLE_CONTAINING_COMPOUND_METABOLIC_PROCESS         0.144  0.430
## GO_BENZENE_CONTAINING_COMPOUND_METABOLIC_PROCESS        0.154 -0.503
## GO_POSITIVE_REGULATION_OF_MONONUCLEAR_CELL_MIGRATION    0.223 -0.012
## GO_AMINE_CATABOLIC_PROCESS                              0.192  0.132
## GO_REGULATION_OF_CELLULAR_AMINO_ACID_METABOLIC_PROCESS -0.524  0.440

也可以使用edger的结果：

# run GSEA to identify down-regulated GO pathways using results from DESeq2
gsea.dn <- runGSEA(moic.res     = cmoic.brca,
                   dea.method   = "deseq2",
                   prefix       = "TCGA-BRCA",
                   msigdb.path  = MSIGDB.FILE,
                   norm.expr    = fpkm,
                   dirct        = "down",
                   p.cutoff     = 0.05,
                   p.adj.cutoff = 0.25,
                   gsva.method  = "ssgsea", # switch to ssgsea
                   norm.method  = "median", # switch to median
                   fig.name     = "DOWNREGULATED PATHWAY HEATMAP")

GSVA

基因集变异分析，和上面的GSEA分析一样的流程：

# MUST locate ABSOLUTE path of gene set file
GSET.FILE <- 
  system.file("extdata", "gene sets of interest.gmt", package = "MOVICS", mustWork = TRUE)

# run GSVA to estimate single sample enrichment score based on given gene set of interest
gsva.res <- 
  runGSVA(moic.res      = cmoic.brca,
          norm.expr     = fpkm,
          gset.gmt.path = GSET.FILE, # ABSOLUTE path of gene set file
          gsva.method   = "gsva", # method to calculate single sample enrichment score
          annCol        = annCol,
          annColors     = annColors,
          fig.path      = getwd(),
          fig.name      = "GENE SETS OF INTEREST HEATMAP",
          height        = 5,
          width         = 8)
## --all samples matched.
## --log2 transformation done for expression data.
## Estimating GSVA scores for 21 gene sets.
## Estimating ECDFs with Gaussian kernels
## 
  |                                                                            
  |                                                                      |   0%       
  |===                                                                   |   5%
  |======================================================================| 100%
## gsva done...

unnamed-chunk-48-186542957

# check raw enrichment score
print(gsva.res$raw.es[1:3,1:3])
##                    BRCA-A03L-01A BRCA-A04R-01A BRCA-A075-01A
## Adhesion              0.09351280    -0.2125523    0.10362446
## Antigen_Processing    0.05451682    -0.3909617    0.05508321
## B-Cell_Functions     -0.01931423    -0.6120864    0.18301567

NTP预测外部数据

哦，等等，我们是否忘记了什么？是的，还有一个尚未使用的数据集，让我们看看我们是否可以使用这些亚型特异性的生物标志物来验证外部Yau队列中的当前乳腺癌亚型。在这部分中，我们的核心目的是预测外部数据集中每个样本的可能亚型。在大多数情况下，这是一个多分类问题，并且在外部队列中，识别出的生物标志物可能很难整体匹配，因此使用基于模型的预测算法可能不可靠。因此，MOVICS为验证队列中的亚型预测提供了两种无模型方法。首先，MOVICS切换到最近模板预测（NTP）方法，该方法可以灵活应用于跨平台、跨物种和多类别的预测，而无需进行任何分析参数的优化。唯一需要做的就是生成一个模板文件，幸运的是，这已经准备好了。

# run NTP in Yau cohort by using up-regulated biomarkers
yau.ntp.pred <- runNTP(expr       = brca.yau$mRNA.expr,
                       templates  = marker.up$templates, # the template has been already prepared in runMarker()
                       scaleFlag  = TRUE, # scale input data (by default)
                       centerFlag = TRUE, # center input data (by default)
                       doPlot     = TRUE, # to generate heatmap
                       fig.name   = "NTP HEATMAP FOR YAU") 
## --original template has 500 biomarkers and 272 are matched in external expression profile.
## cosine correlation distance
## 682 samples; 5 classes; 39-66 features/class
## serial processing; 1000 permutation(s)...
## predicted samples/class (FDR<0.05)

unnamed-chunk-50-186542957

## 
##  CS1  CS2  CS3  CS4  CS5 <NA> 
##  104   85   90  120  140  143

head(yau.ntp.pred$ntp.res)
##     prediction  d.CS1  d.CS2  d.CS3  d.CS4  d.CS5 p.value    FDR
## 107        CS2 0.7344 0.5165 0.7377 0.7471 0.7512   0.001 0.0020
## 109        CS1 0.5816 0.7581 0.7172 0.7308 0.7295   0.001 0.0020
## 11         CS1 0.6527 0.7307 0.7430 0.7557 0.7731   0.001 0.0020
## 110        CS2 0.7619 0.5418 0.7583 0.7655 0.7567   0.001 0.0020
## 111        CS1 0.6806 0.7157 0.7198 0.7889 0.7911   0.007 0.0106
## 113        CS4 0.6785 0.7505 0.7394 0.6389 0.7308   0.007 0.0106

在yau队列中做生存分析：

# compare survival outcome in Yau cohort
surv.yau <- compSurv(moic.res         = yau.ntp.pred,
                     surv.info        = brca.yau$clin.info,
                     convt.time       = "m", # switch to month
                     surv.median.line = "hv", # switch to both
                     fig.name         = "KAPLAN-MEIER CURVE OF NTP FOR YAU") 
## --a total of 682 samples are identified.

unnamed-chunk-52-186542957

print(surv.yau)
## $fitd
## Call:
## survdiff(formula = Surv(futime, fustat) ~ Subtype, data = mosurv.res, 
##     na.action = na.exclude)
## 
##               N Observed Expected (O-E)^2/E (O-E)^2/V
## Subtype=CS1 136       51     43.8     1.169     1.452
## Subtype=CS2 117       51     38.5     4.037     4.876
## Subtype=CS3 119       33     42.3     2.045     2.517
## Subtype=CS4 159       43     57.3     3.590     4.820
## Subtype=CS5 151       50     46.0     0.351     0.441
## 
##  Chisq= 11.2  on 4 degrees of freedom, p= 0.02 
## 
## $fit
## Call: survfit(formula = Surv(futime, fustat) ~ Subtype, data = mosurv.res, 
##     na.action = na.exclude, error = "greenwood", type = "kaplan-meier", 
##     conf.type = "plain")
## 
##       n events median 0.95LCL 0.95UCL
## CS1 136     51     NA      NA      NA
## CS2 117     51    205     109      NA
## CS3 119     33    236      NA      NA
## CS4 159     43    222     191      NA
## CS5 151     50     NA      NA      NA
## 
## $xyrs.est
## [1] "[Not Available]: argument of xyrs.est was not specified."
## 
## $overall.p
## [1] 0.02410534
## 
## $pairwise.p
## 
##  Pairwise comparisons using Log-Rank test 
## 
## data:  mosurv.res and Subtype 
## 
##     CS1   CS2   CS3   CS4  
## CS2 0.699 -     -     -    
## CS3 0.158 0.059 -     -    
## CS4 0.100 0.039 0.901 -    
## CS5 0.804 0.504 0.244 0.158
## 
## P value adjustment method: BH

比较一致性：

# compare agreement in Yau cohort
agree.yau <- compAgree(moic.res  = yau.ntp.pred,
                       subt2comp = brca.yau$clin.info[, "PAM50", drop = FALSE],
                       doPlot    = TRUE,
                       fig.name  = "YAU PREDICTEDMOIC WITH PAM50")
## --all samples matched.

unnamed-chunk-54-186542957

print(agree.yau)
##   current.subtype other.subtype        RI       AMI        JI        FM
## 1         Subtype         PAM50 0.7830213 0.3819096 0.3318526 0.4994425

PAM预测外部数据

除了NTP方法，MOVICS还提供了另一种无模型方法来预测亚型。具体来说，首先在发现（训练）队列（即TCGA-BRCA队列）中使用PAM（partition around medoids）分类器来训练模型，以预测验证（测试）队列（即BRCA-Yau队列）中患者的亚型，并将验证队列中的每个样本分配给与其质心具有最高的Pearson相关性的亚型标签17。最后，将使用in-group proportion (IGP) 统计量来评估发现队列和验证队列之间获得的亚型的相似性和可靠性。

yau.pam.pred <- runPAM(train.expr  = fpkm,
                       moic.res    = cmoic.brca,
                       test.expr   = brca.yau$mRNA.expr)
## --all samples matched.
## --a total of 7303 genes shared and used.
## --log2 transformation done for training expression data.
## --testing expression profile seems to have been standardised (z-score or log transformation), no more action will be performed.

查看IGP：

print(yau.pam.pred$IGP)
##       CS1       CS2       CS3       CS4       CS5 
## 0.4545455 0.6416667 0.5616438 0.7814570 0.9225806

一致性评价

想要知道在使用发现队列时，NTP或PAM的准确性如何？想要知道不同预测结果的一致性如何？可以使用runKappa()函数来进行评估。

# predict subtype in discovery cohort using NTP
tcga.ntp.pred <- runNTP(expr      = fpkm,
                        templates = marker.up$templates,
                        doPlot    = FALSE) 
## --original template has 500 biomarkers and 500 are matched in external expression profile.
## cosine correlation distance
## 643 samples; 5 classes; 100-100 features/class
## serial processing; 1000 permutation(s)...
## predicted samples/class (FDR<0.05)
## 
##  CS1  CS2  CS3  CS4  CS5 <NA> 
##   99  105  138  155  107   39
#> --original template has 500 biomarkers and 500 are matched in external expression profile.
#> cosine correlation distance
#> 643 samples; 5 classes; 100-100 features/class
#> serial processing; 1000 permutation(s)...
#> predicted samples/class (FDR<0.05)
#> 
#>  CS1  CS2  CS3  CS4  CS5 <NA> 
#>   99  105  138  155  107   39

# predict subtype in discovery cohort using PAM
tcga.pam.pred <- runPAM(train.expr  = fpkm,
                        moic.res    = cmoic.brca,
                        test.expr   = fpkm)
## --all samples matched.
## --a total of 13771 genes shared and used.
## --log2 transformation done for training expression data.
## --log2 transformation done for testing expression data.
#> --all samples matched.
#> --a total of 13771 genes shared and used.
#> --log2 transformation done for training expression data.
#> --log2 transformation done for testing expression data.

# check consistency between current and NTP-predicted subtype in discovery TCGA-BRCA
runKappa(subt1     = cmoic.brca$clust.res$clust,
         subt2     = tcga.ntp.pred$clust.res$clust,
         subt1.lab = "CMOIC",
         subt2.lab = "NTP",
         fig.name  = "CONSISTENCY HEATMAP FOR TCGA between CMOIC and NTP")

unnamed-chunk-58-186542957

# check consistency between current and PAM-predicted subtype in discovery TCGA-BRCA
runKappa(subt1     = cmoic.brca$clust.res$clust,
         subt2     = tcga.pam.pred$clust.res$clust,
         subt1.lab = "CMOIC",
         subt2.lab = "PAM",
         fig.name  = "CONSISTENCY HEATMAP FOR TCGA between CMOIC and PAM")

unnamed-chunk-58-286542957

# check consistency between NTP and PAM-predicted subtype in validation Yau-BRCA
runKappa(subt1     = yau.ntp.pred$clust.res$clust,
         subt2     = yau.pam.pred$clust.res$clust,
         subt1.lab = "NTP",
         subt2.lab = "PAM",
         fig.name  = "CONSISTENCY HEATMAP FOR YAU")