分享

学习笔记:一个下午的时间完成了一区文章生信分析(一)

 生物_医药_科研 2019-07-21

这是肿瘤学领域的重要学术刊物,中科院分区一区,影响因子目前8.911分。

链接:https://www.ncbi.nlm./pmc/articles/PMC4911309/

下面我来还原本文章全部的分析过程。

下载RNAseq,me450k和来自TCGA黑素瘤肿瘤的临床数据

介绍

本文档概述了我下载数据3级TCGA数据的方法。使用RTCGAToolbox(Samur 2014)包下载RNA-seq,甲基化450K数据和临床信息

#Load the librarylibrary(RTCGAToolbox)
getFirehoseDatasets()
##  [1] 'ACC'      'BLCA'     'BRCA'     'CESC'     'CHOL'     'COADREAD'##  [7] 'COAD'     'DLBC'     'ESCA'     'FPPP'     'GBMLGG'   'GBM'     ## [13] 'HNSC'     'KICH'     'KIPAN'    'KIRC'     'KIRP'     'LAML'    ## [19] 'LGG'      'LIHC'     'LUAD'     'LUSC'     'MESO'     'OV'      ## [25] 'PAAD'     'PCPG'     'PRAD'     'READ'     'SARC'     'SKCM'    ## [31] 'STAD'     'STES'     'TGCT'     'THCA'     'THYM'     'UCEC'    ## [37] 'UCS'      'UVM'
head(getFirehoseRunningDates())
## [1] '20160128' '20151101' '20150821' '20150601' '20150402' '20150204'
head(getFirehoseAnalyzeDates())
## [1] '20160128' '20150821' '20150402' '20141017' '20140715' '20140416'

临床和RNA-seq数据 - 下载和处理

readDataSKCM <- getFirehoseData (dataset='SKCM',                                 runDate='20160128',                                forceDownload = TRUE,                                 clinical = TRUE,                                 RNASeq2GeneNorm = TRUE,                                 Methylation = FALSE, fileSizeLimit= 3000)
load('~/Dropbox/GitHub/Downloading-TCGA-data/RDatafiles/TCGA_readDataSKCM.RData')
#extract the dataclinSKCM <- getData(readDataSKCM, 'clinical')rnaseqSKCM <- getData(readDataSKCM, 'RNASeq2GeneNorm')

在进行任何下游分析之前,处理(或清理)临床和RNA-seq数据。

  1. TCGA条形码在临床和RNA-seq数据集之间的结构不同,因此需要进行匹配。例如,RNAseq数据中的第一个TCGA条形码是“TCGA-3N-A9WB-06A-11R-A38C-07”,而在临床数据中它是“tcga.d3.a2je”。

  2. 一些样品具有2x RNA-seq数据。删除重复的RNA-seq数据。

  3. 一些样本包含RNA-seq数据,但没有临床信息。仅保留具有RNA-seq和临床信息的那些样品用于下游分析。

#Changing patient identifier namesdim(clinSKCM)
## [1] 470  18
head(clinSKCM)
##              Composite Element REF years_to_birth vital_status## tcga.d3.a2je                 value             75            1## tcga.d3.a2jf                 value             74            0## tcga.d3.a3c8                 value             58            0## tcga.d3.a3ml                 value             70            1## tcga.d3.a51g                 value                       0## tcga.d3.a8gi                 value             68            1##              days_to_death days_to_last_followup## tcga.d3.a2je           841                  ## tcga.d3.a2jf                            1888## tcga.d3.a3c8                            1409## tcga.d3.a3ml           422                  ## tcga.d3.a51g                            ## tcga.d3.a8gi          1780                  ##              days_to_submitted_specimen_dx pathologic_stage## tcga.d3.a2je                           140       stage iiic## tcga.d3.a2jf                           544         stage ia## tcga.d3.a3c8                             0       stage iiic## tcga.d3.a3ml                           230       stage iiia## tcga.d3.a51g                                    stage 0## tcga.d3.a8gi                          1653         stage ia##              pathology_T_stage pathology_N_stage pathology_M_stage## tcga.d3.a2je                tx                n3                m0## tcga.d3.a2jf               t1a                n0                m0## tcga.d3.a3c8                tx                n3                m0## tcga.d3.a3ml               t3a               n2a                m0## tcga.d3.a51g               tis                n0                m0## tcga.d3.a8gi               t1a                n0                m0##              melanoma_ulceration melanoma_primary_known Breslow_thickness## tcga.d3.a2je                                    yes              ## tcga.d3.a2jf                  no                    yes              0.28## tcga.d3.a3c8                                    yes              ## tcga.d3.a3ml                  no                    yes               2.3## tcga.d3.a51g                                    yes                 0## tcga.d3.a8gi                  no                    yes              0.98##              gender date_of_initial_pathologic_diagnosis radiation_therapy## tcga.d3.a2je female                                 2009                no## tcga.d3.a2jf   male                                 2008                no## tcga.d3.a3c8 female                                 2009               yes## tcga.d3.a3ml   male                                 2003                no## tcga.d3.a51g   male                                                 no## tcga.d3.a8gi   male                                 2008                no##               race              ethnicity## tcga.d3.a2je white not hispanic or latino## tcga.d3.a2jf white not hispanic or latino## tcga.d3.a3c8 white not hispanic or latino## tcga.d3.a3ml white not hispanic or latino## tcga.d3.a51g white not hispanic or latino## tcga.d3.a8gi white not hispanic or latino
dim(rnaseqSKCM)
## [1] 20501   473
rnaseqSKCM[1:5,1:5]
##       TCGA-3N-A9WB-06A-11R-A38C-07 TCGA-3N-A9WC-06A-11R-A38C-07## A1BG                      381.0662                     195.1822## A1CF                        0.0000                       0.0000## A2BP1                       0.0000                       0.0000## A2LD1                     250.1979                     160.7548## A2ML1                       7.2698                       0.0000##       TCGA-3N-A9WD-06A-11R-A38C-07 TCGA-BF-A1PU-01A-11R-A18S-07## A1BG                      360.8794                     176.3994## A1CF                        0.7092                       0.0000## A2BP1                       6.3830                       1.2987## A2LD1                      97.1986                     163.2338## A2ML1                       0.0000                       7.7922##       TCGA-BF-A1PV-01A-11R-A18U-07## A1BG                      216.8470## A1CF                        0.0000## A2BP1                       0.0000## A2LD1                      60.8727## A2ML1                       0.5977
#The identifiers in the RNA-seq data are transformed to be the same as the ones in the clinical data. This includes substringing the first 12 characters and replacing '-' with '.'rid = tolower(substr(colnames(rnaseqSKCM),1,12))rid = gsub('-', '.', rid)  colnames(rnaseqSKCM) = rid length(intersect(rid,rownames(clinSKCM))) # 469 samples intersect between RNAseq and clinSKCM
## [1] 469
#Remove duplicated samples#Samples with duplicated names are removed. The data between the replicates are very similar however looking into the original barcodes, some of the duplicates are primary or metastatic, or a normal solid tumour. Tumours that are labeled primary, normal solid tumour or additional metastatic are removed. duplicatedSamples <- which(duplicated(colnames(rnaseqSKCM))) # 4  duplicate samplesduplicatedSampleNames <- colnames(rnaseqSKCM)[duplicated(colnames(rnaseqSKCM))]rnaseqMel_duplicated <- rnaseqSKCM[,colnames(rnaseqSKCM) %in% duplicatedSampleNames] #matrix of only those duplicated samplescolnames(rnaseqMel_duplicated)
## [1] 'tcga.d3.a1qa' 'tcga.d3.a1qa' 'tcga.er.a19t' 'tcga.er.a19t'## [5] 'tcga.er.a2nf' 'tcga.er.a2nf' 'tcga.gn.a4u8' 'tcga.gn.a4u8'
par(mfrow=c(2,2))plot(log2(rnaseqMel_duplicated[1001:2000,1:2]))plot(log2(rnaseqMel_duplicated[1001:2000,3:4]))plot(log2(rnaseqMel_duplicated[1001:2000,5:6]))plot(log2(rnaseqMel_duplicated[1001:2000,7:8]))

图1:图1:将重复样品一起绘制以评估相关性。它们看起来很相似,并且不明显要保留哪些副本。

研究了这些重复样品的完整TCGA条形码名称。此处给出了有关TCGA条形码的信息,此处提供了有关TCGA条形码的样本类型(例如,初级,转移,附加的metastsatic)的信息。

index <- which(colnames(rnaseqSKCM) %in% duplicatedSampleNames)rnaseqSKCM2 <- getData(readDataSKCM, 'RNASeq2GeneNorm')original_rnaseq_barcode <- colnames(rnaseqSKCM2)original_rnaseq_barcode[index]
## [1] 'TCGA-D3-A1QA-07A-11R-A37K-07' 'TCGA-D3-A1QA-06A-11R-A18T-07'## [3] 'TCGA-ER-A19T-06A-11R-A18U-07' 'TCGA-ER-A19T-01A-11R-A18T-07'## [5] 'TCGA-ER-A2NF-06A-11R-A18T-07' 'TCGA-ER-A2NF-01A-11R-A18T-07'## [7] 'TCGA-GN-A4U8-11A-11R-A32P-07' 'TCGA-GN-A4U8-06A-11R-A32P-07'

这些重复样品的样品类型是原发性实体(01)肿瘤,转移性(06),其他转移性(07)或实体组织(11)正常。

在这里,从重复,我将仅保留转移性肿瘤,因此我去除了重复的原发性实体瘤,另外的转移性肿瘤和实体组织正常。第1次重复样本:去除其他转移性第2次重复样本:去除原发肿瘤第3次重复样本:去除原发性第4次重复样本:去除实体组织正常

remove_index <- c('TCGA-D3-A1QA-07A-11R-A37K-07', 'TCGA-ER-A19T-01A-11R-A18T-07',  'TCGA-ER-A2NF-01A-11R-A18T-07', 'TCGA-GN-A4U8-11A-11R-A32P-07') #excluding these remove_index <- which(original_rnaseq_barcode %in% remove_index)rnaseqSKCM = rnaseqSKCM[,-remove_index] # getting rid of the duplicatedim(rnaseqSKCM) # from 473 samples to 469
## [1] 20501   469
length(intersect(colnames(rnaseqSKCM),rownames(clinSKCM)))  #469 samples interect between rnaseqSKCM and clinSKCM
## [1] 469
length(rownames(clinSKCM)) # there is 1 sample in clinMel which there is absent in rnaseqMel
## [1] 470
clinSKCM <- clinSKCM[intersect(colnames(rnaseqSKCM),rownames(clinSKCM)),]dim(clinSKCM)
## [1] 469  18
table(colnames(rnaseqSKCM)==rownames(clinSKCM)) # patient names are in the same order
## ## TRUE ##  469
#You may wish to create an expression set which contains both RNA-seq and expression matrix. library(Biobase)readES = ExpressionSet(as.matrix(log2(rnaseqSKCM+1)))pData(readES) = clinSKCM

生存数据清理和分析

生存分析:背景

为了分析总体生存,clinMel需要数据集中的3个变量,即“vital_status”,“days_to_death”和“days_to_last_followup”。

这里的谷歌论坛页面提供了相关信息

dim(clinSKCM)
## [1] 469  18
str(clinSKCM[,c('vital_status','days_to_death','days_to_last_followup')])
## 'data.frame':    469 obs. of  3 variables:##  $ vital_status         : chr  '1' '0' '1' '0' ...##  $ days_to_death        : chr  '518' NA '395' NA ...##  $ days_to_last_followup: chr  NA '2022' NA '387' ...
clinSKCM[1:5,c('vital_status','days_to_death','days_to_last_followup')]
##              vital_status days_to_death days_to_last_followup## tcga.3n.a9wb            1           518                  ## tcga.3n.a9wc            0                            2022## tcga.3n.a9wd            1           395                  ## tcga.bf.a1pu            0                             387## tcga.bf.a1pv            0                              14
  • vital_status: “1”表示已故,“0”表示仍然存在。

  • days_to_death:对于被除名的患者,days_to_death变量给出死亡前的天数。

  • days_to_last_followup:对于仍然活着的患者,days_to_last_followup变量给出了最后一次随访前的天数。

生存数据:探索性分析

在469名患者中,460名患者中,days_to_death和days_to_last_followup是相互驱散的; 如果在days_to_death中有一个NA,那么DaystoLastfollowup会有一个数字,反之亦然。其余的天数为day_to_death和days_to_last_followup。

table(!is.na(clinSKCM[,'days_to_death']) & is.na(clinSKCM[,'days_to_last_followup'])) #There are 220 patients who does not contain (NA) days_to_last_followup however contains days_to_death
## ## FALSE  TRUE ##   249   220
table(is.na(clinSKCM[,'days_to_death']) & !is.na(clinSKCM[,'days_to_last_followup'])) #There are 240 patients that do not contain (NA) days_to_death but contain days_to_last_followup
## ## FALSE  TRUE ##   229   240
table(is.na(clinSKCM$'days_to_death') & is.na(clinSKCM$'days_to_last_followup')) #there are 9 patinets that contain NA for both days_to_death and days_to_last_followup
## ## FALSE  TRUE ##   460     9

有9名患者同时将“days_to_death”和“days_to_last_followup”作为NA。

survivalVariables <- c('days_to_last_followup','vital_status','days_to_death')index <- is.na(clinSKCM[,'days_to_death']) & is.na(clinSKCM[,'days_to_last_followup'])clinSKCM[index,survivalVariables]
##              days_to_last_followup vital_status days_to_death## tcga.d3.a3c1                              0          ## tcga.d3.a3c3                              0          ## tcga.d3.a51g                              0          ## tcga.d3.a8go                              1          ## tcga.er.a19o                              1          ## tcga.fr.a3yo                              0          ## tcga.rp.a695                              0          ## tcga.rp.a6k9                              0          ## tcga.yd.a9tb                              0
dim(clinSKCM[index,survivalVariables])
## [1] 9 3

还有1名患者为负天数_to_last_followup。这是什么意思?

survivalVariables <- c('days_to_last_followup','vital_status','days_to_death')index <- which(clinSKCM[,'days_to_death'] < 0 | clinSKCM[,'days_to_last_followup'] < 0)clinSKCM[index,survivalVariables]
##              days_to_last_followup vital_status days_to_death## tcga.eb.a430                    -2            0

未完接第二篇  https://antonioahn./post/tcgadata_download/

    本站是提供个人知识管理的网络存储空间,所有内容均由用户发布,不代表本站观点。请注意甄别内容中的联系方式、诱导购买等信息,谨防诈骗。如发现有害或侵权内容,请点击一键举报。
    转藏 分享 献花(0

    0条评论

    发表

    请遵守用户 评论公约

    类似文章 更多