就是一篇文章并没有使用TCGA数据库的指定癌症的生存信息去看自己感兴趣的基因的生存效应,反而舍近求远去下载BMC Cancer. 2011 文章数据,所以我怀疑TCGA应该是该基因在该癌症里面的生存效果不显著! 1.从UCSC Xena下载TCGA BRCA的表达矩阵HiSeqV2.gz,临床信息BRCA_clinicalMatrix,生存信息BRCA_survival.txt.gz,读入R。。 rm(list=ls()) options(stringsAsFactors = F) options(warn = -1)
library(data.table) ##读入TCGA_BRCA表达信息 exprSet <- fread("HiSeqV2.gz",header = T,sep = '\t') exprSet <- as.data.frame(exprSet) rownames(exprSet) <- exprSet[,1] exprSet <- exprSet[,-1] ## 从exprSet中提取PTP4A3的表达情况 dat <- exprSet["PTP4A3",]
##读入TCGA_BRCA生存信息 surdata <- read.table("BRCA_survival.txt.gz",header = T,sep = '\t') rownames(surdata) <- surdata$sample surdata <- surdata[,-1]
##读入TCGA_BRCA临床信息 pheno <- read.table("BRCA_clinicalMatrix",header = T,sep = '\t') rownames(pheno) <- pheno$sampleID pheno <- pheno[,-1]
如果你不知道如何下载TCGA数据库,可以看我以前的教程,我挑选了部分,写了6个数据下载系列教程:1)病人数据去重 table(duplicated(surdata$X_PATIENT)) #FALSE TRUE # 1097 139 choose_samples <- rownames(surdata[!duplicated(surdata$X_PATIENT),]) choose_samples[1:3] length(choose_samples) surdata <- surdata[choose_samples,] pheno <- pheno[choose_samples,] dat <- dat[,(colnames(dat) %in% choose_samples)]
choose_samples <- colnames(dat) surdata <- surdata[choose_samples,] pheno <- pheno[choose_samples,] dat <- dat[,(colnames(dat) %in% choose_samples)]
2)选择Primary Tumor 样本 pheno[1:3,1:3] table(pheno$sample_type) # Metastatic Primary Tumor Solid Tissue Normal # 7 1086 2 choose_samples <- rownames(pheno[pheno$sample_type=='Primary Tumor',]) surdata <- surdata[choose_samples,] pheno <- pheno[choose_samples,] dat <- dat[,(colnames(dat) %in% choose_samples)]
参考:【生信技能树】TCGA的28篇教程- 对TCGA数据库的任意癌症中任意基因做生存分析 dat_bak <- dat dat <- t(dat) dat <- as.data.frame(dat) dat$sampleID <- rownames(dat) surdata$sampleID <- rownames(surdata) surdata <- merge(surdata,dat,by='sampleID') surdata$PTP4A3 <- as.numeric(surdata$PTP4A3) surdata$g <- ifelse(surdata$PTP4A3 > median(surdata$PTP4A3),'high','low') table(surdata$g) #high low # 543 543
library(survival) table(surdata$OS) # 0 1 #939 147 table(surdata$OS.time>0) #FALSE TRUE # 13 1072 my.surv <- Surv(surdata$OS.time,surdata$OS==1) kmfit2 <- survfit(my.surv~surdata$g,data = surdata) library("survminer") ggsurvplot(kmfit2,conf.int =F, pval = T,risk.table =F, ncensor.plot = F) p=0.91,结果不显著。按照文献里写的用三阴性乳腺癌样本分析。 参考:TCGA数据库中三阴性乳腺癌在亚洲人群中的差异表达 colnames_num_tnbc <- grep('receptor_status',colnames(pheno)) colnames(pheno)[colnames_num_tnbc] eph <- pheno[,colnames_num_tnbc[1:3]] eph$tnbc_row_num <- apply(eph, 1, function(x) sum(x =='Negative')) ## 均为阴性的为三阴性乳腺癌 tnbc_samples <- rownames(pheno)[eph$tnbc_row_num == 3] length(tnbc_samples) #[1] 115 ##TCGA中tnbc病人只有115个
tnbc_surdata <- surdata[surdata$sampleID %in% tnbc_samples,] tnbc.surv <- Surv(tnbc_surdata$OS.time,tnbc_surdata$OS==1) tnbc.kmfit <- survfit(tnbc.surv~tnbc_surdata$g,data=tnbc_surdata) ggsurvplot(tnbc.kmfit,conf.int = F,pval = T)
p=0.47。还是不显著,参考公众号【生信技能树】文章《生存分析就是一个任人打扮的小姑凉》继续折腾。 主要就是使用survminer包 的surv_cutpoint函数 找基因表达量的分组阈值,根据这个阈值把基因分为high 和low 两组,而不是按照前面的median(surdata$PTP4A3) 分组。 surv.cut <- surv_cutpoint( surdata, time = "OS.time", event = "OS", variables = "PTP4A3" ) summary(surv.cut) plot(surv.cut, "PTP4A3", palette = "npg")
surv.cat <- surv_categorize(surv.cut)
surv.fit <- survfit(Surv(OS.time, OS) ~ PTP4A3, data = surv.cat) ggsurvplot( surv.fit, # survfit object with calculated statistics. risk.table = TRUE, # show risk table. pval = TRUE, # show p-value of log-rank test. conf.int = TRUE, # show confidence intervals for # point estimaes of survival curves. #xlim = c(0,3000), # present narrower X axis, but not affect # survival estimates. break.time.by = 1000, # break X axis in time intervals by 500. risk.table.y.text.col = T, # colour risk table text annotations. risk.table.y.text = FALSE # show bars instead of names in text annotations # in legend of risk table )
p=0.01!!! 再来分析一遍三阴性乳腺癌样本。 tnbc.surv.cut <- surv_cutpoint( tnbc_surdata, time = "OS.time", event = "OS", variables = "PTP4A3" ) summary(tnbc.surv.cut) plot(tnbc.surv.cut, "PTP4A3", palette = "npg")
tnbc.surv.cat <- surv_categorize(tnbc.surv.cut)
tnbc.surv.fit <- survfit(Surv(OS.time, OS) ~ PTP4A3, data = tnbc.surv.cat) ggsurvplot( tnbc.surv.fit, # survfit object with calculated statistics. risk.table = TRUE, # show risk table. pval = TRUE, # show p-value of log-rank test. conf.int = TRUE, # show confidence intervals for # point estimaes of survival curves. # xlim = c(0,3000), # present narrower X axis, but not affect # survival estimates. break.time.by = 1000, # break X axis in time intervals by 500. risk.table.y.text.col = T, # colour risk table text annotations. risk.table.y.text = FALSE # show bars instead of names in text annotations # in legend of risk table )
p=0.078。也离0.05比较接近了,大概数据量太少了吧(尬笑)
4.网页工具分析TCGA BRCA中PTP4A3基因的生存分析 TCGA数据库肯定不仅仅是生存分析那么简单啦,同样的
我写了部分常见的TCGA数据库用法:
|