大家好,我是生信技能树学徒,前面我们带来了大量的表达数据挖掘实战演练,但是TCGA数据库之丰富程度,值得我们花费多年时间继续探索,现在带来的是突变全景图,如果你对之前的教程感兴趣,可以点击学习 菜鸟团(周一数据挖掘专栏)成果展 

标题: Comprehensive and Integrative Genomic Characterization of Hepatocellular Carcinoma 链接: https://www./cell/fulltext/S0092-8674(17)30639-6 绘制全景图需要maf格式的突变信息文件以及临床信息文件。 还是从XENA上进行下载

需要注意,这里储存突变信息的文件需要是maf格式,和我们之前根据是否存在该基因的突变对样本进行分类的文件不同。 
tumor_type <- "LIHC" Rdata_file <- paste('./data/', tumor_type, '.phenoData.Rdata', sep = '') if (!file.exists( Rdata_file )) { phenoData <- read.table( destfile, header = T, sep = '\t', quote = '' ) rownames( phenoData ) <- phenoData[ , 1] colnames( phenoData )[1] <- "Tumor_Sample_Barcode" phenoData[1:5, 1:5] save( phenoData, file = Rdata_file ) }else{ load( Rdata_file ) }
这里是遇到的第一个坑:我们看一下临床信息的“Tumor_Sample_Barcode”,是16位的短ID,但是后来在使用read.maf读取maf文件时,发现下载的maf文件的“Tumor_Sample_Barcode”是长ID,就存在了两个ID不匹配,从而导致临床信息被直接略过了。我去github上翻看了一下作者的代码,read.maf也可以接受数据框。所以就把maf文件先读取进来,处理一下ID。 maf <- data.table::as.data.table(read.csv(file = "./raw_data/TCGA.LIHC.mutect.DR-10.0.somatic.maf.gz", header = TRUE, sep = '\t', stringsAsFactors = FALSE, comment.char = "#")) maf$Tumor_Sample_Barcode <- substr(maf$Tumor_Sample_Barcode, 1, 16)
require(maftools) ## 作者用到了HBV和HCV的临床信息 phenoData$HBV <- ifelse(phenoData$hist_hepato_carc_fact == 'Hepatitis B', 'HBV', 'others') phenoData$HCV <- ifelse(phenoData$hist_hepato_carc_fact == 'Hepatitis C', 'HCV', 'others') phenoData[phenoData$neoplasm_histologic_grade == ""] <- 'no_reported' ## 这个函数不强求直接读取文本文件,也可以读取数据变量 laml <- read.maf(maf, clinicalData = phenoData) laml
laml@data <- laml@data[grepl('PASS', laml@data$FILTER), ]
接下来绘图遇到了第二个坑,关于factor的问题,以及颜色的对应关系的列表如何制作,绘图的函数怎么调用颜色信息。 library(RColorBrewer) png(paste0('oncoplot_top26_phone', tumor_type, '.png'), res = 150, width = 1500, height = 1080)
## 文章中这些driver gene是Mutsig挑选出来的,文章里面提供了,就直接使用了这个数据 genes = c("TP53", "CTNNB1", "ALB", "AXIN1", "BAP1", "KEAP1", "NFE2L2", "LZTR1", "RB1", "PIK3CA", "RPS6KA3", "AZIN1", "KRAS", "IL6ST", "RP1L1", "CDKN2A", "EEF1A1", "ARID2", "ARID1A", "GPATCH4", "ACVR2A", "APOB", "CREB3L3", "NRAS", "AHCTF1", "HIST1H1C")
## 为突变类型的分类数据设置颜色 variantClass <- names(table(laml@data$Variant_Classification)) col = c(RColorBrewer::brewer.pal(n = 4, name = 'Set1'), RColorBrewer::brewer.pal(n = 5, name = 'Set2')) names(col) = variantClass col
## 绘图的时候我们使用的数据是laml,临床信息在clinical.data里面 ## 绘图函数要求这些设置颜色的数据是factor,所以我们要把加到图上的 ## 临床信息转变为因子 laml@clinical.data$neoplasm_histologic_grade <- as.factor(laml@clinical.data$neoplasm_histologic_grade) gradecolors = RColorBrewer::brewer.pal(n = 4,name = 'Spectral') names(gradecolors) = levels(laml@clinical.data$neoplasm_histologic_grade)
laml@clinical.data$race.demographic <- as.factor(laml@clinical.data$race.demographic) Racecolors = RColorBrewer::brewer.pal(n = 5,name = 'Spectral') names(Racecolors) = levels(laml@clinical.data$race.demographic)
laml@clinical.data$gender.demographic <- as.factor(laml@clinical.data$gender.demographic) Gendercolors = c("#b3e2cd", "#fb9a99") names(Gendercolors) = levels(laml@clinical.data$gender.demographic)
laml@clinical.data$HBV <- as.factor(laml@clinical.data$HBV) HBVcolors = c("#ffffb3", "#e31a1c") names(HBVcolors) = levels(laml@clinical.data$HBV)
laml@clinical.data$HCV <- as.factor(laml@clinical.data$HCV) HCVcolors = c("#1b9e77", "#fc8d62") names(HCVcolors) = levels(laml@clinical.data$HCV)
## 绘图函数需要一个list phecolors = list(neoplasm_histologic_grade = gradecolors, race.demographic = Racecolors, gender.demographic = Gendercolors, HBV = HBVcolors, HCV = HCVcolors) ## clinicalFeatures是从laml@clinical.data里面挑取数据,所以 ## 一定要是laml@clinical.data里面的列名 oncoplot(maf = laml, colors = col, bgCol = "#ebebeb", borderCol = "#ebebeb", genes = genes, GeneOrderSort = F, keepGeneOrder = T, fontSize = 7 , legendFontSize = 7, annotationFontSize = 7, annotationTitleFontSize = 7, sortByMutation = T, showTumorSampleBarcodes = F, annotationColor = phecolors, clinicalFeatures = c("neoplasm_histologic_grade", "race.demographic", "gender.demographic", "HBV", "HCV")) dev.off()
|