你现在看到的,仍然是学徒大作,上个星期我带来的挖掘过程展现非常的受欢迎,所以再接再厉,希望大家喜欢 5分的生信数据挖掘文章全部图表复现(纯R代码)
https:///10.1371/journal.pone.0189675葡萄膜黑色素瘤是成人最常见的原发性眼内恶性肿瘤,在国外其发病率占眼内肿瘤的首位,在国内则仅次于视网膜母细胞瘤,居眼内肿瘤的第二位。主要经血循环转移,全身转移主要见于肝脏,肝转移所致死亡率高达50%。高表达的lncRNA-PVT1能够独立且有效地预测葡萄膜黑色素瘤(uveal melanoma, UVM)患者的差预后。由于最近接触TCGA数据比较多,每次都要去XENA官网上下载很麻烦,就解析了一下官网。 保存一下Xena的网页你会发现,源代码全是js,找不要想要的数据,搜索发现了“Selenium”这个强大的包,在正式处理这次数据之前,先介绍一下提取的结果。 1 “JDK”[ https://download.oracle.com/otn-pub/java/jdk/12+33/312335d836a34c7c8bba9d963e26dc23/jdk-12_windows-x64_bin.exe ] 2 “selenium” [https://selenium-release.storage.googleapis.com/3.9/selenium-server-standalone-3.9.1.jar ] 3 浏览器驱动任意选择一个(我的浏览器都是最新版,所以驱动我也选择了最新的): Chrome: http://npm./mirrors/chromedriver/2.9/ Firefox: https://github.com/mozilla/geckodriver/releases/download/v0.24.0/geckodriver-v0.24.0-win64.zip 注:下载得到的都是exe,需要移动到你的浏览器安装目录下,并且将浏览器安装目录添加到你的环境变量。 4 “RSelenium”是一个R包,直接install.packages就可以安装。
library(RCurl) library(XML) library(RSelenium) ## 不想专门打开一个cmd,就在R里直接运行了,这些是我安装软件的路径,改成你自己的就可以啦 system(paste(''C:\Program Files\Java\jdk-10.0.2\bin\java.exe'', '-Dwebdriver.chrome.driver='C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe'', '-jar 'C:\Program Files\Java\selenium-server-standalone-3.9.1.jar''), wait = FALSE) ## 最开始没有找到 “RSelenium”包,所以是用 XML解析下面的网址,F12可以看到元素,我就 ## 直接复制出了源代码,在文件开头加上“<?xml version='1.0'?>” ## https:///datapages/?hub=https://tcga.:44 
if (!file.exists( './data/xena_url.Rdata' )) { webpage <- readLines('./Xena.htm') pagetree <- htmlTreeParse(webpage) # parse the tree by tables a <- getNodeSet(pagetree$children$html, '/html/body/li/a') href <- sapply(a, function(el) xmlGetAttr(el, 'href')) href <- paste('https:///datapages/', href, sep = '') abbr <- getNodeSet(pagetree$children$html, '/html/body/li/a/abbr') abbr <- as.data.frame(unlist(abbr)) abbr <- abbr[seq(3, nrow(abbr), by = 3), ] xena_url <- data.frame(url = href, tumor = abbr) save( xena_url, file = './data/xena_url.Rdata' ) }else{ load('./data/xena_url.Rdata') }
## 得到的对应关系如下:

如果上面这一段教程你们没有看懂,就放弃吧,是我刻意在秀技巧,总之,前面我们讲解过了
太多的TCGA数据库下载,你肯定学会了!
5分的生信数据挖掘文章全部图表复现(纯R代码) ## 下面就要开始下载我们这次的UVM数据了
tumor_type <- 'UVM' url <- xena_url[grep(tumor_type, xena_url$tumor), 1]
remDr <- remoteDriver(browserName = 'firefox', remoteServerAddr = 'localhost', port = 4444L) remDr$open() remDr$navigate(url)
webElems <- remDr$findElements(using = 'css selector', 'a') sub_type <- unlist(lapply(webElems, function(x) {x$getElementText()})) sub_type targetElem <- webElems[[grep('Phenotypes', sub_type)]] targetElem$clickElement() Sys.sleep(5) webElems <- remDr$findElements(using = 'css selector', 'a') sub_url <- unlist(lapply(webElems, function(x) {x$getElementText()})) phenoData_url <- sub_url[grep('http', sub_url)[1]] phenoData_url remDr$goBack()
webElems <- remDr$findElements(using = 'css selector', 'a') sub_type <- unlist(lapply(webElems, function(x) {x$getElementText()})) sub_type targetElem <- webElems[[grep('IlluminaHiSeq', sub_type)[2]]] targetElem$clickElement() Sys.sleep(5) webElems <- remDr$findElements(using = 'css selector', 'a') sub_url <- unlist(lapply(webElems, function(x) {x$getElementText()})) raw_data_url <- sub_url[grep('http', sub_url)[1]] raw_data_url
## 最终得到了两个URL,临床数据的下载地址phenoData_url和表达矩阵的下载地址raw_data_url
表达矩阵和临床信息 tumor_type <- 'UVM' Rdata_file <- paste('./data/TCGA.', tumor_type, '.Rdata', sep = '') if (!file.exists( Rdata_file )) { url <- raw_data_url url_word <- strsplit(url, split = '/') gz_file <- url_word[[1]][length(url_word[[1]])] destfile <- paste('./raw_data/TCGA.', tumor_type, '.', gz_file, sep = '') download.file(url, destfile) library(R.utils) gunzip(destfile, remove = F) library(data.table) raw_data <- fread( destfile, sep = ' ', header = T) raw_data <- as.data.frame( raw_data ) raw_data[1:5, 1:6] rownames( raw_data ) <- raw_data[, 1] raw_data <- raw_data[, -1] raw_data[1:5, 1:6] raw_data <- 2^raw_data - 1 raw_data <- ceiling( raw_data ) raw_data[1:5, 1:6] pick_row <- apply( raw_data, 1, function(x){ sum(x == 0) < 10 }) raw_data <- raw_data[pick_row, ] dim(raw_data ) save( raw_data, file = Rdata_file ) }else{ load( Rdata_file ) }
Rdata_file <- paste('./data/TCGA.', tumor_type, '.phenoData.Rdata', sep = '') if (!file.exists( Rdata_file )) { url <- phenoData_url url_word <- strsplit(url, split = '/') gz_file <- url_word[[1]][length(url_word[[1]])] destfile <- paste('./raw_data/TCGA.', gz_file, sep = '') download.file(url, destfile) phenoData <- read.table( destfile, header = T, sep = ' ', quote = '' ) name <- phenoData[ , 1] phenoData <- phenoData[ , -1] rownames( phenoData ) <- name phenoData[1:5, 1:3] save( phenoData, file = Rdata_file ) }else{ load( Rdata_file ) }
## Category : PVT1 gene expression
picked_gene_data <- as.data.frame(t(raw_data[rownames(raw_data) == 'PVT1',])) picked_gene_data[,2] <- rownames(picked_gene_data) picked_gene_data <- picked_gene_data[order(picked_gene_data[,1]),] PVT1_group <- c(rep('Low', floor(length(rownames(picked_gene_data))/2)), rep('High', ceiling(length(rownames(picked_gene_data))/2))) names(PVT1_group) <- factor(picked_gene_data[,2])
## OS library(survival) os.model <- phenoData[, c('OS.time', 'OS')]
os.model <- os.model[rownames(os.model) %in% names(PVT1_group),] os.model <- os.model[names(PVT1_group),] os.model$type <- PVT1_group
fit <- survfit(Surv(OS.time, OS) ~ type, data = os.model)
library(survminer) ggsurvplot(fit, pval = TRUE, conf.int = TRUE, linetype = 'type', surv.median.line = 'hv', ggtheme = theme_bw(), palette = c('#E7B800', '#2E9FDF') )

这可是相当之显著啊!!!
5分的生信数据挖掘文章全部图表复现(纯R代码) 代码由生信技能树学徒独立完成,我并没有进行任何干预!
|