HMDB数据库爬虫经常我们需要使用功能HMDB数据库的一些代谢物信息,虽然HMDB收录的代谢物已经几十万种,但是信息清楚的还是分为子库的这些。这些物质一共有大约1万种,我这里分不同的子库进行爬虫,得到代谢物编号,和HMDB数据库编号; 随后更具HMDB数据库编号,匹配通路分类等信息。 导入需要的R包 # Load the package required to read website library(XML) library("httr") library("magrittr") library("rvest") library("xml2") library("stringr")
devtools::install_github("ChristianAmes/HMDBScraper") library(HMDBScraper)# 包在github上,注意安装方式
测试爬虫代码很简单的代码 #--爬取HMDB数据库 web <- read_html("https:///bmi_metabolomics") pag1 <- web %>% html_table() %>%.[[1]] pag1
第一部分:bmi代谢租数据dir.create("./HMDB爬虫/")
#--第一部分:bmi代谢租数据#----------- # 294是在网页直接看到的总条数,25是每页显示的条数。pages = 1:ceiling(294 / 25) url <- "http://www./bmi_metabolomics?page=" url_all <- paste(url, pages, sep="")
for (i in pages) { tem = url_all[i] %>% read_html %>% html_table() %>%.[[1]]
if (i == 1) { dat = tem } else { dat = rbind(dat,tem) } }
head(dat) write.csv(dat,"./HMDB爬虫/bim_HMDB.csv")
age代谢租数据#--第二部分:age代谢租数据#----------- pages = 1:ceiling(418 / 25) url <- "https:///age_metabolomics?page=" url_all <- paste(url, pages, sep="")
for (i in pages) { tem = url_all[i] %>% read_html %>% html_table() %>%.[[1]]
if (i == 1) { dat1 = tem } else { dat1 = rbind(dat1,tem) } } dim(dat1) head(dat1) write.csv(dat1,"./HMDB爬虫/age_HMDB.csv")
gender代谢租数据#--第三部分:gender代谢租数据#----------- pages = 1:ceiling(515 / 25) url <- "https:///gender_metabolomics?page=" url_all <- paste(url, pages, sep="")
for (i in pages) { tem = url_all[i] %>% read_html %>% html_table() %>%.[[1]]
if (i == 1) { dat2 = tem } else { dat2 = rbind(dat2,tem) } } dim(dat2) head(dat2)
write.csv(dat2,"./HMDB爬虫/gender_HMDB.csv")
geno代谢租数据#--第四部分:geno代谢租数据#----------- pages = 1:ceiling(6777 / 25) url <- "https:///geno_metabolomics?page=" url_all <- paste(url, pages, sep="")
for (i in pages) { tem = url_all[i] %>% read_html %>% html_table() %>%.[[1]]
if (i == 1) { dat3 = tem } else { dat3 = rbind(dat3,tem) } } head(dat3) dim(dat3) write.csv(dat3,"./HMDB爬虫/geno_HMDB.csv")
pharmaco代谢租数据#--第五部分:pharmaco代谢租数据#----------- pages = 1:ceiling(2497 / 25) url <- "https:///pharmaco_metabolomics?page=" url_all <- paste(url, pages, sep="")
for (i in pages) { tem = url_all[i] %>% read_html %>% html_table() %>%.[[1]]
if (i == 1) { dat4 = tem } else { dat4 = rbind(dat4,tem) } } head(dat4) dim(dat4) write.csv(dat4,"./HMDB爬虫/pharmaco_HMDB.csv")
diurnal代谢租数据#--第六部分:diurnal代谢租数据#----------- pages = 1:ceiling(2315 / 25) url <- "https:///pharmaco_metabolomics?page=" url_all <- paste(url, pages, sep="")
for (i in pages) { tem = url_all[i] %>% read_html %>% html_table() %>%.[[1]]
if (i == 1) { dat5 = tem } else { dat5 = rbind(dat5,tem) } } head(dat5) dim(dat5) write.csv(dat5,"./HMDB爬虫/diurnal_HMDB.csv")
清理合并数据datall = data.frame(ID = c(dat$Metabolite,dat1$Metabolite,dat2$Metabolite,dat3$Metabolite, dat4$Metabolite,dat5$Metabolite))
tail(datall,20) datall$metabolite = sapply(strsplit(datall$ID, "[(]HMDB"), `[`, 1)
datall$HMDB_ID = sapply(strsplit(datall$ID, "[(]HMDB"), `[`, 2) datall$HMDB_ID = gsub(")","",datall$HMDB_ID) datall$HMDB_ID = paste("HMDB",datall$HMDB_ID,sep = "")
write.csv(datall,"./HMDB爬虫/all_HMDB_ID_metabolites.csv")
get.entry函数别人的函数,但是他们R包正在开发中,无法进行很好使用,我摘出来一个函数用到这里。 #--获取代谢物全部信息 get.entry<- function(id, prefix= "http://www./metabolites/",check_availability=T){
if(check_availability){ if(!check.availability(id)){ stop(paste(id, " id could not be found"))} } #create link link<- paste(prefix,id,".xml",sep= "")
#download data txt<- try(readLines(link))
#error handling code, just try another 3 times i<-1 while(inherits(txt, "try-error")) { print(paste("Retrieving has failed for ",i,". time",sep="")) txt<- try(readLines(link))
i= i+1 if(i>3) stop("Could not find Server or ID") }
#process data to be usable data<- XML::xmlTreeParse(txt,asText= T) data<- XML::xmlToList(data)
return (data) }
tem = read.csv("./HMDB爬虫/all_HMDB_ID_metabolites.csv",row.names = 1) head(tem) id0 = tem$HMDB_ID
A = c() B = c() C = c() D = c() E = c() G = c() for (i in 373:length(id0)) { dat = try(get.entry(id = id0[i]) , silent = FALSE) # 提取kegg id if (!is.null(dat$kegg_id)) { A[i] = dat$kegg_id } else{ A[i] = "" }
if (!is.null(dat$taxonomy$description)) { # 提取描述信息 B[i] =dat$taxonomy$description } else{ B[i] = "" }
if (!is.null(dat$taxonomy$kingdom)) { #-提取kingdom分类 C[i] = dat$taxonomy$kingdom } else{ B[i] = "" }
if (!is.null(dat$taxonomy$super_class)) { # 提取super_class分类 D[i] = dat$taxonomy$super_class } else{ B[i] = "" }
if (!is.null(dat$taxonomy$class)) { # 提取class E[i] = dat$taxonomy$class } else{ B[i] = "" }
if (!is.null(dat$taxonomy$sub_class)) { # 提取sub_class分类 G[i]= dat$taxonomy$sub_class } else{ B[i] = "" }
print(i)
}
# 合并全部内容 tax.hmdb = data.frame( KEGGID = A, Descrip = B, Kingdom = C, Super_class = D, Class = E, Sub_class = G )
根际互作生物学研究室 简介
|