背景 KEGG.db的包自2011年后不再更新,clusterprofiler做KEGG分析的时候,可以使用KEGG提供的API对所需物种信息进行获取,但是KEGG的服务器有时候会断网,这对我们批量进行数据分析的时候,会发生错误而分析中断。另外,目前我们在云上使用docker是无法联网的(不是云平台的原因)。因此我需要自己自备数据分析来获得更为全面正确的分析结果。
KEGG.db基本信息查看 > library ('KEGG.db' ) > ls('package:KEGG.db' ) # [1 ] 'KEGG' 'KEGG_dbconn' 'KEGG_dbfile' 'KEGG_dbInfo' 'KEGG_dbschema' [6 ] 'KEGGENZYMEID2GO' 'KEGGEXTID2PATHID' 'KEGGGO2ENZYMEID' 'KEGGMAPCOUNTS' 'KEGGPATHID2EXTID' [11 ] 'KEGGPATHID2NAME' 'KEGGPATHNAME2ID' > columns(KEGG.db) Error in columns(KEGG.db) : object 'KEGG.db' not found > # columns(org.Hs.eg.db) # show colums in .db > # columns(KEGG.db) # Error in columns(KEGG.db) : object 'KEGG.db' not found > # available.dbschemas() > frame = toTable(KEGGPATHID2EXTID) > head(frame) pathway_id gene_or_orf_id1 hsa00232 10 2 hsa00983 10 3 hsa01100 10 4 hsa00230 100 5 hsa01100 100 6 hsa05340 100 > frame = toTable(KEGGPATHID2NAME) > head(frame) path_id path_name1 hsa00010 Glycolysis / Gluconeogenesis2 hsa00020 Citrate cycle (TCA cycle)3 hsa00030 Pentose phosphate pathway4 hsa00040 Pentose and glucuronate interconversions5 hsa00051 Fructose and mannose metabolism6 hsa00052 Galactose metabolism > class(KEGGPATHID2EXTID) [1 ] 'AnnDbBimap' attr(,'package' ) [1 ] 'AnnotationDbi'
ls('package:KEGG.db')
返回的结果是一些AnnObObj
(Bimap objects)。是一种老的AnnotationDbi
的interface,现在已经不怎么建议使用了。现在推荐使用select方法(columns()
、cols()
、keytypes()
等)。
生成简易版KEGG.dbR包的代码 0. 最终的目录结构如下: KEGG(目录,手动创建) ├── DESCRIPTION(文件,手动创建) ├── inst(目录) │ └── extdata(目录) │ └── KEGG.sqlite(文件,通过代码生成) ├── LICENSE(文件,手动创建) ├── NAMESPACE(文件,手动创建) └── R(目录,手动创建) └── zzz.R(文件,手动创建)
1. DESCRIPTION文件 参考2011版本的KEGG.db来写:
Package: KEGG.db Title: KEGG.db for KEGG enrichment analysis. Description: KEGG.db for KEGG enrichment analysis. Version: 1.0 Author: xxx <xxx@xxx.xxx.com> Maintainer: xxx <xxx@xxx.xxx.com> Depends: R (>= 2.7 .0 ), methods, AnnotationDbi (>= 1.44 .0 ) Imports: methods, AnnotationDbi Suggests: DBI License: BSD License_restricts_use: yes biocViews: AnnotationData, FunctionalAnnotation
2. LICENSE文件 参考2011版本的KEGG.db来写:
Free for academic use. Non-academic users are requested to obtain a license agreement with KEGG.
3. NAMESPACE文件 参考2011版本的KEGG.db来写:
import(methods) import(AnnotationDbi)### Only put what is statically exported here. All the AnnObj instances ### created at load time are dynamically exported (refer to R/zzz.R for ### the details). export( KEGG, KEGG_dbconn, KEGG_dbfile, KEGG_dbschema, KEGG_dbInfo )
4. Bimap的生成 查看clusterprofiler的enrichKEGG函数,当使用KEGG.db的数据时,使用了get_data_from_KEGG_db()函数进行数据获取。
get_data_from_KEGG_db <- function (species) { PATHID2EXTID <- as.list(get_KEGG_db('KEGGPATHID2EXTID' )) if (!any(grepl(species, names(PATHID2EXTID)))) { stop ('input species is not supported by KEGG.db...' ) } idx <- grep(species, names(PATHID2EXTID)) PATHID2EXTID <- PATHID2EXTID[idx] PATHID2EXTID.df <- stack(PATHID2EXTID) PATHID2EXTID.df <- PATHID2EXTID.df[, c(2 ,1 )] PATHID2NAME <- as.list(get_KEGG_db('KEGGPATHID2NAME' )) PATHID2NAME.df <- data.frame(path=paste0(species, names(PATHID2NAME)), name=unlist(PATHID2NAME)) build_Anno(PATHID2EXTID.df, PATHID2NAME.df) }
get_KEGG_db <- function (kw) { annoDb <- 'KEGG.db' suppressMessages(requireNamespace(annoDb)) eval(parse(text=paste0(annoDb, '::' , kw))) }
get_data_from_KEGG_db()使用get_KEGG_db()函数从'KEGG.db'中获取了'KEGGPATHID2EXTID'和'KEGGPATHID2NAME'这两个AnnDbBimap。
具体来说,对于get_KEGG_db('KEGGPATHID2EXTID')其实就是执行KEGG.db::KEGGPATHID2EXTID这一条命令。as.list(get_KEGG_db('KEGGPATHID2EXTID'))就是将KEGG.db::KEGGPATHID2EXTID这一条命令返回的结果转成list。
> x <- as.list(KEGG.db::KEGGPATHID2EXTID) > head(x,2 ) $hsa00232 [1 ] '10' '1544' '1548' '1549' '1553' '7498' '9' $hsa00983 [1 ] '10' '1066' '10720' '10941' '151531' '1548' '1549' '1551' '1553' [10 ] '1576' '1577' '1806' '1807' '1890' '221223' '2990' '3251' '3614' [19 ] '3615' '3704' '51733' '54490' '54575' '54576' '54577' '54578' '54579' [28 ] '54600' '54657' '54658' '54659' '54963' '574537' '64816' '7083' '7084' [37 ] '7172' '7363' '7364' '7365' '7366' '7367' '7371' '7372' '7378' [46 ] '7498' '79799' '83549' '8824' '8833' '9' '978'
那AnnDbBimap是怎么生成的?在AnnotationDbi的createAnnObjs.KEGG_DB.R文件里有生成bimap的代码:
KEGG_DB_AnnDbBimap_seeds <- list( list( objName='PATHID2NAME' , Class='AnnDbBimap' , L2Rchain=list( list( tablename='pathway2name' , Lcolname='path_id' , Rcolname='path_name' ) ) ), list( objName='PATHID2EXTID' , Class='AnnDbBimap' , L2Rchain=list( list( tablename='pathway2gene' , Lcolname='pathway_id' , Rcolname='gene_or_orf_id' ) ) ), list( objName='ENZYMEID2GO' , Class='AnnDbBimap' , L2Rchain=list( list( tablename='ec2go' , Lcolname='ec_no' , Rcolname='go_id' ) ) ) ) createAnnObjs.KEGG_DB <- function (prefix, objTarget, dbconn, datacache) { checkDBSCHEMA(dbconn, 'KEGG_DB' ) ## AnnDbBimap objects seed0 <- list( objTarget=objTarget, datacache=datacache ) ann_objs <- createAnnDbBimaps(KEGG_DB_AnnDbBimap_seeds, seed0) ## Reverse maps ann_objs$PATHNAME2ID <- revmap(ann_objs$PATHID2NAME, objName='PATHNAME2ID' ) ann_objs$EXTID2PATHID <- revmap(ann_objs$PATHID2EXTID, objName='EXTID2PATHID' ) ann_objs$GO2ENZYMEID <- revmap(ann_objs$ENZYMEID2GO, objName='GO2ENZYMEID' ) ## 1 special map that is not an AnnDbBimap object (just a named integer vector) ann_objs$MAPCOUNTS <- createMAPCOUNTS(dbconn, prefix) prefixAnnObjNames(ann_objs, prefix) }
5. R/zzz.R 具体参考2011版本的KEGG.db。
datacache <- new.env(hash=TRUE , parent=emptyenv()) KEGG <- function () showQCData('KEGG' , datacache) KEGG_dbconn <- function () dbconn(datacache) KEGG_dbfile <- function () dbfile(datacache) KEGG_dbschema <- function (file='' , show.indices=FALSE ) dbschema(datacache, file=file, show.indices=show.indices) KEGG_dbInfo <- function () dbInfo(datacache) .onLoad <- function (libname, pkgname) { ## Connect to the SQLite DB dbfile <- system.file('extdata' , 'KEGG.sqlite' , package=pkgname, lib.loc=libname) assign('dbfile' , dbfile, envir=datacache) dbconn <- dbFileConnect(dbfile) assign('dbconn' , dbconn, envir=datacache) ## Create the AnnObj instances ann_objs <- createAnnObjs.SchemaChoice('KEGG_DB' , 'KEGG' , 'KEGG' , dbconn, datacache) mergeToNamespaceAndExport(ann_objs, pkgname) packageStartupMessage(AnnotationDbi:::annoStartupMessages('KEGG.db' )) } .onUnload <- function (libpath) { dbFileDisconnect(KEGG_dbconn()) }
这里,为了简化过程,我们通过AnnotationDbi包的createAnnObjs.SchemaChoice()函数调用了AnnotationDbi里的createAnnObjs.KEGG_DB.R来产生AnnDbBimap。你也可以自己自定义一个加到zzz.R里。具体操作可以参考AnnotationForge,version 2.11 of Bioconductor的Creating an annotation package with a new database schema
6. 添加clusterProfiler做KEGG富集分析所需数据 packagedir <- '你创建的KEGG目录所在的路径' sqlite_path <- paste(packagedir, sep=.Platform$file.sep, 'inst' , 'extdata' )if (!dir.exists(sqlite_path)){dir.create(sqlite_path,recursive = TRUE )} dbfile <- file.path(sqlite_path, 'KEGG.sqlite' ) unlink(dbfile)################################################### ### download data ################################################### species <- 'hsa' # KEGG的物种列表查看: # https://www./kegg/catalog/org_list.html # 或者 # http://rest./list/organism kegg <- clusterProfiler::download_KEGG(species) KEGGPATHID2NAME <- kegg$KEGGPATHID2NAME colnames(KEGGPATHID2NAME) <- c('path_id' ,'path_name' ) KEGGPATHID2NAME$path_id <- sub(species,'' ,KEGGPATHID2NAME$path_id) KEGGPATHID2EXTID <- kegg$KEGGPATHID2EXTID colnames(KEGGPATHID2EXTID) <- c('pathway_id' ,'gene_or_orf_id' )################################################### ### create database ################################################### ## Create the database file library (RSQLite) drv <- dbDriver('SQLite' ) db <- dbConnect(drv, dbname=dbfile)## dbDisconnect(db) ################################################### ### put the data into the tables ################################################### dbWriteTable(conn = db, 'pathway2name' , KEGGPATHID2NAME, row.names=FALSE ) dbWriteTable(conn = db, 'pathway2gene' , KEGGPATHID2EXTID, row.names=FALSE ) dbListTables(db) test <- dbReadTable(conn = db,'pathway2name' ) head(test) test <- dbReadTable(conn = db,'pathway2gene' ) head(test)################################################### ### append the metadata ################################################### #dbSendQuery(conn = db,'drop table if exists metadata') metadata <- rbind(c('PATHNAMESOURCENAME' , 'KEGG PATHWAY' ), c('PATHNAMESOURCEURL' , 'ftp://ftp./pub/kegg/pathway' ), c('PATHNAMESOURCEDATE' , format(Sys.Date(), '%Y%m%d' )), c('KEGGSOURCENAME' , 'KEGG GENOME' ), c('KEGGSOURCEURL' , 'ftp://ftp./pub/kegg/genomes' ), c('KEGGSOURCEDATE' , format(Sys.Date(), '%Y%m%d' )), c('GOEXTSOURCEDATE' , '2015-Sepec2go27' ), c('GOEXTSOURCENAME' , 'Gene Ontology External Link' ), c('GOEXTSOURCEURL' , 'http://www./external2go' ), c('Db type' , 'KEGGDB' ), c('DBSCHEMA' , 'KEGG_DB' ), c('DBSCHEMAVERSION' , '2.1' )) metadata <- as.data.frame(metadata) colnames(metadata) <- c('name' , 'value' ) #makeAnnDbPkg规定的 dbWriteTable(conn = db, 'metadata' , metadata, row.names=FALSE ) map.counts <- rbind(c('pathway2name' , nrow(KEGGPATHID2NAME)), c('pathway2gene' , nrow(KEGGPATHID2EXTID))) map.counts <- as.data.frame(map.counts) colnames(map.counts) <- c('map_name' ,'count' ) dbWriteTable(conn = db, 'map_counts' , map.counts, row.names=FALSE ) dbListTables(db) dbListFields(conn = db, 'metadata' ) dbReadTable(conn = db,'metadata' ) map.metadata <- rbind(c('ENZYMEID2GO' ,'Gene Ontology External Link' ,'http://www./external2go' ,'2015-Sepec2go27' ), c('GO2ENZYMEID' ,'Gene Ontology External Link' ,'http://www./external2go' ,'2015-Sepec2go27' ), c('EXTID2PATHID' ,'KEGG GENOME' ,'ftp://ftp./pub/kegg/genomes' ,'2011-Mar15' ), c('PATHID2EXTID' ,'KEGG GENOME' ,'ftp://ftp./pub/kegg/genomes' ,'2011-Mar15' ), c('PATHNAME2ID' ,'KEGG PATHWAY' ,'ftp://ftp./pub/kegg/pathway' ,format(Sys.Date(),'%Y%m%d' )), c('PATHID2NAME' ,'KEGG PATHWAY' ,'ftp://ftp./pub/kegg/pathway' ,format(Sys.Date(),'%Y%m%d' ))) map.metadata <- as.data.frame(map.metadata) colnames(map.metadata) <- c('map_name' ,'source_name' ,'source_url' ,'source_date' ) dbWriteTable(conn = db, 'map_metadata' , map.metadata, row.names=FALSE ) dbListTables(db) dbListFields(conn = db, 'map_metadata' ) dbReadTable(conn = db,'map_metadata' ) dbDisconnect(db)
7. 生成R包 新开一个RStudio :File → New Project
有1条warning,我给忽略了。
8. 测试刚刚生成的包 install.packages('D:/test/KEGG.db_1.0.tar.gz' , repos=NULL ,type = 'source' )library (KEGG.db)library (clusterProfiler) data(geneList, package='DOSE' ) head(geneList) gene <- names(geneList)[abs(geneList) > 2 ] head(gene) kk <- enrichKEGG(gene = gene, organism = 'hsa' , pvalueCutoff = 0.05 , qvalueCutoff = 0.05 , use_internal_data =T )# 使用线上拉取的数据,则use_internal_data = F head(kk) nrow(kk)
结果对比:
好了,打完收工。
注:最简单粗暴的方式其实是在安装了KEGG.db后,使用本文添加数据的方法,将sqlite里的数据直接替换掉。