将比对好的fasta序列转换成relaxed phylip格式 | Public Library of Bioinformatics

勤悦轩 2015-10-23

展开全文

有时,在推断进化树的过程中,我们希望phylip格式的文件能够将序列的名称完整得保留下来。通常情况下,我们的名称会超过10字符, 而ClustalX和MUSCLE等软件给出的phylip文件是严格的phylip格式,物种名称不能超过10个字符。 Clustal和MUSCLE还给出了FASTA格式,这种格式则完整得保留了各序列的名称.

为了从比对好的FASTA文件直接生成 relaxed phylip格式的文件, 这里提供了R的源代码. 以供有兴趣的同仁使用,并提出宝贵意见.

### Convert Aligned Fasta to relax phylip
 
getsequence.fasta <- function (x = NULL)
{
    if (!inherits(x, "fasta")) {
        stop("Make sure the data is a fasta object.")
    }
    if (is.null(x)) {
        stop("You have to specify the input data.")
    }
    result = x[!grepl(">", x)]
    return(result)
}
 
getnames.fasta <- function (x = NULL)
{
    if (!inherits(x, "fasta")) {
        stop("Make sure the data is a fasta object.")
    }
    if (is.null(x)) {
        stop("You have to specify the input data.")
    }
    result = x[grepl(">", x)]
    result = gsub(">", "", result)
    return(result)
}
 
dat2relax.phylip <- function (input, write = TRUE)
{
    row1.1 <- nrow(input)
    row1.2 <- nchar(as.character(input[1, 2]))
    row1 <- paste(row1.1, row1.2)
    space <- as.vector(max(nchar(as.character(input[,1]))) +
                        1 - nchar(as.character(input[, 1])))
    res <- c()
    for(i in 1:length(space)){
       res[i] <-  paste(input[i, 1],
                  paste(rep(" ", space[i]), collapse = ""),
                        input[i, 2],  collapse = "")
    }
    res <- c(row1, res)
    return(res)
}
 
 
read.fasta <- function (file = NULL)
{
    if (is.null(file)) {
        stop("Please specify the input fasta file.")
    }
    result <- readLines(file)
    nameline <- result[grepl("[>]", result)]
    test <- regexpr(">", nameline) > 1
    if (any(test)) {
        warning(paste("\">\" in line(s)", which(test),
                "\n appeared not at the beginning.
                 \n Please remove any character(s) before \">\"."))
    }
    result = result[grepl("[A-Za-z0-9]", result)]
    result <- ConvFas(result, "fas")
    if (any(regexpr(">", result[seq(1, length(result), by = 2)]) <
        0)) {
        xx <- 2 * which(regexpr(">", result[seq(1, length(result),
            by = 2)]) < 0)
        if (length(xx) > 10) {
            xx <- xx[1:10]
            stop(paste("readfasta could not find \">\" in row: \n",
                paste(xx, collapse = ", "), "... \n", "Make sure the file ",
                file, " is in fasta format.\n"))
        }
    }
    class(result) <- "fasta"
    return(result)
}
 
ConvFas <- function (fil = NULL, type = c("fas", "nxs", "phy"))
{
    match.arg(type)
    dna = fil
    if (type == "fas") {
        seqNamPos = grep("^>", dna)
        pos = c(seqNamPos, length(dna) + 1)
        seqNam = dna[seqNamPos]
    }
    if (type == "nxs") {
        dna = dna[(grep("matrix", dna, ignore.case = TRUE) +
            1):length(dna)]
        dna = dna[-which(dna == "" | dna == "end;" | dna == ";")]
        seqNam = unique(substr(dna, 1, regexpr(" ", dna) - 1))
    }
    if (type == "phy") {
        dna = dna[regexpr("[ATGC-]", dna) > 0]
        seqNam = substr(dna, 1, regexpr(" ", dna) - 1)
        seqNam = seqNam[-which(seqNam == "")]
    }
    nSeq = length(seqNam)
    for (i in 1:nSeq) {
        if (type == "fas") {
            st = pos[i] + 1
            ed = pos[i + 1] - 1
            stri = gsub(" ", "", paste(dna[st:ed], collapse = ""))
        }
        if (type == "nxs" | type == "phy") {
            nBlock = length(dna)/length(seqNam)
            rNam = ((1:nBlock) - 1) * nSeq + i
            stri = gsub("[ -]", "", gsub(seqNam[i], "", paste(dna[rNam],
                collapse = "")))
            stri = toupper(stri)
        }
        Nam = paste(">", seqNam[i], sep = "")
        if (i == 1) {
            DNA = c(Nam, stri)
        }
        if (i > 1) {
            DNA = c(DNA, Nam, stri)
        }
    }
    DNA = gsub(">>", ">", DNA)
    class(DNA) <- "fasta"
    return(DNA)
}
 
dat <- read.fasta("input.fasta")
nam <- getnames.fasta(dat)
seq <- getsequence.fasta(dat)
dat2 <- data.frame(nam, seq)
res <- dat2relax.phylip(dat2)
writeLines(res, "output.phy")

来自：http://blog.sciencenet.cn/blog-255662-529962.html