Anders S, McCarthy DJ, Chen Y,
Okoniewski M, Smyth GK, Huber W, Robinson MD: Count-based
differential expression analysis of RNA sequencing data using R and
Bioconductor. Nature protocols 2013,
8(9):1765-1786.
1、 Assess
sequence quality control with shortread
>
library("ShortRead") #加载ShortRead包
> fqQC
= qa(dirPath = ".", pattern = ".fastq$", type =
"fastq") #调用ShortRead包里的qa()质量评估函数,文件的路径、文件名匹配模式、文件类型
> report(fqQC, type =
"html", dest = "fastqQAreport")
#调用ShortRead包里的函数report()总结质量评估
注:利用浏览器查看html文件
2、collect metadata of
experimental design
从NCBI上输入数据的
accession号,导出SraRunInfo.csv文件
> sri =
read.csv("SraRunInfo.csv", stringsAsFactors=FALSE)
#读入要处理的文件列表
> keep = grep("CG8144|Untreated-", sri$
LibraryName)#取出需要分析的文件
> sri = sri[keep,]
# trim
label利用模式匹配去除sri中LibraryName名称中的S2_DRSC
>
sri$LibraryName =
gsub("S2_DRSC_","",sri$LibraryName)
#取出LibraryName和LibraruLayout唯一的值
> samples
=
unique(sri[,c("LibraryName","LibraryLayout")])
#根据LibraryName值把sri中的文件分类到samples中,并根据LibraryLayout的值是单链还是双链,
给双链文件的文件命名,在原有名称后面加上_1.fastq或_2.fastq,并用,分开
> for(i in
seq_len(nrow(samples))) {
rw =
(sri$LibraryName == samples$LibraryName[i])
if(samples$LibraryLayout[i] ==
"PAIRED") {
samples$fastq1[i] =
paste0(sri$Run[rw],"_1.fastq",collapse = ",")
samples$fastq2[i] =
paste0(sri$Run[rw],"_2.fastq",collapse = ",")
} else {
samples$fastq1[i] =
paste0(sri$Run[rw],".fastq",collapse = ",")
samples$fastq2[i] = ""
}
}
##给samples增加condition和shortname两列(检查确保samples中的内容正确)
>
samples$condition = "CTL"
>
samples$condition[grep("RNAi",samples$LibraryName)]
= "KD"
>
samples$shortname =
paste(substr(samples$condition,1,2),
substr(samples$LibraryLayout,1,2),
seq_len(nrow(samples)), sep = ".")
3、align the reads
(using tophat2) to the reference genome
##利用R构建tophat的比对命令,并在命令行运行这些命令
> gf
=
"Drosophila_melanogaster.BDGP5.70.gtf"
#基因的注释文件
> bowind
= "Dme1_BDGP5_70" #bowtie的索引文件
> cmd
= with(samples, paste("tophat2 -G", gf, "-p 5
-o",
LibraryName, bowind, fastq1, fastq2))
#eg:tophat2 -G
Drosophila_melanogaster.BDGP5.70.gtf -p 5 -o Untreated-3
Dme1_BDGP5_70 \
SRR031714_1.fastq,SRR031715_1.fastq
SRR031714_2.fastq,SRR031715_2.fastq
4、organize, sort and index
the BaM files and create saM files
#利用R产生用于htseq-count和IGV的文件的samtools工具
> for(i in
seq_len(nrow(samples))) {
lib =
samples$LibraryName[i]
ob =
?le.path(lib, "accepted_hits.bam")
# sort by name, convert to SAM for
htseq-count
cat(paste0("samtools sort -n ",ob,"
",lib,"_sn"),"\n")
cat(paste0("samtools view -o ",lib,"_sn.sam
",lib,"_sn.bam"),"\n")
# sort by position and index for IGV
cat(paste0("samtools sort ",ob,"
",lib,"_s"),"\n")
cat(paste0("samtools index
",lib,"_s.bam"),"\n\n")
}
5、count
reads using htseq-count
#利用R生成htseq-count的命令
>
samples$countf =
paste(samples$LibraryName, "count", sep = ".")
> gf
= "Drosophila_melanogaster.BDGP5.70.gtf"
> cmd
= paste0("htseq-count -s no -a 10 ",
samples$LibraryName,
"_sn.sam ", gf," > ", samples$countf)
> cmd
R output:
htseq-count -s no -a 10 Untreated-3_sn.sam
Drosophila_melanogaster.BDGP5.70.gtf
> Untreated-3.count
6、利用biocouductor的edgeR包进行差异表达分析
|