任务
<font color=orange>差异基因分析背后的统计学知识</font>
一:样品间(组间的数据)的表达标准化
二:假设检验进行差异表达基因的鉴定
<font color=orange>用DESeq2进行差异表达分析</font>一:DESeq2的安装source("https:///biocLite.R") 载入安装工具bioconductor biocLite("DESeq2") bioconductor安装DESEQ2 library(DESeq2) 二:DESeq2差异分析基本流程
三:构建dds矩阵
dds <- DESeqDataSetFromMatrix(countData = cts, colData = coldata, design= ~ batch + condition)
KD1 <- read.table("mus_E14_cell_Akap95_shRNA_rep1_SRR3589959.count",sep = "\t",col.names = c("gene_id","rep1")) KD2 <- read.table("mus_E14_cell_Akap95_shRNA_rep2_SRR3589960_matrix.count",sep = "\t",col.names = c("gene_id","rep2")) control1 <- read.table("mus_E14_cell_control_shRNA_rep1_SRR3589961_matrix.count",sep = "\t",col.names = c("gene_id","control1")) control2 <- read.table("mus_E14_cell_control_shRNA_rep2_SRR3589962_matrix.count",sep = "\t",col.names = c("gene_id","control2"))
raw_count <- merge(merge(merge(KD1,KD2,by="gene_id"),control1,by="gene_id"),control2,by="gene_id")
count_data <- raw_count[,2:5] row.names(count_data) <- raw_count[,1] condition <- factor(c("rep","rep","condition","condition")) col_data <- data.frame(row.names = colnames(count_data), condition) dds <- DESeqDataSetFromMatrix(countData = count_data,colData = col_data,design = ~ condition)
nrow(dds) ## 显示52636行 dds_filter <- dds[ rowSums(counts(dds))>1, ] ##将所有样本基因表达量之和小于1的基因过滤掉 nrow(dds_filter) ##显示27101行,过滤掉了近50% 四:对dds矩阵进行DESeq分析:
dds_out <- DESeq(dds_filter) res <- results(dds_out) summary(res)
五:提取差异基因分析的结果
table(res$padj<0.05) res_deseq <- res[order(res$padj),] diff_gene_deseq2 <- subset(res_deseq, padj<0.05 & (log2FoldChange > 1 | log2FoldChange < -1)) ## or diff_gene_deseq2 <- subset(res_desq, padj<0.05 & abs(log2FoldChange) >=1) diff_gene_deseq2 <- row.names(diff_gene_deseq2) res_diff_data <- merge(as.data.frame(res),as.data.frame(counts(dds_out,normalize=TRUE)),by="row.names",sort=FALSE) write.csv(res_diff_data,file = "11_30_mouse_data.csv",row.names = F)
六 MA图
library(ggplot2) plotMA(res,ylim=c(-5,5)) topGene <- rownames(res)[which.min(res$padj)] with(res[topGene, ], { points(baseMean, log2FoldChange, col="dodgerblue", cex=2, lwd=2) text(baseMean, log2FoldChange, topGene, pos=2, col="dodgerblue") }) image
resLFC <- lfcShrink(dds_out,coef = 2,res=res) plotMA(resLFC, ylim=c(-5,5)) topGene <- rownames(resLFC)[which.min(res$padj)] with(resLFC[topGene, ], { points(baseMean, log2FoldChange, col="dodgerblue", cex=2, lwd=2) text(baseMean, log2FoldChange, topGene, pos=2, col="dodgerblue") }) idx <- identify(res$baseMean, res$log2FoldChange) image 七:gene 聚类热图
library(genefilter) library(pheatmap) rld <- rlogTransformation(dds_out,blind = F) write.csv(assay(rld),file="mm.DESeq2.pseudo.counts.csv") topVarGene <- head(order(rowVars(assay(rld)),decreasing = TRUE),20) mat <- assay(rld)[ topVarGene, ] ### mat <- mat - rowMeans(mat) 减去一个平均值,让数值更加集中。第二个图 anno <- as.data.frame(colData(rld)[,c("condition","sizeFactor")]) pheatmap(mat, annotation_col = anno) 结果两个热图: image image 八:火山图#### perl 单行:第三列log2foldchange大于1为上调,小于-1下调,其他的不显著。 perl -F'\t' -alne '$F[5]=~s/\r//;if(/baseMean/){print "gene\tlog2FoldChange\tpadj\tsignificant"} else{unless(/NA/){if($F[2] >=1 && $F[5]<0.05){print "$F[0]\t$F[2]\t$F[5]\tup"} elsif($F[2]<=-1 && $F[5]<0.05){print "$F[0]\t$F[2]\t$F[5]\tdown"} else{print "$F[0]\t$F[2]\t$F[5]\tno"}}}' mm_deseq2_resLFS.csv >volcano.txt ### 火山图 data <-read.table(file="volcano.txt",header = TRUE, row.names =1,sep = "\t") volcano <-ggplot(data = volcano_data,aes(x=log2FoldChange,y= -1*log10(padj)))+geom_point(aes(color=significant))+scale_color_manual(values = c("red","grey","blue")) + labs(title="Volcano_Plot",x=expression((log[2](FC)), y=expression(-log[10](padj)) ))+geom_hline(yintercept=1.3,linetype=4)+geom_vline(xintercept=c(-1,1),linetype=4) volcano image
参考文章
|
|
来自: 新用户3677sdB0 > 《转录组学习》