【原】1-easy_microbiome项目-微生物数据整理和基本统计分析

微生信生物 2021-06-12

展开全文

写在前面：

easy_microbiome上线已经一年多了，但是一直挂到github上，星星也已经超过20个了

我一直也没有进行一个推送，这次进行一个推送，整体图片包含许多

https://github.com/taowenmicro/easy_microbiome

2. 基于数据处理结果的整体总计分析

提供多种传统流程转化phyloseq的方式，方便使用后续流程。
提供将追踪序列文件可视化的代码；
基于phyloseq对样品和OTU使用多种标准进行过滤（目前是我见过最为方便的过滤方式）；
使用代码直接书写文章中扩增子部分结果分析的基本统计部分的段落内容。

原始序列处理过程的track分析，统计序列数量的变化

track文件即为序列追踪文件，追踪每个步骤保留序列的数量本track使用的是R全套原始数据处理流程。

library("phyloseq")
track = read.delim("./a9_sum_and_track/dada2_track.txt")

ps = readRDS("./a3_DADA2_table/ps.rds")
design = as.data.frame(sample_data(ps))
head(design)

##                    ID SampleType env1  env2 env3 env4   env5  env6  env7
## SRR8247254 SRR8247254          C 7.25 13.35 1.55 8.64 131.44 50.64 13.83
## SRR8247256 SRR8247256          D 6.65 12.20 1.47 8.31 130.21 38.24 12.70
## SRR8247258 SRR8247258          D 6.74 13.13 1.47 8.91 124.07 39.26 13.84
## SRR8247260 SRR8247260          B 7.46 11.90 1.32 9.00 124.07 16.90 12.13
## SRR8247262 SRR8247262          B 7.26 11.50 1.32 8.68 103.19  5.72 12.55
## SRR8247264 SRR8247264          C 7.49 13.00 1.59 8.20 108.10 44.66 12.82
##             env8   env9 env10 env11 env12
## SRR8247254 67.69 599.74  5.72 29.71 32.27
## SRR8247256 53.29 518.65  3.69 23.23 26.38
## SRR8247258 59.29 556.72  4.71 25.86 28.73
## SRR8247260 55.69 538.29  3.69 20.80 31.21
## SRR8247262 41.29 508.62  1.65 17.97 21.67
## SRR8247264 65.29 660.90  4.71 33.16 27.43

index = merge(track,design[,c("ID","SampleType")],by = "row.names",all = F)
head(index)

##    Row.names input filtered denoised nonchim         ID SampleType
## 1 SRR8247254 27158    24638     5798    5596 SRR8247254          C
## 2 SRR8247256 39850    35623     9296    8861 SRR8247256          D
## 3 SRR8247258 25980    23502     6156    5799 SRR8247258          D
## 4 SRR8247260 21948    19815     3077    2917 SRR8247260          B
## 5 SRR8247262 23112    20080     2413    2335 SRR8247262          B
## 6 SRR8247264 30831    27254     4999    4643 SRR8247264          C

row.names(index) = index$Row.names
index$Row.names  = NULL
library(reshape2)
library("ggplot2")
track_l<- melt(index, id.vars=c("ID","SampleType"),value.name = "value")

head(track_l)

##           ID SampleType variable value
## 1 SRR8247254          C    input 27158
## 2 SRR8247256          D    input 39850
## 3 SRR8247258          D    input 25980
## 4 SRR8247260          B    input 21948
## 5 SRR8247262          B    input 23112
## 6 SRR8247264          C    input 30831

p = ggplot(track_l,aes(x = variable,y = value, fill = ID,color = SampleType)) +
  geom_line(aes(group = ID,color = SampleType))+
  geom_point(pch = 21,size = 3,color = "black")+
  labs(x = "step", y = "count")

  
p = p +theme_bw()+
  #theme_classic()+
  # scale_color_brewer( guide = guide_legend(title = NULL))+
  # scale_fill_brewer(guide = guide_legend(title = NULL))+
  theme(
    
    panel.grid.major=element_blank(),
    panel.grid.minor=element_blank(),
    
    plot.title = element_text(vjust = -8.5,hjust = 0.1),
    axis.title.y =element_text(size = 20,face = "bold",colour = "black"),
    axis.title.x =element_text(size = 24,face = "bold",colour = "black"),
    axis.text = element_text(size = 20,face = "bold"),
    axis.text.x = element_text(colour = "black",size = 14),
    axis.text.y = element_text(colour = "black",size = 14),
    legend.text = element_text(size = 15,face = "bold"))+
  #theme(legend.position = c(0.1,0.2))+
  
  theme(strip.text.x = element_text(size=15, angle=0),
        strip.text.y = element_text(size=12, face="bold"),
        strip.background = element_rect(colour="blue", fill="#CCCCFF"))+
  guides(fill=FALSE)
  # guides(color=FALSE)
p

if (length(unique(track_l$ID))>3){  p=p+theme(axis.text.x=element_text(angle=45,vjust=1, hjust=1))}

ggsave("./a0_track_for_all_count.pdf", p, width = 12, height =6 )

phyloseq对象取子集(样品过滤)

读入phyloseq数据并进行原始count整理取舍（过滤OTU）

我们整理完成的ps为原始文件。这个文件导入后还需要做一些事情：1.对otu表格整理：取舍一些otu或者不同分类等级的数据

ps = readRDS("./a3_DADA2_table//ps.rds")
sample_data(ps)

##                    ID SampleType env1  env2 env3  env4   env5  env6  env7
## SRR8247254 SRR8247254          C 7.25 13.35 1.55  8.64 131.44 50.64 13.83
## SRR8247256 SRR8247256          D 6.65 12.20 1.47  8.31 130.21 38.24 12.70
## SRR8247258 SRR8247258          D 6.74 13.13 1.47  8.91 124.07 39.26 13.84
## SRR8247260 SRR8247260          B 7.46 11.90 1.32  9.00 124.07 16.90 12.13
## SRR8247262 SRR8247262          B 7.26 11.50 1.32  8.68 103.19  5.72 12.55
## SRR8247264 SRR8247264          C 7.49 13.00 1.59  8.20 108.10 44.66 12.82
## SRR8247266 SRR8247266          C 6.67 13.85 1.51  9.18 117.93 39.36 13.13
## SRR8247268 SRR8247268          B 7.13 11.91 1.09 10.89 101.96  4.22 11.97
## SRR8247271 SRR8247271          D 6.58 14.75 1.69  8.75 128.99 49.48 12.83
## SRR8247272 SRR8247272          C 7.21 12.41 1.54  8.09  95.82 45.32 13.11
## SRR8247274 SRR8247274          D 7.49 13.00 1.59  8.20 108.10 44.66 12.82
## SRR8247276 SRR8247276          D 7.13 11.91 1.09 10.89 101.96  4.22 11.97
## SRR8247278 SRR8247278          B 7.70 12.15 1.06 11.50 105.64 14.82 12.53
## SRR8247280 SRR8247280          C 7.59 12.99 1.65  7.86 108.10 33.78 13.08
## SRR8247282 SRR8247282          C 6.74 13.13 1.47  8.91 124.07 39.26 13.84
## SRR8247284 SRR8247284          A 7.67 12.18 1.65  7.38 113.02 10.11 12.89
## SRR8247286 SRR8247286          A 7.61 12.51 1.61  7.78  97.05 13.20 13.97
## SRR8247288 SRR8247288          A 7.15 13.63 1.44  9.47 111.79 14.87 12.84
## SRR8247290 SRR8247290          A 6.89 13.45 1.54  8.74 115.47  9.40 12.10
## SRR8247292 SRR8247292          A 7.01 14.95 1.63  9.15 117.93 50.36 12.85
## SRR8247294 SRR8247294          A 7.39 13.06 1.82  7.17 100.73 35.62 12.80
## SRR8247296 SRR8247296          B 6.58 14.75 1.69  8.75 128.99 49.48 12.83
## SRR8247298 SRR8247298          B 6.81 13.58 1.51  8.99 125.30 43.56 12.72
## SRR8247299 SRR8247299          D 6.75 13.09 1.49  8.76  94.59 44.94 13.23
##             env8   env9 env10 env11 env12
## SRR8247254 67.69 599.74  5.72 29.71 32.27
## SRR8247256 53.29 518.65  3.69 23.23 26.38
## SRR8247258 59.29 556.72  4.71 25.86 28.73
## SRR8247260 55.69 538.29  3.69 20.80 31.21
## SRR8247262 41.29 508.62  1.65 17.97 21.67
## SRR8247264 65.29 660.90  4.71 33.16 27.43
## SRR8247266 61.69 605.74  5.72 27.27 28.70
## SRR8247268 50.89 513.71  7.76 15.51 27.63
## SRR8247271 78.49 677.08  9.79 35.37 33.34
## SRR8247272 61.69 554.32  5.72 26.06 29.91
## SRR8247274 65.29 660.90  4.71 33.16 27.43
## SRR8247276 50.89 513.71  7.76 15.51 27.63
## SRR8247278 67.69 636.46  5.72 28.49 33.48
## SRR8247280 61.69 627.77  5.72 28.49 27.48
## SRR8247282 59.29 556.72  4.71 25.86 28.73
## SRR8247284 71.29 651.23  6.74 29.91 34.65
## SRR8247286 60.49 643.66  3.69 29.31 27.50
## SRR8247288 66.49 696.42  5.21 31.43 29.85
## SRR8247290 61.69 642.46  5.72 27.27 28.70
## SRR8247292 67.69 680.53  5.72 30.92 31.05
## SRR8247294 65.29 646.21  5.72 30.92 28.65
## SRR8247296 78.49 677.08  9.79 35.37 33.34
## SRR8247298 66.49 571.56 10.81 23.41 32.28
## SRR8247299 64.09 640.06  7.76 27.67 28.67

ps

## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2525 taxa and 24 samples ]
## sample_data() Sample Data:       [ 24 samples by 14 sample variables ]
## tax_table()   Taxonomy Table:    [ 2525 taxa by 6 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 2525 tips and 2524 internal nodes ]

library("tidyverse")
##如果我们分析细菌数据，可以选择去除非细菌成分，还有我们无法区分的叶绿体和线粒体序列
ps1 <- ps %>%
  subset_taxa(
    Kingdom == "Bacteria" &
      Family  != "mitochondria" &
      Class   != "Chloroplast"
  )
ps1

## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 1309 taxa and 24 samples ]
## sample_data() Sample Data:       [ 24 samples by 14 sample variables ]
## tax_table()   Taxonomy Table:    [ 1309 taxa by 6 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 1309 tips and 1308 internal nodes ]

#去除count值小于1的序列
ps1 = filter_taxa(ps1, function(x) sum(x ) > 1 , TRUE)
ps1

## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 1291 taxa and 24 samples ]
## sample_data() Sample Data:       [ 24 samples by 14 sample variables ]
## tax_table()   Taxonomy Table:    [ 1291 taxa by 6 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 1291 tips and 1290 internal nodes ]

# ##去除在一半样品中出现count数量少于三条的otu
# ps1 = filter_taxa(ps1, function(x) sum(x > 1) > (0.5*length(x)), TRUE);ps1

标准化phyloseq对象

标准化otu表格有多重方法，这里需要做一个总结，并且要不断完善：1. 抽平 2.相对丰度标准化 3.log转化

基于群落数据进行统计分析

准备一份报告，将测序序列数量，otu统计信息，最多的五个门类统计信息等做一个综述。

########基于整体的测序文件我们需要制作测序数据的整体评估###########
ps_sum <- ps1 <- ps
#step_1 ：ps02准备好phyloseq对象并可以使用下面代码来书写群落分析的前四句话。
where <- "soil"
method <- "using 16S rRNA gene amplicon sequencing."
rep = 12
step_1 <- paste("We analyzed the composition of microbial communities in the",where,method)

#step_2 统计每个样品中的序列数量
repead <- paste("For this analysis, we collected",rep,"repeats for each samples from culture",sep = "")
each_count <- paste(repead,"We obtained an average read count per sample of ",round(mean(sample_sums(ps1)),0),"(standard deviation (SD)",round(sd(sample_sums(ps1)),2),").",sep = "")
each_count

## [1] "For this analysis, we collected12repeats for each samples from cultureWe obtained an average read count per sample of 3935(standard deviation (SD)1658.64)."

#step_3 统计每个样品中的OTU数量
vegan_otu <-  function(physeq){
  OTU <-  otu_table(physeq)
  if(taxa_are_rows(OTU)){
    OTU <-  t(OTU)
  }
  return(as(OTU,"matrix"))
}
aa = vegan_otu(ps1)
otu_table = as.data.frame(t(aa))
otu_table = as.matrix(otu_table)
otu_table [otu_table > 0] <-1
OTU_sum <- colSums(otu_table)
sample_tax <- paste("the numbers of OTU, generally ranged between ",
                    min(OTU_sum)," and ",max(OTU_sum)," per sample with an average of ",
                    round(mean(OTU_sum),0),"(SD ",round(sd(OTU_sum)),")",sep = "")
sample_tax

## [1] "the numbers of OTU, generally ranged between 43 and 224 per sample with an average of 150(SD 43)"

###step 4 统计门水平的总体丰度信息
library("tidyverse")
Taxonomies <- ps1 %>%
  tax_glom(taxrank = "Phylum") %>% 
  transform_sample_counts(function(x) {x/sum(x)} )%>% 
  psmelt() %>%
  #filter(Abundance > 0.05) %>%
  arrange(Phylum)
iris_groups<- group_by(Taxonomies, Phylum)
ps0_sum <- summarise(iris_groups, mean(Abundance), sd(Abundance))
ps0_sum[is.na(ps0_sum)] <- 0
head(ps0_sum)

## # A tibble: 6 x 3
##   Phylum                    `mean(Abundance)` `sd(Abundance)`
##   <fct>                                 <dbl>           <dbl>
## 1 Acidobacteria                      0.0275           0.0206 
## 2 Actinobacteria                     0.0684           0.0334 
## 3 Armatimonadetes                    0.00110          0.00194
## 4 Bacteroidetes                      0.00205          0.00386
## 5 BJ-169                             0.000383         0.00138
## 6 Candidatus_Berkelbacteria          0.00214          0.00298

colnames(ps0_sum) = c("ID","mean","sd")
ps0_sum <- arrange(ps0_sum,desc(mean))
ps0_sum$mean <- ps0_sum$mean *100
ps0_sum <- as.data.frame(ps0_sum)
a = paste(ps0_sum[1,1],"(",round(ps0_sum[1,2],2),"%"," with sd ",round(ps0_sum[1,3],3),")",sep = "")
b = paste(ps0_sum[2,1],"(",round(ps0_sum[2,2],2),"%"," with sd ",round(ps0_sum[2,3],3),")",sep = "")
c = paste(ps0_sum[3,1],"(",round(ps0_sum[3,2],2),"%"," with sd ",round(ps0_sum[3,3],3),")",sep = "")
d = paste(ps0_sum[4,1],"(",round(ps0_sum[4,2],2),"%"," with sd ",round(ps0_sum[4,3],3),")",sep = "")
e = paste(ps0_sum[5,1],"(",round(ps0_sum[5,2],2),"%"," with sd ",round(ps0_sum[5,3],3),")",sep = "")
tax_sum = paste("The majority of OTU belonged to the phyla",a,b,c,d,"and",e,".",sep = " ")

##all_first 
line = paste(step_1,each_count ,sample_tax ,tax_sum,sep = "")
line

## [1] "We analyzed the composition of microbial communities in the soil using 16S rRNA gene amplicon sequencing.For this analysis, we collected12repeats for each samples from cultureWe obtained an average read count per sample of 3935(standard deviation (SD)1658.64).the numbers of OTU, generally ranged between 43 and 224 per sample with an average of 150(SD 43)The majority of OTU belonged to the phyla Proteobacteria(30.59% with sd 0.088) Chloroflexi(27.03% with sd 0.16) Saccharibacteria(13.93% with sd 0.13) Parcubacteria(8.35% with sd 0.048) and Actinobacteria(6.84% with sd 0.033) ."

write.table(line,"./a9_sum_and_track/microbiome_first_line.txt",quote = F)

导入数据（提供多种导入数据方式）

第一种

这里提供从普通txt数据导入成为phyloseq的代码：注意：默认的otu_table为otu在行名，如果反向，则需指定taxa_are_rows = F，mapping文件需要有一列样品名，并且行名为样品名

提供Qiime输出导入phyloseq的流程

#清空内存#######
rm(list=ls()) 
library("phyloseq")
library("ggplot2")
library("dada2")
library("tidyverse")
# ###制作phyloseq对象并保
aa = "~/Desktop/DATA_get_wilt/a3_together_result_gg135/merged190604//a9_usearch_otu_table//otu_table_tax_phylosep.txt"
aab = import_qiime(aa)

metadata <- import_qiime_sample_data("./mapping.txt")
dim(metadata)
head(metadata)

colnames(metadata)[1] <- "ID"
metadata$ID

# match(sample_names(aab),metadata$ID)
# setdiff(sample_names(aab),metadata$ID)
# setdiff(metadata$ID,sample_names(aab))

#colnames(tax_table(China_phyloseq))<- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")
tree = import_qiime(treefilename = "./merged190604//a9_tree/rep_set.tree")

ps <- merge_phyloseq(metadata, aab,tree)
ps

saveRDS(ps, "./ps.rds")

原始序列处理过程的track分析，统计序列数量的变化

由于并不是所有人都是用的R语言DADA2流程，所以这部分数据处理过程可视化的代码放到后面。默认不执行

track文件即为序列追踪文件，追踪每个步骤保留序列的数量本track使用的是R全套原始数据处理流程。

library("phyloseq")
track = read.delim("./a9_sum_and_track/dada2_track.txt")

ps = readRDS("./a3_DADA2_table/ps.rds")
design = as.data.frame(sample_data(ps))
head(design)
index = merge(track,design[,c("ID","SampleType")],by = "row.names",all = F)
head(index)
row.names(index) = index$Row.names
index$Row.names  = NULL
library(reshape2)
library("ggplot2")
track_l<- melt(index, id.vars=c("ID","SampleType"),value.name = "value")

head(track_l)
p = ggplot(track_l,aes(x = variable,y = value, fill = ID,color = SampleType)) +
  geom_line(aes(group = ID,color = SampleType))+
  geom_point(pch = 21,size = 3,color = "black")+
  labs(x = "step", y = "count")


p = p +theme_bw()+
  #theme_classic()+
  # scale_color_brewer( guide = guide_legend(title = NULL))+
  # scale_fill_brewer(guide = guide_legend(title = NULL))+
  theme(

    panel.grid.major=element_blank(),
    panel.grid.minor=element_blank(),

    plot.title = element_text(vjust = -8.5,hjust = 0.1),
    axis.title.y =element_text(size = 20,face = "bold",colour = "black"),
    axis.title.x =element_text(size = 24,face = "bold",colour = "black"),
    axis.text = element_text(size = 20,face = "bold"),
    axis.text.x = element_text(colour = "black",size = 14),
    axis.text.y = element_text(colour = "black",size = 14),
    legend.text = element_text(size = 15,face = "bold"))+
  #theme(legend.position = c(0.1,0.2))+

  theme(strip.text.x = element_text(size=15, angle=0),
        strip.text.y = element_text(size=12, face="bold"),
        strip.background = element_rect(colour="blue", fill="#CCCCFF"))+
  guides(fill=FALSE)
  # guides(color=FALSE)
p

if (length(unique(track_l$ID))>3){    p=p+theme(axis.text.x=element_text(angle=45,vjust=1, hjust=1))}

ggsave("./a0_track_for_all_count.pdf", p, width = 12, height =6 )

进过这么多次的分析，发现基于R语言DADA2跑出来的otu表格中OTU名称是一条序列影响整体美观，我准备做一个转换

psOTU.ps:为转换后的phyloseq封装文件


ps = readRDS("./a3_DADA2_table/ps.rds")
vegan_otu <-  function(physeq){
  OTU <-  otu_table(physeq)
  if(taxa_are_rows(OTU)){
    OTU <-  t(OTU)
  }
  return(as(OTU,"matrix"))
}
otu_table = as.data.frame(t(vegan_otu(ps)))
head(otu_table)

### 进过这么多次的分析，发现基于R语言DADA2跑出来的otu表格中OTU名称是一条序列影响整体美观，我准备做一个转换

vegan_tax <-  function(physeq){
  tax <-  tax_table(physeq)

  return(as(tax,"matrix"))
}
tax_table = as.data.frame(vegan_tax(ps))
head(tax_table)

#合并两个表格
otu_tax = merge(otu_table,tax_table,by = "row.names", all = F)
row.names(otu_tax) = paste("OTU_",1:dim(otu_tax)[1],sep = "")
otu_tax$rep = otu_tax$Row.names
otu_tax$Row.names = NULL
colnames(otu_tax)
head(otu_tax)

if (length(colnames(tax_table(ps))) == 6) {
  OTU_a = ncol(otu_tax)-7
  seqtab.nochim  = t(otu_tax[1:OTU_a])
  str(seqtab.nochim)
  seqtab.nochim = as.matrix(seqtab.nochim)
  #tax文件制作
  taxa = otu_tax[-c(1:OTU_a)]
  taxa = as.matrix(taxa)
  mapping = as.data.frame(sample_data(ps))
}

if (length(colnames(tax_table(ps))) == 7) {
  OTU_a = ncol(otu_tax)-8
  seqtab.nochim  = t(otu_tax[1:OTU_a])
  str(seqtab.nochim)
  seqtab.nochim = as.matrix(seqtab.nochim)
  #tax文件制作
  taxa = otu_tax[-c(1:OTU_a)]
  taxa = as.matrix(taxa)
  mapping = as.data.frame(sample_data(ps))
}

ps = phyloseq(otu_table(seqtab.nochim,taxa_are_rows = F), 
              sample_data(mapping), 
              tax_table(taxa))
ps

saveRDS(ps,"./a3_DADA2_table/ps_OTU_.ps")

数据整理附件


metadata <- import_qiime_sample_data("./mapping.txt")
dim(metadata)
head(metadata)
colnames(metadata)[1] <- "ID"

sample_data(ps) = metadata

saveRDS(ps,"./a3_DADA2_table/ps.rds")
saveRDS(ps,"./ps.rds")