R语言代码：用于微生物生态数据R code for ecological data analysis...

生物_医药_科研 2021-12-22

展开全文

NMDS_ordisurf.R

# ============================================================
# Tutorial on drawing an NMDS plot with environmental contours using ggplot2
# by Umer Zeeshan Ijaz (http://userweb.eng./umer.ijaz)
# =============================================================

abund_table<-read.csv('SPE_pitlatrine.csv',row.names=1,check.names=FALSE)
#Transpose the data to have sample names on rows
abund_table<-t(abund_table)
meta_table<-read.csv('ENV_pitlatrine.csv',row.names=1,check.names=FALSE)
#Just a check to ensure that the samples in meta_table are in the same order as in abund_table
meta_table<-meta_table[rownames(abund_table),]
#Get grouping information
grouping_info<-data.frame(row.names=rownames(abund_table),t(as.data.frame(strsplit(rownames(abund_table),'_'))))
# > head(grouping_info)
# X1 X2 X3
# T_2_1 T 2 1
# T_2_10 T 2 10
# T_2_12 T 2 12
# T_2_2 T 2 2
# T_2_3 T 2 3
# T_2_6 T 2 6

#Let us group on countries
groups<-grouping_info[,1]

library(vegan)
#Get MDS stats
sol<-metaMDS(abund_table,distance = 'bray', k = 2, trymax = 50)

#We use meta_table$Temp to plot temperature values on the plot. You can select
#any other variable
#> names(meta_table)
#[1] 'pH' 'Temp' 'TS' 'VS' 'VFA' 'CODt'
#[7] 'CODs' 'perCODsbyt' 'NH4' 'Prot' 'Carbo'
#Reference:http://oliviarata./2014/07/17/ordinations-in-ggplot2-v2-ordisurf/
ordi<-ordisurf(sol,meta_table$Temp,plot = FALSE, bs='ds')
ordi.grid <- ordi$grid #extracts the ordisurf object
str(ordi.grid) #it's a list though - cannot be plotted as is
ordi.mite <- expand.grid(x = ordi.grid$x, y = ordi.grid$y) #get x and ys
ordi.mite$z <- as.vector(ordi.grid$z) #unravel the matrix for the z scores
ordi.mite.na <- data.frame(na.omit(ordi.mite)) #gets rid of the nas

NMDS=data.frame(x=sol$point[,1],y=sol$point[,2],Type=groups)

p<-ggplot()+
stat_contour(data = ordi.mite.na, aes(x = x, y = y, z = z, colour = ..level..),positon='identity')+ #can change the binwidth depending on how many contours you want
geom_point(data=NMDS,aes(x,y,fill=Type),pch=21,size=3,colour=NA)
p<-p+scale_colour_continuous(high = 'darkgreen', low = 'darkolivegreen1') #here we set the high and low of the colour scale. Can delete to go back to the standard blue, or specify others
p<-p+labs(colour = 'Temperature') #another way to set the labels, in this case, for the colour legend
p<-p+theme_bw()
#p<-p+theme(legend.key = element_blank(), #removes the box around each legend item
# legend.position = 'bottom', #legend at the bottom
# legend.direction = 'horizontal',
# legend.box = 'horizontal',
# legend.box.just = 'centre')
pdf('NMDS_ordisurf.pdf',width=10,height=10)
print(p)
dev.off()

以上代码产生下图：

KW.R

# ============================================================
# Tutorial on finding significant taxa and then plotting them using ggplot2
# by Umer Zeeshan Ijaz (http://userweb.eng./umer.ijaz)
# =============================================================
library(reshape)
library(ggplot2)
 
#Load the abundance table 
abund_table<-read.csv('SPE_pitlatrine.csv',row.names=1,check.names=FALSE)
 
#Transpose the data to have sample names on rows
abund_table<-t(abund_table)
 
#Get grouping information
grouping_info<-data.frame(row.names=rownames(abund_table),t(as.data.frame(strsplit(rownames(abund_table),'_'))))
# > head(grouping_info)
# X1 X2 X3
# T_2_1   T  2  1
# T_2_10  T  2 10
# T_2_12  T  2 12
# T_2_2   T  2  2
# T_2_3   T  2  3
# T_2_6   T  2  6
 
#Use countries as grouping information
groups<-as.factor(grouping_info[,1])
 
#Apply normalisation (either use relative or log-relative transformation)
#data<-abund_table/rowSums(abund_table)
data<-log((abund_table+1)/(rowSums(abund_table)+dim(abund_table)[2]))
data<-as.data.frame(data)
 
#Reference: http://www.bigre./courses/statistics_bioinformatics/practicals/microarrays_berry_2010/berry_feature_selection.html
kruskal.wallis.alpha=0.01
kruskal.wallis.table <- data.frame()
for (i in 1:dim(data)[2]) {
  ks.test <- kruskal.test(data[,i], g=groups)
  # Store the result in the data frame
  kruskal.wallis.table <- rbind(kruskal.wallis.table,
                                data.frame(id=names(data)[i],
                                           p.value=ks.test$p.value
                                ))
  # Report number of values tested
  cat(paste('Kruskal-Wallis test for ',names(data)[i],' ', i, '/', 
            dim(data)[2], '; p-value=', ks.test$p.value,'\n', sep=''))
}
 
 
kruskal.wallis.table$E.value <- kruskal.wallis.table$p.value * dim(kruskal.wallis.table)[1]
 
kruskal.wallis.table$FWER <- pbinom(q=0, p=kruskal.wallis.table$p.value, 
                                    size=dim(kruskal.wallis.table)[1], lower.tail=FALSE)
 
kruskal.wallis.table <- kruskal.wallis.table[order(kruskal.wallis.table$p.value,
                                                   decreasing=FALSE), ]
kruskal.wallis.table$q.value.factor <- dim(kruskal.wallis.table)[1] / 1:dim(kruskal.wallis.table)[1]
kruskal.wallis.table$q.value <- kruskal.wallis.table$p.value * kruskal.wallis.table$q.value.factor
pdf('KW_correction.pdf')
plot(kruskal.wallis.table$p.value,
     kruskal.wallis.table$E.value,
     main='Multitesting corrections',
     xlab='Nominal p-value',
     ylab='Multitesting-corrected statistics',
     log='xy',
     col='blue',
     panel.first=grid(col='#BBBBBB',lty='solid'))
lines(kruskal.wallis.table$p.value,
      kruskal.wallis.table$FWER,
      pch=20,col='darkgreen', type='p'
)
lines(kruskal.wallis.table$p.value,
      kruskal.wallis.table$q.value,
      pch='+',col='darkred', type='p'
)
abline(h=kruskal.wallis.alpha, col='red', lwd=2)
legend('topleft', legend=c('E-value', 'p-value', 'q-value'), col=c('blue', 'darkgreen','darkred'), lwd=2,bg='white',bty='o')
dev.off()
 
last.significant.element <- max(which(kruskal.wallis.table$q.value <= kruskal.wallis.alpha))
selected <- 1:last.significant.element
diff.cat.factor <- kruskal.wallis.table$id[selected]
diff.cat <- as.vector(diff.cat.factor)
 
print(kruskal.wallis.table[selected,])
 
#Now we plot taxa significantly different between the categories
df<-NULL
for(i in diff.cat){
  tmp<-data.frame(data[,i],groups,rep(paste(i,' q = ',round(kruskal.wallis.table[kruskal.wallis.table$id==i,'q.value'],5),sep=''),dim(data)[1]))
  if(is.null(df)){df<-tmp} else { df<-rbind(df,tmp)} 
}
colnames(df)<-c('Value','Type','Taxa')
 
p<-ggplot(df,aes(Type,Value,colour=Type))+ylab('Log-relative normalised')
p<-p+geom_boxplot()+geom_jitter()+theme_bw()+
  facet_wrap( ~ Taxa , scales='free', ncol=3)
p<-p+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
pdf('KW_significant.pdf',width=10,height=14)
print(p)
dev.off()

以上代码产生以下两个图：

NB.R

The DESeq {DESeq2} package allows negative binomial GLM fitting and Wald statistics for abundance data. You can use this script as an alternative to KW.R to find taxa that are significantly different between different conditions.

# ============================================================
# Tutorial on finding significant taxa and then plotting them using ggplot2
# by Umer Zeeshan Ijaz (http://userweb.eng./umer.ijaz)
# =============================================================

library(ggplot2)
library(DESeq2)

#Load the abundance table
abund_table<-read.csv('SPE_pitlatrine.csv',row.names=1,check.names=FALSE)

#Transpose the data to have sample names on rows
abund_table<-t(abund_table)

#Get grouping information
grouping_info<-data.frame(row.names=rownames(abund_table),t(as.data.frame(strsplit(rownames(abund_table),'_'))))
# > head(grouping_info)
# X1 X2 X3
# T_2_1 T 2 1
# T_2_10 T 2 10
# T_2_12 T 2 12
# T_2_2 T 2 2
# T_2_3 T 2 3
# T_2_6 T 2 6

#We will convert our table to DESeqDataSet object
countData = round(as(abund_table, 'matrix'), digits = 0)
# We will add 1 to the countData otherwise DESeq will fail with the error:
# estimating size factors
# Error in estimateSizeFactorsForMatrix(counts(object), locfunc = locfunc, :
# every gene contains at least one zero, cannot compute log geometric means
countData<-(t(countData+1))

dds <- DESeqDataSetFromMatrix(countData, grouping_info, as.formula(~ X1))

#Reference:https://github.com/MadsAlbertsen/ampvis/blob/master/R/amp_test_species.R

#Differential expression analysis based on the Negative Binomial (a.k.a. Gamma-Poisson) distribution
#Some reason this doesn't work: data_deseq_test = DESeq(dds, test='wald', fitType='parametric')
data_deseq_test = DESeq(dds)

## Extract the results
res = results(data_deseq_test, cooksCutoff = FALSE)
res_tax = cbind(as.data.frame(res), as.matrix(countData[rownames(res), ]), OTU = rownames(res))

sig = 0.001
fold = 0
plot.point.size = 2
label=T
tax.display = NULL
tax.aggregate = 'OTU'

res_tax_sig = subset(res_tax, padj < sig & fold < abs(log2FoldChange))

res_tax_sig <- res_tax_sig[order(res_tax_sig$padj),]

## Plot the data
### MA plot
res_tax$Significant <- ifelse(rownames(res_tax) %in% rownames(res_tax_sig) , 'Yes', 'No')
res_tax$Significant[is.na(res_tax$Significant)] <- 'No'
p1 <- ggplot(data = res_tax, aes(x = baseMean, y = log2FoldChange, color = Significant)) +
geom_point(size = plot.point.size) +
scale_x_log10() +
scale_color_manual(values=c('black', 'red')) +
labs(x = 'Mean abundance', y = 'Log2 fold change')+theme_bw()
if(label == T){
if (!is.null(tax.display)){
rlab <- data.frame(res_tax, Display = apply(res_tax[,c(tax.display, tax.aggregate)], 1, paste, collapse='; '))
} else {
rlab <- data.frame(res_tax, Display = res_tax[,tax.aggregate])
}
p1 <- p1 + geom_text(data = subset(rlab, Significant == 'Yes'), aes(label = Display), size = 4, vjust = 1)
}
pdf('NB_MA.pdf')
print(p1)
dev.off()

res_tax_sig_abund = cbind(as.data.frame(countData[rownames(res_tax_sig), ]), OTU = rownames(res_tax_sig), padj = res_tax[rownames(res_tax_sig),'padj'])

#Apply normalisation (either use relative or log-relative transformation)
#data<-abund_table/rowSums(abund_table)
data<-log((abund_table+1)/(rowSums(abund_table)+dim(abund_table)[2]))
data<-as.data.frame(data)

#Now we plot taxa significantly different between the categories
df<-NULL
for(i in res_tax[rownames(res_tax_sig),'OTU']){
tmp<-data.frame(data[,i],groups,rep(paste(i,' padj = ',round(res_tax[i,'padj'],5),sep=''),dim(data)[1]))
if(is.null(df)){df<-tmp} else { df<-rbind(df,tmp)}
}
colnames(df)<-c('Value','Type','Taxa')

p<-ggplot(df,aes(Type,Value,colour=Type))+ylab('Log-relative normalised')
p<-p+geom_boxplot()+geom_jitter()+theme_bw()+
facet_wrap( ~ Taxa , scales='free', ncol=3)
p<-p+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
pdf('NB_significant.pdf',width=10,height=14)
print(p)

dev.off()

使用以上代码，我们将得到以下图：