散点图之线性回归caolong2019年4月10日用excel打开scatter_plot2.csv文件,初步查看数据矩阵结构组组成,如下图,我们用散点图分析MCODE_Score与Degree度的关系,如果学习生信的同学知道MCODE是是cytoscape软件的一个聚类插件,感兴趣的同学可以百度搜索STRING网址和MCODE插件。此excel表格是我筛选的基因集合后,选出MCODE_Score与Degree度列用来分析其关系的散点图实例。
data<-read.csv("G:/scatter_plot2.csv",row.names = 1)
data
## MCODE_Score Degree
## PTPRC 11.5428571 31
## TYROBP 11.5428571 29
## CD86 11.5428571 26
## ITGAM 11.7362637 26
## ITGB2 11.5428571 25
## C3AR1 11.5428571 24
## CCR1 9.9916667 23
## C1QC 9.8484848 23
## FCGR2B 11.5428571 21
## CD53 11.5428571 21
## C1QB 10.6373626 21
## LAPTM5 11.5428571 20
## LY86 11.7362637 20
## MMP9 5.7272727 19
## CSF2RB 11.5428571 17
## CTSS 10.6373626 17
## CD52 10.0000000 15
## CXCR4 6.2363636 15
## CD14 8.5909091 14
## NCF2 10.8589744 14
## CCL19 5.1538462 13
## CCL18 6.0000000 11
## CXCL2 6.8055556 11
## RAC2 7.0000000 11
## RNASE6 8.0000000 10
## EVI2B 6.5333333 10
## APOE 3.7333333 10
## TNFSF13B 3.8888889 9
## FPR3 3.7777778 9
## SPP1 5.0000000 9
## CLEC5A 6.8055556 9
## SLAMF8 4.7619048 8
## MSR1 5.0000000 7
## ACP5 3.7333333 6
## VAMP8 5.0000000 6
## SERPINA1 3.0000000 5
## IGLL5 3.0000000 5
## CD84 4.0000000 4
## CD37 4.0000000 4
## FCGR1B 4.0000000 4
## CHI3L1 3.0000000 4
## MS4A7 3.0000000 3
## ACTN2 0.5000000 3
## NTN1 0.6666667 2
## RBP4 2.0000000 2
## ADAMDEC1 2.0000000 2
## CNTN4 0.6666667 2
## KCNMA1 0.0000000 1
## KCNT2 0.0000000 1
## PLXNC1 0.0000000 1
## BAMBI 0.0000000 1
## IER3 0.0000000 1
## BTC 0.0000000 1
## ATP1A2 0.0000000 1
## NEXN 0.0000000 1
## FCGBP 0.0000000 1
## PIP5K1B 0.0000000 1
## PIK3AP1 0.0000000 1
## IGJ 0.0000000 1
knitr::opts_chunk$set(echo = TRUE)
加载ggplot2 package,然后利用ggplot2画一个简单的线性回归图library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.3
p <- ggplot(data,aes(x=Degree,y=MCODE_Score)) + geom_point(shape=19) +
xlab("Degree") + ylab("MCODE_Score")
p+geom_smooth(method = lm)

p+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

knitr::opts_chunk$set(echo = TRUE)
计算method=lm的简单线性回归相关参数k值,b值,以及r平方值
data.lm<-lm(data$MCODE_Score~data$Degree,data=data)
formula <- sprintf("italic(y) == %.2f %+.2f * italic(x)",
round(coef(data.lm)[1],4),round(coef(data.lm)[2],4))
r2 <- sprintf("italic(R^2) == %.2f",summary(data.lm)$r.squared)
labels <- data.frame(formula=formula,r2=r2,stringsAsFactors = FALSE)
knitr::opts_chunk$set(echo = TRUE)
分类画分类标注degree>=12的散点图pp <- ggplot(data,aes(x=Degree,y=MCODE_Score,colour=Degree>=12)) + geom_point(shape=19) +
xlab("Degree") + ylab("MCODE_Score")
pp

knitr::opts_chunk$set(echo = TRUE)
注释degree>12的那些点pp+
geom_abline(intercept = coef(data.lm)[1],slope = coef(data.lm)[2])+
geom_text(data=labels,mapping=aes(x = 15,y=20,label=formula),parse = TRUE,inherit.aes = FALSE, size = 6) +
geom_text(data=labels,mapping=aes(x = 20,y=18,label=r2),parse = TRUE,inherit.aes = FALSE, size = 4)+
annotate(geom = "text",x=data$Degree[1:21],y=data$MCODE_Score[1:21],label=rownames(data)[1:21], size=2.0)

knitr::opts_chunk$set(echo = TRUE)
注意:此条代码annotate(geom = “text”,x=dataDegree[1:10],y=dataMCODE_Score[1:10],label=rownames(dat)[1:10], size=2.0)可以拓展其应用范围用于标注差异基因火山图最显著的10个基因变更为annotate(geom = “text”,x=DElogFC[1:10],y=(−log10(DElogFC[1:10])),label=DE$Gene[1:10],size=2.0)其中DE表示different genes
|