一、数据类型和向量
1. 数据类型
1.1 判断数据类型class()
1.2 按Tab键自动补全1.3 数据类型的判断和转换
(1)is 族函数,判断,返回值为TRUE或FALSE
is.numeric("123")
is.character("a")
is.logical(TRUE)
(2)as族函数实现数据类型之间转换
as.matrix()
as.numeric()
as.character()
as.logical()
2. 向量
(1)有重复的用rep(),有规律的用seq(),随机数用rnorm
rep("sample",6)
[1] "sample" "sample" "sample" "sample" "sample" "sample"
seq(4,30,3)
[1] 4 7 10 13 16 19 22 25 28
rnorm(3)
[1] 0.1511196 1.1105814 -0.8626667
(2)组合
paste0(rep("x",3),1:3) # 或 paste0("x",1:3)
[1] "x1" "x2" "x3"
paste0("sample",seq(1,5,2))
[1] "sample1" "sample3" "sample5"
paste() 和paste0()区别:(1)paste()中的sep=将两个或多个向量字符串分别对应连接
paste(v1,v1,sep = " ")
paste0()与paste()的区别是无法设定sep,默认=“”每空格。
paste("x",1:3,sep = "~")
[1] "x~1" "x~2" "x~3"
(2)两个向量的操作
重点:
x %in% y # x的每个元素在y中吗
x[x %in% y] #注意x,y顺序
x == y # x和对应位置的y相等吗
x = c(1,5,3,4)
y = c(5,12,24,3)
intersect(x,y)
[1] 5 3
union(x,y)
[1] 1 5 3 4 12 24
setdiff(x,y)
[1] 1 4
setdiff(y,x)
[1] 12 24
- 当两个向量长度不一致
循环补齐

- match(x,y)
x[match(y,x)]
match : 谁在外面,谁就在后面,以y为模板,给x调整顺序
x = c("A","B","C","D","E")
y = c("E","C","B","A")
match(y,x)
x[match(y,x)]
二、数据框、矩阵和列表
1.区别
(1)Vector向量——一维;matrix矩阵——二维,只允许一种数据类型;data.frame数据框——二维,每列只允许一种数据类型
2.练习题
(1)#求c1第一列数值的中位数 #筛选c1中,最后一列值为a或c的行
c1 <- read.csv("./exercise.csv")
median(c1$Petal.Length) # 求c1第一列数值的中位数
# 或者median(c1[,1])
c1[c1$Species %in% c("c","a"),] # 筛选c1中,最后一列值为a或c的行
# 或者c1[c1$Species == "a"| c1$Species == "c",]
错误形式如下:
c1[c1$Species == c("c","a"),] # 一长一短,无法比较,他们发生了循环补齐
(2)修改行名和列名
#改行名和列名
rownames(df) <- c("r1","r2","r3","r4")
#只修改某一行/列的名
colnames(df)[2]="CHANGE"
(3)两个数据框的连接
merge(test1,test2,by=“name”)
merge(test1,test3,by.x = “name”,by.y = “NAME”)
(4) 练习
1.统计内置数据iris最后一列有哪几个取值,每个取值重复了多少次
2.提取内置数据iris的前5行,前4列,并转换为矩阵,赋值给a。3.将a的行名改为flower1,flower2…flower5。
table(iris[,ncol(iris)])
a = as.data.frame(iris[1:5,1:4])
rownames(a) = paste0("flowers",1:5) # 或者
rownames(a) = paste0("flowers",1:nrow(a))
(5) match() 函数的使用
## 以y为模板,对X的顺序进行排序,然后选择x的id列给y的列名:match()函数
# match(colnames(y),x$file_name)
# x[match(colnames(y),x$file_name),]
# x$ID[match(colnames(y),x$file_name)]
colnames(y) = x$ID[match(colnames(y),x$file_name)]
三、几种加载包的方法
# 方法一:
install.packages("tidyr")
install.packages('BiocManager')
# 方法二:
BiocManager::install("ggplot2")
# 方法三:
devtools::install_github("jmzeng1314/idmap1") #括号里写作者用户名加包名
# 方法四:
if(!require(stringr))install.packages("stringr")
镜像源推荐:
# 清华镜像
# http://mirrors.tuna./CRAN/
# http://mirrors.tuna./bioconductor/
# 中科大镜像
# http://mirrors.ustc.edu.cn/CRAN/
# http://mirrors.ustc.edu.cn/bioc/
R语言中的符号
[ ] :向量,数据框,矩阵取子集 [[ ]]:列表取子集
四、读取,写入数据
txt 和csv
read.csv():一般读取csv格式 read.table():一般读取txt格式
ex1 <- read.table("./ex1.txt",
header = T)
ex2 <- read.csv("./ex2.csv",
row.names = 1) # 第一列为行名
soft <- read.table("./soft.txt",
sep = "\t", # 以...分隔
fill = TRUE, # 空格自动填充
header = TRUE
)
write.table(ex1,file = “./ex1.txt”) write.csv(ex2,file = “./ex2.csv”)
Rdata
save() --- > 保存
load() --- > 加载
save(ex1,file = "./ex1.Rdata")
load("./ex1.Rdata")
读入数据,ID转换
案例:
soft <- read.csv("./soft.csv",row.names = 1)
head(soft)
exp$symbol <- soft$GeneName[match(rownames(exp),soft$ID)]
exp <- exp[!duplicated(exp$symbol),]
exp <- exp[!grepl("^ENST",exp$symbol),]
rownames(exp) <- exp$symbol
exp = exp[,-ncol(exp)]
五. 画图
(1)绘图
(1)作图:ggplot2、ggpubr、base
(2)拼图:patchwork包、par里的mfrow、grid.arrange、cowplot
(3)导出:
#图片的保存和导出
# 1. ggplot2系列
ggsave(p,filename = "")
# 2.通用:三段论
# 保存的格式及文件名
pdf("test.pdf")
dev.off() # 关闭画板
(2)ggplot2语法
- 属性设置
映射:根据数据的某一列的内容分配颜色
手动设置:把图形设置为一个或N个颜色,与数据类型无关


#1.入门级绘图模板:作图数据,横纵坐标
ggplot(data = iris)+
geom_point(mapping = aes(x = Sepal.Length,
y = Petal.Length))
#2.属性设置(颜色、大小、透明度、点的形状,线型等)
ggplot(data = iris) +
geom_point(mapping = aes(x = Sepal.Length, y = Petal.Length),
size = 5, # 点的大小5mm
alpha = 0.5, # 透明度 50%
shape = 8) # 点的形状
## 指定映射的具体颜色?
ggplot(data = iris)+
geom_point(mapping = aes(x = Sepal.Length,
y = Petal.Length,
color = Species))+
scale_color_manual(values = c("blue","grey","red"))
## 区分color和fill两个属性
### 1 空心形状和实心形状都用color设置颜色
ggplot(data = iris)+
geom_point(mapping = aes(x = Sepal.Length,
y = Petal.Length,
color = Species),
shape = 17) #17号,实心的例子
ggplot(data = iris)+
geom_point(mapping = aes(x = Sepal.Length,
y = Petal.Length,
color = Species),
shape = 2) #2号,空心的例子
### 2 既有边框又有内心的,才需要color和fill两个参数
ggplot(data = iris)+
geom_point(mapping = aes(x = Sepal.Length,
y = Petal.Length,
color = Species),
shape = 24,
fill = "black") #24号,双色的例子
#3.分面
ggplot(data = iris) +
geom_point(mapping = aes(x = Sepal.Length, y = Petal.Length)) +
facet_wrap(~ Species)
#双分面
dat = iris
dat$Group = sample(letters[1:5],150,replace = T)
ggplot(data = dat) +
geom_point(mapping = aes(x = Sepal.Length, y = Petal.Length)) +
facet_grid(Group ~ Species)
#4.几何对象
#局部设置和全局设置
ggplot(data = iris) +
geom_smooth(mapping = aes(x = Sepal.Length,
y = Petal.Length))+
geom_point(mapping = aes(x = Sepal.Length,
y = Petal.Length))
ggplot(data = iris,mapping = aes(x = Sepal.Length, y = Petal.Length))+
geom_smooth()+
geom_point()
#5.统计变换使用场景
#5.1.不统计,数据直接做图
fre = as.data.frame(table(diamonds$cut))
fre
ggplot(data = fre) +
geom_bar(mapping = aes(x = Var1, y = Freq), stat = "identity")
#5.2count改为prop
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut, y = ..prop.., group = 1))
#6.位置关系
# 6.1抖动的点图
ggplot(data = iris,mapping = aes(x = Species,
y = Sepal.Width,
fill = Species)) +
geom_boxplot()+
geom_point()
ggplot(data = iris,mapping = aes(x = Species,
y = Sepal.Width,
fill = Species)) +
geom_boxplot()+
geom_jitter()
# 6.2堆叠直方图
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut,fill=clarity))
# 6.3 并列直方图
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut, fill = clarity), position = "dodge")
#7.坐标系
#翻转coord_flip()
ggplot(data = mpg, mapping = aes(x = class, y = hwy)) +
geom_boxplot() +
coord_flip()
#极坐标系coord_polar()
bar <- ggplot(data = diamonds) +
geom_bar(
mapping = aes(x = cut, fill = cut),
width = 1
) +
theme(aspect.ratio = 1) +
labs(x = NULL, y = NULL)
bar
bar + coord_flip()
bar + coord_polar()
# 练习题:小提琴图+箱线图
ggplot(iris,mapping = aes(x = Sepal.Width,y = Species)) +
geom_violin(aes(fill = Species)) +
geom_boxplot()+
geom_jitter(aes(shape = Species))
单分面
双分面
统计变换
堆叠直方图
并列直方图
**
小提琴+箱线图
(3)ggpubr.R语法
# sthda上有大量ggpubr出的图
library(ggpubr)
ggscatter(iris,x="Sepal.Length",
y="Petal.Length",
color="Species")
p <- ggboxplot(iris, x = "Species",
y = "Sepal.Length",
color = "Species",
shape = "Species",
add = "jitter")
p
my_comparisons <- list( c("setosa", "versicolor"),
c("setosa", "virginica"),
c("versicolor", "virginica") )
p + stat_compare_means(comparisons = my_comparisons)+ # Add pairwise comparisons p-value
stat_compare_means(label.y = 9)
(4) 图片的保存
#图片的保存和导出
#1. ggplot2系列
ggsave(p,filename = “”)
#2.通用:三段论
保存的格式及文件名
pdf(“test.pdf”)
…
…
dev.off() # 结束
(5)拼图
# patchwork包
p1.1 <- violin_plot(dat = dat,gene = dat$CCL5)
p1.2 <- violin_plot(dat = dat,gene = dat$MMP9)
p1.4 <- violin_plot(dat = dat,gene = dat$RAC2)
p1.5 <- violin_plot(dat = dat,gene = dat$CORO1A)
p1.6 <- violin_plot(dat = dat,gene = dat$CCL2)
library(patchwork)
p1 <- (p1.1 | p1.2 ) / # 分成两行
(p1.4 | p1.5 | p1.6)
library(ggplot2)
ggsave("./vertify/GSE100927_vertify.pdf", plot = p1, width = 15, height = 18)
1234567891011
六、专题
1.数据框的排序
- order 或者 tidyverse中的arrange()函数
# order 可以给向量排序,也可以给数据框排序
sort(test$Sepal.Length)
test$Sepal.Length[order(test$Sepal.Length)]
test[order(test$Sepal.Length),]
test[order(test$Sepal.Length,decreasing = T),]
# arrange,更加灵活的排序
library(tidyverse) # 需要加载这个包
arrange(test, Sepal.Length)
arrange(test, desc(Sepal.Length))
arrange(test, desc(Sepal.Width),Sepal.Length) # 先按照Sepal.Width排序,如果Sepal.Width列相同,再按照Sepal.Length列排序
dplyr包中的mutate、select、filter、rename
mutate():新增列,rename():重命名列名
select():筛选列;filter():筛选行
管道符号:%>%:ctrl + shift +m
2.表达矩阵画箱线图
如下图,根据这样的表达矩阵,画出这个图,如果不变换表,是无法成功的
将长表变成短表,变化操作如下
library(tidyr)
library(tibble)
library(dplyr)
dat = t(exp) %>% as.data.frame() %>% rownames_to_column() %>%
mutate(group = group_list)



3. 连接
library(dplyr)
inner_join(test1,test2,by="name")
inner_join(test1,test2,by=c("name" = "Name")
right_join(test1,test2,by="name")
full_join(test1,test2,by="name")
semi_join(test1,test2,by="name")
anti_join(test1,test2,by="name")
merge():函数
4. 字符串函数:加载stringr包





x <- "The birch canoe slid on the smooth planks."
x
###1.检测字符串长度
str_length(x)
length(x)
###2.字符串拆分
str_split(x," ")
x2 = str_split(x," ")[[1]];x2
y = c("jimmy 150","nicker 140","tony 152")
str_split(y," ")
str_split(y," ",simplify = T)
###3.按位置提取字符串
str_sub(x,5,9)
###4.字符检测
str_detect(x2,"h")
###5.字符串替换
str_replace(x2,"o","A")
str_replace_all(x2,"o","A")
###6.字符删除
str_remove(x," ")
str_remove_all(x," ")