【原】R包基础实操—tidyverse包

健明 2021-10-08

展开全文

核心软件包是ggplot2、dplyr、tidyr、readr、purrr、tibble、stringr和forcats，它们提供了建模、转换和可视化数据的功能。

其中，readr包用于读取数据，tidyr包用于整理数据，dplyr包用于数据转换，ggplot2包用于数据可视化，purrr包用于函数式编程。

1 readr包：快速读写

1-1 readr包提供了几个新函数，能够更快的读取文件

readr包中的主要的函数有：

read_csv，read_tsv，read_table，read_delim，
write_csv，write_tsv, write_excel_csv，write_delim

library(tidyverse)
library(readr)
library(feather)
library(Seurat)

metadata <- pbmc_small@meta.data

# If you add an extension to the file name, write_()* will automatically compress the output.
write_csv(metadata, 'metadata.csv')
write_tsv(metadata, 'metadata2.csv')
write_tsv(metadata, 'metadata.tsv.gz')

# write_rds()联合read_rds()使用，write_rds()将数据保存为自定义的二进制形式(RDS格式)
write_rds(metadata,"challenge.rds")
head(read_rds("challenge.rds"))
##                   orig.ident nCount_RNA nFeature_RNA RNA_snn_res.0.8
## ATGCCAGAACGACT SeuratProject         70           47               0
## CATGGCCTGTGCAT SeuratProject         85           52               0
## GAACCTGATGAACC SeuratProject         87           50               1
## TGACTGGATTCTCA SeuratProject        127           56               0
## AGTCAGACTGCACA SeuratProject        173           53               0
## TCTGATACACGTGT SeuratProject         70           48               0
##                letter.idents groups RNA_snn_res.1
## ATGCCAGAACGACT             A     g2             0
## CATGGCCTGTGCAT             A     g1             0
## GAACCTGATGAACC             B     g2             0
## TGACTGGATTCTCA             A     g2             0
## AGTCAGACTGCACA             A     g2             0
## TCTGATACACGTGT             A     g1             0

# saveRDS()联合readRDS()使用，saveRDS()将数据保存为自定义的压缩后的二进制形式(RDS格式)
saveRDS(metadata, "challenge.rds")
metadata <- readRDS("challenge.rds")

# write_rds(metadata, "compressed_mtc.rds", "xz", compression = 9L)
# write_rds(metadata,"challenge.rds", compress = 'gz') = saveRDS(metadata, "challenge.rds", compress = TRUE)

# feather包也是实现一种二进制形式，可以在多个编程语言之间共享；相比于RDS，速度更快。
write_feather(metadata,"challenge.feather")
read_feather("challenge.feather")
## # A tibble: 80 x 7
##    orig.ident    nCount_RNA nFeature_RNA RNA_snn_res.0.8 letter.idents groups
##    <fct>              <dbl>        <int> <fct>           <fct>         <chr> 
##  1 SeuratProject         70           47 0               A             g2    
##  2 SeuratProject         85           52 0               A             g1    
##  3 SeuratProject         87           50 1               B             g2    
##  4 SeuratProject        127           56 0               A             g2    
##  5 SeuratProject        173           53 0               A             g2    
##  6 SeuratProject         70           48 0               A             g1    
##  7 SeuratProject         64           36 0               A             g1    
##  8 SeuratProject         72           45 0               A             g1    
##  9 SeuratProject         52           36 0               A             g1    
## 10 SeuratProject        100           41 0               A             g1    
## # ... with 70 more rows, and 1 more variable: RNA_snn_res.1 <fct>

1-2 将R数据写入Excel

write.xlsx(x, file, sheetName = “Sheet1”, col.names = TRUE, row.names = TRUE, append = FALSE)
write.xlsx2(x, file, sheetName = “Sheet1”, col.names = TRUE, row.names = TRUE, append = FALSE)

library("xlsx")
# Write the first data set in a new workbook
write.xlsx(USArrests, file = "myworkbook.xlsx", sheetName = "USA-ARRESTS", append = FALSE)
# Add a second data set in a new worksheet
write.xlsx(mtcars, file = "myworkbook.xlsx", sheetName="MTCARS", append=TRUE)
# Add a third data set
write.xlsx(iris, file = "myworkbook.xlsx", sheetName="IRIS", append=TRUE)

2 tidyr包：整理数据

2-1 tidyr包提供了几个新函数，能够有效整理数据

gather(): makes “wide” data longer
spread(): makes “long” data wider
separate(): splits a single column into multiple columns
unite(): combines multiple columns into a single column

library(tidyr)
library(dplyr)
DF <- data.frame(Group=rep(1:3, each=4), Year=rep(2006:2009, times=3), 
                 Qtr.1 = rep(seq(14, 20, 2), 3),  Qtr.2 = rep(seq(12, 18, 2), 3),
                 Qtr.3 = rep(seq(16, 22, 2), 3),  Qtr.4 = rep(seq(18, 24, 2), 3))

long_DF <- DF %>% gather(Quarter, Revenue, Qtr.1:Qtr.4)
head(long_DF)
##   Group Year Quarter Revenue
## 1     1 2006   Qtr.1      14
## 2     1 2007   Qtr.1      16
## 3     1 2008   Qtr.1      18
## 4     1 2009   Qtr.1      20
## 5     2 2006   Qtr.1      14
## 6     2 2007   Qtr.1      16

wide_DF <- long_DF %>% spread(Quarter, Revenue)
head(wide_DF, 10)
##    Group Year Qtr.1 Qtr.2 Qtr.3 Qtr.4
## 1      1 2006    14    12    16    18
## 2      1 2007    16    14    18    20
## 3      1 2008    18    16    20    22
## 4      1 2009    20    18    22    24
## 5      2 2006    14    12    16    18
## 6      2 2007    16    14    18    20
## 7      2 2008    18    16    20    22
## 8      2 2009    20    18    22    24
## 9      3 2006    14    12    16    18
## 10     3 2007    16    14    18    20

separate_DF <- long_DF %>% separate(Quarter, c("Time_Interval", "Interval_ID"))
head(separate_DF, 10)
##    Group Year Time_Interval Interval_ID Revenue
## 1      1 2006           Qtr           1      14
## 2      1 2007           Qtr           1      16
## 3      1 2008           Qtr           1      18
## 4      1 2009           Qtr           1      20
## 5      2 2006           Qtr           1      14
## 6      2 2007           Qtr           1      16
## 7      2 2008           Qtr           1      18
## 8      2 2009           Qtr           1      20
## 9      3 2006           Qtr           1      14
## 10     3 2007           Qtr           1      16

unite_DF <- separate_DF %>% unite(Quarter, Time_Interval, Interval_ID, sep = ".")
head(unite_DF, 10)
##    Group Year Quarter Revenue
## 1      1 2006   Qtr.1      14
## 2      1 2007   Qtr.1      16
## 3      1 2008   Qtr.1      18
## 4      1 2009   Qtr.1      20
## 5      2 2006   Qtr.1      14
## 6      2 2007   Qtr.1      16
## 7      2 2008   Qtr.1      18
## 8      2 2009   Qtr.1      20
## 9      3 2006   Qtr.1      14
## 10     3 2007   Qtr.1      16

DF %>% gather(Quarter, Revenue, Qtr.1:Qtr.4) %>% head(2)
##   Group Year Quarter Revenue
## 1     1 2006   Qtr.1      14
## 2     1 2007   Qtr.1      16
DF %>% gather(Quarter, Revenue, -Group, -Year) %>% head(2)
##   Group Year Quarter Revenue
## 1     1 2006   Qtr.1      14
## 2     1 2007   Qtr.1      16
DF %>% gather(Quarter, Revenue, 3:6) %>% head(2)
##   Group Year Quarter Revenue
## 1     1 2006   Qtr.1      14
## 2     1 2007   Qtr.1      16
DF %>% gather(Quarter, Revenue, Qtr.1, Qtr.2, Qtr.3, Qtr.4) %>% head(2)
##   Group Year Quarter Revenue
## 1     1 2006   Qtr.1      14
## 2     1 2007   Qtr.1      16

long_DF %>% separate(Quarter, c("Time_Interval", "Interval_ID")) %>% head(2)
##   Group Year Time_Interval Interval_ID Revenue
## 1     1 2006           Qtr           1      14
## 2     1 2007           Qtr           1      16
long_DF %>% separate(Quarter, c("Time_Interval", "Interval_ID"), sep = "\\.") %>% head(2)
##   Group Year Time_Interval Interval_ID Revenue
## 1     1 2006           Qtr           1      14
## 2     1 2007           Qtr           1      16
separate_DF %>% unite(Quarter, Time_Interval, Interval_ID, sep = "_") %>% head(2)
##   Group Year Quarter Revenue
## 1     1 2006   Qtr_1      14
## 2     1 2007   Qtr_1      16
separate_DF %>% unite(Quarter, Time_Interval, Interval_ID) %>% head(2)
##   Group Year Quarter Revenue
## 1     1 2006   Qtr_1      14
## 2     1 2007   Qtr_1      16

3 dplyr包：有效数据操作

3-1 tidyr包提供了几个新函数，能够有效操作数据

filter() picks cases based on their values.
select() picks variables based on their names.
arrange() changes the ordering of the rows.
mutate() adds new variables that are functions of existing variables.
summarise() reduces multiple values down to a single summary.

Rows:

filter()chooses rows based on column values.
arrange()changes the order of the rows.
slice()chooses rows based on location.

Columns:

select()changes whether or not a column is included.
rename()changes the name of columns.
mutate()changes the values of columns and creates new columns.
relocate()changes the order of the columns.

Groups of rows:

summarise()collapses a group into a single row.

library(dplyr)
head(starwars)
## # A tibble: 6 x 14
##   name     height  mass hair_color  skin_color eye_color birth_year sex   gender
##   <chr>     <int> <dbl> <chr>       <chr>      <chr>          <dbl> <chr> <chr> 
## 1 Luke Sk~    172    77 blond       fair       blue            19   male  mascu~
## 2 C-3PO       167    75 <NA>        gold       yellow         112   none  mascu~
## 3 R2-D2        96    32 <NA>        white, bl~ red             33   none  mascu~
## 4 Darth V~    202   136 none        white      yellow          41.9 male  mascu~
## 5 Leia Or~    150    49 brown       light      brown           19   fema~ femin~
## 6 Owen La~    178   120 brown, grey light      blue            52   male  mascu~
## # ... with 5 more variables: homeworld <chr>, species <chr>, films <list>,
## #   vehicles <list>, starships <list>

starwars %>% 
  dplyr::select(name, ends_with("color"))
## # A tibble: 87 x 4
##    name               hair_color    skin_color  eye_color
##    <chr>              <chr>         <chr>       <chr>    
##  1 Luke Skywalker     blond         fair        blue     
##  2 C-3PO              <NA>          gold        yellow   
##  3 R2-D2              <NA>          white, blue red      
##  4 Darth Vader        none          white       yellow   
##  5 Leia Organa        brown         light       brown    
##  6 Owen Lars          brown, grey   light       blue     
##  7 Beru Whitesun lars brown         light       blue     
##  8 R5-D4              <NA>          white, red  red      
##  9 Biggs Darklighter  black         light       brown    
## 10 Obi-Wan Kenobi     auburn, white fair        blue-gray
## # ... with 77 more rows

starwars %>% 
  mutate(name, bmi = mass / ((height / 100)  ^ 2)) %>%
  dplyr::filter(species == "Droid") %>%
  dplyr::select(name:mass, bmi) %>%
  arrange(desc(mass))
## # A tibble: 6 x 4
##   name   height  mass   bmi
##   <chr>   <int> <dbl> <dbl>
## 1 IG-88     200   140  35  
## 2 C-3PO     167    75  26.9
## 3 R2-D2      96    32  34.7
## 4 R5-D4      97    32  34.0
## 5 R4-P17     96    NA  NA  
## 6 BB8        NA    NA  NA

starwars %>%
  group_by(species) %>%
  summarise(n = n(), mass = mean(mass, na.rm = TRUE)) %>%
  filter(n > 1, mass > 50)
## # A tibble: 8 x 3
##   species      n  mass
##   <chr>    <int> <dbl>
## 1 Droid        6  69.8
## 2 Gungan       3  74  
## 3 Human       35  82.8
## 4 Kaminoan     2  88  
## 5 Mirialan     2  53.1
## 6 Twi'lek      2  55  
## 7 Wookiee      2 124  
## 8 Zabrak       2  80

4 purrr包：函数式编程

用R写循环从低到高有三种境界：手动 for 循环，apply 函数族，purrr 包泛函式编程。

map(.x, .f, …): Apply a function to each element of a list or vector. map(x, is.logical)
map2(.x, .y, .f, …): Apply a function to pairs of elements from two lists, vectors. map2(x, y, sum)
pmap(.l, .f, …): Apply a function to groups of elements from list of lists, vectors. pmap(list(x, y, z), sum, na.rm = TRUE)

map系列函数的返回值如下：

map_chr(.x, .f): 返回字符型向量
map_lgl(.x, .f): 返回逻辑型向量
map_dbl(.x, .f): 返回实数型向量
map_int(.x, .f): 返回整数型向量
map_dfr(.x, .f): 返回数据框列表，再 bind_rows 按行合并为一个数据框
map_dfc(.x, .f): 返回数据框列表，再 bind_cols 按列合并为一个数据框

library(purrr)

infos <- tibble(
  born=c(1990, 1992, 2000, 1985), 
  family=c("张", "李", "王", "赵"),
  name=c("三", "四", "五", "六"))
infos
## # A tibble: 4 x 3
##    born family name 
##   <dbl> <chr>  <chr>
## 1  1990 张     三   
## 2  1992 李     四   
## 3  2000 王     五   
## 4  1985 赵     六
infos2 <- infos

age <- purrr::map(infos$born, function(x){2020-x}) %>% unlist()
infos$age <- age
infos
## # A tibble: 4 x 4
##    born family name    age
##   <dbl> <chr>  <chr> <dbl>
## 1  1990 张     三       30
## 2  1992 李     四       28
## 3  2000 王     五       20
## 4  1985 赵     六       35

fullname <- purrr::map2(infos$family, infos$name, function(x, y){paste0(x,y)}) %>% unlist()
infos$fullname <- fullname
infos
## # A tibble: 4 x 5
##    born family name    age fullname
##   <dbl> <chr>  <chr> <dbl> <chr>   
## 1  1990 张     三       30 张三    
## 2  1992 李     四       28 李四    
## 3  2000 王     五       20 王五    
## 4  1985 赵     六       35 赵六

fullname <- purrr::pmap(list(x=infos$family, y=infos$name, z=infos$born), function(x, y, z) paste0(x, y, z)) %>% unlist()
infos$fullname2 <- fullname
infos
## # A tibble: 4 x 6
##    born family name    age fullname fullname2
##   <dbl> <chr>  <chr> <dbl> <chr>    <chr>    
## 1  1990 张     三       30 张三     张三1990 
## 2  1992 李     四       28 李四     李四1992 
## 3  2000 王     五       20 王五     王五2000 
## 4  1985 赵     六       35 赵六     赵六1985

# 
purrr::pmap(list(x=infos$born), function(x){2020-x}) %>% unlist()
## [1] 30 28 20 35
purrr::pmap(list(x=infos$born, y=infos$name), function(x, y){paste0(x,y)}) %>% unlist()
## [1] "1990三" "1992四" "2000五" "1985六"
purrr::pmap(list(x=infos$family, y=infos$name, z=infos$born), function(x, y, z) paste0(x, y, z)) %>% unlist()
## [1] "张三1990" "李四1992" "王五2000" "赵六1985"

names(infos2) <- c('x', 'y', 'z')
infos2
## # A tibble: 4 x 3
##       x y     z    
##   <dbl> <chr> <chr>
## 1  1990 张    三   
## 2  1992 李    四   
## 3  2000 王    五   
## 4  1985 赵    六

purrr::pmap(infos2['x'], function(x){2020-x}) %>% unlist()
## [1] 30 28 20 35
purrr::pmap(infos2[c('x', 'y')], function(x, y){paste0(x,y)}) %>% unlist()
## [1] "1990张" "1992李" "2000王" "1985赵"
purrr::pmap(infos2, function(x, y, z){paste0(x,y, z)}) %>% unlist()
## [1] "1990张三" "1992李四" "2000王五" "1985赵六"

比较匿名函数和公式

df <- mtcars %>% 
  dplyr::select(mpg, cyl, wt) %>% 
  group_nest(cyl) 

# formula
df %>% mutate(model = map(data, ~ lm(mpg ~ wt, data = .x) ))
## # A tibble: 3 x 3
##     cyl               data model 
##   <dbl> <list<tibble[,2]>> <list>
## 1     4           [11 x 2] <lm>  
## 2     6            [7 x 2] <lm>  
## 3     8           [14 x 2] <lm>
map_dbl(mtcars, ~ length(unique(.x)))
##  mpg  cyl disp   hp drat   wt qsec   vs   am gear carb 
##   25    3   27   22   22   29   30    2    2    3    6

# anonymous function
df %>% mutate(model = map(data, function(x) {lm(mpg ~ wt, x)} ))
## # A tibble: 3 x 3
##     cyl               data model 
##   <dbl> <list<tibble[,2]>> <list>
## 1     4           [11 x 2] <lm>  
## 2     6            [7 x 2] <lm>  
## 3     8           [14 x 2] <lm>
map_dbl(mtcars, function(x) length(unique(x)))
##  mpg  cyl disp   hp drat   wt qsec   vs   am gear carb 
##   25    3   27   22   22   29   30    2    2    3    6


mtcars %>%
  split(.$cyl) %>%
  map(~ lm(mpg ~ wt, data = .)) %>%
  map(summary) %>%
  map("r.squared")
## $`4`
## [1] 0.5086326
## 
## $`6`
## [1] 0.4645102
## 
## $`8`
## [1] 0.4229655

mtcars %>%
  split(.$cyl) %>%
  map(~ lm(mpg ~ wt, data = .)) %>%
  map(summary) %>%
  map_dbl("r.squared")
##         4         6         8 
## 0.5086326 0.4645102 0.4229655

mtcars %>%
  split(.$cyl) %>% 
  map(~ lm(mpg ~ wt, data = .)) %>%
  map(summary) %>%
  map_dfr("r.squared")
## # A tibble: 1 x 3
##     `4`   `6`   `8`
##   <dbl> <dbl> <dbl>
## 1 0.509 0.465 0.423

mtcars %>%
  mutate(cyl = factor(cyl),
         am = factor(am)) %>%
  dplyr::select(mpg, disp, hp) %>%
  map(~ aov(.x ~ cyl * am, data = mtcars)) %>%
  map_dfr(~ broom::tidy(.), .id = 'source') %>%
  mutate(p.value = round(p.value, 5))
## # A tibble: 12 x 7
##    source term         df    sumsq    meansq statistic p.value
##    <chr>  <chr>     <dbl>    <dbl>     <dbl>     <dbl>   <dbl>
##  1 mpg    cyl           1    818.     818.      94.6    0     
##  2 mpg    am            1     37.0     37.0      4.28   0.0479
##  3 mpg    cyl:am        1     29.4     29.4      3.41   0.0755
##  4 mpg    Residuals    28    242.       8.64    NA     NA     
##  5 disp   cyl           1 387454.  387454.     138.     0     
##  6 disp   am            1   9405.    9405.       3.35   0.0779
##  7 disp   cyl:am        1    688.     688.       0.245  0.624 
##  8 disp   Residuals    28  78637.    2808.      NA     NA     
##  9 hp     cyl           1 100984.  100984.      91.3    0     
## 10 hp     am            1   7378.    7378.       6.67   0.0153
## 11 hp     cyl:am        1   6403.    6403.       5.79   0.0230
## 12 hp     Residuals    28  30961.    1106.      NA     NA

参考资料

[1]

R如何实现更快读取数据——使用redr包: https://www.jianshu.com/p/71b4fd0f0a19

[2]

Writing Data From R to Excel Files (xls|xlsx): http://www./english/wiki/writing-data-from-r-to-excel-files-xls-xlsx#writing-excel-files-using-xlsx-package

[3]

Reshaping Your Dat with tidyr: https://uc-r./tidyr

[4]

数据重塑之tidyr包: https://zhuanlan.zhihu.com/p/22265154

[5]

Introduction to dplyr: https://cran./web/packages/dplyr/vignettes/dplyr.html

[6]

dplyr包: https://www.jianshu.com/p/f8b9e6bd52a2

[7]

dplyr新功能解读: https://zhuanlan.zhihu.com/p/145839517

[8]

优雅的循环迭代：purrr包: https://zhuanlan.zhihu.com/p/168772624