【原】dplyr总结篇

医科研 2021-01-25

展开全文

欢迎来到医科研，这里是白介素2的读书笔记，跟我一起聊临床与科研的故事, 生物医学数据挖掘，R语言，TCGA、GEO数据挖掘。

dplyr-总结

有必要对dplyr进行一个总结

对行处理

数据处理对于行的处理，我们也称为观测。主要包括：filter和arrange
-filter用于筛选行: m %in% (1:10)用法比较重要
-arrange用于排列行， desc()用于设定降序排列，这一点与sort函数类似
slice用于删减行，可以按位置索引进行删减

举基因表达矩阵的例子来说明更生动

library(tidyverse)
load("expma.Rdata")
head(expma)
##
  GSM188013 GSM188014 GSM188016 GSM188018 GSM188020 GSM188022
## 1007_s_at 15630.200 17048.800 13667.500 15138.800 10766.600 15680.800
## 1053_at    3614.400  3563.220  2604.650  1945.710  3371.290  3406.660
## 117_at     1032.670  1164.150   510.692  5061.200   452.166   400.477
## 121_at     5917.800  6826.670  4562.440  5870.130  3869.480  3680.440
## 1255_g_at   224.525   395.025   207.087   164.835   111.609   130.123
## 1294_at     799.786   839.787   592.434   593.632   431.526   332.962
##

dim(expma)
data<-na.omit(expma) %>% 
   as.data.frame() %>%  #转换为数据框
   rownames_to_column("ID") %>% # 行转列并命名为ID
   as_tibble() 

## filter
data %>% 
  ## filter只筛选TRUE
  filter(GSM188013==3614.400) #筛选出某个变量的表达值为xx
##
## # A tibble: 1 x 7
##   ID      GSM188013 GSM188014 GSM188016 GSM188018 GSM188020 GSM188022
##   <chr>       <dbl>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>
## 1 1053_at     3614.     3563.     2605.     1946.     3371.     3407.
##

data %>% 
  ## filter只筛选TRUE
  filter(!duplicated(ID)) %>% 
  ## count类似于table的计数
  count(!duplicated(ID))

## arrange
data %>% 
  filter(!is.na(ID)) %>% 
  ## 按GSM188013的表达值降序排列
  arrange(desc(GSM188013)) %>% 
  ## 选择1:nrow行
  slice(1:n()) %>% 
  slice(-10:-n()) %>% ##删去10:n行
  slice(1:9)
##
## # A tibble: 9 x 7
##   ID          GSM188013 GSM188014 GSM188016 GSM188018 GSM188020 GSM188022
##   <chr>           <dbl>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>
## 1 211542_x_at    115359   112557     95911.   115259    104002    119307 
## 2 212869_x_at    111036    98746     86384.   109036     94431.   101598 
## 3 201429_s_at    110207   106494     77982.   104826     86286.    95794.
## 4 200801_x_at    107297   103409     81221     88992.    80843.   105781 
## 5 200817_x_at    107276   100417     87705     94664.    89916.   106468 
## 6 207783_x_at    106812   103959     78483.    99711.    88342.    98408.
## 7 212661_x_at    104459   104613     86729.   106335     98321.   109809 
## 8 217740_x_at    104423    97527.    79070.   101645     85800.    91210.
## 9 201257_x_at    104330   103574     82088.    98103.    99495.   105121
##

对列处理

数据处理时，经常需要选择自己感兴趣的列，我们也叫变量。主要包括 select 和 mutate
select函数可以做到筛选列，几乎可以做到所有筛选，对大批量数据还可以应用正则匹配
select可以筛选，也可以反向选择删除一些变量，starts_with(),ends_with(),contains() matchs(辅助)，容易忽略的是select 配合everything()可以把自己感兴趣的变量移动到前面去

# 为了方便把列名修改了一下
colnames(data)[2:ncol(data)]<-str_c("A",c(1:6),sep = "")
data %>% 
  select(1:6) %>% 
  select(ID,A5,everything()) %>% ##把A5放到前面来
  select(starts_with("A")) %>% ##筛选以A开头的列
  mutate(A6=mean(A1+A2)) %>%  ##增加一个新列
  transmute(A2=mean(A1+A2))##只保留新列
  ##
## # A tibble: 22,283 x 1
##       A2
##    <dbl>
##  1 6910.
##  2 6910.
##  3 6910.
##  4 6910.
##  5 6910.
##  6 6910.
##  7 6910.
##  8 6910.
##  9 6910.
## 10 6910.
## # ... with 22,273 more rows
##

分组摘要

对于数据的分析解释常需要对其进行分组计算
主要的函数有group_by 与 summarize

data %>% 
  summarize(x=mean(A1),na.rm=TRUE)##用处不大
##
## # A tibble: 1 x 2
##       x na.rm
##   <dbl> <lgl>
## 1 3459. TRUE
##
## 加载上probe信息
load("probe.Rdata")
head(probe)
##
##          ID Gene Symbol ENTREZ_GENE_ID
## 2   1053_at        RFC2           5982
## 3    117_at       HSPA6           3310
## 4    121_at        PAX8           7849
## 5 1255_g_at      GUCA1A           2978
## 7   1316_at        THRA           7067
## 8   1320_at      PTPN21          11099
##
head(data)
##
## # A tibble: 6 x 7
##   ID            A1     A2     A3     A4     A5     A6
##   <chr>      <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1 1007_s_at 15630. 17049. 13668. 15139. 10767. 15681.
## 2 1053_at    3614.  3563.  2605.  1946.  3371.  3407.
## 3 117_at     1033.  1164.   511.  5061.   452.   400.
## 4 121_at     5918.  6827.  4562.  5870.  3869.  3680.
## 5 1255_g_at   225.   395.   207.   165.   112.   130.
## 6 1294_at     800.   840.   592.   594.   432.   333.
##
data<-data %>% 
  inner_join(probe,by="ID") %>% ## 合并probe
  rename(genename= 'Gene Symbol',geneid=ENTREZ_GENE_ID) %>% ##修改列名
  select(ID,genename,geneid,everything()) ##调整下顺序
head(data)
##
## # A tibble: 6 x 9
##   ID        genename geneid    A1    A2    A3    A4    A5    A6
##   <chr>     <chr>    <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1053_at   RFC2     5982   3614. 3563. 2605. 1946. 3371. 3407.
## 2 117_at    HSPA6    3310   1033. 1164.  511. 5061.  452.  400.
## 3 121_at    PAX8     7849   5918. 6827. 4562. 5870. 3869. 3680.
## 4 1255_g_at GUCA1A   2978    225.  395.  207.  165.  112.  130.
## 5 1316_at   THRA     7067    722.  934.  455.  592.  350.  525.
## 6 1320_at   PTPN21   11099   135.  100.  353.  230.  230.  341.
##
dim(data)## 20878个gene

## group_by函数联合 summarize分组摘要
data %>% 
  group_by(genename) %>% ##会分组，但并不计算
  summarize(
    count=n(),##计数分组有多少
    a1=mean(A1,na.rm = T)##这样得到的是只有1列a1，这样就不适用于对所有样本分组摘要了
  )

##实现处理多个探针对应一个基因求平均值
data %>% 
  select(genename,A1:A6) %>% ##筛选出需要的变量，注意不能有多个分类变量
  group_by(genename) %>% ##会分组，但并不计算
  summarise_all(mean)##mean可改为其它函数即可
##
## # A tibble: 12,549 x 7
##    genename     A1     A2     A3     A4     A5     A6
##    <chr>     <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
##  1 ""       1335.  1207.   756.   936.   800.   849. 
##  2 A1CF     2017.  1569.   982.  1363.   856.   839. 
##  3 A2M        93.7   84.8  282.    49.3   45.9   46.9
##  4 A4GALT    211.   225.   409.   131.    59.9  383. 
##  5 A4GNT    1211.  1161.   671.   439.   690.   572. 
##  6 AAAS      616.   593.  3334.  2370.  1581.  1714. 
##  7 AACS     5697.  5413.  4494.  4389.  4659.  4447. 
##  8 AADAC      89.0   57.4   30.3   58.7   51.6   47.7
##  9 AAGAB    1895.  1824.  2340.  2128.  2777.  2921. 
## 10 AAK1      419.   484.   309.   520.   389.   446. 
## # ... with 12,539 more rows
##
## 示例
iris %>%
  group_by(Species) %>% 
  summarise_all(mean)##同时对所有变量进行分组摘要，并不需要命名新变量
##
## # A tibble: 3 x 5
##   Species    Sepal.Length Sepal.Width Petal.Length Petal.Width
##   <fct>             <dbl>       <dbl>        <dbl>       <dbl>
## 1 setosa             5.01        3.43         1.46       0.246
## 2 versicolor         5.94        2.77         4.26       1.33 
## 3 virginica          6.59        2.97         5.55       2.03
##
## 等同于上式，但命名更直观，这种方式可以同时进行多种分组摘要
iris %>%
  group_by(Species) %>% 
  summarise_all(list(~min(.), ~max(.)))##
##
## # A tibble: 3 x 9
##   Species Sepal.Length_min Sepal.Width_min Petal.Length_min Petal.Width_min
##   <fct>              <dbl>           <dbl>            <dbl>           <dbl>
## 1 setosa               4.3             2.3              1               0.1
## 2 versic~              4.9             2                3               1  
## 3 virgin~              4.9             2.2              4.5             1.4
## # ... with 4 more variables: Sepal.Length_max <dbl>,
## #   Sepal.Width_max <dbl>, Petal.Length_max <dbl>, Petal.Width_max <dbl>
##
## summarise 对指定的向量处理  
starwars %>%
  summarise_at(c("height", "mass"), mean, na.rm = TRUE)

group_by与filter联用

例如我们要实现找出多于5个探针对应同一个基因的分组联用filter可以筛选分组

data %>% 
  group_by(genename) %>% 
  filter(n()>3)##找出3个探针以上对应一个基因
## # A tibble: 4,081 x 9
## # Groups:   genename [662]
##    ID          genename geneid     A1     A2     A3     A4     A5     A6
##    <chr>       <chr>    <chr>   <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
##  1 121_at      PAX8     7849    5918.  6827.  4562.  5870.  3869.  3680.
##  2 1316_at     THRA     7067     722.   934.   455.   592.   350.   525.
##  3 1320_at     PTPN21   11099    135.   100.   353.   230.   230.   341.
##  4 1494_f_at   CYP2A6   1548    1128.  1495.   965.  1567.   647.   731.
##  5 160020_at   MMP14    4323    2415.  2017.  2408.  2618.  1696.  1605.
##  6 177_at      PLD1     5337    1384.  1506.   586.   561.   608.   400.
##  7 200014_s_at HNRNPC   3183   13273. 12936. 15264  13516. 19769. 17326.
##  8 200047_s_at YY1      7528   22706. 23938. 18820. 18727. 22232. 20630.
##  9 200067_x_at SNX3     8724   10194. 10602.  9283.  9883. 11139. 10550.
## 10 200073_s_at HNRNPD   3184   15528  15212. 11880.  9753. 18304  16338.
## # ... with 4,071 more rows

distinct函数

去重函数，可以发现它们也是有配套的distinct_all等函数

df <- tibble(
  x = sample(10, 100, rep = TRUE),
  y = sample(10, 100, rep = TRUE)
)
head(df)
## # A tibble: 6 x 2
##       x     y
##   <int> <int>
## 1    10     4
## 2     3     3
## 3     6     2
## 4     5     9
## 5     8     9
## 6     3     2
nrow(df)
## [1] 100
##组合x,y
nrow(distinct(df))##删去重复还有64行
## [1] 68
nrow(distinct(df, x, y))
## [1] 68
##去重某个列
distinct(df, x)##10个
## # A tibble: 10 x 1
##        x
##    <int>
##  1    10
##  2     3
##  3     6
##  4     5
##  5     8
##  6     7
##  7     2
##  8     4
##  9     1
## 10     9
distinct(df, y)## 
## # A tibble: 10 x 1
##        y
##    <int>
##  1     4
##  2     3
##  3     2
##  4     9
##  5     5
##  6     7
##  7    10
##  8     6
##  9     1
## 10     8
# 仅对x去重，其它均保留，这个就适用于去除重复
distinct(df, x, .keep_all = TRUE)
## # A tibble: 10 x 2
##        x     y
##    <int> <int>
##  1    10     4
##  2     3     3
##  3     6     2
##  4     5     9
##  5     8     9
##  6     7     7
##  7     2     4
##  8     4     3
##  9     1     4
## 10     9     9
## distinct_all函数
df <- tibble(x = rep(2:5, each = 2) / 2, y = rep(2:3, each = 4) / 2)
df
## # A tibble: 8 x 2
##       x     y
##   <dbl> <dbl>
## 1   1     1  
## 2   1     1  
## 3   1.5   1  
## 4   1.5   1  
## 5   2     1.5
## 6   2     1.5
## 7   2.5   1.5
## 8   2.5   1.5
distinct_all(df)##相当于组合x,y
## # A tibble: 4 x 2
##       x     y
##   <dbl> <dbl>
## 1   1     1  
## 2   1.5   1  
## 3   2     1.5
## 4   2.5   1.5
distinct_at(df, vars(x,y))
## # A tibble: 4 x 2
##       x     y
##   <dbl> <dbl>
## 1   1     1  
## 2   1.5   1  
## 3   2     1.5
## 4   2.5   1.5
distinct_if(df, is.numeric)
## # A tibble: 4 x 2
##       x     y
##   <dbl> <dbl>
## 1   1     1  
## 2   1.5   1  
## 3   2     1.5
## 4   2.5   1.5
##在distinct前先运用函数处理
distinct_all(df, round)
## # A tibble: 3 x 2
##       x     y
##   <dbl> <dbl>
## 1     1     1
## 2     2     1
## 3     2     2
arrange_all(df, list(~round(.)))##多个变量组合排序
## # A tibble: 8 x 2
##       x     y
##   <dbl> <dbl>
## 1   1     1  
## 2   1     1  
## 3   1.5   1  
## 4   1.5   1  
## 5   2     1.5
## 6   2     1.5
## 7   2.5   1.5
## 8   2.5   1.5