R语言实用函数整理

gearss 2018-04-16

展开全文

初始化

options(stringsAsFactors=F,scipen=99)
rm(list=ls());gc()
getwd() 获得工作路径信息
setwd() 设置工作路径1
2
3
4

清空控制台

快捷键control+L1

获取目录下所有文件名

filenames=dir("/Users/yuyin/Downloads/数据/Excel数据")
##or推荐第二种
setwd("/Users/yuyin/Downloads/数据/Excel数据")
filenames=dir()1
2
3
4

读取文件输出文件

require(data.table)
library(data.table)
da<- fread("/Users/yuyin/Downloads/train_all_weekday.csv",header = FALSE)
#读取gbk编码文件
u<- read.csv("JData_User.csv",fileEncoding='gbk',header = TRUE)
write.table (out, file ="/Users/yuyin/Downloads/2.csv",sep =",",row.names = F,col.names=F,quote =F)1
2
3
4
5
6

读写xlsx文件

library("xlsx")
t=read.xlsx('吉林2014.xlsx',sheetIndex=1)
write.xlsx(t, file="./s.xlsx")1
2
3

SQL查询

library(sqldf)
re=sqldf("select V1,V2,V6 from da where V2>=20161004 and V2<=20161017 order by V1,V2")1
2

绘图

library(recharts)
echartr(tmp,as.character(tmp$V2),V6,type = 'line')1
2

分位数

#四个分位数
quantile(ck)  
#自定义分位数 
quantile(ck,  probs = c(0.85,0.95))
median中位数
mean均值1
2
3
4
5
6

查看行数

nrow(data.frame)1

字符串操作

拼接字符串

##方法一
paste(Y,'/',m,'/',d,sep='')
##方法二
library(stringr)
pout=str_c(path,name,collapse='')1
2
3
4
5

替换字符串

name=str_replace_all(name,"/","_")1

DF去重

tt=unique(tt)1

合并数据框

合并行
rbind(t1,t2)
合并列
cbind(t1,t2)1
2
3
4

DF排序

x=x[order(x$bad_comment_rate,decreasing=F),]1

生成随机数

runif(n, min=0, max=1) 均匀分布
rnorm(n, mean=0, sd=1) 正态分布
sample(seq(0,100,by=1),1,replace=TRUE) 抽样生成随机数1
2
3

最大最小归一化

b1=(data[,1]-min(data[,1]))/(max(data[,1])-min(data[,1]))  
b1=(d-min(d))/(max(d)-min(d))  1
2

日期转换

dateChar<-("2014-04-06")
dtV<-as.POSIXct(dateChar,format="%Y-%m-%d")
##或者dtV<-as.Date(dateChar,format="%Y-%m-%d")
format(dtV,"%Y/%m/%d %H:%M:%S")
#转换为2014/4/6
Y=format(dtV,"%Y")
m=as.character(as.numeric(format(dtV,"%m")))
d=as.character(as.numeric(format(dtV,"%d")))
dt<-paste(Y,'/',m,'/',d,sep='')1
2
3
4
5
6
7
8
9

计算时间差

d <- c('2013-12-05 18:43:00','2013-08-23 22:29:00')
difftime(d[2],d[1])
difftime(strptime(d, "%Y-%m-%d %H:%M:%S")[2],strptime(d, "%Y-%m-%d %H:%M:%S")[1],units='secs')1
2
3

高效数据清洗包dplyr代替sqldf

速度比sqldf快很多适合数据量大处理

library(dplyr)
#将数据整理成的tbl_df数据(处理速度快) 
iris <- tbl_df(iris)
##变量筛选select  对应select  删除-
select(iris,Sepal.Length,Sepal.Width)
select(iris,-Species)
##对数据运算并添加为新列mutate() 对应 count(a) as t1
mutate(iris,t1=Sepal.Length*2)
##计算
n(): 计算个数
n_distinct() #: 计算 x 中唯一值的个数
first(x), last(x) 和 nth(x, n)#: 返回对应秩的值, 类似于自带函数 x[1], x[length(x)], 和 x[n]
##过滤filter  对应 where
filter(iris,Sepal.Length>5,Sepal.Width<4)
filter(iris,Sepal.Length>5 & Sepal.Width<4 & (Species == "setosa" | Species == "versicolor"))
##数据排序arrange  对应 order by
arrange(iris,Sepal.Length)
arrange(iris,desc(Sepal.Length))
##汇总group_by() 分组-汇总
group_by(iris, Species)
group_by(iris,Species,Petal.Width)  %>% summarise(c1=n(),c2=n_distinct(Species))
##计算summarise()
summarise(iris,c1=n(),c2=mean(Sepal.Length))
##多步操作连接符%>%
filter(iris,Sepal.Length>5,Sepal.Width<4) %>% summarise(c1=n(),c2=mean(Sepal.Length)) 
##抽样sample_n sample_frac
sample_n(iris,20) 
##左连接 ab交集 差集
left_join(a, b, by="x1")
right_join(a, b, by="x1")
inner_join(a, b, by="x1")##保留匹配的数据
outer_join(a, b, by="x1")##保留所有数据
semi_join(a, b, by="x1") # 数据集a中能与数据集b匹配的记录
anti_join(a, b, by="x1") # 数据集a中雨数据集b不匹配的记录
intersect(x, y): x 和 y 的交集（按行）
union(x, y): x 和 y 的并集（按行）
setdiff(x, y): x 和 y 的补集 （在x中不在y中）
##列合并
bind_cols(y, z)
##行合并
bind_rows(y, z)1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41

参考(特别是文章后面的翻译图片)

查询相关R包

library(sos)
findFn('onehot')
##具体看sos的help1
2
3

本站是提供个人知识管理的网络存储空间，所有内容均由用户发布，不代表本站观点。请注意甄别内容中的联系方式、诱导购买等信息，谨防诈骗。如发现有害或侵权内容，请点击一键举报。

转藏分享

QQ空间 QQ好友新浪微博微信

献花（0） +1

来自： gearss > 《R语言情况和基本各种函数简单介绍》

举报/认领

0条评论

发表

请遵守用户评论公约

类似文章 更多

gearss

关注对话

TA的最新馆藏

[转] 特别关注·憨佗评《三国演义》
名言集
一只不会飞的鹦鹉（深度好文）
老鼠偷了人类的大米，人们说它狡猾；人类偷了蜜蜂的蜂蜜，却说它很勤劳
2022国际数学奥赛，中国少年全员满分摘金！那些藏在奥数背后的故事
丙吉为什么只问牛喘？

喜欢该文的人也喜欢更多

热门阅读换一换