分享

R Study Note 6

 争子俱乐部 2013-06-27
rm(list=ls())

################回归分析###########

####一元线性回归####
 x<-c(0.10, 0.11, 0.12, 0.13, 0.14, 0.15,
0.16, 0.17, 0.18, 0.20, 0.21, 0.23)
 y<-c(42.0, 43.5, 45.0, 45.5, 45.0, 47.5,
49.0, 53.0, 50.0, 55.0, 55.0, 60.0)
lm.sol<-lm(y~1+x)
summary(lm.sol)

####参数的区间估计#####
beta.int<-function(fm,alpha=0.05){
A<-summary(fm)$coefficients
df<-fm$df.residual
left<-A[,1]-A[,2]*qt(1-alpha/2,df)
right<-A[,1]+A[,2]*qt(1-alpha/2,df)
rowname<-dimnames(A)[[1]]                 ##[[1]]什么意思
colname<-c("Estimate","Left","Right")
matrix(c(A[,1],left,right),ncol=3,
       dimnames=list(rowname,colname))
}
beta.int(lm.sol)

####预测#####
new<-data.frame(x=0.16)    #数据框
lm.pred<-predict(lm.sol,new,interval="prediction",level=0.95)
lm.pred



####实例####
X <- matrix(c(
194.5, 20.79, 1.3179, 131.79,
194.3, 20.79, 1.3179, 131.79,
197.9, 22.40, 1.3502, 135.02,
198.4, 22.67, 1.3555, 135.55,
199.4, 23.15, 1.3646, 136.46,
199.9, 23.35, 1.3683, 136.83,
200.9, 23.89, 1.3782, 137.82,
201.1, 23.99, 1.3800, 138.00,
201.4, 24.02, 1.3806, 138.06,
201.3, 24.01, 1.3805, 138.05,
203.6, 25.14, 1.4004, 140.04,
204.6, 26.57, 1.4244, 142.44,
209.5, 28.49, 1.4547, 145.47,
208.6, 27.76, 1.4434, 144.34,
210.7, 29.04, 1.4630, 146.30,
211.9, 29.88, 1.4754, 147.54,
212.2, 30.06, 1.4780, 147.80),
ncol=4, byrow=T,                                     #矩阵排列
dimnames = list(1:17, c("F", "h", "log", "log100"))) #矩阵命名
X
forbes<-as.data.frame(X)        #数据框
plot(forbes$F,forbes$log100)    #散点图

lm.for<-lm(log100~F,data=forbes)
summary(lm.for)

abline(lm.for,col="red")

res.for<-residuals(lm.for);plot(res.for)   #残差及残差图
lm.for$residual;plot(lm.for$residual)

text(12,res.for[12],labels=12,adj=1.2)    #将第12号残差点标出

i<-1:17; forbes12<-as.data.frame(X[i!=12, ]) #取出第12号点
lm.for12<-lm(log100~F, data=forbes12)
summary(lm.for12)


####lm()函数####
lm(formula, data, subset, weights, na.action,
method = "qr", model = TRUE, x = FALSE,
y = FALSE, qr = TRUE, singular.ok = TRUE,
contrasts = NULL, offset, ...)

lm(formula,data=data.frame)

#提取模型信息
anova(object,...)   #方差分析表
coefficients(obeject,...)   #模型的系数
deviance()   #残差平方和
formula()    #模型公式
plot()    #绘制模型诊断的几种图形
predict(object,newdata=data.frame)    #预测值和预测空间,newdata预测点的数

print()    #模型拟合的结果
rediduals(object,
          type=c("working","response","deviance",
                 "pearson","partial"))    #残差
step()    #逐步回归
summary()   #模型拟合结果


####多元线性回归分析####
blood<-data.frame(
     X1=c(76.0, 91.5, 85.5, 82.5, 79.0, 80.5, 74.5,
        79.0, 85.0, 76.5, 82.0, 95.0, 92.5),
     X2=c(50, 20, 20, 30, 30, 50, 60, 50, 40, 55,
        40, 40, 20),
     Y= c(120, 141, 124, 126, 117, 125, 123, 125,
        132, 123, 132, 155, 147)
        )
lm.sol<-lm(Y~X1+X2,data=blood)
summary(lm.sol)

#参数的区间估计
beta.int(lm.sol)

#预测
new<-data.frame(X1=80,X2=40)
lm.pred<-predict(lm.sol,new,interval="prediction",lecvel=0.95)
lm.pred

#修正拟合模型 update()
mew.model<-updata(old.model,new.formula)
fm3<-lm(y~x1+x2+x3,data=production)
fm2<-update(fm3,.~.-x3)
fm4<-update(fm3,.~.+x4)     #在new.formula中,相应名字用“.”代替
smf4<-update(fm4,sqrt(.)~.)


####计算实例####
toothpaste<-data.frame(
          X1=c(-0.05, 0.25,0.60,0,   0.25,0.20, 0.15,0.05,-0.15, 0.15,
               0.20, 0.10,0.40,0.45,0.35,0.30, 0.50,0.50, 0.40,-0.05,
               -0.05,-0.10,0.20,0.10,0.50,0.60,-0.05,0,    0.05, 0.55),
          X2=c( 5.50,6.75,7.25,5.50,7.00,6.50,6.75,5.25,5.25,6.00,
               6.50,6.25,7.00,6.90,6.80,6.80,7.10,7.00,6.80,6.50,
               6.25,6.00,6.50,7.00,6.80,6.80,6.50,5.75,5.80,6.80),
          Y =c( 7.38,8.51,9.52,7.50,9.33,8.28,8.75,7.87,7.10,8.00,
               7.89,8.15,9.10,8.86,8.90,8.87,9.26,9.00,8.75,7.95,
               7.65,7.27,8.00,8.50,8.75,9.21,8.27,7.67,7.93,9.26)
                       )

lm.sol<-lm(Y~X1+X2,data=toothpaste)
summary(lm.sol)

#绘制散点图及回归曲线
attach(toothpaste)
plot(Y~X1);abline(lm(Y~X1))

lm2.sol<-lm(Y~X2+I(X2^2))
x<-seq(min(X2),max(X2),len=200)
y<-predict(lm2.sol,data.frame(X2=x))
plot(Y~X2)
lines(x,y)

#修改模型形式(加平方项)
lm.new<-update(lm.sol,.~.+I(X2^2))
summary(lm.new)

#区间估计
beta.int(lm.new)

#修改模型形式(减去一项)
lm.new2<-update(lm.new,.~.-X2)
summary(lm.new2)

#修改模型形式(交互作用)
lm.new3<-update(lm.new,.~.+X1*X2)
summary(lm.new3)



####逐步回归#####
####step()函数#### AIC信息准则
step(object, scope, scale = 0,   #回归模型,逐步搜索区域,AIC统计量
direction = c("both", "backward", "forward"),   #逐步搜索方向
trace = 1, keep = NULL, steps = 1000, k = 2, ...)   #

cement<-data.frame(
      X1=c( 7,  1, 11, 11,  7, 11,  3,  1,  2, 21,  1, 11, 10),
      X2=c(26, 29, 56, 31, 52, 55, 71, 31, 54, 47, 40, 66, 68),
      X3=c( 6, 15,  8,  8,  6,  9, 17, 22, 18,  4, 23,  9,  8),
      X4=c(60, 52, 20, 47, 33, 22,  6, 44, 22, 26, 34, 12, 12),
      Y =c(78.5, 74.3, 104.3,  87.6,  95.9, 109.2, 102.7, 72.5,
           93.1,115.9,  83.8, 113.3, 109.4)
      )
lm.sol<-lm(Y~X1+X2+X3+X4,data=cement)
summary(lm.sol)

lm.step<-step(lm.sol)    #逐步回归
summary(lm.step)

drop1(lm.step)
lm.opt<-update(lm.step,.~.-X4,data=cement);summary(lm.opt)


####回归诊断####
#图的有用性
Anscombe<-data.frame(
  X =c(10.0, 8.0, 13.0, 9.0, 11.0, 14.0, 6.0, 4.0, 12.0, 7.0, 5.0),
  Y1=c(8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68),
  Y2=c(9.14, 8.14, 8.74, 8.77, 9.26, 8.10, 6.13, 3.10, 9.13, 7.26, 4.74),
  Y3=c(7.46, 6.77, 12.74, 7.11, 7.81, 8.84, 6.08, 5.39, 8.15, 6.44, 5.73),
  X4=c(rep(8,7), 19, rep(8,3)),
  Y4=c(6.58, 5.76, 7.71, 8.84, 8.47, 7.04, 5.25, 12.50, 5.56, 7.91, 6.89)
)
summary(lm(Y1~X, data=Anscombe))
summary(lm(Y2~X, data=Anscombe))
summary(lm(Y3~X, data=Anscombe))
summary(lm(Y4~X4,data=Anscombe))

attach(Anscombe)
par(mfrow=c(2,2))
plot(c(3,20), c(3,13), type="n", xlab = "X", ylab = "Y1",main="No.1")
points(X,Y1)
abline(lm(Y1~X))

plot(c(3,20), c(3,13), type="n", xlab = "X", ylab = "Y2",main="No.2")
points(X,Y2)
abline(lm(Y2~X))

plot(c(3,20), c(3,13), type="n", xlab = "X", ylab = "Y3",main="No.3")
points(X,Y3)
abline(lm(Y3~X))

plot(c(3,20), c(3,13), type="n", xlab = "X4", ylab = "Y4",main="No.4")
points(X4,Y4)
abline(lm(Y4~X4))

lm2.sol<-lm(Y2~X+I(X^2),data=Anscombe);summary(lm2.sol) #No2 光滑曲线

i<-1:11;Y31<-Anscombe$Y3[i!=3];X3<-Anscombe$X[i!=3]  #No3 去点奇异点(第十

点??)
lm3.sol<-lm(Y31~X3);summary(lm3.sol)


#####残差####
#残差正态性检验 shapiro.test()函数
y.res<-residuals(lm.sol)  #Forbes数据
shapiro.test(y.res)

#计算回归模型标准化残差 rstandard()函数

#计算回归模型外学生化残差 rstudent()函数

#残差图
y.res<-resid(lm.sol)    #残差
y.rst<-rstandard(lm.sol)   #标准化残差
y.fit<-predict(lm.sol)
par(mfrow=c(2,1))
plot(y.res~y.fit,main="残差");plot(y.rst~y.fit,main="标准化残差")


####实例(产品营销策略)例6.14
X<-scan()
679  292 1012  493  582 1156  997 2189 1097 2078
1818 1700  747 2030 1643  414  354 1276  745  435
540  874 1543 1029  710 1434  837 1748 1381 1428
1255 1777  370 2316 1130  463  770  724  808  790
783  406 1242  658 1746  468 1114  413 1787 3560
1495 2221 1526

Y<-scan()
0.79 0.44 0.56 0.79 2.70 3.64 4.73 9.50 5.34 6.85
5.84 5.21 3.25 4.43 3.16 0.50 0.17 1.88 0.77 1.39
0.56 1.56 5.28 0.64 4.00 0.31 4.20 4.88 3.48 7.58
2.63 4.99 0.59 8.19 4.79 0.51 1.74 4.10 3.94 0.96
3.29 0.44 3.24 2.14 5.71 0.64 1.90 0.51 8.33 14.94
5.11 3.85 3.93

lm.sol<-lm(Y~X);summary(lm.sol)
y.rst<-rstandard(lm.sol);y.fit<-predict(lm.sol);plot(y.rst~y.fit) #残差诊断
abline(0.1,0.5);abline(-0.1,-0.5)

lm.new<-update(lm.sol,sqrt(.)~.);coef(lm.new)
yn.rst<-rstandard(lm.new);yn.fit<-predict(lm.new);plot(yn.rst~yn.fit) #残差

诊断

plot(lm.new,2)    #残差QQ图

#残差图绘制 plot()函数
plot(x, which = 1:4,
caption = c("Residuals vs Fitted", "Normal Q-Q plot",
"Scale-Location plot", "Cook’s distance plot"),
x为模型;which:1=普通残差与拟合值,2=正态QQ图,3=标准化残差的开方与拟合值

,4=cook统计量

####影响分析####
#帽子矩阵#异常值
hatvalues(model,infl=lm.influence(model,do.coef=FALSE),...)
hat(x,intercept=TRUE)

#DFFITS准则
dffits(model, infl = , res = )
p<-1;n<-nrow(forbes);d<-dffits(lm.for)
cf<-1:n;cf[d>2*sqrt((p+1)/n)]    #第12号点可能为异常值

#Cook统计量
cooks.distance(model,infl=lm.influence(model,do.coef=FALSE),
res=weighted.residuals(model),
sd=sqrt(deviance(model)/df.residual(model)),
hat=infl$hat, ...)

#COVRATIO准则
covratio(model, infl = lm.influence(model, do.coef = FALSE),
res = weighted.residuals(model))

####回归诊断函数####
Reg_Diag<-function(fm){
n<-nrow(fm$model); df<-fm$df.residual
p<-n-df-1; s<-rep(" ", n);
res<-residuals(fm); s1<-s; s1[abs(res)==max(abs(res))]<-"*"
sta<-rstandard(fm); s2<-s; s2[abs(sta)>2]<-"*"
stu<-rstudent(fm); s3<-s; s3[abs(sta)>2]<-"*"
h<-hatvalues(fm); s4<-s; s4[h>2*(p+1)/n]<-"*"
d<-dffits(fm); s5<-s; s5[abs(d)>2*sqrt((p+1)/n)]<-"*"
c<-cooks.distance(fm); s6<-s; s6[c==max(c)]<-"*"
co<-covratio(fm);   abs_co<-abs(co-1)
s7<-s; s7[abs_co==max(abs_co)]<-"*"
data.frame(residual=res, s1, standard=sta, s2,
student=stu, s3,  hat_matrix=h, s4,
DFFITS=d, s5,cooks_distance=c, s6,
COVRATIO=co, s7)
}

#influence.measures()函数
influence.measures(model)



####多重共线性####
collinear<-data.frame(
   Y=c(10.006, 9.737, 15.087, 8.422, 8.625, 16.289,
       5.958, 9.313, 12.960, 5.541, 8.756, 10.937),
   X1=rep(c(8, 0, 2, 0), c(3, 3, 3, 3)),
   X2=rep(c(1, 0, 7, 0), c(3, 3, 3, 3)),
   X3=rep(c(1, 9, 0), c(3, 3, 6)),
   X4=rep(c(1, 0, 1, 10), c(1, 2, 6, 3)),
   X5=c(0.541, 0.130, 2.116, -2.397, -0.046, 0.365,
       1.996, 0.228, 1.38, -0.798, 0.257, 0.440),
   X6=c(-0.099, 0.070, 0.115, 0.252, 0.017, 1.504,
       -0.865, -0.055, 0.502, -0.399, 0.101, 0.432)
   )
XX<-cor(collinear[2:7])    #计算相关系数
kappa(XX,exact=TRUE)    #计算矩阵的条件数
eigen(XX)     #计算矩阵特征值和特征向量


####广义线性回归模型####
#glm()函数
fitted.model <- glm(formula, family=family.generator,#拟合公式  #分布族
data=data.frame)

#正态分布族
fm <- glm(formula, family = gaussian(link = identity),
          data = data.frame)


#二项分布族 #logistic回归模型
fm <- glm(formula, family = binomial(link = logit),
          data=data.frame)

norell<-data.frame(
        x=0:5,n=rep(70,6),success=c(0,9,21,47,60,63)
        )
norell
norell$Ymat<-cbind(norell$success,norell$n-norell$success)
glm.sol<-glm(Ymat~x,family=binomial,data=norell)
summary(glm.sol)

#预测
pre<-predict(glm.sol,data.frame(x=3.5))
p<-exp(pre)/(1+exp(pre));p
#当电流强度为3.5毫安有相应的牛的概率为p

#画图  
d<-seq(0,5,len=100)
pre<-predict(glm.sol,data.frame(x = d))
p<-exp(pre)/(1+exp(pre))
norell$y<-norell$success/norell$n
plot(norell$x,norell$y);lines(d,p)

例子6.20
life<-data.frame(
X1=c(2.5, 173, 119, 10, 502, 4, 14.4, 2, 40, 6.6,
     21.4, 2.8, 2.5, 6, 3.5, 62.2, 10.8, 21.6, 2, 3.4,
     5.1, 2.4, 1.7, 1.1, 12.8, 1.2, 3.5, 39.7, 62.4, 2.4,
     34.7, 28.4, 0.9, 30.6, 5.8, 6.1, 2.7, 4.7, 128, 35,
     2, 8.5, 2, 2, 4.3, 244.8, 4, 5.1, 32, 1.4),
X2=rep(c(0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2,
         0, 2, 0, 2, 0, 2, 0),
       c(1, 4, 2, 2, 1, 1, 8, 1, 5, 1, 5, 1, 1, 1, 2, 1,
         1, 1, 3, 1, 2, 1, 4)),
X3=rep(c(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1),
       c(6, 1, 3, 1, 3, 1, 1, 5, 1, 3, 7, 1, 1, 3, 1, 1, 2, 9)),
Y=rep(c(0,  1,   0,  1), 
      c(15, 10, 15, 10))
)
glm.sol<-glm(Y~X1+X2+X3,family=binomial,data=life)
summary(glm.sol)
#若一病人x1=5,x2=2,x3=0,则一年以上的存活概率
pre<-predict(glm.sol,data.frame(X1=5,X2=2,X3=0))
p<-exp(pre)/(1+exp(pre));p#没进行巩固治疗
pre<-predict(glm.sol,data.frame(X1=5,X2=2,X3=1))
p<-exp(pre)/(1+exp(pre));p#进行巩固治疗

    本站是提供个人知识管理的网络存储空间,所有内容均由用户发布,不代表本站观点。请注意甄别内容中的联系方式、诱导购买等信息,谨防诈骗。如发现有害或侵权内容,请点击一键举报。
    转藏 分享 献花(0

    0条评论

    发表

    请遵守用户 评论公约