rm(list=ls()) ################回归分析########### ####一元线性回归#### x<-c(0.10, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.20, 0.21, 0.23) y<-c(42.0, 43.5, 45.0, 45.5, 45.0, 47.5, 49.0, 53.0, 50.0, 55.0, 55.0, 60.0) lm.sol<-lm(y~1+x) summary(lm.sol) ####参数的区间估计##### beta.int<-function(fm,alpha=0.05){ A<-summary(fm)$coefficients df<-fm$df.residual left<-A[,1]-A[,2]*qt(1-alpha/2,df) right<-A[,1]+A[,2]*qt(1-alpha/2,df) rowname<-dimnames(A)[[1]] ##[[1]]什么意思 colname<-c("Estimate","Left","Right") matrix(c(A[,1],left,right),ncol=3, dimnames=list(rowname,colname)) } beta.int(lm.sol) ####预测##### new<-data.frame(x=0.16) #数据框 lm.pred<-predict(lm.sol,new,interval="prediction",level=0.95) lm.pred ####实例#### X <- matrix(c( 194.5, 20.79, 1.3179, 131.79, 194.3, 20.79, 1.3179, 131.79, 197.9, 22.40, 1.3502, 135.02, 198.4, 22.67, 1.3555, 135.55, 199.4, 23.15, 1.3646, 136.46, 199.9, 23.35, 1.3683, 136.83, 200.9, 23.89, 1.3782, 137.82, 201.1, 23.99, 1.3800, 138.00, 201.4, 24.02, 1.3806, 138.06, 201.3, 24.01, 1.3805, 138.05, 203.6, 25.14, 1.4004, 140.04, 204.6, 26.57, 1.4244, 142.44, 209.5, 28.49, 1.4547, 145.47, 208.6, 27.76, 1.4434, 144.34, 210.7, 29.04, 1.4630, 146.30, 211.9, 29.88, 1.4754, 147.54, 212.2, 30.06, 1.4780, 147.80), ncol=4, byrow=T, #矩阵排列 dimnames = list(1:17, c("F", "h", "log", "log100"))) #矩阵命名 X forbes<-as.data.frame(X) #数据框 plot(forbes$F,forbes$log100) #散点图 lm.for<-lm(log100~F,data=forbes) summary(lm.for) abline(lm.for,col="red") res.for<-residuals(lm.for);plot(res.for) #残差及残差图 lm.for$residual;plot(lm.for$residual) text(12,res.for[12],labels=12,adj=1.2) #将第12号残差点标出 i<-1:17; forbes12<-as.data.frame(X[i!=12, ]) #取出第12号点 lm.for12<-lm(log100~F, data=forbes12) summary(lm.for12) ####lm()函数#### lm(formula, data, subset, weights, na.action, method = "qr", model = TRUE, x = FALSE, y = FALSE, qr = TRUE, singular.ok = TRUE, contrasts = NULL, offset, ...) lm(formula,data=data.frame) #提取模型信息 anova(object,...) #方差分析表 coefficients(obeject,...) #模型的系数 deviance() #残差平方和 formula() #模型公式 plot() #绘制模型诊断的几种图形 predict(object,newdata=data.frame) #预测值和预测空间,newdata预测点的数 据 print() #模型拟合的结果 rediduals(object, type=c("working","response","deviance", "pearson","partial")) #残差 step() #逐步回归 summary() #模型拟合结果 ####多元线性回归分析#### blood<-data.frame( X1=c(76.0, 91.5, 85.5, 82.5, 79.0, 80.5, 74.5, 79.0, 85.0, 76.5, 82.0, 95.0, 92.5), X2=c(50, 20, 20, 30, 30, 50, 60, 50, 40, 55, 40, 40, 20), Y= c(120, 141, 124, 126, 117, 125, 123, 125, 132, 123, 132, 155, 147) ) lm.sol<-lm(Y~X1+X2,data=blood) summary(lm.sol) #参数的区间估计 beta.int(lm.sol) #预测 new<-data.frame(X1=80,X2=40) lm.pred<-predict(lm.sol,new,interval="prediction",lecvel=0.95) lm.pred #修正拟合模型 update() mew.model<-updata(old.model,new.formula) fm3<-lm(y~x1+x2+x3,data=production) fm2<-update(fm3,.~.-x3) fm4<-update(fm3,.~.+x4) #在new.formula中,相应名字用“.”代替 smf4<-update(fm4,sqrt(.)~.) ####计算实例#### toothpaste<-data.frame( X1=c(-0.05, 0.25,0.60,0, 0.25,0.20, 0.15,0.05,-0.15, 0.15, 0.20, 0.10,0.40,0.45,0.35,0.30, 0.50,0.50, 0.40,-0.05, -0.05,-0.10,0.20,0.10,0.50,0.60,-0.05,0, 0.05, 0.55), X2=c( 5.50,6.75,7.25,5.50,7.00,6.50,6.75,5.25,5.25,6.00, 6.50,6.25,7.00,6.90,6.80,6.80,7.10,7.00,6.80,6.50, 6.25,6.00,6.50,7.00,6.80,6.80,6.50,5.75,5.80,6.80), Y =c( 7.38,8.51,9.52,7.50,9.33,8.28,8.75,7.87,7.10,8.00, 7.89,8.15,9.10,8.86,8.90,8.87,9.26,9.00,8.75,7.95, 7.65,7.27,8.00,8.50,8.75,9.21,8.27,7.67,7.93,9.26) ) lm.sol<-lm(Y~X1+X2,data=toothpaste) summary(lm.sol) #绘制散点图及回归曲线 attach(toothpaste) plot(Y~X1);abline(lm(Y~X1)) lm2.sol<-lm(Y~X2+I(X2^2)) x<-seq(min(X2),max(X2),len=200) y<-predict(lm2.sol,data.frame(X2=x)) plot(Y~X2) lines(x,y) #修改模型形式(加平方项) lm.new<-update(lm.sol,.~.+I(X2^2)) summary(lm.new) #区间估计 beta.int(lm.new) #修改模型形式(减去一项) lm.new2<-update(lm.new,.~.-X2) summary(lm.new2) #修改模型形式(交互作用) lm.new3<-update(lm.new,.~.+X1*X2) summary(lm.new3) ####逐步回归##### ####step()函数#### AIC信息准则 step(object, scope, scale = 0, #回归模型,逐步搜索区域,AIC统计量 direction = c("both", "backward", "forward"), #逐步搜索方向 trace = 1, keep = NULL, steps = 1000, k = 2, ...) # cement<-data.frame( X1=c( 7, 1, 11, 11, 7, 11, 3, 1, 2, 21, 1, 11, 10), X2=c(26, 29, 56, 31, 52, 55, 71, 31, 54, 47, 40, 66, 68), X3=c( 6, 15, 8, 8, 6, 9, 17, 22, 18, 4, 23, 9, 8), X4=c(60, 52, 20, 47, 33, 22, 6, 44, 22, 26, 34, 12, 12), Y =c(78.5, 74.3, 104.3, 87.6, 95.9, 109.2, 102.7, 72.5, 93.1,115.9, 83.8, 113.3, 109.4) ) lm.sol<-lm(Y~X1+X2+X3+X4,data=cement) summary(lm.sol) lm.step<-step(lm.sol) #逐步回归 summary(lm.step) drop1(lm.step) lm.opt<-update(lm.step,.~.-X4,data=cement);summary(lm.opt) ####回归诊断#### #图的有用性 Anscombe<-data.frame( X =c(10.0, 8.0, 13.0, 9.0, 11.0, 14.0, 6.0, 4.0, 12.0, 7.0, 5.0), Y1=c(8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68), Y2=c(9.14, 8.14, 8.74, 8.77, 9.26, 8.10, 6.13, 3.10, 9.13, 7.26, 4.74), Y3=c(7.46, 6.77, 12.74, 7.11, 7.81, 8.84, 6.08, 5.39, 8.15, 6.44, 5.73), X4=c(rep(8,7), 19, rep(8,3)), Y4=c(6.58, 5.76, 7.71, 8.84, 8.47, 7.04, 5.25, 12.50, 5.56, 7.91, 6.89) ) summary(lm(Y1~X, data=Anscombe)) summary(lm(Y2~X, data=Anscombe)) summary(lm(Y3~X, data=Anscombe)) summary(lm(Y4~X4,data=Anscombe)) attach(Anscombe) par(mfrow=c(2,2)) plot(c(3,20), c(3,13), type="n", xlab = "X", ylab = "Y1",main="No.1") points(X,Y1) abline(lm(Y1~X)) plot(c(3,20), c(3,13), type="n", xlab = "X", ylab = "Y2",main="No.2") points(X,Y2) abline(lm(Y2~X)) plot(c(3,20), c(3,13), type="n", xlab = "X", ylab = "Y3",main="No.3") points(X,Y3) abline(lm(Y3~X)) plot(c(3,20), c(3,13), type="n", xlab = "X4", ylab = "Y4",main="No.4") points(X4,Y4) abline(lm(Y4~X4)) lm2.sol<-lm(Y2~X+I(X^2),data=Anscombe);summary(lm2.sol) #No2 光滑曲线 i<-1:11;Y31<-Anscombe$Y3[i!=3];X3<-Anscombe$X[i!=3] #No3 去点奇异点(第十 点??) lm3.sol<-lm(Y31~X3);summary(lm3.sol) #####残差#### #残差正态性检验 shapiro.test()函数 y.res<-residuals(lm.sol) #Forbes数据 shapiro.test(y.res) #计算回归模型标准化残差 rstandard()函数 #计算回归模型外学生化残差 rstudent()函数 #残差图 y.res<-resid(lm.sol) #残差 y.rst<-rstandard(lm.sol) #标准化残差 y.fit<-predict(lm.sol) par(mfrow=c(2,1)) plot(y.res~y.fit,main="残差");plot(y.rst~y.fit,main="标准化残差") ####实例(产品营销策略)例6.14 X<-scan() 679 292 1012 493 582 1156 997 2189 1097 2078 1818 1700 747 2030 1643 414 354 1276 745 435 540 874 1543 1029 710 1434 837 1748 1381 1428 1255 1777 370 2316 1130 463 770 724 808 790 783 406 1242 658 1746 468 1114 413 1787 3560 1495 2221 1526 Y<-scan() 0.79 0.44 0.56 0.79 2.70 3.64 4.73 9.50 5.34 6.85 5.84 5.21 3.25 4.43 3.16 0.50 0.17 1.88 0.77 1.39 0.56 1.56 5.28 0.64 4.00 0.31 4.20 4.88 3.48 7.58 2.63 4.99 0.59 8.19 4.79 0.51 1.74 4.10 3.94 0.96 3.29 0.44 3.24 2.14 5.71 0.64 1.90 0.51 8.33 14.94 5.11 3.85 3.93 lm.sol<-lm(Y~X);summary(lm.sol) y.rst<-rstandard(lm.sol);y.fit<-predict(lm.sol);plot(y.rst~y.fit) #残差诊断 abline(0.1,0.5);abline(-0.1,-0.5) lm.new<-update(lm.sol,sqrt(.)~.);coef(lm.new) yn.rst<-rstandard(lm.new);yn.fit<-predict(lm.new);plot(yn.rst~yn.fit) #残差 诊断 plot(lm.new,2) #残差QQ图 #残差图绘制 plot()函数 plot(x, which = 1:4, caption = c("Residuals vs Fitted", "Normal Q-Q plot", "Scale-Location plot", "Cook’s distance plot"), x为模型;which:1=普通残差与拟合值,2=正态QQ图,3=标准化残差的开方与拟合值 ,4=cook统计量 ####影响分析#### #帽子矩阵#异常值 hatvalues(model,infl=lm.influence(model,do.coef=FALSE),...) hat(x,intercept=TRUE) #DFFITS准则 dffits(model, infl = , res = ) p<-1;n<-nrow(forbes);d<-dffits(lm.for) cf<-1:n;cf[d>2*sqrt((p+1)/n)] #第12号点可能为异常值 #Cook统计量 cooks.distance(model,infl=lm.influence(model,do.coef=FALSE), res=weighted.residuals(model), sd=sqrt(deviance(model)/df.residual(model)), hat=infl$hat, ...) #COVRATIO准则 covratio(model, infl = lm.influence(model, do.coef = FALSE), res = weighted.residuals(model)) ####回归诊断函数#### Reg_Diag<-function(fm){ n<-nrow(fm$model); df<-fm$df.residual p<-n-df-1; s<-rep(" ", n); res<-residuals(fm); s1<-s; s1[abs(res)==max(abs(res))]<-"*" sta<-rstandard(fm); s2<-s; s2[abs(sta)>2]<-"*" stu<-rstudent(fm); s3<-s; s3[abs(sta)>2]<-"*" h<-hatvalues(fm); s4<-s; s4[h>2*(p+1)/n]<-"*" d<-dffits(fm); s5<-s; s5[abs(d)>2*sqrt((p+1)/n)]<-"*" c<-cooks.distance(fm); s6<-s; s6[c==max(c)]<-"*" co<-covratio(fm); abs_co<-abs(co-1) s7<-s; s7[abs_co==max(abs_co)]<-"*" data.frame(residual=res, s1, standard=sta, s2, student=stu, s3, hat_matrix=h, s4, DFFITS=d, s5,cooks_distance=c, s6, COVRATIO=co, s7) } #influence.measures()函数 influence.measures(model) ####多重共线性#### collinear<-data.frame( Y=c(10.006, 9.737, 15.087, 8.422, 8.625, 16.289, 5.958, 9.313, 12.960, 5.541, 8.756, 10.937), X1=rep(c(8, 0, 2, 0), c(3, 3, 3, 3)), X2=rep(c(1, 0, 7, 0), c(3, 3, 3, 3)), X3=rep(c(1, 9, 0), c(3, 3, 6)), X4=rep(c(1, 0, 1, 10), c(1, 2, 6, 3)), X5=c(0.541, 0.130, 2.116, -2.397, -0.046, 0.365, 1.996, 0.228, 1.38, -0.798, 0.257, 0.440), X6=c(-0.099, 0.070, 0.115, 0.252, 0.017, 1.504, -0.865, -0.055, 0.502, -0.399, 0.101, 0.432) ) XX<-cor(collinear[2:7]) #计算相关系数 kappa(XX,exact=TRUE) #计算矩阵的条件数 eigen(XX) #计算矩阵特征值和特征向量 ####广义线性回归模型#### #glm()函数 fitted.model <- glm(formula, family=family.generator,#拟合公式 #分布族 data=data.frame) #正态分布族 fm <- glm(formula, family = gaussian(link = identity), data = data.frame) #二项分布族 #logistic回归模型 fm <- glm(formula, family = binomial(link = logit), data=data.frame) norell<-data.frame( x=0:5,n=rep(70,6),success=c(0,9,21,47,60,63) ) norell norell$Ymat<-cbind(norell$success,norell$n-norell$success) glm.sol<-glm(Ymat~x,family=binomial,data=norell) summary(glm.sol) #预测 pre<-predict(glm.sol,data.frame(x=3.5)) p<-exp(pre)/(1+exp(pre));p #当电流强度为3.5毫安有相应的牛的概率为p #画图 d<-seq(0,5,len=100) pre<-predict(glm.sol,data.frame(x = d)) p<-exp(pre)/(1+exp(pre)) norell$y<-norell$success/norell$n plot(norell$x,norell$y);lines(d,p) 例子6.20 life<-data.frame( X1=c(2.5, 173, 119, 10, 502, 4, 14.4, 2, 40, 6.6, 21.4, 2.8, 2.5, 6, 3.5, 62.2, 10.8, 21.6, 2, 3.4, 5.1, 2.4, 1.7, 1.1, 12.8, 1.2, 3.5, 39.7, 62.4, 2.4, 34.7, 28.4, 0.9, 30.6, 5.8, 6.1, 2.7, 4.7, 128, 35, 2, 8.5, 2, 2, 4.3, 244.8, 4, 5.1, 32, 1.4), X2=rep(c(0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0), c(1, 4, 2, 2, 1, 1, 8, 1, 5, 1, 5, 1, 1, 1, 2, 1, 1, 1, 3, 1, 2, 1, 4)), X3=rep(c(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1), c(6, 1, 3, 1, 3, 1, 1, 5, 1, 3, 7, 1, 1, 3, 1, 1, 2, 9)), Y=rep(c(0, 1, 0, 1), c(15, 10, 15, 10)) ) glm.sol<-glm(Y~X1+X2+X3,family=binomial,data=life) summary(glm.sol) #若一病人x1=5,x2=2,x3=0,则一年以上的存活概率 pre<-predict(glm.sol,data.frame(X1=5,X2=2,X3=0)) p<-exp(pre)/(1+exp(pre));p#没进行巩固治疗 pre<-predict(glm.sol,data.frame(X1=5,X2=2,X3=1)) p<-exp(pre)/(1+exp(pre));p#进行巩固治疗
|
|