分享

ROC 和 AUC 分析R代码

 生物_医药_科研 2019-05-03

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
library(pROC) # install with install.packages('pROC')
library(randomForest) # install with install.packages('randomForest')
 
#######################################
##
## Generate weight and obesity datasets.
##
#######################################
set.seed(420) # this will make my results match yours
 
num.samples <- 100
 
## genereate 100 values from a normal distribution with
## mean 172 and standard deviation 29, then sort them
weight <- sort(rnorm(n=num.samples, mean=172, sd=29))
 
## Now we will decide if a sample is obese or not.
## NOTE: This method for classifying a sample as obese or not
## was made up just for this example.
## rank(weight) returns 1 for the lightest, 2 for the second lightest, ...
##              ... and it returns 100 for the heaviest.
## So what we do is generate a random number between 0 and 1. Then we see if
## that number is less than rank/100. So, for the lightest sample, rank = 1.
## This sample will be classified 'obese' if we get a random number less than
## 1/100. For the second lightest sample, rank = 2, we get another random
## number between 0 and 1 and classify this sample 'obese' if that random
## number is < 2/100. We repeat that process for all 100 samples
obese <- ifelse(test=(runif(n=num.samples) < (rank(weight)/num.samples)),
  yes=1, no=0)
obese ## print out the contents of 'obese' to show us which samples were
      ## classified 'obese' with 1, and which samples were classified
      ## 'not obese' with 0.
 
## plot the data
plot(x=weight, y=obese)
 
## fit a logistic regression to the data...
glm.fit=glm(obese ~ weight, family=binomial)
lines(weight, glm.fit$fitted.values)
 
#######################################
##
## draw ROC and AUC using pROC
##
#######################################
 
## NOTE: By default, the graphs come out looking terrible
## The problem is that ROC graphs should be square, since the x and y axes
## both go from 0 to 1. However, the window in which I draw them isn't square
## so extra whitespace is added to pad the sides.
roc(obese, glm.fit$fitted.values, plot=TRUE)
 
## Now let's configure R so that it prints the graph as a square.
##
par(pty = 's') ## pty sets the aspect ratio of the plot region. Two options:
##                's' - creates a square plotting region
##                'm' - (the default) creates a maximal plotting region
roc(obese, glm.fit$fitted.values, plot=TRUE)
 
## NOTE: By default, roc() uses specificity on the x-axis and the values range
## from 1 to 0. This makes the graph look like what we would expect, but the
## x-axis itself might induce a headache. To use 1-specificity (i.e. the
## False Positive Rate) on the x-axis, set 'legacy.axes' to TRUE.
roc(obese, glm.fit$fitted.values, plot=TRUE, legacy.axes=TRUE)
 
## If you want to rename the x and y axes...
roc(obese, glm.fit$fitted.values, plot=TRUE, legacy.axes=TRUE, percent=TRUE, xlab='False Positive Percentage', ylab='True Postive Percentage')
 
## We can also change the color of the ROC line, and make it wider...
roc(obese, glm.fit$fitted.values, plot=TRUE, legacy.axes=TRUE, percent=TRUE, xlab='False Positive Percentage', ylab='True Postive Percentage', col='#377eb8', lwd=4)
 
## If we want to find out the optimal threshold we can store the
## data used to make the ROC graph in a variable...
roc.info <- roc(obese, glm.fit$fitted.values, legacy.axes=TRUE)
str(roc.info)
 
## and then extract just the information that we want from that variable.
roc.df <- data.frame(
  tpp=roc.info$sensitivities*100, ## tpp = true positive percentage
  fpp=(1 - roc.info$specificities)*100, ## fpp = false positive precentage
  thresholds=roc.info$thresholds)
 
head(roc.df) ## head() will show us the values for the upper right-hand corner
             ## of the ROC graph, when the threshold is so low
             ## (negative infinity) that every single sample is called 'obese'.
             ## Thus TPP = 100% and FPP = 100%
 
tail(roc.df) ## tail() will show us the values for the lower left-hand corner
             ## of the ROC graph, when the threshold is so high (infinity)
             ## that every single sample is called 'not obese'.
             ## Thus, TPP = 0% and FPP = 0%
 
## now let's look at the thresholds between TPP 60% and 80%
roc.df[roc.df$tpp > 60 & roc.df$tpp < 80,]
 
## We can calculate the area under the curve...
roc(obese, glm.fit$fitted.values, plot=TRUE, legacy.axes=TRUE, percent=TRUE, xlab='False Positive Percentage', ylab='True Postive Percentage', col='#377eb8', lwd=4, print.auc=TRUE)
 
## ...and the partial area under the curve.
roc(obese, glm.fit$fitted.values, plot=TRUE, legacy.axes=TRUE, percent=TRUE, xlab='False Positive Percentage', ylab='True Postive Percentage', col='#377eb8', lwd=4, print.auc=TRUE, print.auc.x=45, partial.auc=c(100, 90), auc.polygon = TRUE, auc.polygon.col = '#377eb822')
 
#######################################
##
## Now let's fit the data with a random forest...
##
#######################################
rf.model <- randomForest(factor(obese) ~ weight)
 
## ROC for random forest
roc(obese, rf.model$votes[,1], plot=TRUE, legacy.axes=TRUE, percent=TRUE, xlab='False Positive Percentage', ylab='True Postive Percentage', col='#4daf4a', lwd=4, print.auc=TRUE)
 
#######################################
##
## Now layer logistic regression and random forest ROC graphs..
##
#######################################
roc(obese, glm.fit$fitted.values, plot=TRUE, legacy.axes=TRUE, percent=TRUE, xlab='False Positive Percentage', ylab='True Postive Percentage', col='#377eb8', lwd=4, print.auc=TRUE)
 
plot.roc(obese, rf.model$votes[,1], percent=TRUE, col='#4daf4a', lwd=4, print.auc=TRUE, add=TRUE, print.auc.y=40)
legend('bottomright', legend=c('Logisitic Regression', 'Random Forest'), col=c('#377eb8', '#4daf4a'), lwd=4)
 
#######################################
##
## Now that we're done with our ROC fun, let's reset the par() variables.
## There are two ways to do it...
##
#######################################
par(pty = 'm')

如果您觉得有价值,请把此文放到您朋友圈,大家都会感谢你

    本站是提供个人知识管理的网络存储空间,所有内容均由用户发布,不代表本站观点。请注意甄别内容中的联系方式、诱导购买等信息,谨防诈骗。如发现有害或侵权内容,请点击一键举报。
    转藏 分享 献花(0

    0条评论

    发表

    请遵守用户 评论公约

    类似文章 更多