library (pROC) # install with install.packages('pROC')
library (randomForest) # install with install.packages('randomForest') #######################################
##
## Generate weight and obesity datasets.
##
#######################################
set.seed (420) # this will make my results match yours num.samples <- 100 ## genereate 100 values from a normal distribution with
## mean 172 and standard deviation 29, then sort them
weight <- sort ( rnorm (n=num.samples, mean=172, sd=29)) ## Now we will decide if a sample is obese or not.
## NOTE: This method for classifying a sample as obese or not
## was made up just for this example.
## rank(weight) returns 1 for the lightest, 2 for the second lightest, ...
## ... and it returns 100 for the heaviest.
## So what we do is generate a random number between 0 and 1. Then we see if
## that number is less than rank/100. So, for the lightest sample, rank = 1.
## This sample will be classified 'obese' if we get a random number less than
## 1/100. For the second lightest sample, rank = 2, we get another random
## number between 0 and 1 and classify this sample 'obese' if that random
## number is < 2/100. We repeat that process for all 100 samples
obese <- ifelse (test=( runif (n=num.samples) < ( rank (weight)/num.samples)),
yes=1, no=0)
obese ## print out the contents of 'obese' to show us which samples were
## classified 'obese' with 1, and which samples were classified
## 'not obese' with 0. ## plot the data
plot (x=weight, y=obese) ## fit a logistic regression to the data...
glm.fit= glm (obese ~ weight, family=binomial)
lines (weight, glm.fit$fitted.values) #######################################
##
## draw ROC and AUC using pROC
##
####################################### ## NOTE: By default, the graphs come out looking terrible
## The problem is that ROC graphs should be square, since the x and y axes
## both go from 0 to 1. However, the window in which I draw them isn't square
## so extra whitespace is added to pad the sides.
roc (obese, glm.fit$fitted.values, plot= TRUE ) ## Now let's configure R so that it prints the graph as a square.
##
par (pty = 's' ) ## pty sets the aspect ratio of the plot region. Two options:
## 's' - creates a square plotting region
## 'm' - (the default) creates a maximal plotting region
roc (obese, glm.fit$fitted.values, plot= TRUE ) ## NOTE: By default, roc() uses specificity on the x-axis and the values range
## from 1 to 0. This makes the graph look like what we would expect, but the
## x-axis itself might induce a headache. To use 1-specificity (i.e. the
## False Positive Rate) on the x-axis, set 'legacy.axes' to TRUE.
roc (obese, glm.fit$fitted.values, plot= TRUE , legacy.axes= TRUE ) ## If you want to rename the x and y axes...
roc (obese, glm.fit$fitted.values, plot= TRUE , legacy.axes= TRUE , percent= TRUE , xlab= 'False Positive Percentage' , ylab= 'True Postive Percentage' ) ## We can also change the color of the ROC line, and make it wider...
roc (obese, glm.fit$fitted.values, plot= TRUE , legacy.axes= TRUE , percent= TRUE , xlab= 'False Positive Percentage' , ylab= 'True Postive Percentage' , col= '#377eb8' , lwd=4) ## If we want to find out the optimal threshold we can store the
## data used to make the ROC graph in a variable...
roc.info <- roc (obese, glm.fit$fitted.values, legacy.axes= TRUE )
str (roc.info) ## and then extract just the information that we want from that variable.
roc.df <- data.frame (
tpp=roc.info$sensitivities*100, ## tpp = true positive percentage
fpp=(1 - roc.info$specificities)*100, ## fpp = false positive precentage
thresholds=roc.info$thresholds) head (roc.df) ## head() will show us the values for the upper right-hand corner
## of the ROC graph, when the threshold is so low
## (negative infinity) that every single sample is called 'obese'.
## Thus TPP = 100% and FPP = 100% tail (roc.df) ## tail() will show us the values for the lower left-hand corner
## of the ROC graph, when the threshold is so high (infinity)
## that every single sample is called 'not obese'.
## Thus, TPP = 0% and FPP = 0% ## now let's look at the thresholds between TPP 60% and 80%
roc.df[roc.df$tpp > 60 & roc.df$tpp < 80,] ## We can calculate the area under the curve...
roc (obese, glm.fit$fitted.values, plot= TRUE , legacy.axes= TRUE , percent= TRUE , xlab= 'False Positive Percentage' , ylab= 'True Postive Percentage' , col= '#377eb8' , lwd=4, print.auc= TRUE ) ## ...and the partial area under the curve.
roc (obese, glm.fit$fitted.values, plot= TRUE , legacy.axes= TRUE , percent= TRUE , xlab= 'False Positive Percentage' , ylab= 'True Postive Percentage' , col= '#377eb8' , lwd=4, print.auc= TRUE , print.auc.x=45, partial.auc= c (100, 90), auc.polygon = TRUE , auc.polygon.col = '#377eb822' ) #######################################
##
## Now let's fit the data with a random forest...
##
#######################################
rf.model <- randomForest ( factor (obese) ~ weight) ## ROC for random forest
roc (obese, rf.model$votes[,1], plot= TRUE , legacy.axes= TRUE , percent= TRUE , xlab= 'False Positive Percentage' , ylab= 'True Postive Percentage' , col= '#4daf4a' , lwd=4, print.auc= TRUE ) #######################################
##
## Now layer logistic regression and random forest ROC graphs..
##
#######################################
roc (obese, glm.fit$fitted.values, plot= TRUE , legacy.axes= TRUE , percent= TRUE , xlab= 'False Positive Percentage' , ylab= 'True Postive Percentage' , col= '#377eb8' , lwd=4, print.auc= TRUE ) plot.roc (obese, rf.model$votes[,1], percent= TRUE , col= '#4daf4a' , lwd=4, print.auc= TRUE , add= TRUE , print.auc.y=40)
legend ( 'bottomright' , legend= c ( 'Logisitic Regression' , 'Random Forest' ), col= c ( '#377eb8' , '#4daf4a' ), lwd=4) #######################################
##
## Now that we're done with our ROC fun, let's reset the par() variables.
## There are two ways to do it...
##
#######################################
par (pty = 'm' )
|