From faa5c24264a6dac42a87eafd394c0cb95f403896 Mon Sep 17 00:00:00 2001 From: Andrew Lawrence Date: Mon, 7 Sep 2020 15:30:34 +0100 Subject: [PATCH 01/10] Improvements to ROC plots. ROC plot examples in Vignette. --- R/dCVnet_plotting.R | 60 +++++++++--- vignettes/merging-ROC.Rmd | 195 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 244 insertions(+), 11 deletions(-) create mode 100644 vignettes/merging-ROC.Rmd diff --git a/R/dCVnet_plotting.R b/R/dCVnet_plotting.R index a2a1277..3f44fb0 100644 --- a/R/dCVnet_plotting.R +++ b/R/dCVnet_plotting.R @@ -132,6 +132,12 @@ plot.dCVnet <- function(x, type = "tuning", ...) { #' @param legend logical. Display a legend? #' @param alphalabel should certain alpha values (probability thresholds) #' on the curve be highlighted with symbols indicating threshold? +#' @param guide_labels a named list of 3 labels used in the legend: +#' \itemize{ +#' \item{group = label for the Grouping factor} +#' \item{threshold = label for the threshold factor} +#' \item{refline = label for the reference line} +#' } #' @param ... additional arguments (unused) #' @return a ROC plot, as above. #' @@ -139,8 +145,13 @@ plot.dCVnet <- function(x, type = "tuning", ...) { plot.rocdata <- function(x, legend = TRUE, alphalabel = c(0.25, 0.5, 0.75), + guide_labels = c(group = "Model", + threshold = expression(P[Threshold]), + refline = "Chance\nPerformance"), ...) { - hasalphas <- any(!is.na(alphalabel)) + # were thresholds supplied? + hasalphas <- !any(is.na(alphalabel)) && !identical(FALSE, alphalabel) + print(hasalphas) .closest <- function(vals, x) { locs <- vapply(vals, function(v) which.min(abs(x - v)), FUN.VALUE = 1L) r <- rep(NA_character_, NROW(x)) @@ -169,25 +180,52 @@ plot.rocdata <- function(x, x$PThreshold <- factor(x$PThreshold, levels = names(alphalabel)) } - p <- ggplot2::ggplot(x, ggplot2::aes_string(y = "Sens", - x = "InvSpec", - group = "run", - colour = "run")) + - ggplot2::geom_abline(slope = 1, intercept = 0, - colour = "black") + - ggplot2::geom_line(show.legend = legend) + + refline <- data.frame(Sens = 0, + InvSpec = 0, + SensEnd = 1, + InvSpecEnd = 1, + lty = guide_labels$refline) + + p <- ggplot2::ggplot(x, + ggplot2::aes_string(y = "Sens", + x = "InvSpec")) + + ggplot2::geom_segment( + ggplot2::aes_( + x = ~InvSpec, + y = ~Sens, + xend = ~InvSpecEnd, + yend = ~SensEnd, + lty = ~lty), + show.legend = TRUE, + data = refline, + inherit.aes = FALSE) + + ggplot2::geom_line(ggplot2::aes_string(colour = "run"), + show.legend = legend) + ggplot2::xlab("False positive rate") + ggplot2::ylab("True positive rate") + ggplot2::scale_x_continuous(breaks = seq(0, 1, 0.1), minor_breaks = NULL) + ggplot2::scale_y_continuous(breaks = seq(0, 1, 0.1), minor_breaks = NULL) + + ggplot2::scale_size_manual(values = c(0.7)) + + ggplot2::guides( + colour = ggplot2::guide_legend(title = guide_labels$group), + lty = ggplot2::guide_legend(title = NULL, + override.aes = list( + linetype = 1, + shape = 32, + alpha = 1))) + ggplot2::coord_equal() + ggplot2::theme_light() - if ( hasalphas ) { + if ( hasalphas ) { p <- p + ggplot2::geom_point(data = x[!is.na(x$PThreshold), ], - mapping = ggplot2::aes_string(shape = "PThreshold"), - show.legend = legend) + mapping = ggplot2::aes_string(shape = "PThreshold", + colour = "run"), + show.legend = legend) + + ggplot2::guides( + shape = ggplot2::guide_legend(title = guide_labels$threshold, + override.aes = list(linetype = NA)) + ) } print(p) return(invisible(list(plot = p, diff --git a/vignettes/merging-ROC.Rmd b/vignettes/merging-ROC.Rmd new file mode 100644 index 0000000..32764b3 --- /dev/null +++ b/vignettes/merging-ROC.Rmd @@ -0,0 +1,195 @@ +--- +title: "Merging and Manipulating ROC plots" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Merging and Manipulating ROC plots} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( +collapse = TRUE, +comment = "#>" +) +``` + +***THIS VIGNETTE IS CURRENTLY WORK IN PROGRESS*** + +This example will demonstrate: + + * plotting a ROC curve for dCVnet + * ROC plots showing data from different dCVnet models. + * ROC plot customisation + +First, setup the R environment and make some example dCVnet objects: + +```{r setup} +knitr::opts_chunk$set( + echo = FALSE, + message = FALSE, + warning = FALSE +) +library(tidyverse) +ggplot2::theme_set(theme_minimal()) +library(dCVnet) +data(prostate, package = "dCVnet") + +prostate$agecat <- factor(prostate$age > 65, levels = c(FALSE, TRUE), labels = c("le65", "over65")) + +set.seed(42) + +# Model 1 predicts SVI (seminal vesicle invasion): +mod1 <- dCVnet::dCVnet(y = prostate$svi, + data = prostate %>% select(-train, -svi), + nrep_outer = 3, k_outer = 3, nrep_inner = 1, k_inner = 10) + +# Model 2 predicts whether Age > 65: +mod2 <- dCVnet::dCVnet(y = prostate$agecat, + data = prostate %>% select(-train, -age, -agecat), + nrep_outer = 3, k_outer = 3, nrep_inner = 1, k_inner = 10) + +``` + +# The default ROC plot +By default, calling `plot(my_dCVnet, type = "ROC")` on a dCVnet object +produces a plot of the cross-validated ROCs over the different outer loop +repetitions (to show the variability). + +Under the hood this function is composing three steps: + + 1. getting performance information from the outer-loop cross-validation (using `dCVnet::performance`) + 2. extracting sensitivity, specificity and threholds and turning this into a + standardised *rocdata* format using `dCVnet::extract_rocdata` + 3. running the plot method for rocdata objects (`plot.rocdata`). + + + +```{r default plot, echo=TRUE, fig.height=7, fig.width=7} +p1 <- plot(mod1, type = "ROC") + +# Note this is identical to: +# p1 <- plot(extract_rocdata(dCVnet::performance(mod1))) + +``` + +# Plots with averaged ROC +To plot overall (average) cross-validated ROC taken over the repetitions of the +repeated k-fold outer-loop cross-validation, use `dCVnet::average_rocdata()` on +the *rocdata* object. + +```{r average plot, echo=TRUE, fig.height=7, fig.width=7} +p2 <- plot(average_rocdata(extract_rocdata(performance(mod1)))) + +# This nested function is more readable if you use the piping function (%>%): +# p2 <- mod1 %>% +# performance %>% +# extract_rocdata %>% +# average_rocdata %>% +# plot() + +``` + +# Combining average and CV-variability +Perhaps we want to see both the average performance and its variability over +outerloop-cv repetitions. We can combine two rocdata objects with rbind +(they are really just data.frames with particular columns): + +```{r combined plot, echo=TRUE, fig.height=7, fig.width=7} +combined_roc_data <- rbind(extract_rocdata(performance(mod1)), + average_rocdata(extract_rocdata(performance(mod1)))) +p3 <- plot(combined_roc_data) + +# Written (paritally) using pipes: +# p3 <- rbind(mod1 %>% +# performance() %>% +# extract_rocdata(), +# mod1 %>% +# performance() %>% +# extract_rocdata() %>% +# average_rocdata()) + + +``` + +# Plotting from multiple models +Sometimes we want to display the results from two or more models +in the same ROC plot. Do this by `rbind`-ing the separate rocdata objects. +However, we first need to give informative names in the run column: + +```{r multimodel plot, echo=TRUE, fig.height=7, fig.width=7} +d1 <- mod1 %>% + performance() %>% + extract_rocdata() %>% + average_rocdata() %>% + mutate(run = "Model 1") # labels the data from this model. + +d2 <- mod2 %>% + performance() %>% + extract_rocdata() %>% + average_rocdata() %>% + mutate(run = "Model 2") # labels the data from this model + +p4 <- plot(rbind(d1, d2)) + + +``` + +Looking at `?plot.dCVnet` we can see that this is a call to `dCVnet::plot.rocdata` +From `?plot.rocdata` we see that this method uses a "rocdata" object which is +generated by `dCVnet::extract_rocdata`. + +# Plot options + +As set out below, most customisation is done via ggplot, but there are two +options for plot customisation in `plot.rocdata`. The legend can be toggled off +with `legend = FALSE`, and the threshold labels can be altered +(`alphalabel = c(0.05, 0.5, 0.95)`) or omitted (`alphalabel = FALSE`). + +```{r options1, echo=TRUE, fig.height=7, fig.width=7} + +# Make a version of p4 without a legend, and no markers for threshold: +p5 <- plot(rbind(d1, d2), alphalabel = FALSE, legend = FALSE) + + +``` + +# Customising plots + +dCVnet ROC plots are ggplot2 objects, so quite a few manipuations can be done +without recalculating the plot. In the following example I will modify `p5` the +multiple-models plot to... + + 1. Add a plot title + 2. Change the legend headings + 3. Manually set colours + 4. Change the plot appearance with a different ggplot theme + +```{r options2, echo=TRUE, fig.height=7, fig.width=7} + +p4$plot + + scale_colour_manual(values = c(`Model 2` = "goldenrod", + `Model 1` = "cadetblue")) + + labs(title = "Model Comparison", + colour = "Prediction\nModel", + shape = "Thresh.") + + theme_classic() + +``` + +# Further Customisation + +Somes changes are not easy to make in ggplot after the plot has been generated. +Things like remapping aesthetics. + +If you need a lot of control over the display, or would prefer a base R plot +rather than ggplot, the underlying data are returned by the call to +`plot.rocdata`. + +```{r plotdata} + +head(p4$data) + +``` + + From d2ac773322aca24cde941fa8a2ff98c2280723c2 Mon Sep 17 00:00:00 2001 From: Andrew Lawrence Date: Mon, 7 Sep 2020 15:48:09 +0100 Subject: [PATCH 02/10] docfix: omitted from previous --- man/plot.rocdata.Rd | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/man/plot.rocdata.Rd b/man/plot.rocdata.Rd index 20ea9ff..92cfaa4 100644 --- a/man/plot.rocdata.Rd +++ b/man/plot.rocdata.Rd @@ -4,7 +4,14 @@ \alias{plot.rocdata} \title{plot.rocdata} \usage{ -\method{plot}{rocdata}(x, legend = TRUE, alphalabel = c(0.25, 0.5, 0.75), ...) +\method{plot}{rocdata}( + x, + legend = TRUE, + alphalabel = c(0.25, 0.5, 0.75), + guide_labels = c(group = "Model", threshold = expression(P[Threshold]), refline = + "Chance\\nPerformance"), + ... +) } \arguments{ \item{x}{\code{\link[=extract_rocdata]{rocdata}} object} @@ -14,6 +21,13 @@ \item{alphalabel}{should certain alpha values (probability thresholds) on the curve be highlighted with symbols indicating threshold?} +\item{guide_labels}{a named list of 3 labels used in the legend: +\itemize{ +\item{group = label for the Grouping factor} +\item{threshold = label for the threshold factor} +\item{refline = label for the reference line} +}} + \item{...}{additional arguments (unused)} } \value{ From ff2c6588016b0cde6f5f2c0219ce761f3a2f387b Mon Sep 17 00:00:00 2001 From: Andrew Lawrence Date: Tue, 8 Sep 2020 13:32:29 +0100 Subject: [PATCH 03/10] Further ROC improvements. --- R/dCVnet_plotting.R | 44 +++++++--- man/plot.rocdata.Rd | 4 +- vignettes/merging-ROC.Rmd | 164 +++++++++++++++++++++++--------------- 3 files changed, 132 insertions(+), 80 deletions(-) diff --git a/R/dCVnet_plotting.R b/R/dCVnet_plotting.R index 3f44fb0..097a9aa 100644 --- a/R/dCVnet_plotting.R +++ b/R/dCVnet_plotting.R @@ -135,8 +135,8 @@ plot.dCVnet <- function(x, type = "tuning", ...) { #' @param guide_labels a named list of 3 labels used in the legend: #' \itemize{ #' \item{group = label for the Grouping factor} -#' \item{threshold = label for the threshold factor} -#' \item{refline = label for the reference line} +#' \item{threshold = label for the Threshold factor} +#' \item{refline = label for the Reference line} #' } #' @param ... additional arguments (unused) #' @return a ROC plot, as above. @@ -149,9 +149,18 @@ plot.rocdata <- function(x, threshold = expression(P[Threshold]), refline = "Chance\nPerformance"), ...) { + guide_labels_expect <- c("group", "threshold", "refline") + if ( ! all( guide_labels_expect %in% names(guide_labels)) ) { + warning(paste("guide_labels must contain named elements:", + "\t\t'group', 'threshold' and 'refline'", + " Using defaults instead", sep = "\n")) + guide_labels <- c(group = "Model", + threshold = expression(P[Threshold]), + refline = "Chance\nPerformance") + } # were thresholds supplied? hasalphas <- !any(is.na(alphalabel)) && !identical(FALSE, alphalabel) - print(hasalphas) + .closest <- function(vals, x) { locs <- vapply(vals, function(v) which.min(abs(x - v)), FUN.VALUE = 1L) r <- rep(NA_character_, NROW(x)) @@ -196,36 +205,44 @@ plot.rocdata <- function(x, xend = ~InvSpecEnd, yend = ~SensEnd, lty = ~lty), - show.legend = TRUE, + show.legend = legend, data = refline, inherit.aes = FALSE) + ggplot2::geom_line(ggplot2::aes_string(colour = "run"), show.legend = legend) + - ggplot2::xlab("False positive rate") + - ggplot2::ylab("True positive rate") + + ggplot2::xlab("False Positive Rate") + + ggplot2::ylab("True Positive Rate") + ggplot2::scale_x_continuous(breaks = seq(0, 1, 0.1), minor_breaks = NULL) + ggplot2::scale_y_continuous(breaks = seq(0, 1, 0.1), minor_breaks = NULL) + ggplot2::scale_size_manual(values = c(0.7)) + - ggplot2::guides( - colour = ggplot2::guide_legend(title = guide_labels$group), + ggplot2::coord_equal() + + ggplot2::theme_light() + + if ( legend ) { + p <- p + ggplot2::guides( + colour = ggplot2::guide_legend(title = guide_labels$group, order = 1), lty = ggplot2::guide_legend(title = NULL, + order = 3, override.aes = list( linetype = 1, shape = 32, - alpha = 1))) + - ggplot2::coord_equal() + - ggplot2::theme_light() + alpha = 1))) + } if ( hasalphas ) { p <- p + ggplot2::geom_point(data = x[!is.na(x$PThreshold), ], mapping = ggplot2::aes_string(shape = "PThreshold", colour = "run"), - show.legend = legend) + - ggplot2::guides( + show.legend = legend) + + if ( legend ) { + p <- p + ggplot2::guides( shape = ggplot2::guide_legend(title = guide_labels$threshold, + order = 2, override.aes = list(linetype = NA)) ) + } } print(p) return(invisible(list(plot = p, @@ -233,6 +250,7 @@ plot.rocdata <- function(x, } + # ~ non-S3 plotters ------------------------------------------------------ diff --git a/man/plot.rocdata.Rd b/man/plot.rocdata.Rd index 92cfaa4..5616530 100644 --- a/man/plot.rocdata.Rd +++ b/man/plot.rocdata.Rd @@ -24,8 +24,8 @@ on the curve be highlighted with symbols indicating threshold?} \item{guide_labels}{a named list of 3 labels used in the legend: \itemize{ \item{group = label for the Grouping factor} -\item{threshold = label for the threshold factor} -\item{refline = label for the reference line} +\item{threshold = label for the Threshold factor} +\item{refline = label for the Reference line} }} \item{...}{additional arguments (unused)} diff --git a/vignettes/merging-ROC.Rmd b/vignettes/merging-ROC.Rmd index 32764b3..cd64df1 100644 --- a/vignettes/merging-ROC.Rmd +++ b/vignettes/merging-ROC.Rmd @@ -14,61 +14,59 @@ comment = "#>" ) ``` -***THIS VIGNETTE IS CURRENTLY WORK IN PROGRESS*** +This vignette will demonstrate: -This example will demonstrate: - - * plotting a ROC curve for dCVnet - * ROC plots showing data from different dCVnet models. + * Plotting a ROC curve for a dCVnet object + * ROC plots with data from different dCVnet models. * ROC plot customisation First, setup the R environment and make some example dCVnet objects: -```{r setup} -knitr::opts_chunk$set( - echo = FALSE, - message = FALSE, - warning = FALSE -) -library(tidyverse) -ggplot2::theme_set(theme_minimal()) +```{r setup, echo=TRUE, warning=FALSE, message=FALSE, results='hide'} +library(ggplot2) +library(dplyr) library(dCVnet) + data(prostate, package = "dCVnet") -prostate$agecat <- factor(prostate$age > 65, levels = c(FALSE, TRUE), labels = c("le65", "over65")) +prostate$agecat <- factor(prostate$age > 65, + levels = c(FALSE, TRUE), + labels = c("le65", "over65")) set.seed(42) - # Model 1 predicts SVI (seminal vesicle invasion): mod1 <- dCVnet::dCVnet(y = prostate$svi, - data = prostate %>% select(-train, -svi), - nrep_outer = 3, k_outer = 3, nrep_inner = 1, k_inner = 10) + data = subset(prostate, + select = c(-train, -svi)), + nrep_outer = 3, k_outer = 3, + nrep_inner = 1, k_inner = 10) # Model 2 predicts whether Age > 65: mod2 <- dCVnet::dCVnet(y = prostate$agecat, - data = prostate %>% select(-train, -age, -agecat), - nrep_outer = 3, k_outer = 3, nrep_inner = 1, k_inner = 10) + data = subset(prostate, + select = c(-train, -age, -agecat)), + nrep_outer = 3, k_outer = 3, + nrep_inner = 1, k_inner = 10) ``` # The default ROC plot By default, calling `plot(my_dCVnet, type = "ROC")` on a dCVnet object -produces a plot of the cross-validated ROCs over the different outer loop +produces a plot of the cross-validated ROCs for the different outer-loop repetitions (to show the variability). Under the hood this function is composing three steps: - 1. getting performance information from the outer-loop cross-validation (using `dCVnet::performance`) - 2. extracting sensitivity, specificity and threholds and turning this into a + 1. getting performance information from the outer-loop cross-validation + (using `dCVnet::performance`) + 2. extracting sensitivity, specificity and thresholds and turning this into a standardised *rocdata* format using `dCVnet::extract_rocdata` 3. running the plot method for rocdata objects (`plot.rocdata`). - - ```{r default plot, echo=TRUE, fig.height=7, fig.width=7} p1 <- plot(mod1, type = "ROC") -# Note this is identical to: +# Note this is the same as: # p1 <- plot(extract_rocdata(dCVnet::performance(mod1))) ``` @@ -80,8 +78,7 @@ the *rocdata* object. ```{r average plot, echo=TRUE, fig.height=7, fig.width=7} p2 <- plot(average_rocdata(extract_rocdata(performance(mod1)))) - -# This nested function is more readable if you use the piping function (%>%): +# The above nested function is more readable if you use pipes (%>%): # p2 <- mod1 %>% # performance %>% # extract_rocdata %>% @@ -91,16 +88,16 @@ p2 <- plot(average_rocdata(extract_rocdata(performance(mod1)))) ``` # Combining average and CV-variability -Perhaps we want to see both the average performance and its variability over +Perhaps we want to see both the average performance, and its variability over outerloop-cv repetitions. We can combine two rocdata objects with rbind -(they are really just data.frames with particular columns): +(they are just data.frame format with a particular set of columns): ```{r combined plot, echo=TRUE, fig.height=7, fig.width=7} combined_roc_data <- rbind(extract_rocdata(performance(mod1)), average_rocdata(extract_rocdata(performance(mod1)))) p3 <- plot(combined_roc_data) -# Written (paritally) using pipes: +# Or, as above, but (mostly) using pipes: # p3 <- rbind(mod1 %>% # performance() %>% # extract_rocdata(), @@ -112,6 +109,21 @@ p3 <- plot(combined_roc_data) ``` +# Plotting uncrossvalidated ROC +The 'final model' of dCVnet is the model we cross-validated, it would be used +to make a prediction for new data. However its performance in the training data +not crossvalidated, and so is typically an overestimate. + +Sometimes we want to see this training set performance all the same - it forms +an upper bound on cross-validated performance. *However, care must always be * +*taken not to interpret this as a cross-validated estimate.* + +To plot the uncrossvalidated performance of a final model: +```{r final model, echo=TRUE, fig.height=7, fig.width=7} +plot(extract_rocdata(performance(mod1$final))) +``` + + # Plotting from multiple models Sometimes we want to display the results from two or more models in the same ROC plot. Do this by `rbind`-ing the separate rocdata objects. @@ -130,66 +142,88 @@ d2 <- mod2 %>% average_rocdata() %>% mutate(run = "Model 2") # labels the data from this model -p4 <- plot(rbind(d1, d2)) +d3 <- mod1$final %>% + performance() %>% + extract_rocdata() %>% + average_rocdata() %>% + mutate(run = "Model 1 (train)") # labels the data from this model. + +d4 <- mod2$final %>% + performance() %>% + extract_rocdata() %>% + average_rocdata() %>% + mutate(run = "Model 2 (train)") # labels the data from this model. +p4 <- plot(rbind(d1, d2, d3, d4)) -``` -Looking at `?plot.dCVnet` we can see that this is a call to `dCVnet::plot.rocdata` -From `?plot.rocdata` we see that this method uses a "rocdata" object which is -generated by `dCVnet::extract_rocdata`. +``` # Plot options -As set out below, most customisation is done via ggplot, but there are two -options for plot customisation in `plot.rocdata`. The legend can be toggled off -with `legend = FALSE`, and the threshold labels can be altered -(`alphalabel = c(0.05, 0.5, 0.95)`) or omitted (`alphalabel = FALSE`). +A good deal of customisation can be done with ggplot (see below), but there are +a few options provided for plot customisation in `plot.rocdata`. The legend can +be toggled off with `legend = FALSE`, different threshold label points can be +chosen (`alphalabel = c(0.05, 0.5, 0.95)`) or these can be omitted +(`alphalabel = FALSE`). Finally alternative labels can be provided for +legend headings with the `guide_labels` argument. ```{r options1, echo=TRUE, fig.height=7, fig.width=7} - -# Make a version of p4 without a legend, and no markers for threshold: -p5 <- plot(rbind(d1, d2), alphalabel = FALSE, legend = FALSE) - +# Make a version of p4 without a legend or markers for threshold: +p5 <- plot(rbind(d1, d2), + alphalabel = FALSE, + legend = FALSE) + +# And one with custom threshold markers and legend headings: +p5 <- plot(rbind(d1, d2), + alphalabel = c(0.5), + guide_labels = list(group = "plotdata", + threshold = "50% Threshold", + refline = "line")) ``` # Customising plots -dCVnet ROC plots are ggplot2 objects, so quite a few manipuations can be done -without recalculating the plot. In the following example I will modify `p5` the -multiple-models plot to... +dCVnet ROC plots are ggplot2 objects. As a result many manipulations can be done +without recalculating the plot. In the following example I will modify `p4$plot` +(the multiple-models plot) in order to... - 1. Add a plot title - 2. Change the legend headings - 3. Manually set colours - 4. Change the plot appearance with a different ggplot theme + 1. Add a plot title and change axis labels + 2. Manually set colours + 3. Change the plot appearance with a different ggplot2 theme + 4. Suppress the legend for the reference line but not other plot features + +Note: ggplot2 legends are determined by aesthetics which are mapped to the data, +in this this plot Model is mapped to *colour*, Threshold markers to *shape* +and the reference line to *lty* (linetype). ```{r options2, echo=TRUE, fig.height=7, fig.width=7} - p4$plot + - scale_colour_manual(values = c(`Model 2` = "goldenrod", - `Model 1` = "cadetblue")) + - labs(title = "Model Comparison", - colour = "Prediction\nModel", - shape = "Thresh.") + + # add a title, change axis labels: + labs(title = "Comparison of ROC Performance") + + xlab("1 - Specificity") + + ylab("Sensitivity") + + # set manual colours: + scale_colour_manual(values = c(`Model 2` = "hotpink4", + `Model 2 (train)` = "hotpink", + `Model 1` = "blue4", + `Model 1 (train)` = "blue")) + + # suppress part of the legend: + guides(lty = guide_none()) + + # use a different theme: theme_classic() ``` # Further Customisation -Somes changes are not easy to make in ggplot after the plot has been generated. -Things like remapping aesthetics. - -If you need a lot of control over the display, or would prefer a base R plot -rather than ggplot, the underlying data are returned by the call to -`plot.rocdata`. - -```{r plotdata} +If you need more control over the display, or would prefer to plot using +base R, or different software entirely, the underlying data are +returned by the call to `plot.rocdata`. +```{r plotdata, echo=TRUE} head(p4$data) - ``` From bace9c234c2bcb1dba88ecc47f2009b3fdc1c5bd Mon Sep 17 00:00:00 2001 From: Andrew Lawrence Date: Tue, 8 Sep 2020 13:32:57 +0100 Subject: [PATCH 04/10] Spelling correction --- R/dCVnet_utilities.R | 2 +- inst/WORDLIST | 2 ++ man/tidy_predict.multialpha.repeated.cv.glmnet.Rd | 2 +- vignettes/dCVnet-vs-glm.Rmd | 2 +- 4 files changed, 5 insertions(+), 3 deletions(-) diff --git a/R/dCVnet_utilities.R b/R/dCVnet_utilities.R index 2754b40..c946669 100644 --- a/R/dCVnet_utilities.R +++ b/R/dCVnet_utilities.R @@ -861,7 +861,7 @@ tidy_predict.glmnet <- function(mod, #' #' @param alpha specify an alpha, or leave NULL to use the optimal alpha #' identified by \code{\link{multialpha.repeated.cv.glmnet}}. -#' @param s specfy a lambda, or leave NULL to use the optimal lambda +#' @param s specify a lambda, or leave NULL to use the optimal lambda #' identified by \code{\link{multialpha.repeated.cv.glmnet}}. #' #' @inheritParams tidy_predict.glmnet diff --git a/inst/WORDLIST b/inst/WORDLIST index 2353e1d..b854585 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -23,12 +23,14 @@ Elasticnet enet errorbars foldid +ggplot Github glm glmlist glmnet innerloop JMLR +lm Maudsley mgaussian MLR diff --git a/man/tidy_predict.multialpha.repeated.cv.glmnet.Rd b/man/tidy_predict.multialpha.repeated.cv.glmnet.Rd index 8c505e4..d9aa220 100644 --- a/man/tidy_predict.multialpha.repeated.cv.glmnet.Rd +++ b/man/tidy_predict.multialpha.repeated.cv.glmnet.Rd @@ -21,7 +21,7 @@ tidy_predict.multialpha.repeated.cv.glmnet( \item{newx}{new values of x for which predictions are desired.} -\item{s}{specfy a lambda, or leave NULL to use the optimal lambda +\item{s}{specify a lambda, or leave NULL to use the optimal lambda identified by \code{\link{multialpha.repeated.cv.glmnet}}.} \item{alpha}{specify an alpha, or leave NULL to use the optimal alpha diff --git a/vignettes/dCVnet-vs-glm.Rmd b/vignettes/dCVnet-vs-glm.Rmd index b0a8677..3ef73e8 100644 --- a/vignettes/dCVnet-vs-glm.Rmd +++ b/vignettes/dCVnet-vs-glm.Rmd @@ -260,7 +260,7 @@ however repeated cross-validation will not be liberal when this is not possible. ## Random Labels We have shown that regularisation with elastic net can produce models which -generalise better, but is the proceedure robust? What happens +generalise better, but is the procedure robust? What happens when it is asked to predict noise/nonsense? In the next section (note: output suppressed for brevity) the class labels of From 726540b238c7940fd58e304d3856f2fec0a64731 Mon Sep 17 00:00:00 2001 From: Andrew Lawrence Date: Tue, 8 Sep 2020 14:26:58 +0100 Subject: [PATCH 05/10] Add suggests for ROC vignette. --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index fe9f168..9505b3a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -33,6 +33,7 @@ Suggests: rmarkdown, openxlsx, car, + dplyr, psych, boot, tidyverse, From 3758136b609c749d81e20c0feec733650ea973d6 Mon Sep 17 00:00:00 2001 From: Andrew Lawrence Date: Fri, 11 Sep 2020 15:27:31 +0100 Subject: [PATCH 06/10] FEAT: Add Brier Score to summary.performance --- R/dCVnet_performance.R | 16 ++++++++++++-- tests/testthat/test-dCVnet_classperformance.R | 21 +++++++++++++++---- 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/R/dCVnet_performance.R b/R/dCVnet_performance.R index d227a62..5657876 100644 --- a/R/dCVnet_performance.R +++ b/R/dCVnet_performance.R @@ -334,7 +334,9 @@ summary.performance <- function(object, # However glm / model.matrix uses treatment-coding: the # first level of y is the reference class (e.g. control subjects). # This behaviour is helpful when interpreting model coefficients - # as they represent the deviation from the reference. + # as they represent the deviation from the reference such that + # a positive coefficient indicates an increased probability of + # the 'positive' class. # glmnet: to add to the confusion glmnet will reorder binary factors # such that levels are alphabetical. # dCVnet coerces input data into an alphabetical factor with the @@ -348,8 +350,18 @@ summary.performance <- function(object, # Next add the AUC: B <- ModelMetrics::auc(actual = performance$reference, predicted = performance$prediction) + # and the Brier Score: + if ( is.numeric(performance$reference) ) { + Bs <- ModelMetrics::brier(actual = performance$reference, + predicted = performance$prediction) + } else { + Bs <- ModelMetrics::brier(actual = as.numeric(performance$reference) - 1, + predicted = performance$prediction) + } # following hack removed: B <- pmax(B, 1 - B) - B <- data.frame(Measure = "AUROC", Value = B, stringsAsFactors = FALSE) + B <- data.frame(Measure = c("AUROC", "Brier"), + Value = c(B, Bs), + stringsAsFactors = FALSE) B$label <- A$label <- unique(performance$label) return(rbind(A, B)) diff --git a/tests/testthat/test-dCVnet_classperformance.R b/tests/testthat/test-dCVnet_classperformance.R index 02bbf72..edc7e94 100644 --- a/tests/testthat/test-dCVnet_classperformance.R +++ b/tests/testthat/test-dCVnet_classperformance.R @@ -3,6 +3,7 @@ context("tests of model performance objects") +# Setup perfect_classification <- structure( data.frame( reference = c("A", "A", "B", "B"), @@ -13,19 +14,31 @@ perfect_classification <- structure( ), class = c("performance", "data.frame") ) +imperfect_classification <- perfect_classification +imperfect_classification$reference <- rev(perfect_classification$reference) + perfect_classification.s <- summary(perfect_classification) +imperfect_classification.s <- summary(imperfect_classification) class_measures <- c("Accuracy", "Kappa", "Sensitivity", "Specificity", "Pos Pred Value", "Neg Pred Value", "Precision", "Recall", "F1", "Prevalence", "Detection Rate", - "Detection Prevalence", "Balanced Accuracy", "AUROC") - + "Detection Prevalence", "Balanced Accuracy", + "AUROC", "Brier") -test_that("prediction measures work", { +# The tests: +test_that("perfect binomial classification is as expected", { expect_equal(perfect_classification.s$Value[ perfect_classification.s$Measure %in% class_measures], - c(1, 1, 1, 1, 1, 1, 1, 1, 1, 0.5, 0.5, 0.5, 1, 1) + c(1, 1, 1, 1, 1, 1, 1, 1, 1, 0.5, 0.5, 0.5, 1, 1, 0.0875) + ) +}) + +test_that("imperfect binomial classification is as expected", { + expect_equal(imperfect_classification.s$Value[ + imperfect_classification.s$Measure %in% class_measures], + c(0, -1, 0, 0, 0, 0, 0, 0, NaN, 0.5, 0.0, 0.5, 0, 0, 0.5875) ) }) From 11ad63cde25b01b5951b942e69286c949bfbfd0c Mon Sep 17 00:00:00 2001 From: Andrew Lawrence Date: Thu, 17 Sep 2020 13:01:12 +0100 Subject: [PATCH 07/10] FEATURE: added Calibration --- NAMESPACE | 4 + R/dCVnet_calibration.R | 116 ++++++++++++++++++ R/dCVnet_performance.R | 1 - R/dCVnet_utilities.R | 50 +++++--- man/calibration.Rd | 41 +++++++ man/cv_performance_glm.Rd | 4 + tests/testthat/test-dCVnet_classperformance.R | 28 ++++- 7 files changed, 220 insertions(+), 24 deletions(-) create mode 100644 R/dCVnet_calibration.R create mode 100644 man/calibration.Rd diff --git a/NAMESPACE b/NAMESPACE index d24b934..0c90b4c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +S3method(calibration,default) S3method(coef,dCVnet) S3method(coef,multialpha.repeated.cv.glmnet) S3method(performance,dCVnet) @@ -12,17 +13,20 @@ S3method(plot,multialpha.repeated.cv.glmnet) S3method(plot,rocdata) S3method(predict,dCVnet) S3method(predict,multialpha.repeated.cv.glmnet) +S3method(print,calcoefs) S3method(print,dCVnet) S3method(print,multialpha.repeated.cv.glmnet) S3method(print,performance) S3method(print,reflogreg) S3method(reflogreg,dCVnet) S3method(reflogreg,default) +S3method(summary,calcoefs) S3method(summary,dCVnet) S3method(summary,multialpha.repeated.cv.glmnet) S3method(summary,performance) export(amalgamate_cv.glmnet) export(average_rocdata) +export(calibration) export(casesummary.performance) export(coef_reflogreg) export(coefficients_summary) diff --git a/R/dCVnet_calibration.R b/R/dCVnet_calibration.R new file mode 100644 index 0000000..d090474 --- /dev/null +++ b/R/dCVnet_calibration.R @@ -0,0 +1,116 @@ + +# calibration S3 generic --------------------------------------------- + +# implements 'weak' (i.e. linear) calibration. + + +#' calibration +#' +#' calculates 'weak' (i.e. intercept + slope) calibration equivalent to the +#' values returned by the val.prob function in the rms package +#' (which accompanies Frank Harrell's Regression Modelling Strategies book). +#' +#' Calibration is not returned via \code{\link{performance}} because of the +#' computational overhead of model fitting. +#' +#' @name calibration +#' +#' @param x an object containing predicted probabilities and observed outcomes, +#' from which calibration can be extracted. +#' @param ... arguments to pass on +#' +#' @return calibration intercept and calibration slope +#' +#' @export +calibration <- function(x, ...) { + UseMethod("calibration") +} + +# Internal function to run the actual calculation: +calibration_ <- function(predictedprobability, + observedoutcome) { + # xx is data.frame with reference & prediction (no group) + m <- glm(formula = observedoutcome ~ qlogis(predictedprobability), + family = "binomial") + cc <- coef(m) + return(c(Intercept = cc[[1]], Slope = cc[[2]])) +} + + +#' calibration.default +#' +#' @rdname calibration +#' @description Default function behaviour assumes input is a +#' list/data.frame with required vectors as elements. +#' @param ppid indicator for predicted probability element in x +#' (column "name" or index) +#' @param ooid indicator for observed outcome element in x +#' (column "name" or index) +#' @param gid indicator for grouping variable in x +#' (column "name" or index). Set to NA to force no grouping. +#' @export +calibration.default <- function(x, + ppid = "prediction", + ooid = "reference", + gid = "label", + ...) { + if ( length(unique(c(ppid, ooid, gid))) != 3 ) { + stop("Predictions and observations must be distinct elements in x") + } + pp <- x[[ppid]] + oo <- x[[ooid]] + + # No implicit recyling + if ( length(pp) != length(oo) ) { + stop("numbers of predicted probabilities and + observed outcomes must match") + } + + # grouping variable? + if ( is.na(gid) ) { + g <- rep(deparse(substitute(x)), times = length(pp)) + } else { + g <- x[[gid]] + } + + # iterable for groups: + glist <- setNames(unique(g), unique(g)) + + # calibration for each group: + result <- vapply( + glist, + FUN = function(i) { + calibration_(predictedprobability = pp[g %in% i], + observedoutcome = oo[g %in% i]) + }, + FUN.VALUE = c(Intercept = 0.0, + Slope = 1.0), + USE.NAMES = TRUE) + + class(result) <- c("calcoefs") + result +} + +# Simple print function for calibration result +#' @export +print.calcoefs <- function(x, ...) { + cat("Calibration coefficents\n") + print(unclass(x)) + invisible(x) +} + +# Simple summary function for calibration result +#' @export +summary.calcoefs <- function(object, + FUNS = c(mean = mean, + sd = sd, + min = min, + max = max), + ...) { + if ( NCOL(object) < 2 ) return(object) + return(sapply(FUNS, function(f) { + + apply(object, 1, f) + + })) +} diff --git a/R/dCVnet_performance.R b/R/dCVnet_performance.R index 5657876..d0739fd 100644 --- a/R/dCVnet_performance.R +++ b/R/dCVnet_performance.R @@ -398,7 +398,6 @@ summary.performance <- function(object, return(R) } - #' report_performance_summary #' #' extracts performance from a dCVnet object diff --git a/R/dCVnet_utilities.R b/R/dCVnet_utilities.R index c946669..e3a68bf 100644 --- a/R/dCVnet_utilities.R +++ b/R/dCVnet_utilities.R @@ -177,14 +177,14 @@ parseddata_summary <- function(object) { sprintf(ytab, fmt = "%i"), " (", yptab, "%)") } else if ( object$family == "cox") { - stry <- aggregate(list(Time = object$y[, 1]), - by = list(Outcome = object$y[, 2]), - summary) + stry <- aggregate(list(Time = object$y[, 1]), + by = list(Outcome = object$y[, 2]), + summary) } else { - # should be gaussian (1d mat / vector), - # poisson (1d mat / vector) or - # mgaussian (data.frame) - stry <- summary(object$y) + # should be gaussian (1d mat / vector), + # poisson (1d mat / vector) or + # mgaussian (data.frame) + stry <- summary(object$y) } # Next the predictor matrix: @@ -1044,6 +1044,8 @@ predict_cat.glm <- function(glm, threshold = 0.5) { #' @param y outcome vector (numeric or factor) #' @param data predictors in a data.frame #' @param f a formula to apply to x +#' @param return_summary bool. return summarised performance (default), or +#' \code{\link{performance}} objects for further analysis (set to FALSE) #' @param ... other arguments #' @inheritParams multialpha.repeated.cv.glmnet #' @inheritParams repeated.cv.glmnet @@ -1059,15 +1061,16 @@ predict_cat.glm <- function(glm, threshold = 0.5) { #' @seealso \code{\link[boot]{cv.glm}}, \code{\link{performance}} #' @export cv_performance_glm <- function(y, - data, - f = "~.", # nolint - folds = NULL, - k = 10, - nrep = 2, - family = "binomial", - opt.ystratify = TRUE, - opt.uniquefolds = FALSE, - ...) { + data, + f = "~.", # nolint + folds = NULL, + k = 10, + nrep = 2, + family = "binomial", + opt.ystratify = TRUE, + opt.uniquefolds = FALSE, + return_summary = TRUE, + ...) { cl <- as.list(match.call())[-1] parsed <- parse_dCVnet_input(data = data, y = y, f = f, family = family) @@ -1135,13 +1138,22 @@ cv_performance_glm <- function(y, class = c("performance", "data.frame")) - pp <- report_performance_summary(pp) + ppp <- report_performance_summary(pp) + + if ( return_summary ) { + return(list( + glm.performance = p0, + cv.performance = ppp, + folds = folds, + call = cl)) + } return(list( - glm.performance = p0, + glm.performance = performance(m0), cv.performance = pp, folds = folds, - call = cl)) + call = cl + )) } # function was removed from glmnet in 3.0 update. diff --git a/man/calibration.Rd b/man/calibration.Rd new file mode 100644 index 0000000..aeb6b88 --- /dev/null +++ b/man/calibration.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dCVnet_calibration.R +\name{calibration} +\alias{calibration} +\alias{calibration.default} +\title{calibration} +\usage{ +calibration(x, ...) + +\method{calibration}{default}(x, ppid = "prediction", ooid = "reference", gid = "label", ...) +} +\arguments{ +\item{x}{an object containing predicted probabilities and observed outcomes, +from which calibration can be extracted.} + +\item{...}{arguments to pass on} + +\item{ppid}{indicator for predicted probability element in x +(column "name" or index)} + +\item{ooid}{indicator for observed outcome element in x +(column "name" or index)} + +\item{gid}{indicator for grouping variable in x +(column "name" or index). Set to NA to force no grouping.} +} +\value{ +calibration intercept and calibration slope +} +\description{ +calculates 'weak' (i.e. intercept + slope) calibration equivalent to the +values returned by the val.prob function in the rms package +(which accompanies Frank Harrell's Regression Modelling Strategies book). + +Default function behaviour assumes input is a +list/data.frame with required vectors as elements. +} +\details{ +Calibration is not returned via \code{\link{performance}} because of the +computational overhead of model fitting. +} diff --git a/man/cv_performance_glm.Rd b/man/cv_performance_glm.Rd index 045aafb..2bfcfee 100644 --- a/man/cv_performance_glm.Rd +++ b/man/cv_performance_glm.Rd @@ -14,6 +14,7 @@ cv_performance_glm( family = "binomial", opt.ystratify = TRUE, opt.uniquefolds = FALSE, + return_summary = TRUE, ... ) } @@ -44,6 +45,9 @@ In most circumstances folds will be unique. This requests that random folds are checked for uniqueness in inner and outer loops. Currently it warns if non-unique values are found.} +\item{return_summary}{bool. return summarised performance (default), or +\code{\link{performance}} objects for further analysis (set to FALSE)} + \item{...}{other arguments} } \value{ diff --git a/tests/testthat/test-dCVnet_classperformance.R b/tests/testthat/test-dCVnet_classperformance.R index edc7e94..2fd747a 100644 --- a/tests/testthat/test-dCVnet_classperformance.R +++ b/tests/testthat/test-dCVnet_classperformance.R @@ -20,6 +20,12 @@ imperfect_classification$reference <- rev(perfect_classification$reference) perfect_classification.s <- summary(perfect_classification) imperfect_classification.s <- summary(imperfect_classification) + +perfect_classification.c <- suppressWarnings( + calibration(perfect_classification)) +imperfect_classification.c <- suppressWarnings( + calibration(imperfect_classification)) + class_measures <- c("Accuracy", "Kappa", "Sensitivity", "Specificity", "Pos Pred Value", "Neg Pred Value", @@ -28,17 +34,31 @@ class_measures <- c("Accuracy", "Kappa", "Detection Prevalence", "Balanced Accuracy", "AUROC", "Brier") +test_that("classes are correct", { + expect_s3_class(perfect_classification, "performance") + expect_s3_class(imperfect_classification, "performance") + expect_s3_class(perfect_classification.c, "calcoefs") + expect_s3_class(imperfect_classification.c, "calcoefs") +}) + # The tests: -test_that("perfect binomial classification is as expected", { +test_that("binomial classification is as expected", { + # perfect: expect_equal(perfect_classification.s$Value[ perfect_classification.s$Measure %in% class_measures], c(1, 1, 1, 1, 1, 1, 1, 1, 1, 0.5, 0.5, 0.5, 1, 1, 0.0875) ) -}) - -test_that("imperfect binomial classification is as expected", { + # imperfect: expect_equal(imperfect_classification.s$Value[ imperfect_classification.s$Measure %in% class_measures], c(0, -1, 0, 0, 0, 0, 0, 0, NaN, 0.5, 0.0, 0.5, 0, 0, 0.5875) ) }) + +known_cal <- c(-11.62047, 55.67869) +test_that("binomial calibration is as expected", { + # perfect: + expect_equal(as.vector(perfect_classification.c), known_cal) + # imperfect: + expect_equal(as.vector(imperfect_classification.c), known_cal * -1) +}) From 324eb5048f658e48898f13159a8fbbe3c4acee50 Mon Sep 17 00:00:00 2001 From: Andrew Lawrence Date: Tue, 6 Oct 2020 15:38:01 +0100 Subject: [PATCH 08/10] Infinity handling for Calibration --- R/dCVnet_calibration.R | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/R/dCVnet_calibration.R b/R/dCVnet_calibration.R index d090474..b21936a 100644 --- a/R/dCVnet_calibration.R +++ b/R/dCVnet_calibration.R @@ -29,8 +29,20 @@ calibration <- function(x, ...) { # Internal function to run the actual calculation: calibration_ <- function(predictedprobability, observedoutcome) { + logit <- qlogis(predictedprobability) + finite <- is.finite(logit) + if ( any(!finite) ) { + # remove from fit: + logit <- logit[finite] + observedoutcome <- observedoutcome[finite] + # warn: + n <- length(finite) + i <- sum(!finite) + warning(paste0(i, "/", n, + "values removed due to predictions equal to 0 / 1")) + } # xx is data.frame with reference & prediction (no group) - m <- glm(formula = observedoutcome ~ qlogis(predictedprobability), + m <- glm(formula = observedoutcome ~ logit, family = "binomial") cc <- coef(m) return(c(Intercept = cc[[1]], Slope = cc[[2]])) From a219a2c5f034dedcf64d53d4496d693ca6e11641 Mon Sep 17 00:00:00 2001 From: Andrew Lawrence Date: Tue, 6 Oct 2020 15:38:54 +0100 Subject: [PATCH 09/10] Improve documentation, testing, and handling of factor levels --- DESCRIPTION | 2 +- NAMESPACE | 1 + R/dCVnet_calibration.R | 2 + R/dCVnet_main.R | 2 + R/dCVnet_utilities.R | 137 ++++++++----- man/cv_performance_glm.Rd | 3 +- man/dCVnet.Rd | 34 +++- man/multialpha.repeated.cv.glmnet.Rd | 3 +- man/parse_dCVnet_input.Rd | 43 ++-- man/repeated.cv.glmnet.Rd | 3 +- tests/testthat/test-dCVnet_classperformance.R | 3 +- tests/testthat/test-dCVnet_parse_data.R | 189 ++++++++++++++++++ 12 files changed, 345 insertions(+), 77 deletions(-) create mode 100644 tests/testthat/test-dCVnet_parse_data.R diff --git a/DESCRIPTION b/DESCRIPTION index 9505b3a..18352d3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -27,7 +27,7 @@ Imports: methods, stats Roxygen: list(markdown = TRUE) -RoxygenNote: 7.1.0 +RoxygenNote: 7.1.1 Suggests: knitr, rmarkdown, diff --git a/NAMESPACE b/NAMESPACE index 0c90b4c..0752f61 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -75,6 +75,7 @@ importFrom(stats,model.matrix) importFrom(stats,prcomp) importFrom(stats,predict) importFrom(stats,predict.glm) +importFrom(stats,qlogis) importFrom(stats,relevel) importFrom(stats,sd) importFrom(stats,setNames) diff --git a/R/dCVnet_calibration.R b/R/dCVnet_calibration.R index b21936a..ba0f8dc 100644 --- a/R/dCVnet_calibration.R +++ b/R/dCVnet_calibration.R @@ -21,6 +21,8 @@ #' #' @return calibration intercept and calibration slope #' +#' @importFrom stats qlogis +#' #' @export calibration <- function(x, ...) { UseMethod("calibration") diff --git a/R/dCVnet_main.R b/R/dCVnet_main.R index 1e9ba0c..ce2cd88 100644 --- a/R/dCVnet_main.R +++ b/R/dCVnet_main.R @@ -14,6 +14,8 @@ #' and \bold{alpha} (the balance of L1 and L2 regularisation types). #' #' @inheritParams parse_dCVnet_input +#' @inheritSection parse_dCVnet_input Factor Outcomes +#' @inheritSection parse_dCVnet_input Notes #' @inheritParams multialpha.repeated.cv.glmnet #' #' @param k_inner an integer, the k in the inner k-fold CV. diff --git a/R/dCVnet_utilities.R b/R/dCVnet_utilities.R index e3a68bf..fe39326 100644 --- a/R/dCVnet_utilities.R +++ b/R/dCVnet_utilities.R @@ -1,6 +1,30 @@ # Utility Functions ------------------------------------------------------- +check_categorical_outcome <- function(cat) { + + .is_alphabetical <- function(factor) { + lvl <- levels(factor) + identical(lvl, sort(lvl)) + } + + if ( is.numeric(cat) ) return(cat) + if ( is.factor(cat) ) { + if ( ! .is_alphabetical(cat) ) { + stop("The order of factor levels is not alphabetical. + glmnet ignores levels and treats factor levels in y + alphabetically.") + } + return(cat) + } + if ( is.character(cat) ) return(factor(cat, levels = sort(unique(cat)))) + # at this point we have already returned for: + # is.numeric, is.factor and is.character. + # failure mode: co-erce factor + return(as.factor(cat)) +} + + #' parse_dCVnet_input #' #' Collate an outcome (y) predictor matrix (x) into a standardised object ready @@ -9,21 +33,28 @@ #' transformations and expansions using R formula notation. #' #' @section Factor Outcomes: -#' For binomial and multinomial families \code{\link[glmnet]{glmnet}} -#' coerces non-numeric y data to a factor with labels sorted alphabetically. -#' Numeric y is left unchanged (behaviour verified in version 2.0.18). -#' This matters because most R functions (e.g. glm) honour the ordering -#' of the factor levels. A binomial glm fit to a factor with levels -#' \code{c("control", "case")}, will predict "case" and the model will return -#' predicted probabilities of being a "case". \code{\link[glmnet]{glmnet}} -#' will return the predicted probabilities of being "control" because it -#' follows "case" alphabetically. Currently cannot be avoided while using -#' glmnet. -#' To ensure dCVnet results are comparable between glmnet and glm / other -#' packages an error will be thrown if y is a factor and the levels are not -#' sorted alphabetically. Factor levels should be corrected such that -#' levels(y) and sort(levels(y)) agree. -#' e.g. \code{y <- forcats::fct_recode(y, case = "zcase")} +#' For categorical families (binomial, multinomial) input can be: +#' \itemize{ +#' \item{numeric (integer): c(0,1,2)} +#' \item{factor: factor(1:3, labels = c("A", "B", "C")))} +#' \item{character: c("A", "B", "C")} +#' \item{other} +#' } +#' These are treated differently. +#' +#' Numeric data is used as provided. +#' Character data will be coerced to a factor: +#' \code{factor(x, levels = sort(unique(x)))}. +#' Factor data will be used as provided, but *must* have levels in +#' alphabetical order. +#' +#' In all cases *the reference category must be ordered first*, +#' this means for the binomial family the 'positive' category is second. +#' +#' Why alphabetical? Previously bugs arose due to different handling +#' of factor levels between functions called by dCVnet. These appear to be +#' resolved in the latest versions of the packages, but this restriction will +#' stay until I can verify. #' #' @section Notes: #' Sparse matrices are not supported by dCVnet. @@ -33,6 +64,7 @@ #' @param data a data.frame containing variables needed for the formula (f). #' @param y the outcome (can be numeric vector, #' a factor (for binomial / multinomial) or a matrix for cox/mgaussian) +#' For factors see Factor Outcomes section below. #' @param f a one sided formula. #' The RHS must refer to columns in \code{data} and may include #' interactions, transformations or expansions (like \code{\link{poly}}, or @@ -41,7 +73,8 @@ #' @param family the model family (see \code{\link{glmnet}}) #' @param offset optional model offset (see \code{\link{glmnet}}) #' @param yname an optional label for the outcome / y variable. -#' @param passNA should NA values be excluded (FALSE) or passed through (TRUE)? +#' @param passNA should NA values in data be excluded (FALSE) +#' or passed through (TRUE)? #' #' @return a list containing #' \itemize{ @@ -76,44 +109,42 @@ parse_dCVnet_input <- function(data, data <- data[, vars, drop = FALSE] - # Custom, paired x/y removal of incomplete data: + # Paired removal of incomplete data from x/y + # missing y is never passed, + # missing x (data) is optionally passed according to passNA + ycomplete <- stats::complete.cases(y) + dcomplete <- stats::complete.cases(data) + # only worry about missing data if y is not missing: + dmiss <- any( (!dcomplete) & ycomplete ) + + # special behaviour for missing in data not y if passNA: + if ( passNA && dmiss ) { + dcomplete <- rep(TRUE, length(dcomplete)) + dmiss <- FALSE + warning("Passing NAs - missing data (non-outcome)") + } + + # joint removal: + complete <- ycomplete & dcomplete + miss <- any(!complete) - # remove missing in x/data (unless pass NAs for imputing): - if ( !passNA && any(!stats::complete.cases(data)) ) { - cat(paste0("Removing ", sum(!stats::complete.cases(data)), - " of ", NROW(data), + if ( miss ) { + y <- subset(y, complete) + data <- subset(data, complete) + warning(paste0("Removing ", sum(!complete), + " of ", length(complete), " subjects due to missing data.\n")) - y <- subset(y, stats::complete.cases(data)) - data <- subset(data, stats::complete.cases(data)) - } - # always remove missing in y: - if ( any(!stats::complete.cases(y)) ) { - cat(paste0("Removing ", sum(!stats::complete.cases(y)), - " of ", NROW(y), - " subjects due to missing y.\n")) - data <- subset(data, stats::complete.cases(y)) - y <- subset(y, stats::complete.cases(y)) } # coerce y into factor and check, because... # - glmnet (for its own infernal reasons) does not honour factor ordering, # the factor levels are converted to character and sorted alphabetically - # - previously there was functionality to change factor ordering according - # to a reference category, but this is inconsistent with glmnet. + # - previously in dCVnet there was an argument ('positive') to change factor + # ordering from the call to dCVnet. This was not honoured by glmnet. # - if y is numeric it is assumed the user knows what they are doing. - if ( family %in% c("binomial", "multinomial") && !is.numeric(y)) { - - y <- as.factor(y) # coerce y to factor - - # check if - lvl <- levels(y) - alvl <- sort(lvl) - - if ( !identical(lvl, alvl) ) { - stop("The order of factor levels is not alphabetical. - glmnet ignores levels and treats factor levels in y - alphabetically.") - } + # NOTE: now not certain if this was my bug rather than glmnet. + if ( family %in% c("binomial", "multinomial") ) { + y <- check_categorical_outcome(y) } # Make a model matrix of RHS variables @@ -786,23 +817,21 @@ tidy_predict.glmnet <- function(mod, exact = FALSE, newoffset = newoffset, ...) - if ( class(p) == "array" ) { - p <- p[,,1] # nolint # extra dims not needed. - } - - p <- as.data.frame(p, stringAsFactors = FALSE) + # remove excess dims, convert to data.frame + p <- as.data.frame(drop(p), stringAsFactors = FALSE) if ( !is.null(newy)) { a <- as.data.frame(newy, stringsAsFactors = FALSE) } + cn <- "prediction" # different rules for column names of p: if ( family %in% c("mgaussian", "multinomial") ) { - colnames(p) <- paste0("prediction", colnames(p)) - } else { - colnames(p) <- "prediction" + cn <- paste0("prediction", colnames(p)) } + colnames(p) <- cn + p$rowid <- rownames(p) if ( family %in% c("gaussian", "poisson") ) { diff --git a/man/cv_performance_glm.Rd b/man/cv_performance_glm.Rd index 2bfcfee..98fb737 100644 --- a/man/cv_performance_glm.Rd +++ b/man/cv_performance_glm.Rd @@ -34,7 +34,8 @@ and the k in repeated k-fold cv.} \item{nrep}{the number of repetitions} -\item{family}{Response type (see above)} +\item{family}{Response type (see above). Either a character string representing +one of the built-in families, or else a \code{glm()} family object.} \item{opt.ystratify}{Boolean. Outer and inner sampling is stratified by outcome. diff --git a/man/dCVnet.Rd b/man/dCVnet.Rd index f62532f..1f2e421 100644 --- a/man/dCVnet.Rd +++ b/man/dCVnet.Rd @@ -26,7 +26,8 @@ dCVnet( } \arguments{ \item{y}{the outcome (can be numeric vector, -a factor (for binomial / multinomial) or a matrix for cox/mgaussian)} +a factor (for binomial / multinomial) or a matrix for cox/mgaussian) +For factors see Factor Outcomes section below.} \item{data}{a data.frame containing variables needed for the formula (f).} @@ -98,6 +99,37 @@ Elasticnet hyperparameters are \bold{lambda} (the total regularisation penalty) and \bold{alpha} (the balance of L1 and L2 regularisation types). } +\section{Factor Outcomes}{ + +For categorical families (binomial, multinomial) input can be: +\itemize{ +\item{numeric (integer): c(0,1,2)} +\item{factor: factor(1:3, labels = c("A", "B", "C")))} +\item{character: c("A", "B", "C")} +\item{other} +} +These are treated differently. + +Numeric data is used as provided. +Character data will be coerced to a factor: +\code{factor(x, levels = sort(unique(x)))}. +Factor data will be used as provided, but \emph{must} have levels in +alphabetical order. + +In all cases \emph{the reference category must be ordered first}, +this means for the binomial family the 'positive' category is second. + +Why alphabetical? Previously bugs arose due to different handling +of factor levels between functions called by dCVnet. These appear to be +resolved in the latest versions of the packages, but this restriction will +stay until I can verify. +} + +\section{Notes}{ + +Sparse matrices are not supported by dCVnet. +} + \examples{ \dontrun{ diff --git a/man/multialpha.repeated.cv.glmnet.Rd b/man/multialpha.repeated.cv.glmnet.Rd index de16e59..ec4aff1 100644 --- a/man/multialpha.repeated.cv.glmnet.Rd +++ b/man/multialpha.repeated.cv.glmnet.Rd @@ -65,7 +65,8 @@ In most circumstances folds will be unique. This requests that random folds are checked for uniqueness in inner and outer loops. Currently it warns if non-unique values are found.} -\item{family}{Response type (see above)} +\item{family}{Response type (see above). Either a character string representing +one of the built-in families, or else a \code{glm()} family object.} \item{opt.keep_models}{The models take up memory. What should we return? \itemize{ diff --git a/man/parse_dCVnet_input.Rd b/man/parse_dCVnet_input.Rd index 6569341..afd6215 100644 --- a/man/parse_dCVnet_input.Rd +++ b/man/parse_dCVnet_input.Rd @@ -18,7 +18,8 @@ parse_dCVnet_input( \item{data}{a data.frame containing variables needed for the formula (f).} \item{y}{the outcome (can be numeric vector, -a factor (for binomial / multinomial) or a matrix for cox/mgaussian)} +a factor (for binomial / multinomial) or a matrix for cox/mgaussian) +For factors see Factor Outcomes section below.} \item{family}{the model family (see \code{\link{glmnet}})} @@ -32,7 +33,8 @@ The formula \emph{must} include an intercept.} \item{yname}{an optional label for the outcome / y variable.} -\item{passNA}{should NA values be excluded (FALSE) or passed through (TRUE)?} +\item{passNA}{should NA values in data be excluded (FALSE) +or passed through (TRUE)?} } \value{ a list containing @@ -52,21 +54,28 @@ transformations and expansions using R formula notation. } \section{Factor Outcomes}{ -For binomial and multinomial families \code{\link[glmnet]{glmnet}} -coerces non-numeric y data to a factor with labels sorted alphabetically. -Numeric y is left unchanged (behaviour verified in version 2.0.18). -This matters because most R functions (e.g. glm) honour the ordering -of the factor levels. A binomial glm fit to a factor with levels -\code{c("control", "case")}, will predict "case" and the model will return -predicted probabilities of being a "case". \code{\link[glmnet]{glmnet}} -will return the predicted probabilities of being "control" because it -follows "case" alphabetically. Currently cannot be avoided while using -glmnet. -To ensure dCVnet results are comparable between glmnet and glm / other -packages an error will be thrown if y is a factor and the levels are not -sorted alphabetically. Factor levels should be corrected such that -levels(y) and sort(levels(y)) agree. -e.g. \code{y <- forcats::fct_recode(y, case = "zcase")} +For categorical families (binomial, multinomial) input can be: +\itemize{ +\item{numeric (integer): c(0,1,2)} +\item{factor: factor(1:3, labels = c("A", "B", "C")))} +\item{character: c("A", "B", "C")} +\item{other} +} +These are treated differently. + +Numeric data is used as provided. +Character data will be coerced to a factor: +\code{factor(x, levels = sort(unique(x)))}. +Factor data will be used as provided, but \emph{must} have levels in +alphabetical order. + +In all cases \emph{the reference category must be ordered first}, +this means for the binomial family the 'positive' category is second. + +Why alphabetical? Previously bugs arose due to different handling +of factor levels between functions called by dCVnet. These appear to be +resolved in the latest versions of the packages, but this restriction will +stay until I can verify. } \section{Notes}{ diff --git a/man/repeated.cv.glmnet.Rd b/man/repeated.cv.glmnet.Rd index de61b13..37cc56a 100644 --- a/man/repeated.cv.glmnet.Rd +++ b/man/repeated.cv.glmnet.Rd @@ -57,7 +57,8 @@ cross-validation? The default (nfolds=NULL) uses 10-fold.} \item{nreps}{if folds are not specified, how many times to repeat k-fold cross-validation? The default (nreps=NULL) uses 5 repeats.} -\item{family}{Response type (see above)} +\item{family}{Response type (see above). Either a character string representing +one of the built-in families, or else a \code{glm()} family object.} \item{...}{arguments passed to \code{\link[glmnet]{cv.glmnet}}} diff --git a/tests/testthat/test-dCVnet_classperformance.R b/tests/testthat/test-dCVnet_classperformance.R index 2fd747a..3a77f16 100644 --- a/tests/testthat/test-dCVnet_classperformance.R +++ b/tests/testthat/test-dCVnet_classperformance.R @@ -1,8 +1,9 @@ -# class performance object (from scratch) context("tests of model performance objects") +# class performance object (from scratch) + # Setup perfect_classification <- structure( data.frame( diff --git a/tests/testthat/test-dCVnet_parse_data.R b/tests/testthat/test-dCVnet_parse_data.R new file mode 100644 index 0000000..6b5a074 --- /dev/null +++ b/tests/testthat/test-dCVnet_parse_data.R @@ -0,0 +1,189 @@ + +context("tests of data parsing") + +nested_anyna <- function(x) { + any(sapply(x, function(k) any(is.na(k)) )) +} + +# init some data: +y1 <- rep(0:1, each = 10) +set.seed(42) +x1 <- data.frame(matrix(rnorm(20 * 10), nrow = 20, ncol = 10)) + +# Versions with missing data: +y2 <- y1 +y2[c(8, 12)] <- NA + +x2 <- x1 +x2[sample(NROW(x2), size = 3), sample(NCOL(x2), size = 3)] <- NA + + +# minimal call: +p1.basic <- dCVnet::parse_dCVnet_input(y = y1, data = x1, family = "binomial") + +# yname: +yn <- paste0(sample(letters, size = 7), collapse = "") +p1.yname <- dCVnet::parse_dCVnet_input(y = y1, data = x1, family = "binomial", + yname = yn) + + +# basic tests ------------------------------------------------------------- + +test_that("parsed data equals input for simple calls", { + # outcome values: + expect_equal(y1, p1.basic$y) + expect_equal(y1, p1.yname$y) + # data values: + expect_equivalent(as.matrix(x1), p1.basic$x_mat) + expect_equivalent(as.matrix(x1), p1.yname$x_mat) + # data colnames: + expect_equal(colnames(p1.basic$x_mat), colnames(x1)) + expect_equal(colnames(p1.yname$x_mat), colnames(x1)) + + # yname: + expect_equal("y", p1.basic$yname) + expect_equal(yn, p1.yname$yname) + + # output when yname changes: + expect_equal(p1.basic[names(p1.basic) != "yname"], + p1.yname[names(p1.yname) != "yname"]) +}) + + +# missing data ------------------------------------------------------------ + +test_that("parsed data works with missing data", { + # First y only missing: + expect_warning((p2.y <- dCVnet::parse_dCVnet_input(y = y2, + data = x1, + family = "binomial"))) + expect_equal(NROW(p2.y$y), NROW(p2.y$x_mat)) + expect_equal(NROW(p2.y$y), sum(complete.cases(y2))) + expect_equal(nested_anyna(p2.y), FALSE) + # Second x only missing: + expect_warning((p2.x <- dCVnet::parse_dCVnet_input(y = y1, + data = x2, + family = "binomial"))) + expect_equal(NROW(p2.x$y), NROW(p2.x$x_mat)) + expect_equal(NROW(p2.x$y), sum(complete.cases(x2))) + expect_equal(nested_anyna(p2.x), FALSE) + + # Third x&y missing: + expect_warning((p2.xy <- dCVnet::parse_dCVnet_input(y = y2, + data = x2, + family = "binomial"))) + expect_equal(NROW(p2.xy$y), NROW(p2.xy$x_mat)) + expect_equal(NROW(p2.xy$y), sum(complete.cases(x2) & complete.cases(y2))) + expect_equal(nested_anyna(p2.xy), FALSE) +}) + +# as above, but now passNA: +test_that("missing x can be passed (but not y)", { + # First y only missing (no effect of passNA): + expect_warning((p2.y <- dCVnet::parse_dCVnet_input(y = y2, + data = x1, + family = "binomial", + passNA = FALSE))) + expect_warning((p3.y <- dCVnet::parse_dCVnet_input(y = y2, + data = x1, + family = "binomial", + passNA = TRUE))) + + expect_identical(p3.y, p2.y) + # Second x only missing: + expect_warning((p3.x <- dCVnet::parse_dCVnet_input(y = y1, + data = x2, + family = "binomial", + passNA = TRUE))) + expect_equal(NROW(p3.x$y), NROW(p3.x$x_mat)) + expect_gt(NROW(p3.x$y), sum(complete.cases(x2))) + expect_equal(nested_anyna(p3.x), TRUE) + + # Third x&y missing: + expect_warning((p3.xy <- dCVnet::parse_dCVnet_input(y = y2, + data = x2, + family = "binomial", + passNA = TRUE))) + expect_equal(NROW(p3.xy$y), NROW(p3.xy$x_mat)) + expect_gt(NROW(p3.xy$y), sum(complete.cases(x2) & complete.cases(y2))) + expect_equal(nested_anyna(p3.xy), TRUE) +}) + + +# Family Handling --------------------------------------------------------- + +# ~ binomial -------------------------------------------------------------- + +# make some factor labels +set.seed(42) +y_fac_labs <- sort(replicate(2, + paste(sample(letters, size = 7), collapse = ""))) +y_fac_rlabs <- y_fac_labs[2:1] + +# outcome variables: +ovars <- list( + int = y1, # integer format + rint = -y1 + 1, # reverse coded + fac = factor(y1, + levels = 0:1, + labels = y_fac_labs), # factor (alphabetical) + rfac = factor(y1, + levels = 0:1, + labels = y_fac_rlabs), # factor (non-alpha) - error expected. + char = y_fac_labs[y1 + 1], # character (one way) + rchar = y_fac_rlabs[y1 + 1] # charcter (the other) +) + +# get all output: +res <- lapply(ovars, function(y) { + try(parse_dCVnet_input(y = y, data = x1, family = "binomial"), + silent = TRUE) +}) + +test_that("binomial y is formatted as expected", { + expect_identical(res[[1]]$y, ovars[[1]]) + expect_identical(res[[2]]$y, ovars[[2]]) + expect_identical(res[[3]]$y, ovars[[3]]) + expect_identical(class(res[[4]]), "try-error") + expect_identical(as.character(res[[5]]$y), ovars[[5]]) + expect_identical(as.character(res[[6]]$y), ovars[[6]]) +}) + +# ~ multinomial ------------------------------------------------------------- + +# make some factor labels +set.seed(42) +m1 <- sample(1:5, size = 100, replace = TRUE) +m1.x <- matrix(0, nrow = length(m1), ncol = 5) +m_fac_labs <- sort(replicate(5, + paste(sample(letters, size = 7), collapse = ""))) +m_fac_rlabs <- m_fac_labs[5:1] + +# outcome variables: +ovars <- list( + int = m1, # integer format + rint = -m1 + 6, # reverse coded + fac = factor(m1, + levels = 1:5, + labels = m_fac_labs), # factor (alphabetical) + rfac = factor(m1, + levels = 1:5, + labels = m_fac_rlabs), # factor (non-alpha) - error expected. + char = m_fac_labs[m1], # character (one wam) + rchar = m_fac_rlabs[m1] # charcter (the other) +) + +# get all output: +res <- lapply(ovars, function(y) { + try(parse_dCVnet_input(y = y, data = m1.x, family = "binomial"), + silent = TRUE) +}) + +test_that("binomial y is formatted as expected", { + expect_identical(res[[1]]$y, ovars[[1]]) + expect_identical(res[[2]]$y, ovars[[2]]) + expect_identical(res[[3]]$y, ovars[[3]]) + expect_identical(class(res[[4]]), "try-error") + expect_identical(as.character(res[[5]]$y), ovars[[5]]) + expect_identical(as.character(res[[6]]$y), ovars[[6]]) +}) From aeadfa3053d0f772d470ff656a99f1e36e4f6df7 Mon Sep 17 00:00:00 2001 From: Andrew Lawrence Date: Wed, 7 Oct 2020 12:28:31 +0100 Subject: [PATCH 10/10] improve README.md increment version --- DESCRIPTION | 2 +- README.md | 13 +++++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 18352d3..085eb14 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: dCVnet Type: Package Title: doubly cross-validated elastic-net regularised generalised linear models -Version: 1.0.6 +Version: 1.0.7 Authors@R: c(person("Andrew J.", "Lawrence", email = "lawrenceajl@gmail.com", role = c("aut", "cre"))) diff --git a/README.md b/README.md index 303aea1..e7156a6 100644 --- a/README.md +++ b/README.md @@ -24,14 +24,19 @@ The commands below will install missing package dependencies (see the [DESCRIPTION](DESCRIPTION) file *Imports* section). It will then run a toy example from the package's main function. -#### Install dCVnet from GitHub: +#### Install dCVnet (from GitHub): ``` -devtools::install_github("AndrewLawrence/dCVnet", dependencies = TRUE) +devtools::install_github("AndrewLawrence/dCVnet", dependencies = TRUE, build_vignettes = TRUE) ``` -#### Install dCVnet from an Archive: +#### -or- install the dev version of dCVnet (from GitHub): ``` -devtools::install_local("path/to/dCVnet_1.0.6.tar.gz", dependencies = TRUE) +devtools::install_github("AndrewLawrence/dCVnet@dev", dependencies = TRUE, build_vignettes = TRUE) +``` + +#### Install dCVnet (from an Archive): +``` +devtools::install_local("path/to/dCVnet_1.0.7.tar.gz", dependencies = TRUE, build_vignettes = TRUE) ``` #### Run a simple example: