Merge pull request #2 from AndrewLawrence/dev

Version 1.07
AndrewLawrence · Oct 7, 2020 · 3be9cc7 · 3be9cc7
2 parents 13f6a27 + aeadfa3
commit 3be9cc7
Show file tree

Hide file tree

Showing 21 changed files with 931 additions and 123 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: dCVnet
 Type: Package
 Title: doubly cross-validated elastic-net regularised generalised linear models
-Version: 1.0.6
+Version: 1.0.7
 Authors@R: c(person("Andrew J.", "Lawrence", 
                     email = "lawrenceajl@gmail.com",
                     role = c("aut", "cre")))
@@ -27,12 +27,13 @@ Imports:
     methods,
     stats
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.1.0
+RoxygenNote: 7.1.1
 Suggests:
     knitr,
     rmarkdown,
     openxlsx,
     car,
+    dplyr,
     psych,
     boot,
     tidyverse,

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,5 +1,6 @@
 # Generated by roxygen2: do not edit by hand
 
+S3method(calibration,default)
 S3method(coef,dCVnet)
 S3method(coef,multialpha.repeated.cv.glmnet)
 S3method(performance,dCVnet)
@@ -12,17 +13,20 @@ S3method(plot,multialpha.repeated.cv.glmnet)
 S3method(plot,rocdata)
 S3method(predict,dCVnet)
 S3method(predict,multialpha.repeated.cv.glmnet)
+S3method(print,calcoefs)
 S3method(print,dCVnet)
 S3method(print,multialpha.repeated.cv.glmnet)
 S3method(print,performance)
 S3method(print,reflogreg)
 S3method(reflogreg,dCVnet)
 S3method(reflogreg,default)
+S3method(summary,calcoefs)
 S3method(summary,dCVnet)
 S3method(summary,multialpha.repeated.cv.glmnet)
 S3method(summary,performance)
 export(amalgamate_cv.glmnet)
 export(average_rocdata)
+export(calibration)
 export(casesummary.performance)
 export(coef_reflogreg)
 export(coefficients_summary)
@@ -71,6 +75,7 @@ importFrom(stats,model.matrix)
 importFrom(stats,prcomp)
 importFrom(stats,predict)
 importFrom(stats,predict.glm)
+importFrom(stats,qlogis)
 importFrom(stats,relevel)
 importFrom(stats,sd)
 importFrom(stats,setNames)

diff --git a/R/dCVnet_calibration.R b/R/dCVnet_calibration.R
@@ -0,0 +1,130 @@
+
+# calibration S3 generic ---------------------------------------------
+
+# implements 'weak' (i.e. linear) calibration.
+
+
+#' calibration
+#'
+#' calculates 'weak' (i.e. intercept + slope) calibration equivalent to the
+#'     values returned by the val.prob function in the rms package
+#'     (which accompanies Frank Harrell's Regression Modelling Strategies book).
+#'
+#' Calibration is not returned via \code{\link{performance}} because of the
+#'     computational overhead of model fitting.
+#'
+#' @name calibration
+#'
+#' @param x an object containing predicted probabilities and observed outcomes,
+#'     from which calibration can be extracted.
+#' @param ... arguments to pass on
+#'
+#' @return calibration intercept and calibration slope
+#'
+#' @importFrom stats qlogis
+#'
+#' @export
+calibration <- function(x, ...) {
+  UseMethod("calibration")
+}
+
+# Internal function to run the actual calculation:
+calibration_ <- function(predictedprobability,
+                         observedoutcome) {
+  logit <- qlogis(predictedprobability)
+  finite <- is.finite(logit)
+  if ( any(!finite) ) {
+    # remove from fit:
+    logit <- logit[finite]
+    observedoutcome <- observedoutcome[finite]
+    # warn:
+    n <- length(finite)
+    i <- sum(!finite)
+    warning(paste0(i, "/", n,
+                   "values removed due to predictions equal to 0 / 1"))
+  }
+  # xx is data.frame with reference & prediction (no group)
+  m <- glm(formula = observedoutcome ~ logit,
+           family = "binomial")
+  cc <- coef(m)
+  return(c(Intercept = cc[[1]], Slope = cc[[2]]))
+}
+
+
+#' calibration.default
+#'
+#' @rdname calibration
+#' @description Default function behaviour assumes input is a
+#'     list/data.frame with required vectors as elements.
+#' @param ppid indicator for predicted probability element in x
+#'     (column "name" or index)
+#' @param ooid indicator for observed outcome element in x
+#'     (column "name" or index)
+#' @param gid indicator for grouping variable in x
+#'     (column "name" or index). Set to NA to force no grouping.
+#' @export
+calibration.default <- function(x,
+                                ppid = "prediction",
+                                ooid = "reference",
+                                gid = "label",
+                                ...) {
+  if ( length(unique(c(ppid, ooid, gid))) != 3 ) {
+    stop("Predictions and observations must be distinct elements in x")
+  }
+  pp <- x[[ppid]]
+  oo <- x[[ooid]]
+
+  # No implicit recyling
+  if ( length(pp) != length(oo) ) {
+    stop("numbers of predicted probabilities and
+         observed outcomes must match")
+  }
+
+  # grouping variable?
+  if ( is.na(gid) ) {
+    g <- rep(deparse(substitute(x)), times = length(pp))
+  } else {
+    g <- x[[gid]]
+  }
+
+  # iterable for groups:
+  glist <- setNames(unique(g), unique(g))
+
+  # calibration for each group:
+  result <- vapply(
+    glist,
+    FUN = function(i) {
+      calibration_(predictedprobability = pp[g %in% i],
+                   observedoutcome = oo[g %in% i])
+    },
+    FUN.VALUE = c(Intercept = 0.0,
+                  Slope = 1.0),
+    USE.NAMES = TRUE)
+
+  class(result) <- c("calcoefs")
+  result
+}
+
+# Simple print function for calibration result
+#' @export
+print.calcoefs <- function(x, ...) {
+  cat("Calibration coefficents\n")
+  print(unclass(x))
+  invisible(x)
+}
+
+# Simple summary function for calibration result
+#' @export
+summary.calcoefs <- function(object,
+                             FUNS = c(mean = mean,
+                                      sd = sd,
+                                      min = min,
+                                      max = max),
+                             ...) {
+  if ( NCOL(object) < 2 ) return(object)
+  return(sapply(FUNS, function(f) {
+
+    apply(object, 1, f)
+
+  }))
+}
diff --git a/R/dCVnet_main.R b/R/dCVnet_main.R
@@ -14,6 +14,8 @@
 #'     and \bold{alpha} (the balance of L1 and L2 regularisation types).
 #'
 #' @inheritParams parse_dCVnet_input
+#' @inheritSection parse_dCVnet_input Factor Outcomes
+#' @inheritSection parse_dCVnet_input Notes
 #' @inheritParams multialpha.repeated.cv.glmnet
 #'
 #' @param k_inner an integer, the k in the inner k-fold CV.

diff --git a/R/dCVnet_performance.R b/R/dCVnet_performance.R
@@ -334,7 +334,9 @@ summary.performance <- function(object,
         #   However glm / model.matrix uses treatment-coding: the
         #   first level of y is the reference class (e.g. control subjects).
         #   This behaviour is helpful when interpreting model coefficients
-        #   as they represent the deviation from the reference.
+        #   as they represent the deviation from the reference such that
+        #   a positive coefficient indicates an increased probability of
+        #   the 'positive' class.
         # glmnet: to add to the confusion glmnet will reorder binary factors
         #   such that levels are alphabetical.
         # dCVnet coerces input data into an alphabetical factor with the
@@ -348,8 +350,18 @@ summary.performance <- function(object,
     # Next add the AUC:
     B <- ModelMetrics::auc(actual = performance$reference,
                            predicted = performance$prediction)
+    # and the Brier Score:
+    if ( is.numeric(performance$reference) ) {
+      Bs <- ModelMetrics::brier(actual = performance$reference,
+                                predicted = performance$prediction)
+    } else {
+      Bs <- ModelMetrics::brier(actual = as.numeric(performance$reference) - 1,
+                                predicted = performance$prediction)
+    }
     # following hack removed: B <- pmax(B, 1 - B)
-    B <- data.frame(Measure = "AUROC", Value = B, stringsAsFactors = FALSE)
+    B <- data.frame(Measure = c("AUROC", "Brier"),
+                    Value = c(B, Bs),
+                    stringsAsFactors = FALSE)
 
     B$label <- A$label <- unique(performance$label)
     return(rbind(A, B))
@@ -386,7 +398,6 @@ summary.performance <- function(object,
   return(R)
 }
 
-
 #' report_performance_summary
 #'
 #' extracts performance from a dCVnet object

diff --git a/R/dCVnet_plotting.R b/R/dCVnet_plotting.R
@@ -132,15 +132,35 @@ plot.dCVnet <- function(x, type = "tuning", ...) {
 #' @param legend logical. Display a legend?
 #' @param alphalabel should certain alpha values (probability thresholds)
 #'     on the curve be highlighted with symbols indicating threshold?
+#' @param guide_labels a named list of 3 labels used in the legend:
+#'     \itemize{
+#'     \item{group = label for the Grouping factor}
+#'     \item{threshold = label for the Threshold factor}
+#'     \item{refline = label for the Reference line}
+#'     }
 #' @param ... additional arguments (unused)
 #' @return a ROC plot, as above.
 #'
 #' @export
 plot.rocdata <- function(x,
                          legend = TRUE,
                          alphalabel = c(0.25, 0.5, 0.75),
+                         guide_labels = c(group = "Model",
+                                          threshold = expression(P[Threshold]),
+                                          refline = "Chance\nPerformance"),
                          ...) {
-  hasalphas <- any(!is.na(alphalabel))
+  guide_labels_expect <- c("group", "threshold", "refline")
+  if ( ! all( guide_labels_expect %in% names(guide_labels)) ) {
+    warning(paste("guide_labels must contain named elements:",
+                  "\t\t'group', 'threshold' and 'refline'",
+                  " Using defaults instead", sep = "\n"))
+    guide_labels <- c(group = "Model",
+                      threshold = expression(P[Threshold]),
+                      refline = "Chance\nPerformance")
+  }
+  # were thresholds supplied?
+  hasalphas <- !any(is.na(alphalabel)) && !identical(FALSE, alphalabel)
+
   .closest <- function(vals, x) {
     locs <- vapply(vals, function(v) which.min(abs(x - v)), FUN.VALUE = 1L)
     r <- rep(NA_character_, NROW(x))
@@ -169,32 +189,68 @@ plot.rocdata <- function(x,
     x$PThreshold <- factor(x$PThreshold, levels = names(alphalabel))
   }
 
-  p <- ggplot2::ggplot(x, ggplot2::aes_string(y = "Sens",
-                                              x = "InvSpec",
-                                              group = "run",
-                                              colour = "run")) +
-    ggplot2::geom_abline(slope = 1, intercept = 0,
-                         colour = "black") +
-    ggplot2::geom_line(show.legend = legend) +
-    ggplot2::xlab("False positive rate") +
-    ggplot2::ylab("True positive rate") +
+  refline <- data.frame(Sens = 0,
+                        InvSpec = 0,
+                        SensEnd = 1,
+                        InvSpecEnd = 1,
+                        lty = guide_labels$refline)
+
+  p <- ggplot2::ggplot(x,
+                       ggplot2::aes_string(y = "Sens",
+                                           x = "InvSpec")) +
+    ggplot2::geom_segment(
+      ggplot2::aes_(
+        x = ~InvSpec,
+        y = ~Sens,
+        xend = ~InvSpecEnd,
+        yend = ~SensEnd,
+        lty = ~lty),
+      show.legend = legend,
+      data = refline,
+      inherit.aes = FALSE) +
+    ggplot2::geom_line(ggplot2::aes_string(colour = "run"),
+                       show.legend = legend) +
+    ggplot2::xlab("False Positive Rate") +
+    ggplot2::ylab("True Positive Rate") +
     ggplot2::scale_x_continuous(breaks = seq(0, 1, 0.1), minor_breaks = NULL) +
     ggplot2::scale_y_continuous(breaks = seq(0, 1, 0.1), minor_breaks = NULL) +
+    ggplot2::scale_size_manual(values = c(0.7)) +
     ggplot2::coord_equal() +
     ggplot2::theme_light()
 
-  if ( hasalphas  ) {
+  if ( legend ) {
+    p <- p + ggplot2::guides(
+      colour = ggplot2::guide_legend(title = guide_labels$group, order = 1),
+      lty = ggplot2::guide_legend(title = NULL,
+                                  order = 3,
+                                  override.aes = list(
+                                    linetype = 1,
+                                    shape = 32,
+                                    alpha = 1)))
+  }
+
+  if ( hasalphas ) {
     p <- p +
       ggplot2::geom_point(data = x[!is.na(x$PThreshold), ],
-                          mapping = ggplot2::aes_string(shape = "PThreshold"),
+                          mapping = ggplot2::aes_string(shape = "PThreshold",
+                                                        colour = "run"),
                           show.legend = legend)
+
+    if ( legend ) {
+      p <- p + ggplot2::guides(
+        shape = ggplot2::guide_legend(title = guide_labels$threshold,
+                                      order = 2,
+                                      override.aes = list(linetype = NA))
+      )
+    }
   }
   print(p)
   return(invisible(list(plot = p,
                         data = x)))
 }
 
 
+
 #  ~ non-S3 plotters ------------------------------------------------------