Margins (#105)

* trying marginal effects and fixes #101 * adding in REmargins function * sketch out remargins vignette * RE margins implemented and demonstrated * closes #92 * closes #101
jknowles · May 11, 2019 · 083fb07 · 083fb07
1 parent 91b360e
commit 083fb07
Show file tree

Hide file tree

Showing 31 changed files with 502 additions and 984 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: merTools
 Title: Tools for Analyzing Mixed Effect Regression Models
-Version: 0.4.3
+Version: 0.5.0
 Authors@R: c(
     person(c("Jared", "E."), "Knowles", email = "jknowles@gmail.com", role = c("aut", "cre")),
     person("Carl", "Frederick", email="carlbfrederick@gmail.com", role = c("aut")),

diff --git a/NAMESPACE b/NAMESPACE
@@ -14,6 +14,7 @@ export(ICC)
 export(REcorrExtract)
 export(REextract)
 export(REimpact)
+export(REmargins)
 export(REquantile)
 export(REsdExtract)
 export(REsim)
@@ -97,6 +98,7 @@ importFrom(stats,pt)
 importFrom(stats,qnorm)
 importFrom(stats,quantile)
 importFrom(stats,reformulate)
+importFrom(stats,reshape)
 importFrom(stats,residuals)
 importFrom(stats,rgamma)
 importFrom(stats,rnorm)

diff --git a/inst/REmargins.R → R/REmargins.R b/inst/REmargins.R → R/REmargins.R
@@ -1,12 +1,14 @@
-#' Calculate the weighted mean of fitted values for various combinations of
-#' random effect terms.
+#' Calculate the predicted value for each observation across the distribution
+#' of the random effect terms.
 #'
 #' \code{REmargins} calculates the average predicted value for each row of a
 #' new data frame across the distribution of \code{\link{expectedRank}} for a
 #' merMod object. This allows the user to make meaningful comparisons about the
 #' influence of random effect terms on the scale of the response variable,
 #' for user-defined inputs, and accounting for the variability in grouping terms.
 #'
+#' The function simulates the
+#'
 #' The function predicts the response at every level in the random effect term
 #' specified by the user. Then, the expected rank of each group level is binned
 #' to the number of bins specified by the user. Finally, a weighted mean of the
@@ -42,15 +44,26 @@
 #' @param ... additional arguments to pass to \code{\link{predictInterval}}
 #'
 #' @return A data.frame with all unique combinations of the number of cases, rows
-#' in the newdata element, and number of bins:
+#' in the newdata element:
 #'   \describe{
-#'     \item{case}{The row number of the observation from newdata.}
-#'     \item{bin}{The ranking bin for the expected rank, the higher the bin number,
-#'     the greater the expected rank of the groups in that bin.}
-#'     \item{AvgFitWght}{The weighted mean of the fitted values for case i in bin k}
-#'     \item{AvgFitWghtSE}{The standard deviation of the mean of the fitted values
-#'     for case i in bin k.}
-#'     \item{nobs}{The number of group effects contained in that bin.}
+#'     \item{...}{The columns of the original data taken from \code{newdata}}
+#'     \item{case}{The row number of the observation from newdata. Each row in newdata will be
+#'     repeated for all unique levels of the grouping_var, term, and breaks.}
+#'     \item{grouping_var}{The grouping variable the random effect is being marginalized over.}
+#'     \item{term}{The term for the grouping variable the random effect is being marginalized over.}
+#'     \item{breaks}{The ntile of the effect size for \code{grouping_var} and \code{term}}
+#'     \item{original_group_level}{The original grouping value for this \code{case}}
+#'     \item{fit_combined}{The predicted value from \code{predictInterval} for this case simulated
+#'     at the Nth ntile of the expected rank distribution of \code{grouping_var} and \code{term}}
+#'     \item{upr_combined}{The upper bound of the predicted value.}
+#'     \item{lwr_combined}{The lower bound of the predicted value.}
+#'     \item{fit_XX}{For each grouping term in newdata the predicted value is decomposed into its
+#'     fit components via predictInterval and these are all returned here}
+#'     \item{upr_XX}{The upper bound for the effect of each grouping term}
+#'     \item{lwr_XX}{The lower bound for the effect of each grouping term}
+#'     \item{fit_fixed}{The predicted fit with all the grouping terms set to 0 (average)}
+#'     \item{upr_fixed}{The upper bound fit with all the grouping terms set to 0 (average)}
+#'     \item{lwr_fixed}{The lower bound fit with all the grouping terms set to 0 (average)}
 #'   }
 #'
 #'
@@ -65,7 +78,8 @@
 #' @seealso \code{\link{expectedRank}}, \code{\link{predictInterval}}
 #' @importFrom stats reshape
 #' @examples
-#' #For a one-level random intercept model
+#' fm1 <- lmer(Reaction ~ Days + (Days | Subject), sleepstudy)
+#' mfx <- REmargins(merMod = fm1, newdata = sleepstudy[1:10,])
 #' \donttest{
 #' # You can also pass additional arguments to predictInterval through REimpact
 #'  g1 <- lmer(y ~ lectage + studage + (1|d) + (1|s), data=InstEval)
@@ -75,32 +89,42 @@
 #'                          breaks = 3)
 #' }
 #' @export
-REmargins <- function(merMod, newdata = NULL, groupFctr = NULL, term = NULL, breaks = 3,
+REmargins <- function(merMod, newdata = NULL, groupFctr = NULL, term = NULL, breaks = 4,
                       .parallel = FALSE, ...){
-  if(is.null(groupFctr)){
+  # Validate inputs
+  if (is.null(groupFctr)) {
+    # If the user doesn't tell us which term to use, we take the first term
     groupFctr <- names(ranef(merMod))[1]
   }
   if (is.null(newdata)) {
+    # If the user doesn't give us data, we take the whole dataset
+    # TODO - how does performance scale to a large number of observations?
     newdata <- merMod@frame
   }
 
+  # If the user doesn't tell us what term to use, we take all the terms
+  if (is.null(term)) {
+    term <- names(ranef(merMod)[[groupFctr]])
+    # Sub out intercept
+    term[term == "(Intercept)"] <- "Intercept"
+  }
+
   # This is a rough way to break the ER distribution into quantiles
   brks <- ceiling(seq(1, 100, by = 100/breaks))
-  if(! 99 %in% brks) {
+  # Fallback so we always take a 99th percentile effect (for the maximum)
+  if (!99 %in% brks) {
     brks <- c(brks, 99)
   }
+  # Inputs are validated - now we get the effect distribution
   # Generate the expected rank distribution
-  ER_DF <- expectedRank(merMod, groupFctr = groupFctr)
+  ER_DF <- expectedRank(merMod, groupFctr = groupFctr, term = term)
+  # With many effects there is a lot of duplication - drop duplicated pctER
   ER_DF <- ER_DF[!duplicated(ER_DF[, c("groupFctr", "term", "pctER")]), ]
 
-
-  if (is.null(term)) {
-    term <- unique(ER_DF$term)
-  }
-
-  out_df <- expand.grid("grouping_var" = groupFctr, "term" = term, "breaks" = 1:breaks)
-  out_df$groupLevel <- NA
-
+  # Now we create a data frame to capture the factor levels of each groupFctr that
+  # correspond to the right break in the expectedRank distribution of the random
+  # effect grouping factor and term
+  par_df <- expand.grid("grouping_var" = groupFctr, "term" = term, "breaks" = 1:breaks)
 
   # Keep only factor levels that have effects at the margins
   # Need to match closest value here
@@ -109,53 +133,53 @@ REmargins <- function(merMod, newdata = NULL, groupFctr = NULL, term = NULL, bre
     # For each combination build an index of candidate rows/effect levels
   # Then choose the level that has the most precise estimate within a
   # tolerance of the effect size
-
-
   for (trm in term) {
     for (i in seq_along(brks)) {
-
-      zz <- abs(ER_DF$pctER[ER_DF$term == trm] - brks[i])
-      tmp <- which(zz %in% zz[order(zz)][1])
-      out_df$groupLevel[out_df$breaks == i & out_df$term == trm] <- ER_DF$groupLevel[tmp]
+      # Compute each terms distance from the break
+      rank_dist <- abs(ER_DF$pctER[ER_DF$term == trm] - brks[i])
+      # Get the index for the rank that minimizes the distance
+      # TODO - how to break ties here?
+      tmp <- which(rank_dist %in% rank_dist[order(rank_dist)][1])
+      # Store the result in the par_df object
+      par_df$groupLevel[par_df$breaks == i & par_df$term == trm] <- ER_DF$groupLevel[tmp]
     }
   }
-  # Need to repeat
-  # TODO - Figure out how to keep the effects seperate from each term
-  # TODO - order case by magnitude of ER
-
   # Get ready to expand the data
-  zed <- as.data.frame(lapply(newdata, rep, each = nrow(out_df)))
-  zed$case <- rep(1:nrow(newdata), each = nrow(out_df))
-  zed <- cbind(zed, out_df)
-  zed$original_group_level <- zed[, groupFctr]
-  zed[, groupFctr] <- zed$groupLevel
-  zed$groupLevel <- NULL
+  sim_data <- as.data.frame(lapply(newdata, rep, each = nrow(par_df)))
+  # sim_data now repeats each row of newdata by the number of rows in par_df
+  # case labels the rows with an integer for later mapping
+  sim_data$case <- rep(1:nrow(newdata), each = nrow(par_df))
+  sim_data <- cbind(sim_data, par_df)
+  sim_data$original_group_level <- sim_data[, groupFctr]
+  sim_data[, groupFctr] <- sim_data$groupLevel
+  sim_data$groupLevel <- NULL
   #
-  # zed[, "case"] <- rep(seq(1, nrow(newdata)), times = nrow(lvls))
 
   # Maybe strongly recommend parallel here?
-  if ( .parallel & requireNamespace("foreach", quietly=TRUE)) {
+  if (.parallel & requireNamespace("foreach", quietly = TRUE)) {
     # TODO use future here
-      merTools:::setup_parallel()
-        out <- predictInterval(merMod, newdata = zed,
-                               which = "all")
+      setup_parallel()
+        out <- predictInterval(merMod, newdata = sim_data,
+                               which = "all", ...)
         out_w <- stats::reshape(out, direction = "wide",
                                 idvar = "obs", timevar = "effect",
                                 v.names = c("fit", "upr", "lwr"), sep = "_")
-        zed <- cbind(zed, out_w)
-    } else if ( .parallel & !requireNamespace("foreach", quietly=TRUE)) {
+        out_w$obs <- NULL
+        sim_data <- cbind(sim_data, out_w)
+    } else if ( .parallel & !requireNamespace("foreach", quietly = TRUE)) {
       warning("foreach package is unavailable, parallel computing not available")
     } else {
-      out <- predictInterval(merMod, newdata = zed,
-                             which = "all")
+      out <- predictInterval(merMod, newdata = sim_data,
+                             which = "all", ...)
       out_w <- stats::reshape(out, direction = "wide",
                        idvar = "obs", timevar = "effect",
                        v.names = c("fit", "upr", "lwr"), sep = "_")
-      zed <- cbind(zed, out_w)
+      out_w$obs <- NULL
+      sim_data <- cbind(sim_data, out_w)
     }
   # Case is the number of the row in newdata
   # obs is the variance among the selected random effects to marginalize over
   # So we want to collapse by case if we can
 
-  return(zed)
+  return(sim_data)
 }
diff --git a/R/merFastDisplay.R b/R/merFastDisplay.R
@@ -137,12 +137,14 @@ fastdisp.merModList <- function(x, ...){
   cat(paste(paste(names(out$ngrps), out$ngrps, sep = ", "),
             collapse = "; "))
   cat(sprintf("\nAIC = %g", round(out$AIC, 1)))
+  cat("---\n")
   # cat(round(out$DIC, 1))
   # cat("\ndeviance =", fround(out$deviance, 1), "\n")
   if (useScale < 0) {
     out$sigma.hat <- sigma(x)
     cat("overdispersion parameter =", fround(out$sigma.hat,
                                              1), "\n")
+    cat("---\n")
   }
   return(invisible(out))
   }

diff --git a/R/merPredict.R b/R/merPredict.R
@@ -149,6 +149,8 @@ predictInterval <- function(merMod, newdata, which=c("full", "fixed", "random",
   }
 
   newdata.modelMatrix <- buildModelMatrix(model= merMod, newdata = newdata)
+  # When there is no fixed effect intercept but there is a group level intercept
+  # We need to do something!
 
   re.xb <- vector(getME(merMod, "n_rfacs"), mode = "list")
   names(re.xb) <- names(ngrps(merMod))
@@ -168,12 +170,14 @@ predictInterval <- function(merMod, newdata, which=c("full", "fixed", "random",
     # Add switch if no random groups are observed to avoid indexing errors,
     # we burn 1 sample of 1 group of all coefficients that will eventually
     # be multiplied by zero later on
-    if(length(keep) > 0 & !identical(keep, alllvl)) {
+    if (length(keep) > 0 & !identical(keep, alllvl)) {
       reMeans <- reMeans[keep, , drop=FALSE]
       dimnames(reMatrix)[[3]] <- alllvl
       reMatrix <- reMatrix[, , keep, drop = FALSE]
     } else if (length(keep) > 0 & identical(keep, alllvl)){
       dimnames(reMatrix)[[3]] <- alllvl
+      # dimnames(reMeans)[[2]] <- j  # we need to get the variable name into this ojbect
+      reMatrix <- reMatrix[, , keep, drop = FALSE]
     } else{
       reMeans <- reMeans[1, , drop=FALSE]
       reMatrix <- reMatrix[, , 1, drop = FALSE]
@@ -189,22 +193,32 @@ predictInterval <- function(merMod, newdata, which=c("full", "fixed", "random",
                                                 sigma=matrixTmp, method = "chol"))
     }
 
-    REcoefs <- sapply(tmpList, identity, simplify="array"); rm(tmpList)
+    REcoefs <- sapply(tmpList, identity, simplify="array")
+    # rm(tmpList)
     dimnames(REcoefs) <- list(1:n.sims,
                             attr(reMeans, "dimnames")[[2]],
-                            attr(reMeans, "dimnames")[[1]])
+                            attr(reMeans, "dimnames")[[1]]
+                            )
     if (j %in% names(newdata)) { # get around if names do not line up because of nesting
       tmp <- cbind(as.data.frame(newdata.modelMatrix), var = newdata[, j])
       tmp <- tmp[, !duplicated(colnames(tmp))]
       keep <- names(tmp)[names(tmp) %in% dimnames(REcoefs)[[2]]]
       if (length(keep) == 0) {
         keep <- grep(dimnames(REcoefs)[[2]], names(tmp), value = TRUE)
+      }
         if (length(keep) == 0) {
           tmp <- cbind(model.frame(subbars(formula(merMod)), data = newdata),
                        var = newdata[, j])
           keep <- grep(dimnames(REcoefs)[[2]], names(tmp), value = TRUE)
         }
-      }
+          if ( length(keep) == 0) {
+            # Add in an intercept for RE purposes
+            tmp <- cbind(as.data.frame(newdata.modelMatrix), var = newdata[, j])
+            tmp <- tmp[, !duplicated(colnames(tmp))]
+            tmp <- cbind(data.frame(1), tmp)
+            names(tmp)[1] <- "(Intercept)"
+            keep <- "(Intercept)"
+          }
       tmp <- tmp[, c(keep, "var"), drop = FALSE]
       tmp[, "var"] <- as.character(tmp[, "var"])
       colnames(tmp)[which(names(tmp) == "var")] <- names(newdata[, j,  drop = FALSE])
@@ -244,16 +258,16 @@ predictInterval <- function(merMod, newdata, which=c("full", "fixed", "random",
         coefs_new <- array(NA, dim = c(dim(coefs)[1], colLL,
                                        dim(coefs)[3]))
         dimnames(coefs_new)[c(1, 3)] <- dimnames(coefs)[c(1, 3)]
-        dimnames(coefs_new)[[2]] <- rep(dimnames(coefs)[[2]], colLL)
+
+        dimnames(coefs_new)[[2]] <- rep(dimnames(coefs)[[2]], dim(coefs_new)[2])
         for (k in 1:colLL) {
           coefs_new[, k, 1:dim(coefs)[3]] <- coefs[, 1, 1:dim(coefs)[3]]
         }
         coefs <- coefs_new
       }
 
       for(i in 1:nrow(data)){
-        # TODO - Fix issue 101 which trips up here
-         lvl <- as.character(data[, group][i])
+        lvl <- as.character(data[, group][i])
          if(!lvl %in% new.levels){
            yhatTmp[i, ] <- as.numeric(data[i, 1:colIdx]) %*% t(coefs[, 1:colIdx, lvl])
          } else{
@@ -297,7 +311,7 @@ predictInterval <- function(merMod, newdata, which=c("full", "fixed", "random",
   # for now, ignore this check
   if (include.resid.var==FALSE) {
 #     if (length(new.levels)==0)
-      sigmahat <- rep(1,n.sims)
+      sigmahat <- rep(1, n.sims)
 #     else {
 #       include.resid.var=TRUE
 #       warning("    \n  Since new levels were detected resetting include.resid.var to TRUE.")
@@ -315,6 +329,12 @@ predictInterval <- function(merMod, newdata, which=c("full", "fixed", "random",
     fix.intercept.variance <- FALSE
     message("No intercept detected, setting fix.intercept.variance to FALSE")
   }
+  # If intercept is not in fixed terms
+  if (!"(Intercept)" %in% names(fixef(merMod)) && fix.intercept.variance) {
+    # TODO - decide if this is an error or if we should allow it to continue with warning
+    warning("No fixed-effect intercept detected. Variance adjustment may be unreliable.")
+  }
+
   if (fix.intercept.variance) {
     #Assuming all random effects include intercepts.
     intercept.variance <- vcov.tmp[1,1]

diff --git a/README.Rmd b/README.Rmd
@@ -51,6 +51,23 @@ install.packages("merTools")
 
 ## Recent Updates
 
+### merTools 0.5.0
+
+#### New Features
+
+- `subBoot` now works with `glmerMod` objects as well
+- `reMargins` a new function that allows the user to marginalize the prediction over breaks in the 
+distribution of random effect distributions, see `?reMargins` and the new `reMargins` vignette (closes #73)
+
+#### Bug fixes
+
+- Fixed an issue where known convergence errors were issuing warnings and causing the test suite 
+to not work
+- Fixed an issue where models with a random slope, no intercept, and no fixed term were unable 
+to be predicted (#101)
+- Fixed an issue with shinyMer not working with substantive fixed effects (#93)
+
+
 ### merTools 0.4.1
 
 #### New Features
@@ -179,7 +196,7 @@ plotdfb <- predictInterval(m1, newdata = InstEval[1:10, ], n.sims = 2000,
                           level = 0.9, stat = 'median', which = "all", 
                           include.resid.var = TRUE)
 
-plotdf <- bind_rows(plotdf, plotdfb, .id = "residVar")
+plotdf <- dplyr::bind_rows(plotdf, plotdfb, .id = "residVar")
 plotdf$residVar <- ifelse(plotdf$residVar == 1, "No Model Variance", 
                           "Model Variance")
 
@@ -338,3 +355,23 @@ ggplot(plotdf, aes(y = fit, x = Anger, color = btype, group = btype)) +
   facet_wrap(~situ) + theme_bw() +
   labs(y = "Predicted Probability")
 ```
+
+## Marginalizing Random Effects
+
+```{r}
+# get cases
+case_idx <- sample(1:nrow(VerbAgg), 10)
+mfx <- REmargins(fmVA, newdata = VerbAgg[case_idx,], breaks = 4, groupFctr = "item", 
+                 type = "probability")
+
+ggplot(mfx, aes(y = fit_combined, x = breaks, group = case)) + 
+  geom_point() + geom_line() + 
+  theme_bw() + 
+  scale_y_continuous(breaks = 1:10/10, limits = c(0, 1)) +
+  coord_cartesian(expand = FALSE) +
+  labs(x = "Quartile of item random effect Intercept for term 'item'", 
+       y = "Predicted Probability", 
+       title = "Simulated Effect of Item Intercept on Predicted Probability for 10 Random Cases")
+```
+
+