diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..c4045df Binary files /dev/null and b/.DS_Store differ diff --git a/.Rbuildignore b/.Rbuildignore new file mode 100644 index 0000000..6243a3a --- /dev/null +++ b/.Rbuildignore @@ -0,0 +1,7 @@ +^.*\.Rproj$ +^\.Rproj\.user$ +^old_files* +^\.gitignore$ +^.DS_Store +^src/.DS_Store +^.git diff --git a/.gitignore b/.gitignore index fae8299..5b6a065 100644 --- a/.gitignore +++ b/.gitignore @@ -1,39 +1,4 @@ -# History files +.Rproj.user .Rhistory -.Rapp.history - -# Session Data files .RData - -# User-specific files .Ruserdata - -# Example code in package build process -*-Ex.R - -# Output files from R CMD build -/*.tar.gz - -# Output files from R CMD check -/*.Rcheck/ - -# RStudio files -.Rproj.user/ - -# produced vignettes -vignettes/*.html -vignettes/*.pdf - -# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 -.httr-oauth - -# knitr and R markdown default cache directories -*_cache/ -/cache/ - -# Temporary files created by R markdown -*.utf8.md -*.knit.md - -# R Environment Variables -.Renviron diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..b89962b --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,15 @@ +Package: c060 +Version: 0.2-6 +Date: 2021-02-27 +Author: Martin Sill, Thomas Hielscher, Manuela Zucknick, Natalia Becker. +Maintainer: Frederic Bertrand +Title: Extended Inference for Lasso and Elastic-Net Regularized Cox and + Generalized Linear Models +Depends: +Imports: glmnet, survival, parallel, mlegp, tgp, peperr, penalizedSVM, + lattice +Suggests: +Description: The c060 package provides additional functions to perform stability selection, model validation and parameter tuning for glmnet models. +License: GPL-2 +LazyLoad: yes +NeedsCompilation: no diff --git a/MD5 b/MD5 new file mode 100644 index 0000000..66175fd --- /dev/null +++ b/MD5 @@ -0,0 +1,33 @@ +3b1a03fcb51c4f98fe6f79e20f169979 *DESCRIPTION +8723f691e371abaebe8e5b4089be3975 *NAMESPACE +14e7421de46ec2d51f97547613958eca *R/EPSGO.R +b1e7b5d4f74093c73f58e5413058994e *R/auc.R +4ca09c9e7a01557a832bae2b97ed1d3f *R/balancedFolds.R +b75c2c69378c782173ab4a38aca06a91 *R/coef_intsearch.R +4978d8c02b55ebe56023b39397704e40 *R/nonzeroCoef.R +5aeffa3cb87ae3ca4c2a2ba516fbb23e *R/peperr_glmnet.R +50123ec5b8c9cda4524a0725d9e5dfd3 *R/plotCoef.R +87479f0f199b359f6f14fb0a005f46d7 *R/plot_coef_glmnet.R +f13a06d844b4e98b65cf1fef7a593a43 *R/plot_summary_inter_search.R +764f57f1f8279cd798c876475e42126c *R/stabilityselection.R +828689648276bf83b98b3a830014de56 *R/summary_inter_search.R +04cc7054551a3e7cf1fdbbeb10b2c2b5 *R/tune_glmnet_interval.R +601e225f64250baecacd420b87d9c533 *inst/CITATION +1bbfbff9c45b78b439740d30b3b4ed93 *inst/doc/c060_vignette_nojss.pdf +1335532e0cfbc776e37678fdb952df7e *man/EPSGO.Rd +63306392b3b85f6e550bf395f02b83f9 *man/PLL.coxnet.Rd +46a994ab464367a5406a562eb9906e00 *man/Plot.coef.glmnet.Rd +1e056ffd644d66b7eac7652e9058c5e3 *man/Plot.peperr.curves.Rd +3e28aa4c1450b18c8ba1bd7b2ed2c916 *man/aggregation.auc.Rd +8e694da5b53d433511b8f3044b2b08d7 *man/balancedFolds.Rd +2429191ba9782c4cd38e70bfede43159 *man/coef.intsearch.Rd +3dc6c991e1157bf327dc505075fe6355 *man/complexity.glmnet.Rd +8237b6cd3a4551ef834f199fc86537a8 *man/fit.glmnet.Rd +9d37d6ae2241a86dd6d1b5ebeb8f4e45 *man/plot.stabpath.Rd +9dd7fe6f7fd33d9585358f50af9c268f *man/plot.summary.int.Rd +4b3ee8af84c3adcea709fe1f8b3893bc *man/predictProb.coxnet.Rd +62e8d584ca831a1af783e250bf302208 *man/predictProb.glmnet.Rd +c7f5d32a3ddd8ea90cb9e4d081a65507 *man/stabpath.Rd +2eeeee63b9ea0c6fbfbce834a975d05d *man/stabsel.Rd +c7fa442f367258f30c613b9d15e0b0ad *man/summary.intsearch.Rd +d66db9676c465dfd3126b32856c62951 *man/tune.glmnet.interval.Rd diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..cbb1018 --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,28 @@ +import(glmnet, survival, parallel, peperr, penalizedSVM, lattice, mlegp, tgp) + +export(stabpath, + stabsel, + aggregation.auc, + complexity.glmnet, + fit.glmnet, + Plot.peperr.curves, + epsgo, + tune.glmnet.interval, + balancedFolds, + Plot.coef.glmnet + ) + +S3method("print","stabpath") +S3method("plot","stabpath") +S3method("PLL","coxnet") +S3method("predictProb","glmnet") +S3method("predictProb","coxnet") +S3method("summary","intsearch") +S3method("plot","sum.intsearch") +S3method("coef","sum.intsearch") + +importFrom("grDevices", "dev.off", "gray.colors", "pdf") +importFrom("graphics", "abline", "grid", "legend", "lines", "matplot", + "par", "plot", "polygon", "text", "axis") +importFrom("stats", "approx", "coef", "predict", "quantile", "runif", + "sd") \ No newline at end of file diff --git a/NEWS b/NEWS new file mode 100644 index 0000000..1e18557 --- /dev/null +++ b/NEWS @@ -0,0 +1 @@ +0.2-6 Package was about to be archived leading to a maintainer change. diff --git a/R/EPSGO.R b/R/EPSGO.R new file mode 100644 index 0000000..cc02b18 --- /dev/null +++ b/R/EPSGO.R @@ -0,0 +1,468 @@ + +########################################################################################################### +########################################################################################################### +epsgo<- function( + # function to be minimized + Q.func, + # bounds for parameters + bounds, + # round.n -number of digits after comma + round.n=5, + # parms.coding="none", # or log2 + parms.coding="none", # or log2 + # min value for the function Q.func + fminlower=0, + # do you want to find one min value and stop? + flag.find.one.min =FALSE, + # show plots ? none, final iteration, all iterations + show=c("none", "final", "all"), + # define the number of start points + N= NULL, + #% maximum # of function evaluations + maxevals = 500, + # plot parameter + pdf.name=NULL, + pdf.width=12, pdf.height=12, + my.mfrow=c(1,1), + # verbose? + verbose=TRUE, + seed=123, + ... ){ + + + + # The EPSGO algorithm (from Holger's paper) + # + #EPSGO = Efficient Parameter Selection via Global Optimization + + #Input: function Q.func to measure gen. error + # bounds: data frame with parameter bounds + # rownames: parameter names + # colnames: "lower" and "upper" + # Example: bounds=t(data.frame(lambda1=c(0, 10), lambda2=c(0,8)));colnames(bounds)<-c("lower", "upper") + # fminlower: min Q value + #Output: Qmin, p*, number of visited points neval + + ## Scheme: + # 1. number of tuning patameters + # D = dim(P) + # 2. create N = 10D sample points p1, ..., pN + # in [bounds] using Latin hypercube sampl. + # 3. compute X= Q(p_i), i = 1, ...,N + # + # 4. train Online GP + # 5. number of visited points p_obs + # neval = N + # 6. REPEAT + # 6.1 Find a new p, with max E[I(p)] ( the same as min -E[I(p)] ) + # ?p = argmaxp E[I(p)] (computed by DIRECT) + # # Important! Direct.R calculate global min! --> change Problem function + # 6.2 compute std. dev. and mean of E[I(p)] + # 6.3 new p, new Q(p) + # Qnew = Q(?p) + # 6.4 Add the new p ? + # if Qnew < Qmin + # Qmin = Qnew + # ?p = ?p + # end + # 6.5 update Online GP + # neval = neval + 1 + # UNTIL convergence + + if (verbose){ + print("parms.coding") + print(parms.coding) + } + + + ################################################################################################################### + ## 1. ## number of tuning patameters D = dim(P) + ################################################################################################################### + + D<- nrow(bounds) + + ################################################################################################################### + ## 2. ## create N = 10D sample points p1, ..., pN + #in [l, u] using Latin hypercube sampl. + ################################################################################################################### + + set.seed(seed) + # in ego.m X = start.points + # N - number of start points in parameter space + # wie in ego.m + + # define the number of start points + if (is.null(N)){ + ns<-c(21, 21, 33, 41, 51, 65, 65) ## + # N = 10D, D= number of parameters , but for high dim data with more than 6 dim, restrict the initial set of p to 65 + N<- ifelse ( D <= length(ns),ns[D], 65 ) + #N <- mult.factor * D + } + + # start points X (= p_1,..., p_N) + + X<- lhs(N, bounds) + + # round.n -number of digits after comma + X<-round(X,round.n) + + if (verbose) print(X) + + if(show !="none" & !is.null(pdf.name)) { + pdf(pdf.name, pdf.width, pdf.height) + par(mfrow=my.mfrow) + } + + if ( (show !="none") & (D<=2 )){ + # 1D plot + if (D==1) { + plot(X, xlab="Index", ylab=rownames(bounds)[1], col="orange", pch=20, + main=paste( "Latin hypercube sampling, n=",nrow(X)*D) ) + abline(h=seq(bounds[1,1],bounds[1,2],length=(nrow(X)+1)), lty=2, col=3) + } else { + # 2D plot + plot(X, xlab=rownames(bounds)[1], ylab=rownames(bounds)[2], col="orange", pch=20, + main=paste( "Latin hypercube sampling, n=",nrow(X))) + abline(v=seq(bounds[1,1],bounds[1,2],length=(nrow(X)+1)), lty=2, col=3) + abline(h=seq(bounds[2,1],bounds[2,2],length=(nrow(X)+1)), lty=2, col=3) + } + } + + + ################################################################################################################### + ## 3. ## compute Q(p_i), i = 1, ...,N + ################################################################################################################### + + # model.list<-apply(X, 1, eval(Q.func), x.svm=x.svm, y.svm=y.svm, maxIter=maxIter,parms.coding=parms.coding, inner.val.method=inner.val.method, cross.inner=cross.inner, seed=seed, verbose=verbose ) + model.list<-apply(X, 1, eval(Q.func), parms.coding=parms.coding, + maxevals=maxevals, seed=seed, verbose=verbose, ...) + #debugging + #save(model.list,file="model_list.RData") + + + # take the Q.values + Q<- as.numeric(unlist( sapply(model.list, "[", "q.val"))) + Q.min <- min (Q, na.rm=T) + min.p<- X[which.min(Q), , drop=FALSE] + + if (verbose) print(data.frame(X,Q)) + + # delete the point(s) with no Q value(s) + X <- X[!is.na(Q), ,drop=FALSE] + Q <-Q[!is.na(Q)] + + # add start.q.values to the plot + if (show !="none" & D ==1) text( c(1: nrow(X)),X, labels=round(Q,6), pos=1, cex=0.5 ) + if (show !="none" & D ==2) text( X, labels=signif(Q,6), pos=1, cex=0.5 ) + + # # 4. train Online GP + + # train Gaussian Process + # Input: collection of random variables G(x)(here: x= points in tuning parameter space = start.points X ) + # + ################################################################################################################### + ## 4. ## train Gaussian Process + ################################################################################################################### + + gp.seed.new<- seed + + # if we have tried 5 times and are still not able to fit --> break + # the reason is in having a new point in Ytrain very close to one of the old ones. + # --> matrix is singular! + + if (exists("fit.gp")) rm(fit.gp) + flag.fit.gp<- FALSE + tmp.i<-1 + + while( ! flag.fit.gp ) { + + if (tmp.i >5) { + print(print( "At least one of the intial points is very close to the other points. It is not possible to fit the model via Gaussian Process. ")) + break + } + try(fit.gp<-mlegp(X, Q,constantMean=0, seed=gp.seed.new, verbose= 0 )) + + flag.fit.gp<- FALSE + # if fit.gp exists AND is not null + if (exists("fit.gp")) + if (!is.null(fit.gp)) + flag.fit.gp<- TRUE + + # if fails to fit change seed + if(!flag.fit.gp ) { + print("fails to fit gp (fit.gp), change seed !") + gp.seed.new<- round(runif(1,min=1, max=10^3)) + tmp.i<-tmp.i + 1 + } + } # end of while + + gp.seed<- gp.seed.new + + # define the exprected improvement function --> function ExpImprovement.R + # E(I(p)) = (Q_min - ^mu(p)) * PHI([Q_min - ^mu(p)] / ^sig(p) ) + + # ^sig(p)* phi([Q_min - ^sig(p)] / ^sig(p) )^ + + ################################################################################################################### + # 5. number of visited points p_obs + ################################################################################################################### + neval <- length(Q) + + ################################################################################################################### + ##.6. REPEAT - main block + ################################################################################################################### + + # Initialization for observed points in parameter space Xtrain in R^D and + # for quality value Ytrain = Q + Xtrain<- X + Ytrain <- Q + finished <- FALSE + EImax <- Inf + + loop<-1 + fcalls<-length(Q) + # Point with max E[I(p)] + xmax=X[1, ,drop=FALSE] + #fmin <- Inf + # change to already calculated! + + # info for point with min Q.func + fminold<- Inf + fmin <- Q.min + xmin<- min.p + not_changed = 0 + + set.seed(seed) + + while (!finished){ + if (verbose ) print(paste("loop", loop)) + # calculate Q for the new points + if (loop >1) { + EIold <- EImax + fminold <- fmin + + + #model.list.new<-apply(X, 1, eval(Q.func), x.svm=x.svm, y.svm=y.svm, + # maxIter=maxIter,parms.coding=parms.coding, inner.val.method=inner.val.method, cross.inner=cross.inner ,verbose=verbose ) + #model.list.new<-apply(X, 1, eval(Q.func), x=x, y=y, family=family, nfolds=nfolds, maxit=maxit, seed=seed, verbose=verbose ,type.min=type.min) + model.list.new<-apply(X, 1, eval(Q.func), parms.coding=parms.coding, + maxevals=maxevals, seed=seed, verbose=verbose, ...) + # take the Q.values + model.list<-c(model.list, model.list.new ) + Q<- as.numeric(unlist( sapply(model.list.new, "[", "q.val"))) + + fcalls = fcalls + length(Q); + Xtrain = rbind(Xtrain, X) + Ytrain = c(Ytrain, Q ) + + if (verbose) print(data.frame(Xtrain,Ytrain)) + + # fmin = current min of Q.func at point xmin. + fmin<- min(Ytrain) + xmin = Xtrain[which.min(Ytrain),] + } + + if (verbose) print(paste("fmin=",fmin)) + if (fmin < fminlower){ + break + } + + if (flag.find.one.min){ + ### skip it (in case of having more than 1 points with global min ) + # break if reach the min value + if (fmin <= fminlower){ + break + } + } + + + # break if no changes in the last 10 iterations + if (fmin == fminold){ + not_changed = not_changed + 1 + if (not_changed >= 10){ + print("No changes in the last 10 iterations, break iterations") + break + } + }else { + not_changed = 0 + } + + # train GP + if (loop >1) { + # sometimes error: Error in solve.default(gp$invVarMatrix) : + # system is computationally singular: reciprocal condition number = 7.502e-17 + # Solution: change seed + + gp.seed.new<- c(seed + loop-1) + + + if (exists("fit.gp")) rm(fit.gp) + + # if we have tried 5 times and are still not able to fit --> break + # the reason is in having a new point in Ytrain very close to one of the old ones. + # --> matrix is singular! + + tmp.i<-1 + flag.fit.gp<- FALSE + + while(!flag.fit.gp ) { + + if (tmp.i >5) { + print(print( "The new point X is very close to the one of visited points.")) + finished <- TRUE + break + } + try(fit.gp<-mlegp(Xtrain, Ytrain,constantMean=0, seed=gp.seed.new, verbose= 0 )) + + flag.fit.gp<- FALSE + # if fit.gp exists AND is not null + if (exists("fit.gp")) + if (!is.null(fit.gp)) + flag.fit.gp<- TRUE + + # if fails to fit change seed + if(!flag.fit.gp ) { + print("fails to fit gp (fit.gp), change seed !") + gp.seed.new<- round(runif(1,min=1, max=10^3)) + tmp.i<-tmp.i + 1 + } + } # end of while (!flag.fit.gp ) + + if (finished) break + + gp.seed<-c(gp.seed, gp.seed.new) + #str(fit.gp) + } + + if (show !="none" ) plot(fit.gp, main=paste("Gaussian Process", "iter ", loop)) + + Problem<- list(f = "ExpImprovement" ) + + #6.1 Find a new p, with max E[I(p)] ( the same as min -E[I(p)] ) + #[EImax xmax history] = Direct(Problem,bounds,options) + Dir.list<- Direct(Problem=Problem, + bounds=bounds, + # options + #% maximum of iterations + maxits = 50, + #% maximum # of function evaluations + maxevals = maxevals, + # the optimal value is unknown + testflag= 0, + # minimum value of function , min (ExpImprovement) = 0 + globalmin = 0, + #% print and plot iteration stats + showits= show, + # verbose? + verbose=FALSE, + tol= 0.01, + pdf.name=NULL, #"test.pdf" # + pdf.width=12, pdf.height=12, + my.mfrow=c(1,1), #c(3,3), + # + # additional args for the problem function + fmin=fmin, + fit.gp=fit.gp + ) + + # E[I(p)], xmax - p with max EI + EImax <- (-1 ) * Dir.list$minval + xmax<- Dir.list$final_point.xatmin + History <- Dir.list $ History + + if (verbose ){ + print(paste("EImax=", EImax ) ) + print( "xmax:") + print( xmax) + print( "history:") + print( History) + } + + # 6.2 compute std. dev. and mean of E[I(p)] + ### other random sample in the same size; Latin hypercube sampling over the + ## whole parameter space. The idea is to make sure, that the expected improvement over the whole space is almost equally small. + Xsam <- lhs(N, bounds) + EIall=rep(0,N) + for (i in 1:N ){ + EIall[i] = - ExpImprovement(Xsam[i, , drop=FALSE], fmin, fit.gp, muX=NULL, muY=NULL) + } + + EIstd = sd( c(EIall, EImax) ) + EImean = mean(c(EIall, EImax)) + + # if E[I(p)] is max by a random sample --> take it! + if ( max(EIall) > EImax ){ + xmax = t(Xsam[which.max(EIall),, drop=FALSE]) + rownames(xmax)<- rownames(bounds) + EImax <- max(EIall) + } + + # 6.4 Add the new p ? + # if the new p is one from the observed ones --> break + # OR if the differences in functions is small, + # stop iterations + + # Is xmin already in the set of visited points? + visited <- Xtrain - matrix(xmax, nrow=nrow(Xtrain), ncol= ncol(Xtrain), byrow=TRUE) + visited<- any ( apply (visited == 0 , 1, all ) ) + + if ( (visited ) | (abs(fmin - fminold) < 0.01) & ((EImax - EImean)^2 <= 0.1*EIstd) ){ + if (visited) print( "The new point with min E[I(p)] is already in the set of visited points.") + if ((abs(fmin - fminold) < 0.01) & ((EImax - EImean)^2 <= 0.1*EIstd) ) { + print( "the differences in functions between 2 last iterations is small, stop iterations") + } + finished = TRUE + # } else { + # # print("Take the second best point (candidate) ") + # + # # filter out E[I(p)]=fc==0, + # cand<-Dir.list$fc[Dir.list$fc != 0 ] + # # mult by (-1) because of changing from max to min problem in Direct + # EImax2<-sort((-1) * cand, decr=T)[2] + # + # xmax <- Dir.list$c[ , which((-1)*Dir.list$fc == EImax2) ] + # X = t(xmax) + # EImax<- EImax2 + # neval<- neval + 1 + + + }else{ + X = t(xmax) + X<-round(X,round.n) + rownames(X)<- NULL + neval<- neval + 1 + } + if (verbose ) print(paste("iteration :", loop, " fmin = ", fmin )) + print(paste("finished?", finished)) + print("X") + print(X) + loop = loop + 1 + + print(data.frame(Xtrain,Ytrain)) + + } # end of while (!finished) + + if(show !="none" & !is.null(pdf.name)) dev.off() + + # define the set of points with the same fmin + tmp.set<-data.frame(Xtrain, f=Ytrain) + points.fmin<- tmp.set[tmp.set$f == fmin, ,drop=FALSE ] + + out <- list(fmin =fmin, + xmin = xmin, + iter = loop, + neval =neval, + maxevals= maxevals, + seed =seed, + bounds = bounds, + Q.func =Q.func, + points.fmin =points.fmin, + Xtrain = Xtrain, + Ytrain= Ytrain, + gp.seed=gp.seed, + model.list = model.list + ) + + class(out) <- "intsearch" + return(out) + +} diff --git a/R/auc.R b/R/auc.R new file mode 100644 index 0000000..cde139a --- /dev/null +++ b/R/auc.R @@ -0,0 +1,24 @@ +# this function was copied from the glmnet package +# https://cran.r-project.org/web/packages/glmnet/index.html + +auc=function(y,prob,w){ + if(missing(w)){ + rprob=rank(prob) + n1=sum(y);n0=length(y)-n1 + u=sum(rprob[y==1])-n1*(n1+1)/2 + exp(log(u) - log(n1) - log(n0)) + } + else{ + rprob=runif(length(prob)) + op=order(prob,rprob)#randomize ties + y=y[op] + w=w[op] + cw=cumsum(w) + w1=w[y==1] + cw1=cumsum(w1) + wauc = log(sum(w1*(cw[y==1]-cw1))) + sumw1 = cw1[length(cw1)] + sumw2 = cw[length(cw)] - sumw1 + exp(wauc - log(sumw1) - log(sumw2)) + } +} diff --git a/R/balancedFolds.R b/R/balancedFolds.R new file mode 100644 index 0000000..37b4c6b --- /dev/null +++ b/R/balancedFolds.R @@ -0,0 +1,15 @@ + +balancedFolds <- function(class.column.factor, cross.outer) +{ + #stolen from MCREstimate package + # used for stratified(balanced) classification or regression + # get balanced folds from pamr + sampleOfFolds <- get("balanced.folds",envir=asNamespace("pamr"))(class.column.factor, nfolds=cross.outer) + permutated.cut <- rep(0,length(class.column.factor)) + for (sample in 1:cross.outer) + { + cat(sample,"\n") + permutated.cut[sampleOfFolds[[sample]]] <- sample + } + return(permutated.cut) +} diff --git a/R/coef_intsearch.R b/R/coef_intsearch.R new file mode 100644 index 0000000..c186c29 --- /dev/null +++ b/R/coef_intsearch.R @@ -0,0 +1,16 @@ + +########################################################################################################### +coef.sum.intsearch<-function(object,...){ + # get coef for a object from fit object after running interval search + + f1 <- object$cvreg + res<-f1$glmnet.fit + cof <- as.vector(coef(res, s=object$lambda)) + names.cof <- rownames(res$beta) + cofn <- cof[which(cof != 0)] + names(cofn) <- names.cof[which(cof != 0)] + bet <- res$beta[match(names(cofn), rownames(res$beta)),] + + return(cofn) +} + diff --git a/R/nonzeroCoef.R b/R/nonzeroCoef.R new file mode 100644 index 0000000..4534ae2 --- /dev/null +++ b/R/nonzeroCoef.R @@ -0,0 +1,46 @@ +# this function was copied from the glmnet package +# https://cran.r-project.org/web/packages/glmnet/index.html + +nonzeroCoef = function (beta, bystep = FALSE) +{ +### bystep = FALSE means which variables were ever nonzero +### bystep = TRUE means which variables are nonzero for each step + nr=nrow(beta) + if (nr == 1) {#degenerate case + if (bystep) + apply(beta, 2, function(x) if (abs(x) > 0) + 1 + else NULL) + else { + if (any(abs(beta) > 0)) + 1 + else NULL + } + } + else { + beta=abs(beta)>0 # this is sparse + which=seq(nr) + ones=rep(1,ncol(beta)) + nz=as.vector((beta%*%ones)>0) + which=which[nz] + if (bystep) { + if(length(which)>0){ + beta=as.matrix(beta[which,,drop=FALSE]) + nzel = function(x, which) if (any(x)) + which[x] + else NULL + which=apply(beta, 2, nzel, which) + if(!is.list(which))which=data.frame(which)# apply can return a matrix!! + which + } + else{ + dn=dimnames(beta)[[2]] + which=vector("list",length(dn)) + names(which)=dn + which + } + + } + else which + } +} diff --git a/R/peperr_glmnet.R b/R/peperr_glmnet.R new file mode 100644 index 0000000..0fa8d5a --- /dev/null +++ b/R/peperr_glmnet.R @@ -0,0 +1,233 @@ +############################################################### +# baseline survival/ hazard Breslow estimator +# function essentially based on gbm::basehaz.gbm +############################################################### +basesurv <- function (response, lp, times.eval = NULL, centered = FALSE) +{ + if (is.null(times.eval)) times.eval <- sort(unique(response[,1])) + + t.unique <- sort(unique(response[,1][response[,2] == 1])) + alpha <- length(t.unique) + + for (i in 1:length(t.unique)) { + alpha[i] <- sum(response[,1][response[,2] == 1] == t.unique[i])/sum(exp(lp[response[,1] >= t.unique[i]])) + } + + obj <- approx(t.unique, cumsum(alpha), yleft=0, xout = times.eval, rule=2) + + if (centered) obj$y <- obj$y * exp(mean(lp)) + obj$z <- exp(-obj$y) + + names(obj) <- c("times","cumBaseHaz","BaseSurv") + return(obj) +} + +############################################### +### wrapper for glmnet +############################################### + +fit.glmnet <- function (response, x, cplx, ...) +{ + #require(glmnet) + res <- NULL + tryerr <- try(res <- glmnet(y = response, x = data.matrix(x), lambda = cplx, ...), silent=TRUE) + + if(class(tryerr) != 'try-error' && "coxnet" %in% class(res)) { + res$linear.predictor <- as.numeric(predict(res, newx=data.matrix(x), type="link")) + res$response <- response + } + res +} + +complexity.glmnet <- function (response, x, full.data, ...) +{ + #require(glmnet) + lambda <- NULL + tryerr <- try(cv <- cv.glmnet(y = response, x = data.matrix(x), ...), silent=TRUE) + + if(class(tryerr) != 'try-error'){ + lambda <-cv$lambda.min + } + lambda +} + +predictProb.coxnet <- predictProb.glmnet <- function (object, response, x, times, complexity, ...) +{ + #require(glmnet) + lp <- as.numeric(predict(object, newx=data.matrix(x),s=complexity, type="link")) + basesurv <- basesurv(object$response,object$linear.predictor, sort(unique(times))) + p <- exp(exp(lp) %*% -t(basesurv$cumBaseHaz)) + + if (NROW(p) != NROW(x) || NCOL(p) != length(times)) + stop("Prediction failed") + p +} + +PLL.coxnet <- function(object, newdata, newtime, newstatus, complexity, ...) +{ + #require(glmnet) + PLL <- glmnet::coxnet.deviance(pred = NULL, Surv(newtime,newstatus), x = data.matrix(newdata), offset = NULL, weights = NULL, beta = coef(object,s=complexity)) + PLL / -2 +} + + +################################################## +###### classification aggregation functions #### +################################################## + +aggregation.misclass <- function (full.data = NULL, response, x, model, cplx = NULL, + type = c("apparent", "noinf"), fullsample.attr = NULL, ...) +{ + data <- as.data.frame(x) + data$response <- response + + if ("glmnet" %in% class(model)) { + probs <- as.numeric(predict(model, newx = data.matrix(x), type="response", ...)) + } + else if (class(model)[1] == "penfit") { + probs <- predict(model, data = data, penalized = x, ...) + } + else if (class(model)[1]=="glm") { + probs <- predict(model, newdata = data, type="response", ...) + } + else { + probs <- predict(model, data = data, type = "response", + ...) + } + type <- match.arg(type) + if (type == "apparent") { + mr <- sum(abs(round(probs) - response))/length(response) + } + if (type == "noinf") { + mr <- mean(abs((matrix(response, length(response), length(response), + byrow = TRUE) - round(probs)))) + } + mr +} + +aggregation.brier <- function (full.data = NULL, response, x, model, cplx = NULL, + type = c("apparent", "noinf"), fullsample.attr = NULL, ...) +{ + data <- as.data.frame(x) + data$response <- response + + if ("glmnet" %in% class(model)) { + probs <- as.numeric(predict(model, newx = data.matrix(x), type="response", ...)) + } + else if (class(model)[1] == "penfit") { + probs <- predict(model, data = data, penalized = x, ...) + } + else if (class(model)[1]=="glm") { + probs <- predict(model, newdata = data, type="response", ...) + } + else { + probs <- predict(model, data = data, type = "response", + ...) + } + type <- match.arg(type) + if (type == "apparent") { + brier.score <- sum((probs - response)^2)/length(response) + } + if (type == "noinf") { + brier.score <- mean((matrix(response, length(response), + length(response), byrow = TRUE) - probs)^2) + } + brier.score +} + +aggregation.auc <- function (full.data = NULL, response, x, model, cplx = NULL, + type = c("apparent", "noinf"), fullsample.attr = NULL, ...) +{ + data <- as.data.frame(x) + data$response <- response + if ("glmnet" %in% class(model)) { + probs <- as.numeric(predict(model, newx = data.matrix(x), type="response", ...)) + } + else if (class(model)[1] == "penfit") { + probs <- predict(model, data = data, penalized = x, ...) + } + else if (class(model)[1]=="glm") { + probs <- predict(model, newdata = data, type="response", ...) + } + else { + probs <- predict(model, data = data, type = "response", + ...) + } + type <- match.arg(type) + if (type == "apparent") { + auc <- auc(response,probs) + } + if (type == "noinf") { + resp.mat <- matrix(response, length(response), length(response), byrow = TRUE) + auc <- mean(apply(resp.mat, 1, function(d) auc(d,probs))) + } + auc +} + +######################## +### plot pecs ### +######################## + +Plot.peperr.curves <- function(x,at.risk=TRUE,allErrors=FALSE, bootRuns=FALSE, bootQuants=TRUE, bootQuants.level=0.95, leg.cex=0.7, ...) { + + #require(peperr) + + if (bootRuns) bootQuants <- FALSE + + plot(x$attribute, x$null.model, type = "n", las=1, + col = "blue", xlab = "Evaluation time points", + ylab = "Prediction error", main = "Prediction error curves", + ylim = c(0, max(perr(x), x$full.apparent, x$null.model) + 0.1)) + + if (length(x$sample.error) > 1 & bootRuns==TRUE) { + for (i in 1:(length(x$sample.error))) { + lines(x$attribute, x$sample.error[[i]], type = "l", col = "light grey", lty = 1) + } + } + + if (length(x$sample.error) > 1 & bootQuants==TRUE) { + boots <- do.call("rbind",x$sample.error) + quants <- apply(boots, 2, function(d) quantile(d, probs=c((1-bootQuants.level)/2,1 - (1-bootQuants.level)/2))) + polygon(c(x$attribute,rev(x$attribute)),c(quants[1,],rev(quants[2,])), col="light grey", border="light grey") + } + + if (allErrors==FALSE) { + lines(x$attribute, x$null.model, type = "l", col = "blue", lwd = 2, lty = 1) + lines(x$attribute, perr(x, "632p"), type = "l", col= "black", lty = 1, lwd = 2) + lines(x$attribute, x$full.apparent, type = "l", col = "red", lty = 1, lwd = 2) + if (bootRuns==TRUE) { + legend(x = "topleft", col = c("blue", "black", "red", "light grey"), lwd=c(2,2,2,1), cex=leg.cex, + lty = 1, legend = c("Null model", ".632+ estimate", "Full apparent", "Bootstrap samples")) + } else { + legend(x = "topleft", col = c("blue", "black", "red"), lwd=c(2,2,2), cex=leg.cex, + lty = 1, legend = c("Null model", ".632+ estimate", "Full apparent")) + } + } + + if (allErrors==TRUE) { + lines(x$attribute, x$null.model, type = "l", col = "blue", lwd = 2, lty = 1) + lines(x$attribute, perr(x, "632p"), type = "l", col= "black", lty = 1, lwd = 2) + lines(x$attribute, perr(x, "632"), type = "l", col= "brown", lty = 1, lwd = 2) + lines(x$attribute, perr(x, "NoInf"), type = "l", col= "green", lty = 1, lwd = 2) + lines(x$attribute, perr(x, "resample"), type = "l", col= "dark grey", lty = 1, lwd = 2) + lines(x$attribute, x$full.apparent, type = "l", col = "red", lty = 1, lwd = 2) + if (bootRuns==TRUE) { + legend(x = "topleft", ncol=2, col = c("blue", "black","brown","green","dark grey","red", "light grey"), lwd=c(2,2,2,2,2,2,1), cex=leg.cex, + lty = 1, legend = c("Null model", ".632+ estimate",".632 estimate", "No Information","Out-of-bag average","Full apparent", "Bootstrap samples")) + } else { + legend(x = "topleft", ncol=2, col = c("blue", "black","brown","green","dark grey","red"), lwd=c(2,2,2,2,2,2), cex=leg.cex, + lty = 1, legend = c("Null model", ".632+ estimate",".632 estimate", "No Information","Out-of-bag average","Full apparent")) + } + } + + if (at.risk) { + tmpxaxp <- par("xaxp") + tmpusr <- par("usr") + at.loc <- seq(tmpxaxp[1],tmpxaxp[2],length=tmpxaxp[3]+1) + n.at.risk <- summary(survfit(x$response ~ 1),times=at.loc)$n.risk + + text(x=at.loc, y=tmpusr[3], labels=n.at.risk, cex=0.8, pos=3) + text(x=tmpxaxp[2]+(tmpusr[2]-tmpxaxp[2])/2, y=tmpusr[3], labels="at\nrisk", cex=0.8, pos=3) + } +} + diff --git a/R/plotCoef.R b/R/plotCoef.R new file mode 100644 index 0000000..9520ada --- /dev/null +++ b/R/plotCoef.R @@ -0,0 +1,59 @@ +# this function was copied from the glmnet package +# https://cran.r-project.org/web/packages/glmnet/index.html + +plotCoef=function(beta,norm,lambda,df,dev,label=FALSE,xvar=c("norm","lambda","dev"),xlab=iname,ylab="Coefficients",...){ + ##beta should be in "dgCMatrix" format + which=nonzeroCoef(beta) + nwhich=length(which) + switch(nwhich+1,#we add one to make switch work + "0"={ + warning("No plot produced since all coefficients zero") + return() + }, + "1"=warning("1 or less nonzero coefficients; glmnet plot is not meaningful") + ) + beta=as.matrix(beta[which,,drop=FALSE]) + xvar=match.arg(xvar) + switch(xvar, + "norm"={ + index=if(missing(norm))apply(abs(beta),2,sum)else norm + iname="L1 Norm" + approx.f=1 + }, + "lambda"={ + index=log(lambda) + iname="Log Lambda" + approx.f=0 + }, + "dev"= { + index=dev + iname="Fraction Deviance Explained" + approx.f=1 + } + ) + dotlist=list(...) + type=dotlist$type + if(is.null(type)) + matplot(index,t(beta),lty=1,xlab=xlab,ylab=ylab,type="l",...) + else matplot(index,t(beta),lty=1,xlab=xlab,ylab=ylab,...) + atdf=pretty(index) + ### compute df by interpolating to df at next smaller lambda + ### thanks to Yunyang Qian + prettydf=approx(x=index,y=df,xout=atdf,rule=2,method="constant",f=approx.f)$y +# prettydf=ceiling(approx(x=index,y=df,xout=atdf,rule=2)$y) + axis(3,at=atdf,labels=prettydf,tcl=NA) + if(label){ + nnz=length(which) + xpos=max(index) + pos=4 + if(xvar=="lambda"){ + xpos=min(index) + pos=2 + } + xpos=rep(xpos,nnz) + ypos=beta[,ncol(beta)] + text(xpos,ypos,paste(which),cex=.5,pos=pos) + } + +} + diff --git a/R/plot_coef_glmnet.R b/R/plot_coef_glmnet.R new file mode 100644 index 0000000..0f00e92 --- /dev/null +++ b/R/plot_coef_glmnet.R @@ -0,0 +1,30 @@ +Plot.coef.glmnet <- function(cvfit, betas){ + +op <- par(no.readonly = TRUE) +par(mar=c(4,4,2.5,1), mgp=c(2.5,1,0), mfrow=c(2,2)) +fit <- cvfit$glmnet.fit +bet <- fit$beta[match(betas, rownames(fit$beta)),] + +plot(fit, xvar="lambda", col="gray") +plotCoef(bet, lambda = fit$lambda, df = fit$df, dev = fit$dev.ratio, xvar = "lambda", add=TRUE, col="red") +abline(v=log(cvfit$lambda.min), lty=3) +abline(v=log(cvfit$lambda.1se), lty=3) + +plotCoef(bet, lambda = fit$lambda, df = fit$df, dev = fit$dev.ratio, xvar = "lambda", add=FALSE, col="red") +abline(v=log(cvfit$lambda.min), lty=3) +abline(v=log(cvfit$lambda.1se), lty=3) + +norm <- apply(abs(fit$beta), 2, sum) +plot(fit, xvar="norm", col="gray") +plotCoef(bet, xvar = "norm", add=TRUE, col="red", + norm = norm, lambda = fit$lambda, df = fit$df, dev = fit$dev.ratio) +abline(v=norm[match(cvfit$lambda.min, cvfit$lambda)], lty=3) +abline(v=norm[match(cvfit$lambda.1se, cvfit$lambda)], lty=3) + +plot(fit, xvar="dev", col="gray") +plotCoef(bet, lambda = fit$lambda, df = fit$df, dev = fit$dev.ratio, xvar = "dev", add=TRUE, col="red") +abline(v=fit$dev.ratio[match(cvfit$lambda.min, cvfit$lambda)], lty=3) +abline(v=fit$dev.ratio[match(cvfit$lambda.1se, cvfit$lambda)], lty=3) + +par(op) +} \ No newline at end of file diff --git a/R/plot_summary_inter_search.R b/R/plot_summary_inter_search.R new file mode 100644 index 0000000..77d9cf4 --- /dev/null +++ b/R/plot_summary_inter_search.R @@ -0,0 +1,50 @@ + +plot.sum.intsearch <-function(x,type="summary",startN=21,...){ + if(type=="summary"){ + summary.int <- x + summary.int$info$log_lambda <- log(summary.int$info$lambda) + breaks <- do.breaks(range(summary.int$info$deviance), 20) + n_init <- 21 # number of initial alpha values at iteration zero + summary.int$info$cols <- level.colors(summary.int$info$deviance,at=breaks, col.regions = gray.colors) + n.features <- summary.int$info$n.features + + print(my.plot <- xyplot(log_lambda ~ alpha, + data = summary.int$info, + groups = summary.int$info$cols, + cex = 1, cex.axis=1.5, + col = "black", + jitter.y=T, amount=0.01, + ylab=list(expression(paste("log ",lambda)),cex=1.5), + xlab=list(expression(alpha),cex=1.5), + # scales=list(x=list(log=T, equispaced.log = FALSE)), # x axis on log-scale + panel = function(x, y, groups, ..., subscripts) { + fill <- groups[subscripts] + panel.grid(h = -1, v = -1) + panel.abline(h = log(summary.int$opt.lambda), + v = summary.int$opt.alpha, + col="red", lty = 1, lwd=2 ) + panel.xyplot(x, y, pch = rep(c(22,21),c(n_init,nrow(summary.int$info)-n_init)), + fill = fill, ...) ; + ltext(x=x, y=y, labels=n.features, pos=ifelse(y<0.1,3,4), offset=1.5, cex=1,col=1) + }, + legend = list(top = list(fun = draw.colorkey, + args = list(key = list(space = "top", + col = gray.colors, + at = breaks), + draw = FALSE))), + main="Cross-validated partial log likelihood deviance", + scales=list(cex=1) + #sub="number of selected features are printed next to symbol \n rectangles show initial alpha values" + )) + } + if(type=="points"){ + # plot visited points vs. iteration steps + summary.int <- x + niter<- nrow(summary.int$info) - startN + iter<-c(rep(0,startN), c(1:niter)) + plot(summary.int$info$alpha, iter, xlab=expression(alpha), ylab="Iteration", pch=20,cex=1.5,cex.axis=1.5) + grid(NA, niter+1, lwd=2) + abline(v=summary.int$opt.alpha, col="red") + } +} + diff --git a/R/stabilityselection.R b/R/stabilityselection.R new file mode 100644 index 0000000..e34fe95 --- /dev/null +++ b/R/stabilityselection.R @@ -0,0 +1,138 @@ +stabpath <- function(y,x,size=0.632,steps=100,weakness=1,mc.cores=getOption("mc.cores", 2L),...){ + fit <- glmnet(x,y,...) + if(class(fit)[1]=="multnet"|class(fit)[1]=="lognet") y <- as.factor(y) + #if(class(fit)[1]=="lognet") y <- as.logical(y) + p <- ncol(x) + #draw subsets + subsets <- sapply(1:steps,function(v){sample(1:nrow(x),nrow(x)*size)}) + + # parallel computing depending on OS + # UNIX/Mac + if (.Platform$OS.type!="windows") { + res <- mclapply(1:steps,mc.cores=mc.cores,glmnet.subset,subsets,x,y,lambda=fit$lambda,weakness,p,...) + } else { + # Windows + cl <- makePSOCKcluster(mc.cores) + clusterExport(cl,c("glmnet","drop0")) + res <- parLapply(cl, 1:steps,glmnet.subset,subsets,x,y,lambda=fit$lambda,weakness,p,...) + stopCluster(cl) + } + + #merging + res <- res[unlist(lapply(lapply(res,dim),function(x) x[2]==dim(res[[1]])[2]))] + x <- as.matrix(res[[1]]) + qmat <- matrix(ncol=ncol(res[[1]]),nrow=length(res)) + qmat[1,] <- colSums(as.matrix(res[[1]])) + for(i in 2:length(res)){ + qmat[i,] <- colSums(as.matrix(res[[i]])) + x <- x + as.matrix(res[[i]]) + } + x <- x/length(res) + qs <- colMeans(qmat) + out <- list(fit=fit,x=x,qs=qs) + class(out) <- "stabpath" + return(out) +} + +#internal function used by lapply +glmnet.subset <- function(index,subsets,x,y,lambda,weakness,p,...){ + if(length(dim(y))==2|class(y)=="Surv"){ + glmnet(x[subsets[,index],],y[subsets[,index],],lambda=lambda + ,penalty.factor= 1/runif(p,weakness,1),...)$beta!=0 + }else{ + if(is.factor(y)&length(levels(y))>2){ + temp <- glmnet(x[subsets[,index],],y[subsets[,index]],lambda=lambda + ,penalty.factor= 1/runif(p,weakness,1),...)[[2]] + temp <- lapply(temp,as.matrix) + Reduce("+",lapply(temp,function(x) x!=0)) + + } + else{ + glmnet(x[subsets[,index],],y[subsets[,index]],lambda=lambda + ,penalty.factor= 1/runif(p,weakness,1),...)$beta!=0 + } + } +} + + +#performs error control and returns estimated set of stable variables and corresponding lambda +stabsel <- function(x,error=0.05,type=c("pfer","pcer"),pi_thr=0.6){ + if(pi_thr <= 0.5 | pi_thr >= 1) stop("pi_thr needs to be > 0.5 and < 1!") + if(class(x$fit)[1]=="multnet"){ + p <- dim(x$fit$beta[[1]])[1] + }else{ + p <- dim(x$fit$beta)[1] + } + type <- match.arg(type) + switch(type, + "pcer"={ + if(error>=1 | error<=0)stop("pcer needs to be > 0 and < 1!") + qv <- ceiling(sqrt(error* p * (2*pi_thr-1)*p)) }, + "pfer"={ + qv <- ceiling(sqrt(error * (2*pi_thr-1)*p)) } + ) + if(x$qs[length(x$qs)]<=qv){ lpos <- length(x$qs) + }else{ + lpos <- which(x$qs>qv)[1] + } + if(!is.na(lpos)){stable <- which(x$x[,lpos]>=pi_thr)}else{ + stable <- NA + } + out <- list(stable=stable,lambda=x$fit$lambda[lpos],lpos=lpos,error=error,type=type) + return(out) +} + +print.stabpath <- function(x,...){ + cat(" stabilitypath","\n", + dim(x$x)[1],"variables","\n", + dim(x$x)[2],"lambdas","\n") +} + +#plot penalization and stability path +plot.stabpath <- function(x,error=0.05,type=c("pfer","pcer"),pi_thr=0.6,xvar=c("lambda", "norm", "dev") + , col.all="black", col.sel="red",...){ + sel <- stabsel(x,error,type,pi_thr) + if(class(x$fit)[1]=="multnet"){ + beta = as.matrix(Reduce("+",x$fit$beta)) + }else{ + beta = as.matrix(x$fit$beta) + } + p <- dim(beta)[1] + which = nonzeroCoef(beta) + nwhich = length(which) + switch(nwhich + 1, `0` = { + warning("No plot produced since all coefficients zero") + return() + }, `1` = warning("1 or less nonzero coefficients; glmnet plot is not meaningful")) + xvar = match.arg(xvar) + switch(xvar, norm = { + index = apply(abs(beta), 2, sum) + iname = "L1 Norm" + }, lambda = { + index = log(x$fit$lambda) + iname = expression(paste("log ",lambda)) + }, dev = { + index = x$fit$dev + iname = "Fraction Deviance Explained" + }) + #} + #stability path + cols <- rep(col.all,p) + cols[sel$stable] <- col.sel + lwds <- rep(1,p) + lwds[sel$stable] <- 2 + if(!class(x$fit)[1]=="multnet"){ + par(mfrow=c(2,1)) + matplot(y=t(beta), x=index + ,type="l",col=cols,lwd=lwds,lty=1,ylab=expression(paste(hat(beta)[i])) + ,xlab=iname,main="Penalization Path",cex.lab=1,cex.axis=1,las=1,...) + } + matplot(y=as.matrix(t(x$x)), x=index + ,type="l",col=cols,lwd=lwds,lty=1,ylab=expression(paste(hat(Pi))) + ,xlab=iname,main="Stability Path",ylim=c(0,1),cex.lab=1,cex.axis=1,las=1,...) + abline(h=pi_thr,col="darkred",lwd=1,lty=1) + abline(v=index[sel$lpos],col="darkred",lwd=1,lty=1) + #text(x=20,y=0.9,paste(expression(paste(lambda)),"=",paste(round(sel[[2]],digits=3)),sep=""),cex=0.75) + return(sel) +} + diff --git a/R/summary_inter_search.R b/R/summary_inter_search.R new file mode 100644 index 0000000..803aeb2 --- /dev/null +++ b/R/summary_inter_search.R @@ -0,0 +1,40 @@ + +########################################################################################################### +summary.intsearch<-function(object,digits = max(3, getOption("digits") - 3), verbose=TRUE, first.n=5, ...){ + fit <- object + alphas <- fit$Xtrain[,1] + lambdas <- unlist(sapply(sapply(fit$model, "[", "model"), "[", "lambda")) + deviances <- fit$Ytrain + # round problems!!!! take first from the fit + # number of selected features in the models; dfs + tmp.models<-sapply(sapply(sapply(fit$model, "[", "model"), "[", "cvreg"), "[", "glmnet.fit") + + n.features<-mapply( function(List, lam) List$df[which(List$lambda %in% lam)], tmp.models, lambdas) + + # optimal models + #print("chose the model with min num of FS ") + opt.models <- sapply(fit$model.list, "[", "model") [fit$Ytrain == fit$fmin ] + + opt.alpha <- opt.models[[1]]$alpha + opt.lambda <- opt.models[[1]]$lambda + opt.error <- fit$fmin + + out <- list(info=data.frame(alpha=alphas,lambda=lambdas,deviance=deviances,n.features=n.features), + opt.alpha=opt.alpha, opt.lambda=opt.lambda, opt.error=opt.error, + opt.models=opt.models) + class(out) <- "sum.intsearch" + + if(verbose){ + cat("Summary interval search \n\n") + cat(paste("show the first", first.n,"out of",nrow(out$info),"entries\n")) + print(out$info[1:first.n,]) + cat("\n..............................") + + cat("\n\n Optimal parameters found are: \n\n") + cat(paste("alpha = ",round(out$opt.alpha,digits), + "\t", + "lambda = ",round(out$opt.lambda,digits), + "deviance = ",round(out$opt.error,digits))) + } + invisible(out) +} \ No newline at end of file diff --git a/R/tune_glmnet_interval.R b/R/tune_glmnet_interval.R new file mode 100644 index 0000000..c49fabf --- /dev/null +++ b/R/tune_glmnet_interval.R @@ -0,0 +1,57 @@ +tune.glmnet.interval<-function(parms, x, y, + weights, + offset = NULL, + lambda = NULL, + type.measure = c("mse", "deviance", "class", "auc", "mae"), + seed=12345, + nfolds = 10, + foldid=NULL, + grouped = TRUE, + type.min=c("lambda.min", "lambda.1se"), + family, + verbose=FALSE, + ...){ + + # 1. decode the parameters ############################################################ + + alpha<-parms[1] + names(alpha)<- NULL + + if (verbose) print(paste("alpha=",alpha)) + + # 2. find optimal lambda for given alpha ####################################################################### + # for EPSGO, Problem = cv.glmnet , output mean misclassification error + + + # find optimal lambda for given alpha + set.seed(seed) + cv<-cv.glmnet(x=x,y=y, family=family, + alpha=alpha, + offset = NULL, + lambda = NULL, + type.measure =type.measure, + nfolds = nfolds, + foldid = foldid, + grouped = grouped ) + + + opt.lambda<-ifelse(type.min=="lambda.min", cv$lambda.min, cv$lambda.1se ) + + # q.val= mean cross-validated error over the folds + q.val<-cv$cvm[which(cv$lambda == opt.lambda) ] + + # 3. fit the model for given alpha and opt.lambda ######################################################## + # fit the model for given alpha and opt.lambda + fit<-glmnet(x=x,y=y, + family=family, + alpha=alpha, + lambda=opt.lambda ) + + + ret<-list(q.val=q.val, model=list(alpha=alpha, lambda=opt.lambda, nfolds=nfolds, cvreg=cv, fit=fit) ) + + return(ret) +} + + + diff --git a/c060.Rproj b/c060.Rproj new file mode 100644 index 0000000..b1ded45 --- /dev/null +++ b/c060.Rproj @@ -0,0 +1,17 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: knitr +LaTeX: pdfLaTeX + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source diff --git a/inst/CITATION b/inst/CITATION new file mode 100644 index 0000000..e30f5d8 --- /dev/null +++ b/inst/CITATION @@ -0,0 +1,21 @@ +citHeader("To cite c060 in publications use:") + +citEntry(entry = "Article", + title = "{c060}: Extended Inference with Lasso and Elastic-Net Regularized Cox and Generalized Linear Models", + author = personList(as.person("Martin Sill"), + as.person("Thomas Hielscher"), + as.person("Natalia Becker"), + as.person("Manuela Zucknick")), + journal = "Journal of Statistical Software", + year = "2014", + volume = "62", + number = "5", + pages = "1--22", + url = "https://www.jstatsoft.org/v62/i05/", + + textVersion = + paste("Martin Sill, Thomas Hielscher, Natalia Becker, Manuela Zucknick (2014).", + "c060: Extended Inference with Lasso and Elastic-Net Regularized Cox and Generalized Linear Models.", + "Journal of Statistical Software, 62(5), 1-22.", + "URL https://www.jstatsoft.org/v62/i05/.") +) \ No newline at end of file diff --git a/man/EPSGO.Rd b/man/EPSGO.Rd new file mode 100644 index 0000000..37230d1 --- /dev/null +++ b/man/EPSGO.Rd @@ -0,0 +1,200 @@ +\name{epsgo} + +\alias{epsgo} + +\title{ Efficient Parameter Selection via Global Optimization } +\description{ + Finds an optimal solution for the Q.func function. +} + +\usage{ +epsgo(Q.func, bounds, round.n=5, parms.coding="none", + fminlower=0, flag.find.one.min =FALSE, + show=c("none", "final", "all"), N= NULL, maxevals = 500, + pdf.name=NULL, pdf.width=12, pdf.height=12, my.mfrow=c(1,1), + verbose=TRUE, seed=123, \dots ) +} + +\arguments{ + \item{Q.func}{ name of the function to be minimized. } + \item{bounds}{ bounds for parameters} + \item{round.n}{ number of digits after comma, default: 5} + \item{parms.coding}{ parmeters coding: none or log2, default: none. } + \item{fminlower}{ minimal value for the function Q.func, default is 0. } + \item{flag.find.one.min}{ do you want to find one min value and stop? Default: FALSE } + \item{show}{ show plots of DIRECT algorithm: none, final iteration, all iterations. Default: none } + \item{N}{ define the number of start points, see details. } + \item{maxevals}{ the maximum number of DIRECT function evaluations, default: 500. } + \item{pdf.name}{pdf name } + \item{pdf.width}{ default: 12 } + \item{pdf.height}{ default: 12 } + \item{my.mfrow}{ default: c(1,1) } + \item{verbose}{ verbose? default: TRUE. } + \item{seed}{ seed } + \item{\dots}{ additional argument(s) } +} + + + + + +\value{ + \item{fmin }{minimal value of Q.func on the interval defined by bounds. } + \item{xmin }{corresponding parameters for the minimum} + \item{iter }{number of iterations} + \item{neval }{ number of visited points } + \item{maxevals }{ the maximum number of DIRECT function evaluations } + \item{seed }{ seed} + \item{bounds}{ bounds for parameters} + \item{Q.func }{ name of the function to be minimized. } + \item{points.fmin }{ the set of points with the same fmin } + \item{Xtrain }{ visited points } + \item{Ytrain }{ the output of Q.func at visited points Xtrain } + \item{gp.seed }{ seed for Gaussian Process } + \item{model.list }{ detailed information of the search process } +} + + +\details{ + if the number of start points (N) is not defined by the user, it will be defined dependent on the dimensionality of the parameter space. + N=10D+1, where D is the number of parameters, but for high dimensional parameter space with more than 6 dimensions, + the initial set is restricted to 65. However for one-dimensional parameter space the N is set to 21 due to stability reasons. + + The idea of EPSGO (Efficient Parameter Selection via Global Optimization): Beginning + from an intial Latin hypercube sampling containing N starting points we train + an Online GP, look for the point with the maximal expected improvement, sample there and update the Gaussian Process(GP). Thereby + it is not so important that GP really correctly models the error surface of the SVM in parameter space, but + that it can give a us information about potentially interesting points in parameter space where we should sample next. + We continue with sampling points until some convergence criterion is met. + + DIRECT is a sampling algorithm which requires no knowledge of the objective function gradient. + Instead, the algorithm samples points in the domain, and uses the information it has obtained to decide where to + search next. The DIRECT algorithm will globally converge to the maximal value of the objective function. The name + DIRECT comes from the shortening of the phrase 'DIviding RECTangles', which describes the way the algorithm moves + towards the optimum. + + The code source was adopted from MATLAB originals, special thanks to Holger Froehlich. +} + + +\author{ Natalia Becker natalia.becker at dkfz.de } + +\references{ +Froehlich, H. and Zell, A. (2005) "Effcient parameter selection for support vector +machines in classification and regression via model-based global optimization" +\emph{In Proc. Int. Joint Conf. Neural Networks, 1431-1438 }.\cr + +Sill M., Hielscher T., Becker N. and Zucknick M. (2014), \emph{c060: Extended Inference with Lasso and Elastic-Net Regularized Cox and Generalized Linear Models, Journal of Statistical Software, Volume 62(5), pages 1--22.} +\url{https://www.jstatsoft.org/v62/i05/} +} + +\examples{ +\dontrun{ +set.seed(1010) +n=1000;p=100 +nzc=trunc(p/10) +x=matrix(rnorm(n*p),n,p) +beta=rnorm(nzc) +fx= x[,seq(nzc)] \%*\% beta +eps=rnorm(n)*5 +y=drop(fx+eps) +px=exp(fx) +px=px/(1+px) +ly=rbinom(n=length(px),prob=px,size=1) +set.seed(1011) + + +nfolds = 10 +set.seed(1234) +foldid <- balancedFolds(class.column.factor=y.classes, cross.outer=nfolds) + +# y - binomial +y.classes<-ifelse(y>= median(y),1, 0) +bounds <- t(data.frame(alpha=c(0, 1))) +colnames(bounds)<-c("lower","upper") + +fit <- epsgo(Q.func="tune.glmnet.interval", + bounds=bounds, + parms.coding="none", + seed = 1234, + show="none", + fminlower = -100, + x = x, y = y.classes, family = "binomial", + foldid = foldid, + type.min = "lambda.1se", + type.measure = "mse") +summary(fit) + +# y - multinomial: low - low 25\%, middle - (25,75)-quantiles, high - larger 75\%. +y.classes<-ifelse(y <= quantile(y,0.25),1, ifelse(y >= quantile(y,0.75),3, 2)) +bounds <- t(data.frame(alpha=c(0, 1))) +colnames(bounds)<-c("lower","upper") + +fit <- epsgo(Q.func="tune.glmnet.interval", + bounds=bounds, + parms.coding="none", + seed = 1234, + show="none", + fminlower = -100, + x = x, y = y.classes, family = "multinomial", + foldid = foldid, + type.min = "lambda.1se", + type.measure = "mse") +summary(fit) + +##poisson +N=500; p=20 +nzc=5 +x=matrix(rnorm(N*p),N,p) +beta=rnorm(nzc) +f = x[,seq(nzc)]%*%beta +mu=exp(f) +y.classes=rpois(N,mu) + +nfolds = 10 +set.seed(1234) +foldid <- balancedFolds(class.column.factor=y.classes, cross.outer=nfolds) + + +fit <- epsgo(Q.func="tune.glmnet.interval", + bounds=bounds, + parms.coding="none", + seed = 1234, + show="none", + fminlower = -100, + x = x, y = y.classes, family = "poisson", + foldid = foldid, + type.min = "lambda.1se", + type.measure = "mse") +summary(fit) + +#gaussian +set.seed(1234) +x=matrix(rnorm(100*1000,0,1),100,1000) +y <- x[1:100,1:1000]\%*\%c(rep(2,5),rep(-2,5),rep(.1,990)) + +foldid <- rep(1:10,each=10) + +fit <- epsgo(Q.func="tune.glmnet.interval", + bounds=bounds, + parms.coding="none", + seed = 1234, + show="none", + fminlower = -100, + x = x, y = y, family = "gaussian", + foldid = foldid, + type.min = "lambda.1se", + type.measure = "mse") +summary(fit) + +# y - cox in vingette +} +} + +\keyword{ models } +\keyword{ multivariate } +\keyword{ graphs } +\keyword{ iteration } +\keyword{ optimize } + + diff --git a/man/PLL.coxnet.Rd b/man/PLL.coxnet.Rd new file mode 100644 index 0000000..bf8b303 --- /dev/null +++ b/man/PLL.coxnet.Rd @@ -0,0 +1,34 @@ +\name{PLL.coxnet} +\alias{PLL.coxnet} +\title{Predictive partial log-likelihood for glmnet Cox PH model fit} +\description{ +Extracts the predictive partial log-likelihood from a glmnet Cox PH model fit. +} +\usage{ +\method{PLL}{coxnet}(object, newdata, newtime, newstatus, complexity, ...) +} +\arguments{ + \item{object}{fitted model of class \code{coxnet}.} + \item{newdata}{\code{n_new*p} matrix of covariates.} + \item{newtime}{\code{n_new}-vector of censored survival times.} + \item{newstatus}{\code{n_new}-vector of survival status, coded with 0 and .1} + \item{complexity}{lambda penalty value.} + \item{\dots}{additional arguments, not used.} +} +\details{ +Used by function \code{peperr}, if function \code{fit.glmnet} and \code{family="cox"} is used for model fit, which gives a class \code{coxnet} object. +This is basically a wrapper based on the \code{coxnet.deviance} function from package \code{glmnet}. +} +\value{ +Vector of length \code{n_new} +} +\keyword{models} \keyword{penalized regression} \keyword{survival} + +\author{ +Thomas Hielscher \ +\email{t.hielscher@dkfz.de} +} +\references{ +Sill M., Hielscher T., Becker N. and Zucknick M. (2014), \emph{c060: Extended Inference with Lasso and Elastic-Net Regularized Cox and Generalized Linear Models, Journal of Statistical Software, Volume 62(5), pages 1--22.} +\url{https://www.jstatsoft.org/v62/i05/} +} \ No newline at end of file diff --git a/man/Plot.coef.glmnet.Rd b/man/Plot.coef.glmnet.Rd new file mode 100644 index 0000000..dbb6bfd --- /dev/null +++ b/man/Plot.coef.glmnet.Rd @@ -0,0 +1,57 @@ +\name{Plot.coef.glmnet} +\alias{Plot.coef.glmnet} +\title{ +function to highlight the path of a pre-specified set of variables within the coefficient path +} +\description{ +Creates several plots showing the coefficient path for the final model of a cv.glmnet fit and highlights the path of a pre-specified set of variables within the coefficient path. +} +\usage{ +Plot.coef.glmnet(cvfit, betas) +} +\arguments{ + \item{cvfit}{an object of class "cv.glmnet" as returned by the function \code{cv.glmnet}.} + \item{betas}{a vector of names of variables; must be a subset of rownames(coef(cvfit)).} +} +\value{ +a list of four objects + \item{stable}{ + a vector giving the positions of the estimated stable variables + } + \item{lambda}{ + the penalization parameter used for the stability selection + } + \item{lpos}{ + the position of the penalization parameter in the regularization path + } + \item{error}{the desired type I error level w.r.t. to the chosen type I error rate + } + \item{type}{the type I error rate + } +} +\author{ +Manuela Zucknick \ +\email{m.zucknick@dkfz-heidelberg.de} +} +\examples{ +\dontrun{ +set.seed(1010) +n=1000;p=100 +nzc=trunc(p/10) +x=matrix(rnorm(n*p),n,p) +beta=rnorm(nzc) +fx= x[,seq(nzc)] \%*\% beta +eps=rnorm(n)*5 +y=drop(fx+eps) +px=exp(fx) +px=px/(1+px) +ly=rbinom(n=length(px),prob=px,size=1) +set.seed(1011) +cvob1=cv.glmnet(x,y) +Plot.coef.glmnet(cvob1, c("V1","V100")) +}} +\keyword{coefficient path} +\references{ +Sill M., Hielscher T., Becker N. and Zucknick M. (2014), \emph{c060: Extended Inference with Lasso and Elastic-Net Regularized Cox and Generalized Linear Models, Journal of Statistical Software, Volume 62(5), pages 1--22.} +\url{https://www.jstatsoft.org/v62/i05/} +} \ No newline at end of file diff --git a/man/Plot.peperr.curves.Rd b/man/Plot.peperr.curves.Rd new file mode 100644 index 0000000..c29201b --- /dev/null +++ b/man/Plot.peperr.curves.Rd @@ -0,0 +1,66 @@ +\name{Plot.peperr.curves} +\alias{Plot.peperr.curves} +\title{Plot method for prediction error curves of a peperr object} +\description{Plots individual and aggregated prediction error estimates based on bootstrap samples.} +\usage{ +Plot.peperr.curves(x, at.risk=TRUE, allErrors=FALSE, +bootRuns=FALSE, bootQuants=TRUE, bootQuants.level=0.95, leg.cex=0.7,...) +} +\arguments{ +\item{x}{\code{peperr} object.} +\item{at.risk}{number at risk to be display. default is TRUE.} +\item{allErrors}{Display .632, no information and average out-of-bag error in addition. default is FALSE.} +\item{bootRuns}{Display individual out-of-bag bootstrap samples. default is FALSE.} +\item{bootQuants}{Display pointwise out-of-bag bootstrap quantiles as shaded area. default is TRUE.} +\item{bootQuants.level}{Quantile probabilities for pointwise out-of-bag bootstrap quantiles. default is 0.95, i.e. 2.5\% and 97.5\% quantiles.} +\item{leg.cex}{size of legend text} +\item{\dots}{additional arguments, not used.} + +} +\details{ +This function is literally taken from \code{plot.peperr} in the \code{peperr} package. +The display of prediction error curves is adapted to allow for numbers at risk and pointwise bootstrap quantiles. +} +\examples{ +\dontrun{ + +# example from glmnet package +set.seed(10101) +library(glmnet) +library(survival) +library(peperr) + +N=1000;p=30 +nzc=p/3 +x=matrix(rnorm(N*p),N,p) +beta=rnorm(nzc) +fx=x[,seq(nzc)]%*%beta/3 +hx=exp(fx) +ty=rexp(N,hx) +tcens=rbinom(n=N,prob=.3,size=1)# censoring indicator +y=Surv(ty,1-tcens) + +peperr.object <- peperr(response=y, x=x, + fit.fun=fit.glmnet, args.fit=list(family="cox"), + complexity=complexity.glmnet, + args.complexity=list(family="cox",nfolds=10), + indices=resample.indices(n=N, method="sub632", sample.n=10)) + +# pointwise bootstrap quantiles and all error types +Plot.peperr.curves(peperr.object, allErrors=TRUE) + +# individual bootstrap runs and selected error types +Plot.peperr.curves(peperr.object, allErrors=FALSE, bootRuns=TRUE) +} +} +\keyword{models} \keyword{regression} \keyword{survival} + +\author{ +Thomas Hielscher +\email{t.hielscher@dkfz.de} +} +\references{ +Sill M., Hielscher T., Becker N. and Zucknick M. (2014), \emph{c060: Extended Inference with Lasso and Elastic-Net Regularized Cox and Generalized Linear Models, Journal of Statistical Software, Volume 62(5), pages 1--22.} +\url{https://www.jstatsoft.org/v62/i05/} +} +\seealso{\code{\link[peperr]{peperr}}} diff --git a/man/aggregation.auc.Rd b/man/aggregation.auc.Rd new file mode 100644 index 0000000..0f8352e --- /dev/null +++ b/man/aggregation.auc.Rd @@ -0,0 +1,84 @@ +\name{aggregation.auc} +\alias{aggregation.auc} +\title{Determine the area under the ROC curve for a fitted model} +\description{ +Evaluate the area under the ROC curve for a fitted model on new data. To be used as argument \code{aggregation.fun} in \code{peperr} call. +} +\usage{ +aggregation.auc(full.data=NULL, response, x, model, cplx=NULL, +type=c("apparent", "noinf"), fullsample.attr = NULL, \dots) +} +\arguments{ +\item{full.data}{passed from \code{peperr}, but not used for calculation.} +\item{response}{vector of binary response.} +\item{x}{\code{n*p} matrix of covariates.} +\item{model}{model fitted as returned by a \code{fit.fun}, as used in a call to \code{peperr}.} +\item{cplx}{passed from \code{peperr}, but not necessary for calculation.} +\item{type}{character.} +\item{fullsample.attr}{passed from \code{peperr}, but not necessary for calculation.} +\item{\dots}{additional arguments, passed to \code{predict} function.} +} +\details{ +Area under the ROC curve is calculated based on internal \code{glmnet:::auc} function from package \code{glmnet}. + +} +\value{ +Scalar, indicating the area under the ROC curve. +} + +\author{ +Thomas Hielscher \ +\email{t.hielscher@dkfz.de} +} + +\seealso{\code{\link[peperr]{peperr}}} + +\examples{ +\dontrun{ +# binomial model - classification + +library(c060) +library(gridExtra) +library(ggplot2) + +set.seed(0815) +x <- matrix(rnorm(100*20),100,20) +y <- sample(0:1,100,replace=TRUE) + +peperr_obj <- peperr(response=y, x=x, fit.fun=fit.glmnet, args.fit=list(family="binomial"), + complexity=complexity.glmnet, args.complexity=list(nfolds=10, family="binomial"), + trace=F, RNG="fixed",seed=0815, +# aggregation.fun=c060:::aggregation.misclass, +# aggregation.fun=c060:::aggregation.brier, + aggregation.fun=c060:::aggregation.auc, + indices=resample.indices(n=nrow(x), sample.n = 100, method = "sub632")) + +tmp <- data.frame(grp="",error=unlist(peperr_obj$sample.error)) +errs <- data.frame(error=c(perr(peperr_obj,"resample"), + perr(peperr_obj,"632p"),perr(peperr_obj,"app"), + perr(peperr_obj,"nullmodel")), col = c("red","blue","green","brown"), + row.names=c("mean\nout-of-bag",".632plus","apparent","null model")) + +p <- ggplot(tmp, aes(grp,error)) +pg <- p + geom_boxplot(outlier.colour = rgb(0,0,0,0), outlier.size=0) + + geom_jitter(position=position_jitter(width=.1)) + + theme_bw() + scale_y_continuous("AUC") + scale_x_discrete("") + + geom_hline(aes(yintercept=error, colour=col), data=errs, show_guide=T) + + scale_colour_identity("error type", guide = "legend", breaks=errs$col, + labels=rownames(errs)) + + ggtitle("AUC \n in bootstrap samples ") + +p2 <- ggplot(data.frame(complx=peperr_obj$sample.complexity), aes(x=complx)) +pg2 <- p2 + geom_histogram(binwidth = 0.02, fill = "white", colour="black") + + theme_bw()+ xlab(expression(lambda)) + + ylab("frequency") + + geom_vline(xintercept=peperr_obj$selected.complexity, colour="red") + + ggtitle("Selected complexity \n in bootstrap samples") + + ggplot2::annotate("text", x = 0.12, y = -0.5, + label = "full data", colour="red", size=4) + +grid.arrange(pg2, pg, ncol=2) + +}} + +\keyword{models} \keyword{regression} \keyword{classification} \ No newline at end of file diff --git a/man/balancedFolds.Rd b/man/balancedFolds.Rd new file mode 100644 index 0000000..99caf1f --- /dev/null +++ b/man/balancedFolds.Rd @@ -0,0 +1,50 @@ +\name{balancedFolds } +\alias{balancedFolds } + +\title{ Function producing stratified/ balanced folds for cross validation } +\description{ + Get balanced folds for cross validation, which are used for tuning penalization parameters +} + +\usage{ +balancedFolds(class.column.factor, cross.outer) +} + + +\arguments{ + \item{class.column.factor}{class labels of length n } + \item{cross.outer}{ number of folds} +} + + + +\value{ + \item{permutated.cut }{vector of length n, indicating the fold belongs to} + \item{model }{ model list + \itemize{ + \item alpha - optimal alpha + \item lambda - optimal lambda + \item nfolds - cross-validation's folds + \item cvreg - \code{cv.glmnet} object for optimal alpha + \item fit - \code{glmnet} object for optimal alpha and optimal lambda + } + } + + +} +\author{Natalia Becker natalia.becker at dkfz.de } + +\seealso{\code{\link{EPSGO}}} + +\keyword{models} +\keyword{multivariate} +\keyword{iteration} +\keyword{optimize} +\references{ +Sill M., Hielscher T., Becker N. and Zucknick M. (2014), \emph{c060: Extended Inference with Lasso and Elastic-Net Regularized Cox and Generalized Linear Models, Journal of Statistical Software, Volume 62(5), pages 1--22.} +\url{https://www.jstatsoft.org/v62/i05/} +} + + + + diff --git a/man/coef.intsearch.Rd b/man/coef.intsearch.Rd new file mode 100644 index 0000000..6f5e035 --- /dev/null +++ b/man/coef.intsearch.Rd @@ -0,0 +1,32 @@ +\name{coef.sum.intsearch} +\alias{coef.sum.intsearch} + +\title{ +Get coefficients for a model +} +\description{ +Get coefficients for a model after applying interval search for tuning parameters +} +\usage{ +\method{coef}{sum.intsearch}(object,\dots) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ +\item{object}{ an object as returned by the function \code{summary.intsearch}.} +\item{\dots}{additional argument(s)} +} +\value{ + named vector of non-zero coeficients for the optimal lambda} +\author{ +Natalia Becker \ +\email{natalia.becker@dkfz.de} +} + +\seealso{ \code{\link{EPSGO}}, \code{\link{summary.intsearch}},\code{\link{plot.sum.intsearch}} +} + +\keyword{system} +\references{ +Sill M., Hielscher T., Becker N. and Zucknick M. (2014), \emph{c060: Extended Inference with Lasso and Elastic-Net Regularized Cox and Generalized Linear Models, Journal of Statistical Software, Volume 62(5), pages 1--22.} +\url{https://www.jstatsoft.org/v62/i05/} +} \ No newline at end of file diff --git a/man/complexity.glmnet.Rd b/man/complexity.glmnet.Rd new file mode 100644 index 0000000..fa1eb2d --- /dev/null +++ b/man/complexity.glmnet.Rd @@ -0,0 +1,50 @@ +\name{complexity.glmnet} +\alias{complexity.glmnet} +\title{Interface for determination of penalty lambda in penalized regression model via cross-validation} +\description{ +Determines the amount of shrinkage for a penalized regression model fitted by glmnet via cross-validation, conforming to the calling convention required by argument \code{complexity} in \code{peperr} call. +} +\usage{ +complexity.glmnet(response, x, full.data, ...) +} +\arguments{ + \item{response}{a survival object (with \code{Surv(time, status)}, or a binary vector with entries 0 and 1).} + \item{x}{\code{n*p} matrix of covariates.} + \item{full.data}{data frame containing response and covariates of the full data set.} + \item{\dots}{additional arguments passed to \code{cv.glmnet} call such as \code{family}.} + } +\value{ +Scalar value giving the optimal lambda. +} +\details{ +Function is basically a wrapper for \code{cv.glmnet} of package \code{glmnet}. A n-fold cross-validation (default n=10) is performed to determine the optimal penalty lambda. +For Cox PH regression models the deviance based on penalized partial log-likelihood is used as loss function. For binary endpoints other loss functions are available as well (see \code{type.measure}). Deviance is default. Calling \code{peperr}, the default arguments of \code{cv.glmnet} can be changed by passing a named list containing these as argument \code{args.complexity}. +Note that only penalized Cox PH (\code{family="cox"}) and logistic regression models (\code{family="binomial"}) are sensible for prediction error +evaluation with package \code{peperr}. +} + +\references{ + Friedman, J., Hastie, T. and Tibshirani, R. (2008) + \emph{Regularization Paths for Generalized Linear Models via Coordinate + Descent}, \url{https://web.stanford.edu/~hastie/Papers/glmnet.pdf}\cr + \emph{Journal of Statistical Software, Vol. 33(1), 1-22 Feb 2010}\cr + \url{https://www.jstatsoft.org/v33/i01/}\cr + Simon, N., Friedman, J., Hastie, T., Tibshirani, R. (2011) + \emph{Regularization Paths for Cox's Proportional Hazards Model via + Coordinate Descent, Journal of Statistical Software, Vol. 39(5) + 1-13}\cr + \url{https://www.jstatsoft.org/v39/i05/}\cr + Porzelius, C., Binder, H., and Schumacher, M. (2009) + \emph{Parallelized prediction error estimation for evaluation of high-dimensional models, + Bioinformatics, Vol. 25(6), 827-829.}\cr +Sill M., Hielscher T., Becker N. and Zucknick M. (2014), \emph{c060: Extended Inference with Lasso and Elastic-Net Regularized Cox and Generalized Linear Models, Journal of Statistical Software, Volume 62(5), pages 1--22.} +\url{https://www.jstatsoft.org/v62/i05/} +} + +\author{ +Thomas Hielscher \ +\email{t.hielscher@dkfz.de} +} + +\seealso{\code{\link[peperr]{peperr}}, \code{\link[glmnet]{cv.glmnet}}} +\keyword{models} \keyword{penalized regression} diff --git a/man/fit.glmnet.Rd b/man/fit.glmnet.Rd new file mode 100644 index 0000000..513bde6 --- /dev/null +++ b/man/fit.glmnet.Rd @@ -0,0 +1,50 @@ +\name{fit.glmnet} +\alias{fit.glmnet} +\title{Interface function for fitting a penalized regression model with \code{glmnet}} +\description{ +Interface for fitting penalized regression models for binary of survival endpoint using \code{glmnet}, conforming to the requirements for argument \code{fit.fun} in \code{peperr} call. +} +\usage{ +fit.glmnet(response, x, cplx, ...) +} +\arguments{ + \item{response}{a survival object (with \code{Surv(time, status)}, or a binary vector with entries 0 and 1).} + \item{x}{\code{n*p} matrix of covariates.} + \item{cplx}{lambda penalty value.} + \item{\dots}{additional arguments passed to \code{glmnet} call such as \code{family}.} +} +\value{ +glmnet object +} + +\details{ +Function is basically a wrapper for \code{glmnet} of package \pkg{glmnet}. +Note that only penalized Cox PH (\code{family="cox"}) and logistic regression models (\code{family="binomial"}) are sensible for prediction error +evaluation with package \code{peperr}. +} + +\references{ + Friedman, J., Hastie, T. and Tibshirani, R. (2008) + \emph{Regularization Paths for Generalized Linear Models via Coordinate + Descent}, \url{https://web.stanford.edu/~hastie/Papers/glmnet.pdf}\cr + \emph{Journal of Statistical Software, Vol. 33(1), 1-22 Feb 2010}\cr + \url{https://www.jstatsoft.org/v33/i01/}\cr + Simon, N., Friedman, J., Hastie, T., Tibshirani, R. (2011) + \emph{Regularization Paths for Cox's Proportional Hazards Model via + Coordinate Descent, Journal of Statistical Software, Vol. 39(5) + 1-13}\cr + \url{https://www.jstatsoft.org/v39/i05/}\cr + Porzelius, C., Binder, H., and Schumacher, M. (2009) + \emph{Parallelized prediction error estimation for evaluation of high-dimensional models, + Bioinformatics, Vol. 25(6), 827-829.}\cr +Sill M., Hielscher T., Becker N. and Zucknick M. (2014), \emph{c060: Extended Inference with Lasso and Elastic-Net Regularized Cox and Generalized Linear Models, Journal of Statistical Software, Volume 62(5), pages 1--22.} +\url{https://www.jstatsoft.org/v62/i05/} +} + +\author{ +Thomas Hielscher \ +\email{t.hielscher@dkfz.de} +} + +\seealso{ \code{\link[peperr]{peperr}}, \code{\link[glmnet]{glmnet}}} +\keyword{models} \keyword{penalized regression} \ No newline at end of file diff --git a/man/plot.stabpath.Rd b/man/plot.stabpath.Rd new file mode 100644 index 0000000..d20859d --- /dev/null +++ b/man/plot.stabpath.Rd @@ -0,0 +1,79 @@ +\name{plot.stabpath} +\alias{plot.stabpath} +\title{ +function to plot a stability path +} +\description{ +Given a desired family-wise error rate (FWER) and a stability path calculated with \code{stability.path} the function selects an stable set of features and plots the stability path and the corresponding regularization path. +} +\usage{ +\method{plot}{stabpath}(x, error=0.05, type=c("pfer","pcer"), pi_thr=0.6, xvar=c("lambda", "norm", "dev"), + col.all="black", col.sel="red", ...) +} +\arguments{ + \item{x}{ + an object of class "stabpath" as returned by the function \code{stabpath}. +} +\item{error}{ +the desired type I error level w.r.t. to the chosen type I error rate. +} +\item{type}{ +The type I error rate used for controlling the number falsely selected variables. If \code{type="pfer"} the per-family error rate is controlled and \code{error} corresponds to the expected number of type I errors. +Selecting \code{type="pfer"} and \code{error} in the range of 0 > \code{error} < 1 will control the family-wise error rate, i.e. the probability that at least one variable in the estimated stable set has been falsely selected. +If \code{type="pcer"} the per-comparison error rate is controlled and \code{error} corresponds to the expected number of type I errors divided by the number variables. + } + \item{pi_thr}{ +the threshold used for the stability selection, should be in the range of 0.5 > pi_thr < 1. +} + \item{xvar}{ +the variable used for the xaxis, e.g. for "lambda" the selection probabilities are plotted along the log of the penalization parameters, +for "norm" along the L1-norm and for "dev" along the fraction of explained deviance. +} + \item{col.all}{ +the color used for the variables that are not in the estimated stable set +} + \item{col.sel}{ +the color used for the variables in the estimated stable set +} + \item{...}{ +further arguments that are passed to matplot + } +} +\value{ +a list of four objects + \item{stable}{ + a vector giving the positions of the estimated stable variables + } + \item{lambda}{ + the penalization parameter used for the stability selection + } + \item{lpos}{ + the position of the penalization parameter in the regularization path + } + \item{error}{the desired type I error level w.r.t. to the chosen type I error rate + } + \item{type}{the type I error rate + } +} +\author{ +Martin Sill \ +\email{m.sill@dkfz.de} +} +\references{ +Meinshausen N. and Buehlmann P. (2010), Stability Selection, Journal of the Royal Statistical Society: Series B (Statistical Methodology) Volume 72, Issue 4, pages 417-473.\cr +Sill M., Hielscher T., Becker N. and Zucknick M. (2014), \emph{c060: Extended Inference with Lasso and Elastic-Net Regularized Cox and Generalized Linear Models, Journal of Statistical Software, Volume 62(5), pages 1--22.} +\url{https://www.jstatsoft.org/v62/i05/} +} +\seealso{ \code{\link{stabsel},\link{stabpath}} +} +\examples{ +\dontrun{ +#gaussian +set.seed(1234) +x=matrix(rnorm(100*1000,0,1),100,1000) +y <- x[1:100,1:1000]\%*\%c(rep(2,5),rep(-2,5),rep(.1,990)) +res <- stabpath(y,x,weakness=1,mc.cores=2) +plot(res,error=.5,type='pfer') +} +} +\keyword{stability selection} \ No newline at end of file diff --git a/man/plot.summary.int.Rd b/man/plot.summary.int.Rd new file mode 100644 index 0000000..1b91eb9 --- /dev/null +++ b/man/plot.summary.int.Rd @@ -0,0 +1,36 @@ +\name{plot.sum.intsearch} +\alias{plot.sum.intsearch} +\title{ +Plot Summary object for interval search models +} +\description{ +Produces a plot for summary object of a fitted interval search model. +Plot 'visited' points against iteration steps. start.N points are initial points selected before interval search starts. +} +\usage{ +\method{plot}{sum.intsearch}(x,type="summary",startN=21,... ) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{x}{ + an object of class \code{sum.intsearch} as returned by the function \code{summary.intsearch}. + } + \item{type}{type of plot to be drawn, \code{type="summary"} will plot the partial log likelihood deviance as a function of both tuning parameters \ifelse{html}{\out{α}}{\eqn{\alpha}{alpha}} and log\ifelse{html}{\out{λ}}{\eqn{\lambda}{lambda}}. The final solution will be highlighted by solid red line. +Alternativly, \code{type="points"} will draw the distribution of initial and visited points of the interval search plotted in chronological order.} + \item{startN}{number of initial points. Needed if \code{type="points"}} + \item{...}{additional argument(s)} +} + + +\author{ +Natalia Becker \ +\email{natalia.becker@dkfz.de} +} + +\seealso{ \code{\link{EPSGO}}, \code{\link{summary.intsearch}} +} +\references{ +Sill M., Hielscher T., Becker N. and Zucknick M. (2014), \emph{c060: Extended Inference with Lasso and Elastic-Net Regularized Cox and Generalized Linear Models, Journal of Statistical Software, Volume 62(5), pages 1--22.} +\url{https://www.jstatsoft.org/v62/i05/} +} +\keyword{plot} diff --git a/man/predictProb.coxnet.Rd b/man/predictProb.coxnet.Rd new file mode 100644 index 0000000..246a0e8 --- /dev/null +++ b/man/predictProb.coxnet.Rd @@ -0,0 +1,48 @@ +\name{predictProb.coxnet} +\alias{predictProb.coxnet} +\title{Extract predicted survival probabilities from a glmnet fit} +\description{ +Extracts predicted survival probabilities from survival model fitted by glmnet, providing an interface as required by \code{pmpec}. +} +\usage{ +\method{predictProb}{coxnet}(object, response, x, times, complexity, ...) +} +\arguments{ +\item{object}{a fitted model of class \code{glmnet}} +\item{response}{a two-column matrix with columns named 'time' and 'status'. The latter is a binary variable, with '1' indicating death, and '0' indicating right censored. The function \code{Surv()} in package survival produces such a matrix} +\item{x}{\code{n*p} matrix of covariates.} +\item{times}{vector of evaluation time points.} +\item{complexity}{lambda penalty value.} +\item{\dots}{additional arguments, currently not used.} +} + +\value{ +Matrix with probabilities for each evaluation time point in \code{times} (columns) and each new observation (rows). +} + +\references{ + Friedman, J., Hastie, T. and Tibshirani, R. (2008) + \emph{Regularization Paths for Generalized Linear Models via Coordinate + Descent}, \url{https://web.stanford.edu/~hastie/Papers/glmnet.pdf}\cr + \emph{Journal of Statistical Software, Vol. 33(1), 1-22 Feb 2010}\cr + \url{https://www.jstatsoft.org/v33/i01/}\cr + Simon, N., Friedman, J., Hastie, T., Tibshirani, R. (2011) + \emph{Regularization Paths for Cox's Proportional Hazards Model via + Coordinate Descent, Journal of Statistical Software, Vol. 39(5) + 1-13}\cr + \url{https://www.jstatsoft.org/v39/i05/}\cr + Porzelius, C., Binder, H., and Schumacher, M. (2009) + \emph{Parallelized prediction error estimation for evaluation of high-dimensional models, + Bioinformatics, Vol. 25(6), 827-829.}\cr + Sill M., Hielscher T., Becker N. and Zucknick M. (2014), \emph{c060: Extended Inference with Lasso and Elastic-Net Regularized Cox and Generalized Linear Models, Journal of Statistical Software, Volume 62(5), pages 1--22.} + \url{https://www.jstatsoft.org/v62/i05/} +} + +\author{ +Thomas Hielscher \ +\email{t.hielscher@dkfz.de} +} + + +\seealso{\code{\link[c060]{predictProb.glmnet}},\code{\link[peperr]{peperr}}, \code{\link[glmnet]{glmnet}}} +\keyword{models} \keyword{penalized regression} \keyword{survival} \ No newline at end of file diff --git a/man/predictProb.glmnet.Rd b/man/predictProb.glmnet.Rd new file mode 100644 index 0000000..1958002 --- /dev/null +++ b/man/predictProb.glmnet.Rd @@ -0,0 +1,47 @@ +\name{predictProb.glmnet} +\alias{predictProb.glmnet} +\title{Extract predicted survival probabilities from a glmnet fit} +\description{ +Extracts predicted survival probabilities from survival model fitted by glmnet, providing an interface as required by \code{pmpec}. +} +\usage{ +\method{predictProb}{glmnet}(object, response, x, times, complexity, ...) +} +\arguments{ +\item{object}{a fitted model of class \code{glmnet}.} +\item{response}{response variable. Quantitative for \code{family="gaussian"}, or \code{family="poisson"} (non-negative counts). For \code{family="binomial"} should be either a factor with two levels, or a two-column matrix of counts or proportions. For \code{family="multinomial"}, can be a nc>=2 level factor, or a matrix with nc columns of counts or proportions. } +\item{x}{\code{n*p} matrix of covariates.} +\item{times}{vector of evaluation time points.} +\item{complexity}{lambda penalty value.} +\item{\dots}{additional arguments, currently not used.} +} + +\value{ +Matrix with probabilities for each evaluation time point in \code{times} (columns) and each new observation (rows). +} + +\references{ + Friedman, J., Hastie, T. and Tibshirani, R. (2008) + \emph{Regularization Paths for Generalized Linear Models via Coordinate + Descent}, \url{https://web.stanford.edu/~hastie/Papers/glmnet.pdf}\cr + \emph{Journal of Statistical Software, Vol. 33(1), 1-22 Feb 2010}\cr + \url{https://www.jstatsoft.org/v33/i01/}\cr + Simon, N., Friedman, J., Hastie, T., Tibshirani, R. (2011) + \emph{Regularization Paths for Cox's Proportional Hazards Model via + Coordinate Descent, Journal of Statistical Software, Vol. 39(5) + 1-13}\cr + \url{https://www.jstatsoft.org/v39/i05/}\cr + Porzelius, C., Binder, H., and Schumacher, M. (2009) + \emph{Parallelized prediction error estimation for evaluation of high-dimensional models, + Bioinformatics, Vol. 25(6), 827-829.}\cr + Sill M., Hielscher T., Becker N. and Zucknick M. (2014), \emph{c060: Extended Inference with Lasso and Elastic-Net Regularized Cox and Generalized Linear Models, Journal of Statistical Software, Volume 62(5), pages 1--22.} + \url{https://www.jstatsoft.org/v62/i05/} +} + +\author{ +Thomas Hielscher \ +\email{t.hielscher@dkfz.de} +} + +\seealso{\code{\link[c060]{predictProb.coxnet}}, \code{\link[peperr]{peperr}}, \code{\link[glmnet]{glmnet}}} +\keyword{models} \keyword{penalized regression} \keyword{survival} \ No newline at end of file diff --git a/man/stabpath.Rd b/man/stabpath.Rd new file mode 100644 index 0000000..5eff5da --- /dev/null +++ b/man/stabpath.Rd @@ -0,0 +1,121 @@ +\name{stabpath} +\alias{stabpath} +\title{ +Stability path for glmnet models +} +\description{ +The function calculates the stability path for glmnet models, e.g. the selection probabilities of the features along the range of regularization parameters. +} +\usage{ +stabpath(y,x,size=0.632,steps=100,weakness=1,mc.cores=getOption("mc.cores", 2L),...) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ +\item{y}{ +response variable. Like for the glment function: Quantitative for \code{family="gaussian"} or + \code{family="poisson"} (non-negative counts). For + \code{family="binomial"} should be either a factor with two + levels, or a two-column matrix of counts or proportions. For + \code{family="multinomial"}, can be a \code{nc>=2} level factor, or a + matrix with \code{nc} columns of counts or proportions. For + \code{family="cox"}, \code{y} should be a two-column matrix with + columns named 'time' and 'status'. The latter is a binary + variable, with '1' indicating death, and '0' indicating right + censored. The function \code{Surv()} in package \code{survival} + produces such a matrix +} + \item{x}{ +input matrix. Like for the glmnet function: +of dimension nobs x nvars; each row is an +observation vector. Can be in sparse matrix format (inherit +from class \code{"sparseMatrix"} as in package \code{Matrix}; not yet +available for \code{family="cox"}) +} + \item{size}{ +proportion of samples drawn in every subsample used for the stability selection. +} + \item{steps}{ +number of subsamples used for the stability selection. +} + \item{weakness}{ +weakness parameter used for the randomised lasso as described in Meinshausen and B\"uhlmann (2010). +For each subsample the features are reweighted by a random weight uniformly sampled in [weakness,1]. +This additional randomisation leads to a more consistent estimation of the stable set of features. +} + \item{mc.cores}{ +number of cores used for the parallelization. For unix like system the parallelization is done by forking using the function \code{mclapply}. For windows systems socket cluster are used. +} + \item{...}{ +further arguments that are passed to the \code{glmnet} function. +} +} +\value{ +an object of class "stabpath", which is a list of three objects + \item{fit}{ + the fit object of class "glmnet" as returned from the glmnet function when applied to the complete data set. + } + \item{stabpath}{ + a matrix which represents the stability path. + } + \item{qs}{ + a vector holding the values of the average number of non-zero coefficients w.r.t to the lambdas in the regularization path. + } +} +\author{ +Martin Sill +\email{m.sill@dkfz.de} +} +\references{ +Meinshausen N. and B\"uhlmann P. (2010), \emph{Stability Selection, Journal of the Royal Statistical Society: Series B (Statistical Methodology) Volume 72, Issue 4, pages 417--473.}\cr + +Sill M., Hielscher T., Becker N. and Zucknick M. (2014), \emph{c060: Extended Inference with Lasso and Elastic-Net Regularized Cox and Generalized Linear Models, Journal of Statistical Software, Volume 62(5), pages 1--22.} +\url{https://www.jstatsoft.org/v62/i05/} +} +\seealso{ \code{\link{glmnet},\link{stabsel},\link{plot.stabpath}} +} +\examples{\dontrun{ +#gaussian +set.seed(1234) +x <- matrix(rnorm(100*1000,0,1),100,1000) +y <- x[1:100,1:1000]\%*\% c(rep(2,5),rep(-2,5),rep(.1,990)) +res <- stabpath(y,x,weakness=1,mc.cores=2) +plot(res) + +#binomial +y=sample(1:2,100,replace=TRUE) +res <- stabpath(y,x,weakness=1,mc.cores=2,family="binomial") +plot(res) + +#multinomial +y=sample(1:4,100,replace=TRUE) +res <- stabpath(y,x,weakness=1,mc.cores=2,family="multinomial") +plot(res) + +#poisson +N=100; p=1000 +nzc=5 +x=matrix(rnorm(N*p),N,p) +beta=rnorm(nzc) +f = x[,seq(nzc)]\%*\%beta +mu=exp(f) +y=rpois(N,mu) +res <- stabpath(y,x,weakness=1,mc.cores=2,family="poisson") +plot(res) + +#Cox +library(survival) +set.seed(10101) +N=100;p=1000 +nzc=p/3 +x=matrix(rnorm(N*p),N,p) +beta=rnorm(nzc) +fx=x[,seq(nzc)]\%*\%beta/3 +hx=exp(fx) +ty=rexp(N,hx) +tcens=rbinom(n=N,prob=.3,size=1) +y=cbind(time=ty,status=1-tcens) +res <- stabpath(y,x,weakness=1,mc.cores=2,family="cox") +plot(res) +} +} +\keyword{stability selection} diff --git a/man/stabsel.Rd b/man/stabsel.Rd new file mode 100644 index 0000000..fd6dd95 --- /dev/null +++ b/man/stabsel.Rd @@ -0,0 +1,66 @@ +\name{stabsel} +\alias{stabsel} +\title{ +function to estimate a stable set of variables +} +\description{ +Given a desired type I error rate and a stability path calculated with \code{stability.path} the function selects a stable set of variables. +} +\usage{ +stabsel(x,error=0.05,type=c("pfer","pcer"),pi_thr=0.6) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{x}{ + an object of class "stabpath" as returned by the function \code{stabpath}. +} + \item{error}{ +the desired type I error level w.r.t. to the chosen type I error rate. +} + \item{type}{ +The type I error rate used for controlling the number falsely selected variables. If \code{type="pfer"} the per-family error rate is controlled and \code{error} corresponds to the expected number of type I errors. +Selecting \code{type="pfer"} and \code{error} in the range of $0 > \code{error} < 1$ will control the family-wise error rate, i.e. the probability that at least one variable in the estimated stable set has been falsely selected. +If \code{type="pcer"} the per-comparison error rate is controlled and \code{error} corresponds to the expected number of type I errors divided by the number variables. + } + \item{pi_thr}{ +the threshold used for the stability selection, should be in the range of $0.5 > pi_thr < 1$. +} +} +\value{ +a list of four objects + \item{stable}{ + a vector giving the positions of the estimated stable variables + } + \item{lambda}{ + the penalization parameter used for the stability selection + } + \item{lpos}{ + the position of the penalization parameter in the regularization path + } + \item{error}{the desired type I error level w.r.t. to the chosen type I error rate + } + \item{type}{the type I error rate + } +} +\author{ +Martin Sill \ +\email{m.sill@dkfz.de} +} +\references{ +Meinshausen N. and B\"uhlmann P. (2010), \emph{Stability Selection, Journal of the Royal Statistical Society: Series B (Statistical Methodology) Volume 72, Issue 4, pages 417--473.}\cr + +Sill M., Hielscher T., Becker N. and Zucknick M. (2014), \emph{c060: Extended Inference with Lasso and Elastic-Net Regularized Cox and Generalized Linear Models, Journal of Statistical Software, Volume 62(5), pages 1--22.} +\url{https://www.jstatsoft.org/v62/i05/} +} +\seealso{ \code{\link{plot.stabpath},\link{stabpath}} +} +\examples{ +\dontrun{ +#gaussian +set.seed(1234) +x=matrix(rnorm(100*1000,0,1),100,1000) +y <- x[1:100,1:1000]\%*\%c(rep(2,5),rep(-2,5),rep(.1,990)) +res <- stabpath(y,x,weakness=1,mc.cores=2) +stabsel(res,error=0.05,type="pfer") +}} +\keyword{stability selection} diff --git a/man/summary.intsearch.Rd b/man/summary.intsearch.Rd new file mode 100644 index 0000000..7fa6351 --- /dev/null +++ b/man/summary.intsearch.Rd @@ -0,0 +1,48 @@ +\name{summary.intsearch} +\alias{summary.intsearch} +\title{ +Summary method for interval search models +} +\description{ +Produces a summary of a fitted interval search model +} +\usage{ +\method{summary}{intsearch}(object,digits = max(3, getOption("digits") - 3), verbose=TRUE,first.n=5,...) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{object}{ an object of class \code{intsearch} as returned by the function \code{EPSGO}.} + \item{digits}{ digits after the comma } + \item{verbose}{ default set to TRUE. } + \item{first.n}{ show first.n entries , default 5. } + \item{\dots}{additional argument(s)} +} + +\value{ + A list of following elements + \item{info}{ + a data frame of four objects for optimal models\cr + \itemize{ + \item alpha - a vector of alphas + \item lambda - a vector of penalization parameter lambda + \item deviances - a vector of deviances + \item n.features -a vector of number of features selected in each optimal model + } + } + \item{opt.alpha}{ an optimal value for alpha} + \item{opt.lambda}{an optimal value for lambda} + \item{opt.error}{ an optimal value for error, hier minimal diviance} + \item{opt.models}{ a list of optimal models with the same optimal error} +} +\author{ +Natalia Becker \ +\email{natalia.becker@dkfz.de} +} + +\seealso{ \code{\link{EPSGO}},\code{\link{plot.sum.intsearch}} +} +\references{ +Sill M., Hielscher T., Becker N. and Zucknick M. (2014), \emph{c060: Extended Inference with Lasso and Elastic-Net Regularized Cox and Generalized Linear Models, Journal of Statistical Software, Volume 62(5), pages 1--22.} +\url{https://www.jstatsoft.org/v62/i05/} +} +\keyword{summary} diff --git a/man/tune.glmnet.interval.Rd b/man/tune.glmnet.interval.Rd new file mode 100644 index 0000000..c50a985 --- /dev/null +++ b/man/tune.glmnet.interval.Rd @@ -0,0 +1,85 @@ +\name{tune.glmnet.interval} + +\alias{tune.glmnet.interval} + +\title{ Wrapper function for \code{glmnet} objects. } +\description{ + Wrapper function for \code{glmnet} objects used by \code{epsgo} function. + This function is mainly used within the function \code{\link{epsgo}} +} + + + +\usage{ +tune.glmnet.interval(parms, x, y, + weights, + offset = NULL, + lambda = NULL, + type.measure = c("mse", "deviance", "class", "auc", "mae"), + seed=12345, + nfolds = 10, + foldid=NULL, + grouped = TRUE, + type.min=c("lambda.min", "lambda.1se"), + family, + verbose=FALSE, + \dots) +} +%tune.glmnet.interval: no visible binding for global variable +% 'parms.coding' + + +\arguments{ + \item{parms}{tuning parameter alpha for \code{glmnet} object} + \item{x,y}{x is a matrix where each row refers to a sample a each + column refers to a gene; y is a factor which includes the class for + each sample} + \item{weights}{observation weights. Can be total counts if responses are proportion matrices. Default is 1 for each observation} + \item{offset}{A vector of length nobs that is included in the linear predictor (a nobs x nc matrix for the "multinomial" family). Useful for the "poisson" family (e.g. log of exposure time), or for refining a model by starting at a current fit. Default is NULL. If supplied, then values must also be supplied to the predict function.} + \item{lambda}{A user supplied lambda sequence. Typical usage is to have the program compute its own lambda sequence based on nlambda and lambda.min.ratio. Supplying a value of lambda overrides this. WARNING: use with care. Do not supply a single value for lambda (for predictions after CV use predict() instead). Supply instead a decreasing sequence of lambda values. glmnet relies on its warms starts for speed, and its often faster to fit a whole path than compute a single fit.} + \item{type.measure}{loss to use for cross-validation. Currently five options, not all available for all models. The default is type.measure="deviance", which uses squared-error for gaussian models (a.k.a type.measure="mse" there), deviance for logistic and poisson regression, and partial-likelihood for the Cox model. type.measure="class" applies to binomial and multinomial logistic regression only, and gives misclassification error. type.measure="auc" is for two-class logistic regression only, and gives area under the ROC curve. type.measure="mse" or type.measure="mae" (mean absolute error) can be used by all models except the "cox"; they measure the deviation from the fitted mean to the response.} + \item{seed}{seed} + \item{nfolds}{number of cross-validation's folds, default 10.} + \item{foldid}{an optional vector of values between 1 and nfold identifying what fold each observation is in. If supplied, nfold can be missing.} + \item{grouped}{ This is an experimental argument, with default TRUE, and can be ignored by most users. For all models except the "cox", this refers to computing nfolds separate statistics, and then using their mean and estimated standard error to describe the CV curve. If grouped=FALSE, an error matrix is built up at the observation level from the predictions from the nfold fits, and then summarized (does not apply to type.measure="auc"). For the "cox" family, grouped=TRUE obtains the CV partial likelihood for the Kth fold by subtraction; by subtracting the log partial likelihood evaluated on the full dataset from that evaluated on the on the (K-1)/K dataset. This makes more efficient use of risk sets. With grouped=FALSE the log partial likelihood is computed only on the Kth fold} + \item{type.min}{parameter for chosing optimal model: 'lambda.min'- value of lambda that gives minimum mean cross-validated error (cvm). + 'lambda.1se' - largest value of lambda such that error is within one standard error of the minimum.} + \item{family}{family of the model, i.e. cox, glm,...} + \item{verbose}{verbose} + \item{\dots}{Further parameters} +} + + + +\value{ + \item{q.val }{minimal value of Q.func on the interval defined by bounds. Here, q.val is minimum mean cross-validate d error (cvm)} + \item{model }{ model list + \itemize{ + \item alpha - optimal alpha + \item lambda - optimal lambda + \item nfolds - cross-validation's folds + \item cvreg - \code{cv.glmnet} object for optimal alpha + \item fit - \code{glmnet} object for optimal alpha and optimal lambda + } + } + + + + +} +\author{Natalia Becker natalia.becker at dkfz.de } + +\seealso{\code{\link{epsgo}}} +\references{ +Sill M., Hielscher T., Becker N. and Zucknick M. (2014), \emph{c060: Extended Inference with Lasso and Elastic-Net Regularized Cox and Generalized Linear Models, Journal of Statistical Software, Volume 62(5), pages 1--22.} +\url{https://www.jstatsoft.org/v62/i05/} +} +\keyword{models} +\keyword{multivariate} +\keyword{iteration} +\keyword{optimize} + + + + +