diff --git a/.gitignore b/.gitignore index 42da60f..1a4d98b 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,5 @@ src/*.dll #man/*.Rd *.tar.gz inst/doc +*.pdf + diff --git a/.travis.yml b/.travis.yml index 020bf80..655491f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -22,8 +22,6 @@ r: # BiocInstaller required for 'graph' package from Bioconductor bioc_required: true -# Install up-to-date dependencies from GitHub -r_github_packages: - - itsrainingdata/sparsebnUtils - - itsrainingdata/ccdrAlgorithm - - gujyjean/discretecdAlgorithm +# Set CXX1X for R-devel, as R-devel does not detect CXX1X support for gcc 4.6.3 +before_install: +- if [[ "$TRAVIS_R_VERSION_STRING" = 'devel' ]]; then mkdir ~/.R && echo 'CXX1X=g++ -std=c++0x -g -O2 -fPIC' > ~/.R/Makevars; fi diff --git a/DESCRIPTION b/DESCRIPTION index 6e55b46..4e65de2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,15 +1,15 @@ Package: sparsebn Title: Learning Sparse Bayesian Networks from High-Dimensional Data -Version: 0.0.1 -Date: 2016-08-12 +Version: 0.0.2 +Date: 2016-11-28 Authors@R: person("Bryon", "Aragam", email = "sparsebn@gmail.com", role = c("aut", "cre")) Maintainer: Bryon Aragam Description: Fast methods for learning sparse Bayesian networks from high-dimensional data using sparse regularization. Designed to incorporate mixed experimental and observational data with thousands of variables with either continuous or discrete observations. Depends: R (>= 3.2.3), - sparsebnUtils, - ccdrAlgorithm, - discretecdAlgorithm + sparsebnUtils (>= 0.0.3), + ccdrAlgorithm (>= 0.0.2), + discretecdAlgorithm (>= 0.0.2) Suggests: knitr, rmarkdown, diff --git a/NAMESPACE b/NAMESPACE index 4335ef6..83ca040 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,8 +1,15 @@ # Generated by roxygen2: do not edit by hand +S3method(plotDAG,edgeList) +S3method(plotDAG,sparsebnFit) +S3method(plotDAG,sparsebnPath) export(estimate.covariance) export(estimate.dag) export(estimate.precision) +export(plotDAG) import(ccdrAlgorithm) import(discretecdAlgorithm) import(sparsebnUtils) +importFrom(grDevices,gray) +importFrom(grDevices,rgb) +importFrom(graphics,plot) diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 0000000..1986590 --- /dev/null +++ b/NEWS.md @@ -0,0 +1,16 @@ +# sparsebn 0.0.2 + +## Features +* Added a `NEWS.md` file to track changes to the package +* Added plotDAG to provide convenient default for plotting large graphs +* `estimate.dag` now takes an optional logical argument `adaptive`: If `TRUE`, then an adaptive version of the CD algorithm will be run for discrete data. This argument is ignored for continuous data. +* Vignette updated and re-written + +## Bug fixes + +* Cytometry data has been fixed to correct some errors in the network structure; now correctly reflects the network learned in Sachs et al. (2005) + +# sparsebn 0.0.1 + +* Initial stable release + diff --git a/R/sparsebn-main.R b/R/sparsebn-main.R index 8260245..9dafbbe 100644 --- a/R/sparsebn-main.R +++ b/R/sparsebn-main.R @@ -40,6 +40,7 @@ #' @param weight.scale (CD only) A postitive number to scale weight matrix. #' @param convLb (CD only) Small positive number used in Hessian approximation. #' @param upperbound (CD only) A large positive value used to truncate the adaptive weights. A -1 value indicates that there is no truncation. +#' @param adaptive (CD only) \code{TRUE / FALSE}, if \code{TRUE} the adaptive algorithm will be run. #' @param verbose \code{TRUE / FALSE} whether or not to print out progress and summary reports. #' #' @return A \code{\link[sparsebnUtils]{sparsebnPath}} object. @@ -49,7 +50,7 @@ #' \dontrun{ #' # Estimate a DAG from the cytometry data #' data(cytometryContinuous) -#' dat <- sparsebnData(cytometryContinuous$data, type = "d", ivn = cytometryContinuous$ivn) +#' dat <- sparsebnData(cytometryContinuous$data, type = "c", ivn = cytometryContinuous$ivn) #' estimate.dag(dat) #' } #' @@ -64,6 +65,7 @@ estimate.dag <- function(data, weight.scale = 1.0, convLb = 0.01, upperbound = 100.0, + adaptive = FALSE, verbose = FALSE ){ pp <- ncol(data$data) @@ -107,7 +109,8 @@ estimate.dag <- function(data, error.tol = error.tol, convLb = convLb, weight.scale = weight.scale, - upperbound = upperbound) + upperbound = upperbound, + adaptive = adaptive) } } @@ -132,7 +135,7 @@ estimate.dag <- function(data, #' #' \dontrun{ #' data(cytometryContinuous) -#' dat <- sparsebnData(cytometryContinuous$data, type = "d", ivn = cytometryContinuous$ivn) +#' dat <- sparsebnData(cytometryContinuous$data, type = "c", ivn = cytometryContinuous$ivn) #' estimate.covariance(dat) # estimate covariance #' estimate.precision(dat) # estimate precision #' } diff --git a/R/sparsebn-plotting.R b/R/sparsebn-plotting.R new file mode 100644 index 0000000..3155c1b --- /dev/null +++ b/R/sparsebn-plotting.R @@ -0,0 +1,83 @@ +# +# sparsebn-plotting.R +# sparsebn +# +# Created by Bryon Aragam (local) on 10/2/16. +# Copyright (c) 2016 Bryon Aragam. All rights reserved. +# + +# +# PACKAGE SPARSEBN: Default methods for plotting DAGs +# +# CONTENTS: +# plotDAG +# plotDAG.edgeList +# plotDAG.sparsebnFit +# plotDAG.sparsebnPath +# + +#' Plot a DAG +#' +#' Using some sensible defaults for large graphs, plot a DAG object. Uses \code{\link[igraph]{igraph}} +#' package by default. +#' +#' This method is not intended for customization. For more control over the output, use +#' \code{\link{plot}} and see \code{\link{setPlotPackage}} for plotting only and/or +#' \code{\link{setGraphPackage}} for even more control. These methods grants the user the full +#' feature set of the selected package. +#' +#' @param x An \code{\link{edgeList}}, \code{\link{sparsebnFit}}, or \code{\link{sparsebnPath}} object. +#' @param ... Additional arguments to \code{\link{plot}}. +#' +#' @export +plotDAG <- function(x){ + ### Must use igraph for the default method + current_plot_pkg <- getPlotPackage() + if(current_plot_pkg != "igraph"){ + err_msg <- sprintf("This method requires that the 'igraph' package be set as your plotting package. You are currently set to use the '%s' package instead. Please set this packge to be 'igraph', or use the default plot() method from the %s package instead. See ?setPlotPackage for more details.", current_plot_pkg, current_plot_pkg) + stop(err_msg) + } + + UseMethod("plotDAG", x) +} + +#' @export +plotDAG.edgeList <- function(x, ...){ + + # + # Note that the defaults used here are not the same as for sparsebnPath! + # + plot(x, + vertex.size = 4, + vertex.label = NA, + vertex.label.color = gray(0), + vertex.color = rgb(1,0,0), + edge.width = 1, + edge.color = gray(0), + edge.arrow.size = 0.5, + ...) +} + +#' @export +plotDAG.sparsebnFit <- function(x, ...){ + plotDAG(x$edges, ...) +} + +#' @export +plotDAG.sparsebnPath <- function(x, ...){ + + # + # This method does not delegate directly to sparsebnFit because + # plotting a grid of graphs requires a different visual layout. + # + + plot(x, + vertex.size = 4, + vertex.label = NA, + vertex.label.color = gray(0), + vertex.color = rgb(1,0,0), + edge.width = 0.5, + edge.color = gray(0), + edge.arrow.size = 0.15, + ...) +} diff --git a/R/zzz.R b/R/zzz.R index 1d15f31..3c670da 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -7,10 +7,12 @@ # #' @import sparsebnUtils ccdrAlgorithm discretecdAlgorithm +#' @importFrom grDevices gray rgb +#' @importFrom graphics plot .onAttach <- function(libname, pkgname){ msg <- paste0("\n", - "sparsebn v0.0.1, Copyright (c) 2016\n", + "sparsebn v0.0.2, Copyright (c) 2016\n", "\tBryon Aragam, University of California, Los Angeles\n", "\tJiaying Gu, University of California, Los Angeles\n", "\tDacheng Zhang, University of California, Los Angeles\n", diff --git a/README.Rmd b/README.Rmd index 2bf3b9a..dcb13dd 100644 --- a/README.Rmd +++ b/README.Rmd @@ -15,6 +15,8 @@ knitr::opts_chunk$set( # sparsebn [![Travis-CI Build Status](https://travis-ci.org/itsrainingdata/sparsebn.svg?branch=master)](https://travis-ci.org/itsrainingdata/sparsebn) +[![](http://www.r-pkg.org/badges/version/sparsebn)](http://www.r-pkg.org/pkg/sparsebn) +[![CRAN RStudio mirror downloads](http://cranlogs.r-pkg.org/badges/sparsebn)](http://www.r-pkg.org/pkg/sparsebn) Methods for learning sparse Bayesian networks and other graphical models from high-dimensional data via sparse regularization. Designed to handle: diff --git a/README.md b/README.md index 46d6495..4393941 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ sparsebn ======== -[![Travis-CI Build Status](https://travis-ci.org/itsrainingdata/sparsebn.svg?branch=master)](https://travis-ci.org/itsrainingdata/sparsebn) +[![Travis-CI Build Status](https://travis-ci.org/itsrainingdata/sparsebn.svg?branch=master)](https://travis-ci.org/itsrainingdata/sparsebn) [![](http://www.r-pkg.org/badges/version/sparsebn)](http://www.r-pkg.org/pkg/sparsebn) [![CRAN RStudio mirror downloads](http://cranlogs.r-pkg.org/badges/sparsebn)](http://www.r-pkg.org/pkg/sparsebn) Methods for learning sparse Bayesian networks and other graphical models from high-dimensional data via sparse regularization. Designed to handle: diff --git a/cran-comments.md b/cran-comments.md index 850bb13..97b6088 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,28 +1,14 @@ ## Test environments -* local OS X install, R 3.3.1 -* ubuntu 12.04 (travis-ci), R 3.3.1 +* local OS X install, R 3.3.2 +* ubuntu 12.04.5 (travis-ci), R 3.3.2 (oldrel, devel, and release) * win-builder (devel and release) ## R CMD check results -There were no ERRORs or WARNINGs. +There were no ERRORs, WARNINGs, or NOTEs. -There was 1 NOTE: +## CRAN notes -* checking CRAN incoming feasibility ... NOTE -Maintainer: ‘Bryon Aragam ’ - -New submission - -This is the first version of this package that has been submitted to CRAN. - -## Re-submission notes -- I get - -Reading CITATION file fails with - $ operator is invalid for atomic vectors -when package is not installed. - -Fixed by replacing packageDecription with meta as described in Section 1.9 of R-exts. +The current warnings listed for v0.0.1 at https://cran.rstudio.com/web/checks/check_results_sparsebn.html are a result of recent updates to the dependencies for this submission. The current submission remedies all of these issues. ## Dependencies @@ -30,4 +16,4 @@ CHECK has been run on all dependencies and passed. ## Reverse dependencies -This is a new release, so there are no reverse dependencies. +There are no reverse dependencies. \ No newline at end of file diff --git a/data/cytometryContinuous.rda b/data/cytometryContinuous.rda index 78527fa..5e1589b 100644 Binary files a/data/cytometryContinuous.rda and b/data/cytometryContinuous.rda differ diff --git a/data/cytometryDiscrete.rda b/data/cytometryDiscrete.rda index 9fd3593..dbf6ac4 100644 Binary files a/data/cytometryDiscrete.rda and b/data/cytometryDiscrete.rda differ diff --git a/data/pathfinder.rda b/data/pathfinder.rda index 51cd7e2..29b80d3 100644 Binary files a/data/pathfinder.rda and b/data/pathfinder.rda differ diff --git a/man/estimate.covariance.Rd b/man/estimate.covariance.Rd index 272ccab..6e3baa4 100644 --- a/man/estimate.covariance.Rd +++ b/man/estimate.covariance.Rd @@ -32,7 +32,7 @@ see Sections 2.1 and 2.2 (equation (6)) of Aragam and Zhou (2015) for more detai \dontrun{ data(cytometryContinuous) -dat <- sparsebnData(cytometryContinuous$data, type = "d", ivn = cytometryContinuous$ivn) +dat <- sparsebnData(cytometryContinuous$data, type = "c", ivn = cytometryContinuous$ivn) estimate.covariance(dat) # estimate covariance estimate.precision(dat) # estimate precision } diff --git a/man/estimate.dag.Rd b/man/estimate.dag.Rd index 2889afc..f232abb 100644 --- a/man/estimate.dag.Rd +++ b/man/estimate.dag.Rd @@ -6,7 +6,8 @@ \usage{ estimate.dag(data, lambdas = NULL, lambdas.length = 20, error.tol = 1e-04, max.iters = NULL, edge.threshold = NULL, concavity = 2, - weight.scale = 1, convLb = 0.01, upperbound = 100, verbose = FALSE) + weight.scale = 1, convLb = 0.01, upperbound = 100, adaptive = FALSE, + verbose = FALSE) } \arguments{ \item{data}{Data as \code{\link[sparsebnUtils]{sparsebnData}}.} @@ -36,6 +37,8 @@ will be used and this value is otherwise ignored.} \item{upperbound}{(CD only) A large positive value used to truncate the adaptive weights. A -1 value indicates that there is no truncation.} +\item{adaptive}{(CD only) \code{TRUE / FALSE}, if \code{TRUE} the adaptive algorithm will be run.} + \item{verbose}{\code{TRUE / FALSE} whether or not to print out progress and summary reports.} } \value{ @@ -54,7 +57,7 @@ and \code{\link[discretecdAlgorithm]{cd.run}}. \dontrun{ # Estimate a DAG from the cytometry data data(cytometryContinuous) -dat <- sparsebnData(cytometryContinuous$data, type = "d", ivn = cytometryContinuous$ivn) +dat <- sparsebnData(cytometryContinuous$data, type = "c", ivn = cytometryContinuous$ivn) estimate.dag(dat) } diff --git a/man/plotDAG.Rd b/man/plotDAG.Rd new file mode 100644 index 0000000..aef1cc1 --- /dev/null +++ b/man/plotDAG.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sparsebn-plotting.R +\name{plotDAG} +\alias{plotDAG} +\title{Plot a DAG} +\usage{ +plotDAG(x) +} +\arguments{ +\item{x}{An \code{\link{edgeList}}, \code{\link{sparsebnFit}}, or \code{\link{sparsebnPath}} object.} + +\item{...}{Additional arguments to \code{\link{plot}}.} +} +\description{ +Using some sensible defaults for large graphs, plot a DAG object. Uses \code{\link[igraph]{igraph}} +package by default. +} +\details{ +This method is not intended for customization. For more control over the output, use +\code{\link{plot}} and see \code{\link{setPlotPackage}} for plotting only and/or +\code{\link{setGraphPackage}} for even more control. These methods grants the user the full +feature set of the selected package. +} + diff --git a/tests/testthat/helper-sparsebnUtils-generate_objects.R b/tests/testthat/helper-sparsebnUtils-generate_objects.R index 0ad8d15..8d974b9 100644 --- a/tests/testthat/helper-sparsebnUtils-generate_objects.R +++ b/tests/testthat/helper-sparsebnUtils-generate_objects.R @@ -1,3 +1,13 @@ +### Helper function to suppress output (form https://stat.ethz.ch/pipermail/r-help/2008-January/151471.html) +suppressFunctionOutput <- function(expr){ + this_will_die <- capture.output({expr}) +} + +### Return node names for use in function +helper_node_names <- function(){ + c("xyz", "abc", "123", "hij", "a1b2c3") +} + ### Generate fixed data -- need 5 columns to match objects below generate_fixed_data_frame <- function(){ x <- runif(5) @@ -44,7 +54,7 @@ generate_empty_SparseBlockMatrixR <- function(){ } generate_empty_sparsebnFit <- function(){ - li <- list(edges = generate_empty_edgeList(), lambda = 1, nedge = 0, pp = 1, nn = 10, time = 1) + li <- list(edges = generate_empty_edgeList(), nodes = "test", lambda = 1, nedge = 0, pp = 1, nn = 10, time = 1) sparsebnFit(li) } @@ -63,7 +73,7 @@ generate_empty_adjacency_matrix <- function(){ # # 0 . . . . # 1 0 . . . -# 0 1 0 . . +# 0 0 0 . . # 5 4 0 . . # 0 3 0 0 . # @@ -71,7 +81,7 @@ generate_fixed_edgeList <- function(){ nnode <- 5 li <- vector("list", length = nnode) li[[1]] <- c(2L,4L) - li[[2]] <- c(3L,4L,5L) + li[[2]] <- c(4L,5L) li[[3]] <- integer(0) li[[4]] <- integer(0) li[[5]] <- integer(0) @@ -80,6 +90,19 @@ generate_fixed_edgeList <- function(){ edgeL } +generate_fixed_graphNEL <- function(){ + V <- helper_node_names() + edL <- vector("list", length=5) + names(edL) <- V + edL[[1]] <- list(edges=c(), weights=runif(1)) # Edge list is + edL[[2]] <- list(edges=c(V[1]), weights=runif(1)) # to-from, not + edL[[3]] <- list(edges=c(), weights=runif(1)) # from-to! + edL[[4]] <- list(edges=c(V[1], V[2]), weights=runif(2)) # + edL[[5]] <- list(edges=c(V[2]), weights=runif(1)) # + + graph::graphNEL(nodes=V, edgeL=edL, edgemode="directed") +} + generate_fixed_SparseBlockMatrixR <- function(){ nnode <- 5 li <- list(rows = vector("list", length = nnode), @@ -90,14 +113,14 @@ generate_fixed_SparseBlockMatrixR <- function(){ ### Parents / rows li$rows[[1]] <- c(2L,4L) - li$rows[[2]] <- c(3L,4L,5L) + li$rows[[2]] <- c(4L,5L) li$rows[[3]] <- integer(0) li$rows[[4]] <- integer(0) li$rows[[5]] <- integer(0) ### Values li$vals[[1]] <- c(1,5) - li$vals[[2]] <- c(1,4,3) + li$vals[[2]] <- c(4,3) li$vals[[3]] <- integer(0) li$vals[[4]] <- integer(0) li$vals[[5]] <- integer(0) @@ -110,16 +133,23 @@ generate_fixed_SparseBlockMatrixR <- function(){ SparseBlockMatrixR(li) } -generate_fixed_sparsebnFit <- function(){ +generate_fixed_sparsebnFit <- function(edges = generate_fixed_edgeList()){ # sbm <- generate_fixed_SparseBlockMatrixR() - edges <- generate_fixed_edgeList() - sbf <- sparsebnFit(list(edges = edges, lambda = 1.54, nedge = num.edges(edges), pp = num.nodes(edges), nn = 10, time = 1)) + # edges <- generate_fixed_edgeList() + # sbf <- sparsebnFit(list(edges = edges, nodes = LETTERS[1:num.nodes(edges)], lambda = 1.54, nedge = num.edges(edges), pp = num.nodes(edges), nn = 10, time = 1)) + sbf <- sparsebnFit(list(edges = edges, + nodes = helper_node_names()[1:num.nodes(edges)], + lambda = 1.54, + nedge = num.edges(edges), + pp = num.nodes(edges), + nn = 10, + time = 1)) sbf } -generate_fixed_sparsebnPath <- function(){ - sbf <- generate_fixed_sparsebnFit() +generate_fixed_sparsebnPath <- function(sbf = generate_fixed_sparsebnFit()){ + # sbf <- generate_fixed_sparsebnFit() sbp <- sparsebnPath(list(sbf, sbf, sbf, sbf)) sbp @@ -129,9 +159,47 @@ generate_fixed_adjacency_matrix <- function(){ ### CCDr output is unweighted adjacency matrix by default m <- rbind(c(0, 0, 0, 0, 0), c(1, 0, 0, 0, 0), - c(0, 1, 0, 0, 0), + c(0, 0, 0, 0, 0), c(1, 1, 0, 0, 0), c(0, 1, 0, 0, 0)) m # Matrix::Matrix(m) } + +generate_nontrivial_sparsebnPath <- function(){ + sbf1 <- sbf2 <- sbf3 <- sbf4 <- generate_fixed_sparsebnFit() + sbf1$edges[[1]] <- sbf1$edges[[2]] <- integer(0) + sbf1$lambda <- 2.1 + sbf1$nedge <- 0 + + sbf3$edges[[3]] <- c(4) + sbf3$lambda <- 0.97 + sbf3$nedge <- sbf3$nedge + 1 + + sbf4$edges[[4]] <- c(1,3,5) + sbf4$lambda <- 0.57 + sbf4$nedge <- sbf4$nedge + 3 + + sbp <- sparsebnPath(list(sbf1, sbf2, sbf3, sbf4)) + + sbp +} + +# Return cytometry graph as test edge list +cyto_edge_list <- function(){ + edges <- list( + "akt" = c("erk", "pip3", "pka"), + "erk" = c("pka", "mek"), + "jnk" = c("pkc", "pka"), + "mek" = c("pkc", "pka", "raf"), + "p38" = c("pkc", "pka"), + "pip2" = c("plc", "pip3"), + "pip3" = character(0), + "pka" = c("pkc"), + "pkc" = c("pip2", "plc"), + "plc" = c("pip3"), + "raf" = c("pkc", "pka") + ) + + edges +} diff --git a/tests/testthat/test-dag.R b/tests/testthat/test-dag.R index 3c9c6d9..7c0b893 100644 --- a/tests/testthat/test-dag.R +++ b/tests/testthat/test-dag.R @@ -10,6 +10,7 @@ test_that("", { test_that("DAG estimation runs without errors", { expect_error(estimate.dag(dat_cts), NA) # continuous data expect_error(estimate.dag(dat_disc), NA) # discrete data + expect_error(estimate.dag(dat_disc, adaptive = TRUE), NA) # adaptive algorithm }) test_that("estimate.dag returns expected output", { diff --git a/tests/testthat/test-plotDAG.R b/tests/testthat/test-plotDAG.R new file mode 100644 index 0000000..335e8e9 --- /dev/null +++ b/tests/testthat/test-plotDAG.R @@ -0,0 +1,11 @@ +context("plotDAG") + +el <- generate_fixed_edgeList() +sbp <- generate_fixed_sparsebnPath() +sbf <- generate_fixed_sparsebnFit() + +test_that("plotDAG runs without errors", { + expect_error(plotDAG(el), NA) + expect_error(plotDAG(sbf), NA) + expect_error(plotDAG(sbp), NA) +}) diff --git a/vignettes/sparsebn-vignette.Rmd b/vignettes/sparsebn-vignette.Rmd index 803a820..9584d6e 100644 --- a/vignettes/sparsebn-vignette.Rmd +++ b/vignettes/sparsebn-vignette.Rmd @@ -1,5 +1,5 @@ --- -title: "Introduction to sparsebn" +title: "Introduction to the sparsebn package" author: "Bryon Aragam" date: "`r Sys.Date()`" output: rmarkdown::html_vignette @@ -9,7 +9,7 @@ vignette: > %\VignetteEncoding{UTF-8} --- -`sparsebn` is an `R` package for learning sparse Bayesian networks and other graphical models from high-dimensional data via sparse regularization. Designed to handle: +`sparsebn` is an `R` package for learning sparse Bayesian networks and other graphical models from high-dimensional data via sparse regularization. It is designed to handle: - Experimental data with interventions - Mixed observational / experimental data @@ -17,11 +17,6 @@ vignette: > - Datasets with thousands of variables (tested up to _p_=8000) - Continuous and discrete data -The workhorse behind [`sparsebn`](http://www.github.com/itsrainingdata/sparsebn/) is the [`sparsebnUtils`](http://www.github.com/itsrainingdata/sparsebnUtils/) -package, which provides various S3 classes and methods for representing and manipulating graphs. The basic algorithms are implemented in [`ccdrAlgorithm`](http://www.github.com/itsrainingdata/ccdrAlgorithm/) and [`discretecdAlgorithm`](http://www.github.com/gujyjean/discretecdAlgorithm/). - -## Overview - The main methods for learning graphical models are: * `estimate.dag` for directed acyclic graphs (Bayesian networks). @@ -30,9 +25,113 @@ The main methods for learning graphical models are: Currently, estimation of precision and covariances matrices is limited to Gaussian data. +## tl;dr: Structure learning in five lines of code + +The following example illustrates how to use `sparsebn` to learn a Bayesian network in just a few lines of code. An explanation of this code can be found in the next section. + +```{r message=FALSE, warning=FALSE} +library(sparsebn) +data(pathfinder) +data <- sparsebnData(pathfinder[["data"]], type = "continuous") +dags <- estimate.dag(data) +dags +``` + +## Example: Learning the pathfinder network + +In this section, we will reconstruct the pathfinder network from the [Bayesian network repository](http://www.bnlearn.com/bnrepository/#pathfinder). The pathfinder network has 109 nodes and 195 edges. + +In order to use the methods in the sparsebn package, we need to indicate what kind of data we are working with by wrapping the data into a `sparsebnData` object. First we load the data and then create a `sparsebnData` object: + +```{r} +data(pathfinder) +dat <- sparsebnData(pathfinder$data, type = "continuous", ivn = NULL) +``` + +The `sparsebnData` object keeps track of what kind of data we are working with: Is it discrete or continuous, does it contain any interventions, and if it is discrete, what are the factor levels in the data? The argument `type = "continuous"` specifies that this data is continuous, and `ivn = NULL` indicate that there are no interventions. + +Now we can run the algorithm: + +```{r message=FALSE, warning=FALSE} +dags <- estimate.dag(data = dat) +dags +``` + +Instead of automatically generating a grid of regularization parameters, we can also generate one manually (see `?generate.lambdas`). + +```{r message=FALSE, warning=FALSE} +nn <- num.samples(dat) # number of samples in the dataset / equivalent to nrow(dat$data) +lambdas <- generate.lambdas(sqrt(nn), 0.05, lambdas.length = 50, scale = "linear") +dags <- estimate.dag(data = dat, + lambdas = lambdas, + verbose = FALSE) +dags +``` + +Note that the output is a _solution path_ (stored internally as a `sparsebnPath` object), instead of a single estimate. In order to select a particular DAG, we need to do model selection. For example, we can visualize the solution with 195 edges: +```{r, fig.width = 6, fig.height = 6} +solution <- select(dags, edges = 195) +par(mfrow = c(1,2), oma = rep(0,4)) +plotDAG(solution) +plot(solution, + layout = igraph::layout_(to_igraph(solution$edges), igraph::in_circle()), + vertex.label = NA, + vertex.size = 5, + vertex.label.color = gray(0), + vertex.color = gray(0.9), + edge.color = gray(0), + edge.arrow.size = 0.45 +) +``` + +On the left, we use the `plotDAG` method, which uses some sensible defaults for plotting large graphs. On the right, we use the `plot` method (by default, imported from the `igraph` package) in order to use an organized circular layout and change some graphical parameters. The `sparsebn` package allows the user to use the `igraph`, `network`, or `graph` packages for visualization via the `plot` method. See `?setPlotPackage` for more details. + +For comparison, let's plot the original pathfinder graph; note that the plot on the right makes it easier to compare this to the previous plot. +```{r, fig.width = 6, fig.height = 6} +par(mfrow = c(1,2), oma = rep(0,4)) +plotDAG(pathfinder$dag) +plot(pathfinder$dag, + layout = igraph::layout_(to_igraph(pathfinder$dag), igraph::in_circle()), + vertex.label = NA, + vertex.size = 5, + vertex.label.color = gray(0), + vertex.color = gray(0.9), + edge.color = gray(0), + edge.arrow.size = 0.45 +) +``` + +Alternatively, we can automatically select a good solution using `select.parameter`: +```{r, fig.width = 6, fig.height = 6} +select.idx <- select.parameter(dags, dat) +solution <- select(dags, index = select.idx) # same as dags[[select.idx]] + +par(mfrow = c(1,2), oma = rep(0,4)) +plotDAG(solution) +plot(solution, + layout = igraph::layout_(to_igraph(solution$edges), igraph::in_circle()), + vertex.label = NA, + vertex.size = 5, + vertex.label.color = gray(0), + vertex.color = gray(0.9), + edge.color = gray(0), + edge.arrow.size = 0.45 +) +``` + +The output of `estimate.dag` is a list of _graphs_, i.e. adjacency lists without edge weights. In order to estimate the edge weights, we use `estimate.parameters`: +```{r} +dags.fit <- estimate.parameters(dags, data = dat) +``` + +The output is a list of weights, one for each value of $\lambda$. The weights are given in terms of $(B,\Omega)$, corresponding to the list `list(coefs, vars)`. For example, we can see how the weight of the first node on the second changes as we decrease $\lambda$: +```{r} +unlist(lapply(dags.fit, function(x) x$coefs[1,2])) +``` + ## Example: Learning a Markov Chain -Suppose that the data is generated by a simple Markov chain: +In this example, we illustrate how to learn a simple Markov chain on three nodes. We will also discuss how to use `sparsebn` to do covariance estimation. Suppose that the data is generated by the following graphical model: $$X_1\to X_2\to X_3.$$ @@ -64,33 +163,26 @@ covariance.matrix <- rbind(c(3,2,1), c(1,1,1)) ``` -Then we can generate some data using `mvtnorm::rmvnorm`: +Then we can generate some data using the `mvtnorm` package: ```{r} gaussian.data <- mvtnorm::rmvnorm(n = 100, mean = mean.vector, sigma = covariance.matrix) colnames(gaussian.data) <- c("X1", "X2", "X3") ``` -In order to use the methods in the sparsebn package, we need to indicate what kind of data we are working with by wrapping the data into a `sparsebnData` object: -```{r message=FALSE, warning=FALSE} -library("sparsebn") -dat <- sparsebnData(gaussian.data, type = "continuous") -``` - Now we can use this data to estimate $B$: ```{r} -dags.out <- estimate.dag(data = dat, +dat <- sparsebnData(gaussian.data, type = "continuous") +dags <- estimate.dag(data = dat, lambdas.length = 20, edge.threshold = 10, verbose = FALSE) -dags.out +dags ``` -Note that the output is a _solution path_ (stored internally as a `sparsebnPath` object), instead of a single estimate. In order to select a particular DAG, we need to do model selection (not implemented yet). - As expected, the third estimate in our solution path gives the correct estimate: ```{r} -dags.out[[3]] -get.adjacency.matrix(dags.out[[3]]) +dags[[3]] +get.adjacency.matrix(dags[[3]]) ``` We can also use this data to directly estimate the covariance matrix $\Sigma$: @@ -116,84 +208,11 @@ cov.out <- estimate.covariance(data = dat) cov.out[[3]] ``` -## Example: Bayesian network repository - -For a less trivial example, we will try to reconstruct the pathfinder network from the [Bayesian network repository](http://www.bnlearn.com/bnrepository/#pathfinder). The pathfinder network has 109 nodes and 195 edges. - -First, load the data: - -```{r} -data(pathfinder) -dat <- sparsebnData(pathfinder$data, type = "c", ivn = NULL) -``` - -Note that we create a `sparsebnData` object with no interventions by setting `ivn = NULL`. This time, instead of automatically generating a grid of regularization parameters, we generate one manually (see `?generate.lambdas`). Then we run the algorithm: - -```{r message=FALSE, warning=FALSE} -nn <- num.samples(dat) # number of samples in the dataset / equivalent to nrow(dat$data) -lambdas <- generate.lambdas(sqrt(nn), 0.05, lambdas.length = 50, scale = "linear") -dags.out <- estimate.dag(data = dat, - lambdas = lambdas, - edge.threshold = 500, - verbose = FALSE) -dags.out -``` - -Let's visualize the solution with 195 edges. For this, we use the `igraph` package: -```{r} -solution <- get.solution(dags.out, edges = 195) -plot(solution, - layout = igraph::layout.circle(to_igraph(solution$edges)), - vertex.label = NA, - vertex.size = 5, - vertex.color = gray(0.75), - edge.color = gray(0), - edge.width = 1, - edge.arrow.size = .25) -``` - -For comparison, let's plot the original pathfinder graph. -```{r} -plot(pathfinder$dag, - layout = igraph::layout.circle(to_igraph(pathfinder$dag)), - vertex.label = NA, - vertex.size = 5, - vertex.color = gray(0.75), - edge.color = gray(0), - edge.width = 1, - edge.arrow.size = .25) -``` - -We can also use the parameter selection technique described in \[[4](#references)\] to automatically select a good index (see `?select.parameter` for more details): -```{r} -select.idx <- select.parameter(dags.out, dat) -solution <- get.solution(dags.out, index = select.idx) -plot(solution, - layout = igraph::layout.circle(to_igraph(solution$edges)), - vertex.label = NA, - vertex.size = 5, - vertex.color = gray(0.75), - edge.color = gray(0), - edge.width = 1, - edge.arrow.size = .25) -``` - - -Note that the output of `estimate.dag` is a list of _graphs_, i.e. without weights. In order to do inference and estimate the weights in these graphs, use `estimate.parameters`: -```{r} -out.fit <- estimate.parameters(dags.out, data = dat) -``` - -The output is a list of weights, one for each value of $\lambda$. The weights are given in terms of $(B,\Omega)$, corresponding to the list `list(coefs, vars)`. For example, we can see how the weight of the first node on the second changes as we decrease $\lambda$: -```{r} -unlist(lapply(out.fit, function(x) x$coefs[1,2])) -``` - ## Appendix: Simulating data from a linear Gaussian SEM -The `pathfinder` dataset has been simulated from a linear Gaussian SEM. In this section, we illustrate how to do this from scratch. +Both datasets above have been simulated from a linear Gaussian SEM. In this section, we illustrate how to do this from scratch. -We first need a DAG; for this example we will use the pathfinder network from the previous section. First, load this DAG: +We first need a DAG; for this example we will use the pathfinder network. First, load this DAG: ```{r} data(pathfinder) @@ -205,7 +224,7 @@ $$ \Sigma = (I-B)^{-T}\Omega(I-B)^{-1}. $$ -Assuming unit influences (i.e. $\beta_{ij}=1$ if $\beta_{ij}\ne 0$) and unit variances (i.e. $\omega_j^2=1$ for all $j$), we can then compute $\Sigma$ directly: +Assuming unit influences (i.e. $\beta_{ij}=1$ if $\beta_{ij}\ne 0$) and unit variances (i.e. $\omega_j^2=1$ for all $j$), we can then compute $\Sigma$ by using the above equation: ```{r} id <- diag(rep(1, num.nodes(pathfinder$dag))) # 109x109 identity matrix @@ -217,23 +236,13 @@ Finally, we can use the `mvtnorm` package to generate random multivariate Gaussi ```{r} set.seed(123) -nn <- 1000 -gaussian.data <- mvtnorm::rmvnorm(nn, sigma = Sigma) +nsamples <- 1000 +gaussian.data <- mvtnorm::rmvnorm(nsamples, sigma = Sigma) ``` Instead of setting $\beta_{ij}=1$, we can also use random edge weights: ```{r} B[B!=0] <- runif(n = num.edges(pathfinder$dag), min = 0.5, max = 2) Sigma <- solve(t(id - B)) %*% Omega %*% solve(id - B) -gaussian.data <- mvtnorm::rmvnorm(nn, sigma = Sigma) +gaussian.data <- mvtnorm::rmvnorm(nsamples, sigma = Sigma) ``` - -## References - -\[1\] Aragam, B. and Zhou, Q. (2015). [Concave penalized estimation of sparse Gaussian Bayesian networks.](http://jmlr.org/papers/v16/aragam15a.html) _The Journal of Machine Learning Research_. 16(Nov):2273−2328. - -\[2\] Fu, F., Gu, J., and Zhou, Q. (2014). [Adaptive penalized estimation of directed acyclic graphs from categorical data.](http://arxiv.org/abs/1403.2310) arXiv: 1403.2310. - -\[3\] Aragam, B., Amini, A. A., and Zhou, Q. (2015). [Learning directed acyclic graphs with penalized neighbourhood regression.](http://arxiv.org/abs/1511.08963) arXiv: 1511.08963. - -\[4\] Fu, F. and Zhou, Q. (2013). [Learning sparse causal Gaussian networks with experimental intervention: Regularization and coordinate descent.](http://amstat.tandfonline.com/doi/abs/10.1080/01621459.2012.754359) Journal of the American Statistical Association, 108: 288-300.