references.bib


@book{bishop_pattern_2006,
	address = {New York},
	series = {Information science and statistics},
	title = {Pattern recognition and machine learning},
	isbn = {0-387-31073-8 978-0-387-31073-2},
	publisher = {Springer},
	author = {Bishop, Christopher M.},
	year = {2006},
	note = {16274},
	keywords = {Machine learning, Pattern perception}
}

@book{murphy_machine_2012,
	address = {Cambridge, MA},
	series = {Adaptive computation and machine learning series},
	title = {Machine learning: a probabilistic perspective},
	isbn = {978-0-262-01802-9},
	shorttitle = {Machine learning},
	publisher = {MIT Press},
	author = {Murphy, Kevin P.},
	year = {2012},
	note = {00643},
	keywords = {Machine learning, Probabilities}
}

@article{abraham_machine_2014,
	title = {Machine {Learning} for {Neuroimaging} with {Scikit}-{Learn}},
	url = {http://arxiv.org/abs/1412.3919},
	abstract = {Statistical machine learning methods are increasingly used for neuroimaging data analysis. Their main virtue is their ability to model high-dimensional datasets, e.g. multivariate analysis of activation images or resting-state time series. Supervised learning is typically used in decoding or encoding settings to relate brain images to behavioral or clinical observations, while unsupervised learning can uncover hidden structures in sets of images (e.g. resting state functional MRI) or find sub-populations in large cohorts. By considering different functional neuroimaging applications, we illustrate how scikit-learn, a Python machine learning library, can be used to perform some key analysis steps. Scikit-learn contains a very large set of statistical learning algorithms, both supervised and unsupervised, and its application to neuroimaging data provides a versatile tool to study the brain.},
	urldate = {2017-05-28},
	journal = {arXiv:1412.3919 [cs, stat]},
	author = {Abraham, Alexandre and Pedregosa, Fabian and Eickenberg, Michael and Gervais, Philippe and Muller, Andreas and Kossaifi, Jean and Gramfort, Alexandre and Thirion, Bertrand and Varoquaux, Gäel},
	month = dec,
	year = {2014},
	note = {arXiv: 1412.3919},
	keywords = {Computer Science - Learning, Statistics - Machine Learning, Computer Science - Computer Vision and Pattern Recognition},
	annote = {Comment: Frontiers in neuroscience, Frontiers Research Foundation, 2013, pp.15},
	file = {arXiv\:1412.3919 PDF:/home/petri/Zotero/storage/TVPGTATK/Abraham et al. - 2014 - Machine Learning for Neuroimaging with Scikit-Lear.pdf:application/pdf;arXiv.org Snapshot:/home/petri/Zotero/storage/7NI4WBID/1412.html:text/html}
}

@article{friedman_regularization_2010,
	title = {Regularization {Paths} for {Generalized} {Linear} {Models} via {Coordinate} {Descent}},
	volume = {33},
	copyright = {Copyright (c) 2009 Jerome H. Friedman, Trevor Hastie, Rob Tibshirani},
	issn = {1548-7660},
	url = {https://www.jstatsoft.org/index.php/jss/article/view/v033i01},
	doi = {10.18637/jss.v033.i01},
	language = {en},
	number = {1},
	urldate = {2019-03-03},
	journal = {Journal of Statistical Software},
	author = {Friedman, Jerome H. and Hastie, Trevor and Tibshirani, Rob},
	month = feb,
	year = {2010},
	pages = {1--22},
	file = {Full Text:/home/petri/Zotero/storage/5AXSKNPB/Friedman et al. - 2010 - Regularization Paths for Generalized Linear Models.pdf:application/pdf;Snapshot:/home/petri/Zotero/storage/86C9XWKD/v033i01.html:text/html}
}

@techreport{breiman_arcing_1997,
	title = {Arcing the edge},
	institution = {Technical Report 486, Statistics Department, University of California at …},
	author = {Breiman, Leo},
	year = {1997}
}

@article{hastie_multi-class_2009,
	title = {Multi-class {AdaBoost}},
	volume = {2},
	issn = {1938-7997},
	url = {http://www.intlpress.com/site/pub/pages/journals/items/sii/content/vols/0002/0003/a008/abstract.php},
	doi = {10.4310/SII.2009.v2.n3.a8},
	abstract = {International Press of Boston - publishers of scholarly mathematical and scientific journals and books},
	language = {EN},
	number = {3},
	urldate = {2019-10-03},
	journal = {Statistics and Its Interface},
	author = {Hastie, Trevor and Rosset, Saharon and Zhu, Ji and Zou, Hui},
	year = {2009},
	pages = {349--360},
	file = {Snapshot:/home/petri/Zotero/storage/77TRBMLT/a008.html:text/html;Full Text PDF:/home/petri/Zotero/storage/CS6XV42C/Hastie et al. - 2009 - Multi-class AdaBoost.pdf:application/pdf}
}

@inproceedings{drucker_improving_1997,
	title = {Improving {Regressors} using {Boosting} {Techniques}},
	abstract = {In the regression context, boosting and bagging are techniques to build a committee of regressors that may be superior to a single regressor. We use regression trees as fundamental building blocks in bagging committee machines and boosting committee machines. Performance is analyzed on three non-linear functions and the Boston housing database. In all cases, boosting is at least equivalent, and in most cases better than bagging in terms of prediction error.},
	booktitle = {{ICML}},
	author = {Drucker, Harris},
	year = {1997},
	keywords = {Boosting (machine learning), Decision tree, Linear function, Nonlinear system},
	file = {Full Text PDF:/home/petri/Zotero/storage/QVRGCX99/Drucker - 1997 - Improving Regressors using Boosting Techniques.pdf:application/pdf}
}

@article{freund_decision-theoretic_1997,
	title = {A {Decision}-{Theoretic} {Generalization} of {On}-{Line} {Learning} and an {Application} to {Boosting}},
	volume = {55},
	issn = {0022-0000},
	url = {http://www.sciencedirect.com/science/article/pii/S002200009791504X},
	doi = {10.1006/jcss.1997.1504},
	abstract = {In the first part of the paper we consider the problem of dynamically apportioning resources among a set of options in a worst-case on-line framework. The model we study can be interpreted as a broad, abstract extension of the well-studied on-line prediction model to a general decision-theoretic setting. We show that the multiplicative weight-update Littlestone–Warmuth rule can be adapted to this model, yielding bounds that are slightly weaker in some cases, but applicable to a considerably more general class of learning problems. We show how the resulting learning algorithm can be applied to a variety of problems, including gambling, multiple-outcome prediction, repeated games, and prediction of points in Rn. In the second part of the paper we apply the multiplicative weight-update technique to derive a new boosting algorithm. This boosting algorithm does not require any prior knowledge about the performance of the weak learning algorithm. We also study generalizations of the new boosting algorithm to the problem of learning functions whose range, rather than being binary, is an arbitrary finite set or a bounded segment of the real line.},
	number = {1},
	urldate = {2019-10-03},
	journal = {Journal of Computer and System Sciences},
	author = {Freund, Yoav and Schapire, Robert E},
	month = aug,
	year = {1997},
	pages = {119--139},
	file = {ScienceDirect Full Text PDF:/home/petri/Zotero/storage/6FZGPIYP/Freund and Schapire - 1997 - A Decision-Theoretic Generalization of On-Line Lea.pdf:application/pdf;ScienceDirect Snapshot:/home/petri/Zotero/storage/L4QQNF9W/S002200009791504X.html:text/html}
}

@inproceedings{friedman_additive_2000,
	title = {Additive logistic regression : {A} statistical view of boosting},
	shorttitle = {Additive logistic regression},
	doi = {10.1214/aos/1016218223},
	abstract = {Boosting is one of the most important recent developments in classification methodology. Boosting works by sequentially applying a classification algorithm to reweighted versions of the training data and then taking a weighted majority vote of the sequence of classifiers thus produced. For many classification algorithms, this simple strategy results in dramatic improvements in performance. We show that this seemingly mysterious phenomenon can be understood in terms of well-known statistical principles, namely additive modeling and maximum likelihood. For the two-class problem, boosting can be viewed as an approximation to additive modeling on the logistic scale using maximum Bernoulli likelihood as a criterion. We develop more direct approximations and show that they exhibit nearly identical results to boosting. Direct multiclass generalizations based on multinomial likelihood are derived that exhibit performance comparable to other recently proposed multiclass generalizations of boosting in most situations, and far superior in some. We suggest a minor modification to boosting that can reduce computation, often by factors of 10 to 50. Finally, we apply these insights to produce an alternative formulation of boosting decision trees. This approach, based on best-first truncated tree induction, often leads to better performance, and can provide interpretable descriptions of the aggregate decision rule. It is also much faster computationally, making it more suitable to large-scale data mining applications.},
	author = {Friedman, Jerome H. and Hastie, Trevor and Tibshirani, Robert},
	year = {2000},
	file = {Full Text PDF:/home/petri/Zotero/storage/2WHLKZR5/Friedman et al. - 2000 - Additive logistic regression  A statistical view .pdf:application/pdf}
}

@article{chen_xgboost:_2016,
	title = {{XGBoost}: {A} {Scalable} {Tree} {Boosting} {System}},
	shorttitle = {{XGBoost}},
	url = {http://arxiv.org/abs/1603.02754},
	doi = {10.1145/2939672.2939785},
	abstract = {Tree boosting is a highly effective and widely used machine learning method. In this paper, we describe a scalable end-to-end tree boosting system called XGBoost, which is used widely by data scientists to achieve state-of-the-art results on many machine learning challenges. We propose a novel sparsity-aware algorithm for sparse data and weighted quantile sketch for approximate tree learning. More importantly, we provide insights on cache access patterns, data compression and sharding to build a scalable tree boosting system. By combining these insights, XGBoost scales beyond billions of examples using far fewer resources than existing systems.},
	urldate = {2019-09-30},
	journal = {Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining - KDD '16},
	author = {Chen, Tianqi and Guestrin, Carlos},
	year = {2016},
	note = {arXiv: 1603.02754},
	keywords = {Computer Science - Machine Learning},
	pages = {785--794},
	annote = {Comment: KDD'16 changed all figures to type1},
	file = {arXiv.org Snapshot:/home/petri/Zotero/storage/PUB73PWD/1603.html:text/html;arXiv\:1603.02754 PDF:/home/petri/Zotero/storage/VMDNQQAW/Chen and Guestrin - 2016 - XGBoost A Scalable Tree Boosting System.pdf:application/pdf}
}

@article{friedman_stochastic_2002,
	series = {Nonlinear {Methods} and {Data} {Mining}},
	title = {Stochastic gradient boosting},
	volume = {38},
	issn = {0167-9473},
	url = {http://www.sciencedirect.com/science/article/pii/S0167947301000652},
	doi = {10.1016/S0167-9473(01)00065-2},
	abstract = {Gradient boosting constructs additive regression models by sequentially fitting a simple parameterized function (base learner) to current “pseudo”-residuals by least squares at each iteration. The pseudo-residuals are the gradient of the loss functional being minimized, with respect to the model values at each training data point evaluated at the current step. It is shown that both the approximation accuracy and execution speed of gradient boosting can be substantially improved by incorporating randomization into the procedure. Specifically, at each iteration a subsample of the training data is drawn at random (without replacement) from the full training data set. This randomly selected subsample is then used in place of the full sample to fit the base learner and compute the model update for the current iteration. This randomized approach also increases robustness against overcapacity of the base learner.},
	number = {4},
	urldate = {2019-09-30},
	journal = {Computational Statistics \& Data Analysis},
	author = {Friedman, Jerome H.},
	month = feb,
	year = {2002},
	pages = {367--378},
	file = {ScienceDirect Full Text PDF:/home/petri/Zotero/storage/5RDQFINP/Friedman - 2002 - Stochastic gradient boosting.pdf:application/pdf;ScienceDirect Snapshot:/home/petri/Zotero/storage/3LAP5XDX/S0167947301000652.html:text/html}
}

@article{friedman_greedy_2001,
	title = {Greedy function approximation: {A} gradient boosting machine.},
	volume = {29},
	issn = {0090-5364, 2168-8966},
	shorttitle = {Greedy function approximation},
	url = {https://projecteuclid.org/euclid.aos/1013203451},
	doi = {10.1214/aos/1013203451},
	abstract = {Function estimation/approximation is viewed from the perspective of numerical optimization in function space, rather than parameter space. A connection is made between stagewise additive expansions and steepest-descent minimization. A general gradient descent “boosting” paradigm is developed for additive expansions based on any fitting criterion.Specific algorithms are presented for least-squares, least absolute deviation, and Huber-M loss functions for regression, and multiclass logistic likelihood for classification. Special enhancements are derived for the particular case where the individual additive components are regression trees, and tools for interpreting such “TreeBoost” models are presented. Gradient boosting of regression trees produces competitive, highly robust, interpretable procedures for both regression and classification, especially appropriate for mining less than clean data. Connections between this approach and the boosting methods of Freund and Shapire and Friedman, Hastie and Tibshirani are discussed.},
	language = {en},
	number = {5},
	urldate = {2019-09-30},
	journal = {The Annals of Statistics},
	author = {Friedman, Jerome H.},
	month = oct,
	year = {2001},
	mrnumber = {MR1873328},
	zmnumber = {1043.62034},
	keywords = {boosting, decision trees, Function estimation, robust nonparametric regression},
	pages = {1189--1232},
	file = {Snapshot:/home/petri/Zotero/storage/PRQE45GT/1013203451.html:text/html;Full Text PDF:/home/petri/Zotero/storage/L8QY5JJV/Friedman - 2001 - Greedy function approximation A gradient boosting.pdf:application/pdf}
}

@article{wolpert_stacked_1992,
	title = {Stacked generalization},
	volume = {5},
	issn = {0893-6080},
	url = {http://www.sciencedirect.com/science/article/pii/S0893608005800231},
	doi = {10.1016/S0893-6080(05)80023-1},
	abstract = {This paper introduces stacked generalization, a scheme for minimizing the generalization error rate of one or more generalizers. Stacked generalization works by deducing the biases of the generalizer(s) with respect to a provided learning set. This deduction proceeds by generalizing in a second space whose inputs are (for example) the guesses of the original generalizers when taught with part of the learning set and trying to guess the rest of it, and whose output is (for example) the correct guess. When used with multiple generalizers, stacked generalization can be seen as a more sophisticated version of cross-validation, exploiting a strategy more sophisticated than cross-validation's crude winner-takes-all for combining the individual generalizers. When used with a single generalizer, stacked generalization is a scheme for estimating (and then correcting for) the error of a generalizer which has been trained on a particular learning set and then asked a particular question. After introducing stacked generalization and justifying its use, this paper presents two numerical experiments. The first demonstrates how stacked generalization improves upon a set of separate generalizers for the NETtalk task of translating text to phonemes. The second demonstrates how stacked generalization improves the performance of a single surface-fitter. With the other experimental evidence in the literature, the usual arguments supporting cross-validation, and the abstract justifications presented in this paper, the conclusion is that for almost any real-world generalization problem one should use some version of stacked generalization to minimize the generalization error rate. This paper ends by discussing some of the variations of stacked generalization, and how it touches on other fields like chaos theory.},
	language = {en},
	number = {2},
	urldate = {2020-04-06},
	journal = {Neural Networks},
	author = {Wolpert, David H.},
	month = jan,
	year = {1992},
	keywords = {Combining generalizers, cross-validation, Error estimation and correction, Generalization and induction, Learning set preprocessing},
	pages = {241--259},
	file = {ScienceDirect Snapshot:/home/petri/Zotero/storage/5YHMR7U8/S0893608005800231.html:text/html}
}

@article{breiman_random_2001,
	title = {Random forests},
	volume = {45},
	number = {1},
	journal = {Machine learning},
	author = {Breiman, Leo},
	year = {2001},
	note = {Publisher: Springer},
	pages = {5--32}
}

@article{breiman_bagging_1996,
	title = {Bagging predictors},
	volume = {24},
	issn = {1573-0565},
	url = {https://doi.org/10.1007/BF00058655},
	doi = {10.1007/BF00058655},
	abstract = {Bagging predictors is a method for generating multiple versions of a predictor and using these to get an aggregated predictor. The aggregation averages over the versions when predicting a numerical outcome and does a plurality vote when predicting a class. The multiple versions are formed by making bootstrap replicates of the learning set and using these as new learning sets. Tests on real and simulated data sets using classification and regression trees and subset selection in linear regression show that bagging can give substantial gains in accuracy. The vital element is the instability of the prediction method. If perturbing the learning set can cause significant changes in the predictor constructed, then bagging can improve accuracy.},
	language = {en},
	number = {2},
	urldate = {2020-04-06},
	journal = {Machine Learning},
	author = {Breiman, Leo},
	month = aug,
	year = {1996},
	pages = {123--140},
	file = {Springer Full Text PDF:/home/petri/Zotero/storage/D5PWX8IJ/Breiman - 1996 - Bagging predictors.pdf:application/pdf}
}

@article{geurts_extremely_2006,
	title = {Extremely randomized trees},
	volume = {63},
	issn = {1573-0565},
	url = {https://doi.org/10.1007/s10994-006-6226-1},
	doi = {10.1007/s10994-006-6226-1},
	abstract = {This paper proposes a new tree-based ensemble method for supervised classification and regression problems. It essentially consists of randomizing strongly both attribute and cut-point choice while splitting a tree node. In the extreme case, it builds totally randomized trees whose structures are independent of the output values of the learning sample. The strength of the randomization can be tuned to problem specifics by the appropriate choice of a parameter. We evaluate the robustness of the default choice of this parameter, and we also provide insight on how to adjust it in particular situations. Besides accuracy, the main strength of the resulting algorithm is computational efficiency. A bias/variance analysis of the Extra-Trees algorithm is also provided as well as a geometrical and a kernel characterization of the models induced.},
	language = {en},
	number = {1},
	urldate = {2020-04-06},
	journal = {Machine Learning},
	author = {Geurts, Pierre and Ernst, Damien and Wehenkel, Louis},
	month = apr,
	year = {2006},
	pages = {3--42},
	file = {Springer Full Text PDF:/home/petri/Zotero/storage/Q5YXR84Z/Geurts et al. - 2006 - Extremely randomized trees.pdf:application/pdf}
}

@article{shao_linear_1993,
	title = {Linear {Model} {Selection} by {Cross}-validation},
	volume = {88},
	issn = {0162-1459},
	url = {https://amstat.tandfonline.com/doi/abs/10.1080/01621459.1993.10476299},
	doi = {10.1080/01621459.1993.10476299},
	abstract = {We consider the problem of selecting a model having the best predictive ability among a class of linear models. The popular leave-one-out cross-validation method, which is asymptotically equivalent to many other model selection methods such as the Akaike information criterion (AIC), the C  p , and the bootstrap, is asymptotically inconsistent in the sense that the probability of selecting the model with the best predictive ability does not converge to 1 as the total number of observations n → ∞. We show that the inconsistency of the leave-one-out cross-validation can be rectified by using a leave-n  v -out cross-validation with n  v , the number of observations reserved for validation, satisfying n  v /n → 1 as n → ∞. This is a somewhat shocking discovery, because nv/n → 1 is totally opposite to the popular leave-one-out recipe in cross-validation. Motivations, justifications, and discussions of some practical aspects of the use of the leave-n  v -out cross-validation method are provided, and results from a simulation study are presented.},
	number = {422},
	urldate = {2020-04-06},
	journal = {Journal of the American Statistical Association},
	author = {Shao, Jun},
	month = jun,
	year = {1993},
	note = {Publisher: Taylor \& Francis},
	pages = {486--494},
	file = {Snapshot:/home/petri/Zotero/storage/UDA8JLG6/01621459.1993.html:text/html}
}

@article{browne_single_1989,
	title = {Single {Sample} {Cross}-{Validation} {Indices} for {Covariance} {Structures}},
	volume = {24},
	issn = {0027-3171},
	url = {https://doi.org/10.1207/s15327906mbr2404_4},
	doi = {10.1207/s15327906mbr2404_4},
	abstract = {This article considers single sample approximations for the cross-validation coefficient in the analysis of covariance structures. An adjustment for predictive validity which may be employed in conjunction with any correctly specified discrepancy function is suggested. In the case of maximum likelihood estimation under normality assumptions the coefficient obtained is a simple linear function of the Akaike Information Criterion. Results of a random sampling experiment are reported.},
	number = {4},
	urldate = {2020-04-06},
	journal = {Multivariate Behavioral Research},
	author = {Browne, M. W. and Cudeck, R.},
	month = oct,
	year = {1989},
	pmid = {26753509},
	note = {Publisher: Routledge
\_eprint: https://doi.org/10.1207/s15327906mbr2404\_4},
	pages = {445--455},
	file = {Snapshot:/home/petri/Zotero/storage/CLH83HG2/s15327906mbr2404_4.html:text/html}
}

@article{picard_cross-validation_1984,
	title = {Cross-{Validation} of {Regression} {Models}},
	volume = {79},
	issn = {0162-1459},
	url = {https://www.tandfonline.com/doi/abs/10.1080/01621459.1984.10478083},
	doi = {10.1080/01621459.1984.10478083},
	abstract = {A methodolgy for assessment of the predictive ability of regression models is presented. Attention is given to models obtained via subset selection procedures, which are extremely difficult to evaluate by standard techniques. Cross-validatory assessments of predictive ability are obtained and their use illustrated in examples.},
	number = {387},
	urldate = {2020-04-06},
	journal = {Journal of the American Statistical Association},
	author = {Picard, Richard R. and Cook, R. Dennis},
	month = sep,
	year = {1984},
	note = {Publisher: Taylor \& Francis
\_eprint: https://www.tandfonline.com/doi/pdf/10.1080/01621459.1984.10478083},
	keywords = {Data splitting, Model selection, Optimism principle, Prediction},
	pages = {575--583},
	file = {Snapshot:/home/petri/Zotero/storage/U569PUY7/01621459.1984.html:text/html}
}

@article{stone_cross-validatory_1974,
	title = {Cross-{Validatory} {Choice} and {Assessment} of {Statistical} {Predictions}},
	volume = {36},
	copyright = {© 1974 The Authors},
	issn = {2517-6161},
	url = {https://rss.onlinelibrary.wiley.com/doi/abs/10.1111/j.2517-6161.1974.tb00994.x},
	doi = {10.1111/j.2517-6161.1974.tb00994.x},
	abstract = {A generalized form of the cross-validation criterion is applied to the choice and assessment of prediction using the data-analytic concept of a prescription. The examples used to illustrate the application are drawn from the problem areas of univariate estimation, linear regression and analysis of variance.},
	language = {en},
	number = {2},
	urldate = {2020-04-06},
	journal = {Journal of the Royal Statistical Society: Series B (Methodological)},
	author = {Stone, M.},
	year = {1974},
	note = {\_eprint: https://rss.onlinelibrary.wiley.com/doi/pdf/10.1111/j.2517-6161.1974.tb00994.x},
	keywords = {prediction, analysis of variance, choice of variables, crossvalidation, doublecross, modelmix, multiple regression, prescription, univariate estimation},
	pages = {111--133},
	file = {Snapshot:/home/petri/Zotero/storage/3NLS79AR/j.2517-6161.1974.tb00994.html:text/html}
}

@article{song_decision_2015,
	title = {Decision tree methods: applications for classification and prediction},
	volume = {27},
	issn = {1002-0829},
	shorttitle = {Decision tree methods},
	url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4466856/},
	doi = {10.11919/j.issn.1002-0829.215044},
	abstract = {Decision tree methodology is a commonly used data mining method for establishing classification systems based on multiple covariates or for developing prediction algorithms for a target variable. This method classifies a population into branch-like segments that construct an inverted tree with a root node, internal nodes, and leaf nodes. The algorithm is non-parametric and can efficiently deal with large, complicated datasets without imposing a complicated parametric structure. When the sample size is large enough, study data can be divided into training and validation datasets. Using the training dataset to build a decision tree model and a validation dataset to decide on the appropriate tree size needed to achieve the optimal final model. This paper introduces frequently used algorithms used to develop decision trees (including CART, C4.5, CHAID, and QUEST) and describes the SPSS and SAS programs that can be used to visualize tree structure.},
	number = {2},
	urldate = {2020-04-08},
	journal = {Shanghai Archives of Psychiatry},
	author = {SONG, Yan-yan and LU, Ying},
	month = apr,
	year = {2015},
	pmid = {26120265},
	pmcid = {PMC4466856},
	pages = {130--135},
	file = {PubMed Central Full Text PDF:/home/petri/Zotero/storage/ZCEMND7J/SONG and LU - 2015 - Decision tree methods applications for classifica.pdf:application/pdf}
}

@misc{noauthor_decision_nodate,
	title = {Decision tree learning pros and cons - {Machine} {Learning} with {Swift} [{Book}]},
	url = {https://www.oreilly.com/library/view/machine-learning-with/9781787121515/697c4c5f-1109-4058-8938-d01482389ce3.xhtml},
	abstract = {Decision tree learning pros and cons Advantages: Easy to understand and interpret, perfect for visual representation. This is an example of a white box model, which closely mimics the human … - Selection from Machine Learning with Swift [Book]},
	language = {en},
	urldate = {2020-04-08},
	note = {Library Catalog: www.oreilly.com},
	file = {Snapshot:/home/petri/Zotero/storage/7Z9JX5F9/697c4c5f-1109-4058-8938-d01482389ce3.html:text/html}
}

@article{huang_applications_2018,
	title = {Applications of {Support} {Vector} {Machine} ({SVM}) {Learning} in {Cancer} {Genomics}},
	volume = {15},
	issn = {1790-6245},
	doi = {10.21873/cgp.20063},
	abstract = {Machine learning with maximization (support) of separating margin (vector), called support vector machine (SVM) learning, is a powerful classification tool that has been used for cancer genomic classification or subtyping. Today, as advancements in high-throughput technologies lead to production of large amounts of genomic and epigenomic data, the classification feature of SVMs is expanding its use in cancer genomics, leading to the discovery of new biomarkers, new drug targets, and a better understanding of cancer driver genes. Herein we reviewed the recent progress of SVMs in cancer genomic studies. We intend to comprehend the strength of the SVM learning and its future perspective in cancer genomic applications.},
	language = {eng},
	number = {1},
	journal = {Cancer Genomics \& Proteomics},
	author = {Huang, Shujun and Cai, Nianguang and Pacheco, Pedro Penzuti and Narrandes, Shavira and Wang, Yang and Xu, Wayne},
	month = feb,
	year = {2018},
	pmid = {29275361},
	pmcid = {PMC5822181},
	keywords = {Humans, kernel function, biomarker discovery, Biomarkers, Tumor, cancer classification, classifier, driver gene, drug discovery, Drug Discovery, gene expression, gene selection, gene-gene interaction, Genes, Neoplasm, genomics, Genomics, Machine learning (ML), Neoplasms, Protein Interaction Mapping, review, Support Vector Machine, support vector machine (SVM)},
	pages = {41--51},
	file = {Full Text:/home/petri/Zotero/storage/S6S9HQI7/Huang et al. - 2018 - Applications of Support Vector Machine (SVM) Learn.pdf:application/pdf}
}

@inproceedings{rubio_kernel_2007,
	address = {Berlin, Heidelberg},
	series = {Lecture {Notes} in {Computer} {Science}},
	title = {Kernel {Methods} {Applied} to {Time} {Series} {Forecasting}},
	isbn = {978-3-540-73007-1},
	doi = {10.1007/978-3-540-73007-1_94},
	abstract = {Kernel methods are a class of algorithms whose importance has grown from the 90s in the machine learning field. Their most notable example are Support Vector Machines (SVMs), which are the state of the art for classification problems. Nevertheless, they are applicable to functional approximation problems and there are however several of them available: SVM for regression, Gaussian Process Regression and Least Squares SVM (LS-SVM) for instance. This paper applies and studies these algorithms to a number of Time Series Prediction problems and compares them with some more conventional techniques.},
	language = {en},
	booktitle = {Computational and {Ambient} {Intelligence}},
	publisher = {Springer},
	author = {Rubio, Ginés and Pomares, Héctor and Herrera, Luis J. and Rojas, Ignacio},
	editor = {Sandoval, Francisco and Prieto, Alberto and Cabestany, Joan and Graña, Manuel},
	year = {2007},
	keywords = {Support Vector Machine, Kernel Method, Minimum Description Length, Time Series Forecast, Time Series Prediction},
	pages = {782--789}
}

@article{abiodun_state---art_2018,
	title = {State-of-the-art in artificial neural network applications: {A} survey},
	volume = {4},
	issn = {24058440},
	shorttitle = {State-of-the-art in artificial neural network applications},
	url = {https://linkinghub.elsevier.com/retrieve/pii/S2405844018332067},
	doi = {10.1016/j.heliyon.2018.e00938},
	language = {en},
	number = {11},
	urldate = {2020-04-08},
	journal = {Heliyon},
	author = {Abiodun, Oludare Isaac and Jantan, Aman and Omolara, Abiodun Esther and Dada, Kemi Victoria and Mohamed, Nachaat AbdElatif and Arshad, Humaira},
	month = nov,
	year = {2018},
	pages = {e00938},
	file = {Full Text:/home/petri/Zotero/storage/6FCRPBAF/Abiodun et al. - 2018 - State-of-the-art in artificial neural network appl.pdf:application/pdf}
}

@misc{noauthor_beginners_nodate,
	title = {A {Beginner}'s {Guide} to {LSTMs} and {Recurrent} {Neural} {Networks}},
	url = {http://pathmind.com/wiki/lstm},
	abstract = {LSTMs are a powerful kind of RNN used for processing sequential data such as sound, time series (sensor) data or written natural language.},
	language = {en},
	urldate = {2020-04-08},
	journal = {Pathmind},
	note = {Library Catalog: pathmind.com},
	file = {Snapshot:/home/petri/Zotero/storage/G9LRWUWA/lstm.html:text/html}
}

@book{noauthor_pros_2017,
	title = {Pros and cons of neural networks - {Neural} {Networks} with {R}},
	isbn = {978-1-78839-787-2},
	url = {https://subscription.packtpub.com/book/big_data_and_business_intelligence/9781788397872/1/ch01lvl1sec27/pros-and-cons-of-neural-networks},
	abstract = {Neural networks form the basis of DL, and applications are enormous for DL, ranging from voice recognition to cancer detection.},
	language = {en},
	urldate = {2020-04-08},
	month = sep,
	year = {2017},
	file = {Snapshot:/home/petri/Zotero/storage/MBFY3T4J/pros-and-cons-of-neural-networks.html:text/html}
}

@incollection{knill_hidden_1997,
	address = {Dordrecht},
	series = {Text, {Speech} and {Language} {Technology}},
	title = {Hidden {Markov} {Models} in {Speech} and {Language} {Processing}},
	isbn = {978-94-017-1183-8},
	url = {https://doi.org/10.1007/978-94-017-1183-8_2},
	abstract = {Speech is an acoustic representation of a word or sequence of words, characterized by a slowly changing spectral envelope. Humans perceive this spectral envelope, and convert it into the underlying word string and its associated meaning. The ultimate goal of speech and language processing is to mimic this process so that a machine can hold a natural conversation with a human. Speech and language processing has a far wider role to play, however, in performing less complex tasks such as transcription, language identification, or audio document retrieval, all of which are feasible to a certain extent now. The basic step in all of these systems is to perform the inverse mapping of the speech into the underlying sequence of symbols, usually words, that produced it, as shown in Fig. 2.1. This chapter describes a statistical approach to solving the automatic speech recognition problem, based on stochastic Markov process models.},
	language = {en},
	urldate = {2020-04-08},
	booktitle = {Corpus-{Based} {Methods} in {Language} and {Speech} {Processing}},
	publisher = {Springer Netherlands},
	author = {Knill, K. and Young, S.},
	editor = {Young, Steve and Bloothooft, Gerrit},
	year = {1997},
	doi = {10.1007/978-94-017-1183-8_2},
	keywords = {Acoustic Model, Hide Markov Model, Language Model, Speech Recognition, State Sequence},
	pages = {27--68}
}

@inproceedings{imandoust_application_2013,
	title = {Application of {K}-{Nearest} {Neighbor} ({KNN}) {Approach} for {Predicting} {Economic} {Events}: {Theoretical} {Background}},
	shorttitle = {Application of {K}-{Nearest} {Neighbor} ({KNN}) {Approach} for {Predicting} {Economic} {Events}},
	abstract = {In the present study k-Nearest Neighbor classification method, have been studied for economic forecasting. Due to the effects of companies’ financial distress on stakeholders, financial distress prediction models have been one of the most attractive areas in financial research. In recent years, after the global financial crisis, the number of bankrupt companies has risen. Since companies' financial distress is the first stage of bankruptcy, using financial ratios for predicting financial distress have attracted too much attention of the academics as well as economic and financial institutions. Although in recent years studies on predicting companies’ financial distress in Iran have been increased, most efforts have exploited traditional statistical methods; and just a few studies have used nonparametric methods. Recent studies demonstrate this method is more capable than other methods.},
	author = {Imandoust, Sadegh Bafandeh and Bolandraftar, Mohammad},
	year = {2013},
	file = {Full Text PDF:/home/petri/Zotero/storage/ZCKV466S/Imandoust and Bolandraftar - 2013 - Application of K-Nearest Neighbor (KNN) Approach f.pdf:application/pdf}
}

@misc{noauthor_pros_2019,
	title = {Pros and cons of classical supervised {ML} algorithms},
	url = {http://rmartinshort.jimdofree.com/2019/02/24/pros-and-cons-of-classical-supervised-ml-algorithms/},
	abstract = {Here we explore the pros and cons of some the most popular classical machine learning algorithms for supervised learning. To recap, this is a learning situation where we are given some labelled data and the model must predict the value or class of a new datapoint using a hypothesis function that it has learned from studying the provided examples. By ‘classical’ machine leaning algorithms I mean anything that is not a neural network. We will cover the advantages and disadvantages of various neural network architectures in a future post. Also note that this post deals only with supervised learning. There will be another dealing with clustering algorithms for unsupervised tasks. Linear Regression Note that there are two basic flavors of this: Lasso and Ridge regression, which regularize with the L1 and L2 norms respectively This is used for regression problems. Pros Small number of hyperparmeters Easy to understand and explain Can be regularized to avoid overfitting and this is intuitive Lasso regression can provide feature importances Cons Input data need to be scaled and there are a range of ways to do this May not work well when the hypothesis function is non-linear A complex hypothesis function is really difficult to fit. This can be done by using quadratic and higher order features, but the number of these grows rapidly with the number of original features and may become very computationally expensive Prone to overfitting with a large number of features are present May not handle irrelevant features well Logistic Regression Again, this can be regularized with the L1 or L2 norm This is used for classification problems. Pros Outputs a probabilistic interpretation, which can be investigated with P-R curves, ROC curves etc. Has a small number of hyperparameters Overfitting can be addressed though regularization Using L2 regularization will provide feature importances Cons May overfit when provided with large numbers of features Can only learn linear hypothesis functions so are less suitable to complex relationships between features and target Input data might need scaling May not handle irrelevant features well, especially if the features are strongly correlated Naive Bayes This is used for classification problems. An intuitive way of determining the probability that a sample falls into classes based on a simplification of Bayes Theorem where we treat all the features as statistically independent. Specifically, we are attempting to determine the classification of some example given its feature values. Bayes Theroem states this is proportional to the probability of each class multiplied by the product of observing each of the feature values in each class. Pros No training involved - this is just based on the distribution of feature values. Thus it is easy to implement and fast Very little parameter tuning is required Works surprisingly well given its simplicity Features do not need scaling Cons Assumes that the features are independent, which is rarely true Will usually not perform as well as classification algorithms that follow the ‘train-test’ approach K nearest neighbors This is an approach where the values of a datapoint’s nearest neighbors is used to either classify it or find its value For classification, the neighbors of the datapoint all vote for its class. The class with the most votes wins For regression, the value of the datapoint is determined as the average of its neighbors This is also known as lazy learning because there is no training and testing dataset Pros No training involved Intuitive and easy to understand the concept, especially for clustering Naturally handles multiclass classification and can learn complex decision boundaries Cons The distance metric to choose it not obvious and difficult to justify in many cases Performs poorly on high dimensional datasets Expensive and slow to predict new instances because the distance to all neighbors must be recalculated Data needs to be scaled before use Decision Trees Decision trees are rule-based learning methods that usually use either gini impurity or entropy to split the dataset at a series of decisions, made at nodes. Each internal node represents a ‘test’ on an attribute and each test has a series of outcomes, which are represented by branches. At the very end of the branches are leaf nodes that represent a class or regression outcome from the tree. The tree is ‘learned’ by splitting the source set into subsets based on an attribute-value test. The process can be repeated on each derived subset in a recursive fashion. Decision trees are known as ‘weak learners’ and form the basis for ensemble methods like bagging and boosting Pros Easy to interpret Can adapt to learn arbitrarily complex hypothesis functions Requires little data preparation - data typically does not need to be scaled Feature importance is built in due to the way inwhich decision nodes are built Performs well on large datasets Cos Prone to overfitting unless pruning is used Can be very non-robust, meaning that small changes in the training dataset can lead to quite major differences in the hypothesis function that gets learned Generally have worse performance than ensemble methods Support Vector Machines (SVM) SVMs come in a variety of different flavors and are likely deserved of their own blog post for a more detailed description. Ultimately their goal is to find optimal hyperplanes that best separate the dataset. Typically a hyperplane is chosen such that the ‘margin’ between the plane and datapoints of different classes on either side is maximized Hard margin SVMs: These are not robust to outliers (noise), meaning that all datapoints within each class must be on the correct side of the hyperplane Soft margin SVM: A slack variable is introduced in the objective function so that the constraint can be satisfied even if it is not possible to fit a hyperplane to all datapoints. Typically these will have a regularization parameter C, whose size will control the extent of smoothing that occurs. A small C emphases the importance of the slack variables while a large C diminishes is The kernel trick can be used to adapt SVMs so that they become capable of learning non-linear decision boundaries. A kernel is really just a measure of the similarity between a datapoint and all others represented in the dataset. The most commonly used kernel (Radial Basis Function or RBF) has a gaussian shape Pros Fairly robust against overfitting, especially in higher dimensional space There are many kernels to choose from, and some may perform very well on the dataset in question The optimization problem is convex and so solvable with standard libraries and always unique Cons Feature scaling is required There are many hyperparameters and their meanings are often not intuitive Do not scale well to large datasets (difficult to parallelize) Bagging (random forest, for example) The concept of bagging aggregates/combines the results of many weak learners. Each weak learner has low bias but high variance (not robust to variation in the training dataset), but when aggregated their variance is lowered. In classification, each tree votes for the class of the input datapoint. In regression, the value assigned to each input is the mean of the bagged learners. In random forest, the weak learners are decision trees. Each tree is grown using a random subset of features using a random subset of the data. This decorrelates the trees by showing them different datasets. See the overview of random forest document for more information. Pros Highly flexible and generally very accurate Naturally assigns feature importance scores, so can handle redundant feature columns Has the ability to address class imbalance by using the balanced class weight flag Scales to large datasets Generally robust to overfitting Data does not need to be scaled Can learn non-linear hypothesis functions Cons Results may be difficult to interpret Feature importances may not be robust to variation in the training dataset Boosting (gradient boosted trees, for example) Boosting is alternative ensemble method that relies on the idea of iteratively improving the performance of weak learners. In the case of gradient boosted decision trees, the trees are built sequentially and each tree is designed to correct errors made by previous trees. At each step, the residual between the predicted and observed training data is added to the cost function, which is then minimized in the next step. Pros Robust to missing data, highly correlated features and irrelevant features in much the same way as random forest Naturally assigns feature importance scores In Kaggle competitions at least, XGBoost appears to be the method of choice, with slightly better performance than random forest Data does not need to be scaled Can learn non-linear hypothesis function Cons May be more prone to overfitting than random forest - the main purpose of the boosting approach is to reduce bias , not variance Has many hyperparameters to tune, so model development may not be as fast Feature importances may not be robust to variation in the training dataset},
	language = {en-US},
	urldate = {2020-04-08},
	journal = {rmartinshort},
	month = feb,
	year = {2019},
	note = {Library Catalog: rmartinshort.jimdofree.com},
	file = {Snapshot:/home/petri/Zotero/storage/2HH35V49/pros-and-cons-of-classical-supervised-ml-algorithms.html:text/html}
}

@misc{pant_introduction_2019,
	title = {Introduction to {Linear} {Regression} and {Polynomial} {Regression}},
	url = {https://towardsdatascience.com/introduction-to-linear-regression-and-polynomial-regression-f8adc96f31cb},
	abstract = {In this blog, we will discuss two important topics that will form a base for Machine Learning which is Linear and Polynomial Regression.},
	language = {en},
	urldate = {2020-04-08},
	journal = {Medium},
	author = {Pant, Ayush},
	month = jan,
	year = {2019},
	note = {Library Catalog: towardsdatascience.com},
	file = {Snapshot:/home/petri/Zotero/storage/GL5QIPKI/introduction-to-linear-regression-and-polynomial-regression-f8adc96f31cb.html:text/html}
}

@misc{tipping_sparse_2001,
	title = {Sparse bayesian learning and the relevance vector machine},
	url = {https://doi.org/10.1162/15324430152748236},
	abstract = {This paper introduces a general Bayesian framework for obtaining sparse solutions to regression and classification tasks utilising models linear in the parameters. Although this framework is fully general, we illustrate our approach with a particular specialisation that we denote the 'relevance vector machine' (RVM), a model of identical functional form to the popular and state-of-the-art 'support vector machine' (SVM). We demonstrate that by exploiting a probabilistic Bayesian learning framework, we can derive accurate prediction models which typically utilise dramatically fewer basis functions than a comparable SVM while offering a number of additional advantages. These include the benefits of probabilistic predictions, automatic estimation of 'nuisance' parameters, and the facility to utilise arbitrary basis functions (e.g. non-'Mercer' kernels). We detail the Bayesian framework and associated learning algorithm for the RVM, and give some illustrative examples of its application along with some comparative benchmarks. We offer some explanation for the exceptional degree of sparsity obtained, and discuss and demonstrate some of the advantageous features, and potential extensions, of Bayesian relevance learning.},
	urldate = {2020-04-08},
	publisher = {JMLR.org},
	author = {Tipping, Michael E.},
	month = sep,
	year = {2001},
	file = {Full Text PDF:/home/petri/Zotero/storage/3LYG4A6N/Tipping - 2001 - Sparse bayesian learning and the relevance vector .pdf:application/pdf}
}

@article{hu_relevance_2013,
	title = {A {Relevance} {Vector} {Machine}-{Based} {Approach} with {Application} to {Oil} {Sand} {Pump} {Prognostics}},
	volume = {13},
	issn = {1424-8220},
	url = {http://www.mdpi.com/1424-8220/13/9/12663},
	doi = {10.3390/s130912663},
	language = {en},
	number = {9},
	urldate = {2020-04-08},
	journal = {Sensors},
	author = {Hu, Jinfei and Tse, Peter},
	month = sep,
	year = {2013},
	pages = {12663--12686},
	file = {Full Text:/home/petri/Zotero/storage/9A8CAUK6/Hu and Tse - 2013 - A Relevance Vector Machine-Based Approach with App.pdf:application/pdf}
}

@article{dong_accelerating_2012,
	title = {Accelerating {Relevance}-{Vector}-{Machine}-{Based} {Classification} of {Hyperspectral} {Image} with {Parallel} {Computing}},
	volume = {2012},
	issn = {1024-123X, 1563-5147},
	url = {http://www.hindawi.com/journals/mpe/2012/252979/},
	doi = {10.1155/2012/252979},
	abstract = {Benefiting from the kernel skill and the sparse property, the relevance vector machine (RVM) could acquire a sparse solution, with an equivalent generalization ability compared with the support vector machine. The sparse property requires much less time in the prediction, making RVM potential in classifying the large-scale hyperspectral image. However, RVM is not widespread influenced by its slow training procedure. To solve the problem, the classification of the hyperspectral image using RVM is accelerated by the parallel computing technique in this paper. The parallelization is revealed from the aspects of the multiclass strategy, the ensemble of multiple weak classifiers, and the matrix operations. The parallel RVMs are implemented using the C language plus the parallel functions of the linear algebra packages and the message passing interface library. The proposed methods are evaluated by the AVIRIS Indian Pines data set on the Beowulf cluster and the multicore platforms. It shows that the parallel RVMs accelerate the training procedure obviously.},
	language = {en},
	urldate = {2020-04-08},
	journal = {Mathematical Problems in Engineering},
	author = {Dong, Chao and Tian, Lianfang},
	year = {2012},
	pages = {1--13},
	file = {Full Text:/home/petri/Zotero/storage/EX8D8IJL/Dong and Tian - 2012 - Accelerating Relevance-Vector-Machine-Based Classi.pdf:application/pdf}
}

@article{friedman_additive_2000-1,
	title = {Additive logistic regression: a statistical view of boosting ({With} discussion and a rejoinder by the authors)},
	volume = {28},
	issn = {0090-5364, 2168-8966},
	shorttitle = {Additive logistic regression},
	url = {https://projecteuclid.org/euclid.aos/1016218223},
	doi = {10.1214/aos/1016218223},
	abstract = {Boosting is one of the most important recent developments in classification methodology. Boosting works by sequentially applying a classification algorithm to reweighted versions of the training data and then taking a weighted majority vote of the sequence of classifiers thus produced. For many classification algorithms, this simple strategy results in dramatic improvements in performance. We show that this seemingly mysterious phenomenon can be understood in terms of well-known statistical principles, namely additive modeling and maximum likelihood. For the two-class problem, boosting can be viewed as an approximation to additive modeling on the logistic scale using maximum Bernoulli likelihood as a criterion. We develop more direct approximations and show that they exhibit nearly identical results to boosting. Direct multiclass generalizations based on multinomial likelihood are derived that exhibit performance comparable to other recently proposed multiclass generalizations of boosting in most situations, and far superior in some. We suggest a minor modification to boosting that can reduce computation, often by factors of 10 to 50. Finally, we apply these insights to produce an alternative formulation of boosting decision trees. This approach, based on best-first truncated tree induction, often leads to better performance, and can provide interpretable descriptions of the aggregate decision rule. It is also much faster computationally, making it more suitable to large-scale data mining applications.},
	language = {EN},
	number = {2},
	urldate = {2020-08-08},
	journal = {Annals of Statistics},
	author = {Friedman, Jerome and Hastie, Trevor and Tibshirani, Robert},
	month = apr,
	year = {2000},
	mrnumber = {MR1790002},
	zmnumber = {1106.62323},
	note = {Publisher: Institute of Mathematical Statistics},
	keywords = {classification, machine learning, nonparametric estimation, stagewise fitting, tree},
	pages = {337--407},
	file = {Full Text PDF:/home/petri/Zotero/storage/T634QKJI/Friedman et al. - 2000 - Additive logistic regression a statistical view o.pdf:application/pdf;Snapshot:/home/petri/Zotero/storage/446NCMFL/1016218223.html:text/html}
}