Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add notes for preprints in docs #338

Open
wants to merge 3 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 3 additions & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@
"myst_parser",
]

mathjax_path = "https://cdn.jsdelivr.net/npm/mathjax@2/MathJax.js?config=TeX-AMS-MML_HTMLorMML"

bibtex_bibfiles = ["refs.bib"]
bibtex_reference_style = "super"
bibtex_default_style = "unsrt"
Expand Down Expand Up @@ -82,7 +84,7 @@
numpydoc_show_class_members = False

# -- sphinx.ext.autosummary
autosummary_generate = []
autosummary_generate = True

# Otherwise, the Return parameter list looks different from the Parameters list
napoleon_use_rtype = False
Expand Down
64 changes: 40 additions & 24 deletions docs/refs.bib
Original file line number Diff line number Diff line change
Expand Up @@ -458,30 +458,23 @@ @article{shenChiSquareTestDistance2021
}

@article{chwialkowski2015fast,
title={Fast Two-Sample Testing with Analytic Representations of Probability Measures},
author={Kacper Chwialkowski and Aaditya Ramdas and Dino Sejdinovic and Arthur Gretton},
year={2015},
journal={arXiv:1506.04725 [math, stat]},
print={1506.04725},
eprinttype={arxiv},
abstract={We propose a class of nonparametric two-sample tests with a cost linear in the sample size. Two tests are given, both based on an ensemble of distances between analytic functions representing each of the distributions. The first test uses smoothed empirical characteristic functions to represent the distributions, the second uses distribution embeddings in a reproducing kernel Hilbert space. Analyticity implies that differences in the distributions may be detected almost surely at a finite number of randomly chosen locations/frequencies. The new tests are consistent against a larger class of alternatives than the previous linear-time tests based on the (non-smoothed) empirical characteristic functions, while being much faster than the current state-of-the-art quadratic-time kernel-based or energy distance-based tests. Experiments on artificial benchmarks and on challenging real-world testing problems demonstrate that our tests give a better power/time tradeoff than competing approaches, and in some cases, better outright power than even the most expensive quadratic-time tests. This performance advantage is retained even in high dimensions, and in cases where the difference in distributions is not observable with low order statistics.},
archivePrefix={arXiv},
primaryClass={stat.ML}
}

@article{grettonKernelJointIndependence2016,
title = {{Kernel-based Tests} for {Joint Independence}},
author = {Pfister, Nikolas and Buhlmann, Peter and Scholkopf, Bernhard and Peters, Jonas},
year = {2016},
month = nov,
journal = {arXiv:1603.00285 [math, stat]},
eprint = {1603.00285},
eprinttype = {arxiv},
primaryclass = {math, stat},
abstract = {We investigate the problem of testing whether d random variables, which may or may not be continuous, are jointly (or mutually) independent. Our method builds on ideas of the two variable Hilbert-Schmidt independence criterion (HSIC) but allows for an arbitrary number of variables. We embed the d-dimensional joint distribution and the product of the marginals into a reproducing kernel Hilbert space and define the d-variable Hilbert-Schmidt independence criterion (dHSIC) as the squared distance between the embeddings. In the population case, the value of dHSIC is zero if and only if the d variables are jointly independent, as long as the kernel is characteristic. Based on an empirical estimate of dHSIC, we define three different non-parametric hypothesis tests: a permutation test, a bootstrap test and a test based on a Gamma approximation. We prove that the permutation test achieves the significance level and that the bootstrap test achieves pointwise asymptotic significance level as well as pointwise asymptotic consistency (i.e., it is able to detect any type of fixed dependence in the large sample limit). The Gamma approximation does not come with these guarantees; however, it is computationally very fast and for small d, it performs well in practice. Finally, we apply the test to a problem in causal discovery.},
archiveprefix = {arXiv},
keywords = {Math - Statistics Theory, Statistics - Machine Learning},
}
title={Fast two-sample testing with analytic representations of probability measures},
author={Chwialkowski, Kacper P and Ramdas, Aaditya and Sejdinovic, Dino and Gretton, Arthur},
journal={Advances in Neural Information Processing Systems},
volume={28},
year={2015}
}

@article{pfister2018kernel,
title={Kernel-based tests for joint independence},
author={Pfister, Niklas and B{\"u}hlmann, Peter and Sch{\"o}lkopf, Bernhard and Peters, Jonas},
journal={Journal of the Royal Statistical Society: Series B (Statistical Methodology)},
volume={80},
number={1},
pages={5--31},
year={2018},
publisher={Wiley Online Library}
}

@article{friedmanMultivariateGeneralizationsoftheWaldWolfowitzandSmirnovTwoSampleTests1979,
title = {Multivariate Generalizations of the Wald-Wolfowitz and Smirnov Two-Sample Tests},
Expand Down Expand Up @@ -544,4 +537,27 @@ @Inbook{hotellingRelationsTwoSets1992
isbn="978-1-4612-4380-9",
doi="10.1007/978-1-4612-4380-9_14",
url="https://doi.org/10.1007/978-1-4612-4380-9_14"
}

@article{jitkrittum2017linear,
title={A linear-time kernel goodness-of-fit test},
author={Jitkrittum, Wittawat and Xu, Wenkai and Szab{\'o}, Zolt{\'a}n and Fukumizu, Kenji and Gretton, Arthur},
journal={Advances in Neural Information Processing Systems},
volume={30},
year={2017}
}

@inproceedings{10.5555/3020548.3020641,
author = {Zhang, Kun and Peters, Jonas and Janzing, Dominik and Sch\"{o}lkopf, Bernhard},
title = {Kernel-Based Conditional Independence Test and Application in Causal Discovery},
year = {2011},
isbn = {9780974903972},
publisher = {AUAI Press},
address = {Arlington, Virginia, USA},
abstract = {Conditional independence testing is an important problem, especially in Bayesian network learning and causal discovery. Due to the curse of dimensionality, testing for conditional independence of continuous variables is particularly challenging. We propose a Kernel-based Conditional Independence test (KCI-test), by constructing an appropriate test statistic and deriving its asymptotic distribution under the null hypothesis of conditional independence. The proposed method is computationally efficient and easy to implement. Experimental results show that it outperforms other methods, especially when the conditioning set is large or the sample size is not very large, in which case other methods encounter difficulties.},
booktitle = {Proceedings of the Twenty-Seventh Conference on Uncertainty in Artificial Intelligence},
pages = {804–813},
numpages = {10},
location = {Barcelona, Spain},
series = {UAI'11}
}
4 changes: 4 additions & 0 deletions hyppo/conditional/FCIT.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,12 @@ class FCIT(ConditionalIndependenceTest):
Proportion of data to evaluate test stat on.
discrete: tuple of string
Whether :math:`X` or :math:`Y` are discrete

Notes
-----
.. note::
This algorithm is currently a pre-print on arXiv.

The motivation for the test rests on the assumption that if :math:`X \not\!\perp\!\!\!\perp Y \mid Z`,
then :math:`Y` should be more accurately predicted by using both
:math:`X` and :math:`Z` as covariates as opposed to only using
Expand Down
16 changes: 10 additions & 6 deletions hyppo/conditional/kci.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,24 +10,30 @@ class KCI(ConditionalIndependenceTest):
r"""
Kernel Conditional Independence Test Statistic and P-Value.

This is a conditional indpendence test utilizing a radial basis
This is a conditional indpendence test utilizing a radial basis
function to calculate the kernels of two datasets. The trace
of the normalized matrix product is then calculated to extract the test
of the normalized matrix product is then calculated to extract the test
statistic. A Gaussian distribution is then utilized to calculate
the p-value given the statistic and approximate mean and variance
of the trace values of the independent kernel matrices.
This test is consistent against similar tests.

Notes
-----
The statistic is computed as follows :footcite:p:`10.5555/3020548.3020641`:

Let :math:`x` be a combined sample of :math:`(n, p)` sample
of random variables :math:`X` and let :math:`y` be a :math:`(n, 1)`
labels of sample classes :math:`Y`. We can then generate
:math:`Kx` and :math:`Ky` kernel matrices for each of the respective
:math:`K^x` and :math:`K^y` kernel matrices for each of the respective
samples. Normalizing, multiplying, and taking the trace of these
kernel matrices gives the resulting test statistic.
The p-value and null distribution for the corrected statistic are calculated a
gamma distribution approximation.

References
----------
.. footbibliography::
"""

def __init__(self, **kwargs):
Expand Down Expand Up @@ -110,9 +116,7 @@ def test(self, x, y):
stat = self.statistic(x, y)

mean_appr = (np.trace(Kx) * np.trace(Ky)) / T
var_appr = (
2 * np.trace(Kx @ Kx) * np.trace(Ky @ Ky) / T**2
)
var_appr = 2 * np.trace(Kx @ Kx) * np.trace(Ky @ Ky) / T**2
k_appr = mean_appr**2 / var_appr
theta_appr = var_appr / mean_appr
pvalue = 1 - np.mean(gamma.cdf(stat, k_appr, theta_appr))
Expand Down
6 changes: 3 additions & 3 deletions hyppo/d_variate/dhsic.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@ class dHsic(DVariateTest):
dHsic is a non-parametric kernel-based independence test between an
arbitrary number of variables. The dHsic statistic is 0 if the variables
are jointly independent and positive if the variables are dependent
:footcite:p:`grettonKernelJointIndependence2016`.
:footcite:p:`pfister2018kernel`.
The default choice is the Gaussian kernel, which uses the median distance
as the bandwidth, which is a characteristic kernel that guarantees that
dHsic is a consistent test
:footcite:p:`grettonKernelJointIndependence2016`
:footcite:p:`pfister2018kernel`
:footcite:p:`grettonKernelStatisticalTest2007`
:footcite:p:`grettonConsistentNonparametricTests2010`.

Expand Down Expand Up @@ -47,7 +47,7 @@ class dHsic(DVariateTest):
Notes
-----
The statistic can be derived as follows
:footcite:p:`grettonKernelJointIndependence2016`:
:footcite:p:`pfister2018kernel`:

dHsic builds on the two-variable Hilbert Schmidt Independence Criterion (Hsic),
implemented in :class:`hyppo.independence.Hsic`, but allows for an arbitrary
Expand Down
1 change: 1 addition & 0 deletions hyppo/independence/friedman_rafsky.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class FRTestOutput(NamedTuple):
class FriedmanRafsky(IndependenceTest):
r"""
Friedman-Rafksy (FR) test statistic and p-value.

This is a multivariate extension of the Wald-Wolfowitz
runs test for randomness. The normal concept of a 'run'
is replaced by a minimum spanning tree (MST) calculated between
Expand Down
3 changes: 3 additions & 0 deletions hyppo/independence/kmerf.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ class KMERF(IndependenceTest):

Notes
-----
.. note::
This algorithm is currently under review at a peer-review journal.

A description of KMERF in greater detail can be found in
:footcite:p:`shenLearningInterpretableCharacteristic2020`. It is computed
using the following steps:
Expand Down
5 changes: 5 additions & 0 deletions hyppo/independence/max_margin.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,11 @@ class MaxMargin(IndependenceTest):
**kwargs
Arbitrary keyword arguments for ``compute_distkern``.

Notes
-----
.. note::
This algorithm is currently under review at a peer-review journal.

References
----------
.. footbibliography::
Expand Down
20 changes: 11 additions & 9 deletions hyppo/kgof/fssd.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,10 @@ class FSSD(GofTest):
and a set of paired test locations. The statistic is n*FSSD^2.
The statistic can be negative because of the unbiased estimator.

:math:`H0` : the sample follows :math:`p`
:math:`H1` : the sample does not follow :math:`p`
.. math::

H_0 &: \text{ the sample follows } p \\
H_A &: \text{ the sample does not follow } p

:math:`p` is specified to the constructor in the form of an UnnormalizedDensity.

Expand All @@ -147,17 +149,17 @@ class FSSD(GofTest):
density, the GoF test tests whether or not the sample :math:`\{ \mathbf{x}_i \}_{i=1}^n`
is distributed according to a known :math:`p`.

The implemented test relies on a new test statistic called The Finite-Set Stein Discrepancy (FSSD)
The implemented test relies on a new test statistic called The Finite-Set Stein Discrepancy (FSSD) :footcite:p:`jitkrittum2017linear`
which is a discrepancy measure between a density and a sample. Unique features of the new goodness-of-fit test are:

It makes only a few mild assumptions on the distributions :math:`p` and :math:`q`. The model :math:`p`
can take almost any form. The normalizer of :math:`p` is not assumed known. The test only assesses the goodness of
:math:`p` through :math:`\nabla_{\mathbf{x}} \log p(\mathbf{x})` i.e., the first derivative of the log density.
* It makes only a few mild assumptions on the distributions :math:`p` and :math:`q`. The model :math:`p`
can take almost any form. The normalizer of :math:`p` is not assumed known. The test only assesses the goodness of
:math:`p` through :math:`\nabla_{\mathbf{x}} \log p(\mathbf{x})` i.e., the first derivative of the log density.

The runtime complexity of the full test (both parameter tuning and the actual test) is
:math:`\mathcal{O}(n)` i.e., linear in the sample size.
* The runtime complexity of the full test (both parameter tuning and the actual test) is
:math:`\mathcal{O}(n)` i.e., linear in the sample size.

It returns a set of points (features) which indicate where :math:`p` fails to fit the data.
* It returns a set of points (features) which indicate where :math:`p` fails to fit the data.

The FSSD test requires that the derivative of :math:`\log p` exists.
The test requires a technical condition called the "vanishing boundary" condition for it to be consistent.
Expand Down
3 changes: 3 additions & 0 deletions hyppo/ksample/ksamp.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ class KSample(KSampleTest):

Notes
-----
.. note::
This algorithm is currently under review at a peer-review journal.

The formulation for this implementation is as follows
:footcite:p:`pandaNonparMANOVAIndependence2021`:

Expand Down
3 changes: 3 additions & 0 deletions hyppo/time_series/dcorrx.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ class DcorrX(TimeSeriesTest):

Notes
-----
.. note::
This algorithm is currently a pre-print on arXiv.

The statistic can be derived as follows
:footcite:p:`mehtaIndependenceTestingMultivariate2020`:

Expand Down
3 changes: 3 additions & 0 deletions hyppo/time_series/mgcx.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ class MGCX(TimeSeriesTest):

Notes
-----
.. note::
This algorithm is currently a pre-print on arXiv.

The statistic can be derived as follows
:footcite:p:`mehtaIndependenceTestingMultivariate2020`:

Expand Down
8 changes: 8 additions & 0 deletions tutorials/independence.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,10 @@
#
# .. note::
#
# This algorithm is currently under review at a peer-reviewed journal.
#
# .. note::
#
# :Pros: - Highly accurate, powerful independence test for multivariate and nonlinear
# data
# - Gives information about releative dimension (or feature) importance
Expand Down Expand Up @@ -325,6 +329,10 @@
#
# .. note::
#
# This algorithm is currently under review at a peer-review journal.
#
# .. note::
#
# :Pros: - As powerful as some of the tests within this module
# - Minimal decrease in testing power as dimension increases
# :Cons: - Adds computational complexity, so can be slow
Expand Down
4 changes: 4 additions & 0 deletions tutorials/ksample.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,10 @@
#
# .. note::
#
# This algorithm is currently under review at a peer-review journal.
#
# .. note::
#
# If you want use 2-sample MGC, we have added that functionality to SciPy!
# Please see :func:`scipy.stats.multiscale_graphcorr`.
#
Expand Down
4 changes: 4 additions & 0 deletions tutorials/time_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@
#
# .. note::
#
# This algorithm is currently a preprint on arXiv.
#
# .. note::
#
# :Pros: - Very accurate
# - Operates of multivariate data
# :Cons: - Slower than pairwise Pearson's correlation
Expand Down