Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Error when running Conditional DCorr #410

Open
adam2392 opened this issue Dec 27, 2023 · 0 comments
Open

Error when running Conditional DCorr #410

adam2392 opened this issue Dec 27, 2023 · 0 comments
Assignees
Labels
bug Something isn't working

Comments

@adam2392
Copy link
Collaborator

Reproducing code example:

Download files here: https://www.dropbox.com/scl/fo/iwaer0ai8dnk27mb5obnu/h?rlkey=hcmpmc6mojb7zn5zodl1nmnt2&dl=0

import math
from collections import defaultdict
from itertools import product

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from hyppo.conditional import ConditionalDcorr
from joblib import Parallel, delayed
from sklearn.model_selection import StratifiedShuffleSplit

from sktree.stats.utils import (
    METRIC_FUNCTIONS,
    POSITIVE_METRICS,
    POSTERIOR_FUNCTIONS,
    REGRESSOR_METRICS,
    _compute_null_distribution_coleman,
    _non_nan_samples,
)

seed = 12345
rng = np.random.default_rng(seed)

def _run_parallel_posterior_sim(
    idx,
    n_samples,
    n_features,
    class_probs,
    seed,
    n_features_2,
    test_size,
    max_fpr,
    sim_type,
):
    n_jobs = 1
    results = defaultdict(list)
    rng = np.random.default_rng(seed)
    n_features_ends = [100, None]

    if sim_type == "confounder":
        npy_data = np.load(
            f"/Users/adam2392/Desktop/cancer/confounder/confounder_{idx}.npz"
        )
    elif sim_type == "collider":
        npy_data = np.load(
            f"/Users/adam2392/Desktop/cancer/collider/collider_{idx}.npz"
        )
    elif sim_type == "mediator":
        npy_data = np.load(
            f"/Users/adam2392/Desktop/cancer/mediator/mediator_{idx}.npz"
        )
    elif sim_type == "direct-indirect":
        npy_data = np.load(
            f"/Users/adam2392/Desktop/cancer/direct-indirect/direct-indirect_{idx}.npz"
        )
    elif sim_type == "independent":
        npy_data = np.load(
            f"/Users/adam2392/Desktop/cancer/independent/independent_{idx}.npz"
        )

    X = npy_data["X"]
    y = npy_data["y"]
    # print(X.shape, y.shape)

    X = X[:, : 100 + n_features_2]
    if n_samples < X.shape[0]:
        cv = StratifiedShuffleSplit(n_splits=1, train_size=n_samples)
        for train_idx, _ in cv.split(X, y):
            continue
        X = X[train_idx, :]
        y = y[train_idx, ...].squeeze()
    assert len(X) == len(y)
    assert len(y) == n_samples
    n_features_ends[1] = X.shape[1]

    posteriors_dict = dict()

    # now compute the pvalue when shuffling X2
    covariate_index = np.arange(n_features_ends[0], n_features_ends[1])

    # estimate (conditional) mutual information using KSG
    Z = X[:, covariate_index]
    mask_array = np.ones(X.shape[1])
    mask_array[covariate_index] = 0
    mask_array = mask_array.astype(bool)
    X_minus_Z = X[:, mask_array]

    cdcorr = ConditionalDcorr(bandwidth="silverman")

    print(Z, np.var(Z))
    # print(X_minus_Z, np.var(X_minus_Z))
    print(Z.shape, X_minus_Z.shape, X.shape, y.shape)
    try:
        cdcorr_stat, cdcorr_pvalue = cdcorr.test(
            x=X_minus_Z.copy().astype(np.float64),
            y=y.copy().astype(np.float64),
            z=Z.copy().astype(np.float64),
        )
    except Exception as e:
        errmsg = f"{idx, n_samples, n_features, n_features_2, np.var(Z), X_minus_Z.shape, y.shape, Z.shape}"
        e.args += (errmsg,)
        raise (e)

    np.savez(
        f"./varying-samples/{sim_type}/conddcorr_{n_samples}_{n_features_2}_{idx}.npz",
        n_samples=n_samples,
        n_features_2=n_features_2,
        y_true=y,
        cdcorr_state=cdcorr_stat,
        cdcorr_pvalue=cdcorr_pvalue,
    )
    # results["cdcorr_pvalue_x2"].append(cdcorr_pvalue)
    # results["cdcorr_stat_x2"].append(cdcorr_stat)

    # results["mvrf_posteriors"].append(comight_posteriors_x2)
    # results["mvrf_null_posteriors"].append(comight_null_posteriors_x2)
    return results

# number of features in the first view
n_features = 10
noise_dims = 90

n_samples = 256
max_features = 0.3
n_jobs = -1
test_size = 0.2

max_fpr = 0.1

# number of features in the second view
pows = np.arange(2, 13, dtype=int)
n_features_2_list = [0] + [2**pow for pow in pows]

n_features_2_list = [2**pow for pow in pows]
print(n_features_2_list)

class_probs = [0.5, 0.5]

_results_ind = Parallel(n_jobs=-1)(
    delayed(_run_parallel_posterior_sim)(
        idx_,
        n_samples,
        n_features,
        class_probs,
        seed,
        n_features_2_,
        test_size,
        max_fpr,
        "independent",
    )
    for (idx_, n_features_2_) in product(range(n_repeats), n_features_2_list)
)

Error message

---------------------------------------------------------------------------
_RemoteTraceback                          Traceback (most recent call last)
_RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/Users/adam2392/miniforge3/envs/sktree/lib/python3.9/site-packages/joblib/externals/loky/process_executor.py", line 463, in _process_worker
    r = call_item()
  File "/Users/adam2392/miniforge3/envs/sktree/lib/python3.9/site-packages/joblib/externals/loky/process_executor.py", line 291, in __call__
    return self.fn(*self.args, **self.kwargs)
  File "/Users/adam2392/miniforge3/envs/sktree/lib/python3.9/site-packages/joblib/parallel.py", line 589, in __call__
    return [func(*args, **kwargs)
  File "/Users/adam2392/miniforge3/envs/sktree/lib/python3.9/site-packages/joblib/parallel.py", line 589, in <listcomp>
    return [func(*args, **kwargs)
  File "/var/folders/6_/sl83qtkd68x3_mvfys07_6qm0000gn/T/ipykernel_73186/1504525807.py", line 79, in _run_parallel_posterior_sim
  File "/var/folders/6_/sl83qtkd68x3_mvfys07_6qm0000gn/T/ipykernel_73186/1504525807.py", line 71, in _run_parallel_posterior_sim
  File "/Users/adam2392/Documents/hyppo/hyppo/conditional/cdcorr.py", line 181, in test
    stat, pvalue, null_dist = perm_test(
  File "/Users/adam2392/Documents/hyppo/hyppo/tools/common.py", line 613, in perm_test
    stat = calc_stat(x, y, z)
  File "/Users/adam2392/Documents/hyppo/hyppo/conditional/cdcorr.py", line 112, in statistic
    x, y, z = check_input()
  File "/Users/adam2392/Documents/hyppo/hyppo/conditional/_utils.py", line 24, in __call__
    self._check_variance()
  File "/Users/adam2392/Documents/hyppo/hyppo/conditional/_utils.py", line 88, in _check_variance
    raise ValueError(f"Test cannot be run, one of the inputs has 0 variance {np.var(self.x)}, {np.var(self.y)}, {np.var(self.z)}, {self.z}, {self.z.shape}")
ValueError: ('Test cannot be run, one of the inputs has 0 variance 3.309443477874101, 0.25, 0.0, [[5.77545356e-203 0.00000000e+000 0.00000000e+000 ... 0.00000000e+000\n  0.00000000e+000 0.00000000e+000]\n [0.00000000e+000 5.77545356e-203 0.00000000e+000 ... 0.00000000e+000\n  0.00000000e+000 0.00000000e+000]\n [0.00000000e+000 0.00000000e+000 5.77545356e-203 ... 0.00000000e+000\n  0.00000000e+000 0.00000000e+000]\n ...\n [0.00000000e+000 0.00000000e+000 0.00000000e+000 ... 5.77545356e-203\n  0.00000000e+000 0.00000000e+000]\n [0.00000000e+000 0.00000000e+000 0.00000000e+000 ... 0.00000000e+000\n  5.77545356e-203 0.00000000e+000]\n [0.00000000e+000 0.00000000e+000 0.00000000e+000 ... 0.00000000e+000\n  0.00000000e+000 5.77545356e-203]], (256, 256)', '(0, 256, 10, 512, 1.0062427978699584, (256, 100), (256,), (256, 512))')
"""

The above exception was the direct cause of the following exception:

ValueError                                Traceback (most recent call last)
Cell In[34], line 1
----> 1 _results_ind = Parallel(n_jobs=-1)(
      2     delayed(_run_parallel_posterior_sim)(
      3         idx_,
      4         n_samples,
      5         n_features,
      6         class_probs,
      7         seed,
      8         n_features_2_,
      9         test_size,
     10         max_fpr,
     11         "independent",
     12     )
     13     for (idx_, n_features_2_) in product(range(n_repeats), n_features_2_list)
     14 )

File ~/miniforge3/envs/sktree/lib/python3.9/site-packages/joblib/parallel.py:1952, in Parallel.__call__(self, iterable)
   1946 # The first item from the output is blank, but it makes the interpreter
   1947 # progress until it enters the Try/Except block of the generator and
   1948 # reach the first `yield` statement. This starts the aynchronous
   1949 # dispatch of the tasks to the workers.
   1950 next(output)
-> 1952 return output if self.return_generator else list(output)

File ~/miniforge3/envs/sktree/lib/python3.9/site-packages/joblib/parallel.py:1595, in Parallel._get_outputs(self, iterator, pre_dispatch)
   1592     yield
   1594     with self._backend.retrieval_context():
-> 1595         yield from self._retrieve()
   1597 except GeneratorExit:
   1598     # The generator has been garbage collected before being fully
   1599     # consumed. This aborts the remaining tasks if possible and warn
   1600     # the user if necessary.
   1601     self._exception = True

File ~/miniforge3/envs/sktree/lib/python3.9/site-packages/joblib/parallel.py:1699, in Parallel._retrieve(self)
   1692 while self._wait_retrieval():
   1693 
   1694     # If the callback thread of a worker has signaled that its task
   1695     # triggered an exception, or if the retrieval loop has raised an
   1696     # exception (e.g. `GeneratorExit`), exit the loop and surface the
   1697     # worker traceback.
   1698     if self._aborting:
-> 1699         self._raise_error_fast()
   1700         break
   1702     # If the next job is not ready for retrieval yet, we just wait for
   1703     # async callbacks to progress.

File ~/miniforge3/envs/sktree/lib/python3.9/site-packages/joblib/parallel.py:1734, in Parallel._raise_error_fast(self)
   1730 # If this error job exists, immediatly raise the error by
   1731 # calling get_result. This job might not exists if abort has been
   1732 # called directly or if the generator is gc'ed.
   1733 if error_job is not None:
-> 1734     error_job.get_result(self.timeout)

File ~/miniforge3/envs/sktree/lib/python3.9/site-packages/joblib/parallel.py:736, in BatchCompletionCallBack.get_result(self, timeout)
    730 backend = self.parallel._backend
    732 if backend.supports_retrieve_callback:
    733     # We assume that the result has already been retrieved by the
    734     # callback thread, and is stored internally. It's just waiting to
    735     # be returned.
--> 736     return self._return_or_raise()
    738 # For other backends, the main thread needs to run the retrieval step.
    739 try:

File ~/miniforge3/envs/sktree/lib/python3.9/site-packages/joblib/parallel.py:754, in BatchCompletionCallBack._return_or_raise(self)
    752 try:
    753     if self.status == TASK_ERROR:
--> 754         raise self._result
    755     return self._result
    756 finally:

ValueError: ('Test cannot be run, one of the inputs has 0 variance

Version information

@adam2392 adam2392 added the bug Something isn't working label Dec 27, 2023
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working
Projects
None yet
Development

No branches or pull requests

2 participants