Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EOFError #700

Open
drbenvincent opened this issue Jul 27, 2023 · 6 comments
Open

EOFError #700

drbenvincent opened this issue Jul 27, 2023 · 6 comments

Comments

@drbenvincent
Copy link

drbenvincent commented Jul 27, 2023

I came across a strange bug. Here's a minimum (not) working example to replicate the problem.

This is with Bambi 0.12.0 and pandas 2.0.3. At first I thought it might be a pandas 2 problem, but the same error occurs under pandas 1.5.3.

Couldn't find anything about EOFError in any open or closed issues, so thought I'd submit this possible bug report.

EDIT: Error occurs when N=10_000, but not when N=1_000.

import numpy as np
import pandas as pd
import bambi as bmb

# generate data
N = 10_000
S = np.random.random(N)
Q = 0.2*S + 0.67*np.random.random(N)
X = 0.14*Q + 0.4*np.random.random(N)
Y = 0.7*X + 0.11*Q + 0.32*S + 0.24*np.random.random(N)
P = 0.43*X + 0.21*Y + 0.22*np.random.random(N)
df = pd.DataFrame(np.vstack([S, Q, X, Y, P]).T, columns=["S", "Q", "X", "Y", "P"])

# fit with Bambi
model = bmb.Model('Y ~ S + Q + X + P', df)
results = model.fit()

I get the following error:

---------------------------------------------------------------------------
EOFError                                  Traceback (most recent call last)
Cell In[14], line 10
      8 df = pd.DataFrame(np.vstack([S, Q, X, Y, P]).T, columns=["S", "Q", "X", "Y", "P"])
      9 model = bmb.Model('Y ~ S + Q + X + P', df)
---> 10 results = model.fit()

File ~/mambaforge/envs/pymc_env/lib/python3.11/site-packages/bambi/models.py:325, in Model.fit(self, draws, tune, discard_tuned_samples, omit_offsets, include_mean, inference_method, init, n_init, chains, cores, random_seed, **kwargs)
    318     response = self.components[self.response_name]
    319     _log.info(
    320         "Modeling the probability that %s==%s",
    321         response.response_term.name,
    322         str(response.response_term.success),
    323     )
--> 325 return self.backend.run(
    326     draws=draws,
    327     tune=tune,
    328     discard_tuned_samples=discard_tuned_samples,
    329     omit_offsets=omit_offsets,
    330     include_mean=include_mean,
    331     inference_method=inference_method,
    332     init=init,
    333     n_init=n_init,
    334     chains=chains,
    335     cores=cores,
    336     random_seed=random_seed,
    337     **kwargs,
    338 )

File ~/mambaforge/envs/pymc_env/lib/python3.11/site-packages/bambi/backend/pymc.py:96, in PyMCModel.run(self, draws, tune, discard_tuned_samples, omit_offsets, include_mean, inference_method, init, n_init, chains, cores, random_seed, **kwargs)
     94 # NOTE: Methods return different types of objects (idata, approximation, and dictionary)
     95 if inference_method in ["mcmc", "nuts_numpyro", "nuts_blackjax"]:
---> 96     result = self._run_mcmc(
     97         draws,
     98         tune,
     99         discard_tuned_samples,
    100         omit_offsets,
    101         include_mean,
    102         init,
    103         n_init,
    104         chains,
    105         cores,
    106         random_seed,
    107         inference_method,
    108         **kwargs,
    109     )
    110 elif inference_method == "vi":
    111     result = self._run_vi(**kwargs)

File ~/mambaforge/envs/pymc_env/lib/python3.11/site-packages/bambi/backend/pymc.py:172, in PyMCModel._run_mcmc(self, draws, tune, discard_tuned_samples, omit_offsets, include_mean, init, n_init, chains, cores, random_seed, sampler_backend, **kwargs)
    170 if sampler_backend == "mcmc":
    171     try:
--> 172         idata = pm.sample(
    173             draws=draws,
    174             tune=tune,
    175             discard_tuned_samples=discard_tuned_samples,
    176             init=init,
    177             n_init=n_init,
    178             chains=chains,
    179             cores=cores,
    180             random_seed=random_seed,
    181             **kwargs,
    182         )
    183     except (RuntimeError, ValueError):
    184         if (
    185             "ValueError: Mass matrix contains" in traceback.format_exc()
    186             and init == "auto"
    187         ):

File ~/mambaforge/envs/pymc_env/lib/python3.11/site-packages/pymc/sampling/mcmc.py:766, in sample(draws, tune, chains, cores, random_seed, progressbar, step, nuts_sampler, initvals, init, jitter_max_retries, n_init, trace, discard_tuned_samples, compute_convergence_checks, keep_warning_stat, return_inferencedata, idata_kwargs, nuts_sampler_kwargs, callback, mp_ctx, model, **kwargs)
    764 _print_step_hierarchy(step)
    765 try:
--> 766     _mp_sample(**sample_args, **parallel_args)
    767 except pickle.PickleError:
    768     _log.warning("Could not pickle model, sampling singlethreaded.")

File ~/mambaforge/envs/pymc_env/lib/python3.11/site-packages/pymc/sampling/mcmc.py:1155, in _mp_sample(draws, tune, step, chains, cores, random_seed, start, progressbar, traces, model, callback, mp_ctx, **kwargs)
   1153 try:
   1154     with sampler:
-> 1155         for draw in sampler:
   1156             strace = traces[draw.chain]
   1157             strace.record(draw.point, draw.stats)

File ~/mambaforge/envs/pymc_env/lib/python3.11/site-packages/pymc/sampling/parallel.py:448, in ParallelSampler.__iter__(self)
    445     self._progress.update(self._total_draws)
    447 while self._active:
--> 448     draw = ProcessAdapter.recv_draw(self._active)
    449     proc, is_last, draw, tuning, stats = draw
    450     self._total_draws += 1

File ~/mambaforge/envs/pymc_env/lib/python3.11/site-packages/pymc/sampling/parallel.py:320, in ProcessAdapter.recv_draw(processes, timeout)
    318 idxs = {id(proc._msg_pipe): proc for proc in processes}
    319 proc = idxs[id(ready[0])]
--> 320 msg = ready[0].recv()
    322 if msg[0] == "error":
    323     old_error = msg[1]

File ~/mambaforge/envs/pymc_env/lib/python3.11/multiprocessing/connection.py:249, in _ConnectionBase.recv(self)
    247 self._check_closed()
    248 self._check_readable()
--> 249 buf = self._recv_bytes()
    250 return _ForkingPickler.loads(buf.getbuffer())

File ~/mambaforge/envs/pymc_env/lib/python3.11/multiprocessing/connection.py:413, in Connection._recv_bytes(self, maxsize)
    412 def _recv_bytes(self, maxsize=None):
--> 413     buf = self._recv(4)
    414     size, = struct.unpack("!i", buf.getvalue())
    415     if size == -1:

File ~/mambaforge/envs/pymc_env/lib/python3.11/multiprocessing/connection.py:382, in Connection._recv(self, size, read)
    380 if n == 0:
    381     if remaining == size:
--> 382         raise EOFError
    383     else:
    384         raise OSError("got end of file during message")

EOFError:
@jvparidon
Copy link

There is a similar issue open in the PyMC repo (pymc-devs/pymc#6415) but I'm fairly certain that this is a Bambi issue, because I can create a fairly minimal reprex that fails in Bambi but works just fine if I write it out in PyMC:

import numpy as np
import pandas as pd
import pymc as pm
import bambi as bmb

# 3 columns of 100k random-normal numbers
df = pd.DataFrame(np.random.randn(300000).reshape((-1, 3)), columns=['x1', 'x2', 'y'])

# this PyMC model samples just fine
with pm.Model():
    a = pm.Normal('a', mu=0, sigma=1)
    b1 = pm.Normal('b1', mu=0, sigma=1)
    b2 = pm.Normal('b2', mu=0, sigma=1)
    e = pm.HalfStudentT('e', sigma=1, nu=3)
    yhat = a + b1 * df.x1 + b2 * df.x2
    likelihood = pm.Normal('likelihood', mu=yhat, sigma=e, observed=df.y)
    pm.sample()

# this very similar Bambi model also samples just fine with mp_ctx='forkserver'
bmb.Model('y ~ 1 + x1 + x2', df).fit(mp_ctx='forkserver')

# but this identical Bambi model fails before sampling starts, raising an EOFError
bmb.Model('y ~ 1 + x1 + x2', df).fit()

I've worked out that changing the multiprocessing context from fork to forkserver fixes the problem in Bambi. However I still don't fully understand what's going on here, because the multiprocessing context does not seem to affect the pure PyMC model; I can switch the mp_ctx in the pm.sample call to fork, forkserver, spawn, it all works just fine. It's only the Bambi model that won't work with fork and that seems to be the implicit default for pm.sample.

Having Bambi always pass mp_ctx='forkserver' when it calls pm.sample would fix the issue, so I can open a PR to suggest that change. It would be nice to figure out why this fixes the issue before implementing a fix though.

(@tomicapretto, FYI)

@jvparidon
Copy link

jvparidon commented Sep 23, 2023

As @drbenvincent notes, reducing the dataset size also fixes the problem, so it seems to have something to do with memory allocation for forked processes on M1 chipsets.

There also seems to be a general consensus that using fork on MacOS is unsafe (see https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) but none of that explains why this would affect Bambi models but not pure PyMC models.

@tomicapretto
Copy link
Collaborator

Thanks for the in depth research! Bambi creates models in a slightly different way, first computing a design matrix, then a vector of coefficients, and finally the dot product between those. Maybe, these operations are incurring in some extra consumption or memory or whatever that causes this problem on M1 chipsets.

Are you aware of drawbacks associated with the forkserver option?

@jvparidon
Copy link

What's weird is that I can make the pure PyMC model much, much larger (more data, more random variables, etc.) and it will still work with fork, so it's not purely about the amount of memory used. Something that Bambi is doing is not fork-safe, but only when the model is larger than a certain size. I'll experiment a bit more with the model matrix functions to see if I can isolate it further.

RE: forkserver, I'm not aware of any drawbacks, but admittedly not an expert at all. I think it was created to be a thread-safe alternative to fork. (While still being much faster than spawn.)

@jvparidon
Copy link

jvparidon commented Sep 26, 2023

Narrowed it down further:

import numpy as np
import pandas as pd
import pymc as pm
from formulae import design_matrices

# 3 columns of 100k random-normal numbers
df = pd.DataFrame(np.random.randn(300000).reshape((-1, 3)), columns=['x1', 'x2', 'y'])

# create design matrix
dm = design_matrices('y ~ 1 + x1 + x2', df)

# fit model with multiplication + summation (this works!)
with pm.Model():
    b = pm.Normal('b', mu=0, sigma=1, shape=3)
    e = pm.HalfStudentT('e', sigma=1, nu=3)
    yhat = (dm.common.design_matrix * b).sum(axis=1)
    likelihood = pm.Normal('likelihood', mu=yhat, sigma=e, observed=df.y)
    trace = pm.sample()

# fit model with pm.math.dot (this does not work!)
with pm.Model():
    b = pm.Normal('b', mu=0, sigma=1, shape=3)
    e = pm.HalfStudentT('e', sigma=1, nu=3)
    yhat = pm.math.dot(dm.common.design_matrix, b)
    likelihood = pm.Normal('likelihood', mu=yhat, sigma=e, observed=df.y)
    trace = pm.sample()

Creating a design matrix with formulae.design_matrices() isn't the thing that breaks under mp_ctx='fork', but computing the dot product using pm.math.dot() does. Computing the dot product using element-wise multiplication and summation works just fine, presumably because that compiles down differently than pm.math.dot().

I can still get the model to sample by reducing the size of the dataset, so it's not like pm.math.dot() doesn't ever run on M1 chipsets at all, but it does seem like it's not thread/fork-safe and should be used with the forkserver multiprocessing context instead.

Edited to add: The dot-product that pm.math.dot() has to compute here is dense, so I think it ends up calling pytensor.tensor.math.dense_dot() which I think calls an instance of pytensor.tensor.math.Dot. I'm not sure what it is in there that isn't thread/fork-safe, maybe memory allocation for calls to BLAS or something like that?

@tomicapretto
Copy link
Collaborator

Thanks for the detailed report! I just shared this with the PyTensor people.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

3 participants