Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Improved performance of the ConditionalMNLogit class #9036

Open
wants to merge 3 commits into
base: main
Choose a base branch
from

Conversation

quant12345
Copy link
Contributor

@quant12345 quant12345 commented Oct 22, 2023

  • tests added / passed.
  • code/documentation is well formatted.
  • properly formatted commit message. See NumPy's guide.

ENH - ConditionalMNLogit class: in loglike method replaces the nested loop to numpy vector operations. In score method, variable v is calculated at a time for the selected values (exp_sum). The number 'p' is used to access them.
As a result, test_conditional_mnlogit_grad is calculated 2.7 times faster, and test_conditional_mnlogit_2d, test_conditional_mnlogit_3d 1.7 times faster.

Below is a description of how to compare the old and new options.

I separated test_conditional_, conditional_models_ into separate files so that I could run the old and new options locally together for comparison (located them in a common directory). In conditional_models there are two classes ConditionalMNLogit and ConditionalMNLogit_new - in which I made improvements. In tests, the module is connected directly to use both options for tests. n = 90 is passed to gen_mnlogit(n)

Below is the test_conditional_ code. The conditional_models_ code is not posted due to line limitations, I am attaching it as an attachment.

test_conditional_ - code
import numpy as np
from conditional_models_ import ConditionalMNLogit, ConditionalMNLogit_new
from statsmodels.tools.numdiff import approx_fprime
from numpy.testing import assert_allclose
import pandas as pd


def gen_mnlogit(n):
    np.random.seed(235)

    g = np.kron(np.ones(5), np.arange(n // 5))
    x1 = np.random.normal(size=n)
    x2 = np.random.normal(size=n)
    xm = np.concatenate((x1[:, None], x2[:, None]), axis=1)
    pa = np.array([[0, 1, -1], [0, 2, -1]])
    lpr = np.dot(xm, pa)
    pr = np.exp(lpr)
    pr /= pr.sum(1)[:, None]
    cpr = pr.cumsum(1)
    y = 2 * np.ones(n)
    u = np.random.uniform(size=n)
    y[u < cpr[:, 2]] = 2
    y[u < cpr[:, 1]] = 1
    y[u < cpr[:, 0]] = 0

    df = pd.DataFrame({"y": y, "x1": x1,
                       "x2": x2, "g": g})
    return df


def test_conditional_mnlogit_grad_new(n):
    df = gen_mnlogit(n)
    model = ConditionalMNLogit_new.from_formula(
        "y ~ 0 + x1 + x2", groups="g", data=df)

    # Compare the gradients to numeric gradients
    for _ in range(5):
        za = np.random.normal(size=4)
        grad = model.score(za)
        ngrad = approx_fprime(za, model.loglike)
        assert_allclose(grad, ngrad, rtol=1e-5, atol=1e-3)


def test_conditional_mnlogit_grad(n):
    df = gen_mnlogit(n)
    model = ConditionalMNLogit.from_formula(
        "y ~ 0 + x1 + x2", groups="g", data=df)

    # Compare the gradients to numeric gradients
    for _ in range(5):
        za = np.random.normal(size=4)
        grad = model.score(za)
        ngrad = approx_fprime(za, model.loglike)
        assert_allclose(grad, ngrad, rtol=1e-5, atol=1e-3)


def test_conditional_mnlogit_2d_new(n):
    df = gen_mnlogit(n)
    model = ConditionalMNLogit_new.from_formula(
        "y ~ 0 + x1 + x2", groups="g", data=df)
    result = model.fit()

    #"""
    # Regression tests
    assert_allclose(
        result.params,
        np.asarray([[0.75592035, -1.58565494],
                    [1.82919869, -1.32594231]]),
        rtol=1e-5, atol=1e-5)
    assert_allclose(
        result.bse,
        np.asarray([[0.68099698, 0.70142727],
                    [0.65190315, 0.59653771]]),
        rtol=1e-5, atol=1e-5)
    #"""


def test_conditional_mnlogit_2d(n):
    df = gen_mnlogit(n)
    model = ConditionalMNLogit.from_formula(
        "y ~ 0 + x1 + x2", groups="g", data=df)
    result = model.fit()

    # Regression tests
    #"""
    assert_allclose(
        result.params,
        np.asarray([[0.75592035, -1.58565494],
                    [1.82919869, -1.32594231]]),
        rtol=1e-5, atol=1e-5)
    assert_allclose(
        result.bse,
        np.asarray([[0.68099698, 0.70142727],
                    [0.65190315, 0.59653771]]),
        rtol=1e-5, atol=1e-5)
    #"""


def test_conditional_mnlogit_3d_new(n):
    df = gen_mnlogit(n)
    df["x3"] = np.random.normal(size=df.shape[0])
    model = ConditionalMNLogit_new.from_formula(
        "y ~ 0 + x1 + x2 + x3", groups="g", data=df)
    result = model.fit()

    #"""
    # Regression tests
    assert_allclose(
        result.params,
        np.asarray([[0.729629, -1.633673],
                    [1.879019, -1.327163],
                    [-0.114124, -0.109378]]),
        atol=1e-5, rtol=1e-5)

    assert_allclose(
        result.bse,
        np.asarray([[0.682965, 0.60472],
                    [0.672947, 0.42401],
                    [0.722631, 0.33663]]),
        atol=1e-5, rtol=1e-5)

    #"""

    # Smoke test
    result.summary()


def test_conditional_mnlogit_3d(n):
    df = gen_mnlogit(n)
    df["x3"] = np.random.normal(size=df.shape[0])
    model = ConditionalMNLogit.from_formula(
        "y ~ 0 + x1 + x2 + x3", groups="g", data=df)
    result = model.fit()

    #"""
    # Regression tests
    assert_allclose(
        result.params,
        np.asarray([[0.729629, -1.633673],
                    [1.879019, -1.327163],
                    [-0.114124, -0.109378]]),
        atol=1e-5, rtol=1e-5)

    assert_allclose(
        result.bse,
        np.asarray([[0.682965, 0.60472],
                    [0.672947, 0.42401],
                    [0.722631, 0.33663]]),
        atol=1e-5, rtol=1e-5)

    #"""

    # Smoke test
    result.summary()

import datetime

n = 90

arr = [[test_conditional_mnlogit_grad_new,
        test_conditional_mnlogit_grad,
        'test_conditional_mnlogit_grad'],
       [test_conditional_mnlogit_2d_new,
        test_conditional_mnlogit_2d,
        'test_conditional_mnlogit_2d'],
       [test_conditional_mnlogit_3d_new,
        test_conditional_mnlogit_3d,
        'test_conditional_mnlogit_3d']]

for i in arr:
    print(i[2]) # name of the called function

    now = datetime.datetime.now()
    i[1](n)
    time_old = datetime.datetime.now() - now

    now = datetime.datetime.now()
    i[0](n)
    time_new = datetime.datetime.now() - now

    print(f'time_old = {time_old}, time_new = {time_new}, '
          f'time_old/time_new = {time_old/time_new}')

    print()

conditional_models_.py.tar.gz

Output:

n = 90

test_conditional_mnlogit_grad 
time_old = 0:00:00.680232, time_new = 0:00:00.236433, time_old/time_new = 2.8770603088401367
test_conditional_mnlogit_2d
time_old = 0:00:01.908332, time_new = 0:00:01.057139, time_old/time_new = 1.8051855054065737
test_conditional_mnlogit_3d
time_old = 0:00:02.635174, time_new = 0:00:01.468636, time_old/time_new = 1.7943002895203441

n = 2000 # with assert commented 

test_conditional_mnlogit_grad 
time_old = 0:00:16.267467, time_new = 0:00:06.169248, time_old/time_new = 2.636863844669561
test_conditional_mnlogit_2d
time_old = 0:00:45.773023, time_new = 0:00:25.878624, time_old/time_new = 1.768757991151307
test_conditional_mnlogit_3d
time_old = 0:00:57.176253, time_new = 0:00:33.577012, time_old/time_new = 1.7028392222631366
**Notes**
  • It is essential that you add a test when making code changes. Tests are not
    needed for doc changes.
  • When adding a new function, test values should usually be verified in another package (e.g., R/SAS/Stata).
  • When fixing a bug, you must add a test that would produce the bug in main and
    then show that it is fixed with the new code.
  • New code additions must be well formatted. Changes should pass flake8. If on Linux or OSX, you can
    verify you changes are well formatted by running
    git diff upstream/main -u -- "*.py" | flake8 --diff --isolated
    
    assuming flake8 is installed. This command is also available on Windows
    using the Windows System for Linux once flake8 is installed in the
    local Linux environment. While passing this test is not required, it is good practice and it help
    improve code quality in statsmodels.
  • Docstring additions must render correctly, including escapes and LaTeX.

…p with numpy vector operations. In 'score' method, the score of the variable 'v' is calculated at a time for the selected values (exp_sum). The number 'p' is used to access them.
@quant12345
Copy link
Contributor Author

quant12345 commented Oct 23, 2023

@josef-pkt @bashtage could you tell me why this is happening to me: statsmodels.statsmodels - #20231022.1 failed?

logs:

2023-10-22T15:19:44.6287863Z =========================== short test summary info ============================
2023-10-22T15:19:44.6288600Z FAILED statsmodels/imputation/tests/test_mice.py::TestMICEData::test_default
2023-10-22T15:19:44.6289445Z FAILED statsmodels/imputation/tests/test_mice.py::TestMICEData::test_pertmeth
2023-10-22T15:19:44.6290306Z FAILED statsmodels/imputation/tests/test_mice.py::TestMICEData::test_set_imputer
2023-10-22T15:19:44.6291200Z = 3 failed, 17761 passed, 378 skipped, 142 xfailed, 445 warnings in 525.77s (0:08:45) =
2023-10-22T15:19:45.6372656Z ##[error]Bash exited with code '1'.
2023-10-22T15:19:45.6383999Z ##[section]Finishing: Run tests (editable)

I tried making changes to the installed library locally and running the test_conditional tests and they passed successfully.
I also tried to run the test_mice.py tests with the changes and they also passed successfully.

@quant12345 quant12345 changed the title Improved performance of the ConditionalMNLogit class ENH: Improved performance of the ConditionalMNLogit class Dec 1, 2023
Copy link
Member

@jseabold jseabold left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have recently been using these models. In general this looks good to me, if you could change to logsumexp that would be helpful.

denom = np.sum(
np.exp(
np.sum(x[jj, list(itertools.permutations(y))], axis=1)
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could remove the outer sum and exp and use scipy.special.logsumexp below for a little better stability.

            denom = np.sum(x[jj, list(itertools.permutations(y))], axis=1)
            ll += x[(jj, y)].sum() - logsumexp(denom)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jseabold updated the code. I posted the code for the tests in a separate message.

Updated for better stability(sum and exp replaced with scipy.special.logsumexp).
@pep8speaks
Copy link

pep8speaks commented May 14, 2024

Hello @quant12345! Thanks for updating this PR. We checked the lines you've touched for PEP 8 issues, and found:

There are currently no PEP 8 issues detected in this Pull Request. Cheers! 🍻

Comment last updated at 2024-05-14 13:50:47 UTC

@quant12345
Copy link
Contributor Author

quant12345 commented May 14, 2024

Below is the test_conditional_new code. The conditional_models_new code is not posted due to line limitations, I am attaching it as an attachment(with using scipy.special.logsumexp).

test_conditional_new - code
import numpy as np
from conditional_models_new import ConditionalMNLogit, ConditionalMNLogit_new
from statsmodels.tools.numdiff import approx_fprime
from numpy.testing import assert_allclose
import pandas as pd


def gen_mnlogit(n):
    np.random.seed(235)

    g = np.kron(np.ones(5), np.arange(n // 5))
    x1 = np.random.normal(size=n)
    x2 = np.random.normal(size=n)
    xm = np.concatenate((x1[:, None], x2[:, None]), axis=1)
    pa = np.array([[0, 1, -1], [0, 2, -1]])
    lpr = np.dot(xm, pa)
    pr = np.exp(lpr)
    pr /= pr.sum(1)[:, None]
    cpr = pr.cumsum(1)
    y = 2 * np.ones(n)
    u = np.random.uniform(size=n)
    y[u < cpr[:, 2]] = 2
    y[u < cpr[:, 1]] = 1
    y[u < cpr[:, 0]] = 0

    df = pd.DataFrame({"y": y, "x1": x1,
                       "x2": x2, "g": g})
    return df


def test_conditional_mnlogit_grad_new(n):
    df = gen_mnlogit(n)
    model = ConditionalMNLogit_new.from_formula(
        "y ~ 0 + x1 + x2", groups="g", data=df)

    # Compare the gradients to numeric gradients
    for _ in range(5):
        za = np.random.normal(size=4)
        grad = model.score(za)
        ngrad = approx_fprime(za, model.loglike)
        assert_allclose(grad, ngrad, rtol=1e-5, atol=1e-3)


def test_conditional_mnlogit_grad(n):
    df = gen_mnlogit(n)
    model = ConditionalMNLogit.from_formula(
        "y ~ 0 + x1 + x2", groups="g", data=df)

    # Compare the gradients to numeric gradients
    for _ in range(5):
        za = np.random.normal(size=4)
        grad = model.score(za)
        ngrad = approx_fprime(za, model.loglike)
        assert_allclose(grad, ngrad, rtol=1e-5, atol=1e-3)


def test_conditional_mnlogit_2d_new(n):
    df = gen_mnlogit(n)
    model = ConditionalMNLogit_new.from_formula(
        "y ~ 0 + x1 + x2", groups="g", data=df)
    result = model.fit()

    #"""
    # Regression tests
    assert_allclose(
        result.params,
        np.asarray([[0.75592035, -1.58565494],
                    [1.82919869, -1.32594231]]),
        rtol=1e-5, atol=1e-5)
    assert_allclose(
        result.bse,
        np.asarray([[0.68099698, 0.70142727],
                    [0.65190315, 0.59653771]]),
        rtol=1e-5, atol=1e-5)
    #"""


def test_conditional_mnlogit_2d(n):
    df = gen_mnlogit(n)
    model = ConditionalMNLogit.from_formula(
        "y ~ 0 + x1 + x2", groups="g", data=df)
    result = model.fit()

    # Regression tests
    #"""
    assert_allclose(
        result.params,
        np.asarray([[0.75592035, -1.58565494],
                    [1.82919869, -1.32594231]]),
        rtol=1e-5, atol=1e-5)
    assert_allclose(
        result.bse,
        np.asarray([[0.68099698, 0.70142727],
                    [0.65190315, 0.59653771]]),
        rtol=1e-5, atol=1e-5)
    #"""


def test_conditional_mnlogit_3d_new(n):
    df = gen_mnlogit(n)
    df["x3"] = np.random.normal(size=df.shape[0])
    model = ConditionalMNLogit_new.from_formula(
        "y ~ 0 + x1 + x2 + x3", groups="g", data=df)
    result = model.fit()

    #"""
    # Regression tests
    assert_allclose(
        result.params,
        np.asarray([[0.729629, -1.633673],
                    [1.879019, -1.327163],
                    [-0.114124, -0.109378]]),
        atol=1e-5, rtol=1e-5)

    assert_allclose(
        result.bse,
        np.asarray([[0.682965, 0.60472],
                    [0.672947, 0.42401],
                    [0.722631, 0.33663]]),
        atol=1e-5, rtol=1e-5)

    #"""

    # Smoke test
    result.summary()


def test_conditional_mnlogit_3d(n):
    df = gen_mnlogit(n)
    df["x3"] = np.random.normal(size=df.shape[0])
    model = ConditionalMNLogit.from_formula(
        "y ~ 0 + x1 + x2 + x3", groups="g", data=df)
    result = model.fit()

    #"""
    # Regression tests
    assert_allclose(
        result.params,
        np.asarray([[0.729629, -1.633673],
                    [1.879019, -1.327163],
                    [-0.114124, -0.109378]]),
        atol=1e-5, rtol=1e-5)

    assert_allclose(
        result.bse,
        np.asarray([[0.682965, 0.60472],
                    [0.672947, 0.42401],
                    [0.722631, 0.33663]]),
        atol=1e-5, rtol=1e-5)

    #"""

    # Smoke test
    result.summary()

import datetime

n = 90

arr = [[test_conditional_mnlogit_grad_new,
        test_conditional_mnlogit_grad,
        'test_conditional_mnlogit_grad'],
       [test_conditional_mnlogit_2d_new,
        test_conditional_mnlogit_2d,
        'test_conditional_mnlogit_2d'],
       [test_conditional_mnlogit_3d_new,
        test_conditional_mnlogit_3d,
        'test_conditional_mnlogit_3d']]

for i in arr:
    print(i[2]) # name of the called function

    now = datetime.datetime.now()
    i[1](n)
    time_old = datetime.datetime.now() - now

    now = datetime.datetime.now()
    i[0](n)
    time_new = datetime.datetime.now() - now

    print(f'time_old = {time_old}, time_new = {time_new}, '
          f'time_old/time_new = {time_old/time_new}')

    print()

The performance is the same as in my previous version.
If you use 'n' not 90 comment out the assert.

conditional_models_new.py.tar.gz

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

None yet

4 participants