Skip to content

Commit

Permalink
Show nvidia-pynvml compatibility warnings for #161
Browse files Browse the repository at this point in the history
NVIDIA 535.43, 535.86 can display process information correctly only
with nvidia-ml-py==12.535.77. Display an warning message when an
incompatible combination is detected.

See #161 for more details.
  • Loading branch information
wookayin committed Oct 30, 2023
1 parent ed69c2d commit 342cf10
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 1 deletion.
3 changes: 3 additions & 0 deletions gpustat/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@

import gpustat.util as util
from gpustat.nvml import pynvml as N
from gpustat.nvml import check_driver_nvml_version

NOT_SUPPORTED = 'Not Supported'
MB = 1024 * 1024
Expand Down Expand Up @@ -613,8 +614,10 @@ def _wrapped(*args, **kwargs):
gpu_list.append(gpu_stat)

# 2. additional info (driver version, etc).
# TODO: check this only once, no need to call multiple times
try:
driver_version = _decode(N.nvmlSystemGetDriverVersion())
check_driver_nvml_version(driver_version)
except N.NVMLError as e:
log.add_exception("driver_version", e)
driver_version = None # N/A
Expand Down
47 changes: 46 additions & 1 deletion gpustat/nvml.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@

# pylint: disable=protected-access

import warnings
from typing import Tuple
import functools
import os
import sys
import textwrap
import warnings

# If this environment variable is set, we will bypass pynvml version validation
# so that legacy pynvml (nvidia-ml-py3) can be used. This would be useful
Expand Down Expand Up @@ -61,6 +62,50 @@
""")) from e


class NvidiaCompatibilityWarning(UserWarning):
pass


def check_driver_nvml_version(driver_version_str: str):
"""Show warnings when an incompatible driver is used."""

def safeint(v) -> int:
try:
return int(v)
except (ValueError, TypeError):
return 0

driver_version = tuple(safeint(v) for v in
driver_version_str.strip().split("."))

# #161: invalid process information on 535.xx
is_pynvml_535_77 = (
hasattr(pynvml.c_nvmlProcessInfo_t, 'usedGpuCcProtectedMemory') and
# Note: __name__ changed to pynvml.c_nvmlProcessInfo_v2_t since 12.535.108+
pynvml.c_nvmlProcessInfo_t.__name__ == 'c_nvmlProcessInfo_t'
)

if (535, 43) <= driver_version < (535, 86):
# See #161: these are buggy, gives wrong process information
# except for nvidia-ml-py == 12.535.77 (which is a buggy version too).
# Note: NVIDIA 535.86+ and nvidia-ml-py 12.535.108+ fixes the bug
if not is_pynvml_535_77:
warnings.warn(
f"This version of NVIDIA Driver {driver_version_str} is incompatible, "
"process information will be inaccurate. "
"Upgrade the NVIDIA driver to 535.104.05 or higher, "
"or use nvidia-ml-py==12.535.77. For more details, see "
"https://github.com/wookayin/gpustat/issues/161.",
category=NvidiaCompatibilityWarning, stacklevel=2)
else:
if is_pynvml_535_77: # pynvml 12.535.77 should not be used
warnings.warn(
"This version of nvidia-ml-py (possibly 12.535.77) is incompatible. "
"Please upgrade nvidia-ml-py to the latest version. "
"(pip install --upgrade --force-reinstall nvidia-ml-py)",
category=NvidiaCompatibilityWarning, stacklevel=2)


# Monkey-patch nvml due to breaking changes in pynvml.
# See #107, #141, and test_gpustat.py for more details.

Expand Down

0 comments on commit 342cf10

Please sign in to comment.