Unable to process Multi-GPU Training #4007

Theivaprakasham · 2024-02-17T10:27:04Z

Describe the bug
Training Fastai model fails on Distributed training setup. The training fails both on CLI setup (accelerate launch train.py) and also on notebook_launcher.

To Reproduce
Example from https://docs.fast.ai/distributed.html
Environment: Python 3.10, 2 x RTX6000

from fastai.vision.all import *
from fastai.distributed import *

def train():
    set_seed(99, True)
    path = untar_data(URLs.PETS)/'images'
    dls = ImageDataLoaders.from_name_func(
        path, get_image_files(path), valid_pct=0.2,
        label_func=lambda x: x[0].isupper(), item_tfms=Resize(224))
    
    learn = vision_learner(dls, resnet34, metrics=error_rate).to_fp16()
    with learn.distrib_ctx(in_notebook=True):
        learn.fine_tune(1)

from accelerate import notebook_launcher
notebook_launcher(train, num_processes=2)

Requirements:

accelerate==0.27.2
fastai==2.7.14
fastbook==0.0.29
fastcore==1.5.29
fastdownload==0.0.7
fastjsonschema==2.19.1
fastprogress==1.0.3
torch==2.1.2
torchaudio==2.1.2
torchvision==0.16.2
transformers==4.37.2
triton==2.1.0

Expected behavior

Error with full stack trace

---------------------------------------------------------------------------
ProcessRaisedException                    Traceback (most recent call last)
File /root/miniconda3/envs/py3.10/lib/python3.10/site-packages/accelerate/launchers.py:200, in notebook_launcher(function, args, num_processes, mixed_precision, use_port, master_addr, node_rank, num_nodes)
    199 try:
--> 200     start_processes(launcher, args=args, nprocs=num_processes, start_method="fork")
    201 except ProcessRaisedException as e:

File /root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/multiprocessing/spawn.py:202, in start_processes(fn, args, nprocs, join, daemon, start_method)
    201 # Loop on join until it returns True or raises an exception.
--> 202 while not context.join():
    203     pass

File /root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/multiprocessing/spawn.py:163, in ProcessContext.join(self, timeout)
    162 msg += original_trace
--> 163 raise ProcessRaisedException(msg, error_index, failed_process.pid)

ProcessRaisedException: 

-- Process 1 terminated with the following error:
Traceback (most recent call last):
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 74, in _wrap
    fn(i, *args)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/accelerate/utils/launch.py", line 570, in __call__
    self.launcher(*args)
  File "/tmp/ipykernel_94/4042077722.py", line 13, in train
    learn.fine_tune(1)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/callback/schedule.py", line 165, in fine_tune
    self.fit_one_cycle(freeze_epochs, slice(base_lr), pct_start=0.99, **kwargs)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/callback/schedule.py", line 119, in fit_one_cycle
    self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd, start_epoch=start_epoch)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/learner.py", line 264, in fit
    self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/learner.py", line 199, in _with_events
    try: self(f'before_{event_type}');  f()
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/learner.py", line 253, in _do_fit
    self._with_events(self._do_epoch, 'epoch', CancelEpochException)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/learner.py", line 199, in _with_events
    try: self(f'before_{event_type}');  f()
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/learner.py", line 247, in _do_epoch
    self._do_epoch_train()
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/learner.py", line 239, in _do_epoch_train
    self._with_events(self.all_batches, 'train', CancelTrainException)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/learner.py", line 199, in _with_events
    try: self(f'before_{event_type}');  f()
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/learner.py", line 205, in all_batches
    for o in enumerate(self.dl): self.one_batch(*o)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/data/load.py", line 131, in __iter__
    yield self.after_batch(b)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/distributed.py", line 120, in after_batch
    return self.dl.after_batch(b)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastcore/transform.py", line 208, in __call__
    def __call__(self, o): return compose_tfms(o, tfms=self.fs, split_idx=self.split_idx)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastcore/transform.py", line 158, in compose_tfms
    x = f(x, **kwargs)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastcore/transform.py", line 81, in __call__
    def __call__(self, x, **kwargs): return self._call('encodes', x, **kwargs)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastcore/transform.py", line 91, in _call
    return self._do_call(getattr(self, fn), x, **kwargs)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastcore/transform.py", line 98, in _do_call
    res = tuple(self._do_call(f, x_, **kwargs) for x_ in x)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastcore/transform.py", line 98, in <genexpr>
    res = tuple(self._do_call(f, x_, **kwargs) for x_ in x)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastcore/transform.py", line 97, in _do_call
    return retain_type(f(x, **kwargs), x, ret)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastcore/dispatch.py", line 120, in __call__
    return f(*args, **kwargs)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/data/transforms.py", line 377, in encodes
    def encodes(self, x:TensorImage): return (x-self.mean) / self.std
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/torch_core.py", line 382, in __torch_function__
    res = super().__torch_function__(func, types, args, ifnone(kwargs, {}))
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/_tensor.py", line 1386, in __torch_function__
    ret = func(*args, **kwargs)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0!


The above exception was the direct cause of the following exception:

RuntimeError                              Traceback (most recent call last)
Cell In[1], line 16
     13         learn.fine_tune(1)
     15 from accelerate import notebook_launcher
---> 16 notebook_launcher(train, num_processes=2)

File /root/miniconda3/envs/py3.10/lib/python3.10/site-packages/accelerate/launchers.py:210, in notebook_launcher(function, args, num_processes, mixed_precision, use_port, master_addr, node_rank, num_nodes)
    203                 raise RuntimeError(
    204                     "CUDA has been initialized before the `notebook_launcher` could create a forked subprocess. "
    205                     "This likely stems from an outside import causing issues once the `notebook_launcher()` is called. "
    206                     "Please review your imports and test them when running the `notebook_launcher()` to identify "
    207                     "which one is problematic and causing CUDA to be initialized."
    208                 ) from e
    209             else:
--> 210                 raise RuntimeError(f"An issue was found when launching the training: {e}") from e
    212 else:
    213     # No need for a distributed launch otherwise as it's either CPU, GPU or MPS.
    214     if is_mps_available():

RuntimeError: An issue was found when launching the training: 

-- Process 1 terminated with the following error:
Traceback (most recent call last):
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 74, in _wrap
    fn(i, *args)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/accelerate/utils/launch.py", line 570, in __call__
    self.launcher(*args)
  File "/tmp/ipykernel_94/4042077722.py", line 13, in train
    learn.fine_tune(1)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/callback/schedule.py", line 165, in fine_tune
    self.fit_one_cycle(freeze_epochs, slice(base_lr), pct_start=0.99, **kwargs)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/callback/schedule.py", line 119, in fit_one_cycle
    self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd, start_epoch=start_epoch)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/learner.py", line 264, in fit
    self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/learner.py", line 199, in _with_events
    try: self(f'before_{event_type}');  f()
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/learner.py", line 253, in _do_fit
    self._with_events(self._do_epoch, 'epoch', CancelEpochException)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/learner.py", line 199, in _with_events
    try: self(f'before_{event_type}');  f()
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/learner.py", line 247, in _do_epoch
    self._do_epoch_train()
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/learner.py", line 239, in _do_epoch_train
    self._with_events(self.all_batches, 'train', CancelTrainException)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/learner.py", line 199, in _with_events
    try: self(f'before_{event_type}');  f()
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/learner.py", line 205, in all_batches
    for o in enumerate(self.dl): self.one_batch(*o)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/data/load.py", line 131, in __iter__
    yield self.after_batch(b)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/distributed.py", line 120, in after_batch
    return self.dl.after_batch(b)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastcore/transform.py", line 208, in __call__
    def __call__(self, o): return compose_tfms(o, tfms=self.fs, split_idx=self.split_idx)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastcore/transform.py", line 158, in compose_tfms
    x = f(x, **kwargs)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastcore/transform.py", line 81, in __call__
    def __call__(self, x, **kwargs): return self._call('encodes', x, **kwargs)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastcore/transform.py", line 91, in _call
    return self._do_call(getattr(self, fn), x, **kwargs)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastcore/transform.py", line 98, in _do_call
    res = tuple(self._do_call(f, x_, **kwargs) for x_ in x)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastcore/transform.py", line 98, in <genexpr>
    res = tuple(self._do_call(f, x_, **kwargs) for x_ in x)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastcore/transform.py", line 97, in _do_call
    return retain_type(f(x, **kwargs), x, ret)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastcore/dispatch.py", line 120, in __call__
    return f(*args, **kwargs)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/data/transforms.py", line 377, in encodes
    def encodes(self, x:TensorImage): return (x-self.mean) / self.std
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/fastai/torch_core.py", line 382, in __torch_function__
    res = super().__torch_function__(func, types, args, ifnone(kwargs, {}))
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/_tensor.py", line 1386, in __torch_function__
    ret = func(*args, **kwargs)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0!

@muellerzr

The text was updated successfully, but these errors were encountered:

Theivaprakasham changed the title ~~Multi-GPU Training~~ Unable to process Multi-GPU Training Feb 17, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Unable to process Multi-GPU Training #4007

Unable to process Multi-GPU Training #4007

Theivaprakasham commented Feb 17, 2024

Unable to process Multi-GPU Training #4007

Unable to process Multi-GPU Training #4007

Comments

Theivaprakasham commented Feb 17, 2024