dpctl has issues with multiprocessing #1169

oleksandr-pavlyk · 2023-04-11T21:26:38Z

import dpctl
import dpctl.tensor as dpt

import multiprocessing
import os

def _exec(di):
    print(os.getpid())

    x = dpt.ones(1, device="cpu")
    print(2)
    y = dpt.empty_like(x)
    y[...] = x
    di["hey"] = dpt.asnumpy(y)


def main():
    print("starting")
    with multiprocessing.Manager() as manager:
        di_ = manager.dict()
        p = multiprocessing.Process(
            target=_exec,
            args=(di_,)
        )
        p.start()
        p.join(200)

    print("Done")

if __name__ == "__main__":
    # dpt.ones(1)
    main()

Running this produces an output:

(triage_dpbench) opavlyk@opavlyk-mobl:~/tmp$ python run.py
starting
16819
2
Done
(triage_dpbench) opavlyk@opavlyk-mobl:~/tmp$

Uncommenting the line # dpt.ones(1) above call to main() causes the script to hang (looks like a deadlock).

If in addition x = dpt.ones(1, device="cpu") is replaced with x = dpt.ones(1), the scripts errors out with

(triage_dpbench) opavlyk@opavlyk-mobl:~/tmp$ python run.py
starting
16969
Process Process-2:
Traceback (most recent call last):
  File "/home/opavlyk/miniconda3/envs/triage_dpbench/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/opavlyk/miniconda3/envs/triage_dpbench/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/opavlyk/tmp/run.py", line 10, in _exec
    x = dpt.ones(1)
  File "/home/opavlyk/repos/dpctl/dpctl/tensor/_ctors.py", line 1006, in ones
    hev, _ = ti._full_usm_ndarray(1, res, sycl_queue)
RuntimeError: Native API failed. Native API returns: -997 (The plugin has emitted a backend specific error) -997 (The plugin has emitted a backend specific error)
Done
(triage_dpbench) opavlyk@opavlyk-mobl:~/tmp$

Thank you to @ZzEeKkAa for reporting a related issue which was distilled into this reproducer

The text was updated successfully, but these errors were encountered:

oleksandr-pavlyk · 2023-04-12T11:06:17Z

Actually, this reproducer does not need dpctl.tensor at all. This following script:

# run.py
import dpctl
import dpctl.memory as dpm

import multiprocessing
import os

def _exec(di):
    print(os.getpid())

    d = dpctl.select_default_device()
    c = d.sycl_platform.default_context
    q = dpctl.SyclQueue(c, d)
    x = dpm.MemoryUSMDevice(40, queue=q)
    x.memset(1)
    print(2)
    di["hey"] = [1,2,3]


def main():
    print("starting")
    with multiprocessing.Manager() as manager:
        di_ = manager.dict()
        p = multiprocessing.Process(
            target=_exec,
            args=(di_,)
        )
        p.start()
        p.join(200)

    print("Done")

if __name__ == "__main__":
    d = dpctl.select_default_device()
    c = d.sycl_platform.default_context
    q = dpctl.SyclQueue(c, d)
    x = dpm.MemoryUSMDevice(40, queue=q)
    x.memset(1)
    main()

fails just the same:

(triage_dpbench) opavlyk@opavlyk-mobl:~/tmp/zzeekkaa$ python run.py
starting
22081
die: [L0] getZeQueue: failed to create queue
libc++abi: terminating
Done

My hunch is that DPC++ runtime global states do not like to be forked.

Acting on such a hunch I whipped a C++ snippet building on a tutorial from geeks-for-geeks:

// forking.cpp
#include <sycl/sycl.hpp>
#include <unistd.h>
#include <iostream>

int main(void) {
  sycl::queue q0 { sycl::default_selector_v };

  int *data_main = sycl::malloc_device<int>(10, q0);
  q0.fill<int>(data_main, int(1), 10).wait();
  sycl::free(data_main, q0);

  std::cout << "PID before forking " << getpid() << std::endl;

  pid_t c_pid = fork();

  if (c_pid == -1) {
    std::cout << "Fork failed" << std::endl;
    std::terminate();
  } else if (c_pid > 0) { // parent process
      std::cout << "Parent PID after forking " << getpid() << std::endl;
      sycl::queue q1 { sycl::default_selector_v };

      int *data1 = sycl::malloc_device<int>(10, q1);
      q1.fill<int>(data1, int(1), 10).wait();
      sycl::free(data1, q1);

  } else { // child process
      std::cout << "Parent PID after forking " << getpid() << std::endl;
      sycl::queue q2 {sycl::default_selector_v};

      int *data2 = sycl::malloc_device<int>(10, q2);
      q2.fill<int>(data2, int(1), 10).wait();
      sycl::free(data2, q2);

  }
  return 0;
}

Compiled this code with icpx -fsycl forking.cpp -o forking. Now, lo and behold, the same behavior emerges:

(triage_dpbench) opavlyk@opavlyk-mobl:~/tmp$ ./forking
PID before forking 22478
Parent PID after forking 22478
Parent PID after forking 22479
die: [L0] getZeQueue: failed to create queue
libc++abi: terminating

oleksandr-pavlyk · 2023-04-12T15:14:08Z

Perhaps a better (proper forking) C++ reproducer:

// forking.cpp
#include <sycl/sycl.hpp>
#include <unistd.h>
#include <sys/wait.h>
#include <iostream>

int main(void) {
  sycl::queue q0 { sycl::default_selector_v };

  int *data_main = sycl::malloc_device<int>(10, q0);
  q0.fill<int>(data_main, int(1), 10).wait();
  sycl::free(data_main, q0);

  std::cout << "PID before forking " << getpid() << std::endl;

  pid_t c_pid = fork();

  if (c_pid < 0) {
    std::cout << "Fork failed" << std::endl;
    std::terminate();
  } else if (c_pid > 0) { // parent process
    int status = 0;
    // just wait for the child process to do its thing
    (void) waitpid(c_pid, &status, 0);
    std::cout << "Status retuned by child process: " << status << std::endl;
  } else { // child process
      std::cout << "Child PID after forking " << getpid() << std::endl;
      sycl::queue q2 {sycl::default_selector_v};

      try {
        int *data2 = sycl::malloc_device<int>(10, q2);
        q2.fill<int>(data2, int(1), 10).wait();
        sycl::free(data2, q2);
      } catch (const std::exception &e) {
        std::cerr << "Caught exception: " << e.what() << std::endl;
        return -1;
      }

      return 0; // finish the child process
  }

  std::cout << "From parent process after joining" << std::endl;
  return 0;
}

Running:

$ icpx -fsycl forking.cpp -o forking && ./forking
PID before forking 19329
Child PID after forking 19330
die: [L0] getZeQueue: failed to create queue
libc++abi: terminating
Status retuned by child process: 6
From parent process after joining

Interestingly, replacing sycl::default_selector_v with sycl::cpu_selector_v in the child process branch the executable runs fine.

Using sycl::ext::oneapi::filter_selector to select "opencl:gpu" device runs into problems:

PID before forking 20103
Child PID after forking 20104
Caught exception: The program was built for 1 devices
Build program log for 'Intel(R) Graphics [0x9a49]':
 -6 (PI_ERROR_OUT_OF_HOST_MEMORY)
Status retuned by child process: 65280
From parent process after joining

I suppose this finding hints that presently dpctl, as any other SYCL application is not compatible with Unix's forking, and hence
dpctl should be used in processed spawned by multiprocessing module.

oleksandr-pavlyk · 2023-04-12T15:51:24Z

Another useful variant, more closely related to the error seen in dpbench:

// $ cat forking.cpp
#include <sycl/sycl.hpp>
#include <unistd.h>
#include <sys/wait.h>
#include <iostream>

class my_filter_selector
{
 public:
  static constexpr int REJECT_DEVICE = -1;
  my_filter_selector(const std::string &fs) : _impl(fs) {}
  int operator()(const sycl::device &d) const { return _impl(d); };

 private:
  sycl::ext::oneapi::filter_selector _impl;
};


int main(void) {
  sycl::queue q0 { sycl::default_selector_v };

  int *data_main = sycl::malloc_device<int>(10, q0);
  q0.fill<int>(data_main, int(1), 10).wait();
  sycl::free(data_main, q0);

  std::cout << "PID before forking " << getpid() << std::endl;

  pid_t c_pid = fork();

  if (c_pid < 0) {
    std::cout << "Fork failed" << std::endl;
    std::terminate();
  } else if (c_pid > 0) { // parent process
    int status = 0;
    // just wait for the child process to do its thing
    (void) waitpid(c_pid, &status, 0);
    std::cout << "Status retuned by child process: " << status << std::endl;
  } else { // child process
      std::cout << "Child PID after forking " << getpid() << std::endl;
      sycl::queue q2 { sycl::default_selector_v };

      try {
        int *data2 = sycl::malloc_host<int>(10, q2);
        if (data2 == nullptr) {
          throw std::runtime_error("Failed USM-host allocation");
        }
        q2.fill<int>(data2, int(1), 10).wait();
        sycl::free(data2, q2);
      } catch (const std::exception &e) {
        std::cerr << "Caught exception: " << e.what() << std::endl;
        return -1;
      }

      return 0; // finish the child process
  }

  std::cout << "From parent process after joining" << std::endl;
  return 0;
}

(triage_dpbench) opavlyk@opavlyk-mobl:~/tmp/zzeekkaa$ icpx -fsycl forking.cpp -o forking
(triage_dpbench) opavlyk@opavlyk-mobl:~/tmp/zzeekkaa$ ./forking
PID before forking 20198
Child PID after forking 20199
Caught exception: Failed USM-host allocation
Status retuned by child process: 65280
From parent process after joining

adarshyoga · 2023-04-12T16:47:15Z

Likely related to this defect reported in dpbench.

oleksandr-pavlyk assigned diptorupd and oleksandr-pavlyk Apr 11, 2023

diptorupd removed their assignment Mar 1, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

dpctl has issues with multiprocessing #1169

dpctl has issues with multiprocessing #1169

oleksandr-pavlyk commented Apr 11, 2023

oleksandr-pavlyk commented Apr 12, 2023

oleksandr-pavlyk commented Apr 12, 2023 •

edited

oleksandr-pavlyk commented Apr 12, 2023

adarshyoga commented Apr 12, 2023

dpctl has issues with multiprocessing #1169

dpctl has issues with multiprocessing #1169

Comments

oleksandr-pavlyk commented Apr 11, 2023

oleksandr-pavlyk commented Apr 12, 2023

oleksandr-pavlyk commented Apr 12, 2023 • edited

oleksandr-pavlyk commented Apr 12, 2023

adarshyoga commented Apr 12, 2023

oleksandr-pavlyk commented Apr 12, 2023 •

edited