/
grun.py
101 lines (79 loc) · 2.62 KB
/
grun.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/python3
#pylint: disable=C
"""
Wait for available GPUs to execute a command
"""
import argparse
import time
import os
import sys
import subprocess
import random
import GPUtil
import numpy as np
import glob
def check_pid(pid):
""" Check For the existence of a unix pid. """
try:
os.kill(pid, 0)
except OSError:
return False
else:
return True
def main():
"""
Main function
"""
parser = argparse.ArgumentParser()
parser.add_argument("-n", "--n", type=int, default=1, help="number of GPUs needed")
parser.add_argument("command", metavar='CMD', type=str, nargs='?')
argv = []
i = 1
while i < len(sys.argv) and sys.argv[i][:1] == "-":
argv += sys.argv[i:i + 2]
i += 2
args = parser.parse_args(argv)
args.command = sys.argv[i:]
directory = os.path.join(os.environ['HOME'], '.grun')
if not os.path.exists(directory):
os.makedirs(directory)
while True:
try:
GPUs = GPUtil.getGPUs()
except FileNotFoundError:
print("grun: no gpus")
return
maxLoad = 0.5
maxMemory = 0.5
GPUs = [gpu for gpu in GPUs if gpu.load < maxLoad and gpu.memoryUtil < maxMemory]
running = [f.split('/')[-1].split('_') for f in glob.glob("/home/*/.grun/*")]
running = [(int(pid), list(map(int, ids.split(',')))) for pid, ids in running]
running = [(pid, ids) for pid, ids in running if check_pid(pid)]
p = subprocess.Popen(["nvidia-smi"], stdout=subprocess.PIPE)
out = p.stdout.read().decode('UTF-8')
procs = [int(x.split()[1]) for x in out.split('Processes:')[-1].split('\n') if "iB" in x]
for gpu in GPUs:
gpu.nproc = max(len([1 for pid, ids in running if gpu.id in ids]), len([1 for p in procs if p == gpu.id]))
maxProc = 3
GPUs = [gpu for gpu in GPUs if gpu.nproc < maxProc]
if len(GPUs) >= args.n:
break
print("grun: waiting for gpus...")
time.sleep(5)
GPUs = sorted(GPUs, key=lambda gpu: gpu.nproc)
GPUs = GPUs[:args.n]
# Set CUDA_DEVICE_ORDER so the IDs assigned by CUDA match those from nvidia-smi
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
deviceIds = ",".join(map(str, (gpu.id for gpu in GPUs)))
os.environ["CUDA_VISIBLE_DEVICES"] = deviceIds
p = subprocess.Popen(args.command)
f = os.path.join(directory, "{}_{}".format(p.pid, deviceIds))
open(f, 'w').close()
try:
p.wait()
except KeyboardInterrupt:
print("grun: kill process")
p.kill()
os.remove(f)
if __name__ == "__main__":
main()