-
Notifications
You must be signed in to change notification settings - Fork 12
/
eagle.py
758 lines (678 loc) · 30.1 KB
/
eagle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
# -*- coding: utf-8 -*-
"""
buildstockbatch.eagle
~~~~~~~~~~~~~~~
This class contains the object & methods that allow for usage of the library with Eagle
:author: Noel Merket
:copyright: (c) 2018 by The Alliance for Sustainable Energy
:license: BSD-3
"""
import argparse
from dask.distributed import Client, LocalCluster
import datetime as dt
from fsspec.implementations.local import LocalFileSystem
import gzip
import itertools
from joblib import delayed, Parallel
import json
import logging
import math
import os
import pandas as pd
import pathlib
import random
import re
import requests
import shlex
import shutil
import subprocess
import sys
import time
import yaml
from .base import BuildStockBatchBase, SimulationExists
from .sampler import ResidentialSingularitySampler, CommercialSobolSingularitySampler, PrecomputedSingularitySampler
from .utils import log_error_details, get_error_details
from . import postprocessing
logger = logging.getLogger(__name__)
def get_bool_env_var(varname):
return os.environ.get(varname, '0').lower() in ('true', 't', '1', 'y', 'yes')
class EagleBatch(BuildStockBatchBase):
sys_image_dir = '/shared-projects/buildstock/singularity_images'
hpc_name = 'eagle'
min_sims_per_job = 36 * 2
local_scratch = pathlib.Path('/tmp/scratch')
local_project_dir = local_scratch / 'project'
local_buildstock_dir = local_scratch / 'buildstock'
local_weather_dir = local_scratch / 'weather'
local_output_dir = local_scratch / 'output'
local_singularity_img = local_scratch / 'openstudio.simg'
local_housing_characteristics_dir = local_scratch / 'housing_characteristics'
def __init__(self, project_filename):
super().__init__(project_filename)
output_dir = pathlib.Path(self.output_dir)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
logger.debug('Output directory = {}'.format(output_dir))
weather_dir = self.weather_dir # noqa E841
sampling_algorithm = self.cfg['baseline'].get('sampling_algorithm', None)
if sampling_algorithm is None:
raise KeyError('The key `sampling_algorithm` is not specified in the `baseline` section of the project '
'configuration yaml. This key is required.')
if sampling_algorithm == 'precomputed':
logger.info('calling precomputed sampler')
self.sampler = PrecomputedSingularitySampler(
self.output_dir,
self.cfg,
self.buildstock_dir,
self.project_dir
)
elif self.stock_type == 'residential':
if sampling_algorithm == 'quota':
self.sampler = ResidentialSingularitySampler(
self.singularity_image,
self.output_dir,
self.cfg,
self.buildstock_dir,
self.project_dir
)
else:
raise NotImplementedError('Sampling algorithem "{}" is not implemented for residential projects.'.
format(sampling_algorithm))
elif self.stock_type == 'commercial':
if sampling_algorithm == 'sobol':
self.sampler = CommercialSobolSingularitySampler(
self.output_dir,
self.cfg,
self.buildstock_dir,
self.project_dir
)
else:
raise NotImplementedError('Sampling algorithem "{}" is not implemented for commercial projects.'.
format(sampling_algorithm))
else:
raise KeyError('stock_type = "{}" is not valid'.format(self.stock_type))
@staticmethod
def validate_project(project_file):
super(EagleBatch, EagleBatch).validate_project(project_file)
# Eagle specific validation goes here
@property
def output_dir(self):
output_dir = self.cfg.get(
'output_directory',
os.path.join('/scratch/{}'.format(os.environ.get('USER', 'user')), os.path.basename(self.project_dir))
)
return output_dir
@property
def results_dir(self):
results_dir = os.path.join(self.output_dir, 'results')
assert(os.path.isdir(results_dir))
return results_dir
@staticmethod
def clear_and_copy_dir(src, dst):
if os.path.exists(dst):
shutil.rmtree(dst, ignore_errors=True)
shutil.copytree(src, dst)
@property
def singularity_image_url(self):
return 'https://s3.amazonaws.com/openstudio-builds/{ver}/OpenStudio-{ver}.{sha}-Singularity.simg'.format(
ver=self.os_version,
sha=self.os_sha
)
@property
def singularity_image(self):
# Check the project yaml specification - if the file does not exist do not silently allow for non-specified simg
if 'sys_image_dir' in self.cfg.keys():
sys_image_dir = self.cfg['sys_image_dir']
sys_image = os.path.join(sys_image_dir, 'OpenStudio-{ver}.{sha}-Singularity.simg'.format(
ver=self.os_version,
sha=self.os_sha
))
if os.path.isfile(sys_image):
return sys_image
else:
raise RuntimeError('Unable to find singularity image specified in project file: `{}`'.format(sys_image))
# Use the expected HPC environment default if not explicitly defined in the YAML
sys_image = os.path.join(self.sys_image_dir, 'OpenStudio-{ver}.{sha}-Singularity.simg'.format(
ver=self.os_version,
sha=self.os_sha
))
if os.path.isfile(sys_image):
return sys_image
# Otherwise attempt retrieval from AWS for the appropriate os_version and os_sha
else:
singularity_image_path = os.path.join(self.output_dir, 'openstudio.simg')
if not os.path.isfile(singularity_image_path):
logger.debug(f'Downloading singularity image: {self.singularity_image_url}')
r = requests.get(self.singularity_image_url, stream=True)
if r.status_code != requests.codes.ok:
logger.error('Unable to download simg file from OpenStudio releases S3 bucket.')
r.raise_for_status()
with open(singularity_image_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
logger.debug('Downloaded singularity image to {}'.format(singularity_image_path))
return singularity_image_path
@property
def weather_dir(self):
weather_dir = os.path.join(self.output_dir, 'weather')
if not os.path.exists(weather_dir):
os.makedirs(weather_dir)
self._get_weather_files()
return weather_dir
def run_batch(self, sampling_only=False):
# Create simulation_output dir
sim_out_ts_dir = pathlib.Path(self.output_dir) / 'results' / 'simulation_output' / 'timeseries'
os.makedirs(sim_out_ts_dir, exist_ok=True)
for i in range(0, len(self.cfg.get('upgrades', [])) + 1):
os.makedirs(sim_out_ts_dir / f'up{i:02d}')
# create destination_dir and copy housing_characteristics into it
logger.debug("Copying housing characteristics")
destination_dir = os.path.dirname(self.sampler.csv_path)
if os.path.exists(destination_dir):
shutil.rmtree(destination_dir)
shutil.copytree(
os.path.join(self.project_dir, 'housing_characteristics'),
destination_dir
)
logger.debug("Housing characteristics copied.")
# run sampling
# NOTE: If a buildstock_csv is provided, the BuildStockBatch
# constructor ensures that 'downselect' not in self.cfg and
# run_sampling simply copies that .csv to the correct location if
# necessary and returns the path
if 'downselect' in self.cfg:
# if there is a downselect section in the yml,
# BuildStockBatchBase.downselect calls run_sampling and does
# additional processing before and after
buildstock_csv_filename = self.downselect()
else:
# otherwise just the plain sampling process needs to be run
buildstock_csv_filename = self.run_sampling()
# Hit the weather_dir API to make sure that creating the weather directory isn't a race condition in the array
# jobs - this is rare but happens quasi-repeatably when lustre is really lagging
_ = self.weather_dir
if sampling_only:
return
# Determine the number of simulations expected to be executed
df = pd.read_csv(buildstock_csv_filename, index_col=0)
# find out how many buildings there are to simulate
building_ids = df.index.tolist()
n_datapoints = len(building_ids)
# number of simulations is number of buildings * number of upgrades
n_sims = n_datapoints * (len(self.cfg.get('upgrades', [])) + 1)
# this is the number of simulations defined for this run as a "full job"
# number of simulations per job if we believe the .yml file n_jobs
n_sims_per_job = math.ceil(n_sims / self.cfg[self.hpc_name]['n_jobs'])
# use more appropriate batch size in the case of n_jobs being much
# larger than we need, now that we know n_sims
n_sims_per_job = max(n_sims_per_job, self.min_sims_per_job)
upgrade_sims = itertools.product(building_ids, range(len(self.cfg.get('upgrades', []))))
if not self.skip_baseline_sims:
# create batches of simulations
baseline_sims = zip(building_ids, itertools.repeat(None))
all_sims = list(itertools.chain(baseline_sims, upgrade_sims))
else:
all_sims = list(itertools.chain(upgrade_sims))
random.shuffle(all_sims)
all_sims_iter = iter(all_sims)
for i in itertools.count(1):
batch = list(itertools.islice(all_sims_iter, n_sims_per_job))
if not batch:
break
logger.info('Queueing job {} ({} simulations)'.format(i, len(batch)))
job_json_filename = os.path.join(self.output_dir, 'job{:03d}.json'.format(i))
with open(job_json_filename, 'w') as f:
json.dump({
'job_num': i,
'batch': batch,
}, f, indent=4)
# now queue them
jobids = self.queue_jobs()
# queue up post-processing to run after all the simulation jobs are complete
if not get_bool_env_var('MEASURESONLY'):
self.queue_post_processing(jobids)
def run_job_batch(self, job_array_number):
self.clear_and_copy_dir(
pathlib.Path(self.buildstock_dir) / 'resources',
self.local_buildstock_dir / 'resources'
)
self.clear_and_copy_dir(
pathlib.Path(self.buildstock_dir) / 'measures',
self.local_buildstock_dir / 'measures'
)
self.clear_and_copy_dir(
self.weather_dir,
self.local_weather_dir
)
self.clear_and_copy_dir(
pathlib.Path(self.output_dir) / 'housing_characteristics',
self.local_housing_characteristics_dir
)
if os.path.exists(self.local_singularity_img):
os.remove(self.local_singularity_img)
shutil.copy2(self.singularity_image, self.local_singularity_img)
# Run the job batch as normal
job_json_filename = os.path.join(self.output_dir, 'job{:03d}.json'.format(job_array_number))
with open(job_json_filename, 'r') as f:
args = json.load(f)
traceback_file_path = self.local_output_dir / 'simulation_output' / f'traceback{job_array_number}.out'
@delayed
def run_building_d(i, upgrade_idx):
try:
return self.run_building(self.output_dir, self.cfg, i, upgrade_idx)
except Exception:
with open(traceback_file_path, 'a') as f:
txt = get_error_details()
txt = "\n" + "#" * 20 + "\n" + f"Traceback for building{i}\n" + txt
f.write(txt)
del txt
# Run the simulations, get the data_point_out.json info from each
tick = time.time()
with Parallel(n_jobs=-1, verbose=9) as parallel:
dpouts = parallel(itertools.starmap(run_building_d, args['batch']))
tick = time.time() - tick
logger.info('Simulation time: {:.2f} minutes'.format(tick / 60.))
# Save the aggregated dpouts as a json file
lustre_sim_out_dir = pathlib.Path(self.results_dir) / 'simulation_output'
results_json = lustre_sim_out_dir / f'results_job{job_array_number}.json.gz'
logger.info(f'Writing results to {results_json}')
with gzip.open(results_json, 'wt', encoding='utf-8') as f:
json.dump(dpouts, f)
# Compress simulation results
simout_filename = lustre_sim_out_dir / f'simulations_job{job_array_number}.tar.gz'
logger.info(f'Compressing simulation outputs to {simout_filename}')
local_sim_out_dir = self.local_output_dir / 'simulation_output'
subprocess.run(
[
'tar',
'cf', str(simout_filename),
'-I', 'pigz',
'-C', str(local_sim_out_dir),
'.'
],
check=True
)
# copy the tracebacks if it exists
if os.path.exists(traceback_file_path):
shutil.copy2(traceback_file_path, lustre_sim_out_dir)
@classmethod
def run_building(cls, output_dir, cfg, i, upgrade_idx=None):
fs = LocalFileSystem()
upgrade_id = 0 if upgrade_idx is None else upgrade_idx + 1
try:
sim_id, sim_dir = cls.make_sim_dir(i, upgrade_idx, os.path.join(cls.local_output_dir, 'simulation_output'))
except SimulationExists as ex:
sim_dir = ex.sim_dir
else:
# Generate the osw for this simulation
osw = cls.create_osw(cfg, sim_id, building_id=i, upgrade_idx=upgrade_idx)
with open(os.path.join(sim_dir, 'in.osw'), 'w') as f:
json.dump(osw, f, indent=4)
# Copy other necessary stuff into the simulation directory
dirs_to_mount = [
os.path.join(cls.local_buildstock_dir, 'measures'),
cls.local_weather_dir,
]
# Build the command to instantiate and configure the singularity container the simulation is run inside
local_resources_dir = cls.local_buildstock_dir / 'resources'
args = [
'singularity', 'exec',
'--contain',
'-e',
'--pwd', '/var/simdata/openstudio',
'-B', f'{sim_dir}:/var/simdata/openstudio',
'-B', f'{local_resources_dir}:/lib/resources',
'-B', f'{cls.local_housing_characteristics_dir}:/lib/housing_characteristics'
]
runscript = [
'ln -s /lib /var/simdata/openstudio/lib'
]
for src in dirs_to_mount:
container_mount = '/' + os.path.basename(src)
args.extend(['-B', '{}:{}:ro'.format(src, container_mount)])
container_symlink = os.path.join('/var/simdata/openstudio', os.path.basename(src))
runscript.append('ln -s {} {}'.format(*map(shlex.quote, (container_mount, container_symlink))))
# Build the openstudio command that will be issued within the singularity container
# If custom gems are to be used in the singularity container add extra bundle arguments to the cli command
cli_cmd = 'openstudio run -w in.osw'
if cfg.get('baseline', dict()).get('custom_gems', False):
cli_cmd = 'openstudio --bundle /var/oscli/Gemfile --bundle_path /var/oscli/gems run -w in.osw --debug'
if get_bool_env_var('MEASURESONLY'):
cli_cmd += ' --measures_only'
runscript.append(cli_cmd)
args.extend([
str(cls.local_singularity_img),
'bash', '-x'
])
logger.debug('\n'.join(map(str, args)))
with open(os.path.join(sim_dir, 'singularity_output.log'), 'w') as f_out:
try:
subprocess.run(
args,
check=True,
input='\n'.join(runscript).encode('utf-8'),
stdout=f_out,
stderr=subprocess.STDOUT,
cwd=cls.local_output_dir
)
except subprocess.CalledProcessError:
pass
finally:
# Clean up the symbolic links we created in the container
for mount_dir in dirs_to_mount + [os.path.join(sim_dir, 'lib')]:
try:
os.unlink(os.path.join(sim_dir, os.path.basename(mount_dir)))
except FileNotFoundError:
pass
# Clean up simulation directory
cls.cleanup_sim_dir(
sim_dir,
fs,
f'{output_dir}/results/simulation_output/timeseries',
upgrade_id,
i
)
reporting_measures = cfg.get('reporting_measures', [])
dpout = postprocessing.read_simulation_outputs(fs, reporting_measures, sim_dir, upgrade_id, i)
return dpout
def queue_jobs(self, array_ids=None):
eagle_cfg = self.cfg['eagle']
minutes_per_sim = eagle_cfg.get('minutes_per_sim', 3)
with open(os.path.join(self.output_dir, 'job001.json'), 'r') as f:
job_json = json.load(f)
n_sims_per_job = len(job_json['batch'])
del job_json
if array_ids:
array_spec = ','.join(map(str, array_ids))
else:
jobjson_re = re.compile(r'job(\d+).json')
array_max = max(map(
lambda m: int(m.group(1)),
filter(lambda m: m is not None, map(jobjson_re.match, (os.listdir(self.output_dir))))
))
array_spec = '1-{}'.format(array_max)
account = eagle_cfg['account']
# Estimate the wall time in minutes
cores_per_node = 36
walltime = math.ceil(math.ceil(n_sims_per_job / cores_per_node) * minutes_per_sim)
# Queue up simulations
here = os.path.dirname(os.path.abspath(__file__))
eagle_sh = os.path.join(here, 'eagle.sh')
env = {}
env.update(os.environ)
env['PROJECTFILE'] = self.project_filename
env['MY_CONDA_ENV'] = os.environ['CONDA_PREFIX']
args = [
'sbatch',
'--account={}'.format(account),
'--time={}'.format(walltime),
'--export=PROJECTFILE,MY_CONDA_ENV,MEASURESONLY',
'--array={}'.format(array_spec),
'--output=job.out-%a',
'--job-name=bstk',
eagle_sh
]
if os.environ.get('SLURM_JOB_QOS'):
args.insert(-1, '--qos={}'.format(os.environ.get('SLURM_JOB_QOS')))
logger.debug(' '.join(args))
resp = subprocess.run(
args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
env=env,
encoding='utf-8',
cwd=self.output_dir
)
try:
resp.check_returncode()
except subprocess.CalledProcessError as ex:
logger.error(ex.stderr)
raise
for line in resp.stdout.split('\n'):
logger.debug('sbatch:' + line)
m = re.search(r'Submitted batch job (\d+)', resp.stdout)
if not m:
logger.error('Did not receive job id back from sbatch:')
raise RuntimeError('Didn\'t receive job id back from sbatch')
job_id = m.group(1)
return [job_id]
def queue_post_processing(self, after_jobids=[], upload_only=False, hipri=False):
# Configuration values
account = self.cfg['eagle']['account']
walltime = self.cfg['eagle'].get('postprocessing', {}).get('time', '1:30:00')
memory = self.cfg['eagle'].get('postprocessing', {}).get('node_memory_mb', 85248)
print(f"Submitting job to {memory}MB memory nodes.")
# Throw an error if the files already exist.
if not upload_only:
for subdir in ('parquet', 'results_csvs'):
subdirpath = pathlib.Path(self.output_dir, 'results', subdir)
if subdirpath.exists():
raise FileExistsError(f'{subdirpath} already exists. This means you may have run postprocessing already. If you are sure you want to rerun, delete that directory and try again.') # noqa E501
# Move old output logs and config to make way for new ones
for filename in ('dask_scheduler.json', 'dask_scheduler.out', 'dask_workers.out', 'postprocessing.out'):
filepath = pathlib.Path(self.output_dir, filename)
if filepath.exists():
last_mod_date = dt.datetime.fromtimestamp(os.path.getmtime(filepath))
shutil.move(
filepath,
filepath.parent / f'{filepath.stem}_{last_mod_date:%Y%m%d%H%M}{filepath.suffix}'
)
env = {}
env.update(os.environ)
env['PROJECTFILE'] = self.project_filename
env['MY_CONDA_ENV'] = os.environ['CONDA_PREFIX']
env['OUT_DIR'] = self.output_dir
env['UPLOADONLY'] = str(upload_only)
env['MEMORY'] = str(memory)
here = os.path.dirname(os.path.abspath(__file__))
eagle_post_sh = os.path.join(here, 'eagle_postprocessing.sh')
args = [
'sbatch',
'--account={}'.format(account),
'--time={}'.format(walltime),
'--export=PROJECTFILE,MY_CONDA_ENV,OUT_DIR,UPLOADONLY,MEMORY',
'--job-name=bstkpost',
'--output=postprocessing.out',
'--nodes=1',
':',
'--mem={}'.format(memory),
'--output=dask_workers.out',
'--nodes={}'.format(self.cfg['eagle'].get('postprocessing', {}).get('n_workers', 2)),
eagle_post_sh
]
if after_jobids:
args.insert(4, '--dependency=afterany:{}'.format(':'.join(after_jobids)))
if os.environ.get('SLURM_JOB_QOS'):
args.insert(-1, '--qos={}'.format(os.environ.get('SLURM_JOB_QOS')))
elif hipri:
args.insert(-1, '--qos=high')
resp = subprocess.run(
args,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
env=env,
encoding='utf-8',
cwd=self.output_dir
)
for line in resp.stdout.split('\n'):
logger.debug('sbatch: {}'.format(line))
def get_dask_client(self):
if get_bool_env_var('DASKLOCALCLUSTER'):
cluster = LocalCluster(local_directory='/data/dask-tmp')
return Client(cluster)
else:
return Client(scheduler_file=os.path.join(self.output_dir, 'dask_scheduler.json'))
logging_config = {
'version': 1,
'disable_existing_loggers': True,
'formatters': {
'defaultfmt': {
'format': '%(levelname)s:%(asctime)s:%(name)s:%(message)s',
'datefmt': '%Y-%m-%d %H:%M:%S'
}
},
'handlers': {
'console': {
'class': 'logging.StreamHandler',
'formatter': 'defaultfmt',
'level': 'DEBUG',
'stream': 'ext://sys.stdout',
}
},
'loggers': {
'__main__': {
'level': 'DEBUG',
'propagate': True,
'handlers': ['console']
},
'buildstockbatch': {
'level': 'DEBUG',
'propagate': True,
'handlers': ['console']
}
},
}
def user_cli(argv=sys.argv[1:]):
'''
This is the user entry point for running buildstockbatch on Eagle
'''
# set up logging, currently based on within-this-file hard-coded config
logging.config.dictConfig(logging_config)
# print BuildStockBatch logo
print(BuildStockBatchBase.LOGO)
# CLI arguments
parser = argparse.ArgumentParser()
parser.add_argument('project_filename')
parser.add_argument(
'--hipri',
action='store_true',
help='Submit this job to the high priority queue. Uses 2x node hours.'
)
parser.add_argument(
'-m', '--measures_only',
action='store_true',
help='Only apply the measures, but don\'t run simulations. Useful for debugging.'
)
group = parser.add_mutually_exclusive_group()
group.add_argument(
'--postprocessonly',
help='Only do postprocessing, useful for when the simulations are already done',
action='store_true'
)
group.add_argument(
'--uploadonly',
help='Only upload to S3, useful when postprocessing is already done. Ignores the upload flag in yaml',
action='store_true'
)
group.add_argument(
'--validateonly',
help='Only validate the project YAML file and references. Nothing is executed',
action='store_true'
)
group.add_argument(
'--samplingonly',
help='Run the sampling only.',
action='store_true'
)
# parse CLI arguments
args = parser.parse_args(argv)
# load the yaml project file
if not os.path.isfile(args.project_filename):
raise FileNotFoundError(
'The project file {} doesn\'t exist'.format(args.project_filename)
)
project_filename = os.path.abspath(args.project_filename)
with open(project_filename, 'r') as f:
cfg = yaml.load(f, Loader=yaml.SafeLoader)
# validate the project, and in case of the --validateonly flag return True if validation passes
EagleBatch.validate_project(project_filename)
if args.validateonly:
return True
# if the project has already been run, simply queue the correct post-processing step
if args.postprocessonly or args.uploadonly:
eagle_batch = EagleBatch(project_filename)
eagle_batch.queue_post_processing(upload_only=args.uploadonly, hipri=args.hipri)
return True
# otherwise, queue up the whole eagle buildstockbatch process
# the main work of the first Eagle job is to run the sampling script ...
eagle_sh = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'eagle.sh')
assert(os.path.exists(eagle_sh))
out_dir = cfg['output_directory']
if os.path.exists(out_dir):
raise FileExistsError(
'The output directory {} already exists. Please delete it or choose another.'.format(out_dir)
)
logger.info('Creating output directory {}'.format(out_dir))
os.makedirs(out_dir)
env = {}
env.update(os.environ)
env['PROJECTFILE'] = project_filename
env['MY_CONDA_ENV'] = os.environ['CONDA_PREFIX']
env['MEASURESONLY'] = str(int(args.measures_only))
env['SAMPLINGONLY'] = str(int(args.samplingonly))
subargs = [
'sbatch',
'--time={}'.format(cfg['eagle'].get('sampling', {}).get('time', 60)),
'--account={}'.format(cfg['eagle']['account']),
'--nodes=1',
'--export=PROJECTFILE,MY_CONDA_ENV,MEASURESONLY,SAMPLINGONLY',
'--output=sampling.out',
eagle_sh
]
if args.hipri:
subargs.insert(-1, '--qos=high')
logger.info('Submitting sampling job to task scheduler')
subprocess.run(subargs, env=env, cwd=out_dir, check=True)
logger.info('Run squeue -u $USER to monitor the progress of your jobs')
# eagle.sh calls main()
@log_error_details()
def main():
"""
Determines which piece of work is to be run right now, on this process, on
this node. There are four types of work that may need to be done:
- initialization, sampling, and queuing other work (job_array_number == 0)
- run a batch of simulations (job_array_number > 0)
- post-process results (job_array_number == 0 and POSTPROCESS)
- upload results to Athena (job_array_number == 0 and POSTPROCESS and UPLOADONLY)
The context for the work is deinfed by the project_filename (project .yml file),
which is used to initialize an EagleBatch object.
"""
# set up logging, currently based on within-this-file hard-coded config
logging.config.dictConfig(logging_config)
# only direct script argument is the project .yml file
parser = argparse.ArgumentParser()
parser.add_argument('project_filename')
args = parser.parse_args()
# initialize the EagleBatch object
batch = EagleBatch(args.project_filename)
# other arguments/cues about which part of the process we are in are
# encoded in slurm job environment variables
job_array_number = int(os.environ.get('SLURM_ARRAY_TASK_ID', 0))
post_process = get_bool_env_var('POSTPROCESS')
upload_only = get_bool_env_var('UPLOADONLY')
measures_only = get_bool_env_var('MEASURESONLY')
sampling_only = get_bool_env_var('SAMPLINGONLY')
if job_array_number:
# if job array number is non-zero, run the batch job
# Simulation should not be scheduled for sampling only
assert(not sampling_only)
batch.run_job_batch(job_array_number)
elif post_process:
logger.debug("Starting postprocessing")
# else, we might be in a post-processing step
# Postprocessing should not have been scheduled if measures only or sampling only are run
assert(not measures_only)
assert(not sampling_only)
if upload_only:
batch.process_results(skip_combine=True, force_upload=True)
else:
batch.process_results()
else:
logger.debug("Kicking off batch")
# default job_array_number == 0 task is to kick the whole BuildStock
# process off, that is, to create samples and then create batch jobs
# to run them
batch.run_batch(sampling_only)
if __name__ == '__main__':
main()