Skip to content

Commit

Permalink
Merge pull request #306 from NREL/rerun_failed2
Browse files Browse the repository at this point in the history
Rerun if job.out is missing
  • Loading branch information
nmerket committed Sep 9, 2022
2 parents 1e4a0b4 + b5de427 commit cb549dc
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 18 deletions.
31 changes: 23 additions & 8 deletions buildstockbatch/eagle.py
Expand Up @@ -575,18 +575,32 @@ def process_results(self, *args, **kwargs):

super().process_results(*args, **kwargs)

def _get_job_ids_for_file_pattern(self, pat):
job_ids = set()
for filename in os.listdir(self.output_dir):
m = re.search(pat, filename)
if not m:
continue
job_ids.add(int(m.group(1)))
return job_ids

def get_failed_job_array_ids(self):
job_out_files = sorted(pathlib.Path(self.output_dir).glob('job.out-*'))

failed_job_ids = []
failed_job_ids = set()
for filename in job_out_files:
with open(filename, 'r') as f:
if not re.search(r"batch complete", f.read()):
job_id = int(re.match(r"job\.out-(\d+)", filename.name).group(1))
logger.debug(f"Array Job ID {job_id} had a failure.")
failed_job_ids.append(job_id)
failed_job_ids.add(job_id)

job_out_ids = self._get_job_ids_for_file_pattern(r"job\.out-(\d+)")
job_json_ids = self._get_job_ids_for_file_pattern(r"job(\d+)\.json")
missing_job_ids = job_json_ids - job_out_ids
failed_job_ids.update(missing_job_ids)

return failed_job_ids
return sorted(failed_job_ids)

def rerun_failed_jobs(self, hipri=False):
# Find the jobs that failed
Expand All @@ -603,11 +617,12 @@ def rerun_failed_jobs(self, hipri=False):
for job_array_id in failed_job_array_ids:
# Move the failed job.out file so it doesn't get overwritten
filepath = output_path / f'job.out-{job_array_id}'
last_mod_date = dt.datetime.fromtimestamp(os.path.getmtime(filepath))
shutil.move(
filepath,
prev_failed_job_out_dir / f'{filepath.name}_{last_mod_date:%Y%m%d%H%M}'
)
if filepath.exists():
last_mod_date = dt.datetime.fromtimestamp(os.path.getmtime(filepath))
shutil.move(
filepath,
prev_failed_job_out_dir / f'{filepath.name}_{last_mod_date:%Y%m%d%H%M}'
)

# Delete simulation results for jobs we're about to rerun
files_to_delete = [f'simulations_job{job_array_id}.tar.gz', f'results_job{job_array_id}.json.gz']
Expand Down
28 changes: 18 additions & 10 deletions buildstockbatch/test/test_eagle.py
Expand Up @@ -335,34 +335,42 @@ def test_rerun_failed_jobs(mocker, basic_residential_project_file):

b = EagleBatch(project_filename)

for job_id in range(1, 5):
filename = os.path.join(b.output_dir, f"job.out-{job_id}")
with open(filename, "w") as f:
for job_id in range(1, 6):
json_filename = os.path.join(b.output_dir, f"job{job_id:03d}.json")
with open(json_filename, 'w') as f:
json.dump({}, f)
if job_id == 5:
continue
out_filename = os.path.join(b.output_dir, f"job.out-{job_id}")
with open(out_filename, "w") as f:
f.write('lots of output\ngoes\nhere\n')
if job_id % 2 == 0:
f.write("Traceback")
else:
f.write("batch complete")
f.write('\n')

failed_array_ids = b.get_failed_job_array_ids()
assert sorted(failed_array_ids) == [2, 4, 5]

assert not b.process_results()
process_results_mocker.assert_not_called()
process_results_mocker.reset_mock()

failed_array_ids = b.get_failed_job_array_ids()
assert sorted(failed_array_ids) == [2, 4]

b.rerun_failed_jobs()
queue_jobs_mocker.assert_called_once_with([2, 4], hipri=False)
queue_jobs_mocker.assert_called_once_with([2, 4, 5], hipri=False)
queue_jobs_mocker.reset_mock()
queue_post_processing_mocker.assert_called_once_with([42], hipri=False)
queue_post_processing_mocker.reset_mock()
assert not os.path.exists(os.path.join(results_dir, 'results_csvs'))
assert not os.path.exists(os.path.join(results_dir, 'parquet'))

for job_id in range(1, 5):
filename = os.path.join(b.output_dir, f"job.out-{job_id}")
with open(filename, "w") as f:
for job_id in range(1, 6):
json_filename = os.path.join(b.output_dir, f"job{job_id:03d}.json")
with open(json_filename, 'w') as f:
json.dump({}, f)
out_filename = os.path.join(b.output_dir, f"job.out-{job_id}")
with open(out_filename, "w") as f:
f.write('lots of output\ngoes\nhere\n')
f.write("batch complete\n")

Expand Down
7 changes: 7 additions & 0 deletions docs/changelog/changelog_dev.rst
Expand Up @@ -86,3 +86,10 @@ Development Changelog
:pullreq: 303

Add ability to calculate simple utility bills using the ``residential_hpxml`` workflow.

.. change::
:tags: general, feature, eagle
:pullreq: 306
:tickets: 305

Now reruns jobs where the job.out-x is missing entirely.

0 comments on commit cb549dc

Please sign in to comment.