From 6198e5bfc6e64cbce688d1c1e733d3ea3a8b3480 Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Mon, 3 May 2021 12:08:02 -0600 Subject: [PATCH 1/6] removing deprecated aggregate_timeseries --- buildstockbatch/base.py | 3 --- buildstockbatch/schemas/v0.3.yaml | 3 +-- buildstockbatch/test/test_base.py | 14 ++------------ buildstockbatch/test/test_postprocessing.py | 7 +------ project_resstock_national.yml | 1 - 5 files changed, 4 insertions(+), 24 deletions(-) diff --git a/buildstockbatch/base.py b/buildstockbatch/base.py index 125e4e15..288ceeed 100644 --- a/buildstockbatch/base.py +++ b/buildstockbatch/base.py @@ -270,9 +270,6 @@ def validate_misc_constraints(project_file): # validate other miscellaneous constraints cfg = get_project_configuration(project_file) - if cfg.get('postprocessing', {}).get('aggregate_timeseries', False): - logger.warning('aggregate_timeseries has been deprecated and will be removed in a future version.') - return True @staticmethod diff --git a/buildstockbatch/schemas/v0.3.yaml b/buildstockbatch/schemas/v0.3.yaml index 1286d948..d0a2900e 100644 --- a/buildstockbatch/schemas/v0.3.yaml +++ b/buildstockbatch/schemas/v0.3.yaml @@ -48,7 +48,6 @@ hpc-postprocessing-spec: n_workers: int(required=False) node_memory_mb: enum(85248, 180224, 751616, required=False) parquet_memory_mb: int(required=False) - keep_intermediate_files: bool(required=False) sampler-spec: type: str(required=True) @@ -106,7 +105,7 @@ cost-spec: postprocessing-spec: aws: include('aws-postprocessing-spec', required=False) - aggregate_timeseries: bool(required=False) + keep_intermediate_files: bool(required=False) aws-postprocessing-spec: region_name: str(required=False) diff --git a/buildstockbatch/test/test_base.py b/buildstockbatch/test/test_base.py index 9b805d61..e8e38095 100644 --- a/buildstockbatch/test/test_base.py +++ b/buildstockbatch/test/test_base.py @@ -62,12 +62,7 @@ def test_combine_files_flexible(basic_residential_project_file, mocker): # test_results/results_csvs need to be updated with new data *if* columns were indeed supposed to be added/ # removed/renamed. - post_process_config = { - 'postprocessing': { - 'aggregate_timeseries': True - } - } - project_filename, results_dir = basic_residential_project_file(post_process_config) + project_filename, results_dir = basic_residential_project_file() mocker.patch.object(BuildStockBatchBase, 'weather_dir', None) get_dask_client_mock = mocker.patch.object(BuildStockBatchBase, 'get_dask_client') @@ -178,12 +173,7 @@ def test_downselect_integer_options(basic_residential_project_file, mocker): def test_combine_files(basic_residential_project_file): - post_process_config = { - 'postprocessing': { - 'aggregate_timeseries': True - } - } - project_filename, results_dir = basic_residential_project_file(post_process_config) + project_filename, results_dir = basic_residential_project_file() with patch.object(BuildStockBatchBase, 'weather_dir', None), \ patch.object(BuildStockBatchBase, 'get_dask_client') as get_dask_client_mock, \ diff --git a/buildstockbatch/test/test_postprocessing.py b/buildstockbatch/test/test_postprocessing.py index 766163c3..dfc960cb 100644 --- a/buildstockbatch/test/test_postprocessing.py +++ b/buildstockbatch/test/test_postprocessing.py @@ -80,12 +80,7 @@ def test_large_parquet_combine(basic_residential_project_file): # Test a simulated scenario where the individual timeseries parquet are larger than the max memory per partition # allocated for the parquet file combining. - post_process_config = { - 'postprocessing': { - 'aggregate_timeseries': True - } - } - project_filename, results_dir = basic_residential_project_file(post_process_config) + project_filename, results_dir = basic_residential_project_file() with patch.object(BuildStockBatchBase, 'weather_dir', None), \ patch.object(BuildStockBatchBase, 'get_dask_client'), \ diff --git a/project_resstock_national.yml b/project_resstock_national.yml index 67b69115..5afd7150 100644 --- a/project_resstock_national.yml +++ b/project_resstock_national.yml @@ -59,7 +59,6 @@ aws: notifications_email: user@nrel.gov postprocessing: - aggregate_timeseries: true aws: region_name: 'us-west-2' s3: From 543339dc893952bacba9316847ea021dc1e7884f Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Mon, 3 May 2021 13:42:14 -0600 Subject: [PATCH 2/6] changed delete functionality and added test --- buildstockbatch/aws/s3_assets/bsb_post.py | 3 ++- buildstockbatch/base.py | 9 +++------ buildstockbatch/postprocessing.py | 9 +++++---- buildstockbatch/test/test_postprocessing.py | 22 +++++++++++++++++++++ 4 files changed, 32 insertions(+), 11 deletions(-) diff --git a/buildstockbatch/aws/s3_assets/bsb_post.py b/buildstockbatch/aws/s3_assets/bsb_post.py index f2c4fbe7..cd7e3ef9 100644 --- a/buildstockbatch/aws/s3_assets/bsb_post.py +++ b/buildstockbatch/aws/s3_assets/bsb_post.py @@ -57,7 +57,8 @@ def do_postprocessing(s3_bucket, s3_bucket_prefix): f'{s3_bucket_prefix}/results/parquet' ) - remove_intermediate_files(fs, results_s3_loc) + keep_individual_timeseries = cfg.get('postprocessing', {}).get('keep_intermediate_files', False) + remove_intermediate_files(fs, results_s3_loc, keep_individual_timeseries) if __name__ == '__main__': diff --git a/buildstockbatch/base.py b/buildstockbatch/base.py index 288ceeed..14078216 100644 --- a/buildstockbatch/base.py +++ b/buildstockbatch/base.py @@ -268,7 +268,7 @@ def validate_project_schema(project_file): @staticmethod def validate_misc_constraints(project_file): # validate other miscellaneous constraints - cfg = get_project_configuration(project_file) + cfg = get_project_configuration(project_file) # noqa F841 return True @@ -554,8 +554,5 @@ def process_results(self, skip_combine=False, force_upload=False): if 'athena' in aws_conf: postprocessing.create_athena_tables(aws_conf, os.path.basename(self.output_dir), s3_bucket, s3_prefix) - if not self.cfg.get('eagle', {}).get('postprocessing', {}).get('keep_intermediate_files', False): - logger.info("Removing intermediate files.") - postprocessing.remove_intermediate_files(fs, self.results_dir) - else: - logger.info("Skipped removing intermediate files.") + keep_individual_timeseries = self.cfg.get('postprocessing', {}).get('keep_intermediate_files', False) + postprocessing.remove_intermediate_files(fs, self.results_dir, keep_individual_timeseries) diff --git a/buildstockbatch/postprocessing.py b/buildstockbatch/postprocessing.py index 0353cfdf..a02be786 100644 --- a/buildstockbatch/postprocessing.py +++ b/buildstockbatch/postprocessing.py @@ -365,15 +365,16 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): logger.info(f"Finished combining and saving timeseries for upgrade{upgrade_id}.") -def remove_intermediate_files(fs, results_dir): +def remove_intermediate_files(fs, results_dir, keep_individual_timeseries=False): # Remove aggregated files to save space sim_output_dir = f'{results_dir}/simulation_output' - ts_in_dir = f'{sim_output_dir}/timeseries' results_job_json_glob = f'{sim_output_dir}/results_job*.json.gz' - logger.info('Removing temporary files') - fs.rm(ts_in_dir, recursive=True) + logger.info('Removing results_job*.json.gz') for filename in fs.glob(results_job_json_glob): fs.rm(filename) + if not keep_individual_timeseries: + ts_in_dir = f'{sim_output_dir}/timeseries' + fs.rm(ts_in_dir, recursive=True) def upload_results(aws_conf, output_dir, results_dir): diff --git a/buildstockbatch/test/test_postprocessing.py b/buildstockbatch/test/test_postprocessing.py index dfc960cb..6bdff845 100644 --- a/buildstockbatch/test/test_postprocessing.py +++ b/buildstockbatch/test/test_postprocessing.py @@ -88,3 +88,25 @@ def test_large_parquet_combine(basic_residential_project_file): patch.object(postprocessing, 'MAX_PARQUET_MEMORY', 1e6): # set the max memory to just 1MB bsb = BuildStockBatchBase(project_filename) bsb.process_results() # this would raise exception if the postprocessing could not handle the situation + + +@pytest.mark.parametrize('keep_individual_timeseries', [True, False]) +def test_keep_intermediate_files(keep_individual_timeseries, basic_residential_project_file, mocker): + project_filename, results_dir = basic_residential_project_file({ + 'postprocessing': { + 'keep_intermediate_files': keep_individual_timeseries + } + }) + + mocker.patch.object(BuildStockBatchBase, 'weather_dir', None) + mocker.patch.object(BuildStockBatchBase, 'get_dask_client') + mocker.patch.object(BuildStockBatchBase, 'results_dir', results_dir) + bsb = BuildStockBatchBase(project_filename) + bsb.process_results() + + results_path = pathlib.Path(results_dir) + simout_path = results_path / 'simulation_output' + assert len(list(simout_path.glob('results_job*.json.gz'))) == 0 + + ts_path = simout_path / 'timeseries' + assert ts_path.exists() == keep_individual_timeseries From 94fd99b9f4cf9dacd97428d3c972de85df2fbedc Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Mon, 3 May 2021 13:47:12 -0600 Subject: [PATCH 3/6] changing key name to keep_individual_timeseries --- buildstockbatch/aws/s3_assets/bsb_post.py | 2 +- buildstockbatch/base.py | 2 +- buildstockbatch/schemas/v0.3.yaml | 2 +- buildstockbatch/test/test_postprocessing.py | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/buildstockbatch/aws/s3_assets/bsb_post.py b/buildstockbatch/aws/s3_assets/bsb_post.py index cd7e3ef9..9d2c8b3d 100644 --- a/buildstockbatch/aws/s3_assets/bsb_post.py +++ b/buildstockbatch/aws/s3_assets/bsb_post.py @@ -57,7 +57,7 @@ def do_postprocessing(s3_bucket, s3_bucket_prefix): f'{s3_bucket_prefix}/results/parquet' ) - keep_individual_timeseries = cfg.get('postprocessing', {}).get('keep_intermediate_files', False) + keep_individual_timeseries = cfg.get('postprocessing', {}).get('keep_individual_timeseries', False) remove_intermediate_files(fs, results_s3_loc, keep_individual_timeseries) diff --git a/buildstockbatch/base.py b/buildstockbatch/base.py index 14078216..92075bae 100644 --- a/buildstockbatch/base.py +++ b/buildstockbatch/base.py @@ -554,5 +554,5 @@ def process_results(self, skip_combine=False, force_upload=False): if 'athena' in aws_conf: postprocessing.create_athena_tables(aws_conf, os.path.basename(self.output_dir), s3_bucket, s3_prefix) - keep_individual_timeseries = self.cfg.get('postprocessing', {}).get('keep_intermediate_files', False) + keep_individual_timeseries = self.cfg.get('postprocessing', {}).get('keep_individual_timeseries', False) postprocessing.remove_intermediate_files(fs, self.results_dir, keep_individual_timeseries) diff --git a/buildstockbatch/schemas/v0.3.yaml b/buildstockbatch/schemas/v0.3.yaml index d0a2900e..57f825f3 100644 --- a/buildstockbatch/schemas/v0.3.yaml +++ b/buildstockbatch/schemas/v0.3.yaml @@ -105,7 +105,7 @@ cost-spec: postprocessing-spec: aws: include('aws-postprocessing-spec', required=False) - keep_intermediate_files: bool(required=False) + keep_individual_timeseries: bool(required=False) aws-postprocessing-spec: region_name: str(required=False) diff --git a/buildstockbatch/test/test_postprocessing.py b/buildstockbatch/test/test_postprocessing.py index 6bdff845..183de211 100644 --- a/buildstockbatch/test/test_postprocessing.py +++ b/buildstockbatch/test/test_postprocessing.py @@ -91,10 +91,10 @@ def test_large_parquet_combine(basic_residential_project_file): @pytest.mark.parametrize('keep_individual_timeseries', [True, False]) -def test_keep_intermediate_files(keep_individual_timeseries, basic_residential_project_file, mocker): +def test_keep_individual_timeseries(keep_individual_timeseries, basic_residential_project_file, mocker): project_filename, results_dir = basic_residential_project_file({ 'postprocessing': { - 'keep_intermediate_files': keep_individual_timeseries + 'keep_individual_timeseries': keep_individual_timeseries } }) From 42a8e17c0f69bef7aa4d17d6033c9085d7314543 Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Mon, 3 May 2021 13:53:17 -0600 Subject: [PATCH 4/6] adding item to changelog --- docs/changelog/changelog_dev.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/docs/changelog/changelog_dev.rst b/docs/changelog/changelog_dev.rst index 88d7d105..3119f212 100644 --- a/docs/changelog/changelog_dev.rst +++ b/docs/changelog/changelog_dev.rst @@ -82,3 +82,13 @@ Development Changelog :tickets: Fix for create_eagle_env.sh not creating environment. + + .. change:: + :tags: postprocessing + :pullreq: 227 + :tickets: 182 + + Moves the ``eagle.postprocessing.keep_intermediate_files`` to + ``postprocessing.keep_individual_timeseries`` and changes behavior to + keep only the timeseries parquet files. Also, removes the deprecated + ``aggregate_timeseries`` key as that aggregation always happens. From 2264a0276e88a3c5419d9fd7ba3b6ab22641325b Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Mon, 3 May 2021 14:07:56 -0600 Subject: [PATCH 5/6] updating docs --- docs/changelog/migration_0_20.rst | 88 ++++++++++++++++++++----------- docs/project_defn.rst | 16 +++--- 2 files changed, 66 insertions(+), 38 deletions(-) diff --git a/docs/changelog/migration_0_20.rst b/docs/changelog/migration_0_20.rst index fb98fc5d..b49dd092 100644 --- a/docs/changelog/migration_0_20.rst +++ b/docs/changelog/migration_0_20.rst @@ -169,37 +169,6 @@ New Spec: reporting_frequency: Hourly include_enduse_subcategories: true -Commercial Workflw Generator Hard-Coded Measures ------------------------------------------------- - -The commercial workflow generator has changed to remove most of the hard-coded -reporting measures, allowing them to be added to the config file as-needed. -This should avoid the need to create custom BuildStockBatch environments -for each project that needs to add/remove/modify reporting measures. - -Old hard-coded reporting measures: - -- SimulationOutputReport -- OpenStudio Results (measure_dir_name: f8e23017-894d-4bdf-977f-37e3961e6f42) -- TimeseriesCSVExport -- comstock_sensitivity_reports -- qoi_report -- la_100_qaqc (if include_qaqc = true in config) -- simulation_settings_check (if include_qaqc = true in config) - -New hard-coded reporting measures: - -- SimulationOutputReport (reports annual totals in results.csv) -- TimeseriesCSVExport (generates timeseries results at Timestep frequency) - -Two other hard-coded model measures were removed from the workflow. These will -be added to the workflow via the options-lookup.tsv in ComStock instead. - -Removed hard-coded model measures: - -- add_blinds_to_selected_windows -- set_space_type_load_subcategories - Reporting Measures in Workflows ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -231,6 +200,37 @@ New Spec: - measure_dir_name: ReportingMeasure2 +Commercial Workflow Generator Hard-Coded Measures +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The commercial workflow generator has changed to remove most of the hard-coded +reporting measures, allowing them to be added to the config file as-needed. +This should avoid the need to create custom BuildStockBatch environments +for each project that needs to add/remove/modify reporting measures. + +Old hard-coded reporting measures: + +- ``SimulationOutputReport`` +- OpenStudio Results (measure_dir_name: ``f8e23017-894d-4bdf-977f-37e3961e6f42``) +- ``TimeseriesCSVExport`` +- ``comstock_sensitivity_reports`` +- ``qoi_report`` +- ``la_100_qaqc`` (if include_qaqc = true in config) +- ``simulation_settings_check`` (if include_qaqc = true in config) + +New hard-coded reporting measures: + +- ``SimulationOutputReport`` (reports annual totals in results.csv) +- ``TimeseriesCSVExport`` (generates timeseries results at Timestep frequency) + +Two other hard-coded model measures were removed from the workflow. These will +be added to the workflow via the options-lookup.tsv in ComStock instead. + +Removed hard-coded model measures: + +- ``add_blinds_to_selected_windows`` +- ``set_space_type_load_subcategories`` + AWS EMR Configuration Name Changes ---------------------------------- @@ -246,3 +246,29 @@ renamed the following keys under ``aws.emr``: +----------------------+-----------------------+ | slave_instance_count | worker_instance_count | +----------------------+-----------------------+ + + +Keep Individual Timeseries +-------------------------- + +For some applications it is helpful to keep the timeseries parquet files for +each simulation. Normally, they are aggregated into fewer, larger files. There +was a key introduced in v0.19.1 that enabled this. We moved it to a new home +place in the config file. + +Old Spec: + +.. code-block:: yaml + + schema_version: 0.2 + eagle: + postprocessing: + keep_intermediate_files: true # default false if omitted + +New Spec: + +.. code-block:: yaml + + schema_version: '0.3' + postprocessing: + keep_individual_timeseries: true # default false if omitted diff --git a/docs/project_defn.rst b/docs/project_defn.rst index 2e0f49f7..7522509b 100644 --- a/docs/project_defn.rst +++ b/docs/project_defn.rst @@ -148,9 +148,6 @@ the Eagle supercomputer. * ``node_memory_mb``: The memory (in MB) to request for eagle node for postprocessing. The valid values are 85248, 180224 and 751616. Default is 85248. * ``parquet_memory_mb``: The size (in MB) of the combined parquet file in memory. Default is 40000. - * ``keep_intermediate_files``: Set this to true if you want to keep postprocessing intermediate files (for debugging - or other explorative purpose). The intermediate files contain results_job*.json.gz - files and individual building's timeseries parquet files. Default is false. .. _aws-config: @@ -222,10 +219,11 @@ follows: fewer larger parquet files that are better suited for querying using big data analysis tools. - For ResStock runs with the ResidentialScheduleGenerator, the generated schedules - are horizontally concatenated with the time series files before aggregation, - making sure the schedule values are properly lined up with the timestamps in the - `same way that Energeyplus handles ScheduleFiles `_. +For ResStock runs with the ResidentialScheduleGenerator, the generated schedules +are horizontally concatenated with the time series files before aggregation, +making sure the schedule values are properly lined up with the timestamps in the +`same way that Energeyplus handles ScheduleFiles +`_. Uploading to AWS Athena @@ -255,6 +253,10 @@ The configuration options for postprocessing and AWS upload are: * ``postprocessing``: postprocessing configuration + * ``keep_individual_timeseries``: For some use cases it is useful to keep + the timeseries output for each simulation as a separate parquet file. + Setting this option to ``true`` allows that. Default is ``false``. + * ``aws``: configuration related to uploading to and managing data in amazon web services. For this to work, please `configure aws. `_ Including this key will cause your datasets to be uploaded to AWS, omitting it will cause them not to be uploaded. From eeed61c40a2c18358f79a3ac3cba954fe5beded3 Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Mon, 3 May 2021 14:10:41 -0600 Subject: [PATCH 6/6] changing to correct PR number [skip ci] --- docs/changelog/changelog_dev.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/changelog/changelog_dev.rst b/docs/changelog/changelog_dev.rst index 3119f212..59abd403 100644 --- a/docs/changelog/changelog_dev.rst +++ b/docs/changelog/changelog_dev.rst @@ -85,7 +85,7 @@ Development Changelog .. change:: :tags: postprocessing - :pullreq: 227 + :pullreq: 228 :tickets: 182 Moves the ``eagle.postprocessing.keep_intermediate_files`` to