diff --git a/buildstockbatch/aws/s3_assets/bsb_post.py b/buildstockbatch/aws/s3_assets/bsb_post.py index f2c4fbe7..9d2c8b3d 100644 --- a/buildstockbatch/aws/s3_assets/bsb_post.py +++ b/buildstockbatch/aws/s3_assets/bsb_post.py @@ -57,7 +57,8 @@ def do_postprocessing(s3_bucket, s3_bucket_prefix): f'{s3_bucket_prefix}/results/parquet' ) - remove_intermediate_files(fs, results_s3_loc) + keep_individual_timeseries = cfg.get('postprocessing', {}).get('keep_individual_timeseries', False) + remove_intermediate_files(fs, results_s3_loc, keep_individual_timeseries) if __name__ == '__main__': diff --git a/buildstockbatch/base.py b/buildstockbatch/base.py index 125e4e15..92075bae 100644 --- a/buildstockbatch/base.py +++ b/buildstockbatch/base.py @@ -268,10 +268,7 @@ def validate_project_schema(project_file): @staticmethod def validate_misc_constraints(project_file): # validate other miscellaneous constraints - cfg = get_project_configuration(project_file) - - if cfg.get('postprocessing', {}).get('aggregate_timeseries', False): - logger.warning('aggregate_timeseries has been deprecated and will be removed in a future version.') + cfg = get_project_configuration(project_file) # noqa F841 return True @@ -557,8 +554,5 @@ def process_results(self, skip_combine=False, force_upload=False): if 'athena' in aws_conf: postprocessing.create_athena_tables(aws_conf, os.path.basename(self.output_dir), s3_bucket, s3_prefix) - if not self.cfg.get('eagle', {}).get('postprocessing', {}).get('keep_intermediate_files', False): - logger.info("Removing intermediate files.") - postprocessing.remove_intermediate_files(fs, self.results_dir) - else: - logger.info("Skipped removing intermediate files.") + keep_individual_timeseries = self.cfg.get('postprocessing', {}).get('keep_individual_timeseries', False) + postprocessing.remove_intermediate_files(fs, self.results_dir, keep_individual_timeseries) diff --git a/buildstockbatch/postprocessing.py b/buildstockbatch/postprocessing.py index 0353cfdf..a02be786 100644 --- a/buildstockbatch/postprocessing.py +++ b/buildstockbatch/postprocessing.py @@ -365,15 +365,16 @@ def combine_results(fs, results_dir, cfg, do_timeseries=True): logger.info(f"Finished combining and saving timeseries for upgrade{upgrade_id}.") -def remove_intermediate_files(fs, results_dir): +def remove_intermediate_files(fs, results_dir, keep_individual_timeseries=False): # Remove aggregated files to save space sim_output_dir = f'{results_dir}/simulation_output' - ts_in_dir = f'{sim_output_dir}/timeseries' results_job_json_glob = f'{sim_output_dir}/results_job*.json.gz' - logger.info('Removing temporary files') - fs.rm(ts_in_dir, recursive=True) + logger.info('Removing results_job*.json.gz') for filename in fs.glob(results_job_json_glob): fs.rm(filename) + if not keep_individual_timeseries: + ts_in_dir = f'{sim_output_dir}/timeseries' + fs.rm(ts_in_dir, recursive=True) def upload_results(aws_conf, output_dir, results_dir): diff --git a/buildstockbatch/schemas/v0.3.yaml b/buildstockbatch/schemas/v0.3.yaml index 1286d948..57f825f3 100644 --- a/buildstockbatch/schemas/v0.3.yaml +++ b/buildstockbatch/schemas/v0.3.yaml @@ -48,7 +48,6 @@ hpc-postprocessing-spec: n_workers: int(required=False) node_memory_mb: enum(85248, 180224, 751616, required=False) parquet_memory_mb: int(required=False) - keep_intermediate_files: bool(required=False) sampler-spec: type: str(required=True) @@ -106,7 +105,7 @@ cost-spec: postprocessing-spec: aws: include('aws-postprocessing-spec', required=False) - aggregate_timeseries: bool(required=False) + keep_individual_timeseries: bool(required=False) aws-postprocessing-spec: region_name: str(required=False) diff --git a/buildstockbatch/test/test_base.py b/buildstockbatch/test/test_base.py index 9b805d61..e8e38095 100644 --- a/buildstockbatch/test/test_base.py +++ b/buildstockbatch/test/test_base.py @@ -62,12 +62,7 @@ def test_combine_files_flexible(basic_residential_project_file, mocker): # test_results/results_csvs need to be updated with new data *if* columns were indeed supposed to be added/ # removed/renamed. - post_process_config = { - 'postprocessing': { - 'aggregate_timeseries': True - } - } - project_filename, results_dir = basic_residential_project_file(post_process_config) + project_filename, results_dir = basic_residential_project_file() mocker.patch.object(BuildStockBatchBase, 'weather_dir', None) get_dask_client_mock = mocker.patch.object(BuildStockBatchBase, 'get_dask_client') @@ -178,12 +173,7 @@ def test_downselect_integer_options(basic_residential_project_file, mocker): def test_combine_files(basic_residential_project_file): - post_process_config = { - 'postprocessing': { - 'aggregate_timeseries': True - } - } - project_filename, results_dir = basic_residential_project_file(post_process_config) + project_filename, results_dir = basic_residential_project_file() with patch.object(BuildStockBatchBase, 'weather_dir', None), \ patch.object(BuildStockBatchBase, 'get_dask_client') as get_dask_client_mock, \ diff --git a/buildstockbatch/test/test_postprocessing.py b/buildstockbatch/test/test_postprocessing.py index 766163c3..183de211 100644 --- a/buildstockbatch/test/test_postprocessing.py +++ b/buildstockbatch/test/test_postprocessing.py @@ -80,12 +80,7 @@ def test_large_parquet_combine(basic_residential_project_file): # Test a simulated scenario where the individual timeseries parquet are larger than the max memory per partition # allocated for the parquet file combining. - post_process_config = { - 'postprocessing': { - 'aggregate_timeseries': True - } - } - project_filename, results_dir = basic_residential_project_file(post_process_config) + project_filename, results_dir = basic_residential_project_file() with patch.object(BuildStockBatchBase, 'weather_dir', None), \ patch.object(BuildStockBatchBase, 'get_dask_client'), \ @@ -93,3 +88,25 @@ def test_large_parquet_combine(basic_residential_project_file): patch.object(postprocessing, 'MAX_PARQUET_MEMORY', 1e6): # set the max memory to just 1MB bsb = BuildStockBatchBase(project_filename) bsb.process_results() # this would raise exception if the postprocessing could not handle the situation + + +@pytest.mark.parametrize('keep_individual_timeseries', [True, False]) +def test_keep_individual_timeseries(keep_individual_timeseries, basic_residential_project_file, mocker): + project_filename, results_dir = basic_residential_project_file({ + 'postprocessing': { + 'keep_individual_timeseries': keep_individual_timeseries + } + }) + + mocker.patch.object(BuildStockBatchBase, 'weather_dir', None) + mocker.patch.object(BuildStockBatchBase, 'get_dask_client') + mocker.patch.object(BuildStockBatchBase, 'results_dir', results_dir) + bsb = BuildStockBatchBase(project_filename) + bsb.process_results() + + results_path = pathlib.Path(results_dir) + simout_path = results_path / 'simulation_output' + assert len(list(simout_path.glob('results_job*.json.gz'))) == 0 + + ts_path = simout_path / 'timeseries' + assert ts_path.exists() == keep_individual_timeseries diff --git a/docs/changelog/changelog_dev.rst b/docs/changelog/changelog_dev.rst index ee76791e..892de955 100644 --- a/docs/changelog/changelog_dev.rst +++ b/docs/changelog/changelog_dev.rst @@ -83,10 +83,20 @@ Development Changelog Fix for create_eagle_env.sh not creating environment. - .. change: + .. change:: + :tags: postprocessing + :pullreq: 228 + :tickets: 182 + + Moves the ``eagle.postprocessing.keep_intermediate_files`` to + ``postprocessing.keep_individual_timeseries`` and changes behavior to + keep only the timeseries parquet files. Also, removes the deprecated + ``aggregate_timeseries`` key as that aggregation always happens. + + .. change:: :tags: documentation :pullreq: 229 :tickets: 225 Modifies docs to specify that the ``eagle.postprocessing.n_workers`` key - is for how many Eagle nodes are used and indicates the default of 2. \ No newline at end of file + is for how many Eagle nodes are used and indicates the default of 2. diff --git a/docs/changelog/migration_0_20.rst b/docs/changelog/migration_0_20.rst index fb98fc5d..b49dd092 100644 --- a/docs/changelog/migration_0_20.rst +++ b/docs/changelog/migration_0_20.rst @@ -169,37 +169,6 @@ New Spec: reporting_frequency: Hourly include_enduse_subcategories: true -Commercial Workflw Generator Hard-Coded Measures ------------------------------------------------- - -The commercial workflow generator has changed to remove most of the hard-coded -reporting measures, allowing them to be added to the config file as-needed. -This should avoid the need to create custom BuildStockBatch environments -for each project that needs to add/remove/modify reporting measures. - -Old hard-coded reporting measures: - -- SimulationOutputReport -- OpenStudio Results (measure_dir_name: f8e23017-894d-4bdf-977f-37e3961e6f42) -- TimeseriesCSVExport -- comstock_sensitivity_reports -- qoi_report -- la_100_qaqc (if include_qaqc = true in config) -- simulation_settings_check (if include_qaqc = true in config) - -New hard-coded reporting measures: - -- SimulationOutputReport (reports annual totals in results.csv) -- TimeseriesCSVExport (generates timeseries results at Timestep frequency) - -Two other hard-coded model measures were removed from the workflow. These will -be added to the workflow via the options-lookup.tsv in ComStock instead. - -Removed hard-coded model measures: - -- add_blinds_to_selected_windows -- set_space_type_load_subcategories - Reporting Measures in Workflows ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -231,6 +200,37 @@ New Spec: - measure_dir_name: ReportingMeasure2 +Commercial Workflow Generator Hard-Coded Measures +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The commercial workflow generator has changed to remove most of the hard-coded +reporting measures, allowing them to be added to the config file as-needed. +This should avoid the need to create custom BuildStockBatch environments +for each project that needs to add/remove/modify reporting measures. + +Old hard-coded reporting measures: + +- ``SimulationOutputReport`` +- OpenStudio Results (measure_dir_name: ``f8e23017-894d-4bdf-977f-37e3961e6f42``) +- ``TimeseriesCSVExport`` +- ``comstock_sensitivity_reports`` +- ``qoi_report`` +- ``la_100_qaqc`` (if include_qaqc = true in config) +- ``simulation_settings_check`` (if include_qaqc = true in config) + +New hard-coded reporting measures: + +- ``SimulationOutputReport`` (reports annual totals in results.csv) +- ``TimeseriesCSVExport`` (generates timeseries results at Timestep frequency) + +Two other hard-coded model measures were removed from the workflow. These will +be added to the workflow via the options-lookup.tsv in ComStock instead. + +Removed hard-coded model measures: + +- ``add_blinds_to_selected_windows`` +- ``set_space_type_load_subcategories`` + AWS EMR Configuration Name Changes ---------------------------------- @@ -246,3 +246,29 @@ renamed the following keys under ``aws.emr``: +----------------------+-----------------------+ | slave_instance_count | worker_instance_count | +----------------------+-----------------------+ + + +Keep Individual Timeseries +-------------------------- + +For some applications it is helpful to keep the timeseries parquet files for +each simulation. Normally, they are aggregated into fewer, larger files. There +was a key introduced in v0.19.1 that enabled this. We moved it to a new home +place in the config file. + +Old Spec: + +.. code-block:: yaml + + schema_version: 0.2 + eagle: + postprocessing: + keep_intermediate_files: true # default false if omitted + +New Spec: + +.. code-block:: yaml + + schema_version: '0.3' + postprocessing: + keep_individual_timeseries: true # default false if omitted diff --git a/docs/project_defn.rst b/docs/project_defn.rst index 8e1b05c7..2b5376ea 100644 --- a/docs/project_defn.rst +++ b/docs/project_defn.rst @@ -148,9 +148,6 @@ the Eagle supercomputer. * ``node_memory_mb``: The memory (in MB) to request for eagle node for postprocessing. The valid values are 85248, 180224 and 751616. Default is 85248. * ``parquet_memory_mb``: The size (in MB) of the combined parquet file in memory. Default is 40000. - * ``keep_intermediate_files``: Set this to true if you want to keep postprocessing intermediate files (for debugging - or other explorative purpose). The intermediate files contain results_job*.json.gz - files and individual building's timeseries parquet files. Default is false. .. _aws-config: @@ -222,10 +219,11 @@ follows: fewer larger parquet files that are better suited for querying using big data analysis tools. - For ResStock runs with the ResidentialScheduleGenerator, the generated schedules - are horizontally concatenated with the time series files before aggregation, - making sure the schedule values are properly lined up with the timestamps in the - `same way that Energeyplus handles ScheduleFiles `_. +For ResStock runs with the ResidentialScheduleGenerator, the generated schedules +are horizontally concatenated with the time series files before aggregation, +making sure the schedule values are properly lined up with the timestamps in the +`same way that Energeyplus handles ScheduleFiles +`_. Uploading to AWS Athena @@ -255,6 +253,10 @@ The configuration options for postprocessing and AWS upload are: * ``postprocessing``: postprocessing configuration + * ``keep_individual_timeseries``: For some use cases it is useful to keep + the timeseries output for each simulation as a separate parquet file. + Setting this option to ``true`` allows that. Default is ``false``. + * ``aws``: configuration related to uploading to and managing data in amazon web services. For this to work, please `configure aws. `_ Including this key will cause your datasets to be uploaded to AWS, omitting it will cause them not to be uploaded. diff --git a/project_resstock_national.yml b/project_resstock_national.yml index 67b69115..5afd7150 100644 --- a/project_resstock_national.yml +++ b/project_resstock_national.yml @@ -59,7 +59,6 @@ aws: notifications_email: user@nrel.gov postprocessing: - aggregate_timeseries: true aws: region_name: 'us-west-2' s3: