From 15701e27f59be2458592cfe82efa3fefeb51615e Mon Sep 17 00:00:00 2001 From: Andrew Parker Date: Wed, 27 May 2020 11:40:11 -0600 Subject: [PATCH 1/7] Check for presence of files on s3 before initializing glue crawler --- buildstockbatch/postprocessing.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/buildstockbatch/postprocessing.py b/buildstockbatch/postprocessing.py index b1374787..b67a9127 100644 --- a/buildstockbatch/postprocessing.py +++ b/buildstockbatch/postprocessing.py @@ -417,10 +417,17 @@ def create_athena_tables(aws_conf, tbl_prefix, s3_bucket, s3_prefix): max_crawling_time = aws_conf.get('athena', {}).get('max_crawling_time', 600) assert db_name, "athena:database_name not supplied" + # Check that there are files in the s3 bucket before creating and running glue crawler + s3 = boto3.resource('s3') + bucket = s3.Bucket(s3_bucket) + s3_path = f's3://{s3_bucket}/{s3_prefix}' + n_existing_files = len(list(bucket.objects.filter(Prefix=s3_prefix))) + assert n_existing_files > 0, f"There are no files in {s3_path}, cannot create Athena tables using glue crawler" + glueClient = boto3.client('glue', region_name=region_name) crawlTarget = { 'S3Targets': [{ - 'Path': f's3://{s3_bucket}/{s3_prefix}', + 'Path': s3_path, 'Exclusions': [] }] } From 3d8825a9447f5d87adcb18b22349df4a5007ab8a Mon Sep 17 00:00:00 2001 From: Andrew Parker Date: Wed, 27 May 2020 11:41:15 -0600 Subject: [PATCH 2/7] Strip trailing / from aws:s3:prefix. Fixes #159 --- buildstockbatch/postprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/buildstockbatch/postprocessing.py b/buildstockbatch/postprocessing.py index b67a9127..0cdee410 100644 --- a/buildstockbatch/postprocessing.py +++ b/buildstockbatch/postprocessing.py @@ -382,7 +382,7 @@ def upload_results(aws_conf, output_dir, results_dir): for files in parquet_dir.rglob('*.parquet'): all_files.append(files.relative_to(parquet_dir)) - s3_prefix = aws_conf.get('s3', {}).get('prefix', None) + s3_prefix = aws_conf.get('s3', {}).get('prefix', None).rstrip('/') s3_bucket = aws_conf.get('s3', {}).get('bucket', None) if not (s3_prefix and s3_bucket): logger.error("YAML file missing postprocessing:aws:s3:prefix and/or bucket entry.") From 22b9c31bf7c396cc1dbecaeee4c5e22dfc6f873b Mon Sep 17 00:00:00 2001 From: Andrew Parker Date: Fri, 18 Dec 2020 11:31:06 -0700 Subject: [PATCH 3/7] Correction suggested by @rajeee --- buildstockbatch/postprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/buildstockbatch/postprocessing.py b/buildstockbatch/postprocessing.py index 5ee746d8..c851ffbf 100644 --- a/buildstockbatch/postprocessing.py +++ b/buildstockbatch/postprocessing.py @@ -383,7 +383,7 @@ def upload_results(aws_conf, output_dir, results_dir): for files in parquet_dir.rglob('*.parquet'): all_files.append(files.relative_to(parquet_dir)) - s3_prefix = aws_conf.get('s3', {}).get('prefix', None).rstrip('/') + s3_prefix = aws_conf.get('s3', {}).get('prefix', '').rstrip('/') s3_bucket = aws_conf.get('s3', {}).get('bucket', None) if not (s3_prefix and s3_bucket): logger.error("YAML file missing postprocessing:aws:s3:prefix and/or bucket entry.") From 6bb91549a1eb3fc6d035b3d5ca2b4b4dc93e7e88 Mon Sep 17 00:00:00 2001 From: Andrew Parker Date: Fri, 18 Dec 2020 11:54:30 -0700 Subject: [PATCH 4/7] Changes glue crawler file existence check from assertion to warning --- buildstockbatch/postprocessing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/buildstockbatch/postprocessing.py b/buildstockbatch/postprocessing.py index c851ffbf..17774424 100644 --- a/buildstockbatch/postprocessing.py +++ b/buildstockbatch/postprocessing.py @@ -423,7 +423,8 @@ def create_athena_tables(aws_conf, tbl_prefix, s3_bucket, s3_prefix): bucket = s3.Bucket(s3_bucket) s3_path = f's3://{s3_bucket}/{s3_prefix}' n_existing_files = len(list(bucket.objects.filter(Prefix=s3_prefix))) - assert n_existing_files > 0, f"There are no files in {s3_path}, cannot create Athena tables using glue crawler" + if n_existing_files == 0: + logger.warning(f"There are no files in {s3_path}, Athena tables will not be created as intended") glueClient = boto3.client('glue', region_name=region_name) crawlTarget = { From c3d95350b04e2d63e8c7b57157cab1d02889a152 Mon Sep 17 00:00:00 2001 From: asparke2 Date: Mon, 4 Jan 2021 06:02:42 -0700 Subject: [PATCH 5/7] Return from the function early if there are no files to be crawled per @rajeee Co-authored-by: Rajendra Adhikari --- buildstockbatch/postprocessing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/buildstockbatch/postprocessing.py b/buildstockbatch/postprocessing.py index 17774424..db5296af 100644 --- a/buildstockbatch/postprocessing.py +++ b/buildstockbatch/postprocessing.py @@ -425,6 +425,7 @@ def create_athena_tables(aws_conf, tbl_prefix, s3_bucket, s3_prefix): n_existing_files = len(list(bucket.objects.filter(Prefix=s3_prefix))) if n_existing_files == 0: logger.warning(f"There are no files in {s3_path}, Athena tables will not be created as intended") + return glueClient = boto3.client('glue', region_name=region_name) crawlTarget = { From 53c004b47f2cc2a5e4db3afa9d44fd964c81356e Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Wed, 7 Apr 2021 11:56:04 -0600 Subject: [PATCH 6/7] changelog edit --- docs/changelog/changelog_dev.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/docs/changelog/changelog_dev.rst b/docs/changelog/changelog_dev.rst index 3e6d72f0..108d3970 100644 --- a/docs/changelog/changelog_dev.rst +++ b/docs/changelog/changelog_dev.rst @@ -51,3 +51,12 @@ Development Changelog Use a map of dask delayed function to combine parquets instead of a giant dask df to avoid memory issues. Default to 85GB memory nodes in eagle with single process and single thread in each node to avoid memory issues. + + .. change:: + :tags: postprocessing + :pullreq: 202 + :tickets: 159 + + The glue crawler was failing when there was a trailing ``/`` character. + This fixes that as well as checks to make sure files were uploaded + before running the crawler. From 7d1a03745c66da77eacd44029b428b1fd1847ce0 Mon Sep 17 00:00:00 2001 From: Noel Merket Date: Wed, 7 Apr 2021 17:32:05 -0600 Subject: [PATCH 7/7] fixing upload_files test --- buildstockbatch/test/test_base.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/buildstockbatch/test/test_base.py b/buildstockbatch/test/test_base.py index abfd7e95..9b805d61 100644 --- a/buildstockbatch/test/test_base.py +++ b/buildstockbatch/test/test_base.py @@ -240,7 +240,7 @@ def test_combine_files(basic_residential_project_file): @patch('buildstockbatch.postprocessing.boto3') -def test_upload_files(mocked_s3, basic_residential_project_file): +def test_upload_files(mocked_boto3, basic_residential_project_file): s3_bucket = 'test_bucket' s3_prefix = 'test_prefix' db_name = 'test_db_name' @@ -265,7 +265,8 @@ def test_upload_files(mocked_s3, basic_residential_project_file): } mocked_glueclient = MagicMock() mocked_glueclient.get_crawler = MagicMock(return_value={'Crawler': {'State': 'READY'}}) - mocked_s3.client = MagicMock(return_value=mocked_glueclient) + mocked_boto3.client = MagicMock(return_value=mocked_glueclient) + mocked_boto3.resource().Bucket().objects.filter.side_effect = [[], ['a', 'b', 'c']] project_filename, results_dir = basic_residential_project_file(upload_config) with patch.object(BuildStockBatchBase, 'weather_dir', None), \ patch.object(BuildStockBatchBase, 'output_dir', results_dir), \ @@ -278,7 +279,7 @@ def test_upload_files(mocked_s3, basic_residential_project_file): files_uploaded = [] crawler_created = False crawler_started = False - for call in mocked_s3.mock_calls + mocked_s3.client().mock_calls: + for call in mocked_boto3.mock_calls[2:] + mocked_boto3.client().mock_calls: call_function = call[0].split('.')[-1] # 0 is for the function name if call_function == 'resource': assert call[1][0] in ['s3'] # call[1] is for the positional arguments @@ -289,7 +290,7 @@ def test_upload_files(mocked_s3, basic_residential_project_file): destination_path = call[1][1] files_uploaded.append((source_file_path, destination_path)) if call_function == 'create_crawler': - crawler_para = call[2] # 2 is for the keyboard arguments + crawler_para = call[2] # 2 is for the keyword arguments crawler_created = True assert crawler_para['DatabaseName'] == upload_config['postprocessing']['aws']['athena']['database_name'] assert crawler_para['Role'] == upload_config['postprocessing']['aws']['athena']['glue_service_role']