Merge pull request #202 from NREL/fix/glue_crawler_trailing_space_fai…

…lure Fix/glue crawler trailing space failure
NREL · Apr 14, 2021 · 0d5feb9 · 0d5feb9
2 parents 4048967 + 7d1a037
commit 0d5feb9
Show file tree

Hide file tree

Showing 3 changed files with 25 additions and 6 deletions.
diff --git a/buildstockbatch/postprocessing.py b/buildstockbatch/postprocessing.py
@@ -396,7 +396,7 @@ def upload_results(aws_conf, output_dir, results_dir):
     for files in parquet_dir.rglob('*.parquet'):
         all_files.append(files.relative_to(parquet_dir))
 
-    s3_prefix = aws_conf.get('s3', {}).get('prefix', None)
+    s3_prefix = aws_conf.get('s3', {}).get('prefix', '').rstrip('/')
     s3_bucket = aws_conf.get('s3', {}).get('bucket', None)
     if not (s3_prefix and s3_bucket):
         logger.error("YAML file missing postprocessing:aws:s3:prefix and/or bucket entry.")
@@ -431,10 +431,19 @@ def create_athena_tables(aws_conf, tbl_prefix, s3_bucket, s3_prefix):
     max_crawling_time = aws_conf.get('athena', {}).get('max_crawling_time', 600)
     assert db_name, "athena:database_name not supplied"
 
+    # Check that there are files in the s3 bucket before creating and running glue crawler
+    s3 = boto3.resource('s3')
+    bucket = s3.Bucket(s3_bucket)
+    s3_path = f's3://{s3_bucket}/{s3_prefix}'
+    n_existing_files = len(list(bucket.objects.filter(Prefix=s3_prefix)))
+    if n_existing_files == 0:
+        logger.warning(f"There are no files in {s3_path}, Athena tables will not be created as intended")
+        return
+
     glueClient = boto3.client('glue', region_name=region_name)
     crawlTarget = {
         'S3Targets': [{
-            'Path': f's3://{s3_bucket}/{s3_prefix}',
+            'Path': s3_path,
             'Exclusions': []
         }]
     }

diff --git a/buildstockbatch/test/test_base.py b/buildstockbatch/test/test_base.py
@@ -240,7 +240,7 @@ def test_combine_files(basic_residential_project_file):
 
 
 @patch('buildstockbatch.postprocessing.boto3')
-def test_upload_files(mocked_s3, basic_residential_project_file):
+def test_upload_files(mocked_boto3, basic_residential_project_file):
     s3_bucket = 'test_bucket'
     s3_prefix = 'test_prefix'
     db_name = 'test_db_name'
@@ -265,7 +265,8 @@ def test_upload_files(mocked_s3, basic_residential_project_file):
                     }
     mocked_glueclient = MagicMock()
     mocked_glueclient.get_crawler = MagicMock(return_value={'Crawler': {'State': 'READY'}})
-    mocked_s3.client = MagicMock(return_value=mocked_glueclient)
+    mocked_boto3.client = MagicMock(return_value=mocked_glueclient)
+    mocked_boto3.resource().Bucket().objects.filter.side_effect = [[], ['a', 'b', 'c']]
     project_filename, results_dir = basic_residential_project_file(upload_config)
     with patch.object(BuildStockBatchBase, 'weather_dir', None), \
             patch.object(BuildStockBatchBase, 'output_dir', results_dir), \
@@ -278,7 +279,7 @@ def test_upload_files(mocked_s3, basic_residential_project_file):
     files_uploaded = []
     crawler_created = False
     crawler_started = False
-    for call in mocked_s3.mock_calls + mocked_s3.client().mock_calls:
+    for call in mocked_boto3.mock_calls[2:] + mocked_boto3.client().mock_calls:
         call_function = call[0].split('.')[-1]  # 0 is for the function name
         if call_function == 'resource':
             assert call[1][0] in ['s3']  # call[1] is for the positional arguments
@@ -289,7 +290,7 @@ def test_upload_files(mocked_s3, basic_residential_project_file):
             destination_path = call[1][1]
             files_uploaded.append((source_file_path, destination_path))
         if call_function == 'create_crawler':
-            crawler_para = call[2]  # 2 is for the keyboard arguments
+            crawler_para = call[2]  # 2 is for the keyword arguments
             crawler_created = True
             assert crawler_para['DatabaseName'] == upload_config['postprocessing']['aws']['athena']['database_name']
             assert crawler_para['Role'] == upload_config['postprocessing']['aws']['athena']['glue_service_role']

diff --git a/docs/changelog/changelog_dev.rst b/docs/changelog/changelog_dev.rst
@@ -51,3 +51,12 @@ Development Changelog
 
         Use a map of dask delayed function to combine parquets instead of a giant dask df to avoid memory issues.
         Default to 85GB memory nodes in eagle with single process and single thread in each node to avoid memory issues.
+
+    .. change::
+        :tags: postprocessing
+        :pullreq: 202
+        :tickets: 159
+
+        The glue crawler was failing when there was a trailing ``/`` character.
+        This fixes that as well as checks to make sure files were uploaded
+        before running the crawler.