Merge pull request #374 from NREL/csv_reading_fix

Exclude None from default_na_values list when reading csv files
NREL · Jun 12, 2023 · 61da0dc · 61da0dc
2 parents bd389bf + 4b12209
commit 61da0dc
Show file tree

Hide file tree

Showing 18 changed files with 145 additions and 47 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -34,8 +34,8 @@ jobs:
           wget --quiet https://data.nrel.gov/system/files/156/BuildStock_TMY3_FIPS.zip
       - name: Download and Install OpenStudio
         run: |
-          wget -q https://github.com/NREL/OpenStudio/releases/download/v3.5.1/OpenStudio-3.5.1+22e1db7be5-Ubuntu-20.04.deb
-          sudo apt install -y ./OpenStudio-3.5.1+22e1db7be5-Ubuntu-20.04.deb
+          wget -q https://github.com/NREL/OpenStudio/releases/download/v3.6.1/OpenStudio-3.6.1+bb9481519e-Ubuntu-20.04-x86_64.deb
+          sudo apt install -y ./OpenStudio-3.6.1+bb9481519e-Ubuntu-20.04-x86_64.deb
           openstudio openstudio_version
           which openstudio
       - name: Install buildstockbatch

diff --git a/.gitignore b/.gitignore
@@ -15,3 +15,4 @@ coverage/
 .coverage
 build/
 .env
+.history
diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py
@@ -26,7 +26,6 @@
 import logging
 import math
 import os
-import pandas as pd
 import pathlib
 import random
 from s3fs import S3FileSystem
@@ -42,7 +41,7 @@
 from buildstockbatch.base import ValidationError, BuildStockBatchBase
 from buildstockbatch.aws.awsbase import AwsJobBase
 from buildstockbatch import postprocessing
-from buildstockbatch.utils import ContainerRuntime, log_error_details, get_project_configuration
+from buildstockbatch.utils import ContainerRuntime, log_error_details, get_project_configuration, read_csv
 
 logger = logging.getLogger(__name__)
 
@@ -1858,7 +1857,8 @@ def run_batch(self):
                 json.dump(self.cfg, f)
 
             # Collect simulations to queue
-            df = pd.read_csv(buildstock_csv_filename, index_col=0)
+            df = read_csv(buildstock_csv_filename, index_col=0, dtype=str)
+            self.validate_buildstock_csv(self.project_filename, df)
             building_ids = df.index.tolist()
             n_datapoints = len(building_ids)
             n_sims = n_datapoints * (len(self.cfg.get('upgrades', [])) + 1)

diff --git a/buildstockbatch/base.py b/buildstockbatch/base.py
@@ -16,7 +16,6 @@
 import logging
 from lxml import objectify
 import os
-import pandas as pd
 import numpy as np
 import re
 import requests
@@ -36,7 +35,7 @@
     postprocessing
 )
 from buildstockbatch.exc import SimulationExists, ValidationError
-from buildstockbatch.utils import path_rel_to_file, get_project_configuration
+from buildstockbatch.utils import path_rel_to_file, get_project_configuration, read_csv
 from buildstockbatch.__version__ import __version__ as bsb_version
 
 logger = logging.getLogger(__name__)
@@ -192,7 +191,7 @@ def cleanup_sim_dir(sim_dir, dest_fs, simout_ts_dir, upgrade_id, building_id):
         timeseries_filepath = os.path.join(sim_dir, 'run', 'results_timeseries.csv')
         # FIXME: Allowing both names here for compatibility. Should consolidate on one timeseries filename.
         if os.path.isfile(timeseries_filepath):
-            units_dict = pd.read_csv(timeseries_filepath, nrows=1).transpose().to_dict()[0]
+            units_dict = read_csv(timeseries_filepath, nrows=1).transpose().to_dict()[0]
             skiprows = [1]
         else:
             timeseries_filepath = os.path.join(sim_dir, 'run', 'enduse_timeseries.csv')
@@ -208,15 +207,15 @@ def cleanup_sim_dir(sim_dir, dest_fs, simout_ts_dir, upgrade_id, building_id):
         if os.path.isfile(timeseries_filepath):
             # Find the time columns present in the enduse_timeseries file
             possible_time_cols = ['time', 'Time', 'TimeDST', 'TimeUTC']
-            cols = pd.read_csv(timeseries_filepath, index_col=False, nrows=0).columns.tolist()
+            cols = read_csv(timeseries_filepath, index_col=False, nrows=0).columns.tolist()
             actual_time_cols = [c for c in cols if c in possible_time_cols]
             if not actual_time_cols:
                 logger.error(f'Did not find any time column ({possible_time_cols}) in {timeseries_filepath}.')
                 raise RuntimeError(f'Did not find any time column ({possible_time_cols}) in {timeseries_filepath}.')
 
-            tsdf = pd.read_csv(timeseries_filepath, parse_dates=actual_time_cols, skiprows=skiprows)
+            tsdf = read_csv(timeseries_filepath, parse_dates=actual_time_cols, skiprows=skiprows)
             if os.path.isfile(schedules_filepath):
-                schedules = pd.read_csv(schedules_filepath, dtype=np.float64)
+                schedules = read_csv(schedules_filepath, dtype=np.float64)
                 schedules.rename(columns=lambda x: f'schedules_{x}', inplace=True)
                 schedules['TimeDST'] = tsdf['Time']
                 tsdf = tsdf.merge(schedules, how='left', on='TimeDST')
@@ -302,7 +301,7 @@ def validate_openstudio_path(cls, project_file):
         if os_sha != actual_os_sha:
             raise ValidationError(
                 f"OpenStudio version is correct at {os_version}, but the shas don't match. "
-                "Got {actual_os_sha}, expected {os_sha}"
+                f"Got {actual_os_sha}, expected {os_sha}"
             )
         return True
 
@@ -315,7 +314,38 @@ def validate_sampler(project_file):
         except AttributeError:
             raise ValidationError(f'Sampler class `{sampler_name}` is not available.')
         args = cfg['sampler']['args']
-        return Sampler.validate_args(project_file, **args)
+        Sampler.validate_args(project_file, **args)
+        if issubclass(Sampler, sampler.PrecomputedSampler):
+            sample_file = cfg['sampler']['args']['sample_file']
+            if not os.path.isabs(sample_file):
+                sample_file = os.path.join(os.path.dirname(project_file), sample_file)
+            else:
+                sample_file = os.path.abspath(sample_file)
+            buildstock_df = read_csv(sample_file, dtype=str)
+            BuildStockBatchBase.validate_buildstock_csv(project_file, buildstock_df)
+        return True
+
+
+    @staticmethod
+    def validate_buildstock_csv(project_file, buildstock_df):
+        param_option_dict, _ = BuildStockBatchBase.get_param_option_dict(project_file)
+        # verify that all the Columns in buildstock_df only have values available in param_option_dict
+        # param_option_dict has format: {column_name: [valid_option1, valid_option2, ...], ...}
+        errors = []
+        for column in buildstock_df.columns:
+            if column in {'Building'}:
+                continue
+            if column not in param_option_dict:
+                errors.append(f'Column {column} in buildstock_csv is not available in options_lookup.tsv')
+                continue
+            for option in buildstock_df[column].unique():
+                if option not in param_option_dict[column]:
+                    errors.append(f'Option {option} in column {column} of buildstock_csv is not available '
+                                  f'in options_lookup.tsv')
+        if errors:
+            raise ValidationError('\n'.join(errors))
+
+        return True
 
     @classmethod
     def validate_workflow_generator(cls, project_file):

diff --git a/buildstockbatch/eagle.py b/buildstockbatch/eagle.py
@@ -41,7 +41,8 @@
     get_error_details,
     ContainerRuntime,
     path_rel_to_file,
-    get_project_configuration
+    get_project_configuration,
+    read_csv
 )
 from buildstockbatch import postprocessing
 from buildstockbatch.__version__ import __version__ as bsb_version
@@ -91,7 +92,8 @@ def validate_output_directory_eagle(cls, project_file):
         cfg = get_project_configuration(project_file)
         output_dir = path_rel_to_file(project_file, cfg['output_directory'])
         if not (output_dir.startswith('/scratch') or output_dir.startswith('/projects')):
-            raise ValidationError(f"`output_directory` must be in /scratch or /projects, `output_directory` = {output_dir}")
+            raise ValidationError(f"`output_directory` must be in /scratch or /projects,"
+                                  f" `output_directory` = {output_dir}")
 
     @property
     def output_dir(self):
@@ -196,7 +198,8 @@ def run_batch(self, sampling_only=False):
             return
 
         # Determine the number of simulations expected to be executed
-        df = pd.read_csv(buildstock_csv_filename, index_col=0)
+        df = read_csv(buildstock_csv_filename, index_col=0, dtype=str)
+        self.validate_buildstock_csv(self.project_filename, df)
 
         # find out how many buildings there are to simulate
         building_ids = df.index.tolist()

diff --git a/buildstockbatch/local.py b/buildstockbatch/local.py
@@ -21,7 +21,6 @@
 import json
 import logging
 import os
-import pandas as pd
 import pathlib
 import re
 import shutil
@@ -30,7 +29,7 @@
 
 from buildstockbatch.base import BuildStockBatchBase, SimulationExists
 from buildstockbatch import postprocessing
-from buildstockbatch.utils import log_error_details, ContainerRuntime
+from buildstockbatch.utils import log_error_details, ContainerRuntime, read_csv
 from buildstockbatch.__version__ import __version__ as bsb_version
 
 logger = logging.getLogger(__name__)
@@ -232,7 +231,9 @@ def run_batch(self, n_jobs=None, measures_only=False, sampling_only=False):
         shutil.copytree(buildstock_path / "resources", lib_path / "resources")
         shutil.copytree(project_path / "housing_characteristics", lib_path / "housing_characteristics")
 
-        df = pd.read_csv(buildstock_csv_filename, index_col=0)
+        df = read_csv(buildstock_csv_filename, index_col=0, dtype=str)
+        self.validate_buildstock_csv(self.project_filename, df)
+
         building_ids = df.index.tolist()
         n_datapoints = len(building_ids)
         run_building_d = functools.partial(

diff --git a/buildstockbatch/sampler/commercial_sobol.py b/buildstockbatch/sampler/commercial_sobol.py
@@ -21,7 +21,7 @@
 
 from .sobol_lib import i4_sobol_generate
 from .base import BuildStockSampler
-from buildstockbatch.utils import ContainerRuntime
+from buildstockbatch.utils import ContainerRuntime, read_csv
 from buildstockbatch.exc import ValidationError
 
 logger = logging.getLogger(__name__)
@@ -81,7 +81,7 @@ def run_sampling(self):
         tsv_hash = {}
         for tsv_file in os.listdir(self.buildstock_dir):
             if '.tsv' in tsv_file:
-                tsv_df = pd.read_csv(os.path.join(self.buildstock_dir, tsv_file), sep='\t')
+                tsv_df = read_csv(os.path.join(self.buildstock_dir, tsv_file), sep='\t')
                 dependency_columns = [item for item in list(tsv_df) if 'Dependency=' in item]
                 tsv_df[dependency_columns] = tsv_df[dependency_columns].astype('str')
                 tsv_hash[tsv_file.replace('.tsv', '')] = tsv_df

diff --git a/buildstockbatch/sampler/downselect.py b/buildstockbatch/sampler/downselect.py
@@ -12,11 +12,11 @@
 import math
 import numpy as np
 import os
-import pandas as pd
 import shutil
 
 from .base import BuildStockSampler
 from buildstockbatch.exc import ValidationError
+from buildstockbatch.utils import read_csv
 
 logger = logging.getLogger(__name__)
 
@@ -107,7 +107,7 @@ def run_sampling(self):
             n_samples_init = 350000
             init_sampler = self.SUB_SAMPLER_CLASS(self.parent(), n_datapoints=n_samples_init, **self.sub_kw)
             buildstock_csv_filename = init_sampler.run_sampling()
-            df = pd.read_csv(buildstock_csv_filename, index_col=0)
+            df = read_csv(buildstock_csv_filename, index_col=0, dtype=str)
             df_new = df[self.downselect_logic(df, self.logic)]
             downselected_n_samples_init = df_new.shape[0]
             n_samples = math.ceil(self.n_datapoints * n_samples_init / downselected_n_samples_init)
@@ -120,7 +120,7 @@ def run_sampling(self):
         with gzip.open(os.path.splitext(buildstock_csv_filename)[0] + '_orig.csv.gz', 'wb') as f_out:
             with open(buildstock_csv_filename, 'rb') as f_in:
                 shutil.copyfileobj(f_in, f_out)
-        df = pd.read_csv(buildstock_csv_filename, index_col=0, dtype='str')
+        df = read_csv(buildstock_csv_filename, index_col=0, dtype='str')
         df_new = df[self.downselect_logic(df, self.logic)]
         if len(df_new.index) == 0:
             raise RuntimeError('There are no buildings left after the down select!')

diff --git a/buildstockbatch/test/test_base.py b/buildstockbatch/test/test_base.py
@@ -20,6 +20,7 @@
 from buildstockbatch.local import LocalBatch
 from buildstockbatch.exc import ValidationError
 from buildstockbatch.postprocessing import write_dataframe_as_parquet
+from buildstockbatch.utils import read_csv
 
 dask.config.set(scheduler='synchronous')
 here = os.path.dirname(os.path.abspath(__file__))
@@ -51,7 +52,7 @@ def test_reference_scenario(basic_residential_project_file):
 
     # test results.csv files
     test_path = os.path.join(results_dir, 'results_csvs')
-    test_csv = pd.read_csv(os.path.join(test_path, 'results_up01.csv.gz')).set_index('building_id').sort_index()
+    test_csv = read_csv(os.path.join(test_path, 'results_up01.csv.gz')).set_index('building_id').sort_index()
     assert len(test_csv['apply_upgrade.reference_scenario'].unique()) == 1
     assert test_csv['apply_upgrade.reference_scenario'].iloc[0] == 'example_reference_scenario'
 
@@ -79,16 +80,16 @@ def simplify_columns(colname):
     reference_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'test_results', 'results_csvs')
     test_path = os.path.join(results_dir, 'results_csvs')
 
-    test_csv = pd.read_csv(os.path.join(test_path, 'results_up00.csv.gz')).rename(columns=simplify_columns).\
+    test_csv = read_csv(os.path.join(test_path, 'results_up00.csv.gz')).rename(columns=simplify_columns).\
         sort_values('buildingid').reset_index().drop(columns=['index'])
-    reference_csv = pd.read_csv(os.path.join(reference_path, 'results_up00.csv.gz')).rename(columns=simplify_columns).\
+    reference_csv = read_csv(os.path.join(reference_path, 'results_up00.csv.gz')).rename(columns=simplify_columns).\
         sort_values('buildingid').reset_index().drop(columns=['index'])
     mutul_cols = list(set(test_csv.columns).intersection(set(reference_csv)))
     pd.testing.assert_frame_equal(test_csv[mutul_cols], reference_csv[mutul_cols])
 
-    test_csv = pd.read_csv(os.path.join(test_path, 'results_up01.csv.gz')).rename(columns=simplify_columns).\
+    test_csv = read_csv(os.path.join(test_path, 'results_up01.csv.gz')).rename(columns=simplify_columns).\
         sort_values('buildingid').reset_index().drop(columns=['index'])
-    reference_csv = pd.read_csv(os.path.join(reference_path, 'results_up01.csv.gz')).rename(columns=simplify_columns).\
+    reference_csv = read_csv(os.path.join(reference_path, 'results_up01.csv.gz')).rename(columns=simplify_columns).\
         sort_values('buildingid').reset_index().drop(columns=['index'])
     mutul_cols = list(set(test_csv.columns).intersection(set(reference_csv)))
     pd.testing.assert_frame_equal(test_csv[mutul_cols], reference_csv[mutul_cols])
@@ -186,15 +187,15 @@ def test_combine_files(basic_residential_project_file):
     reference_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'test_results', 'results_csvs')
     test_path = os.path.join(results_dir, 'results_csvs')
 
-    test_csv = pd.read_csv(os.path.join(test_path, 'results_up00.csv.gz')).sort_values('building_id').reset_index()\
+    test_csv = read_csv(os.path.join(test_path, 'results_up00.csv.gz')).sort_values('building_id').reset_index()\
         .drop(columns=['index'])
-    reference_csv = pd.read_csv(os.path.join(reference_path, 'results_up00.csv.gz')).sort_values('building_id')\
+    reference_csv = read_csv(os.path.join(reference_path, 'results_up00.csv.gz')).sort_values('building_id')\
         .reset_index().drop(columns=['index'])
     pd.testing.assert_frame_equal(test_csv, reference_csv)
 
-    test_csv = pd.read_csv(os.path.join(test_path, 'results_up01.csv.gz')).sort_values('building_id').reset_index()\
+    test_csv = read_csv(os.path.join(test_path, 'results_up01.csv.gz')).sort_values('building_id').reset_index()\
         .drop(columns=['index'])
-    reference_csv = pd.read_csv(os.path.join(reference_path, 'results_up01.csv.gz')).sort_values('building_id')\
+    reference_csv = read_csv(os.path.join(reference_path, 'results_up01.csv.gz')).sort_values('building_id')\
         .reset_index().drop(columns=['index'])
     pd.testing.assert_frame_equal(test_csv, reference_csv)
 
@@ -398,7 +399,7 @@ def test_skipping_baseline(basic_residential_project_file):
 
 def test_provide_buildstock_csv(basic_residential_project_file, mocker):
     buildstock_csv = os.path.join(here, 'buildstock.csv')
-    df = pd.read_csv(buildstock_csv)
+    df = read_csv(buildstock_csv, dtype=str)
     project_filename, results_dir = basic_residential_project_file({
         'sampler': {
             'type': 'precomputed',
@@ -412,9 +413,9 @@ def test_provide_buildstock_csv(basic_residential_project_file, mocker):
 
     bsb = LocalBatch(project_filename)
     sampling_output_csv = bsb.sampler.run_sampling()
-    df2 = pd.read_csv(sampling_output_csv)
+    df2 = read_csv(sampling_output_csv, dtype=str)
     pd.testing.assert_frame_equal(df, df2)
-
+    assert (df['Geometry Shared Walls'] == "None").all()  # Verify None is being read properly
     # Test file missing
     with open(project_filename, 'r') as f:
         cfg = yaml.safe_load(f)

diff --git a/buildstockbatch/test/test_eagle.py b/buildstockbatch/test/test_eagle.py
@@ -11,7 +11,7 @@
 
 from buildstockbatch.eagle import user_cli, EagleBatch
 from buildstockbatch.base import BuildStockBatchBase
-from buildstockbatch.utils import get_project_configuration
+from buildstockbatch.utils import get_project_configuration, read_csv
 
 here = os.path.dirname(os.path.abspath(__file__))
 
@@ -281,8 +281,8 @@ def compare_ts_parquets(source, dst):
         compare_ts_parquets(file, results_file)
 
     # Check that buildstock.csv was trimmed properly
-    local_buildstock_df = pd.read_csv(results_dir / 'local_housing_characteristics_dir' / 'buildstock.csv')
-    unique_buildings = {x[0] for x in job_json['batch']}
+    local_buildstock_df = read_csv(results_dir / 'local_housing_characteristics_dir' / 'buildstock.csv', dtype=str)
+    unique_buildings = {str(x[0]) for x in job_json['batch']}
     assert len(unique_buildings) == len(local_buildstock_df)
     assert unique_buildings == set(local_buildstock_df['Building'])
 

diff --git a/buildstockbatch/test/test_inputs/test_openstudio_buildstock/resources/buildstock_bad.csv b/buildstockbatch/test/test_inputs/test_openstudio_buildstock/resources/buildstock_bad.csv
@@ -0,0 +1,6 @@
+Building,Bedroom,Location,Vintage,State,Insulation Wall,Insulation
+1,1,AL_Mobile-Rgnl.AP.722230,1940-1950,CO,Good Option,None
+2,2,AL_Mobile-Rgnl.AP.722230,1940s,CO,Good Option,None
+3,2,AL_Mobile-Rgnl.AP.722230,2010s,VA,Good Option,None
+4,2,AL_Mobile-Rgnl.AP.722230,2000s,TX,,None
+5,3,AL_Mobile-Rgnl.AP.722230,1970s,VA,,None
diff --git a/buildstockbatch/test/test_inputs/test_openstudio_buildstock/resources/buildstock_good.csv b/buildstockbatch/test/test_inputs/test_openstudio_buildstock/resources/buildstock_good.csv
@@ -0,0 +1,6 @@
+Building,Bedroom,Location,Vintage,State,Insulation Wall,Insulation Slab
+1,1,AL_Mobile-Rgnl.AP.722230,<1950,CO,Good Option,None
+2,3,AL_Mobile-Rgnl.AP.722230,1940s,CO,Good Option,None
+3,2,AL_Mobile-Rgnl.AP.722230,2010s,VA,Good Option,None
+4,1,AL_Mobile-Rgnl.AP.722230,2000s,VA,Good Option,None
+5,2,AL_Mobile-Rgnl.AP.722230,1970s,VA,Good Option,None
diff --git a/buildstockbatch/test/test_inputs/test_openstudio_buildstock/resources/options_lookup.tsv b/buildstockbatch/test/test_inputs/test_openstudio_buildstock/resources/options_lookup.tsv
@@ -16,6 +16,9 @@ State	VA
 State	CO															
 County	County1															
 County	County2															
+Bedroom	1															
+Bedroom	2															
+Bedroom	3															
 Insulation Slab	None															
 Insulation Slab	Good Option	ResidentialConstructionsSlab	perimeter_r=0	perimeter_width=0	whole_r=0	gap_r=0	exterior_r=0	exterior_depth=0								
 Insulation Slab	Missing Argument	ResidentialConstructionsSlab	perimeter_r=0	perimeter_width=0	whole_r=10	gap_r=5	exterior_r=0									

diff --git a/buildstockbatch/test/test_postprocessing.py b/buildstockbatch/test/test_postprocessing.py
@@ -3,7 +3,6 @@
 import json
 import logging
 import os
-import pandas as pd
 import pathlib
 import re
 import tarfile
@@ -13,7 +12,7 @@
 
 from buildstockbatch import postprocessing
 from buildstockbatch.base import BuildStockBatchBase
-from buildstockbatch.utils import get_project_configuration
+from buildstockbatch.utils import get_project_configuration, read_csv
 
 postprocessing.performance_report = MagicMock()
 
@@ -58,7 +57,7 @@ def test_report_additional_results_csv_columns(basic_residential_project_file):
     postprocessing.combine_results(fs, results_dir, cfg, do_timeseries=False)
 
     for upgrade_id in (0, 1):
-        df = pd.read_csv(str(results_dir / 'results_csvs' / f'results_up{upgrade_id:02d}.csv.gz'))
+        df = read_csv(str(results_dir / 'results_csvs' / f'results_up{upgrade_id:02d}.csv.gz'))
         assert (df['reporting_measure1.column_1'] == 1).all()
         assert (df['reporting_measure1.column_2'] == 2).all()
         assert (df['reporting_measure2.column_3'] == 3).all()