Skip to content

Commit

Permalink
Merge pull request #374 from NREL/csv_reading_fix
Browse files Browse the repository at this point in the history
Exclude None from default_na_values list when reading csv files
  • Loading branch information
nmerket committed Jun 12, 2023
2 parents bd389bf + 4b12209 commit 61da0dc
Show file tree
Hide file tree
Showing 18 changed files with 145 additions and 47 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Expand Up @@ -34,8 +34,8 @@ jobs:
wget --quiet https://data.nrel.gov/system/files/156/BuildStock_TMY3_FIPS.zip
- name: Download and Install OpenStudio
run: |
wget -q https://github.com/NREL/OpenStudio/releases/download/v3.5.1/OpenStudio-3.5.1+22e1db7be5-Ubuntu-20.04.deb
sudo apt install -y ./OpenStudio-3.5.1+22e1db7be5-Ubuntu-20.04.deb
wget -q https://github.com/NREL/OpenStudio/releases/download/v3.6.1/OpenStudio-3.6.1+bb9481519e-Ubuntu-20.04-x86_64.deb
sudo apt install -y ./OpenStudio-3.6.1+bb9481519e-Ubuntu-20.04-x86_64.deb
openstudio openstudio_version
which openstudio
- name: Install buildstockbatch
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -15,3 +15,4 @@ coverage/
.coverage
build/
.env
.history
6 changes: 3 additions & 3 deletions buildstockbatch/aws/aws.py
Expand Up @@ -26,7 +26,6 @@
import logging
import math
import os
import pandas as pd
import pathlib
import random
from s3fs import S3FileSystem
Expand All @@ -42,7 +41,7 @@
from buildstockbatch.base import ValidationError, BuildStockBatchBase
from buildstockbatch.aws.awsbase import AwsJobBase
from buildstockbatch import postprocessing
from buildstockbatch.utils import ContainerRuntime, log_error_details, get_project_configuration
from buildstockbatch.utils import ContainerRuntime, log_error_details, get_project_configuration, read_csv

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -1858,7 +1857,8 @@ def run_batch(self):
json.dump(self.cfg, f)

# Collect simulations to queue
df = pd.read_csv(buildstock_csv_filename, index_col=0)
df = read_csv(buildstock_csv_filename, index_col=0, dtype=str)
self.validate_buildstock_csv(self.project_filename, df)
building_ids = df.index.tolist()
n_datapoints = len(building_ids)
n_sims = n_datapoints * (len(self.cfg.get('upgrades', [])) + 1)
Expand Down
46 changes: 38 additions & 8 deletions buildstockbatch/base.py
Expand Up @@ -16,7 +16,6 @@
import logging
from lxml import objectify
import os
import pandas as pd
import numpy as np
import re
import requests
Expand All @@ -36,7 +35,7 @@
postprocessing
)
from buildstockbatch.exc import SimulationExists, ValidationError
from buildstockbatch.utils import path_rel_to_file, get_project_configuration
from buildstockbatch.utils import path_rel_to_file, get_project_configuration, read_csv
from buildstockbatch.__version__ import __version__ as bsb_version

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -192,7 +191,7 @@ def cleanup_sim_dir(sim_dir, dest_fs, simout_ts_dir, upgrade_id, building_id):
timeseries_filepath = os.path.join(sim_dir, 'run', 'results_timeseries.csv')
# FIXME: Allowing both names here for compatibility. Should consolidate on one timeseries filename.
if os.path.isfile(timeseries_filepath):
units_dict = pd.read_csv(timeseries_filepath, nrows=1).transpose().to_dict()[0]
units_dict = read_csv(timeseries_filepath, nrows=1).transpose().to_dict()[0]
skiprows = [1]
else:
timeseries_filepath = os.path.join(sim_dir, 'run', 'enduse_timeseries.csv')
Expand All @@ -208,15 +207,15 @@ def cleanup_sim_dir(sim_dir, dest_fs, simout_ts_dir, upgrade_id, building_id):
if os.path.isfile(timeseries_filepath):
# Find the time columns present in the enduse_timeseries file
possible_time_cols = ['time', 'Time', 'TimeDST', 'TimeUTC']
cols = pd.read_csv(timeseries_filepath, index_col=False, nrows=0).columns.tolist()
cols = read_csv(timeseries_filepath, index_col=False, nrows=0).columns.tolist()
actual_time_cols = [c for c in cols if c in possible_time_cols]
if not actual_time_cols:
logger.error(f'Did not find any time column ({possible_time_cols}) in {timeseries_filepath}.')
raise RuntimeError(f'Did not find any time column ({possible_time_cols}) in {timeseries_filepath}.')

tsdf = pd.read_csv(timeseries_filepath, parse_dates=actual_time_cols, skiprows=skiprows)
tsdf = read_csv(timeseries_filepath, parse_dates=actual_time_cols, skiprows=skiprows)
if os.path.isfile(schedules_filepath):
schedules = pd.read_csv(schedules_filepath, dtype=np.float64)
schedules = read_csv(schedules_filepath, dtype=np.float64)
schedules.rename(columns=lambda x: f'schedules_{x}', inplace=True)
schedules['TimeDST'] = tsdf['Time']
tsdf = tsdf.merge(schedules, how='left', on='TimeDST')
Expand Down Expand Up @@ -302,7 +301,7 @@ def validate_openstudio_path(cls, project_file):
if os_sha != actual_os_sha:
raise ValidationError(
f"OpenStudio version is correct at {os_version}, but the shas don't match. "
"Got {actual_os_sha}, expected {os_sha}"
f"Got {actual_os_sha}, expected {os_sha}"
)
return True

Expand All @@ -315,7 +314,38 @@ def validate_sampler(project_file):
except AttributeError:
raise ValidationError(f'Sampler class `{sampler_name}` is not available.')
args = cfg['sampler']['args']
return Sampler.validate_args(project_file, **args)
Sampler.validate_args(project_file, **args)
if issubclass(Sampler, sampler.PrecomputedSampler):
sample_file = cfg['sampler']['args']['sample_file']
if not os.path.isabs(sample_file):
sample_file = os.path.join(os.path.dirname(project_file), sample_file)
else:
sample_file = os.path.abspath(sample_file)
buildstock_df = read_csv(sample_file, dtype=str)
BuildStockBatchBase.validate_buildstock_csv(project_file, buildstock_df)
return True


@staticmethod
def validate_buildstock_csv(project_file, buildstock_df):
param_option_dict, _ = BuildStockBatchBase.get_param_option_dict(project_file)
# verify that all the Columns in buildstock_df only have values available in param_option_dict
# param_option_dict has format: {column_name: [valid_option1, valid_option2, ...], ...}
errors = []
for column in buildstock_df.columns:
if column in {'Building'}:
continue
if column not in param_option_dict:
errors.append(f'Column {column} in buildstock_csv is not available in options_lookup.tsv')
continue
for option in buildstock_df[column].unique():
if option not in param_option_dict[column]:
errors.append(f'Option {option} in column {column} of buildstock_csv is not available '
f'in options_lookup.tsv')
if errors:
raise ValidationError('\n'.join(errors))

return True

@classmethod
def validate_workflow_generator(cls, project_file):
Expand Down
9 changes: 6 additions & 3 deletions buildstockbatch/eagle.py
Expand Up @@ -41,7 +41,8 @@
get_error_details,
ContainerRuntime,
path_rel_to_file,
get_project_configuration
get_project_configuration,
read_csv
)
from buildstockbatch import postprocessing
from buildstockbatch.__version__ import __version__ as bsb_version
Expand Down Expand Up @@ -91,7 +92,8 @@ def validate_output_directory_eagle(cls, project_file):
cfg = get_project_configuration(project_file)
output_dir = path_rel_to_file(project_file, cfg['output_directory'])
if not (output_dir.startswith('/scratch') or output_dir.startswith('/projects')):
raise ValidationError(f"`output_directory` must be in /scratch or /projects, `output_directory` = {output_dir}")
raise ValidationError(f"`output_directory` must be in /scratch or /projects,"
f" `output_directory` = {output_dir}")

@property
def output_dir(self):
Expand Down Expand Up @@ -196,7 +198,8 @@ def run_batch(self, sampling_only=False):
return

# Determine the number of simulations expected to be executed
df = pd.read_csv(buildstock_csv_filename, index_col=0)
df = read_csv(buildstock_csv_filename, index_col=0, dtype=str)
self.validate_buildstock_csv(self.project_filename, df)

# find out how many buildings there are to simulate
building_ids = df.index.tolist()
Expand Down
7 changes: 4 additions & 3 deletions buildstockbatch/local.py
Expand Up @@ -21,7 +21,6 @@
import json
import logging
import os
import pandas as pd
import pathlib
import re
import shutil
Expand All @@ -30,7 +29,7 @@

from buildstockbatch.base import BuildStockBatchBase, SimulationExists
from buildstockbatch import postprocessing
from buildstockbatch.utils import log_error_details, ContainerRuntime
from buildstockbatch.utils import log_error_details, ContainerRuntime, read_csv
from buildstockbatch.__version__ import __version__ as bsb_version

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -232,7 +231,9 @@ def run_batch(self, n_jobs=None, measures_only=False, sampling_only=False):
shutil.copytree(buildstock_path / "resources", lib_path / "resources")
shutil.copytree(project_path / "housing_characteristics", lib_path / "housing_characteristics")

df = pd.read_csv(buildstock_csv_filename, index_col=0)
df = read_csv(buildstock_csv_filename, index_col=0, dtype=str)
self.validate_buildstock_csv(self.project_filename, df)

building_ids = df.index.tolist()
n_datapoints = len(building_ids)
run_building_d = functools.partial(
Expand Down
4 changes: 2 additions & 2 deletions buildstockbatch/sampler/commercial_sobol.py
Expand Up @@ -21,7 +21,7 @@

from .sobol_lib import i4_sobol_generate
from .base import BuildStockSampler
from buildstockbatch.utils import ContainerRuntime
from buildstockbatch.utils import ContainerRuntime, read_csv
from buildstockbatch.exc import ValidationError

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -81,7 +81,7 @@ def run_sampling(self):
tsv_hash = {}
for tsv_file in os.listdir(self.buildstock_dir):
if '.tsv' in tsv_file:
tsv_df = pd.read_csv(os.path.join(self.buildstock_dir, tsv_file), sep='\t')
tsv_df = read_csv(os.path.join(self.buildstock_dir, tsv_file), sep='\t')
dependency_columns = [item for item in list(tsv_df) if 'Dependency=' in item]
tsv_df[dependency_columns] = tsv_df[dependency_columns].astype('str')
tsv_hash[tsv_file.replace('.tsv', '')] = tsv_df
Expand Down
6 changes: 3 additions & 3 deletions buildstockbatch/sampler/downselect.py
Expand Up @@ -12,11 +12,11 @@
import math
import numpy as np
import os
import pandas as pd
import shutil

from .base import BuildStockSampler
from buildstockbatch.exc import ValidationError
from buildstockbatch.utils import read_csv

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -107,7 +107,7 @@ def run_sampling(self):
n_samples_init = 350000
init_sampler = self.SUB_SAMPLER_CLASS(self.parent(), n_datapoints=n_samples_init, **self.sub_kw)
buildstock_csv_filename = init_sampler.run_sampling()
df = pd.read_csv(buildstock_csv_filename, index_col=0)
df = read_csv(buildstock_csv_filename, index_col=0, dtype=str)
df_new = df[self.downselect_logic(df, self.logic)]
downselected_n_samples_init = df_new.shape[0]
n_samples = math.ceil(self.n_datapoints * n_samples_init / downselected_n_samples_init)
Expand All @@ -120,7 +120,7 @@ def run_sampling(self):
with gzip.open(os.path.splitext(buildstock_csv_filename)[0] + '_orig.csv.gz', 'wb') as f_out:
with open(buildstock_csv_filename, 'rb') as f_in:
shutil.copyfileobj(f_in, f_out)
df = pd.read_csv(buildstock_csv_filename, index_col=0, dtype='str')
df = read_csv(buildstock_csv_filename, index_col=0, dtype='str')
df_new = df[self.downselect_logic(df, self.logic)]
if len(df_new.index) == 0:
raise RuntimeError('There are no buildings left after the down select!')
Expand Down
25 changes: 13 additions & 12 deletions buildstockbatch/test/test_base.py
Expand Up @@ -20,6 +20,7 @@
from buildstockbatch.local import LocalBatch
from buildstockbatch.exc import ValidationError
from buildstockbatch.postprocessing import write_dataframe_as_parquet
from buildstockbatch.utils import read_csv

dask.config.set(scheduler='synchronous')
here = os.path.dirname(os.path.abspath(__file__))
Expand Down Expand Up @@ -51,7 +52,7 @@ def test_reference_scenario(basic_residential_project_file):

# test results.csv files
test_path = os.path.join(results_dir, 'results_csvs')
test_csv = pd.read_csv(os.path.join(test_path, 'results_up01.csv.gz')).set_index('building_id').sort_index()
test_csv = read_csv(os.path.join(test_path, 'results_up01.csv.gz')).set_index('building_id').sort_index()
assert len(test_csv['apply_upgrade.reference_scenario'].unique()) == 1
assert test_csv['apply_upgrade.reference_scenario'].iloc[0] == 'example_reference_scenario'

Expand Down Expand Up @@ -79,16 +80,16 @@ def simplify_columns(colname):
reference_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'test_results', 'results_csvs')
test_path = os.path.join(results_dir, 'results_csvs')

test_csv = pd.read_csv(os.path.join(test_path, 'results_up00.csv.gz')).rename(columns=simplify_columns).\
test_csv = read_csv(os.path.join(test_path, 'results_up00.csv.gz')).rename(columns=simplify_columns).\
sort_values('buildingid').reset_index().drop(columns=['index'])
reference_csv = pd.read_csv(os.path.join(reference_path, 'results_up00.csv.gz')).rename(columns=simplify_columns).\
reference_csv = read_csv(os.path.join(reference_path, 'results_up00.csv.gz')).rename(columns=simplify_columns).\
sort_values('buildingid').reset_index().drop(columns=['index'])
mutul_cols = list(set(test_csv.columns).intersection(set(reference_csv)))
pd.testing.assert_frame_equal(test_csv[mutul_cols], reference_csv[mutul_cols])

test_csv = pd.read_csv(os.path.join(test_path, 'results_up01.csv.gz')).rename(columns=simplify_columns).\
test_csv = read_csv(os.path.join(test_path, 'results_up01.csv.gz')).rename(columns=simplify_columns).\
sort_values('buildingid').reset_index().drop(columns=['index'])
reference_csv = pd.read_csv(os.path.join(reference_path, 'results_up01.csv.gz')).rename(columns=simplify_columns).\
reference_csv = read_csv(os.path.join(reference_path, 'results_up01.csv.gz')).rename(columns=simplify_columns).\
sort_values('buildingid').reset_index().drop(columns=['index'])
mutul_cols = list(set(test_csv.columns).intersection(set(reference_csv)))
pd.testing.assert_frame_equal(test_csv[mutul_cols], reference_csv[mutul_cols])
Expand Down Expand Up @@ -186,15 +187,15 @@ def test_combine_files(basic_residential_project_file):
reference_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'test_results', 'results_csvs')
test_path = os.path.join(results_dir, 'results_csvs')

test_csv = pd.read_csv(os.path.join(test_path, 'results_up00.csv.gz')).sort_values('building_id').reset_index()\
test_csv = read_csv(os.path.join(test_path, 'results_up00.csv.gz')).sort_values('building_id').reset_index()\
.drop(columns=['index'])
reference_csv = pd.read_csv(os.path.join(reference_path, 'results_up00.csv.gz')).sort_values('building_id')\
reference_csv = read_csv(os.path.join(reference_path, 'results_up00.csv.gz')).sort_values('building_id')\
.reset_index().drop(columns=['index'])
pd.testing.assert_frame_equal(test_csv, reference_csv)

test_csv = pd.read_csv(os.path.join(test_path, 'results_up01.csv.gz')).sort_values('building_id').reset_index()\
test_csv = read_csv(os.path.join(test_path, 'results_up01.csv.gz')).sort_values('building_id').reset_index()\
.drop(columns=['index'])
reference_csv = pd.read_csv(os.path.join(reference_path, 'results_up01.csv.gz')).sort_values('building_id')\
reference_csv = read_csv(os.path.join(reference_path, 'results_up01.csv.gz')).sort_values('building_id')\
.reset_index().drop(columns=['index'])
pd.testing.assert_frame_equal(test_csv, reference_csv)

Expand Down Expand Up @@ -398,7 +399,7 @@ def test_skipping_baseline(basic_residential_project_file):

def test_provide_buildstock_csv(basic_residential_project_file, mocker):
buildstock_csv = os.path.join(here, 'buildstock.csv')
df = pd.read_csv(buildstock_csv)
df = read_csv(buildstock_csv, dtype=str)
project_filename, results_dir = basic_residential_project_file({
'sampler': {
'type': 'precomputed',
Expand All @@ -412,9 +413,9 @@ def test_provide_buildstock_csv(basic_residential_project_file, mocker):

bsb = LocalBatch(project_filename)
sampling_output_csv = bsb.sampler.run_sampling()
df2 = pd.read_csv(sampling_output_csv)
df2 = read_csv(sampling_output_csv, dtype=str)
pd.testing.assert_frame_equal(df, df2)

assert (df['Geometry Shared Walls'] == "None").all() # Verify None is being read properly
# Test file missing
with open(project_filename, 'r') as f:
cfg = yaml.safe_load(f)
Expand Down
6 changes: 3 additions & 3 deletions buildstockbatch/test/test_eagle.py
Expand Up @@ -11,7 +11,7 @@

from buildstockbatch.eagle import user_cli, EagleBatch
from buildstockbatch.base import BuildStockBatchBase
from buildstockbatch.utils import get_project_configuration
from buildstockbatch.utils import get_project_configuration, read_csv

here = os.path.dirname(os.path.abspath(__file__))

Expand Down Expand Up @@ -281,8 +281,8 @@ def compare_ts_parquets(source, dst):
compare_ts_parquets(file, results_file)

# Check that buildstock.csv was trimmed properly
local_buildstock_df = pd.read_csv(results_dir / 'local_housing_characteristics_dir' / 'buildstock.csv')
unique_buildings = {x[0] for x in job_json['batch']}
local_buildstock_df = read_csv(results_dir / 'local_housing_characteristics_dir' / 'buildstock.csv', dtype=str)
unique_buildings = {str(x[0]) for x in job_json['batch']}
assert len(unique_buildings) == len(local_buildstock_df)
assert unique_buildings == set(local_buildstock_df['Building'])

Expand Down
@@ -0,0 +1,6 @@
Building,Bedroom,Location,Vintage,State,Insulation Wall,Insulation
1,1,AL_Mobile-Rgnl.AP.722230,1940-1950,CO,Good Option,None
2,2,AL_Mobile-Rgnl.AP.722230,1940s,CO,Good Option,None
3,2,AL_Mobile-Rgnl.AP.722230,2010s,VA,Good Option,None
4,2,AL_Mobile-Rgnl.AP.722230,2000s,TX,,None
5,3,AL_Mobile-Rgnl.AP.722230,1970s,VA,,None
@@ -0,0 +1,6 @@
Building,Bedroom,Location,Vintage,State,Insulation Wall,Insulation Slab
1,1,AL_Mobile-Rgnl.AP.722230,<1950,CO,Good Option,None
2,3,AL_Mobile-Rgnl.AP.722230,1940s,CO,Good Option,None
3,2,AL_Mobile-Rgnl.AP.722230,2010s,VA,Good Option,None
4,1,AL_Mobile-Rgnl.AP.722230,2000s,VA,Good Option,None
5,2,AL_Mobile-Rgnl.AP.722230,1970s,VA,Good Option,None
Expand Up @@ -16,6 +16,9 @@ State VA
State CO
County County1
County County2
Bedroom 1
Bedroom 2
Bedroom 3
Insulation Slab None
Insulation Slab Good Option ResidentialConstructionsSlab perimeter_r=0 perimeter_width=0 whole_r=0 gap_r=0 exterior_r=0 exterior_depth=0
Insulation Slab Missing Argument ResidentialConstructionsSlab perimeter_r=0 perimeter_width=0 whole_r=10 gap_r=5 exterior_r=0
Expand Down
5 changes: 2 additions & 3 deletions buildstockbatch/test/test_postprocessing.py
Expand Up @@ -3,7 +3,6 @@
import json
import logging
import os
import pandas as pd
import pathlib
import re
import tarfile
Expand All @@ -13,7 +12,7 @@

from buildstockbatch import postprocessing
from buildstockbatch.base import BuildStockBatchBase
from buildstockbatch.utils import get_project_configuration
from buildstockbatch.utils import get_project_configuration, read_csv

postprocessing.performance_report = MagicMock()

Expand Down Expand Up @@ -58,7 +57,7 @@ def test_report_additional_results_csv_columns(basic_residential_project_file):
postprocessing.combine_results(fs, results_dir, cfg, do_timeseries=False)

for upgrade_id in (0, 1):
df = pd.read_csv(str(results_dir / 'results_csvs' / f'results_up{upgrade_id:02d}.csv.gz'))
df = read_csv(str(results_dir / 'results_csvs' / f'results_up{upgrade_id:02d}.csv.gz'))
assert (df['reporting_measure1.column_1'] == 1).all()
assert (df['reporting_measure1.column_2'] == 2).all()
assert (df['reporting_measure2.column_3'] == 3).all()
Expand Down

0 comments on commit 61da0dc

Please sign in to comment.