Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Exclude None from default_na_values list when reading csv files #374

Merged
merged 8 commits into from Jun 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Expand Up @@ -34,8 +34,8 @@ jobs:
wget --quiet https://data.nrel.gov/system/files/156/BuildStock_TMY3_FIPS.zip
- name: Download and Install OpenStudio
run: |
wget -q https://github.com/NREL/OpenStudio/releases/download/v3.5.1/OpenStudio-3.5.1+22e1db7be5-Ubuntu-20.04.deb
sudo apt install -y ./OpenStudio-3.5.1+22e1db7be5-Ubuntu-20.04.deb
wget -q https://github.com/NREL/OpenStudio/releases/download/v3.6.1/OpenStudio-3.6.1+bb9481519e-Ubuntu-20.04-x86_64.deb
sudo apt install -y ./OpenStudio-3.6.1+bb9481519e-Ubuntu-20.04-x86_64.deb
Comment on lines -37 to +38
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the new version of OpenStudio required for this? It looks like @joseph-robertson has updated it in #351 as well.

Copy link
Contributor Author

@rajeee rajeee Jun 8, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, the develop was missing this update to CI so I did it as a part of this PR. Between Joe's and this, whichever PR gets merged last will have to update from develop (or it might be fine since they are exact same changes).

openstudio openstudio_version
which openstudio
- name: Install buildstockbatch
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -15,3 +15,4 @@ coverage/
.coverage
build/
.env
.history
6 changes: 3 additions & 3 deletions buildstockbatch/aws/aws.py
Expand Up @@ -26,7 +26,6 @@
import logging
import math
import os
import pandas as pd
import pathlib
import random
from s3fs import S3FileSystem
Expand All @@ -42,7 +41,7 @@
from buildstockbatch.base import ValidationError, BuildStockBatchBase
from buildstockbatch.aws.awsbase import AwsJobBase
from buildstockbatch import postprocessing
from buildstockbatch.utils import ContainerRuntime, log_error_details, get_project_configuration
from buildstockbatch.utils import ContainerRuntime, log_error_details, get_project_configuration, read_csv

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -1858,7 +1857,8 @@ def run_batch(self):
json.dump(self.cfg, f)

# Collect simulations to queue
df = pd.read_csv(buildstock_csv_filename, index_col=0)
df = read_csv(buildstock_csv_filename, index_col=0, dtype=str)
self.validate_buildstock_csv(self.project_filename, df)
building_ids = df.index.tolist()
n_datapoints = len(building_ids)
n_sims = n_datapoints * (len(self.cfg.get('upgrades', [])) + 1)
Expand Down
46 changes: 38 additions & 8 deletions buildstockbatch/base.py
Expand Up @@ -16,7 +16,6 @@
import logging
from lxml import objectify
import os
import pandas as pd
import numpy as np
import re
import requests
Expand All @@ -36,7 +35,7 @@
postprocessing
)
from buildstockbatch.exc import SimulationExists, ValidationError
from buildstockbatch.utils import path_rel_to_file, get_project_configuration
from buildstockbatch.utils import path_rel_to_file, get_project_configuration, read_csv
from buildstockbatch.__version__ import __version__ as bsb_version

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -192,7 +191,7 @@ def cleanup_sim_dir(sim_dir, dest_fs, simout_ts_dir, upgrade_id, building_id):
timeseries_filepath = os.path.join(sim_dir, 'run', 'results_timeseries.csv')
# FIXME: Allowing both names here for compatibility. Should consolidate on one timeseries filename.
if os.path.isfile(timeseries_filepath):
units_dict = pd.read_csv(timeseries_filepath, nrows=1).transpose().to_dict()[0]
units_dict = read_csv(timeseries_filepath, nrows=1).transpose().to_dict()[0]
skiprows = [1]
else:
timeseries_filepath = os.path.join(sim_dir, 'run', 'enduse_timeseries.csv')
Expand All @@ -208,15 +207,15 @@ def cleanup_sim_dir(sim_dir, dest_fs, simout_ts_dir, upgrade_id, building_id):
if os.path.isfile(timeseries_filepath):
# Find the time columns present in the enduse_timeseries file
possible_time_cols = ['time', 'Time', 'TimeDST', 'TimeUTC']
cols = pd.read_csv(timeseries_filepath, index_col=False, nrows=0).columns.tolist()
cols = read_csv(timeseries_filepath, index_col=False, nrows=0).columns.tolist()
actual_time_cols = [c for c in cols if c in possible_time_cols]
if not actual_time_cols:
logger.error(f'Did not find any time column ({possible_time_cols}) in {timeseries_filepath}.')
raise RuntimeError(f'Did not find any time column ({possible_time_cols}) in {timeseries_filepath}.')

tsdf = pd.read_csv(timeseries_filepath, parse_dates=actual_time_cols, skiprows=skiprows)
tsdf = read_csv(timeseries_filepath, parse_dates=actual_time_cols, skiprows=skiprows)
if os.path.isfile(schedules_filepath):
schedules = pd.read_csv(schedules_filepath, dtype=np.float64)
schedules = read_csv(schedules_filepath, dtype=np.float64)
schedules.rename(columns=lambda x: f'schedules_{x}', inplace=True)
schedules['TimeDST'] = tsdf['Time']
tsdf = tsdf.merge(schedules, how='left', on='TimeDST')
Expand Down Expand Up @@ -302,7 +301,7 @@ def validate_openstudio_path(cls, project_file):
if os_sha != actual_os_sha:
raise ValidationError(
f"OpenStudio version is correct at {os_version}, but the shas don't match. "
"Got {actual_os_sha}, expected {os_sha}"
f"Got {actual_os_sha}, expected {os_sha}"
)
return True

Expand All @@ -315,7 +314,38 @@ def validate_sampler(project_file):
except AttributeError:
raise ValidationError(f'Sampler class `{sampler_name}` is not available.')
args = cfg['sampler']['args']
return Sampler.validate_args(project_file, **args)
Sampler.validate_args(project_file, **args)
if issubclass(Sampler, sampler.PrecomputedSampler):
sample_file = cfg['sampler']['args']['sample_file']
if not os.path.isabs(sample_file):
sample_file = os.path.join(os.path.dirname(project_file), sample_file)
else:
sample_file = os.path.abspath(sample_file)
buildstock_df = read_csv(sample_file, dtype=str)
BuildStockBatchBase.validate_buildstock_csv(project_file, buildstock_df)
return True


@staticmethod
def validate_buildstock_csv(project_file, buildstock_df):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's this doing? Tell me in the changelog.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

param_option_dict, _ = BuildStockBatchBase.get_param_option_dict(project_file)
# verify that all the Columns in buildstock_df only have values available in param_option_dict
# param_option_dict has format: {column_name: [valid_option1, valid_option2, ...], ...}
errors = []
for column in buildstock_df.columns:
if column in {'Building'}:
continue
if column not in param_option_dict:
errors.append(f'Column {column} in buildstock_csv is not available in options_lookup.tsv')
continue
for option in buildstock_df[column].unique():
if option not in param_option_dict[column]:
errors.append(f'Option {option} in column {column} of buildstock_csv is not available '
f'in options_lookup.tsv')
if errors:
raise ValidationError('\n'.join(errors))

return True

@classmethod
def validate_workflow_generator(cls, project_file):
Expand Down
9 changes: 6 additions & 3 deletions buildstockbatch/eagle.py
Expand Up @@ -41,7 +41,8 @@
get_error_details,
ContainerRuntime,
path_rel_to_file,
get_project_configuration
get_project_configuration,
read_csv
)
from buildstockbatch import postprocessing
from buildstockbatch.__version__ import __version__ as bsb_version
Expand Down Expand Up @@ -91,7 +92,8 @@ def validate_output_directory_eagle(cls, project_file):
cfg = get_project_configuration(project_file)
output_dir = path_rel_to_file(project_file, cfg['output_directory'])
if not (output_dir.startswith('/scratch') or output_dir.startswith('/projects')):
raise ValidationError(f"`output_directory` must be in /scratch or /projects, `output_directory` = {output_dir}")
raise ValidationError(f"`output_directory` must be in /scratch or /projects,"
f" `output_directory` = {output_dir}")

@property
def output_dir(self):
Expand Down Expand Up @@ -196,7 +198,8 @@ def run_batch(self, sampling_only=False):
return

# Determine the number of simulations expected to be executed
df = pd.read_csv(buildstock_csv_filename, index_col=0)
df = read_csv(buildstock_csv_filename, index_col=0, dtype=str)
self.validate_buildstock_csv(self.project_filename, df)

# find out how many buildings there are to simulate
building_ids = df.index.tolist()
Expand Down
7 changes: 4 additions & 3 deletions buildstockbatch/local.py
Expand Up @@ -21,7 +21,6 @@
import json
import logging
import os
import pandas as pd
import pathlib
import re
import shutil
Expand All @@ -30,7 +29,7 @@

from buildstockbatch.base import BuildStockBatchBase, SimulationExists
from buildstockbatch import postprocessing
from buildstockbatch.utils import log_error_details, ContainerRuntime
from buildstockbatch.utils import log_error_details, ContainerRuntime, read_csv
from buildstockbatch.__version__ import __version__ as bsb_version

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -232,7 +231,9 @@ def run_batch(self, n_jobs=None, measures_only=False, sampling_only=False):
shutil.copytree(buildstock_path / "resources", lib_path / "resources")
shutil.copytree(project_path / "housing_characteristics", lib_path / "housing_characteristics")

df = pd.read_csv(buildstock_csv_filename, index_col=0)
df = read_csv(buildstock_csv_filename, index_col=0, dtype=str)
self.validate_buildstock_csv(self.project_filename, df)

building_ids = df.index.tolist()
n_datapoints = len(building_ids)
run_building_d = functools.partial(
Expand Down
4 changes: 2 additions & 2 deletions buildstockbatch/sampler/commercial_sobol.py
Expand Up @@ -21,7 +21,7 @@

from .sobol_lib import i4_sobol_generate
from .base import BuildStockSampler
from buildstockbatch.utils import ContainerRuntime
from buildstockbatch.utils import ContainerRuntime, read_csv
from buildstockbatch.exc import ValidationError

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -81,7 +81,7 @@ def run_sampling(self):
tsv_hash = {}
for tsv_file in os.listdir(self.buildstock_dir):
if '.tsv' in tsv_file:
tsv_df = pd.read_csv(os.path.join(self.buildstock_dir, tsv_file), sep='\t')
tsv_df = read_csv(os.path.join(self.buildstock_dir, tsv_file), sep='\t')
dependency_columns = [item for item in list(tsv_df) if 'Dependency=' in item]
tsv_df[dependency_columns] = tsv_df[dependency_columns].astype('str')
tsv_hash[tsv_file.replace('.tsv', '')] = tsv_df
Expand Down
6 changes: 3 additions & 3 deletions buildstockbatch/sampler/downselect.py
Expand Up @@ -12,11 +12,11 @@
import math
import numpy as np
import os
import pandas as pd
import shutil

from .base import BuildStockSampler
from buildstockbatch.exc import ValidationError
from buildstockbatch.utils import read_csv

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -107,7 +107,7 @@ def run_sampling(self):
n_samples_init = 350000
init_sampler = self.SUB_SAMPLER_CLASS(self.parent(), n_datapoints=n_samples_init, **self.sub_kw)
buildstock_csv_filename = init_sampler.run_sampling()
df = pd.read_csv(buildstock_csv_filename, index_col=0)
df = read_csv(buildstock_csv_filename, index_col=0, dtype=str)
df_new = df[self.downselect_logic(df, self.logic)]
downselected_n_samples_init = df_new.shape[0]
n_samples = math.ceil(self.n_datapoints * n_samples_init / downselected_n_samples_init)
Expand All @@ -120,7 +120,7 @@ def run_sampling(self):
with gzip.open(os.path.splitext(buildstock_csv_filename)[0] + '_orig.csv.gz', 'wb') as f_out:
with open(buildstock_csv_filename, 'rb') as f_in:
shutil.copyfileobj(f_in, f_out)
df = pd.read_csv(buildstock_csv_filename, index_col=0, dtype='str')
df = read_csv(buildstock_csv_filename, index_col=0, dtype='str')
df_new = df[self.downselect_logic(df, self.logic)]
if len(df_new.index) == 0:
raise RuntimeError('There are no buildings left after the down select!')
Expand Down
25 changes: 13 additions & 12 deletions buildstockbatch/test/test_base.py
Expand Up @@ -20,6 +20,7 @@
from buildstockbatch.local import LocalBatch
from buildstockbatch.exc import ValidationError
from buildstockbatch.postprocessing import write_dataframe_as_parquet
from buildstockbatch.utils import read_csv

dask.config.set(scheduler='synchronous')
here = os.path.dirname(os.path.abspath(__file__))
Expand Down Expand Up @@ -51,7 +52,7 @@ def test_reference_scenario(basic_residential_project_file):

# test results.csv files
test_path = os.path.join(results_dir, 'results_csvs')
test_csv = pd.read_csv(os.path.join(test_path, 'results_up01.csv.gz')).set_index('building_id').sort_index()
test_csv = read_csv(os.path.join(test_path, 'results_up01.csv.gz')).set_index('building_id').sort_index()
assert len(test_csv['apply_upgrade.reference_scenario'].unique()) == 1
assert test_csv['apply_upgrade.reference_scenario'].iloc[0] == 'example_reference_scenario'

Expand Down Expand Up @@ -79,16 +80,16 @@ def simplify_columns(colname):
reference_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'test_results', 'results_csvs')
test_path = os.path.join(results_dir, 'results_csvs')

test_csv = pd.read_csv(os.path.join(test_path, 'results_up00.csv.gz')).rename(columns=simplify_columns).\
test_csv = read_csv(os.path.join(test_path, 'results_up00.csv.gz')).rename(columns=simplify_columns).\
sort_values('buildingid').reset_index().drop(columns=['index'])
reference_csv = pd.read_csv(os.path.join(reference_path, 'results_up00.csv.gz')).rename(columns=simplify_columns).\
reference_csv = read_csv(os.path.join(reference_path, 'results_up00.csv.gz')).rename(columns=simplify_columns).\
sort_values('buildingid').reset_index().drop(columns=['index'])
mutul_cols = list(set(test_csv.columns).intersection(set(reference_csv)))
pd.testing.assert_frame_equal(test_csv[mutul_cols], reference_csv[mutul_cols])

test_csv = pd.read_csv(os.path.join(test_path, 'results_up01.csv.gz')).rename(columns=simplify_columns).\
test_csv = read_csv(os.path.join(test_path, 'results_up01.csv.gz')).rename(columns=simplify_columns).\
sort_values('buildingid').reset_index().drop(columns=['index'])
reference_csv = pd.read_csv(os.path.join(reference_path, 'results_up01.csv.gz')).rename(columns=simplify_columns).\
reference_csv = read_csv(os.path.join(reference_path, 'results_up01.csv.gz')).rename(columns=simplify_columns).\
sort_values('buildingid').reset_index().drop(columns=['index'])
mutul_cols = list(set(test_csv.columns).intersection(set(reference_csv)))
pd.testing.assert_frame_equal(test_csv[mutul_cols], reference_csv[mutul_cols])
Expand Down Expand Up @@ -186,15 +187,15 @@ def test_combine_files(basic_residential_project_file):
reference_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'test_results', 'results_csvs')
test_path = os.path.join(results_dir, 'results_csvs')

test_csv = pd.read_csv(os.path.join(test_path, 'results_up00.csv.gz')).sort_values('building_id').reset_index()\
test_csv = read_csv(os.path.join(test_path, 'results_up00.csv.gz')).sort_values('building_id').reset_index()\
.drop(columns=['index'])
reference_csv = pd.read_csv(os.path.join(reference_path, 'results_up00.csv.gz')).sort_values('building_id')\
reference_csv = read_csv(os.path.join(reference_path, 'results_up00.csv.gz')).sort_values('building_id')\
.reset_index().drop(columns=['index'])
pd.testing.assert_frame_equal(test_csv, reference_csv)

test_csv = pd.read_csv(os.path.join(test_path, 'results_up01.csv.gz')).sort_values('building_id').reset_index()\
test_csv = read_csv(os.path.join(test_path, 'results_up01.csv.gz')).sort_values('building_id').reset_index()\
.drop(columns=['index'])
reference_csv = pd.read_csv(os.path.join(reference_path, 'results_up01.csv.gz')).sort_values('building_id')\
reference_csv = read_csv(os.path.join(reference_path, 'results_up01.csv.gz')).sort_values('building_id')\
.reset_index().drop(columns=['index'])
pd.testing.assert_frame_equal(test_csv, reference_csv)

Expand Down Expand Up @@ -398,7 +399,7 @@ def test_skipping_baseline(basic_residential_project_file):

def test_provide_buildstock_csv(basic_residential_project_file, mocker):
buildstock_csv = os.path.join(here, 'buildstock.csv')
df = pd.read_csv(buildstock_csv)
df = read_csv(buildstock_csv, dtype=str)
project_filename, results_dir = basic_residential_project_file({
'sampler': {
'type': 'precomputed',
Expand All @@ -412,9 +413,9 @@ def test_provide_buildstock_csv(basic_residential_project_file, mocker):

bsb = LocalBatch(project_filename)
sampling_output_csv = bsb.sampler.run_sampling()
df2 = pd.read_csv(sampling_output_csv)
df2 = read_csv(sampling_output_csv, dtype=str)
pd.testing.assert_frame_equal(df, df2)

assert (df['Geometry Shared Walls'] == "None").all() # Verify None is being read properly
# Test file missing
with open(project_filename, 'r') as f:
cfg = yaml.safe_load(f)
Expand Down
6 changes: 3 additions & 3 deletions buildstockbatch/test/test_eagle.py
Expand Up @@ -11,7 +11,7 @@

from buildstockbatch.eagle import user_cli, EagleBatch
from buildstockbatch.base import BuildStockBatchBase
from buildstockbatch.utils import get_project_configuration
from buildstockbatch.utils import get_project_configuration, read_csv

here = os.path.dirname(os.path.abspath(__file__))

Expand Down Expand Up @@ -281,8 +281,8 @@ def compare_ts_parquets(source, dst):
compare_ts_parquets(file, results_file)

# Check that buildstock.csv was trimmed properly
local_buildstock_df = pd.read_csv(results_dir / 'local_housing_characteristics_dir' / 'buildstock.csv')
unique_buildings = {x[0] for x in job_json['batch']}
local_buildstock_df = read_csv(results_dir / 'local_housing_characteristics_dir' / 'buildstock.csv', dtype=str)
unique_buildings = {str(x[0]) for x in job_json['batch']}
assert len(unique_buildings) == len(local_buildstock_df)
assert unique_buildings == set(local_buildstock_df['Building'])

Expand Down
@@ -0,0 +1,6 @@
Building,Bedroom,Location,Vintage,State,Insulation Wall,Insulation
1,1,AL_Mobile-Rgnl.AP.722230,1940-1950,CO,Good Option,None
2,2,AL_Mobile-Rgnl.AP.722230,1940s,CO,Good Option,None
3,2,AL_Mobile-Rgnl.AP.722230,2010s,VA,Good Option,None
4,2,AL_Mobile-Rgnl.AP.722230,2000s,TX,,None
5,3,AL_Mobile-Rgnl.AP.722230,1970s,VA,,None
@@ -0,0 +1,6 @@
Building,Bedroom,Location,Vintage,State,Insulation Wall,Insulation Slab
1,1,AL_Mobile-Rgnl.AP.722230,<1950,CO,Good Option,None
2,3,AL_Mobile-Rgnl.AP.722230,1940s,CO,Good Option,None
3,2,AL_Mobile-Rgnl.AP.722230,2010s,VA,Good Option,None
4,1,AL_Mobile-Rgnl.AP.722230,2000s,VA,Good Option,None
5,2,AL_Mobile-Rgnl.AP.722230,1970s,VA,Good Option,None
Expand Up @@ -16,6 +16,9 @@ State VA
State CO
County County1
County County2
Bedroom 1
Bedroom 2
Bedroom 3
Insulation Slab None
Insulation Slab Good Option ResidentialConstructionsSlab perimeter_r=0 perimeter_width=0 whole_r=0 gap_r=0 exterior_r=0 exterior_depth=0
Insulation Slab Missing Argument ResidentialConstructionsSlab perimeter_r=0 perimeter_width=0 whole_r=10 gap_r=5 exterior_r=0
Expand Down
5 changes: 2 additions & 3 deletions buildstockbatch/test/test_postprocessing.py
Expand Up @@ -3,7 +3,6 @@
import json
import logging
import os
import pandas as pd
import pathlib
import re
import tarfile
Expand All @@ -13,7 +12,7 @@

from buildstockbatch import postprocessing
from buildstockbatch.base import BuildStockBatchBase
from buildstockbatch.utils import get_project_configuration
from buildstockbatch.utils import get_project_configuration, read_csv

postprocessing.performance_report = MagicMock()

Expand Down Expand Up @@ -58,7 +57,7 @@ def test_report_additional_results_csv_columns(basic_residential_project_file):
postprocessing.combine_results(fs, results_dir, cfg, do_timeseries=False)

for upgrade_id in (0, 1):
df = pd.read_csv(str(results_dir / 'results_csvs' / f'results_up{upgrade_id:02d}.csv.gz'))
df = read_csv(str(results_dir / 'results_csvs' / f'results_up{upgrade_id:02d}.csv.gz'))
assert (df['reporting_measure1.column_1'] == 1).all()
assert (df['reporting_measure1.column_2'] == 2).all()
assert (df['reporting_measure2.column_3'] == 3).all()
Expand Down