NREL · nmerket · Mar 9, 2021 · Jan 13, 2021 · Jan 19, 2021 · Jan 25, 2021
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -2,14 +2,13 @@ version: 2
 jobs:
   build:
     docker:
-      - image: continuumio/miniconda3
+      - image: cimg/python:3.8
     steps:
       - setup_remote_docker
       - checkout
       - run:
           name: Install buildstock
           command: |
-            conda install -c conda-forge -y -q scipy numpy "pandas>=1.0.0,!=1.0.4" "pyarrow>=0.14.1" dask joblib pyyaml "ruamel.yaml>=0.15.0"
             pip install .[dev] --progress-bar off
       - run:
           name: Run PyTest
@@ -35,7 +34,6 @@ jobs:
           name: Build documentation
           when: always
           command: |
-            apt-get install make
             cd docs
             make html
             mkdir /tmp/docs

diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -0,0 +1,43 @@
+// For format details, see https://aka.ms/devcontainer.json. For config options, see the README at:
+// https://github.com/microsoft/vscode-dev-containers/tree/v0.158.0/containers/docker-existing-dockerfile
+{
+	"name": "Existing Dockerfile",
+
+	// Sets the run context to one level up instead of the .devcontainer folder.
+	"context": "..",
+
+	// Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
+	"dockerFile": "../Dockerfile",
+
+	// Set *default* container specific settings.json values on container create.
+	"settings": { 
+		"terminal.integrated.shell.linux": null
+	},
+
+	// Add the IDs of extensions you want installed when the container is created.
+	"extensions": [
+		"ms-python.python"
+	],
+
+	"workspaceMount": "source=${localWorkspaceFolder},target=/buildstock-batch,type=bind",
+	"workspaceFolder": "/buildstock-batch",
+
+	"mounts": [
+		"source=${localEnv:HOME}/.aws,target=/root/.aws,type=bind"
+	]
+
+	// Use 'forwardPorts' to make a list of ports inside the container available locally.
+	// "forwardPorts": [],
+
+	// Uncomment the next line to run commands after the container is created - for example installing curl.
+	// "postCreateCommand": "apt-get update && apt-get install -y curl",
+
+	// Uncomment when using a ptrace-based debugger like C++, Go, and Rust
+	// "runArgs": [ "--cap-add=SYS_PTRACE", "--security-opt", "seccomp=unconfined" ],
+
+	// Uncomment to use the Docker CLI from inside the container. See https://aka.ms/vscode-remote/samples/docker-from-docker.
+	// "mounts": [ "source=/var/run/docker.sock,target=/var/run/docker.sock,type=bind" ],
+
+	// Uncomment to connect as a non-root user if you've added one. See https://aka.ms/vscode-remote/containers/non-root.
+	// "remoteUser": "vscode"
+}
diff --git a/Dockerfile b/Dockerfile
@@ -1,26 +1,24 @@
 FROM nrel/openstudio:2.9.1
 
 ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
-ENV PATH /opt/conda/bin:$PATH
 
-RUN apt-get update --fix-missing && \
-    apt-get install -y wget bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 git mercurial subversion && \
-    apt-get clean
+RUN sudo apt update && \
+    sudo apt install -y wget build-essential checkinstall libreadline-gplv2-dev libncursesw5-dev libssl-dev \
+    libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev libffi-dev zlib1g-dev
 
-RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-py38_4.8.3-Linux-x86_64.sh -O ~/miniconda.sh && \
-    /bin/bash ~/miniconda.sh -b -p /opt/conda && \
-    rm ~/miniconda.sh && \
-    /opt/conda/bin/conda clean -tipsy && \
-    ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
-    echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \
-    echo "conda activate base" >> ~/.bashrc && \
-    find /opt/conda/ -follow -type f -name '*.a' -delete && \
-    find /opt/conda/ -follow -type f -name '*.js.map' -delete && \
-    /opt/conda/bin/conda clean -afy
+RUN wget https://www.python.org/ftp/python/3.8.8/Python-3.8.8.tgz && \
+    tar xzf Python-3.8.8.tgz && \
+    cd Python-3.8.8 && \
+    ./configure --enable-optimizations && \
+    make altinstall && \
+    rm -rf Python-3.8.8 && \
+    rm -rf Python-3.8.8.tgz
 
-
-RUN conda install -c conda-forge -y pandas dask pyarrow python-dateutil joblib && \
-    python -m pip install --upgrade pip
+RUN sudo apt install -y -V ca-certificates lsb-release && \
+    wget https://apache.bintray.com/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-archive-keyring-latest-$(lsb_release --codename --short).deb && \
+    sudo apt install -y -V ./apache-arrow-archive-keyring-latest-$(lsb_release --codename --short).deb && \
+    sudo apt update && \
+    sudo apt install -y -V libarrow-dev libarrow-glib-dev libarrow-dataset-dev libparquet-dev libparquet-glib-dev
 
 COPY . /buildstock-batch/
-RUN python -m pip install /buildstock-batch
+RUN python3.8 -m pip install /buildstock-batch
diff --git a/aws_demo_project.yml b/aws_demo_project.yml
@@ -3,14 +3,15 @@ buildstock_directory: ../resstock  # Relative to this file or absolute
 project_directory: project_national  # Relative to buildstock_directory
 output_directory: ../demo_test_outputs
 weather_files_url: https://data.nrel.gov/system/files/156/BuildStock_TMY3_FIPS.zip
+# weather_files_path: ../weather/BuildStock_TMY3_FIPS.zip
 
 baseline:
   n_buildings_represented: 133172057  # Total number of residential dwelling units in contiguous United States, including unoccupied units, resulting from acensus tract level query of ACS 5-yr 2016 (i.e. 2012-2016), using this script: https://github.com/NREL/resstock-estimation/blob/master/sources/spatial/tsv_maker.py.
 
 sampler:
   type: residential_quota # change to residential_quota_downselect to do downselect
   args:
-    n_datapoints: 4
+    n_datapoints: 100
     # logic:
     #   - Geometry Building Type RECS|Single-Family Detached
     #   - Vacancy Status|Occupied
@@ -39,10 +40,12 @@ aws:
     bucket: resbldg-datasets
     prefix: testing/external_demo_project
   emr:
-    slave_instance_count: 1
+    manager_instance_type: m5.xlarge
+    worker_instance_type: r5.4xlarge
+    worker_instance_count: 1
   region: us-west-2
   use_spot: true
-  batch_array_size: 4
+  batch_array_size: 100
   # To receive email updates on job progress accept the request to receive emails that will be sent from Amazon
   notifications_email: user@example.com
 

diff --git a/buildstockbatch/aws/aws.py b/buildstockbatch/aws/aws.py
@@ -14,9 +14,11 @@
 import base64
 import boto3
 from botocore.exceptions import ClientError
+import collections
 import csv
 from fsspec.implementations.local import LocalFileSystem
 import gzip
+import hashlib
 import itertools
 from joblib import Parallel, delayed
 import json
@@ -80,6 +82,20 @@ def compress_file(in_filename, out_filename):
             shutil.copyfileobj(f_in, f_out)
 
 
+def calc_hash_for_file(filename):
+    with open(filename, 'rb') as f:
+        return hashlib.sha256(f.read()).hexdigest()
+
+
+def copy_s3_file(src_bucket, src_key, dest_bucket, dest_key):
+    s3 = boto3.client('s3')
+    s3.copy(
+        {'Bucket': src_bucket, 'Key': src_key},
+        dest_bucket,
+        dest_key
+    )
+
+
 class AwsBatchEnv(AwsJobBase):
     """
     Class to manage the AWS Batch environment and Step Function controller.
@@ -1486,14 +1502,14 @@ def create_emr_cluster_function(self):
                     {
                         'Market': 'SPOT' if self.batch_use_spot else 'ON_DEMAND',
                         'InstanceRole': 'MASTER',
-                        'InstanceType': self.emr_master_instance_type,
+                        'InstanceType': self.emr_manager_instance_type,
                         'InstanceCount': 1
                     },
                     {
                         'Market': 'SPOT' if self.batch_use_spot else 'ON_DEMAND',
                         'InstanceRole': 'CORE',
-                        'InstanceType': self.emr_slave_instance_type,
-                        'InstanceCount': self.emr_slave_instance_count
+                        'InstanceType': self.emr_worker_instance_type,
+                        'InstanceCount': self.emr_worker_instance_count
                     },
                 ],
                 'Ec2SubnetId': self.priv_vpc_subnet_id_1,
@@ -1667,8 +1683,8 @@ def validate_instance_types(project_file):
         ec2 = boto3_session.client('ec2')
         job_base = AwsJobBase('genericjobid', aws_config, boto3_session)
         instance_types_requested = set()
-        instance_types_requested.add(job_base.emr_master_instance_type)
-        instance_types_requested.add(job_base.emr_slave_instance_type)
+        instance_types_requested.add(job_base.emr_manager_instance_type)
+        instance_types_requested.add(job_base.emr_worker_instance_type)
         inst_type_resp = ec2.describe_instance_type_offerings(Filters=[{
             'Name': 'instance-type',
             'Values': list(instance_types_requested)
@@ -1775,7 +1791,7 @@ def run_batch(self):
         buildstock_csv_filename = self.sampler.run_sampling()
 
         # Compress and upload assets to S3
-        with tempfile.TemporaryDirectory() as tmpdir, tempfile.TemporaryDirectory() as tmp_weather_dir:
+        with tempfile.TemporaryDirectory(prefix='bsb_') as tmpdir, tempfile.TemporaryDirectory(prefix='bsb_') as tmp_weather_dir:  # noqa: E501
             self._weather_dir = tmp_weather_dir
             self._get_weather_files()
             tmppath = pathlib.Path(tmpdir)
@@ -1786,17 +1802,32 @@ def run_batch(self):
                 tar_f.add(buildstock_path / 'measures', 'measures')
                 tar_f.add(buildstock_path / 'resources', 'lib/resources')
                 tar_f.add(project_path / 'housing_characteristics', 'lib/housing_characteristics')
-            logger.debug('Compressing weather files')
+
+            # Weather files
             weather_path = tmppath / 'weather'
             os.makedirs(weather_path)
+
+            # Determine the unique weather files
+            epw_filenames = list(filter(lambda x: x.endswith('.epw'), os.listdir(self.weather_dir)))
+            logger.debug('Calculating hashes for weather files')
+            epw_hashes = Parallel(n_jobs=-1, verbose=9)(
+                delayed(calc_hash_for_file)(pathlib.Path(self.weather_dir) / epw_filename)
+                for epw_filename in epw_filenames
+            )
+            unique_epws = collections.defaultdict(list)
+            for epw_filename, epw_hash in zip(epw_filenames, epw_hashes):
+                unique_epws[epw_hash].append(epw_filename)
+
+            # Compress unique weather files
+            logger.debug('Compressing weather files')
             Parallel(n_jobs=-1, verbose=9)(
                 delayed(compress_file)(
-                    pathlib.Path(self.weather_dir) / epw_filename,
-                    str(weather_path / epw_filename) + '.gz'
+                    pathlib.Path(self.weather_dir) / x[0],
+                    str(weather_path / x[0]) + '.gz'
                 )
-                for epw_filename
-                in filter(lambda x: x.endswith('.epw'), os.listdir(self.weather_dir))
+                for x in unique_epws.values()
             )
+
             logger.debug('Writing project configuration for upload')
             with open(tmppath / 'config.json', 'wt', encoding='utf-8') as f:
                 json.dump(self.cfg, f)
@@ -1855,6 +1886,22 @@ def run_batch(self):
             logger.debug('Uploading files to S3')
             upload_directory_to_s3(tmppath, self.cfg['aws']['s3']['bucket'], self.cfg['aws']['s3']['prefix'])
 
+        # Copy the non-unique weather files on S3
+        epws_to_copy = []
+        for epws in unique_epws.values():
+            # The first in the list is already up there, copy the rest
+            for filename in epws[1:]:
+                epws_to_copy.append((
+                    f"{self.cfg['aws']['s3']['prefix']}/weather/{epws[0]}.gz",
+                    f"{self.cfg['aws']['s3']['prefix']}/weather/{filename}.gz"
+                ))
+
+        logger.debug('Copying weather files on S3')
+        bucket = self.cfg['aws']['s3']['bucket']
+        Parallel(n_jobs=-1, verbose=9)(
+            delayed(copy_s3_file)(bucket, src, bucket, dest) for src, dest in epws_to_copy
+        )
+
         # Create the output directories
         fs = S3FileSystem()
         for upgrade_id in range(len(self.cfg.get('upgrades', [])) + 1):
@@ -1883,7 +1930,7 @@ def run_batch(self):
         job_env_cfg = self.cfg['aws'].get('job_environment', {})
         batch_env.create_job_definition(
             image_url,
-            command=['python3', '-m', 'buildstockbatch.aws.aws'],
+            command=['python3.8', '-m', 'buildstockbatch.aws.aws'],
             vcpus=job_env_cfg.get('vcpus', 1),
             memory=job_env_cfg.get('memory', 1024),
             env_vars=env_vars
@@ -1946,19 +1993,34 @@ def run_job(cls, job_id, bucket, prefix, job_name, region):
 
         logger.debug('Getting weather files')
         weather_dir = sim_dir / 'weather'
-        os.makedirs(weather_dir)
-        building_ids = [x[0] for x in jobs_d['batch']]
+        os.makedirs(weather_dir, exist_ok=True)
+
+        # Make a lookup of which parameter points to the weather file from options_lookup.tsv
+        with open(sim_dir / 'lib' / 'resources' / 'options_lookup.tsv', 'r', encoding='utf-8') as f:
+            tsv_reader = csv.reader(f, delimiter='\t')
+            next(tsv_reader)  # skip headers
+            param_name = None
+            epws_by_option = {}
+            for row in tsv_reader:
+                row_has_epw = [x.endswith('.epw') for x in row[2:]]
+                if sum(row_has_epw):
+                    if row[0] != param_name and param_name is not None:
+                        raise RuntimeError(f'The epw files are specified in options_lookup.tsv under more than one parameter type: {param_name}, {row[0]}')  # noqa: E501
+                    epw_filename = row[row_has_epw.index(True) + 2].split('=')[1]
+                    param_name = row[0]
+                    option_name = row[1]
+                    epws_by_option[option_name] = epw_filename
+
+        # Look through the buildstock.csv to find the appropriate location and epw
         epws_to_download = set()
+        building_ids = [x[0] for x in jobs_d['batch']]
         with open(sim_dir / 'lib' / 'housing_characteristics' / 'buildstock.csv', 'r', encoding='utf-8') as f:
             csv_reader = csv.DictReader(f)
-            loc_col = 'Location Weather Filename'
-            if loc_col not in csv_reader.fieldnames:
-                loc_col = 'Location'
             for row in csv_reader:
                 if int(row['Building']) in building_ids:
-                    epws_to_download.add(row[loc_col])
-        if loc_col == 'Location':
-            epws_to_download = map('USA_{}.epw'.format, epws_to_download)
+                    epws_to_download.add(epws_by_option[row[param_name]])
+
+        # Download the epws needed for these simulations
         for epw_filename in epws_to_download:
             with io.BytesIO() as f_gz:
                 logger.debug('Downloading {}.gz'.format(epw_filename))

diff --git a/buildstockbatch/aws/awsbase.py b/buildstockbatch/aws/awsbase.py
@@ -177,9 +177,9 @@ def __init__(self, job_identifier, aws_config, boto3_session):
 
         # EMR
         emr_config = aws_config.get('emr', {})
-        self.emr_master_instance_type = emr_config.get('master_instance_type', 'm5.4xlarge')
-        self.emr_slave_instance_type = emr_config.get('slave_instance_type', 'r5.4xlarge')
-        self.emr_slave_instance_count = emr_config.get('slave_instance_count', 4)
+        self.emr_manager_instance_type = emr_config.get('manager_instance_type', 'm5.4xlarge')
+        self.emr_worker_instance_type = emr_config.get('worker_instance_type', 'r5.4xlarge')
+        self.emr_worker_instance_count = emr_config.get('worker_instance_count', 4)
         self.emr_cluster_security_group_name = f'{self.job_identifier}_emr_security_group'
         self.emr_cluster_name = f'{self.job_identifier}_emr_dask_cluster'
         self.emr_job_flow_role_name = f'{self.job_identifier}_emr_job_flow_role'

diff --git a/buildstockbatch/aws/s3_assets/bootstrap-dask-custom b/buildstockbatch/aws/s3_assets/bootstrap-dask-custom
@@ -41,8 +41,9 @@ conda install \
 python=3.7 \
 "dask-yarn>=0.7.0" \
 "pandas>=1.0.0,!=1.0.4" \
-"pyarrow>=0.14.1" \
+"pyarrow>=3.0.0" \
 "s3fs>=0.4.2,<0.5.0" \
+"numpy>=1.20.0" \
 conda-pack \
 tornado=5 
 

diff --git a/buildstockbatch/aws/s3_assets/setup_postprocessing.py b/buildstockbatch/aws/s3_assets/setup_postprocessing.py
@@ -10,6 +10,7 @@
         's3fs>=0.4.2,<0.5.0',
         'boto3',
         'pandas>=1.0.0,!=1.0.4',
-        'pyarrow>=0.14.1',
+        'pyarrow>=3.0.0',
+        'numpy>=1.20.0'
     ]
 )
diff --git a/buildstockbatch/schemas/v0.3.yaml b/buildstockbatch/schemas/v0.3.yaml
@@ -31,9 +31,9 @@ aws-job-environment:
   memory: int(min=1024, required=False)
 
 aws-emr-spec:
-  master_instance_type: str(required=False)
-  slave_instance_type: str(required=False)
-  slave_instance_count: int(min=1, required=False)
+  manager_instance_type: str(required=False)
+  worker_instance_type: str(required=False)
+  worker_instance_count: int(min=1, required=False)
   dask_worker_vcores: int(min=1, required=False)
 
 hpc-spec:

diff --git a/buildstockbatch/workflow_generator/base.py b/buildstockbatch/workflow_generator/base.py
@@ -20,7 +20,6 @@ class WorkflowGeneratorBase(object):
     def __init__(self, cfg, n_datapoints):
         self.cfg = cfg
         self.n_datapoints = n_datapoints
-        self.validate(self.cfg)
 
     def create_osw(self, sim_id, building_id, upgrade_idx):
         """

diff --git a/create_eagle_env.sh b/create_eagle_env.sh
@@ -26,7 +26,7 @@ MY_CONDA_PREFIX="$CONDA_ENVS_DIR/$MY_CONDA_ENV_NAME"
 echo "Creating $MY_CONDA_PREFIX"
 module load conda
 conda remove -y --prefix "$MY_CONDA_PREFIX" --all
-conda create -y --prefix "$MY_CONDA_PREFIX" -c conda-forge "pyarrow>=0.14" python=3.7.8 "pandas>=1.0.0,!=1.0.4" dask distributed ruby
+conda create -y --prefix "$MY_CONDA_PREFIX" -c conda-forge "pyarrow>=3.0.0" python=3.7.8 "numpy>=1.20.0" "pandas>=1.0.0,!=1.0.4" dask distributed ruby
 source activate "$MY_CONDA_PREFIX"
 pip install --upgrade pip
 if [ $DEV -eq 1 ]