Merge pull request #247 from NREL/smaller_partitions

Smaller timeseries partitions
NREL · Sep 13, 2021 · 1f03a84 · 1f03a84
2 parents fcd7131 + 03f6ee0
commit 1f03a84
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 2 deletions.
diff --git a/buildstockbatch/postprocessing.py b/buildstockbatch/postprocessing.py
@@ -34,7 +34,7 @@
 
 logger = logging.getLogger(__name__)
 
-MAX_PARQUET_MEMORY = 4000  # maximum size (MB) of the parquet file in memory when combining multiple parquets
+MAX_PARQUET_MEMORY = 1000  # maximum size (MB) of the parquet file in memory when combining multiple parquets
 
 
 def read_data_point_out_json(fs, reporting_measures, filename):

diff --git a/docs/changelog/changelog_dev.rst b/docs/changelog/changelog_dev.rst
@@ -48,4 +48,13 @@ Development Changelog
         :tickets:
 
         The buildstock.csv is trimmed for each batch job to hold only the rows corresponding to buildings in the batch.
-        This improves speed and memory consumption when the file is loaded in ResStock.
+        This improves speed and memory consumption when the file is loaded in ResStock.
+
+    .. change:
+        :tags: general, postprocessing
+        :pullreq: 247
+
+        The output partition size of 4GB was making downstream data processing
+        difficult. Both spark and dask clusters were failing due to out of
+        memory errors. I'm changing it back to 1GB, which will make more files,
+        but each will be more manageable.