Add method to compute (weighted) quantiles (#686)

IAMconsortium · Dec 15, 2022 · 4cd4c7d · 4cd4c7d
1 parent e0534d9
commit 4cd4c7d
Show file tree

Hide file tree

Showing 6 changed files with 717 additions and 2 deletions.
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
@@ -11,6 +11,7 @@ Bump minimum version of **pandas** to v1.2.0 to support automatic engine selecti
 - [#713](https://github.com/IAMconsortium/pyam/pull/713) Informative error when using lists for filter by level, `level` now a forbidden column.
 - [#709](https://github.com/IAMconsortium/pyam/pull/709) Hotfix ops to support `fillna=0`
 - [#708](https://github.com/IAMconsortium/pyam/pull/708) Remove 'xls' as by-default-supported file format
+- [#686](https://github.com/IAMconsortium/pyam/pull/686) Add support for (weighted) quantile timeseries as `df.compute.quantiles()` with a [tutorial](https://pyam-iamc.readthedocs.io/en/stable/tutorials/quantiles.html)
 
 # Release v1.6.0
 

diff --git a/doc/source/tutorials.rst b/doc/source/tutorials.rst
@@ -21,12 +21,13 @@ The source code is available in the folder
 
    tutorials/pyam_first_steps.ipynb
    tutorials/data_table_formats.ipynb
+   tutorials/unit_conversion.ipynb
+   tutorials/algebraic_operations.ipynb
+   tutorials/quantiles.ipynb
    tutorials/iiasa_dbs.ipynb
    tutorials/unfccc.ipynb
    tutorials/GAMS_to_pyam.ipynb
-   tutorials/unit_conversion.ipynb
    tutorials/aggregating_downscaling_consistency.ipynb
-   tutorials/algebraic_operations.ipynb
    tutorials/subannual_time_resolution.ipynb
    tutorials/ipcc_colors.ipynb
    tutorials/legends.ipynb

diff --git a/doc/source/tutorials/quantiles.ipynb b/doc/source/tutorials/quantiles.ipynb
diff --git a/pyam/compute.py b/pyam/compute.py
@@ -1,4 +1,6 @@
+import itertools
 import math
+import wquantiles
 import pandas as pd
 from pyam.index import replace_index_values
 from pyam.timeseries import growth_rate
@@ -22,6 +24,91 @@ class IamComputeAccessor:
     def __init__(self, df):
         self._df = df
 
+    def quantiles(
+        self, quantiles, weights=None, level=["model", "scenario"], append=False
+    ):
+        """Compute the optionally weighted quantiles of data grouped by `level`.
+
+        For example, the following will provide the interquartile range and median value
+        of CO2 emissions across all models and scenarios in a given dataset:
+
+        .. code-block:: python
+
+            df.filter(variable='Emissions|CO2').compute.quantiles([0.25, 0.5, 0.75])
+
+        Parameters
+        ----------
+        quantiles : collection
+            Group of quantile values to compute
+        weights : pd.Series, optional
+            Series indexed by `level`
+        level : collection, optional
+            The index columns to compute quantiles over
+        append : bool, optional
+            Whether to append computed timeseries data to this instance.
+
+        Returns
+        -------
+        :class:`IamDataFrame` or **None**
+            Computed data or None if `append=True`.
+
+        Raises
+        ------
+        ValueError
+            If more than one variable provided or if argument `weights` is malformed.
+        """
+        from pyam.core import (
+            IamDataFrame,
+            concat,
+        )  # here because of circular import issue
+
+        self_df = self._df
+        if len(self_df.variable) > 1:
+            raise ValueError(
+                "quantiles() currently supports only 1 variable, and this"
+                f"dataframe has {len(self_df.variable)}"
+            )
+        if weights is not None and weights.name != "weight":
+            raise ValueError("weights pd.Series must have name 'weight'")
+
+        df = self_df.timeseries()
+        model = (
+            "Quantiles" if weights is None else "Weighted Quantiles"
+        )  # can make this a kwarg
+
+        # get weights aligned with model/scenario in data
+        if weights is None:
+            df["weight"] = 1.0
+        else:
+            df = df.join(weights, how="inner")
+        w = df["weight"]
+        df.drop("weight", axis="columns", inplace=True)
+
+        # prep data for processing
+        df = df.reset_index(level=level).drop(columns=level)
+
+        dfs = []
+        # indexed over region, variable, and unit
+        idxs = df.index.drop_duplicates()
+        for idx, q in itertools.product(idxs, quantiles):
+            data = pd.Series(
+                wquantiles.quantile(df.loc[idx].values.T, w.values, q),
+                index=pd.Series(df.columns, name="year"),
+                name="value",
+            )
+            kwargs = {idxs.names[i]: idx[i] for i in range(len(idx))}
+            dfs.append(
+                IamDataFrame(
+                    data,
+                    model=model,
+                    scenario=str(q),  # can make this a kwarg
+                    **kwargs,
+                )
+            )
+
+        # append to `self` or return as `IamDataFrame`
+        return self_df._finalize(concat(dfs), append=append)
+
     def growth_rate(self, mapping, append=False):
         """Compute the annualized growth rate of a timeseries along the time dimension
 

diff --git a/setup.cfg b/setup.cfg
@@ -41,6 +41,7 @@ install_requires =
     six
     setuptools >= 41
     setuptools_scm
+    wquantiles
     # required explicitly for Python 3.7
     importlib_metadata
     xlsxwriter

diff --git a/tests/test_feature_quantiles.py b/tests/test_feature_quantiles.py
@@ -0,0 +1,35 @@
+from pyam import IamDataFrame
+import pytest
+from pyam.testing import assert_iamframe_equal
+import pandas as pd
+
+
+def test_quantile_one_variable(test_pd_df):
+    """Tests interquartile range of standard test df
+
+    Because it is only two datapoints, the only 'new' computation
+    is the median
+    """
+    df = IamDataFrame(test_pd_df)
+    quantiles = (0.25, 0.5, 0.75)
+    obs = df.filter(variable="Primary Energy").compute.quantiles(quantiles)
+    exp = IamDataFrame(
+        pd.DataFrame(
+            {
+                "scenario": [str(q) for q in quantiles],
+                "2005": [1, (1.0 + 2) / 2, 2],
+                "2010": [6, (6 + 7) / 2, 7],
+            }
+        ),
+        model="Quantiles",
+        region="World",
+        variable="Primary Energy",
+        unit="EJ/yr",
+    )
+    assert_iamframe_equal(exp, obs)
+
+
+def test_quantile_missing_variable(test_pd_df):
+    df = IamDataFrame(test_pd_df)
+    with pytest.raises(ValueError):
+        df.compute.quantiles((0.25, 0.5))