Add AICc for bias correction with small sample sizes (#727)

pastas · Apr 11, 2024 · eedba33 · eedba33
1 parent bce0bac
commit eedba33
Show file tree

Hide file tree

Showing 5 changed files with 116 additions and 26 deletions.
diff --git a/pastas/model.py b/pastas/model.py
@@ -1806,7 +1806,7 @@ def fit_report(
             "EVP": f"{self.stats.evp():.2f}",
             "R2": f"{self.stats.rsq():.2f}",
             "RMSE": f"{self.stats.rmse():.2f}",
-            "AIC": f"{self.stats.aic():.2f}",
+            "AICc": f"{self.stats.aicc():.2f}",
             "BIC": f"{self.stats.bic():.2f}",
             "Obj": f"{self.solver.obj_func:.2f}",
             "___": "",

diff --git a/pastas/modelstats.py b/pastas/modelstats.py
@@ -47,6 +47,7 @@ class Statistics:
         "kge",
         "bic",
         "aic",
+        "aicc",
     ]
 
     def __init__(self, ml: Model):
@@ -374,7 +375,7 @@ def aic(
 
         See Also
         --------
-        pastas.stats.bic
+        pastas.stats.aic
         """
         nparam = self.ml.parameters["vary"].sum()
         if self.ml.settings["noise"]:
@@ -385,6 +386,30 @@ def aic(
             res = self.ml.residuals(tmin=tmin, tmax=tmax)
         return metrics.aic(res=res, nparam=nparam)
 
+    @model_tmin_tmax
+    def aicc(
+        self, tmin: Optional[TimestampType] = None, tmax: Optional[TimestampType] = None
+    ) -> float:
+        """Akaike Information Criterium with second order bias correction (AICc).
+
+        Parameters
+        ----------
+        tmin: str or pandas.Timestamp, optional
+        tmax: str or pandas.Timestamp, optional
+
+        See Also
+        --------
+        pastas.stats.aicc
+        """
+        nparam = self.ml.parameters["vary"].sum()
+        if self.ml.settings["noise"]:
+            res = self.ml.noise(tmin=tmin, tmax=tmax) * self.ml.noise_weights(
+                tmin=tmin, tmax=tmax
+            )
+        else:
+            res = self.ml.residuals(tmin=tmin, tmax=tmax)
+        return metrics.aicc(res=res, nparam=nparam)
+
     @model_tmin_tmax
     def summary(
         self,

diff --git a/pastas/plotting/modelcompare.py b/pastas/plotting/modelcompare.py
@@ -842,17 +842,18 @@ def plot_table_metrics(
             name of labeled axes to plot table on, by default "met"
         metric_selection : list, optional
             list of str describing which metrics to include, by default None which
-            uses ["rsq", "aic"].
+            uses ["rsq", "aicc"].
         """
         if metric_selection is None:
-            metric_selection = ["rsq", "aic"]
+            metric_selection = ["rsq", "aicc"]
 
         metrics = self.get_metrics(self.models, metric_selection=metric_selection)
-        for met in ["aic", "bic"]:
+        for met in ["aic", "aicc", "bic"]:
             if met in metrics.index:
                 metrics.loc[met] -= metrics.loc[met].min()
+                metname = "AICc" if met == "aicc" else met.upper()
                 metrics = metrics.rename(
-                    index={met: f"\N{GREEK CAPITAL LETTER DELTA}{met.upper()}"}
+                    index={met: f"\N{GREEK CAPITAL LETTER DELTA}{metname}"}
                 )
         if "rsq" in metrics.index:
             metrics = metrics.rename(index={"rsq": "R\N{SUPERSCRIPT TWO}"})

diff --git a/pastas/stats/metrics.py b/pastas/stats/metrics.py
@@ -31,14 +31,15 @@
     "rsq",
     "bic",
     "aic",
+    "aicc",
     "pearsonr",
     "kge",
 ]
 
 captureWarnings(True)
 logger = getLogger(__name__)
 warnings_logger = getLogger("py.warnings")
-warnings_logger.addHandler(logger)
+logger.addHandler(warnings_logger)
 
 
 # Absolute Error Metrics
@@ -483,13 +484,14 @@ def aic(
     sim: pandas.Series, optional
         The Series with the simulated values.
     res: pandas.Series, optional
-        The Series with the residual values. If time series for the residuals are
-        provided, the sim and obs arguments are ignored. Note that the residuals
-        must be computed as `obs - sim` here.
+        The Series with the residual values. If time series for the residuals
+        are provided, the sim and obs arguments are ignored. Note that the
+        residuals must be computed as `obs - sim` here.
     nparam: int, optional
         number of calibrated parameters.
     missing: str, optional
-        string with the rule to deal with missing values. Only "drop" is supported now.
+        string with the rule to deal with missing values. Only "drop" is
+        supported now.
 
     Notes
     -----
@@ -498,8 +500,14 @@ def aic(
 
     .. math:: \\text{AIC} = -2 log(L) + 2 nparam
 
-    where :math:`n_{param}` is the number of calibration parameters and L is the
-    likelihood function for the model.
+    where :math:`n_{param}` is the number of calibration parameters and L is
+    the likelihood function for the model. In the case of ordinary least
+    squares:
+
+    .. math:: log(L) = - (nobs / 2) * log(RSS / -nobs)
+
+    where RSS denotes the residual sum of squares and nobs the number of
+    observations.
     """
     err = _compute_err(obs=obs, sim=sim, res=res, missing=missing)
 
@@ -513,6 +521,56 @@ def aic(
     return n * log((err.to_numpy() ** 2.0).sum() / n) + 2.0 * nparam
 
 
+def aicc(
+    obs: Optional[Series] = None,
+    sim: Optional[Series] = None,
+    res: Optional[Series] = None,
+    missing: str = "drop",
+    nparam: int = 1,
+) -> float:
+    """Compute the Akaike Information Criterium with second order
+    bias correction for the number of observations (AICc)
+
+    Parameters
+    ----------
+    obs: pandas.Series, optional
+        Series with the observed values.
+    sim: pandas.Series, optional
+        The Series with the simulated values.
+    res: pandas.Series, optional
+        The Series with the residual values. If time series for the residuals
+        are provided, the sim and obs arguments are ignored. Note that the
+        residuals must be computed as `obs - sim` here.
+    nparam: int, optional
+        number of calibrated parameters.
+    missing: str, optional
+        string with the rule to deal with missing values. Only "drop" is
+        supported now.
+
+    Notes
+    -----
+
+    The corrected Akaike Information Criterium (AICc)
+    :cite:p:`suguria_aicc_1978` is computed as follows:
+
+    .. math:: \\text{AIC} = -2 log(L) + 2 nparam - (2 nparam (nparam + 1) / (nobs - nparam - 1))
+
+    where :math:`n_{param}` is the number of calibration parameters, nobs is
+    the number of observations and L is the likelihood function for the model.
+    In the case of ordinary least squares:
+
+    .. math:: log(L) = - (nobs / 2) * log(RSS / -nobs)
+
+    where RSS denotes the residual sum of squares.
+    """
+    err = _compute_err(obs=obs, sim=sim, res=res, missing=missing)
+
+    n = err.index.size
+
+    c_term = (2 * nparam * (nparam + 1)) / (n - nparam - 1)
+    return aic(res=-err, nparam=nparam) + c_term
+
+
 # Forecast Error Metrics
 
 

diff --git a/tests/test_metrics.py b/tests/test_metrics.py
@@ -3,61 +3,67 @@
 
 import pastas as ps
 
-obs = pd.Series([10.0, 15.0, 20.0, 25.0])
-sim = pd.Series([12.0, 18.0, 22.0, 28.0])
+index = pd.date_range("2010-01-01", "2010-01-04", freq="D")
+obs = pd.Series([10.0, 15.21, 20.0, 25.2], index=index)
+sim = pd.Series([10.1, 15.0, 20.3, 25.0], index=index)
 tol = 1e-6
 
 
 def test_mae():
     mae = ps.stats.metrics.mae(obs=obs, sim=sim)
-    assert pytest.approx(mae, tol) == 2.5
+    assert pytest.approx(mae, tol) == 0.2025
 
 
 def test_rmse():
     rmse = ps.stats.metrics.rmse(obs=obs, sim=sim)
-    assert pytest.approx(rmse, tol) == 2.549509
+    assert pytest.approx(rmse, tol) == 0.2145343
 
 
 def test_sse():
     sse = ps.stats.metrics.sse(obs=obs, sim=sim)
-    assert pytest.approx(sse, tol) == 26.0
+    assert pytest.approx(sse, tol) == 0.1841
 
 
 def test_pearsonr():
     r = ps.stats.metrics.pearsonr(obs=obs, sim=sim)
-    assert pytest.approx(r, tol) == 0.997054
+    assert pytest.approx(r, tol) == 0.999299
 
 
 def test_evp():
     evp = ps.stats.metrics.evp(obs=obs, sim=sim)
-    assert pytest.approx(evp, tol) == 99.2
+    assert pytest.approx(evp, tol) == 99.85505
 
 
 def test_nse():
     nse = ps.stats.metrics.nse(obs=obs, sim=sim)
-    assert pytest.approx(nse, tol) == 0.792
+    assert pytest.approx(nse, tol) == 0.99855
 
 
 def test_rsq():
     rsq = ps.stats.metrics.rsq(obs=obs, sim=sim)
-    assert pytest.approx(rsq, tol) == 0.792
+    assert pytest.approx(rsq, tol) == 0.99855
 
 
 def test_bic():
     bic = ps.stats.metrics.bic(obs=obs, sim=sim)
-    assert pytest.approx(bic, tol) == 8.873503
+    assert pytest.approx(bic, tol) == -10.9279878
 
 
 def test_aic():
     aic = ps.stats.metrics.aic(obs=obs, sim=sim)
-    assert pytest.approx(aic, tol) == 9.487208
+    assert pytest.approx(aic, tol) == -10.314282
+
+
+def test_aicc():
+    aicc = ps.stats.metrics.aicc(obs=obs, sim=sim)
+    assert pytest.approx(aicc, tol) == -8.314282
 
 
 def test_kge():
     kge = ps.stats.metrics.kge(obs=obs, sim=sim)
-    assert pytest.approx(kge, tol) == 0.850761
+    assert pytest.approx(kge, tol) == 0.9923303
 
 
 def test_kge_modified():
     kgem = ps.stats.metrics.kge(obs=obs, sim=sim, modified=True)
-    assert pytest.approx(kgem, tol) == 0.832548
+    assert pytest.approx(kgem, tol) == 0.99247