Implement the nearby_wetdry approach from rainlink (#113)

* implementing nearby_wetdry approach * add example notebook * add tests
pycomlink · Feb 14, 2023 · c5e516c · c5e516c
1 parent 94ee2ef
commit c5e516c
Show file tree

Hide file tree

Showing 7 changed files with 968 additions and 133 deletions.
diff --git a/notebooks/Basic CML processing workflow.ipynb b/notebooks/Basic CML processing workflow.ipynb
diff --git a/notebooks/Nearby wetdry example.ipynb b/notebooks/Nearby wetdry example.ipynb
diff --git a/pycomlink/io/example_data/example_cml_data.nc b/pycomlink/io/example_data/example_cml_data.nc
diff --git a/pycomlink/io/example_data/example_path_averaged_reference_data.nc b/pycomlink/io/example_data/example_path_averaged_reference_data.nc
diff --git a/pycomlink/processing/wet_dry/nearby_wetdry.py b/pycomlink/processing/wet_dry/nearby_wetdry.py
@@ -0,0 +1,217 @@
+import xarray as xr
+import numpy as np
+from tqdm import tqdm
+import pycomlink.spatial.helper as spatial
+
+
+def calc_distance_between_cml_endpoints(
+    cml_ids,
+    site_a_latitude,
+    site_a_longitude,
+    site_b_latitude,
+    site_b_longitude,
+):
+    """
+    Calculating the distance from and to all start and endpoints of a network
+    of CMLs using the Haversine distance formula. This includes the start and
+    endpoint of each CML (which equals its length). The distance between start
+    and enpoint of a CML will be set to 0 for the case when r (the radius for
+    which CMLs are considered to be "nearby") is smaller than the actual length
+    of the CML
+    ----------
+    cml_ids : list of str or int
+         ids of CMLs
+    site_a_latitude : list or array
+        latitude values of site a
+    site_a_longitude : list or array
+        longitude values of site a
+    site_b_latitude : list or array
+        latitude values of site b
+    site_b_longitude : list or array
+        longitude values of site b
+    Returns
+    -------
+    xarray.Dataset
+        distances between all start and endpoints in four variables, a_to_all_a,
+        a_to_all_b, b_to_all_a, b_to_all_b
+    """
+
+    n_cmls = len(cml_ids)
+    ds = xr.Dataset(
+        data_vars=dict(
+            a_to_all_a=(["cml_id1", "cml_id2"], np.full([n_cmls, n_cmls], np.nan)),
+            a_to_all_b=(["cml_id1", "cml_id2"], np.full([n_cmls, n_cmls], np.nan)),
+            b_to_all_a=(["cml_id1", "cml_id2"], np.full([n_cmls, n_cmls], np.nan)),
+            b_to_all_b=(["cml_id1", "cml_id2"], np.full([n_cmls, n_cmls], np.nan)),
+        ),
+        coords=dict(cml_id1=cml_ids, cml_id2=cml_ids),
+    )
+    for i, cmlid in tqdm(enumerate(cml_ids)):
+        ds.loc[dict(cml_id1=cmlid)]["a_to_all_a"][:] = spatial.haversine(
+            np.array(site_a_longitude[i]),
+            np.array(site_a_latitude[i]),
+            np.array(site_a_longitude),
+            np.array(site_a_latitude),
+        )
+        ds.loc[dict(cml_id1=cmlid)]["a_to_all_b"][:] = spatial.haversine(
+            np.array(site_a_longitude[i]),
+            np.array(site_a_latitude[i]),
+            np.array(site_b_longitude),
+            np.array(site_b_latitude),
+        )
+        ds.loc[dict(cml_id1=cmlid)]["b_to_all_a"][:] = spatial.haversine(
+            np.array(site_b_longitude[i]),
+            np.array(site_b_latitude[i]),
+            np.array(site_a_longitude),
+            np.array(site_a_latitude),
+        )
+        ds.loc[dict(cml_id1=cmlid)]["b_to_all_b"][:] = spatial.haversine(
+            np.array(site_b_longitude[i]),
+            np.array(site_b_latitude[i]),
+            np.array(site_b_longitude),
+            np.array(site_b_latitude),
+        )
+    # set distance between start and endpoint of each CML to 0km
+    # this has to be done so that CMLs are not rejected when the radius r
+    # which defines nearby CMLs is larger than the distance between endpoints
+    # of a CML (aka its length). Otherwise, the signal levels of CMLs with
+    # length > r will not be considered in the nearby approach
+    ds = xr.where(
+        ds.cml_id1 == ds.cml_id2,
+        0,
+        ds)
+    return ds
+
+
+def instanteanous_to_minmax_data(ds_cml, interval=15, timeperiod=24, min_hours=6):
+    """
+    calculating pmin from instanteanousy measured rsl and tsl values
+    ----------
+    ds_cml : xarray.Dataset
+         Time series of rsl and tsl
+    interval : int
+        Interval of pmin in minutes
+    timeperiod : int
+        Number of previous hours over which max(Pmin) is to be computed
+    min_hours : int
+        Minimum number of hours needed to compute max(Pmin)
+    Returns
+    -------
+    xarray.Dataset
+        Time series of pmin, max_pmin, deltaP and deltaPL
+    References
+    ----------
+    .. [1] Overeem, A., Leijnse, H., and Uijlenhoet, R.: Retrieval algorithm
+    for rainfall mapping from microwave links in a cellular communication network,
+    Atmos. Meas. Tech., 9, 2425–2444, https://doi.org/10.5194/amt-9-2425-2016, 2016.
+    """
+    ds_cml["pmin"] = ds_cml.rsl - ds_cml.tsl
+    ds_cml_minmax = ds_cml.pmin.resample(time=str(interval) + "min").min().to_dataset()
+
+    # rolling window * 60min/interval(in minutes)
+    period = int(timeperiod * 60 / interval)
+    # min hours for calculation * 60min/interval(in minutes)
+    hours_needed = int(min_hours * 60 / interval)
+    ds_cml_minmax["max_pmin"] = ds_cml_minmax.pmin.rolling(
+        time=period,
+        min_periods=hours_needed,
+    ).max(skipna=False)
+
+    ds_cml_minmax["deltaP"] = ds_cml_minmax.pmin - ds_cml_minmax.max_pmin
+    ds_cml_minmax["deltaPL"] = ds_cml_minmax["deltaP"] / ds_cml_minmax.length
+
+    return ds_cml_minmax
+
+
+def nearby_wetdry(
+        ds_cml,
+        ds_dist,
+        r=15,
+        thresh_median_P=-1.4,
+        thresh_median_PL=-0.7,
+        min_links=3,
+
+):
+    """
+    Classification of rainy and dry periods from diagnostic (min-max) CML signal
+    levels following the nearby link approach.
+    ----------
+    ds_cml : xarray.Dataset
+         Dataset consisting minmax values (pmin, max_pmin, deltaP and deltaPL
+         from CML data.
+    ds_dist : xarray.Dataset
+         Distance matrix between all CML endpoints calculated with
+         `calc_distance_between_cml_endpoints()`
+    r : float
+        Radius for which surrounding CMLs (both end points are within a chosen
+        radius r from either end of the selected link)
+    thresh_median_P : float
+        Threshold for median_P. Is dependent on the spatial correlation of rainfall.
+    thresh_median_PL : float
+        Threshold for median_PL. Is dependent on the spatial correlation of rainfall.
+    min_links : int
+        minimum number of CMLs within r needed to perform wet-dry classification
+    Returns
+    -------
+    xarray.Dataset
+        Time series wet-dry classification
+    References
+    ----------
+    .. [1] Overeem, A., Leijnse, H., and Uijlenhoet, R.: Retrieval algorithm
+    for rainfall mapping from microwave links in a cellular communication network,
+    Atmos. Meas. Tech., 9, 2425–2444, https://doi.org/10.5194/amt-9-2425-2016, 2016.
+    """
+
+    # get number of CMLs within r for each CML
+    ds_dist["within_r"] = (
+            (ds_dist.a_to_all_a < r)
+            & (ds_dist.a_to_all_b < r)
+            & (ds_dist.b_to_all_a < r)
+            & (ds_dist.b_to_all_b < r)
+    )
+
+    wet = xr.full_like(ds_cml.pmin, np.nan)
+
+    for cmlid in tqdm(ds_cml.cml_id):
+        # only make wet dry detection if min_links is reached within r
+        if sum(ds_dist.within_r.sel(cml_id1=cmlid).values) > min_links:
+            # select all CMLs within r
+            ds_nearby_cmls = ds_cml.isel(
+                cml_id=ds_dist.within_r.sel(cml_id1=cmlid).values
+            )
+
+            # calculate median delta P and delta PL for each timestep
+            # .where checks if the minimal require number of CMLs has data
+            medianP = (
+                ds_nearby_cmls.deltaP.where(
+                    ds_nearby_cmls.deltaP.count("cml_id") > min_links)
+                    .median(dim="cml_id", skipna=False)
+                    .values
+            )
+            medianPL = (
+                ds_nearby_cmls.where(
+                    ds_nearby_cmls.deltaP.count("cml_id") > min_links)
+                    .deltaPL.median(dim="cml_id", skipna=False)
+                    .values
+            )
+
+            ## actual wet dry classification
+            wet.loc[dict(cml_id=cmlid)] = xr.where(
+                cond=(np.isnan(medianP)) | (np.isnan(medianPL)),
+                x=np.nan,
+                y=((medianP < thresh_median_P) & (medianPL < thresh_median_PL)),
+            )
+
+            # if maxpim - pmin > 2db set two timesteps before and one aftere to wet
+            diff2db_rule = (
+                    ds_nearby_cmls.sel(cml_id=cmlid).max_pmin
+                    - ds_nearby_cmls.sel(cml_id=cmlid).pmin
+                    > 2
+            )
+            for shift in [-2, -1, 1]:
+                wet.loc[dict(cml_id=cmlid.values)] = xr.where(
+                    diff2db_rule.shift(time=shift) > 2,
+                    x=1,
+                    y=wet.loc[dict(cml_id=cmlid)],
+                )
+    return wet
diff --git a/pycomlink/spatial/helper.py b/pycomlink/spatial/helper.py
@@ -11,19 +11,19 @@
 
 from __future__ import division
 from builtins import map
-from math import radians, cos, sin, asin, sqrt
+import numpy as np
 
 
 def haversine(lon1, lat1, lon2, lat2):
     """Calculate the great circle distance between two points
     on the earth (specified in decimal degrees)
     """
     # convert decimal degrees to radians
-    lon1, lat1, lon2, lat2 = list(map(radians, [lon1, lat1, lon2, lat2]))
+    lon1, lat1, lon2, lat2 = list(map(np.radians, [lon1, lat1, lon2, lat2]))
     # haversine formula
     dlon = lon2 - lon1
     dlat = lat2 - lat1
-    a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
-    c = 2 * asin(sqrt(a))
+    a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
+    c = 2 * np.arctan2(a ** 0.5, (1 - a) ** 0.5)
     km = 6367 * c
     return km