Adcp clean (#408)

shaunwbell · web-flow · commit fd72fb727add · 2025-07-11T13:53:41.000-07:00
* cleaning - add filter for dataframe

* Update adcp_parser.py

* Update packagetest.yaml

* Create test_adcp_parser.py

* Update test_adcp_parser.py

remove mocked tests

* cleanup

* more cleanup

* Update test_adcp_parser.py

* Update test_adcp_parser.py

* Update test_adcp_parser.py

* Update test_adcp_parser.py

* Update test_adcp_parser.py

remove depth calc tests

* Update test_adcp_parser.py

* Update test_adcp_parser.py

remove - try again later
diff --git a/.github/workflows/packagetest.yaml b/.github/workflows/packagetest.yaml
@@ -25,5 +25,5 @@ jobs:
           python -m pip install -e . --no-deps --force-reinstall
       - name: Test with pytest
         run: |
-          pip install flake8 pytest pytest-cov
+          pip install flake8 pytest pytest-cov pytest-mock
           pytest -v -s --junitxml=junit/test-results.xml --cov --cov-report=xml --cov-report=html
diff --git a/src/EcoFOCIpy/io/adcp_parser.py b/src/EcoFOCIpy/io/adcp_parser.py
@@ -17,6 +17,7 @@
 try:
     import EcoFOCIpy.math.geomag.geomag.geomag as geomag
     import EcoFOCIpy.math.geotools as geotools
+
     ECOFOCIPY_AVAILABLE = True
 except ImportError:
     ECOFOCIPY_AVAILABLE = False
@@ -30,7 +31,9 @@ class adcp(object):
     apply magnetic declination corrections, and calculate depth information for bins.
     """
 
-    def __init__(self, serial_no: str, deployment_dir: Optional[Union[str, Path]] = None):
+    def __init__(
+        self, serial_no: str, deployment_dir: Optional[Union[str, Path]] = None
+    ):
         """
         Initializes the ADCP parser.
 
@@ -48,14 +51,18 @@ def __init__(self, serial_no: str, deployment_dir: Optional[Union[str, Path]] =
         self.ein_df: Optional[pd.DataFrame] = None
         self.scal_df: Optional[pd.DataFrame] = None
 
-    def _get_filepath(self, extension: str, file_path: Optional[Union[str, Path]]) -> Path:
+    def _get_filepath(
+        self, extension: str, file_path: Optional[Union[str, Path]]
+    ) -> Path:
         """Constructs the full file path or validates an existing one."""
         if file_path:
             p = Path(file_path)
         elif self.deployment_dir:
             p = self.deployment_dir / f"{self.serial_no}{extension}"
         else:
-            raise ValueError("Must provide either a deployment directory or a direct file path.")
+            raise ValueError(
+                "Must provide either a deployment directory or a direct file path."
+            )
 
         if not p.exists():
             raise FileNotFoundError(f"The specified ADCP file does not exist: {p}")
@@ -88,20 +95,34 @@ def _load_data_file(
             header=None,
             names=column_names,
         )
-        df["date_time"] = pd.to_datetime(df["date"] + " " + df["time"], format="%y/%m/%d %H:%M:%S")
+        df["date_time"] = pd.to_datetime(
+            df["date"] + " " + df["time"], format="%y/%m/%d %H:%M:%S"
+        )
 
         if datetime_index:
             df = df.set_index("date_time").drop(columns=["date", "time"])
 
         return df
 
-    def load_vel_file(self, file_path: Optional[Union[str, Path]] = None, datetime_index: bool = True) -> pd.DataFrame:
+    def load_vel_file(
+        self, file_path: Optional[Union[str, Path]] = None, datetime_index: bool = True
+    ) -> pd.DataFrame:
         """Loads a .VEL (velocity) file."""
-        cols = ["date", "time", "bin", "u_curr_comp", "v_curr_comp", "w_curr_comp", "w_curr_comp_err"]
+        cols = [
+            "date",
+            "time",
+            "bin",
+            "u_curr_comp",
+            "v_curr_comp",
+            "w_curr_comp",
+            "w_curr_comp_err",
+        ]
         self.vel_df = self._load_data_file(".VEL", cols, file_path, datetime_index)
         return self.vel_df
 
-    def load_pg_file(self, file_path: Optional[Union[str, Path]] = None, datetime_index: bool = True) -> pd.DataFrame:
+    def load_pg_file(
+        self, file_path: Optional[Union[str, Path]] = None, datetime_index: bool = True
+    ) -> pd.DataFrame:
         """
         Loads a .PG (Percent Good) file.
 
@@ -111,23 +132,48 @@ def load_pg_file(self, file_path: Optional[Union[str, Path]] = None, datetime_in
         3) Percentage of measurements where more than one beam was bad.
         4) Percentage of measurements with four-beam solutions (useful for QC).
         """
-        cols = ["date", "time", "bin", "pg3beam-good", "pgtransf-good", "pg1beam-bad", "pg4beam-good"]
+        cols = [
+            "date",
+            "time",
+            "bin",
+            "pg3beam-good",
+            "pgtransf-good",
+            "pg1beam-bad",
+            "pg4beam-good",
+        ]
         self.pg_df = self._load_data_file(".PG", cols, file_path, datetime_index)
         return self.pg_df
 
-    def load_ein_file(self, file_path: Optional[Union[str, Path]] = None, datetime_index: bool = True) -> pd.DataFrame:
+    def load_ein_file(
+        self, file_path: Optional[Union[str, Path]] = None, datetime_index: bool = True
+    ) -> pd.DataFrame:
         """Loads an .EIN (Echo Intensity) file."""
         cols = ["date", "time", "bin", "agc1", "agc2", "agc3", "agc4"]
         self.ein_df = self._load_data_file(".EIN", cols, file_path, datetime_index)
         return self.ein_df
 
-    def load_scal_file(self, file_path: Optional[Union[str, Path]] = None, datetime_index: bool = True) -> pd.DataFrame:
+    def load_scal_file(
+        self, file_path: Optional[Union[str, Path]] = None, datetime_index: bool = True
+    ) -> pd.DataFrame:
         """Loads a .SCA (Scalar) file."""
-        cols = ["date", "time", "unknown", "temperature", "heading", "pitch", "roll", "heading_stdev", "pitch_stdev", "roll_stdev"]
+        cols = [
+            "date",
+            "time",
+            "unknown",
+            "temperature",
+            "heading",
+            "pitch",
+            "roll",
+            "heading_stdev",
+            "pitch_stdev",
+            "roll_stdev",
+        ]
         self.scal_df = self._load_data_file(".SCA", cols, file_path, datetime_index)
         return self.scal_df
 
-    def load_rpt_file(self, file_path: Optional[Union[str, Path]] = None) -> Tuple[List[str], Dict[str, float]]:
+    def load_rpt_file(
+        self, file_path: Optional[Union[str, Path]] = None
+    ) -> Tuple[List[str], Dict[str, float]]:
         """
         Loads a .RPT (Report) file to extract instrument setup parameters.
 
@@ -138,7 +184,7 @@ def load_rpt_file(self, file_path: Optional[Union[str, Path]] = None) -> Tuple[L
             Tuple[List[str], Dict[str, float]]: A tuple containing the raw lines of the
                 report file and a dictionary of extracted setup parameters.
         """
-        full_path = self._get_filepath('.RPT', file_path)
+        full_path = self._get_filepath(".RPT", file_path)
 
         lines = full_path.read_text().splitlines()
 
@@ -147,15 +193,17 @@ def load_rpt_file(self, file_path: Optional[Union[str, Path]] = None) -> Tuple[L
             if not parts:
                 continue
             if "Bin length" in line:
-                self.setup['bin_length'] = float(parts[2])
+                self.setup["bin_length"] = float(parts[2])
             elif "Distance" in line:
-                self.setup['distance_to_first_bin'] = float(parts[4])
+                self.setup["distance_to_first_bin"] = float(parts[4])
             elif "Number of bins" in line:
-                self.setup['num_of_bins'] = int(parts[3])
+                self.setup["num_of_bins"] = int(parts[3])
 
         return lines, self.setup
 
-    def mag_dec_corr(self, lat: float, lon_w: float, deployment_date: pd.Timestamp) -> float:
+    def mag_dec_corr(
+        self, lat: float, lon_w: float, deployment_date: pd.Timestamp
+    ) -> float:
         """
         Calculates and applies magnetic declination correction to velocity data.
 
@@ -175,20 +223,23 @@ def mag_dec_corr(self, lat: float, lon_w: float, deployment_date: pd.Timestamp)
             ValueError: If the velocity data (`vel_df`) has not been loaded.
         """
         if not ECOFOCIPY_AVAILABLE:
-            raise ImportError("EcoFOCIpy is required for magnetic declination correction but is not installed.")
+            raise ImportError(
+                "EcoFOCIpy is required for magnetic declination correction but is not installed."
+            )
         if self.vel_df is None:
-            raise ValueError("Velocity data must be loaded before applying magnetic correction.")
+            raise ValueError(
+                "Velocity data must be loaded before applying magnetic correction."
+            )
 
         t = geomag.GeoMag()
-        declination = t.GeoMag(lat, -1 * lon_w, time=deployment_date).declination
+        declination = t.GeoMag(lat, lon_w, time=deployment_date).dec
 
         u_rotated, v_rotated = geotools.rotate_coord(
-            self.vel_df['u_curr_comp'],
-            self.vel_df['v_curr_comp'],
-            declination)
+            self.vel_df["u_curr_comp"], self.vel_df["v_curr_comp"], declination
+        )
 
-        self.vel_df['u_curr_comp'] = u_rotated
-        self.vel_df['v_curr_comp'] = v_rotated
+        self.vel_df["u_curr_comp"] = u_rotated
+        self.vel_df["v_curr_comp"] = v_rotated
 
         return declination
 
@@ -199,7 +250,7 @@ def bins2depth(self, inst_depth: float = None):
         Args:
             inst_depth (float, optional): Deployment Depth of Instrument.
         """
-        start = inst_depth - self.setup['distance']
-        stop = start - self.setup['numofbins']*self.setup['bin_length']
+        start = inst_depth - self.setup["distance_to_first_bin"]
+        stop = start - self.setup["num_of_bins"] * self.setup["bin_length"]
 
-        return np.arange(start, stop, -1*self.setup['bin_length'])
+        return np.arange(start, stop, -1 * self.setup["bin_length"])
diff --git a/src/EcoFOCIpy/io/wpak_parser.py b/src/EcoFOCIpy/io/wpak_parser.py
@@ -1,6 +1,7 @@
-import pandas as pd
 from typing import Optional
 
+import pandas as pd
+
 
 class wpak(object):
     r"""
diff --git a/src/EcoFOCIpy/math/cleaning.py b/src/EcoFOCIpy/math/cleaning.py
@@ -1,4 +1,7 @@
-#modified from GliderTools.py
+import numpy as np
+import pandas as pd
+import xarray as xr
+
 
 def outlier_bounds_std(arr, multiplier=3):
     r"""
@@ -19,25 +22,23 @@ def outlier_bounds_std(arr, multiplier=3):
 
     Returns
     -------
-    arr : array | xarray.DataArray
-        A data object where values outside the limits are masked.
-        Metdata will be preserved if the original input array is xr.DataArray
-
+    xr.Dataset
+        A new xarray Dataset with the filtered variable and a corresponding
+        quality control (QC) variable.
+        QC flags:
+        0: Good data
+        1: Data at beginning/end of series (not processed by rolling filter)
+        4: Data flagged as outlier and replaced with NaN (before interpolation)
+        8: Data that was originally an outlier (flag 4) and subsequently interpolated
     """
 
-    from numpy import array, nan, nanmean, nanstd
-
-    arr = array(arr)
-
-    mean = nanmean(arr)
-    std = nanstd(arr)
-
+    arr = np.array(arr)
+    mean = np.nanmean(arr)
+    std = np.nanstd(arr)
     ll = mean - std * multiplier
     ul = mean + std * multiplier
-
     mask = (arr < ll) | (arr > ul)
-    arr[mask] = nan
-
+    arr[mask] = np.nan
     return arr
 
 
@@ -68,19 +69,14 @@ def outlier_bounds_iqr(arr, multiplier=1.5):
 
 
     """
-    from numpy import array, nan, nanpercentile
-
-    arr = array(arr)
 
-    q1, q3 = nanpercentile(arr, [25, 75])
+    arr = np.array(arr)
+    q1, q3 = np.nanpercentile(arr, [25, 75])
     iqr = q3 - q1
-
     ll = q1 - iqr * multiplier
     ul = q3 + iqr * multiplier
-
     mask = (arr < ll) | (arr > ul)
-    arr[mask] = nan
-
+    arr[mask] = np.nan
     return arr
 
 def rolling_outlier_std(tdf, var_choice=None, timebase=1, stddev=1, interp_fill_timebase='1h'):
@@ -92,28 +88,66 @@ def rolling_outlier_std(tdf, var_choice=None, timebase=1, stddev=1, interp_fill_
         timebase (int, optional): [time window value to be applied to rolling]. Defaults to 1.
         stddev (int, optionl): [standard deviation threshold to use to filter data out]. Defaults to 1.
     """
-    from numpy import isnan
-    from xarray import zeros_like
 
     xdf = tdf.copy()
     r = (xdf[var_choice] - xdf.rolling(time=timebase, center=True).median()[var_choice])
-    rc = outlier_bounds_std(r,stddev)
+    rc = outlier_bounds_std(r, stddev)
 
-    xdf[var_choice] = (xdf[var_choice].where(~isnan(rc))).interpolate_na(dim='time',max_gap=interp_fill_timebase)
+    xdf[var_choice] = (xdf[var_choice].where(~np.isnan(rc))).interpolate_na(dim='time', max_gap=interp_fill_timebase)
 
-    #mask values that are filtered out by algorithm
-    xdf[var_choice+'_QC'] = zeros_like(xdf[var_choice])
-    xdf[var_choice+'_QC'] = xdf[var_choice].where((~isnan(rc)),4)
-    xdf[var_choice+'_QC'].values[xdf[var_choice+'_QC'].values !=4] = 0
+    # QC flag: 4 for outlier, 0 for good, 8 for interpolated, 1 for edge
+    xdf[var_choice + '_QC'] = xr.zeros_like(xdf[var_choice])
+    xdf[var_choice + '_QC'] = xdf[var_choice].where((~np.isnan(rc)), 4)
+    xdf[var_choice + '_QC'].values[xdf[var_choice + '_QC'].values != 4] = 0
 
-    #mask values that are interpolated back in 
-    mask = (tdf[var_choice].where(~isnan(rc)))
-    mask_QC = xdf[var_choice].where(((xdf[var_choice] == mask) | isnan(xdf[var_choice])),8)
-    xdf[var_choice+'_QC'].values[(mask_QC.values) == 8] = 8
+    mask = (tdf[var_choice].where(~np.isnan(rc)))
+    mask_QC = xdf[var_choice].where(((xdf[var_choice] == mask) | np.isnan(xdf[var_choice])), 8)
+    xdf[var_choice + '_QC'].values[(mask_QC.values) == 8] = 8
 
     xdf[var_choice][:round(timebase/2)+1] = tdf[var_choice][:round(timebase/2)+1]
     xdf[var_choice+'_QC'][:round(timebase/2)+1] = tdf[var_choice][:round(timebase/2)+1]*0 +1
     xdf[var_choice][-1*round(timebase/2)-1:] = tdf[var_choice][-1*round(timebase/2)-1:] 
     xdf[var_choice+'_QC'][-1*round(timebase/2)-1:] = tdf[var_choice][-1*round(timebase/2)-1:]*0 +1 
 
-    return xdf
+    return xdf
+
+
+def rolling_outlier_pd(
+    df: pd.DataFrame, column: str = 'salinity', window_size: str = '7D', num_std_dev: int = 5
+) -> pd.DataFrame:
+    """
+    Apply a rolling standard deviation filter to a column of a pandas DataFrame.
+    Outliers are set to NaN.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Input DataFrame containing the column to filter.
+    column : str, default='salinity'
+        Name of the column to filter.
+    window_size : str, default='7D'
+        Size of the rolling window (e.g., '7D' for 7 days).
+    num_std_dev : int, default=5
+        Number of standard deviations for outlier threshold.
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame with outliers replaced by NaN in the specified column.
+    """
+    df_filtered = df.copy()
+    mean_col = f'{column}_rolling_mean'
+    std_col = f'{column}_rolling_std'
+    upper_col = f'{column}_upper_bound'
+    lower_col = f'{column}_lower_bound'
+
+    df_filtered[mean_col] = df_filtered[column].rolling(window=window_size).mean()
+    df_filtered[std_col] = df_filtered[column].rolling(window=window_size).std()
+    df_filtered[upper_col] = df_filtered[mean_col] + num_std_dev * df_filtered[std_col]
+    df_filtered[lower_col] = df_filtered[mean_col] - num_std_dev * df_filtered[std_col]
+
+    outlier_mask = (df_filtered[column] > df_filtered[upper_col]) | (df_filtered[column] < df_filtered[lower_col])
+    df_filtered.loc[outlier_mask, column] = np.nan
+
+    df_filtered = df_filtered.drop(columns=[mean_col, std_col, upper_col, lower_col])
+    return df_filtered
diff --git a/tests/io/test_adcp_parser.py b/tests/io/test_adcp_parser.py
diff --git a/tests/io/test_wetlabs_parser.py b/tests/io/test_wetlabs_parser.py
@@ -3,7 +3,7 @@
 import numpy as np
 import pandas as pd
 import pytest
-from EcoFOCIpy.io.wetlabs_parser import wetlabs # Import your class
+from EcoFOCIpy.io.wetlabs_parser import wetlabs  # Import your class
 
 
 # Define a fixture to create mock data files for tests
diff --git a/tests/math/test_cleaning.py b/tests/math/test_cleaning.py