From b1bd21dbc2583bd58645c337c9acf09b5ee33449 Mon Sep 17 00:00:00 2001
From: Felix Claessen <30658763+Flix6x@users.noreply.github.com>
Date: Thu, 10 Nov 2022 20:12:38 +0100
Subject: [PATCH] Read csv with naive data, column filters and
 datetime/timedelta units (#521)

Improved import of time series data from CSV file: localize timezone naive data, support reading in datetime and timedelta values, remove rows with NaN values, and filter by values in specific columns.


* Support unit conversion for python datetime and timedelta objects

Signed-off-by: F.N. Claessen <felix@seita.nl>

* Allow to filter by column when reading in beliefs from CSV

Signed-off-by: F.N. Claessen <felix@seita.nl>

* Allow to set a timezone for reading in timezone naive data

Signed-off-by: F.N. Claessen <felix@seita.nl>

* Allow throwing out NaN values when reading in beliefs

Signed-off-by: F.N. Claessen <felix@seita.nl>

* Support datetime unit conversion for aware datetimes with mixed offset

Signed-off-by: F.N. Claessen <felix@seita.nl>

* Raise instead of assume UTC when reading in timezone naive data without a timezone set explicitly

Signed-off-by: F.N. Claessen <felix@seita.nl>

* Bump timely-beliefs dependency for read_csv

Signed-off-by: F.N. Claessen <felix@seita.nl>

* Refactor: flake8

Signed-off-by: F.N. Claessen <felix@seita.nl>

* CLI changelog entry

Signed-off-by: F.N. Claessen <felix@seita.nl>

* changelog entry

Signed-off-by: F.N. Claessen <felix@seita.nl>

* mypy

Signed-off-by: F.N. Claessen <felix@seita.nl>

* Use sensor id field validation

Signed-off-by: F.N. Claessen <felix@seita.nl>

* Querying for a source with a given id no longer requires knowing the source type

Signed-off-by: F.N. Claessen <felix@seita.nl>

* make freeze-deps

Signed-off-by: F.N. Claessen <felix@seita.nl>

* add optional dependency: timely-beliefs[forecast]

Signed-off-by: F.N. Claessen <felix@seita.nl>

* Mention data conversion from 'datetime' or 'timedelta' units

Signed-off-by: F.N. Claessen <felix@seita.nl>

* Allow converting 'datetime' values to a duration other than seconds (since UNIX epoch)

Signed-off-by: F.N. Claessen <felix@seita.nl>

* Refactor and make convert_time_units a private function

Signed-off-by: F.N. Claessen <felix@seita.nl>

* Refactor and add inline comment explaining why we check to_unit for a digit

Signed-off-by: F.N. Claessen <felix@seita.nl>

* mypy: PEP 484 prohibits implicit Optional

Signed-off-by: F.N. Claessen <felix@seita.nl>

* Attempt to revert bugs introduced in merge with main

Signed-off-by: F.N. Claessen <felix@seita.nl>

* black and flake8

Signed-off-by: F.N. Claessen <felix@seita.nl>

* A few more reverts

Signed-off-by: F.N. Claessen <felix@seita.nl>

* Fix typos

Signed-off-by: F.N. Claessen <felix@seita.nl>

Signed-off-by: F.N. Claessen <felix@seita.nl>
---
 documentation/changelog.rst                   |  2 +-
 documentation/cli/change_log.rst              |  1 +
 flexmeasures/api/v1_3/tests/test_api_v1_3.py  |  4 +-
 .../api/v3_0/tests/test_sensor_schedules.py   |  4 +-
 flexmeasures/cli/data_add.py                  | 69 ++++++++++++++----
 flexmeasures/data/queries/data_sources.py     | 12 +++-
 flexmeasures/utils/unit_utils.py              | 23 ++++++
 requirements/app.in                           |  2 +-
 requirements/app.txt                          | 71 ++++++++-----------
 requirements/docs.txt                         |  6 +-
 requirements/test.txt                         | 10 +--
 11 files changed, 130 insertions(+), 74 deletions(-)

diff --git a/documentation/changelog.rst b/documentation/changelog.rst
index 534a32eb9..3b6fcf952 100644
--- a/documentation/changelog.rst
+++ b/documentation/changelog.rst
@@ -12,8 +12,8 @@ New features
 * Ability to provide your own custom scheduling function [see `PR #505 <http://www.github.com/FlexMeasures/flexmeasures/pull/505>`_]
 * Visually distinguish forecasts/schedules (dashed lines) from measurements (solid lines), and expand the tooltip with timing info regarding the forecast/schedule horizon or measurement lag [see `PR #503 <http://www.github.com/FlexMeasures/flexmeasures/pull/503>`_]
 * The asset page also allows to show sensor data from other assets that belong to the same account [see `PR #500 <http://www.github.com/FlexMeasures/flexmeasures/pull/500>`_]
-* Improved import of time series data from CSV file: 1) drop duplicate records with warning, and 2) allow configuring which column contains explicit recording times for each data point (use case: import forecasts) [see `PR #501 <http://www.github.com/FlexMeasures/flexmeasures/pull/501>`_]
 * The CLI command ``flexmeasures show beliefs`` supports showing beliefs data in a custom resolution and/or timezone, and also saving the shown beliefs data to a CSV file [see `PR #519 <http://www.github.com/FlexMeasures/flexmeasures/pull/519>`_]
+* Improved import of time series data from CSV file: 1) drop duplicate records with warning, 2) allow configuring which column contains explicit recording times for each data point (use case: import forecasts) [see `PR #501 <http://www.github.com/FlexMeasures/flexmeasures/pull/501>`_], 3) localize timezone naive data, 4) support reading in datetime and timedelta values, 5) remove rows with NaN values, and 6) filter by values in specific columns [see `PR #521 <http://www.github.com/FlexMeasures/flexmeasures/pull/521>`_]
 
 Bugfixes
 -----------
diff --git a/documentation/cli/change_log.rst b/documentation/cli/change_log.rst
index c6d511c40..7c7dab958 100644
--- a/documentation/cli/change_log.rst
+++ b/documentation/cli/change_log.rst
@@ -8,6 +8,7 @@ since v0.12.0 | November XX, 2022
 =================================
 
 * Add ``--resolution``, ``--timezone`` and ``--to-file`` options to ``flexmeasures show beliefs``, to show beliefs data in a custom resolution and/or timezone, and also to save shown beliefs data to a CSV file.
+* Add options to ``flexmeasures add beliefs`` to 1) read CSV data with timezone naive datetimes (use ``--timezone`` to localize the data), 2) read CSV data with datetime/timedelta units (use ``--unit datetime`` or ``--unit timedelta``, 3) remove rows with NaN values, and 4) add filter to read-in data by matching values in specific columns (use ``--filter-column`` and ``--filter-value`` together).
 * Fix ``flexmeasures db-ops dump`` and ``flexmeasures db-ops restore`` incorrectly reporting a success when `pg_dump` and `pg_restore` are not installed.
 
 since v0.11.0 | August 28, 2022
diff --git a/flexmeasures/api/v1_3/tests/test_api_v1_3.py b/flexmeasures/api/v1_3/tests/test_api_v1_3.py
index 0ab3221e9..1835e3f84 100644
--- a/flexmeasures/api/v1_3/tests/test_api_v1_3.py
+++ b/flexmeasures/api/v1_3/tests/test_api_v1_3.py
@@ -113,7 +113,9 @@ def test_post_udi_event_and_get_device_message(
     if "targets" in message:
         start_soc = message["value"] / 1000  # in MWh
         soc_schedule = integrate_time_series(
-            consumption_schedule, start_soc, decimal_precision=6
+            consumption_schedule,
+            start_soc,
+            decimal_precision=6,
         )
         print(consumption_schedule)
         print(soc_schedule)
diff --git a/flexmeasures/api/v3_0/tests/test_sensor_schedules.py b/flexmeasures/api/v3_0/tests/test_sensor_schedules.py
index 6d9a4516d..d133671c3 100644
--- a/flexmeasures/api/v3_0/tests/test_sensor_schedules.py
+++ b/flexmeasures/api/v3_0/tests/test_sensor_schedules.py
@@ -91,7 +91,9 @@ def test_trigger_and_get_schedule(
     if "targets" in message:
         start_soc = message["soc-at-start"] / 1000  # in MWh
         soc_schedule = integrate_time_series(
-            consumption_schedule, start_soc, decimal_precision=6
+            consumption_schedule,
+            start_soc,
+            decimal_precision=6,
         )
         print(consumption_schedule)
         print(soc_schedule)
diff --git a/flexmeasures/cli/data_add.py b/flexmeasures/cli/data_add.py
index 2c21b9d05..15d00dfe5 100755
--- a/flexmeasures/cli/data_add.py
+++ b/flexmeasures/cli/data_add.py
@@ -286,8 +286,9 @@ def add_initial_structure():
 @click.argument("file", type=click.Path(exists=True))
 @click.option(
     "--sensor-id",
+    "sensor",
     required=True,
-    type=click.IntRange(min=1),
+    type=SensorIdField(),
     help="Sensor to which the beliefs pertain.",
 )
 @click.option(
@@ -301,6 +302,8 @@ def add_initial_structure():
     required=False,
     type=str,
     help="Unit of the data, for conversion to the sensor unit, if possible (a string unit such as 'kW' or 'm³/h').\n"
+    "Measurements of time itself that are formatted as a 'datetime' or 'timedelta' can be converted to a sensor unit representing time (such as 's' or 'h'),\n"
+    "where datetimes are represented as a duration with respect to the UNIX epoch."
     "Hint: to switch the sign of the data, prepend a minus sign.\n"
     "For example, when assigning kW consumption data to a kW production sensor, use '-kW'.",
 )
@@ -341,6 +344,12 @@ def add_initial_structure():
     multiple=True,
     help="Additional strings to recognize as NaN values. This argument can be given multiple times.",
 )
+@click.option(
+    "--keep-default-na",
+    default=False,
+    type=bool,
+    help="Whether or not to keep NaN values in the data.",
+)
 @click.option(
     "--nrows",
     required=False,
@@ -367,6 +376,24 @@ def add_initial_structure():
     type=int,
     help="Column number with datetimes",
 )
+@click.option(
+    "--timezone",
+    required=False,
+    default=None,
+    help="timezone as string, e.g. 'UTC' or 'Europe/Amsterdam'",
+)
+@click.option(
+    "--filter-column",
+    "filter_columns",
+    multiple=True,
+    help="Set a column number to filter data. Use together with --filter-value.",
+)
+@click.option(
+    "--filter-value",
+    "filter_values",
+    multiple=True,
+    help="Set a column value to filter data. Only rows with this value will be added. Use together with --filter-column.",
+)
 @click.option(
     "--delimiter",
     required=True,
@@ -396,8 +423,10 @@ def add_initial_structure():
 )
 def add_beliefs(
     file: str,
-    sensor_id: int,
+    sensor: Sensor,
     source: str,
+    filter_columns: List[int],
+    filter_values: List[int],
     unit: Optional[str] = None,
     horizon: Optional[int] = None,
     cp: Optional[float] = None,
@@ -405,10 +434,12 @@ def add_beliefs(
     allow_overwrite: bool = False,
     skiprows: int = 1,
     na_values: list[str] | None = None,
+    keep_default_na: bool = False,
     nrows: Optional[int] = None,
     datecol: int = 0,
     valuecol: int = 1,
     beliefcol: Optional[int] = None,
+    timezone: Optional[str] = None,
     delimiter: str = ",",
     decimal: str = ".",
     thousands: Optional[str] = None,
@@ -433,17 +464,7 @@ def add_beliefs(
     In case no --horizon is specified and no beliefcol is specified,
     the moment of executing this CLI command is taken as the time at which the beliefs were recorded.
     """
-    sensor = Sensor.query.filter(Sensor.id == sensor_id).one_or_none()
-    if sensor is None:
-        print(f"Failed to create beliefs: no sensor found with ID {sensor_id}.")
-        return
-    if source.isdigit():
-        _source = get_source_or_none(int(source), source_type="CLI script")
-        if not _source:
-            print(f"Failed to find source {source}.")
-            return
-    else:
-        _source = get_or_create_source(source, source_type="CLI script")
+    _source = parse_source(source)
 
     # Set up optional parameters for read_csv
     if file.split(".")[-1].lower() == "csv":
@@ -458,6 +479,14 @@ def add_beliefs(
     elif beliefcol is None:
         kwargs["belief_time"] = server_now().astimezone(pytz.timezone(sensor.timezone))
 
+    # Set up optional filters:
+    if len(filter_columns) != len(filter_values):
+        raise ValueError(
+            "The number of filter columns and filter values should be the same."
+        )
+    filter_by_column = (
+        dict(zip(filter_columns, filter_values)) if filter_columns else None
+    )
     bdf = tb.read_csv(
         file,
         sensor,
@@ -472,6 +501,9 @@ def add_beliefs(
         else [datecol, beliefcol, valuecol],
         parse_dates=True,
         na_values=na_values,
+        keep_default_na=keep_default_na,
+        timezone=timezone,
+        filter_by_column=filter_by_column,
         **kwargs,
     )
     duplicate_rows = bdf.index.duplicated(keep="first")
@@ -1099,3 +1131,14 @@ def check_errors(errors: Dict[str, List[str]]):
             f"Please correct the following errors:\n{errors}.\n Use the --help flag to learn more."
         )
         raise click.Abort
+
+
+def parse_source(source):
+    if source.isdigit():
+        _source = get_source_or_none(int(source))
+        if not _source:
+            print(f"Failed to find source {source}.")
+            return
+    else:
+        _source = get_or_create_source(source, source_type="CLI script")
+    return _source
diff --git a/flexmeasures/data/queries/data_sources.py b/flexmeasures/data/queries/data_sources.py
index d3daded86..178891a34 100644
--- a/flexmeasures/data/queries/data_sources.py
+++ b/flexmeasures/data/queries/data_sources.py
@@ -42,7 +42,15 @@ def get_or_create_source(
     return _source
 
 
-def get_source_or_none(source: int, source_type: str) -> Optional[DataSource]:
-    query = DataSource.query.filter(DataSource.type == source_type)
+def get_source_or_none(
+    source: int | str, source_type: str | None = None
+) -> DataSource | None:
+    """
+    :param source:      source id
+    :param source_type: optionally, filter by source type
+    """
+    query = DataSource.query
+    if source_type is not None:
+        query = query.filter(DataSource.type == source_type)
     query = query.filter(DataSource.id == int(source))
     return query.one_or_none()
diff --git a/flexmeasures/utils/unit_utils.py b/flexmeasures/utils/unit_utils.py
index 360bad072..23bbd8879 100644
--- a/flexmeasures/utils/unit_utils.py
+++ b/flexmeasures/utils/unit_utils.py
@@ -8,6 +8,7 @@
 Time series with fixed resolution can be converted from units of flow to units of stock (such as 'kW' to 'kWh'), and vice versa.
 Percentages can be converted to units of some physical capacity if a capacity is known (such as '%' to 'kWh').
 """
+from __future__ import annotations
 
 from datetime import timedelta
 from typing import List, Optional, Union
@@ -207,6 +208,26 @@ def is_energy_price_unit(unit: str) -> bool:
     return False
 
 
+def _convert_time_units(
+    data: Union[tb.BeliefsSeries, pd.Series, List[Union[int, float]], int, float],
+    from_unit: str,
+    to_unit: str,
+):
+    """Convert data with datetime or timedelta dtypes to float values.
+
+    Use Unix epoch or the requested time unit, respectively.
+    """
+    if not to_unit[0].isdigit():
+        # unit abbreviations passed to pd.Timedelta need a number (so, for example, h becomes 1h)
+        to_unit = f"1{to_unit}"
+    if from_unit == "datetime":
+        return (
+            pd.to_datetime(data, utc=True) - pd.Timestamp("1970-01-01", tz="utc")
+        ) // pd.Timedelta(to_unit)
+    else:
+        return data / pd.Timedelta(to_unit)
+
+
 def convert_units(
     data: Union[tb.BeliefsSeries, pd.Series, List[Union[int, float]], int, float],
     from_unit: str,
@@ -215,6 +236,8 @@ def convert_units(
     capacity: Optional[str] = None,
 ) -> Union[pd.Series, List[Union[int, float]], int, float]:
     """Updates data values to reflect the given unit conversion."""
+    if from_unit in ("datetime", "timedelta"):
+        return _convert_time_units(data, from_unit, to_unit)
 
     if from_unit != to_unit:
         from_magnitudes = (
diff --git a/requirements/app.in b/requirements/app.in
index 9c4649540..82e15f9ec 100644
--- a/requirements/app.in
+++ b/requirements/app.in
@@ -28,7 +28,7 @@ tldextract
 pyomo>=5.6
 tabulate
 timetomodel>=0.7.1
-timely-beliefs>=1.12
+timely-beliefs[forecast]>=1.13
 python-dotenv
 # a backport, not needed in Python3.8
 importlib_metadata
diff --git a/requirements/app.txt b/requirements/app.txt
index 94f3bd22e..71df651f4 100644
--- a/requirements/app.txt
+++ b/requirements/app.txt
@@ -7,20 +7,11 @@
 alembic==1.8.1
     # via flask-migrate
 altair==4.2.0
-    # via
-    #   -r requirements/app.in
-    #   timely-beliefs
+    # via -r requirements/app.in
 arrow==1.2.2
     # via rq-dashboard
-async-generator==1.10
-    # via
-    #   trio
-    #   trio-websocket
 attrs==22.1.0
-    # via
-    #   jsonschema
-    #   outcome
-    #   trio
+    # via jsonschema
 babel==2.10.3
     # via py-moneyed
 bcrypt==4.0.0
@@ -34,7 +25,6 @@ blinker==1.5
 certifi==2022.6.15
     # via
     #   requests
-    #   selenium
     #   sentry-sdk
 charset-normalizer==2.1.1
     # via requests
@@ -50,7 +40,9 @@ convertdate==2.4.0
 cycler==0.11.0
     # via matplotlib
 deprecated==1.2.13
-    # via redis
+    # via
+    #   redis
+    #   sktime
 dill==0.3.5.1
     # via openturns
 dnspython==2.2.1
@@ -114,8 +106,6 @@ fonttools==4.37.1
     # via matplotlib
 greenlet==1.1.3
     # via sqlalchemy
-h11==0.13.0
-    # via wsproto
 humanize==4.3.0
     # via -r requirements/app.in
 idna==3.3
@@ -123,7 +113,6 @@ idna==3.3
     #   email-validator
     #   requests
     #   tldextract
-    #   trio
 importlib-metadata==4.12.0
     # via
     #   -r requirements/app.in
@@ -154,6 +143,8 @@ jsonschema==4.15.0
     # via altair
 kiwisolver==1.4.4
     # via matplotlib
+llvmlite==0.39.1
+    # via numba
 lunardate==0.2.0
     # via workalendar
 mako==1.2.2
@@ -176,24 +167,26 @@ marshmallow-sqlalchemy==0.28.1
     # via -r requirements/app.in
 matplotlib==3.5.3
     # via timetomodel
-numpy==1.23.2
+numba==0.56.3
+    # via sktime
+numpy==1.22.4
     # via
     #   -r requirements/app.in
     #   altair
     #   matplotlib
+    #   numba
     #   pandas
     #   patsy
     #   properscoring
     #   scikit-learn
     #   scipy
+    #   sktime
     #   statsmodels
     #   timely-beliefs
     #   timetomodel
     #   uniplot
 openturns==1.19.post1
     # via timely-beliefs
-outcome==1.2.0
-    # via trio
 packaging==21.3
     # via
     #   marshmallow
@@ -202,10 +195,11 @@ packaging==21.3
     #   redis
     #   statsmodels
     #   webargs
-pandas==1.2.5
+pandas==1.5.1
     # via
     #   -r requirements/app.in
     #   altair
+    #   sktime
     #   statsmodels
     #   timely-beliefs
     #   timetomodel
@@ -245,8 +239,6 @@ pyparsing==3.0.9
     #   packaging
 pyrsistent==0.18.1
     # via jsonschema
-pysocks==1.7.1
-    # via urllib3
 python-dateutil==2.8.2
     # via
     #   arrow
@@ -281,16 +273,17 @@ rq==1.11.0
 rq-dashboard==0.6.1
     # via -r requirements/app.in
 scikit-learn==1.1.2
-    # via sklearn
-scipy==1.9.1
+    # via
+    #   sklearn
+    #   sktime
+scipy==1.8.1
     # via
     #   properscoring
     #   scikit-learn
+    #   sktime
     #   statsmodels
     #   timely-beliefs
     #   timetomodel
-selenium==4.4.3
-    # via timely-beliefs
 sentry-sdk[flask]==1.9.5
     # via -r requirements/app.in
 six==1.16.0
@@ -303,10 +296,8 @@ six==1.16.0
     #   requests-file
 sklearn==0.0
     # via timetomodel
-sniffio==1.2.0
-    # via trio
-sortedcontainers==2.4.0
-    # via trio
+sktime==0.13.4
+    # via timely-beliefs
 sqlalchemy==1.4.40
     # via
     #   -r requirements/app.in
@@ -316,12 +307,14 @@ sqlalchemy==1.4.40
     #   timely-beliefs
     #   timetomodel
 statsmodels==0.13.2
-    # via timetomodel
+    # via
+    #   sktime
+    #   timetomodel
 tabulate==0.8.10
     # via -r requirements/app.in
 threadpoolctl==3.1.0
     # via scikit-learn
-timely-beliefs==1.11.5
+timely-beliefs[forecast]==1.13.0
     # via -r requirements/app.in
 timetomodel==0.7.1
     # via -r requirements/app.in
@@ -329,22 +322,15 @@ tldextract==3.3.1
     # via -r requirements/app.in
 toolz==0.12.0
     # via altair
-trio==0.21.0
-    # via
-    #   selenium
-    #   trio-websocket
-trio-websocket==0.9.2
-    # via selenium
 typing-extensions==4.3.0
     # via
     #   py-moneyed
     #   pydantic
 uniplot==0.7.0
     # via -r requirements/app.in
-urllib3[socks]==1.26.12
+urllib3==1.26.12
     # via
     #   requests
-    #   selenium
     #   sentry-sdk
 webargs==8.2.0
     # via -r requirements/app.in
@@ -356,11 +342,12 @@ workalendar==16.3.0
     # via -r requirements/app.in
 wrapt==1.14.1
     # via deprecated
-wsproto==1.2.0
-    # via trio-websocket
 wtforms==3.0.1
     # via flask-wtf
 xlrd==2.0.1
     # via -r requirements/app.in
 zipp==3.8.1
     # via importlib-metadata
+
+# The following packages are considered to be unsafe in a requirements file:
+# setuptools
diff --git a/requirements/docs.txt b/requirements/docs.txt
index f5266a513..4d7187434 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -53,10 +53,6 @@ pyparsing==3.0.9
     # via
     #   -c requirements/app.txt
     #   packaging
-pysocks==1.7.1
-    # via
-    #   -c requirements/app.txt
-    #   urllib3
 pytz==2022.2.1
     # via
     #   -c requirements/app.txt
@@ -101,7 +97,7 @@ sphinxcontrib-qthelp==1.0.3
     # via sphinx
 sphinxcontrib-serializinghtml==1.1.5
     # via sphinx
-urllib3[socks]==1.26.12
+urllib3==1.26.12
     # via
     #   -c requirements/app.txt
     #   requests
diff --git a/requirements/test.txt b/requirements/test.txt
index 0325d220e..64f5a373a 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -70,10 +70,6 @@ pyparsing==3.0.9
     # via
     #   -c requirements/app.txt
     #   packaging
-pysocks==1.7.1
-    # via
-    #   -c requirements/app.txt
-    #   urllib3
 pytest==7.1.2
     # via
     #   -r requirements/test.in
@@ -103,16 +99,14 @@ six==1.16.0
     #   fakeredis
     #   requests-mock
 sortedcontainers==2.4.0
-    # via
-    #   -c requirements/app.txt
-    #   fakeredis
+    # via fakeredis
 termcolor==1.1.0
     # via pytest-sugar
 tomli==2.0.1
     # via
     #   coverage
     #   pytest
-urllib3[socks]==1.26.12
+urllib3==1.26.12
     # via
     #   -c requirements/app.txt
     #   requests