diff --git a/documentation/changelog.rst b/documentation/changelog.rst index e64a74a3e..725b5ef5c 100644 --- a/documentation/changelog.rst +++ b/documentation/changelog.rst @@ -15,6 +15,7 @@ Infrastructure / Support ---------------------- * Add possibility to send errors to Sentry [see `PR #143 `_] +* Add CLI task to monitor if tasks ran successfully and recently enough [see `PR #146 `_] v0.5.0 | June 7, 2021 diff --git a/documentation/configuration.rst b/documentation/configuration.rst index f000e8549..ba2af79a8 100644 --- a/documentation/configuration.rst +++ b/documentation/configuration.rst @@ -153,6 +153,15 @@ Token for accessing the MapBox API (for displaying maps on the dashboard and ass Default: ``None`` +SENTRY_SDN +^^^^^^^^^^^^ + +Set tokenized URL, so errors will be sent to Sentry when ``app.env`` is not in `debug` or `testing` mode. +E.g.: ``https://@o.ingest.sentry.io/`` + +Default: ``None`` + + SQLAlchemy ---------- @@ -356,6 +365,13 @@ Token which external services can use to check on the status of recurring tasks Default: ``None`` +FLEXMEASURES_MONITORING_MAIL_RECIPIENTS +^^^^^^^^^^^^^^^^^^^^^^^ + +E-mail addresses to send monitoring alerts to from the CLI task ``flexmeasures monitor tasks``. For example ``["fred@one.com", "wilma@two.com"]`` + +Default: ``[]`` + .. _redis-config: diff --git a/flexmeasures/data/__init__.py b/flexmeasures/data/__init__.py index 4ddf8e80c..c48e7c22e 100644 --- a/flexmeasures/data/__init__.py +++ b/flexmeasures/data/__init__.py @@ -26,6 +26,7 @@ def register_at(app: Flask): # Register some useful custom scripts with the flask cli with app.app_context(): import flexmeasures.data.scripts.cli_tasks.jobs + import flexmeasures.data.scripts.cli_tasks.monitor import flexmeasures.data.scripts.cli_tasks.data_add import flexmeasures.data.scripts.cli_tasks.data_delete import flexmeasures.data.scripts.cli_tasks.db_ops diff --git a/flexmeasures/data/scripts/cli_tasks/monitor.py b/flexmeasures/data/scripts/cli_tasks/monitor.py new file mode 100644 index 000000000..896af9e1a --- /dev/null +++ b/flexmeasures/data/scripts/cli_tasks/monitor.py @@ -0,0 +1,88 @@ +from datetime import timedelta +from typing import Optional + +import click +from flask import current_app as app +from flask.cli import with_appcontext +from flask_mail import Message +from sentry_sdk import ( + capture_message as capture_message_for_sentry, + set_context as set_sentry_context, +) + +from flexmeasures.data.models.task_runs import LatestTaskRun +from flexmeasures.utils.time_utils import server_now + + +@click.group("monitor") +def fm_monitor(): + """FlexMeasures: Monitor tasks.""" + + +def send_monitoring_alert( + task_name: str, msg: str, latest_run: Optional[LatestTaskRun] = None +): + """ + Send any monitoring message per Sentry and per email. Also log an error. + """ + latest_run_txt = "" + if latest_run: + set_sentry_context( + "latest_run", {"time": latest_run.datetime, "status": latest_run.status} + ) + latest_run_txt = ( + f"Last run was at {latest_run.datetime}, status was: {latest_run.status}" + ) + + capture_message_for_sentry(msg) + + email_recipients = app.config.get("FLEXMEASURES_MONITORING_MAIL_RECIPIENTS", []) + if len(email_recipients) > 0: + email = Message(subject=f"Problem with task {task_name}", bcc=email_recipients) + email.body = f"{msg}\n\n{latest_run_txt}\nWe suggest to check the logs." + app.mail.send(email) + + app.logger.error(f"{msg} {latest_run_txt}") + + +@fm_monitor.command("tasks") +@with_appcontext +@click.option( + "--task", + type=(str, int), + multiple=True, + required=True, + help="The name of the task and the maximal allowed minutes between successful runs. Use multiple times if needed.", +) +def monitor_tasks(task): + """ + Check if the given task's last successful execution happened less than the allowed time ago. + If not, alert someone, via email or sentry. + """ + for t in task: + task_name = t[0] + app.logger.info(f"Checking latest run of task {task_name} ...") + latest_run: LatestTaskRun = LatestTaskRun.query.get(task_name) + if latest_run is None: + msg = f"Task {task_name} has no last run and thus cannot be monitored. Is it configured properly?" + send_monitoring_alert(task_name, msg) + return + now = server_now() + acceptable_interval = timedelta(minutes=t[1]) + # check if latest run was recently enough + if latest_run.datetime >= now - acceptable_interval: + # latest run time is okay, let's check the status + if latest_run.status is False: + msg = f"A failure has been reported on task {task_name}." + send_monitoring_alert(task_name, msg, latest_run) + else: + msg = ( + f"Task {task_name}'s latest run time is outside of the acceptable range " + f"({acceptable_interval})." + ) + app.logger.error(msg) + send_monitoring_alert(task_name, msg, latest_run) + app.logger.info("Done checking task runs ...") + + +app.cli.add_command(fm_monitor) diff --git a/flexmeasures/utils/config_defaults.py b/flexmeasures/utils/config_defaults.py index 70de7332a..199a42785 100644 --- a/flexmeasures/utils/config_defaults.py +++ b/flexmeasures/utils/config_defaults.py @@ -80,6 +80,7 @@ class Config(object): # traces_sample_rate is for performance monitoring across all transactions, # you probably want to adjust this. FLEXMEASURES_SENTRY_CONFIG: dict = dict(traces_sample_rate=0.33) + FLEXMEASURES_MONITORING_MAIL_RECIPIENTS: List[str] = [] FLEXMEASURES_PLATFORM_NAME: str = "FlexMeasures" FLEXMEASURES_MODE: str = ""