Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Monitor latest task runs per CLI #146

Merged
merged 10 commits into from Jun 19, 2021
1 change: 1 addition & 0 deletions documentation/changelog.rst
Expand Up @@ -15,6 +15,7 @@ Infrastructure / Support
----------------------

* Add possibility to send errors to Sentry [see `PR #143 <http://www.github.com/SeitaBV/flexmeasures/pull/143>`_]
* Add CLI task to monitor if tasks ran successfully and recently enough [see `PR #146 <http://www.github.com/SeitaBV/flexmeasures/pull/146>`_]


v0.5.0 | June 7, 2021
Expand Down
16 changes: 16 additions & 0 deletions documentation/configuration.rst
Expand Up @@ -153,6 +153,15 @@ Token for accessing the MapBox API (for displaying maps on the dashboard and ass
Default: ``None``


SENTRY_SDN
^^^^^^^^^^^^

Set tokenized URL, so errors will be sent to Sentry when ``app.env`` is not in `debug` or `testing` mode.
E.g.: ``https://<examplePublicKey>@o<something>.ingest.sentry.io/<project-Id>``

Default: ``None``


SQLAlchemy
----------

Expand Down Expand Up @@ -356,6 +365,13 @@ Token which external services can use to check on the status of recurring tasks
Default: ``None``


FLEXMEASURES_MONITORING_MAIL_RECIPIENTS
^^^^^^^^^^^^^^^^^^^^^^^

E-mail addresses to send monitoring alerts to from the CLI task ``flexmeasures monitor tasks``. For example ``["fred@one.com", "wilma@two.com"]``

Default: ``[]``


.. _redis-config:

Expand Down
1 change: 1 addition & 0 deletions flexmeasures/data/__init__.py
Expand Up @@ -26,6 +26,7 @@ def register_at(app: Flask):
# Register some useful custom scripts with the flask cli
with app.app_context():
import flexmeasures.data.scripts.cli_tasks.jobs
import flexmeasures.data.scripts.cli_tasks.monitor
import flexmeasures.data.scripts.cli_tasks.data_add
import flexmeasures.data.scripts.cli_tasks.data_delete
import flexmeasures.data.scripts.cli_tasks.db_ops
Expand Down
88 changes: 88 additions & 0 deletions flexmeasures/data/scripts/cli_tasks/monitor.py
@@ -0,0 +1,88 @@
from datetime import timedelta
from typing import Optional

import click
from flask import current_app as app
from flask.cli import with_appcontext
from flask_mail import Message
from sentry_sdk import (
capture_message as capture_message_for_sentry,
set_context as set_sentry_context,
)

from flexmeasures.data.models.task_runs import LatestTaskRun
from flexmeasures.utils.time_utils import server_now


@click.group("monitor")
def fm_monitor():
"""FlexMeasures: Monitor tasks."""


def send_monitoring_alert(
task_name: str, msg: str, latest_run: Optional[LatestTaskRun] = None
):
"""
Send any monitoring message per Sentry and per email. Also log an error.
"""
latest_run_txt = ""
if latest_run:
set_sentry_context(
"latest_run", {"time": latest_run.datetime, "status": latest_run.status}
)
latest_run_txt = (
f"Last run was at {latest_run.datetime}, status was: {latest_run.status}"
)

capture_message_for_sentry(msg)

email_recipients = app.config.get("FLEXMEASURES_MONITORING_MAIL_RECIPIENTS", [])
if len(email_recipients) > 0:
email = Message(subject=f"Problem with task {task_name}", bcc=email_recipients)
email.body = f"{msg}\n\n{latest_run_txt}\nWe suggest to check the logs."
app.mail.send(email)

app.logger.error(f"{msg} {latest_run_txt}")


@fm_monitor.command("tasks")
@with_appcontext
@click.option(
"--task",
type=(str, int),
multiple=True,
required=True,
help="The name of the task and the maximal allowed minutes between successful runs. Use multiple times if needed.",
)
def monitor_tasks(task):
"""
Check if the given task's last successful execution happened less than the allowed time ago.
If not, alert someone, via email or sentry.
"""
for t in task:
task_name = t[0]
app.logger.info(f"Checking latest run of task {task_name} ...")
latest_run: LatestTaskRun = LatestTaskRun.query.get(task_name)
if latest_run is None:
msg = f"Task {task_name} has no last run and thus cannot be monitored. Is it configured properly?"
send_monitoring_alert(task_name, msg)
return
now = server_now()
acceptable_interval = timedelta(minutes=t[1])
# check if latest run was recently enough
if latest_run.datetime >= now - acceptable_interval:
# latest run time is okay, let's check the status
if latest_run.status is False:
msg = f"A failure has been reported on task {task_name}."
send_monitoring_alert(task_name, msg, latest_run)
else:
msg = (
f"Task {task_name}'s latest run time is outside of the acceptable range "
f"({acceptable_interval})."
)
app.logger.error(msg)
send_monitoring_alert(task_name, msg, latest_run)
app.logger.info("Done checking task runs ...")


app.cli.add_command(fm_monitor)
1 change: 1 addition & 0 deletions flexmeasures/utils/config_defaults.py
Expand Up @@ -80,6 +80,7 @@ class Config(object):
# traces_sample_rate is for performance monitoring across all transactions,
# you probably want to adjust this.
FLEXMEASURES_SENTRY_CONFIG: dict = dict(traces_sample_rate=0.33)
FLEXMEASURES_MONITORING_MAIL_RECIPIENTS: List[str] = []

FLEXMEASURES_PLATFORM_NAME: str = "FlexMeasures"
FLEXMEASURES_MODE: str = ""
Expand Down