Skip to content

Commit

Permalink
Monitor latest task runs per CLI (#146)
Browse files Browse the repository at this point in the history
* monitor latest task runs per CLI and send any alerts to Sentry and email
* make monitoring setting a list and give it a better name
* add changelog entry
* simplify recency check

Co-authored-by: nhoening <nhoening@users.noreply.github.com>
  • Loading branch information
nhoening and nhoening committed Jun 19, 2021
1 parent 24cd1ef commit e4a26dc
Show file tree
Hide file tree
Showing 5 changed files with 107 additions and 0 deletions.
1 change: 1 addition & 0 deletions documentation/changelog.rst
Expand Up @@ -15,6 +15,7 @@ Infrastructure / Support
----------------------

* Add possibility to send errors to Sentry [see `PR #143 <http://www.github.com/SeitaBV/flexmeasures/pull/143>`_]
* Add CLI task to monitor if tasks ran successfully and recently enough [see `PR #146 <http://www.github.com/SeitaBV/flexmeasures/pull/146>`_]


v0.5.0 | June 7, 2021
Expand Down
16 changes: 16 additions & 0 deletions documentation/configuration.rst
Expand Up @@ -153,6 +153,15 @@ Token for accessing the MapBox API (for displaying maps on the dashboard and ass
Default: ``None``


SENTRY_SDN
^^^^^^^^^^^^

Set tokenized URL, so errors will be sent to Sentry when ``app.env`` is not in `debug` or `testing` mode.
E.g.: ``https://<examplePublicKey>@o<something>.ingest.sentry.io/<project-Id>``

Default: ``None``


SQLAlchemy
----------

Expand Down Expand Up @@ -356,6 +365,13 @@ Token which external services can use to check on the status of recurring tasks
Default: ``None``


FLEXMEASURES_MONITORING_MAIL_RECIPIENTS
^^^^^^^^^^^^^^^^^^^^^^^

E-mail addresses to send monitoring alerts to from the CLI task ``flexmeasures monitor tasks``. For example ``["fred@one.com", "wilma@two.com"]``

Default: ``[]``


.. _redis-config:

Expand Down
1 change: 1 addition & 0 deletions flexmeasures/data/__init__.py
Expand Up @@ -26,6 +26,7 @@ def register_at(app: Flask):
# Register some useful custom scripts with the flask cli
with app.app_context():
import flexmeasures.data.scripts.cli_tasks.jobs
import flexmeasures.data.scripts.cli_tasks.monitor
import flexmeasures.data.scripts.cli_tasks.data_add
import flexmeasures.data.scripts.cli_tasks.data_delete
import flexmeasures.data.scripts.cli_tasks.db_ops
Expand Down
88 changes: 88 additions & 0 deletions flexmeasures/data/scripts/cli_tasks/monitor.py
@@ -0,0 +1,88 @@
from datetime import timedelta
from typing import Optional

import click
from flask import current_app as app
from flask.cli import with_appcontext
from flask_mail import Message
from sentry_sdk import (
capture_message as capture_message_for_sentry,
set_context as set_sentry_context,
)

from flexmeasures.data.models.task_runs import LatestTaskRun
from flexmeasures.utils.time_utils import server_now


@click.group("monitor")
def fm_monitor():
"""FlexMeasures: Monitor tasks."""


def send_monitoring_alert(
task_name: str, msg: str, latest_run: Optional[LatestTaskRun] = None
):
"""
Send any monitoring message per Sentry and per email. Also log an error.
"""
latest_run_txt = ""
if latest_run:
set_sentry_context(
"latest_run", {"time": latest_run.datetime, "status": latest_run.status}
)
latest_run_txt = (
f"Last run was at {latest_run.datetime}, status was: {latest_run.status}"
)

capture_message_for_sentry(msg)

email_recipients = app.config.get("FLEXMEASURES_MONITORING_MAIL_RECIPIENTS", [])
if len(email_recipients) > 0:
email = Message(subject=f"Problem with task {task_name}", bcc=email_recipients)
email.body = f"{msg}\n\n{latest_run_txt}\nWe suggest to check the logs."
app.mail.send(email)

app.logger.error(f"{msg} {latest_run_txt}")


@fm_monitor.command("tasks")
@with_appcontext
@click.option(
"--task",
type=(str, int),
multiple=True,
required=True,
help="The name of the task and the maximal allowed minutes between successful runs. Use multiple times if needed.",
)
def monitor_tasks(task):
"""
Check if the given task's last successful execution happened less than the allowed time ago.
If not, alert someone, via email or sentry.
"""
for t in task:
task_name = t[0]
app.logger.info(f"Checking latest run of task {task_name} ...")
latest_run: LatestTaskRun = LatestTaskRun.query.get(task_name)
if latest_run is None:
msg = f"Task {task_name} has no last run and thus cannot be monitored. Is it configured properly?"
send_monitoring_alert(task_name, msg)
return
now = server_now()
acceptable_interval = timedelta(minutes=t[1])
# check if latest run was recently enough
if latest_run.datetime >= now - acceptable_interval:
# latest run time is okay, let's check the status
if latest_run.status is False:
msg = f"A failure has been reported on task {task_name}."
send_monitoring_alert(task_name, msg, latest_run)
else:
msg = (
f"Task {task_name}'s latest run time is outside of the acceptable range "
f"({acceptable_interval})."
)
app.logger.error(msg)
send_monitoring_alert(task_name, msg, latest_run)
app.logger.info("Done checking task runs ...")


app.cli.add_command(fm_monitor)
1 change: 1 addition & 0 deletions flexmeasures/utils/config_defaults.py
Expand Up @@ -80,6 +80,7 @@ class Config(object):
# traces_sample_rate is for performance monitoring across all transactions,
# you probably want to adjust this.
FLEXMEASURES_SENTRY_CONFIG: dict = dict(traces_sample_rate=0.33)
FLEXMEASURES_MONITORING_MAIL_RECIPIENTS: List[str] = []

FLEXMEASURES_PLATFORM_NAME: str = "FlexMeasures"
FLEXMEASURES_MODE: str = ""
Expand Down

0 comments on commit e4a26dc

Please sign in to comment.