Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Monitor latest task runs per CLI #146

Merged
merged 10 commits into from Jun 19, 2021
2 changes: 2 additions & 0 deletions documentation/changelog.rst
Expand Up @@ -14,6 +14,8 @@ Bugfixes
Infrastructure / Support
----------------------

* Add possibility to send errors to Sentry [see `PR #143 <http://www.github.com/SeitaBV/flexmeasures/pull/143>`_]


v0.5.0 | June 7, 2021
===========================
Expand Down
17 changes: 17 additions & 0 deletions documentation/configuration.rst
Expand Up @@ -160,6 +160,15 @@ Token which external services can use to check on the status of recurring tasks
Default: ``None``


SENTRY_SDN
^^^^^^^^^^^^

Set tokenized URL, so errors will be sent to Sentry when ``app.env`` is not in `debug` or `testing` mode.
E.g.: ``https://<examplePublicKey>@o<something>.ingest.sentry.io/<project-Id>``

Default: ``None``


SQLAlchemy
----------

Expand Down Expand Up @@ -329,6 +338,14 @@ Password of mail system user.
Default: ``None``


MAIL_MONITORING_RECIPIENTS
^^^^^^^^^^^^^^^^^^^^^^^

Mail addresses (comma-separated) to send monitoring alerts to. For example "fred@one.com,wilma@two.com"

Default: ``None``


.. _redis-config:

Redis
Expand Down
15 changes: 9 additions & 6 deletions flexmeasures/app.py
Expand Up @@ -26,7 +26,7 @@ def create(env: Optional[str] = None, path_to_config: Optional[str] = None) -> F

from flexmeasures.utils import config_defaults
from flexmeasures.utils.config_utils import read_config, configure_logging
from flexmeasures.utils.app_utils import set_secret_key
from flexmeasures.utils.app_utils import set_secret_key, init_sentry
from flexmeasures.utils.error_utils import add_basic_error_handlers

# Create app
Expand All @@ -36,17 +36,20 @@ def create(env: Optional[str] = None, path_to_config: Optional[str] = None) -> F
# as we need to know the ENV now (for it to be recognised by Flask()).
load_dotenv()
app = Flask("flexmeasures")

if env is not None: # overwrite
app.env = env
if env == "testing":
app.testing = True
if env == "development":
app.debug = config_defaults.DevelopmentConfig.DEBUG
if app.env == "testing":
app.testing = True
if app.env == "development":
app.debug = config_defaults.DevelopmentConfig.DEBUG

# App configuration

read_config(app, path_to_config=path_to_config)
read_config(app, custom_path_to_config=path_to_config)
add_basic_error_handlers(app)
if not app.env == "development" and not app.testing:
init_sentry(app)

app.mail = Mail(app)
FlaskJSON(app)
Expand Down
1 change: 1 addition & 0 deletions flexmeasures/data/__init__.py
Expand Up @@ -26,6 +26,7 @@ def register_at(app: Flask):
# Register some useful custom scripts with the flask cli
with app.app_context():
import flexmeasures.data.scripts.cli_tasks.jobs
import flexmeasures.data.scripts.cli_tasks.monitor
import flexmeasures.data.scripts.cli_tasks.data_add
import flexmeasures.data.scripts.cli_tasks.data_delete
import flexmeasures.data.scripts.cli_tasks.db_ops
Expand Down
4 changes: 2 additions & 2 deletions flexmeasures/data/scripts/cli_tasks/data_add.py
Expand Up @@ -481,8 +481,8 @@ def create_forecasts(
asset_id=asset_id,
timed_value_type=value_type,
horizons=[horizon],
start_of_roll=from_date - timedelta(hours=horizon),
end_of_roll=to_date - timedelta(hours=horizon),
start_of_roll=from_date - horizon,
end_of_roll=to_date - horizon,
)
else:
from flexmeasures.data.scripts.data_gen import populate_time_series_forecasts
Expand Down
91 changes: 91 additions & 0 deletions flexmeasures/data/scripts/cli_tasks/monitor.py
@@ -0,0 +1,91 @@
from datetime import timedelta
from typing import Optional

import click
from flask import current_app as app
from flask.cli import with_appcontext
from flask_mail import Message
from sentry_sdk import (
capture_message as capture_message_for_sentry,
set_context as set_sentry_context,
)

from flexmeasures.data.models.task_runs import LatestTaskRun
from flexmeasures.utils.time_utils import server_now


@click.group("monitor")
def fm_monitor():
"""FlexMeasures: Monitor tasks."""


def send_monitoring_alert(
task_name: str, msg: str, latest_run: Optional[LatestTaskRun] = None
):
"""
Send any monitoring message per Sentry and per email. Also log an error.
"""
latest_run_txt = ""
if latest_run:
set_sentry_context(
"latest_run", {"time": latest_run.datetime, "status": latest_run.status}
)
latest_run_txt = (
f"Last run was at {latest_run.datetime}, status was: {latest_run.status}"
)

capture_message_for_sentry(msg)

email_recipients = app.config.get("MAIL_MONITORING_RECIPIENTS", "").split(",")
if len(email_recipients) > 0:
email = Message(subject=f"Problem with task {task_name}", bcc=email_recipients)
email.body = f"{msg}\n\n{latest_run_txt}\nWe suggest to check the logs."
app.mail.send(email)

app.logger.error(f"msg {latest_run_txt}")


@fm_monitor.command("tasks")
@with_appcontext
@click.option(
"--task",
type=(str, int),
multiple=True,
required=True,
help="The name of the task and the maximal allowed minutes between successful runs. Use multiple times if needed.",
)
def monitor_tasks(task):
"""
Check if the given task's last successful execution happened less than the allowed time ago.
If not, alert someone, via email or sentry.
"""
for t in task:
task_name = t[0]
app.logger.info(f"Checking latest run of task {task_name} ...")
latest_run: LatestTaskRun = LatestTaskRun.query.get(task_name)
if latest_run is None:
msg = f"Task {task_name} has no last run and thus cannot be monitored. Is it configured properly?"
send_monitoring_alert(task_name, msg)
return
now = server_now()
acceptable_interval = timedelta(minutes=t[1])
if (
now - acceptable_interval
<= latest_run.datetime
<= now + acceptable_interval
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the now + acceptable_interval supposed to do? Doesn't seem documented.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. I got this from Bobby's code. It checks if the latest run time isn't too far from the monitor's now perspective.

Well, re-thinking it, I am actually not sure this makes sense. This date can practically happen to be in the future from the monitor's perspective, but if the clocks of monitoring server and task-executing server are out of sync, that should become a different kind of warning I guess. Then the allowed interval isn't a good measurement, as we don't know how much we are out of bounds.

AAAnyway ― in our case the same server is executing and monitoring so I'll not do that extra warning. I'll simply remove the future check.

):
# last time is okay, let's check the status
if latest_run.status is False:
msg = f"A failure has been reported on task {task_name}."
send_monitoring_alert(task_name, msg, latest_run)
else:
msg = (
f"Task {task_name}'s latest run time is outside of the acceptable range "
f"({acceptable_interval})."
)
app.logger.error(msg)
send_monitoring_alert(task_name, msg, latest_run)
app.logger.info("Done checking task runs ...")


app.cli.add_command(fm_monitor)
44 changes: 42 additions & 2 deletions flexmeasures/utils/app_utils.py
Expand Up @@ -4,17 +4,56 @@

import click
from flask import Flask
from flask.cli import FlaskGroup
from flask.cli import FlaskGroup, with_appcontext
import sentry_sdk
from sentry_sdk.integrations.flask import FlaskIntegration
from sentry_sdk.integrations.rq import RqIntegration
from pkg_resources import get_distribution

from flexmeasures.app import create as create_app


@click.group(cls=FlaskGroup, create_app=create_app)
@with_appcontext
def flexmeasures_cli():
"""Management scripts for the FlexMeasures platform."""
"""
Management scripts for the FlexMeasures platform.
We use @app_context here so things from the app setup are initialised
only once. This is crucial for Sentry, for example.
"""
pass


def init_sentry(app: Flask):
"""
Configure Sentry.
We need the app to read the Sentry DSN from configuration, and also
to send some additional meta information.
"""
sentry_dsn = app.config.get("SENTRY_DSN")
if not sentry_dsn:
app.logger.info(
"[FLEXMEASURES] No SENTRY_DSN setting found, so initialising Sentry cannot happen ..."
)
return
app.logger.info("[FLEXMEASURES] Initialising Sentry ...")
sentry_sdk.init(
dsn=sentry_dsn,
integrations=[FlaskIntegration(), RqIntegration()],
debug=app.debug,
release=f"flexmeasures@{get_distribution('flexmeasures').version}",
send_default_pii=True, # user data (current user id, email address, username) is attached to the event.
environment=app.env,
# Set traces_sample_rate to 1.0 to capture 100%
# of transactions for performance monitoring.
# We recommend adjusting this value in production.
# TODO: Decide if we need this and if to configure it.
traces_sample_rate=0.33,
)
sentry_sdk.set_tag("mode", app.config.get("FLEXMEASURES_MODE"))
sentry_sdk.set_tag("platform-name", app.config.get("FLEXMEASURES_PLATFORM_NAME"))


def set_secret_key(app, filename="secret_key"):
"""Set the SECRET_KEY or exit.

Expand Down Expand Up @@ -104,3 +143,4 @@ def register_plugins(app: Flask):
plugin_version = getattr(plugin_blueprint, "__version__", "0.1")
app.config["LOADED_PLUGINS"][plugin_name] = plugin_version
app.logger.info(f"Loaded plugins: {app.config['LOADED_PLUGINS']}")
sentry_sdk.set_context("plugins", app.config.get("LOADED_PLUGINS", {}))
3 changes: 3 additions & 0 deletions flexmeasures/utils/config_defaults.py
Expand Up @@ -41,6 +41,7 @@ class Config(object):
"no-reply@example.com",
) # tuple of name and email address
MAIL_PASSWORD: Optional[str] = None
MAIL_MONITORING_RECIPIENTS = None

SECURITY_REGISTERABLE = False
SECURITY_LOGIN_USER_TEMPLATE = "admin/login_user.html"
Expand Down Expand Up @@ -75,6 +76,8 @@ class Config(object):
3000 # Web interface poll period for updates in ms
)

SENTRY_DSN: Optional[str] = None

FLEXMEASURES_PLATFORM_NAME: str = "FlexMeasures"
FLEXMEASURES_MODE: str = ""
FLEXMEASURES_TIMEZONE: str = "Asia/Seoul"
Expand Down
15 changes: 8 additions & 7 deletions flexmeasures/utils/config_utils.py
Expand Up @@ -50,7 +50,7 @@ def configure_logging():
loggingDictConfig(flexmeasures_logging_config)


def read_config(app: Flask, path_to_config: Optional[str]):
def read_config(app: Flask, custom_path_to_config: Optional[str]):
"""Read configuration from various expected sources, complain if not setup correctly. """

if app.env not in (
Expand All @@ -65,21 +65,22 @@ def read_config(app: Flask, path_to_config: Optional[str]):
)
sys.exit(2)

# Load default config settings
# First, load default config settings
app.config.from_object(
"flexmeasures.utils.config_defaults.%sConfig" % camelize(app.env)
)

# Now read user config, if possible. If no explicit path is given, try home dir first, then instance dir
# Now, potentially overwrite those from config file
# These two locations are possible (besides the custom path)
path_to_config_home = str(Path.home().joinpath(".flexmeasures.cfg"))
path_to_config_instance = os.path.join(app.instance_path, "flexmeasures.cfg")
if not app.testing:
if not app.testing: # testing runs completely on defaults
# If no custom path is given, this will try home dir first, then instance dir
used_path_to_config = read_custom_config(
app, path_to_config, path_to_config_home, path_to_config_instance
app, custom_path_to_config, path_to_config_home, path_to_config_instance
)

# Check for missing values.
# Testing might affect only specific functionality (-> dev's responsibility)
# Documentation runs fine without them.
if not app.testing and app.env != "documentation":
if not are_required_settings_complete(app):
Expand Down Expand Up @@ -126,7 +127,7 @@ def read_custom_config(
app.config.from_pyfile(path_to_config)
except FileNotFoundError:
pass
# Finally, all required varaiables can be set as env var:
# Finally, all required variables can be set as env var:
for req_var in required:
app.config[req_var] = os.getenv(req_var, app.config.get(req_var, None))
return path_to_config
Expand Down
2 changes: 1 addition & 1 deletion flexmeasures/utils/error_utils.py
Expand Up @@ -23,7 +23,7 @@ def log_error(exc: Exception, error_msg: str):
extra = dict(url=request.path, **get_err_source_info(last_traceback))

msg = (
'{error_name}:"{message}" [occured at {src_module}({src_func}):{src_linenr},'
'{error_name}:"{message}" [occurred at {src_module}({src_func}):{src_linenr},'
"URL was: {url}]".format(
error_name=exc.__class__.__name__, message=error_msg, **extra
)
Expand Down
1 change: 1 addition & 0 deletions requirements/app.in
Expand Up @@ -48,6 +48,7 @@ Flask-Security-Too>=4.0
Flask-Classful
Flask-Marshmallow
Flask-Cors
sentry-sdk[flask]
marshmallow-sqlalchemy>=0.23.1
webargs
# flask should be after all the flask plugins, because setup might find they ARE flask
Expand Down
9 changes: 8 additions & 1 deletion requirements/app.txt
Expand Up @@ -22,12 +22,15 @@ blinker==1.4
# via
# flask-mail
# flask-principal
# sentry-sdk
bokeh==1.0.4
# via
# -r requirements/app.in
# pandas-bokeh
certifi==2020.12.5
# via requests
# via
# requests
# sentry-sdk
cffi==1.14.5
# via bcrypt
cftime==1.4.1
Expand Down Expand Up @@ -99,6 +102,7 @@ flask==1.1.2
# flask-sslify
# flask-wtf
# rq-dashboard
# sentry-sdk
greenlet==1.0.0
# via sqlalchemy
humanize==3.3.0
Expand Down Expand Up @@ -280,6 +284,8 @@ scipy==1.6.2
# timetomodel
selenium==3.141.0
# via timely-beliefs
sentry-sdk[flask]==1.1.0
# via -r requirements/app.in
siphon==0.9
# via -r requirements/app.in
six==1.15.0
Expand Down Expand Up @@ -331,6 +337,7 @@ urllib3==1.26.4
# via
# requests
# selenium
# sentry-sdk
webargs==7.0.1
# via -r requirements/app.in
werkzeug==1.0.1
Expand Down