diff --git a/HISTORY.rst b/HISTORY.rst index e914d79d..06993ec9 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -5,6 +5,14 @@ Change Log This document records all notable changes to `django-sql-explorer `_. This project adheres to `Semantic Versioning `_. +`4.1.0b1`_ (2024-04-25) +=========================== +* `#609`_: Tracking should be opt-in and not use the SECRET_KEY +* `#610`_: Import error (sql_metadata) with 4.1 version +* `#612`_: Accessing the database during app initialization +* Regex-injection vulnerability +* Better anonymization for telemetry + `4.1.0`_ (2024-04-23) =========================== * SQL Assistant: Built in query help via OpenAI (or LLM of choice), with relevant schema diff --git a/docs/settings.rst b/docs/settings.rst index 183dd3d8..fa164e2a 100644 --- a/docs/settings.rst +++ b/docs/settings.rst @@ -338,11 +338,11 @@ but a dotted path to a python view can be used EXPLORER_NO_PERMISSION_VIEW = 'explorer.views.auth.safe_login_view_wrapper' -Anonymous Usage Stat Collection -******************************* +Anonymous Telemetry Collection +****************************** By default, anonymous usage statistics are collected. To disable this, set the following setting to False. -You can see what is being collected in tracker.py. +You can see what is being collected in telemetry.py. .. code-block:: python diff --git a/explorer/__init__.py b/explorer/__init__.py index 2848706f..d760f262 100644 --- a/explorer/__init__.py +++ b/explorer/__init__.py @@ -1,9 +1,9 @@ __version_info__ = { "major": 4, - "minor": 1, + "minor": 2, "patch": 0, - "releaselevel": "final", - "serial": 0 + "releaselevel": "beta", + "serial": 1 } diff --git a/explorer/actions.py b/explorer/actions.py index 7ceab70b..5b90a289 100644 --- a/explorer/actions.py +++ b/explorer/actions.py @@ -33,10 +33,11 @@ def _package(queries): is_one = len(queries) == 1 name_root = lambda n: f"attachment; filename={n}" # noqa ret["content_type"] = (is_one and "text/csv") or "application/zip" - + formatted = queries[0].title.replace(",", "") + day = date.today() ret["filename"] = ( - is_one and name_root("%s.csv" % queries[0].title.replace(",", "")) - ) or name_root("Report_%s.zip" % date.today()) + is_one and name_root(f"{formatted}.csv") + ) or name_root(f"Report_{day}.zip") ret["data"] = ( is_one and CSVExporter(queries[0]).get_output() diff --git a/explorer/apps.py b/explorer/apps.py index 4775543d..6797abfe 100644 --- a/explorer/apps.py +++ b/explorer/apps.py @@ -1,7 +1,6 @@ from django.apps import AppConfig from django.core.exceptions import ImproperlyConfigured from django.db import connections as djcs -from django.db.utils import DatabaseError from django.utils.translation import gettext_lazy as _ @@ -15,7 +14,6 @@ def ready(self): from explorer.schema import build_async_schemas _validate_connections() build_async_schemas() - track_summary_stats() def _get_default(): @@ -44,25 +42,3 @@ def _validate_connections(): f"EXPLORER_CONNECTIONS contains ({name}, {conn_name}), " f"but {conn_name} is not a valid Django DB connection." ) - - -def track_summary_stats(): - from explorer.tracker import Stat, StatNames - from explorer.tracker import gather_summary_stats - from explorer.models import Query - - # Django doesn't actually have a way of running code on application initialization, so we have come up with this. - # The app.ready() method (the call site for this function) is invoked *before* any migrations are run. So if were - # to just call this function in ready(), without the try: block, then it would always fail the very first time - # Django runs (and e.g. in test runs) because no tables have yet been created. The intuitive way to handle this with - # Django would be to tie into the post_migrate signal in ready() and run this function on post_migrate. But that - # doesn't work because that signal is only called if indeed a migrations has been applied. If the app restarts and - # there are no new migrations, the signal never fires. So instead we check if the Query table exists, and if it - # does, we're good to gather stats. - try: - Query.objects.first() - except DatabaseError: - return - else: - payload = gather_summary_stats() - Stat(StatNames.STARTUP_STATS, payload).track() diff --git a/explorer/assistant/utils.py b/explorer/assistant/utils.py index fb739d32..7f444219 100644 --- a/explorer/assistant/utils.py +++ b/explorer/assistant/utils.py @@ -1,18 +1,15 @@ from explorer import app_settings from explorer.schema import schema_info from explorer.utils import get_valid_connection -from sql_metadata import Parser from django.db.utils import OperationalError -if app_settings.EXPLORER_AI_API_KEY: - import tiktoken - from openai import OpenAI OPENAI_MODEL = app_settings.EXPLORER_ASSISTANT_MODEL["name"] ROW_SAMPLE_SIZE = 2 def openai_client(): + from openai import OpenAI return OpenAI( api_key=app_settings.EXPLORER_AI_API_KEY, base_url=app_settings.EXPLORER_ASSISTANT_BASE_URL @@ -73,6 +70,7 @@ def format_rows_from_table(rows): def get_table_names_from_query(sql): + from sql_metadata import Parser if sql: try: parsed = Parser(sql) @@ -84,6 +82,7 @@ def get_table_names_from_query(sql): def num_tokens_from_string(string: str) -> int: """Returns the number of tokens in a text string.""" + import tiktoken encoding = tiktoken.encoding_for_model(OPENAI_MODEL) num_tokens = len(encoding.encode(string)) return num_tokens diff --git a/explorer/assistant/views.py b/explorer/assistant/views.py index 4aa46c46..ca2d68b2 100644 --- a/explorer/assistant/views.py +++ b/explorer/assistant/views.py @@ -3,7 +3,7 @@ from django.views.decorators.http import require_POST import json -from explorer.tracker import Stat, StatNames +from explorer.telemetry import Stat, StatNames from explorer.utils import get_valid_connection from explorer.assistant.models import PromptLog from explorer.assistant.prompts import primary_prompt diff --git a/explorer/migrations/0015_explorervalue.py b/explorer/migrations/0015_explorervalue.py new file mode 100644 index 00000000..07ec7845 --- /dev/null +++ b/explorer/migrations/0015_explorervalue.py @@ -0,0 +1,21 @@ +# Generated by Django 4.2.8 on 2024-04-25 13:34 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('explorer', '0014_promptlog'), + ] + + operations = [ + migrations.CreateModel( + name='ExplorerValue', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('key', models.CharField(choices=[('UUID', 'Install Unique ID'), ('SMLS', 'Startup metric last send')], max_length=5)), + ('value', models.TextField(blank=True, null=True)), + ], + ), + ] diff --git a/explorer/models.py b/explorer/models.py index 3ac922ae..a512f19f 100644 --- a/explorer/models.py +++ b/explorer/models.py @@ -1,5 +1,6 @@ import logging from time import time +import uuid from django.conf import settings from django.core.exceptions import ValidationError @@ -8,7 +9,7 @@ from django.utils.translation import gettext_lazy as _ from explorer import app_settings -from explorer.tracker import Stat, StatNames +from explorer.telemetry import Stat, StatNames from explorer.utils import ( extract_params, get_params_for_url, get_s3_bucket, get_valid_connection, passes_blacklist, s3_url, shared_dict_update, swap_params, @@ -393,3 +394,50 @@ def stats(self): def __str__(self): return str(self._header) + + +class ExplorerValueManager(models.Manager): + + def get_uuid(self): + # If blank or non-existing, generates a new UUID + uuid_obj, created = self.get_or_create( + key=ExplorerValue.INSTALL_UUID, + defaults={"value": str(uuid.uuid4())} + ) + if created or uuid_obj.value is None: + uuid_obj.value = str(uuid.uuid4()) + uuid_obj.save() + return uuid_obj.value + + def get_startup_last_send(self): + # Stored as a Unix timestamp + try: + timestamp = self.get(key=ExplorerValue.STARTUP_METRIC_LAST_SEND).value + if timestamp: + return float(timestamp) + return None + except ExplorerValue.DoesNotExist: + return None + + def set_startup_last_send(self, ts): + obj, created = self.get_or_create( + key=ExplorerValue.STARTUP_METRIC_LAST_SEND, + defaults={"value": str(ts)} + ) + if not created: + obj.value = str(ts) + obj.save() + + +class ExplorerValue(models.Model): + INSTALL_UUID = "UUID" + STARTUP_METRIC_LAST_SEND = "SMLS" + EXPLORER_SETTINGS_CHOICES = [ + (INSTALL_UUID, "Install Unique ID"), + (STARTUP_METRIC_LAST_SEND, "Startup metric last send"), + ] + + key = models.CharField(max_length=5, choices=EXPLORER_SETTINGS_CHOICES) + value = models.TextField(null=True, blank=True) + + objects = ExplorerValueManager() diff --git a/explorer/telemetry.py b/explorer/telemetry.py new file mode 100644 index 00000000..d4e8108c --- /dev/null +++ b/explorer/telemetry.py @@ -0,0 +1,152 @@ +# Anonymous usage stats +# Opt-out by setting EXPLORER_ENABLE_ANONYMOUS_STATS = False in settings + +import logging +import time +import requests +import json +import threading +from enum import Enum, auto +from django.core.cache import cache +from django.db import connection +from django.db.models import Count +from django.db.migrations.recorder import MigrationRecorder +from django.conf import settings + +logger = logging.getLogger(__name__) + + +def instance_identifier(): + from explorer.models import ExplorerValue + key = "explorer_instance_identifier" + r = cache.get(key) + if not r: + r = ExplorerValue.objects.get_uuid() + cache.set(key, r, 60 * 60 * 24) + return r + + +class SelfNamedEnum(Enum): + + @staticmethod + def _generate_next_value_(name, start, count, last_values): + return name + + +class StatNames(SelfNamedEnum): + + QUERY_RUN = auto() + QUERY_STREAM = auto() + STARTUP_STATS = auto() + ASSISTANT_RUN = auto() + + +class Stat: + + STAT_COLLECTION_INTERVAL = 60 * 10 # Ten minutes + STARTUP_STAT_COLLECTION_INTERVAL = 60 * 60 * 24 * 7 # A week + + def __init__(self, name: StatNames, value): + self.instanceId = instance_identifier() + self.time = time.time() + self.value = value + self.name = name.value + + @property + def is_summary(self): + return self.name == StatNames.STARTUP_STATS.value + + def should_send_summary_stats(self): + from explorer.models import ExplorerValue + last_send = ExplorerValue.objects.get_startup_last_send() + if not last_send: + return True + else: + return self.time - last_send >= self.STARTUP_STAT_COLLECTION_INTERVAL + + def send_summary_stats(self): + from explorer.models import ExplorerValue + payload = _gather_summary_stats() + Stat(StatNames.STARTUP_STATS, payload).track() + ExplorerValue.objects.set_startup_last_send(self.time) + + def track(self): + from explorer import app_settings + if not app_settings.EXPLORER_ENABLE_ANONYMOUS_STATS: + return + + cache_key = "last_stat_sent_time" + last_sent_time = cache.get(cache_key, 0) + # Summary stats are tracked with a different time interval + if self.is_summary or self.time - last_sent_time >= self.STAT_COLLECTION_INTERVAL: + data = json.dumps(self.__dict__) + thread = threading.Thread(target=_send, args=(data,)) + thread.start() + cache.set(cache_key, self.time) + + # Every time we send any tracking, see if we have recently sent overall summary stats + # Of course, sending the summary stats calls .track(), so we need to NOT call track() + # again if we are in fact already in the process of sending summary stats. Otherwise, + # we will end up in infinite recursion of track() calls. + if not self.is_summary and self.should_send_summary_stats(): + self.send_summary_stats() + + +def _send(data): + from explorer import app_settings + try: + requests.post(app_settings.EXPLORER_COLLECT_ENDPOINT_URL, + data=data, + headers={"Content-Type": "application/json"}) + except Exception as e: + logger.warning(f"Failed to send stats: {e}") + + +def _get_install_quarter(): + first_migration = MigrationRecorder.Migration.objects. \ + filter(app="explorer").order_by("applied").first() + + if first_migration is not None: + quarter = (first_migration.applied.month - 1) // 3 + 1 # Calculate the quarter + year = first_migration.applied.year + quarter_str = f"Q{quarter}-{year}" + else: + quarter_str = None + return quarter_str + + +def _gather_summary_stats(): + + from explorer import app_settings + from explorer.models import Query, QueryLog + import explorer + + try: + ql_stats = QueryLog.objects.aggregate( + total_count=Count("*"), + unique_run_by_user_count=Count("run_by_user_id", distinct=True) + ) + + q_stats = Query.objects.aggregate( + total_count=Count("*"), + unique_connection_count=Count("connection", distinct=True) + ) + + # Round the counts to provide additional anonymity + return { + "total_log_count": round(ql_stats["total_count"] * 0.1) * 10, + "unique_run_by_user_count": round(ql_stats["unique_run_by_user_count"] * 0.2) * 5, + "total_query_count": round(q_stats["total_count"] * 0.1) * 10, + "unique_connection_count": round(q_stats["unique_connection_count"] * 0.2) * 5, + "default_database": connection.vendor, + "explorer_install_quarter": _get_install_quarter(), + "debug": settings.DEBUG, + "tasks_enabled": app_settings.ENABLE_TASKS, + "unsafe_rendering": app_settings.UNSAFE_RENDERING, + "transform_count": len(app_settings.EXPLORER_TRANSFORMS), + "assistant_enabled": app_settings.EXPLORER_AI_API_KEY is not None, + "version": explorer.get_version(), + "charts_enabled": app_settings.EXPLORER_CHARTS_ENABLED + } + except Exception as e: + return {"error": f"error gathering stats: {e}"} diff --git a/explorer/tests/test_telemetry.py b/explorer/tests/test_telemetry.py new file mode 100644 index 00000000..51888213 --- /dev/null +++ b/explorer/tests/test_telemetry.py @@ -0,0 +1,66 @@ +from django.test import TestCase +from explorer.telemetry import instance_identifier, _gather_summary_stats, Stat, StatNames, _get_install_quarter +from unittest.mock import patch, MagicMock +from django.core.cache import cache +from datetime import datetime + + +class TestTelemetry(TestCase): + + def test_instance_identifier(self): + v = instance_identifier() + self.assertEqual(len(v), 36) + + # Doesn't change after calling it again + v = instance_identifier() + self.assertEqual(len(v), 36) + + def test_gather_summary_stats(self): + res = _gather_summary_stats() + self.assertEqual(res["total_query_count"], 0) + self.assertEqual(res["default_database"], "sqlite") + + @patch("explorer.telemetry.threading.Thread") + @patch("explorer.app_settings") + def test_stats_not_sent_too_frequently(self, mocked_app_settings, mocked_thread): + mocked_app_settings.EXPLORER_ENABLE_ANONYMOUS_STATS = True + mocked_app_settings.UNSAFE_RENDERING = True + mocked_app_settings.EXPLORER_CHARTS_ENABLED = True + mocked_app_settings.ENABLE_TASKS = True + s1 = Stat(StatNames.QUERY_RUN, {"foo": "bar"}) + s2 = Stat(StatNames.QUERY_RUN, {"mux": "qux"}) + s3 = Stat(StatNames.QUERY_RUN, {"bar": "baz"}) + + # once for s1 and once for summary stats + s1.track() + self.assertEqual(mocked_thread.call_count, 2) + + # both the s2 track call is suppressed, and the summary stat call + s2.track() + self.assertEqual(mocked_thread.call_count, 2) + + # clear the cache, which should cause track() for the stat to work, but not send summary stats + cache.clear() + s3.track() + self.assertEqual(mocked_thread.call_count, 3) + + @patch("explorer.telemetry.MigrationRecorder.Migration.objects.filter") + def test_get_install_quarter_with_no_migrations(self, mock_filter): + mock_filter.return_value.order_by.return_value.first.return_value = None + result = _get_install_quarter() + self.assertIsNone(result) + + @patch("explorer.telemetry.MigrationRecorder.Migration.objects.filter") + def test_get_install_quarter_edge_cases(self, mock_filter): + # Test edge cases like end of year and start of year + dates = [datetime(2022, 12, 31), datetime(2023, 1, 1), datetime(2023, 3, 31), datetime(2023, 4, 1)] + results = ["Q4-2022", "Q1-2023", "Q1-2023", "Q2-2023"] + + for date, expected in zip(dates, results): + with self.subTest(date=date): + mock_migration = MagicMock() + mock_migration.applied = date + mock_filter.return_value.order_by.return_value.first.return_value = mock_migration + + result = _get_install_quarter() + self.assertEqual(result, expected) diff --git a/explorer/tests/test_tracker.py b/explorer/tests/test_tracker.py deleted file mode 100644 index 0098656c..00000000 --- a/explorer/tests/test_tracker.py +++ /dev/null @@ -1,18 +0,0 @@ -from django.test import TestCase -from explorer.tracker import instance_identifier, gather_summary_stats - - -class TestTracker(TestCase): - - def test_instance_identifier(self): - v = instance_identifier() - - # The SHA-256 hash produces a fixed-length output of 256 bits. - # When represented as a hexadecimal string, each byte (8 bits) is - # represented by 2 hex chars. 256/8*2 = 64 - self.assertEqual(len(v), 64) - - def test_gather_summary_stats(self): - res = gather_summary_stats() - self.assertEqual(res["total_query_count"], 0) - self.assertEqual(res["default_database"], "sqlite") diff --git a/explorer/tests/test_views.py b/explorer/tests/test_views.py index 6940fba4..1b184413 100644 --- a/explorer/tests/test_views.py +++ b/explorer/tests/test_views.py @@ -548,7 +548,7 @@ def test_downloading_from_playground(self): self.assertEqual("text/csv", resp["content-type"]) ql = QueryLog.objects.first() self.assertIn( - 'filename="Playground-%s.csv"' % ql.id, + f'filename="Playground-{ql.id}.csv"', resp["Content-Disposition"] ) diff --git a/explorer/tracker.py b/explorer/tracker.py deleted file mode 100644 index c7d591ae..00000000 --- a/explorer/tracker.py +++ /dev/null @@ -1,125 +0,0 @@ -# Anonymous usage stats -# Opt-out by setting EXPLORER_ENABLE_ANONYMOUS_STATS = False in settings - -import logging -import time -import requests -import json -import threading -import hashlib -from enum import Enum, auto -from django.core.cache import cache -from django.db import connection -from django.db.models import Count -from django.db.migrations.recorder import MigrationRecorder -from django.conf import settings - -import explorer - -logger = logging.getLogger(__name__) - - -def _instance_identifier(): - """ - We need a way of identifying unique instances of explorer to track usage. - There isn"t an established approach I"m aware of, so have come up with - this. We take the timestamp of the first applied migration, and hash it - with the secret key. - """ - - try: - migration = MigrationRecorder.Migration.objects.all().order_by("applied").first() - ts = migration.applied.timestamp() - ts_bytes = str(ts).encode("utf-8") - s_bytes = settings.SECRET_KEY.encode("utf-8") - hashed_value = hashlib.sha256(ts_bytes + s_bytes).hexdigest() - - return hashed_value - except Exception as e: - return "unknown: %s" % e - - -def instance_identifier(): - key = "explorer_instance_identifier" - r = cache.get(key) - if not r: - r = _instance_identifier() - cache.set(key, r, 60 * 60 * 24) - return r - - -class SelfNamedEnum(Enum): - - @staticmethod - def _generate_next_value_(name, start, count, last_values): - return name - - -class StatNames(SelfNamedEnum): - - QUERY_RUN = auto() - QUERY_STREAM = auto() - STARTUP_STATS = auto() - ASSISTANT_RUN = auto() - - -class Stat: - - def __init__(self, name: StatNames, value): - self.instanceId = instance_identifier() - self.time = time.time() - self.value = value - self.name = name.value - - def track(self): - from explorer import app_settings - if app_settings.EXPLORER_ENABLE_ANONYMOUS_STATS: - data = json.dumps(self.__dict__) - thread = threading.Thread(target=_send, args=(data,)) - thread.start() - - -def _send(data): - from explorer import app_settings - try: - requests.post(app_settings.EXPLORER_COLLECT_ENDPOINT_URL, - data=data, - headers={"Content-Type": "application/json"}) - except Exception as e: - logger.warning("Failed to send stats: %s" % e) - - -def gather_summary_stats(): - - from explorer import app_settings - from explorer.models import Query, QueryLog - - try: - ql_stats = QueryLog.objects.aggregate( - total_count=Count("*"), - unique_run_by_user_count=Count("run_by_user_id", distinct=True) - ) - - q_stats = Query.objects.aggregate( - total_count=Count("*"), - unique_connection_count=Count("connection", distinct=True) - ) - - install_date = MigrationRecorder.Migration.objects.all().order_by("applied").first().applied.timestamp() - - return { - "total_log_count": ql_stats["total_count"], - "unique_run_by_user_count": ql_stats["unique_run_by_user_count"], - "total_query_count": q_stats["total_count"], - "unique_connection_count": q_stats["unique_connection_count"], - "default_database": connection.vendor, - "django_install_date": install_date, - "debug": settings.DEBUG, - "tasks_enabled": app_settings.ENABLE_TASKS, - "unsafe_rendering": app_settings.UNSAFE_RENDERING, - "transform_count": len(app_settings.EXPLORER_TRANSFORMS), - "assistant_enabled": app_settings.EXPLORER_AI_API_KEY is not None, - "version": explorer.get_version() - } - except Exception as e: - return {"error": "error gathering stats: %s" % e} diff --git a/explorer/utils.py b/explorer/utils.py index bceed7bc..88dcf0f7 100644 --- a/explorer/utils.py +++ b/explorer/utils.py @@ -62,7 +62,8 @@ def param(name): def swap_params(sql, params): p = params.items() if params else {} for k, v in p: - regex = re.compile(r"\$\$%s(?:\|([^\$\:]+))?(?:\:([^\$]+))?\$\$" % str(k).lower(), re.I) + fmt_k = re.escape(str(k).lower()) + regex = re.compile(rf"\$\${fmt_k}(?:\|([^\$\:]+))?(?:\:([^\$]+))?\$\$", re.I) sql = regex.sub(str(v), sql) return sql diff --git a/explorer/views/stream.py b/explorer/views/stream.py index 17e14de4..2c20196a 100644 --- a/explorer/views/stream.py +++ b/explorer/views/stream.py @@ -4,7 +4,7 @@ from explorer.models import Query from explorer.views.auth import PermissionRequiredMixin from explorer.views.export import _export -from explorer.tracker import Stat, StatNames +from explorer.telemetry import Stat, StatNames class StreamQueryView(PermissionRequiredMixin, View):