history

chrisclark · Apr 25, 2024 · 9a81ca3 · 9a81ca3
1 parent 891310d
commit 9a81ca3
Show file tree

Hide file tree

Showing 16 changed files with 314 additions and 185 deletions.
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -5,6 +5,14 @@ Change Log
 This document records all notable changes to `django-sql-explorer <https://github.com/chrisclark/django-sql-explorer>`_.
 This project adheres to `Semantic Versioning <https://semver.org/>`_.
 
+`4.1.0b1`_ (2024-04-25)
+===========================
+* `#609`_: Tracking should be opt-in and not use the SECRET_KEY
+* `#610`_: Import error (sql_metadata) with 4.1 version
+* `#612`_: Accessing the database during app initialization
+* Regex-injection vulnerability
+* Better anonymization for telemetry
+
 `4.1.0`_ (2024-04-23)
 ===========================
 * SQL Assistant: Built in query help via OpenAI (or LLM of choice), with relevant schema

diff --git a/docs/settings.rst b/docs/settings.rst
@@ -338,11 +338,11 @@ but a dotted path to a python view can be used
 
    EXPLORER_NO_PERMISSION_VIEW = 'explorer.views.auth.safe_login_view_wrapper'
 
-Anonymous Usage Stat Collection
-*******************************
+Anonymous Telemetry Collection
+******************************
 
 By default, anonymous usage statistics are collected. To disable this, set the following setting to False.
-You can see what is being collected in tracker.py.
+You can see what is being collected in telemetry.py.
 
 .. code-block:: python
 

diff --git a/explorer/__init__.py b/explorer/__init__.py
@@ -1,9 +1,9 @@
 __version_info__ = {
     "major": 4,
-    "minor": 1,
+    "minor": 2,
     "patch": 0,
-    "releaselevel": "final",
-    "serial": 0
+    "releaselevel": "beta",
+    "serial": 1
 }
 
 

diff --git a/explorer/actions.py b/explorer/actions.py
@@ -33,10 +33,11 @@ def _package(queries):
     is_one = len(queries) == 1
     name_root = lambda n: f"attachment; filename={n}"  # noqa
     ret["content_type"] = (is_one and "text/csv") or "application/zip"
-
+    formatted = queries[0].title.replace(",", "")
+    day = date.today()
     ret["filename"] = (
-        is_one and name_root("%s.csv" % queries[0].title.replace(",", ""))
-    ) or name_root("Report_%s.zip" % date.today())
+        is_one and name_root(f"{formatted}.csv")
+    ) or name_root(f"Report_{day}.zip")
 
     ret["data"] = (
         is_one and CSVExporter(queries[0]).get_output()

diff --git a/explorer/apps.py b/explorer/apps.py
@@ -1,7 +1,6 @@
 from django.apps import AppConfig
 from django.core.exceptions import ImproperlyConfigured
 from django.db import connections as djcs
-from django.db.utils import DatabaseError
 from django.utils.translation import gettext_lazy as _
 
 
@@ -15,7 +14,6 @@ def ready(self):
         from explorer.schema import build_async_schemas
         _validate_connections()
         build_async_schemas()
-        track_summary_stats()
 
 
 def _get_default():
@@ -44,25 +42,3 @@ def _validate_connections():
                 f"EXPLORER_CONNECTIONS contains ({name}, {conn_name}), "
                 f"but {conn_name} is not a valid Django DB connection."
             )
-
-
-def track_summary_stats():
-    from explorer.tracker import Stat, StatNames
-    from explorer.tracker import gather_summary_stats
-    from explorer.models import Query
-
-    # Django doesn't actually have a way of running code on application initialization, so we have come up with this.
-    # The app.ready() method (the call site for this function) is invoked *before* any migrations are run. So if were
-    # to just call this function in ready(), without the try: block, then it would always fail the very first time
-    # Django runs (and e.g. in test runs) because no tables have yet been created. The intuitive way to handle this with
-    # Django would be to tie into the post_migrate signal in ready() and run this function on post_migrate. But that
-    # doesn't work because that signal is only called if indeed a migrations has been applied. If the app restarts and
-    # there are no new migrations, the signal never fires. So instead we check if the Query table exists, and if it
-    # does, we're good to gather stats.
-    try:
-        Query.objects.first()
-    except DatabaseError:
-        return
-    else:
-        payload = gather_summary_stats()
-        Stat(StatNames.STARTUP_STATS, payload).track()
diff --git a/explorer/assistant/utils.py b/explorer/assistant/utils.py
@@ -1,18 +1,15 @@
 from explorer import app_settings
 from explorer.schema import schema_info
 from explorer.utils import get_valid_connection
-from sql_metadata import Parser
 from django.db.utils import OperationalError
 
-if app_settings.EXPLORER_AI_API_KEY:
-    import tiktoken
-    from openai import OpenAI
 
 OPENAI_MODEL = app_settings.EXPLORER_ASSISTANT_MODEL["name"]
 ROW_SAMPLE_SIZE = 2
 
 
 def openai_client():
+    from openai import OpenAI
     return OpenAI(
         api_key=app_settings.EXPLORER_AI_API_KEY,
         base_url=app_settings.EXPLORER_ASSISTANT_BASE_URL
@@ -73,6 +70,7 @@ def format_rows_from_table(rows):
 
 
 def get_table_names_from_query(sql):
+    from sql_metadata import Parser
     if sql:
         try:
             parsed = Parser(sql)
@@ -84,6 +82,7 @@ def get_table_names_from_query(sql):
 
 def num_tokens_from_string(string: str) -> int:
     """Returns the number of tokens in a text string."""
+    import tiktoken
     encoding = tiktoken.encoding_for_model(OPENAI_MODEL)
     num_tokens = len(encoding.encode(string))
     return num_tokens

diff --git a/explorer/assistant/views.py b/explorer/assistant/views.py
@@ -3,7 +3,7 @@
 from django.views.decorators.http import require_POST
 import json
 
-from explorer.tracker import Stat, StatNames
+from explorer.telemetry import Stat, StatNames
 from explorer.utils import get_valid_connection
 from explorer.assistant.models import PromptLog
 from explorer.assistant.prompts import primary_prompt

diff --git a/explorer/migrations/0015_explorervalue.py b/explorer/migrations/0015_explorervalue.py
@@ -0,0 +1,21 @@
+# Generated by Django 4.2.8 on 2024-04-25 13:34
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('explorer', '0014_promptlog'),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='ExplorerValue',
+            fields=[
+                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('key', models.CharField(choices=[('UUID', 'Install Unique ID'), ('SMLS', 'Startup metric last send')], max_length=5)),
+                ('value', models.TextField(blank=True, null=True)),
+            ],
+        ),
+    ]
diff --git a/explorer/models.py b/explorer/models.py
@@ -1,5 +1,6 @@
 import logging
 from time import time
+import uuid
 
 from django.conf import settings
 from django.core.exceptions import ValidationError
@@ -8,7 +9,7 @@
 from django.utils.translation import gettext_lazy as _
 
 from explorer import app_settings
-from explorer.tracker import Stat, StatNames
+from explorer.telemetry import Stat, StatNames
 from explorer.utils import (
     extract_params, get_params_for_url, get_s3_bucket, get_valid_connection, passes_blacklist, s3_url,
     shared_dict_update, swap_params,
@@ -393,3 +394,50 @@ def stats(self):
 
     def __str__(self):
         return str(self._header)
+
+
+class ExplorerValueManager(models.Manager):
+
+    def get_uuid(self):
+        # If blank or non-existing, generates a new UUID
+        uuid_obj, created = self.get_or_create(
+            key=ExplorerValue.INSTALL_UUID,
+            defaults={"value": str(uuid.uuid4())}
+        )
+        if created or uuid_obj.value is None:
+            uuid_obj.value = str(uuid.uuid4())
+            uuid_obj.save()
+        return uuid_obj.value
+
+    def get_startup_last_send(self):
+        # Stored as a Unix timestamp
+        try:
+            timestamp = self.get(key=ExplorerValue.STARTUP_METRIC_LAST_SEND).value
+            if timestamp:
+                return float(timestamp)
+            return None
+        except ExplorerValue.DoesNotExist:
+            return None
+
+    def set_startup_last_send(self, ts):
+        obj, created = self.get_or_create(
+            key=ExplorerValue.STARTUP_METRIC_LAST_SEND,
+            defaults={"value": str(ts)}
+        )
+        if not created:
+            obj.value = str(ts)
+            obj.save()
+
+
+class ExplorerValue(models.Model):
+    INSTALL_UUID = "UUID"
+    STARTUP_METRIC_LAST_SEND = "SMLS"
+    EXPLORER_SETTINGS_CHOICES = [
+        (INSTALL_UUID, "Install Unique ID"),
+        (STARTUP_METRIC_LAST_SEND, "Startup metric last send"),
+    ]
+
+    key = models.CharField(max_length=5, choices=EXPLORER_SETTINGS_CHOICES)
+    value = models.TextField(null=True, blank=True)
+
+    objects = ExplorerValueManager()
diff --git a/explorer/telemetry.py b/explorer/telemetry.py
@@ -0,0 +1,152 @@
+# Anonymous usage stats
+# Opt-out by setting EXPLORER_ENABLE_ANONYMOUS_STATS = False in settings
+
+import logging
+import time
+import requests
+import json
+import threading
+from enum import Enum, auto
+from django.core.cache import cache
+from django.db import connection
+from django.db.models import Count
+from django.db.migrations.recorder import MigrationRecorder
+from django.conf import settings
+
+logger = logging.getLogger(__name__)
+
+
+def instance_identifier():
+    from explorer.models import ExplorerValue
+    key = "explorer_instance_identifier"
+    r = cache.get(key)
+    if not r:
+        r = ExplorerValue.objects.get_uuid()
+        cache.set(key, r, 60 * 60 * 24)
+    return r
+
+
+class SelfNamedEnum(Enum):
+
+    @staticmethod
+    def _generate_next_value_(name, start, count, last_values):
+        return name
+
+
+class StatNames(SelfNamedEnum):
+
+    QUERY_RUN = auto()
+    QUERY_STREAM = auto()
+    STARTUP_STATS = auto()
+    ASSISTANT_RUN = auto()
+
+
+class Stat:
+
+    STAT_COLLECTION_INTERVAL = 60 * 10  # Ten minutes
+    STARTUP_STAT_COLLECTION_INTERVAL = 60 * 60 * 24 * 7  # A week
+
+    def __init__(self, name: StatNames, value):
+        self.instanceId = instance_identifier()
+        self.time = time.time()
+        self.value = value
+        self.name = name.value
+
+    @property
+    def is_summary(self):
+        return self.name == StatNames.STARTUP_STATS.value
+
+    def should_send_summary_stats(self):
+        from explorer.models import ExplorerValue
+        last_send = ExplorerValue.objects.get_startup_last_send()
+        if not last_send:
+            return True
+        else:
+            return self.time - last_send >= self.STARTUP_STAT_COLLECTION_INTERVAL
+
+    def send_summary_stats(self):
+        from explorer.models import ExplorerValue
+        payload = _gather_summary_stats()
+        Stat(StatNames.STARTUP_STATS, payload).track()
+        ExplorerValue.objects.set_startup_last_send(self.time)
+
+    def track(self):
+        from explorer import app_settings
+        if not app_settings.EXPLORER_ENABLE_ANONYMOUS_STATS:
+            return
+
+        cache_key = "last_stat_sent_time"
+        last_sent_time = cache.get(cache_key, 0)
+        # Summary stats are tracked with a different time interval
+        if self.is_summary or self.time - last_sent_time >= self.STAT_COLLECTION_INTERVAL:
+            data = json.dumps(self.__dict__)
+            thread = threading.Thread(target=_send, args=(data,))
+            thread.start()
+            cache.set(cache_key, self.time)
+
+        # Every time we send any tracking, see if we have recently sent overall summary stats
+        # Of course, sending the summary stats calls .track(), so we need to NOT call track()
+        # again if we are in fact already in the process of sending summary stats. Otherwise,
+        # we will end up in infinite recursion of track() calls.
+        if not self.is_summary and self.should_send_summary_stats():
+            self.send_summary_stats()
+
+
+def _send(data):
+    from explorer import app_settings
+    try:
+        requests.post(app_settings.EXPLORER_COLLECT_ENDPOINT_URL,
+                      data=data,
+                      headers={"Content-Type": "application/json"})
+    except Exception as e:
+        logger.warning(f"Failed to send stats: {e}")
+
+
+def _get_install_quarter():
+    first_migration = MigrationRecorder.Migration.objects. \
+        filter(app="explorer").order_by("applied").first()
+
+    if first_migration is not None:
+        quarter = (first_migration.applied.month - 1) // 3 + 1  # Calculate the quarter
+        year = first_migration.applied.year
+        quarter_str = f"Q{quarter}-{year}"
+    else:
+        quarter_str = None
+    return quarter_str
+
+
+def _gather_summary_stats():
+
+    from explorer import app_settings
+    from explorer.models import Query, QueryLog
+    import explorer
+
+    try:
+        ql_stats = QueryLog.objects.aggregate(
+            total_count=Count("*"),
+            unique_run_by_user_count=Count("run_by_user_id", distinct=True)
+        )
+
+        q_stats = Query.objects.aggregate(
+            total_count=Count("*"),
+            unique_connection_count=Count("connection", distinct=True)
+        )
+
+        # Round the counts to provide additional anonymity
+        return {
+            "total_log_count": round(ql_stats["total_count"] * 0.1) * 10,
+            "unique_run_by_user_count": round(ql_stats["unique_run_by_user_count"] * 0.2) * 5,
+            "total_query_count": round(q_stats["total_count"] * 0.1) * 10,
+            "unique_connection_count": round(q_stats["unique_connection_count"] * 0.2) * 5,
+            "default_database": connection.vendor,
+            "explorer_install_quarter": _get_install_quarter(),
+            "debug": settings.DEBUG,
+            "tasks_enabled": app_settings.ENABLE_TASKS,
+            "unsafe_rendering": app_settings.UNSAFE_RENDERING,
+            "transform_count": len(app_settings.EXPLORER_TRANSFORMS),
+            "assistant_enabled": app_settings.EXPLORER_AI_API_KEY is not None,
+            "version": explorer.get_version(),
+            "charts_enabled": app_settings.EXPLORER_CHARTS_ENABLED
+        }
+    except Exception as e:
+        return {"error": f"error gathering stats: {e}"}