From dfea2ffc8a13cec28b9b393875c85262a00c153e Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 22 Feb 2024 11:54:58 -0500 Subject: [PATCH 01/49] Start adding backend support to start and stop qa crawl jobs --- backend/btrixcloud/crawlmanager.py | 27 +++- backend/btrixcloud/crawls.py | 87 +++++++++++++ backend/btrixcloud/k8sapi.py | 49 +++++++- backend/btrixcloud/models.py | 31 +++++ chart/app-templates/crawl_qa_job.yaml | 28 +++++ chart/app-templates/crawler_qa.yaml | 171 ++++++++++++++++++++++++++ 6 files changed, 386 insertions(+), 7 deletions(-) create mode 100644 chart/app-templates/crawl_qa_job.yaml create mode 100644 chart/app-templates/crawler_qa.yaml diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index a8fcfd8344..b6ffcac17e 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -177,6 +177,25 @@ async def create_crawl_job( warc_prefix=warc_prefix, ) + async def create_crawl_qa_job( + self, + oid: UUID, + crawl_id: str, + storage: StorageRef, + qa_source: str, + userid: str, + crawler_image: str, + profile_filename: str = None, + ) -> str: + """create new crawl qa job""" + storage_secret = storage.get_storage_secret_name(str(oid)) + + await self.has_storage_secret(storage_secret) + + return await self.new_crawl_qa_job( + userid, crawl_id, oid, storage, qa_source, crawler_image, profile_filename + ) + async def update_crawl_config( self, crawlconfig: CrawlConfig, update: UpdateCrawlConfig, profile_filename=None ) -> bool: @@ -293,14 +312,16 @@ async def scale_crawl(self, crawl_id: str, scale: int = 1) -> dict: """Set the crawl scale (job parallelism) on the specified job""" return await self._patch_job(crawl_id, {"scale": scale}) - async def shutdown_crawl(self, crawl_id: str, graceful=True) -> dict: + async def shutdown_crawl( + self, crawl_id: str, graceful=True, qa_crawl=False + ) -> dict: """Request a crawl cancelation or stop by calling an API on the job pod/container, returning the result""" if graceful: patch = {"stopping": True} - return await self._patch_job(crawl_id, patch) + return await self._patch_job(crawl_id, patch, qa_crawl=qa_crawl) - return await self.delete_crawl_job(crawl_id) + return await self.delete_crawl_job(crawl_id, qa_crawl=qa_crawl) async def delete_crawl_configs_for_org(self, org: str) -> None: """Delete all crawl configs for given org""" diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 74897c63ab..a1d35ad293 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -5,6 +5,7 @@ import json import re import urllib.parse +from datetime import datetime from uuid import UUID from typing import Optional, List, Dict, Union @@ -31,6 +32,8 @@ PaginatedResponse, RUNNING_AND_STARTING_STATES, ALL_CRAWL_STATES, + QACrawl, + QACrawlIn, ) @@ -553,6 +556,76 @@ async def get_crawl_stats( return crawls_data + async def add_new_qa_crawl(self, qa_crawl_id, crawl_id, user): + """Add QA crawl subdoc to crawl document""" + qa_crawl = QACrawl( + id=qa_crawl_id, + started=datetime.now(), + userid=user.id, + userName=user.name, + state="starting", + image=crawler_image, + ) + try: + await self.crawls.update_one( + {"_id": "crawl_id"}, {"$push": {"qa": qa_crawl.dict()}} + ) + # pylint: disable=broad-exception-caught + except Exception as exc: + print( + f"Error adding QA Crawl {qa_crawl_id} to crawl doc {crawl_id}", + flush=True, + ) + + async def start_crawl_qa_job( + self, qa_crawl_in: QACrawlIn, crawl_id: str, org: Organization, user: User + ): + """Start crawl QA job + + TODO: Do we need a configmap here?""" + crawler_image = self.crawl_configs.get_channel_crawler_image( + qa_crawl_in.crawlerChannel + ) + if not crawler_image: + raise HTTPException(status_code=404, detail="crawler_image_not_found") + + if await self.orgs.storage_quota_reached(org.id): + raise HTTPException(status_code=403, detail="storage_quota_reached") + + if await self.orgs.exec_mins_quota_reached(org.id): + raise HTTPException(status_code=403, detail="exec_minutes_quota_reached") + + crawl = await self.get_crawl(crawl_id, org) + if not crawl.files: + raise HTTPException(status_code=404, detail="crawl_files_not_found") + + # TODO: For now, let's just use the first WACZ. Eventually we'll want to + # pass them all in as an array or a multi-WACZ + qa_source = crawl.files[0] + + try: + qa_crawl_id = await self.crawl_manager.create_crawl_qa_job( + org.id, + crawl_id, + org.storage, + qa_source.filename, + str(user.id), + crawler_image, + qa_crawl_in.profileFilename, + ) + await self.add_new_qa_crawl(qa_crawl_id, crawl_id, user, crawler_image) + return qa_crawl_id + + except Exception as exc: + # pylint: disable=raise-missing-from + raise HTTPException( + status_code=500, detail=f"Error starting QA crawl: {exc}" + ) + + async def stop_crawl_qa_job(self, qa_crawl_id: str, org: Organization): + """Stop crawl QA job""" + return await self.crawl_manager.shutdown_crawl(qa_crawl_id) + # ============================================================================ async def recompute_crawl_file_count_and_size(crawls, crawl_id): @@ -738,6 +811,20 @@ async def get_crawl_admin(crawl_id, user: User = Depends(user_dep)): async def get_crawl(crawl_id, org: Organization = Depends(org_viewer_dep)): return await ops.get_crawl(crawl_id, org, "crawl") + @app.post("/orgs/{oid}/crawls/{crawl_id}/qa/start", tags=["crawls", "qa"]) + async def start_crawl_qa_job( + qa_crawl_in: QACrawlIn, + crawl_id: str, + org: Organization = Depends(org_crawl_dep), + user: User = Depends(user_dep), + ): + qa_crawl_id = await ops.start_crawl_qa_job(qa_crawl_in, crawl_id, org, user) + return {"started": qa_crawl_id} + + @app.post("/orgs/{oid}/crawls/{crawl_id}/qa/stop", tags=["crawls", "qa"]) + async def stop_crawl_qa_job(crawl_id, org: Organization = Depends(org_crawl_dep)): + return await ops.stop_crawl_qa_job(crawl_id, org) + @app.get( "/orgs/all/crawls/{crawl_id}", tags=["crawls"], diff --git a/backend/btrixcloud/k8sapi.py b/backend/btrixcloud/k8sapi.py index 73663934e6..290cf5ce0c 100644 --- a/backend/btrixcloud/k8sapi.py +++ b/backend/btrixcloud/k8sapi.py @@ -2,6 +2,7 @@ import os import traceback +from uuid import uuid4 import yaml @@ -120,6 +121,36 @@ async def new_crawl_job(self, *args, **kwargs) -> str: return crawl_id + # pylint: disable=too-many-arguments, too-many-locals + def new_crawl_qa_job_yaml( + self, userid, crawl_id, oid, storage, qa_source, crawler_image, profile_filename + ): + """load job template from yaml""" + qa_crawl_id = uuid4() + + params = { + "id": str(qa_crawl_id), + "crawl_id": crawl_id, + "oid": oid, + "userid": userid, + "storage_name": str(storage), + "crawler_image": crawler_image, + "profile_filename": profile_filename, + "qa_source": qa_source, + } + + data = self.templates.env.get_template("crawl_qa_job.yaml").render(params) + return qa_crawl_id, data + + async def new_crawl_qa_job(self, *args, **kwargs) -> str: + """load and init qa crawl job via k8s api""" + qa_crawl_id, data = self.new_qa_crawl_job_yaml(*args, **kwargs) + + # create job directly + await self.create_from_yaml(data) + + return qa_crawl_id + async def create_from_yaml(self, doc, namespace=None): """init k8s objects from yaml""" yml_document_all = yaml.safe_load_all(doc) @@ -170,15 +201,19 @@ async def has_storage_secret(self, storage_secret) -> bool: status_code=400, detail="invalid_config_missing_storage_secret" ) - async def delete_crawl_job(self, crawl_id): + async def delete_crawl_job(self, crawl_id, qa_crawl=False): """delete custom crawljob object""" try: + name = f"crawljob-{crawl_id}" + if qa_crawl: + name = f"qa-{name}" + await self.custom_api.delete_namespaced_custom_object( group="btrix.cloud", version="v1", namespace=self.namespace, plural="crawljobs", - name=f"crawljob-{crawl_id}", + name=name, grace_period_seconds=0, # delete as background to allow operator to do proper cleanup propagation_policy="Background", @@ -215,7 +250,9 @@ async def get_profile_browser(self, browserid): name=f"profilejob-{browserid}", ) - async def _patch_job(self, crawl_id, body, pluraltype="crawljobs") -> dict: + async def _patch_job( + self, crawl_id, body, pluraltype="crawljobs", qa_crawl=False + ) -> dict: content_type = self.api_client.default_headers.get("Content-Type") try: @@ -223,12 +260,16 @@ async def _patch_job(self, crawl_id, body, pluraltype="crawljobs") -> dict: "Content-Type", "application/merge-patch+json" ) + name = f"{pluraltype[:-1]}-{crawl_id}" + if qa_crawl: + name = f"qa-{name}" + await self.custom_api.patch_namespaced_custom_object( group="btrix.cloud", version="v1", namespace=self.namespace, plural=pluraltype, - name=f"{pluraltype[:-1]}-{crawl_id}", + name=name, body={"spec": body}, ) return {"success": True} diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index ad914f09b6..c8fdbad636 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -655,6 +655,35 @@ class CrawlScale(BaseModel): scale: conint(ge=1, le=MAX_CRAWL_SCALE) = 1 # type: ignore +# ============================================================================ +class QACrawl(BaseModel): + """Subdocument to track QA runs for given crawl""" + + id: UUID + started: datetime + finished: Optional[datetime] = None + + userid: UUID + userName: Optional[str] + + state: str + stopping: Optional[bool] = False + + image: Optional[str] = None + + crawlExecSeconds: int = 0 + + files: Optional[List[CrawlFile]] = [] + + +# ============================================================================ +class QACrawlIn(BaseModel): + """Input model for QA crawls""" + + crawlerChannel: str = "default" + profileFilename: Optional[str] = None + + # ============================================================================ class Crawl(BaseCrawl, CrawlConfigCore): """Store State of a Crawl (Finished or Running)""" @@ -676,6 +705,8 @@ class Crawl(BaseCrawl, CrawlConfigCore): image: Optional[str] + qa: Optional[List[QACrawl]] = [] + # ============================================================================ class CrawlCompleteIn(BaseModel): diff --git a/chart/app-templates/crawl_qa_job.yaml b/chart/app-templates/crawl_qa_job.yaml new file mode 100644 index 0000000000..83f7e85292 --- /dev/null +++ b/chart/app-templates/crawl_qa_job.yaml @@ -0,0 +1,28 @@ +apiVersion: btrix.cloud/v1 +kind: CrawlJob +metadata: + name: qa-crawljob-{{ id }} + labels: + id: "{{ id }}" + crawlId: "{{ crawl_id }}" + role: "job" + btrix.org: "{{ oid }}" + btrix.user: "{{ userid }}" + btrix.storage: "{{ storage_name }}" + +spec: + selector: + matchLabels: + qaCrawl: "{{ id }}" + + id: "{{ id }}" + crawlId: "{{ crawl_id }}" + userid: "{{ userid }}" + oid: "{{ oid }}" + crawlerImage: "{{ crawler_image }}" + profileFilename: "{{ profile_filename }}" + qaSource: "{{ qa_source }}" + ttlSecondsAfterFinished: 30 + + storageName: "{{ storage_name }}" + diff --git a/chart/app-templates/crawler_qa.yaml b/chart/app-templates/crawler_qa.yaml new file mode 100644 index 0000000000..63d84faec4 --- /dev/null +++ b/chart/app-templates/crawler_qa.yaml @@ -0,0 +1,171 @@ +# ------- +# PVC +# ------- + +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: qa-{{ name }} + namespace: {{ namespace }} + labels: + crawl-qa: {{ id }} + role: crawler + +spec: + accessModes: + - ReadWriteOnce + + resources: + requests: + storage: {{ crawler_storage }} + + {% if volume_storage_class %} + storageClassName: {{ volume_storage_class }} + {% endif %} + + + +# ------- +# CRAWLER +# ------- +{% if not do_restart %} +--- +apiVersion: v1 +kind: Pod +metadata: + name: {{ name }} + namespace: {{ namespace }} + labels: + crawl-qa: {{ id }} + role: crawler + +spec: + hostname: {{ name }} + subdomain: crawler + + {% if priorityClassName %} + priorityClassName: {{ priorityClassName }} + {% endif %} + + restartPolicy: OnFailure + + terminationGracePeriodSeconds: {{ termination_grace_secs }} + volumes: + - name: crawl-data + persistentVolumeClaim: + claimName: {{ name }} + + affinity: +{% if crawler_node_type %} + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: nodeType + operator: In + values: + - "{{ crawler_node_type }}" +{% endif %} + + podAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 10 + podAffinityTerm: + topologyKey: "kubernetes.io/hostname" + labelSelector: + matchExpressions: + - key: crawl + operator: In + values: + - {{ id }} + + tolerations: + - key: nodeType + operator: Equal + value: crawling + effect: NoSchedule + - key: node.kubernetes.io/not-ready + operator: Exists + tolerationSeconds: 300 + effect: NoExecute + - key: node.kubernetes.io/unreachable + operator: Exists + effect: NoExecute + tolerationSeconds: 300 + + containers: + - name: crawler + image: {{ crawler_image }} + imagePullPolicy: {{ crawler_image_pull_policy }} + command: + - qa + - --qaSource + - {{ qa_source }} + - --redisStoreUrl + - {{ redis_url }} + {%- if profile_filename %} + - --profile + - "@{{ profile_filename }}" + {%- endif %} + + volumeMounts: + - name: crawl-data + mountPath: /crawls + + envFrom: + - configMapRef: + name: shared-crawler-config + + - secretRef: + name: {{ storage_secret }} + + {% if signing_secret %} + - secretRef: + name: {{ signing_secret }} + {% endif %} + + env: + - name: CRAWL_QA_ID + value: "{{ id }}" + + - name: WEBHOOK_URL + value: "{{ redis_url }}/crawls-done" + + - name: STORE_PATH + value: "{{ storage_path }}" + + - name: STORE_FILENAME + value: "{{ storage_filename }}" + + - name: STORE_USER + value: "{{ userid }}" + + {% if crawler_socks_proxy_host %} + - name: SOCKS_HOST + value: "{{ crawler_socks_proxy_host }}" + {% if crawler_socks_proxy_port %} + - name: SOCKS_PORT + value: "{{ crawler_socks_proxy_port }}" + {% endif %} + {% endif %} + + resources: + limits: + memory: "{{ memory }}" + + requests: + cpu: "{{ cpu }}" + memory: "{{ memory }}" + + {% if crawler_liveness_port and crawler_liveness_port != '0' %} + livenessProbe: + httpGet: + path: /healthz + port: {{ crawler_liveness_port }} + + initialDelaySeconds: 15 + periodSeconds: 120 + failureThreshold: 3 + {% endif %} + +{% endif %} From e8f0da35e441a1e4766f1f1d3470b89074e20f72 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 22 Feb 2024 17:41:49 -0500 Subject: [PATCH 02/49] Fixups --- backend/btrixcloud/crawlmanager.py | 1 + backend/btrixcloud/crawls.py | 8 ++++---- backend/btrixcloud/k8sapi.py | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index b6ffcac17e..a1eb368181 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -4,6 +4,7 @@ import asyncio import secrets import json +from uuid import UUID from typing import Optional, Dict from datetime import timedelta diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index a1d35ad293..c27f118a72 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -556,7 +556,7 @@ async def get_crawl_stats( return crawls_data - async def add_new_qa_crawl(self, qa_crawl_id, crawl_id, user): + async def add_new_qa_crawl(self, qa_crawl_id, crawl_id, user, crawler_image): """Add QA crawl subdoc to crawl document""" qa_crawl = QACrawl( id=qa_crawl_id, @@ -573,7 +573,7 @@ async def add_new_qa_crawl(self, qa_crawl_id, crawl_id, user): # pylint: disable=broad-exception-caught except Exception as exc: print( - f"Error adding QA Crawl {qa_crawl_id} to crawl doc {crawl_id}", + f"Error adding QA Crawl {qa_crawl_id} to crawl doc {crawl_id}: {exc}", flush=True, ) @@ -622,7 +622,7 @@ async def start_crawl_qa_job( status_code=500, detail=f"Error starting QA crawl: {exc}" ) - async def stop_crawl_qa_job(self, qa_crawl_id: str, org: Organization): + async def stop_crawl_qa_job(self, qa_crawl_id: str): """Stop crawl QA job""" return await self.crawl_manager.shutdown_crawl(qa_crawl_id) @@ -823,7 +823,7 @@ async def start_crawl_qa_job( @app.post("/orgs/{oid}/crawls/{crawl_id}/qa/stop", tags=["crawls", "qa"]) async def stop_crawl_qa_job(crawl_id, org: Organization = Depends(org_crawl_dep)): - return await ops.stop_crawl_qa_job(crawl_id, org) + return await ops.stop_crawl_qa_job(crawl_id) @app.get( "/orgs/all/crawls/{crawl_id}", diff --git a/backend/btrixcloud/k8sapi.py b/backend/btrixcloud/k8sapi.py index 290cf5ce0c..e9a88264da 100644 --- a/backend/btrixcloud/k8sapi.py +++ b/backend/btrixcloud/k8sapi.py @@ -144,7 +144,7 @@ def new_crawl_qa_job_yaml( async def new_crawl_qa_job(self, *args, **kwargs) -> str: """load and init qa crawl job via k8s api""" - qa_crawl_id, data = self.new_qa_crawl_job_yaml(*args, **kwargs) + qa_crawl_id, data = self.new_crawl_qa_job_yaml(*args, **kwargs) # create job directly await self.create_from_yaml(data) From 5dbab3f09a329cba24e3fe632bff13e1947c3fab Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 26 Feb 2024 16:09:37 -0500 Subject: [PATCH 03/49] WIP: Add replay.json endpoint for QA crawls --- backend/btrixcloud/basecrawls.py | 21 ++++++++++++----- backend/btrixcloud/crawls.py | 40 ++++++++++++++++++++++++++++++++ backend/btrixcloud/models.py | 7 ++++++ 3 files changed, 62 insertions(+), 6 deletions(-) diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index 22b71073d1..32ad4607f7 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -124,12 +124,12 @@ async def get_crawl_raw( return res - async def _files_to_resources(self, files, org, crawlid): + async def _files_to_resources(self, files, org, crawlid, qa: bool = False): if not files: return [] crawl_files = [CrawlFile(**data) for data in files] - return await self._resolve_signed_urls(crawl_files, org, crawlid) + return await self._resolve_signed_urls(crawl_files, org, qa, crawlid) async def get_crawl( self, @@ -432,7 +432,11 @@ async def _resolve_crawl_refs( return crawl async def _resolve_signed_urls( - self, files: List[CrawlFile], org: Organization, crawl_id: Optional[str] = None + self, + files: List[CrawlFile], + org: Organization, + qa: bool = False, + crawl_id: Optional[str] = None, ): if not files: print("no files") @@ -451,12 +455,17 @@ async def _resolve_signed_urls( presigned_url = await self.storage_ops.get_presigned_url( org, file_, self.presign_duration_seconds ) + + prefix = "files" + if qa: + prefix = f"qa.{prefix}" + await self.crawls.find_one_and_update( - {"files.filename": file_.filename}, + {f"{prefix}.filename": file_.filename}, { "$set": { - "files.$.presignedUrl": presigned_url, - "files.$.expireAt": exp, + f"{prefix}.$.presignedUrl": presigned_url, + f"{prefix}.$.expireAt": exp, } }, ) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index c27f118a72..62f0c30cf2 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -34,6 +34,7 @@ ALL_CRAWL_STATES, QACrawl, QACrawlIn, + QACrawlWithResources, ) @@ -626,6 +627,26 @@ async def stop_crawl_qa_job(self, qa_crawl_id: str): """Stop crawl QA job""" return await self.crawl_manager.shutdown_crawl(qa_crawl_id) + async def get_qa_crawl( + self, crawl_id: str, qa_crawl_id: str, org: Optional[Organization] = None + ) -> QACrawlWithResources: + """Fetch QA Crawl""" + crawl_raw = await self.get_crawl(crawl_id, org, "crawl") + qa_crawl = None + for qa_run in crawl_raw.get("qa", []): + if qa_run.get("id") == qa_crawl_id: + qa_crawl = qa_run + + if not qa_crawl: + raise HTTPException(status_code=404, detail="crawl_qa_not_found") + + qa_crawl["resources"] = await self._files_to_resources( + qa_crawl.get("files"), org, None, qa=True + ) + qa_crawl.pop("files", None) + + return QACrawlWithResources(**qa_crawl) + # ============================================================================ async def recompute_crawl_file_count_and_size(crawls, crawl_id): @@ -811,6 +832,25 @@ async def get_crawl_admin(crawl_id, user: User = Depends(user_dep)): async def get_crawl(crawl_id, org: Organization = Depends(org_viewer_dep)): return await ops.get_crawl(crawl_id, org, "crawl") + @app.get( + "/orgs/all/crawls/{crawl_id}/qa/{qa_crawl_id}/replay.json", + tags=["crawls"], + response_model=QACrawlWithResources, + ) + async def get_qa_crawl_admin(crawl_id, user: User = Depends(user_dep)): + if not user.is_superuser: + raise HTTPException(status_code=403, detail="Not Allowed") + + return await ops.get_qa_crawl(crawl_id, qa_crawl_id) + + @app.get( + "/orgs/{oid}/crawls/{crawl_id}/qa/{qa_crawl_id}/replay.json", + tags=["crawls"], + response_model=QACrawlWithResources, + ) + async def get_qa_crawl(crawl_id, org: Organization = Depends(org_viewer_dep)): + return await ops.get_qa_crawl(crawl_id, qa_crawl_id, org) + @app.post("/orgs/{oid}/crawls/{crawl_id}/qa/start", tags=["crawls", "qa"]) async def start_crawl_qa_job( qa_crawl_in: QACrawlIn, diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index c8fdbad636..95e1b335bd 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -676,6 +676,13 @@ class QACrawl(BaseModel): files: Optional[List[CrawlFile]] = [] +# ============================================================================ +class QACrawlWithResources(QACrawl): + """QA crawl output model including resources""" + + resources: Optional[List[CrawlFileOut]] = [] + + # ============================================================================ class QACrawlIn(BaseModel): """Input model for QA crawls""" From f238d254d7c62f83c37359d9802c85b6b1100c85 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 26 Feb 2024 16:12:42 -0500 Subject: [PATCH 04/49] Add API endpoint to return QACrawls associated with a crawl --- backend/btrixcloud/crawls.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 62f0c30cf2..c427394c7e 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -627,11 +627,20 @@ async def stop_crawl_qa_job(self, qa_crawl_id: str): """Stop crawl QA job""" return await self.crawl_manager.shutdown_crawl(qa_crawl_id) + async def get_qa_crawls( + self, crawl_id: str, org: Optional[Organization] = None + ) -> List[QACrawl]: + """Return list of QA crawls""" + crawl = await self.get_crawl(crawl_id, org, "crawl") + if not crawl.qa: + raise HTTPException(status_code=404, detail="crawl_qa_not_found") + return crawl.qa + async def get_qa_crawl( self, crawl_id: str, qa_crawl_id: str, org: Optional[Organization] = None ) -> QACrawlWithResources: """Fetch QA Crawl""" - crawl_raw = await self.get_crawl(crawl_id, org, "crawl") + crawl_raw = await self.get_crawl_raw(crawl_id, org, "crawl") qa_crawl = None for qa_run in crawl_raw.get("qa", []): if qa_run.get("id") == qa_crawl_id: @@ -865,6 +874,14 @@ async def start_crawl_qa_job( async def stop_crawl_qa_job(crawl_id, org: Organization = Depends(org_crawl_dep)): return await ops.stop_crawl_qa_job(crawl_id) + @app.get( + "/orgs/{oid}/crawls/{crawl_id}/qa", + tags=["crawls"], + response_model=List[QACrawl], + ) + async def get_qa_crawls(crawl_id, org: Organization = Depends(org_viewer_dep)): + return await ops.get_qa_crawls(crawl_id, org) + @app.get( "/orgs/all/crawls/{crawl_id}", tags=["crawls"], From 31ea444bbc20a2c90c0da4bb278a2d65ba73ba97 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 26 Feb 2024 16:13:24 -0500 Subject: [PATCH 05/49] Flesh out method docstring --- backend/btrixcloud/crawls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index c427394c7e..7323e5a57a 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -639,7 +639,7 @@ async def get_qa_crawls( async def get_qa_crawl( self, crawl_id: str, qa_crawl_id: str, org: Optional[Organization] = None ) -> QACrawlWithResources: - """Fetch QA Crawl""" + """Fetch QA Crawl with resources for replay.json""" crawl_raw = await self.get_crawl_raw(crawl_id, org, "crawl") qa_crawl = None for qa_run in crawl_raw.get("qa", []): From 447b1426f406054bbaf94ae2e5648c2f8df78e89 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 26 Feb 2024 16:17:00 -0500 Subject: [PATCH 06/49] Fixups --- backend/btrixcloud/crawls.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 7323e5a57a..5466e8f2d3 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -846,7 +846,7 @@ async def get_crawl(crawl_id, org: Organization = Depends(org_viewer_dep)): tags=["crawls"], response_model=QACrawlWithResources, ) - async def get_qa_crawl_admin(crawl_id, user: User = Depends(user_dep)): + async def get_qa_crawl_admin(crawl_id, qa_crawl_id, user: User = Depends(user_dep)): if not user.is_superuser: raise HTTPException(status_code=403, detail="Not Allowed") @@ -857,7 +857,9 @@ async def get_qa_crawl_admin(crawl_id, user: User = Depends(user_dep)): tags=["crawls"], response_model=QACrawlWithResources, ) - async def get_qa_crawl(crawl_id, org: Organization = Depends(org_viewer_dep)): + async def get_qa_crawl( + crawl_id, qa_crawl_id, org: Organization = Depends(org_viewer_dep) + ): return await ops.get_qa_crawl(crawl_id, qa_crawl_id, org) @app.post("/orgs/{oid}/crawls/{crawl_id}/qa/start", tags=["crawls", "qa"]) @@ -870,9 +872,14 @@ async def start_crawl_qa_job( qa_crawl_id = await ops.start_crawl_qa_job(qa_crawl_in, crawl_id, org, user) return {"started": qa_crawl_id} - @app.post("/orgs/{oid}/crawls/{crawl_id}/qa/stop", tags=["crawls", "qa"]) - async def stop_crawl_qa_job(crawl_id, org: Organization = Depends(org_crawl_dep)): - return await ops.stop_crawl_qa_job(crawl_id) + @app.post( + "/orgs/{oid}/crawls/{crawl_id}/qa/{qa_crawl_id}/stop", tags=["crawls", "qa"] + ) + async def stop_crawl_qa_job( + crawl_id, qa_crawl_id, org: Organization = Depends(org_crawl_dep) + ): + # pylint: disable=unused-argument + return await ops.stop_crawl_qa_job(qa_crawl_id) @app.get( "/orgs/{oid}/crawls/{crawl_id}/qa", From b1514f245815c38be711d6bbc54285d05ab2180e Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 26 Feb 2024 16:20:41 -0500 Subject: [PATCH 07/49] Minor touchups --- backend/btrixcloud/crawls.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 5466e8f2d3..98c9ea7e05 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -632,29 +632,28 @@ async def get_qa_crawls( ) -> List[QACrawl]: """Return list of QA crawls""" crawl = await self.get_crawl(crawl_id, org, "crawl") - if not crawl.qa: - raise HTTPException(status_code=404, detail="crawl_qa_not_found") - return crawl.qa + return crawl.qa or [] async def get_qa_crawl( self, crawl_id: str, qa_crawl_id: str, org: Optional[Organization] = None ) -> QACrawlWithResources: """Fetch QA Crawl with resources for replay.json""" crawl_raw = await self.get_crawl_raw(crawl_id, org, "crawl") - qa_crawl = None + + qa_dict = None for qa_run in crawl_raw.get("qa", []): if qa_run.get("id") == qa_crawl_id: - qa_crawl = qa_run + qa_dict = qa_run - if not qa_crawl: + if not qa_dict: raise HTTPException(status_code=404, detail="crawl_qa_not_found") - qa_crawl["resources"] = await self._files_to_resources( - qa_crawl.get("files"), org, None, qa=True + qa_dict["resources"] = await self._files_to_resources( + qa_dict.get("files"), org, None, qa=True ) - qa_crawl.pop("files", None) + qa_dict.pop("files", None) - return QACrawlWithResources(**qa_crawl) + return QACrawlWithResources(**qa_dict) # ============================================================================ From 67481accb3387802f45cba7403fc870083cf6934 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 26 Feb 2024 17:01:16 -0500 Subject: [PATCH 08/49] Use qa-crawlid-ts naming scheme for QA crawls --- backend/btrixcloud/crawls.py | 4 ++-- backend/btrixcloud/k8sapi.py | 5 +++-- backend/btrixcloud/models.py | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 98c9ea7e05..35374f0e77 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -600,8 +600,8 @@ async def start_crawl_qa_job( if not crawl.files: raise HTTPException(status_code=404, detail="crawl_files_not_found") - # TODO: For now, let's just use the first WACZ. Eventually we'll want to - # pass them all in as an array or a multi-WACZ + # TODO: Give this the crawl's replay.json endpoint instead? + # For now, only passing the first WACZ for initial testing qa_source = crawl.files[0] try: diff --git a/backend/btrixcloud/k8sapi.py b/backend/btrixcloud/k8sapi.py index e9a88264da..b5ec5f888d 100644 --- a/backend/btrixcloud/k8sapi.py +++ b/backend/btrixcloud/k8sapi.py @@ -2,7 +2,6 @@ import os import traceback -from uuid import uuid4 import yaml @@ -126,7 +125,9 @@ def new_crawl_qa_job_yaml( self, userid, crawl_id, oid, storage, qa_source, crawler_image, profile_filename ): """load job template from yaml""" - qa_crawl_id = uuid4() + + ts_now = dt_now().strftime("%Y%m%d%H%M%S") + qa_crawl_id = f"qa-{crawl_id}-{ts_now}" params = { "id": str(qa_crawl_id), diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 95e1b335bd..fc23ca8faf 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -659,7 +659,7 @@ class CrawlScale(BaseModel): class QACrawl(BaseModel): """Subdocument to track QA runs for given crawl""" - id: UUID + id: str started: datetime finished: Optional[datetime] = None From af5c866d903e37414c253c38ecf14d5c74b56308 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 7 Mar 2024 11:14:12 -0800 Subject: [PATCH 09/49] qa run work, unifying interface --- backend/btrixcloud/crawls.py | 70 ----------- backend/btrixcloud/models.py | 14 +-- backend/btrixcloud/operator/crawls.py | 11 +- backend/btrixcloud/operator/models.py | 1 + chart/app-templates/crawl_job.yaml | 10 +- chart/app-templates/crawl_qa_job.yaml | 28 ----- chart/app-templates/crawler.yaml | 9 +- chart/app-templates/crawler_qa.yaml | 171 -------------------------- chart/templates/priorities.yaml | 12 ++ 9 files changed, 39 insertions(+), 287 deletions(-) delete mode 100644 chart/app-templates/crawl_qa_job.yaml delete mode 100644 chart/app-templates/crawler_qa.yaml diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 35374f0e77..bb3eb0a6ed 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -557,76 +557,6 @@ async def get_crawl_stats( return crawls_data - async def add_new_qa_crawl(self, qa_crawl_id, crawl_id, user, crawler_image): - """Add QA crawl subdoc to crawl document""" - qa_crawl = QACrawl( - id=qa_crawl_id, - started=datetime.now(), - userid=user.id, - userName=user.name, - state="starting", - image=crawler_image, - ) - try: - await self.crawls.update_one( - {"_id": "crawl_id"}, {"$push": {"qa": qa_crawl.dict()}} - ) - # pylint: disable=broad-exception-caught - except Exception as exc: - print( - f"Error adding QA Crawl {qa_crawl_id} to crawl doc {crawl_id}: {exc}", - flush=True, - ) - - async def start_crawl_qa_job( - self, qa_crawl_in: QACrawlIn, crawl_id: str, org: Organization, user: User - ): - """Start crawl QA job - - TODO: Do we need a configmap here?""" - crawler_image = self.crawl_configs.get_channel_crawler_image( - qa_crawl_in.crawlerChannel - ) - if not crawler_image: - raise HTTPException(status_code=404, detail="crawler_image_not_found") - - if await self.orgs.storage_quota_reached(org.id): - raise HTTPException(status_code=403, detail="storage_quota_reached") - - if await self.orgs.exec_mins_quota_reached(org.id): - raise HTTPException(status_code=403, detail="exec_minutes_quota_reached") - - crawl = await self.get_crawl(crawl_id, org) - if not crawl.files: - raise HTTPException(status_code=404, detail="crawl_files_not_found") - - # TODO: Give this the crawl's replay.json endpoint instead? - # For now, only passing the first WACZ for initial testing - qa_source = crawl.files[0] - - try: - qa_crawl_id = await self.crawl_manager.create_crawl_qa_job( - org.id, - crawl_id, - org.storage, - qa_source.filename, - str(user.id), - crawler_image, - qa_crawl_in.profileFilename, - ) - await self.add_new_qa_crawl(qa_crawl_id, crawl_id, user, crawler_image) - return qa_crawl_id - - except Exception as exc: - # pylint: disable=raise-missing-from - raise HTTPException( - status_code=500, detail=f"Error starting QA crawl: {exc}" - ) - - async def stop_crawl_qa_job(self, qa_crawl_id: str): - """Stop crawl QA job""" - return await self.crawl_manager.shutdown_crawl(qa_crawl_id) - async def get_qa_crawls( self, crawl_id: str, org: Optional[Organization] = None ) -> List[QACrawl]: diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index fde3ba2626..2b0f634275 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -670,7 +670,7 @@ class CrawlScale(BaseModel): # ============================================================================ -class QACrawl(BaseModel): +class QARun(BaseModel): """Subdocument to track QA runs for given crawl""" id: str @@ -691,20 +691,12 @@ class QACrawl(BaseModel): # ============================================================================ -class QACrawlWithResources(QACrawl): +class QARunWithResources(QARun): """QA crawl output model including resources""" resources: Optional[List[CrawlFileOut]] = [] -# ============================================================================ -class QACrawlIn(BaseModel): - """Input model for QA crawls""" - - crawlerChannel: str = "default" - profileFilename: Optional[str] = None - - # ============================================================================ class Crawl(BaseCrawl, CrawlConfigCore): """Store State of a Crawl (Finished or Running)""" @@ -726,7 +718,7 @@ class Crawl(BaseCrawl, CrawlConfigCore): image: Optional[str] - qa: Optional[List[QACrawl]] = [] + qa: Optional[Dict[str, QARun]] = {} # ============================================================================ diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index bab54232bb..54d23fa5bc 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -171,6 +171,7 @@ async def sync_crawls(self, data: MCSyncData): timeout=spec.get("timeout") or 0, max_crawl_size=int(spec.get("maxCrawlSize") or 0), scheduled=spec.get("manual") != "1", + qa_source_crawl_id=spec.get("qsSourceCrawlId") ) # shouldn't get here, crawl should already be finalizing when canceled @@ -241,7 +242,8 @@ async def sync_crawls(self, data: MCSyncData): params["storage_path"] = storage_path params["storage_secret"] = storage_secret - params["profile_filename"] = configmap["PROFILE_FILENAME"] + if not spec.qa_source_crawl_id: + params["profile_filename"] = configmap["PROFILE_FILENAME"] # only resolve if not already set # not automatically updating image for existing crawls @@ -306,7 +308,12 @@ def _load_crawler(self, params, i, status, children): if params.get("do_restart"): print(f"Restart {name}") - params["priorityClassName"] = f"crawl-instance-{i}" + if spec.qa_source_crawl_id: + params["qa_source"] = self.get_crawl_replay_url() + + params["priorityClassName"] = f"qa-crawl-instance-{i}" + else: + params["priorityClassName"] = f"crawl-instance-{i}" return self.load_from_yaml("crawler.yaml", params) diff --git a/backend/btrixcloud/operator/models.py b/backend/btrixcloud/operator/models.py index 2102a11779..b7cd0bf1b2 100644 --- a/backend/btrixcloud/operator/models.py +++ b/backend/btrixcloud/operator/models.py @@ -60,6 +60,7 @@ class CrawlSpec(BaseModel): scheduled: bool = False timeout: int = 0 max_crawl_size: int = 0 + qa_source_crawl_id: Optional[str] # ============================================================================ diff --git a/chart/app-templates/crawl_job.yaml b/chart/app-templates/crawl_job.yaml index 96fec0fc18..53a8873b33 100644 --- a/chart/app-templates/crawl_job.yaml +++ b/chart/app-templates/crawl_job.yaml @@ -1,10 +1,10 @@ apiVersion: btrix.cloud/v1 kind: CrawlJob metadata: - name: crawljob-{{ id }} + name: {{ "qa-" if is_qa else "" }}crawljob-{{ id }} labels: crawl: "{{ id }}" - role: "job" + role: {{ "qa-job" if is_qa else "job" }} btrix.org: "{{ oid }}" btrix.user: "{{ userid }}" btrix.storage: "{{ storage_name }}" @@ -19,8 +19,10 @@ spec: cid: "{{ cid }}" oid: "{{ oid }}" scale: {{ scale }} - maxCrawlSize: {{ max_crawl_size }} - timeout: {{ timeout }} + + maxCrawlSize: {{ max_crawl_size if not is_qa else 0 }} + timeout: {{ timeout if not is_qa else 0 }} + manual: {{ manual }} crawlerChannel: "{{ crawler_channel }}" ttlSecondsAfterFinished: 30 diff --git a/chart/app-templates/crawl_qa_job.yaml b/chart/app-templates/crawl_qa_job.yaml deleted file mode 100644 index 83f7e85292..0000000000 --- a/chart/app-templates/crawl_qa_job.yaml +++ /dev/null @@ -1,28 +0,0 @@ -apiVersion: btrix.cloud/v1 -kind: CrawlJob -metadata: - name: qa-crawljob-{{ id }} - labels: - id: "{{ id }}" - crawlId: "{{ crawl_id }}" - role: "job" - btrix.org: "{{ oid }}" - btrix.user: "{{ userid }}" - btrix.storage: "{{ storage_name }}" - -spec: - selector: - matchLabels: - qaCrawl: "{{ id }}" - - id: "{{ id }}" - crawlId: "{{ crawl_id }}" - userid: "{{ userid }}" - oid: "{{ oid }}" - crawlerImage: "{{ crawler_image }}" - profileFilename: "{{ profile_filename }}" - qaSource: "{{ qa_source }}" - ttlSecondsAfterFinished: 30 - - storageName: "{{ storage_name }}" - diff --git a/chart/app-templates/crawler.yaml b/chart/app-templates/crawler.yaml index b3e7a0c92c..1b9aa8b72d 100644 --- a/chart/app-templates/crawler.yaml +++ b/chart/app-templates/crawler.yaml @@ -54,7 +54,6 @@ spec: - name: crawl-config configMap: name: crawl-config-{{ cid }} - - name: crawl-data persistentVolumeClaim: claimName: {{ name }} @@ -102,6 +101,7 @@ spec: image: {{ crawler_image }} imagePullPolicy: {{ crawler_image_pull_policy }} command: + {% if not qa_source %} - crawl - --config - /tmp/crawl-config.json @@ -112,11 +112,18 @@ spec: - "@{{ profile_filename }}" {%- endif %} + {% else %} + - qa + - --qaSource + - {{ qa_source }} + {% endif %} volumeMounts: + {% if not qa_source %} - name: crawl-config mountPath: /tmp/crawl-config.json subPath: crawl-config.json readOnly: True + {% endif %} - name: crawl-data mountPath: /crawls diff --git a/chart/app-templates/crawler_qa.yaml b/chart/app-templates/crawler_qa.yaml deleted file mode 100644 index 63d84faec4..0000000000 --- a/chart/app-templates/crawler_qa.yaml +++ /dev/null @@ -1,171 +0,0 @@ -# ------- -# PVC -# ------- - -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: qa-{{ name }} - namespace: {{ namespace }} - labels: - crawl-qa: {{ id }} - role: crawler - -spec: - accessModes: - - ReadWriteOnce - - resources: - requests: - storage: {{ crawler_storage }} - - {% if volume_storage_class %} - storageClassName: {{ volume_storage_class }} - {% endif %} - - - -# ------- -# CRAWLER -# ------- -{% if not do_restart %} ---- -apiVersion: v1 -kind: Pod -metadata: - name: {{ name }} - namespace: {{ namespace }} - labels: - crawl-qa: {{ id }} - role: crawler - -spec: - hostname: {{ name }} - subdomain: crawler - - {% if priorityClassName %} - priorityClassName: {{ priorityClassName }} - {% endif %} - - restartPolicy: OnFailure - - terminationGracePeriodSeconds: {{ termination_grace_secs }} - volumes: - - name: crawl-data - persistentVolumeClaim: - claimName: {{ name }} - - affinity: -{% if crawler_node_type %} - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: nodeType - operator: In - values: - - "{{ crawler_node_type }}" -{% endif %} - - podAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 10 - podAffinityTerm: - topologyKey: "kubernetes.io/hostname" - labelSelector: - matchExpressions: - - key: crawl - operator: In - values: - - {{ id }} - - tolerations: - - key: nodeType - operator: Equal - value: crawling - effect: NoSchedule - - key: node.kubernetes.io/not-ready - operator: Exists - tolerationSeconds: 300 - effect: NoExecute - - key: node.kubernetes.io/unreachable - operator: Exists - effect: NoExecute - tolerationSeconds: 300 - - containers: - - name: crawler - image: {{ crawler_image }} - imagePullPolicy: {{ crawler_image_pull_policy }} - command: - - qa - - --qaSource - - {{ qa_source }} - - --redisStoreUrl - - {{ redis_url }} - {%- if profile_filename %} - - --profile - - "@{{ profile_filename }}" - {%- endif %} - - volumeMounts: - - name: crawl-data - mountPath: /crawls - - envFrom: - - configMapRef: - name: shared-crawler-config - - - secretRef: - name: {{ storage_secret }} - - {% if signing_secret %} - - secretRef: - name: {{ signing_secret }} - {% endif %} - - env: - - name: CRAWL_QA_ID - value: "{{ id }}" - - - name: WEBHOOK_URL - value: "{{ redis_url }}/crawls-done" - - - name: STORE_PATH - value: "{{ storage_path }}" - - - name: STORE_FILENAME - value: "{{ storage_filename }}" - - - name: STORE_USER - value: "{{ userid }}" - - {% if crawler_socks_proxy_host %} - - name: SOCKS_HOST - value: "{{ crawler_socks_proxy_host }}" - {% if crawler_socks_proxy_port %} - - name: SOCKS_PORT - value: "{{ crawler_socks_proxy_port }}" - {% endif %} - {% endif %} - - resources: - limits: - memory: "{{ memory }}" - - requests: - cpu: "{{ cpu }}" - memory: "{{ memory }}" - - {% if crawler_liveness_port and crawler_liveness_port != '0' %} - livenessProbe: - httpGet: - path: /healthz - port: {{ crawler_liveness_port }} - - initialDelaySeconds: 15 - periodSeconds: 120 - failureThreshold: 3 - {% endif %} - -{% endif %} diff --git a/chart/templates/priorities.yaml b/chart/templates/priorities.yaml index 4b63c15970..7eec9cb1a1 100644 --- a/chart/templates/priorities.yaml +++ b/chart/templates/priorities.yaml @@ -11,6 +11,18 @@ description: "Priority for crawl instance #{{ . }}" {{- end }} +{{- range untilStep 0 (int .Values.max_crawl_scale) 1 }} +--- +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: crawl-qa-instance-{{ . }} +value: -{{ add .Values.max_crawl_scale . }} +globalDefault: false +description: "Priority for QA crawl instance #{{ . }}" + +{{- end }} + # Lower Priority for Background Jobs --- apiVersion: scheduling.k8s.io/v1 From 93b2652df73a7db94709e1118e10be20491a7ea2 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 7 Mar 2024 12:58:45 -0800 Subject: [PATCH 10/49] more refactoring of apis for qa --- backend/btrixcloud/basecrawls.py | 12 +- backend/btrixcloud/crawlconfigs.py | 19 +-- backend/btrixcloud/crawlmanager.py | 41 ++++--- backend/btrixcloud/crawls.py | 159 +++++++++++++++++++------- backend/btrixcloud/k8sapi.py | 34 +----- backend/btrixcloud/models.py | 1 + backend/btrixcloud/operator/crawls.py | 9 +- chart/app-templates/crawl_job.yaml | 9 +- 8 files changed, 173 insertions(+), 111 deletions(-) diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index 32ad4607f7..b81820e8ce 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -124,12 +124,14 @@ async def get_crawl_raw( return res - async def _files_to_resources(self, files, org, crawlid, qa: bool = False): + async def _files_to_resources( + self, files, org, crawlid, qa_run_id: Optional[str] = None + ): if not files: return [] crawl_files = [CrawlFile(**data) for data in files] - return await self._resolve_signed_urls(crawl_files, org, qa, crawlid) + return await self._resolve_signed_urls(crawl_files, org, crawlid, qa_run_id) async def get_crawl( self, @@ -435,8 +437,8 @@ async def _resolve_signed_urls( self, files: List[CrawlFile], org: Organization, - qa: bool = False, crawl_id: Optional[str] = None, + qa_run_id: Optional[str] = None, ): if not files: print("no files") @@ -457,8 +459,8 @@ async def _resolve_signed_urls( ) prefix = "files" - if qa: - prefix = f"qa.{prefix}" + if qa_run_id: + prefix = f"qa.{qa_run_id}.{prefix}" await self.crawls.find_one_and_update( {f"{prefix}.filename": file_.filename}, diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 9690ebe55f..5a8f8dc937 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -814,8 +814,9 @@ async def get_crawl_config_search_values(self, org): "workflowIds": workflow_ids, } - async def run_now(self, cid: UUID, org: Organization, user: User): - """run specified crawlconfig now""" + async def prepare_for_run_crawl(self, cid: UUID, org: Organization) -> CrawlConfig: + """prepare for running a crawl, returning crawlconfig and + validating that running crawls is allowed""" crawlconfig = await self.get_crawl_config(cid, org.id) if not crawlconfig: @@ -823,11 +824,6 @@ async def run_now(self, cid: UUID, org: Organization, user: User): status_code=404, detail=f"Crawl Config '{cid}' not found" ) - if await self.get_running_crawl(crawlconfig): - raise HTTPException(status_code=400, detail="crawl_already_running") - - crawl_id = None - # ensure crawlconfig exists try: await self.crawl_manager.get_configmap(crawlconfig.id) @@ -841,6 +837,15 @@ async def run_now(self, cid: UUID, org: Organization, user: User): if await self.org_ops.exec_mins_quota_reached(org.id): raise HTTPException(status_code=403, detail="exec_minutes_quota_reached") + return crawlconfig + + async def run_now(self, cid: UUID, org: Organization, user: User): + """run specified crawlconfig now""" + crawlconfig = await self.prepare_for_run_crawl(cid, org) + + if await self.get_running_crawl(crawlconfig): + raise HTTPException(status_code=400, detail="crawl_already_running") + try: crawl_id = await self.crawl_manager.create_crawl_job( crawlconfig, diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index a1eb368181..536da58274 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -4,7 +4,6 @@ import asyncio import secrets import json -from uuid import UUID from typing import Optional, Dict from datetime import timedelta @@ -178,23 +177,35 @@ async def create_crawl_job( warc_prefix=warc_prefix, ) - async def create_crawl_qa_job( + async def create_qa_crawl_job( self, - oid: UUID, - crawl_id: str, + crawlconfig: CrawlConfig, storage: StorageRef, - qa_source: str, userid: str, - crawler_image: str, - profile_filename: str = None, + qa_source: str, ) -> str: - """create new crawl qa job""" - storage_secret = storage.get_storage_secret_name(str(oid)) + """create new QA Run crawl job with qa source crawl id""" + cid = str(crawlconfig.id) + + storage_secret = storage.get_storage_secret_name(str(crawlconfig.oid)) await self.has_storage_secret(storage_secret) - return await self.new_crawl_qa_job( - userid, crawl_id, oid, storage, qa_source, crawler_image, profile_filename + ts_now = dt_now().strftime("%Y%m%d%H%M%S") + crawl_id = f"qa-{ts_now}-{cid[:12]}" + + return await self.new_crawl_job( + cid, + userid, + crawlconfig.oid, + storage, + crawlconfig.crawlerChannel, + 1, + 0, + 0, + warc_prefix="qa", + crawl_id=crawl_id, + qa_source=qa_source, ) async def update_crawl_config( @@ -313,16 +324,14 @@ async def scale_crawl(self, crawl_id: str, scale: int = 1) -> dict: """Set the crawl scale (job parallelism) on the specified job""" return await self._patch_job(crawl_id, {"scale": scale}) - async def shutdown_crawl( - self, crawl_id: str, graceful=True, qa_crawl=False - ) -> dict: + async def shutdown_crawl(self, crawl_id: str, graceful=True) -> dict: """Request a crawl cancelation or stop by calling an API on the job pod/container, returning the result""" if graceful: patch = {"stopping": True} - return await self._patch_job(crawl_id, patch, qa_crawl=qa_crawl) + return await self._patch_job(crawl_id, patch) - return await self.delete_crawl_job(crawl_id, qa_crawl=qa_crawl) + return await self.delete_crawl_job(crawl_id) async def delete_crawl_configs_for_org(self, org: str) -> None: """Delete all crawl configs for given org""" diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index bb3eb0a6ed..18cc6f098c 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -27,14 +27,13 @@ Crawl, CrawlOut, CrawlOutWithResources, + QARun, + QARunWithResources, Organization, User, PaginatedResponse, RUNNING_AND_STARTING_STATES, ALL_CRAWL_STATES, - QACrawl, - QACrawlIn, - QACrawlWithResources, ) @@ -557,33 +556,107 @@ async def get_crawl_stats( return crawls_data - async def get_qa_crawls( - self, crawl_id: str, org: Optional[Organization] = None - ) -> List[QACrawl]: - """Return list of QA crawls""" + async def start_crawl_qa_run(self, crawl_id: str, org: Organization, user: User): + """Start crawl QA run""" + crawl = await self.get_crawl(crawl_id, org, "crawl") - return crawl.qa or [] - async def get_qa_crawl( - self, crawl_id: str, qa_crawl_id: str, org: Optional[Organization] = None - ) -> QACrawlWithResources: - """Fetch QA Crawl with resources for replay.json""" - crawl_raw = await self.get_crawl_raw(crawl_id, org, "crawl") + if crawl.qa_active: + raise HTTPException(status_code=400, detail="qa_already_running") + + crawlconfig = await self.crawl_configs.prepare_for_run_crawl(crawl.cid, org) + + try: + qa_run_id = await self.crawl_manager.create_qa_crawl_job( + crawlconfig, + org.storage, + userid=str(user.id), + qa_source=crawl_id, + ) + + image = self.crawl_configs.get_channel_crawler_image( + crawlconfig.crawlerChannel + ) + + qa_run = QARun( + id=qa_run_id, + started=datetime.now(), + userid=user.id, + userName=user.name, + state="starting", + image=image, + ) + + await self.crawls.find_one_and_update( + {"_id": crawl_id}, + { + "$set": { + "qa": {qa_run_id: qa_run.dict()}, + "qa_active": qa_run_id, + } + }, + ) + + return qa_run_id + + except Exception as exc: + # pylint: disable=raise-missing-from + raise HTTPException(status_code=500, detail=f"Error starting crawl: {exc}") + + async def stop_crawl_qa_run(self, crawl_id: str, org: Organization): + """Stop crawl QA run""" + crawl = await self.get_crawl(crawl_id, org, "crawl") + + if not crawl.active_qa: + raise HTTPException(status_code=400, detail="invalid_qa_run_id") + + try: + result = await self.crawl_manager.shutdown_crawl( + crawl.active_qa, graceful=True + ) - qa_dict = None - for qa_run in crawl_raw.get("qa", []): - if qa_run.get("id") == qa_crawl_id: - qa_dict = qa_run + if result.get("error") == "Not Found": + # treat as success, qa crawl no longer exists, so mark as no qa + result = {"success": True} - if not qa_dict: + if result.get("success"): + await self.crawls.find_one_and_update( + {"_id": crawl_id, "type": "crawl", "oid": org.id}, + {"$set": {"active_qa": None}}, + ) + + return result + + except Exception as exc: + # pylint: disable=raise-missing-from + # if reached here, probably crawl doesn't exist anymore + raise HTTPException( + status_code=404, detail=f"crawl_not_found, (details: {exc})" + ) + + async def get_qa_runs( + self, crawl_id: str, org: Optional[Organization] = None + ) -> List[QARun]: + """Return list of QA runs""" + crawl = await self.get_crawl(crawl_id, org, "crawl") + return list(crawl.qa.values()) or [] + + async def get_qa_run_for_replay( + self, crawl_id: str, qa_run_id: str, org: Optional[Organization] = None + ) -> QARun: + """Fetch QA runs with resources for replay.json""" + crawl = await self.get_crawl(crawl_id, org, "crawl") + + qa_run = crawl.qa.get(qa_run_id) + if not qa_run: raise HTTPException(status_code=404, detail="crawl_qa_not_found") - qa_dict["resources"] = await self._files_to_resources( - qa_dict.get("files"), org, None, qa=True + resources = await self._files_to_resources( + qa_run.files, org, crawlid=crawl_id, qa_run_id=qa_run_id ) - qa_dict.pop("files", None) + qa_run.files = None - return QACrawlWithResources(**qa_dict) + return QARunWithResources(resources=resources, **qa_run.dict()) # ============================================================================ @@ -770,53 +843,53 @@ async def get_crawl_admin(crawl_id, user: User = Depends(user_dep)): async def get_crawl(crawl_id, org: Organization = Depends(org_viewer_dep)): return await ops.get_crawl(crawl_id, org, "crawl") + # QA APIs + # --------------------- + @app.get( - "/orgs/all/crawls/{crawl_id}/qa/{qa_crawl_id}/replay.json", + "/orgs/all/crawls/{crawl_id}/qa/{qa_run_id}/replay.json", tags=["crawls"], - response_model=QACrawlWithResources, + response_model=QARunWithResources, ) - async def get_qa_crawl_admin(crawl_id, qa_crawl_id, user: User = Depends(user_dep)): + async def get_qa_crawl_admin(crawl_id, qa_run_id, user: User = Depends(user_dep)): if not user.is_superuser: raise HTTPException(status_code=403, detail="Not Allowed") - return await ops.get_qa_crawl(crawl_id, qa_crawl_id) + return await ops.get_qa_run_for_replay(crawl_id, qa_run_id) @app.get( - "/orgs/{oid}/crawls/{crawl_id}/qa/{qa_crawl_id}/replay.json", + "/orgs/{oid}/crawls/{crawl_id}/qa/{qa_run_id}/replay.json", tags=["crawls"], - response_model=QACrawlWithResources, + response_model=QARunWithResources, ) async def get_qa_crawl( - crawl_id, qa_crawl_id, org: Organization = Depends(org_viewer_dep) + crawl_id, qa_run_id, org: Organization = Depends(org_viewer_dep) ): - return await ops.get_qa_crawl(crawl_id, qa_crawl_id, org) + return await ops.get_qa_run_for_replay(crawl_id, qa_run_id, org) @app.post("/orgs/{oid}/crawls/{crawl_id}/qa/start", tags=["crawls", "qa"]) - async def start_crawl_qa_job( - qa_crawl_in: QACrawlIn, + async def start_crawl_qa_run( crawl_id: str, org: Organization = Depends(org_crawl_dep), user: User = Depends(user_dep), ): - qa_crawl_id = await ops.start_crawl_qa_job(qa_crawl_in, crawl_id, org, user) - return {"started": qa_crawl_id} + qa_run_id = await ops.start_crawl_qa_run(crawl_id, org, user) + return {"started": qa_run_id} - @app.post( - "/orgs/{oid}/crawls/{crawl_id}/qa/{qa_crawl_id}/stop", tags=["crawls", "qa"] - ) - async def stop_crawl_qa_job( - crawl_id, qa_crawl_id, org: Organization = Depends(org_crawl_dep) + @app.post("/orgs/{oid}/crawls/{crawl_id}/qa/stop", tags=["crawls", "qa"]) + async def stop_crawl_qa_run( + crawl_id: str, org: Organization = Depends(org_crawl_dep) ): # pylint: disable=unused-argument - return await ops.stop_crawl_qa_job(qa_crawl_id) + return await ops.stop_crawl_qa_run(crawl_id, org) @app.get( "/orgs/{oid}/crawls/{crawl_id}/qa", tags=["crawls"], - response_model=List[QACrawl], + response_model=List[QARun], ) - async def get_qa_crawls(crawl_id, org: Organization = Depends(org_viewer_dep)): - return await ops.get_qa_crawls(crawl_id, org) + async def get_qa_runs(crawl_id, org: Organization = Depends(org_viewer_dep)): + return await ops.get_qa_runs(crawl_id, org) @app.get( "/orgs/all/crawls/{crawl_id}", diff --git a/backend/btrixcloud/k8sapi.py b/backend/btrixcloud/k8sapi.py index b5ec5f888d..e4399dd63d 100644 --- a/backend/btrixcloud/k8sapi.py +++ b/backend/btrixcloud/k8sapi.py @@ -87,6 +87,7 @@ def new_crawl_job_yaml( manual=True, crawl_id=None, warc_prefix="", + qa_source="", ): """load job template from yaml""" if not crawl_id: @@ -106,6 +107,7 @@ def new_crawl_job_yaml( "manual": "1" if manual else "0", "crawler_channel": crawler_channel, "warc_prefix": warc_prefix, + "qa_source": qa_source, } data = self.templates.env.get_template("crawl_job.yaml").render(params) @@ -120,38 +122,6 @@ async def new_crawl_job(self, *args, **kwargs) -> str: return crawl_id - # pylint: disable=too-many-arguments, too-many-locals - def new_crawl_qa_job_yaml( - self, userid, crawl_id, oid, storage, qa_source, crawler_image, profile_filename - ): - """load job template from yaml""" - - ts_now = dt_now().strftime("%Y%m%d%H%M%S") - qa_crawl_id = f"qa-{crawl_id}-{ts_now}" - - params = { - "id": str(qa_crawl_id), - "crawl_id": crawl_id, - "oid": oid, - "userid": userid, - "storage_name": str(storage), - "crawler_image": crawler_image, - "profile_filename": profile_filename, - "qa_source": qa_source, - } - - data = self.templates.env.get_template("crawl_qa_job.yaml").render(params) - return qa_crawl_id, data - - async def new_crawl_qa_job(self, *args, **kwargs) -> str: - """load and init qa crawl job via k8s api""" - qa_crawl_id, data = self.new_crawl_qa_job_yaml(*args, **kwargs) - - # create job directly - await self.create_from_yaml(data) - - return qa_crawl_id - async def create_from_yaml(self, doc, namespace=None): """init k8s objects from yaml""" yml_document_all = yaml.safe_load_all(doc) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 2b0f634275..2c27c48be5 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -718,6 +718,7 @@ class Crawl(BaseCrawl, CrawlConfigCore): image: Optional[str] + active_qa: Optional[str] qa: Optional[Dict[str, QARun]] = {} diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 54d23fa5bc..3b2f5bdfd4 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -171,7 +171,7 @@ async def sync_crawls(self, data: MCSyncData): timeout=spec.get("timeout") or 0, max_crawl_size=int(spec.get("maxCrawlSize") or 0), scheduled=spec.get("manual") != "1", - qa_source_crawl_id=spec.get("qsSourceCrawlId") + qa_source_crawl_id=spec.get("qaSourceCrawlId"), ) # shouldn't get here, crawl should already be finalizing when canceled @@ -269,6 +269,9 @@ async def sync_crawls(self, data: MCSyncData): else: params["force_restart"] = False + if spec.qa_source_crawl_id: + params["qa_source"] = "" # self.get_crawl_replay_url() + for i in range(0, status.scale): children.extend(self._load_crawler(params, i, status, data.children)) @@ -308,9 +311,7 @@ def _load_crawler(self, params, i, status, children): if params.get("do_restart"): print(f"Restart {name}") - if spec.qa_source_crawl_id: - params["qa_source"] = self.get_crawl_replay_url() - + if params.get("qa_source"): params["priorityClassName"] = f"qa-crawl-instance-{i}" else: params["priorityClassName"] = f"crawl-instance-{i}" diff --git a/chart/app-templates/crawl_job.yaml b/chart/app-templates/crawl_job.yaml index 53a8873b33..7b55e17b5b 100644 --- a/chart/app-templates/crawl_job.yaml +++ b/chart/app-templates/crawl_job.yaml @@ -1,10 +1,10 @@ apiVersion: btrix.cloud/v1 kind: CrawlJob metadata: - name: {{ "qa-" if is_qa else "" }}crawljob-{{ id }} + name: {{ "qa-" if qa_source else "" }}crawljob-{{ id }} labels: crawl: "{{ id }}" - role: {{ "qa-job" if is_qa else "job" }} + role: {{ "qa-job" if qa_source else "job" }} btrix.org: "{{ oid }}" btrix.user: "{{ userid }}" btrix.storage: "{{ storage_name }}" @@ -20,8 +20,9 @@ spec: oid: "{{ oid }}" scale: {{ scale }} - maxCrawlSize: {{ max_crawl_size if not is_qa else 0 }} - timeout: {{ timeout if not is_qa else 0 }} + maxCrawlSize: {{ max_crawl_size if not qa_source else 0 }} + timeout: {{ timeout if not qa_source else 0 }} + qaSourceCrawlId: "{{ qa_source }}" manual: {{ manual }} crawlerChannel: "{{ crawler_channel }}" From e8495296fc3116f256f1fabc6cd2388d2eebb758 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 8 Mar 2024 14:38:47 -0800 Subject: [PATCH 11/49] update operator + crawl_ops calls to pass crawl_id + source_crawl_id and treat as 'crawl_or_qa_run_id' and 'qa_source_crawl_id' in crawl ops --- backend/btrixcloud/crawls.py | 82 ++++++++++--- backend/btrixcloud/models.py | 59 ++++----- backend/btrixcloud/operator/crawls.py | 170 +++++++++++++------------- backend/btrixcloud/operator/models.py | 2 +- backend/btrixcloud/pages.py | 12 +- 5 files changed, 185 insertions(+), 140 deletions(-) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 18cc6f098c..69048fed30 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -8,7 +8,7 @@ from datetime import datetime from uuid import UUID -from typing import Optional, List, Dict, Union +from typing import Optional, List, Dict, Union, Any from fastapi import Depends, HTTPException from fastapi.responses import StreamingResponse @@ -419,53 +419,103 @@ async def add_or_remove_exclusion(self, crawl_id, regex, org, user, add): return {"success": True} async def update_crawl_state_if_allowed( - self, crawl_id, state, allowed_from, **kwargs + self, + crawl_or_qa_run_id: str, + qa_source_crawl_id: str, + state: str, + allowed_from: List[str], + **kwargs, ): """update crawl state and other properties in db if state has changed""" + crawl_id = qa_source_crawl_id or crawl_or_qa_run_id + kwargs["state"] = state - query = {"_id": crawl_id, "type": "crawl"} + if qa_source_crawl_id: + kwargs = {f"qa.{crawl_or_qa_run_id}": kwargs} + + query: Dict[str, Any] = {"_id": crawl_id, "type": "crawl"} if allowed_from: query["state"] = {"$in": allowed_from} return await self.crawls.find_one_and_update(query, {"$set": kwargs}) - async def update_running_crawl_stats(self, crawl_id, stats): + async def update_running_crawl_stats( + self, crawl_or_qa_run_id: str, qa_source_crawl_id: str, stats + ): """update running crawl stats""" + crawl_id = qa_source_crawl_id or crawl_or_qa_run_id + field = "stats" if not qa_source_crawl_id else f"qa.{crawl_or_qa_run_id}.stats" query = {"_id": crawl_id, "type": "crawl", "state": "running"} - return await self.crawls.find_one_and_update(query, {"$set": {"stats": stats}}) + return await self.crawls.find_one_and_update(query, {"$set": {field: stats}}) - async def inc_crawl_exec_time(self, crawl_id, exec_time, last_updated_time): + async def inc_crawl_exec_time( + self, + crawl_or_qa_run_id: str, + qa_source_crawl_id: str, + exec_time, + last_updated_time, + ): """increment exec time""" + crawl_id = qa_source_crawl_id or crawl_or_qa_run_id + prefix = "" if not qa_source_crawl_id else f"qa.{crawl_or_qa_run_id}." + return await self.crawls.find_one_and_update( - {"_id": crawl_id, "type": "crawl", "_lut": {"$ne": last_updated_time}}, { - "$inc": {"crawlExecSeconds": exec_time}, - "$set": {"_lut": last_updated_time}, + "_id": crawl_id, + "type": "crawl", + f"{prefix}_lut": {"$ne": last_updated_time}, + }, + { + "$inc": {f"{prefix}crawlExecSeconds": exec_time}, + "$set": {f"{prefix}_lut": last_updated_time}, }, ) - async def get_crawl_state(self, crawl_id): + async def get_crawl_state( + self, crawl_or_qa_run_id: str, qa_source_crawl_id: Optional[str] = None + ): """return current crawl state of a crawl""" + crawl_id = qa_source_crawl_id or crawl_or_qa_run_id + state = "state" if not qa_source_crawl_id else f"$qa.{qa_source_crawl_id}.state" + finished = ( + "finished" + if not qa_source_crawl_id + else f"$qa.{qa_source_crawl_id}.finished" + ) + res = await self.crawls.find_one( - {"_id": crawl_id}, projection=["state", "finished"] + {"_id": crawl_id}, projection={"state": state, "finished": finished} ) if not res: return None, None return res.get("state"), res.get("finished") - async def add_crawl_error(self, crawl_id: str, error: str): + async def add_crawl_error( + self, + crawl_or_qa_run_id: str, + qa_source_crawl_id: str, + error: str, + ): """add crawl error from redis to mongodb errors field""" + crawl_id = qa_source_crawl_id or crawl_or_qa_run_id + field = "errors" if not qa_source_crawl_id else "qa.{crawl_or_qa_run_id}.errors" + await self.crawls.find_one_and_update( - {"_id": crawl_id}, {"$push": {"errors": error}} + {"_id": crawl_id}, {"$push": {field: error}} ) - async def add_crawl_file(self, crawl_id, crawl_file, size): + async def add_crawl_file( + self, crawl_or_qa_run_id, qa_source_crawl_id, crawl_file, size + ): """add new crawl file to crawl""" + crawl_id = qa_source_crawl_id or crawl_or_qa_run_id + prefix = "" if not qa_source_crawl_id else "qa.{crawl_or_qa_run_id}." + await self.crawls.find_one_and_update( {"_id": crawl_id}, { - "$push": {"files": crawl_file.dict()}, - "$inc": {"fileCount": 1, "fileSize": size}, + "$push": {f"{prefix}files": crawl_file.dict()}, + "$inc": {f"{prefix}fileCount": 1, f"{prefix}fileSize": size}, }, ) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 2c27c48be5..5ea4a933fc 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -532,36 +532,49 @@ class ReviewStatus(str, Enum): # ============================================================================ -class BaseCrawl(BaseMongoModel): - """Base Crawl object (representing crawls, uploads and manual sessions)""" +class CoreCrawlable: + # pylint: disable=too-few-public-methods + """Core properties for crawlable run (crawl or qa run)""" id: str - type: str - userid: UUID userName: Optional[str] - oid: UUID started: datetime finished: Optional[datetime] = None - name: Optional[str] = "" - state: str + crawlExecSeconds: int = 0 + + image: Optional[str] + + stopping: Optional[bool] = False + stats: Optional[Dict[str, int]] = None files: Optional[List[CrawlFile]] = [] - description: Optional[str] = "" + fileSize: int = 0 + fileCount: int = 0 errors: Optional[List[str]] = [] - collectionIds: Optional[List[UUID]] = [] - fileSize: int = 0 - fileCount: int = 0 +# ============================================================================ +class BaseCrawl(CoreCrawlable, BaseMongoModel): + """Base Crawl object (representing crawls, uploads and manual sessions)""" + + type: str + + oid: UUID + + name: Optional[str] = "" + + description: Optional[str] = "" + + collectionIds: Optional[List[UUID]] = [] reviewStatus: Optional[ReviewStatus] = None @@ -670,25 +683,9 @@ class CrawlScale(BaseModel): # ============================================================================ -class QARun(BaseModel): +class QARun(CoreCrawlable, BaseModel): """Subdocument to track QA runs for given crawl""" - id: str - started: datetime - finished: Optional[datetime] = None - - userid: UUID - userName: Optional[str] - - state: str - stopping: Optional[bool] = False - - image: Optional[str] = None - - crawlExecSeconds: int = 0 - - files: Optional[List[CrawlFile]] = [] - # ============================================================================ class QARunWithResources(QARun): @@ -712,12 +709,6 @@ class Crawl(BaseCrawl, CrawlConfigCore): # schedule: Optional[str] manual: Optional[bool] - stopping: Optional[bool] = False - - crawlExecSeconds: int = 0 - - image: Optional[str] - active_qa: Optional[str] qa: Optional[Dict[str, QARun]] = {} diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 3b2f5bdfd4..4488182c20 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -113,21 +113,33 @@ async def sync_crawls(self, data: MCSyncData): pods = data.children[POD] + crawl = CrawlSpec( + id=crawl_id, + cid=cid, + oid=oid, + storage=StorageRef(spec["storageName"]), + crawler_channel=spec.get("crawlerChannel"), + scale=spec.get("scale", 1), + started=data.parent["metadata"]["creationTimestamp"], + stopping=spec.get("stopping", False), + timeout=spec.get("timeout") or 0, + max_crawl_size=int(spec.get("maxCrawlSize") or 0), + scheduled=spec.get("manual") != "1", + qa_source_crawl_id=spec.get("qaSourceCrawlId"), + ) + # if finalizing, crawl is being deleted if data.finalizing: if not status.finished: # if can't cancel, already finished - if not await self.cancel_crawl( - crawl_id, UUID(cid), UUID(oid), status, data.children[POD] - ): + if not await self.cancel_crawl(crawl, status, data.children[POD]): # instead of fetching the state (that was already set) # return exception to ignore this request, keep previous # finished state raise HTTPException(status_code=400, detail="out_of_sync_status") return await self.finalize_response( - crawl_id, - UUID(oid), + crawl, status, spec, data.children, @@ -140,10 +152,9 @@ async def sync_crawls(self, data: MCSyncData): print( f"warn crawl {crawl_id} finished but not deleted, post-finish taking too long?" ) - self.run_task(self.k8s.delete_crawl_job(crawl_id)) + self.run_task(self.k8s.delete_crawl_job(crawl.id)) return await self.finalize_response( - crawl_id, - UUID(oid), + crawl, status, spec, data.children, @@ -155,25 +166,10 @@ async def sync_crawls(self, data: MCSyncData): # pylint: disable=bare-except, broad-except except: # fail crawl if config somehow missing, shouldn't generally happen - await self.fail_crawl(crawl_id, UUID(cid), UUID(oid), status, pods) + await self.fail_crawl(crawl, status, pods) return self._empty_response(status) - crawl = CrawlSpec( - id=crawl_id, - cid=cid, - oid=oid, - storage=StorageRef(spec["storageName"]), - crawler_channel=spec.get("crawlerChannel"), - scale=spec.get("scale", 1), - started=data.parent["metadata"]["creationTimestamp"], - stopping=spec.get("stopping", False), - timeout=spec.get("timeout") or 0, - max_crawl_size=int(spec.get("maxCrawlSize") or 0), - scheduled=spec.get("manual") != "1", - qa_source_crawl_id=spec.get("qaSourceCrawlId"), - ) - # shouldn't get here, crawl should already be finalizing when canceled # just in case, handle canceled-but-not-finalizing here if status.state == "canceled": @@ -189,9 +185,7 @@ async def sync_crawls(self, data: MCSyncData): and not data.children[PVC] and await self.org_ops.storage_quota_reached(crawl.oid) ): - await self.mark_finished( - crawl.id, crawl.cid, crawl.oid, status, "skipped_quota_reached" - ) + await self.mark_finished(crawl, status, "skipped_quota_reached") return self._empty_response(status) if status.state in ("starting", "waiting_org_limit"): @@ -199,7 +193,7 @@ async def sync_crawls(self, data: MCSyncData): return self._empty_response(status) await self.set_state( - "starting", status, crawl.id, allowed_from=["waiting_org_limit"] + "starting", status, crawl, allowed_from=["waiting_org_limit"] ) if len(pods): @@ -219,8 +213,7 @@ async def sync_crawls(self, data: MCSyncData): if status.finished: return await self.finalize_response( - crawl_id, - UUID(oid), + crawl, status, spec, data.children, @@ -228,7 +221,7 @@ async def sync_crawls(self, data: MCSyncData): ) await self.increment_pod_exec_time( - pods, status, crawl.id, crawl.oid, EXEC_TIME_UPDATE_SECS + pods, crawl, status, EXEC_TIME_UPDATE_SECS ) else: @@ -388,7 +381,14 @@ def sync_resources(self, status, name, pod, children): src = pvc["spec"]["resources"]["requests"] resources.storage = int(parse_quantity(src.get("storage"))) - async def set_state(self, state, status, crawl_id, allowed_from, **kwargs): + async def set_state( + self, + state: str, + status: CrawlStatus, + crawl: CrawlSpec, + allowed_from: list[str], + **kwargs, + ): """set status state and update db, if changed if allowed_from passed in, can only transition from allowed_from state, otherwise get current state from db and return @@ -415,15 +415,21 @@ async def set_state(self, state, status, crawl_id, allowed_from, **kwargs): """ if not allowed_from or status.state in allowed_from: res = await self.crawl_ops.update_crawl_state_if_allowed( - crawl_id, state=state, allowed_from=allowed_from, **kwargs + crawl.id, + crawl.qa_source_crawl_id, + state=state, + allowed_from=allowed_from, + **kwargs, ) if res: - print(f"Setting state: {status.state} -> {state}, {crawl_id}") + print(f"Setting state: {status.state} -> {state}, {crawl.id}") status.state = state return True # get actual crawl state - actual_state, finished = await self.crawl_ops.get_crawl_state(crawl_id) + actual_state, finished = await self.crawl_ops.get_crawl_state( + crawl.id, crawl.qa_source_crawl_id + ) if actual_state: status.state = actual_state if finished: @@ -436,7 +442,7 @@ async def set_state(self, state, status, crawl_id, allowed_from, **kwargs): if status.state != state: print( - f"Not setting state: {status.state} -> {state}, {crawl_id} not allowed" + f"Not setting state: {status.state} -> {state}, {crawl.id} not allowed" ) return False @@ -503,23 +509,21 @@ async def can_start_new(self, crawl: CrawlSpec, data: MCSyncData, status): i += 1 await self.set_state( - "waiting_org_limit", status, crawl.id, allowed_from=["starting"] + "waiting_org_limit", status, crawl, allowed_from=["starting"] ) return False async def cancel_crawl( self, - crawl_id: str, - cid: UUID, - oid: UUID, + crawl: CrawlSpec, status: CrawlStatus, pods: dict, ) -> bool: """Mark crawl as canceled""" - if not await self.mark_finished(crawl_id, cid, oid, status, "canceled"): + if not await self.mark_finished(crawl, status, "canceled"): return False - await self.mark_for_cancelation(crawl_id) + await self.mark_for_cancelation(crawl.id) if not status.canceled: for name, pod in pods.items(): @@ -544,9 +548,7 @@ async def cancel_crawl( async def fail_crawl( self, - crawl_id: str, - cid: UUID, - oid: UUID, + crawl: CrawlSpec, status: CrawlStatus, pods: dict, stats=None, @@ -554,9 +556,7 @@ async def fail_crawl( """Mark crawl as failed, log crawl state and print crawl logs, if possible""" prev_state = status.state - if not await self.mark_finished( - crawl_id, cid, oid, status, "failed", stats=stats - ): + if not await self.mark_finished(crawl, status, "failed", stats=stats): return False if not self.log_failed_crawl_lines or prev_state == "failed": @@ -581,8 +581,7 @@ def _empty_response(self, status): async def finalize_response( self, - crawl_id: str, - oid: UUID, + crawl: CrawlSpec, status: CrawlStatus, spec: dict, children: dict, @@ -590,7 +589,7 @@ async def finalize_response( ): """ensure crawl id ready for deletion""" - redis_pod = f"redis-{crawl_id}" + redis_pod = f"redis-{crawl.id}" new_children = [] finalized = False @@ -601,7 +600,7 @@ async def finalize_response( # if has other pods, keep redis pod until they are removed if len(pods) > 1: new_children = self._load_redis(params, status, children) - await self.increment_pod_exec_time(pods, status, crawl_id, oid) + await self.increment_pod_exec_time(pods, crawl, status) # keep pvs until pods are removed if new_children: @@ -613,7 +612,7 @@ async def finalize_response( ttl = spec.get("ttlSecondsAfterFinished", DEFAULT_TTL) finished = from_k8s_date(status.finished) if (dt_now() - finished).total_seconds() > ttl > 0: - print("CrawlJob expired, deleting: " + crawl_id) + print("CrawlJob expired, deleting: " + crawl.id) finalized = True else: finalized = True @@ -670,7 +669,7 @@ async def sync_crawl_state( await self.set_state( "waiting_capacity", status, - crawl.id, + crawl, allowed_from=RUNNING_AND_STARTING_ONLY, ) @@ -702,7 +701,7 @@ async def sync_crawl_state( if await self.set_state( "running", status, - crawl.id, + crawl, allowed_from=["starting", "waiting_capacity"], ): self.run_task( @@ -725,12 +724,16 @@ async def sync_crawl_state( page_crawled = await redis.lpop(f"{crawl.id}:{self.pages_key}") while page_crawled: page_dict = json.loads(page_crawled) - await self.page_ops.add_page_to_db(page_dict, crawl.id, crawl.oid) + await self.page_ops.add_page_to_db( + page_dict, crawl.id, crawl.qa_source_crawl_id, crawl.oid + ) page_crawled = await redis.lpop(f"{crawl.id}:{self.pages_key}") crawl_error = await redis.lpop(f"{crawl.id}:{self.errors_key}") while crawl_error: - await self.crawl_ops.add_crawl_error(crawl.id, crawl_error) + await self.crawl_ops.add_crawl_error( + crawl.id, crawl.qa_source_crawl_id, crawl_error + ) crawl_error = await redis.lpop(f"{crawl.id}:{self.errors_key}") # ensure filesAdded and filesAddedSize always set @@ -828,9 +831,8 @@ def handle_terminated_pod(self, name, role, status, terminated): async def increment_pod_exec_time( self, pods: dict[str, dict], + crawl: CrawlSpec, status: CrawlStatus, - crawl_id: str, - oid: UUID, min_duration=0, ) -> None: """inc exec time tracking""" @@ -916,7 +918,7 @@ async def increment_pod_exec_time( if exec_time: if not await self.crawl_ops.inc_crawl_exec_time( - crawl_id, exec_time, status.lastUpdatedTime + crawl.id, crawl.qa_source_crawl_id, exec_time, status.lastUpdatedTime ): # if lastUpdatedTime is same as previous, something is wrong, don't update! print( @@ -925,7 +927,7 @@ async def increment_pod_exec_time( ) return - await self.org_ops.inc_org_time_stats(oid, exec_time, True) + await self.org_ops.inc_org_time_stats(crawl.oid, exec_time, True) status.crawlExecTime += exec_time status.elapsedCrawlTime += max_duration @@ -1034,7 +1036,9 @@ async def add_file_to_crawl(self, cc_data, crawl, redis): await redis.incr("filesAddedSize", filecomplete.size) - await self.crawl_ops.add_crawl_file(crawl.id, crawl_file, filecomplete.size) + await self.crawl_ops.add_crawl_file( + crawl.id, crawl.qa_source_crawl_id, crawl_file, filecomplete.size + ) try: await self.background_job_ops.create_replica_jobs( @@ -1115,7 +1119,9 @@ async def update_crawl_state( status.size = stats["size"] status.sizeHuman = humanize.naturalsize(status.size) - await self.crawl_ops.update_running_crawl_stats(crawl.id, stats) + await self.crawl_ops.update_running_crawl_stats( + crawl.id, crawl.qa_source_crawl_id, stats + ) for key, value in sizes.items(): value = int(value) @@ -1151,9 +1157,7 @@ async def update_crawl_state( # check if one-page crawls actually succeeded # if only one page found, and no files, assume failed if status.pagesFound == 1 and not status.filesAdded: - await self.fail_crawl( - crawl.id, crawl.cid, crawl.oid, status, pods, stats - ) + await self.fail_crawl(crawl, status, pods, stats) return status if status.stopReason in ("stopped_by_user", "stopped_quota_reached"): @@ -1161,21 +1165,15 @@ async def update_crawl_state( else: state = "complete" - await self.mark_finished( - crawl.id, crawl.cid, crawl.oid, status, state, crawl, stats - ) + await self.mark_finished(crawl, status, state, stats) # check if all crawlers failed elif status_count.get("failed", 0) >= crawl.scale: # if stopping, and no pages finished, mark as canceled if status.stopping and not status.pagesDone: - await self.mark_finished( - crawl.id, crawl.cid, crawl.oid, status, "canceled", crawl, stats - ) + await self.mark_finished(crawl, status, "canceled", stats) else: - await self.fail_crawl( - crawl.id, crawl.cid, crawl.oid, status, pods, stats - ) + await self.fail_crawl(crawl, status, pods, stats) # check for other statuses else: @@ -1192,7 +1190,7 @@ async def update_crawl_state( new_status = "pending-wait" if new_status: await self.set_state( - new_status, status, crawl.id, allowed_from=RUNNING_STATES + new_status, status, crawl, allowed_from=RUNNING_STATES ) return status @@ -1200,12 +1198,9 @@ async def update_crawl_state( # pylint: disable=too-many-arguments async def mark_finished( self, - crawl_id: str, - cid: UUID, - oid: UUID, + crawl: CrawlSpec, status: CrawlStatus, state: str, - crawl=None, stats=None, ) -> bool: """mark crawl as finished, set finished timestamp and final state""" @@ -1223,7 +1218,7 @@ async def mark_finished( # if set_state returns false, already set to same status, return if not await self.set_state( - state, status, crawl_id, allowed_from=allowed_from, **kwargs + state, status, crawl, allowed_from=allowed_from, **kwargs ): print("already finished, ignoring mark_finished") if not status.finished: @@ -1233,14 +1228,15 @@ async def mark_finished( status.finished = to_k8s_date(finished) - if crawl and state in SUCCESSFUL_STATES: - await self.inc_crawl_complete_stats(crawl, finished) + if not crawl.qa_source_crawl_id: + if crawl and state in SUCCESSFUL_STATES: + await self.inc_crawl_complete_stats(crawl, finished) - self.run_task( - self.do_crawl_finished_tasks( - crawl_id, cid, oid, status.filesAddedSize, state + self.run_task( + self.do_crawl_finished_tasks( + crawl.id, crawl.cid, crawl.oid, status.filesAddedSize, state + ) ) - ) return True diff --git a/backend/btrixcloud/operator/models.py b/backend/btrixcloud/operator/models.py index b7cd0bf1b2..31d488bcf1 100644 --- a/backend/btrixcloud/operator/models.py +++ b/backend/btrixcloud/operator/models.py @@ -60,7 +60,7 @@ class CrawlSpec(BaseModel): scheduled: bool = False timeout: int = 0 max_crawl_size: int = 0 - qa_source_crawl_id: Optional[str] + qa_source_crawl_id: str = "" # ============================================================================ diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 91c5c3bb7e..3e0a31b720 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -57,18 +57,26 @@ async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str): if not page_dict.get("url"): continue - await self.add_page_to_db(page_dict, crawl_id, crawl.oid) + await self.add_page_to_db(page_dict, crawl_id, "", crawl.oid) # pylint: disable=broad-exception-caught, raise-missing-from except Exception as err: print(f"Error adding pages for crawl {crawl_id} to db: {err}", flush=True) - async def add_page_to_db(self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID): + async def add_page_to_db( + self, + page_dict: Dict[str, Any], + crawl_or_qa_run_id: str, + qa_source_crawl_id: str, + oid: UUID, + ): """Add page to database""" page_id = page_dict.get("id") if not page_id: print(f'Page {page_dict.get("url")} has no id - assigning UUID', flush=True) page_id = uuid4() + crawl_id = qa_source_crawl_id or crawl_or_qa_run_id + try: status = page_dict.get("status") if not status and page_dict.get("loadState"): From f2ca46cd16fdd2a646e07b4c18b203a9fc5452b7 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 8 Mar 2024 14:58:57 -0800 Subject: [PATCH 12/49] store qa compare data in unified PageQACompare from page_dict --- backend/btrixcloud/models.py | 22 ++++++++++----------- backend/btrixcloud/pages.py | 38 ++++++++++++++++-------------------- 2 files changed, 27 insertions(+), 33 deletions(-) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 5ea4a933fc..af7a4f1db8 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -1449,6 +1449,15 @@ class PageNote(BaseModel): userName: str +# ============================================================================ +class PageQACompare(BaseModel): + """Model for updating pages from QA run""" + + screenshotMatch: Optional[int] = None + textMatch: Optional[int] = None + resourceCounts: Optional[Dict[str, Dict[str, int]]] + + # ============================================================================ class Page(BaseMongoModel): """Model for crawl pages""" @@ -1462,9 +1471,7 @@ class Page(BaseMongoModel): status: Optional[int] = None # automated heuristics, keyed by QA run id - screenshotMatch: Optional[Dict[str, float]] = {} - textMatch: Optional[Dict[str, float]] = {} - resourceCounts: Optional[Dict[str, Dict[str, int]]] = {} + qa: Optional[Dict[str, PageQACompare]] = {} # manual review userid: Optional[UUID] = None @@ -1478,12 +1485,3 @@ class PageOut(Page): """Model for pages output""" status: Optional[int] = 200 - - -# ============================================================================ -class PageQAUpdate(BaseModel): - """Model for updating pages from QA run""" - - screenshotMatch: Optional[int] = None - textMatch: Optional[int] = None - resourceCounts: Optional[Dict[str, Dict[str, int]]] diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 3e0a31b720..bd7b59473f 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -11,7 +11,7 @@ Page, PageOut, PageReviewUpdate, - PageQAUpdate, + PageQACompare, Organization, PaginatedResponse, User, @@ -108,6 +108,17 @@ async def add_page_to_db( f"Error adding page {page_id} from crawl {crawl_id} to db: {err}", flush=True, ) + return + + # qa data + if qa_source_crawl_id: + compare_dict = page_dict.get("compare") + if compare_dict is None: + print("QA Run, but compare data missing!") + return + + compare = PageQACompare(**compare_dict) + await self.add_qa_compare(page_id, oid, crawl_or_qa_run_id, compare) async def delete_crawl_pages(self, crawl_id: str, oid: Optional[UUID] = None): """Delete crawl pages from db""" @@ -149,38 +160,23 @@ async def get_page( page_raw = await self.get_page_raw(page_id, oid, crawl_id) return Page.from_dict(page_raw) - async def update_page_qa( - self, - page_id: UUID, - oid: UUID, - qa_run_id: str, - update: PageQAUpdate, + async def add_qa_compare( + self, page_id: UUID, oid: UUID, qa_run_id: str, compare: PageQACompare ) -> Dict[str, bool]: """Update page heuristics and mime/type from QA run""" - query = update.dict(exclude_unset=True) - if len(query) == 0: - raise HTTPException(status_code=400, detail="no_update_data") - - keyed_fields = ("screenshotMatch", "textMatch", "resourceCounts") - for field in keyed_fields: - score = query.get(field) - if score: - query[f"{field}.{qa_run_id}"] = score - query.pop(field, None) - - query["modified"] = datetime.utcnow().replace(microsecond=0, tzinfo=None) + # modified = datetime.utcnow().replace(microsecond=0, tzinfo=None) result = await self.pages.find_one_and_update( {"_id": page_id, "oid": oid}, - {"$set": query}, + {"$set": {f"qa.{qa_run_id}", compare.dict()}}, return_document=pymongo.ReturnDocument.AFTER, ) if not result: raise HTTPException(status_code=404, detail="page_not_found") - return {"updated": True} + return True async def update_page_approval( self, From 3e1bda2ae5e6338f3d066554064f6e87f6298ee3 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 8 Mar 2024 17:51:35 -0800 Subject: [PATCH 13/49] fixes --- backend/btrixcloud/basecrawls.py | 2 ++ backend/btrixcloud/operator/crawls.py | 13 ++++++++----- chart/templates/priorities.yaml | 4 ++-- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index b81820e8ce..15d1cacae0 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -156,6 +156,8 @@ async def get_crawl( res.pop("files", None) res.pop("errors", None) + print("RES", res) + crawl = cls_type.from_dict(res) if crawl.type == "crawl": diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 4488182c20..9770076cf5 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -132,11 +132,11 @@ async def sync_crawls(self, data: MCSyncData): if data.finalizing: if not status.finished: # if can't cancel, already finished - if not await self.cancel_crawl(crawl, status, data.children[POD]): + await self.cancel_crawl(crawl, status, data.children[POD]) # instead of fetching the state (that was already set) # return exception to ignore this request, keep previous # finished state - raise HTTPException(status_code=400, detail="out_of_sync_status") + # raise HTTPException(status_code=400, detail="out_of_sync_status") return await self.finalize_response( crawl, @@ -233,10 +233,13 @@ async def sync_crawls(self, data: MCSyncData): storage_path = crawl.storage.get_storage_extra_path(oid) storage_secret = crawl.storage.get_storage_secret_name(oid) + if not crawl.qa_source_crawl_id: + params["profile_filename"] = configmap["PROFILE_FILENAME"] + else: + storage_path += "qa/" + params["storage_path"] = storage_path params["storage_secret"] = storage_secret - if not spec.qa_source_crawl_id: - params["profile_filename"] = configmap["PROFILE_FILENAME"] # only resolve if not already set # not automatically updating image for existing crawls @@ -262,7 +265,7 @@ async def sync_crawls(self, data: MCSyncData): else: params["force_restart"] = False - if spec.qa_source_crawl_id: + if crawl.qa_source_crawl_id: params["qa_source"] = "" # self.get_crawl_replay_url() for i in range(0, status.scale): diff --git a/chart/templates/priorities.yaml b/chart/templates/priorities.yaml index 7eec9cb1a1..42427df881 100644 --- a/chart/templates/priorities.yaml +++ b/chart/templates/priorities.yaml @@ -17,7 +17,7 @@ apiVersion: scheduling.k8s.io/v1 kind: PriorityClass metadata: name: crawl-qa-instance-{{ . }} -value: -{{ add .Values.max_crawl_scale . }} +value: -{{ add 100 . }} globalDefault: false description: "Priority for QA crawl instance #{{ . }}" @@ -29,7 +29,7 @@ apiVersion: scheduling.k8s.io/v1 kind: PriorityClass metadata: name: bg-jobs -value: -100 +value: -1000 globalDefault: false description: "Priority for background jobs" From 8b0e6c7b6c57e2e7b87f3b6aed1c963a0b9911e8 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 8 Mar 2024 21:04:09 -0800 Subject: [PATCH 14/49] refactor basecrawls / crawls / uploads getters: - rename original get_crawl -> get_crawl_out as its used for api response - basecrawls has get_base_crawl and get_crawl_raw and get_crawl_out - crawls has get_crawl - uploads has get_uplads --- backend/btrixcloud/background_jobs.py | 10 +- backend/btrixcloud/basecrawls.py | 147 +++++++++---------------- backend/btrixcloud/crawls.py | 148 +++++++++++++++++++------- backend/btrixcloud/main.py | 4 +- backend/btrixcloud/main_op.py | 2 +- backend/btrixcloud/models.py | 7 +- backend/btrixcloud/operator/crawls.py | 24 +++-- backend/btrixcloud/pages.py | 9 +- backend/btrixcloud/uploads.py | 17 ++- backend/btrixcloud/webhooks.py | 4 +- chart/app-templates/crawler.yaml | 7 +- 11 files changed, 216 insertions(+), 163 deletions(-) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index e77f96077d..b8cd420ea5 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -403,11 +403,11 @@ async def get_replica_job_file( profile = await self.profile_ops.get_profile(UUID(job.object_id), org) return BaseFile(**profile.resource.dict()) - item_res = await self.base_crawl_ops.get_crawl_raw(job.object_id, org) - matching_file = [ - f for f in item_res.get("files", []) if f["filename"] == job.file_path - ][0] - return BaseFile(**matching_file) + item_res = await self.base_crawl_ops.get_base_crawl(job.object_id, org) + matching_file = [f for f in item_res.files if f.filename == job.file_path][ + 0 + ] + return matching_file # pylint: disable=broad-exception-caught, raise-missing-from except Exception: raise HTTPException(status_code=404, detail="file_not_found") diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index 15d1cacae0..ef1d3cf75e 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -5,7 +5,6 @@ from typing import Optional, List, Union, Dict, Any, Type, TYPE_CHECKING, cast from uuid import UUID import urllib.parse -import contextlib import asyncio from fastapi import HTTPException, Depends @@ -30,7 +29,6 @@ if TYPE_CHECKING: from .crawlconfigs import CrawlConfigOps - from .crawlmanager import CrawlManager from .users import UserManager from .orgs import OrgOps from .colls import CollectionOps @@ -41,7 +39,7 @@ else: CrawlConfigOps = UserManager = OrgOps = CollectionOps = PageOps = object - CrawlManager = StorageOps = EventWebhookOps = BackgroundJobOps = object + StorageOps = EventWebhookOps = BackgroundJobOps = object # Presign duration must be less than 604800 seconds (one week), # so set this one minute short of a week. @@ -57,7 +55,6 @@ class BaseCrawlOps: # pylint: disable=duplicate-code, too-many-arguments, too-many-locals crawl_configs: CrawlConfigOps - crawl_manager: CrawlManager user_manager: UserManager orgs: OrgOps colls: CollectionOps @@ -71,7 +68,6 @@ def __init__( mdb, users: UserManager, orgs: OrgOps, - crawl_manager: CrawlManager, crawl_configs: CrawlConfigOps, colls: CollectionOps, storage_ops: StorageOps, @@ -79,7 +75,6 @@ def __init__( background_job_ops: BackgroundJobOps, ): self.crawls = mdb["crawls"] - self.crawl_manager = crawl_manager self.crawl_configs = crawl_configs self.user_manager = users self.orgs = orgs @@ -107,7 +102,7 @@ async def get_crawl_raw( org: Optional[Organization] = None, type_: Optional[str] = None, project: Optional[dict[str, bool]] = None, - ): + ) -> Dict[str, Any]: """Get data for single crawl""" query: dict[str, object] = {"_id": crawlid} @@ -133,32 +128,49 @@ async def _files_to_resources( crawl_files = [CrawlFile(**data) for data in files] return await self._resolve_signed_urls(crawl_files, org, crawlid, qa_run_id) - async def get_crawl( + async def get_wacz_files(self, crawl_id: str, org: Organization): + """Return list of WACZ files associated with crawl.""" + wacz_files = [] + crawl = await self.get_base_crawl(crawl_id, org) + for file_ in crawl.files: + if file_.filename.endswith(".wacz"): + wacz_files.append(file_) + return wacz_files + + async def get_base_crawl( self, crawlid: str, org: Optional[Organization] = None, type_: Optional[str] = None, - cls_type: Type[Union[CrawlOut, CrawlOutWithResources]] = CrawlOutWithResources, - ): - """Get data for single base crawl""" + project: Optional[dict[str, bool]] = None, + ) -> BaseCrawl: + """Get crawl data for internal use""" + res = await self.get_crawl_raw(crawlid, org, type_, project) + return BaseCrawl.from_dict(res) + + async def get_crawl_out( + self, + crawlid: str, + org: Optional[Organization] = None, + type_: Optional[str] = None, + skip_resources=False, + ) -> CrawlOutWithResources: + """Get crawl data for api output""" res = await self.get_crawl_raw(crawlid, org, type_) - if cls_type == CrawlOutWithResources: + if not skip_resources: res["resources"] = await self._files_to_resources( res.get("files"), org, crawlid ) - if res.get("collectionIds"): - res["collections"] = await self.colls.get_collection_names( - res.get("collectionIds") - ) + coll_ids = res.get("collectionIds") + if coll_ids: + res["collections"] = await self.colls.get_collection_names(coll_ids) res.pop("files", None) res.pop("errors", None) - print("RES", res) - - crawl = cls_type.from_dict(res) + crawl = CrawlOutWithResources.from_dict(res) if crawl.type == "crawl": crawl = await self._resolve_crawl_refs(crawl, org) @@ -186,9 +198,9 @@ async def _update_crawl_collections( self, crawl_id: str, org: Organization, collection_ids: List[UUID] ): """Update crawl collections to match updated list.""" - crawl = await self.get_crawl(crawl_id, org, cls_type=CrawlOut) + crawl = await self.get_crawl_out(crawl_id, org, skip_resources=True) - prior_coll_ids = set(crawl.collectionIds) + prior_coll_ids = set(crawl.collectionIds or []) updated_coll_ids = set(collection_ids) # Add new collections @@ -268,50 +280,7 @@ async def add_crawl_file_replica( ) async def shutdown_crawl(self, crawl_id: str, org: Organization, graceful: bool): - """stop or cancel specified crawl""" - crawl = await self.get_crawl_raw(crawl_id, org) - if crawl.get("type") != "crawl": - return - - result = None - try: - result = await self.crawl_manager.shutdown_crawl( - crawl_id, graceful=graceful - ) - - if result.get("success"): - if graceful: - await self.crawls.find_one_and_update( - {"_id": crawl_id, "type": "crawl", "oid": org.id}, - {"$set": {"stopping": True}}, - ) - return result - - except Exception as exc: - # pylint: disable=raise-missing-from - # if reached here, probably crawl doesn't exist anymore - raise HTTPException( - status_code=404, detail=f"crawl_not_found, (details: {exc})" - ) - - # if job no longer running, canceling is considered success, - # but graceful stoppage is not possible, so would be a failure - if result.get("error") == "Not Found": - if not graceful: - await self.update_crawl_state(crawl_id, "canceled") - crawl = await self.get_crawl_raw(crawl_id, org) - if not await self.crawl_configs.stats_recompute_last( - crawl["cid"], 0, -1 - ): - raise HTTPException( - status_code=404, - detail=f"crawl_config_not_found: {crawl['cid']}", - ) - - return {"success": True} - - # return whatever detail may be included in the response - raise HTTPException(status_code=400, detail=result) + """placeholder, implemented in crawls, base version does nothing""" async def delete_crawls( self, @@ -326,17 +295,17 @@ async def delete_crawls( size = 0 for crawl_id in delete_list.crawl_ids: - crawl = await self.get_crawl_raw(crawl_id, org) - if crawl.get("type") != type_: + crawl = await self.get_base_crawl(crawl_id, org) + if crawl.type != type_: continue # Ensure user has appropriate permissions for all crawls in list: # - Crawler users can delete their own crawls # - Org owners can delete any crawls in org - if user and (crawl.get("userid") != user.id) and not org.is_owner(user): + if user and (crawl.userid != user.id) and not org.is_owner(user): raise HTTPException(status_code=403, detail="not_allowed") - if type_ == "crawl" and not crawl.get("finished"): + if type_ == "crawl" and not crawl.finished: try: await self.shutdown_crawl(crawl_id, org, graceful=False) except Exception as exc: @@ -351,7 +320,7 @@ async def delete_crawls( crawl_size = await self._delete_crawl_files(crawl, org) size += crawl_size - cid = crawl.get("cid") + cid = str(crawl.cid) if cid: if cids_to_update.get(cid): cids_to_update[cid]["inc"] += 1 @@ -381,9 +350,8 @@ async def delete_crawls( return res.deleted_count, cids_to_update, quota_reached - async def _delete_crawl_files(self, crawl_raw: Dict[str, Any], org: Organization): + async def _delete_crawl_files(self, crawl: BaseCrawl, org: Organization): """Delete files associated with crawl from storage.""" - crawl = BaseCrawl.from_dict(crawl_raw) size = 0 for file_ in crawl.files: size += file_.size @@ -397,9 +365,9 @@ async def _delete_crawl_files(self, crawl_raw: Dict[str, Any], org: Organization async def delete_crawl_files(self, crawl_id: str, oid: UUID): """Delete crawl files""" - crawl_raw = await self.get_crawl_raw(crawl_id) + crawl = await self.get_base_crawl(crawl_id) org = await self.orgs.get_org_by_id(oid) - return await self._delete_crawl_files(crawl_raw, org) + return await self._delete_crawl_files(crawl, org) async def _resolve_crawl_refs( self, @@ -494,25 +462,13 @@ async def _resolve_signed_urls( return out_files - @contextlib.asynccontextmanager - async def get_redis(self, crawl_id): - """get redis url for crawl id""" - redis_url = self.crawl_manager.get_redis_url(crawl_id) - - redis = await self.crawl_manager.get_redis_client(redis_url) - - try: - yield redis - finally: - await redis.close() - async def add_to_collection( self, crawl_ids: List[str], collection_id: UUID, org: Organization ): """Add crawls to collection.""" for crawl_id in crawl_ids: - crawl_raw = await self.get_crawl_raw(crawl_id, org) - crawl_collections = crawl_raw.get("collectionIds") + crawl = await self.get_base_crawl(crawl_id, org) + crawl_collections = crawl.collectionIds if crawl_collections and crawl_id in crawl_collections: raise HTTPException( status_code=400, detail="crawl_already_in_collection" @@ -662,11 +618,10 @@ async def delete_crawls_all_types( uploads: list[str] = [] for crawl_id in delete_list.crawl_ids: - crawl = await self.get_crawl_raw(crawl_id, org) - type_ = crawl.get("type") - if type_ == "crawl": + crawl = await self.get_base_crawl(crawl_id, org) + if crawl.type == "crawl": crawls.append(crawl_id) - if type_ == "upload": + if crawl.type == "upload": uploads.append(crawl_id) crawls_length = len(crawls) @@ -817,7 +772,7 @@ async def get_all_crawls_search_values( response_model=CrawlOutWithResources, ) async def get_base_crawl(crawl_id: str, org: Organization = Depends(org_crawl_dep)): - return await ops.get_crawl(crawl_id, org) + return await ops.get_crawl_out(crawl_id, org) @app.get( "/orgs/all/all-crawls/{crawl_id}/replay.json", @@ -828,15 +783,15 @@ async def get_base_crawl_admin(crawl_id, user: User = Depends(user_dep)): if not user.is_superuser: raise HTTPException(status_code=403, detail="Not Allowed") - return await ops.get_crawl(crawl_id, None) + return await ops.get_crawl_out(crawl_id, None) @app.get( "/orgs/{oid}/all-crawls/{crawl_id}/replay.json", tags=["all-crawls"], response_model=CrawlOutWithResources, ) - async def get_crawl(crawl_id, org: Organization = Depends(org_viewer_dep)): - return await ops.get_crawl(crawl_id, org) + async def get_crawl_out(crawl_id, org: Organization = Depends(org_viewer_dep)): + return await ops.get_crawl_out(crawl_id, org) @app.patch("/orgs/{oid}/all-crawls/{crawl_id}", tags=["all-crawls"]) async def update_crawl( diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 69048fed30..544393e4e6 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -4,6 +4,7 @@ import json import re +import contextlib import urllib.parse from datetime import datetime from uuid import UUID @@ -18,6 +19,7 @@ from .pagination import DEFAULT_PAGE_SIZE, paginated_format from .utils import dt_now, parse_jsonl_error_messages, stream_dict_list_as_csv from .basecrawls import BaseCrawlOps +from .crawlmanager import CrawlManager from .models import ( UpdateCrawl, DeleteCrawlList, @@ -42,13 +44,15 @@ # ============================================================================ +# pylint: disable=too-many-arguments, too-many-instance-attributes, too-many-public-methods class CrawlOps(BaseCrawlOps): """Crawl Ops""" - # pylint: disable=too-many-arguments, too-many-instance-attributes, too-many-public-methods + crawl_manager: CrawlManager - def __init__(self, *args): + def __init__(self, crawl_manager: CrawlManager, *args): super().__init__(*args) + self.crawl_manager = crawl_manager self.crawl_configs.set_crawl_ops(self) self.colls.set_crawl_ops(self) self.event_webhook_ops.set_crawl_ops(self) @@ -79,6 +83,28 @@ async def init_index(self): await self.crawls.create_index([("state", pymongo.HASHED)]) await self.crawls.create_index([("fileSize", pymongo.DESCENDING)]) + async def get_crawl( + self, + crawlid: str, + org: Optional[Organization] = None, + project: Optional[dict[str, bool]] = None, + ) -> Crawl: + """Get crawl data for internal use""" + res = await self.get_crawl_raw(crawlid, org, "crawl", project) + return Crawl.from_dict(res) + + @contextlib.asynccontextmanager + async def get_redis(self, crawl_id): + """get redis url for crawl id""" + redis_url = self.crawl_manager.get_redis_url(crawl_id) + + redis = await self.crawl_manager.get_redis_client(redis_url) + + try: + yield redis + finally: + await redis.close() + async def list_crawls( self, org: Optional[Organization] = None, @@ -225,16 +251,6 @@ async def delete_crawls( return {"deleted": True, "storageQuotaReached": quota_reached} - async def get_wacz_files(self, crawl_id: str, org: Organization): - """Return list of WACZ files associated with crawl.""" - wacz_files = [] - crawl_raw = await self.get_crawl_raw(crawl_id, org) - crawl = Crawl.from_dict(crawl_raw) - for file_ in crawl.files: - if file_.filename.endswith(".wacz"): - wacz_files.append(file_) - return wacz_files - # pylint: disable=too-many-arguments async def add_new_crawl( self, @@ -288,9 +304,9 @@ async def update_crawl_scale( self, crawl_id: str, org: Organization, crawl_scale: CrawlScale, user: User ): """Update crawl scale in the db""" - crawl = await self.get_crawl_raw(crawl_id, org) + crawl = await self.get_crawl(crawl_id, org) update = UpdateCrawlConfig(scale=crawl_scale.scale) - await self.crawl_configs.update_crawl_config(crawl["cid"], org, user, update) + await self.crawl_configs.update_crawl_config(crawl.cid, org, user, update) result = await self.crawls.find_one_and_update( {"_id": crawl_id, "type": "crawl", "oid": org.id}, @@ -391,11 +407,11 @@ async def add_or_remove_exclusion(self, crawl_id, regex, org, user, add): """add new exclusion to config or remove exclusion from config for given crawl_id, update config on crawl""" - crawl_raw = await self.get_crawl_raw(crawl_id, org, project={"cid": True}) + crawl = await self.get_crawl(crawl_id, org, project={"cid": True}) - cid = crawl_raw.get("cid") + cid = crawl.cid - scale = crawl_raw.get("scale", 1) + scale = crawl.scale or 1 async with self.get_redis(crawl_id) as redis: query = { @@ -498,7 +514,9 @@ async def add_crawl_error( ): """add crawl error from redis to mongodb errors field""" crawl_id = qa_source_crawl_id or crawl_or_qa_run_id - field = "errors" if not qa_source_crawl_id else "qa.{crawl_or_qa_run_id}.errors" + field = ( + "errors" if not qa_source_crawl_id else f"qa.{crawl_or_qa_run_id}.errors" + ) await self.crawls.find_one_and_update( {"_id": crawl_id}, {"$push": {field: error}} @@ -509,7 +527,7 @@ async def add_crawl_file( ): """add new crawl file to crawl""" crawl_id = qa_source_crawl_id or crawl_or_qa_run_id - prefix = "" if not qa_source_crawl_id else "qa.{crawl_or_qa_run_id}." + prefix = "" if not qa_source_crawl_id else f"qa.{crawl_or_qa_run_id}." await self.crawls.find_one_and_update( {"_id": crawl_id}, @@ -530,9 +548,10 @@ async def get_crawl_seeds( skip = (page - 1) * page_size upper_bound = skip + page_size - crawl_raw = await self.get_crawl_raw(crawl_id, org) + crawl = await self.get_crawl(crawl_id, org) + if not crawl.config or not crawl.config.seeds: + return [], 0 try: - crawl = Crawl.from_dict(crawl_raw) return crawl.config.seeds[skip:upper_bound], len(crawl.config.seeds) # pylint: disable=broad-exception-caught except Exception: @@ -606,14 +625,63 @@ async def get_crawl_stats( return crawls_data + async def shutdown_crawl( + self, crawl_id: str, org: Organization, graceful: bool + ) -> Dict[str, bool]: + """stop or cancel specified crawl""" + crawl = await self.get_base_crawl(crawl_id, org) + if crawl and crawl.type != "crawl": + raise HTTPException(status_code=400, detail="not_a_crawl") + + result = None + try: + result = await self.crawl_manager.shutdown_crawl( + crawl_id, graceful=graceful + ) + + if result.get("success"): + if graceful: + await self.crawls.find_one_and_update( + {"_id": crawl_id, "type": "crawl", "oid": org.id}, + {"$set": {"stopping": True}}, + ) + return result + + except Exception as exc: + # pylint: disable=raise-missing-from + # if reached here, probably crawl doesn't exist anymore + raise HTTPException( + status_code=404, detail=f"crawl_not_found, (details: {exc})" + ) + + # if job no longer running, canceling is considered success, + # but graceful stoppage is not possible, so would be a failure + if result.get("error") == "Not Found": + if not graceful: + await self.update_crawl_state(crawl_id, "canceled") + crawl = await self.get_crawl(crawl_id, org) + if not await self.crawl_configs.stats_recompute_last(crawl.cid, 0, -1): + raise HTTPException( + status_code=404, + detail=f"crawl_config_not_found: {crawl.cid}", + ) + + return {"success": True} + + # return whatever detail may be included in the response + raise HTTPException(status_code=400, detail=result) + async def start_crawl_qa_run(self, crawl_id: str, org: Organization, user: User): """Start crawl QA run""" - crawl = await self.get_crawl(crawl_id, org, "crawl") + crawl = await self.get_crawl(crawl_id, org) if crawl.qa_active: raise HTTPException(status_code=400, detail="qa_already_running") + if not crawl.cid: + raise HTTPException(status_code=400, detail="invalid_crawl_for_qa") + crawlconfig = await self.crawl_configs.prepare_for_run_crawl(crawl.cid, org) try: @@ -655,14 +723,14 @@ async def start_crawl_qa_run(self, crawl_id: str, org: Organization, user: User) async def stop_crawl_qa_run(self, crawl_id: str, org: Organization): """Stop crawl QA run""" - crawl = await self.get_crawl(crawl_id, org, "crawl") + crawl = await self.get_crawl(crawl_id, org) - if not crawl.active_qa: + if not crawl.qa_active: raise HTTPException(status_code=400, detail="invalid_qa_run_id") try: result = await self.crawl_manager.shutdown_crawl( - crawl.active_qa, graceful=True + crawl.qa_active, graceful=True ) if result.get("error") == "Not Found": @@ -688,23 +756,24 @@ async def get_qa_runs( self, crawl_id: str, org: Optional[Organization] = None ) -> List[QARun]: """Return list of QA runs""" - crawl = await self.get_crawl(crawl_id, org, "crawl") - return list(crawl.qa.values()) or [] + crawl = await self.get_crawl(crawl_id, org) + return list(crawl.qa.values()) if crawl.qa else [] async def get_qa_run_for_replay( self, crawl_id: str, qa_run_id: str, org: Optional[Organization] = None ) -> QARun: """Fetch QA runs with resources for replay.json""" - crawl = await self.get_crawl(crawl_id, org, "crawl") + crawl = await self.get_crawl(crawl_id, org) + qa = crawl.qa or {} + qa_run = qa.get(qa_run_id) - qa_run = crawl.qa.get(qa_run_id) if not qa_run: raise HTTPException(status_code=404, detail="crawl_qa_not_found") resources = await self._files_to_resources( qa_run.files, org, crawlid=crawl_id, qa_run_id=qa_run_id ) - qa_run.files = None + qa_run.files = [] return QARunWithResources(resources=resources, **qa_run.dict()) @@ -729,11 +798,11 @@ async def recompute_crawl_file_count_and_size(crawls, crawl_id): # ============================================================================ # pylint: disable=too-many-arguments, too-many-locals, too-many-statements -def init_crawls_api(app, user_dep, *args): +def init_crawls_api(crawl_manager: CrawlManager, app, user_dep, *args): """API for crawl management, including crawl done callback""" # pylint: disable=invalid-name, duplicate-code - ops = CrawlOps(*args) + ops = CrawlOps(crawl_manager, *args) org_viewer_dep = ops.orgs.org_viewer_dep org_crawl_dep = ops.orgs.org_crawl_dep @@ -883,15 +952,15 @@ async def get_crawl_admin(crawl_id, user: User = Depends(user_dep)): if not user.is_superuser: raise HTTPException(status_code=403, detail="Not Allowed") - return await ops.get_crawl(crawl_id, None, "crawl") + return await ops.get_crawl_out(crawl_id, None, "crawl") @app.get( "/orgs/{oid}/crawls/{crawl_id}/replay.json", tags=["crawls"], response_model=CrawlOutWithResources, ) - async def get_crawl(crawl_id, org: Organization = Depends(org_viewer_dep)): - return await ops.get_crawl(crawl_id, org, "crawl") + async def get_crawl_out(crawl_id, org: Organization = Depends(org_viewer_dep)): + return await ops.get_crawl_out(crawl_id, org, "crawl") # QA APIs # --------------------- @@ -1076,7 +1145,7 @@ async def stream_crawl_logs( logLevel: Optional[str] = None, context: Optional[str] = None, ): - crawl = await ops.get_crawl(crawl_id, org, "crawl") + crawl = await ops.get_crawl(crawl_id, org) log_levels = [] contexts = [] @@ -1111,14 +1180,13 @@ async def get_crawl_errors( page: int = 1, org: Organization = Depends(org_viewer_dep), ): - crawl_raw = await ops.get_crawl_raw(crawl_id, org) - crawl = Crawl.from_dict(crawl_raw) + crawl = await ops.get_crawl(crawl_id, org) skip = (page - 1) * pageSize upper_bound = skip + pageSize - errors = crawl.errors[skip:upper_bound] + errors = crawl.errors[skip:upper_bound] if crawl.errors else [] parsed_errors = parse_jsonl_error_messages(errors) - return paginated_format(parsed_errors, len(crawl.errors), page, pageSize) + return paginated_format(parsed_errors, len(crawl.errors or []), page, pageSize) return ops diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py index 4afd8afea0..c6f7df3ded 100644 --- a/backend/btrixcloud/main.py +++ b/backend/btrixcloud/main.py @@ -126,10 +126,10 @@ def main(): base_crawl_init = ( app, current_active_user, + # to basecrawls mdb, user_manager, org_ops, - crawl_manager, crawl_config_ops, coll_ops, storage_ops, @@ -139,7 +139,7 @@ def main(): base_crawl_ops = init_base_crawls_api(*base_crawl_init) - crawls = init_crawls_api(*base_crawl_init) + crawls = init_crawls_api(crawl_manager, *base_crawl_init) page_ops = init_pages_api( app, mdb, crawls, org_ops, storage_ops, current_active_user diff --git a/backend/btrixcloud/main_op.py b/backend/btrixcloud/main_op.py index ba6555dba2..e278a3644a 100644 --- a/backend/btrixcloud/main_op.py +++ b/backend/btrixcloud/main_op.py @@ -77,10 +77,10 @@ def main(): coll_ops = CollectionOps(mdb, crawl_manager, org_ops, event_webhook_ops) crawl_ops = CrawlOps( + crawl_manager, mdb, user_manager, org_ops, - crawl_manager, crawl_config_ops, coll_ops, storage_ops, diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index af7a4f1db8..a7b32d6c7f 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -532,7 +532,7 @@ class ReviewStatus(str, Enum): # ============================================================================ -class CoreCrawlable: +class CoreCrawlable(BaseModel): # pylint: disable=too-few-public-methods """Core properties for crawlable run (crawl or qa run)""" @@ -554,7 +554,7 @@ class CoreCrawlable: stats: Optional[Dict[str, int]] = None - files: Optional[List[CrawlFile]] = [] + files: List[CrawlFile] = [] fileSize: int = 0 fileCount: int = 0 @@ -569,6 +569,7 @@ class BaseCrawl(CoreCrawlable, BaseMongoModel): type: str oid: UUID + cid: Optional[UUID] = None name: Optional[str] = "" @@ -709,7 +710,7 @@ class Crawl(BaseCrawl, CrawlConfigCore): # schedule: Optional[str] manual: Optional[bool] - active_qa: Optional[str] + qa_active: Optional[str] qa: Optional[Dict[str, QARun]] = {} diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 9770076cf5..fa278288d8 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -7,7 +7,6 @@ import json from uuid import UUID -from fastapi import HTTPException import humanize @@ -133,10 +132,10 @@ async def sync_crawls(self, data: MCSyncData): if not status.finished: # if can't cancel, already finished await self.cancel_crawl(crawl, status, data.children[POD]) - # instead of fetching the state (that was already set) - # return exception to ignore this request, keep previous - # finished state - # raise HTTPException(status_code=400, detail="out_of_sync_status") + # instead of fetching the state (that was already set) + # return exception to ignore this request, keep previous + # finished state + # raise HTTPException(status_code=400, detail="out_of_sync_status") return await self.finalize_response( crawl, @@ -266,7 +265,7 @@ async def sync_crawls(self, data: MCSyncData): params["force_restart"] = False if crawl.qa_source_crawl_id: - params["qa_source"] = "" # self.get_crawl_replay_url() + params["qa_source_json"] = self.get_crawl_replay_json(crawl.qs_source_crawl_id) if len(pods) else "" for i in range(0, status.scale): children.extend(self._load_crawler(params, i, status, data.children)) @@ -314,6 +313,10 @@ def _load_crawler(self, params, i, status, children): return self.load_from_yaml("crawler.yaml", params) + async def _get_crawl_replay_json(self, crawl_id): + crawl = await self.crawl_ops.get_crawl_out(crawl_id) + return json.dumps(crawl.dict()) + # pylint: disable=too-many-arguments async def _resolve_scale( self, @@ -1043,6 +1046,10 @@ async def add_file_to_crawl(self, cc_data, crawl, redis): crawl.id, crawl.qa_source_crawl_id, crawl_file, filecomplete.size ) + # no replicas for QA for now + if crawl.qa_source_crawl_id: + return True + try: await self.background_job_ops.create_replica_jobs( crawl.oid, crawl_file, crawl.id, "crawl" @@ -1231,6 +1238,7 @@ async def mark_finished( status.finished = to_k8s_date(finished) + # Regular Crawl Finished if not crawl.qa_source_crawl_id: if crawl and state in SUCCESSFUL_STATES: await self.inc_crawl_complete_stats(crawl, finished) @@ -1241,6 +1249,10 @@ async def mark_finished( ) ) + # QA Run Finished + else: + await self.k8s.delete_crawl_job(crawl.id) + return True # pylint: disable=too-many-arguments diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index bd7b59473f..cc8d936c1c 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -4,6 +4,8 @@ from typing import TYPE_CHECKING, Optional, Tuple, List, Dict, Any, Union from uuid import UUID, uuid4 +from pprint import pprint + from fastapi import Depends, HTTPException import pymongo @@ -49,7 +51,7 @@ def __init__(self, mdb, crawl_ops, org_ops, storage_ops): async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str): """Add pages to database from WACZ files""" try: - crawl = await self.crawl_ops.get_crawl(crawl_id, None) + crawl = await self.crawl_ops.get_crawl_out(crawl_id, None) org = await self.org_ops.get_org_by_id(crawl.oid) wacz_files = await self.crawl_ops.get_wacz_files(crawl_id, org) stream = await self.storage_ops.sync_stream_pages_from_wacz(org, wacz_files) @@ -77,6 +79,9 @@ async def add_page_to_db( crawl_id = qa_source_crawl_id or crawl_or_qa_run_id + print("Adding Page Data") + pprint(page_dict) + try: status = page_dict.get("status") if not status and page_dict.get("loadState"): @@ -162,7 +167,7 @@ async def get_page( async def add_qa_compare( self, page_id: UUID, oid: UUID, qa_run_id: str, compare: PageQACompare - ) -> Dict[str, bool]: + ) -> bool: """Update page heuristics and mime/type from QA run""" # modified = datetime.utcnow().replace(microsecond=0, tzinfo=None) diff --git a/backend/btrixcloud/uploads.py b/backend/btrixcloud/uploads.py index 6f8f6474a0..2b3f6e2023 100644 --- a/backend/btrixcloud/uploads.py +++ b/backend/btrixcloud/uploads.py @@ -39,6 +39,15 @@ class UploadOps(BaseCrawlOps): """upload ops""" + async def get_upload( + self, + crawlid: str, + org: Optional[Organization] = None, + ) -> UploadedCrawl: + """Get crawl data for internal use""" + res = await self.get_crawl_raw(crawlid, org, "upload") + return UploadedCrawl.from_dict(res) + # pylint: disable=too-many-arguments, too-many-instance-attributes, too-many-public-methods, too-many-function-args # pylint: disable=too-many-arguments, too-many-locals, duplicate-code, invalid-name async def upload_stream( @@ -60,7 +69,7 @@ async def upload_stream( prev_upload = None if replaceId: try: - prev_upload = await self.get_crawl_raw(replaceId, org, "upload") + prev_upload = await self.get_upload(replaceId, org) except HTTPException: # not found replaceId = None @@ -371,7 +380,7 @@ async def list_uploads( response_model=CrawlOut, ) async def get_upload(crawlid: str, org: Organization = Depends(org_crawl_dep)): - return await ops.get_crawl(crawlid, org, "upload") + return await ops.get_crawl_out(crawlid, org, "upload") @app.get( "/orgs/all/uploads/{crawl_id}/replay.json", @@ -382,7 +391,7 @@ async def get_upload_replay_admin(crawl_id, user: User = Depends(user_dep)): if not user.is_superuser: raise HTTPException(status_code=403, detail="Not Allowed") - return await ops.get_crawl(crawl_id, None, "upload") + return await ops.get_crawl_out(crawl_id, None, "upload") @app.get( "/orgs/{oid}/uploads/{crawl_id}/replay.json", @@ -390,7 +399,7 @@ async def get_upload_replay_admin(crawl_id, user: User = Depends(user_dep)): response_model=CrawlOutWithResources, ) async def get_upload_replay(crawl_id, org: Organization = Depends(org_viewer_dep)): - return await ops.get_crawl(crawl_id, org, "upload") + return await ops.get_crawl_out(crawl_id, org, "upload") @app.patch("/orgs/{oid}/uploads/{crawl_id}", tags=["uploads"]) async def update_uploads_api( diff --git a/backend/btrixcloud/webhooks.py b/backend/btrixcloud/webhooks.py index aafe10e80c..5b3ec99243 100644 --- a/backend/btrixcloud/webhooks.py +++ b/backend/btrixcloud/webhooks.py @@ -195,12 +195,12 @@ async def _create_item_finished_notification( body: Union[CrawlFinishedBody, UploadFinishedBody], ): """Create webhook notification for finished crawl/upload.""" - crawl = await self.crawl_ops.get_crawl(crawl_id, org) + crawl = await self.crawl_ops.get_crawl_out(crawl_id, org) if not crawl: print(f"Crawl {crawl_id} not found, skipping event webhook", flush=True) return - body.resources = crawl.resources + body.resources = crawl.resources or [] notification = WebhookNotification( id=uuid4(), diff --git a/chart/app-templates/crawler.yaml b/chart/app-templates/crawler.yaml index 1b9aa8b72d..272da96a90 100644 --- a/chart/app-templates/crawler.yaml +++ b/chart/app-templates/crawler.yaml @@ -115,8 +115,11 @@ spec: {% else %} - qa - --qaSource - - {{ qa_source }} - {% endif %} + - /tmp/ + - --redisStoreUrl + - {{ redis_url }} + - --writePagesToRedis + {% endif %} volumeMounts: {% if not qa_source %} - name: crawl-config From 23f5a24c88792b60e2e7ec1f6013b9e5c5e4dcf7 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 9 Mar 2024 09:26:52 -0800 Subject: [PATCH 15/49] various fixes, add separate configmap for qa qa runs work! --- backend/btrixcloud/basecrawls.py | 24 +++++++------- backend/btrixcloud/crawls.py | 30 ++++++++++------- backend/btrixcloud/k8sapi.py | 14 +++----- backend/btrixcloud/operator/crawls.py | 47 ++++++++++++++++++--------- backend/btrixcloud/storages.py | 2 ++ chart/app-templates/crawl_job.yaml | 2 +- chart/app-templates/crawler.yaml | 10 +++--- chart/app-templates/qa_configmap.yaml | 14 ++++++++ chart/templates/operators.yaml | 5 +++ chart/templates/priorities.yaml | 2 +- 10 files changed, 96 insertions(+), 54 deletions(-) create mode 100644 chart/app-templates/qa_configmap.yaml diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index ef1d3cf75e..a53fd7c66b 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -121,7 +121,7 @@ async def get_crawl_raw( async def _files_to_resources( self, files, org, crawlid, qa_run_id: Optional[str] = None - ): + ) -> List[CrawlFileOut]: if not files: return [] @@ -184,15 +184,15 @@ async def get_crawl_out( return crawl - async def get_resource_resolved_raw_crawl( - self, crawlid: str, org: Organization, type_=None - ): - """return single base crawl with resources resolved""" - res = await self.get_crawl_raw(crawlid=crawlid, type_=type_, org=org) - res["resources"] = await self._files_to_resources( - res.get("files"), org, res["_id"] - ) - return res + async def get_internal_crawl_out(self, crawl_id): + """add internal prefix for relative paths""" + crawl_out = await self.get_crawl_out(crawl_id) + resources = crawl_out.resources or [] + for file_ in resources: + if file_.path.startswith("/"): + file_.path = "http://browsertrix-cloud-frontend.default" + file_.path + + return crawl_out async def _update_crawl_collections( self, crawl_id: str, org: Organization, collection_ids: List[UUID] @@ -409,10 +409,10 @@ async def _resolve_signed_urls( org: Organization, crawl_id: Optional[str] = None, qa_run_id: Optional[str] = None, - ): + ) -> List[CrawlFileOut]: if not files: print("no files") - return + return [] delta = timedelta(seconds=self.presign_duration_seconds) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 544393e4e6..e661ed0a03 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -440,20 +440,25 @@ async def update_crawl_state_if_allowed( qa_source_crawl_id: str, state: str, allowed_from: List[str], - **kwargs, + finished: Optional[datetime] = None, + stats: Optional[Dict[str, Any]] = None, ): """update crawl state and other properties in db if state has changed""" crawl_id = qa_source_crawl_id or crawl_or_qa_run_id - kwargs["state"] = state - if qa_source_crawl_id: - kwargs = {f"qa.{crawl_or_qa_run_id}": kwargs} + prefix = "" if not qa_source_crawl_id else f"qa.{crawl_or_qa_run_id}." + + update: Dict[str, Any] = {f"{prefix}state": state} + if finished: + update[f"{prefix}finished"] = finished + if stats: + update[f"{prefix}stats"] = stats query: Dict[str, Any] = {"_id": crawl_id, "type": "crawl"} if allowed_from: - query["state"] = {"$in": allowed_from} + query[f"{prefix}state"] = {"$in": allowed_from} - return await self.crawls.find_one_and_update(query, {"$set": kwargs}) + return await self.crawls.find_one_and_update(query, {"$set": update}) async def update_running_crawl_stats( self, crawl_or_qa_run_id: str, qa_source_crawl_id: str, stats @@ -492,9 +497,11 @@ async def get_crawl_state( ): """return current crawl state of a crawl""" crawl_id = qa_source_crawl_id or crawl_or_qa_run_id - state = "state" if not qa_source_crawl_id else f"$qa.{qa_source_crawl_id}.state" + state = ( + "$state" if not qa_source_crawl_id else f"$qa.{qa_source_crawl_id}.state" + ) finished = ( - "finished" + "$finished" if not qa_source_crawl_id else f"$qa.{qa_source_crawl_id}.finished" ) @@ -504,6 +511,7 @@ async def get_crawl_state( ) if not res: return None, None + print("get_crawl_state", res) return res.get("state"), res.get("finished") async def add_crawl_error( @@ -676,8 +684,8 @@ async def start_crawl_qa_run(self, crawl_id: str, org: Organization, user: User) crawl = await self.get_crawl(crawl_id, org) - if crawl.qa_active: - raise HTTPException(status_code=400, detail="qa_already_running") + # if crawl.qa_active: + # raise HTTPException(status_code=400, detail="qa_already_running") if not crawl.cid: raise HTTPException(status_code=400, detail="invalid_crawl_for_qa") @@ -709,7 +717,7 @@ async def start_crawl_qa_run(self, crawl_id: str, org: Organization, user: User) {"_id": crawl_id}, { "$set": { - "qa": {qa_run_id: qa_run.dict()}, + f"qa.{qa_run_id}": qa_run.dict(), "qa_active": qa_run_id, } }, diff --git a/backend/btrixcloud/k8sapi.py b/backend/btrixcloud/k8sapi.py index e4399dd63d..d429f0b98e 100644 --- a/backend/btrixcloud/k8sapi.py +++ b/backend/btrixcloud/k8sapi.py @@ -30,7 +30,9 @@ def __init__(self): self.namespace = os.environ.get("CRAWLER_NAMESPACE") or "crawlers" self.custom_resources = {} - self.templates = Jinja2Templates(directory=get_templates_dir()) + self.templates = Jinja2Templates( + directory=get_templates_dir(), autoescape=False + ) config.load_incluster_config() self.client = client @@ -172,12 +174,10 @@ async def has_storage_secret(self, storage_secret) -> bool: status_code=400, detail="invalid_config_missing_storage_secret" ) - async def delete_crawl_job(self, crawl_id, qa_crawl=False): + async def delete_crawl_job(self, crawl_id): """delete custom crawljob object""" try: name = f"crawljob-{crawl_id}" - if qa_crawl: - name = f"qa-{name}" await self.custom_api.delete_namespaced_custom_object( group="btrix.cloud", @@ -221,9 +221,7 @@ async def get_profile_browser(self, browserid): name=f"profilejob-{browserid}", ) - async def _patch_job( - self, crawl_id, body, pluraltype="crawljobs", qa_crawl=False - ) -> dict: + async def _patch_job(self, crawl_id, body, pluraltype="crawljobs") -> dict: content_type = self.api_client.default_headers.get("Content-Type") try: @@ -232,8 +230,6 @@ async def _patch_job( ) name = f"{pluraltype[:-1]}-{crawl_id}" - if qa_crawl: - name = f"qa-{name}" await self.custom_api.patch_namespaced_custom_object( group="btrix.cloud", diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index fa278288d8..ce6d4bd281 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -3,7 +3,8 @@ import traceback import os from pprint import pprint -from typing import Optional +from typing import Optional, Any +from datetime import datetime import json from uuid import UUID @@ -265,7 +266,8 @@ async def sync_crawls(self, data: MCSyncData): params["force_restart"] = False if crawl.qa_source_crawl_id: - params["qa_source_json"] = self.get_crawl_replay_json(crawl.qs_source_crawl_id) if len(pods) else "" + params["qa_source_crawl_id"] = crawl.qa_source_crawl_id + children.extend(await self._load_qa_configmap(params, data.children)) for i in range(0, status.scale): children.extend(self._load_crawler(params, i, status, data.children)) @@ -292,6 +294,20 @@ def _load_redis(self, params, status, children): return self.load_from_yaml("redis.yaml", params) + async def _load_qa_configmap(self, params, children): + qa_source_crawl_id = params["qa_source_crawl_id"] + name = f"qa-replay-{qa_source_crawl_id}" + + if name in children[CMAP]: + return [children[CMAP][name]] + + crawl_replay = await self.crawl_ops.get_internal_crawl_out(qa_source_crawl_id) + + params["name"] = name + params["qa_source_replay_json"] = crawl_replay.json(include={"resources"}) + + return self.load_from_yaml("qa_configmap.yaml", params) + def _load_crawler(self, params, i, status, children): name = f"crawl-{params['id']}-{i}" has_pod = name in children[POD] @@ -306,17 +322,13 @@ def _load_crawler(self, params, i, status, children): if params.get("do_restart"): print(f"Restart {name}") - if params.get("qa_source"): + if params.get("qa_source_crawl_id"): params["priorityClassName"] = f"qa-crawl-instance-{i}" else: params["priorityClassName"] = f"crawl-instance-{i}" return self.load_from_yaml("crawler.yaml", params) - async def _get_crawl_replay_json(self, crawl_id): - crawl = await self.crawl_ops.get_crawl_out(crawl_id) - return json.dumps(crawl.dict()) - # pylint: disable=too-many-arguments async def _resolve_scale( self, @@ -393,7 +405,8 @@ async def set_state( status: CrawlStatus, crawl: CrawlSpec, allowed_from: list[str], - **kwargs, + finished: Optional[datetime] = None, + stats: Optional[dict[str, Any]] = None, ): """set status state and update db, if changed if allowed_from passed in, can only transition from allowed_from state, @@ -425,7 +438,8 @@ async def set_state( crawl.qa_source_crawl_id, state=state, allowed_from=allowed_from, - **kwargs, + finished=finished, + stats=stats, ) if res: print(f"Setting state: {status.state} -> {state}, {crawl.id}") @@ -557,7 +571,7 @@ async def fail_crawl( crawl: CrawlSpec, status: CrawlStatus, pods: dict, - stats=None, + stats: Optional[dict[str, Any]] = None, ) -> bool: """Mark crawl as failed, log crawl state and print crawl logs, if possible""" prev_state = status.state @@ -1211,16 +1225,12 @@ async def mark_finished( crawl: CrawlSpec, status: CrawlStatus, state: str, - stats=None, + stats: Optional[dict[str, Any]] = None, ) -> bool: """mark crawl as finished, set finished timestamp and final state""" finished = dt_now() - kwargs = {"finished": finished} - if stats: - kwargs["stats"] = stats - if state in SUCCESSFUL_STATES: allowed_from = RUNNING_STATES else: @@ -1228,7 +1238,12 @@ async def mark_finished( # if set_state returns false, already set to same status, return if not await self.set_state( - state, status, crawl, allowed_from=allowed_from, **kwargs + state, + status, + crawl, + allowed_from=allowed_from, + finished=finished, + stats=stats, ): print("already finished, ignoring mark_finished") if not status.finished: diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 4161da1d53..ebb14b8ccb 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -481,9 +481,11 @@ async def get_presigned_url( and s3storage.access_endpoint_url and s3storage.access_endpoint_url != s3storage.endpoint_url ): + print("PRE", presigned_url) presigned_url = presigned_url.replace( s3storage.endpoint_url, s3storage.access_endpoint_url ) + print("POST", presigned_url) return presigned_url diff --git a/chart/app-templates/crawl_job.yaml b/chart/app-templates/crawl_job.yaml index 7b55e17b5b..4684e06047 100644 --- a/chart/app-templates/crawl_job.yaml +++ b/chart/app-templates/crawl_job.yaml @@ -1,7 +1,7 @@ apiVersion: btrix.cloud/v1 kind: CrawlJob metadata: - name: {{ "qa-" if qa_source else "" }}crawljob-{{ id }} + name: crawljob-{{ id }} labels: crawl: "{{ id }}" role: {{ "qa-job" if qa_source else "job" }} diff --git a/chart/app-templates/crawler.yaml b/chart/app-templates/crawler.yaml index 272da96a90..e9ea1834d0 100644 --- a/chart/app-templates/crawler.yaml +++ b/chart/app-templates/crawler.yaml @@ -53,7 +53,11 @@ spec: volumes: - name: crawl-config configMap: + {% if not qa_source_crawl_id %} name: crawl-config-{{ cid }} + {% else %} + name: qa-replay-{{ qa_source_crawl_id }} + {% endif %} - name: crawl-data persistentVolumeClaim: claimName: {{ name }} @@ -101,7 +105,7 @@ spec: image: {{ crawler_image }} imagePullPolicy: {{ crawler_image_pull_policy }} command: - {% if not qa_source %} + {% if not qa_source_crawl_id %} - crawl - --config - /tmp/crawl-config.json @@ -115,18 +119,16 @@ spec: {% else %} - qa - --qaSource - - /tmp/ + - /tmp/crawl-config.json - --redisStoreUrl - {{ redis_url }} - --writePagesToRedis {% endif %} volumeMounts: - {% if not qa_source %} - name: crawl-config mountPath: /tmp/crawl-config.json subPath: crawl-config.json readOnly: True - {% endif %} - name: crawl-data mountPath: /crawls diff --git a/chart/app-templates/qa_configmap.yaml b/chart/app-templates/qa_configmap.yaml new file mode 100644 index 0000000000..9fd9e4051b --- /dev/null +++ b/chart/app-templates/qa_configmap.yaml @@ -0,0 +1,14 @@ +# ------- +# CONFIGMAP +# ------- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ name }} + namespace: {{ namespace }} + labels: + crawl: {{ id }} + role: crawler + +data: + crawl-config.json: {{ qa_source_replay_json | tojson }} diff --git a/chart/templates/operators.yaml b/chart/templates/operators.yaml index 160e301bde..b7126edb4c 100644 --- a/chart/templates/operators.yaml +++ b/chart/templates/operators.yaml @@ -20,6 +20,11 @@ spec: updateStrategy: method: InPlace + - apiVersion: v1 + resource: configmaps + updateStrategy: + method: OnDelete + hooks: sync: webhook: diff --git a/chart/templates/priorities.yaml b/chart/templates/priorities.yaml index 42427df881..97d80ada81 100644 --- a/chart/templates/priorities.yaml +++ b/chart/templates/priorities.yaml @@ -16,7 +16,7 @@ description: "Priority for crawl instance #{{ . }}" apiVersion: scheduling.k8s.io/v1 kind: PriorityClass metadata: - name: crawl-qa-instance-{{ . }} + name: qa-crawl-instance-{{ . }} value: -{{ add 100 . }} globalDefault: false description: "Priority for QA crawl instance #{{ . }}" From 5d662796ec4a727ae9117ff9704fe5fc24ff3dec Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 10 Mar 2024 12:01:42 -0700 Subject: [PATCH 16/49] simplify qa model: - active qa data added to 'qa' - finished qa later added to 'qa_finished' when crawl finished - active qa cleared on operator finalize - default text extract to 'to-warc' for qa support --- backend/btrixcloud/crawls.py | 149 ++++++++++++++---------- backend/btrixcloud/models.py | 8 +- backend/btrixcloud/operator/crawls.py | 86 ++++++++------ backend/btrixcloud/operator/cronjobs.py | 4 +- backend/btrixcloud/operator/models.py | 10 ++ backend/btrixcloud/pages.py | 40 ++++--- chart/values.yaml | 4 +- 7 files changed, 181 insertions(+), 120 deletions(-) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index e661ed0a03..0dee17c1c0 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -26,6 +26,7 @@ CrawlConfig, UpdateCrawlConfig, CrawlScale, + CrawlFile, Crawl, CrawlOut, CrawlOutWithResources, @@ -35,6 +36,7 @@ User, PaginatedResponse, RUNNING_AND_STARTING_STATES, + SUCCESSFUL_STATES, ALL_CRAWL_STATES, ) @@ -436,17 +438,15 @@ async def add_or_remove_exclusion(self, crawl_id, regex, org, user, add): async def update_crawl_state_if_allowed( self, - crawl_or_qa_run_id: str, - qa_source_crawl_id: str, + crawl_id: str, + is_qa: bool, state: str, allowed_from: List[str], finished: Optional[datetime] = None, stats: Optional[Dict[str, Any]] = None, ): """update crawl state and other properties in db if state has changed""" - crawl_id = qa_source_crawl_id or crawl_or_qa_run_id - - prefix = "" if not qa_source_crawl_id else f"qa.{crawl_or_qa_run_id}." + prefix = "" if not is_qa else "qa." update: Dict[str, Any] = {f"{prefix}state": state} if finished: @@ -460,25 +460,23 @@ async def update_crawl_state_if_allowed( return await self.crawls.find_one_and_update(query, {"$set": update}) - async def update_running_crawl_stats( - self, crawl_or_qa_run_id: str, qa_source_crawl_id: str, stats - ): + async def update_running_crawl_stats(self, crawl_id: str, is_qa: bool, stats): """update running crawl stats""" - crawl_id = qa_source_crawl_id or crawl_or_qa_run_id - field = "stats" if not qa_source_crawl_id else f"qa.{crawl_or_qa_run_id}.stats" + prefix = "" if not is_qa else "qa." query = {"_id": crawl_id, "type": "crawl", "state": "running"} - return await self.crawls.find_one_and_update(query, {"$set": {field: stats}}) + return await self.crawls.find_one_and_update( + query, {"$set": {f"{prefix}stats": stats}} + ) async def inc_crawl_exec_time( self, - crawl_or_qa_run_id: str, - qa_source_crawl_id: str, + crawl_id: str, + is_qa: bool, exec_time, last_updated_time, ): """increment exec time""" - crawl_id = qa_source_crawl_id or crawl_or_qa_run_id - prefix = "" if not qa_source_crawl_id else f"qa.{crawl_or_qa_run_id}." + prefix = "" if not is_qa else "qa." return await self.crawls.find_one_and_update( { @@ -492,50 +490,36 @@ async def inc_crawl_exec_time( }, ) - async def get_crawl_state( - self, crawl_or_qa_run_id: str, qa_source_crawl_id: Optional[str] = None - ): + async def get_crawl_state(self, crawl_id: str, is_qa: bool): """return current crawl state of a crawl""" - crawl_id = qa_source_crawl_id or crawl_or_qa_run_id - state = ( - "$state" if not qa_source_crawl_id else f"$qa.{qa_source_crawl_id}.state" - ) - finished = ( - "$finished" - if not qa_source_crawl_id - else f"$qa.{qa_source_crawl_id}.finished" - ) + prefix = "" if not is_qa else "qa." res = await self.crawls.find_one( - {"_id": crawl_id}, projection={"state": state, "finished": finished} + {"_id": crawl_id}, + projection={"state": f"${prefix}state", "finished": f"${prefix}finished"}, ) if not res: return None, None - print("get_crawl_state", res) return res.get("state"), res.get("finished") async def add_crawl_error( self, - crawl_or_qa_run_id: str, - qa_source_crawl_id: str, + crawl_id: str, + is_qa: bool, error: str, ): """add crawl error from redis to mongodb errors field""" - crawl_id = qa_source_crawl_id or crawl_or_qa_run_id - field = ( - "errors" if not qa_source_crawl_id else f"qa.{crawl_or_qa_run_id}.errors" - ) + prefix = "" if not is_qa else "qa." await self.crawls.find_one_and_update( - {"_id": crawl_id}, {"$push": {field: error}} + {"_id": crawl_id}, {"$push": {f"{prefix}errors": error}} ) async def add_crawl_file( - self, crawl_or_qa_run_id, qa_source_crawl_id, crawl_file, size + self, crawl_id: str, is_qa: bool, crawl_file: CrawlFile, size: int ): """add new crawl file to crawl""" - crawl_id = qa_source_crawl_id or crawl_or_qa_run_id - prefix = "" if not qa_source_crawl_id else f"qa.{crawl_or_qa_run_id}." + prefix = "" if not is_qa else "qa." await self.crawls.find_one_and_update( {"_id": crawl_id}, @@ -684,8 +668,8 @@ async def start_crawl_qa_run(self, crawl_id: str, org: Organization, user: User) crawl = await self.get_crawl(crawl_id, org) - # if crawl.qa_active: - # raise HTTPException(status_code=400, detail="qa_already_running") + if crawl.qa: + raise HTTPException(status_code=400, detail="qa_already_running") if not crawl.cid: raise HTTPException(status_code=400, detail="invalid_crawl_for_qa") @@ -717,8 +701,7 @@ async def start_crawl_qa_run(self, crawl_id: str, org: Organization, user: User) {"_id": crawl_id}, { "$set": { - f"qa.{qa_run_id}": qa_run.dict(), - "qa_active": qa_run_id, + "qa": qa_run.dict(), } }, ) @@ -730,27 +713,19 @@ async def start_crawl_qa_run(self, crawl_id: str, org: Organization, user: User) raise HTTPException(status_code=500, detail=f"Error starting crawl: {exc}") async def stop_crawl_qa_run(self, crawl_id: str, org: Organization): - """Stop crawl QA run""" + """Stop crawl QA run, QA run removed when actually finished""" crawl = await self.get_crawl(crawl_id, org) - if not crawl.qa_active: - raise HTTPException(status_code=400, detail="invalid_qa_run_id") + if not crawl.qa: + raise HTTPException(status_code=400, detail="qa_not_running") try: - result = await self.crawl_manager.shutdown_crawl( - crawl.qa_active, graceful=True - ) + result = await self.crawl_manager.shutdown_crawl(crawl.qa.id, graceful=True) if result.get("error") == "Not Found": # treat as success, qa crawl no longer exists, so mark as no qa result = {"success": True} - if result.get("success"): - await self.crawls.find_one_and_update( - {"_id": crawl_id, "type": "crawl", "oid": org.id}, - {"$set": {"active_qa": None}}, - ) - return result except Exception as exc: @@ -760,20 +735,59 @@ async def stop_crawl_qa_run(self, crawl_id: str, org: Organization): status_code=404, detail=f"crawl_not_found, (details: {exc})" ) + async def delete_crawl_qa_run(self, crawl_id: str, qa_run_id: str): + """delete specified finished QA run""" + + res = await self.crawls.find_one_and_update( + {"_id": crawl_id, "type": "crawl"}, + {"$unset": {f"qa_finished.{qa_run_id}": ""}}, + ) + + await self.page_ops.delete_qa_run_from_pages(crawl_id, qa_run_id) + + return res + + async def qa_run_clear(self, crawl_id: str): + """remove any active qa run, if set""" + await self.crawls.find_one_and_update( + {"_id": crawl_id, "type": "crawl"}, {"$set": {"qa": None}} + ) + + async def qa_run_finished(self, crawl_id: str): + """add qa run to finished list, if successful""" + crawl = await self.get_crawl(crawl_id) + + if not crawl.qa: + return False + + if crawl.qa.finished and crawl.qa.state in SUCCESSFUL_STATES: + query = {f"qa_finished.{crawl.qa.id}": crawl.qa.dict()} + + if await self.crawls.find_one_and_update( + {"_id": crawl_id, "type": "crawl"}, {"$set": query} + ): + return True + + return False + async def get_qa_runs( self, crawl_id: str, org: Optional[Organization] = None ) -> List[QARun]: """Return list of QA runs""" crawl = await self.get_crawl(crawl_id, org) - return list(crawl.qa.values()) if crawl.qa else [] + all_qa = list(crawl.qa_finished.values()) if crawl.qa_finished else [] + all_qa.sort(key=lambda x: x.finished or dt_now(), reverse=True) + # if crawl.qa and (len(all_qa) == 0 or crawl.qa.id != all_qa[0].id): + # all_qa.insert(0, crawl.qa) + return all_qa async def get_qa_run_for_replay( self, crawl_id: str, qa_run_id: str, org: Optional[Organization] = None ) -> QARun: """Fetch QA runs with resources for replay.json""" crawl = await self.get_crawl(crawl_id, org) - qa = crawl.qa or {} - qa_run = qa.get(qa_run_id) + qa_finished = crawl.qa_finished or {} + qa_run = qa_finished.get(qa_run_id) if not qa_run: raise HTTPException(status_code=404, detail="crawl_qa_not_found") @@ -975,7 +989,7 @@ async def get_crawl_out(crawl_id, org: Organization = Depends(org_viewer_dep)): @app.get( "/orgs/all/crawls/{crawl_id}/qa/{qa_run_id}/replay.json", - tags=["crawls"], + tags=["qa"], response_model=QARunWithResources, ) async def get_qa_crawl_admin(crawl_id, qa_run_id, user: User = Depends(user_dep)): @@ -986,7 +1000,7 @@ async def get_qa_crawl_admin(crawl_id, qa_run_id, user: User = Depends(user_dep) @app.get( "/orgs/{oid}/crawls/{crawl_id}/qa/{qa_run_id}/replay.json", - tags=["crawls"], + tags=["qa"], response_model=QARunWithResources, ) async def get_qa_crawl( @@ -994,7 +1008,7 @@ async def get_qa_crawl( ): return await ops.get_qa_run_for_replay(crawl_id, qa_run_id, org) - @app.post("/orgs/{oid}/crawls/{crawl_id}/qa/start", tags=["crawls", "qa"]) + @app.post("/orgs/{oid}/crawls/{crawl_id}/qa/start", tags=["qa"]) async def start_crawl_qa_run( crawl_id: str, org: Organization = Depends(org_crawl_dep), @@ -1003,21 +1017,30 @@ async def start_crawl_qa_run( qa_run_id = await ops.start_crawl_qa_run(crawl_id, org, user) return {"started": qa_run_id} - @app.post("/orgs/{oid}/crawls/{crawl_id}/qa/stop", tags=["crawls", "qa"]) + @app.post("/orgs/{oid}/crawls/{crawl_id}/qa/stop", tags=["qa"]) async def stop_crawl_qa_run( crawl_id: str, org: Organization = Depends(org_crawl_dep) ): # pylint: disable=unused-argument return await ops.stop_crawl_qa_run(crawl_id, org) + @app.delete("/orgs/{oid}/crawls/{crawl_id}/qa/{qa_run_id}", tags=["qa"]) + async def delete_crawl_qa_run( + crawl_id: str, qa_run_id: str, org: Organization = Depends(org_crawl_dep) + ): + # pylint: disable=unused-argument + return await ops.delete_crawl_qa_run(crawl_id, qa_run_id) + @app.get( "/orgs/{oid}/crawls/{crawl_id}/qa", - tags=["crawls"], + tags=["qa"], response_model=List[QARun], ) async def get_qa_runs(crawl_id, org: Organization = Depends(org_viewer_dep)): return await ops.get_qa_runs(crawl_id, org) + # ---- + @app.get( "/orgs/all/crawls/{crawl_id}", tags=["crawls"], diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index a7b32d6c7f..d5f791eb02 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -710,8 +710,8 @@ class Crawl(BaseCrawl, CrawlConfigCore): # schedule: Optional[str] manual: Optional[bool] - qa_active: Optional[str] - qa: Optional[Dict[str, QARun]] = {} + qa: Optional[QARun] = None + qa_finished: Optional[Dict[str, QARun]] = {} # ============================================================================ @@ -1456,13 +1456,15 @@ class PageQACompare(BaseModel): screenshotMatch: Optional[int] = None textMatch: Optional[int] = None - resourceCounts: Optional[Dict[str, Dict[str, int]]] + resourceCounts: Optional[Dict[str, int]] # ============================================================================ class Page(BaseMongoModel): """Model for crawl pages""" + id: UUID + oid: UUID crawl_id: str url: AnyHttpUrl diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index ce6d4bd281..90b37a333c 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -7,7 +7,6 @@ from datetime import datetime import json -from uuid import UUID import humanize @@ -233,7 +232,7 @@ async def sync_crawls(self, data: MCSyncData): storage_path = crawl.storage.get_storage_extra_path(oid) storage_secret = crawl.storage.get_storage_secret_name(oid) - if not crawl.qa_source_crawl_id: + if not crawl.is_qa: params["profile_filename"] = configmap["PROFILE_FILENAME"] else: storage_path += "qa/" @@ -434,8 +433,8 @@ async def set_state( """ if not allowed_from or status.state in allowed_from: res = await self.crawl_ops.update_crawl_state_if_allowed( - crawl.id, - crawl.qa_source_crawl_id, + crawl.db_crawl_id, + crawl.is_qa, state=state, allowed_from=allowed_from, finished=finished, @@ -448,7 +447,7 @@ async def set_state( # get actual crawl state actual_state, finished = await self.crawl_ops.get_crawl_state( - crawl.id, crawl.qa_source_crawl_id + crawl.db_crawl_id, crawl.is_qa ) if actual_state: status.state = actual_state @@ -637,6 +636,9 @@ async def finalize_response( else: finalized = True + if finalized and crawl.is_qa: + await self.crawl_ops.qa_run_clear(crawl.db_crawl_id) + return { "status": status.dict(exclude_none=True), "children": new_children, @@ -742,17 +744,19 @@ async def sync_crawl_state( file_done = await redis.lpop(self.done_key) page_crawled = await redis.lpop(f"{crawl.id}:{self.pages_key}") + qa_run_id = crawl.id if crawl.is_qa else None + while page_crawled: page_dict = json.loads(page_crawled) await self.page_ops.add_page_to_db( - page_dict, crawl.id, crawl.qa_source_crawl_id, crawl.oid + page_dict, crawl.db_crawl_id, qa_run_id, crawl.oid ) page_crawled = await redis.lpop(f"{crawl.id}:{self.pages_key}") crawl_error = await redis.lpop(f"{crawl.id}:{self.errors_key}") while crawl_error: await self.crawl_ops.add_crawl_error( - crawl.id, crawl.qa_source_crawl_id, crawl_error + crawl.db_crawl_id, crawl.is_qa, crawl_error ) crawl_error = await redis.lpop(f"{crawl.id}:{self.errors_key}") @@ -938,7 +942,7 @@ async def increment_pod_exec_time( if exec_time: if not await self.crawl_ops.inc_crawl_exec_time( - crawl.id, crawl.qa_source_crawl_id, exec_time, status.lastUpdatedTime + crawl.db_crawl_id, crawl.is_qa, exec_time, status.lastUpdatedTime ): # if lastUpdatedTime is same as previous, something is wrong, don't update! print( @@ -1057,11 +1061,11 @@ async def add_file_to_crawl(self, cc_data, crawl, redis): await redis.incr("filesAddedSize", filecomplete.size) await self.crawl_ops.add_crawl_file( - crawl.id, crawl.qa_source_crawl_id, crawl_file, filecomplete.size + crawl.db_crawl_id, crawl.is_qa, crawl_file, filecomplete.size ) # no replicas for QA for now - if crawl.qa_source_crawl_id: + if crawl.is_qa: return True try: @@ -1144,7 +1148,7 @@ async def update_crawl_state( status.sizeHuman = humanize.naturalsize(status.size) await self.crawl_ops.update_running_crawl_stats( - crawl.id, crawl.qa_source_crawl_id, stats + crawl.db_crawl_id, crawl.is_qa, stats ) for key, value in sizes.items(): @@ -1254,50 +1258,64 @@ async def mark_finished( status.finished = to_k8s_date(finished) # Regular Crawl Finished - if not crawl.qa_source_crawl_id: - if crawl and state in SUCCESSFUL_STATES: - await self.inc_crawl_complete_stats(crawl, finished) - - self.run_task( - self.do_crawl_finished_tasks( - crawl.id, crawl.cid, crawl.oid, status.filesAddedSize, state - ) - ) + if not crawl.is_qa: + self.run_task(self.do_crawl_finished_tasks(crawl, status, finished, state)) # QA Run Finished else: - await self.k8s.delete_crawl_job(crawl.id) + self.run_task(self.do_qa_run_finished_tasks(crawl, state)) return True # pylint: disable=too-many-arguments async def do_crawl_finished_tasks( self, - crawl_id: str, - cid: UUID, - oid: UUID, - files_added_size: int, + crawl: CrawlSpec, + status: CrawlStatus, + finished: datetime, state: str, ) -> None: """Run tasks after crawl completes in asyncio.task coroutine.""" - await self.crawl_config_ops.stats_recompute_last(cid, files_added_size, 1) + await self.crawl_config_ops.stats_recompute_last( + crawl.cid, status.filesAddedSize, 1 + ) - if state in SUCCESSFUL_STATES and oid: - await self.org_ops.inc_org_bytes_stored(oid, files_added_size, "crawl") - await self.coll_ops.add_successful_crawl_to_collections(crawl_id, cid) + if state in SUCCESSFUL_STATES and crawl.oid: + await self.inc_crawl_complete_stats(crawl, finished) + await self.org_ops.inc_org_bytes_stored( + crawl.oid, status.filesAddedSize, "crawl" + ) + await self.coll_ops.add_successful_crawl_to_collections(crawl.id, crawl.cid) if state in FAILED_STATES: - await self.crawl_ops.delete_crawl_files(crawl_id, oid) - await self.page_ops.delete_crawl_pages(crawl_id, oid) + await self.crawl_ops.delete_crawl_files(crawl.id, crawl.oid) + await self.page_ops.delete_crawl_pages(crawl.id, crawl.oid) await self.event_webhook_ops.create_crawl_finished_notification( - crawl_id, oid, state + crawl.id, crawl.oid, state ) # finally, delete job - await self.k8s.delete_crawl_job(crawl_id) + await self.k8s.delete_crawl_job(crawl.id) + + # pylint: disable=too-many-arguments + async def do_qa_run_finished_tasks( + self, + crawl: CrawlSpec, + state: str, + ) -> None: + """Run tasks after qa run completes in asyncio.task coroutine.""" + + if state in SUCCESSFUL_STATES: + await self.crawl_ops.qa_run_finished(crawl.db_crawl_id) + + if state in FAILED_STATES: + await self.page_ops.delete_qa_run_from_pages(crawl.db_crawl_id, crawl.id) + + # finally, delete job + await self.k8s.delete_crawl_job(crawl.id) - async def inc_crawl_complete_stats(self, crawl, finished): + async def inc_crawl_complete_stats(self, crawl: CrawlSpec, finished: datetime): """Increment Crawl Stats""" started = from_k8s_date(crawl.started) diff --git a/backend/btrixcloud/operator/cronjobs.py b/backend/btrixcloud/operator/cronjobs.py index 720fe9dd71..445e86fbca 100644 --- a/backend/btrixcloud/operator/cronjobs.py +++ b/backend/btrixcloud/operator/cronjobs.py @@ -48,7 +48,9 @@ async def sync_cronjob_crawl(self, data: MCDecoratorSyncData): name = metadata.get("name") crawl_id = name - actual_state, finished = await self.crawl_ops.get_crawl_state(crawl_id) + actual_state, finished = await self.crawl_ops.get_crawl_state( + crawl_id, is_qa=False + ) if finished: status = None # mark job as completed diff --git a/backend/btrixcloud/operator/models.py b/backend/btrixcloud/operator/models.py index 31d488bcf1..acc28b9ffd 100644 --- a/backend/btrixcloud/operator/models.py +++ b/backend/btrixcloud/operator/models.py @@ -62,6 +62,16 @@ class CrawlSpec(BaseModel): max_crawl_size: int = 0 qa_source_crawl_id: str = "" + @property + def db_crawl_id(self) -> str: + """return actual crawl_id for db, if qa run""" + return self.qa_source_crawl_id or self.id + + @property + def is_qa(self) -> bool: + """return true if qa run""" + return bool(self.qa_source_crawl_id) + # ============================================================================ class PodResourcePercentage(BaseModel): diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index cc8d936c1c..1243de9580 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -4,8 +4,6 @@ from typing import TYPE_CHECKING, Optional, Tuple, List, Dict, Any, Union from uuid import UUID, uuid4 -from pprint import pprint - from fastapi import Depends, HTTPException import pymongo @@ -59,7 +57,7 @@ async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str): if not page_dict.get("url"): continue - await self.add_page_to_db(page_dict, crawl_id, "", crawl.oid) + await self.add_page_to_db(page_dict, crawl_id, None, crawl.oid) # pylint: disable=broad-exception-caught, raise-missing-from except Exception as err: print(f"Error adding pages for crawl {crawl_id} to db: {err}", flush=True) @@ -67,27 +65,24 @@ async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str): async def add_page_to_db( self, page_dict: Dict[str, Any], - crawl_or_qa_run_id: str, - qa_source_crawl_id: str, + crawl_id: str, + qa_run_id: Optional[str], oid: UUID, ): """Add page to database""" - page_id = page_dict.get("id") + page_id: Optional[str] = page_dict.get("id") if not page_id: print(f'Page {page_dict.get("url")} has no id - assigning UUID', flush=True) - page_id = uuid4() - - crawl_id = qa_source_crawl_id or crawl_or_qa_run_id print("Adding Page Data") - pprint(page_dict) + page = None try: status = page_dict.get("status") if not status and page_dict.get("loadState"): status = 200 page = Page( - id=page_id, + id=page_id or uuid4(), oid=oid, crawl_id=crawl_id, url=page_dict.get("url"), @@ -106,7 +101,8 @@ async def add_page_to_db( ) ) except pymongo.errors.DuplicateKeyError: - return + pass + # pylint: disable=broad-except except Exception as err: print( @@ -116,14 +112,14 @@ async def add_page_to_db( return # qa data - if qa_source_crawl_id: - compare_dict = page_dict.get("compare") + if qa_run_id and page: + compare_dict = page_dict.get("comparison") if compare_dict is None: print("QA Run, but compare data missing!") return compare = PageQACompare(**compare_dict) - await self.add_qa_compare(page_id, oid, crawl_or_qa_run_id, compare) + await self.add_qa_run_for_page(page.id, oid, qa_run_id, compare) async def delete_crawl_pages(self, crawl_id: str, oid: Optional[UUID] = None): """Delete crawl pages from db""" @@ -165,7 +161,7 @@ async def get_page( page_raw = await self.get_page_raw(page_id, oid, crawl_id) return Page.from_dict(page_raw) - async def add_qa_compare( + async def add_qa_run_for_page( self, page_id: UUID, oid: UUID, qa_run_id: str, compare: PageQACompare ) -> bool: """Update page heuristics and mime/type from QA run""" @@ -174,15 +170,25 @@ async def add_qa_compare( result = await self.pages.find_one_and_update( {"_id": page_id, "oid": oid}, - {"$set": {f"qa.{qa_run_id}", compare.dict()}}, + {"$set": {f"qa.{qa_run_id}": compare.dict()}}, return_document=pymongo.ReturnDocument.AFTER, ) if not result: + print("*** page_id:", page_id, "oid:", oid, flush=True) raise HTTPException(status_code=404, detail="page_not_found") return True + async def delete_qa_run_from_pages(self, crawl_id: str, qa_run_id: str): + """delete pages""" + print("delete qa run") + result = await self.pages.update_many( + {"crawl_id": crawl_id}, {"$unset": {f"qa.{qa_run_id}": ""}} + ) + print(result) + return result + async def update_page_approval( self, page_id: UUID, diff --git a/chart/values.yaml b/chart/values.yaml index 92b5a9438d..aa0301582e 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -17,8 +17,8 @@ disk_utilization_threshold: 90 # crawler logging flags crawler_logging_opts: "stats,behaviors,debug" -# to enable, set to a value other than 'false' -crawler_extract_full_text: false +# to enable, set to one or more comma separate values: to-warc,to-pages,final-to-warc +crawler_extract_full_text: to-warc # max pages per crawl # set to non-zero value to enforce global max pages per crawl limit From 59e1d70d3dbdc5703e12e86de72cdb8369d28d71 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 10 Mar 2024 13:18:55 -0700 Subject: [PATCH 17/49] rename qa_finished -> qaFinished fix qa replay additional type fixes, look up Org if missing, needed for resolving presigned URLs --- backend/btrixcloud/basecrawls.py | 22 +++++--- backend/btrixcloud/colls.py | 3 +- backend/btrixcloud/crawls.py | 78 +++++++++++++++------------ backend/btrixcloud/models.py | 32 +++++++++-- backend/btrixcloud/operator/crawls.py | 21 ++++---- backend/btrixcloud/orgs.py | 3 +- backend/btrixcloud/pages.py | 6 +-- backend/btrixcloud/storages.py | 2 - 8 files changed, 100 insertions(+), 67 deletions(-) diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index a53fd7c66b..f432622d86 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -120,7 +120,11 @@ async def get_crawl_raw( return res async def _files_to_resources( - self, files, org, crawlid, qa_run_id: Optional[str] = None + self, + files: List[Dict], + org: Organization, + crawlid: str, + qa_run_id: Optional[str] = None, ) -> List[CrawlFileOut]: if not files: return [] @@ -159,21 +163,17 @@ async def get_crawl_out( res = await self.get_crawl_raw(crawlid, org, type_) if not skip_resources: - res["resources"] = await self._files_to_resources( - res.get("files"), org, crawlid - ) - coll_ids = res.get("collectionIds") if coll_ids: res["collections"] = await self.colls.get_collection_names(coll_ids) - res.pop("files", None) + files = res.pop("files", None) res.pop("errors", None) crawl = CrawlOutWithResources.from_dict(res) - if crawl.type == "crawl": - crawl = await self._resolve_crawl_refs(crawl, org) + if crawl.type == "crawl" and not skip_resources: + crawl = await self._resolve_crawl_refs(crawl, org, files) if crawl.config and crawl.config.seeds: crawl.config.seeds = None @@ -383,6 +383,12 @@ async def _resolve_crawl_refs( config = await self.crawl_configs.get_crawl_config( crawl.cid, org.id if org else None, active_only=False ) + + if not org: + org = await self.orgs.get_org_by_id(crawl.oid) + if not org: + raise HTTPException(status_code=400, detail="missing_org") + if config and config.config.seeds: if add_first_seed: first_seed = config.config.seeds[0] diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index 497a9cc09a..1152fec4eb 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -337,8 +337,7 @@ async def update_collection_counts_and_tags(self, collection_id: UUID): files = crawl.get("files", []) for file in files: total_size += file.get("size", 0) - if crawl.get("stats"): - page_count += crawl.get("stats", {}).get("done", 0) + page_count += crawl.stats.done if crawl.get("tags"): tags.extend(crawl.get("tags")) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 0dee17c1c0..b7ffa3cf19 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -26,11 +26,13 @@ CrawlConfig, UpdateCrawlConfig, CrawlScale, + CrawlStats, CrawlFile, Crawl, CrawlOut, CrawlOutWithResources, QARun, + QARunOut, QARunWithResources, Organization, User, @@ -299,7 +301,6 @@ async def add_new_crawl( return dt_now except pymongo.errors.DuplicateKeyError: - # print(f"Crawl Already Added: {crawl.id} - {crawl.state}") return None async def update_crawl_scale( @@ -443,7 +444,7 @@ async def update_crawl_state_if_allowed( state: str, allowed_from: List[str], finished: Optional[datetime] = None, - stats: Optional[Dict[str, Any]] = None, + stats: Optional[CrawlStats] = None, ): """update crawl state and other properties in db if state has changed""" prefix = "" if not is_qa else "qa." @@ -452,7 +453,7 @@ async def update_crawl_state_if_allowed( if finished: update[f"{prefix}finished"] = finished if stats: - update[f"{prefix}stats"] = stats + update[f"{prefix}stats"] = stats.dict() query: Dict[str, Any] = {"_id": crawl_id, "type": "crawl"} if allowed_from: @@ -460,12 +461,14 @@ async def update_crawl_state_if_allowed( return await self.crawls.find_one_and_update(query, {"$set": update}) - async def update_running_crawl_stats(self, crawl_id: str, is_qa: bool, stats): + async def update_running_crawl_stats( + self, crawl_id: str, is_qa: bool, stats: CrawlStats + ): """update running crawl stats""" prefix = "" if not is_qa else "qa." query = {"_id": crawl_id, "type": "crawl", "state": "running"} return await self.crawls.find_one_and_update( - query, {"$set": {f"{prefix}stats": stats}} + query, {"$set": {f"{prefix}stats": stats.dict()}} ) async def inc_crawl_exec_time( @@ -593,25 +596,13 @@ async def get_crawl_stats( if duration_seconds: data["duration"] = duration_seconds - done_stats = None - if crawl.get("stats") and crawl.get("stats").get("done"): - done_stats = crawl["stats"]["done"] - - data["pages"] = 0 - if done_stats: - data["pages"] = done_stats + data["pages"] = crawl.stats.done data["filesize"] = crawl.get("fileSize", 0) data["avg_page_time"] = 0 - if ( - done_stats - and done_stats != 0 - and started - and finished - and duration_seconds - ): - data["avg_page_time"] = int(duration_seconds / done_stats) + if crawl.stats.done != 0 and started and finished and duration_seconds: + data["avg_page_time"] = int(duration_seconds / crawl.stats.done) crawls_data.append(data) @@ -668,10 +659,20 @@ async def start_crawl_qa_run(self, crawl_id: str, org: Organization, user: User) crawl = await self.get_crawl(crawl_id, org) + # can only QA finished crawls + if not crawl.finished: + raise HTTPException(status_code=400, detail="crawl_not_finished") + + # can only QA successfully finished crawls + if crawl.state not in SUCCESSFUL_STATES: + raise HTTPException(status_code=400, detail="crawl_did_not_succeed") + + # can only run one QA at a time if crawl.qa: raise HTTPException(status_code=400, detail="qa_already_running") - if not crawl.cid: + # not a valid crawl + if not crawl.cid or crawl.type != "crawl": raise HTTPException(status_code=400, detail="invalid_crawl_for_qa") crawlconfig = await self.crawl_configs.prepare_for_run_crawl(crawl.cid, org) @@ -740,7 +741,7 @@ async def delete_crawl_qa_run(self, crawl_id: str, qa_run_id: str): res = await self.crawls.find_one_and_update( {"_id": crawl_id, "type": "crawl"}, - {"$unset": {f"qa_finished.{qa_run_id}": ""}}, + {"$unset": {f"qaFinished.{qa_run_id}": ""}}, ) await self.page_ops.delete_qa_run_from_pages(crawl_id, qa_run_id) @@ -761,7 +762,7 @@ async def qa_run_finished(self, crawl_id: str): return False if crawl.qa.finished and crawl.qa.state in SUCCESSFUL_STATES: - query = {f"qa_finished.{crawl.qa.id}": crawl.qa.dict()} + query = {f"qaFinished.{crawl.qa.id}": crawl.qa.dict()} if await self.crawls.find_one_and_update( {"_id": crawl_id, "type": "crawl"}, {"$set": query} @@ -772,29 +773,38 @@ async def qa_run_finished(self, crawl_id: str): async def get_qa_runs( self, crawl_id: str, org: Optional[Organization] = None - ) -> List[QARun]: + ) -> List[QARunOut]: """Return list of QA runs""" - crawl = await self.get_crawl(crawl_id, org) - all_qa = list(crawl.qa_finished.values()) if crawl.qa_finished else [] - all_qa.sort(key=lambda x: x.finished or dt_now(), reverse=True) + crawl_data = await self.get_crawl_raw( + crawl_id, org, "crawl", project={"qaFinished": True} + ) + qa_finished = crawl_data.get("qaFinished") or {} + all_qa = [QARunOut(**qa_run_data) for qa_run_data in qa_finished.values()] + all_qa.sort(key=lambda x: x.finished, reverse=True) # if crawl.qa and (len(all_qa) == 0 or crawl.qa.id != all_qa[0].id): # all_qa.insert(0, crawl.qa) return all_qa async def get_qa_run_for_replay( self, crawl_id: str, qa_run_id: str, org: Optional[Organization] = None - ) -> QARun: + ) -> QARunWithResources: """Fetch QA runs with resources for replay.json""" crawl = await self.get_crawl(crawl_id, org) - qa_finished = crawl.qa_finished or {} + qa_finished = crawl.qaFinished or {} qa_run = qa_finished.get(qa_run_id) if not qa_run: raise HTTPException(status_code=404, detail="crawl_qa_not_found") - resources = await self._files_to_resources( - qa_run.files, org, crawlid=crawl_id, qa_run_id=qa_run_id + if not org: + org = await self.orgs.get_org_by_id(crawl.oid) + if not org: + raise HTTPException(status_code=400, detail="missing_org") + + resources = await self._resolve_signed_urls( + qa_run.files, org, crawl.id, qa_run_id ) + qa_run.files = [] return QARunWithResources(resources=resources, **qa_run.dict()) @@ -992,7 +1002,7 @@ async def get_crawl_out(crawl_id, org: Organization = Depends(org_viewer_dep)): tags=["qa"], response_model=QARunWithResources, ) - async def get_qa_crawl_admin(crawl_id, qa_run_id, user: User = Depends(user_dep)): + async def get_qa_run_admin(crawl_id, qa_run_id, user: User = Depends(user_dep)): if not user.is_superuser: raise HTTPException(status_code=403, detail="Not Allowed") @@ -1003,7 +1013,7 @@ async def get_qa_crawl_admin(crawl_id, qa_run_id, user: User = Depends(user_dep) tags=["qa"], response_model=QARunWithResources, ) - async def get_qa_crawl( + async def get_qa_run( crawl_id, qa_run_id, org: Organization = Depends(org_viewer_dep) ): return await ops.get_qa_run_for_replay(crawl_id, qa_run_id, org) @@ -1034,7 +1044,7 @@ async def delete_crawl_qa_run( @app.get( "/orgs/{oid}/crawls/{crawl_id}/qa", tags=["qa"], - response_model=List[QARun], + response_model=List[QARunOut], ) async def get_qa_runs(crawl_id, org: Organization = Depends(org_viewer_dep)): return await ops.get_qa_runs(crawl_id, org) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index d5f791eb02..b5d52ce94c 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -531,6 +531,15 @@ class ReviewStatus(str, Enum): FAILURE = "failure" +# ============================================================================ +class CrawlStats(BaseModel): + """Crawl Stats for pages and size""" + + found: int = 0 + done: int = 0 + size: int = 0 + + # ============================================================================ class CoreCrawlable(BaseModel): # pylint: disable=too-few-public-methods @@ -550,9 +559,7 @@ class CoreCrawlable(BaseModel): image: Optional[str] - stopping: Optional[bool] = False - - stats: Optional[Dict[str, int]] = None + stats: CrawlStats = CrawlStats() files: List[CrawlFile] = [] @@ -612,7 +619,7 @@ class CrawlOut(BaseMongoModel): state: str - stats: Optional[Dict[str, int]] + stats: Optional[CrawlStats] fileSize: int = 0 fileCount: int = 0 @@ -695,6 +702,20 @@ class QARunWithResources(QARun): resources: Optional[List[CrawlFileOut]] = [] +# ============================================================================ +class QARunOut(BaseModel): + """QA Run Output""" + + id: str + + userName: Optional[str] + + started: datetime + finished: datetime + + stats: CrawlStats = CrawlStats() + + # ============================================================================ class Crawl(BaseCrawl, CrawlConfigCore): """Store State of a Crawl (Finished or Running)""" @@ -709,9 +730,10 @@ class Crawl(BaseCrawl, CrawlConfigCore): # schedule: Optional[str] manual: Optional[bool] + stopping: Optional[bool] = False qa: Optional[QARun] = None - qa_finished: Optional[Dict[str, QARun]] = {} + qaFinished: Optional[Dict[str, QARun]] = {} # ============================================================================ diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 90b37a333c..9a24bebb1f 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -20,6 +20,7 @@ RUNNING_AND_STARTING_STATES, SUCCESSFUL_STATES, FAILED_STATES, + CrawlStats, CrawlFile, CrawlCompleteIn, StorageRef, @@ -405,7 +406,7 @@ async def set_state( crawl: CrawlSpec, allowed_from: list[str], finished: Optional[datetime] = None, - stats: Optional[dict[str, Any]] = None, + stats: Optional[CrawlStats] = None, ): """set status state and update db, if changed if allowed_from passed in, can only transition from allowed_from state, @@ -570,7 +571,7 @@ async def fail_crawl( crawl: CrawlSpec, status: CrawlStatus, pods: dict, - stats: Optional[dict[str, Any]] = None, + stats: Optional[CrawlStats] = None, ) -> bool: """Mark crawl as failed, log crawl state and print crawl logs, if possible""" prev_state = status.state @@ -1110,7 +1111,9 @@ async def is_crawl_stopping( return None - async def get_redis_crawl_stats(self, redis: Redis, crawl_id: str): + async def get_redis_crawl_stats( + self, redis: Redis, crawl_id: str + ) -> tuple[CrawlStats, dict[str, Any]]: """get page stats""" try: # crawler >0.9.0, done key is a value @@ -1123,7 +1126,7 @@ async def get_redis_crawl_stats(self, redis: Redis, crawl_id: str): sizes = await redis.hgetall(f"{crawl_id}:size") archive_size = sum(int(x) for x in sizes.values()) - stats = {"found": pages_found, "done": pages_done, "size": archive_size} + stats = CrawlStats(found=pages_found, done=pages_done, size=archive_size) return stats, sizes async def update_crawl_state( @@ -1139,12 +1142,12 @@ async def update_crawl_state( stats, sizes = await self.get_redis_crawl_stats(redis, crawl.id) # need to add size of previously completed WACZ files as well! - stats["size"] += status.filesAddedSize + stats.size += status.filesAddedSize # update status - status.pagesDone = stats["done"] - status.pagesFound = stats["found"] - status.size = stats["size"] + status.pagesDone = stats.done + status.pagesFound = stats.found + status.size = stats.size status.sizeHuman = humanize.naturalsize(status.size) await self.crawl_ops.update_running_crawl_stats( @@ -1229,7 +1232,7 @@ async def mark_finished( crawl: CrawlSpec, status: CrawlStatus, state: str, - stats: Optional[dict[str, Any]] = None, + stats: Optional[CrawlStats] = None, ) -> bool: """mark crawl as finished, set finished timestamp and final state""" diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index b5534aa5d6..7da0424983 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -617,8 +617,7 @@ async def get_org_metrics(self, org: Organization): crawl_count += 1 if type_ == "upload": upload_count += 1 - if item.get("stats"): - page_count += item.get("stats", {}).get("done", 0) + page_count += item.stats.done profile_count = await self.profiles_db.count_documents({"oid": org.id}) workflows_running_count = await self.crawls_db.count_documents( diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 1243de9580..3a2f39ce9a 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -49,7 +49,7 @@ def __init__(self, mdb, crawl_ops, org_ops, storage_ops): async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str): """Add pages to database from WACZ files""" try: - crawl = await self.crawl_ops.get_crawl_out(crawl_id, None) + crawl = await self.crawl_ops.get_crawl(crawl_id) org = await self.org_ops.get_org_by_id(crawl.oid) wacz_files = await self.crawl_ops.get_wacz_files(crawl_id, org) stream = await self.storage_ops.sync_stream_pages_from_wacz(org, wacz_files) @@ -74,7 +74,6 @@ async def add_page_to_db( if not page_id: print(f'Page {page_dict.get("url")} has no id - assigning UUID', flush=True) - print("Adding Page Data") page = None try: @@ -175,18 +174,15 @@ async def add_qa_run_for_page( ) if not result: - print("*** page_id:", page_id, "oid:", oid, flush=True) raise HTTPException(status_code=404, detail="page_not_found") return True async def delete_qa_run_from_pages(self, crawl_id: str, qa_run_id: str): """delete pages""" - print("delete qa run") result = await self.pages.update_many( {"crawl_id": crawl_id}, {"$unset": {f"qa.{qa_run_id}": ""}} ) - print(result) return result async def update_page_approval( diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index ebb14b8ccb..4161da1d53 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -481,11 +481,9 @@ async def get_presigned_url( and s3storage.access_endpoint_url and s3storage.access_endpoint_url != s3storage.endpoint_url ): - print("PRE", presigned_url) presigned_url = presigned_url.replace( s3storage.endpoint_url, s3storage.access_endpoint_url ) - print("POST", presigned_url) return presigned_url From caf82f1aa0fa88e3eed05ee1d38710fd22f89998 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 10 Mar 2024 14:12:46 -0700 Subject: [PATCH 18/49] various type fixes to fix tests: - ensure .from_dict() has proper generic type - use type Crawl objects for better type safety - add tags to BaseCrawl - fix delete_crawls() typing test: update for page model change --- backend/btrixcloud/basecrawls.py | 16 +++---- backend/btrixcloud/colls.py | 14 +++--- backend/btrixcloud/crawlconfigs.py | 11 +++-- backend/btrixcloud/crawls.py | 45 +++++++++---------- backend/btrixcloud/db.py | 10 +++-- .../migrations/migration_0004_config_seeds.py | 6 ++- backend/btrixcloud/models.py | 9 ++-- backend/btrixcloud/orgs.py | 11 ++--- backend/btrixcloud/pages.py | 2 +- backend/test/test_run_crawl.py | 4 +- 10 files changed, 71 insertions(+), 57 deletions(-) diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index f432622d86..fefe4028d2 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -162,17 +162,17 @@ async def get_crawl_out( """Get crawl data for api output""" res = await self.get_crawl_raw(crawlid, org, type_) + files = res.pop("files", None) + res.pop("errors", None) + if not skip_resources: coll_ids = res.get("collectionIds") if coll_ids: res["collections"] = await self.colls.get_collection_names(coll_ids) - files = res.pop("files", None) - res.pop("errors", None) - crawl = CrawlOutWithResources.from_dict(res) - if crawl.type == "crawl" and not skip_resources: + if not skip_resources: crawl = await self._resolve_crawl_refs(crawl, org, files) if crawl.config and crawl.config.seeds: crawl.config.seeds = None @@ -288,9 +288,9 @@ async def delete_crawls( delete_list: DeleteCrawlList, type_: str, user: Optional[User] = None, - ): + ) -> tuple[int, dict[UUID, dict[str, int]], bool]: """Delete a list of crawls by id for given org""" - cids_to_update: dict[str, dict[str, int]] = {} + cids_to_update: dict[UUID, dict[str, int]] = {} size = 0 @@ -320,7 +320,7 @@ async def delete_crawls( crawl_size = await self._delete_crawl_files(crawl, org) size += crawl_size - cid = str(crawl.cid) + cid = crawl.cid if cid: if cids_to_update.get(cid): cids_to_update[cid]["inc"] += 1 @@ -373,8 +373,8 @@ async def _resolve_crawl_refs( self, crawl: Union[CrawlOut, CrawlOutWithResources], org: Optional[Organization], + files: Optional[list[dict]], add_first_seed: bool = True, - files: Optional[list[dict]] = None, ): """Resolve running crawl data""" # pylint: disable=too-many-branches diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index 1152fec4eb..9d830b1305 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -20,6 +20,7 @@ CollIdName, UpdateColl, AddRemoveCrawlList, + BaseCrawl, CrawlOutWithResources, Organization, PaginatedResponse, @@ -330,16 +331,17 @@ async def update_collection_counts_and_tags(self, collection_id: UUID): total_size = 0 tags = [] - async for crawl in self.crawls.find({"collectionIds": collection_id}): - if crawl["state"] not in SUCCESSFUL_STATES: + async for crawl_raw in self.crawls.find({"collectionIds": collection_id}): + crawl = BaseCrawl.from_dict(crawl_raw) + if crawl.state not in SUCCESSFUL_STATES: continue crawl_count += 1 - files = crawl.get("files", []) + files = crawl.files or [] for file in files: - total_size += file.get("size", 0) + total_size += file.size page_count += crawl.stats.done - if crawl.get("tags"): - tags.extend(crawl.get("tags")) + if crawl.tags: + tags.extend(crawl.tags) sorted_tags = [tag for tag, count in Counter(tags).most_common()] diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 5a8f8dc937..3fb1d09700 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -24,6 +24,7 @@ CrawlConfig, CrawlConfigOut, CrawlConfigIdNameOut, + CrawlOut, EmptyStr, UpdateCrawlConfig, Organization, @@ -559,7 +560,9 @@ async def get_crawl_config_ids_for_profile( results = [CrawlConfigIdNameOut.from_dict(res) for res in results] return results - async def get_running_crawl(self, crawlconfig: CrawlConfig): + async def get_running_crawl( + self, crawlconfig: Union[CrawlConfig, CrawlConfigOut] + ) -> Optional[CrawlOut]: """Return the id of currently running crawl for this config, if any""" # crawls = await self.crawl_manager.list_running_crawls(cid=crawlconfig.id) crawls, _ = await self.crawl_ops.list_crawls( @@ -619,13 +622,15 @@ async def stats_recompute_last(self, cid: UUID, size: int, inc_crawls: int = 1): return result is not None - def _add_curr_crawl_stats(self, crawlconfig, crawl): + def _add_curr_crawl_stats( + self, crawlconfig: CrawlConfigOut, crawl: Optional[CrawlOut] + ): """Add stats from current running crawl, if any""" if not crawl: return crawlconfig.lastCrawlState = crawl.state - crawlconfig.lastCrawlSize = crawl.stats.get("size", 0) if crawl.stats else 0 + crawlconfig.lastCrawlSize = crawl.stats.size if crawl.stats else 0 crawlconfig.lastCrawlStopping = crawl.stopping async def get_crawl_config_out(self, cid: UUID, org: Organization): diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index b7ffa3cf19..ceae2797f5 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -226,7 +226,7 @@ async def list_crawls( crawl = cls.from_dict(result) files = result.get("files") if resources else None crawl = await self._resolve_crawl_refs( - crawl, org, add_first_seed=False, files=files + crawl, org, files=files, add_first_seed=False ) crawls.append(crawl) @@ -566,42 +566,38 @@ async def get_crawl_stats( if org: query["oid"] = org.id - async for crawl in self.crawls.find(query): + async for crawl_raw in self.crawls.find(query): + crawl = Crawl.from_dict(crawl_raw) data: Dict[str, Union[str, int]] = {} - data["id"] = str(crawl.get("_id")) + data["id"] = crawl.id - oid = crawl.get("oid") - data["oid"] = str(oid) - data["org"] = org_slugs[oid] + data["oid"] = str(crawl.oid) + data["org"] = org_slugs[crawl.oid] - data["cid"] = str(crawl.get("cid")) - crawl_name = crawl.get("name") - data["name"] = f'"{crawl_name}"' if crawl_name else "" - data["state"] = crawl.get("state") + data["cid"] = crawl.id + data["name"] = f'"{crawl.name}"' if crawl.name else "" + data["state"] = crawl.state - userid = crawl.get("userid") - data["userid"] = str(userid) - data["user"] = user_emails.get(userid) + data["userid"] = str(crawl.userid) + data["user"] = user_emails.get(crawl.userid) - started = crawl.get("started") - finished = crawl.get("finished") - - data["started"] = str(started) - data["finished"] = str(finished) + data["started"] = str(crawl.started) + data["finished"] = str(crawl.finished) data["duration"] = 0 - if started and finished: - duration = finished - started + duration_seconds = 0 + if crawl.started and crawl.finished: + duration = crawl.finished - crawl.started duration_seconds = int(duration.total_seconds()) if duration_seconds: data["duration"] = duration_seconds data["pages"] = crawl.stats.done - data["filesize"] = crawl.get("fileSize", 0) + data["filesize"] = crawl.fileSize data["avg_page_time"] = 0 - if crawl.stats.done != 0 and started and finished and duration_seconds: + if crawl.stats.done != 0 and duration_seconds: data["avg_page_time"] = int(duration_seconds / crawl.stats.done) crawls_data.append(data) @@ -807,7 +803,10 @@ async def get_qa_run_for_replay( qa_run.files = [] - return QARunWithResources(resources=resources, **qa_run.dict()) + qa_run_dict = qa_run.dict() + qa_run_dict["resources"] = resources + + return QARunWithResources(**qa_run_dict) # ============================================================================ diff --git a/backend/btrixcloud/db.py b/backend/btrixcloud/db.py index e9064a47f8..ba70248a76 100644 --- a/backend/btrixcloud/db.py +++ b/backend/btrixcloud/db.py @@ -8,7 +8,7 @@ import asyncio from uuid import UUID -from typing import Optional, Union +from typing import Optional, Union, TypeVar, Type import motor.motor_asyncio from pydantic import BaseModel @@ -198,6 +198,10 @@ async def create_indexes( await user_manager.init_index() +# ============================================================================ +T = TypeVar("T") + + # ============================================================================ class BaseMongoModel(BaseModel): """Base pydantic model that is also a mongo doc""" @@ -210,10 +214,10 @@ def id_str(self): return str(self.id) @classmethod - def from_dict(cls, data): + def from_dict(cls: Type[T], data: dict) -> T: """convert dict from mongo to a class""" if not data: - return None + return cls() data["id"] = data.pop("_id") return cls(**data) diff --git a/backend/btrixcloud/migrations/migration_0004_config_seeds.py b/backend/btrixcloud/migrations/migration_0004_config_seeds.py index 61f599ff95..8f6e5ff6c2 100644 --- a/backend/btrixcloud/migrations/migration_0004_config_seeds.py +++ b/backend/btrixcloud/migrations/migration_0004_config_seeds.py @@ -102,12 +102,14 @@ async def migrate_up(self): # Test migration async for config_dict in crawl_configs.find({}): config = CrawlConfig.from_dict(config_dict) - for seed in config.config.seeds: + seeds = config.config.seeds or [] + for seed in seeds: assert isinstance(seed, Seed) assert seed.url async for crawl_dict in crawls.find({}): crawl = Crawl.from_dict(crawl_dict) - for seed in crawl.config.seeds: + seeds = crawl.config.seeds or [] + for seed in seeds: assert isinstance(seed, Seed) assert seed.url diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index b5d52ce94c..1afaa4ee05 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -387,6 +387,8 @@ def get_raw_config(self): class CrawlConfigOut(CrawlConfigCore, CrawlConfigAdditional): """Crawl Config Output""" + id: UUID + lastCrawlStopping: Optional[bool] = False profileName: Optional[str] firstSeed: Optional[str] @@ -580,8 +582,12 @@ class BaseCrawl(CoreCrawlable, BaseMongoModel): name: Optional[str] = "" + stopping: Optional[bool] = False + description: Optional[str] = "" + tags: Optional[List[str]] = [] + collectionIds: Optional[List[UUID]] = [] reviewStatus: Optional[ReviewStatus] = None @@ -730,7 +736,6 @@ class Crawl(BaseCrawl, CrawlConfigCore): # schedule: Optional[str] manual: Optional[bool] - stopping: Optional[bool] = False qa: Optional[QARun] = None qaFinished: Optional[Dict[str, QARun]] = {} @@ -763,8 +768,6 @@ class UploadedCrawl(BaseCrawl): type: Literal["upload"] = "upload" - tags: Optional[List[str]] = [] - # ============================================================================ class UpdateUpload(UpdateCrawl): diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index 7da0424983..6aaf38309c 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -20,6 +20,7 @@ SUCCESSFUL_STATES, RUNNING_STATES, STARTING_STATES, + BaseCrawl, Organization, StorageRef, OrgQuotas, @@ -608,14 +609,14 @@ async def get_org_metrics(self, org: Organization): upload_count = 0 page_count = 0 - async for item in self.crawls_db.find({"oid": org.id}): - if item["state"] not in SUCCESSFUL_STATES: + async for item_data in self.crawls_db.find({"oid": org.id}): + item = BaseCrawl.from_dict(item_data) + if item.state not in SUCCESSFUL_STATES: continue archived_item_count += 1 - type_ = item.get("type") - if type_ == "crawl": + if item.type == "crawl": crawl_count += 1 - if type_ == "upload": + if item.type == "upload": upload_count += 1 page_count += item.stats.done diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 3a2f39ce9a..2fc71d6f89 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -317,7 +317,7 @@ async def list_pages( page: int = 1, sort_by: Optional[str] = None, sort_direction: Optional[int] = -1, - ) -> Tuple[List[Page], int]: + ) -> Tuple[List[PageOut], int]: """List all pages in crawl""" # pylint: disable=duplicate-code, too-many-locals # Zero-index page for query diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index 08eb770dd2..d7a25e8d9b 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -453,9 +453,7 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): assert page.get("title") or page.get("title") is None assert page["load_state"] - assert page["screenshotMatch"] == {} - assert page["textMatch"] == {} - assert page["resourceCounts"] == {} + assert page["qa"] == {} assert page["notes"] == [] assert page.get("userid") is None From 4a72080c2bb328b1d859d9afa868dae1477824a7 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 10 Mar 2024 17:20:25 -0700 Subject: [PATCH 19/49] move finished qa run to fininshed in finalizer, perform clear qa + add to finished atomatically set ttl for qa crawljobs to 0 to finalize more quickly add shared 'qaCrawlExecSeconds' to crawl to track shared qa exec time usage --- backend/btrixcloud/crawls.py | 27 +++++++++++++++------------ backend/btrixcloud/models.py | 6 ++++-- backend/btrixcloud/operator/crawls.py | 9 +++++---- chart/app-templates/crawl_job.yaml | 2 +- 4 files changed, 25 insertions(+), 19 deletions(-) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index ceae2797f5..834307028d 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -479,17 +479,24 @@ async def inc_crawl_exec_time( last_updated_time, ): """increment exec time""" - prefix = "" if not is_qa else "qa." + # update both crawl-shared qa exec seconds and per-qa run exec seconds + if is_qa: + inc_update = { + "qaCrawlExecSeconds": exec_time, + "qa.crawlExecSeconds": exec_time, + } + else: + inc_update = {"crawlExecSeconds": exec_time} return await self.crawls.find_one_and_update( { "_id": crawl_id, "type": "crawl", - f"{prefix}_lut": {"$ne": last_updated_time}, + "_lut": {"$ne": last_updated_time}, }, { - "$inc": {f"{prefix}crawlExecSeconds": exec_time}, - "$set": {f"{prefix}_lut": last_updated_time}, + "$inc": inc_update, + "$set": {"_lut": last_updated_time}, }, ) @@ -744,21 +751,17 @@ async def delete_crawl_qa_run(self, crawl_id: str, qa_run_id: str): return res - async def qa_run_clear(self, crawl_id: str): - """remove any active qa run, if set""" - await self.crawls.find_one_and_update( - {"_id": crawl_id, "type": "crawl"}, {"$set": {"qa": None}} - ) - async def qa_run_finished(self, crawl_id: str): - """add qa run to finished list, if successful""" + """clear active qa, add qa run to finished list, if successful""" crawl = await self.get_crawl(crawl_id) if not crawl.qa: return False + query: Dict[str, Any] = {"qa": None} + if crawl.qa.finished and crawl.qa.state in SUCCESSFUL_STATES: - query = {f"qaFinished.{crawl.qa.id}": crawl.qa.dict()} + query[f"qaFinished.{crawl.qa.id}"] = crawl.qa.dict() if await self.crawls.find_one_and_update( {"_id": crawl_id, "type": "crawl"}, {"$set": query} diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 1afaa4ee05..c627439653 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -582,8 +582,6 @@ class BaseCrawl(CoreCrawlable, BaseMongoModel): name: Optional[str] = "" - stopping: Optional[bool] = False - description: Optional[str] = "" tags: Optional[List[str]] = [] @@ -737,6 +735,10 @@ class Crawl(BaseCrawl, CrawlConfigCore): # schedule: Optional[str] manual: Optional[bool] + stopping: Optional[bool] = False + + qaCrawlExecSeconds: int = 0 + qa: Optional[QARun] = None qaFinished: Optional[Dict[str, QARun]] = {} diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 9a24bebb1f..4ded22bc9c 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -631,14 +631,14 @@ async def finalize_response( if status.finished: ttl = spec.get("ttlSecondsAfterFinished", DEFAULT_TTL) finished = from_k8s_date(status.finished) - if (dt_now() - finished).total_seconds() > ttl > 0: + if (dt_now() - finished).total_seconds() > ttl >= 0: print("CrawlJob expired, deleting: " + crawl.id) finalized = True else: finalized = True if finalized and crawl.is_qa: - await self.crawl_ops.qa_run_clear(crawl.db_crawl_id) + await self.crawl_ops.qa_run_finished(crawl.db_crawl_id) return { "status": status.dict(exclude_none=True), @@ -1309,8 +1309,9 @@ async def do_qa_run_finished_tasks( ) -> None: """Run tasks after qa run completes in asyncio.task coroutine.""" - if state in SUCCESSFUL_STATES: - await self.crawl_ops.qa_run_finished(crawl.db_crawl_id) + # now done in finalizer + # if state in SUCCESSFUL_STATES: + # await self.crawl_ops.qa_run_finished(crawl.db_crawl_id) if state in FAILED_STATES: await self.page_ops.delete_qa_run_from_pages(crawl.db_crawl_id, crawl.id) diff --git a/chart/app-templates/crawl_job.yaml b/chart/app-templates/crawl_job.yaml index 4684e06047..3255e56f99 100644 --- a/chart/app-templates/crawl_job.yaml +++ b/chart/app-templates/crawl_job.yaml @@ -26,7 +26,7 @@ spec: manual: {{ manual }} crawlerChannel: "{{ crawler_channel }}" - ttlSecondsAfterFinished: 30 + ttlSecondsAfterFinished: {{ 30 if not qa_source else 0 }} warcPrefix: "{{ warc_prefix }}" storageName: "{{ storage_name }}" From ec9240b892302f5d6db968fdb59713835707713d Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 10 Mar 2024 17:27:08 -0700 Subject: [PATCH 20/49] page ops: create index on 'crawl_id' --- backend/btrixcloud/db.py | 11 +++++++++-- backend/btrixcloud/pages.py | 4 ++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/db.py b/backend/btrixcloud/db.py index ba70248a76..93207ea12a 100644 --- a/backend/btrixcloud/db.py +++ b/backend/btrixcloud/db.py @@ -96,7 +96,13 @@ async def update_and_prepare_db( if await run_db_migrations(mdb, user_manager, page_ops): await drop_indexes(mdb) await create_indexes( - org_ops, crawl_ops, crawl_config_ops, coll_ops, invite_ops, user_manager + org_ops, + crawl_ops, + crawl_config_ops, + coll_ops, + invite_ops, + user_manager, + page_ops, ) await user_manager.create_super_user() await org_ops.create_default_org() @@ -186,7 +192,7 @@ async def drop_indexes(mdb): # ============================================================================ # pylint: disable=too-many-arguments async def create_indexes( - org_ops, crawl_ops, crawl_config_ops, coll_ops, invite_ops, user_manager + org_ops, crawl_ops, crawl_config_ops, coll_ops, invite_ops, user_manager, page_ops ): """Create database indexes.""" print("Creating database indexes", flush=True) @@ -196,6 +202,7 @@ async def create_indexes( await coll_ops.init_index() await invite_ops.init_index() await user_manager.init_index() + await page_ops.init_index() # ============================================================================ diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 2fc71d6f89..1a1e236ebc 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -46,6 +46,10 @@ def __init__(self, mdb, crawl_ops, org_ops, storage_ops): self.org_ops = org_ops self.storage_ops = storage_ops + async def init_index(self): + """init index for pages db collection""" + await self.pages.create_index([("crawl_id", pymongo.HASHED)]) + async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str): """Add pages to database from WACZ files""" try: From 5942a1b9e0a5a1ee121b38246f53c73c8623a465 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 10 Mar 2024 17:40:21 -0700 Subject: [PATCH 21/49] lint fix --- backend/btrixcloud/operator/crawls.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 892db71687..d79f41cc68 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -227,7 +227,9 @@ async def sync_crawls(self, data: MCSyncData): else: status.scale = crawl.scale now = dt_now() - await self.crawl_ops.inc_crawl_exec_time(crawl.db_crawl_id, crawl.is_qa, 0, now) + await self.crawl_ops.inc_crawl_exec_time( + crawl.db_crawl_id, crawl.is_qa, 0, now + ) status.lastUpdatedTime = to_k8s_date(now) children = self._load_redis(params, status, data.children) @@ -870,7 +872,9 @@ async def increment_pod_exec_time( ) if not update_start_time: - await self.crawl_ops.inc_crawl_exec_time(crawl.db_crawl_id, crawl.is_qa, 0, now) + await self.crawl_ops.inc_crawl_exec_time( + crawl.db_crawl_id, crawl.is_qa, 0, now + ) status.lastUpdatedTime = to_k8s_date(now) return @@ -956,7 +960,9 @@ async def increment_pod_exec_time( flush=True, ) - await self.crawl_ops.inc_crawl_exec_time(crawl.db_crawl_id, crawl.is_qa, exec_time, now) + await self.crawl_ops.inc_crawl_exec_time( + crawl.db_crawl_id, crawl.is_qa, exec_time, now + ) status.lastUpdatedTime = to_k8s_date(now) def should_mark_waiting(self, state, started): From b2581fcbab64150c0c9f7f4259e7498d938d6d6a Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 12 Mar 2024 08:04:59 -0400 Subject: [PATCH 22/49] additional pages APIs: - /qa//pages - /activeQA - split Page model into core Page, Page with all qa, and Page with single QA data - support filtering with qa_run_id, lte / gte, qa_range_field query args - fix typo in qa stats, use float instead of int! --- backend/btrixcloud/crawls.py | 19 +++++- backend/btrixcloud/models.py | 36 +++++++++--- backend/btrixcloud/operator/crawls.py | 8 ++- backend/btrixcloud/pages.py | 83 ++++++++++++++++++++++++--- 4 files changed, 129 insertions(+), 17 deletions(-) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 0455dae5c2..4a3c510593 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -791,6 +791,16 @@ async def get_qa_runs( # all_qa.insert(0, crawl.qa) return all_qa + async def get_active_qa( + self, crawl_id: str, org: Optional[Organization] = None + ) -> Optional[QARunOut]: + """return just the active QA, if any""" + crawl_data = await self.get_crawl_raw( + crawl_id, org, "crawl", project={"qa": True} + ) + qa = crawl_data.get("qa") + return QARunOut(**qa) if qa else None + async def get_qa_run_for_replay( self, crawl_id: str, qa_run_id: str, org: Optional[Organization] = None ) -> QARunWithResources: @@ -1005,7 +1015,6 @@ async def get_crawl_out(crawl_id, org: Organization = Depends(org_viewer_dep)): # QA APIs # --------------------- - @app.get( "/orgs/all/crawls/{crawl_id}/qa/{qa_run_id}/replay.json", tags=["qa"], @@ -1058,6 +1067,14 @@ async def delete_crawl_qa_run( async def get_qa_runs(crawl_id, org: Organization = Depends(org_viewer_dep)): return await ops.get_qa_runs(crawl_id, org) + @app.get( + "/orgs/{oid}/crawls/{crawl_id}/qa/activeQA", + tags=["qa"], + response_model=Dict[str, Optional[QARunOut]], + ) + async def get_active_qa(crawl_id, org: Organization = Depends(org_viewer_dep)): + return {"qa": await ops.get_active_qa(crawl_id, org)} + # ---- @app.get( diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index c627439653..787fffae72 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -1481,28 +1481,27 @@ class PageNote(BaseModel): class PageQACompare(BaseModel): """Model for updating pages from QA run""" - screenshotMatch: Optional[int] = None - textMatch: Optional[int] = None + screenshotMatch: Optional[float] = None + textMatch: Optional[float] = None resourceCounts: Optional[Dict[str, int]] # ============================================================================ class Page(BaseMongoModel): - """Model for crawl pages""" + """Core page data, no QA""" id: UUID oid: UUID crawl_id: str + + # core page data url: AnyHttpUrl title: Optional[str] = None timestamp: Optional[datetime] = None load_state: Optional[int] = None status: Optional[int] = None - # automated heuristics, keyed by QA run id - qa: Optional[Dict[str, PageQACompare]] = {} - # manual review userid: Optional[UUID] = None modified: Optional[datetime] = None @@ -1510,8 +1509,31 @@ class Page(BaseMongoModel): notes: List[PageNote] = [] +# ============================================================================ +class PageWithAllQA(Page): + """Model for core page data + qa""" + + # automated heuristics, keyed by QA run id + qa: Optional[Dict[str, PageQACompare]] = {} + + # ============================================================================ class PageOut(Page): - """Model for pages output""" + """Model for pages output, no QA""" status: Optional[int] = 200 + + +# ============================================================================ +class PageOutWithSingleQA(Page): + """Page out with single QA entry""" + + qa: Optional[PageQACompare] = None + + +# ============================================================================ +class PagesAndResources(BaseModel): + """moage for qa configmap data, pages + resources""" + + resources: List[CrawlFileOut] = [] + pages: List[PageOut] = [] diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index d79f41cc68..97c5db80a6 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -24,6 +24,7 @@ CrawlFile, CrawlCompleteIn, StorageRef, + PagesAndResources, ) from btrixcloud.utils import ( @@ -305,10 +306,15 @@ async def _load_qa_configmap(self, params, children): if name in children[CMAP]: return [children[CMAP][name]] + pages, _ = await self.page_ops.list_pages(qa_source_crawl_id, page_size=1000) + crawl_replay = await self.crawl_ops.get_internal_crawl_out(qa_source_crawl_id) + res_and_pages = PagesAndResources(resources=crawl_replay.resources, pages=pages) + params["name"] = name - params["qa_source_replay_json"] = crawl_replay.json(include={"resources"}) + params["qa_source_replay_json"] = res_and_pages.json() + # params["qa_source_replay_json"] = crawl_replay.json(include={"resources"}) return self.load_from_yaml("qa_configmap.yaml", params) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 1a1e236ebc..4759e50dd9 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -10,6 +10,7 @@ from .models import ( Page, PageOut, + PageOutWithSingleQA, PageReviewUpdate, PageQACompare, Organization, @@ -122,6 +123,8 @@ async def add_page_to_db( return compare = PageQACompare(**compare_dict) + print("Adding QA Run Data for Page", page_dict.get("url"), compare) + await self.add_qa_run_for_page(page.id, oid, qa_run_id, compare) async def delete_crawl_pages(self, crawl_id: str, oid: Optional[UUID] = None): @@ -315,23 +318,41 @@ async def delete_page_notes( async def list_pages( self, - org: Organization, crawl_id: str, + org: Optional[Organization] = None, + qa_run_id: Optional[str] = None, + qa_range_field: Optional[str] = None, + qa_range_field_gte: Optional[float] = None, + qa_range_field_lte: Optional[float] = None, page_size: int = DEFAULT_PAGE_SIZE, page: int = 1, sort_by: Optional[str] = None, sort_direction: Optional[int] = -1, - ) -> Tuple[List[PageOut], int]: + ) -> Tuple[Union[List[PageOut], List[PageOutWithSingleQA]], int]: """List all pages in crawl""" - # pylint: disable=duplicate-code, too-many-locals + # pylint: disable=duplicate-code, too-many-locals, too-many-branches # Zero-index page for query page = page - 1 skip = page_size * page query: dict[str, object] = { - "oid": org.id, "crawl_id": crawl_id, } + if org: + query["oid"] = org.id + + if qa_run_id: + query["qa"] = {"$exists": qa_run_id} + + range_match = {} + + if qa_range_field_gte: + range_match["$gte"] = qa_range_field_gte + if qa_range_field_lte: + range_match["$lte"] = qa_range_field_lte + + if qa_range_field: + query[f"qa.{qa_run_id}.{qa_range_field}"] = range_match aggregate = [{"$match": query}] @@ -340,12 +361,25 @@ async def list_pages( # - automated heuristics like screenshot_comparison (dict keyed by QA run id) # - Ensure notes sorting works okay with notes in list sort_fields = ("url", "title", "notes", "approved", "notes") - if sort_by not in sort_fields: + qa_sort_fields = ("screenshotMatch", "textMatch") + if sort_by not in sort_fields and sort_by not in qa_sort_fields: raise HTTPException(status_code=400, detail="invalid_sort_by") if sort_direction not in (1, -1): raise HTTPException(status_code=400, detail="invalid_sort_direction") + + if sort_by in qa_sort_fields: + if not qa_run_id: + raise HTTPException( + status_code=400, detail="qa_run_id_missing_for_qa_sort" + ) + + sort_by = f"qa.{qa_run_id}.{sort_by}" + aggregate.extend([{"$sort": {sort_by: sort_direction}}]) + if qa_run_id: + aggregate.extend([{"$set": {"qa": f"qa.{qa_run_id}"}}]) + aggregate.extend( [ { @@ -371,9 +405,10 @@ async def list_pages( except (IndexError, ValueError): total = 0 - pages = [PageOut.from_dict(data) for data in items] + if qa_run_id: + return [PageOutWithSingleQA.from_dict(data) for data in items], total - return pages, total + return [PageOut.from_dict(data) for data in items], total # ============================================================================ @@ -471,8 +506,40 @@ async def get_pages_list( ): """Retrieve paginated list of pages""" pages, total = await ops.list_pages( - org, crawl_id=crawl_id, + org=org, + page_size=pageSize, + page=page, + sort_by=sortBy, + sort_direction=sortDirection, + ) + return paginated_format(pages, total, page, pageSize) + + @app.get( + "/orgs/{oid}/crawls/{crawl_id}/qa/{qa_run_id}/pages", + tags=["pages", "qa"], + response_model=PaginatedResponse, + ) + async def get_pages_list_with_qa( + crawl_id: str, + qa_run_id: str, + qa_range_field: Optional[str] = None, + qa_range_field_gte: Optional[float] = None, + qa_range_field_lte: Optional[float] = None, + org: Organization = Depends(org_crawl_dep), + pageSize: int = DEFAULT_PAGE_SIZE, + page: int = 1, + sortBy: Optional[str] = None, + sortDirection: Optional[int] = -1, + ): + """Retrieve paginated list of pages""" + pages, total = await ops.list_pages( + crawl_id=crawl_id, + org=org, + qa_run_id=qa_run_id, + qa_range_field=qa_range_field, + qa_range_field_gte=qa_range_field_gte, + qa_range_field_lte=qa_range_field_lte, page_size=pageSize, page=page, sort_by=sortBy, From 4d332e9ee1ce167329c8ee49bef5ed8473ed2100 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 12 Mar 2024 08:14:00 -0400 Subject: [PATCH 23/49] update test, remove 'qa' from default /pages response --- backend/test/test_run_crawl.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index d7a25e8d9b..8fdce57112 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -453,8 +453,6 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): assert page.get("title") or page.get("title") is None assert page["load_state"] - assert page["qa"] == {} - assert page["notes"] == [] assert page.get("userid") is None assert page.get("modified") is None From cdf7a715de2805369ea4e8a5c877b0c6de347dc5 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 12 Mar 2024 11:17:37 -0400 Subject: [PATCH 24/49] Adjust priorities without changing bg-jobs --- chart/templates/priorities.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/chart/templates/priorities.yaml b/chart/templates/priorities.yaml index 97d80ada81..373d939240 100644 --- a/chart/templates/priorities.yaml +++ b/chart/templates/priorities.yaml @@ -17,7 +17,7 @@ apiVersion: scheduling.k8s.io/v1 kind: PriorityClass metadata: name: qa-crawl-instance-{{ . }} -value: -{{ add 100 . }} +value: -{{ add 50 . }} globalDefault: false description: "Priority for QA crawl instance #{{ . }}" @@ -29,7 +29,7 @@ apiVersion: scheduling.k8s.io/v1 kind: PriorityClass metadata: name: bg-jobs -value: -1000 +value: -100 globalDefault: false description: "Priority for background jobs" From 87a62a8da03f70ea2157ef377defc7af8ec009b1 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 12 Mar 2024 11:51:03 -0400 Subject: [PATCH 25/49] Change qa crawl priority back --- chart/templates/priorities.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chart/templates/priorities.yaml b/chart/templates/priorities.yaml index 373d939240..073d41222a 100644 --- a/chart/templates/priorities.yaml +++ b/chart/templates/priorities.yaml @@ -17,7 +17,7 @@ apiVersion: scheduling.k8s.io/v1 kind: PriorityClass metadata: name: qa-crawl-instance-{{ . }} -value: -{{ add 50 . }} +value: -{{ add 100 . }} globalDefault: false description: "Priority for QA crawl instance #{{ . }}" From 9b4b39327f0c82ea8465fb29523aebf5c2821c61 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 12 Mar 2024 11:57:22 -0400 Subject: [PATCH 26/49] Allow None for stats for reverse compatibility --- backend/btrixcloud/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 787fffae72..9e6ccfb315 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -561,7 +561,7 @@ class CoreCrawlable(BaseModel): image: Optional[str] - stats: CrawlStats = CrawlStats() + stats: Optional[CrawlStats] = CrawlStats() files: List[CrawlFile] = [] From 8b3b6c057d62bf42604eaf78542a353c6f2443da Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 12 Mar 2024 12:02:18 -0400 Subject: [PATCH 27/49] Add conditional for if base crawl has stats --- backend/btrixcloud/orgs.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index 6aaf38309c..dd866efc6b 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -618,7 +618,8 @@ async def get_org_metrics(self, org: Organization): crawl_count += 1 if item.type == "upload": upload_count += 1 - page_count += item.stats.done + if item.stats: + page_count += item.stats.done profile_count = await self.profiles_db.count_documents({"oid": org.id}) workflows_running_count = await self.crawls_db.count_documents( From 914d6ffe5ac4f0105a19dc9753c6f637cd02179a Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 12 Mar 2024 12:34:47 -0400 Subject: [PATCH 28/49] bg job priority: rename to 'bg-job' (to avoid conflict) and change to -1000 --- chart/app-templates/replica_job.yaml | 2 +- chart/templates/priorities.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/chart/app-templates/replica_job.yaml b/chart/app-templates/replica_job.yaml index 30870d3fb9..88a7da17b8 100644 --- a/chart/app-templates/replica_job.yaml +++ b/chart/app-templates/replica_job.yaml @@ -13,7 +13,7 @@ spec: template: spec: restartPolicy: Never - priorityClassName: bg-jobs + priorityClassName: bg-job podFailurePolicy: rules: - action: FailJob diff --git a/chart/templates/priorities.yaml b/chart/templates/priorities.yaml index 073d41222a..9acb5ae8ac 100644 --- a/chart/templates/priorities.yaml +++ b/chart/templates/priorities.yaml @@ -28,8 +28,8 @@ description: "Priority for QA crawl instance #{{ . }}" apiVersion: scheduling.k8s.io/v1 kind: PriorityClass metadata: - name: bg-jobs -value: -100 + name: bg-job +value: -1000 globalDefault: false description: "Priority for background jobs" From fc7b1ef442203f005d382eb79cdc713304998fcf Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 12 Mar 2024 12:49:37 -0400 Subject: [PATCH 29/49] lint: fix conditional for stats being optional --- backend/btrixcloud/colls.py | 3 ++- backend/btrixcloud/crawls.py | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index 9d830b1305..6a23fe50d2 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -339,7 +339,8 @@ async def update_collection_counts_and_tags(self, collection_id: UUID): files = crawl.files or [] for file in files: total_size += file.size - page_count += crawl.stats.done + if crawl.stats: + page_count += crawl.stats.done if crawl.tags: tags.extend(crawl.tags) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 4a3c510593..1ae5e156e4 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -606,12 +606,13 @@ async def get_crawl_stats( if duration_seconds: data["duration"] = duration_seconds - data["pages"] = crawl.stats.done + if crawl.stats: + data["pages"] = crawl.stats.done data["filesize"] = crawl.fileSize data["avg_page_time"] = 0 - if crawl.stats.done != 0 and duration_seconds: + if crawl.stats and crawl.stats.done != 0 and duration_seconds: data["avg_page_time"] = int(duration_seconds / crawl.stats.done) crawls_data.append(data) From 896787f6776bf6af2ad916c105f160f9da3cbd56 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 12 Mar 2024 13:11:10 -0400 Subject: [PATCH 30/49] Remove duplicate notes field in sort_fields --- backend/btrixcloud/pages.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 4759e50dd9..5058d506df 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -360,7 +360,7 @@ async def list_pages( # Sorting options to add: # - automated heuristics like screenshot_comparison (dict keyed by QA run id) # - Ensure notes sorting works okay with notes in list - sort_fields = ("url", "title", "notes", "approved", "notes") + sort_fields = ("url", "title", "notes", "approved") qa_sort_fields = ("screenshotMatch", "textMatch") if sort_by not in sort_fields and sort_by not in qa_sort_fields: raise HTTPException(status_code=400, detail="invalid_sort_by") From 20a3083952de1195c31c9e654e9f4122c96aa1b6 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 12 Mar 2024 13:11:24 -0400 Subject: [PATCH 31/49] Print log pages data in operator --- backend/btrixcloud/operator/crawls.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 97c5db80a6..f10de63203 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -758,6 +758,8 @@ async def sync_crawl_state( qa_run_id = crawl.id if crawl.is_qa else None while page_crawled: + print("PAGE DATA", flush=True) + print(page_crawled, flush=True) page_dict = json.loads(page_crawled) await self.page_ops.add_page_to_db( page_dict, crawl.db_crawl_id, qa_run_id, crawl.oid From bc7c6680d6b7436797c156096a320c6ca3f8db2f Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 12 Mar 2024 21:23:25 -0400 Subject: [PATCH 32/49] additional fixes: - fix qa run stats update while running - fix qa run pages, add filterBy, gte, gt, lte and lt query args - include active QA in QA list, update model --- backend/btrixcloud/crawls.py | 11 ++++---- backend/btrixcloud/models.py | 6 ++++- backend/btrixcloud/pages.py | 50 +++++++++++++++++++++++------------- 3 files changed, 43 insertions(+), 24 deletions(-) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 1ae5e156e4..a4f0ee9840 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -466,7 +466,7 @@ async def update_running_crawl_stats( ): """update running crawl stats""" prefix = "" if not is_qa else "qa." - query = {"_id": crawl_id, "type": "crawl", "state": "running"} + query = {"_id": crawl_id, "type": "crawl", f"{prefix}state": "running"} return await self.crawls.find_one_and_update( query, {"$set": {f"{prefix}stats": stats.dict()}} ) @@ -783,13 +783,14 @@ async def get_qa_runs( ) -> List[QARunOut]: """Return list of QA runs""" crawl_data = await self.get_crawl_raw( - crawl_id, org, "crawl", project={"qaFinished": True} + crawl_id, org, "crawl", project={"qaFinished": True, "qa": True} ) qa_finished = crawl_data.get("qaFinished") or {} all_qa = [QARunOut(**qa_run_data) for qa_run_data in qa_finished.values()] - all_qa.sort(key=lambda x: x.finished, reverse=True) - # if crawl.qa and (len(all_qa) == 0 or crawl.qa.id != all_qa[0].id): - # all_qa.insert(0, crawl.qa) + all_qa.sort(key=lambda x: x.finished or dt_now(), reverse=True) + qa = crawl_data.get("qa") + if qa: + all_qa.insert(0, QARunOut(**qa)) return all_qa async def get_active_qa( diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 9e6ccfb315..4a4a9510fb 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -715,7 +715,11 @@ class QARunOut(BaseModel): userName: Optional[str] started: datetime - finished: datetime + finished: Optional[datetime] = None + + state: str + + crawlExecSeconds: int = 0 stats: CrawlStats = CrawlStats() diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 5058d506df..24d36d6f00 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -321,9 +321,11 @@ async def list_pages( crawl_id: str, org: Optional[Organization] = None, qa_run_id: Optional[str] = None, - qa_range_field: Optional[str] = None, - qa_range_field_gte: Optional[float] = None, - qa_range_field_lte: Optional[float] = None, + qa_filter_by: Optional[str] = None, + qa_gte: Optional[float] = None, + qa_gt: Optional[float] = None, + qa_lte: Optional[float] = None, + qa_lt: Optional[float] = None, page_size: int = DEFAULT_PAGE_SIZE, page: int = 1, sort_by: Optional[str] = None, @@ -344,15 +346,22 @@ async def list_pages( if qa_run_id: query["qa"] = {"$exists": qa_run_id} - range_match = {} + range_filter = {} - if qa_range_field_gte: - range_match["$gte"] = qa_range_field_gte - if qa_range_field_lte: - range_match["$lte"] = qa_range_field_lte + if qa_gte: + range_filter["$gte"] = qa_gte + if qa_lte: + range_filter["$lte"] = qa_lte + if qa_gt: + range_filter["$gt"] = qa_gt + if qa_lt: + range_filter["$lt"] = qa_lt - if qa_range_field: - query[f"qa.{qa_run_id}.{qa_range_field}"] = range_match + if qa_filter_by: + if not range_filter: + raise HTTPException(status_code=400, detail="range_missing") + + query[f"qa.{qa_run_id}.{qa_filter_by}"] = range_filter aggregate = [{"$match": query}] @@ -377,8 +386,9 @@ async def list_pages( aggregate.extend([{"$sort": {sort_by: sort_direction}}]) - if qa_run_id: - aggregate.extend([{"$set": {"qa": f"qa.{qa_run_id}"}}]) + if qa_run_id: + aggregate.extend([{"$set": {"qa": f"$qa.{qa_run_id}"}}]) + # aggregate.extend([{"$project": {"qa": f"$qa.{qa_run_id}"}}]) aggregate.extend( [ @@ -523,9 +533,11 @@ async def get_pages_list( async def get_pages_list_with_qa( crawl_id: str, qa_run_id: str, - qa_range_field: Optional[str] = None, - qa_range_field_gte: Optional[float] = None, - qa_range_field_lte: Optional[float] = None, + filterBy: Optional[str] = None, + gte: Optional[float] = None, + gt: Optional[float] = None, + lte: Optional[float] = None, + lt: Optional[float] = None, org: Organization = Depends(org_crawl_dep), pageSize: int = DEFAULT_PAGE_SIZE, page: int = 1, @@ -537,9 +549,11 @@ async def get_pages_list_with_qa( crawl_id=crawl_id, org=org, qa_run_id=qa_run_id, - qa_range_field=qa_range_field, - qa_range_field_gte=qa_range_field_gte, - qa_range_field_lte=qa_range_field_lte, + qa_filter_by=filterBy, + qa_gte=gte, + qa_gt=gt, + qa_lte=lte, + qa_lt=lt, page_size=pageSize, page=page, sort_by=sortBy, From 3c58cb9c85726bbccba769c13775fe0f0510c924 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 12 Mar 2024 21:38:04 -0400 Subject: [PATCH 33/49] page model rename: rename load_state -> loadState, timestamp -> ts to match pages.jsonl data model --- backend/btrixcloud/models.py | 4 ++-- backend/btrixcloud/pages.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 4a4a9510fb..70c5a8c5ae 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -1502,8 +1502,8 @@ class Page(BaseMongoModel): # core page data url: AnyHttpUrl title: Optional[str] = None - timestamp: Optional[datetime] = None - load_state: Optional[int] = None + ts: Optional[datetime] = None + loadState: Optional[int] = None status: Optional[int] = None # manual review diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 24d36d6f00..bba937d5e6 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -91,9 +91,9 @@ async def add_page_to_db( crawl_id=crawl_id, url=page_dict.get("url"), title=page_dict.get("title"), - load_state=page_dict.get("loadState"), + loadState=page_dict.get("loadState"), status=status, - timestamp=( + ts=( from_k8s_date(page_dict.get("ts")) if page_dict.get("ts") else datetime.now() From ce58e38f9ea2a3c6a52366805982ea2f9868c328 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 12 Mar 2024 22:50:06 -0400 Subject: [PATCH 34/49] tests: fix page test --- backend/test/test_run_crawl.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index 8fdce57112..4780a9780b 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -431,9 +431,9 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): assert page["oid"] assert page["crawl_id"] assert page["url"] - assert page["timestamp"] + assert page["ts"] assert page.get("title") or page.get("title") is None - assert page["load_state"] + assert page["loadState"] # Test GET page endpoint global page_id @@ -449,9 +449,9 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): assert page["oid"] assert page["crawl_id"] assert page["url"] - assert page["timestamp"] + assert page["ts"] assert page.get("title") or page.get("title") is None - assert page["load_state"] + assert page["loadState"] assert page["notes"] == [] assert page.get("userid") is None @@ -480,9 +480,9 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): assert page["oid"] assert page["crawl_id"] assert page["url"] - assert page["timestamp"] + assert page["ts"] assert page.get("title") or page.get("title") is None - assert page["load_state"] + assert page["loadState"] assert page["notes"] == [] assert page["userid"] From ff9592e628aa7112d13f5f384cd27c202de8d80c Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 13 Mar 2024 00:53:39 -0400 Subject: [PATCH 35/49] initial QA tests! use 1.1.0-beta.0 crawler for QA support --- backend/test/test_qa.py | 137 ++++++++++++++++++++++++++++++++++++++++ chart/test/test.yaml | 2 +- 2 files changed, 138 insertions(+), 1 deletion(-) create mode 100644 backend/test/test_qa.py diff --git a/backend/test/test_qa.py b/backend/test/test_qa.py new file mode 100644 index 0000000000..f435e2970e --- /dev/null +++ b/backend/test/test_qa.py @@ -0,0 +1,137 @@ +from .conftest import API_PREFIX, HOST_PREFIX +import requests +import time + +qa_run_id = None + + +def test_run_qa(crawler_crawl_id, crawler_auth_headers, default_org_id): + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/start", + headers=crawler_auth_headers, + ) + + assert r.status_code == 200 + + data = r.json() + assert data["started"] + global qa_run_id + qa_run_id = data["started"] + + +def test_run_qa_already_running(crawler_crawl_id, crawler_auth_headers, default_org_id): + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/start", + headers=crawler_auth_headers, + ) + + assert r.status_code == 400 + assert r.json()["detail"] == "qa_already_running" + + +def test_active_qa(crawler_crawl_id, crawler_auth_headers, default_org_id): + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/activeQA", + headers=crawler_auth_headers, + ) + + data = r.json() + qa = data["qa"] + + assert qa + assert qa["state"] + assert qa["started"] + assert not qa["finished"] + + +def test_qa_list(crawler_crawl_id, crawler_auth_headers, default_org_id): + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa", + headers=crawler_auth_headers, + ) + + data = r.json() + + assert len(data) == 1 + + qa = data[0] + assert qa + assert qa["state"] + assert qa["started"] + assert not qa["finished"] + + +def test_wait_for_complete(crawler_crawl_id, crawler_auth_headers, default_org_id): + count = 0 + completed = False + while count < 24: + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/activeQA", + headers=crawler_auth_headers, + ) + + data = r.json() + if not data["qa"]: + completed = True + break + + time.sleep(5) + count += 1 + + assert completed + + +def test_qa_completed(crawler_crawl_id, crawler_auth_headers, default_org_id): + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa", + headers=crawler_auth_headers, + ) + + data = r.json() + + assert len(data) == 1 + + qa = data[0] + assert qa + assert qa["state"] == "complete" + assert qa["started"] + assert qa["finished"] + assert qa["stats"]["found"] == 1 + assert qa["stats"]["done"] == 1 + + +def test_qa_page_data(crawler_crawl_id, crawler_auth_headers, default_org_id): + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/pages", + headers=crawler_auth_headers, + ) + data = r.json() + print(data) + assert len(data["items"]) == 1 + page = data["items"][0] + assert page["title"] == "Webrecorder" + assert page["url"] == "https://webrecorder.net/" + assert page["qa"]["textMatch"] == 1.0 + assert page["qa"]["screenshotMatch"] == 1.0 + assert page["qa"]["resourceCounts"] == {"crawlGood": 15, "crawlBad": 0, "replayGood": 15, "replayBad": 1} + +def test_qa_replay(crawler_crawl_id, crawler_auth_headers, default_org_id): + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/replay.json", + headers=crawler_auth_headers, + ) + data = r.json() + assert len(data["resources"]) == 1 + assert data["resources"][0]["path"] + + +def test_run_qa_not_running(crawler_crawl_id, crawler_auth_headers, default_org_id): + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/stop", + headers=crawler_auth_headers, + ) + + assert r.status_code == 400 + assert r.json()["detail"] == "qa_not_running" + + diff --git a/chart/test/test.yaml b/chart/test/test.yaml index 91af1846e2..b75a948bc9 100644 --- a/chart/test/test.yaml +++ b/chart/test/test.yaml @@ -22,7 +22,7 @@ crawler_channels: image: "docker.io/webrecorder/browsertrix-crawler:latest" - id: test - image: "docker.io/webrecorder/browsertrix-crawler:1.0.0-beta.4" + image: "docker.io/webrecorder/browsertrix-crawler:1.1.0-beta.0" mongo_auth: # specify either username + password (for local mongo) From 571b3063fe0a7a31d501abbc32a22978f7acc7ea Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 13 Mar 2024 01:42:24 -0400 Subject: [PATCH 36/49] qa: fix qa deletion, add test for deleting qa run, ensuring removed from both crawl and pages --- backend/btrixcloud/crawls.py | 5 ++++- backend/btrixcloud/pages.py | 2 +- backend/test/test_qa.py | 31 ++++++++++++++++++++++++++++++- 3 files changed, 35 insertions(+), 3 deletions(-) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index a4f0ee9840..24004a71a0 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -755,9 +755,12 @@ async def delete_crawl_qa_run(self, crawl_id: str, qa_run_id: str): {"$unset": {f"qaFinished.{qa_run_id}": ""}}, ) + if not res: + return {"deleted": False} + await self.page_ops.delete_qa_run_from_pages(crawl_id, qa_run_id) - return res + return {"deleted": True} async def qa_run_finished(self, crawl_id: str): """clear active qa, add qa run to finished list, if successful""" diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index bba937d5e6..230f18ff69 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -344,7 +344,7 @@ async def list_pages( query["oid"] = org.id if qa_run_id: - query["qa"] = {"$exists": qa_run_id} + query[f"qa.{qa_run_id}"] = {"$exists": True} range_filter = {} diff --git a/backend/test/test_qa.py b/backend/test/test_qa.py index f435e2970e..61766704d2 100644 --- a/backend/test/test_qa.py +++ b/backend/test/test_qa.py @@ -113,7 +113,13 @@ def test_qa_page_data(crawler_crawl_id, crawler_auth_headers, default_org_id): assert page["url"] == "https://webrecorder.net/" assert page["qa"]["textMatch"] == 1.0 assert page["qa"]["screenshotMatch"] == 1.0 - assert page["qa"]["resourceCounts"] == {"crawlGood": 15, "crawlBad": 0, "replayGood": 15, "replayBad": 1} + assert page["qa"]["resourceCounts"] == { + "crawlGood": 15, + "crawlBad": 0, + "replayGood": 15, + "replayBad": 1, + } + def test_qa_replay(crawler_crawl_id, crawler_auth_headers, default_org_id): r = requests.get( @@ -135,3 +141,26 @@ def test_run_qa_not_running(crawler_crawl_id, crawler_auth_headers, default_org_ assert r.json()["detail"] == "qa_not_running" +def test_delete_qa_run(crawler_crawl_id, crawler_auth_headers, default_org_id): + r = requests.delete( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}", + headers=crawler_auth_headers, + ) + + assert r.status_code == 200 + assert r.json()["deleted"] == True + + # deleted from finished qa list + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa", + headers=crawler_auth_headers, + ) + + assert len(r.json()) == 0 + + # deleted from pages + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/pages", + headers=crawler_auth_headers, + ) + assert len(r.json()["items"]) == 0 From b544a590db90f812fd07154c80440aae69baf2be Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 13 Mar 2024 07:59:29 -0700 Subject: [PATCH 37/49] Update backend/btrixcloud/basecrawls.py Co-authored-by: Tessa Walsh --- backend/btrixcloud/basecrawls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index fefe4028d2..f4d11a9182 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -436,7 +436,7 @@ async def _resolve_signed_urls( prefix = "files" if qa_run_id: - prefix = f"qa.{qa_run_id}.{prefix}" + prefix = f"qaFinished.{qa_run_id}.{prefix}" await self.crawls.find_one_and_update( {f"{prefix}.filename": file_.filename}, From d61b8d91860eb85e6de57298e305dd4b781157b2 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 13 Mar 2024 12:19:37 -0400 Subject: [PATCH 38/49] also track qa-only exec time and usage, add qausage and qaexectime to org model --- backend/btrixcloud/models.py | 14 ++++++++++++++ backend/btrixcloud/operator/crawls.py | 6 ++++-- backend/btrixcloud/orgs.py | 13 ++++++++----- 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 70c5a8c5ae..b68e8fb01e 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -964,8 +964,15 @@ class OrgOut(BaseMongoModel): storageQuotaReached: Optional[bool] execMinutesQuotaReached: Optional[bool] + # total usage and exec time usage: Optional[Dict[str, int]] crawlExecSeconds: Dict[str, int] = {} + + # qa only usage + exec time + qausage: Optional[Dict[str, int]] = {} + qacrawlExecSeconds: Dict[str, int] = {} + + # exec time limits monthlyExecSeconds: Dict[str, int] = {} extraExecSeconds: Dict[str, int] = {} giftedExecSeconds: Dict[str, int] = {} @@ -999,8 +1006,15 @@ class Organization(BaseMongoModel): bytesStoredUploads: int = 0 bytesStoredProfiles: int = 0 + # total usage + exec time usage: Dict[str, int] = {} crawlExecSeconds: Dict[str, int] = {} + + # qa only usage + exec time + qausage: Dict[str, int] = {} + qacrawlExecSeconds: Dict[str, int] = {} + + # exec time limits monthlyExecSeconds: Dict[str, int] = {} extraExecSeconds: Dict[str, int] = {} giftedExecSeconds: Dict[str, int] = {} diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index f10de63203..059bd7b144 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -959,7 +959,9 @@ async def increment_pod_exec_time( max_duration = max(duration, max_duration) if exec_time: - await self.org_ops.inc_org_time_stats(crawl.oid, exec_time, True) + await self.org_ops.inc_org_time_stats( + crawl.oid, exec_time, True, crawl.is_qa + ) status.crawlExecTime += exec_time status.elapsedCrawlTime += max_duration @@ -1338,7 +1340,7 @@ async def inc_crawl_complete_stats(self, crawl: CrawlSpec, finished: datetime): print(f"Duration: {duration}", flush=True) - await self.org_ops.inc_org_time_stats(crawl.oid, duration) + await self.org_ops.inc_org_time_stats(crawl.oid, duration, False, crawl.is_qa) async def mark_for_cancelation(self, crawl_id): """mark crawl as canceled in redis""" diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index dd866efc6b..9ed56daa5c 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -499,18 +499,21 @@ async def set_origin(self, org: Organization, request: Request): {"_id": org.id}, {"$set": {"origin": origin}} ) - async def inc_org_time_stats(self, oid, duration, is_exec_time=False): + async def inc_org_time_stats(self, oid, duration, is_exec_time=False, is_qa=False): """inc crawl duration stats for org Overage is applied only to crawlExecSeconds - monthlyExecSeconds, giftedExecSeconds, and extraExecSeconds are added to only up to quotas + + If is_qa is true, also update seperate qa only counter """ - # pylint: disable=too-many-return-statements + # pylint: disable=too-many-return-statements, too-many-locals key = "crawlExecSeconds" if is_exec_time else "usage" yymm = datetime.utcnow().strftime("%Y-%m") - await self.orgs.find_one_and_update( - {"_id": oid}, {"$inc": {f"{key}.{yymm}": duration}} - ) + inc_query = {f"{key}.{yymm}": duration} + if is_qa: + inc_query[f"qa{key}.{yymm}"] = duration + await self.orgs.find_one_and_update({"_id": oid}, {"$inc": inc_query}) if not is_exec_time: return From 310780a8a87c00626a22138af3fd4231415c7ef7 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 13 Mar 2024 12:28:19 -0400 Subject: [PATCH 39/49] change delete endpoint to use POST, accept a list of qa_run_ids --- backend/btrixcloud/crawls.py | 31 ++++++++++++++++++------------- backend/btrixcloud/models.py | 7 +++++++ backend/test/test_qa.py | 5 +++-- 3 files changed, 28 insertions(+), 15 deletions(-) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 24004a71a0..64df8d488c 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -34,6 +34,7 @@ QARun, QARunOut, QARunWithResources, + DeleteQARunList, Organization, User, PaginatedResponse, @@ -747,20 +748,22 @@ async def stop_crawl_qa_run(self, crawl_id: str, org: Organization): status_code=404, detail=f"crawl_not_found, (details: {exc})" ) - async def delete_crawl_qa_run(self, crawl_id: str, qa_run_id: str): + async def delete_crawl_qa_runs(self, crawl_id: str, delete_list: DeleteQARunList): """delete specified finished QA run""" - res = await self.crawls.find_one_and_update( - {"_id": crawl_id, "type": "crawl"}, - {"$unset": {f"qaFinished.{qa_run_id}": ""}}, - ) + count = 0 + for qa_run_id in delete_list.qa_run_ids: + res = await self.crawls.find_one_and_update( + {"_id": crawl_id, "type": "crawl"}, + {"$unset": {f"qaFinished.{qa_run_id}": ""}}, + ) - if not res: - return {"deleted": False} + if res: + count += 1 - await self.page_ops.delete_qa_run_from_pages(crawl_id, qa_run_id) + await self.page_ops.delete_qa_run_from_pages(crawl_id, qa_run_id) - return {"deleted": True} + return {"deleted": count} async def qa_run_finished(self, crawl_id: str): """clear active qa, add qa run to finished list, if successful""" @@ -1057,12 +1060,14 @@ async def stop_crawl_qa_run( # pylint: disable=unused-argument return await ops.stop_crawl_qa_run(crawl_id, org) - @app.delete("/orgs/{oid}/crawls/{crawl_id}/qa/{qa_run_id}", tags=["qa"]) - async def delete_crawl_qa_run( - crawl_id: str, qa_run_id: str, org: Organization = Depends(org_crawl_dep) + @app.post("/orgs/{oid}/crawls/{crawl_id}/qa/delete", tags=["qa"]) + async def delete_crawl_qa_runs( + crawl_id: str, + qa_run_ids: DeleteQARunList, + org: Organization = Depends(org_crawl_dep), ): # pylint: disable=unused-argument - return await ops.delete_crawl_qa_run(crawl_id, qa_run_id) + return await ops.delete_crawl_qa_runs(crawl_id, qa_run_ids) @app.get( "/orgs/{oid}/crawls/{crawl_id}/qa", diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index b68e8fb01e..1dabaea6f6 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -682,6 +682,13 @@ class DeleteCrawlList(BaseModel): crawl_ids: List[str] +# ============================================================================ +class DeleteQARunList(BaseModel): + """delete qa run list POST body""" + + qa_run_ids: List[str] + + # ============================================================================ ### AUTOMATED CRAWLS ### diff --git a/backend/test/test_qa.py b/backend/test/test_qa.py index 61766704d2..a9d575e0aa 100644 --- a/backend/test/test_qa.py +++ b/backend/test/test_qa.py @@ -142,8 +142,9 @@ def test_run_qa_not_running(crawler_crawl_id, crawler_auth_headers, default_org_ def test_delete_qa_run(crawler_crawl_id, crawler_auth_headers, default_org_id): - r = requests.delete( - f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}", + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/delete", + json={"qa_run_ids": [qa_run_id]}, headers=crawler_auth_headers, ) From d512defd5aa5f32c1b1330e0cba5b78a7d8de7c2 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 15 Mar 2024 14:00:33 -0400 Subject: [PATCH 40/49] bump kubernetes-asyncio to 29.0.0 --- backend/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/requirements.txt b/backend/requirements.txt index 0877636c95..f8204b2116 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -9,7 +9,7 @@ email-validator #fastapi-users[mongodb]==9.2.2 loguru aiofiles -kubernetes-asyncio==25.11.0 +kubernetes-asyncio==29.0.0 kubernetes aiobotocore redis>=5.0.0 From ce72edba911f05497f050c43f8ca66cf47594d44 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 16 Mar 2024 07:34:46 -0400 Subject: [PATCH 41/49] fix qa org keys to be properly cased --- backend/btrixcloud/orgs.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index 9ed56daa5c..804d60aa92 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -512,7 +512,8 @@ async def inc_org_time_stats(self, oid, duration, is_exec_time=False, is_qa=Fals yymm = datetime.utcnow().strftime("%Y-%m") inc_query = {f"{key}.{yymm}": duration} if is_qa: - inc_query[f"qa{key}.{yymm}"] = duration + qa_key = "qaCrawlExecSeconds" if is_exec_time else "qaUsage" + inc_query[f"{qa_key}.{yymm}"] = duration await self.orgs.find_one_and_update({"_id": oid}, {"$inc": inc_query}) if not is_exec_time: From 43e2d0bc84fe0c2f276b865fdc39eac4d37b06c8 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 19 Mar 2024 17:41:05 -0700 Subject: [PATCH 42/49] additional type fixes --- backend/btrixcloud/crawls.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 8660f31e46..4711bec709 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -1222,7 +1222,7 @@ async def stream_crawl_logs( logLevel: Optional[str] = None, context: Optional[str] = None, ): - crawl = await ops.get_crawl(crawl_id, org) + crawl = await ops.get_crawl_out(crawl_id, org) log_levels = [] contexts = [] @@ -1234,7 +1234,7 @@ async def stream_crawl_logs( # If crawl is finished, stream logs from WACZ files using presigned urls if crawl.finished: resp = await ops.storage_ops.sync_stream_wacz_logs( - crawl.resources, log_levels, contexts + crawl.resources or [], log_levels, contexts ) return StreamingResponse( resp, From cbf3c9992438701f2ad969a7b702ed15c6af1331 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 19 Mar 2024 22:31:32 -0700 Subject: [PATCH 43/49] fix Page constructor --- backend/btrixcloud/operator/models.py | 2 +- backend/btrixcloud/pages.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/backend/btrixcloud/operator/models.py b/backend/btrixcloud/operator/models.py index acc28b9ffd..f5a2f41473 100644 --- a/backend/btrixcloud/operator/models.py +++ b/backend/btrixcloud/operator/models.py @@ -60,7 +60,7 @@ class CrawlSpec(BaseModel): scheduled: bool = False timeout: int = 0 max_crawl_size: int = 0 - qa_source_crawl_id: str = "" + qa_source_crawl_id: Optional[str] = "" @property def db_crawl_id(self) -> str: diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index cf1505c98f..8c721e3167 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -99,9 +99,9 @@ def _get_page_from_dict(self, page_dict: Dict[str, Any], crawl_id: str, oid: UUI crawl_id=crawl_id, url=page_dict.get("url"), title=page_dict.get("title"), - load_state=page_dict.get("loadState"), + loadState=page_dict.get("loadState"), status=status, - timestamp=( + ts=( from_k8s_date(page_dict.get("ts")) if page_dict.get("ts") else datetime.now() @@ -148,6 +148,7 @@ async def add_page_to_db( flush=True, ) return + # qa data if qa_run_id and page: compare_dict = page_dict.get("comparison") From fa4442051eacae31b64939235addbf3f0672c7f4 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 19 Mar 2024 22:51:40 -0700 Subject: [PATCH 44/49] fix test, update crawler image to 1.1.0-beta.1 --- backend/test/test_run_crawl.py | 2 +- chart/test/test.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index 7ef70c609a..a62848d4c0 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -518,7 +518,7 @@ def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_ assert page["oid"] assert page["crawl_id"] assert page["url"] - assert page["timestamp"] + assert page["ts"] assert page.get("title") or page.get("title") is None assert page["load_state"] assert page["status"] diff --git a/chart/test/test.yaml b/chart/test/test.yaml index b75a948bc9..b867e49713 100644 --- a/chart/test/test.yaml +++ b/chart/test/test.yaml @@ -22,7 +22,7 @@ crawler_channels: image: "docker.io/webrecorder/browsertrix-crawler:latest" - id: test - image: "docker.io/webrecorder/browsertrix-crawler:1.1.0-beta.0" + image: "docker.io/webrecorder/browsertrix-crawler:1.1.0-beta.1" mongo_auth: # specify either username + password (for local mongo) From 039f273d9abf4e743f4dcb760b20709c71f7fa37 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 20 Mar 2024 09:49:59 -0700 Subject: [PATCH 45/49] fix load_state -> loadState --- backend/test/test_run_crawl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index a62848d4c0..59720c40dc 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -520,7 +520,7 @@ def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_ assert page["url"] assert page["ts"] assert page.get("title") or page.get("title") is None - assert page["load_state"] + assert page["loadState"] assert page["status"] # Ensure only superuser can re-add pages for all crawls in an org From f5d1c02c88e5b1bd97735daa9db91b3d7f5b34f1 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 20 Mar 2024 09:55:35 -0700 Subject: [PATCH 46/49] fix models to contain qaUsage, qaCrawlExecSeconds in org and org out --- backend/btrixcloud/models.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 1dabaea6f6..70485119ab 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -976,8 +976,8 @@ class OrgOut(BaseMongoModel): crawlExecSeconds: Dict[str, int] = {} # qa only usage + exec time - qausage: Optional[Dict[str, int]] = {} - qacrawlExecSeconds: Dict[str, int] = {} + qaUsage: Optional[Dict[str, int]] = {} + qaCrawlExecSeconds: Dict[str, int] = {} # exec time limits monthlyExecSeconds: Dict[str, int] = {} @@ -1018,8 +1018,8 @@ class Organization(BaseMongoModel): crawlExecSeconds: Dict[str, int] = {} # qa only usage + exec time - qausage: Dict[str, int] = {} - qacrawlExecSeconds: Dict[str, int] = {} + qaUsage: Dict[str, int] = {} + qaCrawlExecSeconds: Dict[str, int] = {} # exec time limits monthlyExecSeconds: Dict[str, int] = {} From 9e9f352ed68cbf5c12d696948f92d7366bbc6e0b Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 20 Mar 2024 10:37:27 -0700 Subject: [PATCH 47/49] rename FRONTEND_ALIAS -> FRONTEND_ORIGIN add resolve_internal_access_path() to storages which prepends the frontend_origin in get_internal_crawl_out() --- backend/btrixcloud/basecrawls.py | 5 +++-- backend/btrixcloud/storages.py | 27 +++++++++++++++------------ chart/templates/configmap.yaml | 2 +- 3 files changed, 19 insertions(+), 15 deletions(-) diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index f4d11a9182..a02ab3fac4 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -63,6 +63,8 @@ class BaseCrawlOps: background_job_ops: BackgroundJobOps page_ops: PageOps + presign_duration: int + def __init__( self, mdb, @@ -189,8 +191,7 @@ async def get_internal_crawl_out(self, crawl_id): crawl_out = await self.get_crawl_out(crawl_id) resources = crawl_out.resources or [] for file_ in resources: - if file_.path.startswith("/"): - file_.path = "http://browsertrix-cloud-frontend.default" + file_.path + file_.path = self.storage_ops.resolve_internal_access_path(file_.path) return crawl_out diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index a28878fc4b..835aadffee 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -72,17 +72,20 @@ class StorageOps: org_ops: OrgOps crawl_manager: CrawlManager + is_local_minio: bool + frontend_origin: str + def __init__(self, org_ops, crawl_manager) -> None: self.org_ops = org_ops self.crawl_manager = crawl_manager self.is_local_minio = is_bool(os.environ.get("IS_LOCAL_MINIO")) - self.frontend_alias = os.environ.get( - "FRONTEND_ALIAS", "http://browsertrix-cloud-frontend" + frontend_origin = os.environ.get( + "FRONTEND_ORIGIN", "http://browsertrix-cloud-frontend" ) - self.default_namespace = os.environ.get("DEFAULT_NAMESPACE", "default") - self.frontend_url = f"{self.frontend_alias}.{self.default_namespace}" + default_namespace = os.environ.get("DEFAULT_NAMESPACE", "default") + self.frontend_origin = f"{frontend_origin}.{default_namespace}" with open(os.environ["STORAGES_JSON"], encoding="utf-8") as fh: storage_list = json.loads(fh.read()) @@ -320,6 +323,12 @@ async def verify_storage_upload(self, storage: S3Storage, filename: str) -> None resp = await client.put_object(Bucket=bucket, Key=key, Body=data) assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200 + def resolve_internal_access_path(self, path): + """Resolve relative path for internal access to minio bucket""" + if path.startswith("/"): + return self.frontend_origin + path + return path + def get_org_relative_path( self, org: Organization, ref: StorageRef, file_path: str ) -> str: @@ -601,10 +610,7 @@ def organize_based_on_instance_number( wacz_log_streams: List[Iterator[dict]] = [] for wacz_file in instance_list: - wacz_url = wacz_file.path - if wacz_url.startswith("/data"): - wacz_url = f"{self.frontend_url}{wacz_url}" - + wacz_url = self.resolve_internal_access_path(wacz_file.path) with RemoteZip(wacz_url) as remote_zip: log_files: List[ZipInfo] = [ f @@ -649,10 +655,7 @@ def stream_page_lines( page_generators: List[Iterator[Dict[Any, Any]]] = [] for wacz_file in wacz_files: - wacz_url = wacz_file.path - if wacz_url.startswith("/data"): - wacz_url = f"{self.frontend_url}{wacz_url}" - + wacz_url = self.resolve_internal_access_path(wacz_file.path) with RemoteZip(wacz_url) as remote_zip: page_files: List[ZipInfo] = [ f diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml index 6efb48a86b..c19255d9c1 100644 --- a/chart/templates/configmap.yaml +++ b/chart/templates/configmap.yaml @@ -12,7 +12,7 @@ data: DEFAULT_NAMESPACE: {{ .Release.Namespace }} - FRONTEND_ALIAS: {{ .Values.frontend_alias | default "http://browsertrix-cloud-frontend" }} + FRONTEND_ORIGIN: {{ .Values.frontend_alias | default "http://browsertrix-cloud-frontend" }} CRAWLER_FQDN_SUFFIX: ".{{ .Values.crawler_namespace }}.svc.cluster.local" From bc3bc4427a802da2d1c40dc4af3b8446cefca3dd Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 20 Mar 2024 14:26:32 -0700 Subject: [PATCH 48/49] add test for qaCrawlExecuted and qaUsage, ensure qaUsage is populated as well by calling inc_crawl_complete_stats for both regular crawls and qa runs --- backend/btrixcloud/models.py | 1 + backend/btrixcloud/operator/crawls.py | 11 ++++------- backend/test/test_qa.py | 22 +++++++++++++++++++++- 3 files changed, 26 insertions(+), 8 deletions(-) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 70485119ab..2b468699dc 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -635,6 +635,7 @@ class CrawlOut(BaseMongoModel): collectionIds: Optional[List[UUID]] = [] crawlExecSeconds: int = 0 + qaCrawlExecSeconds: int = 0 # automated crawl fields config: Optional[RawCrawlConfig] diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 059bd7b144..c33afe1d59 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -1272,9 +1272,12 @@ async def mark_finished( status.finished = to_k8s_date(finished) + if state in SUCCESSFUL_STATES: + await self.inc_crawl_complete_stats(crawl, finished) + # Regular Crawl Finished if not crawl.is_qa: - self.run_task(self.do_crawl_finished_tasks(crawl, status, finished, state)) + self.run_task(self.do_crawl_finished_tasks(crawl, status, state)) # QA Run Finished else: @@ -1287,7 +1290,6 @@ async def do_crawl_finished_tasks( self, crawl: CrawlSpec, status: CrawlStatus, - finished: datetime, state: str, ) -> None: """Run tasks after crawl completes in asyncio.task coroutine.""" @@ -1296,7 +1298,6 @@ async def do_crawl_finished_tasks( ) if state in SUCCESSFUL_STATES and crawl.oid: - await self.inc_crawl_complete_stats(crawl, finished) await self.org_ops.inc_org_bytes_stored( crawl.oid, status.filesAddedSize, "crawl" ) @@ -1321,10 +1322,6 @@ async def do_qa_run_finished_tasks( ) -> None: """Run tasks after qa run completes in asyncio.task coroutine.""" - # now done in finalizer - # if state in SUCCESSFUL_STATES: - # await self.crawl_ops.qa_run_finished(crawl.db_crawl_id) - if state in FAILED_STATES: await self.page_ops.delete_qa_run_from_pages(crawl.db_crawl_id, crawl.id) diff --git a/backend/test/test_qa.py b/backend/test/test_qa.py index a9d575e0aa..6029541843 100644 --- a/backend/test/test_qa.py +++ b/backend/test/test_qa.py @@ -1,6 +1,7 @@ from .conftest import API_PREFIX, HOST_PREFIX import requests import time +from datetime import datetime qa_run_id = None @@ -98,6 +99,26 @@ def test_qa_completed(crawler_crawl_id, crawler_auth_headers, default_org_id): assert qa["finished"] assert qa["stats"]["found"] == 1 assert qa["stats"]["done"] == 1 + assert qa["crawlExecSeconds"] > 0 + + +def test_qa_org_stats(crawler_crawl_id, crawler_auth_headers, default_org_id): + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}", + headers=crawler_auth_headers, + ) + crawl_stats = r.json() + assert crawl_stats["qaCrawlExecSeconds"] > 0 + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}", + headers=crawler_auth_headers, + ) + org_stats = r.json() + + yymm = datetime.utcnow().strftime("%Y-%m") + assert org_stats["qaCrawlExecSeconds"][yymm] > 0 + assert org_stats["qaUsage"][yymm] > 0 def test_qa_page_data(crawler_crawl_id, crawler_auth_headers, default_org_id): @@ -106,7 +127,6 @@ def test_qa_page_data(crawler_crawl_id, crawler_auth_headers, default_org_id): headers=crawler_auth_headers, ) data = r.json() - print(data) assert len(data["items"]) == 1 page = data["items"][0] assert page["title"] == "Webrecorder" From 55994a180963ba32829fe83705d5328acfb70431 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 20 Mar 2024 16:04:00 -0700 Subject: [PATCH 49/49] filterBy -> filterQABy --- backend/btrixcloud/pages.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 8c721e3167..231ff9cb58 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -600,7 +600,7 @@ async def get_pages_list( async def get_pages_list_with_qa( crawl_id: str, qa_run_id: str, - filterBy: Optional[str] = None, + filterQABy: Optional[str] = None, gte: Optional[float] = None, gt: Optional[float] = None, lte: Optional[float] = None, @@ -616,7 +616,7 @@ async def get_pages_list_with_qa( crawl_id=crawl_id, org=org, qa_run_id=qa_run_id, - qa_filter_by=filterBy, + qa_filter_by=filterQABy, qa_gte=gte, qa_gt=gt, qa_lte=lte,