diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index e45ae1ac2..025ab70d8 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -222,16 +222,6 @@ async def delete_crawls( return {"deleted": True, "storageQuotaReached": quota_reached} - async def get_wacz_files(self, crawl_id: str, org: Organization): - """Return list of WACZ files associated with crawl.""" - wacz_files = [] - crawl_raw = await self.get_crawl_raw(crawl_id, org) - crawl = Crawl.from_dict(crawl_raw) - for file_ in crawl.files: - if file_.filename.endswith(".wacz"): - wacz_files.append(file_) - return wacz_files - # pylint: disable=too-many-arguments async def add_new_crawl( self, @@ -889,11 +879,10 @@ async def stream_crawl_logs( if context: contexts = context.split(",") - # If crawl is finished, stream logs from WACZ files + # If crawl is finished, stream logs from WACZ files using presigned urls if crawl.finished: - wacz_files = await ops.get_wacz_files(crawl_id, org) resp = await ops.storage_ops.sync_stream_wacz_logs( - org, wacz_files, log_levels, contexts + crawl.resources, log_levels, contexts ) return StreamingResponse( resp, diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 91c5c3bb7..b3af72b3d 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -1,5 +1,7 @@ """crawl pages""" +import asyncio +import traceback from datetime import datetime from typing import TYPE_CHECKING, Optional, Tuple, List, Dict, Any, Union from uuid import UUID, uuid4 @@ -42,51 +44,83 @@ class PageOps: def __init__(self, mdb, crawl_ops, org_ops, storage_ops): self.pages = mdb["pages"] + self.crawls = mdb["crawls"] self.crawl_ops = crawl_ops self.org_ops = org_ops self.storage_ops = storage_ops - async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str): + async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str, batch_size=100): """Add pages to database from WACZ files""" + pages_buffer: List[Page] = [] try: crawl = await self.crawl_ops.get_crawl(crawl_id, None) - org = await self.org_ops.get_org_by_id(crawl.oid) - wacz_files = await self.crawl_ops.get_wacz_files(crawl_id, org) - stream = await self.storage_ops.sync_stream_pages_from_wacz(org, wacz_files) + stream = await self.storage_ops.sync_stream_wacz_pages(crawl.resources) for page_dict in stream: if not page_dict.get("url"): continue - await self.add_page_to_db(page_dict, crawl_id, crawl.oid) + if len(pages_buffer) > batch_size: + await self._add_pages_to_db(pages_buffer) + + pages_buffer.append( + self._get_page_from_dict(page_dict, crawl_id, crawl.oid) + ) + + # Add any remaining pages in buffer to db + if pages_buffer: + await self._add_pages_to_db(pages_buffer) + + print(f"Added pages for crawl {crawl_id} to db", flush=True) # pylint: disable=broad-exception-caught, raise-missing-from except Exception as err: + traceback.print_exc() print(f"Error adding pages for crawl {crawl_id} to db: {err}", flush=True) - async def add_page_to_db(self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID): - """Add page to database""" + def _get_page_from_dict(self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID): + """Return Page object from dict""" page_id = page_dict.get("id") if not page_id: print(f'Page {page_dict.get("url")} has no id - assigning UUID', flush=True) page_id = uuid4() + status = page_dict.get("status") + if not status and page_dict.get("loadState"): + status = 200 + + return Page( + id=page_id, + oid=oid, + crawl_id=crawl_id, + url=page_dict.get("url"), + title=page_dict.get("title"), + load_state=page_dict.get("loadState"), + status=status, + timestamp=( + from_k8s_date(page_dict.get("ts")) + if page_dict.get("ts") + else datetime.now() + ), + ) + + async def _add_pages_to_db(self, pages: List[Page]): + """Add batch of pages to db in one insert""" + result = await self.pages.insert_many( + [ + page.to_dict( + exclude_unset=True, exclude_none=True, exclude_defaults=True + ) + for page in pages + ] + ) + if not result.inserted_ids: + # pylint: disable=broad-exception-raised + raise Exception("No pages inserted") + + async def add_page_to_db(self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID): + """Add page to database""" + page = self._get_page_from_dict(page_dict, crawl_id, oid) + try: - status = page_dict.get("status") - if not status and page_dict.get("loadState"): - status = 200 - page = Page( - id=page_id, - oid=oid, - crawl_id=crawl_id, - url=page_dict.get("url"), - title=page_dict.get("title"), - load_state=page_dict.get("loadState"), - status=status, - timestamp=( - from_k8s_date(page_dict.get("ts")) - if page_dict.get("ts") - else datetime.now() - ), - ) await self.pages.insert_one( page.to_dict( exclude_unset=True, exclude_none=True, exclude_defaults=True @@ -97,7 +131,7 @@ async def add_page_to_db(self, page_dict: Dict[str, Any], crawl_id: str, oid: UU # pylint: disable=broad-except except Exception as err: print( - f"Error adding page {page_id} from crawl {crawl_id} to db: {err}", + f"Error adding page {page.id} from crawl {crawl_id} to db: {err}", flush=True, ) @@ -360,6 +394,20 @@ async def list_pages( return pages, total + async def re_add_crawl_pages(self, crawl_id: str, oid: UUID): + """Delete existing pages for crawl and re-add from WACZs.""" + await self.delete_crawl_pages(crawl_id, oid) + print(f"Deleted pages for crawl {crawl_id}", flush=True) + await self.add_crawl_pages_to_db_from_wacz(crawl_id) + + async def re_add_all_crawl_pages(self, oid: UUID): + """Re-add pages for all crawls in org""" + crawl_ids = await self.crawls.distinct( + "_id", {"type": "crawl", "finished": {"$ne": None}} + ) + for crawl_id in crawl_ids: + await self.re_add_crawl_pages(crawl_id, oid) + # ============================================================================ # pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme @@ -371,6 +419,25 @@ def init_pages_api(app, mdb, crawl_ops, org_ops, storage_ops, user_dep): org_crawl_dep = org_ops.org_crawl_dep + @app.post("/orgs/{oid}/crawls/all/pages/reAdd", tags=["pages"]) + async def re_add_all_crawl_pages( + org: Organization = Depends(org_crawl_dep), user: User = Depends(user_dep) + ): + """Re-add pages for all crawls in org (superuser only)""" + if not user.is_superuser: + raise HTTPException(status_code=403, detail="Not Allowed") + + asyncio.create_task(ops.re_add_all_crawl_pages(org.id)) + return {"started": True} + + @app.post("/orgs/{oid}/crawls/{crawl_id}/pages/reAdd", tags=["pages"]) + async def re_add_crawl_pages( + crawl_id: str, org: Organization = Depends(org_crawl_dep) + ): + """Re-add pages for crawl""" + asyncio.create_task(ops.re_add_crawl_pages(crawl_id, org.id)) + return {"started": True} + @app.get( "/orgs/{oid}/crawls/{crawl_id}/pages/{page_id}", tags=["pages"], diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 4161da1d5..6d40cf04f 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -14,18 +14,20 @@ ) from urllib.parse import urlsplit from contextlib import asynccontextmanager +from itertools import chain import asyncio import heapq import zlib import json -import itertools import os from datetime import datetime +from zipfile import ZipInfo from fastapi import Depends, HTTPException from stream_zip import stream_zip, NO_COMPRESSION_64 +from remotezip import RemoteZip import aiobotocore.session import boto3 @@ -43,10 +45,6 @@ S3StorageIn, OrgStorageRefs, ) -from .zip import ( - sync_get_zip_file, - sync_get_filestream, -) from .utils import is_bool, slug_from_name @@ -80,6 +78,12 @@ def __init__(self, org_ops, crawl_manager) -> None: self.is_local_minio = is_bool(os.environ.get("IS_LOCAL_MINIO")) + self.frontend_alias = os.environ.get( + "FRONTEND_ALIAS", "http://browsertrix-cloud-frontend" + ) + self.default_namespace = os.environ.get("DEFAULT_NAMESPACE", "default") + self.frontend_url = f"{self.frontend_alias}.{self.default_namespace}" + with open(os.environ["STORAGES_JSON"], encoding="utf-8") as fh: storage_list = json.loads(fh.read()) @@ -512,75 +516,53 @@ async def _delete_file( return status_code == 204 - async def sync_stream_pages_from_wacz( - self, - org: Organization, - wacz_files: List[CrawlFile], + async def sync_stream_wacz_pages( + self, wacz_files: List[CrawlFileOut] ) -> Iterator[Dict[Any, Any]]: - """Return stream of page dicts from last of WACZs""" - async with self.get_sync_client(org) as (client, bucket, key): - loop = asyncio.get_event_loop() + """Return stream of pages specified WACZ""" + loop = asyncio.get_event_loop() - stream = await loop.run_in_executor( - None, - self._sync_get_pages, - wacz_files, - client, - bucket, - key, - ) + resp = await loop.run_in_executor(None, self._sync_get_pages, wacz_files) - return stream + return resp async def sync_stream_wacz_logs( self, - org: Organization, - wacz_files: List[CrawlFile], + wacz_files: List[CrawlFileOut], log_levels: List[str], contexts: List[str], ) -> Iterator[bytes]: """Return filtered stream of logs from specified WACZs sorted by timestamp""" - async with self.get_sync_client(org) as (client, bucket, key): - loop = asyncio.get_event_loop() - - resp = await loop.run_in_executor( - None, - self._sync_get_logs, - wacz_files, - log_levels, - contexts, - client, - bucket, - key, - ) + loop = asyncio.get_event_loop() + + resp = await loop.run_in_executor( + None, + self._sync_get_logs, + wacz_files, + log_levels, + contexts, + ) - return resp + return resp def _sync_get_logs( self, - wacz_files: List[CrawlFile], + wacz_files: List[CrawlFileOut], log_levels: List[str], contexts: List[str], - client, - bucket: str, - key: str, ) -> Iterator[bytes]: """Generate filtered stream of logs from specified WACZs sorted by timestamp""" # pylint: disable=too-many-function-args def stream_log_lines( - wacz_key, wacz_filename, cd_start, log_zipinfo + log_zipinfo: ZipInfo, wacz_url: str, wacz_filename: str ) -> Iterator[dict]: """Pass lines as json objects""" + filename = log_zipinfo.filename - print( - f"Fetching log {log_zipinfo.filename} from {wacz_filename}", flush=True - ) - - line_iter: Iterator[bytes] = sync_get_filestream( - client, bucket, wacz_key, log_zipinfo, cd_start - ) + print(f"Fetching log {filename} from {wacz_filename}", flush=True) + line_iter: Iterator[bytes] = self._sync_get_filestream(wacz_url, filename) for line in line_iter: yield _parse_json(line.decode("utf-8", errors="ignore")) @@ -597,14 +579,14 @@ def stream_json_lines( yield json_str.encode("utf-8") def organize_based_on_instance_number( - wacz_files: List[CrawlFile], - ) -> List[List[CrawlFile]]: + wacz_files: List[CrawlFileOut], + ) -> List[List[CrawlFileOut]]: """Place wacz_files into their own list based on instance number""" - wacz_files.sort(key=lambda file: file.filename) - waczs_groups: Dict[str, List[CrawlFile]] = {} + wacz_files.sort(key=lambda file: file.name) + waczs_groups: Dict[str, List[CrawlFileOut]] = {} for file in wacz_files: - instance_number = file.filename[ - file.filename.rfind("-") + 1 : file.filename.rfind(".") + instance_number = file.name[ + file.name.rfind("-") + 1 : file.name.rfind(".") ] if instance_number in waczs_groups: waczs_groups[instance_number].append(file) @@ -619,24 +601,24 @@ def organize_based_on_instance_number( wacz_log_streams: List[Iterator[dict]] = [] for wacz_file in instance_list: - wacz_key = key + wacz_file.filename - cd_start, zip_file = sync_get_zip_file(client, bucket, wacz_key) - - log_files = [ - f - for f in zip_file.filelist - if f.filename.startswith("logs/") and not f.is_dir() - ] - log_files.sort(key=lambda log_zipinfo: log_zipinfo.filename) - - for log_zipinfo in log_files: - wacz_log_streams.append( - stream_log_lines( - wacz_key, wacz_file.filename, cd_start, log_zipinfo + wacz_url = wacz_file.path + if wacz_url.startswith("/data"): + wacz_url = f"{self.frontend_url}{wacz_url}" + + with RemoteZip(wacz_url) as remote_zip: + log_files: List[ZipInfo] = [ + f + for f in remote_zip.infolist() + if f.filename.startswith("logs/") and not f.is_dir() + ] + log_files.sort(key=lambda log_zipinfo: log_zipinfo.filename) + + for log_zipinfo in log_files: + wacz_log_streams.append( + stream_log_lines(log_zipinfo, wacz_url, wacz_file.name) ) - ) - log_generators.append(itertools.chain(*wacz_log_streams)) + log_generators.append(chain(*wacz_log_streams)) heap_iter = heapq.merge(*log_generators, key=lambda entry: entry["timestamp"]) @@ -644,50 +626,53 @@ def organize_based_on_instance_number( def _sync_get_pages( self, - wacz_files: List[CrawlFile], - client, - bucket: str, - key: str, + wacz_files: List[CrawlFileOut], ) -> Iterator[Dict[Any, Any]]: """Generate stream of page dicts from specified WACZs""" # pylint: disable=too-many-function-args def stream_page_lines( - wacz_key, wacz_filename, cd_start, pagefile_zipinfo + pagefile_zipinfo: ZipInfo, wacz_url: str, wacz_filename: str ) -> Iterator[Dict[Any, Any]]: """Pass lines as json objects""" + filename = pagefile_zipinfo.filename + print( - f"Fetching JSON lines from {pagefile_zipinfo.filename} in {wacz_filename}", + f"Fetching JSON lines from {filename} in {wacz_filename}", flush=True, ) - line_iter: Iterator[bytes] = sync_get_filestream( - client, bucket, wacz_key, pagefile_zipinfo, cd_start - ) + line_iter: Iterator[bytes] = self._sync_get_filestream(wacz_url, filename) for line in line_iter: yield _parse_json(line.decode("utf-8", errors="ignore")) page_generators: List[Iterator[Dict[Any, Any]]] = [] for wacz_file in wacz_files: - wacz_key = key + wacz_file.filename - cd_start, zip_file = sync_get_zip_file(client, bucket, wacz_key) - - page_files = [ - f - for f in zip_file.filelist - if f.filename.startswith("pages/") - and f.filename.endswith(".jsonl") - and not f.is_dir() - ] - for pagefile_zipinfo in page_files: - page_generators.append( - stream_page_lines( - wacz_key, wacz_file.filename, cd_start, pagefile_zipinfo + wacz_url = wacz_file.path + if wacz_url.startswith("/data"): + wacz_url = f"{self.frontend_url}{wacz_url}" + + with RemoteZip(wacz_url) as remote_zip: + page_files: List[ZipInfo] = [ + f + for f in remote_zip.infolist() + if f.filename.startswith("pages/") + and f.filename.endswith(".jsonl") + and not f.is_dir() + ] + for pagefile_zipinfo in page_files: + page_generators.append( + stream_page_lines(pagefile_zipinfo, wacz_url, wacz_file.name) ) - ) - return itertools.chain(*page_generators) + return chain.from_iterable(page_generators) + + def _sync_get_filestream(self, wacz_url: str, filename: str) -> Iterator[bytes]: + """Return iterator of lines in remote file as bytes""" + with RemoteZip(wacz_url) as remote_zip: + with remote_zip.open(filename) as file_stream: + yield from file_stream def _sync_dl( self, all_files: List[CrawlFileOut], client: S3Client, bucket: str, key: str diff --git a/backend/btrixcloud/utils.py b/backend/btrixcloud/utils.py index 1b60fe010..66774ba8c 100644 --- a/backend/btrixcloud/utils.py +++ b/backend/btrixcloud/utils.py @@ -120,14 +120,3 @@ def stream_dict_list_as_csv(data: List[Dict[str, Union[str, int]]], filename: st media_type="text/csv", headers={"Content-Disposition": f"attachment;filename={filename}"}, ) - - -async def gather_tasks_with_concurrency(*tasks, n=5): - """Limit concurrency to n tasks at a time""" - semaphore = asyncio.Semaphore(n) - - async def semaphore_task(task): - async with semaphore: - return await task - - return await asyncio.gather(*(semaphore_task(task) for task in tasks)) diff --git a/backend/btrixcloud/zip.py b/backend/btrixcloud/zip.py deleted file mode 100644 index bead89e3b..000000000 --- a/backend/btrixcloud/zip.py +++ /dev/null @@ -1,200 +0,0 @@ -""" -Methods for interacting with zip/WACZ files -""" - -import io -import struct -import zipfile -import zlib - - -# ============================================================================ -EOCD_RECORD_SIZE = 22 -ZIP64_EOCD_RECORD_SIZE = 56 -ZIP64_EOCD_LOCATOR_SIZE = 20 - -MAX_STANDARD_ZIP_SIZE = 4_294_967_295 - -CHUNK_SIZE = 1024 * 256 - - -# ============================================================================ -def sync_get_filestream(client, bucket, key, file_zipinfo, cd_start): - """Return uncompressed byte stream of file in WACZ""" - # pylint: disable=too-many-locals - file_head = sync_fetch( - client, bucket, key, cd_start + file_zipinfo.header_offset + 26, 4 - ) - name_len = parse_little_endian_to_int(file_head[0:2]) - extra_len = parse_little_endian_to_int(file_head[2:4]) - - content = sync_fetch_stream( - client, - bucket, - key, - cd_start + file_zipinfo.header_offset + 30 + name_len + extra_len, - file_zipinfo.compress_size, - ) - - decompress = False - if file_zipinfo.compress_type == zipfile.ZIP_DEFLATED: - decompress = True - - return sync_iter_lines(content, decompress=decompress) - - -def sync_iter_lines(chunk_iter, decompress=False, keepends=True): - """ - Iter by lines, adapted from botocore - """ - pending = b"" - for chunk in chunk_iter: - if decompress: - chunk = zlib.decompressobj(-zlib.MAX_WBITS).decompress(chunk) - lines = (pending + chunk).splitlines(True) - for line in lines[:-1]: - yield line.splitlines(keepends)[0] - pending = lines[-1] - if pending: - yield pending.splitlines(keepends)[0] - - -async def get_zip_file(client, bucket, key): - """Fetch enough of the WACZ file be able to read the zip filelist""" - file_size = await get_file_size(client, bucket, key) - eocd_record = await fetch( - client, bucket, key, file_size - EOCD_RECORD_SIZE, EOCD_RECORD_SIZE - ) - - if file_size <= MAX_STANDARD_ZIP_SIZE: - cd_start, cd_size = get_central_directory_metadata_from_eocd(eocd_record) - central_directory = await fetch(client, bucket, key, cd_start, cd_size) - return ( - cd_start, - zipfile.ZipFile(io.BytesIO(central_directory + eocd_record)), - ) - - zip64_eocd_record = await fetch( - client, - bucket, - key, - file_size - - (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE + ZIP64_EOCD_RECORD_SIZE), - ZIP64_EOCD_RECORD_SIZE, - ) - zip64_eocd_locator = await fetch( - client, - bucket, - key, - file_size - (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE), - ZIP64_EOCD_LOCATOR_SIZE, - ) - cd_start, cd_size = get_central_directory_metadata_from_eocd64(zip64_eocd_record) - central_directory = await fetch(client, bucket, key, cd_start, cd_size) - return ( - cd_start, - zipfile.ZipFile( - io.BytesIO( - central_directory + zip64_eocd_record + zip64_eocd_locator + eocd_record - ) - ), - ) - - -def sync_get_zip_file(client, bucket, key): - """Fetch enough of the WACZ file be able to read the zip filelist""" - file_size = sync_get_file_size(client, bucket, key) - eocd_record = sync_fetch( - client, bucket, key, file_size - EOCD_RECORD_SIZE, EOCD_RECORD_SIZE - ) - - if file_size <= MAX_STANDARD_ZIP_SIZE: - cd_start, cd_size = get_central_directory_metadata_from_eocd(eocd_record) - central_directory = sync_fetch(client, bucket, key, cd_start, cd_size) - with zipfile.ZipFile(io.BytesIO(central_directory + eocd_record)) as zip_file: - return (cd_start, zip_file) - - zip64_eocd_record = sync_fetch( - client, - bucket, - key, - file_size - - (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE + ZIP64_EOCD_RECORD_SIZE), - ZIP64_EOCD_RECORD_SIZE, - ) - zip64_eocd_locator = sync_fetch( - client, - bucket, - key, - file_size - (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE), - ZIP64_EOCD_LOCATOR_SIZE, - ) - cd_start, cd_size = get_central_directory_metadata_from_eocd64(zip64_eocd_record) - central_directory = sync_fetch(client, bucket, key, cd_start, cd_size) - with zipfile.ZipFile( - io.BytesIO( - central_directory + zip64_eocd_record + zip64_eocd_locator + eocd_record - ) - ) as zip_file: - return (cd_start, zip_file) - - -async def get_file_size(client, bucket, key): - """Get WACZ file size from HEAD request""" - head_response = await client.head_object(Bucket=bucket, Key=key) - return head_response["ContentLength"] - - -def sync_get_file_size(client, bucket, key): - """Get WACZ file size from HEAD request""" - head_response = client.head_object(Bucket=bucket, Key=key) - return head_response["ContentLength"] - - -async def fetch(client, bucket, key, start, length): - """Fetch a byte range from a file in object storage""" - end = start + length - 1 - response = await client.get_object( - Bucket=bucket, Key=key, Range=f"bytes={start}-{end}" - ) - return await response["Body"].read() - - -def sync_fetch(client, bucket, key, start, length): - """Fetch a byte range from a file in object storage""" - end = start + length - 1 - response = client.get_object(Bucket=bucket, Key=key, Range=f"bytes={start}-{end}") - return response["Body"].read() - - -def sync_fetch_stream(client, bucket, key, start, length): - """Fetch a byte range from a file in object storage as a stream""" - end = start + length - 1 - response = client.get_object(Bucket=bucket, Key=key, Range=f"bytes={start}-{end}") - return response["Body"].iter_chunks(chunk_size=CHUNK_SIZE) - - -def get_central_directory_metadata_from_eocd(eocd): - """Get central directory start and size""" - cd_size = parse_little_endian_to_int(eocd[12:16]) - cd_start = parse_little_endian_to_int(eocd[16:20]) - return cd_start, cd_size - - -def get_central_directory_metadata_from_eocd64(eocd64): - """Get central directory start and size for zip64""" - cd_size = parse_little_endian_to_int(eocd64[40:48]) - cd_start = parse_little_endian_to_int(eocd64[48:56]) - return cd_start, cd_size - - -def parse_little_endian_to_int(little_endian_bytes): - """Convert little endian used in zip spec to int""" - byte_length = len(little_endian_bytes) - format_character = "q" - if byte_length == 4: - format_character = "i" - elif byte_length == 2: - format_character = "h" - - return struct.unpack("<" + format_character, little_endian_bytes)[0] diff --git a/backend/requirements.txt b/backend/requirements.txt index 0877636c9..30a6c0f32 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -28,3 +28,4 @@ types_aiobotocore_s3 types-redis types-python-slugify types-pyYAML +remotezip diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index 08eb770dd..26e58638a 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -434,6 +434,7 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): assert page["timestamp"] assert page.get("title") or page.get("title") is None assert page["load_state"] + assert page["status"] # Test GET page endpoint global page_id @@ -494,6 +495,46 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): assert page["approved"] +def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id): + # Re-add pages and verify they were correctly added + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/reAdd", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + assert r.json()["started"] + + time.sleep(10) + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data["total"] >= 0 + + pages = data["items"] + assert pages + + for page in pages: + assert page["id"] + assert page["oid"] + assert page["crawl_id"] + assert page["url"] + assert page["timestamp"] + assert page.get("title") or page.get("title") is None + assert page["load_state"] + assert page["status"] + + # Ensure only superuser can re-add pages for all crawls in an org + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/all/pages/reAdd", + headers=crawler_auth_headers, + ) + assert r.status_code == 403 + + def test_crawl_page_notes(crawler_auth_headers, default_org_id, crawler_crawl_id): note_text = "testing" updated_note_text = "updated" diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml index 570b1ccd8..6efb48a86 100644 --- a/chart/templates/configmap.yaml +++ b/chart/templates/configmap.yaml @@ -12,6 +12,8 @@ data: DEFAULT_NAMESPACE: {{ .Release.Namespace }} + FRONTEND_ALIAS: {{ .Values.frontend_alias | default "http://browsertrix-cloud-frontend" }} + CRAWLER_FQDN_SUFFIX: ".{{ .Values.crawler_namespace }}.svc.cluster.local" DEFAULT_ORG: "{{ .Values.default_org }}" diff --git a/chart/values.yaml b/chart/values.yaml index 92b5a9438..2dbb84557 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -130,6 +130,8 @@ frontend_memory: "64Mi" # if using ingress, this value is ignored local_service_port: 30870 +frontend_alias: "http://browsertrix-cloud-frontend" + # MongoDB Image # =========================================