-
-
Notifications
You must be signed in to change notification settings - Fork 28
/
migration_0026_crawl_pages.py
47 lines (34 loc) · 1.42 KB
/
migration_0026_crawl_pages.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
"""
Migration 0026 -- Crawl Pages
"""
from btrixcloud.migrations import BaseMigration
from btrixcloud.utils import gather_tasks_with_concurrency
MIGRATION_VERSION = "0026"
class Migration(BaseMigration):
"""Migration class."""
def __init__(self, mdb, **kwargs):
super().__init__(mdb, migration_version=MIGRATION_VERSION)
self.page_ops = kwargs["page_ops"]
async def migrate_up(self):
"""Perform migration up.
Add pages to database for each crawl without them, pulling from WACZ files.
"""
# pylint: disable=duplicate-code
crawls_mdb = self.mdb["crawls"]
pages_mdb = self.mdb["pages"]
crawl_ids = await crawls_mdb.distinct(
"_id", {"type": "crawl", "finished": {"$ne": None}}
)
crawl_ids_with_pages = await pages_mdb.distinct("crawl_id")
crawl_ids_no_pages = list(set(crawl_ids) - set(crawl_ids_with_pages))
if not crawl_ids_no_pages:
return
all_coroutines = []
for crawl_id in crawl_ids_no_pages:
current_coroutine = self.page_ops.add_crawl_pages_to_db_from_wacz(crawl_id)
all_coroutines.append(current_coroutine)
try:
await gather_tasks_with_concurrency(*all_coroutines)
# pylint: disable=broad-exception-caught, raise-missing-from
except Exception as err:
print(f"Error adding pages to db: {err}", flush=True)