From 19d47b1f488869fee43dc79d3a3749f850dafa6d Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 4 Apr 2024 15:16:43 -0700 Subject: [PATCH] fix issue with incorrect number of total pages if one of the seeds is a redirect following changes in webrecorder/browsertrix-crawler#475, webrecorder/browsertrix-crawler#509, the crawler adds a redirected seed to the seen list. To account for this, it needs to be subtracted to get the actual page count. --- backend/btrixcloud/operator/crawls.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 98fdae105..5a2e19e4a 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -1178,6 +1178,11 @@ async def get_redis_crawl_stats( pages_done = await redis.llen(f"{crawl_id}:d") pages_found = await redis.scard(f"{crawl_id}:s") + # account for extra seeds and subtract from seen list + extra_seeds = await redis.llen(f"{crawl_id}:extraSeeds") + if extra_seeds: + pages_found -= extra_seeds + sizes = await redis.hgetall(f"{crawl_id}:size") archive_size = sum(int(x) for x in sizes.values())