From bf5cbb08f5c1689928c9e18fb8bd37a78c056921 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 26 Mar 2024 11:21:48 -0700 Subject: [PATCH] fixes redirected seed (from #475) being counted againt page limit: (#509) - subtract extraSeeds when computing limit - don't include redirect seeds in seen list when serializing - tests: adjust saved-state-test to also check total pages when crawl is done fixes #508 (for 1.0.3 release) --- package.json | 2 +- src/util/state.ts | 13 +++++----- tests/saved-state.test.js | 51 ++++++++++++++++++++++++++------------- 3 files changed, 41 insertions(+), 25 deletions(-) diff --git a/package.json b/package.json index 6f6cc5771..20242b25f 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "browsertrix-crawler", - "version": "1.0.2", + "version": "1.0.3", "main": "browsertrix-crawler", "type": "module", "repository": "https://github.com/webrecorder/browsertrix-crawler", diff --git a/src/util/state.ts b/src/util/state.ts index ad365680e..69d09fa89 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -91,6 +91,7 @@ declare module "ioredis" { pkey: string, qkey: string, skey: string, + esKey: string, url: string, score: number, data: string, @@ -193,9 +194,9 @@ export class RedisCrawlState { _initLuaCommands(redis: Redis) { redis.defineCommand("addqueue", { - numberOfKeys: 3, + numberOfKeys: 4, lua: ` -local size = redis.call('scard', KEYS[3]); +local size = redis.call('scard', KEYS[3]) - redis.call('llen', KEYS[4]); local limit = tonumber(ARGV[4]); if limit > 0 and size >= limit then return 1; @@ -486,6 +487,7 @@ return 0; this.pkey, this.qkey, this.skey, + this.esKey, url, this._getScore(data), JSON.stringify(data), @@ -604,7 +606,8 @@ return 0; for (const result of someResults) { const json = JSON.parse(result); - seenSet.delete(json.url); + //for extra seeds + seenSet.delete(json.url || json.newUrl); results.push(result); } } @@ -702,10 +705,6 @@ return 0; return parseInt(done || "0"); } - async numSeen() { - return await this.redis.scard(this.skey); - } - async numPending() { const res = await this.redis.hlen(this.pkey); diff --git a/tests/saved-state.test.js b/tests/saved-state.test.js index 00b222749..a5176ce3e 100644 --- a/tests/saved-state.test.js +++ b/tests/saved-state.test.js @@ -4,17 +4,15 @@ import path from "path"; import yaml from "js-yaml"; import Redis from "ioredis"; + +const pagesFile = "test-crawls/collections/int-state-test/pages/pages.jsonl"; + + function sleep(ms) { return new Promise((resolve) => setTimeout(resolve, ms)); } -async function waitContainer(containerId) { - try { - execSync(`docker kill -s SIGINT ${containerId}`); - } catch (e) { - return; - } - +async function waitContainerDone(containerId) { // containerId is initially the full id, but docker ps // only prints the short id (first 12 characters) containerId = containerId.slice(0, 12); @@ -32,6 +30,17 @@ async function waitContainer(containerId) { } } +async function killContainer(containerId) { + try { + execSync(`docker kill -s SIGINT ${containerId}`); + } catch (e) { + return; + } + + await waitContainerDone(containerId); +} + + let savedStateFile; let state; let numDone; @@ -43,7 +52,7 @@ test("check crawl interrupted + saved state written", async () => { try { containerId = execSync( - "docker run -d -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://www.webrecorder.net/ --limit 10", + "docker run -d -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://www.webrecorder.net/ --limit 10 --behaviors \"\"", { encoding: "utf-8" }, //wait.callback, ); @@ -51,8 +60,6 @@ test("check crawl interrupted + saved state written", async () => { console.log(error); } - const pagesFile = "test-crawls/collections/int-state-test/pages/pages.jsonl"; - // remove existing pagesFile to support reentrancy try { fs.unlinkSync(pagesFile); @@ -77,7 +84,7 @@ test("check crawl interrupted + saved state written", async () => { await sleep(500); } - await waitContainer(containerId); + await killContainer(containerId); const savedStates = fs.readdirSync( "test-crawls/collections/int-state-test/crawls", @@ -97,11 +104,13 @@ test("check parsing saved state + page done + queue present", () => { const saved = yaml.load(savedState); - expect(!!saved.state).toBe(true); state = saved.state; - numDone = state.finished.length; + finished = state.finished; + + numDone = finished.length; numQueued = state.queued.length; + expect(!!state).toBe(true); expect(numDone > 0).toEqual(true); expect(numQueued > 0).toEqual(true); expect(numDone + numQueued).toEqual(10); @@ -110,8 +119,6 @@ test("check parsing saved state + page done + queue present", () => { expect(state.extraSeeds).toEqual([ `{"origSeedId":0,"newUrl":"https://webrecorder.net/"}`, ]); - - finished = state.finished; }); test("check crawl restarted with saved state", async () => { @@ -119,7 +126,7 @@ test("check crawl restarted with saved state", async () => { try { containerId = execSync( - `docker run -d -p 36379:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 5`, + `docker run -d -p 36379:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 10 --behaviors ""`, { encoding: "utf-8" }, ); } catch (error) { @@ -149,7 +156,7 @@ test("check crawl restarted with saved state", async () => { } catch (e) { console.log(e); } finally { - await waitContainer(containerId); + await waitContainerDone(containerId); try { await redis.disconnect(); @@ -158,3 +165,13 @@ test("check crawl restarted with saved state", async () => { } } }); + +test("ensure correct number of pages was written", () => { + const pages = fs + .readFileSync(pagesFile, { encoding: "utf-8" }) + .trim() + .split("\n"); + + // first line is the header + expect(pages.length).toBe(10 + 1); +});