From bf5cbb08f5c1689928c9e18fb8bd37a78c056921 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 26 Mar 2024 11:21:48 -0700 Subject: [PATCH 1/4] fixes redirected seed (from #475) being counted againt page limit: (#509) - subtract extraSeeds when computing limit - don't include redirect seeds in seen list when serializing - tests: adjust saved-state-test to also check total pages when crawl is done fixes #508 (for 1.0.3 release) --- package.json | 2 +- src/util/state.ts | 13 +++++----- tests/saved-state.test.js | 51 ++++++++++++++++++++++++++------------- 3 files changed, 41 insertions(+), 25 deletions(-) diff --git a/package.json b/package.json index 6f6cc5771..20242b25f 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "browsertrix-crawler", - "version": "1.0.2", + "version": "1.0.3", "main": "browsertrix-crawler", "type": "module", "repository": "https://github.com/webrecorder/browsertrix-crawler", diff --git a/src/util/state.ts b/src/util/state.ts index ad365680e..69d09fa89 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -91,6 +91,7 @@ declare module "ioredis" { pkey: string, qkey: string, skey: string, + esKey: string, url: string, score: number, data: string, @@ -193,9 +194,9 @@ export class RedisCrawlState { _initLuaCommands(redis: Redis) { redis.defineCommand("addqueue", { - numberOfKeys: 3, + numberOfKeys: 4, lua: ` -local size = redis.call('scard', KEYS[3]); +local size = redis.call('scard', KEYS[3]) - redis.call('llen', KEYS[4]); local limit = tonumber(ARGV[4]); if limit > 0 and size >= limit then return 1; @@ -486,6 +487,7 @@ return 0; this.pkey, this.qkey, this.skey, + this.esKey, url, this._getScore(data), JSON.stringify(data), @@ -604,7 +606,8 @@ return 0; for (const result of someResults) { const json = JSON.parse(result); - seenSet.delete(json.url); + //for extra seeds + seenSet.delete(json.url || json.newUrl); results.push(result); } } @@ -702,10 +705,6 @@ return 0; return parseInt(done || "0"); } - async numSeen() { - return await this.redis.scard(this.skey); - } - async numPending() { const res = await this.redis.hlen(this.pkey); diff --git a/tests/saved-state.test.js b/tests/saved-state.test.js index 00b222749..a5176ce3e 100644 --- a/tests/saved-state.test.js +++ b/tests/saved-state.test.js @@ -4,17 +4,15 @@ import path from "path"; import yaml from "js-yaml"; import Redis from "ioredis"; + +const pagesFile = "test-crawls/collections/int-state-test/pages/pages.jsonl"; + + function sleep(ms) { return new Promise((resolve) => setTimeout(resolve, ms)); } -async function waitContainer(containerId) { - try { - execSync(`docker kill -s SIGINT ${containerId}`); - } catch (e) { - return; - } - +async function waitContainerDone(containerId) { // containerId is initially the full id, but docker ps // only prints the short id (first 12 characters) containerId = containerId.slice(0, 12); @@ -32,6 +30,17 @@ async function waitContainer(containerId) { } } +async function killContainer(containerId) { + try { + execSync(`docker kill -s SIGINT ${containerId}`); + } catch (e) { + return; + } + + await waitContainerDone(containerId); +} + + let savedStateFile; let state; let numDone; @@ -43,7 +52,7 @@ test("check crawl interrupted + saved state written", async () => { try { containerId = execSync( - "docker run -d -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://www.webrecorder.net/ --limit 10", + "docker run -d -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://www.webrecorder.net/ --limit 10 --behaviors \"\"", { encoding: "utf-8" }, //wait.callback, ); @@ -51,8 +60,6 @@ test("check crawl interrupted + saved state written", async () => { console.log(error); } - const pagesFile = "test-crawls/collections/int-state-test/pages/pages.jsonl"; - // remove existing pagesFile to support reentrancy try { fs.unlinkSync(pagesFile); @@ -77,7 +84,7 @@ test("check crawl interrupted + saved state written", async () => { await sleep(500); } - await waitContainer(containerId); + await killContainer(containerId); const savedStates = fs.readdirSync( "test-crawls/collections/int-state-test/crawls", @@ -97,11 +104,13 @@ test("check parsing saved state + page done + queue present", () => { const saved = yaml.load(savedState); - expect(!!saved.state).toBe(true); state = saved.state; - numDone = state.finished.length; + finished = state.finished; + + numDone = finished.length; numQueued = state.queued.length; + expect(!!state).toBe(true); expect(numDone > 0).toEqual(true); expect(numQueued > 0).toEqual(true); expect(numDone + numQueued).toEqual(10); @@ -110,8 +119,6 @@ test("check parsing saved state + page done + queue present", () => { expect(state.extraSeeds).toEqual([ `{"origSeedId":0,"newUrl":"https://webrecorder.net/"}`, ]); - - finished = state.finished; }); test("check crawl restarted with saved state", async () => { @@ -119,7 +126,7 @@ test("check crawl restarted with saved state", async () => { try { containerId = execSync( - `docker run -d -p 36379:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 5`, + `docker run -d -p 36379:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 10 --behaviors ""`, { encoding: "utf-8" }, ); } catch (error) { @@ -149,7 +156,7 @@ test("check crawl restarted with saved state", async () => { } catch (e) { console.log(e); } finally { - await waitContainer(containerId); + await waitContainerDone(containerId); try { await redis.disconnect(); @@ -158,3 +165,13 @@ test("check crawl restarted with saved state", async () => { } } }); + +test("ensure correct number of pages was written", () => { + const pages = fs + .readFileSync(pagesFile, { encoding: "utf-8" }) + .trim() + .split("\n"); + + // first line is the header + expect(pages.length).toBe(10 + 1); +}); From b707d004471275486cd6672f58edcc808d68f34a Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 26 Mar 2024 13:54:51 -0700 Subject: [PATCH 2/4] sitemap improvements: gz support + application/xml + extraHops fix (#511) sitemap fixes, follow up to #496 - support parsing sitemap urls that end in .gz with gzip decompression - support both `application/xml` and `text/xml` as valid sitemap content-types (add test for both) - ignore extraHops for sitemap found URLs by setting to past extraHops limit (otherwise, all sitemap URLs would be treated as links from seed page) --- src/crawler.ts | 7 +++++- src/util/sitemapper.ts | 47 ++++++++++++++++++++++++++++--------- tests/sitemap-parse.test.js | 18 +++++++++++--- 3 files changed, 57 insertions(+), 15 deletions(-) diff --git a/src/crawler.ts b/src/crawler.ts index e0289c078..7b7b63f32 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -2106,6 +2106,10 @@ self.__bx_behaviors.selectMainBehavior(); let finished = false; + // disable extraHops for sitemap found URLs by setting to extraHops limit + 1 + // otherwise, all sitemap found URLs would be eligible for additional hops + const extraHopsDisabled = this.params.extraHops + 1; + await new Promise((resolve) => { sitemapper.on("end", () => { resolve(); @@ -2119,6 +2123,7 @@ self.__bx_behaviors.selectMainBehavior(); finished = true; } }); + sitemapper.on("url", ({ url }) => { const count = sitemapper.count; if (count % 10 ** power === 0) { @@ -2132,7 +2137,7 @@ self.__bx_behaviors.selectMainBehavior(); "sitemap", ); } - this.queueInScopeUrls(seedId, [url], 0); + this.queueInScopeUrls(seedId, [url], 0, extraHopsDisabled); if (count >= 100 && !resolved) { logger.info( "Sitemap partially parsed, continue parsing large sitemap in the background", diff --git a/src/util/sitemapper.ts b/src/util/sitemapper.ts index 3f3609d51..438c94b64 100644 --- a/src/util/sitemapper.ts +++ b/src/util/sitemapper.ts @@ -11,6 +11,9 @@ import { sleep } from "./timing.js"; const SITEMAP_CONCURRENCY = 5; +const TEXT_CONTENT_TYPE = ["text/plain"]; +const XML_CONTENT_TYPES = ["text/xml", "application/xml"]; + export type SitemapOpts = { headers?: Record; @@ -83,7 +86,7 @@ export class SitemapReader extends EventEmitter { } } - async tryFetch(url: string, expectedCT?: string | null) { + async tryFetch(url: string, expectedCT?: string[] | null) { try { logger.debug( "Detecting Sitemap: fetching", @@ -101,7 +104,7 @@ export class SitemapReader extends EventEmitter { } const ct = resp.headers.get("content-type"); - if (expectedCT && ct && ct.split(";")[0] != expectedCT) { + if (expectedCT && ct && !expectedCT.includes(ct.split(";")[0])) { logger.debug( "Detecting Sitemap: invalid content-type", { ct }, @@ -129,12 +132,12 @@ export class SitemapReader extends EventEmitter { if (sitemap === DETECT_SITEMAP) { logger.debug("Detecting sitemap for seed", { seedUrl }, "sitemap"); fullUrl = new URL("/robots.txt", seedUrl).href; - resp = await this.tryFetch(fullUrl, "text/plain"); + resp = await this.tryFetch(fullUrl, TEXT_CONTENT_TYPE); if (resp) { isRobots = true; } else { fullUrl = new URL("/sitemap.xml", seedUrl).href; - resp = await this.tryFetch(fullUrl, "text/xml"); + resp = await this.tryFetch(fullUrl, XML_CONTENT_TYPES); if (resp) { isSitemap = true; } @@ -144,10 +147,10 @@ export class SitemapReader extends EventEmitter { fullUrl = new URL(sitemap, seedUrl).href; let expected = null; if (fullUrl.endsWith(".xml")) { - expected = "text/xml"; + expected = XML_CONTENT_TYPES; isSitemap = true; } else if (fullUrl.endsWith(".txt")) { - expected = "text/plain"; + expected = TEXT_CONTENT_TYPE; isRobots = true; } resp = await this.tryFetch(fullUrl, expected); @@ -218,8 +221,22 @@ export class SitemapReader extends EventEmitter { } private async _parseSitemapFromResponse(url: string, resp: Response) { + let stream; + + const { body } = resp; + if (!body) { + throw new Error("missing response body"); + } + // decompress .gz sitemaps + if (url.endsWith(".gz")) { + const ds = new DecompressionStream("gzip"); + stream = body.pipeThrough(ds); + } else { + stream = body; + } + const readableNodeStream = Readable.fromWeb( - resp.body as ReadableStream, + stream as ReadableStream, ); this.initSaxParser(url, readableNodeStream); } @@ -244,6 +261,8 @@ export class SitemapReader extends EventEmitter { let currUrl: string | null; let lastmod: Date | null = null; + let errCount = 0; + let otherTags = 0; parserStream.on("end", async () => { @@ -358,10 +377,16 @@ export class SitemapReader extends EventEmitter { this.pending.delete(url); return; } - logger.warn("Sitemap error parsing XML", { err }, "sitemap"); - // eslint-disable-next-line @typescript-eslint/no-explicit-any - (parserStream._parser as any).error = null; - parserStream._parser.resume(); + logger.warn( + "Sitemap error parsing XML", + { url, err, errCount }, + "sitemap", + ); + if (errCount++ < 3) { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + (parserStream._parser as any).error = null; + parserStream._parser.resume(); + } }); sourceStream.pipe(parserStream); diff --git a/tests/sitemap-parse.test.js b/tests/sitemap-parse.test.js index b4f3ab6d6..29e156e25 100644 --- a/tests/sitemap-parse.test.js +++ b/tests/sitemap-parse.test.js @@ -1,7 +1,6 @@ import child_process from "child_process"; import Redis from "ioredis"; - function sleep(ms) { return new Promise((resolve) => setTimeout(resolve, ms)); } @@ -30,8 +29,8 @@ async function waitContainer(containerId) { } } -async function runCrawl(numExpected, url, sitemap="", limit=0) { - const containerId = child_process.execSync(`docker run -d -p 36379:6379 -e CRAWL_ID=test webrecorder/browsertrix-crawler crawl --url ${url} --sitemap ${sitemap} --limit ${limit} --context sitemap --logging debug --debugAccessRedis`, {encoding: "utf-8"}); +async function runCrawl(numExpected, url, sitemap="", limit=0, numExpectedLessThan=0, extra="") { + const containerId = child_process.execSync(`docker run -d -p 36379:6379 -e CRAWL_ID=test webrecorder/browsertrix-crawler crawl --url ${url} --sitemap ${sitemap} --limit ${limit} --context sitemap --logging debug --debugAccessRedis ${extra}`, {encoding: "utf-8"}); await sleep(2000); @@ -66,6 +65,10 @@ async function runCrawl(numExpected, url, sitemap="", limit=0) { } expect(finished).toBeGreaterThanOrEqual(numExpected); + + if (numExpectedLessThan) { + expect(finished).toBeLessThanOrEqual(numExpectedLessThan); + } } test("test sitemap fully finish", async () => { @@ -80,3 +83,12 @@ test("test sitemap with limit, specific URL", async () => { await runCrawl(1900, "https://www.mozilla.org/", "https://www.mozilla.org/sitemap.xml", 2000); }); +test("test sitemap with application/xml content-type", async () => { + await runCrawl(10, "https://bitarchivist.net/", "", 0); +}); + + +test("test sitemap with narrow scope, extraHops, to ensure extraHops don't apply to sitemap", async () => { + await runCrawl(1, "https://www.mozilla.org/", "", 2000, 100, "--extraHops 1 --scopeType page"); +}); + From 7a34295e1efd8a6df38bb30de178adb600a8f0d6 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 26 Mar 2024 13:56:49 -0700 Subject: [PATCH 3/4] update add-exclusion.test.js from main --- tests/add-exclusion.test.js | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/tests/add-exclusion.test.js b/tests/add-exclusion.test.js index 87f10f4ef..71a1d240f 100644 --- a/tests/add-exclusion.test.js +++ b/tests/add-exclusion.test.js @@ -1,6 +1,10 @@ import { exec } from "child_process"; import Redis from "ioredis"; +function sleep(ms) { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + test("dynamically add exclusion while crawl is running", async () => { let callback = null; @@ -12,7 +16,7 @@ test("dynamically add exclusion while crawl is running", async () => { try { exec( - "docker run -p 36379:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection add-exclusion --url https://webrecorder.net/ --scopeType prefix --limit 20 --logging debug --debugAccessRedis", + "docker run -p 36382:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection add-exclusion --url https://webrecorder.net/ --scopeType prefix --limit 20 --logging debug --debugAccessRedis", { shell: "/bin/bash" }, callback, ); @@ -20,18 +24,18 @@ test("dynamically add exclusion while crawl is running", async () => { console.log(error); } - await new Promise((resolve) => setTimeout(resolve, 3000)); + await sleep(3000); - const redis = new Redis("redis://127.0.0.1:36379/0", { lazyConnect: true }); + const redis = new Redis("redis://127.0.0.1:36382/0", { lazyConnect: true, retryStrategy: () => null }) - await redis.connect({ maxRetriesPerRequest: 50 }); + await redis.connect(); while (true) { if (Number(await redis.zcard("test:q")) > 1) { break; } - await new Promise((resolve) => setTimeout(resolve, 500)); + await sleep(500); } const uids = await redis.hkeys("test:status"); @@ -48,6 +52,5 @@ test("dynamically add exclusion while crawl is running", async () => { expect(stdout.indexOf("Add Exclusion") > 0).toBe(true); expect(stdout.indexOf("Removing excluded URL") > 0).toBe(true); - - await redis.disconnect(); }); + From c2846d1a3bf81364f9d2ed75e49f2cbc2b24048e Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 26 Mar 2024 14:02:33 -0700 Subject: [PATCH 4/4] test: merge sitemap-parse test changes from main --- tests/sitemap-parse.test.js | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/tests/sitemap-parse.test.js b/tests/sitemap-parse.test.js index 29e156e25..ed067d719 100644 --- a/tests/sitemap-parse.test.js +++ b/tests/sitemap-parse.test.js @@ -30,20 +30,17 @@ async function waitContainer(containerId) { } async function runCrawl(numExpected, url, sitemap="", limit=0, numExpectedLessThan=0, extra="") { - const containerId = child_process.execSync(`docker run -d -p 36379:6379 -e CRAWL_ID=test webrecorder/browsertrix-crawler crawl --url ${url} --sitemap ${sitemap} --limit ${limit} --context sitemap --logging debug --debugAccessRedis ${extra}`, {encoding: "utf-8"}); + const containerId = child_process.execSync(`docker run -d -p 36381:6379 -e CRAWL_ID=test webrecorder/browsertrix-crawler crawl --url ${url} --sitemap ${sitemap} --limit ${limit} --context sitemap --logging debug --debugAccessRedis ${extra}`, {encoding: "utf-8"}); - await sleep(2000); + await sleep(3000); - const redis = new Redis("redis://127.0.0.1:36379/0", { lazyConnect: true }); + const redis = new Redis("redis://127.0.0.1:36381/0", { lazyConnect: true, retryStrategy: () => null }); let finished = 0; try { await redis.connect({ maxRetriesPerRequest: 100, - retryStrategy(times) { - return times < 100 ? 1000 : null; - }, }); while (true) { @@ -57,11 +54,6 @@ async function runCrawl(numExpected, url, sitemap="", limit=0, numExpectedLessTh console.error(e); } finally { await waitContainer(containerId); - try { - await redis.disconnect(); - } catch (e) { - // ignore - } } expect(finished).toBeGreaterThanOrEqual(numExpected); @@ -91,4 +83,3 @@ test("test sitemap with application/xml content-type", async () => { test("test sitemap with narrow scope, extraHops, to ensure extraHops don't apply to sitemap", async () => { await runCrawl(1, "https://www.mozilla.org/", "", 2000, 100, "--extraHops 1 --scopeType page"); }); -