diff --git a/src/crawler.ts b/src/crawler.ts index 5ae57fa16..81c765a5b 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -2152,6 +2152,10 @@ self.__bx_behaviors.selectMainBehavior(); let finished = false; + // disable extraHops for sitemap found URLs by setting to extraHops limit + 1 + // otherwise, all sitemap found URLs would be eligible for additional hops + const extraHopsDisabled = this.params.extraHops + 1; + await new Promise((resolve) => { sitemapper.on("end", () => { resolve(); @@ -2165,6 +2169,7 @@ self.__bx_behaviors.selectMainBehavior(); finished = true; } }); + sitemapper.on("url", ({ url }) => { const count = sitemapper.count; if (count % 10 ** power === 0) { @@ -2178,7 +2183,7 @@ self.__bx_behaviors.selectMainBehavior(); "sitemap", ); } - this.queueInScopeUrls(seedId, [url], 0); + this.queueInScopeUrls(seedId, [url], 0, extraHopsDisabled); if (count >= 100 && !resolved) { logger.info( "Sitemap partially parsed, continue parsing large sitemap in the background", diff --git a/src/util/sitemapper.ts b/src/util/sitemapper.ts index 3f3609d51..438c94b64 100644 --- a/src/util/sitemapper.ts +++ b/src/util/sitemapper.ts @@ -11,6 +11,9 @@ import { sleep } from "./timing.js"; const SITEMAP_CONCURRENCY = 5; +const TEXT_CONTENT_TYPE = ["text/plain"]; +const XML_CONTENT_TYPES = ["text/xml", "application/xml"]; + export type SitemapOpts = { headers?: Record; @@ -83,7 +86,7 @@ export class SitemapReader extends EventEmitter { } } - async tryFetch(url: string, expectedCT?: string | null) { + async tryFetch(url: string, expectedCT?: string[] | null) { try { logger.debug( "Detecting Sitemap: fetching", @@ -101,7 +104,7 @@ export class SitemapReader extends EventEmitter { } const ct = resp.headers.get("content-type"); - if (expectedCT && ct && ct.split(";")[0] != expectedCT) { + if (expectedCT && ct && !expectedCT.includes(ct.split(";")[0])) { logger.debug( "Detecting Sitemap: invalid content-type", { ct }, @@ -129,12 +132,12 @@ export class SitemapReader extends EventEmitter { if (sitemap === DETECT_SITEMAP) { logger.debug("Detecting sitemap for seed", { seedUrl }, "sitemap"); fullUrl = new URL("/robots.txt", seedUrl).href; - resp = await this.tryFetch(fullUrl, "text/plain"); + resp = await this.tryFetch(fullUrl, TEXT_CONTENT_TYPE); if (resp) { isRobots = true; } else { fullUrl = new URL("/sitemap.xml", seedUrl).href; - resp = await this.tryFetch(fullUrl, "text/xml"); + resp = await this.tryFetch(fullUrl, XML_CONTENT_TYPES); if (resp) { isSitemap = true; } @@ -144,10 +147,10 @@ export class SitemapReader extends EventEmitter { fullUrl = new URL(sitemap, seedUrl).href; let expected = null; if (fullUrl.endsWith(".xml")) { - expected = "text/xml"; + expected = XML_CONTENT_TYPES; isSitemap = true; } else if (fullUrl.endsWith(".txt")) { - expected = "text/plain"; + expected = TEXT_CONTENT_TYPE; isRobots = true; } resp = await this.tryFetch(fullUrl, expected); @@ -218,8 +221,22 @@ export class SitemapReader extends EventEmitter { } private async _parseSitemapFromResponse(url: string, resp: Response) { + let stream; + + const { body } = resp; + if (!body) { + throw new Error("missing response body"); + } + // decompress .gz sitemaps + if (url.endsWith(".gz")) { + const ds = new DecompressionStream("gzip"); + stream = body.pipeThrough(ds); + } else { + stream = body; + } + const readableNodeStream = Readable.fromWeb( - resp.body as ReadableStream, + stream as ReadableStream, ); this.initSaxParser(url, readableNodeStream); } @@ -244,6 +261,8 @@ export class SitemapReader extends EventEmitter { let currUrl: string | null; let lastmod: Date | null = null; + let errCount = 0; + let otherTags = 0; parserStream.on("end", async () => { @@ -358,10 +377,16 @@ export class SitemapReader extends EventEmitter { this.pending.delete(url); return; } - logger.warn("Sitemap error parsing XML", { err }, "sitemap"); - // eslint-disable-next-line @typescript-eslint/no-explicit-any - (parserStream._parser as any).error = null; - parserStream._parser.resume(); + logger.warn( + "Sitemap error parsing XML", + { url, err, errCount }, + "sitemap", + ); + if (errCount++ < 3) { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + (parserStream._parser as any).error = null; + parserStream._parser.resume(); + } }); sourceStream.pipe(parserStream); diff --git a/src/util/state.ts b/src/util/state.ts index 92f5e40f0..1b11dc3c9 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -97,6 +97,7 @@ declare module "ioredis" { pkey: string, qkey: string, skey: string, + esKey: string, url: string, score: number, data: string, @@ -199,9 +200,9 @@ export class RedisCrawlState { _initLuaCommands(redis: Redis) { redis.defineCommand("addqueue", { - numberOfKeys: 3, + numberOfKeys: 4, lua: ` -local size = redis.call('scard', KEYS[3]); +local size = redis.call('scard', KEYS[3]) - redis.call('llen', KEYS[4]); local limit = tonumber(ARGV[4]); if limit > 0 and size >= limit then return 1; @@ -506,6 +507,7 @@ return 0; this.pkey, this.qkey, this.skey, + this.esKey, url, this._getScore(data), JSON.stringify(data), @@ -624,7 +626,8 @@ return 0; for (const result of someResults) { const json = JSON.parse(result); - seenSet.delete(json.url); + //for extra seeds + seenSet.delete(json.url || json.newUrl); results.push(result); } } @@ -722,10 +725,6 @@ return 0; return parseInt(done || "0"); } - async numSeen() { - return await this.redis.scard(this.skey); - } - async numPending() { const res = await this.redis.hlen(this.pkey); diff --git a/tests/saved-state.test.js b/tests/saved-state.test.js index 03c96bc1c..8651a7e0c 100644 --- a/tests/saved-state.test.js +++ b/tests/saved-state.test.js @@ -4,17 +4,15 @@ import path from "path"; import yaml from "js-yaml"; import Redis from "ioredis"; + +const pagesFile = "test-crawls/collections/int-state-test/pages/pages.jsonl"; + + function sleep(ms) { return new Promise((resolve) => setTimeout(resolve, ms)); } -async function waitContainer(containerId) { - try { - execSync(`docker kill -s SIGINT ${containerId}`); - } catch (e) { - return; - } - +async function waitContainerDone(containerId) { // containerId is initially the full id, but docker ps // only prints the short id (first 12 characters) containerId = containerId.slice(0, 12); @@ -32,6 +30,17 @@ async function waitContainer(containerId) { } } +async function killContainer(containerId) { + try { + execSync(`docker kill -s SIGINT ${containerId}`); + } catch (e) { + return; + } + + await waitContainerDone(containerId); +} + + let savedStateFile; let state; let numDone; @@ -43,7 +52,7 @@ test("check crawl interrupted + saved state written", async () => { try { containerId = execSync( - "docker run -d -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://www.webrecorder.net/ --limit 10", + "docker run -d -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://www.webrecorder.net/ --limit 10 --behaviors \"\"", { encoding: "utf-8" }, //wait.callback, ); @@ -51,8 +60,6 @@ test("check crawl interrupted + saved state written", async () => { console.log(error); } - const pagesFile = "test-crawls/collections/int-state-test/pages/pages.jsonl"; - // remove existing pagesFile to support reentrancy try { fs.unlinkSync(pagesFile); @@ -77,7 +84,7 @@ test("check crawl interrupted + saved state written", async () => { await sleep(500); } - await waitContainer(containerId); + await killContainer(containerId); const savedStates = fs.readdirSync( "test-crawls/collections/int-state-test/crawls", @@ -97,11 +104,13 @@ test("check parsing saved state + page done + queue present", () => { const saved = yaml.load(savedState); - expect(!!saved.state).toBe(true); state = saved.state; - numDone = state.finished.length; + finished = state.finished; + + numDone = finished.length; numQueued = state.queued.length; + expect(!!state).toBe(true); expect(numDone > 0).toEqual(true); expect(numQueued > 0).toEqual(true); expect(numDone + numQueued).toEqual(10); @@ -110,8 +119,6 @@ test("check parsing saved state + page done + queue present", () => { expect(state.extraSeeds).toEqual([ `{"origSeedId":0,"newUrl":"https://webrecorder.net/"}`, ]); - - finished = state.finished; }); test("check crawl restarted with saved state", async () => { @@ -121,7 +128,7 @@ test("check crawl restarted with saved state", async () => { try { containerId = execSync( - `docker run -d -p ${port}:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 5`, + `docker run -d -p ${port}:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 10 --behaviors ""`, { encoding: "utf-8" }, ); } catch (error) { @@ -148,6 +155,16 @@ test("check crawl restarted with saved state", async () => { } catch (e) { console.log(e); } finally { - await waitContainer(containerId); + await waitContainerDone(containerId); } }); + +test("ensure correct number of pages was written", () => { + const pages = fs + .readFileSync(pagesFile, { encoding: "utf-8" }) + .trim() + .split("\n"); + + // first line is the header + expect(pages.length).toBe(10 + 1); +}); diff --git a/tests/sitemap-parse.test.js b/tests/sitemap-parse.test.js index 7815da0f7..ed067d719 100644 --- a/tests/sitemap-parse.test.js +++ b/tests/sitemap-parse.test.js @@ -29,8 +29,8 @@ async function waitContainer(containerId) { } } -async function runCrawl(numExpected, url, sitemap="", limit=0) { - const containerId = child_process.execSync(`docker run -d -p 36381:6379 -e CRAWL_ID=test webrecorder/browsertrix-crawler crawl --url ${url} --sitemap ${sitemap} --limit ${limit} --context sitemap --logging debug --debugAccessRedis`, {encoding: "utf-8"}); +async function runCrawl(numExpected, url, sitemap="", limit=0, numExpectedLessThan=0, extra="") { + const containerId = child_process.execSync(`docker run -d -p 36381:6379 -e CRAWL_ID=test webrecorder/browsertrix-crawler crawl --url ${url} --sitemap ${sitemap} --limit ${limit} --context sitemap --logging debug --debugAccessRedis ${extra}`, {encoding: "utf-8"}); await sleep(3000); @@ -57,6 +57,10 @@ async function runCrawl(numExpected, url, sitemap="", limit=0) { } expect(finished).toBeGreaterThanOrEqual(numExpected); + + if (numExpectedLessThan) { + expect(finished).toBeLessThanOrEqual(numExpectedLessThan); + } } test("test sitemap fully finish", async () => { @@ -70,3 +74,12 @@ test("test sitemap with limit", async () => { test("test sitemap with limit, specific URL", async () => { await runCrawl(1900, "https://www.mozilla.org/", "https://www.mozilla.org/sitemap.xml", 2000); }); + +test("test sitemap with application/xml content-type", async () => { + await runCrawl(10, "https://bitarchivist.net/", "", 0); +}); + + +test("test sitemap with narrow scope, extraHops, to ensure extraHops don't apply to sitemap", async () => { + await runCrawl(1, "https://www.mozilla.org/", "", 2000, 100, "--extraHops 1 --scopeType page"); +});