diff --git a/src/crawler.ts b/src/crawler.ts index e0289c078..7b7b63f32 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -2106,6 +2106,10 @@ self.__bx_behaviors.selectMainBehavior(); let finished = false; + // disable extraHops for sitemap found URLs by setting to extraHops limit + 1 + // otherwise, all sitemap found URLs would be eligible for additional hops + const extraHopsDisabled = this.params.extraHops + 1; + await new Promise((resolve) => { sitemapper.on("end", () => { resolve(); @@ -2119,6 +2123,7 @@ self.__bx_behaviors.selectMainBehavior(); finished = true; } }); + sitemapper.on("url", ({ url }) => { const count = sitemapper.count; if (count % 10 ** power === 0) { @@ -2132,7 +2137,7 @@ self.__bx_behaviors.selectMainBehavior(); "sitemap", ); } - this.queueInScopeUrls(seedId, [url], 0); + this.queueInScopeUrls(seedId, [url], 0, extraHopsDisabled); if (count >= 100 && !resolved) { logger.info( "Sitemap partially parsed, continue parsing large sitemap in the background", diff --git a/src/util/sitemapper.ts b/src/util/sitemapper.ts index 3f3609d51..438c94b64 100644 --- a/src/util/sitemapper.ts +++ b/src/util/sitemapper.ts @@ -11,6 +11,9 @@ import { sleep } from "./timing.js"; const SITEMAP_CONCURRENCY = 5; +const TEXT_CONTENT_TYPE = ["text/plain"]; +const XML_CONTENT_TYPES = ["text/xml", "application/xml"]; + export type SitemapOpts = { headers?: Record; @@ -83,7 +86,7 @@ export class SitemapReader extends EventEmitter { } } - async tryFetch(url: string, expectedCT?: string | null) { + async tryFetch(url: string, expectedCT?: string[] | null) { try { logger.debug( "Detecting Sitemap: fetching", @@ -101,7 +104,7 @@ export class SitemapReader extends EventEmitter { } const ct = resp.headers.get("content-type"); - if (expectedCT && ct && ct.split(";")[0] != expectedCT) { + if (expectedCT && ct && !expectedCT.includes(ct.split(";")[0])) { logger.debug( "Detecting Sitemap: invalid content-type", { ct }, @@ -129,12 +132,12 @@ export class SitemapReader extends EventEmitter { if (sitemap === DETECT_SITEMAP) { logger.debug("Detecting sitemap for seed", { seedUrl }, "sitemap"); fullUrl = new URL("/robots.txt", seedUrl).href; - resp = await this.tryFetch(fullUrl, "text/plain"); + resp = await this.tryFetch(fullUrl, TEXT_CONTENT_TYPE); if (resp) { isRobots = true; } else { fullUrl = new URL("/sitemap.xml", seedUrl).href; - resp = await this.tryFetch(fullUrl, "text/xml"); + resp = await this.tryFetch(fullUrl, XML_CONTENT_TYPES); if (resp) { isSitemap = true; } @@ -144,10 +147,10 @@ export class SitemapReader extends EventEmitter { fullUrl = new URL(sitemap, seedUrl).href; let expected = null; if (fullUrl.endsWith(".xml")) { - expected = "text/xml"; + expected = XML_CONTENT_TYPES; isSitemap = true; } else if (fullUrl.endsWith(".txt")) { - expected = "text/plain"; + expected = TEXT_CONTENT_TYPE; isRobots = true; } resp = await this.tryFetch(fullUrl, expected); @@ -218,8 +221,22 @@ export class SitemapReader extends EventEmitter { } private async _parseSitemapFromResponse(url: string, resp: Response) { + let stream; + + const { body } = resp; + if (!body) { + throw new Error("missing response body"); + } + // decompress .gz sitemaps + if (url.endsWith(".gz")) { + const ds = new DecompressionStream("gzip"); + stream = body.pipeThrough(ds); + } else { + stream = body; + } + const readableNodeStream = Readable.fromWeb( - resp.body as ReadableStream, + stream as ReadableStream, ); this.initSaxParser(url, readableNodeStream); } @@ -244,6 +261,8 @@ export class SitemapReader extends EventEmitter { let currUrl: string | null; let lastmod: Date | null = null; + let errCount = 0; + let otherTags = 0; parserStream.on("end", async () => { @@ -358,10 +377,16 @@ export class SitemapReader extends EventEmitter { this.pending.delete(url); return; } - logger.warn("Sitemap error parsing XML", { err }, "sitemap"); - // eslint-disable-next-line @typescript-eslint/no-explicit-any - (parserStream._parser as any).error = null; - parserStream._parser.resume(); + logger.warn( + "Sitemap error parsing XML", + { url, err, errCount }, + "sitemap", + ); + if (errCount++ < 3) { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + (parserStream._parser as any).error = null; + parserStream._parser.resume(); + } }); sourceStream.pipe(parserStream); diff --git a/tests/sitemap-parse.test.js b/tests/sitemap-parse.test.js index b4f3ab6d6..29e156e25 100644 --- a/tests/sitemap-parse.test.js +++ b/tests/sitemap-parse.test.js @@ -1,7 +1,6 @@ import child_process from "child_process"; import Redis from "ioredis"; - function sleep(ms) { return new Promise((resolve) => setTimeout(resolve, ms)); } @@ -30,8 +29,8 @@ async function waitContainer(containerId) { } } -async function runCrawl(numExpected, url, sitemap="", limit=0) { - const containerId = child_process.execSync(`docker run -d -p 36379:6379 -e CRAWL_ID=test webrecorder/browsertrix-crawler crawl --url ${url} --sitemap ${sitemap} --limit ${limit} --context sitemap --logging debug --debugAccessRedis`, {encoding: "utf-8"}); +async function runCrawl(numExpected, url, sitemap="", limit=0, numExpectedLessThan=0, extra="") { + const containerId = child_process.execSync(`docker run -d -p 36379:6379 -e CRAWL_ID=test webrecorder/browsertrix-crawler crawl --url ${url} --sitemap ${sitemap} --limit ${limit} --context sitemap --logging debug --debugAccessRedis ${extra}`, {encoding: "utf-8"}); await sleep(2000); @@ -66,6 +65,10 @@ async function runCrawl(numExpected, url, sitemap="", limit=0) { } expect(finished).toBeGreaterThanOrEqual(numExpected); + + if (numExpectedLessThan) { + expect(finished).toBeLessThanOrEqual(numExpectedLessThan); + } } test("test sitemap fully finish", async () => { @@ -80,3 +83,12 @@ test("test sitemap with limit, specific URL", async () => { await runCrawl(1900, "https://www.mozilla.org/", "https://www.mozilla.org/sitemap.xml", 2000); }); +test("test sitemap with application/xml content-type", async () => { + await runCrawl(10, "https://bitarchivist.net/", "", 0); +}); + + +test("test sitemap with narrow scope, extraHops, to ensure extraHops don't apply to sitemap", async () => { + await runCrawl(1, "https://www.mozilla.org/", "", 2000, 100, "--extraHops 1 --scopeType page"); +}); +