From f174658486c379b62e28de94548ba51860ec9b6d Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 25 Mar 2024 17:44:09 -0700 Subject: [PATCH] sitemap fix, follow up to #496 - support parsing sitemap urls that end in .gz with gzip decompression - ignore extraHops for sitemap found URLs by setting to past extraHops limit (otherwise, all sitemap URLs would be treated as links from seed page) --- src/crawler.ts | 7 ++++++- src/util/sitemapper.ts | 32 +++++++++++++++++++++++++++----- 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/src/crawler.ts b/src/crawler.ts index e0289c078..7b7b63f32 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -2106,6 +2106,10 @@ self.__bx_behaviors.selectMainBehavior(); let finished = false; + // disable extraHops for sitemap found URLs by setting to extraHops limit + 1 + // otherwise, all sitemap found URLs would be eligible for additional hops + const extraHopsDisabled = this.params.extraHops + 1; + await new Promise((resolve) => { sitemapper.on("end", () => { resolve(); @@ -2119,6 +2123,7 @@ self.__bx_behaviors.selectMainBehavior(); finished = true; } }); + sitemapper.on("url", ({ url }) => { const count = sitemapper.count; if (count % 10 ** power === 0) { @@ -2132,7 +2137,7 @@ self.__bx_behaviors.selectMainBehavior(); "sitemap", ); } - this.queueInScopeUrls(seedId, [url], 0); + this.queueInScopeUrls(seedId, [url], 0, extraHopsDisabled); if (count >= 100 && !resolved) { logger.info( "Sitemap partially parsed, continue parsing large sitemap in the background", diff --git a/src/util/sitemapper.ts b/src/util/sitemapper.ts index 3f3609d51..25c5de5de 100644 --- a/src/util/sitemapper.ts +++ b/src/util/sitemapper.ts @@ -218,8 +218,22 @@ export class SitemapReader extends EventEmitter { } private async _parseSitemapFromResponse(url: string, resp: Response) { + let stream; + + const { body } = resp; + if (!body) { + throw new Error("missing response body"); + } + // decompress .gz sitemaps + if (url.endsWith(".gz")) { + const ds = new DecompressionStream("gzip"); + stream = body.pipeThrough(ds); + } else { + stream = body; + } + const readableNodeStream = Readable.fromWeb( - resp.body as ReadableStream, + stream as ReadableStream, ); this.initSaxParser(url, readableNodeStream); } @@ -244,6 +258,8 @@ export class SitemapReader extends EventEmitter { let currUrl: string | null; let lastmod: Date | null = null; + let errCount = 0; + let otherTags = 0; parserStream.on("end", async () => { @@ -358,10 +374,16 @@ export class SitemapReader extends EventEmitter { this.pending.delete(url); return; } - logger.warn("Sitemap error parsing XML", { err }, "sitemap"); - // eslint-disable-next-line @typescript-eslint/no-explicit-any - (parserStream._parser as any).error = null; - parserStream._parser.resume(); + logger.warn( + "Sitemap error parsing XML", + { url, err, errCount }, + "sitemap", + ); + if (errCount++ < 3) { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + (parserStream._parser as any).error = null; + parserStream._parser.resume(); + } }); sourceStream.pipe(parserStream);