Skip to content

Commit

Permalink
sitemap fix, follow up to #496
Browse files Browse the repository at this point in the history
- support parsing sitemap urls that end in .gz with gzip decompression
- ignore extraHops for sitemap found URLs by setting to past extraHops limit
(otherwise, all sitemap URLs would be treated as links from seed page)
  • Loading branch information
ikreymer committed Mar 26, 2024
1 parent 22a7351 commit f174658
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 6 deletions.
7 changes: 6 additions & 1 deletion src/crawler.ts
Expand Up @@ -2106,6 +2106,10 @@ self.__bx_behaviors.selectMainBehavior();

let finished = false;

// disable extraHops for sitemap found URLs by setting to extraHops limit + 1
// otherwise, all sitemap found URLs would be eligible for additional hops
const extraHopsDisabled = this.params.extraHops + 1;

await new Promise<void>((resolve) => {
sitemapper.on("end", () => {
resolve();
Expand All @@ -2119,6 +2123,7 @@ self.__bx_behaviors.selectMainBehavior();
finished = true;
}
});

sitemapper.on("url", ({ url }) => {
const count = sitemapper.count;
if (count % 10 ** power === 0) {
Expand All @@ -2132,7 +2137,7 @@ self.__bx_behaviors.selectMainBehavior();
"sitemap",
);
}
this.queueInScopeUrls(seedId, [url], 0);
this.queueInScopeUrls(seedId, [url], 0, extraHopsDisabled);
if (count >= 100 && !resolved) {
logger.info(
"Sitemap partially parsed, continue parsing large sitemap in the background",
Expand Down
32 changes: 27 additions & 5 deletions src/util/sitemapper.ts
Expand Up @@ -218,8 +218,22 @@ export class SitemapReader extends EventEmitter {
}

private async _parseSitemapFromResponse(url: string, resp: Response) {
let stream;

const { body } = resp;
if (!body) {
throw new Error("missing response body");
}
// decompress .gz sitemaps
if (url.endsWith(".gz")) {
const ds = new DecompressionStream("gzip");
stream = body.pipeThrough(ds);
} else {
stream = body;
}

const readableNodeStream = Readable.fromWeb(
resp.body as ReadableStream<Uint8Array>,
stream as ReadableStream<Uint8Array>,
);
this.initSaxParser(url, readableNodeStream);
}
Expand All @@ -244,6 +258,8 @@ export class SitemapReader extends EventEmitter {
let currUrl: string | null;
let lastmod: Date | null = null;

let errCount = 0;

let otherTags = 0;

parserStream.on("end", async () => {
Expand Down Expand Up @@ -358,10 +374,16 @@ export class SitemapReader extends EventEmitter {
this.pending.delete(url);
return;
}
logger.warn("Sitemap error parsing XML", { err }, "sitemap");
// eslint-disable-next-line @typescript-eslint/no-explicit-any
(parserStream._parser as any).error = null;
parserStream._parser.resume();
logger.warn(
"Sitemap error parsing XML",
{ url, err, errCount },
"sitemap",
);
if (errCount++ < 3) {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
(parserStream._parser as any).error = null;
parserStream._parser.resume();
}
});

sourceStream.pipe(parserStream);
Expand Down

0 comments on commit f174658

Please sign in to comment.