Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

sitemap improvements: gz support + application/xml + extraHops fix #511

Merged
merged 3 commits into from Mar 26, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 6 additions & 1 deletion src/crawler.ts
Expand Up @@ -2106,6 +2106,10 @@ self.__bx_behaviors.selectMainBehavior();

let finished = false;

// disable extraHops for sitemap found URLs by setting to extraHops limit + 1
// otherwise, all sitemap found URLs would be eligible for additional hops
const extraHopsDisabled = this.params.extraHops + 1;
tw4l marked this conversation as resolved.
Show resolved Hide resolved

await new Promise<void>((resolve) => {
sitemapper.on("end", () => {
resolve();
Expand All @@ -2119,6 +2123,7 @@ self.__bx_behaviors.selectMainBehavior();
finished = true;
}
});

sitemapper.on("url", ({ url }) => {
const count = sitemapper.count;
if (count % 10 ** power === 0) {
Expand All @@ -2132,7 +2137,7 @@ self.__bx_behaviors.selectMainBehavior();
"sitemap",
);
}
this.queueInScopeUrls(seedId, [url], 0);
this.queueInScopeUrls(seedId, [url], 0, extraHopsDisabled);
if (count >= 100 && !resolved) {
logger.info(
"Sitemap partially parsed, continue parsing large sitemap in the background",
Expand Down
32 changes: 27 additions & 5 deletions src/util/sitemapper.ts
Expand Up @@ -218,8 +218,22 @@ export class SitemapReader extends EventEmitter {
}

private async _parseSitemapFromResponse(url: string, resp: Response) {
let stream;

const { body } = resp;
if (!body) {
throw new Error("missing response body");
}
// decompress .gz sitemaps
if (url.endsWith(".gz")) {
const ds = new DecompressionStream("gzip");
stream = body.pipeThrough(ds);
} else {
stream = body;
}

const readableNodeStream = Readable.fromWeb(
resp.body as ReadableStream<Uint8Array>,
stream as ReadableStream<Uint8Array>,
);
this.initSaxParser(url, readableNodeStream);
}
Expand All @@ -244,6 +258,8 @@ export class SitemapReader extends EventEmitter {
let currUrl: string | null;
let lastmod: Date | null = null;

let errCount = 0;

let otherTags = 0;

parserStream.on("end", async () => {
Expand Down Expand Up @@ -358,10 +374,16 @@ export class SitemapReader extends EventEmitter {
this.pending.delete(url);
return;
}
logger.warn("Sitemap error parsing XML", { err }, "sitemap");
// eslint-disable-next-line @typescript-eslint/no-explicit-any
(parserStream._parser as any).error = null;
parserStream._parser.resume();
logger.warn(
"Sitemap error parsing XML",
{ url, err, errCount },
"sitemap",
);
if (errCount++ < 3) {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
(parserStream._parser as any).error = null;
parserStream._parser.resume();
}
});

sourceStream.pipe(parserStream);
Expand Down