Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

sitemap improvements: gz support + application/xml + extraHops fix #511

Merged
merged 3 commits into from Mar 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 6 additions & 1 deletion src/crawler.ts
Expand Up @@ -2106,6 +2106,10 @@ self.__bx_behaviors.selectMainBehavior();

let finished = false;

// disable extraHops for sitemap found URLs by setting to extraHops limit + 1
// otherwise, all sitemap found URLs would be eligible for additional hops
const extraHopsDisabled = this.params.extraHops + 1;
tw4l marked this conversation as resolved.
Show resolved Hide resolved

await new Promise<void>((resolve) => {
sitemapper.on("end", () => {
resolve();
Expand All @@ -2119,6 +2123,7 @@ self.__bx_behaviors.selectMainBehavior();
finished = true;
}
});

sitemapper.on("url", ({ url }) => {
const count = sitemapper.count;
if (count % 10 ** power === 0) {
Expand All @@ -2132,7 +2137,7 @@ self.__bx_behaviors.selectMainBehavior();
"sitemap",
);
}
this.queueInScopeUrls(seedId, [url], 0);
this.queueInScopeUrls(seedId, [url], 0, extraHopsDisabled);
if (count >= 100 && !resolved) {
logger.info(
"Sitemap partially parsed, continue parsing large sitemap in the background",
Expand Down
47 changes: 36 additions & 11 deletions src/util/sitemapper.ts
Expand Up @@ -11,6 +11,9 @@ import { sleep } from "./timing.js";

const SITEMAP_CONCURRENCY = 5;

const TEXT_CONTENT_TYPE = ["text/plain"];
const XML_CONTENT_TYPES = ["text/xml", "application/xml"];

export type SitemapOpts = {
headers?: Record<string, string>;

Expand Down Expand Up @@ -83,7 +86,7 @@ export class SitemapReader extends EventEmitter {
}
}

async tryFetch(url: string, expectedCT?: string | null) {
async tryFetch(url: string, expectedCT?: string[] | null) {
try {
logger.debug(
"Detecting Sitemap: fetching",
Expand All @@ -101,7 +104,7 @@ export class SitemapReader extends EventEmitter {
}

const ct = resp.headers.get("content-type");
if (expectedCT && ct && ct.split(";")[0] != expectedCT) {
if (expectedCT && ct && !expectedCT.includes(ct.split(";")[0])) {
logger.debug(
"Detecting Sitemap: invalid content-type",
{ ct },
Expand Down Expand Up @@ -129,12 +132,12 @@ export class SitemapReader extends EventEmitter {
if (sitemap === DETECT_SITEMAP) {
logger.debug("Detecting sitemap for seed", { seedUrl }, "sitemap");
fullUrl = new URL("/robots.txt", seedUrl).href;
resp = await this.tryFetch(fullUrl, "text/plain");
resp = await this.tryFetch(fullUrl, TEXT_CONTENT_TYPE);
if (resp) {
isRobots = true;
} else {
fullUrl = new URL("/sitemap.xml", seedUrl).href;
resp = await this.tryFetch(fullUrl, "text/xml");
resp = await this.tryFetch(fullUrl, XML_CONTENT_TYPES);
if (resp) {
isSitemap = true;
}
Expand All @@ -144,10 +147,10 @@ export class SitemapReader extends EventEmitter {
fullUrl = new URL(sitemap, seedUrl).href;
let expected = null;
if (fullUrl.endsWith(".xml")) {
expected = "text/xml";
expected = XML_CONTENT_TYPES;
isSitemap = true;
} else if (fullUrl.endsWith(".txt")) {
expected = "text/plain";
expected = TEXT_CONTENT_TYPE;
isRobots = true;
}
resp = await this.tryFetch(fullUrl, expected);
Expand Down Expand Up @@ -218,8 +221,22 @@ export class SitemapReader extends EventEmitter {
}

private async _parseSitemapFromResponse(url: string, resp: Response) {
let stream;

const { body } = resp;
if (!body) {
throw new Error("missing response body");
}
// decompress .gz sitemaps
if (url.endsWith(".gz")) {
const ds = new DecompressionStream("gzip");
stream = body.pipeThrough(ds);
} else {
stream = body;
}

const readableNodeStream = Readable.fromWeb(
resp.body as ReadableStream<Uint8Array>,
stream as ReadableStream<Uint8Array>,
);
this.initSaxParser(url, readableNodeStream);
}
Expand All @@ -244,6 +261,8 @@ export class SitemapReader extends EventEmitter {
let currUrl: string | null;
let lastmod: Date | null = null;

let errCount = 0;

let otherTags = 0;

parserStream.on("end", async () => {
Expand Down Expand Up @@ -358,10 +377,16 @@ export class SitemapReader extends EventEmitter {
this.pending.delete(url);
return;
}
logger.warn("Sitemap error parsing XML", { err }, "sitemap");
// eslint-disable-next-line @typescript-eslint/no-explicit-any
(parserStream._parser as any).error = null;
parserStream._parser.resume();
logger.warn(
"Sitemap error parsing XML",
{ url, err, errCount },
"sitemap",
);
if (errCount++ < 3) {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
(parserStream._parser as any).error = null;
parserStream._parser.resume();
}
});

sourceStream.pipe(parserStream);
Expand Down
18 changes: 15 additions & 3 deletions tests/sitemap-parse.test.js
@@ -1,7 +1,6 @@
import child_process from "child_process";
import Redis from "ioredis";


function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
Expand Down Expand Up @@ -30,8 +29,8 @@ async function waitContainer(containerId) {
}
}

async function runCrawl(numExpected, url, sitemap="", limit=0) {
const containerId = child_process.execSync(`docker run -d -p 36379:6379 -e CRAWL_ID=test webrecorder/browsertrix-crawler crawl --url ${url} --sitemap ${sitemap} --limit ${limit} --context sitemap --logging debug --debugAccessRedis`, {encoding: "utf-8"});
async function runCrawl(numExpected, url, sitemap="", limit=0, numExpectedLessThan=0, extra="") {
const containerId = child_process.execSync(`docker run -d -p 36379:6379 -e CRAWL_ID=test webrecorder/browsertrix-crawler crawl --url ${url} --sitemap ${sitemap} --limit ${limit} --context sitemap --logging debug --debugAccessRedis ${extra}`, {encoding: "utf-8"});

await sleep(2000);

Expand Down Expand Up @@ -66,6 +65,10 @@ async function runCrawl(numExpected, url, sitemap="", limit=0) {
}

expect(finished).toBeGreaterThanOrEqual(numExpected);

if (numExpectedLessThan) {
expect(finished).toBeLessThanOrEqual(numExpectedLessThan);
}
}

test("test sitemap fully finish", async () => {
Expand All @@ -80,3 +83,12 @@ test("test sitemap with limit, specific URL", async () => {
await runCrawl(1900, "https://www.mozilla.org/", "https://www.mozilla.org/sitemap.xml", 2000);
});

test("test sitemap with application/xml content-type", async () => {
await runCrawl(10, "https://bitarchivist.net/", "", 0);
});


test("test sitemap with narrow scope, extraHops, to ensure extraHops don't apply to sitemap", async () => {
await runCrawl(1, "https://www.mozilla.org/", "", 2000, 100, "--extraHops 1 --scopeType page");
});