Skip to content

Commit

Permalink
sitemap improvements: gz support + application/xml + extraHops fix (#511
Browse files Browse the repository at this point in the history
)

sitemap fixes,  follow up to #496
- support parsing sitemap urls that end in .gz with gzip decompression
- support both `application/xml` and `text/xml` as valid sitemap
content-types (add test for both)
- ignore extraHops for sitemap found URLs by setting to past extraHops
limit (otherwise, all sitemap URLs would be treated as links from seed
page)
  • Loading branch information
ikreymer committed Mar 26, 2024
1 parent bf5cbb0 commit b707d00
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 15 deletions.
7 changes: 6 additions & 1 deletion src/crawler.ts
Expand Up @@ -2106,6 +2106,10 @@ self.__bx_behaviors.selectMainBehavior();

let finished = false;

// disable extraHops for sitemap found URLs by setting to extraHops limit + 1
// otherwise, all sitemap found URLs would be eligible for additional hops
const extraHopsDisabled = this.params.extraHops + 1;

await new Promise<void>((resolve) => {
sitemapper.on("end", () => {
resolve();
Expand All @@ -2119,6 +2123,7 @@ self.__bx_behaviors.selectMainBehavior();
finished = true;
}
});

sitemapper.on("url", ({ url }) => {
const count = sitemapper.count;
if (count % 10 ** power === 0) {
Expand All @@ -2132,7 +2137,7 @@ self.__bx_behaviors.selectMainBehavior();
"sitemap",
);
}
this.queueInScopeUrls(seedId, [url], 0);
this.queueInScopeUrls(seedId, [url], 0, extraHopsDisabled);
if (count >= 100 && !resolved) {
logger.info(
"Sitemap partially parsed, continue parsing large sitemap in the background",
Expand Down
47 changes: 36 additions & 11 deletions src/util/sitemapper.ts
Expand Up @@ -11,6 +11,9 @@ import { sleep } from "./timing.js";

const SITEMAP_CONCURRENCY = 5;

const TEXT_CONTENT_TYPE = ["text/plain"];
const XML_CONTENT_TYPES = ["text/xml", "application/xml"];

export type SitemapOpts = {
headers?: Record<string, string>;

Expand Down Expand Up @@ -83,7 +86,7 @@ export class SitemapReader extends EventEmitter {
}
}

async tryFetch(url: string, expectedCT?: string | null) {
async tryFetch(url: string, expectedCT?: string[] | null) {
try {
logger.debug(
"Detecting Sitemap: fetching",
Expand All @@ -101,7 +104,7 @@ export class SitemapReader extends EventEmitter {
}

const ct = resp.headers.get("content-type");
if (expectedCT && ct && ct.split(";")[0] != expectedCT) {
if (expectedCT && ct && !expectedCT.includes(ct.split(";")[0])) {
logger.debug(
"Detecting Sitemap: invalid content-type",
{ ct },
Expand Down Expand Up @@ -129,12 +132,12 @@ export class SitemapReader extends EventEmitter {
if (sitemap === DETECT_SITEMAP) {
logger.debug("Detecting sitemap for seed", { seedUrl }, "sitemap");
fullUrl = new URL("/robots.txt", seedUrl).href;
resp = await this.tryFetch(fullUrl, "text/plain");
resp = await this.tryFetch(fullUrl, TEXT_CONTENT_TYPE);
if (resp) {
isRobots = true;
} else {
fullUrl = new URL("/sitemap.xml", seedUrl).href;
resp = await this.tryFetch(fullUrl, "text/xml");
resp = await this.tryFetch(fullUrl, XML_CONTENT_TYPES);
if (resp) {
isSitemap = true;
}
Expand All @@ -144,10 +147,10 @@ export class SitemapReader extends EventEmitter {
fullUrl = new URL(sitemap, seedUrl).href;
let expected = null;
if (fullUrl.endsWith(".xml")) {
expected = "text/xml";
expected = XML_CONTENT_TYPES;
isSitemap = true;
} else if (fullUrl.endsWith(".txt")) {
expected = "text/plain";
expected = TEXT_CONTENT_TYPE;
isRobots = true;
}
resp = await this.tryFetch(fullUrl, expected);
Expand Down Expand Up @@ -218,8 +221,22 @@ export class SitemapReader extends EventEmitter {
}

private async _parseSitemapFromResponse(url: string, resp: Response) {
let stream;

const { body } = resp;
if (!body) {
throw new Error("missing response body");
}
// decompress .gz sitemaps
if (url.endsWith(".gz")) {
const ds = new DecompressionStream("gzip");
stream = body.pipeThrough(ds);
} else {
stream = body;
}

const readableNodeStream = Readable.fromWeb(
resp.body as ReadableStream<Uint8Array>,
stream as ReadableStream<Uint8Array>,
);
this.initSaxParser(url, readableNodeStream);
}
Expand All @@ -244,6 +261,8 @@ export class SitemapReader extends EventEmitter {
let currUrl: string | null;
let lastmod: Date | null = null;

let errCount = 0;

let otherTags = 0;

parserStream.on("end", async () => {
Expand Down Expand Up @@ -358,10 +377,16 @@ export class SitemapReader extends EventEmitter {
this.pending.delete(url);
return;
}
logger.warn("Sitemap error parsing XML", { err }, "sitemap");
// eslint-disable-next-line @typescript-eslint/no-explicit-any
(parserStream._parser as any).error = null;
parserStream._parser.resume();
logger.warn(
"Sitemap error parsing XML",
{ url, err, errCount },
"sitemap",
);
if (errCount++ < 3) {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
(parserStream._parser as any).error = null;
parserStream._parser.resume();
}
});

sourceStream.pipe(parserStream);
Expand Down
18 changes: 15 additions & 3 deletions tests/sitemap-parse.test.js
@@ -1,7 +1,6 @@
import child_process from "child_process";
import Redis from "ioredis";


function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
Expand Down Expand Up @@ -30,8 +29,8 @@ async function waitContainer(containerId) {
}
}

async function runCrawl(numExpected, url, sitemap="", limit=0) {
const containerId = child_process.execSync(`docker run -d -p 36379:6379 -e CRAWL_ID=test webrecorder/browsertrix-crawler crawl --url ${url} --sitemap ${sitemap} --limit ${limit} --context sitemap --logging debug --debugAccessRedis`, {encoding: "utf-8"});
async function runCrawl(numExpected, url, sitemap="", limit=0, numExpectedLessThan=0, extra="") {
const containerId = child_process.execSync(`docker run -d -p 36379:6379 -e CRAWL_ID=test webrecorder/browsertrix-crawler crawl --url ${url} --sitemap ${sitemap} --limit ${limit} --context sitemap --logging debug --debugAccessRedis ${extra}`, {encoding: "utf-8"});

await sleep(2000);

Expand Down Expand Up @@ -66,6 +65,10 @@ async function runCrawl(numExpected, url, sitemap="", limit=0) {
}

expect(finished).toBeGreaterThanOrEqual(numExpected);

if (numExpectedLessThan) {
expect(finished).toBeLessThanOrEqual(numExpectedLessThan);
}
}

test("test sitemap fully finish", async () => {
Expand All @@ -80,3 +83,12 @@ test("test sitemap with limit, specific URL", async () => {
await runCrawl(1900, "https://www.mozilla.org/", "https://www.mozilla.org/sitemap.xml", 2000);
});

test("test sitemap with application/xml content-type", async () => {
await runCrawl(10, "https://bitarchivist.net/", "", 0);
});


test("test sitemap with narrow scope, extraHops, to ensure extraHops don't apply to sitemap", async () => {
await runCrawl(1, "https://www.mozilla.org/", "", 2000, 100, "--extraHops 1 --scopeType page");
});

0 comments on commit b707d00

Please sign in to comment.