sitemap improvements: gz support + application/xml + extraHops fix (#511

) sitemap fixes, follow up to #496 - support parsing sitemap urls that end in .gz with gzip decompression - support both `application/xml` and `text/xml` as valid sitemap content-types (add test for both) - ignore extraHops for sitemap found URLs by setting to past extraHops limit (otherwise, all sitemap URLs would be treated as links from seed page)
webrecorder · Mar 26, 2024 · b707d00 · b707d00
1 parent bf5cbb0
commit b707d00
Show file tree

Hide file tree

Showing 3 changed files with 57 additions and 15 deletions.
diff --git a/src/crawler.ts b/src/crawler.ts
@@ -2106,6 +2106,10 @@ self.__bx_behaviors.selectMainBehavior();
 
     let finished = false;
 
+    // disable extraHops for sitemap found URLs by setting to extraHops limit + 1
+    // otherwise, all sitemap found URLs would be eligible for additional hops
+    const extraHopsDisabled = this.params.extraHops + 1;
+
     await new Promise<void>((resolve) => {
       sitemapper.on("end", () => {
         resolve();
@@ -2119,6 +2123,7 @@ self.__bx_behaviors.selectMainBehavior();
           finished = true;
         }
       });
+
       sitemapper.on("url", ({ url }) => {
         const count = sitemapper.count;
         if (count % 10 ** power === 0) {
@@ -2132,7 +2137,7 @@ self.__bx_behaviors.selectMainBehavior();
             "sitemap",
           );
         }
-        this.queueInScopeUrls(seedId, [url], 0);
+        this.queueInScopeUrls(seedId, [url], 0, extraHopsDisabled);
         if (count >= 100 && !resolved) {
           logger.info(
             "Sitemap partially parsed, continue parsing large sitemap in the background",

diff --git a/src/util/sitemapper.ts b/src/util/sitemapper.ts
@@ -11,6 +11,9 @@ import { sleep } from "./timing.js";
 
 const SITEMAP_CONCURRENCY = 5;
 
+const TEXT_CONTENT_TYPE = ["text/plain"];
+const XML_CONTENT_TYPES = ["text/xml", "application/xml"];
+
 export type SitemapOpts = {
   headers?: Record<string, string>;
 
@@ -83,7 +86,7 @@ export class SitemapReader extends EventEmitter {
     }
   }
 
-  async tryFetch(url: string, expectedCT?: string | null) {
+  async tryFetch(url: string, expectedCT?: string[] | null) {
     try {
       logger.debug(
         "Detecting Sitemap: fetching",
@@ -101,7 +104,7 @@ export class SitemapReader extends EventEmitter {
       }
 
       const ct = resp.headers.get("content-type");
-      if (expectedCT && ct && ct.split(";")[0] != expectedCT) {
+      if (expectedCT && ct && !expectedCT.includes(ct.split(";")[0])) {
         logger.debug(
           "Detecting Sitemap: invalid content-type",
           { ct },
@@ -129,12 +132,12 @@ export class SitemapReader extends EventEmitter {
     if (sitemap === DETECT_SITEMAP) {
       logger.debug("Detecting sitemap for seed", { seedUrl }, "sitemap");
       fullUrl = new URL("/robots.txt", seedUrl).href;
-      resp = await this.tryFetch(fullUrl, "text/plain");
+      resp = await this.tryFetch(fullUrl, TEXT_CONTENT_TYPE);
       if (resp) {
         isRobots = true;
       } else {
         fullUrl = new URL("/sitemap.xml", seedUrl).href;
-        resp = await this.tryFetch(fullUrl, "text/xml");
+        resp = await this.tryFetch(fullUrl, XML_CONTENT_TYPES);
         if (resp) {
           isSitemap = true;
         }
@@ -144,10 +147,10 @@ export class SitemapReader extends EventEmitter {
       fullUrl = new URL(sitemap, seedUrl).href;
       let expected = null;
       if (fullUrl.endsWith(".xml")) {
-        expected = "text/xml";
+        expected = XML_CONTENT_TYPES;
         isSitemap = true;
       } else if (fullUrl.endsWith(".txt")) {
-        expected = "text/plain";
+        expected = TEXT_CONTENT_TYPE;
         isRobots = true;
       }
       resp = await this.tryFetch(fullUrl, expected);
@@ -218,8 +221,22 @@ export class SitemapReader extends EventEmitter {
   }
 
   private async _parseSitemapFromResponse(url: string, resp: Response) {
+    let stream;
+
+    const { body } = resp;
+    if (!body) {
+      throw new Error("missing response body");
+    }
+    // decompress .gz sitemaps
+    if (url.endsWith(".gz")) {
+      const ds = new DecompressionStream("gzip");
+      stream = body.pipeThrough(ds);
+    } else {
+      stream = body;
+    }
+
     const readableNodeStream = Readable.fromWeb(
-      resp.body as ReadableStream<Uint8Array>,
+      stream as ReadableStream<Uint8Array>,
     );
     this.initSaxParser(url, readableNodeStream);
   }
@@ -244,6 +261,8 @@ export class SitemapReader extends EventEmitter {
     let currUrl: string | null;
     let lastmod: Date | null = null;
 
+    let errCount = 0;
+
     let otherTags = 0;
 
     parserStream.on("end", async () => {
@@ -358,10 +377,16 @@ export class SitemapReader extends EventEmitter {
         this.pending.delete(url);
         return;
       }
-      logger.warn("Sitemap error parsing XML", { err }, "sitemap");
-      // eslint-disable-next-line @typescript-eslint/no-explicit-any
-      (parserStream._parser as any).error = null;
-      parserStream._parser.resume();
+      logger.warn(
+        "Sitemap error parsing XML",
+        { url, err, errCount },
+        "sitemap",
+      );
+      if (errCount++ < 3) {
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
+        (parserStream._parser as any).error = null;
+        parserStream._parser.resume();
+      }
     });
 
     sourceStream.pipe(parserStream);

diff --git a/tests/sitemap-parse.test.js b/tests/sitemap-parse.test.js
@@ -1,7 +1,6 @@
 import child_process from "child_process";
 import Redis from "ioredis";
 
-
 function sleep(ms) {
   return new Promise((resolve) => setTimeout(resolve, ms));
 }
@@ -30,8 +29,8 @@ async function waitContainer(containerId) {
   }
 }
 
-async function runCrawl(numExpected, url, sitemap="", limit=0) {
-  const containerId = child_process.execSync(`docker run -d -p 36379:6379 -e CRAWL_ID=test webrecorder/browsertrix-crawler crawl --url ${url} --sitemap ${sitemap} --limit ${limit} --context sitemap --logging debug --debugAccessRedis`, {encoding: "utf-8"});
+async function runCrawl(numExpected, url, sitemap="", limit=0, numExpectedLessThan=0, extra="") {
+  const containerId = child_process.execSync(`docker run -d -p 36379:6379 -e CRAWL_ID=test webrecorder/browsertrix-crawler crawl --url ${url} --sitemap ${sitemap} --limit ${limit} --context sitemap --logging debug --debugAccessRedis ${extra}`, {encoding: "utf-8"});
 
   await sleep(2000);
 
@@ -66,6 +65,10 @@ async function runCrawl(numExpected, url, sitemap="", limit=0) {
   }
 
   expect(finished).toBeGreaterThanOrEqual(numExpected);
+
+  if (numExpectedLessThan) {
+    expect(finished).toBeLessThanOrEqual(numExpectedLessThan);
+  }
 }
 
 test("test sitemap fully finish", async () => {
@@ -80,3 +83,12 @@ test("test sitemap with limit, specific URL", async () => {
   await runCrawl(1900, "https://www.mozilla.org/", "https://www.mozilla.org/sitemap.xml", 2000);
 });
 
+test("test sitemap with application/xml content-type", async () => {
+  await runCrawl(10, "https://bitarchivist.net/", "", 0);
+});
+
+
+test("test sitemap with narrow scope, extraHops, to ensure extraHops don't apply to sitemap", async () => {
+  await runCrawl(1, "https://www.mozilla.org/", "", 2000, 100, "--extraHops 1 --scopeType page");
+});
+