From f174658486c379b62e28de94548ba51860ec9b6d Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ikreymer@gmail.com>
Date: Mon, 25 Mar 2024 17:44:09 -0700
Subject: [PATCH] sitemap fix, follow up to #496 - support parsing sitemap urls
 that end in .gz with gzip decompression - ignore extraHops for sitemap found
 URLs by setting to past extraHops limit (otherwise, all sitemap URLs would be
 treated as links from seed page)

---
 src/crawler.ts         |  7 ++++++-
 src/util/sitemapper.ts | 32 +++++++++++++++++++++++++++-----
 2 files changed, 33 insertions(+), 6 deletions(-)
diff --git a/src/crawler.ts b/src/crawler.ts
index e0289c078..7b7b63f32 100644
--- a/src/crawler.ts
+++ b/src/crawler.ts
@@ -2106,6 +2106,10 @@ self.__bx_behaviors.selectMainBehavior();
 
     let finished = false;
 
+    // disable extraHops for sitemap found URLs by setting to extraHops limit + 1
+    // otherwise, all sitemap found URLs would be eligible for additional hops
+    const extraHopsDisabled = this.params.extraHops + 1;
+
     await new Promise<void>((resolve) => {
       sitemapper.on("end", () => {
         resolve();
@@ -2119,6 +2123,7 @@ self.__bx_behaviors.selectMainBehavior();
           finished = true;
         }
       });
+
       sitemapper.on("url", ({ url }) => {
         const count = sitemapper.count;
         if (count % 10 ** power === 0) {
@@ -2132,7 +2137,7 @@ self.__bx_behaviors.selectMainBehavior();
             "sitemap",
           );
         }
-        this.queueInScopeUrls(seedId, [url], 0);
+        this.queueInScopeUrls(seedId, [url], 0, extraHopsDisabled);
         if (count >= 100 && !resolved) {
           logger.info(
             "Sitemap partially parsed, continue parsing large sitemap in the background",
diff --git a/src/util/sitemapper.ts b/src/util/sitemapper.ts
index 3f3609d51..25c5de5de 100644
--- a/src/util/sitemapper.ts
+++ b/src/util/sitemapper.ts
@@ -218,8 +218,22 @@ export class SitemapReader extends EventEmitter {
   }
 
   private async _parseSitemapFromResponse(url: string, resp: Response) {
+    let stream;
+
+    const { body } = resp;
+    if (!body) {
+      throw new Error("missing response body");
+    }
+    // decompress .gz sitemaps
+    if (url.endsWith(".gz")) {
+      const ds = new DecompressionStream("gzip");
+      stream = body.pipeThrough(ds);
+    } else {
+      stream = body;
+    }
+
     const readableNodeStream = Readable.fromWeb(
-      resp.body as ReadableStream<Uint8Array>,
+      stream as ReadableStream<Uint8Array>,
     );
     this.initSaxParser(url, readableNodeStream);
   }
@@ -244,6 +258,8 @@ export class SitemapReader extends EventEmitter {
     let currUrl: string | null;
     let lastmod: Date | null = null;
 
+    let errCount = 0;
+
     let otherTags = 0;
 
     parserStream.on("end", async () => {
@@ -358,10 +374,16 @@ export class SitemapReader extends EventEmitter {
         this.pending.delete(url);
         return;
       }
-      logger.warn("Sitemap error parsing XML", { err }, "sitemap");
-      // eslint-disable-next-line @typescript-eslint/no-explicit-any
-      (parserStream._parser as any).error = null;
-      parserStream._parser.resume();
+      logger.warn(
+        "Sitemap error parsing XML",
+        { url, err, errCount },
+        "sitemap",
+      );
+      if (errCount++ < 3) {
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
+        (parserStream._parser as any).error = null;
+        parserStream._parser.resume();
+      }
     });
 
     sourceStream.pipe(parserStream);