basecode · basecode · Mar 21, 2024 · Mar 21, 2024
diff --git a/.eslintignore b/.eslintignore
@@ -0,0 +1,2 @@
+.vscode/*
+coverage/*
diff --git a/.eslintrc.cjs b/.eslintrc.cjs
@@ -0,0 +1,14 @@
+
+
+module.exports = {
+  root: true,
+  extends: '@adobe/helix',
+  overrides: [
+    {
+      files: ['*.test.js'],
+      rules: {
+        'no-unused-expressions': 'off',
+      },
+    },
+  ],
+};
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,4 @@ node_modules
 reports
 .DS_Store
 /assessment/output
+.idea/
diff --git a/all-count-pages-in-sitemaps.js b/all-count-pages-in-sitemaps.js
@@ -1,16 +1,28 @@
+/*
+ * Copyright 2024 Adobe. All rights reserved.
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License. You may obtain a copy
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
+ * OF ANY KIND, either express or implied. See the License for the specific language
+ * governing permissions and limitations under the License.
+ */
 import zlib from 'zlib';
 import fetch from 'node-fetch';
 import dotenv from 'dotenv';
 import fs from 'fs';
 import { fileURLToPath } from 'url';
 import path from 'path';
 import { parseStringPromise } from 'xml2js';
-import { makeSpaceCatApiCall} from './lib.js';
+// eslint-disable-next-line import/no-unresolved
+import { makeSpaceCatApiCall } from './lib.js';
 
 dotenv.config();
 
-const __USER_AGENT_HEADER = { headers: { 'User-Agent': 'basecode/seo-research-crawler/1.0' } };
-const __visitedSitemaps = [];
+const userAgentHeader = { headers: { 'User-Agent': 'basecode/seo-research-crawler/1.0' } };
+const visitedSitemaps = [];
 
 const REPORTS_DIR = path.join(path.dirname(fileURLToPath(import.meta.url)), 'reports');
 const EXECUTE_SINGLE_SITE_REPORT = '';
@@ -22,33 +34,30 @@ if (!fs.existsSync(REPORTS_DIR)) {
 
 const hrtimeToSeconds = (hrtime) => {
   // hrtime is an array: [seconds, nanoseconds]
-  const totalNanoseconds = hrtime[0] * 1e9 + hrtime[1]; // Convert seconds to nanoseconds and add the nanoseconds
+  // Convert seconds to nanoseconds and add the nanoseconds
+  const totalNanoseconds = hrtime[0] * 1e9 + hrtime[1];
   return totalNanoseconds / 1e9;
-}
-
-const sanitizeFilename = (url) => {
-  return url.replace(/[^a-zA-Z0-9]/g, '_');
 };
 
-const reportExists = (site) => {
-  return fs.existsSync(path.join(REPORTS_DIR, `${sanitizeFilename(site)}.txt`));
-}
+const sanitizeFilename = (url) => url.replace(/[^a-zA-Z0-9]/g, '_');
 
-const reportSite = (site) => {
-  if (EXECUTE_SINGLE_SITE_REPORT) console.log(`Report for ${site}`);
-    fs.writeFileSync(path.join(REPORTS_DIR, `${sanitizeFilename(site)}.txt`), `Report for ${site}\n`);
-    report(site, `Date: ${Date.now()}`);
-}
+const reportExists = (site) => fs.existsSync(path.join(REPORTS_DIR, `${sanitizeFilename(site)}.txt`));
 
 const report = (site, message) => {
   if (EXECUTE_SINGLE_SITE_REPORT) console.log(message);
-  fs.appendFileSync(path.join(REPORTS_DIR, `${sanitizeFilename(site)}.txt`), message + "\n");
-}
+  fs.appendFileSync(path.join(REPORTS_DIR, `${sanitizeFilename(site)}.txt`), `${message}\n`);
+};
+
+const reportSite = (site) => {
+  if (EXECUTE_SINGLE_SITE_REPORT) console.log(`Report for ${site}`);
+  fs.writeFileSync(path.join(REPORTS_DIR, `${sanitizeFilename(site)}.txt`), `Report for ${site}\n`);
+  report(site, `Date: ${Date.now()}`);
+};
 
 const reportPages = (site, pages) => {
   report(site, `Total Pages: ${pages.length}`);
-  pages.forEach(page => fs.appendFileSync(path.join(REPORTS_DIR, `${sanitizeFilename(site)}.txt`), `${page}\n`) );
-}
+  pages.forEach((page) => fs.appendFileSync(path.join(REPORTS_DIR, `${sanitizeFilename(site)}.txt`), `${page}\n`));
+};
 
 /*
 Example output:
@@ -59,18 +68,18 @@ const getSpacecatSitesUrls = async () => {
   return response
     .filter((item) => item.deliveryType === 'aem_edge')
     .map((item) => item.baseURL);
-}
-
+};
 
 async function fetchSitemapUrls(siteUrl) {
-  let sitemapUrl = new URL('sitemap.xml', siteUrl).toString(); // Default sitemap location
-  let urls = [];
+  const sitemapUrl = new URL('sitemap.xml', siteUrl).toString(); // Default sitemap location
+  const urls = [];
 
   function parseRobotsTxt(robotsTxt) {
     try {
       const regex = /Sitemap:\s*(https?:\/\/[^\s]+)/g;
       let match;
-      let sitemaps = [];
+      const sitemaps = [];
+      // eslint-disable-next-line no-cond-assign
       while ((match = regex.exec(robotsTxt)) !== null) {
         sitemaps.push(match[1]);
       }
@@ -81,69 +90,86 @@ async function fetchSitemapUrls(siteUrl) {
   }
 
   async function parseSitemap(xml, source) {
-    if (__visitedSitemaps.includes(sitemapUrl)) return;
+    if (visitedSitemaps.includes(source)) return; // Ensure to use `source` instead of `sitemapUrl`
     try {
       const result = await parseStringPromise(xml);
+      const fetchPromises = [];
+
       if (result.urlset && result.urlset.url) {
-        for (let urlEntry of result.urlset.url) {
+        result.urlset.url.forEach((urlEntry) => {
           urls.push(urlEntry.loc[0]);
-        }
+        });
       } else if (result.sitemapindex && result.sitemapindex.sitemap) {
-        for (let sitemap of result.sitemapindex.sitemap) {
+        result.sitemapindex.sitemap.forEach((sitemap) => {
           const sitemapIndexUrl = sitemap.loc[0];
-          if (__visitedSitemaps.includes(sitemapIndexUrl)) break;
-          __visitedSitemaps.push(sitemapIndexUrl);
+          if (visitedSitemaps.includes(sitemapIndexUrl)) return;
+          visitedSitemaps.push(sitemapIndexUrl);
           report(siteUrl, `Found Sitemap in Index: ${sitemapIndexUrl}`);
-          const response = await fetch(sitemapIndexUrl, __USER_AGENT_HEADER);
-          if (!response.ok || response.status === '404' || response.headers.get('content-type').includes('text/html')) {
-            report(siteUrl, `Error in ${sitemapIndexUrl}, Status: ${response.status}, Content-Type: ${response.headers.get('content-type')}, Source: ${source}`);
-          } else if (response.headers.get('content-type').includes('application/x-gzip')) {
-            // Handle gzipped sitemap
-            report(siteUrl, '..and gzipped');
-            const buffer = Buffer.from(await response.arrayBuffer());
-            const decompressed = zlib.gunzipSync(buffer).toString();
-            await parseSitemap(decompressed);
-          } else {
-            // Handle regular sitemap
-            const xmlText = await response.text();
-            await parseSitemap(xmlText); // Recursively parse nested sitemaps
-          }
-        }
+
+          // Create a fetch promise and add it to the array
+          const fetchPromise = fetch(sitemapIndexUrl, userAgentHeader)
+            .then((response) => {
+              if (!response.ok || response.status === '404' || response.headers.get('content-type').includes('text/html')) {
+                report(siteUrl, `Error in ${sitemapIndexUrl}, Status: ${response.status}, Content-Type: ${response.headers.get('content-type')}, Source: ${source}`);
+                return null; // Return null to handle this in the subsequent .then()
+              } else if (response.headers.get('content-type').includes('application/x-gzip')) {
+                // Handle gzipped sitemap
+                report(siteUrl, '..and gzipped');
+                return response.arrayBuffer().then((buffer) => {
+                  const decompressed = zlib.gunzipSync(Buffer.from(buffer)).toString();
+                  // Recursively parse nested sitemaps
+                  return parseSitemap(decompressed, sitemapIndexUrl);
+                });
+              } else {
+                // Handle regular sitemap
+                // Recursively parse nested sitemaps
+                return response.text().then((xmlText) => parseSitemap(xmlText, sitemapIndexUrl));
+              }
+            });
+
+          fetchPromises.push(fetchPromise);
+        });
       }
+
+      // Wait for all fetch operations to complete
+      await Promise.all(fetchPromises);
     } catch (error) {
-      __visitedSitemaps.push(sitemapUrl);
-      console.error(`Error in ${sitemapUrl}: ${error}. Source: ${source}`);
+      visitedSitemaps.push(source); // Ensure to use `source` instead of `sitemapUrl`
+      console.error(`Error in ${source}: ${error}. Source: ${source}`);
     }
   }
 
   // Check robots.txt for the sitemap URL(s)
   try {
-    const robotsResponse = await fetch(new URL('robots.txt', siteUrl).toString(), __USER_AGENT_HEADER);
+    const robotsResponse = await fetch(new URL('robots.txt', siteUrl).toString(), userAgentHeader);
     if (robotsResponse.ok) {
       const robotsTxt = await robotsResponse.text();
       const robotsSitemapUrls = parseRobotsTxt(robotsTxt);
       if (robotsSitemapUrls && robotsSitemapUrls.length > 0) {
-        // Process each sitemap found in robots.txt
-        for (const robotsSitemapUrl of robotsSitemapUrls) {
-          if (__visitedSitemaps.includes(robotsSitemapUrl)) break;
+        // Create a list of promises for processing each sitemap found in robots.txt
+        const sitemapFetchPromises = robotsSitemapUrls.map(async (robotsSitemapUrl) => {
+          if (visitedSitemaps.includes(robotsSitemapUrl)) {
+            return; // Skip already visited sitemaps
+          }
           report(siteUrl, `Found Sitemap in robots.txt: ${robotsSitemapUrl}`);
-          const response = await fetch(robotsSitemapUrl, __USER_AGENT_HEADER);
+          const response = await fetch(robotsSitemapUrl, userAgentHeader);
           if (!response.ok || response.status === '404' || response.headers.get('content-type').includes('text/html')) {
-            report(siteUrl, `Sitemap not found at ${sitemapUrl}`);
+            report(siteUrl, `Sitemap not found at ${robotsSitemapUrl}`);
+          } else if (response.headers.get('content-type').includes('application/x-gzip')) {
+            // Handle gzipped sitemap
+            const buffer = Buffer.from(await response.arrayBuffer());
+            const decompressed = zlib.gunzipSync(buffer).toString();
+            await parseSitemap(decompressed, robotsSitemapUrl);
           } else {
-            if (response.headers.get('content-type').includes('application/x-gzip')) {
-              // Handle gzipped sitemap
-              const buffer = Buffer.from(await response.arrayBuffer());
-              const decompressed = zlib.gunzipSync(buffer).toString();
-              await parseSitemap(decompressed, robotsSitemapUrl);
-            } else {
-              // Handle regular sitemap
-              const xml = await response.text();
-              await parseSitemap(xml, robotsSitemapUrl);
-            }
+            // Handle regular sitemap
+            const xml = await response.text();
+            await parseSitemap(xml, robotsSitemapUrl);
           }
-        }
-        return urls; // Return early if sitemap URLs are found in robots.txt
+        });
+
+        // Wait for all sitemap processing promises to complete
+        await Promise.all(sitemapFetchPromises);
+        return urls; // Return the collected URLs after processing all sitemaps
       }
     }
   } catch (error) {
@@ -152,22 +178,22 @@ async function fetchSitemapUrls(siteUrl) {
 
   // Fetch and parse the default sitemap if no sitemap URL is found in robots.txt
   try {
-    const response = await fetch(sitemapUrl, __USER_AGENT_HEADER);
+    const response = await fetch(sitemapUrl, userAgentHeader);
     if (!response.ok || response.status === '404' || response.headers.get('content-type').includes('text/html')) {
       report(siteUrl, `Sitemap not found at ${sitemapUrl}`);
     } else {
       report(siteUrl, `Found Sitemap in default location: ${sitemapUrl}`);
       let xml;
       if (response.headers.get('content-type').includes('application/x-gzip')) {
         const buffer = Buffer.from(await response.arrayBuffer());
-        const xml = zlib.gunzipSync(buffer).toString();
+        zlib.gunzipSync(buffer).toString();
       } else {
         xml = await response.text();
       }
       await parseSitemap(xml, sitemapUrl);
     }
   } catch (error) {
-    __visitedSitemaps.push(sitemapUrl);
+    visitedSitemaps.push(sitemapUrl);
     report(siteUrl, `Error fetching default sitemap ${siteUrl}: ${error}`);
   }
 
@@ -179,12 +205,16 @@ async function fetchSitemapUrls(siteUrl) {
   const totalStartTime = process.hrtime();
   let totalPages = 0;
 
-  const siteUrls = EXECUTE_SINGLE_SITE_REPORT ? [EXECUTE_SINGLE_SITE_REPORT] : await getSpacecatSitesUrls();
+  const siteUrls = EXECUTE_SINGLE_SITE_REPORT
+    ? [EXECUTE_SINGLE_SITE_REPORT]
+    : await getSpacecatSitesUrls();
+
   for (const siteUrl of siteUrls) {
     if (!reportExists(siteUrl)) {
       const startTime = process.hrtime();
       console.log(`Processing: ${siteUrl}`);
       reportSite(siteUrl);
+      // eslint-disable-next-line no-await-in-loop
       const pages = await fetchSitemapUrls(siteUrl);
       totalPages += pages.length;
       reportPages(siteUrl, pages);