Skip to content

Commit

Permalink
Merge pull request #1 from basecode/spacecat-eslint-config
Browse files Browse the repository at this point in the history
fix: introduce linting config
  • Loading branch information
basecode committed Mar 21, 2024
2 parents a1ffd12 + 74989d1 commit 643243f
Show file tree
Hide file tree
Showing 12 changed files with 3,828 additions and 426 deletions.
2 changes: 2 additions & 0 deletions .eslintignore
@@ -0,0 +1,2 @@
.vscode/*
coverage/*
14 changes: 14 additions & 0 deletions .eslintrc.cjs
@@ -0,0 +1,14 @@


module.exports = {
root: true,
extends: '@adobe/helix',
overrides: [
{
files: ['*.test.js'],
rules: {
'no-unused-expressions': 'off',
},
},
],
};
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -3,3 +3,4 @@ node_modules
reports
.DS_Store
/assessment/output
.idea/
170 changes: 100 additions & 70 deletions all-count-pages-in-sitemaps.js
@@ -1,16 +1,28 @@
/*
* Copyright 2024 Adobe. All rights reserved.
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
* OF ANY KIND, either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
import zlib from 'zlib';
import fetch from 'node-fetch';
import dotenv from 'dotenv';
import fs from 'fs';
import { fileURLToPath } from 'url';
import path from 'path';
import { parseStringPromise } from 'xml2js';
import { makeSpaceCatApiCall} from './lib.js';
// eslint-disable-next-line import/no-unresolved
import { makeSpaceCatApiCall } from './lib.js';

dotenv.config();

const __USER_AGENT_HEADER = { headers: { 'User-Agent': 'basecode/seo-research-crawler/1.0' } };
const __visitedSitemaps = [];
const userAgentHeader = { headers: { 'User-Agent': 'basecode/seo-research-crawler/1.0' } };
const visitedSitemaps = [];

const REPORTS_DIR = path.join(path.dirname(fileURLToPath(import.meta.url)), 'reports');
const EXECUTE_SINGLE_SITE_REPORT = '';
Expand All @@ -22,33 +34,30 @@ if (!fs.existsSync(REPORTS_DIR)) {

const hrtimeToSeconds = (hrtime) => {
// hrtime is an array: [seconds, nanoseconds]
const totalNanoseconds = hrtime[0] * 1e9 + hrtime[1]; // Convert seconds to nanoseconds and add the nanoseconds
// Convert seconds to nanoseconds and add the nanoseconds
const totalNanoseconds = hrtime[0] * 1e9 + hrtime[1];
return totalNanoseconds / 1e9;
}

const sanitizeFilename = (url) => {
return url.replace(/[^a-zA-Z0-9]/g, '_');
};

const reportExists = (site) => {
return fs.existsSync(path.join(REPORTS_DIR, `${sanitizeFilename(site)}.txt`));
}
const sanitizeFilename = (url) => url.replace(/[^a-zA-Z0-9]/g, '_');

const reportSite = (site) => {
if (EXECUTE_SINGLE_SITE_REPORT) console.log(`Report for ${site}`);
fs.writeFileSync(path.join(REPORTS_DIR, `${sanitizeFilename(site)}.txt`), `Report for ${site}\n`);
report(site, `Date: ${Date.now()}`);
}
const reportExists = (site) => fs.existsSync(path.join(REPORTS_DIR, `${sanitizeFilename(site)}.txt`));

const report = (site, message) => {
if (EXECUTE_SINGLE_SITE_REPORT) console.log(message);
fs.appendFileSync(path.join(REPORTS_DIR, `${sanitizeFilename(site)}.txt`), message + "\n");
}
fs.appendFileSync(path.join(REPORTS_DIR, `${sanitizeFilename(site)}.txt`), `${message}\n`);
};

const reportSite = (site) => {
if (EXECUTE_SINGLE_SITE_REPORT) console.log(`Report for ${site}`);
fs.writeFileSync(path.join(REPORTS_DIR, `${sanitizeFilename(site)}.txt`), `Report for ${site}\n`);
report(site, `Date: ${Date.now()}`);
};

const reportPages = (site, pages) => {
report(site, `Total Pages: ${pages.length}`);
pages.forEach(page => fs.appendFileSync(path.join(REPORTS_DIR, `${sanitizeFilename(site)}.txt`), `${page}\n`) );
}
pages.forEach((page) => fs.appendFileSync(path.join(REPORTS_DIR, `${sanitizeFilename(site)}.txt`), `${page}\n`));
};

/*
Example output:
Expand All @@ -59,18 +68,18 @@ const getSpacecatSitesUrls = async () => {
return response
.filter((item) => item.deliveryType === 'aem_edge')
.map((item) => item.baseURL);
}

};

async function fetchSitemapUrls(siteUrl) {
let sitemapUrl = new URL('sitemap.xml', siteUrl).toString(); // Default sitemap location
let urls = [];
const sitemapUrl = new URL('sitemap.xml', siteUrl).toString(); // Default sitemap location
const urls = [];

function parseRobotsTxt(robotsTxt) {
try {
const regex = /Sitemap:\s*(https?:\/\/[^\s]+)/g;
let match;
let sitemaps = [];
const sitemaps = [];
// eslint-disable-next-line no-cond-assign
while ((match = regex.exec(robotsTxt)) !== null) {
sitemaps.push(match[1]);
}
Expand All @@ -81,69 +90,86 @@ async function fetchSitemapUrls(siteUrl) {
}

async function parseSitemap(xml, source) {
if (__visitedSitemaps.includes(sitemapUrl)) return;
if (visitedSitemaps.includes(source)) return; // Ensure to use `source` instead of `sitemapUrl`
try {
const result = await parseStringPromise(xml);
const fetchPromises = [];

if (result.urlset && result.urlset.url) {
for (let urlEntry of result.urlset.url) {
result.urlset.url.forEach((urlEntry) => {
urls.push(urlEntry.loc[0]);
}
});
} else if (result.sitemapindex && result.sitemapindex.sitemap) {
for (let sitemap of result.sitemapindex.sitemap) {
result.sitemapindex.sitemap.forEach((sitemap) => {
const sitemapIndexUrl = sitemap.loc[0];
if (__visitedSitemaps.includes(sitemapIndexUrl)) break;
__visitedSitemaps.push(sitemapIndexUrl);
if (visitedSitemaps.includes(sitemapIndexUrl)) return;
visitedSitemaps.push(sitemapIndexUrl);
report(siteUrl, `Found Sitemap in Index: ${sitemapIndexUrl}`);
const response = await fetch(sitemapIndexUrl, __USER_AGENT_HEADER);
if (!response.ok || response.status === '404' || response.headers.get('content-type').includes('text/html')) {
report(siteUrl, `Error in ${sitemapIndexUrl}, Status: ${response.status}, Content-Type: ${response.headers.get('content-type')}, Source: ${source}`);
} else if (response.headers.get('content-type').includes('application/x-gzip')) {
// Handle gzipped sitemap
report(siteUrl, '..and gzipped');
const buffer = Buffer.from(await response.arrayBuffer());
const decompressed = zlib.gunzipSync(buffer).toString();
await parseSitemap(decompressed);
} else {
// Handle regular sitemap
const xmlText = await response.text();
await parseSitemap(xmlText); // Recursively parse nested sitemaps
}
}

// Create a fetch promise and add it to the array
const fetchPromise = fetch(sitemapIndexUrl, userAgentHeader)
.then((response) => {
if (!response.ok || response.status === '404' || response.headers.get('content-type').includes('text/html')) {
report(siteUrl, `Error in ${sitemapIndexUrl}, Status: ${response.status}, Content-Type: ${response.headers.get('content-type')}, Source: ${source}`);
return null; // Return null to handle this in the subsequent .then()
} else if (response.headers.get('content-type').includes('application/x-gzip')) {
// Handle gzipped sitemap
report(siteUrl, '..and gzipped');
return response.arrayBuffer().then((buffer) => {
const decompressed = zlib.gunzipSync(Buffer.from(buffer)).toString();
// Recursively parse nested sitemaps
return parseSitemap(decompressed, sitemapIndexUrl);
});
} else {
// Handle regular sitemap
// Recursively parse nested sitemaps
return response.text().then((xmlText) => parseSitemap(xmlText, sitemapIndexUrl));
}
});

fetchPromises.push(fetchPromise);
});
}

// Wait for all fetch operations to complete
await Promise.all(fetchPromises);
} catch (error) {
__visitedSitemaps.push(sitemapUrl);
console.error(`Error in ${sitemapUrl}: ${error}. Source: ${source}`);
visitedSitemaps.push(source); // Ensure to use `source` instead of `sitemapUrl`
console.error(`Error in ${source}: ${error}. Source: ${source}`);
}
}

// Check robots.txt for the sitemap URL(s)
try {
const robotsResponse = await fetch(new URL('robots.txt', siteUrl).toString(), __USER_AGENT_HEADER);
const robotsResponse = await fetch(new URL('robots.txt', siteUrl).toString(), userAgentHeader);
if (robotsResponse.ok) {
const robotsTxt = await robotsResponse.text();
const robotsSitemapUrls = parseRobotsTxt(robotsTxt);
if (robotsSitemapUrls && robotsSitemapUrls.length > 0) {
// Process each sitemap found in robots.txt
for (const robotsSitemapUrl of robotsSitemapUrls) {
if (__visitedSitemaps.includes(robotsSitemapUrl)) break;
// Create a list of promises for processing each sitemap found in robots.txt
const sitemapFetchPromises = robotsSitemapUrls.map(async (robotsSitemapUrl) => {
if (visitedSitemaps.includes(robotsSitemapUrl)) {
return; // Skip already visited sitemaps
}
report(siteUrl, `Found Sitemap in robots.txt: ${robotsSitemapUrl}`);
const response = await fetch(robotsSitemapUrl, __USER_AGENT_HEADER);
const response = await fetch(robotsSitemapUrl, userAgentHeader);
if (!response.ok || response.status === '404' || response.headers.get('content-type').includes('text/html')) {
report(siteUrl, `Sitemap not found at ${sitemapUrl}`);
report(siteUrl, `Sitemap not found at ${robotsSitemapUrl}`);
} else if (response.headers.get('content-type').includes('application/x-gzip')) {
// Handle gzipped sitemap
const buffer = Buffer.from(await response.arrayBuffer());
const decompressed = zlib.gunzipSync(buffer).toString();
await parseSitemap(decompressed, robotsSitemapUrl);
} else {
if (response.headers.get('content-type').includes('application/x-gzip')) {
// Handle gzipped sitemap
const buffer = Buffer.from(await response.arrayBuffer());
const decompressed = zlib.gunzipSync(buffer).toString();
await parseSitemap(decompressed, robotsSitemapUrl);
} else {
// Handle regular sitemap
const xml = await response.text();
await parseSitemap(xml, robotsSitemapUrl);
}
// Handle regular sitemap
const xml = await response.text();
await parseSitemap(xml, robotsSitemapUrl);
}
}
return urls; // Return early if sitemap URLs are found in robots.txt
});

// Wait for all sitemap processing promises to complete
await Promise.all(sitemapFetchPromises);
return urls; // Return the collected URLs after processing all sitemaps
}
}
} catch (error) {
Expand All @@ -152,22 +178,22 @@ async function fetchSitemapUrls(siteUrl) {

// Fetch and parse the default sitemap if no sitemap URL is found in robots.txt
try {
const response = await fetch(sitemapUrl, __USER_AGENT_HEADER);
const response = await fetch(sitemapUrl, userAgentHeader);
if (!response.ok || response.status === '404' || response.headers.get('content-type').includes('text/html')) {
report(siteUrl, `Sitemap not found at ${sitemapUrl}`);
} else {
report(siteUrl, `Found Sitemap in default location: ${sitemapUrl}`);
let xml;
if (response.headers.get('content-type').includes('application/x-gzip')) {
const buffer = Buffer.from(await response.arrayBuffer());
const xml = zlib.gunzipSync(buffer).toString();
zlib.gunzipSync(buffer).toString();
} else {
xml = await response.text();
}
await parseSitemap(xml, sitemapUrl);
}
} catch (error) {
__visitedSitemaps.push(sitemapUrl);
visitedSitemaps.push(sitemapUrl);
report(siteUrl, `Error fetching default sitemap ${siteUrl}: ${error}`);
}

Expand All @@ -179,12 +205,16 @@ async function fetchSitemapUrls(siteUrl) {
const totalStartTime = process.hrtime();
let totalPages = 0;

const siteUrls = EXECUTE_SINGLE_SITE_REPORT ? [EXECUTE_SINGLE_SITE_REPORT] : await getSpacecatSitesUrls();
const siteUrls = EXECUTE_SINGLE_SITE_REPORT
? [EXECUTE_SINGLE_SITE_REPORT]
: await getSpacecatSitesUrls();

for (const siteUrl of siteUrls) {
if (!reportExists(siteUrl)) {
const startTime = process.hrtime();
console.log(`Processing: ${siteUrl}`);
reportSite(siteUrl);
// eslint-disable-next-line no-await-in-loop
const pages = await fetchSitemapUrls(siteUrl);
totalPages += pages.length;
reportPages(siteUrl, pages);
Expand Down

0 comments on commit 643243f

Please sign in to comment.