Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: introduce linting config #1

Merged
merged 1 commit into from Mar 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions .eslintignore
@@ -0,0 +1,2 @@
.vscode/*
coverage/*
14 changes: 14 additions & 0 deletions .eslintrc.cjs
@@ -0,0 +1,14 @@


module.exports = {
root: true,
extends: '@adobe/helix',
overrides: [
{
files: ['*.test.js'],
rules: {
'no-unused-expressions': 'off',
},
},
],
};
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -3,3 +3,4 @@ node_modules
reports
.DS_Store
/assessment/output
.idea/
170 changes: 100 additions & 70 deletions all-count-pages-in-sitemaps.js
@@ -1,16 +1,28 @@
/*
* Copyright 2024 Adobe. All rights reserved.
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
* OF ANY KIND, either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
import zlib from 'zlib';
import fetch from 'node-fetch';
import dotenv from 'dotenv';
import fs from 'fs';
import { fileURLToPath } from 'url';
import path from 'path';
import { parseStringPromise } from 'xml2js';
import { makeSpaceCatApiCall} from './lib.js';
// eslint-disable-next-line import/no-unresolved
import { makeSpaceCatApiCall } from './lib.js';

dotenv.config();

const __USER_AGENT_HEADER = { headers: { 'User-Agent': 'basecode/seo-research-crawler/1.0' } };
const __visitedSitemaps = [];
const userAgentHeader = { headers: { 'User-Agent': 'basecode/seo-research-crawler/1.0' } };
const visitedSitemaps = [];

const REPORTS_DIR = path.join(path.dirname(fileURLToPath(import.meta.url)), 'reports');
const EXECUTE_SINGLE_SITE_REPORT = '';
Expand All @@ -22,33 +34,30 @@ if (!fs.existsSync(REPORTS_DIR)) {

const hrtimeToSeconds = (hrtime) => {
// hrtime is an array: [seconds, nanoseconds]
const totalNanoseconds = hrtime[0] * 1e9 + hrtime[1]; // Convert seconds to nanoseconds and add the nanoseconds
// Convert seconds to nanoseconds and add the nanoseconds
const totalNanoseconds = hrtime[0] * 1e9 + hrtime[1];
return totalNanoseconds / 1e9;
}

const sanitizeFilename = (url) => {
return url.replace(/[^a-zA-Z0-9]/g, '_');
};

const reportExists = (site) => {
return fs.existsSync(path.join(REPORTS_DIR, `${sanitizeFilename(site)}.txt`));
}
const sanitizeFilename = (url) => url.replace(/[^a-zA-Z0-9]/g, '_');

const reportSite = (site) => {
if (EXECUTE_SINGLE_SITE_REPORT) console.log(`Report for ${site}`);
fs.writeFileSync(path.join(REPORTS_DIR, `${sanitizeFilename(site)}.txt`), `Report for ${site}\n`);
report(site, `Date: ${Date.now()}`);
}
const reportExists = (site) => fs.existsSync(path.join(REPORTS_DIR, `${sanitizeFilename(site)}.txt`));

const report = (site, message) => {
if (EXECUTE_SINGLE_SITE_REPORT) console.log(message);
fs.appendFileSync(path.join(REPORTS_DIR, `${sanitizeFilename(site)}.txt`), message + "\n");
}
fs.appendFileSync(path.join(REPORTS_DIR, `${sanitizeFilename(site)}.txt`), `${message}\n`);
};

const reportSite = (site) => {
if (EXECUTE_SINGLE_SITE_REPORT) console.log(`Report for ${site}`);
fs.writeFileSync(path.join(REPORTS_DIR, `${sanitizeFilename(site)}.txt`), `Report for ${site}\n`);
report(site, `Date: ${Date.now()}`);
};

const reportPages = (site, pages) => {
report(site, `Total Pages: ${pages.length}`);
pages.forEach(page => fs.appendFileSync(path.join(REPORTS_DIR, `${sanitizeFilename(site)}.txt`), `${page}\n`) );
}
pages.forEach((page) => fs.appendFileSync(path.join(REPORTS_DIR, `${sanitizeFilename(site)}.txt`), `${page}\n`));
};

/*
Example output:
Expand All @@ -59,18 +68,18 @@ const getSpacecatSitesUrls = async () => {
return response
.filter((item) => item.deliveryType === 'aem_edge')
.map((item) => item.baseURL);
}

};

async function fetchSitemapUrls(siteUrl) {
let sitemapUrl = new URL('sitemap.xml', siteUrl).toString(); // Default sitemap location
let urls = [];
const sitemapUrl = new URL('sitemap.xml', siteUrl).toString(); // Default sitemap location
const urls = [];

function parseRobotsTxt(robotsTxt) {
try {
const regex = /Sitemap:\s*(https?:\/\/[^\s]+)/g;
let match;
let sitemaps = [];
const sitemaps = [];
// eslint-disable-next-line no-cond-assign
while ((match = regex.exec(robotsTxt)) !== null) {
sitemaps.push(match[1]);
}
Expand All @@ -81,69 +90,86 @@ async function fetchSitemapUrls(siteUrl) {
}

async function parseSitemap(xml, source) {
if (__visitedSitemaps.includes(sitemapUrl)) return;
if (visitedSitemaps.includes(source)) return; // Ensure to use `source` instead of `sitemapUrl`
try {
const result = await parseStringPromise(xml);
const fetchPromises = [];

if (result.urlset && result.urlset.url) {
for (let urlEntry of result.urlset.url) {
result.urlset.url.forEach((urlEntry) => {
urls.push(urlEntry.loc[0]);
}
});
} else if (result.sitemapindex && result.sitemapindex.sitemap) {
for (let sitemap of result.sitemapindex.sitemap) {
result.sitemapindex.sitemap.forEach((sitemap) => {
const sitemapIndexUrl = sitemap.loc[0];
if (__visitedSitemaps.includes(sitemapIndexUrl)) break;
__visitedSitemaps.push(sitemapIndexUrl);
if (visitedSitemaps.includes(sitemapIndexUrl)) return;
visitedSitemaps.push(sitemapIndexUrl);
report(siteUrl, `Found Sitemap in Index: ${sitemapIndexUrl}`);
const response = await fetch(sitemapIndexUrl, __USER_AGENT_HEADER);
if (!response.ok || response.status === '404' || response.headers.get('content-type').includes('text/html')) {
report(siteUrl, `Error in ${sitemapIndexUrl}, Status: ${response.status}, Content-Type: ${response.headers.get('content-type')}, Source: ${source}`);
} else if (response.headers.get('content-type').includes('application/x-gzip')) {
// Handle gzipped sitemap
report(siteUrl, '..and gzipped');
const buffer = Buffer.from(await response.arrayBuffer());
const decompressed = zlib.gunzipSync(buffer).toString();
await parseSitemap(decompressed);
} else {
// Handle regular sitemap
const xmlText = await response.text();
await parseSitemap(xmlText); // Recursively parse nested sitemaps
}
}

// Create a fetch promise and add it to the array
const fetchPromise = fetch(sitemapIndexUrl, userAgentHeader)
.then((response) => {
if (!response.ok || response.status === '404' || response.headers.get('content-type').includes('text/html')) {
report(siteUrl, `Error in ${sitemapIndexUrl}, Status: ${response.status}, Content-Type: ${response.headers.get('content-type')}, Source: ${source}`);
return null; // Return null to handle this in the subsequent .then()
} else if (response.headers.get('content-type').includes('application/x-gzip')) {
// Handle gzipped sitemap
report(siteUrl, '..and gzipped');
return response.arrayBuffer().then((buffer) => {
const decompressed = zlib.gunzipSync(Buffer.from(buffer)).toString();
// Recursively parse nested sitemaps
return parseSitemap(decompressed, sitemapIndexUrl);
});
} else {
// Handle regular sitemap
// Recursively parse nested sitemaps
return response.text().then((xmlText) => parseSitemap(xmlText, sitemapIndexUrl));
}
});

fetchPromises.push(fetchPromise);
});
}

// Wait for all fetch operations to complete
await Promise.all(fetchPromises);
} catch (error) {
__visitedSitemaps.push(sitemapUrl);
console.error(`Error in ${sitemapUrl}: ${error}. Source: ${source}`);
visitedSitemaps.push(source); // Ensure to use `source` instead of `sitemapUrl`
console.error(`Error in ${source}: ${error}. Source: ${source}`);
}
}

// Check robots.txt for the sitemap URL(s)
try {
const robotsResponse = await fetch(new URL('robots.txt', siteUrl).toString(), __USER_AGENT_HEADER);
const robotsResponse = await fetch(new URL('robots.txt', siteUrl).toString(), userAgentHeader);
if (robotsResponse.ok) {
const robotsTxt = await robotsResponse.text();
const robotsSitemapUrls = parseRobotsTxt(robotsTxt);
if (robotsSitemapUrls && robotsSitemapUrls.length > 0) {
// Process each sitemap found in robots.txt
for (const robotsSitemapUrl of robotsSitemapUrls) {
if (__visitedSitemaps.includes(robotsSitemapUrl)) break;
// Create a list of promises for processing each sitemap found in robots.txt
const sitemapFetchPromises = robotsSitemapUrls.map(async (robotsSitemapUrl) => {
if (visitedSitemaps.includes(robotsSitemapUrl)) {
return; // Skip already visited sitemaps
}
report(siteUrl, `Found Sitemap in robots.txt: ${robotsSitemapUrl}`);
const response = await fetch(robotsSitemapUrl, __USER_AGENT_HEADER);
const response = await fetch(robotsSitemapUrl, userAgentHeader);
if (!response.ok || response.status === '404' || response.headers.get('content-type').includes('text/html')) {
report(siteUrl, `Sitemap not found at ${sitemapUrl}`);
report(siteUrl, `Sitemap not found at ${robotsSitemapUrl}`);
} else if (response.headers.get('content-type').includes('application/x-gzip')) {
// Handle gzipped sitemap
const buffer = Buffer.from(await response.arrayBuffer());
const decompressed = zlib.gunzipSync(buffer).toString();
await parseSitemap(decompressed, robotsSitemapUrl);
} else {
if (response.headers.get('content-type').includes('application/x-gzip')) {
// Handle gzipped sitemap
const buffer = Buffer.from(await response.arrayBuffer());
const decompressed = zlib.gunzipSync(buffer).toString();
await parseSitemap(decompressed, robotsSitemapUrl);
} else {
// Handle regular sitemap
const xml = await response.text();
await parseSitemap(xml, robotsSitemapUrl);
}
// Handle regular sitemap
const xml = await response.text();
await parseSitemap(xml, robotsSitemapUrl);
}
}
return urls; // Return early if sitemap URLs are found in robots.txt
});

// Wait for all sitemap processing promises to complete
await Promise.all(sitemapFetchPromises);
return urls; // Return the collected URLs after processing all sitemaps
}
}
} catch (error) {
Expand All @@ -152,22 +178,22 @@ async function fetchSitemapUrls(siteUrl) {

// Fetch and parse the default sitemap if no sitemap URL is found in robots.txt
try {
const response = await fetch(sitemapUrl, __USER_AGENT_HEADER);
const response = await fetch(sitemapUrl, userAgentHeader);
if (!response.ok || response.status === '404' || response.headers.get('content-type').includes('text/html')) {
report(siteUrl, `Sitemap not found at ${sitemapUrl}`);
} else {
report(siteUrl, `Found Sitemap in default location: ${sitemapUrl}`);
let xml;
if (response.headers.get('content-type').includes('application/x-gzip')) {
const buffer = Buffer.from(await response.arrayBuffer());
const xml = zlib.gunzipSync(buffer).toString();
zlib.gunzipSync(buffer).toString();
} else {
xml = await response.text();
}
await parseSitemap(xml, sitemapUrl);
}
} catch (error) {
__visitedSitemaps.push(sitemapUrl);
visitedSitemaps.push(sitemapUrl);
report(siteUrl, `Error fetching default sitemap ${siteUrl}: ${error}`);
}

Expand All @@ -179,12 +205,16 @@ async function fetchSitemapUrls(siteUrl) {
const totalStartTime = process.hrtime();
let totalPages = 0;

const siteUrls = EXECUTE_SINGLE_SITE_REPORT ? [EXECUTE_SINGLE_SITE_REPORT] : await getSpacecatSitesUrls();
const siteUrls = EXECUTE_SINGLE_SITE_REPORT
? [EXECUTE_SINGLE_SITE_REPORT]
: await getSpacecatSitesUrls();

for (const siteUrl of siteUrls) {
if (!reportExists(siteUrl)) {
const startTime = process.hrtime();
console.log(`Processing: ${siteUrl}`);
reportSite(siteUrl);
// eslint-disable-next-line no-await-in-loop
const pages = await fetchSitemapUrls(siteUrl);
totalPages += pages.length;
reportPages(siteUrl, pages);
Expand Down