Skip to content

Commit

Permalink
feat: introduce top-pages command using ahrefs (#3)
Browse files Browse the repository at this point in the history
  • Loading branch information
dzehnder committed Mar 25, 2024
1 parent 46b4aa9 commit 94d692a
Show file tree
Hide file tree
Showing 7 changed files with 206 additions and 33 deletions.
7 changes: 7 additions & 0 deletions README.md
Expand Up @@ -5,3 +5,10 @@ Misc SEO research

`npm run all <baselUrl>`

## How to trigger canonical assessment

`node ./assessment/canonical.js <baseUrl> [options]`

Options:
- `--top-pages=<number>` - Run audit for top pages (default 200), based on estimated organic traffic
- `--sitemap=<sitemapUrl>` - Specify a specific sitemap location (default fetched from robots.txt or /sitemap.xml), especially useful for page in development as they are not listed yet in the robots.txt or sitemap_index.xml
15 changes: 13 additions & 2 deletions all-assessments.js
@@ -1,5 +1,16 @@
import {canonical} from "./assessment/canonical.js";
import {sitemap} from "./assessment/sitemap.js";
/*
* Copyright 2024 Adobe. All rights reserved.
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
* OF ANY KIND, either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
import { canonical } from './assessment/canonical.js';
import { sitemap } from './assessment/sitemap.js';

(async () => {
await sitemap;
Expand Down
85 changes: 85 additions & 0 deletions assessment/ahrefs-lib.js
@@ -0,0 +1,85 @@
/*
* Copyright 2024 Adobe. All rights reserved.
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
* OF ANY KIND, either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/

import path from 'path';
import fs from 'fs';
import { fileURLToPath } from 'url';
import { csv2json, json2csv } from 'json-2-csv';
import { generateFileName } from './file-lib.js';

const AHREFS_API_BASE_URL = 'https://api.ahrefs.com/v3';
const OUTPUT_DIR = path.join(path.dirname(fileURLToPath(import.meta.url)), 'output');

const sendRequest = async (endpoint, queryParams = {}) => {
const queryParamsKeys = Object.keys(queryParams);
const queryString = queryParamsKeys.length > 0
? `?${queryParamsKeys
.map((key) => `${encodeURIComponent(key)}=${encodeURIComponent(queryParams[key])}`)
.join('&')}` : '';

const fullAuditRef = `${AHREFS_API_BASE_URL}${endpoint}${queryString}`;
const response = await fetch(fullAuditRef, {
method: 'GET',
headers: {
'Content-Type': 'application/json',
Authorization: `Bearer ${process.env.AHREFS_API_KEY}`,
},
});

if (!response.ok) {
throw new Error(`Ahrefs API request failed with status: ${response.status}`);
}

try {
const result = await response.json();
return {
result,
fullAuditRef,
};
} catch (e) {
throw new Error(`Error parsing Ahrefs API response: ${e.message}`);
}
};

export const getTopPages = async (target, limit) => {
// check if file exists that starts with and return immediately if it does
const files = fs.readdirSync(OUTPUT_DIR);
const existingFile = files.find((file) => file.startsWith(`${generateFileName(target, `top-pages-${limit}`)}`));
if (existingFile) {
console.log(`Using cached file to avoid Ahrefs API call: ${existingFile}`);
const cachedContent = fs.readFileSync(`${OUTPUT_DIR}/${existingFile}`);
return csv2json(cachedContent.toString());
}

const queryParams = {
select: [
'url',
'sum_traffic',
].join(','),
limit,
order_by: 'sum_traffic_merged',
target,
date: new Date().toISOString().split('T')[0],
date_compared: new Date(Date.now() - 30 * 24 * 60 * 60 * 1000).toISOString().split('T')[0],
output: 'json',
};
// safe result as csv to cache
const { result } = await sendRequest('/site-explorer/top-pages', queryParams);
if (result.pages) {
const csvResult = json2csv(result.pages);
const FILE_PATH = path.join(OUTPUT_DIR, `${generateFileName(target, `top-pages-${limit}`)}-${Date.now()}.csv`);
fs.writeFileSync(FILE_PATH, csvResult);
return result.pages;
} else {
throw new Error('No pages found in Ahrefs API response.');
}
};
8 changes: 2 additions & 6 deletions assessment/assessment-lib.js
Expand Up @@ -10,18 +10,14 @@
* governing permissions and limitations under the License.
*/

import { fileURLToPath } from 'url';
import fs from 'fs';
import path from 'path';
import { json2csv } from 'json-2-csv';
import { generateFileName, OUTPUT_DIR } from './file-lib.js';
import { getSiteByBaseUrl } from '../spacecat-lib.js';

export const USER_AGENT = 'basecode/seo-research-crawler/1.0';

const OUTPUT_DIR = path.join(path.dirname(fileURLToPath(import.meta.url)), 'output');

const sanitizeFilename = (url) => url.replace(/[^a-zA-Z0-9]/g, '_').toLowerCase();

const hrtimeToSeconds = (hrtime) => {
const [seconds, nanoseconds] = hrtime; // Destructuring for clarity
return (seconds * 1e9 + nanoseconds) / 1e9; // Simplified calculation
Expand All @@ -39,7 +35,7 @@ export const createAssessment = async (userSite, userTitle) => {
console.log('Check if URL is qualified to be assessed. Needs to be part of spacecat catalogue');
const SITE = await getSiteByBaseUrl(userSite);
const SITE_URL = SITE.baseURL;
const FILE_PATH = path.join(OUTPUT_DIR, `${sanitizeFilename(userTitle)}-${sanitizeFilename(SITE_URL)}-${Date.now()}.csv`);
const FILE_PATH = path.join(OUTPUT_DIR, `${generateFileName(SITE_URL, userTitle)}-${Date.now()}.csv`);

console.log(`${userTitle}: Assessment for ${SITE_URL}`);

Expand Down
90 changes: 70 additions & 20 deletions assessment/canonical.js
Expand Up @@ -12,11 +12,18 @@
import { JSDOM } from 'jsdom';
import { createAssessment } from './assessment-lib.js';
import { fetchSitemapsFromBaseUrl } from './sitemap.js';
import { getTopPages } from './ahrefs-lib.js';

const TRACKING_PARAM = '?utm';
const userSiteUrl = process.argv[2];

const checkForCanonical = async (url, assessment) => {
const options = {
topPages: undefined,
sitemapSrc: undefined,
};

// eslint-disable-next-line consistent-return
const checkForCanonical = async (url, assessment, source = 'ahrefs', retries = 3, backoff = 300) => {
try {
const response = await fetch(url);
const contentType = response.headers.get('content-type');
Expand All @@ -32,9 +39,10 @@ const checkForCanonical = async (url, assessment) => {
if (canonicalLink) {
assessment.addColumn({
url,
source,
canonicalExists: true,
response: response.status,
presentInSiteMap: url === canonicalLink,
presentInSiteMap: source === 'sitemap' ? url === canonicalLink : '',
www: url.startsWith('https://www.'),
hasTrailingSlash: url.endsWith('/'),
hasHtmlExtension: url.endsWith('.html'),
Expand All @@ -54,37 +62,79 @@ const checkForCanonical = async (url, assessment) => {
});
}
} catch (error) {
assessment.addColumn({
url,
error: `Error fetching URL ${url}: ${error.message}`,
});
if (retries > 0) {
console.log(`Error fetching URL ${url}: ${error.message}. Retrying in ${backoff}ms`);
await new Promise((resolve) => {
setTimeout(resolve, backoff);
});
return checkForCanonical(url, assessment, source, retries - 1, backoff * 2);
} else {
assessment.addColumn({
url,
error: `Error fetching URL ${url}: ${error.message} after ${retries} retries`,
});
}
}
};

const canonicalAudit = async (siteUrl, assessment) => {
// TODO: fetch sitemap url from file if already exists
const sitemaps = await fetchSitemapsFromBaseUrl(siteUrl);
return Promise.all(sitemaps.map((sitemap) => {
if (sitemap.page) {
return checkForCanonical(sitemap.page, assessment);
}
}));
if (options.topPages) {
// if top pages are specified, get pages from ahrefs
// default, get pages from sitemap
console.log(`Fetching top ${options.topPages} pages from Ahrefs`);
const pages = await getTopPages(siteUrl, options.topPages);
// eslint-disable-next-line consistent-return,array-callback-return
return Promise.all(pages.map((page) => {
if (page.url && page.sum_traffic > 0) {
return checkForCanonical(page.url, assessment);
}
}));
} else {
console.log(`Fetching pages from sitemap ${options.sitemapSrc ? `provided at ${options.sitemapSrc}` : ''}`);
const pages = await fetchSitemapsFromBaseUrl(siteUrl, options.sitemapSrc);
// eslint-disable-next-line array-callback-return,consistent-return
return Promise.all(pages.map((page) => {
if (page.page) {
return checkForCanonical(page.page, assessment, 'sitemap');
}
}));
}
};

export const canonical = (async () => {
process.argv.slice(3).forEach((arg) => {
if (arg.startsWith('--top-pages')) {
const [, value] = arg.split('=');
const number = parseInt(value, 10);
if (Number.isNaN(number) || number <= 0) {
console.log('Defaulting to top 200 pages');
options.topPages = 200;
} else {
options.topPages = number;
}
} else if (arg.startsWith('--sitemap')) {
const [, value] = arg.split('=');
options.sitemapSrc = value;
} else {
console.error(`Error: Unknown option '${arg}'`);
process.exit(1);
}
});
const assessment = await createAssessment(userSiteUrl, 'Canonical');
assessment.setRowHeadersAndDefaults({
url: '',
canonicalExists: false,
source: '',
canonicalExists: '',
response: '',
presentInSiteMap: false,
www: undefined,
hasTrailingSlash: undefined,
hasHtmlExtension: undefined,
hasTrackingParams: undefined,
presentInSiteMap: '',
www: '',
hasTrailingSlash: '',
hasHtmlExtension: '',
hasTrackingParams: '',
error: '',
warning: '',
});
await canonicalAudit(userSiteUrl, assessment);
await canonicalAudit(userSiteUrl, assessment, options);
assessment.end();
process.exit(0);
})();
19 changes: 19 additions & 0 deletions assessment/file-lib.js
@@ -0,0 +1,19 @@
/*
* Copyright 2024 Adobe. All rights reserved.
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
* OF ANY KIND, either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/

import path from 'path';
import { fileURLToPath } from 'url';

export const OUTPUT_DIR = path.join(path.dirname(fileURLToPath(import.meta.url)), 'output');
export const sanitizeFilename = (url) => url.replace(/[^a-zA-Z0-9]/g, '_').toLowerCase();

export const generateFileName = (siteUrl, title) => `${sanitizeFilename(title)}-${sanitizeFilename(siteUrl)}`;
15 changes: 10 additions & 5 deletions assessment/sitemap.js
Expand Up @@ -110,7 +110,12 @@ async function fetchSitemapsFromRobots(siteUrl) {
return fetchSitemapsFromSource(sitemapSources);
}

export async function fetchSitemapsFromBaseUrl(url) {
export async function fetchSitemapsFromBaseUrl(url, sitemapSrc) {
if (sitemapSrc) {
return fetchSitemapsFromSource([
{ url: new URL(sitemapSrc, url).toString(), source: 'user provided' },
]);
}
let sitemaps = await fetchSitemapsFromRobots(userSiteUrl);
if (!sitemaps.length) {
sitemaps = await fetchSitemapsFromSource([
Expand All @@ -123,7 +128,7 @@ export async function fetchSitemapsFromBaseUrl(url) {
}
}
return sitemaps;
};
}

export const sitemap = (async () => {
const assessment = await createAssessment(userSiteUrl, 'Sitemap');
Expand All @@ -138,10 +143,10 @@ export const sitemap = (async () => {
const sitemaps = await fetchSitemapsFromBaseUrl(userSiteUrl);

// Assessment for sitemaps
sitemaps.forEach(async (sitemap) => {
if (sitemap.url) {
sitemaps.forEach(async (sm) => {
if (sm.url) {
assessment.addColumn({
sitemapOrPage: sitemap.url, source: sitemap.source, locs: sitemap.locs, error: sitemap.error || '', warning: sitemap.warning || '',
sitemapOrPage: sm.url, source: sm.source, locs: sm.locs, error: sm.error || '', warning: sm.warning || '',
});
}
});
Expand Down

0 comments on commit 94d692a

Please sign in to comment.