Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: introduce ahrefs lib #3

Merged
merged 11 commits into from Mar 25, 2024
9 changes: 9 additions & 0 deletions README.md
Expand Up @@ -5,3 +5,12 @@ Misc SEO research

`npm run all <baselUrl>`

## How to trigger canonical assessment

`node ./assessment/canonical.js <baseUrl> [options]`

Options:
- `--all` - Run audit for all pages listed in sitemap
- `--top-pages=<number>` - Run audit for top pages (default 200), based on estimated organic traffic
- `--sitemap=<sitemapUrl>` - Specify a specific sitemap location, especially useful for page in development as they are not listed yet in the robots.txt or sitemap_index.xml
- `--ignore-ahrefs-cache` - Top pages are locally cached to reduce API calls to Ahrefs. This option ignores the Ahrefs cache and fetches fresh data
dzehnder marked this conversation as resolved.
Show resolved Hide resolved
15 changes: 13 additions & 2 deletions all-assessments.js
@@ -1,5 +1,16 @@
import {canonical} from "./assessment/canonical.js";
import {sitemap} from "./assessment/sitemap.js";
/*
* Copyright 2024 Adobe. All rights reserved.
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
* OF ANY KIND, either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
import { canonical } from './assessment/canonical.js';
import { sitemap } from './assessment/sitemap.js';

(async () => {
await sitemap;
Expand Down
85 changes: 85 additions & 0 deletions assessment/ahrefs-lib.js
@@ -0,0 +1,85 @@
/*
* Copyright 2024 Adobe. All rights reserved.
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
* OF ANY KIND, either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/

import path from 'path';
import fs from 'fs';
import { fileURLToPath } from 'url';
import { csv2json, json2csv } from 'json-2-csv';
import { generateFileName } from './file-lib.js';

const AHREFS_API_BASE_URL = 'https://api.ahrefs.com/v3';
const OUTPUT_DIR = path.join(path.dirname(fileURLToPath(import.meta.url)), 'output');

const sendRequest = async (endpoint, queryParams = {}) => {
const queryParamsKeys = Object.keys(queryParams);
const queryString = queryParamsKeys.length > 0
? `?${queryParamsKeys
.map((key) => `${encodeURIComponent(key)}=${encodeURIComponent(queryParams[key])}`)
.join('&')}` : '';

const fullAuditRef = `${AHREFS_API_BASE_URL}${endpoint}${queryString}`;
const response = await fetch(fullAuditRef, {
method: 'GET',
headers: {
'Content-Type': 'application/json',
Authorization: `Bearer ${process.env.AHREFS_API_KEY}`,
},
});

if (!response.ok) {
throw new Error(`Ahrefs API request failed with status: ${response.status}`);
}

try {
const result = await response.json();
return {
result,
fullAuditRef,
};
} catch (e) {
throw new Error(`Error parsing Ahrefs API response: ${e.message}`);
}
};

export const getTopPages = async (target, limit) => {
// check if file exists that starts with and return immediately if it does
const files = fs.readdirSync(OUTPUT_DIR);
const existingFile = files.find((file) => file.startsWith(`${generateFileName(target, 'top-pages')}`));
dzehnder marked this conversation as resolved.
Show resolved Hide resolved
if (existingFile) {
console.log(`Using cached file to avoid Ahrefs API call: ${existingFile}`);
const cachedContent = fs.readFileSync(`${OUTPUT_DIR}/${existingFile}`);
return csv2json(cachedContent.toString());
}

const queryParams = {
select: [
'url',
'sum_traffic',
].join(','),
limit,
order_by: 'sum_traffic_merged',
target,
date: new Date().toISOString().split('T')[0],
date_compared: new Date(Date.now() - 30 * 24 * 60 * 60 * 1000).toISOString().split('T')[0],
output: 'json',
};
// safe result as csv to cache
const { result } = await sendRequest('/site-explorer/top-pages', queryParams);
if (result.pages) {
const csvResult = json2csv(result.pages);
const FILE_PATH = path.join(OUTPUT_DIR, `${generateFileName(target, 'top-pages')}-${Date.now()}.csv`);
fs.writeFileSync(FILE_PATH, csvResult);
return result.pages;
} else {
throw new Error('No pages found in Ahrefs API response.');
}
};
15 changes: 5 additions & 10 deletions assessment/assessment-lib.js
Expand Up @@ -10,18 +10,13 @@
* governing permissions and limitations under the License.
*/

import { fileURLToPath } from 'url';
import fs from 'fs';
import path from 'path';
import { json2csv } from 'json-2-csv';
import { getSiteByBaseUrl } from '../spacecat-lib.js';
import { generateFileName, OUTPUT_DIR } from './file-lib.js';

export const USER_AGENT = 'basecode/seo-research-crawler/1.0';

const OUTPUT_DIR = path.join(path.dirname(fileURLToPath(import.meta.url)), 'output');

const sanitizeFilename = (url) => url.replace(/[^a-zA-Z0-9]/g, '_').toLowerCase();

const hrtimeToSeconds = (hrtime) => {
const [seconds, nanoseconds] = hrtime; // Destructuring for clarity
return (seconds * 1e9 + nanoseconds) / 1e9; // Simplified calculation
Expand All @@ -37,11 +32,11 @@ export const createAssessment = async (userSite, userTitle) => {
}

console.log('Check if URL is qualified to be assessed. Needs to be part of spacecat catalogue');
const SITE = await getSiteByBaseUrl(userSite);
const SITE_URL = SITE.baseURL;
const FILE_PATH = path.join(OUTPUT_DIR, `${sanitizeFilename(userTitle)}-${sanitizeFilename(SITE_URL)}-${Date.now()}.csv`);
// const SITE = await getSiteByBaseUrl(userSite);
// const SITE_URL = SITE.baseURL;
const FILE_PATH = path.join(OUTPUT_DIR, `${generateFileName(userSite, userTitle)}-${Date.now()}.csv`);

console.log(`${userTitle}: Assessment for ${SITE_URL}`);
console.log(`${userTitle}: Assessment for ${userSite}`);

let rowHeadersAndDefaults;

Expand Down
94 changes: 74 additions & 20 deletions assessment/canonical.js
Expand Up @@ -12,11 +12,20 @@
import { JSDOM } from 'jsdom';
import { createAssessment } from './assessment-lib.js';
import { fetchSitemapsFromBaseUrl } from './sitemap.js';
import { getTopPages } from './ahrefs-lib.js';

const TRACKING_PARAM = '?utm';
const userSiteUrl = process.argv[2];

const checkForCanonical = async (url, assessment) => {
const options = {
all: false,
topPages: 200,
ignoreAhrefsCache: false,
sitemapSrc: undefined,
};

// eslint-disable-next-line consistent-return
const checkForCanonical = async (url, assessment, source = 'ahrefs', retries = 3, backoff = 300) => {
try {
const response = await fetch(url);
const contentType = response.headers.get('content-type');
Expand All @@ -32,9 +41,10 @@ const checkForCanonical = async (url, assessment) => {
if (canonicalLink) {
assessment.addColumn({
url,
source,
canonicalExists: true,
response: response.status,
presentInSiteMap: url === canonicalLink,
presentInSiteMap: source === 'sitemap' ? url === canonicalLink : '',
www: url.startsWith('https://www.'),
hasTrailingSlash: url.endsWith('/'),
hasHtmlExtension: url.endsWith('.html'),
Expand All @@ -54,37 +64,81 @@ const checkForCanonical = async (url, assessment) => {
});
}
} catch (error) {
assessment.addColumn({
url,
error: `Error fetching URL ${url}: ${error.message}`,
});
if (retries > 0) {
console.log(`Error fetching URL ${url}: ${error.message}. Retrying in ${backoff}ms`);
await new Promise((resolve) => {
setTimeout(resolve, backoff);
});
return checkForCanonical(url, assessment, source, retries - 1, backoff * 2);
} else {
assessment.addColumn({
url,
error: `Error fetching URL ${url}: ${error.message} after ${retries} retries`,
});
}
}
};

const canonicalAudit = async (siteUrl, assessment) => {
// TODO: fetch sitemap url from file if already exists
const sitemaps = await fetchSitemapsFromBaseUrl(siteUrl);
return Promise.all(sitemaps.map((sitemap) => {
if (sitemap.page) {
return checkForCanonical(sitemap.page, assessment);
}
}));
if (options.all || options.sitemapSrc) {
// if all, get from sitemap
console.log('Fetching all pages from sitemap');
const pages = await fetchSitemapsFromBaseUrl(siteUrl, options.sitemapSrc);
// eslint-disable-next-line array-callback-return,consistent-return
return Promise.all(pages.map((page) => {
if (page.page) {
return checkForCanonical(page.page, assessment, 'sitemap');
}
}));
} else {
// if not all, get from ahrefs
console.log(`Fetching top ${options.topPages} pages from Ahrefs`);
const pages = await getTopPages(siteUrl, options.topPages);
// eslint-disable-next-line consistent-return,array-callback-return
return Promise.all(pages.map((page) => {
if (page.url && page.sum_traffic > 0) {
return checkForCanonical(page.url, assessment);
}
}));
}
};

export const canonical = (async () => {
process.argv.slice(3).forEach((arg) => {
if (arg === '--all') {
options.all = true;
} else if (arg.startsWith('--top-pages')) {
const [, value] = arg.split('=');
const number = parseInt(value, 10);
if (Number.isNaN(number) || number <= 0) {
console.error('Error: --top-pages must be a positive integer.');
process.exit(1);
}
options.topPages = number;
} else if (arg === '--ignore-ahrefs-cache') {
options.ignoreAhrefsCache = true;
} else if (arg.startsWith('--sitemap')) {
const [, value] = arg.split('=');
options.sitemapSrc = value;
} else {
console.error(`Error: Unknown option '${arg}'`);
process.exit(1);
}
});
const assessment = await createAssessment(userSiteUrl, 'Canonical');
assessment.setRowHeadersAndDefaults({
url: '',
canonicalExists: false,
source: '',
canonicalExists: '',
response: '',
presentInSiteMap: false,
www: undefined,
hasTrailingSlash: undefined,
hasHtmlExtension: undefined,
hasTrackingParams: undefined,
presentInSiteMap: '',
www: '',
hasTrailingSlash: '',
hasHtmlExtension: '',
hasTrackingParams: '',
error: '',
warning: '',
});
await canonicalAudit(userSiteUrl, assessment);
await canonicalAudit(userSiteUrl, assessment, options);
assessment.end();
})();
19 changes: 19 additions & 0 deletions assessment/file-lib.js
@@ -0,0 +1,19 @@
/*
* Copyright 2024 Adobe. All rights reserved.
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
* OF ANY KIND, either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/

import path from 'path';
import { fileURLToPath } from 'url';

export const OUTPUT_DIR = path.join(path.dirname(fileURLToPath(import.meta.url)), 'output');
export const sanitizeFilename = (url) => url.replace(/[^a-zA-Z0-9]/g, '_').toLowerCase();

export const generateFileName = (siteUrl, title) => `${sanitizeFilename(title)}-${sanitizeFilename(siteUrl)}`;
15 changes: 10 additions & 5 deletions assessment/sitemap.js
Expand Up @@ -110,7 +110,12 @@ async function fetchSitemapsFromRobots(siteUrl) {
return fetchSitemapsFromSource(sitemapSources);
}

export async function fetchSitemapsFromBaseUrl(url) {
export async function fetchSitemapsFromBaseUrl(url, sitemapSrc) {
if (sitemapSrc) {
return fetchSitemapsFromSource([
{ url: new URL(sitemapSrc, url).toString(), source: 'user provided' },
]);
}
let sitemaps = await fetchSitemapsFromRobots(userSiteUrl);
if (!sitemaps.length) {
sitemaps = await fetchSitemapsFromSource([
Expand All @@ -123,7 +128,7 @@ export async function fetchSitemapsFromBaseUrl(url) {
}
}
return sitemaps;
};
}

export const sitemap = (async () => {
const assessment = await createAssessment(userSiteUrl, 'Sitemap');
Expand All @@ -138,10 +143,10 @@ export const sitemap = (async () => {
const sitemaps = await fetchSitemapsFromBaseUrl(userSiteUrl);

// Assessment for sitemaps
sitemaps.forEach(async (sitemap) => {
if (sitemap.url) {
sitemaps.forEach(async (sm) => {
if (sm.url) {
assessment.addColumn({
sitemapOrPage: sitemap.url, source: sitemap.source, locs: sitemap.locs, error: sitemap.error || '', warning: sitemap.warning || '',
sitemapOrPage: sm.url, source: sm.source, locs: sm.locs, error: sm.error || '', warning: sm.warning || '',
});
}
});
Expand Down