Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SAX-based sitemap parser #497

Merged
merged 18 commits into from Mar 19, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 2 additions & 1 deletion package.json
Expand Up @@ -17,6 +17,7 @@
},
"dependencies": {
"@novnc/novnc": "^1.4.0",
"@types/sax": "^1.2.7",
"@webrecorder/wabac": "^2.16.12",
"browsertrix-behaviors": "^0.5.3",
"crc": "^4.3.2",
Expand All @@ -27,8 +28,8 @@
"minio": "^7.1.3",
"p-queue": "^7.3.4",
"puppeteer-core": "^20.8.2",
"sax": "^1.3.0",
"sharp": "^0.32.6",
"sitemapper": "^3.2.6",
"tsc": "^2.0.4",
"uuid": "8.3.2",
"warcio": "^2.2.1",
Expand Down
105 changes: 76 additions & 29 deletions src/crawler.ts
Expand Up @@ -13,7 +13,6 @@ import {
PageCallbacks,
} from "./util/state.js";

import Sitemapper from "sitemapper";
import yaml from "js-yaml";

import * as warcio from "warcio";
Expand Down Expand Up @@ -53,6 +52,7 @@ import { OriginOverride } from "./util/originoverride.js";
import { Agent as HTTPAgent } from "http";
import { Agent as HTTPSAgent } from "https";
import { CDPSession, Frame, HTTPRequest, Page } from "puppeteer-core";
import { SitemapReader } from "./util/sitemapper.js";

const HTTPS_AGENT = new HTTPSAgent({
rejectUnauthorized: false,
Expand All @@ -70,6 +70,7 @@ const behaviors = fs.readFileSync(

const FETCH_TIMEOUT_SECS = 30;
const PAGE_OP_TIMEOUT_SECS = 5;
const SITEMAP_INITIAL_FETCH_TIMEOUT_SECS = 30;

const POST_CRAWL_STATES = [
"generate-wacz",
Expand Down Expand Up @@ -1241,7 +1242,13 @@ self.__bx_behaviors.selectMainBehavior();
}

if (seed.sitemap) {
await this.parseSitemap(seed.sitemap, i, this.params.sitemapFromDate);
await timedRun(
this.parseSitemap(seed.sitemap, i),
SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
"Sitemap initial fetch timed out",
{ sitemap: seed.sitemap, seed: seed.url },
"sitemap",
);
}
}

Expand Down Expand Up @@ -2052,40 +2059,80 @@ self.__bx_behaviors.selectMainBehavior();
return false;
}

async parseSitemap(url: string, seedId: number, sitemapFromDate: number) {
// handle sitemap last modified date if passed
let lastmodFromTimestamp = undefined;
const dateObj = new Date(sitemapFromDate);
if (isNaN(dateObj.getTime())) {
logger.info(
"Fetching full sitemap (fromDate not specified/valid)",
{ url, sitemapFromDate },
"sitemap",
);
} else {
lastmodFromTimestamp = dateObj.getTime();
logger.info(
"Fetching and filtering sitemap by date",
{ url, sitemapFromDate },
"sitemap",
);
async parseSitemap(url: string, seedId: number) {
if (await this.crawlState.isSitemapDone()) {
logger.info("Sitemap already processed, skipping", "sitemap");
return;
}

// eslint-disable-next-line @typescript-eslint/no-explicit-any
const sitemapper = new (Sitemapper as any)({
url,
timeout: 15000,
requestHeaders: this.headers,
lastmod: lastmodFromTimestamp,
const fromDate = this.params.sitemapFromDate;
const toDate = this.params.sitemapToDate;
const headers = this.headers;

logger.info(
"Fetching sitemap",
{ from: fromDate || "<any date>", to: fromDate || "<any date>" },
"sitemap",
);
const counter = { value: 0 };
const sitemapper = new SitemapReader({
headers,
fromDate,
toDate,
counter,
limit: this.pageLimit,
});

try {
const { sites } = await sitemapper.fetch();
logger.info("Sitemap Urls Found", { urls: sites.length }, "sitemap");
await this.queueInScopeUrls(seedId, sites, 0);
await sitemapper.parseSitemap(url);
} catch (e) {
logger.warn("Error fetching sites from sitemap", e, "sitemap");
logger.warn("Sitemap parse failed", { url, ...formatErr(e) }, "sitemap");
}

let power = 1;
let resolved = false;

let finished = false;

await new Promise<void>((resolve) => {
sitemapper.on("end", () => {
resolve();
if (!finished) {
logger.info(
"Sitemap Parsing Finished",
{ urlsFound: counter.value, limitHit: sitemapper.atLimit() },
"sitemap",
);
this.crawlState.markSitemapDone();
finished = true;
}
});
sitemapper.on("url", ({ url }) => {
const count = counter.value;

if (count % 10 ** power === 0) {
if (count % 10 ** (power + 1) === 0 && power <= 3) {
power++;
}
const sitemapsQueued = sitemapper.getSitemapsQueued();
logger.debug(
"Sitemap URLs processed so far",
{ count, sitemapsQueued },
"sitemap",
);
}
this.queueInScopeUrls(seedId, [url], 0);
if (count >= 100 && !resolved) {
logger.info(
"Sitemap partially parsed, continue parsing large sitemap in the background",
{ urlsFound: count },
"sitemap",
);
resolve();
resolved = true;
}
});
});
}

async combineWARC() {
Expand Down
8 changes: 7 additions & 1 deletion src/util/argParser.ts
Expand Up @@ -287,7 +287,13 @@ class ArgParser {
sitemapFromDate: {
alias: "sitemapFrom",
describe:
"If set, filter URLs from sitemaps to those greater than or equal to provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)",
"If set, filter URLs from sitemaps to those greater than or equal to (>=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)",
},

sitemapToDate: {
alias: "sitemapTo",
describe:
"If set, filter URLs from sitemaps to those less than or equal to (<=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)",
},

statsFilename: {
Expand Down