Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SAX-based sitemap parser #497

Merged
merged 18 commits into from Mar 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
10 changes: 9 additions & 1 deletion docs/docs/user-guide/common-options.md
Expand Up @@ -10,12 +10,20 @@ See [page.goto waitUntil options](https://pptr.dev/api/puppeteer.page.goto#remar

The `--pageLoadTimeout`/`--timeout` option sets the timeout in seconds for page load, defaulting to 90 seconds. Behaviors will run on the page once either the page load condition or the page load timeout is met, whichever happens first.

## Ad blocking
## Ad Blocking

Brave Browser, the browser used by Browsertrix Crawler for crawling, has some ad and tracker blocking features enabled by default. These [Shields](https://brave.com/shields/) be disabled or customized using [Browser Profiles](browser-profiles.md).

Browsertrix Crawler also supports blocking ads from being loaded during capture based on [Stephen Black's list of known ad hosts](https://raw.githubusercontent.com/StevenBlack/hosts/master/hosts). To enable ad blocking based on this list, use the `--blockAds` option. If `--adBlockMessage` is set, a record with the specified error message will be added in the ad's place.

## Sitemap Parsing

The `--sitemap` option can be used to have the crawler parse a sitemap and queue any found URLs while respecting the crawl's scoping rules and limits. Browsertrix Crawler is able to parse regular sitemaps as well as sitemap indices that point out to nested sitemaps.

By default, `--sitemap` will look for a sitemap at `<your-seed>/sitemap.xml`. If a website's sitemap is hosted at a different URL, pass the URL with the flag like `--sitemap <sitemap url>`.

The `--sitemapFrom`/`--sitemapFromDate` and `--sitemapTo`/`--sitemapToDate` options allow for only extracting pages within a specific date range. If set, these options will filter URLs from sitemaps to those greater than or equal to (>=) or lesser than or equal to (<=) a provided ISO Date string (`YYYY-MM-DD`, `YYYY-MM-DDTHH:MM:SS`, or partial date), respectively.

## Custom Warcinfo Fields

Custom fields can be added to the `warcinfo` WARC record, generated for each combined WARC. The fields can be specified in the YAML config under `warcinfo` section or specifying individually via the command-line.
Expand Down
3 changes: 2 additions & 1 deletion package.json
Expand Up @@ -17,6 +17,7 @@
},
"dependencies": {
"@novnc/novnc": "^1.4.0",
"@types/sax": "^1.2.7",
"@webrecorder/wabac": "^2.16.12",
"browsertrix-behaviors": "^0.5.3",
"crc": "^4.3.2",
Expand All @@ -27,8 +28,8 @@
"minio": "^7.1.3",
"p-queue": "^7.3.4",
"puppeteer-core": "^20.8.2",
"sax": "^1.3.0",
"sharp": "^0.32.6",
"sitemapper": "^3.2.6",
"tsc": "^2.0.4",
"uuid": "8.3.2",
"warcio": "^2.2.1",
Expand Down
112 changes: 83 additions & 29 deletions src/crawler.ts
Expand Up @@ -13,7 +13,6 @@ import {
PageCallbacks,
} from "./util/state.js";

import Sitemapper from "sitemapper";
import yaml from "js-yaml";

import * as warcio from "warcio";
Expand Down Expand Up @@ -53,6 +52,8 @@ import { OriginOverride } from "./util/originoverride.js";
import { Agent as HTTPAgent } from "http";
import { Agent as HTTPSAgent } from "https";
import { CDPSession, Frame, HTTPRequest, Page } from "puppeteer-core";
import { SitemapReader } from "./util/sitemapper.js";
import { ScopedSeed } from "./util/seeds.js";

const HTTPS_AGENT = new HTTPSAgent({
rejectUnauthorized: false,
Expand All @@ -70,6 +71,7 @@ const behaviors = fs.readFileSync(

const FETCH_TIMEOUT_SECS = 30;
const PAGE_OP_TIMEOUT_SECS = 5;
const SITEMAP_INITIAL_FETCH_TIMEOUT_SECS = 30;

const POST_CRAWL_STATES = [
"generate-wacz",
Expand Down Expand Up @@ -1241,7 +1243,13 @@ self.__bx_behaviors.selectMainBehavior();
}

if (seed.sitemap) {
await this.parseSitemap(seed.sitemap, i, this.params.sitemapFromDate);
await timedRun(
this.parseSitemap(seed, i),
SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
"Sitemap initial fetch timed out",
{ sitemap: seed.sitemap, seed: seed.url },
"sitemap",
);
}
}

Expand Down Expand Up @@ -2052,40 +2060,86 @@ self.__bx_behaviors.selectMainBehavior();
return false;
}

async parseSitemap(url: string, seedId: number, sitemapFromDate: number) {
// handle sitemap last modified date if passed
let lastmodFromTimestamp = undefined;
const dateObj = new Date(sitemapFromDate);
if (isNaN(dateObj.getTime())) {
logger.info(
"Fetching full sitemap (fromDate not specified/valid)",
{ url, sitemapFromDate },
"sitemap",
);
} else {
lastmodFromTimestamp = dateObj.getTime();
logger.info(
"Fetching and filtering sitemap by date",
{ url, sitemapFromDate },
"sitemap",
);
async parseSitemap({ url, sitemap }: ScopedSeed, seedId: number) {
if (!sitemap) {
return;
}

// eslint-disable-next-line @typescript-eslint/no-explicit-any
const sitemapper = new (Sitemapper as any)({
url,
timeout: 15000,
requestHeaders: this.headers,
lastmod: lastmodFromTimestamp,
if (await this.crawlState.isSitemapDone()) {
logger.info("Sitemap already processed, skipping", "sitemap");
return;
}

const fromDate = this.params.sitemapFromDate;
const toDate = this.params.sitemapToDate;
const headers = this.headers;

logger.info(
"Fetching sitemap",
{ from: fromDate || "<any date>", to: fromDate || "<any date>" },
"sitemap",
);
const sitemapper = new SitemapReader({
headers,
fromDate,
toDate,
limit: this.pageLimit,
});

try {
const { sites } = await sitemapper.fetch();
logger.info("Sitemap Urls Found", { urls: sites.length }, "sitemap");
await this.queueInScopeUrls(seedId, sites, 0);
await sitemapper.parse(sitemap, url);
} catch (e) {
logger.warn("Error fetching sites from sitemap", e, "sitemap");
logger.warn(
"Sitemap for seed failed",
{ url, sitemap, ...formatErr(e) },
"sitemap",
);
return;
}

let power = 1;
let resolved = false;

let finished = false;

await new Promise<void>((resolve) => {
sitemapper.on("end", () => {
resolve();
if (!finished) {
logger.info(
"Sitemap Parsing Finished",
{ urlsFound: sitemapper.count, limitHit: sitemapper.atLimit() },
"sitemap",
);
this.crawlState.markSitemapDone();
finished = true;
}
});
sitemapper.on("url", ({ url }) => {
const count = sitemapper.count;
if (count % 10 ** power === 0) {
if (count % 10 ** (power + 1) === 0 && power <= 3) {
power++;
}
const sitemapsQueued = sitemapper.getSitemapsQueued();
logger.debug(
"Sitemap URLs processed so far",
{ count, sitemapsQueued },
"sitemap",
);
}
this.queueInScopeUrls(seedId, [url], 0);
if (count >= 100 && !resolved) {
logger.info(
"Sitemap partially parsed, continue parsing large sitemap in the background",
{ urlsFound: count },
"sitemap",
);
resolve();
resolved = true;
}
});
});
}

async combineWARC() {
Expand Down
8 changes: 7 additions & 1 deletion src/util/argParser.ts
Expand Up @@ -287,7 +287,13 @@ class ArgParser {
sitemapFromDate: {
alias: "sitemapFrom",
describe:
"If set, filter URLs from sitemaps to those greater than or equal to provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)",
"If set, filter URLs from sitemaps to those greater than or equal to (>=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)",
},

sitemapToDate: {
alias: "sitemapTo",
describe:
"If set, filter URLs from sitemaps to those less than or equal to (<=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)",
},

statsFilename: {
Expand Down
3 changes: 3 additions & 0 deletions src/util/constants.ts
Expand Up @@ -9,6 +9,9 @@ export const WAIT_UNTIL_OPTS = [
"networkidle0",
"networkidle2",
];

export const DETECT_SITEMAP = "<detect>";

export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"];

export const BEHAVIOR_LOG_FUNC = "__bx_log";
Expand Down
7 changes: 2 additions & 5 deletions src/util/seeds.ts
Expand Up @@ -144,12 +144,9 @@ export class ScopedSeed {

resolveSiteMap(sitemap: boolean | string | null): string | null {
if (sitemap === true) {
const url = new URL(this.url);
url.pathname = "/sitemap.xml";
return url.href;
return "<detect>";
} else if (typeof sitemap === "string") {
const url = new URL(sitemap, this.url);
return url.href;
return sitemap;
}

return null;
Expand Down