Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes from 1.0.3 release -> main #517

Merged
merged 5 commits into from Mar 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 6 additions & 1 deletion src/crawler.ts
Expand Up @@ -2152,6 +2152,10 @@ self.__bx_behaviors.selectMainBehavior();

let finished = false;

// disable extraHops for sitemap found URLs by setting to extraHops limit + 1
// otherwise, all sitemap found URLs would be eligible for additional hops
const extraHopsDisabled = this.params.extraHops + 1;

await new Promise<void>((resolve) => {
sitemapper.on("end", () => {
resolve();
Expand All @@ -2165,6 +2169,7 @@ self.__bx_behaviors.selectMainBehavior();
finished = true;
}
});

sitemapper.on("url", ({ url }) => {
const count = sitemapper.count;
if (count % 10 ** power === 0) {
Expand All @@ -2178,7 +2183,7 @@ self.__bx_behaviors.selectMainBehavior();
"sitemap",
);
}
this.queueInScopeUrls(seedId, [url], 0);
this.queueInScopeUrls(seedId, [url], 0, extraHopsDisabled);
if (count >= 100 && !resolved) {
logger.info(
"Sitemap partially parsed, continue parsing large sitemap in the background",
Expand Down
47 changes: 36 additions & 11 deletions src/util/sitemapper.ts
Expand Up @@ -11,6 +11,9 @@ import { sleep } from "./timing.js";

const SITEMAP_CONCURRENCY = 5;

const TEXT_CONTENT_TYPE = ["text/plain"];
const XML_CONTENT_TYPES = ["text/xml", "application/xml"];

export type SitemapOpts = {
headers?: Record<string, string>;

Expand Down Expand Up @@ -83,7 +86,7 @@ export class SitemapReader extends EventEmitter {
}
}

async tryFetch(url: string, expectedCT?: string | null) {
async tryFetch(url: string, expectedCT?: string[] | null) {
try {
logger.debug(
"Detecting Sitemap: fetching",
Expand All @@ -101,7 +104,7 @@ export class SitemapReader extends EventEmitter {
}

const ct = resp.headers.get("content-type");
if (expectedCT && ct && ct.split(";")[0] != expectedCT) {
if (expectedCT && ct && !expectedCT.includes(ct.split(";")[0])) {
logger.debug(
"Detecting Sitemap: invalid content-type",
{ ct },
Expand Down Expand Up @@ -129,12 +132,12 @@ export class SitemapReader extends EventEmitter {
if (sitemap === DETECT_SITEMAP) {
logger.debug("Detecting sitemap for seed", { seedUrl }, "sitemap");
fullUrl = new URL("/robots.txt", seedUrl).href;
resp = await this.tryFetch(fullUrl, "text/plain");
resp = await this.tryFetch(fullUrl, TEXT_CONTENT_TYPE);
if (resp) {
isRobots = true;
} else {
fullUrl = new URL("/sitemap.xml", seedUrl).href;
resp = await this.tryFetch(fullUrl, "text/xml");
resp = await this.tryFetch(fullUrl, XML_CONTENT_TYPES);
if (resp) {
isSitemap = true;
}
Expand All @@ -144,10 +147,10 @@ export class SitemapReader extends EventEmitter {
fullUrl = new URL(sitemap, seedUrl).href;
let expected = null;
if (fullUrl.endsWith(".xml")) {
expected = "text/xml";
expected = XML_CONTENT_TYPES;
isSitemap = true;
} else if (fullUrl.endsWith(".txt")) {
expected = "text/plain";
expected = TEXT_CONTENT_TYPE;
isRobots = true;
}
resp = await this.tryFetch(fullUrl, expected);
Expand Down Expand Up @@ -218,8 +221,22 @@ export class SitemapReader extends EventEmitter {
}

private async _parseSitemapFromResponse(url: string, resp: Response) {
let stream;

const { body } = resp;
if (!body) {
throw new Error("missing response body");
}
// decompress .gz sitemaps
if (url.endsWith(".gz")) {
const ds = new DecompressionStream("gzip");
stream = body.pipeThrough(ds);
} else {
stream = body;
}

const readableNodeStream = Readable.fromWeb(
resp.body as ReadableStream<Uint8Array>,
stream as ReadableStream<Uint8Array>,
);
this.initSaxParser(url, readableNodeStream);
}
Expand All @@ -244,6 +261,8 @@ export class SitemapReader extends EventEmitter {
let currUrl: string | null;
let lastmod: Date | null = null;

let errCount = 0;

let otherTags = 0;

parserStream.on("end", async () => {
Expand Down Expand Up @@ -358,10 +377,16 @@ export class SitemapReader extends EventEmitter {
this.pending.delete(url);
return;
}
logger.warn("Sitemap error parsing XML", { err }, "sitemap");
// eslint-disable-next-line @typescript-eslint/no-explicit-any
(parserStream._parser as any).error = null;
parserStream._parser.resume();
logger.warn(
"Sitemap error parsing XML",
{ url, err, errCount },
"sitemap",
);
if (errCount++ < 3) {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
(parserStream._parser as any).error = null;
parserStream._parser.resume();
}
});

sourceStream.pipe(parserStream);
Expand Down
13 changes: 6 additions & 7 deletions src/util/state.ts
Expand Up @@ -97,6 +97,7 @@ declare module "ioredis" {
pkey: string,
qkey: string,
skey: string,
esKey: string,
url: string,
score: number,
data: string,
Expand Down Expand Up @@ -199,9 +200,9 @@ export class RedisCrawlState {

_initLuaCommands(redis: Redis) {
redis.defineCommand("addqueue", {
numberOfKeys: 3,
numberOfKeys: 4,
lua: `
local size = redis.call('scard', KEYS[3]);
local size = redis.call('scard', KEYS[3]) - redis.call('llen', KEYS[4]);
local limit = tonumber(ARGV[4]);
if limit > 0 and size >= limit then
return 1;
Expand Down Expand Up @@ -506,6 +507,7 @@ return 0;
this.pkey,
this.qkey,
this.skey,
this.esKey,
url,
this._getScore(data),
JSON.stringify(data),
Expand Down Expand Up @@ -624,7 +626,8 @@ return 0;

for (const result of someResults) {
const json = JSON.parse(result);
seenSet.delete(json.url);
//for extra seeds
seenSet.delete(json.url || json.newUrl);
results.push(result);
}
}
Expand Down Expand Up @@ -722,10 +725,6 @@ return 0;
return parseInt(done || "0");
}

async numSeen() {
return await this.redis.scard(this.skey);
}

async numPending() {
const res = await this.redis.hlen(this.pkey);

Expand Down
51 changes: 34 additions & 17 deletions tests/saved-state.test.js
Expand Up @@ -4,17 +4,15 @@ import path from "path";
import yaml from "js-yaml";
import Redis from "ioredis";


const pagesFile = "test-crawls/collections/int-state-test/pages/pages.jsonl";


function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}

async function waitContainer(containerId) {
try {
execSync(`docker kill -s SIGINT ${containerId}`);
} catch (e) {
return;
}

async function waitContainerDone(containerId) {
// containerId is initially the full id, but docker ps
// only prints the short id (first 12 characters)
containerId = containerId.slice(0, 12);
Expand All @@ -32,6 +30,17 @@ async function waitContainer(containerId) {
}
}

async function killContainer(containerId) {
try {
execSync(`docker kill -s SIGINT ${containerId}`);
} catch (e) {
return;
}

await waitContainerDone(containerId);
}


let savedStateFile;
let state;
let numDone;
Expand All @@ -43,16 +52,14 @@ test("check crawl interrupted + saved state written", async () => {

try {
containerId = execSync(
"docker run -d -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://www.webrecorder.net/ --limit 10",
"docker run -d -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://www.webrecorder.net/ --limit 10 --behaviors \"\"",
{ encoding: "utf-8" },
//wait.callback,
);
} catch (error) {
console.log(error);
}

const pagesFile = "test-crawls/collections/int-state-test/pages/pages.jsonl";

// remove existing pagesFile to support reentrancy
try {
fs.unlinkSync(pagesFile);
Expand All @@ -77,7 +84,7 @@ test("check crawl interrupted + saved state written", async () => {
await sleep(500);
}

await waitContainer(containerId);
await killContainer(containerId);

const savedStates = fs.readdirSync(
"test-crawls/collections/int-state-test/crawls",
Expand All @@ -97,11 +104,13 @@ test("check parsing saved state + page done + queue present", () => {

const saved = yaml.load(savedState);

expect(!!saved.state).toBe(true);
state = saved.state;
numDone = state.finished.length;
finished = state.finished;

numDone = finished.length;
numQueued = state.queued.length;

expect(!!state).toBe(true);
expect(numDone > 0).toEqual(true);
expect(numQueued > 0).toEqual(true);
expect(numDone + numQueued).toEqual(10);
Expand All @@ -110,8 +119,6 @@ test("check parsing saved state + page done + queue present", () => {
expect(state.extraSeeds).toEqual([
`{"origSeedId":0,"newUrl":"https://webrecorder.net/"}`,
]);

finished = state.finished;
});

test("check crawl restarted with saved state", async () => {
Expand All @@ -121,7 +128,7 @@ test("check crawl restarted with saved state", async () => {

try {
containerId = execSync(
`docker run -d -p ${port}:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 5`,
`docker run -d -p ${port}:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 10 --behaviors ""`,
{ encoding: "utf-8" },
);
} catch (error) {
Expand All @@ -148,6 +155,16 @@ test("check crawl restarted with saved state", async () => {
} catch (e) {
console.log(e);
} finally {
await waitContainer(containerId);
await waitContainerDone(containerId);
}
});

test("ensure correct number of pages was written", () => {
const pages = fs
.readFileSync(pagesFile, { encoding: "utf-8" })
.trim()
.split("\n");

// first line is the header
expect(pages.length).toBe(10 + 1);
});
17 changes: 15 additions & 2 deletions tests/sitemap-parse.test.js
Expand Up @@ -29,8 +29,8 @@ async function waitContainer(containerId) {
}
}

async function runCrawl(numExpected, url, sitemap="", limit=0) {
const containerId = child_process.execSync(`docker run -d -p 36381:6379 -e CRAWL_ID=test webrecorder/browsertrix-crawler crawl --url ${url} --sitemap ${sitemap} --limit ${limit} --context sitemap --logging debug --debugAccessRedis`, {encoding: "utf-8"});
async function runCrawl(numExpected, url, sitemap="", limit=0, numExpectedLessThan=0, extra="") {
const containerId = child_process.execSync(`docker run -d -p 36381:6379 -e CRAWL_ID=test webrecorder/browsertrix-crawler crawl --url ${url} --sitemap ${sitemap} --limit ${limit} --context sitemap --logging debug --debugAccessRedis ${extra}`, {encoding: "utf-8"});

await sleep(3000);

Expand All @@ -57,6 +57,10 @@ async function runCrawl(numExpected, url, sitemap="", limit=0) {
}

expect(finished).toBeGreaterThanOrEqual(numExpected);

if (numExpectedLessThan) {
expect(finished).toBeLessThanOrEqual(numExpectedLessThan);
}
}

test("test sitemap fully finish", async () => {
Expand All @@ -70,3 +74,12 @@ test("test sitemap with limit", async () => {
test("test sitemap with limit, specific URL", async () => {
await runCrawl(1900, "https://www.mozilla.org/", "https://www.mozilla.org/sitemap.xml", 2000);
});

test("test sitemap with application/xml content-type", async () => {
await runCrawl(10, "https://bitarchivist.net/", "", 0);
});


test("test sitemap with narrow scope, extraHops, to ensure extraHops don't apply to sitemap", async () => {
await runCrawl(1, "https://www.mozilla.org/", "", 2000, 100, "--extraHops 1 --scopeType page");
});