Skip to content

Commit

Permalink
fixes redirected seed (from #475) being counted againt page limit: (#509
Browse files Browse the repository at this point in the history
)

- subtract extraSeeds when computing limit
- don't include redirect seeds in seen list when serializing
- tests: adjust saved-state-test to also check total pages when crawl is
done

fixes #508
(for 1.0.3 release)
  • Loading branch information
ikreymer committed Mar 26, 2024
1 parent 22a7351 commit bf5cbb0
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 25 deletions.
2 changes: 1 addition & 1 deletion package.json
@@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "1.0.2",
"version": "1.0.3",
"main": "browsertrix-crawler",
"type": "module",
"repository": "https://github.com/webrecorder/browsertrix-crawler",
Expand Down
13 changes: 6 additions & 7 deletions src/util/state.ts
Expand Up @@ -91,6 +91,7 @@ declare module "ioredis" {
pkey: string,
qkey: string,
skey: string,
esKey: string,
url: string,
score: number,
data: string,
Expand Down Expand Up @@ -193,9 +194,9 @@ export class RedisCrawlState {

_initLuaCommands(redis: Redis) {
redis.defineCommand("addqueue", {
numberOfKeys: 3,
numberOfKeys: 4,
lua: `
local size = redis.call('scard', KEYS[3]);
local size = redis.call('scard', KEYS[3]) - redis.call('llen', KEYS[4]);
local limit = tonumber(ARGV[4]);
if limit > 0 and size >= limit then
return 1;
Expand Down Expand Up @@ -486,6 +487,7 @@ return 0;
this.pkey,
this.qkey,
this.skey,
this.esKey,
url,
this._getScore(data),
JSON.stringify(data),
Expand Down Expand Up @@ -604,7 +606,8 @@ return 0;

for (const result of someResults) {
const json = JSON.parse(result);
seenSet.delete(json.url);
//for extra seeds
seenSet.delete(json.url || json.newUrl);
results.push(result);
}
}
Expand Down Expand Up @@ -702,10 +705,6 @@ return 0;
return parseInt(done || "0");
}

async numSeen() {
return await this.redis.scard(this.skey);
}

async numPending() {
const res = await this.redis.hlen(this.pkey);

Expand Down
51 changes: 34 additions & 17 deletions tests/saved-state.test.js
Expand Up @@ -4,17 +4,15 @@ import path from "path";
import yaml from "js-yaml";
import Redis from "ioredis";


const pagesFile = "test-crawls/collections/int-state-test/pages/pages.jsonl";


function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}

async function waitContainer(containerId) {
try {
execSync(`docker kill -s SIGINT ${containerId}`);
} catch (e) {
return;
}

async function waitContainerDone(containerId) {
// containerId is initially the full id, but docker ps
// only prints the short id (first 12 characters)
containerId = containerId.slice(0, 12);
Expand All @@ -32,6 +30,17 @@ async function waitContainer(containerId) {
}
}

async function killContainer(containerId) {
try {
execSync(`docker kill -s SIGINT ${containerId}`);
} catch (e) {
return;
}

await waitContainerDone(containerId);
}


let savedStateFile;
let state;
let numDone;
Expand All @@ -43,16 +52,14 @@ test("check crawl interrupted + saved state written", async () => {

try {
containerId = execSync(
"docker run -d -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://www.webrecorder.net/ --limit 10",
"docker run -d -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://www.webrecorder.net/ --limit 10 --behaviors \"\"",
{ encoding: "utf-8" },
//wait.callback,
);
} catch (error) {
console.log(error);
}

const pagesFile = "test-crawls/collections/int-state-test/pages/pages.jsonl";

// remove existing pagesFile to support reentrancy
try {
fs.unlinkSync(pagesFile);
Expand All @@ -77,7 +84,7 @@ test("check crawl interrupted + saved state written", async () => {
await sleep(500);
}

await waitContainer(containerId);
await killContainer(containerId);

const savedStates = fs.readdirSync(
"test-crawls/collections/int-state-test/crawls",
Expand All @@ -97,11 +104,13 @@ test("check parsing saved state + page done + queue present", () => {

const saved = yaml.load(savedState);

expect(!!saved.state).toBe(true);
state = saved.state;
numDone = state.finished.length;
finished = state.finished;

numDone = finished.length;
numQueued = state.queued.length;

expect(!!state).toBe(true);
expect(numDone > 0).toEqual(true);
expect(numQueued > 0).toEqual(true);
expect(numDone + numQueued).toEqual(10);
Expand All @@ -110,16 +119,14 @@ test("check parsing saved state + page done + queue present", () => {
expect(state.extraSeeds).toEqual([
`{"origSeedId":0,"newUrl":"https://webrecorder.net/"}`,
]);

finished = state.finished;
});

test("check crawl restarted with saved state", async () => {
let containerId = null;

try {
containerId = execSync(
`docker run -d -p 36379:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 5`,
`docker run -d -p 36379:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 10 --behaviors ""`,
{ encoding: "utf-8" },
);
} catch (error) {
Expand Down Expand Up @@ -149,7 +156,7 @@ test("check crawl restarted with saved state", async () => {
} catch (e) {
console.log(e);
} finally {
await waitContainer(containerId);
await waitContainerDone(containerId);

try {
await redis.disconnect();
Expand All @@ -158,3 +165,13 @@ test("check crawl restarted with saved state", async () => {
}
}
});

test("ensure correct number of pages was written", () => {
const pages = fs
.readFileSync(pagesFile, { encoding: "utf-8" })
.trim()
.split("\n");

// first line is the header
expect(pages.length).toBe(10 + 1);
});

0 comments on commit bf5cbb0

Please sign in to comment.