Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fixes redirected seed (from #475) being counted againt page limit: #509

Merged
merged 2 commits into from
Mar 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "1.0.2",
"version": "1.0.3",
"main": "browsertrix-crawler",
"type": "module",
"repository": "https://github.com/webrecorder/browsertrix-crawler",
Expand Down
13 changes: 6 additions & 7 deletions src/util/state.ts
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ declare module "ioredis" {
pkey: string,
qkey: string,
skey: string,
esKey: string,
url: string,
score: number,
data: string,
Expand Down Expand Up @@ -193,9 +194,9 @@ export class RedisCrawlState {

_initLuaCommands(redis: Redis) {
redis.defineCommand("addqueue", {
numberOfKeys: 3,
numberOfKeys: 4,
lua: `
local size = redis.call('scard', KEYS[3]);
local size = redis.call('scard', KEYS[3]) - redis.call('llen', KEYS[4]);
local limit = tonumber(ARGV[4]);
if limit > 0 and size >= limit then
return 1;
Expand Down Expand Up @@ -486,6 +487,7 @@ return 0;
this.pkey,
this.qkey,
this.skey,
this.esKey,
url,
this._getScore(data),
JSON.stringify(data),
Expand Down Expand Up @@ -604,7 +606,8 @@ return 0;

for (const result of someResults) {
const json = JSON.parse(result);
seenSet.delete(json.url);
//for extra seeds
seenSet.delete(json.url || json.newUrl);
results.push(result);
}
}
Expand Down Expand Up @@ -702,10 +705,6 @@ return 0;
return parseInt(done || "0");
}

async numSeen() {
return await this.redis.scard(this.skey);
}

async numPending() {
const res = await this.redis.hlen(this.pkey);

Expand Down
51 changes: 34 additions & 17 deletions tests/saved-state.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,15 @@ import path from "path";
import yaml from "js-yaml";
import Redis from "ioredis";


const pagesFile = "test-crawls/collections/int-state-test/pages/pages.jsonl";


function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}

async function waitContainer(containerId) {
try {
execSync(`docker kill -s SIGINT ${containerId}`);
} catch (e) {
return;
}

async function waitContainerDone(containerId) {
// containerId is initially the full id, but docker ps
// only prints the short id (first 12 characters)
containerId = containerId.slice(0, 12);
Expand All @@ -32,6 +30,17 @@ async function waitContainer(containerId) {
}
}

async function killContainer(containerId) {
try {
execSync(`docker kill -s SIGINT ${containerId}`);
} catch (e) {
return;
}

await waitContainerDone(containerId);
}


let savedStateFile;
let state;
let numDone;
Expand All @@ -43,16 +52,14 @@ test("check crawl interrupted + saved state written", async () => {

try {
containerId = execSync(
"docker run -d -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://www.webrecorder.net/ --limit 10",
"docker run -d -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://www.webrecorder.net/ --limit 10 --behaviors \"\"",
{ encoding: "utf-8" },
//wait.callback,
);
} catch (error) {
console.log(error);
}

const pagesFile = "test-crawls/collections/int-state-test/pages/pages.jsonl";

// remove existing pagesFile to support reentrancy
try {
fs.unlinkSync(pagesFile);
Expand All @@ -77,7 +84,7 @@ test("check crawl interrupted + saved state written", async () => {
await sleep(500);
}

await waitContainer(containerId);
await killContainer(containerId);

const savedStates = fs.readdirSync(
"test-crawls/collections/int-state-test/crawls",
Expand All @@ -97,11 +104,13 @@ test("check parsing saved state + page done + queue present", () => {

const saved = yaml.load(savedState);

expect(!!saved.state).toBe(true);
state = saved.state;
numDone = state.finished.length;
finished = state.finished;

numDone = finished.length;
numQueued = state.queued.length;

expect(!!state).toBe(true);
expect(numDone > 0).toEqual(true);
expect(numQueued > 0).toEqual(true);
expect(numDone + numQueued).toEqual(10);
Expand All @@ -110,16 +119,14 @@ test("check parsing saved state + page done + queue present", () => {
expect(state.extraSeeds).toEqual([
`{"origSeedId":0,"newUrl":"https://webrecorder.net/"}`,
]);

finished = state.finished;
});

test("check crawl restarted with saved state", async () => {
let containerId = null;

try {
containerId = execSync(
`docker run -d -p 36379:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 5`,
`docker run -d -p 36379:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 10 --behaviors ""`,
{ encoding: "utf-8" },
);
} catch (error) {
Expand Down Expand Up @@ -149,7 +156,7 @@ test("check crawl restarted with saved state", async () => {
} catch (e) {
console.log(e);
} finally {
await waitContainer(containerId);
await waitContainerDone(containerId);

try {
await redis.disconnect();
Expand All @@ -158,3 +165,13 @@ test("check crawl restarted with saved state", async () => {
}
}
});

test("ensure correct number of pages was written", () => {
const pages = fs
.readFileSync(pagesFile, { encoding: "utf-8" })
.trim()
.split("\n");

// first line is the header
expect(pages.length).toBe(10 + 1);
});