-
-
Notifications
You must be signed in to change notification settings - Fork 71
/
sitemap-parse.test.js
85 lines (66 loc) · 2.36 KB
/
sitemap-parse.test.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import child_process from "child_process";
import Redis from "ioredis";
function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
async function waitContainer(containerId) {
try {
child_process.execSync(`docker kill -s SIGINT ${containerId}`);
} catch (e) {
return;
}
// containerId is initially the full id, but docker ps
// only prints the short id (first 12 characters)
containerId = containerId.slice(0, 12);
while (true) {
try {
const res = child_process.execSync("docker ps -q", { encoding: "utf-8" });
if (res.indexOf(containerId) < 0) {
return;
}
} catch (e) {
console.error(e);
}
await sleep(500);
}
}
async function runCrawl(numExpected, url, sitemap="", limit=0, numExpectedLessThan=0, extra="") {
const containerId = child_process.execSync(`docker run -d -p 36381:6379 -e CRAWL_ID=test webrecorder/browsertrix-crawler crawl --url ${url} --sitemap ${sitemap} --limit ${limit} --context sitemap --logging debug --debugAccessRedis ${extra}`, {encoding: "utf-8"});
await sleep(3000);
const redis = new Redis("redis://127.0.0.1:36381/0", { lazyConnect: true, retryStrategy: () => null });
let finished = 0;
try {
await redis.connect({
maxRetriesPerRequest: 100,
});
while (true) {
finished = await redis.zcard("test:q");
if (finished >= numExpected) {
break;
}
}
} catch (e) {
console.error(e);
} finally {
await waitContainer(containerId);
}
expect(finished).toBeGreaterThanOrEqual(numExpected);
if (numExpectedLessThan) {
expect(finished).toBeLessThanOrEqual(numExpectedLessThan);
}
}
test("test sitemap fully finish", async () => {
await runCrawl(8036, "https://www.mozilla.org/", "", 0);
});
test("test sitemap with limit", async () => {
await runCrawl(1900, "https://www.mozilla.org/", "", 2000);
});
test("test sitemap with limit, specific URL", async () => {
await runCrawl(1900, "https://www.mozilla.org/", "https://www.mozilla.org/sitemap.xml", 2000);
});
test("test sitemap with application/xml content-type", async () => {
await runCrawl(10, "https://bitarchivist.net/", "", 0);
});
test("test sitemap with narrow scope, extraHops, to ensure extraHops don't apply to sitemap", async () => {
await runCrawl(1, "https://www.mozilla.org/", "", 2000, 100, "--extraHops 1 --scopeType page");
});