-
-
Notifications
You must be signed in to change notification settings - Fork 71
/
basic_crawl.test.js
122 lines (106 loc) · 3.67 KB
/
basic_crawl.test.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import child_process from "child_process";
import fs from "fs";
import path from "path";
import md5 from "md5";
test("ensure basic crawl run with docker run passes", async () => {
child_process.execSync(
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com/ --generateWACZ --text --collection wr-net --combineWARC --rolloverSize 10000 --workers 2 --title "test title" --description "test description" --warcPrefix custom-prefix',
);
// child_process.execSync(
// "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/wr-net/wr-net.wacz",
// );
child_process.execSync(
"unzip test-crawls/collections/wr-net/wr-net.wacz -d test-crawls/collections/wr-net/wacz",
);
});
test("check that individual WARCs have correct prefix and are under rollover size", () => {
const archiveWarcLists = fs.readdirSync(
"test-crawls/collections/wr-net/archive",
);
archiveWarcLists.forEach((filename) => {
expect(filename.startsWith("custom-prefix-")).toEqual(true);
const size = fs.statSync(
path.join("test-crawls/collections/wr-net/archive", filename),
).size;
expect(size < 10000).toEqual(true);
});
});
test("check that a combined warc file exists in the archive folder", () => {
const warcLists = fs.readdirSync("test-crawls/collections/wr-net");
var captureFound = 0;
for (var i = 0; i < warcLists.length; i++) {
if (warcLists[i].endsWith("_0.warc.gz")) {
captureFound = 1;
}
}
expect(captureFound).toEqual(1);
});
test("check that a combined warc file is under the rolloverSize", () => {
const warcLists = fs.readdirSync(
path.join("test-crawls/collections/wr-net/wacz", "archive"),
);
let rolloverSize = 0;
function getFileSize(filename) {
return fs.statSync(filename).size;
}
for (let i = 0; i < warcLists.length; i++) {
const size = getFileSize(
path.join("test-crawls/collections/wr-net/wacz/archive/", warcLists[i]),
);
if (size < 10000) {
rolloverSize = 1;
}
}
expect(rolloverSize).toEqual(1);
});
test("check that the pages.jsonl file exists in the collection under the pages folder", () => {
expect(
fs.existsSync("test-crawls/collections/wr-net/pages/pages.jsonl"),
).toBe(true);
});
test("check that the pages.jsonl file exists in the wacz under the pages folder", () => {
expect(
fs.existsSync("test-crawls/collections/wr-net/wacz/pages/pages.jsonl"),
).toBe(true);
});
test("check that the hash in the pages folder and in the unzipped wacz folders match", () => {
const crawl_hash = md5(
JSON.parse(
fs
.readFileSync(
"test-crawls/collections/wr-net/wacz/pages/pages.jsonl",
"utf8",
)
.split("\n")[1],
)["text"],
);
const wacz_hash = md5(
JSON.parse(
fs
.readFileSync(
"test-crawls/collections/wr-net/pages/pages.jsonl",
"utf8",
)
.split("\n")[1],
)["text"],
);
const fixture_hash = md5(
JSON.parse(
fs.readFileSync("tests/fixtures/pages.jsonl", "utf8").split("\n")[1],
)["text"],
);
expect(wacz_hash).toEqual(fixture_hash);
expect(wacz_hash).toEqual(crawl_hash);
});
test("check that the supplied title and description made it into datapackage.json", () => {
expect(
fs.existsSync("test-crawls/collections/wr-net/wacz/datapackage.json"),
).toBe(true);
const data = fs.readFileSync(
"test-crawls/collections/wr-net/wacz/datapackage.json",
"utf8",
);
const dataPackageJSON = JSON.parse(data);
expect(dataPackageJSON.title).toEqual("test title");
expect(dataPackageJSON.description).toEqual("test description");
});