/
crawl.js
49 lines (46 loc) · 1.02 KB
/
crawl.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
if (process.argv.length < 4) {
console.log("Usage: node crawl <url> <job-name> <start-page = 1>");
process.exit(0);
}
const url = process.argv[2];
const jobName = process.argv[3];
const startPage = parseInt(process.argv[4] || 1);
const https = require('https');
const fs = require('fs');
(async function() {
try {
let page = startPage;
let next = `${url}?format=json&page=${page}`;
let res;
while (next) {
res = await get(next);
fs.writeFile(`data/${jobName}/${page}.json`, res, 'utf8', (err) => {
if (err) {
console.error(err);
}
});
next = JSON.parse(res).next;
page++;
}
} catch (err) {
console.error(err);
process.exit(0);
}
})();
function get(url) {
console.log('[GET] ' + url);
return new Promise((resolve, reject) => {
https.get(url, res => {
const data = [];
res.on('data', chunk => {
data.push(chunk);
});
res.on('end', () => {
resolve(data.join(''));
});
})
.on('error', err => {
reject(err);
});
});
}