Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch to using JS WACZ #505

Draft
wants to merge 21 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 3 additions & 3 deletions Dockerfile
Expand Up @@ -20,16 +20,16 @@ ENV PROXY_HOST=localhost \

WORKDIR /app

ADD requirements.txt /app/
RUN pip install -U setuptools; pip install -r requirements.txt
#ADD requirements.txt /app/
#RUN pip install -U setuptools; pip install -r requirements.txt

ADD package.json /app/

# to allow forcing rebuilds from this stage
ARG REBUILD

# Prefetch tldextract so pywb is able to boot in environments with limited internet access
RUN tldextract --update
#RUN tldextract --update

# Download and format ad host blocklist as JSON
RUN mkdir -p /tmp/ads && cd /tmp/ads && \
Expand Down
4 changes: 4 additions & 0 deletions package.json
Expand Up @@ -16,6 +16,7 @@
"prepare": "husky install"
},
"dependencies": {
"@harvard-lil/js-wacz": "^0.1.0",
"@novnc/novnc": "^1.4.0",
"@types/sax": "^1.2.7",
"@webrecorder/wabac": "^2.16.12",
Expand Down Expand Up @@ -60,5 +61,8 @@
"jest": {
"transform": {},
"testTimeout": 90000
},
"resolutions": {
"wrap-ansi": "7.0.0"
}
}
165 changes: 73 additions & 92 deletions src/crawler.ts
Expand Up @@ -3,6 +3,7 @@ import path from "path";
import fs, { WriteStream } from "fs";
import os from "os";
import fsp from "fs/promises";
import readline from "readline";

import {
RedisCrawlState,
Expand All @@ -18,6 +19,9 @@ import yaml from "js-yaml";

import * as warcio from "warcio";

// @ts-expect-error TODO fill in why error is expected
import { WACZ } from "@harvard-lil/js-wacz";

import { HealthChecker } from "./util/healthcheck.js";
import { TextExtractViaSnapshot } from "./util/textextract.js";
import {
Expand All @@ -31,7 +35,7 @@ import {
import { ScreenCaster, WSTransport } from "./util/screencaster.js";
import { Screenshots } from "./util/screenshots.js";
import { initRedis } from "./util/redis.js";
import { logger, formatErr, LogDetails } from "./util/logger.js";
import { logger, formatErr, LogDetails, WACZLogger } from "./util/logger.js";
import {
WorkerOpts,
WorkerState,
Expand Down Expand Up @@ -1284,11 +1288,6 @@ self.__bx_behaviors.selectMainBehavior();
"Non-Seed Pages",
);

this.adBlockRules = new AdBlockRules(
this.captureBasePrefix,
this.params.adBlockMessage,
);

if (this.params.blockRules && this.params.blockRules.length) {
this.blockRules = new BlockRules(
this.params.blockRules,
Expand Down Expand Up @@ -1412,31 +1411,31 @@ self.__bx_behaviors.selectMainBehavior();
}

if (this.params.generateCDX) {
// just move cdx files from tmp-cdx -> indexes at this point
logger.info("Generating CDX");
await fsp.mkdir(path.join(this.collDir, "indexes"), { recursive: true });

const indexer = new warcio.CDXIndexer({ format: "cdxj" });

await this.crawlState.setStatus("generate-cdx");

const warcList = await fsp.readdir(this.archivesDir);
const warcListFull = warcList.map((filename) =>
path.join(this.archivesDir, filename),
);
const indexesDir = path.join(this.collDir, "indexes");
await fsp.mkdir(indexesDir, { recursive: true });

//const indexResult = await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd}));
const params = [
"-o",
path.join(this.collDir, "indexes", "index.cdxj"),
...warcListFull,
];
const indexResult = await this.awaitProcess(
child_process.spawn("cdxj-indexer", params, { cwd: this.params.cwd }),
);
if (indexResult === 0) {
logger.debug("Indexing complete, CDX successfully created");
} else {
logger.error("Error indexing and generating CDX", {
"status code": indexResult,
});
}
const indexFile = path.join(indexesDir, "index.cdxj");
const destFh = fs.createWriteStream(indexFile);

const archiveDir = path.join(this.collDir, "archive");
const archiveFiles = await fsp.readdir(archiveDir);
const warcFiles = archiveFiles.filter((f) => f.endsWith(".warc.gz"));

const files = warcFiles.map((warcFile) => {
return {
reader: fs.createReadStream(path.join(archiveDir, warcFile)),
filename: warcFile,
};
});

await indexer.writeAll(files, destFh);
}

logger.info("Crawling done");
Expand All @@ -1459,6 +1458,13 @@ self.__bx_behaviors.selectMainBehavior();
}
}

// remove tmp-cdx, now that it's already been added to the WACZ and/or
// copied to indexes
await fsp.rm(this.tempCdxDir, {
recursive: true,
force: true,
});

if (this.params.waitOnDone && (!this.interrupted || this.finalExit)) {
this.done = true;
logger.info("All done, waiting for signal...");
Expand Down Expand Up @@ -1515,69 +1521,45 @@ self.__bx_behaviors.selectMainBehavior();
const waczFilename = this.params.collection.concat(".wacz");
const waczPath = path.join(this.collDir, waczFilename);

const createArgs = [
"create",
"-o",
waczPath,
"--pages",
this.seedPagesFile,
"--extra-pages",
this.otherPagesFile,
"--copy-pages",
"--log-directory",
this.logDir,
];
const waczLogger = new WACZLogger(logger);

// eslint-disable-next-line @typescript-eslint/no-explicit-any
const waczOpts: Record<string, any> = {
input: warcFileList.map((x) => path.join(this.archivesDir, x)),
output: waczPath,
pages: this.pagesDir,
detectPages: false,
indexFromWARCs: false,
logDirectory: this.logDir,
log: waczLogger,
};

if (process.env.WACZ_SIGN_URL) {
createArgs.push("--signing-url");
createArgs.push(process.env.WACZ_SIGN_URL);
waczOpts.signingUrl = process.env.WACZ_SIGN_URL;
if (process.env.WACZ_SIGN_TOKEN) {
createArgs.push("--signing-token");
createArgs.push(process.env.WACZ_SIGN_TOKEN);
waczOpts.signingToken = process.env.WACZ_SIGN_TOKEN;
}
}

if (this.params.title) {
createArgs.push("--title");
createArgs.push(this.params.title);
waczOpts.title = this.params.title;
}

if (this.params.description) {
createArgs.push("--desc");
createArgs.push(this.params.description);
waczOpts.description = this.params.description;
}

createArgs.push("-f");

warcFileList.forEach((val) =>
createArgs.push(path.join(this.archivesDir, val)),
);

// create WACZ
const waczResult = await this.awaitProcess(
child_process.spawn("wacz", createArgs, { detached: RUN_DETACHED }),
);

if (waczResult !== 0) {
logger.error("Error creating WACZ", { "status code": waczResult });
try {
const wacz = new WACZ(waczOpts);
await this._addCDXJ(wacz);
await wacz.process();
} catch (e) {
logger.error("Error creating WACZ", e);
logger.fatal("Unable to write WACZ successfully");
}

logger.debug(`WACZ successfully generated and saved to: ${waczPath}`);

// Verify WACZ
/*
const validateArgs = ["validate"];
validateArgs.push("-f");
validateArgs.push(waczPath);

const waczVerifyResult = await this.awaitProcess(child_process.spawn("wacz", validateArgs));

if (waczVerifyResult !== 0) {
console.log("validate", waczVerifyResult);
logger.fatal("Unable to verify WACZ created successfully");
}
*/
if (this.storage) {
await this.crawlState.setStatus("uploading-wacz");
const filename = process.env.STORE_FILENAME || "@ts-@id.wacz";
Expand All @@ -1590,29 +1572,28 @@ self.__bx_behaviors.selectMainBehavior();
return false;
}

awaitProcess(proc: ChildProcess) {
const stdout: string[] = [];
const stderr: string[] = [];
// todo: replace with js-wacz impl eventually
async _addCDXJ(wacz: WACZ) {
const dirPath = this.tempCdxDir;

proc.stdout!.on("data", (data) => {
stdout.push(data.toString());
});
try {
const cdxjFiles = await fsp.readdir(dirPath);

proc.stderr!.on("data", (data) => {
stderr.push(data.toString());
});
for (let i = 0; i < cdxjFiles.length; i++) {
const cdxjFile = path.join(dirPath, cdxjFiles[i]);

return new Promise((resolve) => {
proc.on("close", (code) => {
if (stdout.length) {
logger.debug(stdout.join("\n"));
}
if (stderr.length && this.params.logging.includes("debug")) {
logger.debug(stderr.join("\n"));
logger.debug(`CDXJ: Reading entries from ${cdxjFile}`);
const rl = readline.createInterface({
input: fs.createReadStream(cdxjFile),
});

for await (const line of rl) {
wacz.addCDXJ(line + "\n");
}
resolve(code);
});
});
}
} catch (err) {
logger.error("CDXJ Indexing Error", err);
}
}

logMemory() {
Expand Down
33 changes: 33 additions & 0 deletions src/util/logger.ts
Expand Up @@ -51,6 +51,7 @@ export const LOG_CONTEXT_TYPES = [
"crawlStatus",
"links",
"sitemap",
"wacz",
"replay",
] as const;

Expand Down Expand Up @@ -173,6 +174,14 @@ class Logger {
}
}

trace(content: string | object, context: LogContext = "general") {
if (typeof content === "string") {
this.logAsJSON(content, {}, context, "trace");
} else {
this.logAsJSON("Stacktrace", content, context, "trace");
}
}

fatal(
message: string,
data = {},
Expand All @@ -191,3 +200,27 @@ class Logger {
}

export const logger = new Logger();

export class WACZLogger {
logger: Logger;

constructor(loggerInstance: Logger) {
this.logger = loggerInstance;
}

info(message: string) {
logger.info(message, {}, "wacz");
}

error(message: string) {
logger.error(message, {}, "wacz");
}

warn(message: string) {
logger.warn(message, {}, "wacz");
}

trace(content: string | object) {
logger.trace(content, "wacz");
}
}
2 changes: 0 additions & 2 deletions src/util/warcwriter.ts
Expand Up @@ -58,8 +58,6 @@ export class WARCWriter implements IndexerOffsetLength {
}) {
this.archivesDir = archivesDir;
this.tempCdxDir = tempCdxDir;
// for now, disabling CDX
this.tempCdxDir = undefined;
this.logDetails = logDetails;
this.gzip = gzip;
this.rolloverSize = rolloverSize;
Expand Down
6 changes: 3 additions & 3 deletions tests/basic_crawl.test.js
Expand Up @@ -8,9 +8,9 @@ test("ensure basic crawl run with docker run passes", async () => {
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com/ --generateWACZ --text --collection wr-net --combineWARC --rolloverSize 10000 --workers 2 --title "test title" --description "test description" --warcPrefix custom-prefix',
);

child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/wr-net/wr-net.wacz",
);
// child_process.execSync(
// "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/wr-net/wr-net.wacz",
// );

child_process.execSync(
"unzip test-crawls/collections/wr-net/wr-net.wacz -d test-crawls/collections/wr-net/wacz",
Expand Down
6 changes: 3 additions & 3 deletions tests/mult_url_crawl_with_favicon.test.js
Expand Up @@ -6,9 +6,9 @@ test("ensure multi url crawl run with docker run passes", async () => {
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url https://webrecorder.net/ --generateWACZ --text --collection advanced --combineWARC --rolloverSize 10000 --workers 2 --title "test title" --description "test description" --pages 2 --limit 2',
);

child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/advanced/advanced.wacz",
);
// child_process.execSync(
// "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/advanced/advanced.wacz",
// );
});

test("check that the favicon made it into the pages jsonl file", () => {
Expand Down
10 changes: 5 additions & 5 deletions tests/redis_crawl_state.js
Expand Up @@ -12,8 +12,8 @@ test("ensure crawl run with redis passes", async () => {
redis.kill("SIGINT");
});

test("check that wacz created is valid", () => {
child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/redis-crawl/redis-crawl.wacz",
);
});
// test("check that wacz created is valid", () => {
// child_process.execSync(
// "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/redis-crawl/redis-crawl.wacz",
// );
// });