diff --git a/bin/cli.js b/bin/cli.js index c117c37..47412d7 100755 --- a/bin/cli.js +++ b/bin/cli.js @@ -37,8 +37,8 @@ program.command('create') 'Path to output .wacz file.', 'archive.wacz') .option( '-p --pages ', - 'Path to a jsonl files to be used to replace pages.jsonl. ' + - 'If not provided, js-wacz will attempt to detect pages.') + 'Path to a directory of pages JSONL files to copy into WACZ as-is. ' + + 'If --pages is not provided, js-wacz will attempt to detect pages.') .option( '--url ', 'If provided, will be used as the "main page url" in datapackage.json.') @@ -115,6 +115,7 @@ program.command('create') description: values?.desc, signingUrl: values?.signingUrl, signingToken: values?.signingToken, + pages: values?.pages, log }) } catch (err) { @@ -122,28 +123,6 @@ program.command('create') return } - // Ingest user-provided pages.jsonl file, if any. - if (values?.pages) { - try { - log.info(`pages.jsonl: Reading entries from ${values?.pages}`) - const rl = readline.createInterface({ input: createReadStream(values.pages) }) - - for await (const line of rl) { - const page = JSON.parse(line) - - if (!page?.url) { - continue - } - - log.info(`Adding ${page.url}.`) - archive.addPage(page?.url, page?.title, page?.ts) - } - } catch (err) { - log.trace(err) - log.error('An error occurred while processing user-provided pages.jsonl.') - } - } - // Ingest user-provided CDX files, if any. if (values?.cdxj) { try { diff --git a/constants.js b/constants.js index a560fa0..539534c 100644 --- a/constants.js +++ b/constants.js @@ -16,6 +16,24 @@ export const BASE_PATH = dirname(fileURLToPath(import.meta.url)) */ export const FIXTURES_PATH = `${BASE_PATH}${sep}fixtures${sep}` +/** + * Path to the fixtures folder pages sub-directory. + * @constant + */ +export const PAGES_DIR_FIXTURES_PATH = `${FIXTURES_PATH}pages${sep}` + +/** + * Path to the pages.jsonl fixture + * @constant + */ +export const PAGES_FIXTURE_PATH = `${PAGES_DIR_FIXTURES_PATH}pages.jsonl` + +/** + * Path to the extraPages.jsonl fixture + * @constant + */ +export const EXTRA_PAGES_FIXTURE_PATH = `${PAGES_DIR_FIXTURES_PATH}extraPages.jsonl` + /** * Colors scheme for log level. * @constant diff --git a/fixtures/pages/extraPages.jsonl b/fixtures/pages/extraPages.jsonl new file mode 100644 index 0000000..f0c15cc --- /dev/null +++ b/fixtures/pages/extraPages.jsonl @@ -0,0 +1,4 @@ +{"format": "json-pages-1.0", "id": "extra-pages", "title": "Extra Pages"} +{"id": "e33b4ca5-ce1d-46b2-83ea-405c43b949c5", "url": "https://webrecorder.net/tools", "title": "Webrecorder | Tools", "loadState": 4, "status": 200, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:22Z"} +{"id": "d026299c-3e37-4473-bcb4-742bc005b25d", "url": "https://webrecorder.net/blog", "title": "Webrecorder | Blog", "loadState": 4, "status": 200, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"} +{"id": "726e4e11-abb5-447d-b0be-61c4de7bb4b1", "url": "https://webrecorder.net/community", "title": "Webrecorder | Community", "loadState": 4, "status": 200, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"} diff --git a/fixtures/pages/invalid.jsonl b/fixtures/pages/invalid.jsonl new file mode 100644 index 0000000..89930b9 --- /dev/null +++ b/fixtures/pages/invalid.jsonl @@ -0,0 +1,2 @@ +{id": "extra-pages", "title": "Extra Pages"} +{"id": "8e584989-8e90-41d6-9f27-c15d0fefe437", "url": "https://webrecorder.net/about", "title": "Webrecorder | About", "loadState": 4, "status": null, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"} diff --git a/fixtures/pages/invalid.txt b/fixtures/pages/invalid.txt new file mode 100644 index 0000000..f1fe4c4 --- /dev/null +++ b/fixtures/pages/invalid.txt @@ -0,0 +1 @@ +Not a JSONL file diff --git a/fixtures/pages/pages.jsonl b/fixtures/pages/pages.jsonl new file mode 100644 index 0000000..ffee2c7 --- /dev/null +++ b/fixtures/pages/pages.jsonl @@ -0,0 +1,2 @@ +{"format": "json-pages-1.0", "id": "pages", "title": "All Pages"} +{"id": "3e01410a-e0a8-4b6f-8a6a-fca6302d9916", "url": "https://webrecorder.net/", "title": "Webrecorder", "loadState": 4, "status": 200, "seed": true, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:17Z"} diff --git a/index.js b/index.js index 2378a9d..146b7d7 100644 --- a/index.js +++ b/index.js @@ -3,7 +3,9 @@ import fs from 'fs/promises' import { createWriteStream, createReadStream, WriteStream, unlinkSync } from 'fs' // eslint-disable-line import { createHash } from 'crypto' -import { basename, sep } from 'path' +import { basename, sep, resolve } from 'path' +import * as readline from 'node:readline/promises' +import assert from 'node:assert/strict' import { Deflate } from 'pako' import { globSync } from 'glob' @@ -177,6 +179,12 @@ export class WACZ { */ archiveStream = null + /** + * Path to directory of pages JSONL files to copy as-is into WACZ. + * @type {?string} + */ + pagesDir = null + /** * @param {WACZOptions} options - See {@link WACZOptions} for details. */ @@ -276,6 +284,11 @@ export class WACZ { this.detectPages = false } + if (options?.pages) { + this.detectPages = false + this.pagesDir = String(options?.pages).trim() + } + if (options?.indexFromWARCs === false) { this.indexFromWARCs = false } @@ -359,7 +372,11 @@ export class WACZ { await this.writeIndexesToZip() info('Writing pages.jsonl to WACZ') - await this.writePagesToZip() + if (!this.pagesDir) { + await this.writePagesToZip() + } else { + await this.copyPagesFilesToZip() + } info('Writing WARCs to WACZ') await this.writeWARCsToZip() @@ -582,6 +599,62 @@ export class WACZ { } } + /** + * Copies pages.jsonl and extraPages.jsonl files in this.pagesDir into ZIP. + * @returns {Promise} + */ + copyPagesFilesToZip = async () => { + this.stateCheck() + + const { pagesDir, log, addFileToZip } = this + + if (!pagesDir) { + throw new Error('Error copying pages files, no directory specified.') + } + + const pagesFiles = await fs.readdir(pagesDir) + + for (let i = 0; i < pagesFiles.length; i++) { + const filename = pagesFiles[i] + const filenameLower = filename.toLowerCase() + const pagesFile = resolve(this.pagesDir, filename) + + if (!filenameLower.endsWith('.jsonl')) { + log.warn(`Pages: Skipping file ${pagesFile}, does not end with jsonl extension`) + continue + } + + let isValidJSONL = true + + // Ensure file is valid JSONL + const rl = readline.createInterface({ input: createReadStream(pagesFile) }) + let lineIndex = 0 + + for await (const line of rl) { + try { + const page = JSON.parse(line) + if (lineIndex === 0) { + assert(page.format) + assert(page.id) + } else { + assert(page.url) + assert(page.ts) + } + lineIndex++ + } catch (err) { + isValidJSONL = false + log.trace(err) + log.warn(`Pages: Skipping file ${pagesFile}, not valid JSONL`) + break + } + } + + if (isValidJSONL) { + await addFileToZip(pagesFile, `pages/${filename}`) + } + } + } + /** * Streams all the files listes in `this.WARCs` to the output ZIP. * @returns {Promise} diff --git a/index.test.js b/index.test.js index 7927c67..2eaf526 100644 --- a/index.test.js +++ b/index.test.js @@ -11,7 +11,7 @@ import StreamZip from 'node-stream-zip' import * as dotenv from 'dotenv' import { WACZ } from './index.js' -import { FIXTURES_PATH } from './constants.js' +import { FIXTURES_PATH, PAGES_DIR_FIXTURES_PATH, PAGES_FIXTURE_PATH, EXTRA_PAGES_FIXTURE_PATH } from './constants.js' import { assertSHA256WithPrefix, assertValidWACZSignatureFormat } from './utils/assertions.js' // see https://github.com/motdotla/dotenv#how-do-i-use-dotenv-with-import // Loads env vars from .env if provided @@ -74,6 +74,12 @@ test('WACZ constructor accounts for options.detectPages if valid.', async (_t) = assert.equal(archive.detectPages, false) }) +test('WACZ constructor accounts for options.pages if provided.', async (_t) => { + const archive = new WACZ({ input: FIXTURE_INPUT, pages: PAGES_DIR_FIXTURES_PATH }) + assert.equal(archive.detectPages, false) + assert.equal(archive.pagesDir, PAGES_DIR_FIXTURES_PATH) +}) + test('WACZ constructor ignores options.indexFromWARCs if invalid.', async (_t) => { const scenarios = ['foo', {}, Buffer.alloc(0), 12, () => {}] @@ -333,3 +339,40 @@ test('WACZ.process runs the entire process and writes a valid .wacz to disk, acc // Delete temp file await fs.unlink(options.output) }) + +test('WACZ.process with pagesDir option creates valid WACZ with provided pages files.', async (_t) => { + const options = { + input: FIXTURE_INPUT, + output: '../tmp.wacz', + url: 'https://lil.law.harvard.edu', + title: 'WACZ Title', + description: 'WACZ Description', + pages: PAGES_DIR_FIXTURES_PATH + } + + const archive = new WACZ(options) + + await archive.process(false) + + const zip = new StreamZip.async({ file: options.output }) // eslint-disable-line + + // File in pages fixture directory that are invalid JSONL or have wrong extension + // should not be copied into the WACZ. + assert.rejects(async () => await zip.entryData('pages/invalid.jsonl')) + assert.rejects(async () => await zip.entryData('pages/invalid.txt')) + + // pages/pages.jsonl and pages/extraPages.jsonl should have same hash as fixtures + // they were copied from. + const datapackage = JSON.parse(await zip.entryData('datapackage.json')) + + const datapackagePages = datapackage.resources.filter(entry => entry.path === 'pages/pages.jsonl')[0] + const pagesFixtureHash = await archive.sha256(PAGES_FIXTURE_PATH) + assert.equal(datapackagePages.hash, pagesFixtureHash) + + const datapackageExtraPages = datapackage.resources.filter(entry => entry.path === 'pages/extraPages.jsonl')[0] + const extraPagesFixtureHash = await archive.sha256(EXTRA_PAGES_FIXTURE_PATH) + assert.equal(datapackageExtraPages.hash, extraPagesFixtureHash) + + // Delete temp file + await fs.unlink(options.output) +})