harvard-lil · matteocargnelutti · Mar 22, 2024 · Mar 20, 2024 · Mar 20, 2024 · Mar 20, 2024
diff --git a/bin/cli.js b/bin/cli.js
@@ -37,8 +37,8 @@ program.command('create')
     'Path to output .wacz file.', 'archive.wacz')
   .option(
     '-p --pages <string>',
-    'Path to a jsonl files to be used to replace pages.jsonl. ' +
-    'If not provided, js-wacz will attempt to detect pages.')
+    'Path to a directory of pages JSONL files to copy into WACZ as-is. ' +
+    'If --pages is not provided, js-wacz will attempt to detect pages.')
   .option(
     '--url <string>',
     'If provided, will be used as the "main page url" in datapackage.json.')
@@ -115,35 +115,14 @@ program.command('create')
         description: values?.desc,
         signingUrl: values?.signingUrl,
         signingToken: values?.signingToken,
+        pages: values?.pages,
         log
       })
     } catch (err) {
       log.error(`${err}`) // Show simplified report
       return
     }
 
-    // Ingest user-provided pages.jsonl file, if any.
-    if (values?.pages) {
-      try {
-        log.info(`pages.jsonl: Reading entries from ${values?.pages}`)
-        const rl = readline.createInterface({ input: createReadStream(values.pages) })
-
-        for await (const line of rl) {
-          const page = JSON.parse(line)
-
-          if (!page?.url) {
-            continue
-          }
-
-          log.info(`Adding ${page.url}.`)
-          archive.addPage(page?.url, page?.title, page?.ts)
-        }
-      } catch (err) {
-        log.trace(err)
-        log.error('An error occurred while processing user-provided pages.jsonl.')
-      }
-    }
-
     // Ingest user-provided CDX files, if any.
     if (values?.cdxj) {
       try {

diff --git a/constants.js b/constants.js
@@ -16,6 +16,24 @@ export const BASE_PATH = dirname(fileURLToPath(import.meta.url))
  */
 export const FIXTURES_PATH = `${BASE_PATH}${sep}fixtures${sep}`
 
+/**
+ * Path to the fixtures folder pages sub-directory.
+ * @constant
+ */
+export const PAGES_DIR_FIXTURES_PATH = `${FIXTURES_PATH}pages${sep}`
+
+/**
+ * Path to the pages.jsonl fixture
+ * @constant
+ */
+export const PAGES_FIXTURE_PATH = `${PAGES_DIR_FIXTURES_PATH}pages.jsonl`
+
+/**
+ * Path to the extraPages.jsonl fixture
+ * @constant
+ */
+export const EXTRA_PAGES_FIXTURE_PATH = `${PAGES_DIR_FIXTURES_PATH}extraPages.jsonl`
+
 /**
  * Colors scheme for log level.
  * @constant

diff --git a/fixtures/pages/extraPages.jsonl b/fixtures/pages/extraPages.jsonl
@@ -0,0 +1,4 @@
+{"format": "json-pages-1.0", "id": "extra-pages", "title": "Extra Pages"}
+{"id": "e33b4ca5-ce1d-46b2-83ea-405c43b949c5", "url": "https://webrecorder.net/tools", "title": "Webrecorder | Tools", "loadState": 4, "status": 200, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:22Z"}
+{"id": "d026299c-3e37-4473-bcb4-742bc005b25d", "url": "https://webrecorder.net/blog", "title": "Webrecorder | Blog", "loadState": 4, "status": 200, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"}
+{"id": "726e4e11-abb5-447d-b0be-61c4de7bb4b1", "url": "https://webrecorder.net/community", "title": "Webrecorder | Community", "loadState": 4, "status": 200, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"}
diff --git a/fixtures/pages/invalid.jsonl b/fixtures/pages/invalid.jsonl
@@ -0,0 +1,2 @@
+{"format": "json-pages-1.0", "id": "extra-pages", "title": "Extra Pages"
+{"id": "8e584989-8e90-41d6-9f27-c15d0fefe437", "url": "https://webrecorder.net/about", "title": "Webrecorder | About", "loadState": 4, "status": None, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"}
diff --git a/fixtures/pages/invalid.txt b/fixtures/pages/invalid.txt
@@ -0,0 +1 @@
+Not a JSONL file
diff --git a/fixtures/pages/pages.jsonl b/fixtures/pages/pages.jsonl
@@ -0,0 +1,2 @@
+{"format": "json-pages-1.0", "id": "pages", "title": "All Pages"}
+{"id": "3e01410a-e0a8-4b6f-8a6a-fca6302d9916", "url": "https://webrecorder.net/", "title": "Webrecorder", "loadState": 4, "status": 200, "seed": true, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:17Z"}
diff --git a/index.js b/index.js
@@ -3,7 +3,8 @@
 import fs from 'fs/promises'
 import { createWriteStream, createReadStream, WriteStream, unlinkSync } from 'fs' // eslint-disable-line
 import { createHash } from 'crypto'
-import { basename, sep } from 'path'
+import { basename, sep, resolve } from 'path'
+import * as readline from 'node:readline/promises'
 
 import { Deflate } from 'pako'
 import { globSync } from 'glob'
@@ -177,6 +178,12 @@ export class WACZ {
    */
   archiveStream = null
 
+  /**
+   * Path to directory of pages JSONL files to copy as-is into WACZ.
+   * @type {?string}
+   */
+  pagesDir = null
+
   /**
    * @param {WACZOptions} options - See {@link WACZOptions} for details.
    */
@@ -276,6 +283,11 @@ export class WACZ {
       this.detectPages = false
     }
 
+    if (options?.pages) {
+      this.detectPages = false
+      this.pagesDir = String(options?.pages).trim()
+    }
+
     if (options?.indexFromWARCs === false) {
       this.indexFromWARCs = false
     }
@@ -359,7 +371,11 @@ export class WACZ {
     await this.writeIndexesToZip()
 
     info('Writing pages.jsonl to WACZ')
-    await this.writePagesToZip()
+    if (!this.pagesDir) {
+      await this.writePagesToZip()
+    } else {
+      await this.copyPagesFilesToZip()
+    }
 
     info('Writing WARCs to WACZ')
     await this.writeWARCsToZip()
@@ -582,6 +598,51 @@ export class WACZ {
     }
   }
 
+  /**
+   * Copies pages.jsonl and extraPages.jsonl files in this.pagesDir into ZIP.
+   * @returns {Promise<void>}
+   */
+  copyPagesFilesToZip = async () => {
+    this.stateCheck()
+
+    const { pagesDir, log, addFileToZip } = this
+
+    if (!pagesDir) {
+      throw new Error('Error copying pages files, no directory specified.')
+    }
+
+    const pagesFiles = await fs.readdir(pagesDir)
+
+    for (let i = 0; i < pagesFiles.length; i++) {
+      const filename = pagesFiles[i]
+      const filenameLower = filename.toLowerCase()
+      const pagesFile = resolve(this.pagesDir, filename)
+
+      if (!filenameLower.endsWith('.jsonl')) {
+        log.warn(`Pages: Skipping file ${pagesFile}, does not end with jsonl extension`)
+        continue
+      }
+
+      let isValidJSONL = true
+
+      // Ensure file is valid JSONL
+      const rl = readline.createInterface({ input: createReadStream(pagesFile) })
+      for await (const line of rl) {
+        try {
+          JSON.parse(line)
+        } catch (err) {
+          isValidJSONL = false
+          log.warn(`Pages: Skipping file ${pagesFile}, not valid JSONL`)
+          break
+        }
+      }
+
+      if (isValidJSONL) {
+        await addFileToZip(pagesFile, `pages/${filename}`)
+      }
+    }
+  }
+
   /**
    * Streams all the files listes in `this.WARCs` to the output ZIP.
    * @returns {Promise<void>}

diff --git a/index.test.js b/index.test.js
@@ -11,7 +11,7 @@ import StreamZip from 'node-stream-zip'
 import * as dotenv from 'dotenv'
 
 import { WACZ } from './index.js'
-import { FIXTURES_PATH } from './constants.js'
+import { FIXTURES_PATH, PAGES_DIR_FIXTURES_PATH, PAGES_FIXTURE_PATH, EXTRA_PAGES_FIXTURE_PATH } from './constants.js'
 import { assertSHA256WithPrefix, assertValidWACZSignatureFormat } from './utils/assertions.js' // see https://github.com/motdotla/dotenv#how-do-i-use-dotenv-with-import
 
 // Loads env vars from .env if provided
@@ -74,6 +74,12 @@ test('WACZ constructor accounts for options.detectPages if valid.', async (_t) =
   assert.equal(archive.detectPages, false)
 })
 
+test('WACZ constructor accounts for options.pages if provided.', async (_t) => {
+  const archive = new WACZ({ input: FIXTURE_INPUT, pages: PAGES_DIR_FIXTURES_PATH })
+  assert.equal(archive.detectPages, false)
+  assert.equal(archive.pagesDir, PAGES_DIR_FIXTURES_PATH)
+})
+
 test('WACZ constructor ignores options.indexFromWARCs if invalid.', async (_t) => {
   const scenarios = ['foo', {}, Buffer.alloc(0), 12, () => {}]
 
@@ -333,3 +339,40 @@ test('WACZ.process runs the entire process and writes a valid .wacz to disk, acc
   // Delete temp file
   await fs.unlink(options.output)
 })
+
+test('WACZ.process with pagesDir option creates valid WACZ with provided pages files.', async (_t) => {
+  const options = {
+    input: FIXTURE_INPUT,
+    output: '../tmp.wacz',
+    url: 'https://lil.law.harvard.edu',
+    title: 'WACZ Title',
+    description: 'WACZ Description',
+    pages: PAGES_DIR_FIXTURES_PATH
+  }
+
+  const archive = new WACZ(options)
+
+  await archive.process(false)
+
+  const zip = new StreamZip.async({ file: options.output }) // eslint-disable-line
+
+  // File in pages fixture directory that are invalid JSONL or have wrong extension
+  // should not be copied into the WACZ.
+  assert.rejects(async () => await zip.entryData('pages/invalid.jsonl'))
+  assert.rejects(async () => await zip.entryData('pages/invalid.txt'))
+
+  // pages/pages.jsonl and pages/extraPages.jsonl should have same hash as fixtures
+  // they were copied from.
+  const datapackage = JSON.parse(await zip.entryData('datapackage.json'))
+
+  const datapackagePages = datapackage.resources.filter(entry => entry.path === 'pages/pages.jsonl')[0]
+  const pagesFixtureHash = await archive.sha256(PAGES_FIXTURE_PATH)
+  assert.equal(datapackagePages.hash, pagesFixtureHash)
+
+  const datapackageExtraPages = datapackage.resources.filter(entry => entry.path === 'pages/extraPages.jsonl')[0]
+  const extraPagesFixtureHash = await archive.sha256(EXTRA_PAGES_FIXTURE_PATH)
+  assert.equal(datapackageExtraPages.hash, extraPagesFixtureHash)
+
+  // Delete temp file
+  await fs.unlink(options.output)
+})