From 216c46483770b9479e53ad2830840a3333d0b0fc Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 20 Mar 2024 16:33:36 -0400 Subject: [PATCH 1/6] Add --pagesDir option to copy pages files directly into WACZ pages.jsonl and extraPages.jsonl files will be copied, other files are ignored. --- bin/cli.js | 8 ++++- constants.js | 18 ++++++++++ fixtures/pages/extraPages.jsonl | 4 +++ fixtures/pages/invalidName.jsonl | 2 ++ fixtures/pages/pages.jsonl | 2 ++ index.js | 59 ++++++++++++++++++++++++++++++-- index.test.js | 44 +++++++++++++++++++++++- 7 files changed, 133 insertions(+), 4 deletions(-) create mode 100644 fixtures/pages/extraPages.jsonl create mode 100644 fixtures/pages/invalidName.jsonl create mode 100644 fixtures/pages/pages.jsonl diff --git a/bin/cli.js b/bin/cli.js index c117c37..1eaffb8 100755 --- a/bin/cli.js +++ b/bin/cli.js @@ -38,7 +38,12 @@ program.command('create') .option( '-p --pages ', 'Path to a jsonl files to be used to replace pages.jsonl. ' + - 'If not provided, js-wacz will attempt to detect pages.') + 'If neither --pages nor --pages-dir is provided, js-wacz will attempt to detect pages.') + .option( + '--pages-dir ', + 'Path to a directory of pages files to copy into WACZ as-is. ' + + 'Only files nomed pages.jsonl or extraPages.jsonl will be copied. ' + + 'If neither --pages nor --pages-dir is provided, js-wacz will attempt to detect pages.') .option( '--url ', 'If provided, will be used as the "main page url" in datapackage.json.') @@ -115,6 +120,7 @@ program.command('create') description: values?.desc, signingUrl: values?.signingUrl, signingToken: values?.signingToken, + pagesDir: values?.pagesDir, log }) } catch (err) { diff --git a/constants.js b/constants.js index a560fa0..539534c 100644 --- a/constants.js +++ b/constants.js @@ -16,6 +16,24 @@ export const BASE_PATH = dirname(fileURLToPath(import.meta.url)) */ export const FIXTURES_PATH = `${BASE_PATH}${sep}fixtures${sep}` +/** + * Path to the fixtures folder pages sub-directory. + * @constant + */ +export const PAGES_DIR_FIXTURES_PATH = `${FIXTURES_PATH}pages${sep}` + +/** + * Path to the pages.jsonl fixture + * @constant + */ +export const PAGES_FIXTURE_PATH = `${PAGES_DIR_FIXTURES_PATH}pages.jsonl` + +/** + * Path to the extraPages.jsonl fixture + * @constant + */ +export const EXTRA_PAGES_FIXTURE_PATH = `${PAGES_DIR_FIXTURES_PATH}extraPages.jsonl` + /** * Colors scheme for log level. * @constant diff --git a/fixtures/pages/extraPages.jsonl b/fixtures/pages/extraPages.jsonl new file mode 100644 index 0000000..f0c15cc --- /dev/null +++ b/fixtures/pages/extraPages.jsonl @@ -0,0 +1,4 @@ +{"format": "json-pages-1.0", "id": "extra-pages", "title": "Extra Pages"} +{"id": "e33b4ca5-ce1d-46b2-83ea-405c43b949c5", "url": "https://webrecorder.net/tools", "title": "Webrecorder | Tools", "loadState": 4, "status": 200, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:22Z"} +{"id": "d026299c-3e37-4473-bcb4-742bc005b25d", "url": "https://webrecorder.net/blog", "title": "Webrecorder | Blog", "loadState": 4, "status": 200, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"} +{"id": "726e4e11-abb5-447d-b0be-61c4de7bb4b1", "url": "https://webrecorder.net/community", "title": "Webrecorder | Community", "loadState": 4, "status": 200, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"} diff --git a/fixtures/pages/invalidName.jsonl b/fixtures/pages/invalidName.jsonl new file mode 100644 index 0000000..14205f7 --- /dev/null +++ b/fixtures/pages/invalidName.jsonl @@ -0,0 +1,2 @@ +{"format": "json-pages-1.0", "id": "extra-pages", "title": "Extra Pages"} +{"id": "8e584989-8e90-41d6-9f27-c15d0fefe437", "url": "https://webrecorder.net/about", "title": "Webrecorder | About", "loadState": 4, "status": 200, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"} diff --git a/fixtures/pages/pages.jsonl b/fixtures/pages/pages.jsonl new file mode 100644 index 0000000..ffee2c7 --- /dev/null +++ b/fixtures/pages/pages.jsonl @@ -0,0 +1,2 @@ +{"format": "json-pages-1.0", "id": "pages", "title": "All Pages"} +{"id": "3e01410a-e0a8-4b6f-8a6a-fca6302d9916", "url": "https://webrecorder.net/", "title": "Webrecorder", "loadState": 4, "status": 200, "seed": true, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:17Z"} diff --git a/index.js b/index.js index 2378a9d..6007cb4 100644 --- a/index.js +++ b/index.js @@ -3,7 +3,7 @@ import fs from 'fs/promises' import { createWriteStream, createReadStream, WriteStream, unlinkSync } from 'fs' // eslint-disable-line import { createHash } from 'crypto' -import { basename, sep } from 'path' +import { basename, sep, resolve } from 'path' import { Deflate } from 'pako' import { globSync } from 'glob' @@ -177,6 +177,12 @@ export class WACZ { */ archiveStream = null + /** + * Path to directory of pages.jsonl files to copy as-is into WACZ. + * @type {?string} + */ + pagesDir = null + /** * @param {WACZOptions} options - See {@link WACZOptions} for details. */ @@ -276,6 +282,11 @@ export class WACZ { this.detectPages = false } + if (options?.pagesDir) { + this.detectPages = false + this.pagesDir = String(options?.pagesDir).trim() + } + if (options?.indexFromWARCs === false) { this.indexFromWARCs = false } @@ -359,7 +370,11 @@ export class WACZ { await this.writeIndexesToZip() info('Writing pages.jsonl to WACZ') - await this.writePagesToZip() + if (!this.pagesDir) { + await this.writePagesToZip() + } else { + await this.copyPagesFilesToZip() + } info('Writing WARCs to WACZ') await this.writeWARCsToZip() @@ -582,6 +597,46 @@ export class WACZ { } } + /** + * Copies pages.jsonl and extraPages.jsonl files in this.pagesDir into ZIP. + * @returns {Promise} + */ + copyPagesFilesToZip = async () => { + this.stateCheck() + + const { pagesDir, log, addFileToZip } = this + + const allowedPagesFiles = ['pages.jsonl', 'extraPages.jsonl'] + + if (!this.pagesDir) { + throw new Error('Error copying pages files, no directory specified.') + } + + try { + const pagesFiles = await fs.readdir(pagesDir) + + for (let i = 0; i < pagesFiles.length; i++) { + const filename = pagesFiles[i] + const pagesFile = resolve(this.pagesDir, filename) + + if (!allowedPagesFiles.includes(filename)) { + log.warn(`Pages: Skipping file ${pagesFile}, not pages.jsonl or extraPages.jsonl`) + continue + } + + let destination = 'pages/pages.jsonl' + if (filename === 'extraPages.jsonl') { + destination = 'pages/extraPages.jsonl' + } + + await addFileToZip(pagesFile, destination) + } + } catch (err) { + log.trace(err) + throw new Error('An error occurred while copying pages files into WACZ.') + } + } + /** * Streams all the files listes in `this.WARCs` to the output ZIP. * @returns {Promise} diff --git a/index.test.js b/index.test.js index 7927c67..0add326 100644 --- a/index.test.js +++ b/index.test.js @@ -11,7 +11,7 @@ import StreamZip from 'node-stream-zip' import * as dotenv from 'dotenv' import { WACZ } from './index.js' -import { FIXTURES_PATH } from './constants.js' +import { FIXTURES_PATH, PAGES_DIR_FIXTURES_PATH, PAGES_FIXTURE_PATH, EXTRA_PAGES_FIXTURE_PATH } from './constants.js' import { assertSHA256WithPrefix, assertValidWACZSignatureFormat } from './utils/assertions.js' // see https://github.com/motdotla/dotenv#how-do-i-use-dotenv-with-import // Loads env vars from .env if provided @@ -74,6 +74,12 @@ test('WACZ constructor accounts for options.detectPages if valid.', async (_t) = assert.equal(archive.detectPages, false) }) +test('WACZ constructor accounts for options.pagesDir if provided.', async (_t) => { + const archive = new WACZ({ input: FIXTURE_INPUT, pagesDir: PAGES_DIR_FIXTURES_PATH }) + assert.equal(archive.detectPages, false) + assert.equal(archive.pagesDir, PAGES_DIR_FIXTURES_PATH) +}) + test('WACZ constructor ignores options.indexFromWARCs if invalid.', async (_t) => { const scenarios = ['foo', {}, Buffer.alloc(0), 12, () => {}] @@ -333,3 +339,39 @@ test('WACZ.process runs the entire process and writes a valid .wacz to disk, acc // Delete temp file await fs.unlink(options.output) }) + +test('WACZ.process with pagesDir option creates valid WACZ with provided pages files.', async (_t) => { + const options = { + input: FIXTURE_INPUT, + output: '../tmp.wacz', + url: 'https://lil.law.harvard.edu', + title: 'WACZ Title', + description: 'WACZ Description', + pagesDir: PAGES_DIR_FIXTURES_PATH + } + + const archive = new WACZ(options) + + await archive.process(false) + + const zip = new StreamZip.async({ file: options.output }) // eslint-disable-line + + // File in pages fixture directory not named pages.jsonl or extraPages.jsonl + // should not exist in the WACZ. + assert.rejects(async() => await zip.entryData('pages/invalidName.jsonl')) + + // pages/pages.jsonl and pages/extraPages.jsonl should have same hash as fixtures + // they were copied from. + const datapackage = JSON.parse(await zip.entryData('datapackage.json')) + + const datapackagePages = datapackage.resources.filter(entry => entry.path === 'pages/pages.jsonl')[0] + const pagesFixtureHash = await archive.sha256(PAGES_FIXTURE_PATH) + assert.equal(datapackagePages.hash, pagesFixtureHash) + + const datapackageExtraPages = datapackage.resources.filter(entry => entry.path === 'pages/extraPages.jsonl')[0] + const extraPagesFixtureHash = await archive.sha256(EXTRA_PAGES_FIXTURE_PATH) + assert.equal(datapackageExtraPages.hash, extraPagesFixtureHash) + + // Delete temp file + await fs.unlink(options.output) +}) From 6a61f723d4571c8056f770cb5df920d39d882fdb Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 20 Mar 2024 17:28:29 -0400 Subject: [PATCH 2/6] Fix linting --- index.test.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index.test.js b/index.test.js index 0add326..98d4755 100644 --- a/index.test.js +++ b/index.test.js @@ -358,7 +358,7 @@ test('WACZ.process with pagesDir option creates valid WACZ with provided pages f // File in pages fixture directory not named pages.jsonl or extraPages.jsonl // should not exist in the WACZ. - assert.rejects(async() => await zip.entryData('pages/invalidName.jsonl')) + assert.rejects(async () => await zip.entryData('pages/invalidName.jsonl')) // pages/pages.jsonl and pages/extraPages.jsonl should have same hash as fixtures // they were copied from. From d8ba5f2e353c4493cb8180e47ad348f50fe5d429 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 20 Mar 2024 17:29:06 -0400 Subject: [PATCH 3/6] Fix help text typo --- bin/cli.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/cli.js b/bin/cli.js index 1eaffb8..643de35 100755 --- a/bin/cli.js +++ b/bin/cli.js @@ -42,7 +42,7 @@ program.command('create') .option( '--pages-dir ', 'Path to a directory of pages files to copy into WACZ as-is. ' + - 'Only files nomed pages.jsonl or extraPages.jsonl will be copied. ' + + 'Only files named pages.jsonl or extraPages.jsonl will be copied. ' + 'If neither --pages nor --pages-dir is provided, js-wacz will attempt to detect pages.') .option( '--url ', From 02ef17a29d32eb67809920c6ccbfca4948f31a4f Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 21 Mar 2024 11:53:11 -0400 Subject: [PATCH 4/6] Make code review revisions - Replace existing -p/--pages implementation rather than adding another option - Rather than hardcoding allowed names, check that JSONL files passed have correct extension and are well-formed JSON lines - Modify tests and fixtures to account for new logic --- bin/cli.js | 33 ++---------- .../{invalidName.jsonl => invalid.jsonl} | 4 +- fixtures/pages/invalid.txt | 1 + index.js | 50 +++++++++++-------- index.test.js | 13 ++--- 5 files changed, 41 insertions(+), 60 deletions(-) rename fixtures/pages/{invalidName.jsonl => invalid.jsonl} (61%) create mode 100644 fixtures/pages/invalid.txt diff --git a/bin/cli.js b/bin/cli.js index 643de35..47412d7 100755 --- a/bin/cli.js +++ b/bin/cli.js @@ -37,13 +37,8 @@ program.command('create') 'Path to output .wacz file.', 'archive.wacz') .option( '-p --pages ', - 'Path to a jsonl files to be used to replace pages.jsonl. ' + - 'If neither --pages nor --pages-dir is provided, js-wacz will attempt to detect pages.') - .option( - '--pages-dir ', - 'Path to a directory of pages files to copy into WACZ as-is. ' + - 'Only files named pages.jsonl or extraPages.jsonl will be copied. ' + - 'If neither --pages nor --pages-dir is provided, js-wacz will attempt to detect pages.') + 'Path to a directory of pages JSONL files to copy into WACZ as-is. ' + + 'If --pages is not provided, js-wacz will attempt to detect pages.') .option( '--url ', 'If provided, will be used as the "main page url" in datapackage.json.') @@ -120,7 +115,7 @@ program.command('create') description: values?.desc, signingUrl: values?.signingUrl, signingToken: values?.signingToken, - pagesDir: values?.pagesDir, + pages: values?.pages, log }) } catch (err) { @@ -128,28 +123,6 @@ program.command('create') return } - // Ingest user-provided pages.jsonl file, if any. - if (values?.pages) { - try { - log.info(`pages.jsonl: Reading entries from ${values?.pages}`) - const rl = readline.createInterface({ input: createReadStream(values.pages) }) - - for await (const line of rl) { - const page = JSON.parse(line) - - if (!page?.url) { - continue - } - - log.info(`Adding ${page.url}.`) - archive.addPage(page?.url, page?.title, page?.ts) - } - } catch (err) { - log.trace(err) - log.error('An error occurred while processing user-provided pages.jsonl.') - } - } - // Ingest user-provided CDX files, if any. if (values?.cdxj) { try { diff --git a/fixtures/pages/invalidName.jsonl b/fixtures/pages/invalid.jsonl similarity index 61% rename from fixtures/pages/invalidName.jsonl rename to fixtures/pages/invalid.jsonl index 14205f7..193f4ec 100644 --- a/fixtures/pages/invalidName.jsonl +++ b/fixtures/pages/invalid.jsonl @@ -1,2 +1,2 @@ -{"format": "json-pages-1.0", "id": "extra-pages", "title": "Extra Pages"} -{"id": "8e584989-8e90-41d6-9f27-c15d0fefe437", "url": "https://webrecorder.net/about", "title": "Webrecorder | About", "loadState": 4, "status": 200, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"} +{"format": "json-pages-1.0", "id": "extra-pages", "title": "Extra Pages" +{"id": "8e584989-8e90-41d6-9f27-c15d0fefe437", "url": "https://webrecorder.net/about", "title": "Webrecorder | About", "loadState": 4, "status": None, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"} diff --git a/fixtures/pages/invalid.txt b/fixtures/pages/invalid.txt new file mode 100644 index 0000000..f1fe4c4 --- /dev/null +++ b/fixtures/pages/invalid.txt @@ -0,0 +1 @@ +Not a JSONL file diff --git a/index.js b/index.js index 6007cb4..fd24042 100644 --- a/index.js +++ b/index.js @@ -4,6 +4,7 @@ import fs from 'fs/promises' import { createWriteStream, createReadStream, WriteStream, unlinkSync } from 'fs' // eslint-disable-line import { createHash } from 'crypto' import { basename, sep, resolve } from 'path' +import * as readline from 'node:readline/promises' import { Deflate } from 'pako' import { globSync } from 'glob' @@ -178,7 +179,7 @@ export class WACZ { archiveStream = null /** - * Path to directory of pages.jsonl files to copy as-is into WACZ. + * Path to directory of pages JSONL files to copy as-is into WACZ. * @type {?string} */ pagesDir = null @@ -282,9 +283,9 @@ export class WACZ { this.detectPages = false } - if (options?.pagesDir) { + if (options?.pages) { this.detectPages = false - this.pagesDir = String(options?.pagesDir).trim() + this.pagesDir = String(options?.pages).trim() } if (options?.indexFromWARCs === false) { @@ -606,34 +607,39 @@ export class WACZ { const { pagesDir, log, addFileToZip } = this - const allowedPagesFiles = ['pages.jsonl', 'extraPages.jsonl'] - - if (!this.pagesDir) { + if (!pagesDir) { throw new Error('Error copying pages files, no directory specified.') } - try { - const pagesFiles = await fs.readdir(pagesDir) + const pagesFiles = await fs.readdir(pagesDir) - for (let i = 0; i < pagesFiles.length; i++) { - const filename = pagesFiles[i] - const pagesFile = resolve(this.pagesDir, filename) + for (let i = 0; i < pagesFiles.length; i++) { + const filename = pagesFiles[i] + const filenameLower = filename.toLowerCase() + const pagesFile = resolve(this.pagesDir, filename) - if (!allowedPagesFiles.includes(filename)) { - log.warn(`Pages: Skipping file ${pagesFile}, not pages.jsonl or extraPages.jsonl`) - continue - } + if (!filenameLower.endsWith('.jsonl')) { + log.warn(`Pages: Skipping file ${pagesFile}, does not end with jsonl extension`) + continue + } + + let isValidJSONL = true - let destination = 'pages/pages.jsonl' - if (filename === 'extraPages.jsonl') { - destination = 'pages/extraPages.jsonl' + // Ensure file is valid JSONL + const rl = readline.createInterface({ input: createReadStream(pagesFile) }) + for await (const line of rl) { + try { + JSON.parse(line) + } catch (err) { + isValidJSONL = false + log.warn(`Pages: Skipping file ${pagesFile}, not valid JSONL`) + break } + } - await addFileToZip(pagesFile, destination) + if (isValidJSONL) { + await addFileToZip(pagesFile, `pages/${filename}`) } - } catch (err) { - log.trace(err) - throw new Error('An error occurred while copying pages files into WACZ.') } } diff --git a/index.test.js b/index.test.js index 98d4755..2eaf526 100644 --- a/index.test.js +++ b/index.test.js @@ -74,8 +74,8 @@ test('WACZ constructor accounts for options.detectPages if valid.', async (_t) = assert.equal(archive.detectPages, false) }) -test('WACZ constructor accounts for options.pagesDir if provided.', async (_t) => { - const archive = new WACZ({ input: FIXTURE_INPUT, pagesDir: PAGES_DIR_FIXTURES_PATH }) +test('WACZ constructor accounts for options.pages if provided.', async (_t) => { + const archive = new WACZ({ input: FIXTURE_INPUT, pages: PAGES_DIR_FIXTURES_PATH }) assert.equal(archive.detectPages, false) assert.equal(archive.pagesDir, PAGES_DIR_FIXTURES_PATH) }) @@ -347,7 +347,7 @@ test('WACZ.process with pagesDir option creates valid WACZ with provided pages f url: 'https://lil.law.harvard.edu', title: 'WACZ Title', description: 'WACZ Description', - pagesDir: PAGES_DIR_FIXTURES_PATH + pages: PAGES_DIR_FIXTURES_PATH } const archive = new WACZ(options) @@ -356,9 +356,10 @@ test('WACZ.process with pagesDir option creates valid WACZ with provided pages f const zip = new StreamZip.async({ file: options.output }) // eslint-disable-line - // File in pages fixture directory not named pages.jsonl or extraPages.jsonl - // should not exist in the WACZ. - assert.rejects(async () => await zip.entryData('pages/invalidName.jsonl')) + // File in pages fixture directory that are invalid JSONL or have wrong extension + // should not be copied into the WACZ. + assert.rejects(async () => await zip.entryData('pages/invalid.jsonl')) + assert.rejects(async () => await zip.entryData('pages/invalid.txt')) // pages/pages.jsonl and pages/extraPages.jsonl should have same hash as fixtures // they were copied from. From dd847e785e8f05da7aa56103eeb96843aaa95234 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 21 Mar 2024 17:51:32 -0400 Subject: [PATCH 5/6] Check conformance of pages files against spec --- fixtures/pages/invalid.jsonl | 4 ++-- index.js | 13 ++++++++++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/fixtures/pages/invalid.jsonl b/fixtures/pages/invalid.jsonl index 193f4ec..89930b9 100644 --- a/fixtures/pages/invalid.jsonl +++ b/fixtures/pages/invalid.jsonl @@ -1,2 +1,2 @@ -{"format": "json-pages-1.0", "id": "extra-pages", "title": "Extra Pages" -{"id": "8e584989-8e90-41d6-9f27-c15d0fefe437", "url": "https://webrecorder.net/about", "title": "Webrecorder | About", "loadState": 4, "status": None, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"} +{id": "extra-pages", "title": "Extra Pages"} +{"id": "8e584989-8e90-41d6-9f27-c15d0fefe437", "url": "https://webrecorder.net/about", "title": "Webrecorder | About", "loadState": 4, "status": null, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"} diff --git a/index.js b/index.js index fd24042..631b137 100644 --- a/index.js +++ b/index.js @@ -5,6 +5,7 @@ import { createWriteStream, createReadStream, WriteStream, unlinkSync } from 'fs import { createHash } from 'crypto' import { basename, sep, resolve } from 'path' import * as readline from 'node:readline/promises' +import assert from 'node:assert/strict' import { Deflate } from 'pako' import { globSync } from 'glob' @@ -627,9 +628,19 @@ export class WACZ { // Ensure file is valid JSONL const rl = readline.createInterface({ input: createReadStream(pagesFile) }) + let lineIndex = 0 + for await (const line of rl) { try { - JSON.parse(line) + const page = JSON.parse(line) + if (lineIndex === 0) { + assert(page.format) + assert(page.id) + } else { + assert(page.url) + assert(page.ts) + } + lineIndex++ } catch (err) { isValidJSONL = false log.warn(`Pages: Skipping file ${pagesFile}, not valid JSONL`) From 729affaf390b1553c3514b18b2266899884ff1aa Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Fri, 22 Mar 2024 12:02:39 -0400 Subject: [PATCH 6/6] Add trace logging for error when pages aren't validated Co-authored-by: Matteo Cargnelutti --- index.js | 1 + 1 file changed, 1 insertion(+) diff --git a/index.js b/index.js index 631b137..146b7d7 100644 --- a/index.js +++ b/index.js @@ -643,6 +643,7 @@ export class WACZ { lineIndex++ } catch (err) { isValidJSONL = false + log.trace(err) log.warn(`Pages: Skipping file ${pagesFile}, not valid JSONL`) break }