Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Modify --pages option to copy pages files directly into WACZ #92

Merged
merged 6 commits into from Mar 22, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
27 changes: 3 additions & 24 deletions bin/cli.js
Expand Up @@ -37,8 +37,8 @@ program.command('create')
'Path to output .wacz file.', 'archive.wacz')
.option(
'-p --pages <string>',
'Path to a jsonl files to be used to replace pages.jsonl. ' +
'If not provided, js-wacz will attempt to detect pages.')
'Path to a directory of pages JSONL files to copy into WACZ as-is. ' +
'If --pages is not provided, js-wacz will attempt to detect pages.')
.option(
'--url <string>',
'If provided, will be used as the "main page url" in datapackage.json.')
Expand Down Expand Up @@ -115,35 +115,14 @@ program.command('create')
description: values?.desc,
signingUrl: values?.signingUrl,
signingToken: values?.signingToken,
pages: values?.pages,
log
})
} catch (err) {
log.error(`${err}`) // Show simplified report
return
}

// Ingest user-provided pages.jsonl file, if any.
if (values?.pages) {
try {
log.info(`pages.jsonl: Reading entries from ${values?.pages}`)
const rl = readline.createInterface({ input: createReadStream(values.pages) })

for await (const line of rl) {
const page = JSON.parse(line)

if (!page?.url) {
continue
}

log.info(`Adding ${page.url}.`)
archive.addPage(page?.url, page?.title, page?.ts)
}
} catch (err) {
log.trace(err)
log.error('An error occurred while processing user-provided pages.jsonl.')
}
}

// Ingest user-provided CDX files, if any.
if (values?.cdxj) {
try {
Expand Down
18 changes: 18 additions & 0 deletions constants.js
Expand Up @@ -16,6 +16,24 @@ export const BASE_PATH = dirname(fileURLToPath(import.meta.url))
*/
export const FIXTURES_PATH = `${BASE_PATH}${sep}fixtures${sep}`

/**
* Path to the fixtures folder pages sub-directory.
* @constant
*/
export const PAGES_DIR_FIXTURES_PATH = `${FIXTURES_PATH}pages${sep}`

/**
* Path to the pages.jsonl fixture
* @constant
*/
export const PAGES_FIXTURE_PATH = `${PAGES_DIR_FIXTURES_PATH}pages.jsonl`

/**
* Path to the extraPages.jsonl fixture
* @constant
*/
export const EXTRA_PAGES_FIXTURE_PATH = `${PAGES_DIR_FIXTURES_PATH}extraPages.jsonl`

/**
* Colors scheme for log level.
* @constant
Expand Down
4 changes: 4 additions & 0 deletions fixtures/pages/extraPages.jsonl
@@ -0,0 +1,4 @@
{"format": "json-pages-1.0", "id": "extra-pages", "title": "Extra Pages"}
{"id": "e33b4ca5-ce1d-46b2-83ea-405c43b949c5", "url": "https://webrecorder.net/tools", "title": "Webrecorder | Tools", "loadState": 4, "status": 200, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:22Z"}
{"id": "d026299c-3e37-4473-bcb4-742bc005b25d", "url": "https://webrecorder.net/blog", "title": "Webrecorder | Blog", "loadState": 4, "status": 200, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"}
{"id": "726e4e11-abb5-447d-b0be-61c4de7bb4b1", "url": "https://webrecorder.net/community", "title": "Webrecorder | Community", "loadState": 4, "status": 200, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"}
2 changes: 2 additions & 0 deletions fixtures/pages/invalid.jsonl
@@ -0,0 +1,2 @@
{id": "extra-pages", "title": "Extra Pages"}
{"id": "8e584989-8e90-41d6-9f27-c15d0fefe437", "url": "https://webrecorder.net/about", "title": "Webrecorder | About", "loadState": 4, "status": null, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"}
1 change: 1 addition & 0 deletions fixtures/pages/invalid.txt
@@ -0,0 +1 @@
Not a JSONL file
2 changes: 2 additions & 0 deletions fixtures/pages/pages.jsonl
@@ -0,0 +1,2 @@
{"format": "json-pages-1.0", "id": "pages", "title": "All Pages"}
{"id": "3e01410a-e0a8-4b6f-8a6a-fca6302d9916", "url": "https://webrecorder.net/", "title": "Webrecorder", "loadState": 4, "status": 200, "seed": true, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:17Z"}
76 changes: 74 additions & 2 deletions index.js
Expand Up @@ -3,7 +3,9 @@
import fs from 'fs/promises'
import { createWriteStream, createReadStream, WriteStream, unlinkSync } from 'fs' // eslint-disable-line
import { createHash } from 'crypto'
import { basename, sep } from 'path'
import { basename, sep, resolve } from 'path'
import * as readline from 'node:readline/promises'
import assert from 'node:assert/strict'

import { Deflate } from 'pako'
import { globSync } from 'glob'
Expand Down Expand Up @@ -177,6 +179,12 @@ export class WACZ {
*/
archiveStream = null

/**
* Path to directory of pages JSONL files to copy as-is into WACZ.
* @type {?string}
*/
pagesDir = null

/**
* @param {WACZOptions} options - See {@link WACZOptions} for details.
*/
Expand Down Expand Up @@ -276,6 +284,11 @@ export class WACZ {
this.detectPages = false
}

if (options?.pages) {
this.detectPages = false
this.pagesDir = String(options?.pages).trim()
}

if (options?.indexFromWARCs === false) {
this.indexFromWARCs = false
}
Expand Down Expand Up @@ -359,7 +372,11 @@ export class WACZ {
await this.writeIndexesToZip()

info('Writing pages.jsonl to WACZ')
await this.writePagesToZip()
if (!this.pagesDir) {
await this.writePagesToZip()
} else {
await this.copyPagesFilesToZip()
}

info('Writing WARCs to WACZ')
await this.writeWARCsToZip()
Expand Down Expand Up @@ -582,6 +599,61 @@ export class WACZ {
}
}

/**
* Copies pages.jsonl and extraPages.jsonl files in this.pagesDir into ZIP.
* @returns {Promise<void>}
*/
copyPagesFilesToZip = async () => {
this.stateCheck()

const { pagesDir, log, addFileToZip } = this

if (!pagesDir) {
throw new Error('Error copying pages files, no directory specified.')
}

const pagesFiles = await fs.readdir(pagesDir)

for (let i = 0; i < pagesFiles.length; i++) {
const filename = pagesFiles[i]
const filenameLower = filename.toLowerCase()
const pagesFile = resolve(this.pagesDir, filename)

if (!filenameLower.endsWith('.jsonl')) {
log.warn(`Pages: Skipping file ${pagesFile}, does not end with jsonl extension`)
continue
}

let isValidJSONL = true

// Ensure file is valid JSONL
const rl = readline.createInterface({ input: createReadStream(pagesFile) })
let lineIndex = 0

for await (const line of rl) {
try {
const page = JSON.parse(line)
if (lineIndex === 0) {
assert(page.format)
assert(page.id)
} else {
assert(page.url)
assert(page.ts)
}
lineIndex++
} catch (err) {
isValidJSONL = false
log.warn(`Pages: Skipping file ${pagesFile}, not valid JSONL`)
tw4l marked this conversation as resolved.
Show resolved Hide resolved
break
}
}

if (isValidJSONL) {
await addFileToZip(pagesFile, `pages/${filename}`)
}
}
}

/**
* Streams all the files listes in `this.WARCs` to the output ZIP.
* @returns {Promise<void>}
Expand Down
45 changes: 44 additions & 1 deletion index.test.js
Expand Up @@ -11,7 +11,7 @@ import StreamZip from 'node-stream-zip'
import * as dotenv from 'dotenv'

import { WACZ } from './index.js'
import { FIXTURES_PATH } from './constants.js'
import { FIXTURES_PATH, PAGES_DIR_FIXTURES_PATH, PAGES_FIXTURE_PATH, EXTRA_PAGES_FIXTURE_PATH } from './constants.js'
import { assertSHA256WithPrefix, assertValidWACZSignatureFormat } from './utils/assertions.js' // see https://github.com/motdotla/dotenv#how-do-i-use-dotenv-with-import

// Loads env vars from .env if provided
Expand Down Expand Up @@ -74,6 +74,12 @@ test('WACZ constructor accounts for options.detectPages if valid.', async (_t) =
assert.equal(archive.detectPages, false)
})

test('WACZ constructor accounts for options.pages if provided.', async (_t) => {
const archive = new WACZ({ input: FIXTURE_INPUT, pages: PAGES_DIR_FIXTURES_PATH })
assert.equal(archive.detectPages, false)
assert.equal(archive.pagesDir, PAGES_DIR_FIXTURES_PATH)
})

test('WACZ constructor ignores options.indexFromWARCs if invalid.', async (_t) => {
const scenarios = ['foo', {}, Buffer.alloc(0), 12, () => {}]

Expand Down Expand Up @@ -333,3 +339,40 @@ test('WACZ.process runs the entire process and writes a valid .wacz to disk, acc
// Delete temp file
await fs.unlink(options.output)
})

test('WACZ.process with pagesDir option creates valid WACZ with provided pages files.', async (_t) => {
const options = {
input: FIXTURE_INPUT,
output: '../tmp.wacz',
url: 'https://lil.law.harvard.edu',
title: 'WACZ Title',
description: 'WACZ Description',
pages: PAGES_DIR_FIXTURES_PATH
}

const archive = new WACZ(options)

await archive.process(false)

const zip = new StreamZip.async({ file: options.output }) // eslint-disable-line

// File in pages fixture directory that are invalid JSONL or have wrong extension
// should not be copied into the WACZ.
assert.rejects(async () => await zip.entryData('pages/invalid.jsonl'))
assert.rejects(async () => await zip.entryData('pages/invalid.txt'))

// pages/pages.jsonl and pages/extraPages.jsonl should have same hash as fixtures
// they were copied from.
const datapackage = JSON.parse(await zip.entryData('datapackage.json'))

const datapackagePages = datapackage.resources.filter(entry => entry.path === 'pages/pages.jsonl')[0]
const pagesFixtureHash = await archive.sha256(PAGES_FIXTURE_PATH)
assert.equal(datapackagePages.hash, pagesFixtureHash)

const datapackageExtraPages = datapackage.resources.filter(entry => entry.path === 'pages/extraPages.jsonl')[0]
const extraPagesFixtureHash = await archive.sha256(EXTRA_PAGES_FIXTURE_PATH)
assert.equal(datapackageExtraPages.hash, extraPagesFixtureHash)

// Delete temp file
await fs.unlink(options.output)
})