Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Modify --pages option to copy pages files directly into WACZ #92

Merged
merged 6 commits into from Mar 22, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
27 changes: 3 additions & 24 deletions bin/cli.js
Expand Up @@ -37,8 +37,8 @@ program.command('create')
'Path to output .wacz file.', 'archive.wacz')
.option(
'-p --pages <string>',
'Path to a jsonl files to be used to replace pages.jsonl. ' +
'If not provided, js-wacz will attempt to detect pages.')
'Path to a directory of pages JSONL files to copy into WACZ as-is. ' +
'If --pages is not provided, js-wacz will attempt to detect pages.')
.option(
'--url <string>',
'If provided, will be used as the "main page url" in datapackage.json.')
Expand Down Expand Up @@ -115,35 +115,14 @@ program.command('create')
description: values?.desc,
signingUrl: values?.signingUrl,
signingToken: values?.signingToken,
pages: values?.pages,
log
})
} catch (err) {
log.error(`${err}`) // Show simplified report
return
}

// Ingest user-provided pages.jsonl file, if any.
if (values?.pages) {
try {
log.info(`pages.jsonl: Reading entries from ${values?.pages}`)
const rl = readline.createInterface({ input: createReadStream(values.pages) })

for await (const line of rl) {
const page = JSON.parse(line)

if (!page?.url) {
continue
}

log.info(`Adding ${page.url}.`)
archive.addPage(page?.url, page?.title, page?.ts)
}
} catch (err) {
log.trace(err)
log.error('An error occurred while processing user-provided pages.jsonl.')
}
}

// Ingest user-provided CDX files, if any.
if (values?.cdxj) {
try {
Expand Down
18 changes: 18 additions & 0 deletions constants.js
Expand Up @@ -16,6 +16,24 @@ export const BASE_PATH = dirname(fileURLToPath(import.meta.url))
*/
export const FIXTURES_PATH = `${BASE_PATH}${sep}fixtures${sep}`

/**
* Path to the fixtures folder pages sub-directory.
* @constant
*/
export const PAGES_DIR_FIXTURES_PATH = `${FIXTURES_PATH}pages${sep}`

/**
* Path to the pages.jsonl fixture
* @constant
*/
export const PAGES_FIXTURE_PATH = `${PAGES_DIR_FIXTURES_PATH}pages.jsonl`

/**
* Path to the extraPages.jsonl fixture
* @constant
*/
export const EXTRA_PAGES_FIXTURE_PATH = `${PAGES_DIR_FIXTURES_PATH}extraPages.jsonl`

/**
* Colors scheme for log level.
* @constant
Expand Down
4 changes: 4 additions & 0 deletions fixtures/pages/extraPages.jsonl
@@ -0,0 +1,4 @@
{"format": "json-pages-1.0", "id": "extra-pages", "title": "Extra Pages"}
{"id": "e33b4ca5-ce1d-46b2-83ea-405c43b949c5", "url": "https://webrecorder.net/tools", "title": "Webrecorder | Tools", "loadState": 4, "status": 200, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:22Z"}
{"id": "d026299c-3e37-4473-bcb4-742bc005b25d", "url": "https://webrecorder.net/blog", "title": "Webrecorder | Blog", "loadState": 4, "status": 200, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"}
{"id": "726e4e11-abb5-447d-b0be-61c4de7bb4b1", "url": "https://webrecorder.net/community", "title": "Webrecorder | Community", "loadState": 4, "status": 200, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"}
2 changes: 2 additions & 0 deletions fixtures/pages/invalid.jsonl
@@ -0,0 +1,2 @@
{"format": "json-pages-1.0", "id": "extra-pages", "title": "Extra Pages"
{"id": "8e584989-8e90-41d6-9f27-c15d0fefe437", "url": "https://webrecorder.net/about", "title": "Webrecorder | About", "loadState": 4, "status": None, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"}
1 change: 1 addition & 0 deletions fixtures/pages/invalid.txt
@@ -0,0 +1 @@
Not a JSONL file
2 changes: 2 additions & 0 deletions fixtures/pages/pages.jsonl
@@ -0,0 +1,2 @@
{"format": "json-pages-1.0", "id": "pages", "title": "All Pages"}
{"id": "3e01410a-e0a8-4b6f-8a6a-fca6302d9916", "url": "https://webrecorder.net/", "title": "Webrecorder", "loadState": 4, "status": 200, "seed": true, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:17Z"}
65 changes: 63 additions & 2 deletions index.js
Expand Up @@ -3,7 +3,8 @@
import fs from 'fs/promises'
import { createWriteStream, createReadStream, WriteStream, unlinkSync } from 'fs' // eslint-disable-line
import { createHash } from 'crypto'
import { basename, sep } from 'path'
import { basename, sep, resolve } from 'path'
import * as readline from 'node:readline/promises'

import { Deflate } from 'pako'
import { globSync } from 'glob'
Expand Down Expand Up @@ -177,6 +178,12 @@ export class WACZ {
*/
archiveStream = null

/**
* Path to directory of pages JSONL files to copy as-is into WACZ.
* @type {?string}
*/
pagesDir = null

/**
* @param {WACZOptions} options - See {@link WACZOptions} for details.
*/
Expand Down Expand Up @@ -276,6 +283,11 @@ export class WACZ {
this.detectPages = false
}

if (options?.pages) {
this.detectPages = false
this.pagesDir = String(options?.pages).trim()
}

if (options?.indexFromWARCs === false) {
this.indexFromWARCs = false
}
Expand Down Expand Up @@ -359,7 +371,11 @@ export class WACZ {
await this.writeIndexesToZip()

info('Writing pages.jsonl to WACZ')
await this.writePagesToZip()
if (!this.pagesDir) {
await this.writePagesToZip()
} else {
await this.copyPagesFilesToZip()
}

info('Writing WARCs to WACZ')
await this.writeWARCsToZip()
Expand Down Expand Up @@ -582,6 +598,51 @@ export class WACZ {
}
}

/**
* Copies pages.jsonl and extraPages.jsonl files in this.pagesDir into ZIP.
* @returns {Promise<void>}
*/
copyPagesFilesToZip = async () => {
this.stateCheck()

const { pagesDir, log, addFileToZip } = this

if (!pagesDir) {
throw new Error('Error copying pages files, no directory specified.')
}

const pagesFiles = await fs.readdir(pagesDir)

for (let i = 0; i < pagesFiles.length; i++) {
const filename = pagesFiles[i]
const filenameLower = filename.toLowerCase()
const pagesFile = resolve(this.pagesDir, filename)

if (!filenameLower.endsWith('.jsonl')) {
log.warn(`Pages: Skipping file ${pagesFile}, does not end with jsonl extension`)
continue
}

let isValidJSONL = true

// Ensure file is valid JSONL
const rl = readline.createInterface({ input: createReadStream(pagesFile) })
for await (const line of rl) {
try {
JSON.parse(line)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the edits @tw4l!

I think we're almost there - We should maybe we add a little bit of validation against the spec, just to make sure those are indeed pages files.

We could check:

  • That the first item contains format and id properties
  • That subsequent entries contain url and ts properties

Maybe using Node's assert since we're in a try / catch ?

What do you think?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah nice suggestion! May as well get this right while we're focused on it :)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Commit pushed!

Copy link
Collaborator Author

@tw4l tw4l Mar 21, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ilya raised a related point: in older versions of the crawler, the pages files occasionally included invalid lines, we think because of text extraction that wasn't truncated.

It may be safer if less performant to filter per-line rather than per-file, but write valid lines as-is into the correct file in the WACZ. What do you think?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm now inclined to say that we can generate valid pages.jsonl and just fail if its invalid, but yeah, another option would be, on first failure, to skip the failing lines and reserialize only valid ones (similar to old behavior). But not sure if its needed at this point.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think either approach works for me; in both cases, we are unlikely to end up with invalid pages.jsonl files added to the archive. I am also not concerned about performance for this step.

Let me know if you'd like to add line-by-line filtering or not 😄 . Otherwise: this is great and I'm happy to test / approve / merge.

Thank you both!

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds like we're going to stick with per-file and just make sure the crawler isn't writing any invalid pages files to begin with, so feel free to test/approve/merge, thank you!

} catch (err) {
isValidJSONL = false
log.warn(`Pages: Skipping file ${pagesFile}, not valid JSONL`)
tw4l marked this conversation as resolved.
Show resolved Hide resolved
break
}
}

if (isValidJSONL) {
await addFileToZip(pagesFile, `pages/${filename}`)
}
}
}

/**
* Streams all the files listes in `this.WARCs` to the output ZIP.
* @returns {Promise<void>}
Expand Down
45 changes: 44 additions & 1 deletion index.test.js
Expand Up @@ -11,7 +11,7 @@ import StreamZip from 'node-stream-zip'
import * as dotenv from 'dotenv'

import { WACZ } from './index.js'
import { FIXTURES_PATH } from './constants.js'
import { FIXTURES_PATH, PAGES_DIR_FIXTURES_PATH, PAGES_FIXTURE_PATH, EXTRA_PAGES_FIXTURE_PATH } from './constants.js'
import { assertSHA256WithPrefix, assertValidWACZSignatureFormat } from './utils/assertions.js' // see https://github.com/motdotla/dotenv#how-do-i-use-dotenv-with-import

// Loads env vars from .env if provided
Expand Down Expand Up @@ -74,6 +74,12 @@ test('WACZ constructor accounts for options.detectPages if valid.', async (_t) =
assert.equal(archive.detectPages, false)
})

test('WACZ constructor accounts for options.pages if provided.', async (_t) => {
const archive = new WACZ({ input: FIXTURE_INPUT, pages: PAGES_DIR_FIXTURES_PATH })
assert.equal(archive.detectPages, false)
assert.equal(archive.pagesDir, PAGES_DIR_FIXTURES_PATH)
})

test('WACZ constructor ignores options.indexFromWARCs if invalid.', async (_t) => {
const scenarios = ['foo', {}, Buffer.alloc(0), 12, () => {}]

Expand Down Expand Up @@ -333,3 +339,40 @@ test('WACZ.process runs the entire process and writes a valid .wacz to disk, acc
// Delete temp file
await fs.unlink(options.output)
})

test('WACZ.process with pagesDir option creates valid WACZ with provided pages files.', async (_t) => {
const options = {
input: FIXTURE_INPUT,
output: '../tmp.wacz',
url: 'https://lil.law.harvard.edu',
title: 'WACZ Title',
description: 'WACZ Description',
pages: PAGES_DIR_FIXTURES_PATH
}

const archive = new WACZ(options)

await archive.process(false)

const zip = new StreamZip.async({ file: options.output }) // eslint-disable-line

// File in pages fixture directory that are invalid JSONL or have wrong extension
// should not be copied into the WACZ.
assert.rejects(async () => await zip.entryData('pages/invalid.jsonl'))
assert.rejects(async () => await zip.entryData('pages/invalid.txt'))

// pages/pages.jsonl and pages/extraPages.jsonl should have same hash as fixtures
// they were copied from.
const datapackage = JSON.parse(await zip.entryData('datapackage.json'))

const datapackagePages = datapackage.resources.filter(entry => entry.path === 'pages/pages.jsonl')[0]
const pagesFixtureHash = await archive.sha256(PAGES_FIXTURE_PATH)
assert.equal(datapackagePages.hash, pagesFixtureHash)

const datapackageExtraPages = datapackage.resources.filter(entry => entry.path === 'pages/extraPages.jsonl')[0]
const extraPagesFixtureHash = await archive.sha256(EXTRA_PAGES_FIXTURE_PATH)
assert.equal(datapackageExtraPages.hash, extraPagesFixtureHash)

// Delete temp file
await fs.unlink(options.output)
})