Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add option to use existing CDXJ rather than indexing from WARCs #89

Merged
merged 2 commits into from Mar 7, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 8 additions & 0 deletions README.md
Expand Up @@ -100,6 +100,14 @@ If not provided, **js-wacz** is going to attempt to detect pages in WARC records
js-wacz create -f "collection/*.warc.gz" --pages collection/pages.jsonl
```

### --cdxj

Allows to pass a directory of existing CDXJ files, rather than indexing from WARCs. Must be used in combination with `--pages`.

```bash
js-wacz create -f "collection/*.warc.gz" --pages collection/pages.jsonl --cdxj collection/indexes/
```

### --url

If provided, will be used as the [`mainPageUrl` attribute for `datapackage.json`](https://specs.webrecorder.net/wacz/1.1.1/#datapackage-json).
Expand Down
40 changes: 40 additions & 0 deletions bin/cli.js
@@ -1,6 +1,8 @@
#! /usr/bin/env node

import { createReadStream } from 'fs'
import fs from 'fs/promises'
import { resolve } from 'path'
import * as readline from 'node:readline/promises'

import log from 'loglevel'
Expand Down Expand Up @@ -59,6 +61,10 @@ program.command('create')
.option(
'--log-level <string>',
'Can be "silent", "trace", "debug", "info", "warn", "error"', 'info')
.option('--cdxj <string>',
'Path to a directory containing CDXJ indices to merge into final WACZ CDXJ. ' +
'If not provided, js-wacz will reindex from WARCS. Must be used in combination ' +
'with --pages, since using this option will skip reading the WARC files.')
.action(async (name, options, command) => {
/** @type {Object} */
const values = options._optionValues
Expand Down Expand Up @@ -93,6 +99,11 @@ program.command('create')
return
}

if (values?.cdxj && !values?.pages) {
console.error('Error: --cdxj option must be used in combination with --pages.')
return
}

// Pass options to WACZ
try {
archive = new WACZ({
Expand Down Expand Up @@ -133,6 +144,35 @@ program.command('create')
}
}

// Ingest user-provided CDX files, if any.
if (values?.cdxj) {
try {
const dirPath = values?.cdxj
const cdxjFiles = await fs.readdir(dirPath)
const allowedExts = ['cdx', 'cdxj']

for (let i = 0; i < cdxjFiles.length; i++) {
const cdxjFile = resolve(dirPath, cdxjFiles[i])

const ext = cdxjFile.split('.').pop()
if (!allowedExts.includes(ext)) {
log.info(`CDXJ: Skipping file ${cdxjFile}, not a CDXJ file`)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tw4l Nitpick: I'd make that a warning maybe?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good call, just pushed the change :)

continue
}

log.info(`CDXJ: Reading entries from ${cdxjFile}`)
const rl = readline.createInterface({ input: createReadStream(cdxjFile) })

for await (const line of rl) {
archive.addCDXJ(line + '\n')
}
}
} catch (err) {
log.trace(err)
log.error('An error occurred while processing user-provided CDXJ indices.')
}
}

// Main process
try {
await archive.process()
Expand Down
29 changes: 27 additions & 2 deletions index.js
Expand Up @@ -76,6 +76,12 @@ export class WACZ {
*/
detectPages = true

/**
* From WACZOptions.indexFromWARCs.
* @type {boolean}
*/
indexFromWARCs = true

/**
* From WACZOptions.url.
* @type {?string}
Expand Down Expand Up @@ -270,6 +276,10 @@ export class WACZ {
this.detectPages = false
}

if (options?.indexFromWARCs === false) {
this.indexFromWARCs = false
}

if (options?.url) {
try {
new URL(options.url) // eslint-disable-line
Expand Down Expand Up @@ -337,8 +347,10 @@ export class WACZ {
info('Initializing indexer')
this.initWorkerPool()

info('Indexing WARCS')
await this.indexWARCs()
if (this.indexFromWARCs) {
info('Indexing WARCS')
await this.indexWARCs()
}

info('Harvesting sorted indexes from trees')
this.harvestArraysFromTrees()
Expand Down Expand Up @@ -792,6 +804,19 @@ export class WACZ {
return page
}

/**
* Allows to manually add a CDJX entry to `this.cdxTree`.
* Calling this method automatically turns indexing from WARCS off.
* @param {string} cdjx - CDJX as string
* @returns {Promise<void>}
*/
addCDXJ = (cdjx) => {
this.stateCheck()
this.indexFromWARCs = false

this.cdxTree.setIfNotPresent(cdjx, true)
}

/**
* Adds a file to the output ZIP stream.
* Automatically keeps trace of file in `this.resources` so it can be referenced in datapackage.json.
Expand Down
25 changes: 25 additions & 0 deletions index.test.js
Expand Up @@ -74,6 +74,20 @@ test('WACZ constructor accounts for options.detectPages if valid.', async (_t) =
assert.equal(archive.detectPages, false)
})

test('WACZ constructor ignores options.indexFromWARCs if invalid.', async (_t) => {
const scenarios = ['foo', {}, Buffer.alloc(0), 12, () => {}]

for (const indexFromWARCs of scenarios) {
const archive = new WACZ({ input: FIXTURE_INPUT, indexFromWARCs })
assert.equal(archive.indexFromWARCs, true)
}
})

test('WACZ constructor accounts for options.indexFromWARCs if valid.', async (_t) => {
const archive = new WACZ({ input: FIXTURE_INPUT, indexFromWARCs: false })
assert.equal(archive.indexFromWARCs, false)
})

test('WACZ constructor ignores options.url if invalid.', async (_t) => {
const scenarios = ['foo', {}, Buffer.alloc(0), 12, () => {}]

Expand Down Expand Up @@ -178,6 +192,17 @@ test('addPage adds entry to pagesTree and turns detectPages off.', async (_t) =>
assert.equal(archive.pagesTree.length, 1)
})

test('addCDXJ adds entry to cdxTree and turns indexFromWARCs off.', async (_t) => {
const archive = new WACZ({ input: FIXTURE_INPUT })
assert.equal(archive.indexFromWARCs, true)
assert.equal(archive.cdxTree.length, 0)

archive.addCDXJ('net,webrecorder)/ 20240307070734 {"url":"https://webrecorder.net/","mime":"text/html","status":200,"digest":"16966a2a2909825ad1d9a6f1b2f4833c8fe43428cb9920d0f974bd7b3d73c31d","length":3941,"offset":0,"filename":"rec-8bc4bd095683-20240307070734658-0.warc.gz"}')

assert.equal(archive.indexFromWARCs, false)
assert.equal(archive.cdxTree.length, 1)
})

// Note: if `TEST_SIGNING_URL` / `TEST_SIGNING_TOKEN` are present, this will also test the signing feature.
test('WACZ.process runs the entire process and writes a valid .wacz to disk, accounting for options.', async (_t) => {
//
Expand Down