diff --git a/.github/workflows/nodejs.yml b/.github/workflows/nodejs.yml index 76bc5bd..a25080f 100644 --- a/.github/workflows/nodejs.yml +++ b/.github/workflows/nodejs.yml @@ -10,5 +10,5 @@ jobs: nodejs: uses: zakodium/workflows/.github/workflows/nodejs.yml@nodejs-v1 with: - node-version-matrix: '[14, 16]' + node-version-matrix: '[14, 16, 18]' lint-check-types: true diff --git a/.gitignore b/.gitignore index 7db52e3..81c3507 100644 --- a/.gitignore +++ b/.gitignore @@ -119,4 +119,6 @@ dist lib lib-esm -big.xml \ No newline at end of file +big.xml + +script/medline.xml \ No newline at end of file diff --git a/package.json b/package.json index 7b16073..850fde6 100644 --- a/package.json +++ b/package.json @@ -39,9 +39,9 @@ "homepage": "https://github.com/cheminfo/arraybuffer-xml-parser#readme", "devDependencies": { "@types/he": "^1.1.2", - "@types/jest": "^27.5.0", + "@types/jest": "^27.5.1", "cheminfo-build": "^1.1.11", - "eslint": "^8.15.0", + "eslint": "^8.16.0", "eslint-config-cheminfo-typescript": "^10.4.0", "he": "^1.2.0", "iobuffer": "^5.1.0", @@ -49,8 +49,8 @@ "pako": "^2.0.4", "prettier": "^2.6.2", "rimraf": "^3.0.2", - "ts-jest": "^28.0.2", - "typescript": "^4.6.4", + "ts-jest": "^28.0.3", + "typescript": "^4.7.2", "uint8-base64": "^0.1.1" }, "dependencies": { diff --git a/script/medline.mjs b/script/medline.mjs new file mode 100644 index 0000000..a98d454 --- /dev/null +++ b/script/medline.mjs @@ -0,0 +1,19 @@ +import { parseStream } from '../lib/index.js'; +import { open } from 'fs/promises'; + +/* +In order to test this script you should first build the package: `npm run prepack` +And you also need a (big) file from medline called 'medline.xml' +*/ + +async function doAll() { + const file = await open(new URL('medline.xml', import.meta.url), 'r'); + const stream = file.readableWebStream(); + let i = 0; + for await (const entry of parseStream(stream, 'PubmedArticle')) { + console.log(entry); + console.log(i++); + } +} + +doAll(); diff --git a/src/__tests__/parseStream.test.ts b/src/__tests__/parseStream.test.ts new file mode 100644 index 0000000..b40a06e --- /dev/null +++ b/src/__tests__/parseStream.test.ts @@ -0,0 +1,63 @@ +import { open } from 'fs/promises'; +import { join } from 'path'; + +import { parseStream } from '../parseStream'; + +describe('parseStream', () => { + it('simple case', async () => { + // eslint-disable-next-line jest/no-if + if (Number(process.versions.node.split('.')[0]) >= 18) { + const file = await open(join(__dirname, 'assets/sample.xml'), 'r'); + const CHUNK_SIZE = 10; + const transformStream = new TransformStream({ + start: function start() {}, // required. + transform: async function transform(chunk, controller) { + if (chunk === null) controller.terminate(); + chunk = new Uint8Array(await chunk); + for (let i = 0; i < chunk.length; i += CHUNK_SIZE) { + controller.enqueue(chunk.slice(i, i + CHUNK_SIZE)); + } + }, + }); + + const results = []; + //@ts-expect-error feature is too new + const readableStream = file.readableWebStream(); + for await (let entry of parseStream( + readableStream.pipeThrough(transformStream), + 'address', + )) { + results.push(entry); + //console.log(entry); + } + expect(results).toMatchInlineSnapshot(` + Array [ + Object { + "buildingNo": 1, + "city": "New York", + "flatNo": 1, + "street": "Park Ave", + }, + Object { + "buildingNo": 33, + "city": "Boston", + "flatNo": 24, + "street": "Centre St", + }, + Object { + "buildingNo": 1, + "city": "Moscow", + "flatNo": 2, + "street": "Kahovka", + }, + Object { + "buildingNo": 3, + "city": "Tula", + "flatNo": 78, + "street": "Lenina", + }, + ] + `); + } + }); +}); diff --git a/src/index.ts b/src/index.ts index dd1a55c..6386175 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1 +1,2 @@ export * from './parse'; +export * from './parseStream'; diff --git a/src/parseStream.ts b/src/parseStream.ts new file mode 100644 index 0000000..a2e3093 --- /dev/null +++ b/src/parseStream.ts @@ -0,0 +1,25 @@ +import { + defaultOptions, + StreamParseOptions, +} from './traversable/defaultOptions'; +import { getTraversableGenerator } from './traversable/getTraversableGenerator'; +import { traversableToJSON } from './traversableToJSON'; + +/** + * Parse a web stream representing an XML and emit objects + */ +export async function* parseStream( + readableStream: ReadableStream, + lookupTagName: string, + options: StreamParseOptions = {}, +) { + options = { ...defaultOptions, ...options }; + + for await (const traversableEntry of getTraversableGenerator( + readableStream, + lookupTagName, + options, + )) { + yield traversableToJSON(traversableEntry, options); + } +} diff --git a/src/traversable/closingIndexForOpeningTag.ts b/src/traversable/closingIndexForOpeningTag.ts index e242c23..fa1ef89 100644 --- a/src/traversable/closingIndexForOpeningTag.ts +++ b/src/traversable/closingIndexForOpeningTag.ts @@ -1,5 +1,11 @@ import { decoder } from './utils/utf8Decoder'; +/** + * Search for the corresponding closing tag '>' + * @param data + * @param i + * @returns + */ export function closingIndexForOpeningTag( data: Uint8Array, i: number, @@ -25,8 +31,5 @@ export function closingIndexForOpeningTag( } endIndex++; } - return { - data: decoder.decode(data.subarray(i, i + endIndex)), - index: 0, - }; + throw new Error('Could not find closing tag'); } diff --git a/src/traversable/defaultOptions.ts b/src/traversable/defaultOptions.ts index 20c5d8b..524b948 100644 --- a/src/traversable/defaultOptions.ts +++ b/src/traversable/defaultOptions.ts @@ -7,6 +7,20 @@ export const decoder = { return utf8Decoder.decode(array); }, }; + +export interface StreamParseOptions extends ParseOptions { + /** + * What is the maximal size (in bytes) of an entry + * @default 1e7 + */ + maxEntrySize?: number; + /** + * What is the maximal size for the buffer + * @default 2e8 + */ + maxBufferSize?: number; +} + export interface ParseOptions { /** * should we remove ascii < 32 @@ -92,6 +106,7 @@ export interface ParseOptions { */ stopNodes?: string[]; } + export const defaultOptions: ParseOptions = { trimValues: true, attributeNamePrefix: '$', diff --git a/src/traversable/getTraversableGenerator.ts b/src/traversable/getTraversableGenerator.ts new file mode 100644 index 0000000..a50aec8 --- /dev/null +++ b/src/traversable/getTraversableGenerator.ts @@ -0,0 +1,275 @@ +import { XMLNode } from '../XMLNode'; +import { arrayIndexOf } from '../bufferUtils/arrayIndexOf'; +import { arrayTrim } from '../bufferUtils/arrayTrim'; + +import { closingIndexForOpeningTag } from './closingIndexForOpeningTag'; +import { StreamParseOptions } from './defaultOptions'; +import { findClosingIndex } from './findClosingIndex'; +import { parseAttributesString } from './parseAttributesString'; +import { concat } from './utils/concat'; +import { removeNameSpaceIfNeeded } from './utils/removeNameSpaceIfNeeded'; +import { decoder } from './utils/utf8Decoder'; + +export async function* getTraversableGenerator( + readableStream: ReadableStream, + lookupTagName: string, + options: StreamParseOptions, +) { + let dataSize = 0; + let dataIndex = 0; + let currentNode: XMLNode | undefined; + let lastMatchingClosedIndex = 0; + const reader = readableStream.getReader(); + let chunk = await reader.read(); + let endStream = chunk.done; + let xmlData = new Uint8Array(chunk.value); + + const { maxEntrySize = 1e7, maxBufferSize = 2e8 } = options; + + for (let i = 0; i < xmlData.length; i++) { + if (xmlData.length - i < maxEntrySize && !endStream) { + // TODO we should remove from xmlData what was processed + if (lastMatchingClosedIndex > 0) { + i -= lastMatchingClosedIndex; + xmlData = xmlData.slice(lastMatchingClosedIndex); + lastMatchingClosedIndex = 0; + } + let currentLength = xmlData.length; + const newChunks = []; + while (currentLength < maxBufferSize && !endStream) { + chunk = await reader.read(); + endStream = chunk.done; + if (!endStream) { + const newChunk = new Uint8Array(chunk.value); + newChunks.push(newChunk); + currentLength += newChunk.length; + } + } + + const newXmlData = new Uint8Array(currentLength); + let currentShift = 0; + newXmlData.set(xmlData, currentShift); + currentShift += xmlData.length; + for (let chunk of newChunks) { + newXmlData.set(chunk, currentShift); + currentShift += chunk.length; + } + xmlData = newXmlData; + } + + if (xmlData[i] === 0x3c) { + // < + const xmlData1 = xmlData[i + 1]; + const xmlData2 = xmlData[i + 2]; + if (xmlData1 === 0x2f) { + // + i, + 'Closing Tag is not closed.', + ); + let tagName = decoder.decode( + arrayTrim(xmlData.subarray(i + 2, closeIndex), {}), + ); + tagName = removeNameSpaceIfNeeded(tagName, options); + + if (currentNode) { + const value = options.trimValues + ? arrayTrim(xmlData.subarray(dataIndex, dataIndex + dataSize)) + : xmlData.subarray(dataIndex, dataIndex + dataSize); + if (currentNode.value === undefined) { + currentNode.value = value; + } else { + currentNode.value = concat(currentNode.value, value); + } + if ( + options.stopNodes?.length && + options.stopNodes.includes(currentNode.tagName) + ) { + currentNode.children = {}; + if (currentNode.attributes === undefined) { + currentNode.attributes = {}; + } + currentNode.value = xmlData.subarray(currentNode.startIndex + 1, i); + } + if (tagName === lookupTagName) { + yield currentNode; + lastMatchingClosedIndex = i; + } + currentNode = currentNode.parent as XMLNode; + } + i = closeIndex; + dataSize = 0; + dataIndex = i + 1; + } else if (xmlData1 === 0x3f) { + // + i, + 'Comment is not closed.', + ); + if (currentNode && dataSize !== 0) { + if (currentNode.tagName !== '!xml') { + currentNode.value = concat( + currentNode.value, + options.trimValues + ? arrayTrim(xmlData.subarray(dataIndex, dataSize + dataIndex)) + : xmlData.subarray(dataIndex, dataSize + dataIndex), + ); + } + } + dataSize = 0; + dataIndex = i + 1; + //!D + } else if (xmlData1 === 0x21 && xmlData2 === 0x44) { + // + i, + 'DOCTYPE is not closed.', + ); + const tagExp = xmlData.subarray(i, closeIndex); + if (arrayIndexOf(tagExp, [0x5b]) >= 0) { + i = arrayIndexOf(xmlData, [0x5d, 0x3e], i) + 1; + } else { + i = closeIndex; + } //![ + } else if (xmlData1 === 0x21 && xmlData2 === 0x5b) { + // + const closeIndex = + findClosingIndex( + xmlData, + [0x5d, 0x5d, 0x3e], //]]> + i, + 'CDATA is not closed.', + ) - 2; + const tagExp = xmlData.subarray(i + 9, closeIndex); + + //considerations + //1. CDATA will always have parent node + //2. A tag with CDATA is not a leaf node so it's value would be string type. + if (dataSize !== 0) { + const value = options.trimValues + ? arrayTrim(xmlData.subarray(dataIndex, dataIndex + dataSize)) + : xmlData.subarray(dataIndex, dataIndex + dataSize); + + if (currentNode) currentNode.value = concat(currentNode.value, value); + } + + if (options.cdataTagName) { + //add cdata node + const childNode = new XMLNode( + options.cdataTagName, + currentNode, + tagExp, + ); + if (currentNode) currentNode.addChild(childNode); + //add rest value to parent node + if (tagExp) { + childNode.value = tagExp; + } + } else { + if (currentNode) { + currentNode.value = concat(currentNode.value, tagExp); + } + } + + i = closeIndex + 2; + dataSize = 0; + dataIndex = i + 1; + } else { + //Opening a normal tag + const parsedOpeningTag = closingIndexForOpeningTag(xmlData, i + 1); + let tagData = parsedOpeningTag.data.replace(/\r?\n|\t/g, ' '); + const closeIndex = parsedOpeningTag.index; + const separatorIndex = tagData.indexOf(' '); + let shouldBuildAttributesMap = true; + let tagName = + separatorIndex >= 0 + ? tagData.substr(0, separatorIndex).replace(/\s+$/, '') + : tagData; + let tagAttributes = + separatorIndex >= 0 ? tagData.substr(separatorIndex + 1) : ''; + if (options.ignoreNameSpace) { + const colonIndex = tagName.indexOf(':'); + if (colonIndex !== -1) { + tagName = tagName.substr(colonIndex + 1); + shouldBuildAttributesMap = + tagName !== parsedOpeningTag.data.substr(colonIndex + 1); + } + } + + //save text to parent node + if (currentNode && dataSize !== 0) { + if (currentNode.tagName !== '!xml') { + currentNode.value = concat( + currentNode.value, + options.trimValues + ? arrayTrim(xmlData.subarray(dataIndex, dataIndex + dataSize)) + : xmlData.subarray(dataIndex, dataIndex + dataSize), + ); + } + } + + if (tagData.length > 0 && tagData.endsWith('/')) { + // selfClosing tag + // TODO we should check if it match the tag and crete the currentNode + if (currentNode) { + if (tagAttributes) { + // + tagAttributes = tagAttributes.substr(0, tagAttributes.length - 1); + } else { + // + tagName = tagName.substr(0, tagName.length - 1); + } + + const childNode = new XMLNode(tagName, currentNode, ''); + if (tagAttributes) { + childNode.attributes = parseAttributesString( + tagAttributes, + options, + ); + } + currentNode.addChild(childNode); + } + } else { + //opening tag + + if (currentNode || tagName === lookupTagName) { + const childNode = new XMLNode(tagName, currentNode); + if ( + options.stopNodes?.length && + options.stopNodes.includes(childNode.tagName) + ) { + childNode.startIndex = closeIndex; + } + if (tagAttributes && shouldBuildAttributesMap) { + childNode.attributes = parseAttributesString( + tagAttributes, + options, + ); + } + if (currentNode) currentNode.addChild(childNode); + currentNode = childNode; + } + } + + i = closeIndex; + dataSize = 0; + dataIndex = i + 1; + } + } else { + dataSize++; + } + } +}