diff --git a/.github/workflows/nodejs.yml b/.github/workflows/nodejs.yml
index 76bc5bd..a25080f 100644
--- a/.github/workflows/nodejs.yml
+++ b/.github/workflows/nodejs.yml
@@ -10,5 +10,5 @@ jobs:
nodejs:
uses: zakodium/workflows/.github/workflows/nodejs.yml@nodejs-v1
with:
- node-version-matrix: '[14, 16]'
+ node-version-matrix: '[14, 16, 18]'
lint-check-types: true
diff --git a/.gitignore b/.gitignore
index 7db52e3..81c3507 100644
--- a/.gitignore
+++ b/.gitignore
@@ -119,4 +119,6 @@ dist
lib
lib-esm
-big.xml
\ No newline at end of file
+big.xml
+
+script/medline.xml
\ No newline at end of file
diff --git a/package.json b/package.json
index 7b16073..850fde6 100644
--- a/package.json
+++ b/package.json
@@ -39,9 +39,9 @@
"homepage": "https://github.com/cheminfo/arraybuffer-xml-parser#readme",
"devDependencies": {
"@types/he": "^1.1.2",
- "@types/jest": "^27.5.0",
+ "@types/jest": "^27.5.1",
"cheminfo-build": "^1.1.11",
- "eslint": "^8.15.0",
+ "eslint": "^8.16.0",
"eslint-config-cheminfo-typescript": "^10.4.0",
"he": "^1.2.0",
"iobuffer": "^5.1.0",
@@ -49,8 +49,8 @@
"pako": "^2.0.4",
"prettier": "^2.6.2",
"rimraf": "^3.0.2",
- "ts-jest": "^28.0.2",
- "typescript": "^4.6.4",
+ "ts-jest": "^28.0.3",
+ "typescript": "^4.7.2",
"uint8-base64": "^0.1.1"
},
"dependencies": {
diff --git a/script/medline.mjs b/script/medline.mjs
new file mode 100644
index 0000000..a98d454
--- /dev/null
+++ b/script/medline.mjs
@@ -0,0 +1,19 @@
+import { parseStream } from '../lib/index.js';
+import { open } from 'fs/promises';
+
+/*
+In order to test this script you should first build the package: `npm run prepack`
+And you also need a (big) file from medline called 'medline.xml'
+*/
+
+async function doAll() {
+ const file = await open(new URL('medline.xml', import.meta.url), 'r');
+ const stream = file.readableWebStream();
+ let i = 0;
+ for await (const entry of parseStream(stream, 'PubmedArticle')) {
+ console.log(entry);
+ console.log(i++);
+ }
+}
+
+doAll();
diff --git a/src/__tests__/parseStream.test.ts b/src/__tests__/parseStream.test.ts
new file mode 100644
index 0000000..b40a06e
--- /dev/null
+++ b/src/__tests__/parseStream.test.ts
@@ -0,0 +1,63 @@
+import { open } from 'fs/promises';
+import { join } from 'path';
+
+import { parseStream } from '../parseStream';
+
+describe('parseStream', () => {
+ it('simple case', async () => {
+ // eslint-disable-next-line jest/no-if
+ if (Number(process.versions.node.split('.')[0]) >= 18) {
+ const file = await open(join(__dirname, 'assets/sample.xml'), 'r');
+ const CHUNK_SIZE = 10;
+ const transformStream = new TransformStream({
+ start: function start() {}, // required.
+ transform: async function transform(chunk, controller) {
+ if (chunk === null) controller.terminate();
+ chunk = new Uint8Array(await chunk);
+ for (let i = 0; i < chunk.length; i += CHUNK_SIZE) {
+ controller.enqueue(chunk.slice(i, i + CHUNK_SIZE));
+ }
+ },
+ });
+
+ const results = [];
+ //@ts-expect-error feature is too new
+ const readableStream = file.readableWebStream();
+ for await (let entry of parseStream(
+ readableStream.pipeThrough(transformStream),
+ 'address',
+ )) {
+ results.push(entry);
+ //console.log(entry);
+ }
+ expect(results).toMatchInlineSnapshot(`
+ Array [
+ Object {
+ "buildingNo": 1,
+ "city": "New York",
+ "flatNo": 1,
+ "street": "Park Ave",
+ },
+ Object {
+ "buildingNo": 33,
+ "city": "Boston",
+ "flatNo": 24,
+ "street": "Centre St",
+ },
+ Object {
+ "buildingNo": 1,
+ "city": "Moscow",
+ "flatNo": 2,
+ "street": "Kahovka",
+ },
+ Object {
+ "buildingNo": 3,
+ "city": "Tula",
+ "flatNo": 78,
+ "street": "Lenina",
+ },
+ ]
+ `);
+ }
+ });
+});
diff --git a/src/index.ts b/src/index.ts
index dd1a55c..6386175 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -1 +1,2 @@
export * from './parse';
+export * from './parseStream';
diff --git a/src/parseStream.ts b/src/parseStream.ts
new file mode 100644
index 0000000..a2e3093
--- /dev/null
+++ b/src/parseStream.ts
@@ -0,0 +1,25 @@
+import {
+ defaultOptions,
+ StreamParseOptions,
+} from './traversable/defaultOptions';
+import { getTraversableGenerator } from './traversable/getTraversableGenerator';
+import { traversableToJSON } from './traversableToJSON';
+
+/**
+ * Parse a web stream representing an XML and emit objects
+ */
+export async function* parseStream(
+ readableStream: ReadableStream,
+ lookupTagName: string,
+ options: StreamParseOptions = {},
+) {
+ options = { ...defaultOptions, ...options };
+
+ for await (const traversableEntry of getTraversableGenerator(
+ readableStream,
+ lookupTagName,
+ options,
+ )) {
+ yield traversableToJSON(traversableEntry, options);
+ }
+}
diff --git a/src/traversable/closingIndexForOpeningTag.ts b/src/traversable/closingIndexForOpeningTag.ts
index e242c23..fa1ef89 100644
--- a/src/traversable/closingIndexForOpeningTag.ts
+++ b/src/traversable/closingIndexForOpeningTag.ts
@@ -1,5 +1,11 @@
import { decoder } from './utils/utf8Decoder';
+/**
+ * Search for the corresponding closing tag '>'
+ * @param data
+ * @param i
+ * @returns
+ */
export function closingIndexForOpeningTag(
data: Uint8Array,
i: number,
@@ -25,8 +31,5 @@ export function closingIndexForOpeningTag(
}
endIndex++;
}
- return {
- data: decoder.decode(data.subarray(i, i + endIndex)),
- index: 0,
- };
+ throw new Error('Could not find closing tag');
}
diff --git a/src/traversable/defaultOptions.ts b/src/traversable/defaultOptions.ts
index 20c5d8b..524b948 100644
--- a/src/traversable/defaultOptions.ts
+++ b/src/traversable/defaultOptions.ts
@@ -7,6 +7,20 @@ export const decoder = {
return utf8Decoder.decode(array);
},
};
+
+export interface StreamParseOptions extends ParseOptions {
+ /**
+ * What is the maximal size (in bytes) of an entry
+ * @default 1e7
+ */
+ maxEntrySize?: number;
+ /**
+ * What is the maximal size for the buffer
+ * @default 2e8
+ */
+ maxBufferSize?: number;
+}
+
export interface ParseOptions {
/**
* should we remove ascii < 32
@@ -92,6 +106,7 @@ export interface ParseOptions {
*/
stopNodes?: string[];
}
+
export const defaultOptions: ParseOptions = {
trimValues: true,
attributeNamePrefix: '$',
diff --git a/src/traversable/getTraversableGenerator.ts b/src/traversable/getTraversableGenerator.ts
new file mode 100644
index 0000000..a50aec8
--- /dev/null
+++ b/src/traversable/getTraversableGenerator.ts
@@ -0,0 +1,275 @@
+import { XMLNode } from '../XMLNode';
+import { arrayIndexOf } from '../bufferUtils/arrayIndexOf';
+import { arrayTrim } from '../bufferUtils/arrayTrim';
+
+import { closingIndexForOpeningTag } from './closingIndexForOpeningTag';
+import { StreamParseOptions } from './defaultOptions';
+import { findClosingIndex } from './findClosingIndex';
+import { parseAttributesString } from './parseAttributesString';
+import { concat } from './utils/concat';
+import { removeNameSpaceIfNeeded } from './utils/removeNameSpaceIfNeeded';
+import { decoder } from './utils/utf8Decoder';
+
+export async function* getTraversableGenerator(
+ readableStream: ReadableStream,
+ lookupTagName: string,
+ options: StreamParseOptions,
+) {
+ let dataSize = 0;
+ let dataIndex = 0;
+ let currentNode: XMLNode | undefined;
+ let lastMatchingClosedIndex = 0;
+ const reader = readableStream.getReader();
+ let chunk = await reader.read();
+ let endStream = chunk.done;
+ let xmlData = new Uint8Array(chunk.value);
+
+ const { maxEntrySize = 1e7, maxBufferSize = 2e8 } = options;
+
+ for (let i = 0; i < xmlData.length; i++) {
+ if (xmlData.length - i < maxEntrySize && !endStream) {
+ // TODO we should remove from xmlData what was processed
+ if (lastMatchingClosedIndex > 0) {
+ i -= lastMatchingClosedIndex;
+ xmlData = xmlData.slice(lastMatchingClosedIndex);
+ lastMatchingClosedIndex = 0;
+ }
+ let currentLength = xmlData.length;
+ const newChunks = [];
+ while (currentLength < maxBufferSize && !endStream) {
+ chunk = await reader.read();
+ endStream = chunk.done;
+ if (!endStream) {
+ const newChunk = new Uint8Array(chunk.value);
+ newChunks.push(newChunk);
+ currentLength += newChunk.length;
+ }
+ }
+
+ const newXmlData = new Uint8Array(currentLength);
+ let currentShift = 0;
+ newXmlData.set(xmlData, currentShift);
+ currentShift += xmlData.length;
+ for (let chunk of newChunks) {
+ newXmlData.set(chunk, currentShift);
+ currentShift += chunk.length;
+ }
+ xmlData = newXmlData;
+ }
+
+ if (xmlData[i] === 0x3c) {
+ // <
+ const xmlData1 = xmlData[i + 1];
+ const xmlData2 = xmlData[i + 2];
+ if (xmlData1 === 0x2f) {
+ // Closing Tag
+ const closeIndex = findClosingIndex(
+ xmlData,
+ [0x3e], //>
+ i,
+ 'Closing Tag is not closed.',
+ );
+ let tagName = decoder.decode(
+ arrayTrim(xmlData.subarray(i + 2, closeIndex), {}),
+ );
+ tagName = removeNameSpaceIfNeeded(tagName, options);
+
+ if (currentNode) {
+ const value = options.trimValues
+ ? arrayTrim(xmlData.subarray(dataIndex, dataIndex + dataSize))
+ : xmlData.subarray(dataIndex, dataIndex + dataSize);
+ if (currentNode.value === undefined) {
+ currentNode.value = value;
+ } else {
+ currentNode.value = concat(currentNode.value, value);
+ }
+ if (
+ options.stopNodes?.length &&
+ options.stopNodes.includes(currentNode.tagName)
+ ) {
+ currentNode.children = {};
+ if (currentNode.attributes === undefined) {
+ currentNode.attributes = {};
+ }
+ currentNode.value = xmlData.subarray(currentNode.startIndex + 1, i);
+ }
+ if (tagName === lookupTagName) {
+ yield currentNode;
+ lastMatchingClosedIndex = i;
+ }
+ currentNode = currentNode.parent as XMLNode;
+ }
+ i = closeIndex;
+ dataSize = 0;
+ dataIndex = i + 1;
+ } else if (xmlData1 === 0x3f) {
+ // PI, processing instruction
+ i = findClosingIndex(xmlData, [0x3f, 0x3e], i, 'Pi Tag is not closed.');
+ } else if (
+ //!-- comment
+ xmlData1 === 0x21 &&
+ xmlData2 === 0x2d &&
+ xmlData[i + 3] === 0x2d
+ ) {
+ i = findClosingIndex(
+ xmlData,
+ [0x2d, 0x2d, 0x3e], //-->
+ i,
+ 'Comment is not closed.',
+ );
+ if (currentNode && dataSize !== 0) {
+ if (currentNode.tagName !== '!xml') {
+ currentNode.value = concat(
+ currentNode.value,
+ options.trimValues
+ ? arrayTrim(xmlData.subarray(dataIndex, dataSize + dataIndex))
+ : xmlData.subarray(dataIndex, dataSize + dataIndex),
+ );
+ }
+ }
+ dataSize = 0;
+ dataIndex = i + 1;
+ //!D
+ } else if (xmlData1 === 0x21 && xmlData2 === 0x44) {
+ //
+ i,
+ 'DOCTYPE is not closed.',
+ );
+ const tagExp = xmlData.subarray(i, closeIndex);
+ if (arrayIndexOf(tagExp, [0x5b]) >= 0) {
+ i = arrayIndexOf(xmlData, [0x5d, 0x3e], i) + 1;
+ } else {
+ i = closeIndex;
+ } //![
+ } else if (xmlData1 === 0x21 && xmlData2 === 0x5b) {
+ //
+ const closeIndex =
+ findClosingIndex(
+ xmlData,
+ [0x5d, 0x5d, 0x3e], //]]>
+ i,
+ 'CDATA is not closed.',
+ ) - 2;
+ const tagExp = xmlData.subarray(i + 9, closeIndex);
+
+ //considerations
+ //1. CDATA will always have parent node
+ //2. A tag with CDATA is not a leaf node so it's value would be string type.
+ if (dataSize !== 0) {
+ const value = options.trimValues
+ ? arrayTrim(xmlData.subarray(dataIndex, dataIndex + dataSize))
+ : xmlData.subarray(dataIndex, dataIndex + dataSize);
+
+ if (currentNode) currentNode.value = concat(currentNode.value, value);
+ }
+
+ if (options.cdataTagName) {
+ //add cdata node
+ const childNode = new XMLNode(
+ options.cdataTagName,
+ currentNode,
+ tagExp,
+ );
+ if (currentNode) currentNode.addChild(childNode);
+ //add rest value to parent node
+ if (tagExp) {
+ childNode.value = tagExp;
+ }
+ } else {
+ if (currentNode) {
+ currentNode.value = concat(currentNode.value, tagExp);
+ }
+ }
+
+ i = closeIndex + 2;
+ dataSize = 0;
+ dataIndex = i + 1;
+ } else {
+ //Opening a normal tag
+ const parsedOpeningTag = closingIndexForOpeningTag(xmlData, i + 1);
+ let tagData = parsedOpeningTag.data.replace(/\r?\n|\t/g, ' ');
+ const closeIndex = parsedOpeningTag.index;
+ const separatorIndex = tagData.indexOf(' ');
+ let shouldBuildAttributesMap = true;
+ let tagName =
+ separatorIndex >= 0
+ ? tagData.substr(0, separatorIndex).replace(/\s+$/, '')
+ : tagData;
+ let tagAttributes =
+ separatorIndex >= 0 ? tagData.substr(separatorIndex + 1) : '';
+ if (options.ignoreNameSpace) {
+ const colonIndex = tagName.indexOf(':');
+ if (colonIndex !== -1) {
+ tagName = tagName.substr(colonIndex + 1);
+ shouldBuildAttributesMap =
+ tagName !== parsedOpeningTag.data.substr(colonIndex + 1);
+ }
+ }
+
+ //save text to parent node
+ if (currentNode && dataSize !== 0) {
+ if (currentNode.tagName !== '!xml') {
+ currentNode.value = concat(
+ currentNode.value,
+ options.trimValues
+ ? arrayTrim(xmlData.subarray(dataIndex, dataIndex + dataSize))
+ : xmlData.subarray(dataIndex, dataIndex + dataSize),
+ );
+ }
+ }
+
+ if (tagData.length > 0 && tagData.endsWith('/')) {
+ // selfClosing tag
+ // TODO we should check if it match the tag and crete the currentNode
+ if (currentNode) {
+ if (tagAttributes) {
+ //
+ tagAttributes = tagAttributes.substr(0, tagAttributes.length - 1);
+ } else {
+ //
+ tagName = tagName.substr(0, tagName.length - 1);
+ }
+
+ const childNode = new XMLNode(tagName, currentNode, '');
+ if (tagAttributes) {
+ childNode.attributes = parseAttributesString(
+ tagAttributes,
+ options,
+ );
+ }
+ currentNode.addChild(childNode);
+ }
+ } else {
+ //opening tag
+
+ if (currentNode || tagName === lookupTagName) {
+ const childNode = new XMLNode(tagName, currentNode);
+ if (
+ options.stopNodes?.length &&
+ options.stopNodes.includes(childNode.tagName)
+ ) {
+ childNode.startIndex = closeIndex;
+ }
+ if (tagAttributes && shouldBuildAttributesMap) {
+ childNode.attributes = parseAttributesString(
+ tagAttributes,
+ options,
+ );
+ }
+ if (currentNode) currentNode.addChild(childNode);
+ currentNode = childNode;
+ }
+ }
+
+ i = closeIndex;
+ dataSize = 0;
+ dataIndex = i + 1;
+ }
+ } else {
+ dataSize++;
+ }
+ }
+}