/
index.js
167 lines (153 loc) · 5.01 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
/* eslint-disable security/detect-non-literal-fs-filename */
const autoParse = require("auto-parse");
const fixUtf8 = require("fix-utf8");
const fp = require("fastify-plugin");
const fs = require("fs").promises;
const glob = require("glob");
const { JSDOM } = require("jsdom");
const path = require("upath");
const { Poppler } = require("node-poppler");
const { v4 } = require("uuid");
/**
* @author Frazer Smith
* @description Pre-handler plugin that uses Poppler to convert Buffer containing
* PDF file in `req.body` to HTML and places HTML file in a temporary directory.
* `req` object is decorated with `conversionResults` object detailing document
* location and contents.
* @param {Function} server - Fastify instance.
* @param {object} options - Plugin config values.
* @param {string} options.binPath - Path to Poppler binary.
* @param {object} options.pdfToHtmlOptions - Refer to
* https://github.com/Fdawgs/node-poppler/blob/master/API.md#Poppler+pdfToHtml
* for options.
* @param {string} options.pdfToHtmlOptions.encoding - Sets the encoding to use for text output.
* @param {string=} options.tempDirectory - Directory for temporarily storing
* files during conversion.
*/
async function plugin(server, options) {
server.addHook("onRequest", async (req) => {
req.conversionResults = { body: undefined };
});
// "onSend" hook used instead of "onResponse" ensures
// cancelled request temp data is also removed
server.addHook("onSend", async (req, res) => {
if (req?.conversionResults?.docLocation) {
// Remove files from temp directory after response sent
const files = glob.sync(
`${path.joinSafe(
req.conversionResults.docLocation.directory,
req.conversionResults.docLocation.id
)}*`
);
await Promise.all(files.map((file) => fs.unlink(file)));
}
return res;
});
server.addHook("preHandler", async (req, res) => {
// Define any default settings the plugin should have to get up and running
const config = {
binPath: undefined,
pdfToHtmlOptions: {
complexOutput: true,
outputEncoding: "UTF-8",
singlePage: true,
},
tempDirectory: path.joinSafe(__dirname, "..", "temp"),
};
Object.assign(config, options);
const directory = path.normalizeTrim(config.tempDirectory);
const poppler = new Poppler(config.binPath);
/**
* Create copy of query string params and prune that,
* as some of the params may be used in other plugins
*/
const query = { ...req.query };
const pdfToHtmlAcceptedParams = [
"exchangePdfLinks",
"extractHidden",
"firstPageToConvert",
"ignoreImages",
"imageFormat",
"lastPageToConvert",
"noDrm",
"noMergeParagraph",
"outputEncoding",
"ownerPassword",
"userPassword",
"wordBreakThreshold",
"zoom",
];
Object.keys(query).forEach((value) => {
if (!pdfToHtmlAcceptedParams.includes(value)) {
delete query[value];
} else {
/**
* Convert query string params to literal values to
* allow Poppler module to use them
*/
query[value] = autoParse(query[value]);
}
});
Object.assign(config.pdfToHtmlOptions, query);
// Create temp directory if missing
await fs.mkdir(directory).catch((err) => {
// Ignore "EEXIST: An object by the name pathname already exists" error
/* istanbul ignore if */
if (err.code !== "EEXIST") {
throw err;
}
});
// Build temporary file for Poppler to write to, and following plugins to read from
const id = v4();
const tempFile = path.joinSafe(directory, id);
req.conversionResults.docLocation = {
directory,
html: tempFile,
id,
};
await poppler
.pdfToHtml(req.body, `${tempFile}.html`, config.pdfToHtmlOptions)
.catch((err) => {
/**
* Poppler will throw if the .pdf file provided
* by client is malformed, thus client error code
*/
/* istanbul ignore else */
if (/Syntax Error:/.test(err)) {
throw res.badRequest();
} else {
throw err;
}
});
// Remove excess title and meta tags left behind by Poppler
// Poppler appends `-html` to the file name, thus the template literal here
const dom = new JSDOM(
await fs.readFile(`${tempFile}-html.html`, {
encoding: config.pdfToHtmlOptions.outputEncoding,
})
);
const titles = dom.window.document.querySelectorAll("title");
for (let index = 1; index < titles.length; index += 1) {
titles[index].parentNode.removeChild(titles[index]);
}
const metas = dom.window.document.querySelectorAll("meta");
for (let index = 1; index < metas.length; index += 1) {
metas[index].parentNode.removeChild(metas[index]);
}
/**
* `fixUtf8` function replaces most common incorrectly converted
* Windows-1252 to UTF-8 results with HTML equivalents.
* Refer to https://www.i18nqa.com/debug/utf8-debug.html for more info.
*/
req.conversionResults.body = fixUtf8(dom.serialize());
res.header(
"content-type",
`text/html; charset=${config.pdfToHtmlOptions.outputEncoding.toLowerCase()}`
);
});
}
module.exports = fp(plugin, {
fastify: "3.x",
name: "pdf-to-html",
dependencies: ["fastify-sensible"],
});