Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[api-minor] Include the document /Lang attribute in the textContent-data #17941

Merged
merged 1 commit into from
May 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/core/catalog.js
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ class Catalog {
return shadow(
this,
"lang",
typeof lang === "string" ? stringToPDFString(lang) : null
lang && typeof lang === "string" ? stringToPDFString(lang) : null
);
}

Expand Down
59 changes: 31 additions & 28 deletions src/core/document.js
Original file line number Diff line number Diff line change
Expand Up @@ -395,10 +395,9 @@ class Page {
}

loadResources(keys) {
if (!this.resourcesPromise) {
// TODO: add async `_getInheritableProperty` and remove this.
this.resourcesPromise = this.pdfManager.ensure(this, "resources");
}
// TODO: add async `_getInheritableProperty` and remove this.
this.resourcesPromise ||= this.pdfManager.ensure(this, "resources");

return this.resourcesPromise.then(() => {
const objectLoader = new ObjectLoader(this.resources, keys, this.xref);
return objectLoader.load();
Expand Down Expand Up @@ -625,7 +624,7 @@ class Page {
});
}

extractTextContent({
async extractTextContent({
handler,
task,
includeMarkedContent,
Expand All @@ -639,31 +638,35 @@ class Page {
"Properties",
"XObject",
]);
const langPromise = this.pdfManager.ensureCatalog("lang");

const dataPromises = Promise.all([contentStreamPromise, resourcesPromise]);
return dataPromises.then(([contentStream]) => {
const partialEvaluator = new PartialEvaluator({
xref: this.xref,
handler,
pageIndex: this.pageIndex,
idFactory: this._localIdFactory,
fontCache: this.fontCache,
builtInCMapCache: this.builtInCMapCache,
standardFontDataCache: this.standardFontDataCache,
globalImageCache: this.globalImageCache,
systemFontCache: this.systemFontCache,
options: this.evaluatorOptions,
});
const [contentStream, , lang] = await Promise.all([
contentStreamPromise,
resourcesPromise,
langPromise,
]);
const partialEvaluator = new PartialEvaluator({
xref: this.xref,
handler,
pageIndex: this.pageIndex,
idFactory: this._localIdFactory,
fontCache: this.fontCache,
builtInCMapCache: this.builtInCMapCache,
standardFontDataCache: this.standardFontDataCache,
globalImageCache: this.globalImageCache,
systemFontCache: this.systemFontCache,
options: this.evaluatorOptions,
});

return partialEvaluator.getTextContent({
stream: contentStream,
task,
resources: this.resources,
includeMarkedContent,
disableNormalization,
sink,
viewBox: this.view,
});
return partialEvaluator.getTextContent({
stream: contentStream,
task,
resources: this.resources,
includeMarkedContent,
disableNormalization,
sink,
viewBox: this.view,
lang,
});
}

Expand Down
3 changes: 3 additions & 0 deletions src/core/evaluator.js
Original file line number Diff line number Diff line change
Expand Up @@ -2307,6 +2307,7 @@ class PartialEvaluator {
sink,
seenStyles = new Set(),
viewBox,
lang = null,
markedContentData = null,
disableNormalization = false,
keepWhiteSpace = false,
Expand All @@ -2323,6 +2324,7 @@ class PartialEvaluator {
const textContent = {
items: [],
styles: Object.create(null),
lang,
};
const textContentItem = {
initialized: false,
Expand Down Expand Up @@ -3296,6 +3298,7 @@ class PartialEvaluator {
sink: sinkWrapper,
seenStyles,
viewBox,
lang,
markedContentData,
disableNormalization,
keepWhiteSpace,
Expand Down
3 changes: 3 additions & 0 deletions src/display/api.js
Original file line number Diff line number Diff line change
Expand Up @@ -1160,6 +1160,7 @@ class PDFDocumentProxy {
* items are included when includeMarkedContent is true.
* @property {Object<string, TextStyle>} styles - {@link TextStyle} objects,
* indexed by font name.
* @property {string | null} lang - The document /Lang attribute.
*/

/**
Expand Down Expand Up @@ -1677,6 +1678,7 @@ class PDFPageProxy {
resolve(textContent);
return;
}
textContent.lang ??= value.lang;
Snuffleupagus marked this conversation as resolved.
Show resolved Hide resolved
Object.assign(textContent.styles, value.styles);
textContent.items.push(...value.items);
pump();
Expand All @@ -1687,6 +1689,7 @@ class PDFPageProxy {
const textContent = {
items: [],
styles: Object.create(null),
lang: null,
};
pump();
});
Expand Down
24 changes: 14 additions & 10 deletions src/display/text_layer.js
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ const DEFAULT_FONT_ASCENT = 0.8;
const ascentCache = new Map();
let _canvasContext = null;

function getCtx() {
function getCtx(lang = null) {
if (!_canvasContext) {
// We don't use an OffscreenCanvas here because we use serif/sans serif
// fonts with it and they depends on the locale.
Expand All @@ -89,13 +89,13 @@ function cleanupTextLayer() {
_canvasContext = null;
}

function getAscent(fontFamily) {
function getAscent(fontFamily, lang) {
const cachedAscent = ascentCache.get(fontFamily);
if (cachedAscent) {
return cachedAscent;
}

const ctx = getCtx();
const ctx = getCtx(lang);

const savedFont = ctx.font;
ctx.canvas.width = ctx.canvas.height = DEFAULT_FONT_SIZE;
Expand Down Expand Up @@ -162,7 +162,7 @@ function getAscent(fontFamily) {
return DEFAULT_FONT_ASCENT;
}

function appendText(task, geom) {
function appendText(task, geom, lang) {
// Initialize all used properties to keep the caches monomorphic.
const textDiv = document.createElement("span");
const textDivProperties = {
Expand All @@ -184,7 +184,7 @@ function appendText(task, geom) {
const fontFamily =
(task._fontInspectorEnabled && style.fontSubstitution) || style.fontFamily;
const fontHeight = Math.hypot(tx[2], tx[3]);
const fontAscent = fontHeight * getAscent(fontFamily);
const fontAscent = fontHeight * getAscent(fontFamily, lang);

let left, top;
if (angle === 0) {
Expand Down Expand Up @@ -324,7 +324,7 @@ class TextLayerRenderTask {
div: null,
scale: viewport.scale * (globalThis.devicePixelRatio || 1),
properties: null,
ctx: getCtx(),
ctx: null,
};
this._styleCache = Object.create(null);
const { pageWidth, pageHeight, pageX, pageY } = viewport.rawDims;
Expand Down Expand Up @@ -371,7 +371,11 @@ class TextLayerRenderTask {
/**
* @private
*/
_processItems(items) {
_processItems(items, lang) {
if (!this._layoutTextParams.ctx) {
this._textDivProperties.set(this._rootContainer, { lang });
this._layoutTextParams.ctx = getCtx(lang);
}
const textDivs = this._textDivs,
textContentItemsStr = this._textContentItemsStr;

Expand Down Expand Up @@ -403,7 +407,7 @@ class TextLayerRenderTask {
continue;
}
textContentItemsStr.push(item.str);
appendText(this, item);
appendText(this, item, lang);
}
}

Expand Down Expand Up @@ -440,7 +444,7 @@ class TextLayerRenderTask {
}

Object.assign(styleCache, value.styles);
this._processItems(value.items);
this._processItems(value.items, value.lang);
pump();
}, this._capability.reject);
};
Expand Down Expand Up @@ -476,7 +480,7 @@ function updateTextLayer({
}

if (mustRescale) {
const ctx = getCtx();
const ctx = getCtx(textDivProperties.get(container)?.lang);
const scale = viewport.scale * (globalThis.devicePixelRatio || 1);
const params = {
prevFontSize: null,
Expand Down
6 changes: 4 additions & 2 deletions test/unit/api_spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -3128,10 +3128,11 @@ describe("api", function () {
});

it("gets text content", async function () {
const { items, styles } = await page.getTextContent();
const { items, styles, lang } = await page.getTextContent();

expect(items.length).toEqual(15);
expect(objectSize(styles)).toEqual(5);
expect(lang).toEqual("en");

const text = mergeText(items);
expect(text).toEqual(`Table Of Content
Expand All @@ -3146,13 +3147,14 @@ page 1 / 3`);
);
const pdfDoc = await loadingTask.promise;
const pdfPage = await pdfDoc.getPage(1);
const { items, styles } = await pdfPage.getTextContent({
const { items, styles, lang } = await pdfPage.getTextContent({
disableNormalization: true,
});
expect(items.length).toEqual(1);
// Font name will be a random object id.
const fontName = items[0].fontName;
expect(Object.keys(styles)).toEqual([fontName]);
expect(lang).toEqual(null);

expect(items[0]).toEqual({
dir: "ltr",
Expand Down