Fixed font name encoding issue: #194, #246

ArtifexSoftware · Jan 5, 2024 · 248a319 · 248a319
1 parent 769a220
commit 248a319
Show file tree

Hide file tree

Showing 2 changed files with 3 additions and 1 deletion.
diff --git a/pdf2docx/font/Fonts.py b/pdf2docx/font/Fonts.py
@@ -69,6 +69,7 @@ def extract(cls, fitz_doc):
         fonts = []
         for xref in xrefs:
             basename, ext, _, buffer = fitz_doc.extract_font(xref)
+            basename = bytes(ord(c) for c in basename).decode()
             name = cls._normalized_font_name(basename)
 
             try:

diff --git a/pdf2docx/text/TextSpan.py b/pdf2docx/text/TextSpan.py
@@ -55,7 +55,8 @@ def __init__(self, raw:dict=None):
         # font metrics
         # line_height is the standard single line height used in relative line spacing,
         # while exact line spacing is used when line_height==-1 by default.
-        self.font = raw.get('font', '')
+        font_name = raw.get('font', '')
+        self.font = bytes(ord(c) for c in font_name).decode() # in case unicode in font name
         self.size = raw.get('size', 12.0)
         self.ascender = raw.get('ascender', 1.0)
         self.descender = raw.get('descender', 0.0)