-
Notifications
You must be signed in to change notification settings - Fork 340
/
TextSpan.py
439 lines (351 loc) · 15.5 KB
/
TextSpan.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
'''Text Span object based on PDF raw dict extracted with ``PyMuPDF``.
Data structure for Span refer to
this `link <https://pymupdf.readthedocs.io/en/latest/textpage.html>`_::
{
# raw dict
---------------------------
'bbox': (x0,y0,x1,y1),
'color': sRGB
'font': fontname,
'size': fontsize,
'flags': fontflags,
'chars': [ chars ],
# added dict
----------------------------
'text': text,
'style': [
{
'type': int,
'color': int,
'uri': str # for hyperlink
},
...
]
}
'''
import fitz
from docx.shared import Pt, RGBColor
from docx.oxml.ns import qn
from .Char import Char
from ..common.Element import Element
from ..common.share import (RectType, rgb_value, rgb_component, decode)
from ..common import constants
from ..common import docx
from ..shape.Shape import Shape
class TextSpan(Element):
'''Object representing text span.'''
def __init__(self, raw:dict=None):
raw = raw or {}
self.color = raw.get('color', 0)
self.flags = raw.get('flags', 0)
# filter empty chars
chars = [Char(c) for c in raw.get('chars', [])] # type: list[Char]
# ignore replacement character, see issue#256
self.chars = [char for char in chars if char.c not in ('', '\ufffd')]
self._text = raw.get('text', '') # not an original key from PyMuPDF
# font metrics
# line_height is the standard single line height used in relative line spacing,
# while exact line spacing is used when line_height==-1 by default.
self.font = decode(raw.get('font', '')) # in case unicode in font name
self.size = raw.get('size', 12.0)
self.ascender = raw.get('ascender', 1.0)
self.descender = raw.get('descender', 0.0)
self.line_height = raw.get('line_height', -1) # not an original key
# introduced attributes
# a list of dict: { 'type': int, 'color': int }
self.style = raw.get('style', [])
# char spacing between adjacent two chars -> pdf operador Tc
# positive to expand space, otherwise condense
# just an attribute placeholder: not used yet
self.char_spacing = raw.get('char_spacing', 0.0)
# init text span element
super().__init__(raw)
# in rare case, the font is unamed, so change font and update bbox accordingly
if self.chars and 'UNNAMED' in self.font.upper():
self._change_font_and_update_bbox(constants.DEFAULT_FONT_NAME)
@property
def text(self):
'''Get span text. Note joining chars is in a higher priority.'''
return ''.join([char.c for char in self.chars]) if self.chars else self._text
@text.setter
def text(self, value):
'''Set span text directly in case no chars are stores, e.g. restored from json.'''
self._text = value
def cal_bbox(self):
'''Calculate bbox based on contained instances.'''
bbox = fitz.Rect()
for char in self.chars: bbox |= char.bbox
return bbox
@property
def is_valid_line_height(self): return self.line_height!=-1
def _change_font_and_update_bbox(self, font_name:str):
'''Set new font, and update font size, span/char bbox accordingly.
It's generally used for span with unnamed fonts.
See this `issue <https://github.com/pymupdf/PyMuPDF/issues/642>`_.
In corner case, where the PDF file containing unnamed and not embedded fonts, the span bbox
extracted from ``PyMuPDF`` is not correct. ``PyMuPDF`` provides feature to replace these
unnamed fonts with specified fonts, then extract correct bbox from the updated PDF. Since we
care less about the original PDF itself but its layout, the idea here is to set a default
font for text spans with unnamed fonts, and estimate the updated bbox with method from
``fitz.TextWriter``.
Args:
font_name (str): Font name.
'''
# set new font property
self.font = font_name
# compute text length under new font with that size
font = fitz.Font(font_name)
new_length = font.text_length(self.text, fontsize=self.size)
if new_length > self.bbox.width:
self.size *= self.bbox.width / new_length
# estimate occupied rect when added with TextWriter
x0, y0, x1, y1 = self.bbox
tw = fitz.TextWriter((0, 0, x1, y1))
rect, _ = tw.append(
self.chars[0].origin, # the bottom left point of the first character
self.text,
font=font,
fontsize=self.size
)
# update span bbox
# - x-direction: use original horizontal range
# - y-direction: centerline defined by estimated vertical range, and height by font size
buff = (rect.height-self.size)/2.0
y0 = rect.y0 + buff
y1 = rect.y1 - buff
self.update_bbox((x0, y0, x1, y1))
# update contained char bbox
for char in self.chars:
x0, _, x1, _ = char.bbox
char.update_bbox((x0, y0, x1, y1))
def add(self, char:Char):
'''Add char and update bbox accordingly.'''
self.chars.append(char)
self.union_bbox(char)
def lstrip(self):
'''Remove blanks at the left side, but keep one blank.'''
original_text = self.text
if not original_text.startswith(' '*2): return False
# keep one blank
num_blanks = len(original_text) - len(original_text.lstrip())
self.chars = self.chars[num_blanks-1:]
self.update_bbox(rect=self.cal_bbox())
return True
def rstrip(self):
'''Remove blanks at the right side, but keep one blank.'''
original_text = self.text
if not original_text.endswith(' '*2): return False
# keep one blank
num_blanks = len(original_text) - len(original_text.rstrip())
self.chars = self.chars[:1-num_blanks]
self.update_bbox(rect=self.cal_bbox())
return True
def store(self):
res = super().store()
res.update({
'color': self.color,
'font': self.font,
'size': self.size,
'line_height': self.line_height,
'flags': self.flags,
'text': self.text,
'style': self.style,
'char_spacing': self.char_spacing
}) # not storing chars for space saving
return res
def plot(self, page, color:tuple): super().plot(page, stroke=color, fill=color, width=0)
def split(self, rect:Shape, horizontal:bool=True):
"""Split span with the intersection: span-intersection-span.
Args:
rect (Shape): Target shape to split this text span.
horizontal (bool, optional): Text direction. Defaults to True.
Returns:
list: Split text spans.
"""
# any intersection in this span?
# NOTE: didn't consider the case that an underline is out of a span
intsec = rect.bbox & self.bbox
# no, then add this span as it is
# Note the case bool(intsec)=True but intsec.get_area()=0
if intsec.is_empty: return [self]
# yes, then split spans:
# - add new style to the intersection part
# - keep the original style for the rest
split_spans = [] # type: list[TextSpan]
# expand the intersection area, e.g. for strike through line,
# the intersection is a `line`, i.e. a rectangle with very small height,
# so expand the height direction to span height
if horizontal:
intsec.y0 = self.bbox.y0
intsec.y1 = self.bbox.y1
else:
intsec.x0 = self.bbox.x0
intsec.x1 = self.bbox.x1
# calculate chars in the format rectangle
# combine an index with enumerate(), so the second element is the char
f = lambda items: items[1].contained_in_rect(rect, horizontal)
index_chars = list(filter(f, enumerate(self.chars)))
# then we get target chars in a sequence
pos = index_chars[0][0] if index_chars else -1 # start index -1 if nothing found
length = len(index_chars)
pos_end = max(pos+length, 0) # max() is used in case: pos=-1, length=0
# split span with the intersection: span-intersection-span
# left part if exists
if pos > 0:
if horizontal:
bbox = (self.bbox.x0, self.bbox.y0, intsec.x0, self.bbox.y1)
else:
bbox = (self.bbox.x0, intsec.y1, self.bbox.x1, self.bbox.y1)
split_span = self.copy().update_bbox(bbox)
split_span.chars = self.chars[0:pos]
split_spans.append(split_span)
# middle intersection part if exists
if length > 0:
bbox = (intsec.x0, intsec.y0, intsec.x1, intsec.y1)
split_span = self.copy().update_bbox(bbox)
split_span.chars = self.chars[pos:pos_end]
split_span._parse_text_format(rect, horizontal) # update style
split_spans.append(split_span)
# right part if exists
if pos_end < len(self.chars):
if horizontal:
bbox = (intsec.x1, self.bbox.y0, self.bbox.x1, self.bbox.y1)
else:
bbox = (self.bbox.x0, self.bbox.y0, self.bbox.x1, intsec.y0)
split_span = self.copy().update_bbox(bbox)
split_span.chars = self.chars[pos_end:]
split_spans.append(split_span)
return split_spans
def _parse_text_format(self, rect:Shape, horizontal:bool=True):
"""Parse text style based on the position to a rect shape.
Args:
rect (Shape): Target rect shape representing potential text style.
horizontal (bool, optional): Horizontal text direction. Defaults to True.
Returns:
bool: Parsed text style successfully or not.
"""
# Skip table border/shading
if rect.equal_to_type(RectType.BORDER) or rect.equal_to_type(RectType.SHADING):
return False
# set hyperlink
if rect.equal_to_type(RectType.HYPERLINK):
self.style.append({
'type': rect.type,
'color': rect.color,
'uri': rect.uri
})
return True
# considering text direction
idx = 1 if horizontal else 0
# recognize text format based on rect and the span it applying to
# region height
h_rect = rect.bbox[idx+2] - rect.bbox[idx]
h_span = self.bbox[idx+2] - self.bbox[idx]
# distance to span bottom border
d = abs(self.bbox[idx+2] - rect.bbox[idx])
# highlight: both the rect height and overlap must be large enough
if h_rect >= 0.5*h_span:
# In general, highlight color isn't white
if rect.color != rgb_value((1,1,1)) and \
self.get_main_bbox(rect, constants.FACTOR_MAJOR):
rect.type = RectType.HIGHLIGHT
# near to bottom of span? yes, underline
elif d <= 0.25*h_span:
rect.type = RectType.UNDERLINE
# near to center of span? yes, strike-through-line
elif 0.35*h_span < d < 0.75*h_span:
rect.type = RectType.STRIKE
# check rect type again
if not rect.is_determined: return False
style = {
'type': rect.type,
'color': rect.color
}
self.style.append(style)
return True
def intersects(self, rect):
'''Create new TextSpan object with chars contained in given bbox.
Args:
rect (fitz.Rect): Target bbox.
'''
# add span directly if fully contained in bbox
if rect.contains(self.bbox):
return self.copy()
# no intersection
if not rect.intersects(self.bbox):
return TextSpan()
# further check chars in span
span = self.copy()
span.chars.clear()
span.update_bbox((0.0,0.0,0.0,0.0))
for char in self.chars:
if char.get_main_bbox(rect, constants.FACTOR_A_HALF): # contains at least a half part
span.chars.append(char)
span.union_bbox(char)
return span
def make_docx(self, paragraph):
'''Add text span to a docx paragraph, and set text style, e.g.
font, color, underline, hyperlink, etc.
.. note::
Hyperlink and its style is parsed separately from pdf. For instance, regarding a general
hyperlink with an underline, the text and uri is parsed as hyperlink itself, while the
underline is treated as a normal text style.
'''
# Create hyperlink in particular, otherwise add a run directly
for style in self.style:
if style['type']==RectType.HYPERLINK.value and self.text.strip():
docx_run = docx.add_hyperlink(paragraph, style['uri'], self.text)
break
else:
docx_run = paragraph.add_run(self.text)
# set text style, e.g. font, underline and highlight
self._set_text_format(docx_run)
# set charters spacing
if self.char_spacing:
docx.set_char_spacing(docx_run, self.char_spacing)
def _set_text_format(self, docx_run):
'''Set text format for ``python-docx.run`` object.'''
# set style
# https://python-docx.readthedocs.io/en/latest/api/text.html#docx.text.run.Font
# basic font style
# line['flags'] is an integer, encoding bool of font properties:
# bit 0: superscripted (2^0)
# bit 1: italic (2^1)
# bit 2: serifed (2^2)
# bit 3: monospaced (2^3)
# bit 4: bold (2^4)
docx_run.superscript = bool(self.flags & 2**0)
docx_run.italic = bool(self.flags & 2**1)
docx_run.bold = bool(self.flags & 2**4)
# font name
font_name = self.font
docx_run.font.name = font_name
docx_run._element.rPr.rFonts.set(qn('w:eastAsia'), font_name) # for CJK characters
docx_run.font.color.rgb = RGBColor(*rgb_component(self.color))
# font size
# NOTE: only x.0 and x.5 is accepted in docx, so set character scaling accordingly
# if the font size doesn't meet this condition.
font_size = round(self.size*2)/2.0
docx_run.font.size = Pt(font_size)
# adjust by set scaling
scale = self.size / (font_size or self.size or 1)
if abs(scale-1.0)>=0.01:
docx.set_char_scaling(docx_run, scale)
# font style parsed from PDF rectangles:
# e.g. highlight, underline, strike-through-line
for style in self.style:
t = style['type']
# Built-in method is provided to set highlight in python-docx,but supports only
# limited colors; so, set character shading instead if out of highlight color scope.
if t==RectType.HIGHLIGHT.value:
docx.set_char_shading(docx_run, style['color'])
# underline set with built-in method `font.underline` has a same color with text.
# so, try to set a different color with xml if necessary
elif t==RectType.UNDERLINE.value:
if self.color==style['color']:
docx_run.font.underline = True
else:
docx.set_char_underline(docx_run, style['color'])
# same color with text for strike line
elif t==RectType.STRIKE.value:
docx_run.font.strike = True