-
-
Notifications
You must be signed in to change notification settings - Fork 229
/
core.py
362 lines (314 loc) · 15.9 KB
/
core.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
# pylint:disable-msg=E0611,I1101
"""
Extraction configuration and processing functions.
"""
import logging
import sys
import warnings
from copy import deepcopy
from lxml.etree import XPath, strip_tags
# own
from .baseline import baseline
from .external import compare_extraction
from .filters import (LANGID_FLAG, check_html_lang, duplicate_test,
language_filter)
from .hashing import content_fingerprint
from .htmlprocessing import convert_tags, prune_unwanted_nodes, tree_cleaning
from .main_extractor import extract_comments, extract_content
from .metadata import Document, extract_metadata
from .settings import DEFAULT_CONFIG, Extractor, use_config
from .utils import load_html, normalize_unicode
from .xml import build_json_output, control_xml_output, xmltotxt, xmltocsv
from .xpaths import REMOVE_COMMENTS_XPATH
LOGGER = logging.getLogger(__name__)
def determine_returnstring(document, options):
'''Convert XML tree to chosen format, clean the result and output it as a string'''
# XML (TEI) steps
if 'xml' in options.format:
# last cleaning
for element in document.body.iter('*'):
if element.tag != 'graphic' and len(element) == 0 and not element.text and not element.tail:
parent = element.getparent()
# do not remove elements inside <code> to preserve formatting
if parent is not None and parent.tag != 'code':
parent.remove(element)
# build output tree
returnstring = control_xml_output(document, options)
# CSV
elif options.format == 'csv':
returnstring = xmltocsv(document, options.formatting)
# JSON
elif options.format == 'json':
returnstring = build_json_output(document)
# Markdown and TXT
else:
returnstring = xmltotxt(document.body, options.formatting)
if document.commentsbody is not None:
returnstring = f"{returnstring}\n{xmltotxt(document.commentsbody, options.formatting)}".strip()
# normalize Unicode format (defaults to NFC)
return normalize_unicode(returnstring)
def bare_extraction(filecontent, url=None, no_fallback=False, # fast=False,
favor_precision=False, favor_recall=False,
include_comments=True, output_format="python", target_language=None,
include_tables=True, include_images=False, include_formatting=False,
include_links=False, deduplicate=False,
date_extraction_params=None,
only_with_metadata=False, with_metadata=False,
max_tree_size=None, url_blacklist=None, author_blacklist=None,
as_dict=True, prune_xpath=None,
config=DEFAULT_CONFIG, options=None):
"""Internal function for text extraction returning bare Python variables.
Args:
filecontent: HTML code as string.
url: URL of the webpage.
no_fallback: Use faster heuristics and skip backup extraction.
favor_precision: prefer less text but correct extraction.
favor_recall: prefer more text even when unsure.
include_comments: Extract comments along with the main text.
output_format: Define an output format, Python being the default
and the interest of this internal function.
Other values: "csv", "json", "markdown", "txt", "xml", and "xmltei".
target_language: Define a language to discard invalid documents (ISO 639-1 format).
include_tables: Take into account information within the HTML <table> element.
include_images: Take images into account (experimental).
include_formatting: Keep structural elements related to formatting
(present in XML format, converted to markdown otherwise).
include_links: Keep links along with their targets (experimental).
deduplicate: Remove duplicate segments and documents.
date_extraction_params: Provide extraction parameters to htmldate as dict().
only_with_metadata: Only keep documents featuring all essential metadata
(date, title, url).
max_tree_size: Discard documents with too many elements.
url_blacklist: Provide a blacklist of URLs as set() to filter out documents.
author_blacklist: Provide a blacklist of Author Names as set() to filter out authors.
as_dict: Legacy option, return a dictionary instead of a class with attributes.
prune_xpath: Provide an XPath expression to prune the tree before extraction.
can be str or list of str.
config: Directly provide a configparser configuration.
options: Directly provide a whole extractor configuration.
Returns:
A Python dict() containing all the extracted information or None.
Raises:
ValueError: Extraction problem.
"""
# deprecation warnings
if with_metadata is True:
only_with_metadata = with_metadata
warnings.warn(
'"with_metadata" will be deprecated in a future version, use "only_with_metadata instead"',
PendingDeprecationWarning
)
#if no_fallback is True:
# fast = no_fallback
#warnings.warn(
# '"no_fallback" will be deprecated in a future version, use "fast" instead',
# PendingDeprecationWarning
#)
# load data
try:
tree = load_html(filecontent)
if tree is None:
LOGGER.error('empty HTML tree: %s', url)
raise ValueError
# regroup extraction options
if not options or not isinstance(options, Extractor):
options = Extractor(
config=config, output_format=output_format,
fast=no_fallback, precision=favor_precision, recall=favor_recall,
comments=include_comments, formatting=include_formatting, links=include_links,
images=include_images, tables=include_tables,
dedup=deduplicate, lang=target_language, max_tree_size=max_tree_size,
url=url, only_with_metadata=only_with_metadata,
author_blacklist=author_blacklist, url_blacklist=url_blacklist,
date_params=date_extraction_params
)
# quick and dirty HTML lang check
if options.lang and (options.fast or LANGID_FLAG is False):
if check_html_lang(tree, options.lang) is False:
LOGGER.error('wrong HTML meta language: %s', options.source)
raise ValueError
# extract metadata if necessary
if options.format not in ("markdown", "txt"):
document = extract_metadata(tree, options.url, options.date_params, options.fast, options.author_blacklist)
# cut short if extracted URL in blacklist
if document.url in options.url_blacklist:
LOGGER.warning('blacklisted URL: %s', document.url)
raise ValueError
# cut short if core elements are missing
if options.only_with_metadata and any(
x is None for x in
[document.date, document.title, document.url]
):
LOGGER.error('no metadata: %s', options.source)
raise ValueError
else:
document = Document()
# prune all xpath expressions that user specified
# no backup as this is unetre full control of the user
if prune_xpath is not None:
if isinstance(prune_xpath, str):
prune_xpath = [prune_xpath]
tree = prune_unwanted_nodes(tree, [XPath(x) for x in prune_xpath])
# backup (or not) for further processing
tree_backup_1 = deepcopy(tree) if not options.fast else None
tree_backup_2 = deepcopy(tree)
# clean + use LXML cleaner
cleaned_tree = tree_cleaning(tree, options)
cleaned_tree_backup = deepcopy(cleaned_tree)
# convert tags, the rest does not work without conversion
cleaned_tree = convert_tags(cleaned_tree, options, options.url or document.url)
# comments first, then remove
if options.comments:
commentsbody, temp_comments, len_comments, cleaned_tree = extract_comments(cleaned_tree, options)
else:
commentsbody, temp_comments, len_comments = None, '', 0
if options.precision:
cleaned_tree = prune_unwanted_nodes(cleaned_tree, REMOVE_COMMENTS_XPATH)
# extract content
postbody, temp_text, len_text = extract_content(cleaned_tree, options)
# compare if necessary
if not options.fast:
postbody, temp_text, len_text = compare_extraction(cleaned_tree_backup, tree_backup_1, postbody, temp_text, len_text, options)
# add baseline as additional fallback
# rescue: try to use original/dirty tree # and favor_precision is False=?
if len_text < options.min_extracted_size:
postbody, temp_text, len_text = baseline(tree_backup_2)
LOGGER.debug('non-clean extracted length: %s (extraction)', len_text)
# tree size sanity check
if options.max_tree_size:
# strip tags
if len(postbody) > options.max_tree_size:
LOGGER.debug('output tree too long: %s', len(postbody))
strip_tags(postbody, 'hi')
# still too long, raise an error
if len(postbody) > options.max_tree_size:
LOGGER.debug('output tree too long: %s, discarding %s', len(postbody), options.source)
raise ValueError
# size checks
if len_comments < options.min_extracted_comm_size:
LOGGER.debug('not enough comments: %s', options.source)
if len_text < options.min_output_size and \
len_comments < options.min_output_comm_size:
LOGGER.debug('text and comments not long enough: %s %s %s', len_text, len_comments, options.source)
raise ValueError
# check duplicates at body level
if options.dedup and duplicate_test(postbody, options) is True:
LOGGER.debug('discarding duplicate document: %s', options.source)
raise ValueError
# sanity check on language
if options.lang:
is_not_target_lang, document = language_filter(temp_text, temp_comments, options.lang, document)
if is_not_target_lang is True:
LOGGER.debug('wrong language: %s', options.source)
raise ValueError
except (TypeError, ValueError):
LOGGER.warning('discarding data: %s', options.source)
return None
# special case: python variables
if options.format == 'python':
document.text = xmltotxt(postbody, options.formatting)
if options.comments:
document.comments = xmltotxt(commentsbody, options.formatting)
document.commentsbody = commentsbody
document.raw_text = document.text
else:
document.raw_text, document.commentsbody = temp_text, commentsbody
document.body = postbody
return document if not as_dict else document.as_dict()
def extract(filecontent, url=None, record_id=None, no_fallback=False,
favor_precision=False, favor_recall=False,
include_comments=True, output_format="txt",
tei_validation=False, target_language=None,
include_tables=True, include_images=False, include_formatting=False,
include_links=False, deduplicate=False,
date_extraction_params=None,
only_with_metadata=False, with_metadata=False,
max_tree_size=None, url_blacklist=None, author_blacklist=None,
settingsfile=None, prune_xpath=None,
config=DEFAULT_CONFIG, options=None,
**kwargs):
"""Main function exposed by the package:
Wrapper for text extraction and conversion to chosen output format.
Args:
filecontent: HTML code as string.
url: URL of the webpage.
record_id: Add an ID to the metadata.
no_fallback: Skip the backup extraction with readability-lxml and justext.
favor_precision: prefer less text but correct extraction.
favor_recall: when unsure, prefer more text.
include_comments: Extract comments along with the main text.
output_format: Define an output format:
"csv", "json", "markdown", "txt", "xml", and "xmltei".
tei_validation: Validate the XML-TEI output with respect to the TEI standard.
target_language: Define a language to discard invalid documents (ISO 639-1 format).
include_tables: Take into account information within the HTML <table> element.
include_images: Take images into account (experimental).
include_formatting: Keep structural elements related to formatting
(only valuable if output_format is set to XML).
include_links: Keep links along with their targets (experimental).
deduplicate: Remove duplicate segments and documents.
date_extraction_params: Provide extraction parameters to htmldate as dict().
only_with_metadata: Only keep documents featuring all essential metadata
(date, title, url).
max_tree_size: Discard documents with too many elements.
url_blacklist: Provide a blacklist of URLs as set() to filter out documents.
author_blacklist: Provide a blacklist of Author Names as set() to filter out authors.
settingsfile: Use a configuration file to override the standard settings.
prune_xpath: Provide an XPath expression to prune the tree before extraction.
can be str or list of str.
config: Directly provide a configparser configuration.
options: Directly provide a whole extractor configuration.
Returns:
A string in the desired format or None.
"""
# older, deprecated functions
if kwargs and any([
# output formats
'csv_output' in kwargs,
'json_output' in kwargs,
'tei_output' in kwargs,
'xml_output' in kwargs
]):
raise NameError(
'Deprecated argument: use output_format instead, e.g. output_format="xml"'
)
# todo: add with_metadata later
# regroup extraction options
if not options or not isinstance(options, Extractor):
options = Extractor(
config=use_config(settingsfile, config), output_format=output_format,
fast=no_fallback, precision=favor_precision, recall=favor_recall,
comments=include_comments, formatting=include_formatting, links=include_links,
images=include_images, tables=include_tables,
dedup=deduplicate, lang=target_language, max_tree_size=max_tree_size,
url=url, only_with_metadata=only_with_metadata,
tei_validation=tei_validation,
author_blacklist=author_blacklist, url_blacklist=url_blacklist,
date_params=date_extraction_params
)
# markdown switch
include_formatting = include_formatting or output_format == "markdown"
# extraction
try:
document = bare_extraction(
filecontent, options=options,
with_metadata=with_metadata,
as_dict=False, prune_xpath=prune_xpath,
)
except RuntimeError:
LOGGER.error('Processing timeout for %s', url)
document = None
# post-processing
if document is None:
return None
if options.format not in ("markdown", "txt"):
# add record ID to metadata
document.id = record_id
# calculate fingerprint
if document.raw_text is not None:
document.fingerprint = content_fingerprint(str(document.title) + " " + str(document.raw_text))
# return
return determine_returnstring(document, options)
def process_record(*args, **kwargs):
"Deprecated extraction function."
sys.exit("process_record() is deprecated, use extract() instead")