Skip to content

Commit

Permalink
add focus parameter to extractor
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Apr 23, 2024
1 parent a0f1366 commit 50203bd
Show file tree
Hide file tree
Showing 6 changed files with 33 additions and 26 deletions.
2 changes: 1 addition & 1 deletion tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1061,7 +1061,7 @@ def test_table_processing():
]
assert result == ['table', 'row', 'cell', ('p', 'a list'), 'list']

options.recall = True
options.focus = "recall"
processed_table = handle_table(copy(table_with_list), TAG_CATALOG, options)
result = [
(el.tag, el.text) if el.text is not None and el.text.strip() else el.tag
Expand Down
2 changes: 1 addition & 1 deletion trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ def bare_extraction(filecontent, url=None, no_fallback=False, # fast=False,
commentsbody, temp_comments, len_comments, cleaned_tree = extract_comments(cleaned_tree, options)
else:
commentsbody, temp_comments, len_comments = None, '', 0
if options.precision:
if options.focus == "precision":
cleaned_tree = prune_unwanted_nodes(cleaned_tree, REMOVE_COMMENTS_XPATH)

# extract content
Expand Down
8 changes: 4 additions & 4 deletions trafilatura/external.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,12 @@ def compare_extraction(tree, backup_tree, body, text, len_text, options):
'''Decide whether to choose own or external extraction
based on a series of heuristics'''
# bypass for recall
if options.recall and len_text > options.min_extracted_size * 10:
if options.focus == "recall" and len_text > options.min_extracted_size * 10:
return body, text, len_text
algo_flag, jt_result = False, False
# prior cleaning
backup_tree = prune_unwanted_nodes(backup_tree, PAYWALL_DISCARD_XPATH)
if options.precision:
if options.focus == "precision":
backup_tree = prune_unwanted_nodes(backup_tree, OVERALL_DISCARD_XPATH)
# try with readability
temppost_algo = try_readability(backup_tree)
Expand All @@ -75,7 +75,7 @@ def compare_extraction(tree, backup_tree, body, text, len_text, options):
elif len(body.findall('.//table')) > len(body.findall('.//p')) and len_algo > options.min_extracted_size * 2:
algo_flag = True
# https://github.com/adbar/trafilatura/issues/354
elif options.recall and not body.xpath('.//head') and temppost_algo.xpath('.//h2|.//h3|.//h4') and len_algo > len_text:
elif options.focus == "recall" and not body.xpath('.//head') and temppost_algo.xpath('.//h2|.//h3|.//h4') and len_algo > len_text:
algo_flag = True
else:
LOGGER.debug('extraction values: %s %s for %s', len_text, len_algo, options.source)
Expand All @@ -88,7 +88,7 @@ def compare_extraction(tree, backup_tree, body, text, len_text, options):
LOGGER.debug('using custom extraction: %s', options.source)
# override faulty extraction: try with justext
if body.xpath(SANITIZED_XPATH) or len_text < options.min_extracted_size: # body.find(...)
# or options.recall is True ?
# or options.focus == "recall" is True ?
LOGGER.debug('unclean document triggering justext examination: %s', options.source)
# tree = prune_unwanted_sections(tree, {}, options)
body2, text2, len_text2, jt_result = justext_rescue(tree, options, body, 0, '')
Expand Down
5 changes: 3 additions & 2 deletions trafilatura/htmlprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def delete_element(element):
def tree_cleaning(tree, options):
"Prune the tree by discarding unwanted elements."
# determine cleaning strategy, use lists to keep it deterministic
favor_recall = options.focus == "recall"
cleaning_list, stripping_list = \
MANUALLY_CLEANED.copy(), MANUALLY_STRIPPED.copy()
if not options.tables:
Expand All @@ -61,15 +62,15 @@ def tree_cleaning(tree, options):
strip_tags(tree, stripping_list)

# prevent removal of paragraphs
if options.recall:
if favor_recall:
tcopy = deepcopy(tree)
p_test = tree.xpath('.//p[1]')

# delete targeted elements
for expression in cleaning_list:
for element in tree.getiterator(expression):
delete_element(element)
if options.recall and p_test and not tree.xpath('.//p[1]'):
if favor_recall and p_test and not tree.xpath('.//p[1]'):
tree = tcopy

return prune_html(tree)
Expand Down
29 changes: 15 additions & 14 deletions trafilatura/main_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,7 @@ def handle_table(table_elem, potential_tags, options):
child.tag = "cell"
processed_subchild = handle_textnode(child, options, preserve_spaces=True, comments_fix=True)
# todo: lists in table cells
elif child.tag == "list" and options.recall:
elif child.tag == "list" and options.focus == "recall":
processed_subchild = handle_lists(child, options)
if processed_subchild is not None:
new_child_elem.append(processed_subchild)
Expand Down Expand Up @@ -475,7 +475,7 @@ def recover_wild_text(tree, result_body, options, potential_tags=TAG_CATALOG):
frame and throughout the document to recover potentially missing text parts'''
LOGGER.debug('Recovering wild text elements')
search_expr = './/blockquote|.//code|.//p|.//pre|.//q|.//quote|.//table|.//div[contains(@class, \'w3-code\')]'
if options.recall is True:
if options.focus == "recall":
potential_tags.update(['div', 'lb'])
search_expr += '|.//div|.//lb|.//list'
# prune
Expand All @@ -493,28 +493,29 @@ def recover_wild_text(tree, result_body, options, potential_tags=TAG_CATALOG):

def prune_unwanted_sections(tree, potential_tags, options):
'Rule-based deletion of targeted document sections'
favor_precision = options.focus == "precision"
# prune the rest
tree = prune_unwanted_nodes(tree, OVERALL_DISCARD_XPATH, with_backup=True)
tree = prune_unwanted_nodes(tree, PAYWALL_DISCARD_XPATH)
# decide if images are preserved
if 'graphic' not in potential_tags:
tree = prune_unwanted_nodes(tree, DISCARD_IMAGE_ELEMENTS)
# balance precision/recall
if options.recall is False:
if options.focus != "recall":
tree = prune_unwanted_nodes(tree, TEASER_DISCARD_XPATH)
if options.precision is True:
if favor_precision:
tree = prune_unwanted_nodes(tree, PRECISION_DISCARD_XPATH)
# remove elements by link density
tree = delete_by_link_density(tree, 'div', backtracking=True, favor_precision=options.precision)
tree = delete_by_link_density(tree, 'list', backtracking=False, favor_precision=options.precision)
tree = delete_by_link_density(tree, 'p', backtracking=False, favor_precision=options.precision)
tree = delete_by_link_density(tree, 'div', backtracking=True, favor_precision=favor_precision)
tree = delete_by_link_density(tree, 'list', backtracking=False, favor_precision=favor_precision)
tree = delete_by_link_density(tree, 'p', backtracking=False, favor_precision=favor_precision)
# also filter fw/head, table and quote elements?
if options.precision is True:
if favor_precision:
# delete trailing titles
while len(tree) > 0 and (tree[-1].tag == 'head'):
tree[-1].getparent().remove(tree[-1])
tree = delete_by_link_density(tree, 'head', backtracking=False) # favor_precision=options.precision
tree = delete_by_link_density(tree, 'quote', backtracking=False) # favor_precision=options.precision
tree = delete_by_link_density(tree, 'head', backtracking=False) # favor_precision=favor_precision
tree = delete_by_link_density(tree, 'quote', backtracking=False) # favor_precision=favor_precision
return tree


Expand All @@ -537,8 +538,8 @@ def _extract(tree, options):
# prune the subtree
subtree = prune_unwanted_sections(subtree, potential_tags, options)
# second pass?
# subtree = delete_by_link_density(subtree, 'list', backtracking=False, favor_precision=options.precision)
if 'table' in potential_tags or options.precision is True:
# subtree = delete_by_link_density(subtree, 'list', backtracking=False, favor_precision=options.focus == "precision")
if 'table' in potential_tags or options.focus == "precision":
for elem in subtree.iter('table'):
if link_density_test_tables(elem) is True:
elem.getparent().remove(elem)
Expand All @@ -547,9 +548,9 @@ def _extract(tree, options):
continue
# no paragraphs containing text, or not enough
ptest = subtree.xpath('//p//text()')
if options.recall is True:
if options.focus == "recall":
factor = 5
elif options.precision is True:
elif options.focus == "precision":
factor = 1
else:
factor = 3
Expand Down
13 changes: 9 additions & 4 deletions trafilatura/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class Extractor:
__slots__ = [
'config',
# general
'format', 'fast', 'precision', 'recall', 'comments',
'format', 'fast', 'focus', 'comments',
'formatting', 'links', 'images', 'tables', 'dedup', 'lang',
# extraction size
'min_extracted_size', 'min_output_size',
Expand All @@ -66,8 +66,12 @@ def __init__(self, *, config=DEFAULT_CONFIG, output_format="txt",
self._add_config(config)
self.format = output_format
self.fast = fast
self.precision = precision
self.recall = recall
if recall:
self.focus = "recall"
elif precision:
self.focus = "precision"
else:
self.focus = "balanced"
self.comments = comments
self.formatting = formatting or output_format == "markdown"
self.links = links
Expand Down Expand Up @@ -102,12 +106,13 @@ def args_to_extractor(args, url=None):
"Derive extractor configuration from CLI args."
options = Extractor(
config=use_config(filename=args.config_file), output_format=args.output_format,
precision=args.precision, recall=args.recall,
comments=args.no_comments, tables=args.no_tables,
dedup=args.deduplicate, lang=args.target_language,
url=url, only_with_metadata=args.only_with_metadata,
tei_validation=args.validate_tei
)
for attr in ("fast", "precision", "recall", "formatting", "images", "links"):
for attr in ("fast", "formatting", "images", "links"):
setattr(options, attr, getattr(args, attr))
return options

Expand Down

0 comments on commit 50203bd

Please sign in to comment.