add focus parameter to extractor

adbar · Apr 23, 2024 · 50203bd · 50203bd
1 parent a0f1366
commit 50203bd
Show file tree

Hide file tree

Showing 6 changed files with 33 additions and 26 deletions.
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -1061,7 +1061,7 @@ def test_table_processing():
     ]
     assert result == ['table', 'row', 'cell', ('p', 'a list'), 'list']
 
-    options.recall = True
+    options.focus = "recall"
     processed_table = handle_table(copy(table_with_list), TAG_CATALOG, options)
     result = [
         (el.tag, el.text) if el.text is not None and el.text.strip() else el.tag

diff --git a/trafilatura/core.py b/trafilatura/core.py
@@ -187,7 +187,7 @@ def bare_extraction(filecontent, url=None, no_fallback=False,  # fast=False,
             commentsbody, temp_comments, len_comments, cleaned_tree = extract_comments(cleaned_tree, options)
         else:
             commentsbody, temp_comments, len_comments = None, '', 0
-        if options.precision:
+        if options.focus == "precision":
             cleaned_tree = prune_unwanted_nodes(cleaned_tree, REMOVE_COMMENTS_XPATH)
 
         # extract content

diff --git a/trafilatura/external.py b/trafilatura/external.py
@@ -46,12 +46,12 @@ def compare_extraction(tree, backup_tree, body, text, len_text, options):
     '''Decide whether to choose own or external extraction
        based on a series of heuristics'''
     # bypass for recall
-    if options.recall and len_text > options.min_extracted_size * 10:
+    if options.focus == "recall" and len_text > options.min_extracted_size * 10:
         return body, text, len_text
     algo_flag, jt_result = False, False
     # prior cleaning
     backup_tree = prune_unwanted_nodes(backup_tree, PAYWALL_DISCARD_XPATH)
-    if options.precision:
+    if options.focus == "precision":
         backup_tree = prune_unwanted_nodes(backup_tree, OVERALL_DISCARD_XPATH)
     # try with readability
     temppost_algo = try_readability(backup_tree)
@@ -75,7 +75,7 @@ def compare_extraction(tree, backup_tree, body, text, len_text, options):
     elif len(body.findall('.//table')) > len(body.findall('.//p')) and len_algo > options.min_extracted_size * 2:
         algo_flag = True
     # https://github.com/adbar/trafilatura/issues/354
-    elif options.recall and not body.xpath('.//head') and temppost_algo.xpath('.//h2|.//h3|.//h4') and len_algo > len_text:
+    elif options.focus == "recall" and not body.xpath('.//head') and temppost_algo.xpath('.//h2|.//h3|.//h4') and len_algo > len_text:
         algo_flag = True
     else:
         LOGGER.debug('extraction values: %s %s for %s', len_text, len_algo, options.source)
@@ -88,7 +88,7 @@ def compare_extraction(tree, backup_tree, body, text, len_text, options):
         LOGGER.debug('using custom extraction: %s', options.source)
     # override faulty extraction: try with justext
     if body.xpath(SANITIZED_XPATH) or len_text < options.min_extracted_size:  # body.find(...)
-    # or options.recall is True ?
+    # or options.focus == "recall" is True ?
         LOGGER.debug('unclean document triggering justext examination: %s', options.source)
         # tree = prune_unwanted_sections(tree, {}, options)
         body2, text2, len_text2, jt_result = justext_rescue(tree, options, body, 0, '')

diff --git a/trafilatura/htmlprocessing.py b/trafilatura/htmlprocessing.py
@@ -43,6 +43,7 @@ def delete_element(element):
 def tree_cleaning(tree, options):
     "Prune the tree by discarding unwanted elements."
     # determine cleaning strategy, use lists to keep it deterministic
+    favor_recall = options.focus == "recall"
     cleaning_list, stripping_list = \
         MANUALLY_CLEANED.copy(), MANUALLY_STRIPPED.copy()
     if not options.tables:
@@ -61,15 +62,15 @@ def tree_cleaning(tree, options):
     strip_tags(tree, stripping_list)
 
     # prevent removal of paragraphs
-    if options.recall:
+    if favor_recall:
         tcopy = deepcopy(tree)
         p_test = tree.xpath('.//p[1]')
 
     # delete targeted elements
     for expression in cleaning_list:
         for element in tree.getiterator(expression):
             delete_element(element)
-            if options.recall and p_test and not tree.xpath('.//p[1]'):
+            if favor_recall and p_test and not tree.xpath('.//p[1]'):
                 tree = tcopy
 
     return prune_html(tree)

diff --git a/trafilatura/main_extractor.py b/trafilatura/main_extractor.py
@@ -379,7 +379,7 @@ def handle_table(table_elem, potential_tags, options):
                             child.tag = "cell"
                         processed_subchild = handle_textnode(child, options, preserve_spaces=True, comments_fix=True)
                     # todo: lists in table cells
-                    elif child.tag == "list" and options.recall:
+                    elif child.tag == "list" and options.focus == "recall":
                         processed_subchild = handle_lists(child, options)
                         if processed_subchild is not None:
                             new_child_elem.append(processed_subchild)
@@ -475,7 +475,7 @@ def recover_wild_text(tree, result_body, options, potential_tags=TAG_CATALOG):
        frame and throughout the document to recover potentially missing text parts'''
     LOGGER.debug('Recovering wild text elements')
     search_expr = './/blockquote|.//code|.//p|.//pre|.//q|.//quote|.//table|.//div[contains(@class, \'w3-code\')]'
-    if options.recall is True:
+    if options.focus == "recall":
         potential_tags.update(['div', 'lb'])
         search_expr += '|.//div|.//lb|.//list'
     # prune
@@ -493,28 +493,29 @@ def recover_wild_text(tree, result_body, options, potential_tags=TAG_CATALOG):
 
 def prune_unwanted_sections(tree, potential_tags, options):
     'Rule-based deletion of targeted document sections'
+    favor_precision = options.focus == "precision"
     # prune the rest
     tree = prune_unwanted_nodes(tree, OVERALL_DISCARD_XPATH, with_backup=True)
     tree = prune_unwanted_nodes(tree, PAYWALL_DISCARD_XPATH)
     # decide if images are preserved
     if 'graphic' not in potential_tags:
         tree = prune_unwanted_nodes(tree, DISCARD_IMAGE_ELEMENTS)
     # balance precision/recall
-    if options.recall is False:
+    if options.focus != "recall":
         tree = prune_unwanted_nodes(tree, TEASER_DISCARD_XPATH)
-        if options.precision is True:
+        if favor_precision:
             tree = prune_unwanted_nodes(tree, PRECISION_DISCARD_XPATH)
     # remove elements by link density
-    tree = delete_by_link_density(tree, 'div', backtracking=True, favor_precision=options.precision)
-    tree = delete_by_link_density(tree, 'list', backtracking=False, favor_precision=options.precision)
-    tree = delete_by_link_density(tree, 'p', backtracking=False, favor_precision=options.precision)
+    tree = delete_by_link_density(tree, 'div', backtracking=True, favor_precision=favor_precision)
+    tree = delete_by_link_density(tree, 'list', backtracking=False, favor_precision=favor_precision)
+    tree = delete_by_link_density(tree, 'p', backtracking=False, favor_precision=favor_precision)
     # also filter fw/head, table and quote elements?
-    if options.precision is True:
+    if favor_precision:
         # delete trailing titles
         while len(tree) > 0 and (tree[-1].tag == 'head'):
             tree[-1].getparent().remove(tree[-1])
-        tree = delete_by_link_density(tree, 'head', backtracking=False)  # favor_precision=options.precision
-        tree = delete_by_link_density(tree, 'quote', backtracking=False)  # favor_precision=options.precision
+        tree = delete_by_link_density(tree, 'head', backtracking=False)  # favor_precision=favor_precision
+        tree = delete_by_link_density(tree, 'quote', backtracking=False)  # favor_precision=favor_precision
     return tree
 
 
@@ -537,8 +538,8 @@ def _extract(tree, options):
         # prune the subtree
         subtree = prune_unwanted_sections(subtree, potential_tags, options)
         # second pass?
-        # subtree = delete_by_link_density(subtree, 'list', backtracking=False, favor_precision=options.precision)
-        if 'table' in potential_tags or options.precision is True:
+        # subtree = delete_by_link_density(subtree, 'list', backtracking=False, favor_precision=options.focus == "precision")
+        if 'table' in potential_tags or options.focus == "precision":
             for elem in subtree.iter('table'):
                 if link_density_test_tables(elem) is True:
                     elem.getparent().remove(elem)
@@ -547,9 +548,9 @@ def _extract(tree, options):
             continue
         # no paragraphs containing text, or not enough
         ptest = subtree.xpath('//p//text()')
-        if options.recall is True:
+        if options.focus == "recall":
             factor = 5
-        elif options.precision is True:
+        elif options.focus == "precision":
             factor = 1
         else:
             factor = 3

diff --git a/trafilatura/settings.py b/trafilatura/settings.py
@@ -42,7 +42,7 @@ class Extractor:
     __slots__ = [
     'config',
     # general
-    'format', 'fast', 'precision', 'recall', 'comments',
+    'format', 'fast', 'focus', 'comments',
     'formatting', 'links', 'images', 'tables', 'dedup', 'lang',
     # extraction size
     'min_extracted_size', 'min_output_size',
@@ -66,8 +66,12 @@ def __init__(self, *, config=DEFAULT_CONFIG, output_format="txt",
         self._add_config(config)
         self.format = output_format
         self.fast = fast
-        self.precision = precision
-        self.recall = recall
+        if recall:
+            self.focus = "recall"
+        elif precision:
+            self.focus = "precision"
+        else:
+            self.focus = "balanced"
         self.comments = comments
         self.formatting = formatting or output_format == "markdown"
         self.links = links
@@ -102,12 +106,13 @@ def args_to_extractor(args, url=None):
     "Derive extractor configuration from CLI args."
     options = Extractor(
                   config=use_config(filename=args.config_file), output_format=args.output_format,
+                  precision=args.precision, recall=args.recall,
                   comments=args.no_comments, tables=args.no_tables,
                   dedup=args.deduplicate, lang=args.target_language,
                   url=url, only_with_metadata=args.only_with_metadata,
                   tei_validation=args.validate_tei
               )
-    for attr in ("fast", "precision", "recall", "formatting", "images", "links"):
+    for attr in ("fast", "formatting", "images", "links"):
         setattr(options, attr, getattr(args, attr))
     return options