adbar · adbar · Apr 24, 2024 · Apr 22, 2024 · Apr 22, 2024 · Apr 23, 2024
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -818,21 +818,30 @@ def test_precision_recall():
     '''test precision- and recall-oriented settings'''
     # the test cases could be better
     my_document = html.fromstring('<html><body><p>This here is the text.</p></body></html>')
-    assert extract(my_document, favor_precision=True, config=ZERO_CONFIG, fast=True) is not None
-    assert extract(my_document, favor_recall=True, config=ZERO_CONFIG, fast=True) is not None
+    assert extract(copy(my_document), favor_precision=True, config=ZERO_CONFIG, fast=True) is not None
+    assert extract(copy(my_document), favor_recall=True, config=ZERO_CONFIG, fast=True) is not None
+
     my_document = html.fromstring('<html><body><div class="article-body"><div class="teaser-content"><p>This here is a teaser text.</p></div><div><p>This here is the text.</p></div></body></html>')
-    assert 'teaser text' in extract(my_document, favor_recall=True, config=ZERO_CONFIG, fast=True)
-    assert 'teaser text' not in extract(my_document, config=ZERO_CONFIG, fast=True)
-    assert 'teaser text' not in extract(my_document, favor_precision=True, config=ZERO_CONFIG, fast=True)
+    assert 'teaser text' in extract(copy(my_document), favor_recall=True, config=ZERO_CONFIG, fast=True)
+    assert 'teaser text' not in extract(copy(my_document), config=ZERO_CONFIG, fast=True)
+    assert 'teaser text' not in extract(copy(my_document), favor_precision=True, config=ZERO_CONFIG, fast=True)
+
     my_document = html.fromstring('<html><body><article><div><p><a href="test.html">1.</a><br/><a href="test2.html">2.</a></p></div></article></body></html>')
-    result = extract(my_document, favor_recall=True, config=ZERO_CONFIG, fast=True)
+    result = extract(copy(my_document), favor_recall=True, config=ZERO_CONFIG, fast=True)
     assert '1' not in result
-    result = extract(my_document, favor_precision=True, config=ZERO_CONFIG, fast=True)
+    result = extract(copy(my_document), favor_precision=True, config=ZERO_CONFIG, fast=True)
     assert '1' not in result
+
     my_document = html.fromstring('<html><body><div class="article-body"><p>content</p><h2>Test</h2></div></body></html>')
-    result = extract(my_document, favor_precision=True, config=ZERO_CONFIG, fast=True)
+    result = extract(copy(my_document), favor_precision=True, config=ZERO_CONFIG, fast=True)
     assert 'content' in result and 'Test' not in result
 
+    my_document = html.fromstring('<html><body><article><aside><p>Here is the text.</p></aside></article></body></html>')
+    result = extract(copy(my_document), favor_recall=False, config=ZERO_CONFIG, fast=True)
+    assert result != "Here is the text."
+    result = extract(copy(my_document), favor_recall=True, config=ZERO_CONFIG, fast=True)
+    assert result == "Here is the text."
+
 
 def test_table_processing():
     options = DEFAULT_OPTIONS
@@ -1052,7 +1061,7 @@ def test_table_processing():
     ]
     assert result == ['table', 'row', 'cell', ('p', 'a list'), 'list']
 
-    options.recall = True
+    options.focus = "recall"
     processed_table = handle_table(copy(table_with_list), TAG_CATALOG, options)
     result = [
         (el.tag, el.text) if el.text is not None and el.text.strip() else el.tag

diff --git a/trafilatura/core.py b/trafilatura/core.py
@@ -7,7 +7,7 @@
 import sys
 import warnings
 
-from copy import deepcopy
+from copy import copy, deepcopy
 
 from lxml.etree import XPath, strip_tags
 
@@ -173,13 +173,12 @@ def bare_extraction(filecontent, url=None, no_fallback=False,  # fast=False,
                 prune_xpath = [prune_xpath]
             tree = prune_unwanted_nodes(tree, [XPath(x) for x in prune_xpath])
 
-        # backup (or not) for further processing
-        tree_backup_1 = deepcopy(tree) if not options.fast else None
-        tree_backup_2 = deepcopy(tree)
+        # backup for further processing
+        tree_backup = copy(tree)
 
-        # clean + use LXML cleaner
+        # clean
         cleaned_tree = tree_cleaning(tree, options)
-        cleaned_tree_backup = deepcopy(cleaned_tree)
+        cleaned_tree_backup = copy(cleaned_tree)
 
         # convert tags, the rest does not work without conversion
         cleaned_tree = convert_tags(cleaned_tree, options, options.url or document.url)
@@ -189,19 +188,19 @@ def bare_extraction(filecontent, url=None, no_fallback=False,  # fast=False,
             commentsbody, temp_comments, len_comments, cleaned_tree = extract_comments(cleaned_tree, options)
         else:
             commentsbody, temp_comments, len_comments = None, '', 0
-        if options.precision:
+        if options.focus == "precision":
             cleaned_tree = prune_unwanted_nodes(cleaned_tree, REMOVE_COMMENTS_XPATH)
 
         # extract content
         postbody, temp_text, len_text = extract_content(cleaned_tree, options)
 
         # compare if necessary
         if not options.fast:
-            postbody, temp_text, len_text = compare_extraction(cleaned_tree_backup, tree_backup_1, postbody, temp_text, len_text, options)
+            postbody, temp_text, len_text = compare_extraction(cleaned_tree_backup, deepcopy(tree_backup), postbody, temp_text, len_text, options)
         # add baseline as additional fallback
         # rescue: try to use original/dirty tree # and favor_precision is False=?
         if len_text < options.min_extracted_size:
-            postbody, temp_text, len_text = baseline(tree_backup_2)
+            postbody, temp_text, len_text = baseline(deepcopy(tree_backup))
             LOGGER.debug('non-clean extracted length: %s (extraction)', len_text)
 
         # tree size sanity check
@@ -215,7 +214,7 @@ def bare_extraction(filecontent, url=None, no_fallback=False,  # fast=False,
                 LOGGER.debug('output tree too long: %s, discarding %s', len(postbody), options.source)
                 raise ValueError
         # size checks
-        if len_comments < options.min_extracted_comm_size:
+        if options.comments and len_comments < options.min_extracted_comm_size:
             LOGGER.debug('not enough comments: %s', options.source)
         if len_text < options.min_output_size and \
            len_comments < options.min_output_comm_size:

diff --git a/trafilatura/external.py b/trafilatura/external.py
@@ -46,12 +46,12 @@ def compare_extraction(tree, backup_tree, body, text, len_text, options):
     '''Decide whether to choose own or external extraction
        based on a series of heuristics'''
     # bypass for recall
-    if options.recall and len_text > options.min_extracted_size * 10:
+    if options.focus == "recall" and len_text > options.min_extracted_size * 10:
         return body, text, len_text
     algo_flag, jt_result = False, False
     # prior cleaning
     backup_tree = prune_unwanted_nodes(backup_tree, PAYWALL_DISCARD_XPATH)
-    if options.precision:
+    if options.focus == "precision":
         backup_tree = prune_unwanted_nodes(backup_tree, OVERALL_DISCARD_XPATH)
     # try with readability
     temppost_algo = try_readability(backup_tree)
@@ -75,7 +75,7 @@ def compare_extraction(tree, backup_tree, body, text, len_text, options):
     elif len(body.findall('.//table')) > len(body.findall('.//p')) and len_algo > options.min_extracted_size * 2:
         algo_flag = True
     # https://github.com/adbar/trafilatura/issues/354
-    elif options.recall and not body.xpath('.//head') and temppost_algo.xpath('.//h2|.//h3|.//h4') and len_algo > len_text:
+    elif options.focus == "recall" and not body.xpath('.//head') and temppost_algo.xpath('.//h2|.//h3|.//h4') and len_algo > len_text:
         algo_flag = True
     else:
         LOGGER.debug('extraction values: %s %s for %s', len_text, len_algo, options.source)
@@ -88,7 +88,6 @@ def compare_extraction(tree, backup_tree, body, text, len_text, options):
         LOGGER.debug('using custom extraction: %s', options.source)
     # override faulty extraction: try with justext
     if body.xpath(SANITIZED_XPATH) or len_text < options.min_extracted_size:  # body.find(...)
-    # or options.recall is True ?
         LOGGER.debug('unclean document triggering justext examination: %s', options.source)
         # tree = prune_unwanted_sections(tree, {}, options)
         body2, text2, len_text2, jt_result = justext_rescue(tree, options, body, 0, '')

diff --git a/trafilatura/htmlprocessing.py b/trafilatura/htmlprocessing.py
@@ -43,6 +43,7 @@ def delete_element(element):
 def tree_cleaning(tree, options):
     "Prune the tree by discarding unwanted elements."
     # determine cleaning strategy, use lists to keep it deterministic
+    favor_recall = options.focus == "recall"
     cleaning_list, stripping_list = \
         MANUALLY_CLEANED.copy(), MANUALLY_STRIPPED.copy()
     if not options.tables:
@@ -60,10 +61,18 @@ def tree_cleaning(tree, options):
     # strip targeted elements
     strip_tags(tree, stripping_list)
 
+    # prevent removal of paragraphs
+    run_p_test = False
+    if options.focus == "recall" and tree.find('.//p') is not None:
+        tcopy = deepcopy(tree)
+        run_p_test = True
+
     # delete targeted elements
     for expression in cleaning_list:
         for element in tree.getiterator(expression):
             delete_element(element)
+    if run_p_test and tree.find('.//p') is None:
+        tree = tcopy
 
     return prune_html(tree)
 

diff --git a/trafilatura/main_extractor.py b/trafilatura/main_extractor.py
@@ -379,7 +379,7 @@ def handle_table(table_elem, potential_tags, options):
                             child.tag = "cell"
                         processed_subchild = handle_textnode(child, options, preserve_spaces=True, comments_fix=True)
                     # todo: lists in table cells
-                    elif child.tag == "list" and options.recall:
+                    elif child.tag == "list" and options.focus == "recall":
                         processed_subchild = handle_lists(child, options)
                         if processed_subchild is not None:
                             new_child_elem.append(processed_subchild)
@@ -475,7 +475,7 @@ def recover_wild_text(tree, result_body, options, potential_tags=TAG_CATALOG):
        frame and throughout the document to recover potentially missing text parts'''
     LOGGER.debug('Recovering wild text elements')
     search_expr = './/blockquote|.//code|.//p|.//pre|.//q|.//quote|.//table|.//div[contains(@class, \'w3-code\')]'
-    if options.recall is True:
+    if options.focus == "recall":
         potential_tags.update(['div', 'lb'])
         search_expr += '|.//div|.//lb|.//list'
     # prune
@@ -493,46 +493,42 @@ def recover_wild_text(tree, result_body, options, potential_tags=TAG_CATALOG):
 
 def prune_unwanted_sections(tree, potential_tags, options):
     'Rule-based deletion of targeted document sections'
+    favor_precision = options.focus == "precision"
     # prune the rest
     tree = prune_unwanted_nodes(tree, OVERALL_DISCARD_XPATH, with_backup=True)
     tree = prune_unwanted_nodes(tree, PAYWALL_DISCARD_XPATH)
     # decide if images are preserved
     if 'graphic' not in potential_tags:
         tree = prune_unwanted_nodes(tree, DISCARD_IMAGE_ELEMENTS)
     # balance precision/recall
-    if options.recall is False:
+    if options.focus != "recall":
         tree = prune_unwanted_nodes(tree, TEASER_DISCARD_XPATH)
-        if options.precision is True:
+        if favor_precision:
             tree = prune_unwanted_nodes(tree, PRECISION_DISCARD_XPATH)
     # remove elements by link density
-    tree = delete_by_link_density(tree, 'div', backtracking=True, favor_precision=options.precision)
-    tree = delete_by_link_density(tree, 'list', backtracking=False, favor_precision=options.precision)
-    tree = delete_by_link_density(tree, 'p', backtracking=False, favor_precision=options.precision)
+    tree = delete_by_link_density(tree, 'div', backtracking=True, favor_precision=favor_precision)
+    tree = delete_by_link_density(tree, 'list', backtracking=False, favor_precision=favor_precision)
+    tree = delete_by_link_density(tree, 'p', backtracking=False, favor_precision=favor_precision)
     # also filter fw/head, table and quote elements?
-    if options.precision is True:
+    if favor_precision:
         # delete trailing titles
         while len(tree) > 0 and (tree[-1].tag == 'head'):
             tree[-1].getparent().remove(tree[-1])
-        tree = delete_by_link_density(tree, 'head', backtracking=False)  # favor_precision=options.precision
-        tree = delete_by_link_density(tree, 'quote', backtracking=False)  # favor_precision=options.precision
+        tree = delete_by_link_density(tree, 'head', backtracking=False)  # favor_precision=favor_precision
+        tree = delete_by_link_density(tree, 'quote', backtracking=False)  # favor_precision=favor_precision
     return tree
 
 
-def extract_content(tree, options):
-    '''Find the main content of a page using a set of XPath expressions,
-       then extract relevant elements, strip them of unwanted subparts and
-       convert them'''
-    # backup
-    backup_tree = deepcopy(tree)
+def _extract(tree, options):
     # init
-    result_body = Element('body')
     potential_tags = set(TAG_CATALOG)
     if options.tables is True:
         potential_tags.update(['table', 'td', 'th', 'tr'])
     if options.images is True:
         potential_tags.add('graphic')
     if options.links is True:
         potential_tags.add('ref')
+    result_body = Element('body')
     # iterate
     for expr in BODY_XPATH:
         # select tree if the expression has been found
@@ -542,8 +538,8 @@ def extract_content(tree, options):
         # prune the subtree
         subtree = prune_unwanted_sections(subtree, potential_tags, options)
         # second pass?
-        # subtree = delete_by_link_density(subtree, 'list', backtracking=False, favor_precision=options.precision)
-        if 'table' in potential_tags or options.precision is True:
+        # subtree = delete_by_link_density(subtree, 'list', backtracking=False, favor_precision=options.focus == "precision")
+        if 'table' in potential_tags or options.focus == "precision":
             for elem in subtree.iter('table'):
                 if link_density_test_tables(elem) is True:
                     elem.getparent().remove(elem)
@@ -552,9 +548,7 @@ def extract_content(tree, options):
             continue
         # no paragraphs containing text, or not enough
         ptest = subtree.xpath('//p//text()')
-        if options.recall is True:
-            factor = 5
-        elif options.precision is True:
+        if options.focus == "precision":
             factor = 1
         else:
             factor = 3
@@ -581,6 +575,20 @@ def extract_content(tree, options):
             LOGGER.debug(expr)
             break
     temp_text = ' '.join(result_body.itertext()).strip()
+    return result_body, temp_text, potential_tags
+
+
+def extract_content(cleaned_tree, options):
+    '''Find the main content of a page using a set of XPath expressions,
+       then extract relevant elements, strip them of unwanted subparts and
+       convert them'''
+    # backup
+    backup_tree = deepcopy(cleaned_tree)
+
+    result_body, temp_text, potential_tags = _extract(cleaned_tree, options)
+    #if len(result_body) == 0:
+    #    result_body, temp_text, potential_tags = _extract(tree_backup, options)
+
     # try parsing wild <p> elements if nothing found or text too short
     # todo: test precision and recall settings here
     if len(result_body) == 0 or len(temp_text) < options.min_extracted_size:

diff --git a/trafilatura/settings.py b/trafilatura/settings.py
@@ -42,7 +42,7 @@ class Extractor:
     __slots__ = [
     'config',
     # general
-    'format', 'fast', 'precision', 'recall', 'comments',
+    'format', 'fast', 'focus', 'comments',
     'formatting', 'links', 'images', 'tables', 'dedup', 'lang',
     # extraction size
     'min_extracted_size', 'min_output_size',
@@ -66,8 +66,12 @@ def __init__(self, *, config=DEFAULT_CONFIG, output_format="txt",
         self._add_config(config)
         self.format = output_format
         self.fast = fast
-        self.precision = precision
-        self.recall = recall
+        if recall:
+            self.focus = "recall"
+        elif precision:
+            self.focus = "precision"
+        else:
+            self.focus = "balanced"
         self.comments = comments
         self.formatting = formatting or output_format == "markdown"
         self.links = links
@@ -102,12 +106,13 @@ def args_to_extractor(args, url=None):
     "Derive extractor configuration from CLI args."
     options = Extractor(
                   config=use_config(filename=args.config_file), output_format=args.output_format,
+                  precision=args.precision, recall=args.recall,
                   comments=args.no_comments, tables=args.no_tables,
                   dedup=args.deduplicate, lang=args.target_language,
                   url=url, only_with_metadata=args.only_with_metadata,
                   tei_validation=args.validate_tei
               )
-    for attr in ("fast", "precision", "recall", "formatting", "images", "links"):
+    for attr in ("fast", "formatting", "images", "links"):
         setattr(options, attr, getattr(args, attr))
     return options