Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

extractor: improve recall preset #571

Merged
merged 7 commits into from
Apr 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
27 changes: 18 additions & 9 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -818,21 +818,30 @@ def test_precision_recall():
'''test precision- and recall-oriented settings'''
# the test cases could be better
my_document = html.fromstring('<html><body><p>This here is the text.</p></body></html>')
assert extract(my_document, favor_precision=True, config=ZERO_CONFIG, fast=True) is not None
assert extract(my_document, favor_recall=True, config=ZERO_CONFIG, fast=True) is not None
assert extract(copy(my_document), favor_precision=True, config=ZERO_CONFIG, fast=True) is not None
assert extract(copy(my_document), favor_recall=True, config=ZERO_CONFIG, fast=True) is not None

my_document = html.fromstring('<html><body><div class="article-body"><div class="teaser-content"><p>This here is a teaser text.</p></div><div><p>This here is the text.</p></div></body></html>')
assert 'teaser text' in extract(my_document, favor_recall=True, config=ZERO_CONFIG, fast=True)
assert 'teaser text' not in extract(my_document, config=ZERO_CONFIG, fast=True)
assert 'teaser text' not in extract(my_document, favor_precision=True, config=ZERO_CONFIG, fast=True)
assert 'teaser text' in extract(copy(my_document), favor_recall=True, config=ZERO_CONFIG, fast=True)
assert 'teaser text' not in extract(copy(my_document), config=ZERO_CONFIG, fast=True)
assert 'teaser text' not in extract(copy(my_document), favor_precision=True, config=ZERO_CONFIG, fast=True)

my_document = html.fromstring('<html><body><article><div><p><a href="test.html">1.</a><br/><a href="test2.html">2.</a></p></div></article></body></html>')
result = extract(my_document, favor_recall=True, config=ZERO_CONFIG, fast=True)
result = extract(copy(my_document), favor_recall=True, config=ZERO_CONFIG, fast=True)
assert '1' not in result
result = extract(my_document, favor_precision=True, config=ZERO_CONFIG, fast=True)
result = extract(copy(my_document), favor_precision=True, config=ZERO_CONFIG, fast=True)
assert '1' not in result

my_document = html.fromstring('<html><body><div class="article-body"><p>content</p><h2>Test</h2></div></body></html>')
result = extract(my_document, favor_precision=True, config=ZERO_CONFIG, fast=True)
result = extract(copy(my_document), favor_precision=True, config=ZERO_CONFIG, fast=True)
assert 'content' in result and 'Test' not in result

my_document = html.fromstring('<html><body><article><aside><p>Here is the text.</p></aside></article></body></html>')
result = extract(copy(my_document), favor_recall=False, config=ZERO_CONFIG, fast=True)
assert result != "Here is the text."
result = extract(copy(my_document), favor_recall=True, config=ZERO_CONFIG, fast=True)
assert result == "Here is the text."


def test_table_processing():
options = DEFAULT_OPTIONS
Expand Down Expand Up @@ -1052,7 +1061,7 @@ def test_table_processing():
]
assert result == ['table', 'row', 'cell', ('p', 'a list'), 'list']

options.recall = True
options.focus = "recall"
processed_table = handle_table(copy(table_with_list), TAG_CATALOG, options)
result = [
(el.tag, el.text) if el.text is not None and el.text.strip() else el.tag
Expand Down
19 changes: 9 additions & 10 deletions trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import sys
import warnings

from copy import deepcopy
from copy import copy, deepcopy

from lxml.etree import XPath, strip_tags

Expand Down Expand Up @@ -173,13 +173,12 @@ def bare_extraction(filecontent, url=None, no_fallback=False, # fast=False,
prune_xpath = [prune_xpath]
tree = prune_unwanted_nodes(tree, [XPath(x) for x in prune_xpath])

# backup (or not) for further processing
tree_backup_1 = deepcopy(tree) if not options.fast else None
tree_backup_2 = deepcopy(tree)
# backup for further processing
tree_backup = copy(tree)

# clean + use LXML cleaner
# clean
cleaned_tree = tree_cleaning(tree, options)
cleaned_tree_backup = deepcopy(cleaned_tree)
cleaned_tree_backup = copy(cleaned_tree)

# convert tags, the rest does not work without conversion
cleaned_tree = convert_tags(cleaned_tree, options, options.url or document.url)
Expand All @@ -189,19 +188,19 @@ def bare_extraction(filecontent, url=None, no_fallback=False, # fast=False,
commentsbody, temp_comments, len_comments, cleaned_tree = extract_comments(cleaned_tree, options)
else:
commentsbody, temp_comments, len_comments = None, '', 0
if options.precision:
if options.focus == "precision":
cleaned_tree = prune_unwanted_nodes(cleaned_tree, REMOVE_COMMENTS_XPATH)

# extract content
postbody, temp_text, len_text = extract_content(cleaned_tree, options)

# compare if necessary
if not options.fast:
postbody, temp_text, len_text = compare_extraction(cleaned_tree_backup, tree_backup_1, postbody, temp_text, len_text, options)
postbody, temp_text, len_text = compare_extraction(cleaned_tree_backup, deepcopy(tree_backup), postbody, temp_text, len_text, options)
# add baseline as additional fallback
# rescue: try to use original/dirty tree # and favor_precision is False=?
if len_text < options.min_extracted_size:
postbody, temp_text, len_text = baseline(tree_backup_2)
postbody, temp_text, len_text = baseline(deepcopy(tree_backup))
LOGGER.debug('non-clean extracted length: %s (extraction)', len_text)

# tree size sanity check
Expand All @@ -215,7 +214,7 @@ def bare_extraction(filecontent, url=None, no_fallback=False, # fast=False,
LOGGER.debug('output tree too long: %s, discarding %s', len(postbody), options.source)
raise ValueError
# size checks
if len_comments < options.min_extracted_comm_size:
if options.comments and len_comments < options.min_extracted_comm_size:
LOGGER.debug('not enough comments: %s', options.source)
if len_text < options.min_output_size and \
len_comments < options.min_output_comm_size:
Expand Down
7 changes: 3 additions & 4 deletions trafilatura/external.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,12 @@ def compare_extraction(tree, backup_tree, body, text, len_text, options):
'''Decide whether to choose own or external extraction
based on a series of heuristics'''
# bypass for recall
if options.recall and len_text > options.min_extracted_size * 10:
if options.focus == "recall" and len_text > options.min_extracted_size * 10:
return body, text, len_text
algo_flag, jt_result = False, False
# prior cleaning
backup_tree = prune_unwanted_nodes(backup_tree, PAYWALL_DISCARD_XPATH)
if options.precision:
if options.focus == "precision":
backup_tree = prune_unwanted_nodes(backup_tree, OVERALL_DISCARD_XPATH)
# try with readability
temppost_algo = try_readability(backup_tree)
Expand All @@ -75,7 +75,7 @@ def compare_extraction(tree, backup_tree, body, text, len_text, options):
elif len(body.findall('.//table')) > len(body.findall('.//p')) and len_algo > options.min_extracted_size * 2:
algo_flag = True
# https://github.com/adbar/trafilatura/issues/354
elif options.recall and not body.xpath('.//head') and temppost_algo.xpath('.//h2|.//h3|.//h4') and len_algo > len_text:
elif options.focus == "recall" and not body.xpath('.//head') and temppost_algo.xpath('.//h2|.//h3|.//h4') and len_algo > len_text:
algo_flag = True
else:
LOGGER.debug('extraction values: %s %s for %s', len_text, len_algo, options.source)
Expand All @@ -88,7 +88,6 @@ def compare_extraction(tree, backup_tree, body, text, len_text, options):
LOGGER.debug('using custom extraction: %s', options.source)
# override faulty extraction: try with justext
if body.xpath(SANITIZED_XPATH) or len_text < options.min_extracted_size: # body.find(...)
# or options.recall is True ?
LOGGER.debug('unclean document triggering justext examination: %s', options.source)
# tree = prune_unwanted_sections(tree, {}, options)
body2, text2, len_text2, jt_result = justext_rescue(tree, options, body, 0, '')
Expand Down
9 changes: 9 additions & 0 deletions trafilatura/htmlprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def delete_element(element):
def tree_cleaning(tree, options):
"Prune the tree by discarding unwanted elements."
# determine cleaning strategy, use lists to keep it deterministic
favor_recall = options.focus == "recall"
cleaning_list, stripping_list = \
MANUALLY_CLEANED.copy(), MANUALLY_STRIPPED.copy()
if not options.tables:
Expand All @@ -60,10 +61,18 @@ def tree_cleaning(tree, options):
# strip targeted elements
strip_tags(tree, stripping_list)

# prevent removal of paragraphs
run_p_test = False
if options.focus == "recall" and tree.find('.//p') is not None:
tcopy = deepcopy(tree)
run_p_test = True

# delete targeted elements
for expression in cleaning_list:
for element in tree.getiterator(expression):
delete_element(element)
if run_p_test and tree.find('.//p') is None:
tree = tcopy

return prune_html(tree)

Expand Down
52 changes: 30 additions & 22 deletions trafilatura/main_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,7 @@ def handle_table(table_elem, potential_tags, options):
child.tag = "cell"
processed_subchild = handle_textnode(child, options, preserve_spaces=True, comments_fix=True)
# todo: lists in table cells
elif child.tag == "list" and options.recall:
elif child.tag == "list" and options.focus == "recall":
processed_subchild = handle_lists(child, options)
if processed_subchild is not None:
new_child_elem.append(processed_subchild)
Expand Down Expand Up @@ -475,7 +475,7 @@ def recover_wild_text(tree, result_body, options, potential_tags=TAG_CATALOG):
frame and throughout the document to recover potentially missing text parts'''
LOGGER.debug('Recovering wild text elements')
search_expr = './/blockquote|.//code|.//p|.//pre|.//q|.//quote|.//table|.//div[contains(@class, \'w3-code\')]'
if options.recall is True:
if options.focus == "recall":
potential_tags.update(['div', 'lb'])
search_expr += '|.//div|.//lb|.//list'
# prune
Expand All @@ -493,46 +493,42 @@ def recover_wild_text(tree, result_body, options, potential_tags=TAG_CATALOG):

def prune_unwanted_sections(tree, potential_tags, options):
'Rule-based deletion of targeted document sections'
favor_precision = options.focus == "precision"
# prune the rest
tree = prune_unwanted_nodes(tree, OVERALL_DISCARD_XPATH, with_backup=True)
tree = prune_unwanted_nodes(tree, PAYWALL_DISCARD_XPATH)
# decide if images are preserved
if 'graphic' not in potential_tags:
tree = prune_unwanted_nodes(tree, DISCARD_IMAGE_ELEMENTS)
# balance precision/recall
if options.recall is False:
if options.focus != "recall":
tree = prune_unwanted_nodes(tree, TEASER_DISCARD_XPATH)
if options.precision is True:
if favor_precision:
tree = prune_unwanted_nodes(tree, PRECISION_DISCARD_XPATH)
# remove elements by link density
tree = delete_by_link_density(tree, 'div', backtracking=True, favor_precision=options.precision)
tree = delete_by_link_density(tree, 'list', backtracking=False, favor_precision=options.precision)
tree = delete_by_link_density(tree, 'p', backtracking=False, favor_precision=options.precision)
tree = delete_by_link_density(tree, 'div', backtracking=True, favor_precision=favor_precision)
tree = delete_by_link_density(tree, 'list', backtracking=False, favor_precision=favor_precision)
tree = delete_by_link_density(tree, 'p', backtracking=False, favor_precision=favor_precision)
# also filter fw/head, table and quote elements?
if options.precision is True:
if favor_precision:
# delete trailing titles
while len(tree) > 0 and (tree[-1].tag == 'head'):
tree[-1].getparent().remove(tree[-1])
tree = delete_by_link_density(tree, 'head', backtracking=False) # favor_precision=options.precision
tree = delete_by_link_density(tree, 'quote', backtracking=False) # favor_precision=options.precision
tree = delete_by_link_density(tree, 'head', backtracking=False) # favor_precision=favor_precision
tree = delete_by_link_density(tree, 'quote', backtracking=False) # favor_precision=favor_precision
return tree


def extract_content(tree, options):
'''Find the main content of a page using a set of XPath expressions,
then extract relevant elements, strip them of unwanted subparts and
convert them'''
# backup
backup_tree = deepcopy(tree)
def _extract(tree, options):
# init
result_body = Element('body')
potential_tags = set(TAG_CATALOG)
if options.tables is True:
potential_tags.update(['table', 'td', 'th', 'tr'])
if options.images is True:
potential_tags.add('graphic')
if options.links is True:
potential_tags.add('ref')
result_body = Element('body')
# iterate
for expr in BODY_XPATH:
# select tree if the expression has been found
Expand All @@ -542,8 +538,8 @@ def extract_content(tree, options):
# prune the subtree
subtree = prune_unwanted_sections(subtree, potential_tags, options)
# second pass?
# subtree = delete_by_link_density(subtree, 'list', backtracking=False, favor_precision=options.precision)
if 'table' in potential_tags or options.precision is True:
# subtree = delete_by_link_density(subtree, 'list', backtracking=False, favor_precision=options.focus == "precision")
if 'table' in potential_tags or options.focus == "precision":
for elem in subtree.iter('table'):
if link_density_test_tables(elem) is True:
elem.getparent().remove(elem)
Expand All @@ -552,9 +548,7 @@ def extract_content(tree, options):
continue
# no paragraphs containing text, or not enough
ptest = subtree.xpath('//p//text()')
if options.recall is True:
factor = 5
elif options.precision is True:
if options.focus == "precision":
factor = 1
else:
factor = 3
Expand All @@ -581,6 +575,20 @@ def extract_content(tree, options):
LOGGER.debug(expr)
break
temp_text = ' '.join(result_body.itertext()).strip()
return result_body, temp_text, potential_tags


def extract_content(cleaned_tree, options):
'''Find the main content of a page using a set of XPath expressions,
then extract relevant elements, strip them of unwanted subparts and
convert them'''
# backup
backup_tree = deepcopy(cleaned_tree)

result_body, temp_text, potential_tags = _extract(cleaned_tree, options)
#if len(result_body) == 0:
# result_body, temp_text, potential_tags = _extract(tree_backup, options)

# try parsing wild <p> elements if nothing found or text too short
# todo: test precision and recall settings here
if len(result_body) == 0 or len(temp_text) < options.min_extracted_size:
Expand Down
13 changes: 9 additions & 4 deletions trafilatura/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class Extractor:
__slots__ = [
'config',
# general
'format', 'fast', 'precision', 'recall', 'comments',
'format', 'fast', 'focus', 'comments',
'formatting', 'links', 'images', 'tables', 'dedup', 'lang',
# extraction size
'min_extracted_size', 'min_output_size',
Expand All @@ -66,8 +66,12 @@ def __init__(self, *, config=DEFAULT_CONFIG, output_format="txt",
self._add_config(config)
self.format = output_format
self.fast = fast
self.precision = precision
self.recall = recall
if recall:
self.focus = "recall"
elif precision:
self.focus = "precision"
else:
self.focus = "balanced"
self.comments = comments
self.formatting = formatting or output_format == "markdown"
self.links = links
Expand Down Expand Up @@ -102,12 +106,13 @@ def args_to_extractor(args, url=None):
"Derive extractor configuration from CLI args."
options = Extractor(
config=use_config(filename=args.config_file), output_format=args.output_format,
precision=args.precision, recall=args.recall,
comments=args.no_comments, tables=args.no_tables,
dedup=args.deduplicate, lang=args.target_language,
url=url, only_with_metadata=args.only_with_metadata,
tei_validation=args.validate_tei
)
for attr in ("fast", "precision", "recall", "formatting", "images", "links"):
for attr in ("fast", "formatting", "images", "links"):
setattr(options, attr, getattr(args, attr))
return options

Expand Down