Skip to content

Commit

Permalink
better recall strategy
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Apr 24, 2024
1 parent 0e73f78 commit ad4d5cd
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 9 deletions.
1 change: 0 additions & 1 deletion trafilatura/external.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,6 @@ def compare_extraction(tree, backup_tree, body, text, len_text, options):
LOGGER.debug('using custom extraction: %s', options.source)
# override faulty extraction: try with justext
if body.xpath(SANITIZED_XPATH) or len_text < options.min_extracted_size: # body.find(...)
# or options.focus == "recall" is True ?
LOGGER.debug('unclean document triggering justext examination: %s', options.source)
# tree = prune_unwanted_sections(tree, {}, options)
body2, text2, len_text2, jt_result = justext_rescue(tree, options, body, 0, '')
Expand Down
9 changes: 4 additions & 5 deletions trafilatura/htmlprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,17 +63,16 @@ def tree_cleaning(tree, options):

# prevent removal of paragraphs
run_p_test = False
if options.focus == "recall":
if options.focus == "recall" and tree.find('.//p') is not None:
tcopy = deepcopy(tree)
if tree.find('.//p') is not None:
run_p_test = True
run_p_test = True

# delete targeted elements
for expression in cleaning_list:
for element in tree.getiterator(expression):
delete_element(element)
if run_p_test and tree.find('.//p') is None:
tree = tcopy
if run_p_test and tree.find('.//p') is None:
tree = tcopy

return prune_html(tree)

Expand Down
4 changes: 1 addition & 3 deletions trafilatura/main_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -548,9 +548,7 @@ def _extract(tree, options):
continue
# no paragraphs containing text, or not enough
ptest = subtree.xpath('//p//text()')
if options.focus == "recall":
factor = 5
elif options.focus == "precision":
if options.focus == "precision":
factor = 1
else:
factor = 3
Expand Down

0 comments on commit ad4d5cd

Please sign in to comment.