Skip to content

Commit

Permalink
restore accuracy
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Apr 23, 2024
1 parent 50203bd commit f22b0e2
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 7 deletions.
13 changes: 7 additions & 6 deletions trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import sys
import warnings

from copy import deepcopy
from copy import copy, deepcopy

from lxml.etree import XPath, strip_tags

Expand Down Expand Up @@ -173,11 +173,12 @@ def bare_extraction(filecontent, url=None, no_fallback=False, # fast=False,
prune_xpath = [prune_xpath]
tree = prune_unwanted_nodes(tree, [XPath(x) for x in prune_xpath])

# backup (or not) for further processing
tree_backup = deepcopy(tree)
# backup for further processing
tree_backup = copy(tree)

# clean + use LXML cleaner
# clean
cleaned_tree = tree_cleaning(tree, options)
cleaned_tree_backup = copy(cleaned_tree)

# convert tags, the rest does not work without conversion
cleaned_tree = convert_tags(cleaned_tree, options, options.url or document.url)
Expand All @@ -191,11 +192,11 @@ def bare_extraction(filecontent, url=None, no_fallback=False, # fast=False,
cleaned_tree = prune_unwanted_nodes(cleaned_tree, REMOVE_COMMENTS_XPATH)

# extract content
postbody, temp_text, len_text = extract_content(deepcopy(tree_backup), deepcopy(cleaned_tree), options)
postbody, temp_text, len_text = extract_content(cleaned_tree, options)

# compare if necessary
if not options.fast:
postbody, temp_text, len_text = compare_extraction(deepcopy(cleaned_tree), deepcopy(tree_backup), postbody, temp_text, len_text, options)
postbody, temp_text, len_text = compare_extraction(cleaned_tree_backup, deepcopy(tree_backup), postbody, temp_text, len_text, options)
# add baseline as additional fallback
# rescue: try to use original/dirty tree # and favor_precision is False=?
if len_text < options.min_extracted_size:
Expand Down
2 changes: 1 addition & 1 deletion trafilatura/main_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -580,7 +580,7 @@ def _extract(tree, options):
return result_body, temp_text, potential_tags


def extract_content(tree_backup, cleaned_tree, options):
def extract_content(cleaned_tree, options):
'''Find the main content of a page using a set of XPath expressions,
then extract relevant elements, strip them of unwanted subparts and
convert them'''
Expand Down

0 comments on commit f22b0e2

Please sign in to comment.