Skip to content

Commit

Permalink
CLI & downloads: revamp options and make sure they are used (#565)
Browse files Browse the repository at this point in the history
* CLI & downloads: options as arg for buffered_downloads

* add options to fetch function

* refactoring
  • Loading branch information
adbar committed Apr 23, 2024
1 parent f84e648 commit e151af6
Show file tree
Hide file tree
Showing 7 changed files with 138 additions and 130 deletions.
7 changes: 4 additions & 3 deletions tests/cli_tests.py
Expand Up @@ -18,6 +18,7 @@
from trafilatura import cli, cli_utils, settings, spider
from trafilatura.downloads import add_to_compressed_dict, fetch_url
from trafilatura.filters import LANGID_FLAG
from trafilatura.settings import args_to_extractor

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
RESOURCES_DIR = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'resources')
Expand Down Expand Up @@ -193,7 +194,7 @@ def test_sysoutput():
result = 'DADIDA'
cli_utils.write_result(result, args)
# process with backup directory and no counter
options = cli_utils._args_to_extractor(args)
options = args_to_extractor(args)
assert cli_utils.process_result('DADIDA', args, None, options) is None
# test keeping dir structure
testargs = ['', '-i', 'myinputdir/', '-o', 'test/', '--keep-dirs']
Expand Down Expand Up @@ -333,7 +334,7 @@ def test_file_processing():
args.input_dir = RESOURCES_DIR
cli_utils.file_processing_pipeline(args)
# test manually
options = cli_utils._args_to_extractor(args)
options = args_to_extractor(args)
for f in cli_utils.generate_filelist(args.input_dir):
cli_utils.file_processing(f, args, options=options)

Expand All @@ -346,7 +347,7 @@ def test_cli_config_file():
with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r', encoding="utf-8") as f:
teststring = f.read()
args.config_file = os.path.join(RESOURCES_DIR, args.config_file)
options = cli_utils._args_to_extractor(args)
options = args_to_extractor(args)
assert cli.examine(teststring, args, options=options) is None


Expand Down
21 changes: 11 additions & 10 deletions tests/downloads_tests.py
Expand Up @@ -24,10 +24,9 @@
from courlan import UrlStore

from trafilatura.cli import parse_args
from trafilatura.cli_utils import (_args_to_extractor,
download_queue_processing,
from trafilatura.cli_utils import (download_queue_processing,
url_processing_pipeline)
from trafilatura.core import extract
from trafilatura.core import Extractor, extract
import trafilatura.downloads
from trafilatura.downloads import (DEFAULT_HEADERS, USER_AGENT, Response,
_determine_headers, _handle_response,
Expand All @@ -36,7 +35,7 @@
_urllib3_is_live_page,
add_to_compressed_dict, fetch_url,
is_live_page, load_download_buffer)
from trafilatura.settings import DEFAULT_CONFIG, use_config
from trafilatura.settings import DEFAULT_CONFIG, args_to_extractor, use_config
from trafilatura.utils import decode_file, decode_response, load_html

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
Expand All @@ -48,6 +47,8 @@
RESOURCES_DIR = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'resources')
UA_CONFIG = use_config(filename=os.path.join(RESOURCES_DIR, 'newsettings.cfg'))

DEFAULT_OPTS = Extractor(config=DEFAULT_CONFIG)


def _reset_downloads_global_objects():
"""
Expand Down Expand Up @@ -101,8 +102,8 @@ def test_fetch():
if pycurl is not None:
response1 = _send_pycurl_request('https://httpbun.com/status/200', True, True, DEFAULT_CONFIG)
assert response1.headers["x-powered-by"].startswith("httpbun")
assert _handle_response(url, response1, False, DEFAULT_CONFIG).data == _handle_response(url, response, False, DEFAULT_CONFIG).data
assert _handle_response(url, response1, True, DEFAULT_CONFIG) == _handle_response(url, response, True, DEFAULT_CONFIG)
assert _handle_response(url, response1, False, DEFAULT_OPTS).data == _handle_response(url, response, False, DEFAULT_OPTS).data
assert _handle_response(url, response1, True, DEFAULT_OPTS) == _handle_response(url, response, True, DEFAULT_OPTS)
# response object
# too large response object
data = ""
Expand All @@ -111,14 +112,14 @@ def test_fetch():
response = Response(data, status, url)
# too large
response.data = b'ABC'*10000000
assert _handle_response(response.url, response, False, DEFAULT_CONFIG) is None
assert _handle_response(response.url, response, False, DEFAULT_OPTS) is None
# too small
response.data = b'ABC'
assert _handle_response(response.url, response, False, DEFAULT_CONFIG) is None
assert _handle_response(response.url, response, False, DEFAULT_OPTS) is None
# straight handling of response object
with open(os.path.join(RESOURCES_DIR, 'utf8.html'), 'rb') as filehandle:
response.data = filehandle.read()
assert _handle_response(response.url, response, False, DEFAULT_CONFIG) is not None
assert _handle_response(response.url, response, False, DEFAULT_OPTS) is not None
assert load_html(response) is not None
# nothing to see here
assert extract(response, url=response.url, config=ZERO_CONFIG) is None
Expand Down Expand Up @@ -198,7 +199,7 @@ def test_queue():
url_store = add_to_compressed_dict(inputurls)
args.archived = True
args.config_file = os.path.join(RESOURCES_DIR, 'newsettings.cfg')
options = _args_to_extractor(args)
options = args_to_extractor(args)
options.config['DEFAULT']['SLEEP_TIME'] = '0.2'
results = download_queue_processing(url_store, args, None, options)
assert len(results[0]) == 5 and results[1] is None
Expand Down
51 changes: 19 additions & 32 deletions trafilatura/cli_utils.py
Expand Up @@ -20,17 +20,18 @@
from trafilatura import spider

from .baseline import html2txt
from .core import Extractor, extract
from .core import extract
from .downloads import (add_to_compressed_dict, buffered_downloads,
load_download_buffer)
from .feeds import find_feed_urls
from .filters import LANGID_FLAG, language_classifier
from .hashing import generate_hash_filename
from .meta import reset_caches
from .settings import FILENAME_LEN, MAX_FILES_PER_DIRECTORY, use_config
from .settings import FILENAME_LEN, MAX_FILES_PER_DIRECTORY, args_to_extractor
from .sitemaps import sitemap_search
from .utils import URL_BLACKLIST_REGEX, make_chunks


LOGGER = logging.getLogger(__name__)

random.seed(345) # make generated file names reproducible
Expand Down Expand Up @@ -217,7 +218,7 @@ def download_queue_processing(url_store, args, counter, options):
while url_store.done is False:
bufferlist, url_store = load_download_buffer(url_store, options.config.getfloat('DEFAULT', 'SLEEP_TIME'))
# process downloads
for url, result in buffered_downloads(bufferlist, args.parallel):
for url, result in buffered_downloads(bufferlist, args.parallel, options=options):
# handle result
if result is not None:
options.url = url
Expand All @@ -235,12 +236,12 @@ def cli_discovery(args):
if args.list:
url_store.reset()

config = use_config(filename=args.config_file)
options = args_to_extractor(args)
func = partial(
find_feed_urls if args.feed else sitemap_search,
target_lang=args.target_language,
external=config.getboolean('DEFAULT', 'EXTERNAL_URLS'),
sleep_time=config.getfloat('DEFAULT', 'SLEEP_TIME')
external=options.config.getboolean('DEFAULT', 'EXTERNAL_URLS'),
sleep_time=options.config.getfloat('DEFAULT', 'SLEEP_TIME')
)

# link discovery and storage
Expand All @@ -264,7 +265,7 @@ def cli_discovery(args):
if args.explore:
# add to compressed dict and crawl the remaining websites
control_dict = build_exploration_dict(url_store, input_urls, args)
cli_crawler(args, url_store=control_dict)
cli_crawler(args, url_store=control_dict, options=options)


def build_exploration_dict(url_store, input_urls, args):
Expand All @@ -282,11 +283,12 @@ def build_exploration_dict(url_store, input_urls, args):
return control_dict


def cli_crawler(args, n=30, url_store=None):
def cli_crawler(args, n=30, url_store=None, options=None):
'''Start a focused crawler which downloads a fixed number of URLs within a website
and prints the links found in the process'''
config = use_config(filename=args.config_file)
sleep_time = config.getfloat('DEFAULT', 'SLEEP_TIME')
if not options:
options = args_to_extractor(args)
sleep_time = options.config.getfloat('DEFAULT', 'SLEEP_TIME')
# counter = None
# load input URLs
if url_store is None:
Expand All @@ -307,7 +309,7 @@ def cli_crawler(args, n=30, url_store=None):
while spider.URL_STORE.done is False:
bufferlist, spider.URL_STORE = load_download_buffer(spider.URL_STORE, sleep_time)
# start several threads
for url, result in buffered_downloads(bufferlist, args.parallel, decode=False):
for url, result in buffered_downloads(bufferlist, args.parallel, decode=False, options=options):
base_url = get_base_url(url)
# handle result
if result is not None:
Expand All @@ -325,39 +327,24 @@ def cli_crawler(args, n=30, url_store=None):
def probe_homepage(args):
"Probe websites for extractable content and print the fitting ones."
input_urls = load_input_urls(args)
config = use_config(filename=args.config_file)
min_length = config.getint('DEFAULT', 'MIN_EXTRACTED_SIZE')
options = args_to_extractor(args)

for url, result in buffered_downloads(input_urls, args.parallel):
for url, result in buffered_downloads(input_urls, args.parallel, options=options):
if result is not None:
result = html2txt(result)
if result and len(result) > min_length and any(c.isalpha() for c in result):
if result and len(result) > options.min_extracted_size and any(c.isalpha() for c in result):
if not LANGID_FLAG or not args.target_language or language_classifier(result, "") == args.target_language:
print(url, flush=True)


def _args_to_extractor(args, url=None):
"Derive extractor configuration from CLI args."
options = Extractor(
config=use_config(filename=args.config_file), output_format=args.output_format,
comments=args.no_comments, tables=args.no_tables,
dedup=args.deduplicate, lang=args.target_language,
url=url, only_with_metadata=args.only_with_metadata,
tei_validation=args.validate_tei
)
for attr in ("fast", "precision", "recall", "formatting", "images", "links"):
setattr(options, attr, getattr(args, attr))
return options


def url_processing_pipeline(args, url_store):
'''Aggregated functions to show a list and download and process an input list'''
# print list without further processing
if args.list:
url_store.print_unvisited_urls() # and not write_result()
return False # and not sys.exit(0)

options = _args_to_extractor(args)
options = args_to_extractor(args)

# initialize file counter if necessary
if url_store.total_url_number() > MAX_FILES_PER_DIRECTORY:
Expand All @@ -383,7 +370,7 @@ def url_processing_pipeline(args, url_store):
def file_processing_pipeline(args):
'''Define batches for parallel file processing and perform the extraction'''
filecounter = None
options = _args_to_extractor(args)
options = args_to_extractor(args)
timeout = options.config.getint('DEFAULT', 'EXTRACTION_TIMEOUT')

# max_tasks_per_child available in Python >= 3.11
Expand All @@ -403,7 +390,7 @@ def examine(htmlstring, args, url=None, options=None):
"""Generic safeguards and triggers"""
result = None
if not options:
options = _args_to_extractor(args, url)
options = args_to_extractor(args, url)
# safety check
if htmlstring is None:
sys.stderr.write('ERROR: empty document\n')
Expand Down
65 changes: 2 additions & 63 deletions trafilatura/core.py
Expand Up @@ -19,8 +19,8 @@
from .hashing import content_fingerprint
from .htmlprocessing import convert_tags, prune_unwanted_nodes, tree_cleaning
from .main_extractor import extract_comments, extract_content
from .metadata import Document, extract_metadata, set_date_params
from .settings import DEFAULT_CONFIG, use_config
from .metadata import Document, extract_metadata
from .settings import DEFAULT_CONFIG, Extractor, use_config
from .utils import load_html, normalize_unicode
from .xml import build_json_output, control_xml_output, xmltotxt, xmltocsv
from .xpaths import REMOVE_COMMENTS_XPATH
Expand All @@ -29,67 +29,6 @@
LOGGER = logging.getLogger(__name__)


class Extractor:
"Defines a class to store all extraction options."
__slots__ = [
'config',
# general
'format', 'fast', 'precision', 'recall', 'comments',
'formatting', 'links', 'images', 'tables', 'dedup', 'lang',
# extraction size
'min_extracted_size', 'min_output_size',
'min_output_comm_size', 'min_extracted_comm_size',
# deduplication
'min_duplcheck_size', 'max_repetitions',
# rest
'max_file_size', 'min_file_size', 'max_tree_size',
# meta
'source', 'url', 'only_with_metadata', 'tei_validation',
'date_params',
'author_blacklist', 'url_blacklist'
]
# consider dataclasses for Python 3.7+
def __init__(self, *, config=DEFAULT_CONFIG, output_format="txt",
fast=False, precision=False, recall=False,
comments=True, formatting=False, links=False, images=False,
tables=True, dedup=False, lang=None, max_tree_size=None,
url=None, source=None, only_with_metadata=False, tei_validation=False,
author_blacklist=None, url_blacklist=None, date_params=None):
self._add_config(config)
self.format = output_format
self.fast = fast
self.precision = precision
self.recall = recall
self.comments = comments
self.formatting = formatting or output_format == "markdown"
self.links = links
self.images = images
self.tables = tables
self.dedup = dedup
self.lang = lang
self.max_tree_size = max_tree_size
self.url = url
self.source = url or source
self.only_with_metadata = only_with_metadata
self.tei_validation = tei_validation
self.author_blacklist = author_blacklist or set()
self.url_blacklist = url_blacklist or set()
self.date_params = date_params or \
set_date_params(self.config.getboolean('DEFAULT', 'EXTENSIVE_DATE_SEARCH'))

def _add_config(self, config):
"Store options loaded from config file."
self.min_extracted_size = config.getint('DEFAULT', 'MIN_EXTRACTED_SIZE')
self.min_output_size = config.getint('DEFAULT', 'MIN_OUTPUT_SIZE')
self.min_output_comm_size = config.getint('DEFAULT', 'MIN_OUTPUT_COMM_SIZE')
self.min_extracted_comm_size = config.getint('DEFAULT', 'MIN_EXTRACTED_COMM_SIZE')
self.min_duplcheck_size = config.getint('DEFAULT', 'MIN_DUPLCHECK_SIZE')
self.max_repetitions = config.getint('DEFAULT', 'MAX_REPETITIONS')
self.max_file_size = config.getint('DEFAULT', 'MAX_FILE_SIZE')
self.min_file_size = config.getint('DEFAULT', 'MIN_FILE_SIZE')
self.config = config # todo: remove?


def determine_returnstring(document, options):
'''Convert XML tree to chosen format, clean the result and output it as a string'''
# XML (TEI) steps
Expand Down

0 comments on commit e151af6

Please sign in to comment.