CLI & downloads: revamp options and make sure they are used (#565)

* CLI & downloads: options as arg for buffered_downloads * add options to fetch function * refactoring
adbar · Apr 23, 2024 · e151af6 · e151af6
1 parent f84e648
commit e151af6
Show file tree

Hide file tree

Showing 7 changed files with 138 additions and 130 deletions.
diff --git a/tests/cli_tests.py b/tests/cli_tests.py
@@ -18,6 +18,7 @@
 from trafilatura import cli, cli_utils, settings, spider
 from trafilatura.downloads import add_to_compressed_dict, fetch_url
 from trafilatura.filters import LANGID_FLAG
+from trafilatura.settings import args_to_extractor
 
 logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
 RESOURCES_DIR = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'resources')
@@ -193,7 +194,7 @@ def test_sysoutput():
     result = 'DADIDA'
     cli_utils.write_result(result, args)
     # process with backup directory and no counter
-    options = cli_utils._args_to_extractor(args)
+    options = args_to_extractor(args)
     assert cli_utils.process_result('DADIDA', args, None, options) is None
     # test keeping dir structure
     testargs = ['', '-i', 'myinputdir/', '-o', 'test/', '--keep-dirs']
@@ -333,7 +334,7 @@ def test_file_processing():
     args.input_dir = RESOURCES_DIR
     cli_utils.file_processing_pipeline(args)
     # test manually
-    options = cli_utils._args_to_extractor(args)
+    options = args_to_extractor(args)
     for f in cli_utils.generate_filelist(args.input_dir):
         cli_utils.file_processing(f, args, options=options)
 
@@ -346,7 +347,7 @@ def test_cli_config_file():
     with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r', encoding="utf-8") as f:
         teststring = f.read()
     args.config_file = os.path.join(RESOURCES_DIR, args.config_file)
-    options = cli_utils._args_to_extractor(args)
+    options = args_to_extractor(args)
     assert cli.examine(teststring, args, options=options) is None
 
 

diff --git a/tests/downloads_tests.py b/tests/downloads_tests.py
@@ -24,10 +24,9 @@
 from courlan import UrlStore
 
 from trafilatura.cli import parse_args
-from trafilatura.cli_utils import (_args_to_extractor,
-                                   download_queue_processing,
+from trafilatura.cli_utils import (download_queue_processing,
                                    url_processing_pipeline)
-from trafilatura.core import extract
+from trafilatura.core import Extractor, extract
 import trafilatura.downloads
 from trafilatura.downloads import (DEFAULT_HEADERS, USER_AGENT, Response,
                                    _determine_headers, _handle_response,
@@ -36,7 +35,7 @@
                                    _urllib3_is_live_page,
                                    add_to_compressed_dict, fetch_url,
                                    is_live_page, load_download_buffer)
-from trafilatura.settings import DEFAULT_CONFIG, use_config
+from trafilatura.settings import DEFAULT_CONFIG, args_to_extractor, use_config
 from trafilatura.utils import decode_file, decode_response, load_html
 
 logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
@@ -48,6 +47,8 @@
 RESOURCES_DIR = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'resources')
 UA_CONFIG = use_config(filename=os.path.join(RESOURCES_DIR, 'newsettings.cfg'))
 
+DEFAULT_OPTS = Extractor(config=DEFAULT_CONFIG)
+
 
 def _reset_downloads_global_objects():
     """
@@ -101,8 +102,8 @@ def test_fetch():
     if pycurl is not None:
         response1 = _send_pycurl_request('https://httpbun.com/status/200', True, True, DEFAULT_CONFIG)
         assert response1.headers["x-powered-by"].startswith("httpbun")
-        assert _handle_response(url, response1, False, DEFAULT_CONFIG).data == _handle_response(url, response, False, DEFAULT_CONFIG).data
-        assert _handle_response(url, response1, True, DEFAULT_CONFIG) == _handle_response(url, response, True, DEFAULT_CONFIG)
+        assert _handle_response(url, response1, False, DEFAULT_OPTS).data == _handle_response(url, response, False, DEFAULT_OPTS).data
+        assert _handle_response(url, response1, True, DEFAULT_OPTS) == _handle_response(url, response, True, DEFAULT_OPTS)
     # response object
     # too large response object
     data = ""
@@ -111,14 +112,14 @@ def test_fetch():
     response = Response(data, status, url)
     # too large
     response.data = b'ABC'*10000000
-    assert _handle_response(response.url, response, False, DEFAULT_CONFIG) is None
+    assert _handle_response(response.url, response, False, DEFAULT_OPTS) is None
     # too small
     response.data = b'ABC'
-    assert _handle_response(response.url, response, False, DEFAULT_CONFIG) is None
+    assert _handle_response(response.url, response, False, DEFAULT_OPTS) is None
     # straight handling of response object
     with open(os.path.join(RESOURCES_DIR, 'utf8.html'), 'rb') as filehandle:
         response.data = filehandle.read()
-    assert _handle_response(response.url, response, False, DEFAULT_CONFIG) is not None
+    assert _handle_response(response.url, response, False, DEFAULT_OPTS) is not None
     assert load_html(response) is not None
     # nothing to see here
     assert extract(response, url=response.url, config=ZERO_CONFIG) is None
@@ -198,7 +199,7 @@ def test_queue():
     url_store = add_to_compressed_dict(inputurls)
     args.archived = True
     args.config_file = os.path.join(RESOURCES_DIR, 'newsettings.cfg')
-    options = _args_to_extractor(args)
+    options = args_to_extractor(args)
     options.config['DEFAULT']['SLEEP_TIME'] = '0.2'
     results = download_queue_processing(url_store, args, None, options)
     assert len(results[0]) == 5 and results[1] is None

diff --git a/trafilatura/cli_utils.py b/trafilatura/cli_utils.py
@@ -20,17 +20,18 @@
 from trafilatura import spider
 
 from .baseline import html2txt
-from .core import Extractor, extract
+from .core import extract
 from .downloads import (add_to_compressed_dict, buffered_downloads,
                         load_download_buffer)
 from .feeds import find_feed_urls
 from .filters import LANGID_FLAG, language_classifier
 from .hashing import generate_hash_filename
 from .meta import reset_caches
-from .settings import FILENAME_LEN, MAX_FILES_PER_DIRECTORY, use_config
+from .settings import FILENAME_LEN, MAX_FILES_PER_DIRECTORY, args_to_extractor
 from .sitemaps import sitemap_search
 from .utils import URL_BLACKLIST_REGEX, make_chunks
 
+
 LOGGER = logging.getLogger(__name__)
 
 random.seed(345)  # make generated file names reproducible
@@ -217,7 +218,7 @@ def download_queue_processing(url_store, args, counter, options):
     while url_store.done is False:
         bufferlist, url_store = load_download_buffer(url_store, options.config.getfloat('DEFAULT', 'SLEEP_TIME'))
         # process downloads
-        for url, result in buffered_downloads(bufferlist, args.parallel):
+        for url, result in buffered_downloads(bufferlist, args.parallel, options=options):
             # handle result
             if result is not None:
                 options.url = url
@@ -235,12 +236,12 @@ def cli_discovery(args):
     if args.list:
         url_store.reset()
 
-    config = use_config(filename=args.config_file)
+    options = args_to_extractor(args)
     func = partial(
                find_feed_urls if args.feed else sitemap_search,
                target_lang=args.target_language,
-               external=config.getboolean('DEFAULT', 'EXTERNAL_URLS'),
-               sleep_time=config.getfloat('DEFAULT', 'SLEEP_TIME')
+               external=options.config.getboolean('DEFAULT', 'EXTERNAL_URLS'),
+               sleep_time=options.config.getfloat('DEFAULT', 'SLEEP_TIME')
            )
 
     # link discovery and storage
@@ -264,7 +265,7 @@ def cli_discovery(args):
     if args.explore:
         # add to compressed dict and crawl the remaining websites
         control_dict = build_exploration_dict(url_store, input_urls, args)
-        cli_crawler(args, url_store=control_dict)
+        cli_crawler(args, url_store=control_dict, options=options)
 
 
 def build_exploration_dict(url_store, input_urls, args):
@@ -282,11 +283,12 @@ def build_exploration_dict(url_store, input_urls, args):
     return control_dict
 
 
-def cli_crawler(args, n=30, url_store=None):
+def cli_crawler(args, n=30, url_store=None, options=None):
     '''Start a focused crawler which downloads a fixed number of URLs within a website
        and prints the links found in the process'''
-    config = use_config(filename=args.config_file)
-    sleep_time = config.getfloat('DEFAULT', 'SLEEP_TIME')
+    if not options:
+        options = args_to_extractor(args)
+    sleep_time = options.config.getfloat('DEFAULT', 'SLEEP_TIME')
     # counter = None
     # load input URLs
     if url_store is None:
@@ -307,7 +309,7 @@ def cli_crawler(args, n=30, url_store=None):
     while spider.URL_STORE.done is False:
         bufferlist, spider.URL_STORE = load_download_buffer(spider.URL_STORE, sleep_time)
         # start several threads
-        for url, result in buffered_downloads(bufferlist, args.parallel, decode=False):
+        for url, result in buffered_downloads(bufferlist, args.parallel, decode=False, options=options):
             base_url = get_base_url(url)
             # handle result
             if result is not None:
@@ -325,39 +327,24 @@ def cli_crawler(args, n=30, url_store=None):
 def probe_homepage(args):
     "Probe websites for extractable content and print the fitting ones."
     input_urls = load_input_urls(args)
-    config = use_config(filename=args.config_file)
-    min_length = config.getint('DEFAULT', 'MIN_EXTRACTED_SIZE')
+    options = args_to_extractor(args)
 
-    for url, result in buffered_downloads(input_urls, args.parallel):
+    for url, result in buffered_downloads(input_urls, args.parallel, options=options):
         if result is not None:
             result = html2txt(result)
-            if result and len(result) > min_length and any(c.isalpha() for c in result):
+            if result and len(result) > options.min_extracted_size and any(c.isalpha() for c in result):
                 if not LANGID_FLAG or not args.target_language or language_classifier(result, "") == args.target_language:
                     print(url, flush=True)
 
 
-def _args_to_extractor(args, url=None):
-    "Derive extractor configuration from CLI args."
-    options = Extractor(
-                  config=use_config(filename=args.config_file), output_format=args.output_format,
-                  comments=args.no_comments, tables=args.no_tables,
-                  dedup=args.deduplicate, lang=args.target_language,
-                  url=url, only_with_metadata=args.only_with_metadata,
-                  tei_validation=args.validate_tei
-              )
-    for attr in ("fast", "precision", "recall", "formatting", "images", "links"):
-        setattr(options, attr, getattr(args, attr))
-    return options
-
-
 def url_processing_pipeline(args, url_store):
     '''Aggregated functions to show a list and download and process an input list'''
     # print list without further processing
     if args.list:
         url_store.print_unvisited_urls()  # and not write_result()
         return False  # and not sys.exit(0)
 
-    options = _args_to_extractor(args)
+    options = args_to_extractor(args)
 
     # initialize file counter if necessary
     if url_store.total_url_number() > MAX_FILES_PER_DIRECTORY:
@@ -383,7 +370,7 @@ def url_processing_pipeline(args, url_store):
 def file_processing_pipeline(args):
     '''Define batches for parallel file processing and perform the extraction'''
     filecounter = None
-    options = _args_to_extractor(args)
+    options = args_to_extractor(args)
     timeout = options.config.getint('DEFAULT', 'EXTRACTION_TIMEOUT')
 
     # max_tasks_per_child available in Python >= 3.11
@@ -403,7 +390,7 @@ def examine(htmlstring, args, url=None, options=None):
     """Generic safeguards and triggers"""
     result = None
     if not options:
-        options = _args_to_extractor(args, url)
+        options = args_to_extractor(args, url)
     # safety check
     if htmlstring is None:
         sys.stderr.write('ERROR: empty document\n')

diff --git a/trafilatura/core.py b/trafilatura/core.py
@@ -19,8 +19,8 @@
 from .hashing import content_fingerprint
 from .htmlprocessing import convert_tags, prune_unwanted_nodes, tree_cleaning
 from .main_extractor import extract_comments, extract_content
-from .metadata import Document, extract_metadata, set_date_params
-from .settings import DEFAULT_CONFIG, use_config
+from .metadata import Document, extract_metadata
+from .settings import DEFAULT_CONFIG, Extractor, use_config
 from .utils import load_html, normalize_unicode
 from .xml import build_json_output, control_xml_output, xmltotxt, xmltocsv
 from .xpaths import REMOVE_COMMENTS_XPATH
@@ -29,67 +29,6 @@
 LOGGER = logging.getLogger(__name__)
 
 
-class Extractor:
-    "Defines a class to store all extraction options."
-    __slots__ = [
-    'config',
-    # general
-    'format', 'fast', 'precision', 'recall', 'comments',
-    'formatting', 'links', 'images', 'tables', 'dedup', 'lang',
-    # extraction size
-    'min_extracted_size', 'min_output_size',
-    'min_output_comm_size', 'min_extracted_comm_size',
-    # deduplication
-    'min_duplcheck_size', 'max_repetitions',
-    # rest
-    'max_file_size', 'min_file_size', 'max_tree_size',
-    # meta
-    'source', 'url', 'only_with_metadata', 'tei_validation',
-    'date_params',
-    'author_blacklist', 'url_blacklist'
-    ]
-    # consider dataclasses for Python 3.7+
-    def __init__(self, *, config=DEFAULT_CONFIG, output_format="txt",
-                 fast=False, precision=False, recall=False,
-                 comments=True, formatting=False, links=False, images=False,
-                 tables=True, dedup=False, lang=None, max_tree_size=None,
-                 url=None, source=None, only_with_metadata=False, tei_validation=False,
-                 author_blacklist=None, url_blacklist=None, date_params=None):
-        self._add_config(config)
-        self.format = output_format
-        self.fast = fast
-        self.precision = precision
-        self.recall = recall
-        self.comments = comments
-        self.formatting = formatting or output_format == "markdown"
-        self.links = links
-        self.images = images
-        self.tables = tables
-        self.dedup = dedup
-        self.lang = lang
-        self.max_tree_size = max_tree_size
-        self.url = url
-        self.source = url or source
-        self.only_with_metadata = only_with_metadata
-        self.tei_validation = tei_validation
-        self.author_blacklist = author_blacklist or set()
-        self.url_blacklist = url_blacklist or set()
-        self.date_params = date_params or \
-                           set_date_params(self.config.getboolean('DEFAULT', 'EXTENSIVE_DATE_SEARCH'))
-
-    def _add_config(self, config):
-        "Store options loaded from config file."
-        self.min_extracted_size = config.getint('DEFAULT', 'MIN_EXTRACTED_SIZE')
-        self.min_output_size = config.getint('DEFAULT', 'MIN_OUTPUT_SIZE')
-        self.min_output_comm_size = config.getint('DEFAULT', 'MIN_OUTPUT_COMM_SIZE')
-        self.min_extracted_comm_size = config.getint('DEFAULT', 'MIN_EXTRACTED_COMM_SIZE')
-        self.min_duplcheck_size = config.getint('DEFAULT', 'MIN_DUPLCHECK_SIZE')
-        self.max_repetitions = config.getint('DEFAULT', 'MAX_REPETITIONS')
-        self.max_file_size = config.getint('DEFAULT', 'MAX_FILE_SIZE')
-        self.min_file_size = config.getint('DEFAULT', 'MIN_FILE_SIZE')
-        self.config = config  # todo: remove?
-
-
 def determine_returnstring(document, options):
     '''Convert XML tree to chosen format, clean the result and output it as a string'''
     # XML (TEI) steps