diff --git a/tests/cli_tests.py b/tests/cli_tests.py index 7632850b..9d27e1f9 100644 --- a/tests/cli_tests.py +++ b/tests/cli_tests.py @@ -18,6 +18,7 @@ from trafilatura import cli, cli_utils, settings, spider from trafilatura.downloads import add_to_compressed_dict, fetch_url from trafilatura.filters import LANGID_FLAG +from trafilatura.settings import args_to_extractor logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) RESOURCES_DIR = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'resources') @@ -193,7 +194,7 @@ def test_sysoutput(): result = 'DADIDA' cli_utils.write_result(result, args) # process with backup directory and no counter - options = cli_utils._args_to_extractor(args) + options = args_to_extractor(args) assert cli_utils.process_result('DADIDA', args, None, options) is None # test keeping dir structure testargs = ['', '-i', 'myinputdir/', '-o', 'test/', '--keep-dirs'] @@ -333,7 +334,7 @@ def test_file_processing(): args.input_dir = RESOURCES_DIR cli_utils.file_processing_pipeline(args) # test manually - options = cli_utils._args_to_extractor(args) + options = args_to_extractor(args) for f in cli_utils.generate_filelist(args.input_dir): cli_utils.file_processing(f, args, options=options) @@ -346,7 +347,7 @@ def test_cli_config_file(): with open(os.path.join(RESOURCES_DIR, 'httpbin_sample.html'), 'r', encoding="utf-8") as f: teststring = f.read() args.config_file = os.path.join(RESOURCES_DIR, args.config_file) - options = cli_utils._args_to_extractor(args) + options = args_to_extractor(args) assert cli.examine(teststring, args, options=options) is None diff --git a/tests/downloads_tests.py b/tests/downloads_tests.py index 03652a2e..9d1c8066 100644 --- a/tests/downloads_tests.py +++ b/tests/downloads_tests.py @@ -24,10 +24,9 @@ from courlan import UrlStore from trafilatura.cli import parse_args -from trafilatura.cli_utils import (_args_to_extractor, - download_queue_processing, +from trafilatura.cli_utils import (download_queue_processing, url_processing_pipeline) -from trafilatura.core import extract +from trafilatura.core import Extractor, extract import trafilatura.downloads from trafilatura.downloads import (DEFAULT_HEADERS, USER_AGENT, Response, _determine_headers, _handle_response, @@ -36,7 +35,7 @@ _urllib3_is_live_page, add_to_compressed_dict, fetch_url, is_live_page, load_download_buffer) -from trafilatura.settings import DEFAULT_CONFIG, use_config +from trafilatura.settings import DEFAULT_CONFIG, args_to_extractor, use_config from trafilatura.utils import decode_file, decode_response, load_html logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) @@ -48,6 +47,8 @@ RESOURCES_DIR = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'resources') UA_CONFIG = use_config(filename=os.path.join(RESOURCES_DIR, 'newsettings.cfg')) +DEFAULT_OPTS = Extractor(config=DEFAULT_CONFIG) + def _reset_downloads_global_objects(): """ @@ -101,8 +102,8 @@ def test_fetch(): if pycurl is not None: response1 = _send_pycurl_request('https://httpbun.com/status/200', True, True, DEFAULT_CONFIG) assert response1.headers["x-powered-by"].startswith("httpbun") - assert _handle_response(url, response1, False, DEFAULT_CONFIG).data == _handle_response(url, response, False, DEFAULT_CONFIG).data - assert _handle_response(url, response1, True, DEFAULT_CONFIG) == _handle_response(url, response, True, DEFAULT_CONFIG) + assert _handle_response(url, response1, False, DEFAULT_OPTS).data == _handle_response(url, response, False, DEFAULT_OPTS).data + assert _handle_response(url, response1, True, DEFAULT_OPTS) == _handle_response(url, response, True, DEFAULT_OPTS) # response object # too large response object data = "" @@ -111,14 +112,14 @@ def test_fetch(): response = Response(data, status, url) # too large response.data = b'ABC'*10000000 - assert _handle_response(response.url, response, False, DEFAULT_CONFIG) is None + assert _handle_response(response.url, response, False, DEFAULT_OPTS) is None # too small response.data = b'ABC' - assert _handle_response(response.url, response, False, DEFAULT_CONFIG) is None + assert _handle_response(response.url, response, False, DEFAULT_OPTS) is None # straight handling of response object with open(os.path.join(RESOURCES_DIR, 'utf8.html'), 'rb') as filehandle: response.data = filehandle.read() - assert _handle_response(response.url, response, False, DEFAULT_CONFIG) is not None + assert _handle_response(response.url, response, False, DEFAULT_OPTS) is not None assert load_html(response) is not None # nothing to see here assert extract(response, url=response.url, config=ZERO_CONFIG) is None @@ -198,7 +199,7 @@ def test_queue(): url_store = add_to_compressed_dict(inputurls) args.archived = True args.config_file = os.path.join(RESOURCES_DIR, 'newsettings.cfg') - options = _args_to_extractor(args) + options = args_to_extractor(args) options.config['DEFAULT']['SLEEP_TIME'] = '0.2' results = download_queue_processing(url_store, args, None, options) assert len(results[0]) == 5 and results[1] is None diff --git a/trafilatura/cli_utils.py b/trafilatura/cli_utils.py index cea883da..5d8f8d4a 100644 --- a/trafilatura/cli_utils.py +++ b/trafilatura/cli_utils.py @@ -20,17 +20,18 @@ from trafilatura import spider from .baseline import html2txt -from .core import Extractor, extract +from .core import extract from .downloads import (add_to_compressed_dict, buffered_downloads, load_download_buffer) from .feeds import find_feed_urls from .filters import LANGID_FLAG, language_classifier from .hashing import generate_hash_filename from .meta import reset_caches -from .settings import FILENAME_LEN, MAX_FILES_PER_DIRECTORY, use_config +from .settings import FILENAME_LEN, MAX_FILES_PER_DIRECTORY, args_to_extractor from .sitemaps import sitemap_search from .utils import URL_BLACKLIST_REGEX, make_chunks + LOGGER = logging.getLogger(__name__) random.seed(345) # make generated file names reproducible @@ -217,7 +218,7 @@ def download_queue_processing(url_store, args, counter, options): while url_store.done is False: bufferlist, url_store = load_download_buffer(url_store, options.config.getfloat('DEFAULT', 'SLEEP_TIME')) # process downloads - for url, result in buffered_downloads(bufferlist, args.parallel): + for url, result in buffered_downloads(bufferlist, args.parallel, options=options): # handle result if result is not None: options.url = url @@ -235,12 +236,12 @@ def cli_discovery(args): if args.list: url_store.reset() - config = use_config(filename=args.config_file) + options = args_to_extractor(args) func = partial( find_feed_urls if args.feed else sitemap_search, target_lang=args.target_language, - external=config.getboolean('DEFAULT', 'EXTERNAL_URLS'), - sleep_time=config.getfloat('DEFAULT', 'SLEEP_TIME') + external=options.config.getboolean('DEFAULT', 'EXTERNAL_URLS'), + sleep_time=options.config.getfloat('DEFAULT', 'SLEEP_TIME') ) # link discovery and storage @@ -264,7 +265,7 @@ def cli_discovery(args): if args.explore: # add to compressed dict and crawl the remaining websites control_dict = build_exploration_dict(url_store, input_urls, args) - cli_crawler(args, url_store=control_dict) + cli_crawler(args, url_store=control_dict, options=options) def build_exploration_dict(url_store, input_urls, args): @@ -282,11 +283,12 @@ def build_exploration_dict(url_store, input_urls, args): return control_dict -def cli_crawler(args, n=30, url_store=None): +def cli_crawler(args, n=30, url_store=None, options=None): '''Start a focused crawler which downloads a fixed number of URLs within a website and prints the links found in the process''' - config = use_config(filename=args.config_file) - sleep_time = config.getfloat('DEFAULT', 'SLEEP_TIME') + if not options: + options = args_to_extractor(args) + sleep_time = options.config.getfloat('DEFAULT', 'SLEEP_TIME') # counter = None # load input URLs if url_store is None: @@ -307,7 +309,7 @@ def cli_crawler(args, n=30, url_store=None): while spider.URL_STORE.done is False: bufferlist, spider.URL_STORE = load_download_buffer(spider.URL_STORE, sleep_time) # start several threads - for url, result in buffered_downloads(bufferlist, args.parallel, decode=False): + for url, result in buffered_downloads(bufferlist, args.parallel, decode=False, options=options): base_url = get_base_url(url) # handle result if result is not None: @@ -325,31 +327,16 @@ def cli_crawler(args, n=30, url_store=None): def probe_homepage(args): "Probe websites for extractable content and print the fitting ones." input_urls = load_input_urls(args) - config = use_config(filename=args.config_file) - min_length = config.getint('DEFAULT', 'MIN_EXTRACTED_SIZE') + options = args_to_extractor(args) - for url, result in buffered_downloads(input_urls, args.parallel): + for url, result in buffered_downloads(input_urls, args.parallel, options=options): if result is not None: result = html2txt(result) - if result and len(result) > min_length and any(c.isalpha() for c in result): + if result and len(result) > options.min_extracted_size and any(c.isalpha() for c in result): if not LANGID_FLAG or not args.target_language or language_classifier(result, "") == args.target_language: print(url, flush=True) -def _args_to_extractor(args, url=None): - "Derive extractor configuration from CLI args." - options = Extractor( - config=use_config(filename=args.config_file), output_format=args.output_format, - comments=args.no_comments, tables=args.no_tables, - dedup=args.deduplicate, lang=args.target_language, - url=url, only_with_metadata=args.only_with_metadata, - tei_validation=args.validate_tei - ) - for attr in ("fast", "precision", "recall", "formatting", "images", "links"): - setattr(options, attr, getattr(args, attr)) - return options - - def url_processing_pipeline(args, url_store): '''Aggregated functions to show a list and download and process an input list''' # print list without further processing @@ -357,7 +344,7 @@ def url_processing_pipeline(args, url_store): url_store.print_unvisited_urls() # and not write_result() return False # and not sys.exit(0) - options = _args_to_extractor(args) + options = args_to_extractor(args) # initialize file counter if necessary if url_store.total_url_number() > MAX_FILES_PER_DIRECTORY: @@ -383,7 +370,7 @@ def url_processing_pipeline(args, url_store): def file_processing_pipeline(args): '''Define batches for parallel file processing and perform the extraction''' filecounter = None - options = _args_to_extractor(args) + options = args_to_extractor(args) timeout = options.config.getint('DEFAULT', 'EXTRACTION_TIMEOUT') # max_tasks_per_child available in Python >= 3.11 @@ -403,7 +390,7 @@ def examine(htmlstring, args, url=None, options=None): """Generic safeguards and triggers""" result = None if not options: - options = _args_to_extractor(args, url) + options = args_to_extractor(args, url) # safety check if htmlstring is None: sys.stderr.write('ERROR: empty document\n') diff --git a/trafilatura/core.py b/trafilatura/core.py index 18b02c22..b0d5874a 100644 --- a/trafilatura/core.py +++ b/trafilatura/core.py @@ -19,8 +19,8 @@ from .hashing import content_fingerprint from .htmlprocessing import convert_tags, prune_unwanted_nodes, tree_cleaning from .main_extractor import extract_comments, extract_content -from .metadata import Document, extract_metadata, set_date_params -from .settings import DEFAULT_CONFIG, use_config +from .metadata import Document, extract_metadata +from .settings import DEFAULT_CONFIG, Extractor, use_config from .utils import load_html, normalize_unicode from .xml import build_json_output, control_xml_output, xmltotxt, xmltocsv from .xpaths import REMOVE_COMMENTS_XPATH @@ -29,67 +29,6 @@ LOGGER = logging.getLogger(__name__) -class Extractor: - "Defines a class to store all extraction options." - __slots__ = [ - 'config', - # general - 'format', 'fast', 'precision', 'recall', 'comments', - 'formatting', 'links', 'images', 'tables', 'dedup', 'lang', - # extraction size - 'min_extracted_size', 'min_output_size', - 'min_output_comm_size', 'min_extracted_comm_size', - # deduplication - 'min_duplcheck_size', 'max_repetitions', - # rest - 'max_file_size', 'min_file_size', 'max_tree_size', - # meta - 'source', 'url', 'only_with_metadata', 'tei_validation', - 'date_params', - 'author_blacklist', 'url_blacklist' - ] - # consider dataclasses for Python 3.7+ - def __init__(self, *, config=DEFAULT_CONFIG, output_format="txt", - fast=False, precision=False, recall=False, - comments=True, formatting=False, links=False, images=False, - tables=True, dedup=False, lang=None, max_tree_size=None, - url=None, source=None, only_with_metadata=False, tei_validation=False, - author_blacklist=None, url_blacklist=None, date_params=None): - self._add_config(config) - self.format = output_format - self.fast = fast - self.precision = precision - self.recall = recall - self.comments = comments - self.formatting = formatting or output_format == "markdown" - self.links = links - self.images = images - self.tables = tables - self.dedup = dedup - self.lang = lang - self.max_tree_size = max_tree_size - self.url = url - self.source = url or source - self.only_with_metadata = only_with_metadata - self.tei_validation = tei_validation - self.author_blacklist = author_blacklist or set() - self.url_blacklist = url_blacklist or set() - self.date_params = date_params or \ - set_date_params(self.config.getboolean('DEFAULT', 'EXTENSIVE_DATE_SEARCH')) - - def _add_config(self, config): - "Store options loaded from config file." - self.min_extracted_size = config.getint('DEFAULT', 'MIN_EXTRACTED_SIZE') - self.min_output_size = config.getint('DEFAULT', 'MIN_OUTPUT_SIZE') - self.min_output_comm_size = config.getint('DEFAULT', 'MIN_OUTPUT_COMM_SIZE') - self.min_extracted_comm_size = config.getint('DEFAULT', 'MIN_EXTRACTED_COMM_SIZE') - self.min_duplcheck_size = config.getint('DEFAULT', 'MIN_DUPLCHECK_SIZE') - self.max_repetitions = config.getint('DEFAULT', 'MAX_REPETITIONS') - self.max_file_size = config.getint('DEFAULT', 'MAX_FILE_SIZE') - self.min_file_size = config.getint('DEFAULT', 'MIN_FILE_SIZE') - self.config = config # todo: remove? - - def determine_returnstring(document, options): '''Convert XML tree to chosen format, clean the result and output it as a string''' # XML (TEI) steps diff --git a/trafilatura/downloads.py b/trafilatura/downloads.py index 281ed7e0..83abbd44 100644 --- a/trafilatura/downloads.py +++ b/trafilatura/downloads.py @@ -8,6 +8,7 @@ import warnings from concurrent.futures import ThreadPoolExecutor, as_completed +from functools import partial from io import BytesIO from time import sleep @@ -34,8 +35,7 @@ except ImportError: from importlib_metadata import version - -from .settings import DEFAULT_CONFIG +from .settings import DEFAULT_CONFIG, Extractor from .utils import URL_BLACKLIST_REGEX, decode_file, make_chunks @@ -164,15 +164,15 @@ def _send_urllib_request(url, no_ssl, with_headers, config): return None -def _handle_response(url, response, decode, config): +def _handle_response(url, response, decode, options): 'Internal function to run safety checks on response result.' lentest = len(response.html or response.data or "") if response.status != 200: LOGGER.error('not a 200 response: %s for URL %s', response.status, url) - elif lentest < config.getint('DEFAULT', 'MIN_FILE_SIZE'): + elif lentest < options.min_file_size: LOGGER.error('too small/incorrect for URL %s', url) # raise error instead? - elif lentest > config.getint('DEFAULT', 'MAX_FILE_SIZE'): + elif lentest > options.max_file_size: LOGGER.error('too large: length %s for URL %s', lentest, url) # raise error instead? else: @@ -181,13 +181,14 @@ def _handle_response(url, response, decode, config): return None -def fetch_url(url, decode=True, no_ssl=False, config=DEFAULT_CONFIG): +def fetch_url(url, decode=True, no_ssl=False, config=DEFAULT_CONFIG, options=None): """Downloads a web page and seamlessly decodes the response. Args: url: URL of the page to fetch. no_ssl: Don't try to establish a secure connection (to prevent SSLError). config: Pass configuration values for output control. + options: Extraction options (supersedes config). Returns: Unicode string or None in case of failed downloads and invalid results. @@ -201,7 +202,9 @@ def fetch_url(url, decode=True, no_ssl=False, config=DEFAULT_CONFIG): ) response = fetch_response(url, decode=decode, no_ssl=no_ssl, config=config) if response is not None and response != '': - return _handle_response(url, response, decode, config) + if not options: + options = Extractor(config=config) + return _handle_response(url, response, decode, options) # return '' (useful do discard further processing?) # return response return None @@ -307,11 +310,12 @@ def load_download_buffer(url_store, sleep_time=5): return bufferlist, url_store -def buffered_downloads(bufferlist, download_threads, decode=True): +def buffered_downloads(bufferlist, download_threads, decode=True, options=None): '''Download queue consumer, single- or multi-threaded.''' + worker = partial(fetch_url, decode=decode, options=options) with ThreadPoolExecutor(max_workers=download_threads) as executor: for chunk in make_chunks(bufferlist, 10000): - future_to_url = {executor.submit(fetch_url, url, decode): url for url in chunk} + future_to_url = {executor.submit(worker, url): url for url in chunk} for future in as_completed(future_to_url): # url and download result yield future_to_url[future], future.result() diff --git a/trafilatura/metadata.py b/trafilatura/metadata.py index 11fb947e..eb34c0fd 100644 --- a/trafilatura/metadata.py +++ b/trafilatura/metadata.py @@ -7,7 +7,6 @@ import re from copy import deepcopy -from datetime import datetime from courlan import extract_domain, get_base_url, is_valid_url, normalize_url, validate_url from htmldate import find_date @@ -16,10 +15,11 @@ from .htmlprocessing import prune_unwanted_nodes from .json_metadata import (extract_json, extract_json_parse_error, normalize_json) -from .xpaths import (AUTHOR_DISCARD_XPATHS, AUTHOR_XPATHS, - CATEGORIES_XPATHS, TAGS_XPATHS, TITLE_XPATHS) +from .settings import set_date_params from .utils import (line_processing, load_html, normalize_authors, normalize_tags, trim, unescape) +from .xpaths import (AUTHOR_DISCARD_XPATHS, AUTHOR_XPATHS, + CATEGORIES_XPATHS, TAGS_XPATHS, TITLE_XPATHS) LOGGER = logging.getLogger(__name__) logging.getLogger('htmldate').setLevel(logging.WARNING) @@ -128,15 +128,6 @@ def as_dict(self): EXTRA_META = {'charset', 'http-equiv', 'property'} -def set_date_params(extensive=True): - "Provide default parameters for date extraction." - return { - "original_date": True, - "extensive_search": extensive, - "max_date": datetime.now().strftime("%Y-%m-%d") - } - - def check_authors(authors, author_blacklist): "Check if the authors string correspond to expected values." author_blacklist = {a.lower() for a in author_blacklist} diff --git a/trafilatura/settings.py b/trafilatura/settings.py index 963896c9..a92640d2 100644 --- a/trafilatura/settings.py +++ b/trafilatura/settings.py @@ -4,6 +4,7 @@ """ from configparser import ConfigParser +from datetime import datetime try: from os import sched_getaffinity @@ -16,7 +17,6 @@ from lxml.etree import XPath - def use_config(filename=None, config=None): """ Use configuration object or read and parse a settings file. @@ -36,6 +36,91 @@ def use_config(filename=None, config=None): DEFAULT_CONFIG = use_config() + +class Extractor: + "Defines a class to store all extraction options." + __slots__ = [ + 'config', + # general + 'format', 'fast', 'precision', 'recall', 'comments', + 'formatting', 'links', 'images', 'tables', 'dedup', 'lang', + # extraction size + 'min_extracted_size', 'min_output_size', + 'min_output_comm_size', 'min_extracted_comm_size', + # deduplication + 'min_duplcheck_size', 'max_repetitions', + # rest + 'max_file_size', 'min_file_size', 'max_tree_size', + # meta + 'source', 'url', 'only_with_metadata', 'tei_validation', + 'date_params', + 'author_blacklist', 'url_blacklist' + ] + # consider dataclasses for Python 3.7+ + def __init__(self, *, config=DEFAULT_CONFIG, output_format="txt", + fast=False, precision=False, recall=False, + comments=True, formatting=False, links=False, images=False, + tables=True, dedup=False, lang=None, max_tree_size=None, + url=None, source=None, only_with_metadata=False, tei_validation=False, + author_blacklist=None, url_blacklist=None, date_params=None): + self._add_config(config) + self.format = output_format + self.fast = fast + self.precision = precision + self.recall = recall + self.comments = comments + self.formatting = formatting or output_format == "markdown" + self.links = links + self.images = images + self.tables = tables + self.dedup = dedup + self.lang = lang + self.max_tree_size = max_tree_size + self.url = url + self.source = url or source + self.only_with_metadata = only_with_metadata + self.tei_validation = tei_validation + self.author_blacklist = author_blacklist or set() + self.url_blacklist = url_blacklist or set() + self.date_params = date_params or \ + set_date_params(self.config.getboolean('DEFAULT', 'EXTENSIVE_DATE_SEARCH')) + + def _add_config(self, config): + "Store options loaded from config file." + self.min_extracted_size = config.getint('DEFAULT', 'MIN_EXTRACTED_SIZE') + self.min_output_size = config.getint('DEFAULT', 'MIN_OUTPUT_SIZE') + self.min_output_comm_size = config.getint('DEFAULT', 'MIN_OUTPUT_COMM_SIZE') + self.min_extracted_comm_size = config.getint('DEFAULT', 'MIN_EXTRACTED_COMM_SIZE') + self.min_duplcheck_size = config.getint('DEFAULT', 'MIN_DUPLCHECK_SIZE') + self.max_repetitions = config.getint('DEFAULT', 'MAX_REPETITIONS') + self.max_file_size = config.getint('DEFAULT', 'MAX_FILE_SIZE') + self.min_file_size = config.getint('DEFAULT', 'MIN_FILE_SIZE') + self.config = config # todo: remove? + + +def args_to_extractor(args, url=None): + "Derive extractor configuration from CLI args." + options = Extractor( + config=use_config(filename=args.config_file), output_format=args.output_format, + comments=args.no_comments, tables=args.no_tables, + dedup=args.deduplicate, lang=args.target_language, + url=url, only_with_metadata=args.only_with_metadata, + tei_validation=args.validate_tei + ) + for attr in ("fast", "precision", "recall", "formatting", "images", "links"): + setattr(options, attr, getattr(args, attr)) + return options + + +def set_date_params(extensive=True): + "Provide default parameters for date extraction." + return { + "original_date": True, + "extensive_search": extensive, + "max_date": datetime.now().strftime("%Y-%m-%d") + } + + # Safety checks PARALLEL_CORES = min(len(sched_getaffinity(0)) if sched_getaffinity else cpu_count(), 16) # 16 processes at most LRU_SIZE = 4096