Skip to content

Commit

Permalink
Merge pull request #252 from arcolife/html_cpu_pool
Browse files Browse the repository at this point in the history
fixes #172 and #169: NewsPlease.from_urls() - use multiprocessing
  • Loading branch information
fhamborg committed Aug 30, 2023
2 parents 07df0b5 + 1b233aa commit 1cb8952
Showing 1 changed file with 30 additions and 5 deletions.
35 changes: 30 additions & 5 deletions newsplease/__init__.py
@@ -1,3 +1,4 @@
import concurrent.futures as cf
import datetime
import os
import sys
Expand Down Expand Up @@ -71,6 +72,9 @@ def from_html(html, url=None, download_date=None, fetch_images=True):
:param url:
:return:
"""
if bool(html) is False:
return {}

extractor = article_extractor.Extractor(
(
["newspaper_extractor"]
Expand Down Expand Up @@ -132,11 +136,32 @@ def from_urls(urls, timeout=None):
results = {}
download_date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

if len(urls) > 0:
results = SimpleCrawler.fetch_urls(urls, timeout=timeout)
for url, html in results.items():
if html:
results[url] = NewsPlease.from_html(html, url, download_date)
if len(urls) == 0:
# Nested blocks of code should not be left empty.
# When a block contains a comment, this block is not considered to be empty
pass
elif len(urls) == 1:
url = urls[0]
html = SimpleCrawler.fetch_url(url, timeout=timeout)
results[url] = NewsPlease.from_html(html, url, download_date)
else:
results = SimpleCrawler.fetch_urls(urls)

futures = {}
with cf.ProcessPoolExecutor() as exec:
for url in results:
future = exec.submit(
NewsPlease.from_html, results[url], url, download_date
)
futures[future] = url

for future in cf.as_completed(futures):
url = futures[future]
try:
results[url] = future.result(timeout=timeout)
except Exception as err:
results[url] = {}

return results

@staticmethod
Expand Down

0 comments on commit 1cb8952

Please sign in to comment.