Merge pull request #252 from arcolife/html_cpu_pool

fixes #172 and #169: NewsPlease.from_urls() - use multiprocessing
fhamborg · Aug 30, 2023 · 1cb8952 · 1cb8952
2 parents 07df0b5 + 1b233aa
commit 1cb8952
Showing 1 changed file with 30 additions and 5 deletions.
diff --git a/newsplease/__init__.py b/newsplease/__init__.py
@@ -1,3 +1,4 @@
+import concurrent.futures as cf
 import datetime
 import os
 import sys
@@ -71,6 +72,9 @@ def from_html(html, url=None, download_date=None, fetch_images=True):
         :param url:
         :return:
         """
+        if bool(html) is False:
+            return {}
+
         extractor = article_extractor.Extractor(
             (
                 ["newspaper_extractor"]
@@ -132,11 +136,32 @@ def from_urls(urls, timeout=None):
         results = {}
         download_date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
 
-        if len(urls) > 0:
-            results = SimpleCrawler.fetch_urls(urls, timeout=timeout)
-            for url, html in results.items():
-                if html:
-                    results[url] = NewsPlease.from_html(html, url, download_date)
+        if len(urls) == 0:
+            # Nested blocks of code should not be left empty.
+            # When a block contains a comment, this block is not considered to be empty
+            pass
+        elif len(urls) == 1:
+            url = urls[0]
+            html = SimpleCrawler.fetch_url(url, timeout=timeout)
+            results[url] = NewsPlease.from_html(html, url, download_date)
+        else:
+            results = SimpleCrawler.fetch_urls(urls)
+
+            futures = {}
+            with cf.ProcessPoolExecutor() as exec:
+                for url in results:
+                    future = exec.submit(
+                        NewsPlease.from_html, results[url], url, download_date
+                    )
+                    futures[future] = url
+
+            for future in cf.as_completed(futures):
+                url = futures[future]
+                try:
+                    results[url] = future.result(timeout=timeout)
+                except Exception as err:
+                    results[url] = {}
+
         return results
 
     @staticmethod