fix #255

fhamborg · Dec 27, 2023 · 1d30c5b · 1d30c5b
1 parent 7fd48ad
commit 1d30c5b
Show file tree

Hide file tree

Showing 2 changed files with 37 additions and 18 deletions.
diff --git a/newsplease/crawler/simple_crawler.py b/newsplease/crawler/simple_crawler.py
@@ -13,10 +13,13 @@
 
 LOGGER = logging.getLogger(__name__)
 
+# user agent
+USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36"
+
 # customize headers
 HEADERS = {
-    'Connection': 'close',
-    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36',
+    "Connection": "close",
+    "User-Agent": USER_AGENT,
 }
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 
@@ -25,64 +28,80 @@ class SimpleCrawler(object):
     _results = {}
 
     @staticmethod
-    def fetch_url(url, timeout=None):
+    def fetch_url(url, timeout=None, user_agent=USER_AGENT):
         """
         Crawls the html content of the parameter url and returns the html
         :param url:
         :param timeout: in seconds, if None, the urllib default is used
         :return:
         """
-        return SimpleCrawler._fetch_url(url, False, timeout=timeout)
+        return SimpleCrawler._fetch_url(url, False, timeout=timeout, user_agent=user_agent)
 
     @staticmethod
-    def _fetch_url(url, is_threaded, timeout=None):
+    def _fetch_url(url, is_threaded, timeout=None, user_agent=USER_AGENT):
         """
         Crawls the html content of the parameter url and saves the html in _results
         :param url:
         :param is_threaded: If True, results will be stored for later processing by the fetch_urls method. Else not.
         :param timeout: in seconds, if None, the urllib default is used
         :return: html of the url
         """
+        headers = HEADERS
+        if user_agent:
+            headers["User-Agent"] = user_agent
+
         html_str = None
         # send
         try:
             # read by streaming chunks (stream=True, iter_content=xx)
             # so we can stop downloading as soon as MAX_FILE_SIZE is reached
-            response = requests.get(url, timeout=timeout, verify=False, allow_redirects=True, headers=HEADERS)
+            response = requests.get(
+                url,
+                timeout=timeout,
+                verify=False,
+                allow_redirects=True,
+                headers=headers,
+            )
         except (requests.exceptions.MissingSchema, requests.exceptions.InvalidURL):
-            LOGGER.error('malformed URL: %s', url)
+            LOGGER.error("malformed URL: %s", url)
         except requests.exceptions.TooManyRedirects:
-            LOGGER.error('too many redirects: %s', url)
+            LOGGER.error("too many redirects: %s", url)
         except requests.exceptions.SSLError as err:
-            LOGGER.error('SSL: %s %s', url, err)
+            LOGGER.error("SSL: %s %s", url, err)
         except (
-            socket.timeout, requests.exceptions.ConnectionError,
-            requests.exceptions.Timeout, socket.error, socket.gaierror
+            socket.timeout,
+            requests.exceptions.ConnectionError,
+            requests.exceptions.Timeout,
+            socket.error,
+            socket.gaierror,
         ) as err:
-            LOGGER.error('connection/timeout error: %s %s', url, err)
+            LOGGER.error("connection/timeout error: %s %s", url, err)
         else:
             # safety checks
             if response.status_code != 200:
-                LOGGER.error('not a 200 response: %s', response.status_code)
+                LOGGER.error("not a 200 response: %s", response.status_code)
             elif response.text is None or len(response.text) < MIN_FILE_SIZE:
-                LOGGER.error('too small/incorrect: %s %s', url, len(response.text))
+                LOGGER.error("too small/incorrect: %s %s", url, len(response.text))
             elif len(response.text) > MAX_FILE_SIZE:
-                LOGGER.error('too large: %s %s', url, len(response.text))
+                LOGGER.error("too large: %s %s", url, len(response.text))
             else:
                 html_str = decode_response(response)
         if is_threaded:
             SimpleCrawler._results[url] = html_str
         return html_str
 
     @staticmethod
-    def fetch_urls(urls, timeout=None):
+    def fetch_urls(urls, timeout=None, user_agent=USER_AGENT):
         """
         Crawls the html content of all given urls in parallel. Returns when all requests are processed.
         :param urls:
         :param timeout: in seconds, if None, the urllib default is used
         :return:
         """
-        threads = [threading.Thread(target=SimpleCrawler._fetch_url, args=(url, True, timeout)) for url in urls]
+        threads = [
+            threading.Thread(target=SimpleCrawler._fetch_url, args=(url, True, timeout, user_agent))
+            for url in urls
+        ]
         for thread in threads:
             thread.start()
         for thread in threads:

diff --git a/setup.py b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup, find_packages
 
 setup(name='news-please',
-      version='1.5.35',
+      version='1.5.41',
       description="news-please is an open source easy-to-use news extractor that just works.",
       long_description="""\
 news-please is an open source, easy-to-use news crawler that extracts structured information from almost any news website. It can follow recursively internal hyperlinks and read RSS feeds to fetch both most recent and also old, archived articles. You only need to provide the root URL of the news website. Furthermore, its API allows developers to access the exctraction functionality within their software. news-please also implements a workflow optimized for the news archive provided by commoncrawl.org, allowing users to efficiently crawl and extract news articles including various filter options.""",