Skip to content

Commit

Permalink
fix #255
Browse files Browse the repository at this point in the history
  • Loading branch information
fhamborg committed Dec 27, 2023
1 parent 7fd48ad commit 1d30c5b
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 18 deletions.
53 changes: 36 additions & 17 deletions newsplease/crawler/simple_crawler.py
Expand Up @@ -13,10 +13,13 @@

LOGGER = logging.getLogger(__name__)

# user agent
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36"

# customize headers
HEADERS = {
'Connection': 'close',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36',
"Connection": "close",
"User-Agent": USER_AGENT,
}
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

Expand All @@ -25,64 +28,80 @@ class SimpleCrawler(object):
_results = {}

@staticmethod
def fetch_url(url, timeout=None):
def fetch_url(url, timeout=None, user_agent=USER_AGENT):
"""
Crawls the html content of the parameter url and returns the html
:param url:
:param timeout: in seconds, if None, the urllib default is used
:return:
"""
return SimpleCrawler._fetch_url(url, False, timeout=timeout)
return SimpleCrawler._fetch_url(url, False, timeout=timeout, user_agent=user_agent)

@staticmethod
def _fetch_url(url, is_threaded, timeout=None):
def _fetch_url(url, is_threaded, timeout=None, user_agent=USER_AGENT):
"""
Crawls the html content of the parameter url and saves the html in _results
:param url:
:param is_threaded: If True, results will be stored for later processing by the fetch_urls method. Else not.
:param timeout: in seconds, if None, the urllib default is used
:return: html of the url
"""
headers = HEADERS
if user_agent:
headers["User-Agent"] = user_agent

html_str = None
# send
try:
# read by streaming chunks (stream=True, iter_content=xx)
# so we can stop downloading as soon as MAX_FILE_SIZE is reached
response = requests.get(url, timeout=timeout, verify=False, allow_redirects=True, headers=HEADERS)
response = requests.get(
url,
timeout=timeout,
verify=False,
allow_redirects=True,
headers=headers,
)
except (requests.exceptions.MissingSchema, requests.exceptions.InvalidURL):
LOGGER.error('malformed URL: %s', url)
LOGGER.error("malformed URL: %s", url)
except requests.exceptions.TooManyRedirects:
LOGGER.error('too many redirects: %s', url)
LOGGER.error("too many redirects: %s", url)
except requests.exceptions.SSLError as err:
LOGGER.error('SSL: %s %s', url, err)
LOGGER.error("SSL: %s %s", url, err)
except (
socket.timeout, requests.exceptions.ConnectionError,
requests.exceptions.Timeout, socket.error, socket.gaierror
socket.timeout,
requests.exceptions.ConnectionError,
requests.exceptions.Timeout,
socket.error,
socket.gaierror,
) as err:
LOGGER.error('connection/timeout error: %s %s', url, err)
LOGGER.error("connection/timeout error: %s %s", url, err)
else:
# safety checks
if response.status_code != 200:
LOGGER.error('not a 200 response: %s', response.status_code)
LOGGER.error("not a 200 response: %s", response.status_code)
elif response.text is None or len(response.text) < MIN_FILE_SIZE:
LOGGER.error('too small/incorrect: %s %s', url, len(response.text))
LOGGER.error("too small/incorrect: %s %s", url, len(response.text))
elif len(response.text) > MAX_FILE_SIZE:
LOGGER.error('too large: %s %s', url, len(response.text))
LOGGER.error("too large: %s %s", url, len(response.text))
else:
html_str = decode_response(response)
if is_threaded:
SimpleCrawler._results[url] = html_str
return html_str

@staticmethod
def fetch_urls(urls, timeout=None):
def fetch_urls(urls, timeout=None, user_agent=USER_AGENT):
"""
Crawls the html content of all given urls in parallel. Returns when all requests are processed.
:param urls:
:param timeout: in seconds, if None, the urllib default is used
:return:
"""
threads = [threading.Thread(target=SimpleCrawler._fetch_url, args=(url, True, timeout)) for url in urls]
threads = [
threading.Thread(target=SimpleCrawler._fetch_url, args=(url, True, timeout, user_agent))
for url in urls
]
for thread in threads:
thread.start()
for thread in threads:
Expand Down
2 changes: 1 addition & 1 deletion setup.py
@@ -1,7 +1,7 @@
from setuptools import setup, find_packages

setup(name='news-please',
version='1.5.35',
version='1.5.41',
description="news-please is an open source easy-to-use news extractor that just works.",
long_description="""\
news-please is an open source, easy-to-use news crawler that extracts structured information from almost any news website. It can follow recursively internal hyperlinks and read RSS feeds to fetch both most recent and also old, archived articles. You only need to provide the root URL of the news website. Furthermore, its API allows developers to access the exctraction functionality within their software. news-please also implements a workflow optimized for the news archive provided by commoncrawl.org, allowing users to efficiently crawl and extract news articles including various filter options.""",
Expand Down

0 comments on commit 1d30c5b

Please sign in to comment.