Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rework Gem2Go spider (was riskommunal) #241

Merged
merged 5 commits into from
Apr 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/development.rst
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ scraping from there.
* :ref:`spider_npr.org`
* :ref:`spider_puls4.com`
* :ref:`spider_python-patterns.guide`
* :ref:`spider_riskommunal`
* :ref:`spider_gem2go`
* :ref:`spider_servustv.com`
* :ref:`spider_tuwien.ac.at`
* :ref:`spider_momoxfashion.com`
Expand Down
14 changes: 7 additions & 7 deletions docs/spiders/riskommunal.rst → docs/spiders/gem2go.rst
Original file line number Diff line number Diff line change
@@ -1,26 +1,26 @@
.. _spider_riskommunal:
.. _spider_gem2go:

riskommunal
-----------
gem2go
------
News from your local town or community, if their website is maintained with
`RIS Kommunal <https://info.riskommunal.net/>`_.
`GEM2GO <https://www.gem2go.at/>`_.

Configuration
~~~~~~~~~~~~~
Add ``riskommunal`` to the list of spiders:
Add ``gem2go`` to the list of spiders:

.. code-block:: ini

# List of spiders to run by default, one per line.
spiders =
riskommunal
gem2go

At least one url is required. The local community or town website typically has
a "News" or "Neuigkeiten" URL that you may use.

.. code-block:: ini

[riskommunal]
[gem2go]
urls =
http://yourlocalcommunity.tld/News
https://mytown.tld/BUeRGERSERVICE/Neuigkeiten
2 changes: 1 addition & 1 deletion feeds.cfg.dist
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ useragent = feeds (+https://github.com/pyfeeds/pyfeeds)
# special-report
# leaders

#[riskommunal]
#[gem2go]
#urls =
# http://yourlocalcommunity.tld/News
# https://mytown.tld/BUeRGERSERVICE/Neuigkeiten
70 changes: 36 additions & 34 deletions feeds/spiders/riskommunal.py → feeds/spiders/gem2go.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from urllib.parse import urlparse
from urllib.parse import urljoin, urlparse

import scrapy

Expand All @@ -7,8 +7,8 @@
from feeds.utils import generate_feed_header


class RisKommunalSpider(FeedsSpider):
name = "riskommunal"
class Gem2GoSpider(FeedsSpider):
name = "gem2go"
custom_settings = {"COOKIES_ENABLED": True}
allowed_domains = []
cookies = {"ris-cookie": "g7750", "ris_cookie_setting": "g7750"}
Expand All @@ -17,20 +17,18 @@ class RisKommunalSpider(FeedsSpider):
_titles = {}
_subtitles = {}
_links = {}
_icons = {}

def feed_headers(self):
for site in self._sites:
yield generate_feed_header(
title=self._titles.get(site),
subtitle=self._subtitles.get(site),
link=self._links.get(site),
icon=self._icons.get(site),
path=site,
)

def start_requests(self):
urls = self.settings.get("FEEDS_SPIDER_RISKOMMUNAL_URLS")
urls = self.settings.get("FEEDS_SPIDER_GEM2GO_URLS")
if not urls:
self.logger.error("Please specify url(s) in the config file!")
return
Expand All @@ -52,35 +50,33 @@ def start_requests(self):
def parse(self, response):
site = response.meta["site"]

title = response.css("meta[property='og:title']::attr('content')").get()
title = response.css("meta[property='og:title']::attr(content)").get()
self._titles[site] = title
self._subtitles[site] = f"Neuigkeiten aus {title}"

icon = response.css("link[rel='icon']::attr('href')").get()
if icon and icon.startswith("/"):
icon = f"{self._links[site]}{icon}"
self._icons[site] = icon

for selector in response.css("div.newslist div[class*='float_left']"):
updated = selector.css("p.float_right::text").get()
if not updated:
# Do not care about "archived news"
continue

url = selector.css("a::attr('href')").get()
if not url:
# Ignore articles without a link
continue

if url.startswith("/"):
url = f"{self._links[site]}{url}"

yield scrapy.Request(
url,
self.parse_article,
cookies=self.cookies,
meta={"site": site, "updated": updated},
)
# Sites use different versions of the same "CMS". Extract the "news"
# container first and for each container scrape the article URL and the
# publication date.
for query_container, query_updated in [
("div.newslist div[class*='float_left']", "p.float_right::text"),
(".bemCard", ".card-footer .bemContainer--date::text"),
]:
for selector in response.css(query_container):
url = selector.css("a::attr(href)").get()
if not url:
# Ignore articles without a link
continue

# The publication date might be present only on the overview page,
# only on the article page or mix and match on both.
updated = selector.css(query_updated).get()

yield scrapy.Request(
urljoin(self._links[site], url),
self.parse_article,
cookies=self.cookies,
meta={"site": site, "updated": updated},
)

def parse_article(self, response):
site = response.meta["site"]
Expand All @@ -91,15 +87,21 @@ def parse_article(self, response):
dayfirst=True,
yearfirst=False,
remove_elems=[
"div#main-content-header",
"div.main-content h1:first-of-type",
"div.newsdatum_container",
"p#ctl00_ctl00_ctl00_cph_col_a_cph_content_cph_content_detail_p_date",
"div#main-content-header",
"span.bemContainer--publishDate",
"span.bemContainer--readingTime",
],
)
il.add_value("path", site)
il.add_value("link", response.url)
il.add_value("updated", response.meta["updated"])
il.add_css("title", "div.main-content h1:first-of-type::text")
il.add_css("updated", ".bemContainer--publishDate ::text")
il.add_css("updated", ".newsdatum_container ::text")
il.add_css("updated", "p[id$='detail_p_date'] ::text")
il.add_css("title", "div.main-content h1:first-of-type ::text")
il.add_css("content_html", "div.main-content")

yield il.load_item()
Expand Down