PyFeeds · nblock · Apr 29, 2024 · Apr 28, 2024 · Apr 28, 2024 · Apr 28, 2024
diff --git a/docs/development.rst b/docs/development.rst
@@ -152,7 +152,7 @@ scraping from there.
   * :ref:`spider_npr.org`
   * :ref:`spider_puls4.com`
   * :ref:`spider_python-patterns.guide`
-  * :ref:`spider_riskommunal`
+  * :ref:`spider_gem2go`
   * :ref:`spider_servustv.com`
   * :ref:`spider_tuwien.ac.at`
   * :ref:`spider_momoxfashion.com`

diff --git a/docs/spiders/riskommunal.rst → docs/spiders/gem2go.rst b/docs/spiders/riskommunal.rst → docs/spiders/gem2go.rst
@@ -1,26 +1,26 @@
-.. _spider_riskommunal:
+.. _spider_gem2go:
 
-riskommunal
------------
+gem2go
+------
 News from your local town or community, if their website is maintained with
-`RIS Kommunal <https://info.riskommunal.net/>`_.
+`GEM2GO <https://www.gem2go.at/>`_.
 
 Configuration
 ~~~~~~~~~~~~~
-Add ``riskommunal`` to the list of spiders:
+Add ``gem2go`` to the list of spiders:
 
 .. code-block:: ini
 
    # List of spiders to run by default, one per line.
    spiders =
-     riskommunal
+     gem2go
 
 At least one url is required. The local community or town website typically has
 a "News" or "Neuigkeiten" URL that you may use.
 
 .. code-block:: ini
 
-   [riskommunal]
+   [gem2go]
    urls =
        http://yourlocalcommunity.tld/News
        https://mytown.tld/BUeRGERSERVICE/Neuigkeiten
diff --git a/feeds.cfg.dist b/feeds.cfg.dist
@@ -176,7 +176,7 @@ useragent = feeds (+https://github.com/pyfeeds/pyfeeds)
 #    special-report
 #    leaders
 
-#[riskommunal]
+#[gem2go]
 #urls =
 #    http://yourlocalcommunity.tld/News
 #    https://mytown.tld/BUeRGERSERVICE/Neuigkeiten
diff --git a/feeds/spiders/riskommunal.py → feeds/spiders/gem2go.py b/feeds/spiders/riskommunal.py → feeds/spiders/gem2go.py
@@ -1,4 +1,4 @@
-from urllib.parse import urlparse
+from urllib.parse import urljoin, urlparse
 
 import scrapy
 
@@ -7,8 +7,8 @@
 from feeds.utils import generate_feed_header
 
 
-class RisKommunalSpider(FeedsSpider):
-    name = "riskommunal"
+class Gem2GoSpider(FeedsSpider):
+    name = "gem2go"
     custom_settings = {"COOKIES_ENABLED": True}
     allowed_domains = []
     cookies = {"ris-cookie": "g7750", "ris_cookie_setting": "g7750"}
@@ -17,20 +17,18 @@ class RisKommunalSpider(FeedsSpider):
     _titles = {}
     _subtitles = {}
     _links = {}
-    _icons = {}
 
     def feed_headers(self):
         for site in self._sites:
             yield generate_feed_header(
                 title=self._titles.get(site),
                 subtitle=self._subtitles.get(site),
                 link=self._links.get(site),
-                icon=self._icons.get(site),
                 path=site,
             )
 
     def start_requests(self):
-        urls = self.settings.get("FEEDS_SPIDER_RISKOMMUNAL_URLS")
+        urls = self.settings.get("FEEDS_SPIDER_GEM2GO_URLS")
         if not urls:
             self.logger.error("Please specify url(s) in the config file!")
             return
@@ -52,35 +50,33 @@ def start_requests(self):
     def parse(self, response):
         site = response.meta["site"]
 
-        title = response.css("meta[property='og:title']::attr('content')").get()
+        title = response.css("meta[property='og:title']::attr(content)").get()
         self._titles[site] = title
         self._subtitles[site] = f"Neuigkeiten aus {title}"
 
-        icon = response.css("link[rel='icon']::attr('href')").get()
-        if icon and icon.startswith("/"):
-            icon = f"{self._links[site]}{icon}"
-        self._icons[site] = icon
-
-        for selector in response.css("div.newslist div[class*='float_left']"):
-            updated = selector.css("p.float_right::text").get()
-            if not updated:
-                # Do not care about "archived news"
-                continue
-
-            url = selector.css("a::attr('href')").get()
-            if not url:
-                # Ignore articles without a link
-                continue
-
-            if url.startswith("/"):
-                url = f"{self._links[site]}{url}"
-
-            yield scrapy.Request(
-                url,
-                self.parse_article,
-                cookies=self.cookies,
-                meta={"site": site, "updated": updated},
-            )
+        # Sites use different versions of the same "CMS". Extract the "news"
+        # container first and for each container scrape the article URL and the
+        # publication date.
+        for query_container, query_updated in [
+            ("div.newslist div[class*='float_left']", "p.float_right::text"),
+            (".bemCard", ".card-footer .bemContainer--date::text"),
+        ]:
+            for selector in response.css(query_container):
+                url = selector.css("a::attr(href)").get()
+                if not url:
+                    # Ignore articles without a link
+                    continue
+
+                # The publication date might be present only on the overview page,
+                # only on the article page or mix and match on both.
+                updated = selector.css(query_updated).get()
+
+                yield scrapy.Request(
+                    urljoin(self._links[site], url),
+                    self.parse_article,
+                    cookies=self.cookies,
+                    meta={"site": site, "updated": updated},
+                )
 
     def parse_article(self, response):
         site = response.meta["site"]
@@ -91,15 +87,21 @@ def parse_article(self, response):
             dayfirst=True,
             yearfirst=False,
             remove_elems=[
+                "div#main-content-header",
                 "div.main-content h1:first-of-type",
+                "div.newsdatum_container",
                 "p#ctl00_ctl00_ctl00_cph_col_a_cph_content_cph_content_detail_p_date",
-                "div#main-content-header",
+                "span.bemContainer--publishDate",
+                "span.bemContainer--readingTime",
             ],
         )
         il.add_value("path", site)
         il.add_value("link", response.url)
         il.add_value("updated", response.meta["updated"])
-        il.add_css("title", "div.main-content h1:first-of-type::text")
+        il.add_css("updated", ".bemContainer--publishDate ::text")
+        il.add_css("updated", ".newsdatum_container ::text")
+        il.add_css("updated", "p[id$='detail_p_date'] ::text")
+        il.add_css("title", "div.main-content h1:first-of-type ::text")
         il.add_css("content_html", "div.main-content")
 
         yield il.load_item()