From f99dd126847d8794ba2bd0e415020df504401bc5 Mon Sep 17 00:00:00 2001
From: Florian Preinstorfer <site-github@nblock.org>
Date: Sun, 28 Apr 2024 17:54:35 +0200
Subject: [PATCH] gem2go: add support for just another version

Sites use different versions of the same "CMS". Extract the "news"
container first and for each container scrape the article URL and the
publication date.
---
 feeds/spiders/gem2go.py | 42 +++++++++++++++++++++++++----------------
 1 file changed, 26 insertions(+), 16 deletions(-)

diff --git a/feeds/spiders/gem2go.py b/feeds/spiders/gem2go.py
index 7cfb684..030b333 100644
--- a/feeds/spiders/gem2go.py
+++ b/feeds/spiders/gem2go.py
@@ -54,22 +54,29 @@ def parse(self, response):
         self._titles[site] = title
         self._subtitles[site] = f"Neuigkeiten aus {title}"
 
-        for selector in response.css("div.newslist div[class*='float_left']"):
-            url = selector.css("a::attr(href)").get()
-            if not url:
-                # Ignore articles without a link
-                continue
-
-            # The publication date might be present only on the overview page,
-            # only on the article page or mix and match on both.
-            updated = selector.css("p.float_right::text").get()
-
-            yield scrapy.Request(
-                urljoin(self._links[site], url),
-                self.parse_article,
-                cookies=self.cookies,
-                meta={"site": site, "updated": updated},
-            )
+        # Sites use different versions of the same "CMS". Extract the "news"
+        # container first and for each container scrape the article URL and the
+        # publication date.
+        for query_container, query_updated in [
+            ("div.newslist div[class*='float_left']", "p.float_right::text"),
+            (".bemCard", ".card-footer .bemContainer--date::text"),
+        ]:
+            for selector in response.css(query_container):
+                url = selector.css("a::attr(href)").get()
+                if not url:
+                    # Ignore articles without a link
+                    continue
+
+                # The publication date might be present only on the overview page,
+                # only on the article page or mix and match on both.
+                updated = selector.css(query_updated).get()
+
+                yield scrapy.Request(
+                    urljoin(self._links[site], url),
+                    self.parse_article,
+                    cookies=self.cookies,
+                    meta={"site": site, "updated": updated},
+                )
 
     def parse_article(self, response):
         site = response.meta["site"]
@@ -84,11 +91,14 @@ def parse_article(self, response):
                 "div.main-content h1:first-of-type",
                 "div.newsdatum_container",
                 "p#ctl00_ctl00_ctl00_cph_col_a_cph_content_cph_content_detail_p_date",
+                "span.bemContainer--publishDate",
+                "span.bemContainer--readingTime",
             ],
         )
         il.add_value("path", site)
         il.add_value("link", response.url)
         il.add_value("updated", response.meta["updated"])
+        il.add_css("updated", ".bemContainer--publishDate ::text")
         il.add_css("updated", ".newsdatum_container ::text")
         il.add_css("updated", "p[id$='detail_p_date'] ::text")
         il.add_css("title", "div.main-content h1:first-of-type ::text")