usenix.org: scrape login online

The ;login: magazine was replaced with ;login: online. Scrape articles from the first page. The path was updated to loginonline.
PyFeeds · Apr 27, 2024 · 4259366 · 4259366
1 parent 7cf62af
commit 4259366
Show file tree

Hide file tree

Showing 2 changed files with 19 additions and 56 deletions.
diff --git a/docs/spiders/usenix.org.rst b/docs/spiders/usenix.org.rst
@@ -2,7 +2,8 @@
 
 usenix.org
 ----------
-Newest issues of the Usenix Magazine ;login:.
+Newest articles from `Usenix ;login: online
+<https://www.usenix.org/publications/loginonline>`_.
 
 Configuration
 ~~~~~~~~~~~~~

diff --git a/feeds/spiders/usenix_org.py b/feeds/spiders/usenix_org.py
@@ -1,69 +1,31 @@
-import re
-from datetime import datetime
-
 import scrapy
 
 from feeds.loaders import FeedEntryItemLoader
 from feeds.spiders import FeedsSpider
-from feeds.utils import generate_feed_header
 
 
 class UsenixOrgSpider(FeedsSpider):
     name = "usenix.org"
-    start_urls = ["https://www.usenix.org/publications/login"]
+    start_urls = ["https://www.usenix.org/publications/loginonline"]
 
-    def feed_headers(self):
-        return []
+    feed_title = ";login: online"
+    feed_subtitle = "An open access publication driven by the USENIX community"
+    feed_link = start_urls[0]
+    feed_logo = f"https://{name}/sites/all/themes/custom/cotija/images/logo.svg"
+    path = "loginonline"
 
     def parse(self, response):
-        # Only scrape the last 8 issues.
-        issues = response.css(".issues .month a::attr(href)").extract()[:8]
-        yield generate_feed_header(
-            title=";login:",
-            subtitle="The Usenix Magazine",
-            link=response.url,
-            path="login",
-        )
-        for issue in issues:
-            yield scrapy.Request(response.urljoin(issue), self.parse_login_issue)
+        # Find articles on the first page. Ignore additional pages.
+        for article in response.css(".view-content a::attr(href)").extract():
+            yield scrapy.Request(response.urljoin(article), self.parse_article)
 
-    def parse_login_issue(self, response):
-        remove_elems = [
-            ".field-name-field-file-access",
-            ".field-name-field-login-issue-file",
-            ".field-name-field-product",
-            ".field-commerce-price",
-            ".views-field-field-file-access",
-            ".view-header",
-        ]
-        il = FeedEntryItemLoader(
-            response=response,
-            base_url=f"https://www.{self.name}",
-            remove_elems=remove_elems,
-        )
+    def parse_article(self, response):
+        il = FeedEntryItemLoader(response=response, base_url=self.start_urls[0])
         il.add_value("link", response.url)
-        title = response.css("h1::text").extract_first().strip()
-        il.add_value("title", title)
-        il.add_value("updated", self._date_from_title(title))
-        il.add_css("content_html", ".content-wrapper")
-        il.add_value("path", "login")
-        if response.css(".usenix-files-protected"):
-            il.add_value("category", "paywalled")
+        il.add_css("title", 'meta[property="og:title"]::attr(content)')
+        il.add_css("updated", 'meta[property="article:modified_time"]::attr(content)')
+        il.add_css("author_name", ".field-pseudo-field--author-list a::text")
+        il.add_css("category", ".field-type-taxonomy-term-reference .field-item::text")
+        il.add_css("content_html", ".paragraphs-items-full")
+        il.add_value("path", self.path)
         return il.load_item()
-
-    def _date_from_title(self, issue):
-        """Try to guess the publication date of an issue from the title."""
-        match = re.search(
-            r"(?P<season>Spring|Summer|Fall|Winter) (?P<year>\d{4})", issue
-        )
-        if match:
-            seasons = {"Winter": "1", "Spring": "4", "Summer": "7", "Fall": "10"}
-            month = int(seasons[match.group("season")])
-            year = int(match.group("year"))
-            date = datetime(day=1, month=month, year=year)
-            # Issues become free after a year which should be reflected by
-            # bumping the updated date by a year as well.
-            date_free = datetime(day=1, month=month, year=year + 1)
-            return date_free if date_free < date.utcnow() else date
-        else:
-            self.logger.warning(f'Could not extract date from title "{issue}"!')