diff --git a/docs/spiders/usenix.org.rst b/docs/spiders/usenix.org.rst index fd5f041..fdff014 100644 --- a/docs/spiders/usenix.org.rst +++ b/docs/spiders/usenix.org.rst @@ -2,7 +2,8 @@ usenix.org ---------- -Newest issues of the Usenix Magazine ;login:. +Newest articles from `Usenix ;login: online +`_. Configuration ~~~~~~~~~~~~~ diff --git a/feeds/spiders/usenix_org.py b/feeds/spiders/usenix_org.py index e9170f9..a8d7b2c 100644 --- a/feeds/spiders/usenix_org.py +++ b/feeds/spiders/usenix_org.py @@ -1,69 +1,31 @@ -import re -from datetime import datetime - import scrapy from feeds.loaders import FeedEntryItemLoader from feeds.spiders import FeedsSpider -from feeds.utils import generate_feed_header class UsenixOrgSpider(FeedsSpider): name = "usenix.org" - start_urls = ["https://www.usenix.org/publications/login"] + start_urls = ["https://www.usenix.org/publications/loginonline"] - def feed_headers(self): - return [] + feed_title = ";login: online" + feed_subtitle = "An open access publication driven by the USENIX community" + feed_link = start_urls[0] + feed_logo = f"https://{name}/sites/all/themes/custom/cotija/images/logo.svg" + path = "loginonline" def parse(self, response): - # Only scrape the last 8 issues. - issues = response.css(".issues .month a::attr(href)").extract()[:8] - yield generate_feed_header( - title=";login:", - subtitle="The Usenix Magazine", - link=response.url, - path="login", - ) - for issue in issues: - yield scrapy.Request(response.urljoin(issue), self.parse_login_issue) + # Find articles on the first page. Ignore additional pages. + for article in response.css(".view-content a::attr(href)").extract(): + yield scrapy.Request(response.urljoin(article), self.parse_article) - def parse_login_issue(self, response): - remove_elems = [ - ".field-name-field-file-access", - ".field-name-field-login-issue-file", - ".field-name-field-product", - ".field-commerce-price", - ".views-field-field-file-access", - ".view-header", - ] - il = FeedEntryItemLoader( - response=response, - base_url=f"https://www.{self.name}", - remove_elems=remove_elems, - ) + def parse_article(self, response): + il = FeedEntryItemLoader(response=response, base_url=self.start_urls[0]) il.add_value("link", response.url) - title = response.css("h1::text").extract_first().strip() - il.add_value("title", title) - il.add_value("updated", self._date_from_title(title)) - il.add_css("content_html", ".content-wrapper") - il.add_value("path", "login") - if response.css(".usenix-files-protected"): - il.add_value("category", "paywalled") + il.add_css("title", 'meta[property="og:title"]::attr(content)') + il.add_css("updated", 'meta[property="article:modified_time"]::attr(content)') + il.add_css("author_name", ".field-pseudo-field--author-list a::text") + il.add_css("category", ".field-type-taxonomy-term-reference .field-item::text") + il.add_css("content_html", ".paragraphs-items-full") + il.add_value("path", self.path) return il.load_item() - - def _date_from_title(self, issue): - """Try to guess the publication date of an issue from the title.""" - match = re.search( - r"(?PSpring|Summer|Fall|Winter) (?P\d{4})", issue - ) - if match: - seasons = {"Winter": "1", "Spring": "4", "Summer": "7", "Fall": "10"} - month = int(seasons[match.group("season")]) - year = int(match.group("year")) - date = datetime(day=1, month=month, year=year) - # Issues become free after a year which should be reflected by - # bumping the updated date by a year as well. - date_free = datetime(day=1, month=month, year=year + 1) - return date_free if date_free < date.utcnow() else date - else: - self.logger.warning(f'Could not extract date from title "{issue}"!')