Skip to content

Commit

Permalink
usenix.org: scrape login online
Browse files Browse the repository at this point in the history
The ;login: magazine was replaced with ;login: online. Scrape articles
from the first page. The path was updated to loginonline.
  • Loading branch information
nblock committed Apr 27, 2024
1 parent 7cf62af commit 0783e38
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 56 deletions.
3 changes: 2 additions & 1 deletion docs/spiders/usenix.org.rst
Expand Up @@ -2,7 +2,8 @@

usenix.org
----------
Newest issues of the Usenix Magazine ;login:.
Newest articles from `Usenix ;login: online
<https://www.usenix.org/publications/loginonline>`_.

Configuration
~~~~~~~~~~~~~
Expand Down
72 changes: 17 additions & 55 deletions feeds/spiders/usenix_org.py
@@ -1,69 +1,31 @@
import re
from datetime import datetime

import scrapy

from feeds.loaders import FeedEntryItemLoader
from feeds.spiders import FeedsSpider
from feeds.utils import generate_feed_header


class UsenixOrgSpider(FeedsSpider):
name = "usenix.org"
start_urls = ["https://www.usenix.org/publications/login"]
start_urls = ["https://www.usenix.org/publications/loginonline"]

def feed_headers(self):
return []
feed_title = ";login: online"
feed_subtitle = "An open access publication driven by the USENIX community"
feed_link = start_urls[0]
feed_logo = f"https://{name}/sites/all/themes/custom/cotija/images/logo.svg"
path = "loginonline"

def parse(self, response):
# Only scrape the last 8 issues.
issues = response.css(".issues .month a::attr(href)").extract()[:8]
yield generate_feed_header(
title=";login:",
subtitle="The Usenix Magazine",
link=response.url,
path="login",
)
for issue in issues:
yield scrapy.Request(response.urljoin(issue), self.parse_login_issue)
# Find articles on the first page. Ignore additional pages.
for article in response.css(".view-content a::attr(href)").extract():
yield scrapy.Request(response.urljoin(article), self.parse_article)

def parse_login_issue(self, response):
remove_elems = [
".field-name-field-file-access",
".field-name-field-login-issue-file",
".field-name-field-product",
".field-commerce-price",
".views-field-field-file-access",
".view-header",
]
il = FeedEntryItemLoader(
response=response,
base_url=f"https://www.{self.name}",
remove_elems=remove_elems,
)
def parse_article(self, response):
il = FeedEntryItemLoader(response=response, base_url=self.start_urls[0])
il.add_value("link", response.url)
title = response.css("h1::text").extract_first().strip()
il.add_value("title", title)
il.add_value("updated", self._date_from_title(title))
il.add_css("content_html", ".content-wrapper")
il.add_value("path", "login")
if response.css(".usenix-files-protected"):
il.add_value("category", "paywalled")
il.add_css("title", 'meta[property="og:title"]::attr(content)')
il.add_css("updated", 'meta[property="article:modified_time"]::attr(content)')
il.add_css("author_name", ".field-pseudo-field--author-list a::text")
il.add_css("category", ".field-type-taxonomy-term-reference .field-item::text")
il.add_css("content_html", ".paragraphs-items-full")
il.add_value("path", self.path)
return il.load_item()

def _date_from_title(self, issue):
"""Try to guess the publication date of an issue from the title."""
match = re.search(
r"(?P<season>Spring|Summer|Fall|Winter) (?P<year>\d{4})", issue
)
if match:
seasons = {"Winter": "1", "Spring": "4", "Summer": "7", "Fall": "10"}
month = int(seasons[match.group("season")])
year = int(match.group("year"))
date = datetime(day=1, month=month, year=year)
# Issues become free after a year which should be reflected by
# bumping the updated date by a year as well.
date_free = datetime(day=1, month=month, year=year + 1)
return date_free if date_free < date.utcnow() else date
else:
self.logger.warning(f'Could not extract date from title "{issue}"!')

0 comments on commit 0783e38

Please sign in to comment.