Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
The ;login: magazine was replaced with ;login: online. Scrape articles from the first page. The path was updated to loginonline.
- Loading branch information
Showing
2 changed files
with
19 additions
and
56 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,69 +1,31 @@ | ||
import re | ||
from datetime import datetime | ||
|
||
import scrapy | ||
|
||
from feeds.loaders import FeedEntryItemLoader | ||
from feeds.spiders import FeedsSpider | ||
from feeds.utils import generate_feed_header | ||
|
||
|
||
class UsenixOrgSpider(FeedsSpider): | ||
name = "usenix.org" | ||
start_urls = ["https://www.usenix.org/publications/login"] | ||
start_urls = ["https://www.usenix.org/publications/loginonline"] | ||
|
||
def feed_headers(self): | ||
return [] | ||
feed_title = ";login: online" | ||
feed_subtitle = "An open access publication driven by the USENIX community" | ||
feed_link = start_urls[0] | ||
feed_logo = f"https://{name}/sites/all/themes/custom/cotija/images/logo.svg" | ||
path = "loginonline" | ||
|
||
def parse(self, response): | ||
# Only scrape the last 8 issues. | ||
issues = response.css(".issues .month a::attr(href)").extract()[:8] | ||
yield generate_feed_header( | ||
title=";login:", | ||
subtitle="The Usenix Magazine", | ||
link=response.url, | ||
path="login", | ||
) | ||
for issue in issues: | ||
yield scrapy.Request(response.urljoin(issue), self.parse_login_issue) | ||
# Find articles on the first page. Ignore additional pages. | ||
for article in response.css(".view-content a::attr(href)").extract(): | ||
yield scrapy.Request(response.urljoin(article), self.parse_article) | ||
|
||
def parse_login_issue(self, response): | ||
remove_elems = [ | ||
".field-name-field-file-access", | ||
".field-name-field-login-issue-file", | ||
".field-name-field-product", | ||
".field-commerce-price", | ||
".views-field-field-file-access", | ||
".view-header", | ||
] | ||
il = FeedEntryItemLoader( | ||
response=response, | ||
base_url=f"https://www.{self.name}", | ||
remove_elems=remove_elems, | ||
) | ||
def parse_article(self, response): | ||
il = FeedEntryItemLoader(response=response, base_url=self.start_urls[0]) | ||
il.add_value("link", response.url) | ||
title = response.css("h1::text").extract_first().strip() | ||
il.add_value("title", title) | ||
il.add_value("updated", self._date_from_title(title)) | ||
il.add_css("content_html", ".content-wrapper") | ||
il.add_value("path", "login") | ||
if response.css(".usenix-files-protected"): | ||
il.add_value("category", "paywalled") | ||
il.add_css("title", 'meta[property="og:title"]::attr(content)') | ||
il.add_css("updated", 'meta[property="article:modified_time"]::attr(content)') | ||
il.add_css("author_name", ".field-pseudo-field--author-list a::text") | ||
il.add_css("category", ".field-type-taxonomy-term-reference .field-item::text") | ||
il.add_css("content_html", ".paragraphs-items-full") | ||
il.add_value("path", self.path) | ||
return il.load_item() | ||
|
||
def _date_from_title(self, issue): | ||
"""Try to guess the publication date of an issue from the title.""" | ||
match = re.search( | ||
r"(?P<season>Spring|Summer|Fall|Winter) (?P<year>\d{4})", issue | ||
) | ||
if match: | ||
seasons = {"Winter": "1", "Spring": "4", "Summer": "7", "Fall": "10"} | ||
month = int(seasons[match.group("season")]) | ||
year = int(match.group("year")) | ||
date = datetime(day=1, month=month, year=year) | ||
# Issues become free after a year which should be reflected by | ||
# bumping the updated date by a year as well. | ||
date_free = datetime(day=1, month=month, year=year + 1) | ||
return date_free if date_free < date.utcnow() else date | ||
else: | ||
self.logger.warning(f'Could not extract date from title "{issue}"!') |