-
Notifications
You must be signed in to change notification settings - Fork 11
/
puls4_com.py
82 lines (72 loc) · 2.59 KB
/
puls4_com.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import json
import re
import scrapy
from feeds.loaders import FeedEntryItemLoader
from feeds.spiders import FeedsSpider
class Puls4ComSpider(FeedsSpider):
name = "puls4.com"
start_urls = ["https://www.puls4.com/api/json-fe/page/sendungen"]
feed_icon = (
"https://www.puls4.com/bundles/wundermanpuls4/images/" + "favicon/favicon.png"
)
def parse(self, response):
path = json.loads(response.text)["content"][0]["url"]
return scrapy.Request(
response.urljoin(path), self._parse_shows_list, meta={"dont_cache": True}
)
def _parse_shows_list(self, response):
shows = json.loads(response.text)["formatOverviewItems"]
for show in shows:
time = re.findall(r"(\d{2}:\d{2})", show["announcement"]) or None
if time:
time = time[0]
yield scrapy.Request(
response.urljoin(show["channelUrl"] + "/videos/Ganze-Folgen"),
self._parse_show,
meta={
"dont_cache": True,
"time": time,
"handle_httpstatus_list": [404],
},
)
def _parse_show(self, response):
if response.status != 200:
self.logger.debug("Ignoring response with status code != 200")
return
urls = response.css("a.media-preview-link::attr(href)").extract()[:3]
for url in urls:
yield scrapy.Request(
response.urljoin(url),
self._parse_episode,
meta={"time": response.meta["time"]},
)
def _parse_episode(self, response):
il = FeedEntryItemLoader(
response=response,
base_url=f"https://{self.name}",
timezone="Europe/Vienna",
dayfirst=True,
)
il.add_value("link", response.url)
il.add_xpath(
"title",
'//meta[@name="title"]/@content',
re=r"(?s)(.*?)(?: vom .*)? - puls4\.com",
)
il.add_value(
"updated",
"{} {}".format(
response.xpath('//meta[@name="title"]/@content').re_first(
r".*vom (\d{2}\.\d{2}\.\d{4}).*"
),
response.meta["time"] or "00:00",
),
)
il.add_value(
"content_html",
'<img src="{}">'.format(
response.xpath('//meta[@property="og:image"]/@content').extract_first()
),
)
il.add_css("content_html", ".player-video-description-intro::text")
return il.load_item()