/
saugdat
executable file
·128 lines (106 loc) · 4.18 KB
/
saugdat
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/env python3
# Download all videos from 34c3
#
# Before you can use it you need to get your environment up and running
# and install some packages via pip. I used a virtual environment for
# that (requires you to install virtualenv on your computer)
# 1. virtualenv -p python3 ~/.virtualenvs/saugdat/
# 2. source ~/.virtualenvs/saugdat/bin/activate
# 3. pip install requests bs4
# 4. chmod +x saugdat
# 5. ./saugdat > urls.txt
#
# If you want to fetch videos from other URLs or need to change the
# delimiter have a look at the last line of this script.
# Has only been tested with 34c3 video footage but may work with other
# CCC events as well.
# Imports we need
import sys
import requests
from bs4 import BeautifulSoup
# Some values we want to use.
BASE_URL = "https://media.ccc.de/"
START_PAGE = "/c/34c3"
###
### Here be dragons
###
class Linkgetter:
base_url = ""
start_page = ""
delimiter = " "
def __init__(self, base_url, start_page = "", delimiter=" "):
"""
:param base_url: URL pointing to the media ccc event
:param start_page: URL pointing to the media ccc event
:param delimiter: Char/String used to separate URLS in output
"""
self.base_url = base_url
self.start_page = start_page
self.delimiter = delimiter
# enter main run loop
try:
self.main()
sys.exit(0)
except KeyboardInterrupt:
sys.exit(0)
def main(self):
"""
Main run loop of the script, coordinates actions
"""
# Get links from the event's starting page
links_from_main_page = self.get_links_from_page(self.start_page, base_url=self.base_url)
links_to_talks = self.filter_video_links(links_from_main_page)
# Iterate the links to the talks and get the HQ links from there
for talk_link in links_to_talks:
links_from_talk = self.get_links_from_page(talk_link, base_url=self.base_url)
print(self.delimiter.join(self.filter_hq_video_links(links_from_talk)))
def get_links_from_page(self, url, base_url=""):
"""
Get all links from a document referred to by a given URL.
:param url: URL to fetch, might be relative is base_url is set
:param base_url: Base url, "starting part" of URL (optional)
:return: set with all links
"""
result = set()
try:
response = requests.get("{}{}".format(base_url, url))
soup = BeautifulSoup(response.content, "html.parser")
for link in soup.findAll('a'):
result.add(link.get('href'))
except Exception as e:
print("Could not get links from {}{}: {}".format(base_url, url, e))
return result
def filter_video_links(self, set_of_links):
"""
Returns a set from video links that are contained in the
set of links provided to this function.
The pattern which is used to identify video links needs
to be changed if you are not using media.ccc.de
:param set_of_links: Set of links to process
:return: Set of links pointing to videos
"""
result = set()
for link in set_of_links:
if link.startswith("/v/"):
# If href starts with "/v/" we found a video link
result.add(link)
return result
def filter_hq_video_links(self, set_of_links):
"""
Returns a set of links pointing to HQ videos.
The pattern which is used to identify video links needs
to be changed if you are not using media.ccc.de or if
you want to download the videos in another format
:param set_of_links: Set of links to process
:return: Set of links pointing to HQ videos
"""
result = set()
for link in set_of_links:
if "h264-hd" in link and "slides" not in link:
# We are interested in h264-hd videos but not
# in videos having "slides" in their name
result.add(link)
return result
if __name__ == '__main__':
# Here the program is invoked
Linkgetter(base_url = BASE_URL, start_page = START_PAGE, delimiter = "\n")