/
coursera-download.py
210 lines (171 loc) · 8.13 KB
/
coursera-download.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
import os
import re
import time
import getpass
import requests
import argparse
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import StaleElementReferenceException
def downloadfile(topic, topic_count, title, url):
filename = coursename + "/" + str(topic_count) + " " + topic + "/" + title + ".mp4"
r = requests.get(url)
f = open(filename,'wb')
for chunk in r.iter_content(chunk_size=255):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
print(clean_title + ".mp4 download successfully!")
f.close()
parser = argparse.ArgumentParser()
parser.add_argument("-u","--username", type=str, default=None,
help="Your coursera account username or email")
parser.add_argument("-p","--password", type=str, default=None,
help="Your coursera account password")
parser.add_argument("-t","--time", type=float, default=10,
help="Time for selenium web driver to wait for missing element(s) implicitly")
parser.add_argument("--headless-mode-off", action="store_true",
help="Headless mode (download the tutorial at background without open up the browser)")
args = parser.parse_args()
if args.username is None:
args.username = input('Please enter your email: ')
if args.password is None:
print("\nPassword input will be hidden from terminal, press ENTER after enter the password")
args.password = getpass.getpass('Please enter your password: ')
count = 1
waiting_time = args.time
username = args.username
password = args.password
profile = webdriver.FirefoxProfile()
profile.set_preference("media.volume_scale", "0.0")
profile.update_preferences()
if args.headless_mode_off:
browser = webdriver.Firefox(firefox_profile=profile)
else:
#Operating in headless mode
opts = Options()
opts.set_headless()
assert opts.headless
browser = webdriver.Firefox(firefox_profile=profile,options=opts)
browser.implicitly_wait(waiting_time)
browser.get('https://www.coursera.org/?authMode=login')
browser.find_element_by_name('email').send_keys(username)
browser.find_element_by_name('password').send_keys(password)
browser.find_element_by_xpath("//form[@name='login']/div/button").click()
courses = browser.find_elements_by_xpath("//h4[contains(@class,'headline-1-text')]")
#exit if login failed
#can be no courses found in the Last Active list also
if(len(courses)==0):
print("Wrong email or password. Please try again!")
browser.quit()
exit()
course = input("Please enter the coursename you want to download exactly as shown in the website: ")
coursename = course.replace("/","")
coursename = coursename.replace("?","")
coursename = coursename.replace(":","-")
print("Searching for " + course + " ...")
found = False
for i in range (len(courses)):
if course.lower() == courses[i].text.lower():
found = True
break
#exit program if the course doesn't exist in last active list
if not(found):
print("Sorry, the course is not found in your Last Active list.")
browser.quit()
exit()
#click on course name
courses[i].click()
weeks = browser.find_elements_by_xpath("//div[contains(@class,'rc-NavigationDrawer')]/a")
print("Total week for this course: " + str(len(weeks)))
#loop through all weeks
topic_count=1
for i in range (len(weeks)):
weeks[i].click()
time.sleep(waiting_time) #longer wait time
video_elem = browser.find_elements_by_xpath("//ul/li/a/div/div/div/div[contains(@class,'rc-WeekItemName')]/span")
video = 0
for z in range(len(video_elem)):
if video_elem[z].text == "Lecture":
video +=1
print("Total video(s) in week " + str(i+1) + ": " + str(video))
#navigate to video page by click on the 1st video
browser.find_element_by_xpath("//ul/li/a/div").click()
topic_elem = browser.find_elements_by_xpath("//h3[contains(@class,'lesson-name')]")
#expand all topic
for j in range (1,len(topic_elem)):
topic_elem[j].click()
time.sleep(0.5)
#browse topic
for k in range(len(topic_elem)):
try:
topic = topic_elem[k].text
except StaleElementReferenceException as Exception:
topic_elem = browser.find_elements_by_xpath("//h3[contains(@class,'lesson-name')]")
topic = topic_elem[k].text
for j in range (1,len(topic_elem)):
topic_elem[j].click()
topic = topic.replace("/","")
topic = topic.replace("?","")
topic = topic.replace(":","-")
topic = topic.replace('"',"'")
print("Browsing topic: " + topic)
#count number of video in a topic
video_elem2 = browser.find_elements_by_xpath("(//div[contains(@class,'item-list')])[" + str(k+1) + "]/ul/li/a/div/div/div[contains(@class,'rc-NavItemName')]/span")
video2 = 0
for z in range(len(video_elem2)):
if video_elem2[z].text == "Lecture":
video2 +=1
print("Number of video(s) under this topic: " + str(video2))
if video2 > 0:
path = coursename + "/" + str(topic_count) + " " + topic
#check if the directory exist
if not os.path.exists(path):
os.makedirs(path)
topic_count +=1
#print("New folder created!\n")
else:
print("The folder already exist, download process terminated!")
browser.quit()
exit()
v=0
counter = 1
while v < video2:
#navigate to video page
browser.find_element_by_tag_name('body').send_keys(Keys.CONTROL + Keys.HOME) #scroll to top to prevent element not clickable (blocked by other element)
time.sleep(0.5)
browser.find_element_by_xpath("((//div[contains(@class,'rc-CollapsibleLesson')])[" + str(k+1) + "]/div/ul/li)[" + str(counter) + "]").click()
counter += 1
#prevent browser remains at the previous page (due to slow internet speed) and get the count of video as 1
time.sleep(1)
#check if a video exist in the page
video_list = browser.find_elements_by_tag_name('video')
#print(len(video_list))
if len(video_list) > 0:
if not (k==0 and v==0):
browser.find_element_by_tag_name('body').send_keys(Keys.CONTROL + Keys.HOME) #scroll to top to prevent element not clickable (blocked by navbar)
time.sleep(0.5)
src = browser.find_element_by_xpath("//ul/span/li[contains(@class,'rc-LectureDownloadItem')]/a")
src_link = src.get_attribute("href")
title = browser.find_element_by_xpath("(//h4)")
clean_title = title.text.replace("/","")
clean_title = clean_title.replace("?","")
clean_title = clean_title.replace(":","-")
clean_title = clean_title.replace('"',"'")
clean_title = str(count) + " " + clean_title
downloadfile(topic, topic_count-1 ,clean_title, src_link)
count += 1
v += 1
#close the modal if pop up
modal = browser.find_elements_by_class_name("c-modal-overlay")
if len(modal) > 0:
browser.find_element_by_xpath("//div[contains(@class,'c-modal-x-out')]/a").click()
time.sleep(0.5)
browser.find_element_by_tag_name('body').send_keys(Keys.CONTROL + Keys.HOME) #scroll to top to prevent element not clickable (blocked by navbar)
time.sleep(0.5)
#back to all week navigation
browser.find_element_by_class_name("rc-BackToWeekButton").click()
weeks = browser.find_elements_by_xpath("//div[contains(@class,'rc-NavigationDrawer')]/a")
print("")
print("All videos for " + course + " have been downloaded successfully!")
browser.quit()