/
scraper.py
177 lines (133 loc) · 5.56 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
from playwright.sync_api import sync_playwright, Page
from configparser import ConfigParser
config = ConfigParser()
config.read('pdf_config.ini')
# A parent function that calls all the other functions and logs into the website
def login(page: Page, username: str = config['LOGIN']['username'], password: str = config['LOGIN']['password']):
page.goto("https://stjohnsprep.myschoolapp.com/app/student#studentmyday/progress")
page.goto("https://stjohnsprep.myschoolapp.com/app/student#login")
print('Navigated to login page... \n')
page.get_by_label("Username or Email").click()
page.get_by_label("Username or Email").fill(username)
page.get_by_role("button", name="Next").click()
page.get_by_placeholder("username@stjohnsprep.org").click()
page.get_by_placeholder("username@stjohnsprep.org").fill(username)
page.get_by_placeholder("username@stjohnsprep.org").press("Enter")
page.get_by_placeholder("Password").click()
page.get_by_placeholder("Password").fill(password)
page.get_by_placeholder("Password").press("Enter")
page.get_by_role("button", name="No").click()
page.get_by_role("link", name="Progress").click()
print('LOGGED IN! \n')
return final_joining(page)
# Scrape the class number
def get_class_number(page: Page):
print('Scraping class numbers...')
page.wait_for_selector('#coursesContainer')
# Get the course elements
course_elements = page.query_selector_all('.row')
# Extract class numbers
class_numbers = []
for course_element in course_elements:
col_elements = course_element.query_selector_all('.col-md-3')
if col_elements:
class_number = col_elements[0].query_selector('h5').inner_text()
class_number = class_number.split(' | ')[0]
class_numbers.append(class_number)
return class_numbers[2:]
# Scrape the instructor names
def get_instructor_names(page: Page):
print('Scraping instructor names...')
page.wait_for_selector('#coursesContainer')
# Get the course elements
course_elements = page.query_selector_all('.row')
# Extract instructor names
instructor_names = []
for course_element in course_elements:
h4_element = course_element.query_selector('.group-owner-name')
if h4_element:
instructor_name = h4_element.inner_text()
instructor_names.append(instructor_name)
return instructor_names[2:]
# Join the course names, instructor names, and class numbers
def join_elements(page: Page, course_names: dict(), instructor_names: list(), class_numbers: list()):
joined_elements = {}
print(course_names)
print(instructor_names)
print(class_numbers)
print('---')
for key in course_names.keys():
joined_elements[key] = [course_names[key], instructor_names[int(key[4:])], class_numbers[int(key[4:])]]
print(joined_elements)
return joined_elements
# Scrape the unedited course names
def course_names(page: Page):
print('Scraping course names...')
page.wait_for_selector('#coursesContainer')
# Get the course elements
course_elements = page.query_selector_all('.row')
# Extract course names
course_names = []
for course_element in course_elements:
link_element = course_element.query_selector('a')
if link_element:
h3_element = link_element.query_selector('h3')
if h3_element:
course_name = h3_element.inner_text()
course_names.append(course_name)
return delete_values(course_names)
# The final course names that are fully edited
def final_course_names(page: Page):
unedited_course_names = course_names(page)[0]
courses = {}
for course in unedited_course_names.values():
key = course[course.find('(') + 1 : course.find(')')]
courses[key] = [course.split(" - ")[0]]
sorted_courses = dict(sorted(courses.items()))
return sorted_courses
# Final joining of values
def final_joining(page: Page):
print('Finalizing values...')
joined = join_elements(page, course_names(page)[0], get_instructor_names(page), get_class_number(page))
courses_final = final_course_names(page)
for value in joined.values():
key = value[0][value[0].find('(') + 1 : value[0].find(')')]
if key in courses_final.keys():
courses_final[key].append(value[1])
courses_final[key].append(value[2])
return courses_final
# Delete values that are not classes (Ambigous values)
def delete_values(group: list()):
removed = {}
filtered = {}
group = delete_duplicates(group)
# Filter the keys based on assumption that classes have (LETTER)
i = 0
for value in group:
start_index = value.find('(')
end_index = value.find(')')
if start_index != -1 and end_index != -1:
substring = value[start_index + 1: end_index]
if len(substring) == 1 and substring.isalpha():
filtered['temp' + str(i)] = value
else:
removed['temp' + str(i)] = value
else:
removed['temp' + str(i)] = value
i += 1
return filtered, removed
# Delete duplicate values
def delete_duplicates(group: list()):
new_group = []
for value in group:
if value not in new_group:
new_group.append(value)
return new_group
def caller():
needed_values = None
with sync_playwright() as playwright:
browser = playwright.chromium.launch(headless=False)
context = browser.new_context()
page = context.new_page()
needed_values = login(page)
return needed_values