/
gw.py
230 lines (181 loc) · 9.42 KB
/
gw.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
# Games Workshop Store Paint Scraper
# Dry, Base, Layer, Wash, Technical, Texture, etc
# Full Imports
import os
import sys
import time
import json
import re
import pprint # just for testing and viewing objects in a legible format
# Selective Imports
from paint_manifest import paint_manifest as COLOR_LINK_MANIFEST
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
'''
Selenium is used instead of traditional, headless scraping techniques due to GW's use of Endeca (e-commerce toolkit from Oracle). They inject their inventory data into the window after it has finished loading. From my experimentation, I could technically parse the document without BS4, but that would take a long time to figure out and overly complicate the code.
Selenium is used as a driver for navigating to a specific URL, and quickly pulling the source after they inject it into the browser window. A headless browser window can be used but I would need to overcomplicate the code with overrides to hide the fact that it's a bot scraping their site. Recaptcha will otherwise block access.
A viewable window gets around both of the above issues and is a lot simpler for my needs. One of those cases where testing modules are useful for gathering data.
'''
# Structure of the Inventory Object produced by the GamesWorkshopInventory class
'''
inventory = {
'<type>': {
'<paint_name>': {
'colour': '<colour>'
'price': '<price>'
'type': '<type>'
'size': '<size>'
}
}
}
'''
class GamesWorkshopInventory:
def __init__(self):
self.driver = None
self.initial_url = "https://www.games-workshop.com/en-US/Black-Templar-2019?Ndn=GW_Painting_and_Modelling_Browse_Paint_by_Colour.dyn.dimensionPaintColour&Ndi=3184893862"
self.fragment_url = "https://www.games-workshop.com/en-US/detail"
# self.links = COLOR_LINK_MANIFEST
self.colour_keys = []
self.inventory = {} # Storage
# This will review the page and add the names of colours we need to visit. We use the colours as a key which is used to determine which link Selenium will visit next.
def generate_colour_key(self):
items = bs(self.driver.page_source, 'html.parser').find_all(
'div', class_='hgn__filterButtonName')
for item in items:
self.colour_keys.append(item.text.strip().lower())
self.colour_keys.append('black')
def navigator(self, paint_colour):
colour_title = paint_colour.title()
# Find the button using the class and the colour_key (title case). Then use that element as the click target
try:
# Syntax - Select div that has the class and text
target = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located(
(By.XPATH, f"//div[contains(@class, 'hgn__filterButtonName') and contains(text(), '{colour_title}')]")))
except NoSuchElementException as error:
print(f"Exception caught in <fn visit_next_colour>: {error}")
sys.exit(0)
finally:
target.click()
# After clicking wait for the colour title to appear completely before parsing the page
WebDriverWait(self.driver, 10).until(EC.presence_of_element_located(
(By.XPATH, f"//span[contains(@class, 'ics__breadcrumb') and contains(text(), '{colour_title}')]")))
self.get_paints(paint_colour)
# Function will handle the main task of grabbing the paints from each page and placing them into the correct structure
def get_paints(self, paint_colour):
paints = bs(self.driver.page_source,
'html.parser').find_all('span', class_='recordItem')
filtered_list = [paint['data-gtm-productfieldobject']
for paint in paints]
for item in filtered_list:
# This is an object produced from the json
paint = json.loads(item)
# We use the paint['name'] key and run its value through a regex to grab the type and name
paint_re = re.search(
r"(\w*):\s([^\(]*)", paint['name'])
paint_price = paint['price']
paint_type = paint_re.group(1) # type -> used as our key in the
paint_name = paint_re.group(2) # name
paint_size = None
paint_name, paint_size = self.normalize_colors(
paint_type, paint_name)
# Append the paint dictionary to the inventory dictionary (classified by type)
if paint_type not in self.inventory:
# Initialize an empty key value pair using the paint_type as the key
self.inventory[paint_type] = {}
# Fill the key with the paint_name object which holds various details
self.inventory[paint_type].update({
paint_name: {
'price': paint_price,
'colour': paint_colour,
'type': paint_type,
'size': paint_size
}
})
print(f'Done parsing {paint_colour.title()} colours page.')
'''
The data provided from scraping shows inconsistency across the board with regards to naming conventions. Over the years
GW has added newer products with slightly different naming systems. For example, Citadel Death Guard Spray v.s Chaos Black Spray. To make up for this inconsistency, the data needs to be further filtered through a function and normalized to a standard convention.
Older colors (pre/circa 2019) use a different convention compared to recently added colors. The date is now preceded by the size of the paint on recent colors. I will add in the sizes after normalizing the names because the naming conventions are so wildly different in some cases. Key words to remove from the name include "Citadel", "Global", "Spray", and various <
Also to note, there are a few technicals that are not 24ml. These need to be checked manually by the code and assigned the correct values.
'''
def normalize_colors(self, paint_type, paint_name) -> tuple:
words_to_filter = (
'Citadel',
'Global',
'Spray',
'12ml',
'18ml',
'24ml',
'400ml'
)
paint_sizes = {
'base': '12ml',
'layer': '12ml',
'dry': '12ml',
'contrast': '18ml',
'air': '24ml',
'technical': '24ml',
'technical_2': '12ml',
'shade': '24ml',
'spray': '400ml',
}
'''
This is tuple that contains the names of technical items that don't share the same size as the usual technical items.
We check if the paint_name exists in this tuple and update the size based on that
'''
paint_size_overrides = (
'Nurgles Rot',
'Spiritstone Red',
'Soulstone Blue',
'Waystone Green',
'Nihilakh Oxide',
'Blood For The Blood God',
'Typhus Corrosion'
)
new_name = ' '.join(
filter(lambda x: x not in words_to_filter, paint_name.split()))
new_size = paint_sizes[paint_type.lower()]
# Run another check here for technicals that are not 24ml. Manually checked against the GW website.
if paint_type == 'Technical' and paint_name in paint_size_overrides:
new_size = paint_sizes['technical_2']
return new_name, new_size
# Back up navigator incase the captcha gets flagged and alerted. Switch the code to this method if we have problems
def navigator_bak(self, colour):
print(f"{colour}...")
print(f"{self.fragment_url}{self.colour_keys[colour]}")
self.driver.get(f"{self.fragment_url}{self.colour_keys[colour]}")
def initialize_driver(self):
options = webdriver.ChromeOptions()
options.add_argument("--disable-blink-features")
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_experimental_option(
"excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
self.driver = webdriver.Chrome(
executable_path="C:\\Apps\\Geckodriver\\chromedriver.exe", options=options)
def engage_driver(self):
self.initialize_driver()
# Opens the initial page which are the 'black' colours - we come back to this page at the end of the loop
self.driver.get(self.initial_url)
# Generate a colour key which is used to navigate to the next page
self.generate_colour_key()
for colour in self.colour_keys:
self.navigator(colour)
# At the end of the loop we should be on the turquoise page
pprint.pprint(self.inventory)
self.disengage_driver()
self.produce_json()
def disengage_driver(self):
return self.driver.close()
def produce_json(self):
with open('gw_paint_inventory.json', 'w+') as file:
file.write(json.dumps(self.inventory, indent=4))
if __name__ == "__main__":
G = GamesWorkshopInventory()
G.engage_driver()