-
Notifications
You must be signed in to change notification settings - Fork 18
/
parsers.py
90 lines (71 loc) · 2.73 KB
/
parsers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import collections
import string
from typing import Dict, List
from selenium.webdriver.remote.webelement import WebElement
def data_from_row(row: WebElement, cell_tag="td") -> List[str]:
"""Extract data from a row and return it as a list.
Args:
row (WebElement): The row element.
cell_tag (str, optional): The HTML tag associated with the row cells. Defaults to "td".
Returns:
list: List of strings with the contents.
"""
return [
col.text for col in row.find_elements_by_tag_name(cell_tag)
]
def sanitize_header(labels: List[str]):
"""Sanitize header labels."""
# Handle Treat Empty Header
for idx, label in enumerate(labels):
if label.strip():
# make it lowercase
label = label.lower()
# remove punctuations
label = ''.join([l for l in label if l not in string.punctuation]) # noqa: E741
# replace spaces with underscores
label = label.replace(" ", "_")
else:
label = f"col_{idx}"
labels[idx] = label
# Deduplicate by adding _1, _2, _3 to repeated labels
counts = {k: v for k, v in collections.Counter(labels).items() if v > 1}
for i in reversed(range(len(labels))):
item = labels[i]
if item in counts and counts[item]:
labels[i] = f"{item}_{counts[item]}"
counts[item] -= 1
return labels
def table_to_dict(table: WebElement, has_header: bool = True,
skip_rows: int = 0, header_tag: str = "th") -> List[Dict]:
"""Convert a table WebElement to a dict of lists.
Args:
table (WebElement): The table element.
has_header (bool, optional): Whether or not to parse a header. Defaults to True.
skip_rows (int, optional): Number of rows to skip from the top. Defaults to 0.
header_tag (str, optional): The HTML tag associated with the header cell. Defaults to "th".
Returns:
list: List with dict for each row.
"""
# Collect all rows from table
rows = table.find_elements_by_tag_name("tr")
# Skip rows if informed
if skip_rows:
rows = rows[skip_rows:]
# Parse header labels
if has_header:
# Read header labels
labels = data_from_row(rows[0], cell_tag=header_tag)
# Sanitize headers
labels = sanitize_header(labels)
# Skip the header
rows = rows[1:]
else:
# Make up header labels
num_cols = len(rows[0].find_elements_by_tag_name("td"))
labels = [f"col_{i}" for i in range(num_cols)]
# Assemble output dictionary
out_list = []
for row in rows:
row_data = data_from_row(row)
out_list.append(dict(zip(labels, row_data)))
return out_list