/
reformat_drawtimes.py
48 lines (34 loc) · 1.52 KB
/
reformat_drawtimes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import json
import pandas as pd
import urllib.request as request
def reformat_drawtimes(in_name: str, out_name: str, n_cols=6):
with open(in_name) as r:
lines = r.read().splitlines(False)
lines = lines[:-1]
if n_cols == 6:
headers = ["PUID", "draw_time", "group_number", "last_name", "first_name", "class_year"]
elif n_cols == 5:
headers = ["PUID", "draw_time", "group_number", "last_name", "first_name"]
else:
raise NotImplementedError("n_cols not 5 or 6???")
rows = [lines[i:i+n_cols] for i in range(0, len(lines), n_cols)]
df = pd.DataFrame(rows, columns=headers)
df.to_csv(out_name, sep="\t", header=True)
def puid_to_netid(puid: str) -> str:
url = f"https://edutools.cs.princeton.edu/integration/puid.cgi?puid={puid}"
contents = request.urlopen(url).read()
contents = json.loads(contents)
return contents["netid"]
def _add_netid_column(df: pd.DataFrame):
df["netid"] = df["PUID"].apply(puid_to_netid)
if __name__ == "__main__":
import sys
if len(sys.argv) != 4:
print("Usage: python reformat_drawtimes.py <in_name> <n_cols> <out_name>")
print("This script takes the text copy/pasted out of the draw times"
" pdfs and converts it to a tab separated value file.")
print("Note: you must remove the extra headers in the txt file. "
"Assumes that each cell is on separate line.")
exit(1)
in_name, n_cols, out_name = sys.argv[1:]
reformat_drawtimes(in_name, out_name, n_cols=int(n_cols))