-
Notifications
You must be signed in to change notification settings - Fork 1
/
search.py
83 lines (75 loc) · 3.12 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import tqdm
from ops import *
from tree_pre_select import *
def init_ops(ds):
all_ops = []
for col in ds.cat_cols:
all_ops.append(SelectOp(col, 'cat'))
for col in ds.num_cols:
all_ops.append(SelectOp(col, 'num'))
for OP in terminal_ops + tsfm_op + top_ops:
all_ops.append(OP())
for c, size in zip(ds.cat_cols, ds.vocab_sizes):
all_ops.append(GroupBy(c))
for cond in range(size):
all_ops.append(FilterBy(c, cond))
all_ops.append(RetainBy(c, cond))
for cond in [True, False]:
all_ops.append(SortBy('__self__', cond))
for c in ds.num_cols:
all_ops.append(SortBy(c, cond))
return all_ops
def get_best_action(root, it, iterations, cpucore, depth):
for i in tqdm.trange(iterations, desc=f'depth {depth}', leave=False):
dfs, ys = next(it)
node = tree_policy(root)
with np.errstate(divide='ignore', invalid='ignore'):
reward = default_policy(node, dfs, ys, cpucore)
backup(node, reward)
return root.best_child(c=0)
def get_best_path(root, it, iterations, cpucore):
node = root
depth = 1
while not node.terminal:
node = get_best_action(node, it, iterations, cpucore, depth)
depth += 1
return node
def get_topk_stats_sequential(ops, max_depth, dl, iterations=5, k=3, cpucore=1):
it = dl.get_infinite_iter()
results = []
stats = []
header = []
popped_paths = set()
dl.dataset._backup_ys = dl.dataset.ys
for i in tqdm.trange(k, desc='Stats'):
root = Node(candidates=ops, max_depth=max_depth, popped_paths=popped_paths)
node = get_best_path(root, it, iterations, cpucore)
results.append(node)
popped_paths.add(repr_path(*node.path))
# compute stats and residuals
# current_stats = [pipe_pre_select(node.path, df) for df, _ in dl.dataset]
current_stats = pipe_pre_select_parallel(node.path, (x[0] for x in dl.dataset), cpucore)
current_stats = np.asarray([s.values if isinstance(s, pd.Series) else s for s in current_stats])
current_stats = np.nan_to_num(current_stats).reshape(len(dl.dataset), -1)
dim = current_stats.shape[1]
if dim == 1: # header of generated stats csv files.
header.append(f'R{i}')
else:
header.extend([f'R{i}D{j}' for j in range(dim)])
stats.append(current_stats)
A = np.concatenate(stats, 1)
beta, *_ = np.linalg.lstsq(A.astype(float), dl.dataset._backup_ys, rcond=None)
rs = A @ beta - dl.dataset._backup_ys
dl.dataset.ys = rs
# check if its parent and ancestors become invalid
parent = node.parent
while True:
parent_valid_ops = get_valid_ops_pre_select(parent.path, parent.candidates, parent.max_depth, popped_paths)
if len(parent_valid_ops) == 0:
popped_paths.add(repr_path(*parent.path))
parent = parent.parent
else:
break
dl.dataset.ys = dl.dataset._backup_ys # restore
stats = pd.DataFrame(A, columns=header, index=dl.dataset.index)
return results, stats