-
Notifications
You must be signed in to change notification settings - Fork 74
/
ftrl.py
153 lines (114 loc) · 3.82 KB
/
ftrl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
from math import exp, log, sqrt
# implementation taken from kaggle scripts:
# https://www.kaggle.com/sudalairajkumar/outbrain-click-prediction/ftrl-starter-with-leakage-vars/code
def hash_element(el, D):
h = hash(el) % D
if h < 0:
h = h + D
return h
def hash_elements(elements, D):
return [hash_element(el, D) for el in elements]
class FtrlProximal(object):
''' Our main algorithm: Follow the regularized leader - proximal
In short,
this is an adaptive-learning-rate sparse logistic-regression with
efficient L1-L2-regularization
Reference:
http://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf
'''
def __init__(self, alpha, beta, L1, L2, D, interactions):
# parameters
self.alpha = alpha
self.beta = beta
self.L1 = L1
self.L2 = L2
# feature related parameters
self.D = D
self.interactions = interactions
# model
# n: squared sum of past gradients
# z: weights
# w: lazy weights
self.n = [0.0] * (D + 1)
self.z = [0.0] * (D + 1)
self.w = {}
def to_indices(self, x):
res = hash_elements(x, self.D)
if self.interactions:
sorted_x = sorted(x)
len_x = len(sorted_x)
for i in range(len_x):
for j in range(i + 1, len_x):
h = hash_element(sorted_x[i] + '_' + sorted_x[j], self.D)
res.append(h)
return res
def predict(self, x):
x_hashed = self.to_indices(x)
return self.predict_hashed(x_hashed)
def predict_hashed(self, x):
''' Get probability estimation on x
INPUT:
x: features
OUTPUT:
probability of p(y = 1 | x; w)
'''
# parameters
alpha = self.alpha
beta = self.beta
L1 = self.L1
L2 = self.L2
# model
n = self.n
z = self.z
w = {}
# wTx is the inner product of w and x
wTx = 0.
indices = [0]
for i in x:
indices.append(i + 1)
for i in indices:
sign = -1. if z[i] < 0 else 1. # get sign of z[i]
# build w on the fly using z and n, hence the name - lazy weights
# we are doing this at prediction instead of update time is because
# this allows us for not storing the complete w
if sign * z[i] <= L1:
# w[i] vanishes due to L1 regularization
w[i] = 0.0
else:
# apply prediction time L1, L2 regularization to z and get w
w[i] = (sign * L1 - z[i]) / ((beta + sqrt(n[i])) / alpha + L2)
wTx += w[i]
# cache the current w for update stage
self.w = w
# bounded sigmoid function, this is the probability estimation
return 1.0 / (1.0 + exp(-max(min(wTx, 35.0), -35.0)))
def update(self, x, p, y):
''' Update model using x, p, y
INPUT:
x: a list of indices
p: probability prediction of our model
y: answer
MODIFIES:
self.n: increase by squared gradient
self.z: weights
'''
# parameter
alpha = self.alpha
# model
n = self.n
z = self.z
w = self.w
# gradient under logloss
g = p - y
indices = [0]
for i in x:
indices.append(i + 1)
# update z and n
for i in indices:
sigma = (sqrt(n[i] + g * g) - sqrt(n[i])) / alpha
z[i] += g - sigma * w[i]
n[i] += g * g
def fit(self, x, y):
x_hashed = self.to_indices(x)
p = self.predict_hashed(x_hashed)
self.update(x_hashed, p, y)