/
e_greedy.py
78 lines (65 loc) · 2.42 KB
/
e_greedy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import numpy as np
from .base import ValueFunction
import scipy.stats
class EGreedy(ValueFunction):
"""EGreedy.
In general, ε-Greedy models the problem based on an ε diversification parameter to perform random actions [1]_.
References
----------
.. [1] Auer, P., Cesa-Bianchi, N. & Fischer, P. Finite-time Analysis of the
Multiarmed Bandit Problem. Machine Learning 47, 235–256 (2002).
"""
def __init__(self, *args, **kwargs):
"""__init__.
Args:
args:
kwargs:
"""
super().__init__(*args, **kwargs)
def reset(self, observation):
"""reset.
Args:
observation:
"""
train_dataset = observation
super().reset(train_dataset)
self.train_dataset = train_dataset
self.train_consumption_matrix = scipy.sparse.csr_matrix(
(self.train_dataset.data[:, 2],
(self.train_dataset.data[:, 0], self.train_dataset.data[:, 1])),
(self.train_dataset.num_total_users,
self.train_dataset.num_total_items))
self.num_total_items = self.train_consumption_matrix.shape[1]
self.items_mean_values = np.zeros(self.num_total_items)
self.items_count = np.zeros(self.num_total_items, dtype=int)
for i in range(self.train_dataset.data.shape[0]):
uid = int(self.train_dataset.data[i, 0])
item = int(self.train_dataset.data[i, 1])
reward = self.train_dataset.data[i, 2]
self.update(None, (uid, item), reward, None)
def actions_estimate(self, candidate_actions):
"""actions_estimate.
Args:
candidate_actions: (user id, candidate_items)
Returns:
numpy.ndarray:
"""
uid = candidate_actions[0]
candidate_items = candidate_actions[1]
items_score = self.items_mean_values[candidate_items]
return items_score, None
def update(self, observation, action, reward, info):
"""update.
Args:
observation:
action: (user id, item)
reward (float): reward
info:
"""
uid = action[0]
item = action[1]
additional_data = info
self.items_mean_values[item] = (
self.items_mean_values[item] * self.items_count[item] +
reward) / (self.items_count[item] + 1)
self.items_count[item] += 1