-
Notifications
You must be signed in to change notification settings - Fork 0
/
Agent.py
executable file
·96 lines (82 loc) · 3.73 KB
/
Agent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpus[0], True)
from tensorflow import keras
from keras.layers import Dense, Activation, Input, Flatten
from keras.models import Model, load_model
from keras.optimizers import Adam
import keras.backend as K
import numpy as np
class Agent(object):
def __init__(self, ALPHA, GAMMA=0.99, n_actions=5, layer1_size=16, layer2_size=16, layer3_size=16, input_dims=9, fname='model.h5'):
self.GAMMA = GAMMA
self.lr = ALPHA
self.G = 0
self.input_dims = input_dims
self.fc1_dims = layer1_size
self.fc2_dims = layer2_size
self.fc3_dims = layer3_size
self.n_actions = n_actions
self.state_memory = []
self.action_memory = []
self.reward_memory = []
self.policy, self.predict = self.build_policy_network()
self.action_space = [i for i in range(n_actions)]
self.model_file = fname
#this is the function that will define the policy agent and and will make predictions
def build_policy_network(self):
input = Input(shape=(self.input_dims,))
advantages = Input(shape=[1])
dense1 = Dense(self.fc1_dims, activation='relu')(input)
dense2 = Dense(self.fc2_dims, activation='relu')(dense1)
dense3 = Dense(self.fc3_dims, activation='relu')(dense2)
probs = Dense(self.n_actions, activation='softmax')(dense3)
def custom_loss(y_true, y_predict):
out = K.clip(y_predict, 1e-8, 1 - 1e-8)
log_lik = y_true * K.log(out)
return K.sum(-log_lik * advantages)
policy = Model(input=[input, advantages], output=[probs])
policy.compile(optimizer=Adam(lr=self.lr), loss=custom_loss)
predict = Model(input=[input], output=[probs])
return policy, predict
#we will choose an actiom for each observation within the batch by making a random choice weighted by the probabilities predicted by our model
def choose_action(self, observation):
state = observation[np.newaxis, :]
probabilities = self.predict.predict(state)[0]
action = np.random.choice(self.action_space, p=probabilities)
return action
#this is a helper function that stores the history of the model
def store_transition(self, observation, action, reward):
self.state_memory.append(observation)
self.action_memory.append(action)
self.reward_memory.append(reward)
# this is the main driver function that we will call to train the agent
def learn(self):
state_memory = np.array(self.state_memory)
action_memory = np.array(self.action_memory)
reward_memory = np.array(self.reward_memory)
actions = np.zeros([len(action_memory), self.n_actions])
actions[np.arange(len(action_memory)), action_memory] = 1
# calculating the gain
G = np.zeros_like(reward_memory)
for t in range(len(reward_memory)):
G_sum = 0
discount = 1
for k in range(t, len(reward_memory)):
G_sum += reward_memory[k]*discount
discount *= self.GAMMA
G[t] = G_sum
mean = np.mean(G)
std = np.std(G) if np.std(G) > 0 else 1
self.G = (G-mean)/std
#calculating the cost
cost = self.policy.train_on_batch([state_memory, self.G], actions)
#resetting memory
self.state_memory = []
self.action_memory = []
self.reward_memory = []
#helper function to save and load the model
def save_model(self):
self.policy.save(self.model_file)
def load_model(self):
self.policy = load_model(self.model_file)