-
Notifications
You must be signed in to change notification settings - Fork 1
/
dqn_window.py
389 lines (300 loc) · 15.3 KB
/
dqn_window.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
import tensorflow as tf
import numpy as np
import time
import matplotlib.pyplot as plt
from collections import deque
import random
import gym, ppaquette_gym_super_mario
import sys
import model
# stack_size = 4 # We stack 4 frames
#
# height = 13
# width = 16
# Initialize deque with zero-images one array for each image
# stacked_frames = deque([np.zeros((height, width), dtype=np.int) for i in range(stack_size)], maxlen=4)
def stack_frames(stacked_frames, frame):
# Preprocess frame
# frame = preprocess_frame(state)
# frame = np.expand_dims(state, axis=-1)
# Append frame to deque, automatically removes the oldest frame
stacked_frames.append(frame)
# Build the stacked state (first dimension specifies different frames)
stacked_state = np.stack(stacked_frames, axis=2)
return stacked_state, stacked_frames
class MarioPositionNotFound(Exception):
pass
def get_mario_position(state):
for i in range(state.shape[0]):
for j in range(state.shape[1]):
if state[i, j] == 3:
if state[i,j+1] == 3: #Sometimes mario take two points in state matrix when it is moving. In this case return the point on the right
return i,j+1
return i,j
#raise ValueError('mario position not found')
raise MarioPositionNotFound
class AgentDQN:
def __init__(self, possible_actions, lr, tot_episodes, max_steps, batch_size, explore_start,
explore_stop, decay_rate, gamma, memory_size, env, stack_size):
self.action_size = len(possible_actions)
self.learning_rate = lr
self.total_episodes = tot_episodes
self.max_steps = max_steps
self.batch_size = 64
self.window_size = (8,4) # height 8 is composed by 5 upper, 1 same level, 2 lower
# Exploration parameters for epsilon greedy strategy
self.explore_start = explore_start # exploration probability at start
self.explore_stop = explore_stop # minimum exploration probability
self.decay_rate = decay_rate # exponential decay rate for exploration prob
self.gamma = gamma # Discounting rate
self.pretrain_length = self.batch_size
self.memory_size = memory_size
self.possible_actions = possible_actions
self.actions_for_nn = np.eye(self.action_size)
self.env = env
height, width = self.env.observation_space.shape
self.state_size = (*self.window_size, stack_size)
self.memory = model.Memory(max_size=self.memory_size)
self.DQNetwork = model.DQNetwork(self.state_size, self.action_size, self.learning_rate)
self.sess = tf.Session()
self.saver = tf.train.Saver()
def get_state_window(self, state, window_size = (8,4)):
'''
This is to retrun the window in front of Mario
:param state:
:param window_size:
:return:
'''
if window_size == None:
window_size = self.window_size
# To get the window in front of Mario
try:
y, x = get_mario_position(state)
except MarioPositionNotFound:
return np.zeros(window_size)
# The following condition block is to handle the situation when Mario's position is too high or too low that the window go out of the border of the state
if y-(window_size[0]-2) >= 0 and y <= 10: # in the normal region
front_window = state[y-(window_size[0]-2)+1:y+3, x+1:x+window_size[1]+1]
elif y-(window_size[0]-2) < 0: # too high, add zeros on top of the window
zero_row = [0] * 16
zero_rows = np.stack([zero_row] * (window_size[0] - 3), axis=0)
state = np.concatenate((zero_rows, state), axis=0) # Pad zeros on top
front_window = state[y - (window_size[0] - 2) + 1 + 5:y + 3 + 5, x + 1:x + window_size[1] + 1]
#front_window = np.zeros(window_size)
#front_window[-(y+1):,:] = state[0:y+1, x+1:x+window_size[1]+1] # The part within state
#front_window[0:window_size[0]-2-(y+1), :] = np.zeros((window_size[0]-2-(y+1), window_size[1])) # The part outside the state
elif y>10:
zero_rows = np.stack(([0]*16, [0]*16), axis=0)
state = np.concatenate((state, zero_rows), axis=0)
front_window = state[y - (window_size[0] - 2) + 1:y + 3, x + 1:x + window_size[1] + 1]
#front_window = np.zeros(window_size)
#front_window[0:window_size[0]-2-(12-y), :] = state[y-(window_size[0]-2):, x+1:x+window_size[1]+1] # The part within state
#front_window[window_size[0]-2+(12-y):, :] = np.zeros((y-10, window_size[1])) # The part outside the state
else:
raise Exception('error from get_state_window')
return front_window
def init_memory(self):
# Instantiate memory
print("Initialize memory !")
for i in range(self.pretrain_length):
# If it's the first step
if i == 0:
frame = self.get_state_window(self.env.reset())
#print('init_memory frame shape 1', frame.shape)
stacked_frames = deque([np.zeros_like(frame, dtype=np.int) for i in range(self.state_size[-1])],
maxlen=4)
stacked_frames.append(frame)
stacked_frames.append(frame)
stacked_frames.append(frame)
stacked_frames.append(frame)
# Stack the frames
state = np.stack(stacked_frames, axis=2)
# Get the next_state, the rewards, done by taking a random action
choice = random.randint(1, len(self.possible_actions)) - 1
action = self.possible_actions[choice]
ac_for_nn = self.actions_for_nn[choice]
next_frame, reward, done, info = self.env.step(action)
#print('init_memory next_frame shape 1', next_frame.shape)
#print(next_frame)
# Stack the frames
next_frame = self.get_state_window(next_frame)
#print('init_memory next_frame shape 2', next_frame.shape)
next_state, stacked_frames = stack_frames(stacked_frames, next_frame)
# If the episode is finished (we're dead 3x)
if done:
# We finished the episode
next_state = np.zeros(self.window_size)
# Add experience to memory
self.memory.add((state, ac_for_nn, reward, next_state, done))
# Start a new episode
frame = self.get_state_window(self.env.reset())
#print('init_memory frame shape 2', frame.shape)
# Stack the frames
stacked_frames = deque([np.zeros_like(frame, dtype=np.int) for i in range(self.state_size[-1])],
maxlen=4)
stacked_frames.append(frame)
stacked_frames.append(frame)
stacked_frames.append(frame)
stacked_frames.append(frame)
# Stack the frames
state = np.stack(stacked_frames, axis=2)
else:
# Add experience to memory
self.memory.add((state, ac_for_nn, reward, next_state, done))
# Our new state is now the next_state
state = next_state
"""
This function will do the part
With ϵϵ select a random action atat, otherwise select at=argmaxaQ(st,a)
"""
def predict_action(self, decay_step, state):
# EPSILON GREEDY STRATEGY
# Choose action a from state s using epsilon greedy.
# First we randomize a number
exp_exp_tradeoff = np.random.rand()
# Here we'll use an improved version of our epsilon greedy strategy used in Q-learning notebook
explore_probability = self.explore_stop + (self.explore_start - self.explore_stop) * np.exp(
-self.decay_rate * decay_step)
if explore_probability > exp_exp_tradeoff:
# Make a random action (exploration)
choice = random.randint(1, len(self.possible_actions)) - 1
action = self.possible_actions[choice]
else:
# Get action from Q-network (exploitation)
# Estimate the Qs values state
Qs = self.sess.run(self.DQNetwork.output,
feed_dict={self.DQNetwork.inputs_: state.reshape((1, *state.shape))})
# Take the biggest Q value (= the best action)
choice = np.argmax(Qs)
action = self.possible_actions[choice]
return self.actions_for_nn[choice], action, explore_probability
def train(self):
print("Begin Training ..")
rewards_list = []
# Initialize the variables
self.sess.run(tf.global_variables_initializer())
# saver.restore(sess, "model.ckpt")
# Initialize the decay rate (that will use to reduce epsilon)
decay_step = 0
for episode in range(self.total_episodes):
init_time = time.time()
# Set step to 0
step = 0
# Initialize the rewards of the episode
episode_rewards = []
# Make a new episode and observe the first state
frame = self.get_state_window(self.env.reset())
stacked_frames = deque([np.zeros_like(frame, dtype=np.int) for i in range(self.state_size[-1])], maxlen=4)
stacked_frames.append(frame)
stacked_frames.append(frame)
stacked_frames.append(frame)
stacked_frames.append(frame)
# Stack the frames
state = np.stack(stacked_frames, axis=2)
# Remember that stack frame function also call our preprocess function.
# state, stacked_frames = stack_frames(stacked_frames, state, True)
while step < self.max_steps:
step += 1
# Increase decay_step
decay_step += 1
# Predict the action to take and take it
ac_for_nn, action, explore_probability = self.predict_action(decay_step, state)
# Perform the action and get the next_state, reward, and done information
next_frame, reward, done, info = self.env.step(action)
next_frame = self.get_state_window(next_frame)
if action == [0, 0, 0, 1, 1, 0]: # if action is jump
reward += 4
# if last_action == [0, 0, 0, 1, 1, 0]: # if last action is jump
# reward += 4
# last_action = action
# Add the reward to total reward
episode_rewards.append(reward)
# If the game is finished
if done:
# The episode ends so no next state
next_frame = np.zeros((self.window_size[0], self.window_size[1]), dtype=np.int)
next_state, stacked_frames = stack_frames(stacked_frames, next_frame)
# Set step = max_steps to end the episode
step = self.max_steps
# Get the total reward of the episode
total_reward = np.sum(episode_rewards)
finish_time = time.time()
print('Episode: {}'.format(episode),
'Total reward: {}'.format(total_reward),
'Explore P: {:.4f}'.format(explore_probability),
'Training Loss {:.4f}'.format(loss),
'Duration {:.4f}'.format(finish_time - init_time))
rewards_list.append((episode, total_reward))
# Store transition <st,at,rt+1,st+1> in memory D
self.memory.add((state, ac_for_nn, reward, next_state, done))
else:
# Stack the frame of the next_state
#print('next_frame.shape ', next_frame.shape)
next_state, stacked_frames = stack_frames(stacked_frames, next_frame)
#print('next_state.shape ', next_state.shape)
# Add experience to memory
self.memory.add((state, ac_for_nn, reward, next_state, done))
# st+1 is now our current state
state = next_state
# LEARNING PART
# Obtain random mini-batch from memory
batch = self.memory.sample(self.batch_size)
states_mb = np.array([each[0] for each in batch], ndmin=3)
actions_mb = np.array([each[1] for each in batch])
rewards_mb = np.array([each[2] for each in batch])
next_states_mb = np.array([each[3] for each in batch], ndmin=3)
dones_mb = np.array([each[4] for each in batch])
target_Qs_batch = []
# Get Q values for next_state
Qs_next_state = self.sess.run(self.DQNetwork.output, feed_dict={self.DQNetwork.inputs_: next_states_mb})
# Set Q_target = r if the episode ends at s+1, otherwise set Q_target = r + gamma*maxQ(s', a')
for i in range(0, len(batch)):
terminal = dones_mb[i]
# If we are in a terminal state, only equals reward
if terminal:
target_Qs_batch.append(rewards_mb[i])
else:
target = rewards_mb[i] + self.gamma * np.max(Qs_next_state[i])
target_Qs_batch.append(target)
targets_mb = np.array([each for each in target_Qs_batch])
loss, _ = self.sess.run([self.DQNetwork.loss, self.DQNetwork.optimizer],
feed_dict={self.DQNetwork.inputs_: states_mb,
self.DQNetwork.target_Q: targets_mb,
self.DQNetwork.actions_: actions_mb})
# Save model every 5 episodes
if episode % 5 == 0:
save_path = self.saver.save(self.sess, "./model.ckpt")
print("Model Saved")
#
# if training == True:
# with tf.Session() as sess:
#
def main():
level = "1-1"
env = gym.make('ppaquette/SuperMarioBros-' + level + '-Tiles-v0')
possible_actions = [[0, 0, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 1],
[0, 0, 0, 1, 1, 0],
[0, 0, 0, 1, 0, 0],
[0, 0, 0, 0, 1, 0]]
learning_rate = 0.00025 # Alpha (aka learning rate)
stack_size = 4
window_shape = (8,4)
# TRAINING HYPERPARAMETERS
total_episodes = 100 # Total episodes for training
max_steps = 3000 # Max possible steps in an episode
batch_size = 64 # Batch size
# Exploration parameters for epsilon greedy strategy
explore_start = 1.0 # exploration probability at start
explore_stop = 0.01 # minimum exploration probability
decay_rate = 0.00001 # exponential decay rate for exploration prob
# Q learning hyperparameters
gamma = 0.9 # Discounting rate
# MEMORY HYPERPARAMETERS
memory_size = 1000000 # Number of experiences the Memory can keep
mario_agent = AgentDQN(possible_actions, learning_rate, total_episodes, max_steps, batch_size, explore_start,
explore_stop, decay_rate, gamma, memory_size, env, stack_size)
mario_agent.init_memory()
mario_agent.train()
if __name__ == '__main__':
main()