/
AtariTrainer.py
121 lines (81 loc) · 2.71 KB
/
AtariTrainer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
## AtariTrainer.py Dana Hughes 21-Sept-2017
##
## Class to perform training and evaluation on Atari agents.
import numpy as np
class AtariTrainer:
"""
"""
def __init__(self, environment, agent, counter, **kwargs):
"""
Load the game and create a display using pygame
"""
self.environment = environment
# Hang on to the provided controller and replay memory
self.agent = agent
self.eval_agent = kwargs.get('eval_agent', agent)
self.evaluate = False
# Maximum number of no-op that can be performed at the start of an episode
self.noop_max = kwargs.get('noop_max', 30)
self.action_repeat = kwargs.get('action_repeat', 4)
self.counter = counter
# Listeners for storing parameters, tensorboard, etc.
self.listeners = []
def add_listener(self, listener):
"""
"""
self.listeners.append(listener)
def learn_episode(self):
"""
Allow for controller to learn while playing the game
"""
# Reset the game to start a new episode, and let the agent know
self.environment.reset_game()
self.agent.start_episode()
num_lives = self.environment.lives()
score = 0
for listener in self.listeners:
listener.start_episode({})
# Wait a random number of frames before starting
for i in range(np.random.randint(self.noop_max)):
self.environment.act(0)
while not self.environment.terminal():
state = self.environment.get_state()
# Have the agent observe the environment, then act
self.agent.observe(state)
action, Q = self.agent.act()
# Run the action 4 times
reward = 0.0
for i in range(self.action_repeat):
reward += self.environment.act(action)
score += reward
self.counter.step()
# Cap reward to be between -1 and 1
reward = min(max(reward, -1.0), 1.0)
for listener in self.listeners:
listener.record({'Q': np.max(Q), 'reward': reward, 'action': action})
is_terminal = self.environment.terminal() or self.environment.lives() != num_lives
num_lives = self.environment.lives()
self.agent.learn(action, reward, is_terminal)
for listener in self.listeners:
listener.end_episode({'score': score})
return score
def play_episode(self, num_noop = 0):
"""
Allow the eval agent to play
"""
total_score = 0
# Reset the game to start a new episode
self.environment.reset_game()
# self.environment.display()
# Perform a certain number of noops
for i in range(num_noop):
_ = self.environment.act(0)
while not self.environment.terminal():
state = self.environment.get_state()
# self.environment.display()
self.eval_agent.observe(state)
action, Q = self.eval_agent.act()
for i in range(self.action_repeat):
reward = self.environment.act(action)
total_score += reward
return total_score