-
Notifications
You must be signed in to change notification settings - Fork 2
/
evaluation_script.py
166 lines (135 loc) · 5.93 KB
/
evaluation_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import gym
from modules.policy import GreedyEpsilonPolicy
import gym
import time
import envs.pacman_envs
from keras.models import model_from_json
from modules.dqn_agent import DQNAgent
import numpy as np
from modules.preprocessors import HistoryPreprocessor
import argparse
import os
from os.path import expanduser
def calc_q_values(model, state, num_actions, expand_dims=False):
if expand_dims:
state = np.expand_dims(state, axis=0)
action_mask = np.ones([1, num_actions])
q_values = model.predict_on_batch([state, action_mask])
return q_values.flatten()
def run_random_policy(env, pred_model, args):
"""Run a random policy for the given environment.
Logs the total reward and the number of steps until the terminal
state was reached.
Parameters
----------
env: gym.envs.Environment
Instance of an OpenAI gym.
Returns
-------
(float, int)
First number is the total undiscounted reward received. The
second number is the total number of actions taken before the
episode finished.
"""
gamma = 0.99
preprocessor = HistoryPreprocessor((args.size, args.size), args.network_name,
args.number_pred, args.coop, args.history)
initial_state = env.reset()
env.render()
total_reward = 0.0
average_q_values = [0.0] * args.number_pred
rewards = []
# evaluation always uses greedy policy
greedy_policy = GreedyEpsilonPolicy(0.0)
total_steps = 0
max_episode_length = 1000
for i in range(args.num_episodes):
reward = 0.0
df = 1.0
env.random_start = False
s = env.reset()
preprocessor.reset()
preprocessor.add_state(s)
steps = 0
max_q_val_sum = [0] * args.number_pred
is_terminal = False
while not is_terminal and steps < max_episode_length:
S = preprocessor.get_state()
steps += 1
total_steps += 1
A = {}
action_string = ""
for j in range(args.number_pred):
model = pred_model[j]
q_values = calc_q_values(model, S[j], args.num_actions)
A[j] = greedy_policy.select_action(q_values)
action_string += str(A[j])
max_q_val_sum[j] += np.max(q_values)
s_prime, R, is_terminal, debug_info = env.step(action_string)
env.render()
R = preprocessor.process_reward(R)
reward += R[0] * df # same for each predator bc/ it's cooperative
preprocessor.add_state(s_prime)
df *= gamma
total_reward += reward
rewards.append(reward)
for i in range(args.number_pred):
average_q_values[i] += max_q_val_sum[i] / steps
avg_q, avg_reward = np.sum(np.array(average_q_values)) / (args.num_episodes *
args.number_pred), total_reward / args.num_episodes
avg_steps = total_steps / args.num_episodes
return avg_reward, avg_q, avg_steps, np.max(rewards), np.std(rewards)
def print_env_info(env):
print('Environment has %d states and %d actions.' % (env.nS, env.nA))
def main():
parser = argparse.ArgumentParser(description='Run DQN on Pacman!')
parser.add_argument('--algorithm', default='replay_target', help='One of basic, replay_target, double')
parser.add_argument('--compet', default=False, type=bool, help='Coop or compete.')
parser.add_argument('--debug_mode', default=False, type=bool, help='Whether or not to save states as images.')
parser.add_argument('--env', default='PacmanEnvSmartPrey-v0', help='Env name')
parser.add_argument('--num_episodes', default=25, type=int, help='Number of episodes to evaluate on.')
parser.add_argument('--gamma', default=0.99, type=float, help='discount factor (0, 1)')
parser.add_argument('--history', default=1, type=int, help='number of frames that make up a state')
parser.add_argument('--max_episode_length', default=500, type=int,
help='Max episode length (for training, not eval).')
parser.add_argument('--network_name', default='deep',
help='Model Name: deep, stanford, linear, dueling, dueling_av, or dueling_max')
parser.add_argument('--weight_path', default='~/weights/', type=str,
help='To save weight at eval frequency')
parser.add_argument('--v', default='def', type=str, help='experiment names, used for loading weights')
parser.add_argument('--iter', default=0, type=int, help='the weights to load')
args = parser.parse_args()
args.coop = not bool(args.compet)
# create the environment
env = gym.make(args.env)
args.number_pred = int(env.num_agents / 2)
args.size = env.grid_size
if 'Pacman' in args.env:
args.num_actions = 4
elif 'Warehouse' in args.env:
args.num_actions = 6
else:
args.num_actions = env.action_space.n
pred_model = {}
args.weight_path = expanduser(args.weight_path)
mypath = args.weight_path + "/" + args.v
if not os.path.isdir(mypath):
os.makedirs(mypath)
for i in range(args.number_pred):
json_file = open(args.weight_path + args.v +"/model" + str(i) + ".json", 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights(args.weight_path + args.v + "/" + str(args.iter) + "_" + str(i) + ".hd5")
pred_model[i] = loaded_model
print("Loaded model from disk")
for i in range(10):
total_reward, num_steps = run_random_policy(env, pred_model, args)
print (total_reward)
print('Agent received total reward of: %f' % total_reward)
print('Agent took %d steps' % num_steps)
if __name__ == '__main__':
main()