-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
164 lines (131 loc) · 5.68 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#Implementation of Deep Deterministic Gradient with Tensor Flow"
# Author: Steven Spielberg Pon Kumar (github.com/stevenpjg)
try:
import matplotlib.pyplot as plt
except:
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import gym
from gym.spaces import Box, Discrete
import numpy as np
from ddpg import DDPG
from ou_noise import OUNoise
from system import ControlSystem
import errno
import os
from datetime import datetime
from actor_net import ActorNet
from critic_net import CriticNet
import argparse
#specify parameters here:
episodes=100000
is_batch_norm = False #batch normalization switch
def main():
enable_actuator_dynamics = True
env=ControlSystem(enable_actuator_dynamics = enable_actuator_dynamics)
steps= env.timestep_limit #steps per episode
assert isinstance(env.observation_space, Box), "observation space must be continuous"
assert isinstance(env.action_space, Box), "action space must be continuous"
#Randomly initialize critic,actor,target critic, target actor network and replay buffer
agent = DDPG(env, is_batch_norm)
# agent.load_model()
exploration_noise = OUNoise(env.action_space.shape[0])
counter=0
reward_per_episode = 0
total_reward=0
num_states = env.observation_space.shape[0]
num_actions = env.action_space.shape[0]
print ("Number of States:", num_states)
print ("Number of Actions:", num_actions)
print ("Number of Steps per episode:", steps)
#saving reward:
reward_st = np.array([0])
log_dir = os.path.join(
os.getcwd(), 'logs',
datetime.now().strftime('%Y-%m-%d_%H-%M-%S'), 'action')
if enable_actuator_dynamics == True:
filtered_log_dir = os.path.join(
os.getcwd(), 'logs',
datetime.now().strftime('%Y-%m-%d_%H-%M-%S'), 'filtered_action')
y_hat_log_dir = os.path.join(
os.getcwd(), 'logs',
datetime.now().strftime('%Y-%m-%d_%H-%M-%S'), 'y_hat')
y_ref_log_dir = os.path.join(
os.getcwd(), 'logs',
datetime.now().strftime('%Y-%m-%d_%H-%M-%S'), 'y_ref')
gen_function_log_dir = os.path.join(
os.getcwd(), 'logs',
datetime.now().strftime('%Y-%m-%d_%H-%M-%S'), 'function')
os.makedirs(log_dir)
if enable_actuator_dynamics == True:
os.makedirs(filtered_log_dir)
os.makedirs(y_hat_log_dir)
os.makedirs(y_ref_log_dir)
os.makedirs(gen_function_log_dir)
for i in range(episodes):
print ("==== Starting episode no:",i,"====")
observation = env.reset()
reward_per_episode = 0
actions_per_episode = []
if enable_actuator_dynamics == True:
filtered_action_per_episode = []
for t in range(steps):
#rendering environmet (optional)
env.render()
x = observation
action = agent.evaluate_actor(np.reshape(x,[1,num_states]))
noise = exploration_noise.noise()
action = action[0] + noise #Select action according to current policy and exploration noise
actions_per_episode.append(action)
# if i % 100 == 0:
# print ("Action at step", t ," :",action,"\n")
# print("#", action[0])
if action[0] < 0:
action = [0]
elif action[0] > 1:
action = [1]
# print("Step", t, 'action', action)
if enable_actuator_dynamics == False:
observation,reward,Y_plot,t_plot,y_ref,random_function=env.step(action,t)
elif enable_actuator_dynamics == True:
observation,reward,filtered_action,Y_plot,t_plot,y_ref,random_function=env.step(action,t)
filtered_action_per_episode.append(filtered_action)
# print ("Reward at step", t ," :",reward,"\n")
#add y_t,y_t-1,action,reward,timestep to experience memory
agent.add_experience(x,observation,action,reward,t)
#train critic and actor network
if counter > 64:
agent.train()
reward_per_episode+=reward
counter+=1
#check if episode ends:
if (t == steps-1):
print ('EPISODE: ',i,' Steps: ',t,' Total Reward: ',reward_per_episode)
# print ("Printing reward to file")
exploration_noise.reset() #reinitializing random noise for action exploration
reward_st = np.append(reward_st,reward_per_episode)
np.savetxt('episode_reward.txt',reward_st, newline="\n")
# print("Y_plot")
# plt.step(t_plot,Y_plot)
# plt.grid()
# plt.xlabel('t')
# plt.ylabel('y')
# plt.show()
# Save actions
np.savetxt(log_dir + '/' + str(i).zfill(7) + '.txt', actions_per_episode)
if enable_actuator_dynamics == True:
np.savetxt(filtered_log_dir + '/' + str(i).zfill(7) + '.txt', filtered_action_per_episode)
np.savetxt(y_hat_log_dir + '/' + str(i).zfill(7) + '.txt', Y_plot)
np.savetxt(y_ref_log_dir + '/' + str(i).zfill(7) + '.txt', y_ref)
# np.savetxt(gen_function_log_dir + '/' + str(i).zfill(7) + '.txt', random_function)
# save model
if i % 100 == 0:
print('save')
agent.save_model()
# print ('\n\n')
break
total_reward+=reward_per_episode
print ("Average reward per episode {}".format(total_reward / episodes) )
if __name__ == '__main__':
main()