/
GPOMDP_hopper_mj.py
144 lines (127 loc) · 5.59 KB
/
GPOMDP_hopper_mj.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
from rllab.envs.mujoco.half_cheetah_env import HalfCheetahEnv
from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
from rllab.envs.normalized_env import normalize
import numpy as np
import theano
import theano.tensor as TT
from rllab.sampler import parallel_sampler
from lasagne.updates import sgd
from lasagne.updates import adam
from rllab.misc import ext
from rllab.envs.gym_env import GymEnv
import pandas as pd
np.set_printoptions(threshold = np.inf)
import matplotlib.pyplot as plt
load_policy=True
# normalize() makes sure that the actions for the environment lies
# within the range [-1, 1] (only works for environments with continuous actions)
env = normalize(HalfCheetahEnv())
# Initialize a neural network policy with a single hidden layer of 8 hidden units
policy = GaussianMLPPolicy(env.spec, hidden_sizes=(32,32))
parallel_sampler.populate_task(env, policy)
# policy.distribution returns a distribution object under rllab.distributions. It contains many utilities for computing
# distribution-related quantities, given the computed dist_info_vars. Below we use dist.log_likelihood_sym to compute
# the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class
# rllab.distributions.DiagonalGaussian
dist = policy.distribution
# We will collect 100 trajectories per iteration
N = 10
# Each trajectory will have at most 100 time steps
T = 500
# Number of iterations
n_itr = 1000
# Set the discount factor for the problem
discount = 0.99
# Learning rate for the gradient update
learning_rate = 0.001
observations_var = env.observation_space.new_tensor_variable(
'observations',
# It should have 1 extra dimension since we want to represent a list of observations
extra_dims=1
)
actions_var = env.action_space.new_tensor_variable(
'actions',
extra_dims=1
)
d_rewards_var = TT.vector('d_rewards')
# policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the
# distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation.
dist_info_vars = policy.dist_info_sym(observations_var)
surr = TT.sum(- dist.log_likelihood_sym_1traj_GPOMDP(actions_var, dist_info_vars) * d_rewards_var)
params = policy.get_params(trainable=True)
grad = theano.grad(surr, params)
eval_grad1 = TT.matrix('eval_grad0',dtype=grad[0].dtype)
eval_grad2 = TT.vector('eval_grad1',dtype=grad[1].dtype)
eval_grad3 = TT.matrix('eval_grad3',dtype=grad[2].dtype)
eval_grad4 = TT.vector('eval_grad4',dtype=grad[3].dtype)
eval_grad5 = TT.matrix('eval_grad5',dtype=grad[3].dtype)
eval_grad6 = TT.vector('eval_grad5',dtype=grad[3].dtype)
eval_grad7 = TT.vector('eval_grad5',dtype=grad[3].dtype)
f_train = theano.function(
inputs = [observations_var, actions_var, d_rewards_var],
outputs = grad
)
f_update = theano.function(
inputs = [eval_grad1, eval_grad2, eval_grad3, eval_grad4, eval_grad5,eval_grad6,eval_grad7],
outputs = None,
updates = adam([eval_grad1, eval_grad2, eval_grad3, eval_grad4, eval_grad5,eval_grad6,eval_grad7], params, learning_rate=learning_rate)
)
alla = {}
rewards_snapshot_data={}
all_policy_param_data = {}
parallel_sampler.initialize(10)
for k in range(10):
if (load_policy):
# policy.set_param_values(np.loadtxt('half_policy.txt'), trainable=True)
policy.set_param_values(np.loadtxt('phc'+np.str(k+1)+'.txt'), trainable=True)
avg_return = np.zeros(n_itr)
rewards_snapshot=[]
all_policy_param = []
#np.savetxt("policy_novar.txt",snap_policy.get_param_values(trainable=True))
for j in range(n_itr):
if (j%10==0):
all_policy_param.append(policy.get_param_values())
paths = parallel_sampler.sample_paths_on_trajectories(policy.get_param_values(),N,T,show_bar=False)
paths = paths[:N]
#baseline.fit(paths)
observations = [p["observations"] for p in paths]
actions = [p["actions"] for p in paths]
d_rewards = [p["rewards"] for p in paths]
temp = list()
for x in d_rewards:
z=list()
t=1
for y in x:
z.append(y*t)
t*=discount
temp.append(np.array(z))
d_rewards=temp
s_g = f_train(observations[0], actions[0], d_rewards[0])
for ob,ac,rw in zip(observations[1:],actions[1:],d_rewards[1:]):
s_g = [sum(x) for x in zip(s_g,f_train(ob, ac, rw))]
s_g = [x/len(paths) for x in s_g]
f_update(s_g[0],s_g[1],s_g[2],s_g[3],s_g[4],s_g[5],s_g[6])
avg_return[j] = np.mean([sum(p["rewards"]) for p in paths])
rewards_snapshot.append(np.array([sum(p["rewards"]) for p in paths]))
print(str(j)+' Average Return:', avg_return[j])
rewards_snapshot_data["rewardsSnapshot"+str(k)]= rewards_snapshot
all_policy_param_data["policyParams"+str(k)] = all_policy_param
# plt.plot(avg_return[::10])
# plt.show()
alla["avgReturn"+str(k)]=avg_return
alla = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in alla.items() ]))
rewards_snapshot_data = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in rewards_snapshot_data.items() ]))
all_policy_param_data = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in all_policy_param_data.items() ]))
rewards_snapshot_data.to_csv("rewards_snapshot_GPOMDP_hc.csv",index=False)
all_policy_param_data.to_csv("param_policy_hc_sgd.csv",index=False)
alla.to_csv("GPOMDP_SVRG_adaptive_GPOMDP_hc.csv",index=False)
#
#obs = env.reset()
#env.render()
#done = False
#i=0
#while not done and i<100:
# i +=1
# act = policy.get_action(obs)
# obs, rw, done, info = env.step(act[0])
# env.render()