-
Notifications
You must be signed in to change notification settings - Fork 1
/
humanoid_model_free_ddp.py
128 lines (89 loc) · 3.57 KB
/
humanoid_model_free_ddp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
'''
copyright @ Karthikeya S Parunandi - karthikeyasharma91@gmail.com
Model free DDP method with a 6-link humanoid experiment in MuJoCo simulator.
Date: July 6, 2019
'''
#!/usr/bin/env python
import numpy as np
from model_free_DDP import DDP
import time
from mujoco_py import load_model_from_path, MjSim, MjViewer
from ltv_sys_id import ltv_sys_id_class
class model_free_humanoid_DDP(DDP, ltv_sys_id_class):
def __init__(self, initial_state, final_state, MODEL_XML, alpha, horizon, state_dimemsion, control_dimension, Q, Q_final, R):
'''
Declare the matrices associated with the cost function
'''
self.Q = Q
self.Q_final = Q_final
self.R = R
n_substeps = 5
DDP.__init__(self, MODEL_XML, state_dimemsion, control_dimension, alpha, horizon, initial_state, final_state)
ltv_sys_id_class.__init__(self, MODEL_XML, state_dimemsion, control_dimension, n_substeps, n_samples=40)
def state_output(self, state):
'''
Given a state in terms of Mujoco's MjSimState format, extract the state as a numpy array, after some preprocessing
'''
return np.concatenate([state.qpos, state.qvel]).reshape(-1, 1)
def cost(self, x, u):
'''
Incremental cost in terms of state and controls
'''
return (((x - self.X_g).T @ self.Q) @ (x - self.X_g)) + (((u.T) @ self.R) @ u)
def cost_final(self, x):
'''
Cost in terms of state at the terminal time-step
'''
return (((x - self.X_g).T @ self.Q_final) @ (x - self.X_g))
def initialize_traj(self, path=None):
'''
Initial guess for the nominal trajectory by default is produced by zero controls
'''
if path is None:
for t in range(0, self.N):
self.U_p[t] = np.random.normal(0, 0.0, (self.n_u, 1)) #np.random.normal(0, 0.01, self.n_u).reshape(self.n_u,1)#DM(array[t, 4:6])
np.copyto(self.U_p_temp, self.U_p)
self.forward_pass_sim()
np.copyto(self.X_p_temp, self.X_p)
else:
array = np.loadtxt('../rrt_path.txt')
### INCOMPLETE
if __name__=="__main__":
# Path of the model file
path_to_model_free_DDP = "/home/karthikeya/Documents/research/model_free_DDP"
MODEL_XML = path_to_model_free_DDP+"/models/humanoid.xml"
path_to_file = path_to_model_free_DDP+"/experiments/humanoid/exp_1/humanoid_policy.txt"
training_cost_data_file = path_to_model_free_DDP+"/experiments/humanoid/exp_2/training_cost_data.txt"
# Declare other parameters associated with the problem statement
horizon = 600
state_dimemsion = 47
control_dimension = 17
Q = 9*np.diag(np.concatenate([[1, 1, 1, 1], np.zeros((43,))]))
Q_final = 100*np.diag(np.concatenate([[1, 1, 1, 1], np.zeros((43,))]))
R = .1*np.diag(np.ones((control_dimension, )))
alpha = 1
# Declare the initial state and the final state in the problem
initial_state = np.zeros((47,1))
final_state = np.zeros((47,1))
initial_state[2] = 0
initial_state[6] = 1
final_state[1] = 2
final_state[2] = 1.4
final_state[3] = 1
n_iterations = 50
# Initiate the above class that contains objects specific to this problem
humanoid = model_free_humanoid_DDP(initial_state, final_state, MODEL_XML, alpha, horizon, state_dimemsion, control_dimension, Q, Q_final, R)
start_time = time.time()
# Run the DDP algorithm
humanoid.iterate_ddp(n_iterations)
print("Time taken: ", time.time() - start_time)
# Save the episodic cost
with open(training_cost_data_file, 'w') as f:
for cost in humanoid.episodic_cost_history:
f.write("%s\n" % cost)
# Test the obtained policy
humanoid.save_policy(path_to_file)
humanoid.test_episode(1, path_to_file)
print(humanoid.X_p[-1])
# Plot the episodic cost during the training
humanoid.plot_episodic_cost_history()