/
fully_connected.py
246 lines (208 loc) · 9.33 KB
/
fully_connected.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
# Copyright (c) 2021, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root
# or https://opensource.org/licenses/BSD-3-Clause
#
"""
The Fully Connected Network class
"""
import numpy as np
import torch
import torch.nn.functional as func
from gym.spaces import Box, Dict, Discrete, MultiDiscrete
from torch import nn
from warp_drive.utils.constants import Constants
from warp_drive.utils.data_feed import DataFeed
from warp_drive.training.utils.data_loader import get_flattened_obs_size
_OBSERVATIONS = Constants.OBSERVATIONS
_PROCESSED_OBSERVATIONS = Constants.PROCESSED_OBSERVATIONS
_ACTION_MASK = Constants.ACTION_MASK
_LARGE_NEG_NUM = -1e20
def apply_logit_mask(logits, mask=None):
"""
Mask values of 1 are valid actions.
Add huge negative values to logits with 0 mask values.
"""
if mask is None:
return logits
logit_mask = torch.ones_like(logits) * _LARGE_NEG_NUM
logit_mask = logit_mask * (1 - mask)
return logits + logit_mask
# Policy networks
# ---------------
class FullyConnected(nn.Module):
"""
Fully connected network implementation in Pytorch
"""
name = "torch_fully_connected"
def __init__(
self,
env,
model_config,
policy,
policy_tag_to_agent_id_map,
create_separate_placeholders_for_each_policy=False,
obs_dim_corresponding_to_num_agents="first",
):
super().__init__()
self.env = env
fc_dims = model_config["fc_dims"]
assert isinstance(fc_dims, list)
num_fc_layers = len(fc_dims)
self.policy = policy
self.policy_tag_to_agent_id_map = policy_tag_to_agent_id_map
self.create_separate_placeholders_for_each_policy = (
create_separate_placeholders_for_each_policy
)
assert obs_dim_corresponding_to_num_agents in ["first", "last"]
self.obs_dim_corresponding_to_num_agents = obs_dim_corresponding_to_num_agents
sample_agent_id = self.policy_tag_to_agent_id_map[self.policy][0]
# Flatten obs space
self.observation_space = self.env.env.observation_space[sample_agent_id]
self.flattened_obs_size = self.get_flattened_obs_size(self.observation_space)
if isinstance(self.env.env.action_space[sample_agent_id], Discrete):
action_space = [self.env.env.action_space[sample_agent_id].n]
elif isinstance(self.env.env.action_space[sample_agent_id], MultiDiscrete):
action_space = self.env.env.action_space[sample_agent_id].nvec
else:
raise NotImplementedError
input_dims = [self.flattened_obs_size] + fc_dims[:-1]
output_dims = fc_dims
self.fc = nn.ModuleDict()
for fc_layer in range(num_fc_layers):
self.fc[str(fc_layer)] = nn.Sequential(
nn.Linear(input_dims[fc_layer], output_dims[fc_layer]),
nn.ReLU(),
)
# policy network (list of heads)
policy_heads = [None for _ in range(len(action_space))]
self.output_dims = [] # Network output dimension(s)
for idx, act_space in enumerate(action_space):
self.output_dims += [act_space]
policy_heads[idx] = nn.Linear(fc_dims[-1], act_space)
self.policy_head = nn.ModuleList(policy_heads)
# value-function network head
self.vf_head = nn.Linear(fc_dims[-1], 1)
# used for action masking
self.action_mask = None
# max batch size allowed
name = f"{_PROCESSED_OBSERVATIONS}_batch_{self.policy}"
self.batch_size = self.env.cuda_data_manager.get_shape(name=name)[0]
def get_flattened_obs_size(self, observation_space):
"""Get the total size of the observations after flattening"""
return get_flattened_obs_size(observation_space)
def reshape_and_flatten_obs(self, obs):
"""
# Note: WarpDrive assumes that all the observation are shaped
# (num_agents, *feature_dim), i.e., the observation dimension
# corresponding to 'num_agents' is the first one. If the observation
# dimension corresponding to num_agents is last, we will need to
# permute the axes to align with WarpDrive's assumption.
"""
num_envs = obs.shape[0]
if self.create_separate_placeholders_for_each_policy:
num_agents = len(self.policy_tag_to_agent_id_map[self.policy])
else:
num_agents = self.env.n_agents
if self.obs_dim_corresponding_to_num_agents == "first":
pass
elif self.obs_dim_corresponding_to_num_agents == "last":
shape_len = len(obs.shape)
if shape_len == 1:
obs = obs.reshape(-1, num_agents) # valid only when num_agents = 1
obs = obs.permute(0, -1, *range(1, shape_len - 1))
else:
raise ValueError(
"num_agents can only be the first "
"or the last dimension in the observations."
)
return obs.reshape(num_envs, num_agents, -1)
def get_flattened_obs(self):
"""
If the obs is of Box type, it will already be flattened.
If the obs is of Dict type, then concatenate all the
obs values and flatten them out.
Returns the concatenated and flattened obs.
"""
if isinstance(self.observation_space, Box):
if self.create_separate_placeholders_for_each_policy:
obs = self.env.cuda_data_manager.data_on_device_via_torch(
f"{_OBSERVATIONS}_{self.policy}"
)
else:
obs = self.env.cuda_data_manager.data_on_device_via_torch(_OBSERVATIONS)
flattened_obs = self.reshape_and_flatten_obs(obs)
elif isinstance(self.observation_space, Dict):
obs_dict = {}
for key in self.observation_space:
if self.create_separate_placeholders_for_each_policy:
obs = self.env.cuda_data_manager.data_on_device_via_torch(
f"{_OBSERVATIONS}_{self.policy}_{key}"
)
else:
obs = self.env.cuda_data_manager.data_on_device_via_torch(
f"{_OBSERVATIONS}_{key}"
)
if key == _ACTION_MASK:
self.action_mask = self.reshape_and_flatten_obs(obs)
assert self.action_mask.shape[-1] == sum(self.output_dims)
else:
obs_dict[key] = obs
flattened_obs_dict = {}
for key, value in obs_dict.items():
flattened_obs_dict[key] = self.reshape_and_flatten_obs(value)
flattened_obs = torch.cat(list(flattened_obs_dict.values()), dim=-1)
else:
raise NotImplementedError("Observation space must be of Box or Dict type")
assert flattened_obs.shape[-1] == self.flattened_obs_size, \
f"The flattened observation size of {flattened_obs.shape[-1]} is different " \
f"from the designated size of {self.flattened_obs_size} "
return flattened_obs
def forward(self, obs=None, batch_index=None):
"""
Forward pass through the model.
Returns action probabilities and value functions.
"""
if obs is None:
assert batch_index < self.batch_size, f"batch_index: {batch_index}, self.batch_size: {self.batch_size}"
# Read in observation from the placeholders and flatten them
# before passing through the fully connected layers.
# This is particularly relevant if the observations space is a Dict.
obs = self.get_flattened_obs()
if self.create_separate_placeholders_for_each_policy:
ip = obs
else:
agent_ids_for_policy = self.policy_tag_to_agent_id_map[self.policy]
ip = obs[:, agent_ids_for_policy]
# Push the processed (in this case, flattened) obs to the GPU (device).
# The writing happens to a specific batch index in the processed obs batch.
# The processed obs batch is required for training.
if batch_index >= 0:
self.push_processed_obs_to_batch(batch_index, ip)
else:
ip = obs
# Feed through the FC layers
for layer in range(len(self.fc)):
op = self.fc[str(layer)](ip)
ip = op
logits = op
# Compute the action probabilities and the value function estimate
# Apply action mask to the logits as well.
action_masks = [None for _ in range(len(self.output_dims))]
if self.action_mask is not None:
start = 0
for idx, dim in enumerate(self.output_dims):
action_masks[idx] = self.action_mask[..., start : start + dim]
start = start + dim
action_probs = [
func.softmax(apply_logit_mask(ph(logits), action_masks[idx]), dim=-1)
for idx, ph in enumerate(self.policy_head)
]
vals = self.vf_head(logits)[..., 0]
return action_probs, vals
def push_processed_obs_to_batch(self, batch_index, processed_obs):
name = f"{_PROCESSED_OBSERVATIONS}_batch_{self.policy}"
self.env.cuda_data_manager.data_on_device_via_torch(name=name)[
batch_index
] = processed_obs