/
mineRL.py
290 lines (218 loc) · 9.76 KB
/
mineRL.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
"""
!!! Modified MineRL Library do not change or update the MineRL Library !!!
"""
# Imports
import gym
import minerl
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
from collections import OrderedDict
import torchvision.transforms as transforms
# Silicon Chip GPU Acceleration -- Comment out if not using a Silicon Chip
device = "mps" if torch.backends.mps.is_available() else "cpu"
# Nvidia GPU Acceleration -- Comment out if not using a Nvidia GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print('Device: {}'.format(device))
# Logging
# import logging
# logging.basicConfig(level=logging.DEBUG)
class CNN(nn.Module):
def __init__(self, input_shape, num_actions):
super(CNN, self).__init__()
self.conv1 = nn.Conv2d(input_shape[2], 16, kernel_size=3, stride=1, padding=1)
self.relu1 = nn.ReLU()
self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
self.relu2 = nn.ReLU()
self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
self.conv3 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
self.relu3 = nn.ReLU()
self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)
self.conv4 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
self.fc1_float = nn.Linear(56320, 32) # Adjusted based on new input shape
self.relu3_float = nn.ReLU()
self.fc2_float = nn.Linear(32, 2)
self.fc1 = nn.Linear(56320, 32) # Adjusted based on new input shape
self.relu3 = nn.ReLU()
self.fc2 = nn.Linear(32, 22)
def forward(self, x):
x = self.conv1(x)
x = self.relu1(x)
x = self.pool1(x)
x = self.conv2(x)
x = self.relu2(x)
x = self.pool2(x)
x = self.conv3(x)
x = self.relu3(x)
x = self.pool3(x)
x = self.conv4(x)
x = x.view(x.size(0), -1)
x_float = self.fc1_float(x)
x_float = self.relu3_float(x_float)
logits_float = self.fc2_float(x_float)
x = self.fc1(x)
x = self.relu3(x)
logits = self.fc2(x)
action_probs = torch.softmax(logits, dim=-1)
return action_probs, logits_float
def normalize(x):
mean = torch.mean(x)
x -= mean
std = torch.std(x)
x /= std
return x
env = gym.make("MineRLObtainDiamondShovel-v0")
# env = gym.make('MineRLBasaltFindCave-v0')
input_shape = env.observation_space['pov'].shape
num_actions = 22
cnn = CNN(input_shape, num_actions).to(device)
optimizer = optim.Adam(cnn.parameters(), lr=0.00005) # Set up optimizer
# env.seed(21)
state = env.reset()
done = False
num_episodes = 50 # Number of episodes to run
gamma = 0.99 # Discount factor for cumulative rewards
max_episode_steps = 200
progress_bar = tqdm(total=num_episodes, desc = "Episodes", unit="episode", ncols=80) # Set up progress bar for episodes
cumulativeRewards = [] # Store cumulative rewards per episode
losses = [] # Store loss per episode
episode_lengths = [] # Store number of actions/steps per episode
for episode in range(num_episodes):
# Option to render the last episode
if episode == num_episodes - 1:
user_input = input("\nPress return or enter to continue or 'q' to quit.")
if user_input.lower() == 'q':
print("Continuing")
else:
ui_steps = int(input("Input number of steps for the last episode: "))
max_episode_steps = ui_steps
print("Rendering last Episode")
env.render()
episode_rewards = [] # Store episode rewards
episode_log_probs = []
steps = 0
state = env.reset() # Reset the environment and get initial observation
while True:
state = torch.tensor(state['pov'].copy(), dtype=torch.float32).permute(2, 0, 1).unsqueeze(0)
# Resize the image
resize = transforms.Resize((state.shape[2] // 2, state.shape[3] // 2), antialias=False)
state = resize(state)
state = normalize(state)
state = state.to(device)
discreteActions, camera = cnn(state)
camera = camera.tolist()[0]
camera_y = torch.sigmoid(torch.tensor(camera[1]))
d_action_dist = torch.distributions.Bernoulli(discreteActions)
camera_dist = torch.distributions.Normal(camera[0], camera_y)
action = d_action_dist.sample() # Sample an action from the distribution
cameraX = camera_dist.sample()
cameraY = camera_dist.sample()
action_dict = OrderedDict([
("ESC", np.array([0])),
("attack", np.array([int(action[0][0].item())])),
("back", np.array([int(action[0][1].item())])),
("camera", np.array([cameraX.item(), cameraY.item()], dtype=np.float32)),
("drop", np.array([int(action[0][2].item())])),
("forward", np.array([int(action[0][3].item())])),
("hotbar.1", np.array([int(action[0][4].item())])),
("hotbar.2", np.array([int(action[0][5].item())])),
("hotbar.3", np.array([int(action[0][6].item())])),
("hotbar.4", np.array([int(action[0][7].item())])),
("hotbar.5", np.array([int(action[0][8].item())])),
("hotbar.6", np.array([int(action[0][9].item())])),
("hotbar.7", np.array([int(action[0][10].item())])),
("hotbar.8", np.array([int(action[0][11].item())])),
("hotbar.9", np.array([int(action[0][12].item())])),
("inventory", np.array([int(action[0][13].item())])),
("jump", np.array([int(action[0][14].item())])),
("left", np.array([int(action[0][15].item())])),
("pickItem", np.array([int(action[0][16].item())])),
("right", np.array([int(action[0][17].item())])),
("sneak", np.array([int(action[0][18].item())])),
("sprint", np.array([int(action[0][19].item())])),
("swapHands", np.array([int(action[0][20].item())])),
("use", np.array([int(action[0][21].item())]))
])
log_prob = d_action_dist.log_prob(action[0]) # Calculate the log probability of the chosen action
log_prob_cameraX = camera_dist.log_prob(cameraX) # Log probability of the chosen cameraX action
log_prob_cameraY = camera_dist.log_prob(cameraY) # Log probability of the chosen cameraY action
next_state, reward, done, info = env.step(action_dict) # Take action in the environment
if action_dict['sprint'] == 1:
reward += 1.0
if action_dict['forward'] == 1:
reward += 0.02
if action_dict['right'] == 1:
reward += 0.02
if action_dict['left'] == 1:
reward += 0.02
if action_dict['jump'] == 1:
reward += 0.015
if action_dict['attack'] == 1:
reward += 0.035
episode_rewards.append(reward) # Store the reward
log_probs = log_prob[0] + log_prob_cameraX + log_prob_cameraY
episode_log_probs.append(log_probs)
if done or steps >= max_episode_steps:
discounted_rewards = np.zeros_like(episode_rewards) # Array to store discounted rewards
cumulative_rewards = 0
# Calculate cumulative rewards using discounting
for t in reversed(range(len(episode_rewards))):
# Update cumulative rewards by discounting previous cumulative rewards and adding the current reward
cumulative_rewards = cumulative_rewards * gamma + episode_rewards[t]
# Store the calculated cumulative rewards in the discounted_rewards array
discounted_rewards[t] = cumulative_rewards
discounted_rewards = torch.tensor(discounted_rewards, dtype=torch.float32).to(device)
episode_log_probs = torch.stack(episode_log_probs, dim = -1).to(device)
loss = -torch.sum(episode_log_probs * discounted_rewards)
optimizer.zero_grad() # Clear gradients - reset the gradients of the optimizer
loss.backward() # Backpropagate - calculate gradients of the loss with respect to model parameters
nn.utils.clip_grad_norm_(cnn.parameters(), max_norm=100)
optimizer.step() # Update parameters - adjust model parameters based on gradients using optimizer
print(f"Cumulative Rewards: {cumulative_rewards}")
print(f"Loss: {loss.detach().item()}")
cumulativeRewards.append(cumulative_rewards)
episode_lengths.append(steps)
losses.append(loss.detach().item())
break
state = next_state
steps+=1
if steps % 100 == 0:
print(f"Steps: {steps}")
env.render()
progress_bar.update(1)
progress_bar.close()
env.close()
rollingAverageWindow = int(input("Input rolling average window size: "))
# Calculate rolling average of rewards
rewardsRollingAverage = np.convolve(cumulativeRewards, np.ones(rollingAverageWindow)/rollingAverageWindow, mode='full')
# Plot episode rewards
plt.plot(cumulativeRewards, label="Rewards")
plt.plot(range(0, len(rewardsRollingAverage)), rewardsRollingAverage, label="Rolling Average")
plt.xlim(0, num_episodes)
plt.xlabel("Episode Number")
plt.ylabel("Reward Value")
plt.title("Episode Rewards and Running Average")
plt.legend()
plt.savefig("episode_rewards.png")
plt.close()
# Plot losses
losses_array = np.array(losses)
plt.plot(losses_array)
plt.xlim(0, num_episodes)
plt.xlabel("Episode Number")
plt.ylabel("Loss Value")
plt.title("Episode Losses")
plt.savefig("episode_losses.png")
plt.close()
# Plot episode lengths
plt.plot(episode_lengths)
plt.xlim(0, num_episodes)
plt.xlabel("Episode Number")
plt.ylabel("Amount of Steps/Actions")
plt.title("Episode Lengths")
plt.savefig("episode_lengths.png")
plt.close()