-
Notifications
You must be signed in to change notification settings - Fork 0
/
planning.py
90 lines (64 loc) · 2.21 KB
/
planning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import numpy as np
import networkx as nx
import warnings
def offline_SP_planning(G,means):
# G = G_cyc.copy()
# G.remove_edges_from(nx.selfloop_edges(G_cyc))
# The above was removed because it causes unnecessary edge removal operation in each planning call.
mu_star = np.max(means)
s_star = np.argmax(means)
c = mu_star - means
n_nodes = G.number_of_nodes()
distance = np.ones(n_nodes)*np.inf
distance[s_star] = 0
policy = {s_star:s_star}
# Value iteration for acyclic all-to-all weighted shortest path.
n_calls = 0
n_iter = 0
for _ in range(n_nodes):
n_iter+=1
updated = False
# Bellman-Ford
for s in G:
best_nb_d = distance[s]
for w in G.neighbors(s):
if w == s:
continue
if best_nb_d>distance[w]+c[w]:
best_nb_d=distance[w]+c[w]
policy[s] = w
updated = True
distance[s] = best_nb_d
# Terminate early if no update is made.
if not updated:
break
# print('n_iter',n_iter)
return policy,n_calls,n_iter
def EVI_known_transition_planning(G,means,epsilon = 0.0001):
'''
epsilon: The stopping condition.
'''
assert(epsilon>0)
u = np.zeros(G.number_of_nodes())
policy = {}
iter_count = 0
max_iter = np.min([1e4,1/epsilon])
# max_iter: Hard upper-bound of the number of iterations, to ensure the algorithm does not fall into infinite loop
while iter_count<=max_iter:
iter_count+=1
u_old = np.array(u)
for s in G:
best_nb_u = 0
for w in G[s]:
if u_old[w]>best_nb_u:
best_nb_u = u_old[w]
policy[s] = w
u[s] = means[s]+best_nb_u
# print(iter_count, u)
if np.max(u-u_old) - np.min(u-u_old)<epsilon:
# print('Gap',np.max(u-u_old) - np.min(u-u_old))
break
# print('iter_count',iter_count)
# The following operation is slow. Don't use it!
# policy = {s:list(G[s])[np.argmax(u[G[s]])] for s in G}
return policy, u