/
goal_generator.py
1065 lines (934 loc) · 56.5 KB
/
goal_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# -*- coding: utf-8 -*-
"""
@author: ryuichi takanobu
@modified: anubhav sachan
"""
import os
import json
import random
from collections import Counter
from copy import deepcopy
import numpy as np
import pickle
import logging
from tqdm import tqdm
# from dbquery import DBQuery
domains = {'attraction', 'hotel', 'restaurant', 'train', 'taxi', 'hospital', 'police'}
days = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
domain_keywords = {
'restaurant': 'place to dine',
'train': 'train',
'hotel': 'place to stay',
'attraction': 'places to go',
'police': 'help',
'taxi': 'taxi',
'hospital': 'hospital'
}
request_slot_string_map = {
'phone': 'phone number',
'pricerange': 'price range',
'duration': 'travel time',
'arriveBy': 'arrival time',
'leaveAt': 'departure time',
'trainID': 'train ID'
}
templates = {
'intro': 'You are looking for information in Cambridge.',
'restaurant': {
'intro': 'You are looking forward to trying local restaurants.',
'request': 'Once you find a restaurnat, make sure you get {}.',
'area': 'The restaurant should be in the {}.',
'food': 'The restaurant should serve {} food.',
'name': 'You are looking for a particular restaurant. Its name is called {}.',
'pricerange': 'The restaurant should be in the {} price range.',
'book': 'Once you find the restaurant you want to book a table {}.',
'fail_info food': 'If there is no such restaurant, how about one that serves {} food.',
'fail_info area': 'If there is no such restaurant, how about one in the {} area.',
'fail_info pricerange': 'If there is no such restaurant, how about one in the {} price range.',
'fail_book time': 'If the booking fails how about {}.',
'fail_book day': 'If the booking fails how about {}.'
},
'hotel': {
'intro': 'You are looking for a place to stay.',
'request': 'Once you find a hotel, make sure you get {}.',
'stars': 'The hotel should have a star of {}.',
'area': 'The hotel should be in the {}.',
'type': 'The hotel should be in the type of {}.',
'pricerange': 'The hotel should be in the {} price range.',
'name': 'You are looking for a particular hotel. Its name is called {}.',
'internet yes': 'The hotel should include free wifi.',
'internet no': 'The hotel does not need to include free wifi.',
'parking yes': 'The hotel should include free parking.',
'parking no': 'The hotel does not need to include free parking.',
'book': 'Once you find the hotel you want to book it {}.',
'fail_info type': 'If there is no such hotel, how about one that is in the type of {}.',
'fail_info area': 'If there is no such hotel, how about one that is in the {} area.',
'fail_info stars': 'If there is no such hotel, how about one that has a star of {}.',
'fail_info pricerange': 'If there is no such hotel, how about one that is in the {} price range.',
'fail_info parking yes': 'If there is no such hotel, how about one that has free parking.',
'fail_info parking no': 'If there is no such hotel, how about one that does not has free parking.',
'fail_info internet yes': 'If there is no such hotel, how about one that has free wifi.',
'fail_info internet no': 'If there is no such hotel, how about one that does not has free wifi.',
'fail_book stay': 'If the booking fails how about {} nights.',
'fail_book day': 'If the booking fails how about {}.'
},
'attraction': {
'intro': 'You are excited about seeing local tourist attractions.',
'request': 'Once you find an attraction, make sure you get {}.',
'area': 'The attraction should be in the {}.',
'type': 'The attraction should be in the type of {}.',
'name': 'You are looking for a particular attraction. Its name is called {}.',
'fail_info type': 'If there is no such attraction, how about one that is in the type of {}.',
'fail_info area': 'If there is no such attraction, how about one in the {} area.'
},
'taxi': {
'intro': 'You are also looking for a taxi.',
'commute': 'You also want to book a taxi to commute between the two places.',
'restaurant': 'You want to make sure it arrives the restaurant by the booked time.',
'request': 'Once you find a taxi, make sure you get {}.',
'departure': 'The taxi should depart from {}.',
'destination': 'The taxi should go to {}.',
'leaveAt': 'The taxi should leave after {}.',
'arriveBy': 'The taxi should arrive by {}.'
},
'train': {
'intro': 'You are also looking for a train.',
'request': 'Once you find a train, make sure you get {}.',
'departure': 'The train should depart from {}.',
'destination': 'The train should go to {}.',
'day': 'The train should leave on {}.',
'leaveAt': 'The train should leave after {}.',
'arriveBy': 'The train should arrive by {}.',
'book': 'Once you find the train you want to make a booking {}.'
},
'police': {
'intro': 'You were robbed and are looking for help.',
'request': 'Make sure you get {}.'
},
'hospital': {
'intro': 'You got injured and are looking for a hospital nearby',
'request': 'Make sure you get {}.',
'department': 'The hospital should have the {} department.'
}
}
pro_correction = {
"info": 0.2,
"reqt": 0.2,
"book": 0.2
}
services = {'RentalCars_2', 'Movies_3', 'Services_2', 'Media_2', 'Restaurants_1', 'Homes_2', 'Payment_1', 'Calendar_1', 'Flights_1', 'Weather_1', 'Trains_1', 'Messaging_1', 'RentalCars_3', 'Music_2', 'Hotels_3', 'Buses_3', 'Music_3', 'Travel_1', 'Flights_3', 'Homes_1', 'Music_1', 'Alarm_1', 'Movies_2', 'Events_2', 'Media_1', 'RentalCars_1', 'Services_3', 'Buses_2', 'Flights_2', 'Hotels_4', 'Hotels_2', 'RideSharing_2', 'Events_3', 'Media_3', 'Services_4', 'Restaurants_2', 'Movies_1', 'Hotels_1', 'Buses_1', 'Banks_1', 'Events_1', 'Services_1', 'Flights_4', 'RideSharing_1', 'Banks_2'}
typs = ['info', 'reqt', 'book']
def null_boldify(content):
return content
def do_boldify(content):
return '<b>' + content + '</b>'
def nomial_sample(counter: Counter):
return list(counter.keys())[np.argmax(np.random.multinomial(1, list(counter.values())))]
class GoalGenerator:
"""User goal generator"""
def __init__(self, data_dir, cfg,
goal_model_path,
corpus_path=None,
boldify=False):
"""
Args:
goal_model_path: path to a goal model
corpus_path: path to a dialog corpus to build a goal model
"""
# self.dbquery = DBQuery(data_dir, cfg)
self.goal_model_path = data_dir + '/' + goal_model_path
self.corpus_path = data_dir + '/' + corpus_path if corpus_path is not None else None
self.boldify = do_boldify if boldify else null_boldify
if os.path.exists(self.goal_model_path):
with open(self.goal_model_path, 'rb') as f:
self.ind_slot_dist, self.ind_slot_value_dist, self.domain_ordering_dist, self.book_dist = pickle.load(f)
logging.info('Loading goal model is done')
else:
self._build_goal_model()
logging.info('Building goal model is done')
# remove some slot
del self.ind_slot_dist['police']['reqt']['postcode']
del self.ind_slot_value_dist['police']['reqt']['postcode']
del self.ind_slot_dist['hospital']['reqt']['postcode']
del self.ind_slot_value_dist['hospital']['reqt']['postcode']
del self.ind_slot_dist['hospital']['reqt']['address']
del self.ind_slot_value_dist['hospital']['reqt']['address']
def _build_goal_model(self):
with open(self.corpus_path) as f:
dialogs = json.load(f)
# domain ordering
def _get_dialog_domains(dialog):
return list(filter(lambda x: x in domains and len(dialog['goal'][x]) > 0, dialog['goal']))
domain_orderings = []
for d in dialogs:
d_domains = _get_dialog_domains(dialogs[d])
first_index = []
for domain in d_domains:
message = [dialogs[d]['goal']['message']] if type(dialogs[d]['goal']['message']) == str else \
dialogs[d]['goal']['message']
for i, m in enumerate(message):
if domain_keywords[domain].lower() in m.lower() or domain.lower() in m.lower():
first_index.append(i)
break
domain_orderings.append(tuple(map(lambda x: x[1], sorted(zip(first_index, d_domains), key=lambda x: x[0]))))
domain_ordering_cnt = Counter(domain_orderings)
self.domain_ordering_dist = deepcopy(domain_ordering_cnt)
for order in domain_ordering_cnt.keys():
self.domain_ordering_dist[order] = domain_ordering_cnt[order] / sum(domain_ordering_cnt.values())
# independent goal slot distribution
ind_slot_value_cnt = dict([(domain, {}) for domain in domains])
domain_cnt = Counter()
book_cnt = Counter()
for d in dialogs:
for domain in domains:
if dialogs[d]['goal'][domain] != {}:
domain_cnt[domain] += 1
if 'info' in dialogs[d]['goal'][domain]:
for slot in dialogs[d]['goal'][domain]['info']:
if 'invalid' in slot:
continue
if 'info' not in ind_slot_value_cnt[domain]:
ind_slot_value_cnt[domain]['info'] = {}
if slot not in ind_slot_value_cnt[domain]['info']:
ind_slot_value_cnt[domain]['info'][slot] = Counter()
if 'care' in dialogs[d]['goal'][domain]['info'][slot]:
continue
ind_slot_value_cnt[domain]['info'][slot][dialogs[d]['goal'][domain]['info'][slot]] += 1
if 'reqt' in dialogs[d]['goal'][domain]:
for slot in dialogs[d]['goal'][domain]['reqt']:
if 'reqt' not in ind_slot_value_cnt[domain]:
ind_slot_value_cnt[domain]['reqt'] = Counter()
ind_slot_value_cnt[domain]['reqt'][slot] += 1
if 'book' in dialogs[d]['goal'][domain]:
book_cnt[domain] += 1
for slot in dialogs[d]['goal'][domain]['book']:
if 'invalid' in slot:
continue
if 'book' not in ind_slot_value_cnt[domain]:
ind_slot_value_cnt[domain]['book'] = {}
if slot not in ind_slot_value_cnt[domain]['book']:
ind_slot_value_cnt[domain]['book'][slot] = Counter()
if 'care' in dialogs[d]['goal'][domain]['book'][slot]:
continue
ind_slot_value_cnt[domain]['book'][slot][dialogs[d]['goal'][domain]['book'][slot]] += 1
self.ind_slot_value_dist = deepcopy(ind_slot_value_cnt)
self.ind_slot_dist = dict([(domain, {}) for domain in domains])
self.book_dist = {}
for domain in domains:
if 'info' in ind_slot_value_cnt[domain]:
for slot in ind_slot_value_cnt[domain]['info']:
if 'info' not in self.ind_slot_dist[domain]:
self.ind_slot_dist[domain]['info'] = {}
if slot not in self.ind_slot_dist[domain]['info']:
self.ind_slot_dist[domain]['info'][slot] = {}
self.ind_slot_dist[domain]['info'][slot] = sum(ind_slot_value_cnt[domain]['info'][slot].values()) / \
domain_cnt[domain]
slot_total = sum(ind_slot_value_cnt[domain]['info'][slot].values())
for val in self.ind_slot_value_dist[domain]['info'][slot]:
self.ind_slot_value_dist[domain]['info'][slot][val] = ind_slot_value_cnt[domain]['info'][slot][
val] / slot_total
if 'reqt' in ind_slot_value_cnt[domain]:
for slot in ind_slot_value_cnt[domain]['reqt']:
if 'reqt' not in self.ind_slot_dist[domain]:
self.ind_slot_dist[domain]['reqt'] = {}
self.ind_slot_dist[domain]['reqt'][slot] = ind_slot_value_cnt[domain]['reqt'][slot] / domain_cnt[
domain]
self.ind_slot_value_dist[domain]['reqt'][slot] = ind_slot_value_cnt[domain]['reqt'][slot] / \
domain_cnt[domain]
if 'book' in ind_slot_value_cnt[domain]:
for slot in ind_slot_value_cnt[domain]['book']:
if 'book' not in self.ind_slot_dist[domain]:
self.ind_slot_dist[domain]['book'] = {}
if slot not in self.ind_slot_dist[domain]['book']:
self.ind_slot_dist[domain]['book'][slot] = {}
self.ind_slot_dist[domain]['book'][slot] = sum(ind_slot_value_cnt[domain]['book'][slot].values()) / \
domain_cnt[domain]
slot_total = sum(ind_slot_value_cnt[domain]['book'][slot].values())
for val in self.ind_slot_value_dist[domain]['book'][slot]:
self.ind_slot_value_dist[domain]['book'][slot][val] = ind_slot_value_cnt[domain]['book'][slot][
val] / slot_total
self.book_dist[domain] = book_cnt[domain] / len(dialogs)
with open(self.goal_model_path, 'wb') as f:
pickle.dump((self.ind_slot_dist, self.ind_slot_value_dist, self.domain_ordering_dist, self.book_dist), f)
def _get_domain_goal(self, domain):
cnt_slot = self.ind_slot_dist[domain]
cnt_slot_value = self.ind_slot_value_dist[domain]
pro_book = self.book_dist[domain]
while True:
domain_goal = {'info': {}}
# inform
if 'info' in cnt_slot:
for slot in cnt_slot['info']:
if random.random() < cnt_slot['info'][slot] + pro_correction['info']:
domain_goal['info'][slot] = nomial_sample(cnt_slot_value['info'][slot])
if domain in ['hotel', 'restaurant', 'attraction'] and 'name' in domain_goal['info'] and len(
domain_goal['info']) > 1:
if random.random() < cnt_slot['info']['name']:
domain_goal['info'] = {'name': domain_goal['info']['name']}
else:
del domain_goal['info']['name']
if domain in ['taxi', 'train'] and 'arriveBy' in domain_goal['info'] and 'leaveAt' in domain_goal[
'info']:
if random.random() < (
cnt_slot['info']['leaveAt'] / (cnt_slot['info']['arriveBy'] + cnt_slot['info']['leaveAt'])):
del domain_goal['info']['arriveBy']
else:
del domain_goal['info']['leaveAt']
if domain in ['taxi', 'train'] and 'arriveBy' not in domain_goal['info'] and 'leaveAt' not in \
domain_goal['info']:
if random.random() < (cnt_slot['info']['arriveBy'] / (
cnt_slot['info']['arriveBy'] + cnt_slot['info']['leaveAt'])):
domain_goal['info']['arriveBy'] = nomial_sample(cnt_slot_value['info']['arriveBy'])
else:
domain_goal['info']['leaveAt'] = nomial_sample(cnt_slot_value['info']['leaveAt'])
if domain in ['taxi', 'train'] and 'departure' not in domain_goal['info']:
domain_goal['info']['departure'] = nomial_sample(cnt_slot_value['info']['departure'])
if domain in ['taxi', 'train'] and 'destination' not in domain_goal['info']:
domain_goal['info']['destination'] = nomial_sample(cnt_slot_value['info']['destination'])
if domain in ['taxi', 'train'] and \
'departure' in domain_goal['info'] and \
'destination' in domain_goal['info'] and \
domain_goal['info']['departure'] == domain_goal['info']['destination']:
if random.random() < (cnt_slot['info']['departure'] / (
cnt_slot['info']['departure'] + cnt_slot['info']['destination'])):
domain_goal['info']['departure'] = nomial_sample(cnt_slot_value['info']['departure'])
else:
domain_goal['info']['destination'] = nomial_sample(cnt_slot_value['info']['destination'])
if domain_goal['info'] == {}:
continue
# request
if 'reqt' in cnt_slot:
reqt = [slot for slot in cnt_slot['reqt']
if random.random() < cnt_slot['reqt'][slot] + pro_correction['reqt'] and slot not in
domain_goal['info']]
if len(reqt) > 0:
domain_goal['reqt'] = reqt
# book
if 'book' in cnt_slot and random.random() < pro_book + pro_correction['book']:
if 'book' not in domain_goal:
domain_goal['book'] = {}
for slot in cnt_slot['book']:
if random.random() < cnt_slot['book'][slot] + pro_correction['book']:
domain_goal['book'][slot] = nomial_sample(cnt_slot_value['book'][slot])
# makes sure that there are all necessary slots for booking
if domain == 'restaurant' and 'time' not in domain_goal['book']:
domain_goal['book']['time'] = nomial_sample(cnt_slot_value['book']['time'])
if domain == 'hotel' and 'stay' not in domain_goal['book']:
domain_goal['book']['stay'] = nomial_sample(cnt_slot_value['book']['stay'])
if domain in ['hotel', 'restaurant'] and 'day' not in domain_goal['book']:
domain_goal['book']['day'] = nomial_sample(cnt_slot_value['book']['day'])
if domain in ['hotel', 'restaurant'] and 'people' not in domain_goal['book']:
domain_goal['book']['people'] = nomial_sample(cnt_slot_value['book']['people'])
if domain == 'train' and len(domain_goal['book']) <= 0:
domain_goal['book']['people'] = nomial_sample(cnt_slot_value['book']['people'])
# fail_book
if 'book' in domain_goal and random.random() < 0.5:
if domain == 'hotel':
domain_goal['fail_book'] = deepcopy(domain_goal['book'])
if 'stay' in domain_goal['book'] and random.random() < 0.5:
# increase hotel-stay
domain_goal['fail_book']['stay'] = str(int(domain_goal['book']['stay']) + 1)
elif 'day' in domain_goal['book']:
# push back hotel-day by a day
domain_goal['fail_book']['day'] = days[(days.index(domain_goal['book']['day']) - 1) % 7]
elif domain == 'restaurant':
domain_goal['fail_book'] = deepcopy(domain_goal['book'])
if 'time' in domain_goal['book'] and random.random() < 0.5:
hour, minute = domain_goal['book']['time'].split(':')
domain_goal['fail_book']['time'] = str((int(hour) + 1) % 24) + ':' + minute
elif 'day' in domain_goal['book']:
if random.random() < 0.5:
domain_goal['fail_book']['day'] = days[(days.index(domain_goal['book']['day']) - 1) % 7]
else:
domain_goal['fail_book']['day'] = days[(days.index(domain_goal['book']['day']) + 1) % 7]
# fail_info
if 'info' in domain_goal and len(self.dbquery.query(domain, domain_goal['info'].items())) == 0:
num_trial = 0
while num_trial < 100:
adjusted_info = self._adjust_info(domain, domain_goal['info'])
if len(self.dbquery.query(domain, adjusted_info.items())) > 0:
if domain == 'train':
domain_goal['info'] = adjusted_info
else:
domain_goal['fail_info'] = domain_goal['info']
domain_goal['info'] = adjusted_info
break
num_trial += 1
if num_trial >= 100:
continue
# at least there is one request and book
if 'reqt' in domain_goal or 'book' in domain_goal:
break
return domain_goal
def get_user_goal(self, seed=None):
# seed the generator to get fixed goal
if seed is not None:
random.seed(seed)
np.random.seed(seed)
domain_ordering = []
while not domain_ordering:
domain_ordering = list(nomial_sample(self.domain_ordering_dist))
np.random.shuffle(domain_ordering)
user_goal = {dom: self._get_domain_goal(dom) for dom in domain_ordering}
assert len(user_goal.keys()) > 0
# using taxi to communte between places, removing destination and departure.
if 'taxi' in domain_ordering:
places = [dom for dom in domain_ordering[: domain_ordering.index('taxi')] if 'address' in self.ind_slot_dist[dom]['reqt'].keys()]
if len(places) >= 1:
del user_goal['taxi']['info']['destination']
user_goal[places[-1]]['reqt'] = list(set(user_goal[places[-1]].get('reqt', [])).union({'address'}))
if places[-1] == 'restaurant' and 'book' in user_goal['restaurant']:
user_goal['taxi']['info']['arriveBy'] = user_goal['restaurant']['book']['time']
if 'leaveAt' in user_goal['taxi']['info']:
del user_goal['taxi']['info']['leaveAt']
if len(places) >= 2:
del user_goal['taxi']['info']['departure']
user_goal[places[-2]]['reqt'] = list(set(user_goal[places[-2]].get('reqt', [])).union({'address'}))
# match area of attraction and restaurant
if 'restaurant' in domain_ordering and \
'attraction' in domain_ordering and \
'fail_info' not in user_goal['restaurant'] and \
domain_ordering.index('restaurant') > domain_ordering.index('attraction') and \
'area' in user_goal['restaurant']['info'] and 'area' in user_goal['attraction']['info']:
adjusted_restaurant_goal = deepcopy(user_goal['restaurant']['info'])
adjusted_restaurant_goal['area'] = user_goal['attraction']['info']['area']
if len(self.dbquery.query('restaurant', adjusted_restaurant_goal.items())) > 0 and random.random() < 0.5:
user_goal['restaurant']['info']['area'] = user_goal['attraction']['info']['area']
# match day and people of restaurant and hotel
if 'restaurant' in domain_ordering and 'hotel' in domain_ordering and \
'book' in user_goal['restaurant'] and 'book' in user_goal['hotel']:
if random.random() < 0.5:
user_goal['restaurant']['book']['people'] = user_goal['hotel']['book']['people']
if 'fail_book' in user_goal['restaurant']:
user_goal['restaurant']['fail_book']['people'] = user_goal['hotel']['book']['people']
if random.random() < 1.0:
user_goal['restaurant']['book']['day'] = user_goal['hotel']['book']['day']
if 'fail_book' in user_goal['restaurant']:
user_goal['restaurant']['fail_book']['day'] = user_goal['hotel']['book']['day']
if user_goal['restaurant']['book']['day'] == user_goal['restaurant']['fail_book']['day'] and \
user_goal['restaurant']['book']['time'] == user_goal['restaurant']['fail_book']['time'] and \
user_goal['restaurant']['book']['people'] == user_goal['restaurant']['fail_book']['people']:
del user_goal['restaurant']['fail_book']
# match day and people of hotel and train
if 'hotel' in domain_ordering and 'train' in domain_ordering and \
'book' in user_goal['hotel'] and 'info' in user_goal['train']:
if user_goal['train']['info']['destination'] == 'cambridge' and \
'day' in user_goal['hotel']['book']:
user_goal['train']['info']['day'] = user_goal['hotel']['book']['day']
elif user_goal['train']['info']['departure'] == 'cambridge' and \
'day' in user_goal['hotel']['book'] and 'stay' in user_goal['hotel']['book']:
user_goal['train']['info']['day'] = days[
(days.index(user_goal['hotel']['book']['day']) + int(
user_goal['hotel']['book']['stay'])) % 7]
# In case, we have no query results with adjusted train goal, we simply drop the train goal.
if len(self.dbquery.query('train', user_goal['train']['info'].items())) == 0:
del user_goal['train']
domain_ordering.remove('train')
user_goal['domain_ordering'] = domain_ordering
return user_goal
def _adjust_info(self, domain, info):
# adjust one of the slots of the info
adjusted_info = deepcopy(info)
slot = random.choice(list(info.keys()))
adjusted_info[slot] = random.choice(list(self.ind_slot_value_dist[domain]['info'][slot].keys()))
return adjusted_info
def build_message(self, user_goal, boldify=null_boldify):
message = []
state = deepcopy(user_goal)
for dom in user_goal['domain_ordering']:
state = deepcopy(user_goal[dom])
if not (dom == 'taxi' and len(state['info']) == 1):
# intro
m = [templates[dom]['intro']]
# info
def fill_info_template(user_goal, domain, slot, info):
if slot != 'area' or not ('restaurant' in user_goal and
'attraction' in user_goal and
info in user_goal['restaurant'].keys() and
info in user_goal['attraction'].keys() and
'area' in user_goal['restaurant'][info] and
'area' in user_goal['attraction'][info] and
user_goal['restaurant'][info]['area'] == user_goal['attraction'][info]['area']):
return templates[domain][slot].format(self.boldify(user_goal[domain][info][slot]))
else:
restaurant_index = user_goal['domain_ordering'].index('restaurant')
attraction_index = user_goal['domain_ordering'].index('attraction')
if restaurant_index > attraction_index and domain == 'restaurant':
return templates[domain][slot].format(self.boldify('same area as the attraction'))
elif attraction_index > restaurant_index and domain == 'attraction':
return templates[domain][slot].format(self.boldify('same area as the restaurant'))
return templates[domain][slot].format(self.boldify(user_goal[domain][info][slot]))
info = 'info'
if 'fail_info' in user_goal[dom]:
info = 'fail_info'
if dom == 'taxi' and len(state[info]) == 1:
taxi_index = user_goal['domain_ordering'].index('taxi')
places = [dom for dom in user_goal['domain_ordering'][: taxi_index] if
dom in ['attraction', 'hotel', 'restaurant']]
if len(places) >= 2:
random.shuffle(places)
m.append(templates['taxi']['commute'])
if 'arriveBy' in state[info]:
m.append('The taxi should arrive at the {} from the {} by {}.'.format(self.boldify(places[0]),
self.boldify(places[1]),
self.boldify(state[info]['arriveBy'])))
elif 'leaveAt' in state[info]:
m.append('The taxi should leave from the {} to the {} after {}.'.format(self.boldify(places[0]),
self.boldify(places[1]),
self.boldify(state[info]['leaveAt'])))
message.append(' '.join(m))
else:
while len(state[info]) > 0:
num_acts = random.randint(1, min(len(state[info]), 3))
slots = random.sample(list(state[info].keys()), num_acts)
sents = [fill_info_template(user_goal, dom, slot, info) for slot in slots if slot not in ['parking', 'internet']]
if 'parking' in slots:
sents.append(templates[dom]['parking ' + state[info]['parking']])
if 'internet' in slots:
sents.append(templates[dom]['internet ' + state[info]['internet']])
m.extend(sents)
message.append(' '.join(m))
m = []
for slot in slots:
del state[info][slot]
# fail_info
if 'fail_info' in user_goal[dom]:
adjusted_slot = list(filter(lambda x: x[0][1] != x[1][1],
zip(user_goal[dom]['info'].items(), user_goal[dom]['fail_info'].items())))[0][0][0]
if adjusted_slot in ['internet', 'parking']:
message.append(templates[dom]['fail_info ' + adjusted_slot + ' ' + user_goal[dom]['info'][adjusted_slot]])
else:
message.append(templates[dom]['fail_info ' + adjusted_slot].format(self.boldify(user_goal[dom]['info'][adjusted_slot])))
# reqt
if 'reqt' in state:
slot_strings = []
for slot in state['reqt']:
if slot in ['internet', 'parking', 'food']:
continue
slot_strings.append(slot if slot not in request_slot_string_map else request_slot_string_map[slot])
if len(slot_strings) > 0:
message.append(templates[dom]['request'].format(self.boldify(', '.join(slot_strings))))
if 'internet' in state['reqt']:
message.append('Make sure to ask if the hotel includes free wifi.')
if 'parking' in state['reqt']:
message.append('Make sure to ask if the hotel includes free parking.')
if 'food' in state['reqt']:
message.append('Make sure to ask about what food it serves.')
def get_same_people_domain(user_goal, domain, slot):
if slot not in ['day', 'people']:
return None
domain_index = user_goal['domain_ordering'].index(domain)
previous_domains = user_goal['domain_ordering'][:domain_index]
for prev in previous_domains:
if prev in ['restaurant', 'hotel', 'train'] and 'book' in user_goal[prev] and \
slot in user_goal[prev]['book'] and user_goal[prev]['book'][slot] == \
user_goal[domain]['book'][slot]:
return prev
return None
# book
book = 'book'
if 'fail_book' in user_goal[dom]:
book = 'fail_book'
if 'book' in state:
slot_strings = []
for slot in ['people', 'time', 'day', 'stay']:
if slot in state[book]:
if slot == 'people':
same_people_domain = get_same_people_domain(user_goal, dom, slot)
if same_people_domain is None:
slot_strings.append('for {} people'.format(self.boldify(state[book][slot])))
else:
slot_strings.append(self.boldify(
'for the same group of people as the {} booking'.format(same_people_domain)))
elif slot == 'time':
slot_strings.append('at {}'.format(self.boldify(state[book][slot])))
elif slot == 'day':
same_people_domain = get_same_people_domain(user_goal, dom, slot)
if same_people_domain is None:
slot_strings.append('on {}'.format(self.boldify(state[book][slot])))
else:
slot_strings.append(
self.boldify('on the same day as the {} booking'.format(same_people_domain)))
elif slot == 'stay':
slot_strings.append('for {} nights'.format(self.boldify(state[book][slot])))
del state[book][slot]
assert len(state[book]) <= 0, state[book]
if len(slot_strings) > 0:
message.append(templates[dom]['book'].format(' '.join(slot_strings)))
# fail_book
if 'fail_book' in user_goal[dom]:
adjusted_slot = list(filter(lambda x: x[0][1] != x[1][1], zip(user_goal[dom]['book'].items(),
user_goal[dom]['fail_book'].items())))[0][0][0]
if adjusted_slot in ['internet', 'parking']:
message.append(
templates[dom]['fail_book ' + adjusted_slot + ' ' + user_goal[dom]['book'][adjusted_slot]])
else:
message.append(templates[dom]['fail_book ' + adjusted_slot].format(
self.boldify(user_goal[dom]['book'][adjusted_slot])))
if boldify == do_boldify:
for i, m in enumerate(message):
message[i] = message[i].replace('wifi', "<b>wifi</b>")
message[i] = message[i].replace('internet', "<b>internet</b>")
message[i] = message[i].replace('parking', "<b>parking</b>")
return message
class DSTCGoalGenerator:
"""User goal generator"""
def __init__(self, data_dir, cfg,
goal_model_path,
boldify=False):
"""
Args:
goal_model_path: path to a goal model
corpus_path: path to a dialog corpus to build a goal model
"""
# self.dbquery = DBQuery(data_dir, cfg)
self.goal_model_path = data_dir + '/' + goal_model_path
self.goal_model_path_m = data_dir + '/' + goal_model_path.split('.')[0] + "_m_.pkl"
self.goal_filename_train = 'dstc_goals_train.json'
self.goal_path_train = data_dir + '/' + self.goal_filename_train
self.goal_filename_dev = 'dstc_goals_dev.json'
self.goal_path_dev = data_dir + '/' + self.goal_filename_dev
self.goal_filename_test = 'dstc_goals_test.json'
self.goal_path_test = data_dir + '/' + self.goal_filename_test
self.boldify = do_boldify if boldify else null_boldify
if os.path.exists(self.goal_model_path):
with open(self.goal_model_path, 'rb') as f:
self.ind_slot_dist, self.ind_slot_value_dist, self.domain_ordering_dist = pickle.load(f)
logging.info('Loading goal model is done')
else:
logging.info('Building goal_model.pkl')
self._build_goal_model()
logging.info('Building goal_model is done')
def _build_goal_model(self):
# with open(self.goal_path) as fg:
# goals = json.load(fg)
with open(self.goal_path_train) as fgt:
goals_train = json.load(fgt)
with open(self.goal_path_test) as fgte:
goals_test = json.load(fgte)
with open(self.goal_path_dev) as fgd:
goals_dev = json.load(fgd)
print('Combining Data')
goals = []
goal_files = [goals_train, goals_test, goals_dev]
for ith in goal_files:
for itm in ith:
goals.append(itm)
logging.info('Calculating Individual Slot Value Distribution')
ind_slot_value_cnt = dict([(domain, {}) for domain in services])
for idx, g in enumerate(goals):
for domain in services:
if g['goals'][domain] != {}:
for slot in g['goals'][domain].keys():
if g['goals'][domain][slot] != []:
ind_slot_value_cnt[domain][slot] = {}
val = g['goals'][domain][slot]
for item in val:
if item not in list(ind_slot_value_cnt[domain].keys()):
ind_slot_value_cnt[domain][slot][item] = 1
else:
ind_slot_value_cnt[domain][slot][item] += 1
self.ind_slot_value_dist = deepcopy(ind_slot_value_cnt)
for itk in ind_slot_value_cnt.keys():
for itp in ind_slot_value_cnt[itk]:
for order in ind_slot_value_cnt[itk][itp]:
self.ind_slot_value_dist[itk][itp][order] = ind_slot_value_cnt[itk][itp][order] / sum(ind_slot_value_cnt[itk][itp].values())
logging.info('Calculating Domain Distribution')
domain_orderings = []
for g in goals:
d_domains = []
for it in g['goals'].keys():
if g['goals'][it] != {}:
d_domains.append(it)
domain_orderings.append(tuple(d_domains))
domain_orderings = list(domain_orderings)
domain_ordering_cnt = Counter(domain_orderings)
self.domain_ordering_dist = deepcopy(domain_ordering_cnt)
for order in domain_ordering_cnt.keys():
self.domain_ordering_dist[order] = domain_ordering_cnt[order] / sum(domain_ordering_cnt.values())
# dialogs[d]['goal'][domain] is analogous g['goals'][domain]
logging.info('Calculating Individual Slot Distribution')
ind_slot_cnt = dict([(domain, {}) for domain in services])
for g in goals:
for domain in services:
if g['goals'][domain] != {}:
for slot in g['goals'][domain].keys():
if slot not in ind_slot_cnt[domain]:
ind_slot_cnt[domain][slot] = 1
else:
ind_slot_cnt[domain][slot] += 1
self.ind_slot_dist = deepcopy(ind_slot_cnt)
# ind_slot_dist = deepcopy(ind_slot_cnt)
for itk in ind_slot_cnt.keys():
for order in ind_slot_cnt[itk]:
self.ind_slot_dist[itk][order] = ind_slot_cnt[itk][order] / sum(ind_slot_cnt[itk].values())
with open(self.goal_model_path_m, 'wb') as f:
pickle.dump((self.ind_slot_dist, self.ind_slot_value_dist, self.domain_ordering_dist), f)
logging.info('Saving in a different format (info, reqt, book)')
pk_extra = pickle.load(open(self.goal_model_path_m, 'rb'))
with open('dstc-data/dstc_slot_types.json', 'rb') as f:
data = json.load(f)
slot_cnt = {}
slot_value = {}
for item in services:
slot_cnt[item] = {}
slot_value[item] = {}
for item in services:
for ty in typs:
slot_cnt[item][ty] = {}
slot_value[item][ty] = {}
for doms in data[0].keys():
for typ in data[0][doms].keys():
sl = data[0][doms][typ]
ls = pk_extra[0][doms].keys()
for its in sl:
for itl in ls:
if its == itl:
slot_cnt[doms][typ][its] = pk_extra[0][doms][its]
for doms in data[0].keys():
for typ in data[0][doms].keys():
sl = data[0][doms][typ]
ls = pk_extra[1][doms].keys()
for its in sl:
for itl in ls:
if its == itl:
slot_value[doms][typ][its] = deepcopy(pk_extra[1][doms][its])
self.ind_slot_dist = deepcopy(slot_cnt)
self.ind_slot_value_dist = deepcopy(slot_value)
with open(self.goal_model_path, 'wb') as f:
pickle.dump((self.ind_slot_dist, self.ind_slot_value_dist, self.domain_ordering_dist), f)
def _get_domain_goal(self, domain):
cnt_slot = self.ind_slot_dist[domain]
cnt_slot_value = self.ind_slot_value_dist[domain]
while True:
domain_goal = {'info': {}}
# inform
if 'info' in cnt_slot:
for slot in cnt_slot['info']:
if random.random() < cnt_slot['info'][slot] + pro_correction['info']:
if slot in list(cnt_slot_value['info'].keys()):
domain_goal['info'][slot] = nomial_sample(cnt_slot_value['info'][slot])
if domain_goal['info'] == {}:
continue
# request
if 'reqt' in cnt_slot:
reqt = [slot for slot in cnt_slot['info'] if slot not in cnt_slot['reqt']]
if len(reqt) > 0:
domain_goal['reqt'] = reqt
# at least there is one request and book
if 'reqt' in domain_goal or 'info' in domain_goal:
break
return domain_goal
def get_user_goal(self, seed=None):
# seed the generator to get fixed goal
if seed is not None:
random.seed(seed)
np.random.seed(seed)
domain_ordering = []
while not domain_ordering:
domain_ordering = list(nomial_sample(self.domain_ordering_dist))
np.random.shuffle(domain_ordering)
user_goal = {dom: self._get_domain_goal(dom) for dom in domain_ordering}
assert len(user_goal.keys()) > 0
# # using taxi to communte between places, removing destination and departure.
# if 'taxi' in domain_ordering:
# places = [dom for dom in domain_ordering[: domain_ordering.index('taxi')] if 'address' in self.ind_slot_dist[dom]['reqt'].keys()]
# if len(places) >= 1:
# del user_goal['taxi']['info']['destination']
# user_goal[places[-1]]['reqt'] = list(set(user_goal[places[-1]].get('reqt', [])).union({'address'}))
# if places[-1] == 'restaurant' and 'book' in user_goal['restaurant']:
# user_goal['taxi']['info']['arriveBy'] = user_goal['restaurant']['book']['time']
# if 'leaveAt' in user_goal['taxi']['info']:
# del user_goal['taxi']['info']['leaveAt']
# if len(places) >= 2:
# del user_goal['taxi']['info']['departure']
# user_goal[places[-2]]['reqt'] = list(set(user_goal[places[-2]].get('reqt', [])).union({'address'}))
# # match area of attraction and restaurant
# if 'restaurant' in domain_ordering and \
# 'attraction' in domain_ordering and \
# 'fail_info' not in user_goal['restaurant'] and \
# domain_ordering.index('restaurant') > domain_ordering.index('attraction') and \
# 'area' in user_goal['restaurant']['info'] and 'area' in user_goal['attraction']['info']:
# adjusted_restaurant_goal = deepcopy(user_goal['restaurant']['info'])
# adjusted_restaurant_goal['area'] = user_goal['attraction']['info']['area']
# if len(self.dbquery.query('restaurant', adjusted_restaurant_goal.items())) > 0 and random.random() < 0.5:
# user_goal['restaurant']['info']['area'] = user_goal['attraction']['info']['area']
# # match day and people of restaurant and hotel
# if 'restaurant' in domain_ordering and 'hotel' in domain_ordering and \
# 'book' in user_goal['restaurant'] and 'book' in user_goal['hotel']:
# if random.random() < 0.5:
# user_goal['restaurant']['book']['people'] = user_goal['hotel']['book']['people']
# if 'fail_book' in user_goal['restaurant']:
# user_goal['restaurant']['fail_book']['people'] = user_goal['hotel']['book']['people']
# if random.random() < 1.0:
# user_goal['restaurant']['book']['day'] = user_goal['hotel']['book']['day']
# if 'fail_book' in user_goal['restaurant']:
# user_goal['restaurant']['fail_book']['day'] = user_goal['hotel']['book']['day']
# if user_goal['restaurant']['book']['day'] == user_goal['restaurant']['fail_book']['day'] and \
# user_goal['restaurant']['book']['time'] == user_goal['restaurant']['fail_book']['time'] and \
# user_goal['restaurant']['book']['people'] == user_goal['restaurant']['fail_book']['people']:
# del user_goal['restaurant']['fail_book']
# # match day and people of hotel and train
# if 'hotel' in domain_ordering and 'train' in domain_ordering and \
# 'book' in user_goal['hotel'] and 'info' in user_goal['train']:
# if user_goal['train']['info']['destination'] == 'cambridge' and \
# 'day' in user_goal['hotel']['book']:
# user_goal['train']['info']['day'] = user_goal['hotel']['book']['day']
# elif user_goal['train']['info']['departure'] == 'cambridge' and \
# 'day' in user_goal['hotel']['book'] and 'stay' in user_goal['hotel']['book']:
# user_goal['train']['info']['day'] = days[
# (days.index(user_goal['hotel']['book']['day']) + int(
# user_goal['hotel']['book']['stay'])) % 7]
# # In case, we have no query results with adjusted train goal, we simply drop the train goal.
# if len(self.dbquery.query('train', user_goal['train']['info'].items())) == 0:
# del user_goal['train']
# domain_ordering.remove('train')
user_goal['domain_ordering'] = domain_ordering
# print('USER GOAL')
# print(user_goal)
return user_goal
def _adjust_info(self, domain, info):
# adjust one of the slots of the info
adjusted_info = deepcopy(info)
slot = random.choice(list(info.keys()))
adjusted_info[slot] = random.choice(list(self.ind_slot_value_dist[domain]['info'][slot].keys()))
return adjusted_info
def build_message(self, user_goal, boldify=null_boldify):
message = []
state = deepcopy(user_goal)
for dom in user_goal['domain_ordering']:
state = deepcopy(user_goal[dom])
if not (dom == 'taxi' and len(state['info']) == 1):
# intro
m = [templates[dom]['intro']]
# info
def fill_info_template(user_goal, domain, slot, info):
if slot != 'area' or not ('restaurant' in user_goal and
'attraction' in user_goal and
info in user_goal['restaurant'].keys() and
info in user_goal['attraction'].keys() and
'area' in user_goal['restaurant'][info] and
'area' in user_goal['attraction'][info] and
user_goal['restaurant'][info]['area'] == user_goal['attraction'][info]['area']):
return templates[domain][slot].format(self.boldify(user_goal[domain][info][slot]))
else:
restaurant_index = user_goal['domain_ordering'].index('restaurant')
attraction_index = user_goal['domain_ordering'].index('attraction')
if restaurant_index > attraction_index and domain == 'restaurant':
return templates[domain][slot].format(self.boldify('same area as the attraction'))
elif attraction_index > restaurant_index and domain == 'attraction':
return templates[domain][slot].format(self.boldify('same area as the restaurant'))
return templates[domain][slot].format(self.boldify(user_goal[domain][info][slot]))
info = 'info'
if 'fail_info' in user_goal[dom]:
info = 'fail_info'
if dom == 'taxi' and len(state[info]) == 1:
taxi_index = user_goal['domain_ordering'].index('taxi')
places = [dom for dom in user_goal['domain_ordering'][: taxi_index] if
dom in ['attraction', 'hotel', 'restaurant']]
if len(places) >= 2:
random.shuffle(places)
m.append(templates['taxi']['commute'])
if 'arriveBy' in state[info]:
m.append('The taxi should arrive at the {} from the {} by {}.'.format(self.boldify(places[0]),
self.boldify(places[1]),
self.boldify(state[info]['arriveBy'])))
elif 'leaveAt' in state[info]:
m.append('The taxi should leave from the {} to the {} after {}.'.format(self.boldify(places[0]),
self.boldify(places[1]),
self.boldify(state[info]['leaveAt'])))
message.append(' '.join(m))
else:
while len(state[info]) > 0:
num_acts = random.randint(1, min(len(state[info]), 3))
slots = random.sample(list(state[info].keys()), num_acts)
sents = [fill_info_template(user_goal, dom, slot, info) for slot in slots if slot not in ['parking', 'internet']]
if 'parking' in slots:
sents.append(templates[dom]['parking ' + state[info]['parking']])
if 'internet' in slots:
sents.append(templates[dom]['internet ' + state[info]['internet']])
m.extend(sents)
message.append(' '.join(m))
m = []
for slot in slots:
del state[info][slot]
# fail_info
if 'fail_info' in user_goal[dom]:
adjusted_slot = list(filter(lambda x: x[0][1] != x[1][1],
zip(user_goal[dom]['info'].items(), user_goal[dom]['fail_info'].items())))[0][0][0]
if adjusted_slot in ['internet', 'parking']:
message.append(templates[dom]['fail_info ' + adjusted_slot + ' ' + user_goal[dom]['info'][adjusted_slot]])
else:
message.append(templates[dom]['fail_info ' + adjusted_slot].format(self.boldify(user_goal[dom]['info'][adjusted_slot])))
# reqt
if 'reqt' in state:
slot_strings = []
for slot in state['reqt']:
if slot in ['internet', 'parking', 'food']:
continue
slot_strings.append(slot if slot not in request_slot_string_map else request_slot_string_map[slot])
if len(slot_strings) > 0:
message.append(templates[dom]['request'].format(self.boldify(', '.join(slot_strings))))
if 'internet' in state['reqt']:
message.append('Make sure to ask if the hotel includes free wifi.')
if 'parking' in state['reqt']:
message.append('Make sure to ask if the hotel includes free parking.')
if 'food' in state['reqt']:
message.append('Make sure to ask about what food it serves.')