-
Notifications
You must be signed in to change notification settings - Fork 24
/
prepare_data.py
107 lines (84 loc) · 3.75 KB
/
prepare_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# -*- coding: utf-8 -*-
"""
Created on Tue May 15 23:27:42 2018
@author: batman
"""
import json
import re
import numpy as np
intent_types = [
'AddToPlaylist','BookRestaurant','GetWeather','RateBook','SearchCreativeWork','SearchScreeningEvent'
]
# Function for text preprocessing
def clean_str(string):
"""
Tokenization/string cleaning for all datasets except for SST.
Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
"""
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip().lower()
### Process training data
def process_data():
train_list = []
train_label_list = []
for intent in range(len(intent_types)):
with open("data/raw_json_data/"+intent_types[intent]+"/train_"+intent_types[intent]+"_full.json",encoding='utf-8') as f:
data = json.load(f)
sent_list =[]
label_list =[]
for i in range(len(data[intent_types[intent]])):
txt = ''
for j in range(len(data[intent_types[intent]][i]['data'])):
txt = txt+data[intent_types[intent]][i]['data'][j]['text']
txt = clean_str(txt)
sent_list.append(txt)
for i in range(len(sent_list)):
label_list.append(intent_types[intent])
filename = intent_types[intent]+'_train.txt'
with open("data/processed_data/"+filename, mode="w", encoding='utf-8') as outfile:
for s in sent_list:
outfile.write("%s\n" % s)
train_list.extend(sent_list)
train_label_list.extend(label_list)
### Process test data
test_list = []
test_label_list = []
for intent in range(len(intent_types)):
with open("data/raw_json_data/"+intent_types[intent]+"/validate_"+intent_types[intent]+".json",encoding='utf-8') as f:
data = json.load(f)
sent_list =[]
label_list =[]
for i in range(len(data[intent_types[intent]])):
txt = ''
for j in range(len(data[intent_types[intent]][i]['data'])):
txt = txt+data[intent_types[intent]][i]['data'][j]['text']
txt = clean_str(txt)
sent_list.append(txt)
for i in range(len(sent_list)):
label_list.append(intent_types[intent])
filename = intent_types[intent]+'_test.txt'
with open("data/processed_data/"+filename, mode="w", encoding='utf-8') as outfile:
for s in sent_list:
outfile.write("%s\n" % s)
test_list.extend(sent_list)
test_label_list.extend(label_list)
# how to save list as array: np.array(myList).dump(open('array.npy', 'wb'))
# how to load aan array: myArray = np.load(open('array.npy', 'rb'))
np.array(train_list).dump(open('data/train_text.npy', 'wb'))
np.array(train_label_list).dump(open('data/train_label.npy', 'wb'))
np.array(test_list).dump(open('data/test_text.npy', 'wb'))
np.array(test_label_list).dump(open('data/test_label.npy', 'wb'))
if __name__ == '__main__':
process_data()