-
Notifications
You must be signed in to change notification settings - Fork 1
/
cloze_preprocessing.py
105 lines (87 loc) · 4.12 KB
/
cloze_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# -*- coding: utf-8 -*-
import gzip
import json
import pdb
import operator
import string
import re
STOP_WORDS = frozenset(('a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'can',
'for', 'from', 'have', 'if', 'in', 'is', 'it', 'may',
'not', 'of', 'on', 'or', 'tbd', 'that', 'the', 'this',
'to', 'us', 'we', 'when', 'will', 'with', 'yet',
'you', 'your', 'inc', 'll', 'i'))
def normalize_answer(s):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
@staticmethod
def cleanText(text):
"""
Method to map a fixed list of higher frequency category names
to names that will not be split by a tokenizer.
:param text: input string
:return: cleaned string
"""
transform_list = [(r"'s\b",' '),
(r"(^|\W)[.]net(\W|$)","\g<1>dotnet\\2"),
(r"(^|\W)c[+][+](\W|$)", "\g<1>cplusplus\\2"),
(r"(^|\W)objective[-]c(\W|$)", "\g<1>objectivec\\2"),
(r"(^|\W)node[.]js(\W|$)", "\g<1>nodedotjs\\2"),
(r"(^|\W)asp[.]net(\W|$)", "\g<1>aspdotnet\\2"),
(r"(^|\W)e[-]commerce(\W|$)", "\g<1>ecommerce\\2"),
(r"(^|\W)java[-]ee(\W|$)", "\g<1>javaee\\2"),
(r"(^|\W)32[-]bit(\W|$)", "\g<1>32bit\\2"),
(r"(^|\W)kendo[-]ui(\W|$)", "\g<1>kendoui\\2"),
(r"(^|\W)jquery[-]ui(\W|$)", "\g<1>jqueryui\\2"),
(r"(^|\W)c[+][+]11(\W|$)", "\g<1>cplusplus11\\2"),
(r"(^|\W)windows[-]8(\W|$)", "\g<1>windows8\\2"),
(r"(^|\W)ip[-]address(\W|$)", "\g<1>ipaddress\\2"),
(r"(^|\W)backbone[.]js(\W|$)", "\g<1>backbonedotjs\\2"),
(r"(^|\W)angular[.]js(\W|$)", "\g<1>angulardotjs\\2"),
(r"(^|\W)as[-]if(\W|$)", "\g<1>asif\\2"),
(r"(^|\W)actionscript[-]3(\W|$)", "\g<1>actionscript3\\2"),
(r"(^|\W)[@]placeholder(\W|$)", "\g<1>atsymbolplaceholder\\2"),
(r"\W+", " ")
]
return_text = text.lower()
for p in transform_list:
return_text = re.sub(p[0], p[1], return_text)
return return_text
def prepCloze():
file_contexts = ['data/dev_contexts.json.gz', 'data/test_contexts.json.gz', 'data/train_contexts.json.gz']
file_questions = ['data/dev_questions.json.gz', 'data/test_questions.json.gz', 'data/train_questions.json.gz']
filenames_w = ['data/dev.json', 'data/test.json', 'data/train.json']
#file_testing = ['data/dev_testing.txt', 'data/test_testing.txt']
for i in range(3):
count = 0
fpr_c = gzip.open(file_contexts[i], 'r')
fpr_q = gzip.open(file_questions[i], 'r')
fpw = open(filenames_w[i], 'w')
if i != 2:
fpw_testing = open(file_testing[i], 'w')
for line in fpr_c:
context = json.loads(line)
question = json.loads(fpr_q.readline())
assert(context["uid"] == question["uid"])
fpw.write(question["question"]+'\t'+question["uid"]+'\t'+question["answer"]+'\n')
for c in context['contexts']:
fpw.write(c[1]+'\t'+str(c[0])+'\n')
#fpw.write('*new_instance*\n')
if i != 2:
fpw_testing.write(question["uid"]+'\t'+question["answer"]+'\n')
fpr_c.close()
fpr_q.close()
fpw.close()
if i != 2:
fpw_testing.close()
print ('preprossing finished!')
if __name__ == "__main__":
prepCloze()