/
filter.py
73 lines (65 loc) · 2.38 KB
/
filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import json
filter_words = [
"each image", "both images", "at least", "at most", "all of the animals",
"every of the animals", "all of the objects", "each of the objects"]
number_words = ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten",
"an", "a"]
good_words = ["left image", "right image", "lefthand image",
"righthand image", "image on the right",
"image on the left", "every photo", "leftmost", "rightmost",
"image to the left", "on the right", "on the left",
"left photo", "right photo", "left pic", "right pic",
"an image", "one of the images", "one image"]
advs = ["exactly", "only", "strictly"]
singlar_number_words = ["a", "an", "one"]
def check_filter_words(sent:str) -> bool:
for i in filter_words:
if i in sent:
return True
return False
def check_good_words(sent:str) -> bool:
for i in good_words:
if i in sent:
return True
return False
def check_there_be(sent:str) -> bool:
if len(sent.split()) < 4:
return False
if sent.startswith("there are"):
if sent.split()[2] in advs and sent.split()[3] in number_words:
return True
elif sent.split()[2] in number_words:
return True
else:
return False
elif sent.startswith("there is"):
if sent.split()[2] in singlar_number_words:
return True
else:
return False
else:
return False
def check_number_start(sent:str) -> bool:
if len(sent.split()) < 2:
return False
if sent.split()[0] in advs and sent.split()[1] in number_words:
return True
elif sent.split()[1] in number_words:
return True
else:
return False
file = open("train.json", "r")
outfile = open("train_out.json", "w")
outtsv = open("out.tsv", "w")
for line in file:
datum = json.loads(line)
sent = datum["sentence"].lower()
if datum["label"] == "False":
continue
if check_filter_words(sent):
if not ("left image" in sent and "right image" in sent):
continue
if not (check_good_words(sent) or check_there_be(sent) or check_number_start(sent)):
continue
outfile.write(line)
outtsv.write(f"{datum['identifier']}\t{datum['sentence']}\t{datum['left_url']}\t{datum['right_url']}\t{datum['label']}\n")