/
negation_finder_flickr30k.py
55 lines (46 loc) · 1.88 KB
/
negation_finder_flickr30k.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import csv
import glob
import json
from collections import defaultdict
# Negations: match the word.
# ADJECTIVES = {"absent", "away", "clear", "deprived", "devoid", "free", "removed", "stripped", "vanished"}
# ADVERBS = {"barely", "hardly", "scarcely"}
FREE_NEG = {"not", "n't"}
NO_NEG = {"never", "no", "none", "nothing", "nobody", "nowhere", "nor", "neither"}
PREPOSITIONS = {"without", "sans", "minus"}#, "except", "from", "out", "off"}
# NPIS = {"any","anything"} # to make sure we don't miss anything.
# with open('./negations.csv') as f:
# reader = csv.reader(f)
# AFFIXED = {word for word, yes_no in reader if yes_no == 'yes'}
# Negations: special cases.
# PREFIXES = {"a", "dis", "in", "im", "non", "un"}
# SUFFIXES = {"less"}
VERBS = {"lack", "omit", "miss", "fail"}
# All.
TO_MATCH = FREE_NEG | NO_NEG | PREPOSITIONS
# Translation table to strip the annotations.
TABLE = str.maketrans("","",']')
def lines_in_doc(doc):
"Remove annotations and tokenize the line."
with open(doc) as f:
for line in f:
yield [word.lower() for word in line.translate(TABLE).split()
if not word.startswith('[/EN')]
def lines_containing_negation():
"Generator function yielding all lines containing negations."
for doc in glob.glob('Flickr30k/*.txt'):
for line in lines_in_doc(doc):
bag_of_words = set(line)
if TO_MATCH & bag_of_words:
yield ' '.join(line)
for word in bag_of_words:
for verb in VERBS:
if word.startswith(verb):
yield ' '.join(line)
with open('captions_flickr30k.txt','w') as f:
negation_sents = list(lines_containing_negation())
unique_sents = set(negation_sents)
print("Tokens:", len(negation_sents))
print("Types:", len(unique_sents))
for line in unique_sents:
f.write(line + '\n')