/
make_semeval_answer_file.py
70 lines (60 loc) · 2.69 KB
/
make_semeval_answer_file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import pandas as pd
import argparse
import os
import sys
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--language", default='english', const='all', nargs='?',
help="Choose a language", choices=['english', 'latin', 'swedish', 'german'])
parser.add_argument("--results_file", default="semeval_results/results_english.csv", type=str,
help="Path to csv file with results")
parser.add_argument("--method", default='aff_prop', type=str,
help="CLustering method for which to write results-should match a column in the results file")
parser.add_argument("--target_path", default='data/english/targets.txt', type=str,
help="Path to target files")
args = parser.parse_args()
thresholds = {'english': 0.3,
'german': 0.43,
'latin': 0.4,
'swedish':0.4
}
method_name = args.method
lang = args.language
languages = ['english', 'latin', 'swedish', 'german']
methods = ['aff_prop', 'kmeans_5', 'kmeans_7', 'averaging']
if method_name not in methods:
print("Method not valid, valid choices are: ", ", ".join(methods))
sys.exit()
if lang not in languages:
print("Language not valid, valid choices are: ", ", ".join(languages))
sys.exit()
print("Language:", lang.upper())
clustering_file = args.results_file
clustering_df = pd.read_csv(clustering_file, sep="\t")
target_file = args.target_path
target_words = open(target_file,'r').readlines()
target_words = [t.strip() for t in target_words]
thresh = thresholds[lang]
if not os.path.exists("answer/task1/"):
os.makedirs("answer/task1/")
outfilename1 = "answer/task1/" + lang + ".txt"
outfile1 = open(outfilename1, 'w', encoding='utf-8')
for i,word in enumerate(target_words):
val = float(clustering_df[clustering_df['word'] == word][method_name])
classif = 0 if val < thresh else 1
line = word + "\t" + str(classif)
outfile1.write(line)
if i < len(target_words):
outfile1.write("\n")
print("Done writing", outfilename1,"!")
if not os.path.exists("answer/task2/"):
os.makedirs("answer/task2/")
outfilename2 = "answer/task2/" + lang + ".txt"
outfile2 = open(outfilename2, 'w', encoding='utf-8')
for i,word in enumerate(target_words):
val = float(clustering_df[clustering_df['word'] == word][method_name])
line = word + "\t" + str(val)
outfile2.write(line)
if i < len(target_words):
outfile2.write("\n")
print("Done writing", outfilename2,"!")