-
Notifications
You must be signed in to change notification settings - Fork 1
/
task_predict.py
122 lines (108 loc) · 4.07 KB
/
task_predict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
'''
Created on 8 Feb 2010
@author: paul
'''
import copy
import cPickle
import random
import parse_semeval
import math
from parse_semeval import Paraphrase
def make_priors_freq(all_pairs):
"""
computes the prior probability of a given paraphrase occuring. This is simply
a count of the number of times it occurs in the whole dataset. The
paraphrase must have been produced by at least n annotators to count as a
valid occurence for a given compound.
"""
priors={}
for pair in all_pairs:
for p in pair.paraphrases:
if p.name in priors:
priors[p.name]+=(p.freq)
else:
priors[p.name]=(p.freq)
return priors
def make_priors(all_pairs):
"""
computes the prior probability of a given paraphrase occuring. This is simply
a count of the number of times it occurs in the whole dataset. The
paraphrase must have been produced by at least n annotators to count as a
valid occurence for a given compound.
"""
#parameter: paraphrase must have been mentioned by at least n annotators
priors={}
for pair in all_pairs:
for p in pair.paraphrases:
if p.name in priors:
priors[p.name]+=(1.0)
else:
priors[p.name]=(1.0)
return priors
def make_prob_table(all_pairs,priors):
"""
computes the conditional probability of a paraphrase, i.e. the probability
of one paraphrase occurring in the same compound as another paraphrase.
For two paraphrases A and B, the probability of A occurring given that B
occurs for the same compound is a count of the number of times that A has
occurred with B in all other compounds, divided by the number of times that
B has occurred overall
"""
#parameter: paraphrase must have been mentioned by at least n annotators
cooc={}
#initialize cooccurrence dictionary
for x in priors.keys(): cooc[x]={}
counter=0
#for each paraphrase, count its cooccurrences with all other paraphrases
for compound in all_pairs:
counter+=1
#make a list of paraphrases for this compound
currentParas=[]
for x in compound.paraphrases:
currentParas.append(x)
i=0
while(i<len(currentParas)):
j=0
a=currentParas[i].name
while(j<len(currentParas)):
#don't count co-occurrence of paraphrase with itself
if j==i:
j+=1
continue
b=currentParas[j]
if b.name in cooc[a]: cooc[a][b.name]+=(1)
else: cooc[a][b.name]=(1.0)
j+=1
i+=1
#probabilities are coocurrences divided by prior probability
probs={}
for x in cooc.keys(): probs[x]={}
for a in cooc.keys():
for b in cooc.keys():
if b in cooc[a]:
probs[a][b]=(cooc[a][b]) / ( (priors[b]) * (priors[a]**0) )
#print probs[a][b]
else:
probs[a][b]=0.0
return probs
if __name__=="__main__":
n=2
out_file=open("/home/paul/mayThesis/semEvalTask9/SemEval2_task9_all_data_final/SemEval2_task9_scorer/out.txt",'w')
train_file=open("/home/paul/mayThesis/semEvalTask9/combined.txt")
test_file=open("/home/paul/mayThesis/semEvalTask9/testing.txt")
test_file=open("/home/paul/mayThesis/semEvalTask9/testing.txt")
test_pairs=parse_semeval.parse_file(test_file, n)
all_pairs=parse_semeval.parse_file(train_file, n)
priors=make_priors(all_pairs)
probs=make_prob_table(all_pairs,priors)
for pair in test_pairs:
candidates=copy.copy(pair.paraphrases)
for para in pair.paraphrases:
for c in candidates:
if para==c:continue
para.score+=probs[para.name][c.name]
pair.paraphrases=sorted(pair.paraphrases, key=lambda x: x.score, reverse=True)
i=0
for p in pair.paraphrases:
i+=1
out_file.write(str(i)+ " "+pair.n1 + " " + pair.n2 + " "+ p.name + " "+str(p.score) + "\n")