/
kbp-xml-parser.py
executable file
·380 lines (307 loc) · 17.2 KB
/
kbp-xml-parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
#!/eecs/research/asr/mingbin/python-workspace/hopeless/bin/python
# -*- coding: utf-8 -*-
import sys, codecs, logging, json, os, argparse, urllib, time, HTMLParser
from pycorenlp import StanfordCoreNLP
from pprint import pprint, pformat
from copy import deepcopy
from hanziconv import HanziConv
logger = logging.getLogger(__name__)
def process_one_file( input_dir, output_dir, filename, solution, language, sentence_only = False ):
# send the text to CoreNLP for annotation
properties = { 'annotators': 'tokenize,ssplit',
'outputFormat': 'json' }
if language == 'cmn':
properties['customAnnotatorClass.tokenize'] = 'edu.stanford.nlp.pipeline.ChineseSegmenterAnnotator'
properties['tokenize.model'] = 'edu/stanford/nlp/models/segmenter/chinese/ctb.gz'
properties['tokenize.sighanCorporaDict'] = 'edu/stanford/nlp/models/segmenter/chinese'
properties['tokenize.serDictionary'] = 'edu/stanford/nlp/models/segmenter/chinese/dict-chris6.ser.gz'
properties['tokenize.sighanPostProcessing'] = 'true'
properties['ssplit.boundaryTokenRegex'] = urllib.quote_plus( '[!?]+|[。]|[!?]+' )
if language == 'spa':
properties['tokenize.language'] = 'es'
parsed_bracket = { '-LRB-', '-RRB-', '-LSB-', '-RSB-', '-LCB-', '-RCB-' }
with codecs.open( os.path.join( input_dir, filename),
'rb', 'utf-8' ) as fp:
data = fp.read()
# filename = filename.rsplit('.', 1)[0]
# KBP2015
if filename[-7:] == '.df.xml' or filename[-7:] == '.nw.xml':
filename = filename[:-7]
elif filename[-4:] == '.xml':
filename = filename[:-4]
# KBP2016
elif filename[-3:] == '.df' or filename[-3:] == '.nw':
filename = filename[:-3]
out_file = codecs.open( os.path.join( output_dir, filename ),
'wb', 'utf-8' )
lsb = data.find( u'<DOC' )
if lsb == -1:
lsb = data.find( u'<doc' )
assert lsb >= 0, 'kbp files should start with <DOC> tag'
data = data[lsb:]
logger.debug( '<DOC> or <doc> starts at index %d' % lsb )
assert data.find( u'<' ) == 0, 'Begin of document is incorrect'
lsb = 0
rsb = data.find( u'>', lsb )
assert rsb != -1, 'xml tag doesn not match'
tags = u'\n'
in_quote = False
while True:
assert data[lsb] == u'<'
assert data[rsb] == u'>'
tags += u'(%d,%d)'.ljust(16, u' ') % (lsb, rsb + 1) + data[lsb: rsb + 1] + u'\n'
# look for post author
tag = data[lsb: rsb + 1]
if tag.find( u'<quote' ) != -1:
in_quote = True
if tag.find( u'</quote' ) != -1:
in_quote = False
post_offset = tag.find( u'<post' )
if post_offset != -1:
author_offset = tag.find( u'author', post_offset )
if post_offset != -1:
begin_offset = tag.find( u'"', author_offset ) + 1
end_offset = tag.find( u'"', begin_offset )
post_au = tag[begin_offset: end_offset]
n_leading = len(post_au) - len(post_au.lstrip())
candidate = ( begin_offset + lsb + n_leading,
begin_offset + lsb + \
len(tag[begin_offset: end_offset].rstrip()) )
logger.debug( data[candidate[0]:candidate[1]] )
if filename in solution:
if candidate in solution[filename]:
tags += u'(%d,%d)' % candidate + \
u'(%s,%s,%s)\n' % tuple(solution[filename][candidate][1:])
solution[filename].pop( candidate )
else: # deal with mis-labled offset
for low, high in solution[filename].keys():
if tag[begin_offset: end_offset] == \
solution[filename][(low,high)][0] \
and candidate[0] <= high and low <= candidate[1]:
tags += u'(%d,%d)' % (low, high) + \
u'(%s,%s,%s)\n' % tuple(solution[filename][(low,high)][1:])
solution[filename].pop( (low,high) )
# post authors are detected during prepocessing
# this line is part of the final output
tags += u'(%d,%d) ' % candidate + data[candidate[0]:candidate[1]] + u'\n'
tags += u'\n'
# look for tags
next_lsb = data.find( u'<', rsb + 1 )
if next_lsb == -1:
break
next_rsb = data.find( u'>', next_lsb )
position = rsb + 1
text = data[position: next_lsb]
text = text.replace(u'/', u' ')#.replace(u'-', u' ')
# escape %, CoreNLP might complain
text = text.replace( u'%', u'%25' )
# text = urllib.quote( text )
if len( text.strip() ) > 0:
# check whether the text under consideration is in quote region
is_in_quote = False
if filename in quote:
for begin, end in quote[filename]:
if begin <= position < next_lsb <= end:
is_in_quote = True
break
if is_in_quote or in_quote:
lsb, rsb = next_lsb, next_rsb
continue
n_leading_whitespace = len(text) - len(text.lstrip())
text = text[n_leading_whitespace:]
position += n_leading_whitespace
logger.debug( text )
try:
time.sleep( 0.0128 )
if language == 'cmn':
text = HanziConv.toSimplified( text )
parsed = nlp.annotate( text, properties = properties )
assert isinstance( parsed, dict ), 'CoreNLP does not return well-formed json'
for sentence in parsed[u'sentences']:
# words = [ tokens[u'word'] for tokens in sentence[u'tokens'] ]
# offsets = [ ( tokens[u'characterOffsetBegin'] + position, \
# tokens[u'characterOffsetEnd'] + position ) \
# for tokens in sentence[u'tokens'] ]
extra = u''
words, offsets = [], []
for tokens in sentence[u'tokens']:
if language != 'cmn':
word = tokens[u'word']
begin_offset = tokens[u'characterOffsetBegin']
# print tokens[u'word'], tokens[u'characterOffsetBegin'], tokens[u'characterOffsetEnd']
while True:
next_dash = word.find( u'-' )
if word in parsed_bracket:
next_dash = -1
if next_dash != -1:
# print word, next_dash
if next_dash != 0:
words.append( html_parser.unescape( word[:next_dash] ) )
words.append( u'-' )
end_offset = begin_offset + next_dash
if next_dash != 0:
offsets.append( ( begin_offset + position,
end_offset + position) )
offsets.append( ( end_offset + position,
end_offset + 1 + position) )
word = word[next_dash + 1:]
begin_offset += next_dash + 1
else:
if len(word) > 0: # in case that dash is the last character
words.append( html_parser.unescape( word ) )
offsets.append( ( begin_offset + position,
tokens[u'characterOffsetEnd'] + position ) )
break
else:
# extra += u'(%d,%d,%s) ' % ( tokens[u'characterOffsetBegin'] + position,
# tokens[u'characterOffsetEnd'] + position,
# tokens[u'word'] )
word = tokens[u'word']
if not sentence_only:
begin_offset = tokens[u'characterOffsetBegin']
has_chinese = any( u'\u4e00' <= c <= u'\u9fff' for c in word )
if has_chinese:
for i,c in enumerate( word ):
words.append( u'%s|iNCML|%s' % (c, html_parser.unescape(word)) )
offsets.append( ( begin_offset + position + i,
begin_offset + position + i + 1 ) )
else:
words.append( u'%s|iNCML|%s' % (word, html_parser.unescape(word)) )
offsets.append( ( begin_offset + position,
tokens[u'characterOffsetEnd'] + position ) )
else:
words.append( html_parser.unescape(word) )
for i in xrange(len(words)):
words[i] = words[i].replace( u' ', u'\0' )
assert sentence_only or len(words) == len(offsets)
for word, offset in zip( words, offsets ):
if word != data[offset[0]:offset[1]]:
logger.debug( u'%s, %s, (%d,%d)' % \
( word, data[offset[0]:offset[1]],
offset[0], offset[1] ) )
label = []
if filename in solution:
for i in xrange(len(offsets)):
for j in xrange(i, len(offsets)):
candidate = (offsets[i][0], offsets[j][1])
joint_str = u''.join( data[candidate[0]:candidate[1]].split() ).lower()
# logger.debug( u'%s, %d, %d' % (joint_str, candidate[0], candidate[1]) )
if candidate in solution[filename]:
sol_str = u''.join(solution[filename][candidate][0].split() ).lower()
if joint_str == sol_str:
label.append( u'(%d,%d,%s,%s,%s)' % \
tuple([i, j + 1] + solution[filename][candidate][1:]) )
solution[filename].pop( candidate )
else: # deal with mis-labled offset
for low, high in solution[filename].keys():
sol_str = u''.join(solution[filename][(low,high)][0].split()).lower()
if joint_str == sol_str and \
candidate[0] <= high and low <= candidate[1]:
label.append( u'(%d,%d,%s,%s,%s)' % \
tuple([i, j + 1] + solution[filename][(low,high)][1:]) )
solution[filename].pop( (low,high) )
if sentence_only:
out_file.write( u' '.join(words) + u'\n' )
else:
out_file.write( u' '.join( words ) + u'\n' + \
u' '.join( u'(%d,%d)' % (b,e) for b,e in offsets ) + u'\n' )
# if len(extra) > 0:
# out_file.write( extra + u'\n' )
if len(label) > 0:
out_file.write( u' '.join( label ) + u'\n' )
out_file.write( u'\n' )
except:
logger.exception( filename )
logger.exception( text )
lsb, rsb = next_lsb, next_rsb
if not sentence_only:
out_file.write( tags )
non_match = u'\n'
if filename in solution:
for key,value in sorted(solution[filename].iteritems(), key = lambda x: x[0] ):
non_match += u'%s, %s %s %s %s\n' % ( unicode(key),
value[0], value[1], value[2], value[3] )
out_file.write( non_match )
if len(solution[filename]) > 0:
# logger.info( u'%s non-match ones: \n' % filename + \
# pformat( solution[filename], width = 128 ) )
logger.info( u'%s non-match ones: ' % filename + non_match )
if not sentence_only:
out_file.write( u'\n\n' + u'=' * 128 + u'\n\n\n' + data )
out_file.close()
logger.info( '%s processed\n' % filename )
# if len( solution[filename] ) > 0:
# exit(0)
return len( solution[filename] ) if filename in solution else 0
def process_all_files( input_dir, output_dir, solution, language, sentence_only = False ):
n_fail = 0
files = os.listdir( input_dir )
for filename in files:
full_name = os.path.join( input_dir, filename )
if os.path.isdir( full_name ):
n_fail += process_all_files( full_name, output_dir, solution, language, sentence_only )
else:
n_fail += process_one_file( input_dir, output_dir, filename, solution, language, sentence_only )
return n_fail
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument( 'input_dir', type = str,
help = 'directory containing kbp source xml' )
parser.add_argument( 'output_dir', type = str,
help = 'directory containing parsed sentences' )
parser.add_argument( '--mention', type = str, default = None,
help = 'tac_kbp_2015_tedl_training_gold_standard_entity_mentions.tab' )
parser.add_argument( '--quote', type = str, default = None,
help = 'quote_regions.tab' )
parser.add_argument( '--verbose', action = 'store_true', default = False )
parser.add_argument( '--language', type = str, default = 'eng',
choices = [ 'eng', 'cmn', 'spa' ] )
parser.add_argument( '--url', type = str, default = 'http://localhost:2054' )
parser.add_argument( '--sentence_only', action = 'store_true', default = False )
args = parser.parse_args()
logging.basicConfig( format = '%(asctime)s : %(levelname)s : %(message)s',
level = logging.DEBUG if args.verbose else logging.INFO )
for handler in logging.root.handlers:
handler.addFilter( logging.Filter(__name__) )
logger.info( args )
# load the solutions
solution, correct, incorrect = {}, 0, 0
gold_tab = args.mention
if gold_tab is not None:
incorrect_info = u'\n'
with codecs.open( gold_tab, 'rb', 'utf-8' ) as fp:
for line in fp:
# traditional chinese is converted to simplified chinese
# CJK space is converted to ascii space
tokens = HanziConv.toSimplified( line ).split( u'\t' )
if len(tokens) > 6:
filename, offset = tokens[3].split(u':')
begin_offset, end_offset = [ int(x) for x in offset.split(u'-') ]
if filename not in solution:
solution[filename] = {}
solution[filename][(begin_offset, end_offset + 1)] = [tokens[2], tokens[4], tokens[5], tokens[6]]
if end_offset + 1 - begin_offset != len(tokens[2]):
incorrect_info += u'%s %s %d %d\n' % \
( filename, tokens[2], begin_offset, end_offset + 1 )
incorrect += 1
else:
correct += 1
logger.debug( incorrect_info )
logger.info( '%d correct, %d incorrect\n' % (correct, incorrect) )
# load the quote regions
quote = {}
quote_tab = args.quote
if quote_tab is not None:
with codecs.open( quote_tab, 'rb', 'utf-8' ) as fp:
for line in fp:
filename, begin_offset, end_offset = line.strip().split(u'\t')
filename = filename.split(u'.')[0]
begin_offset, end_offset = int(begin_offset), int(end_offset) + 1
if filename not in quote:
quote[filename] = set()
quote[filename].add( (begin_offset,end_offset) )
# setup CoreNLP
nlp = StanfordCoreNLP( args.url )
html_parser = HTMLParser.HTMLParser()
n_fail = process_all_files( args.input_dir, args.output_dir, solution, args.language, args.sentence_only )
logger.info( u'fail to parse: %d ' % n_fail )