forked from jsfenfen/whatwordwhere
-
Notifications
You must be signed in to change notification settings - Fork 5
/
parse_utils.py
117 lines (95 loc) · 5.92 KB
/
parse_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
""" Utility functions to help parse hocr data. Django-independent. """
from lxml import etree, objectify
from lxml.etree import tostring
from StringIO import StringIO
from parse_errors import PageCountError
# The source data can be amazingly awful. Try to recover as much as possible.
# We could do a better job of this, maybe, by assuming only certain input characters were legal.
# Crazy characters are often not used in tax returns, for instance.
flexible_parser = etree.XMLParser(encoding='utf-8', recover=True)
def simple_clean(word):
"""
Placeholder for text processing that needs to take place at the very start of the process.
Stuff here gets applied before it's returned in any form.
"""
# it seems like words often have leading / trailing spaces
return word.strip()
def get_annotated_bbox(raw_bbox_string):
""" Convenience method for naming which coordinates is which in the bbox. I always forget which is which.
See: https://docs.google.com/document/d/1QQnIQtvdAC_8n92-LhwPcjtAUFwBlzE8EWnKAxlgVf0/preview
x0 y0 x1 y1
"""
coords = raw_bbox_string.split()
return {'xmin':coords[0], 'ymin':coords[1], 'xmax':coords[2], 'ymax':coords[3]}
def get_page_details(page_tree):
ocr_page = page_tree.xpath("//body/div[@class='ocr_page']")
# Raise an error if we have more than one page -- this happens occasionally, not sure why yet.
num_pages = len(ocr_page)
if num_pages != 1:
print "%s" % (tostring(page_tree))
raise PageCountError ("Pages count mismatch: %s pages found" % (num_pages))
return ocr_page[0].attrib
def get_bbox_from_title(title):
## This is brittle, but seems to work
## There seem to be to variants of the hocr -- one has span class of 'ocr_word' the other users 'ocrx_word'. We're just looking for a class that starts with ocr_word, but I dunno if that's good, or if we'll find other problems in later dox.
return title.lstrip('bbox ')
def get_words_only(word_tree):
""" Return a python dict by flattening the entire page into just an array of words.
This discards line number information, and is really just here as a historical thing.
"""
word_array = []
xpath_query = "//body/div[@class='ocr_page']/div[@class='ocr_carea']/p[@class='ocr_par']/span[@class='ocr_line' and @id='" + line_id + "']/span[starts-with(@class, 'ocr')]"
#xpath_query1 = "//body/div[@class='ocr_page']/div[@class='ocr_carea']/p[@class='ocr_par']/span[@class='ocr_line' and @id='" + line_id + "']/span[@class='ocr_word']"
#xpath_query2 = "//body/div[@class='ocr_page']/div[@class='ocr_carea']/p[@class='ocr_par']/span[@class='ocr_line' and @id='" + line_id + "']/span[@class='ocrx_word']"
hocr_words = etree.xpath(xpath_query)
#if not hocr_words:
# hocr_words = etree.xpath(xpath_query2)
for word in hocr_words:
#print word.attrib
# the text may be contained in this span, but it may also be contained in a child element.
word_array.append({'bbox':get_bbox_from_title(word.attrib['title']), 'text':simple_clean(tostring(word, method="text", encoding='UTF-8')), 'word_num':word.attrib['id'].replace("word_","")})
return word_array
def get_words_from_line(etree, line_id, line_num):
""" Return a python dict by flattening the entire page into just an array of words.
This discards line number information, and is really just here as a historical thing.
"""
#print "line id: %s" % line_id
word_array = []
xpath_query = "//body/div[@class='ocr_page']/div[@class='ocr_carea']/p[@class='ocr_par']/span[@class='ocr_line' and @id='" + line_id + "']/span[starts-with(@class, 'ocr')]"
## Have noodled with this some--these used to work. It's not clear that xpath really can handle the range of hocr formats that will be thrown at us, but it works when formats are fairly uniform.
#xpath_query1 = "//body/div[@class='ocr_page']/div[@class='ocr_carea']/p[@class='ocr_par']/span[@class='ocr_line' and @id='" + line_id + "']/span[@class='ocr_word']"
#xpath_query2 = "//body/div[@class='ocr_page']/div[@class='ocr_carea']/p[@class='ocr_par']/span[@class='ocr_line' and @id='" + line_id + "']/span[@class='ocrx_word']"
hocr_words = etree.xpath(xpath_query)
# hocr_words = etree.xpath(xpath_query1)
#if not hocr_words:
# hocr_words = etree.xpath(xpath_query2)
for word in hocr_words:
# the text may be contained in this span, but it may also be contained in a child element.
word_array.append({'bbox':get_bbox_from_title(word.attrib['title']), 'text':simple_clean(tostring(word, method="text", encoding='UTF-8')), 'word_num':word.attrib['id'].replace("word_",""), 'line_num':line_num})
return word_array
def get_lines_with_words(line_tree):
""" Flatten the page into a two-level hierarchy: lines, which contain words. """
#print tostring(line_tree)
word_array = []
hocr_lines = line_tree.xpath("//body/div[@class='ocr_page']/div[@class='ocr_carea']/p[@class='ocr_par']/span[@class='ocr_line']")
for line in hocr_lines:
#print "line attribs is %s" % (line.attrib)
this_line = {}
this_line['attrib'] = line.attrib
line_id = line.attrib['id']
line_num = line_id.replace("line_","")
this_line_words = get_words_from_line(line, line_id, line_num)
word_array = word_array + this_line_words
return word_array
def get_words_from_page(hocr_page):
""" Return just the words from an hocr page. """
tree = etree.parse(StringIO(hocr_page), flexible_parser)
page_details = get_page_details(tree)
words = get_words_only(tree)
return {'attrib': page_details, 'words':words}
def get_words_with_lines_from_page(hocr_page):
""" Return words with lines from an hocr page. """
tree = etree.parse(StringIO(hocr_page), flexible_parser)
page_details = get_page_details(tree)
words = get_lines_with_words(tree)
return {'attrib': page_details, 'words':words}