-
Notifications
You must be signed in to change notification settings - Fork 6
/
political-contributions.py
57 lines (37 loc) · 1.51 KB
/
political-contributions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# -*- coding: utf-8 -*-
# Iceland - political contributions - WORK IN PROGRESS
import scraperwiki
import urllib
import lxml.etree, lxml.html
import re
pdfurl = "http://www.rikisendurskodun.is/utgefid-efni/fjarmal-stjornmalastarfsemi/utdrattur-ur-uppgjorum-stjornmalasamtaka/?eID=dam_frontend_push&docID=1170"
# (harder example to work on: http://www.nihe.gov.uk/schemes_accepted_010109_to_310309.pdf )
pdfdata = urllib.urlopen(pdfurl).read()
pdfxml = scraperwiki.pdftoxml(pdfdata)
root = lxml.etree.fromstring(pdfxml)
print "The pdf file has %d bytes" % len(pdfdata)
print "After converting to xml it has %d bytes" % len(pdfxml)
print "The XML: ", pdfxml
pages = list(root)
print "The pages are numbered:", [ page.attrib.get("number") for page in pages ]
fontspecs = { }
pagetexts = [ ]
for page in root:
assert page.tag == 'page'
pagelines = { }
for v in page:
if v.tag == 'fontspec':
fontspecs[v.attrib.get('id')] = v
else:
assert v.tag == 'text'
text = re.match('(?s)<text.*?>(.*?)</text>', lxml.etree.tostring(v)).group(1) # there has to be a better way here to get the contents
top = int(v.attrib.get('left'))
if top not in pagelines:
pagelines[top] = [ ]
pagelines[top].append((int(v.attrib.get('left')), text))
lpagelines = pagelines.items()
lpagelines.sort()
for top, line in lpagelines[:20]:
line.sort()
print top, line
pagetexts.append(pagelines)