logbylaskra.py

import scraperwiki
import urllib2
import lxml.etree, lxml.html

url = 'http://dl.dropbox.com/u/2192482/Logbylaskra_31_des_2009.pdf'
'''
pdfdata = urllib2.urlopen('http://dl.dropbox.com/u/2192482/Logbylaskra_31_des_2009.pdf').read()
pdfxml = scraperwiki.pdftoxml(pdfdata)

root = lxml.etree.fromstring(pdfxml)


print pdfxml

#rows = root.xpath('//text')

#for all in rows[3:]:
#    print all.text

'''


pdfdata = urllib2.urlopen(url).read()
print "The pdf file has %d bytes" % len(pdfdata)

xmldata = scraperwiki.pdftoxml(pdfdata)
print "After converting to xml it has %d bytes" % len(xmldata)
print xmldata

root = lxml.etree.fromstring(xmldata)
pages = list(root)

print "The pages are numbered:", [ page.attrib.get("number")  for page in pages ]


# this function has to work recursively because we might have "<b>Part1 <i>part 2</i></b>"
def gettext_with_bi_tags(el):
    res = [ ]
    if el.text:
        res.append(el.text)
    for lel in el:
        res.append("<%s>" % lel.tag)
        res.append(gettext_with_bi_tags(lel))
        res.append("</%s>" % lel.tag)
        if el.tail:
            res.append(el.tail)
    return "".join(res)

# print the first hundred text elements from the first page
page0 = pages[0]
for el in list(page)[:100]:
    if el.tag == "text":
        print el.attrib, gettext_with_bi_tags(el)


# If you have many PDF documents to extract data from, the trick is to find what's similar 
# in the way that the information is presented in them in terms of the top left bottom right 
# pixel locations.  It's real work, but you can use the position visualizer here:
#    http://scraperwikiviews.com/run/pdf-to-html-preview-1/