/
UKImport.py
714 lines (589 loc) · 34.1 KB
/
UKImport.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
#####
## Hansard Analysis Package
## Requires Python 3.2
## tanya.whyte@mail.utoronto.ca
## Current 2013-07-23
##
## UKImport.py
## Converts UK Commons XML archives to HansardIO standard
#####
import os
from bs4 import BeautifulSoup
import json
import pickle
import datetime
import copy
import time
import datetime
import re
from toJSON import parseXML
# Error definitions
class HansardImportError(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return repr(self.value)
class HansardImportDateError(HansardImportError):
pass
# UK Translation class - writes and loads dictionaries necessary for translating UK Hansard
# data to the standard imposed by Canadian Hansard data from openparliament.ca
class UKTranslator (object):
def __init__(self):
pass
def dateObjectifier(self, dateStr):
'''Converts a yyyy-mm-dd to a datetime.date object'''
workingYear = dateStr[0:4]
if dateStr[5] == "0":
workingMonth = dateStr[6]
else:
workingMonth = dateStr[5:7]
if dateStr[8] == '0':
workingDay = dateStr[9]
else:
workingDay = dateStr[8:10]
workingDateObject = datetime.date(int(workingYear), int(workingMonth), int(workingDay))
return workingDateObject
def makeUKTranslationFile(self):
'''Creates and writes a file UKTranslationData.pkl
Contains parliament and session data, politicians, etc.
'''
# For the moment, we only go back as far as 1994 here for simplicity's sake
# eg. Old records have nasty duplicates that would need to be dealt with
# "dates" provides information about parliament/session beginning and end dates
# source: http://www.parliament.uk/about/faqs/house-of-commons-faqs/business-faq-page/recess-dates/recess/
datesDict = {'1994':[{'51':{'2':[datetime.date(1993,11,18), datetime.date(1994,11,3)], '3':[datetime.date(1994,11,16), datetime.date(1995,11,8)]}}], \
'1995':[{'51':{'3':[datetime.date(1994,11,16), datetime.date(1995,11,8)], '4':[datetime.date(1995,11,15), datetime.date(1996,10,7)]}}], \
'1996':[{'51':{'4':[datetime.date(1995,11,15), datetime.date(1996,10,7)], '5':[datetime.date(1996,10,14), datetime.date(1997,3,21)]}}], \
'1997':[{'51':{'5':[datetime.date(1996,10,23), datetime.date(1997,3,21)]}}, {'52':{'1':[datetime.date(1997,5,1), datetime.date(1998,11,19)]}}], \
'1998':[{'52':{'1':[datetime.date(1997,5,1), datetime.date(1998,11,19)], '2':[datetime.date(1998,11,24), datetime.date(1999,11,11)]}}], \
'1999':[{'52':{'2':[datetime.date(1998,11,24), datetime.date(1999,11,11)], '3':[datetime.date(1999,11,17), datetime.date(2000,11,30)]}}], \
'2000':[{'52':{'3':[datetime.date(1999,11,17), datetime.date(2000,11,30)], '4':[datetime.date(2000,12,6), datetime.date(2001,5,11)]}}], \
'2001':[{'52':{'4':[datetime.date(2000,12,6), datetime.date(2001,5,11)]}}, {'53':{'1':[datetime.date(2001,6,13), datetime.date(2002,11,7)]}}], \
'2002':[{'53':{'1':[datetime.date(2001,6,13), datetime.date(2002,11,7)], '2':[datetime.date(2002,11,13), datetime.date(2003,11,20)]}}], \
'2003':[{'53':{'2':[datetime.date(2002,11,13), datetime.date(2003,11,20)], '3':[datetime.date(2003,11,26), datetime.date(2004,11,18)]}}], \
'2004':[{'53':{'3':[datetime.date(2003,11,26), datetime.date(2004,11,18)], '4':[datetime.date(2004,11,23), datetime.date(2005,4,7)]}}], \
'2005':[{'53':{'4':[datetime.date(2004,11,23), datetime.date(2005,4,7)]}}, {'54':{'1':[datetime.date(2005,5,11), datetime.date(2006,11,8)]}}], \
'2006':[{'54':{'1':[datetime.date(2005,5,11), datetime.date(2006,11,8)], '2':[datetime.date(2006,11,15), datetime.date(2007,10,30)]}}], \
'2007':[{'54':{'2':[datetime.date(2006,11,15), datetime.date(2007,10,30)], '3':[datetime.date(2007,11,6), datetime.date(2008,11,26)]}}], \
'2008':[{'54':{'3':[datetime.date(2007,11,6), datetime.date(2008,11,26)], '4':[datetime.date(2008,12,3), datetime.date(2009,11,12)]}}], \
'2009':[{'54':{'4':[datetime.date(2008,12,3), datetime.date(2009,11,12)], '5':[datetime.date(2009,11,18), datetime.date(2010,4,12)]}}], \
'2010':[{'54':{'5':[datetime.date(2009,11,18), datetime.date(2010,4,12)]}}, {'55':{'1':[datetime.date(2010,5,25), datetime.date(2012,5,1)]}}], \
'2011':[{'55':{'1':[datetime.date(2010,5,25), datetime.date(2012,5,1)]}}], \
'2012':[{'55':{'1':[datetime.date(2010,5,25), datetime.date(2012,5,1)], '2':[datetime.date(2012,5,9), datetime.date(2013,4,25)]}}]}
# "names" relies on a copy of the file "all-members.xml" in working directory from http://ukparse.kforge.net/svn/parlparse/members/all-members.xml
namesDict = {}
try:
f = "all-members.xml"
j = parseXML(f)
k = json.loads(j)
except:
raise HansardImportError("No all-members.xml file present.")
working=k["publicwhip"]["member"]
print(len(working))
for member in working:
endDate = member["todate"]
endDateObject = datetime.date(int(endDate[0:4]), 1, 1)
if endDateObject.year < 1994:
pass
# Ignoring this person because they left Commons before 1994
else:
firstName = member["firstname"]
lastName = member["lastname"]
fullName = firstName + " " + lastName
riding = member["constituency"]
memberID = member["id"]
party = member["party"]
startDate = member["fromdate"]
if memberID in namesDict:
# A person with this name already exists--check to see if we have a duplicate
# We assume that there will be no two people with the same name in the same riding who aren't duplicates
# This doesn't work properly, but since these only show up in very old data it's fine for now
for listMember in namesDict[memberID]:
if riding in listMember:
pass
# this person seems to be a duplicate
else:
# this person is a new member with a shared name but different riding -- add them
ridingDict = {"fullName":fullName, "memberID": memberID, "firstName":firstName, "lastName":lastName, "riding":riding, "party":party, "startDate":startDate, "endDate":endDate}
namesDict[memberID].append({riding:copy.deepcopy(ridingDict)})
print (fullName+" has another instance")
else:
# This fullname doesn't exist yet, create it
ridingDict = {"fullName":fullName, "firstName":firstName, "memberID": memberID, "lastName":lastName, "riding":riding, "party":party, "startDate":startDate, "endDate":endDate}
print ("Added" + " " + memberID)
namesDict[memberID] = [{riding:copy.deepcopy(ridingDict)}]
# "parliamentData" is a placeholder for anything else that needs to be added in future
#names2010 dict--unfortunately, there are two allmembers files!!!
names2010Dict = {}
try:
f = "all-members-2010.xml"
j = parseXML(f)
k = json.loads(j)
except:
raise HansardImportError("No all-members-2010.xml file present.")
working=k["publicwhip"]["member"]
print(len(working))
for member in working:
endDate = member["todate"]
endDateObject = datetime.date(2013, 1, 1)
firstName = member["firstname"]
lastName = member["lastname"]
fullName = firstName + " " + lastName
riding = member["constituency"]
memberID = member["id"]
party = member["party"]
startDate = member["fromdate"]
ridingDict = {"fullName":fullName, "firstName":firstName, "memberID": memberID, "lastName":lastName, "riding":riding, "party":party, "startDate":startDate, "endDate":endDate}
print ("Added" + " " + memberID)
names2010Dict[memberID] = [{riding:copy.deepcopy(ridingDict)}]
# now we write the translation file
final = {"names":(copy.deepcopy(namesDict)), "dates":(copy.deepcopy(datesDict)), "names2010":(copy.deepcopy(names2010Dict))}
output = open(('UKTranslationData.pkl'), 'wb')
pickle.dump(final, output)
def loadUKTranslationFile(self, filename="UKTranslationData.pkl"):
'''Loads a file UKSessionData.pkl
Contains parliament and session data, politicians, etc.
Needed for convertUKHansardFile()
'''
try:
pkl_file = open((filename), 'rb')
myfile = pickle.load(pkl_file)
pkl_file.close()
return myfile
except (OSError, IOError):
raise HansardImportError("No .pkl file of that name.")
def convertDateToUKParlSess(self, dateStr, parlsess):
'''Returns either the parliament or the session information for a given date
dateStr == yyyy-mm-dd string format, parlsess = Parliament or Session in str format
Note that we return null if parliament isn't in session at the time (this problem is not handled here)'''
TransFile = self.loadUKTranslationFile()
workingYear = dateStr[0:4]
if dateStr[5] == "0":
workingMonth = dateStr[6]
else:
workingMonth = dateStr[5:7]
if dateStr[8] == '0':
workingDay = dateStr[9]
else:
workingDay = dateStr[8:10]
workingDateObject = datetime.date(int(workingYear), int(workingMonth), int(workingDay))
if parlsess == "Parliament":
workingYearList = TransFile['dates'][workingYear]
if len(workingYearList) == 1: # there is only one parliament in this year, so return it
return list(workingYearList[0].keys())[0]
else:
for yearParl in workingYearList: # for dict in list of dicts, each dict has a key of the parliament
for workingParliament in list(yearParl.keys()):
for workingSession in list(yearParl[workingParliament].keys()):
testStart = yearParl[workingParliament][workingSession][0]
testEnd = yearParl[workingParliament][workingSession][1]
if testStart <= workingDateObject <= testEnd:
return workingParliament
else:
pass
elif parlsess == "Session":
workingYearList = TransFile['dates'][workingYear]
for yearParl in workingYearList: # for dict in list of dicts, each dict has a key of the parliament
for workingParliament in list(yearParl.keys()):
for workingSession in list(yearParl[workingParliament].keys()):
testStart = yearParl[workingParliament][workingSession][0]
testEnd = yearParl[workingParliament][workingSession][1]
if testStart <= workingDateObject <= testEnd:
return workingSession
else:
raise HansardImportError("Invalid--Choose Parliament or Session")
def convertUKName(self, dateStr, memberID, parrid):
'''Returns either the party or riding information for a given date
memberID is a string, dateStr == yyyy-mm-dd string format, parrid = Party or Riding or Fullname in str format
Returns None if they weren't in Parliament on dateStr
'''
TransFile = self.loadUKTranslationFile()
workingYear = dateStr[0:4]
if dateStr[5] == "0":
workingMonth = dateStr[6]
else:
workingMonth = dateStr[5:7]
if dateStr[8] == '0':
workingDay = dateStr[9]
else:
workingDay = dateStr[8:10]
workingDateObject = datetime.date(int(workingYear), int(workingMonth), int(workingDay))
if memberID == "unknown":
return "unknown"
if memberID =="uk.org.publicwhip/royal/-1": #The Queen!
if parrid == "Party":
return ""
elif parrid == "Riding":
return ""
elif parrid == "Fullname":
return "The Queen"
if parrid == "Party":
if memberID[25]=="4" and len(memberID)==30: ##it's a post-2010 instance
workingNameDict = TransFile['names2010']
else:
workingNameDict = TransFile['names']
try:
memberInfo = workingNameDict[memberID]
if len(memberInfo) == 1: # only one person with this ID; return their info
return memberInfo[0][(list(memberInfo[0].keys())[0])]["party"]
else: # more than one person with this name (or some kind of duplicate record exists) so check date
for personInstance in range(len(memberInfo)):
testStart = self.dateObjectifier(memberInfo[personInstance][(list(memberInfo[personInstance].keys())[0])]["startDate"])
testEnd = self.dateObjectifier(memberInfo[personInstance][(list(memberInfo[personInstance].keys())[0])]["endDate"])
if testStart <= workingDateObject <= testEnd:
return memberInfo[personInstance][(list(memberInfo[personInstance].keys())[0])]["party"]
except KeyError:
raise HansardImportError("Invalid--ID Doesn't Exist in UK Names")
elif parrid == "Riding":
if memberID[25]=="4" and len(memberID)==30: #it's a post-2010 instance
workingNameDict = TransFile['names2010']
else:
workingNameDict = TransFile['names']
try:
memberInfo = workingNameDict[memberID]
if len(memberInfo) == 1: # only one person with this name; return their info
return memberInfo[0][(list(memberInfo[0].keys())[0])]["riding"]
else: # more than one person with this name (or some kind of duplicate record exists) so check date
for personInstance in range(len(memberInfo)):
testStart = self.dateObjectifier(memberInfo[personInstance][(list(memberInfo[personInstance].keys())[0])]["startDate"])
testEnd = self.dateObjectifier(memberInfo[personInstance][(list(memberInfo[personInstance].keys())[0])]["endDate"])
if testStart <= workingDateObject <= testEnd:
return memberInfo[personInstance][(list(memberInfo[personInstance].keys())[0])]["riding"]
except KeyError:
raise HansardImportError("Invalid--ID Doesn't Exist in UK Names")
elif parrid == "Fullname":
if memberID[25]=="4" and len(memberID)==30: #it's a post-2010 instance
workingNameDict = TransFile['names2010']
else:
workingNameDict = TransFile['names']
try:
memberInfo = workingNameDict[memberID]
if len(memberInfo) == 1: # only one person with this name; return their info
return memberInfo[0][(list(memberInfo[0].keys())[0])]["fullName"]
else: # more than one person with this name (or some kind of duplicate record exists) so check date
for personInstance in range(len(memberInfo)):
testStart = self.dateObjectifier(memberInfo[personInstance][(list(memberInfo[personInstance].keys())[0])]["startDate"])
testEnd = self.dateObjectifier(memberInfo[personInstance][(list(memberInfo[personInstance].keys())[0])]["endDate"])
if testStart <= workingDateObject <= testEnd:
return memberInfo[personInstance][(list(memberInfo[personInstance].keys())[0])]["fullName"]
except KeyError:
raise HansardImportError("Invalid--ID Doesn't Exist in UK Names")
else:
raise HansardImportError("Invalid--Choose Party or Riding or Fullname")
# HansardImport class
class HansardImport(object):
def __init__(self):
if os.path.exists(os.path.join(os.getcwd(), "data_uk"))==False:
os.mkdir(os.path.join(os.getcwd(), "data_uk"))
else:
pass
self.translator = UKTranslator()
def createDir(self, dirName, path=os.getcwd()): # Helper function for creating directories
dirPath = os.path.join(path, dirName)
os.mkdir(dirPath)
def strip(self,untrusted_html):
"""Strips out all tags"""
soup = BeautifulSoup(untrusted_html)
safe_html = ''.join(soup.find_all(text=True))
return safe_html
def strip_friend(self, text):
'''Quick regex hack to remove honfriend class phrase tags so that all speech text is properly processed'''
p = re.compile(r'<phrase class="honfriend".*?>.*?>')
return p.sub('', text)
def strip_date(self, text):
'''Quick regex hack to remove date class phrase tags so that all speech text is properly processed'''
p = re.compile(r'<phrase class="date".*?>.*?>')
return p.sub('', text)
def dictify(self, dictString):
'''Turn UK hansard stripped string into a HansardDict-style dict object'''
doneYet = False
dicted = []
datesList = []
while not doneYet:
n = dictString.find('.xml')
if n == -1:
doneYet = True
else:
if 's' not in dictString[(n-11):(n-1)]:
datesList.append(dictString[(n-11):(n-1)])
print((dictString[(n-11):(n-1)]))
else:
datesList.append(dictString[(n-10):(n)])
print((dictString[(n-10):(n)]))
dictString = dictString.replace('.xml', '', 1)
datesList = set(datesList)
for date in datesList:
dicted.append(dict([('date', str(date))]))
return dicted
def scrapeHansardDict(self, dictfilename="UKDict"):
'''([filename="UKDict"])
Scrapes the UKDict from ukparse.kforge.net
Writes HansardDict-type file containing a dict of valid UK files by date
Also creates a data directory if not already present
'''
try:
self.createDir("data_uk")
except OSError:
pass
if os.path.isfile((dictfilename + '.pkl')): # Checks for existing .pkl file
a = ""
while True:
a = input("File already exists. Replace? y/n ")
break
if a == "y": # Overwrites existing .pkl file
os.remove(dictfilename + '.pkl')
WORDS = []
for word in urllib.request.urlopen("http://ukparse.kforge.net/parldata/scrapedxml/debates/").readlines():
WORDS.append(word.strip().decode('utf-8'))
raw = ''.join(WORDS)
stripped = self.strip(raw)
final = self.dictify(stripped)
output = open((dictfilename + '.pkl'), 'wb')
pickle.dump(final, output)
output.close()
return None
if a == "n":
print("No UKDict scraped.")
return None
else: # Creates new .pkl file
WORDS = []
for word in urllib.request.urlopen("http://ukparse.kforge.net/parldata/scrapedxml/debates/").readlines():
WORDS.append(word.strip().decode('utf-8'))
raw = ''.join(WORDS)
stripped = self.strip(raw)
final = self.dictify(stripped)
output = open((dictfilename + '.pkl'), 'wb')
pickle.dump(final, output)
output.close()
return None
def loadHansardDict(self, dictfilename="UKDict"):
'''([filename="UKDict"])
Returns the UKDict in filename.pkl
'''
try:
pkl_file = open((dictfilename + ".pkl"), 'rb')
myfile = pickle.load(pkl_file)
pkl_file.close()
return myfile
except:
raise HansardImportError("No .pkl file of that name.")
def convertUKHansardFile(self, dateH, dictfilename="UKDict"):
''' dateH = str format yyyy-mm-dd, dictfilename = str local dict file
Writes the .pkl file for a valid Hansard date in a folder path data_uk/yyyy-mm-dd/
Works with raw xml dump downloaded from ukparse.kforge.net/parldata/scrapedxml/debates/
Skips over existing files with same path and filename.
Note that only select data is converted: speech text, politician, party, riding, time
'''
try:
activeDict # Test if there is already an activeDict loaded
except:
if os.path.isfile((dictfilename + '.pkl')):
activeDict = self.loadHansardDict(dictfilename) # If not, load a UKDict if it exists
else:
raise HansardImportError("No dictionary .pkl file of that name.")
validFiles = []
for i in activeDict:
if i["date"] == dateH:
validFiles.append(i["date"]) # Creates a working list of all UK hansard dates
if validFiles == []: # Checks to see if we are looking at a valid date
raise HansardImportDateError("Not a valid date.")
try: # Checks for existing directory for date
self.createDir(str(dateH), os.path.join(os.getcwd(), "data_uk"))
except OSError:
pass
for validDate in validFiles: # Converts the most recent (ie. correct and up to date) Hansard file to standard format
# Also organizes along same filesystem standard
if os.path.isfile(os.path.join(os.getcwd(), "data_uk", str(dateH), (str(dateH) + '.pkl'))):
# Checks for existing .pkl file
print ("Skipped existing file.")
else: # Creates new .pkl file
# Convert here!
if os.path.isfile((os.path.join(os.getcwd(), "data_uk_raw", "debates"+validDate+".xml"))):
f = (os.path.join(os.getcwd(), "data_uk_raw", "debates"+validDate+".xml"))
else:
f = (os.path.join(os.getcwd(), "data_uk_raw", "debates"+validDate+"a"+".xml"))
# strip encoding information to make lxml happy
lines = open(f, 'r', encoding = "iso-8859-1").readlines()
lines[0] = '<?xml version="1.0"?>\n'
file = open(f, 'w', encoding = "iso-8859-1")
for line in lines:
file.write(line)
file.close()
## lines = open(f, 'r').readlines()
## stripHolder = []
## for q in lines:
## r = q.replace("<i>", "") # strips italics tags
## s = r.replace("</i>", "")
## t = self.strip_friend(s) # strips honfriend class phrases
## stripHolder.append(t)
## file = open(f, 'w')
## for line in stripHolder:
## file.write(line)
# checks if we read the most current file; if not, finds it
j = parseXML(f)
k = json.loads(j)
for scrapeversion in ['', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']:
try:
f = (os.path.join(os.getcwd(), "data_uk_raw", "debates"+validDate+scrapeversion+".xml"))
lines = open(f, 'r', encoding = "iso-8859-1").readlines()
lines[0] = '<?xml version="1.0"?>\n'
stripHolder = []
for q in lines:
n = q.replace("</i >", "")
r = n.replace("<i>", "") # strips italics tags
s = r.replace("</i>", "")
t = self.strip_friend(s)
u = self.strip_date(t) # strips bad class phrases
stripHolder.append(u)
file = open(f, 'w', encoding = "iso-8859-1")
for line in stripHolder:
file.write(line)
file.close()
j = parseXML(f)
k = json.loads(j)
if (k["publicwhip"]["latest"]) == "no":
pass
else:
print ("Loaded scrapeversion "+scrapeversion)
break
except IOError:
pass
# constructs Hansard JSON information
workingHansardLog = {}
workingHansardLog["parliament"] = self.translator.convertDateToUKParlSess(dateH, "Parliament")
workingHansardLog["date"] = dateH
workingHansardLog["id"] = ""
workingHansardLog["session"] = self.translator.convertDateToUKParlSess(dateH, "Session")
workingHansardLog["url"] = ""
workingHansardLog["original_url"] = ""
workingStatements = []
if k["publicwhip"]["latest"]=="ignore": # some procedural recordings in the data (eg. swearing in of a new government) must be handled here with a manual flag
workingSpeech = {}
workingSpeech["time"] = ""
workingSpeech["heading"]= ""
workingSpeech["topic"]= ""
workingSpeech["url"]=""
workingSpeech["attribution"]=""
workingSpeech["text"]=""
workingStatements=[workingSpeech]
workingHansardLog["statements"] = copy.deepcopy(workingStatements)
output = open((os.path.join(os.getcwd(), "data_uk", str(dateH), (str(dateH) + '.pkl'))), 'wb')
pickle.dump(workingHansardLog, output)
output.close() # writes a dummy file
return None
for speechNumber in range(len(k["publicwhip"]["speech"])):
workingSpeech = {}
if "nospeaker" in list(k["publicwhip"]["speech"][speechNumber].keys()) or "speakername" not in list(k["publicwhip"]["speech"][speechNumber].keys()):
pass
else:
speechTime = copy.deepcopy(k["publicwhip"]["speech"][speechNumber]["time"])
workingSpeech["time"] = speechTime
workingSpeech["heading"]= ""
workingSpeech["topic"]= ""
try:
workingSpeech["url"]= copy.deepcopy(k["publicwhip"]["speech"][speechNumber]["url"])
except KeyError:
workingSpeech["url"]=""
# politician data
speakerName = copy.deepcopy(k["publicwhip"]["speech"][speechNumber]["speakername"])
speakerID = copy.deepcopy(k["publicwhip"]["speech"][speechNumber]["speakerid"])
workingSpeech["attribution"]=speakerName
if speakerName in ["Several hon. Members", "Hon. Members", "Speaker", "Mr. Speaker", "Madam Speaker", "Mr. Deputy Speaker", "Madam Deputy Speaker", "unknown"]:
pass
else:
workingSpeech["politician"] = {}
workingSpeech["politician"]["name"]=self.translator.convertUKName(str(dateH), speakerID, "Fullname")
workingSpeech['politician']['riding']=self.translator.convertUKName(str(dateH), speakerID, "Riding")
workingSpeech['politician']['party']=self.translator.convertUKName(str(dateH), speakerID, "Party")
workingSpeech['politician']['member_id']=""
workingSpeech['politician']['id']=speakerID
workingSpeech["politician"]["url"] = ""
# text of speech
# note that using toJSON as a workaround for lxml/unicode issues (bug as of July 2013) creates problems here when more than one
# "honourable friend" phrase is referenced in a given speech
# xml parsing of speeches should be rewritten once this bug is addressed
# for the moment, honfriend class phrase tags are simply stripped before processing
gather = []
try:
speechRaw = copy.deepcopy(k["publicwhip"]["speech"][speechNumber]["p"])
except KeyError:
print (k["publicwhip"]["speech"][speechNumber])
if isinstance(speechRaw, list):
for speechComponentNumber in range(len(speechRaw)):
try:
gather.append(speechRaw[speechComponentNumber]["$t"])
except KeyError:
try:
gather.append(speechRaw[speechComponentNumber]["phrase"]["$t"])
except TypeError:
try:
gather.append(speechRaw[speechComponentNumber]['phrase'][0]["$t"])
gather.append(speechRaw[speechComponentNumber]['phrase'][1]["$t"])
except:
print ("KeyError 3")
except KeyError:
print (speechRaw[speechComponentNumber])
print("KeyError4")
elif isinstance(speechRaw, dict):
try:
gather.append(speechRaw["$t"])
except KeyError:
try:
gather.append(speechRaw["phrase"]["$t"])
except KeyError:
print (speechRaw)
print(speechRaw.keys())
workingSpeech["text"] = "".join(gather)
workingStatements.append(copy.deepcopy(workingSpeech))
workingHansardLog["statements"] = copy.deepcopy(workingStatements)
# return workingHansardLog # for testing
output = open((os.path.join(os.getcwd(), "data_uk", str(dateH), (str(dateH) + '.pkl'))), 'wb')
pickle.dump(workingHansardLog, output)
output.close()
return None
def convertUKHansardRange(self, dateStart, dateEnd, dictfilename="UKDict"):
'''dateStart = str format yyyy-mm-dd, dateEnd = str format yyyy-mm-dd, dictfilename = str local dict file
Converts to JSON HansardRaw standard and writes the .pkl files between start and end in a folder path Data/yyyy-mm-dd/
Works with raw xml dump downloaded from ukparse.kforge.net/parldata/scrapedxml/debates/
Skips over existing files with same path and filename.
'''
dateS = datetime.date
dateE = datetime.date
try:
dateS = datetime.date(int(dateStart[0:4]), int(dateStart[5:7]), int(dateStart[8:10])) # Convert str to datetime.date objects
dateE = datetime.date(int(dateEnd[0:4]), int(dateEnd[5:7]), int(dateEnd[8:10]))
except:
raise HansardImportDateError("Invalid date.")
loopDate = copy.deepcopy(dateS)
dateStepInt = (dateE - dateS).days
for i in range(dateStepInt+1): # Convert UKHansard files for range of dates
print((loopDate.isoformat()))
try:
self.convertUKHansardFile(loopDate.isoformat(), dictfilename)
except HansardImportDateError: # No Hansard on this date!
print ("No data here!")
pass
dateStepInt += 1
loopDate += datetime.timedelta(days=1)
def loadHansardFile(self, dateH, trueFilename):
'''dateH = "yyyy-mm-dd", trueFilename = str real file name of HansardFile
eg. 2012-12-03.pkl
Returns a dict of JSON hansard file.
'''
try:
pkl_file = open((os.path.join(os.getcwd(), "data_uk", dateH, trueFilename)), 'rb')
myfile = pickle.load(pkl_file)
pkl_file.close()
return myfile
except (OSError, IOError):
raise HansardImportError("No .pkl file of that name.")
def getTrueFilenames(self, dateH, hansardType=""):
'''dateH = "yyyy-mm-dd"
Returns list of filenames str for use in loadHansardFile
'''
try:
originalList = os.listdir((os.path.join(os.getcwd(), "data_uk", str(dateH))))
# Lists files in the date directory
except OSError:
raise HansardImportDateError("Invalid date.")
return originalList