-
Notifications
You must be signed in to change notification settings - Fork 27
/
corpus.py
722 lines (582 loc) · 26.6 KB
/
corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
from __future__ import print_function
import corpkit
from functools import wraps
def lazyprop(fn):
"""Lazy loading class attributes, with hard-coded docstrings for now..."""
attr_name = '_lazy_' + fn.__name__
if fn.__name__ == 'subcorpora':
@property
def _lazyprop(self):
"""A list-like object containing a corpus' subcorpora.
:Example:
>>> corpus.subcorpora
<corpkit.corpus.Datalist instance: 12 items>
"""
if not hasattr(self, attr_name):
setattr(self, attr_name, fn(self))
return getattr(self, attr_name)
if fn.__name__ == 'files':
@property
def _lazyprop(self):
"""A list-like object containing the files in a folder.
:Example:
>>> corpus.subcorpora[0].files
<corpkit.corpus.Datalist instance: 240 items>
"""
if not hasattr(self, attr_name):
setattr(self, attr_name, fn(self))
return getattr(self, attr_name)
if fn.__name__ == 'features':
@property
def _lazyprop(self):
"""
Generate and show basic stats from the corpus, including number of sentences, clauses, process types, etc.
:Example:
>>> corpus.features
.. Characters Tokens Words Closed class words Open class words Clauses
01 26873 8513 7308 4809 3704 2212
02 25844 7933 6920 4313 3620 2270
03 18376 5683 4877 3067 2616 1640
04 20066 6354 5366 3587 2767 1775
"""
if not hasattr(self, attr_name):
setattr(self, attr_name, fn(self))
return getattr(self, attr_name)
if fn.__name__ == 'document':
@property
def _lazyprop(self):
"""Return the parsed XML of a parsed file"""
if not hasattr(self, attr_name):
setattr(self, attr_name, fn(self))
return getattr(self, attr_name)
return _lazyprop
class Corpus:
"""
A class representing a linguistic text corpus, which contains files,
optionally within subcorpus folders.
Methods for concordancing, interrogating, getting general stats, getting behaviour of particular word, etc.
"""
def __init__(self, path, **kwargs):
import os
from os.path import join, isfile, isdir, abspath, dirname, basename
import re
import operator
from process import determine_datatype
from corpus import Datalist
# levels are 'c' for corpus, 's' for subcorpus and 'f' for file. Which
# one is determined automatically below, and processed accordingly. We
# assume it is a full corpus to begin with.
self.data = None
level = kwargs.pop('level', 'c')
self.datatype = kwargs.pop('datatype', None)
print_info = kwargs.get('print_info', True)
if path.__class__ == Datalist or type(path) == list:
self.path = abspath(dirname(path[0].path.rstrip('/')))
self.name = basename(self.path)
self.data = path
else:
self.path = abspath(path)
self.name = basename(path)
# this messy code figures out as quickly as possible what the datatype
# and singlefile status of the path is. it's messy because it shortcuts
# full checking where possible some of the shortcutting could maybe be
# moved into the determine_datatype() funct.
if print_info:
print('\nCorpus at: %s' % self.path)
self.singlefile = False
if os.path.isfile(self.path):
if self.path.endswith('.xml'):
self.datatype = 'parse'
self.singlefile = True
elif self.path.endswith('-parsed'):
self.datatype = 'parse'
if len([d for d in os.listdir(self.path) if isdir(join(self.path, d))]) > 0:
self.singlefile = False
if len([d for d in os.listdir(self.path) if isdir(join(self.path, d))]) == 0:
level = 's'
else:
if level == 'c':
if not self.datatype:
self.datatype, self.singlefile = determine_datatype(self.path)
if len([d for d in os.listdir(self.path) if isdir(join(self.path, d))]) == 0:
level = 's'
# if initialised on a file, process as file
if self.singlefile and level == 'c':
level = 'f'
self.level = level
@lazyprop
def subcorpora(self):
"""A list-like object containing a corpus' subcorpora."""
import re, os, operator
from os.path import join, isdir
if self.data.__class__ == Datalist or type(self.data) == list:
return self.data
if self.level == 'c':
variable_safe_r = re.compile('[\W0-9_]+', re.UNICODE)
sbs = Datalist(sorted([Subcorpus(join(self.path, d), self.datatype) \
for d in os.listdir(self.path) \
if isdir(join(self.path, d))], \
key=operator.attrgetter('name')))
for subcorpus in sbs:
variable_safe = re.sub(variable_safe_r, '', \
subcorpus.name.lower().split(',')[0])
setattr(self, variable_safe, subcorpus)
return sbs
@lazyprop
def files(self):
"""A list-like object containing the files in a folder
>>> corpus.subcorpora[0].files
"""
import re, os, operator
from os.path import join, isdir
if self.level == 's':
#variable_safe_r = re.compile('[\W0-9_]+', re.UNICODE)
fs = sorted([File(f, self.path, self.datatype) for f in os.listdir(self.path) \
if not f.startswith('.')], key=operator.attrgetter('name'))
fs = Datalist(fs)
#for f in fs:
# variable_safe = re.sub(variable_safe_r, '', f.name.lower().split('.')[0])
# setattr(self, variable_safe, f)
return fs
def __str__(self):
"""String representation of corpus"""
st = 'Corpus at %s:\n\nData type: %s\nNumber of subcorpora: %d\n' % (self.path, self.datatype, len(self.subcorpora))
if self.singlefile:
st = st + '\nCorpus is a single file.\n'
if 'features' in self.__dict__.keys():
if not self.singlefile:
cols = list(self.features.columns)[:10]
st = st + '\nFeatures:\n\n' + self.features.head(10).to_string(columns = cols)
else:
st = st + '\nFeatures:\n\n' + self.features.head(10).to_string()
else:
st = st + '\nFeatures not analysed yet. Use .features to calculate them.\n'
return st
def __repr__(self):
"""object representation of corpus"""
import os
if not self.subcorpora:
ssubcorpora = ''
else:
ssubcorpora = self.subcorpora
return "<corpkit.corpus.Corpus instance: %s; %d subcorpora>" % (os.path.basename(self.path), len(ssubcorpora))
def __getitem__(self, key):
from process import makesafe
if isinstance( key, slice ) :
#Get the start, stop, and step from the slice
return Datalist([self[ii] for ii in range(*key.indices(len(self.subcorpora)))])
elif type(key) == int:
return self.subcorpora.__getitem__(makesafe(self.subcorpora[key]))
else:
try:
return self.subcorpora.__getattribute__(key)
except:
from process import is_number
if is_number(key):
return self.__getattribute__('c' + key)
# METHODS
@lazyprop
def features(self):
"""
Generate and show basic stats from the corpus, including number of sentences, clauses, process types, etc.
:Example:
>>> corpus.features
SB Characters Tokens Words Closed class words Open class words Clauses
01 26873 8513 7308 4809 3704 2212
02 25844 7933 6920 4313 3620 2270
03 18376 5683 4877 3067 2616 1640
04 20066 6354 5366 3587 2767 1775
"""
from interrogator import interrogator
return interrogator(self, 's', 'any').results
def configurations(self, search, **kwargs):
"""
Get the overall behaviour of tokens or lemmas matching a regular expression. The search below makes DataFrames containing the most common subjects, objects, modifiers (etc.) of 'see':
:param search: Similar to `search` in the `interrogate()` / `concordance() methods. `W`/`L keys match word or lemma; `F`: key specifies semantic role (`'participant'`, `'process'` or `'modifier'`. If `F` not specified, each role will be searched for.
:type search: dict
:Example:
>>> see = corpus.configurations({L: 'see', F: 'process'}, show = L)
>>> see.has_subject.results.sum()
i 452
it 227
you 162
we 111
he 94
:returns: :class:`corpkit.interrogation.Interrodict`
"""
from configurations import configurations
return configurations(self, search, **kwargs)
def interrogate(self, search, *args, **kwargs):
"""Interrogate a corpus of texts for a lexicogrammatical phenomenon
:Example:
>>> corpus = Corpus('data/conversations-parsed')
### show lemma form of nouns ending in 'ing'
>>> q = {W: r'ing$', P: r'^N'}
>>> data = corpus.interrogate(q, show = L)
>>> data.results
.. something anything thing feeling everything nothing morning
01 14 11 12 1 6 0 1
02 10 20 4 4 8 3 0
03 14 5 5 3 1 0 0
... ...
:param search: What query should be matching
- t/tregex
- w/word
- l/lemma
- f/function
- g/governor
- d/dependent
- p/pos
- i/index
- n/ngrams
- s/general stats
:type search: str, or, for dependencies, a dict like `{W: 'help', P: r'^V'}`
:param searchmode: Return results matching any/all criteria
:type searchmode: str -- `'any'`/`'all'`
:param exclude: The inverse of `search`, removing results from search
:type exclude: dict -- `{L: 'be'}`
:param excludemode: Exclude results matching any/all criteria
:type excludemode: str -- `'any'`/`'all'`
:param query: A search query for the interrogation
:type query:
- str -- regex/Tregex pattern
- dict -- `{name: pattern}`
- list -- word list to match
:param show: What to output. If multiple strings are passed, results will be colon-separated, in order
- t/tree
- w/word
- l/lemma
- g/governor
- d/dependent
- f/function
- p/pos
- i/index
- a/distance from root
:type show: list of strings
:param lemmatise: Force lemmatisation on results
:type lemmatise: bool
:param lemmatag: Explicitly pass a pos to lemmatiser (generally when data is unparsed)
:type lemmatag: False/'n'/'v'/'a'/'r'
:param spelling: Convert all to U.S. or U.K. English
:type spelling: False/'US'/'UK'
:param dep_type: The kind of Stanford CoreNLP dependency parses you want to use
:type dep_type: str -- 'basic-dependencies'/'a', 'collapsed-dependencies'/'b', 'collapsed-ccprocessed-dependencies'/'c'
:param quicksave: Save result as pickle to `saved_interrogations/str` on completion
:type quicksave: str
:param gramsize: size of ngrams (default 2)
:type gramsize: int
:param split_contractions: make `"don't"` et al into two tokens
:type split_contractions: bool
:param multiprocess: how many parallel processes to run
:type multiprocess: int / bool (to determine automatically)
:param files_as_subcorpora: treat each file as a subcorpus
:type files_as_subcorpora: bool
:param do_concordancing: Concordance while interrogating, store as `.concordance` attribute
:type do_concordancing: bool/'only'
:param maxconc: Maximum number of concordance lines
:type maxcond: int
:returns: A :class:`corpkit.interrogation.Interrogation` object, with `.query`, `.results`, `.totals` attributes. If multiprocessing is \
invoked, result may be a :class:`corpkit.interrogation.Interrodict` containing corpus names, queries or speakers as keys.
"""
from interrogator import interrogator
par = kwargs.pop('multiprocess', None)
if par and self.subcorpora:
if type(par) == int:
kwargs['multiprocess'] = par
return interrogator(self.subcorpora, search, *args, **kwargs)
else:
return interrogator(self, search, *args, **kwargs)
def parse(self, corenlppath = False, operations = False, copula_head = True,
speaker_segmentation = False, memory_mb = False, *args, **kwargs):
"""
Parse an unparsed corpus, saving to disk
:param corenlppath: folder containing corenlp jar files
:type corenlppath: str
:param operations: which kinds of annotations to do
:type operations: str
:param speaker_segmentation: add speaker name to parser output if your corpus is script-like:
:type speaker_segmentation: bool
:param memory_mb: Amount of memory in MB for parser
:type memory_mb: int
:param copula_head: Make copula head in dependency parse
:type copula_head: bool
:Example:
>>> parsed = corpus.parse(speaker_segmentation = True)
>>> parsed
<corpkit.corpus.Corpus instance: speeches-parsed; 9 subcorpora>
:returns: The newly created :class:`corpkit.corpus.Corpus`
"""
from make import make_corpus
from corpus import Corpus
#from process import determine_datatype
#dtype, singlefile = determine_datatype(self.path)
if self.datatype != 'plaintext':
raise ValueError('parse method can only be used on plaintext corpora.')
kwargs.pop('parse', None)
kwargs.pop('tokenise', None)
return Corpus(make_corpus(self.path, parse = True, tokenise = False,
corenlppath = corenlppath, operations = operations, copula_head = copula_head,
speaker_segmentation = speaker_segmentation, memory_mb = memory_mb, *args, **kwargs))
def tokenise(self, *args, **kwargs):
"""
Tokenise a plaintext corpus, saving to disk
:param nltk_data_path: path to tokeniser if not found automatically
:type nltk_data_path: str
:Example:
>>> tok = corpus.tokenise()
>>> tok
<corpkit.corpus.Corpus instance: speeches-tokenised; 9 subcorpora>
:returns: The newly created :class:`corpkit.corpus.Corpus`
"""
from corpkit import make_corpus
from corpus import Corpus
#from process import determine_datatype
#dtype, singlefile = determine_datatype(self.path)
if self.datatype != 'plaintext':
raise ValueError('parse method can only be used on plaintext corpora.')
kwargs.pop('parse', None)
kwargs.pop('tokenise', None)
return Corpus(make_corpus(self.path, parse = False, tokenise = True, *args, **kwargs))
def concordance(self, *args, **kwargs):
"""
A concordance method for Tregex queries, CoreNLP dependencies,
tokenised data or plaintext.
:Example:
>>> wv = ['want', 'need', 'feel', 'desire']
>>> corpus.concordance({L: wv, F: 'root'})
0 01 1-01.txt.xml But , so I feel like i do that for w
1 01 1-01.txt.xml I felt a little like oh , i
2 01 1-01.txt.xml he 's a difficult man I feel like his work ethic
3 01 1-01.txt.xml So I felt like i recognized li
... ...
Arguments are the same as :func:`~corpkit.interrogation.Interrogation.interrogate`, plus:
:param only_format_match: if True, left and right window will just be words, regardless of what is in 'show'
:type only_format_match: bool
:param random: randomise lines
:type random: bool
:param only_unique: only unique lines
:type only_unique: bool
:returns: A :class:`corpkit.interrogation.Concordance` instance
"""
from interrogator import interrogator
kwargs.pop('do_concordancing', None)
kwargs.pop('conc', None)
return interrogator(self, do_concordancing = 'only', *args, **kwargs)
def interroplot(self, search, **kwargs):
"""Interrogate, relativise, then plot, with very little customisability. A demo function.
:Example:
>>> corpus.interroplot(r'/NN.?/ >># NP')
<matplotlib figure>
:param search: search as per :func:`~corpkit.corpus.Corpus.interrogate`
:type search: dict
:param kwargs: extra arguments to pass to :func:`~corpkit.corpus.Corpus.interrogate`/:func:`~corpkit.corpus.Corpus.plot`
:type kwargs: keyword arguments
:returns: None (but show a plot)
"""
if type(search) == str:
search = {'t': search}
quickstart = self.interrogate(search = search, **kwargs)
edited = quickstart.edit('%', 'self', print_info = False)
edited.plot(str(self.path), **kwargs)
def save(self, savename = False, **kwargs):
"""Save corpus class to file
>>> corpus.save(filename)
:param savename: name for the file
:type savename: str
:returns: None
"""
from corpkit import save
if not savename:
savename = self.name
save(self, savename, savedir = 'data', **kwargs)
from corpus import Corpus
class Subcorpus(Corpus):
"""Model a subcorpus, containing files but no subdirectories.
Methods for interrogating, concordancing and configurations are the same as
:class:`corpkit.corpus.Corpus`."""
def __init__(self, path, datatype):
self.path = path
kwargs = {'print_info': False, 'level': 's', 'datatype': datatype}
Corpus.__init__(self, self.path, **kwargs)
def __repr__(self):
return "<corpkit.corpus.Subcorpus instance: %s; %d files>" % (self.name, \
len(self.files))
def __str__(self):
return self.path
def __repr__(self):
return "<corpkit.corpus.Subcorpus instance: %s>" % self.name
def __getitem__(self, key):
from process import makesafe
if isinstance( key, slice ) :
#Get the start, stop, and step from the slice
return Datalist([self[ii] for ii in range(*key.indices(len(self.files)))])
elif type(key) == int:
return self.files.__getitem__(makesafe(self.files[key]))
else:
try:
return self.files.__getattribute__(key)
except:
from process import is_number
if is_number(key):
return self.__getattribute__('c' + key)
class File(Corpus):
"""Models a corpus file for reading, interrogating, concordancing"""
def __init__(self, path, dirname, datatype):
import os
from os.path import join, isfile, isdir
self.path = join(dirname, path)
kwargs = {'print_info': False, 'level': 'f', 'datatype': datatype}
Corpus.__init__(self, self.path, **kwargs)
if self.path.endswith('.p'):
self.datatype = 'tokens'
elif self.path.endswith('.xml'):
self.datatype = 'parse'
else:
self.datatype = 'plaintext'
def __repr__(self):
return "<corpkit.corpus.File instance: %s>" % self.name
def __str__(self):
return self.path
@lazyprop
def document(self):
"""Return the parsed XML of a parsed file"""
from corenlp_xml.document import Document
return Document(self.read())
def __getitem__(self, key):
from process import makesafe
if isinstance( key, slice ) :
#Get the start, stop, and step from the slice
return Datalist([self[ii] for ii in range(*key.indices(len(self.subcorpora)))])
elif type(key) == int:
return self.subcorpora.__getitem__(makesafe(self.subcorpora[key]))
else:
try:
return self.subcorpora.__getattribute__(key)
except:
from process import is_number
if is_number(key):
return self.__getattribute__('c' + key)
def read(self, *args, **kwargs):
"""Read file data. If data is pickled, unpickle first
:returns: str/unpickled data
"""
if self.datatype == 'tokens':
import pickle
with open(self.path, "rb") as fo:
data = pickle.load(fo)
return data
else:
with open(self.path, 'r') as fo:
data = fo.read()
return data
class Datalist(object):
"""
A list-like object containing subcorpora or corpus files.
Objects can be accessed as attributes, dict keys or by indexing/slicing.
Methods for interrogating, concordancing and getting configurations are the same as for :class:`corpkit.corpus.Corpus`
"""
def __init__(self, data):
import re
import os
from os.path import join, isfile, isdir
from process import makesafe
self.current = 0
if data:
self.high = len(data)
else:
self.high = 0
self.data = data
if data and len(data) > 0:
for subcorpus in data:
safe_var = makesafe(subcorpus)
setattr(self, safe_var, subcorpus)
def __str__(self):
st = []
for i in self.data:
st.append(i.name)
return '\n'.join(st)
def __repr__(self):
return "<corpkit.corpus.Datalist instance: %d items>" % len(self)
def __delitem__(self, key):
self.__delattr__(key)
def __getitem__(self, key):
from process import makesafe
if isinstance( key, slice ) :
#Get the start, stop, and step from the slice
return Datalist([self[ii] for ii in range(*key.indices(len(self)))])
elif type(key) == int:
return self.__getitem__(makesafe(self.data[key]))
else:
try:
return self.__getattribute__(key)
except:
from process import is_number
if is_number(key):
return self.__getattribute__('c' + key)
def __setitem__(self, key, value):
self.__setattr__(key, value)
if key.startswith('c'):
self.__setattr__(key.lstrip('c'), value)
def __iter__(self):
for datum in self.data:
yield datum
def __len__(self):
return len(self.data)
def __next__(self): # Python 3: def __next__(self)
if self.current > self.high:
raise StopIteration
else:
self.current += 1
return self.current - 1
def interrogate(self, *args, **kwargs):
"""Interrogate the corpus using :func:`~corpkit.corpus.Corpus.interrogate`"""
from interrogator import interrogator
return interrogator(self, *args, **kwargs)
def concordance(self, *args, **kwargs):
"""Concordance the corpus using :func:`~corpkit.corpus.Corpus.concordance`"""
from interrogator import interrogator
return interrogator(self, do_concordancing = 'only', *args, **kwargs)
def configurations(self, search, **kwargs):
"""Get a configuration using :func:`~corpkit.corpus.Corpus.configurations`"""
from configurations import configurations
return configurations(self, search, **kwargs)
from corpus import Datalist
class Corpora(Datalist):
"""
Models a collection of Corpus objects. Methods are available for interrogating and plotting the entire collection. This is the highest level of abstraction available.
:param data: Corpora to model
:type data: str (path containing corpora), list (of corpus paths/:class:`corpkit.corpus.Corpus` objects)
"""
def __init__(self, data):
from corpus import Corpus
# handle a folder containing corpora
if type(data) == str or type(data) == str:
import os
from os.path import join, isfile, isdir
if not os.path.isdir(data):
raise ValueError('Corpora(str) needs to point to a directory.')
data = sorted([join(data, d) for d in os.listdir(data) \
if isdir(join(data, d))])
# otherwise, make a list of Corpus objects
for index, i in enumerate(data):
if type(i) == str:
data[index] = Corpus(i)
# now turn it into a Datalist
Datalist.__init__(self, data)
def __repr__(self):
return "<corpkit.corpus.Corpora instance: %d items>" % len(self)
def __getitem__(self, key):
"""allow slicing, indexing"""
from process import makesafe
if isinstance( key, slice ) :
#Get the start, stop, and step from the slice
return Corpora([self[ii] for ii in range(*key.indices(len(self)))])
elif type(key) == int:
return self.__getitem__(makesafe(self.data[key]))
else:
try:
return self.__getattribute__(key)
except:
from process import is_number
if is_number(key):
return self.__getattribute__('c' + key)