-
Notifications
You must be signed in to change notification settings - Fork 2
/
quanteda.py
99 lines (85 loc) · 3.59 KB
/
quanteda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import os
import string
#import nltk
import re
import codecs
#from pandas import Series, DataFrame
#import pandas as pd
class Corpus(object):
"""A grouping of documents and associated data"""
def __init__(self, path=None):
if not path==None:
self.documents = self.read_docs(path)
else:
self.documents = []
self.words = []
self.vocab = []
#self.fdist = nltk.FreqDist()
def read_docs(self, path, variables=None):
"""Load documents from a directory and append to corpus"""
docs=[]
fnames = os.listdir(path)
for fname in fnames:
f = codecs.open(os.path.join(path,fname), encoding='utf-8', mode='r')
text = f.read()
d = Document(text, fname)
if variables is not None: d.add_variables(variables)
docs.append(d)
return(docs)
def add_docs(self, *args):
"""append a list of documents to the corpus"""
self.documents.extend(args)
def preprocess(self):
""" preprocesses every document in the corpus"""
for doc in self.documents:
doc.preprocess()
def make_fdist(self):
fdist = nltk.FreqDist()
for doc in self.documents:
toks = nltk.tokenize.word_tokenize(doc.text)
doc.words = toks
self.fdist.update(toks)
self.vocab.extend(self.fdist.keys())
return fdist
def make_dfm(self):
""" Make a document term (or document feature) matrix.
nltk.tokenize is quite slow, so using ordinary .split()
"""
fdist = nltk.FreqDist()
for doc in self.documents:
#toks = nltk.tokenize.word_tokenize(doc.text)
toks = doc.text.split()
self.fdist.update(toks)
self.vocab.extend(self.fdist.keys())
return fdist
def __str__(self):
s=""
for d in self.documents:
s=s+"fname: %s variables: %s \n" % (d.fname, d.variables)
return s
class Document(object):
"""A document associated with a dictionary of variables"""
def __init__(self, text, fname, variables={}):
self.text=text
self.fname=fname
self.variables=variables
def __str__(self):
return "fname: %s variables: %s " % (self.fname, self.variables)
def preprocess(self):
"""downcase and remove punctuation"""
self.text = self.text.lower()
self.text=re.sub("[\.\t\,\:;\(\)\.\?\"\'']", "", self.text, 0, 0)
self.text.strip()
def add_variables(self, new_vars):
self.variables.update(new_vars)
def make_dfm(self, target="missing"):
"""make a word frequency matrix (wordtype:frequency Dict)"""
feat_dict={}
words = self.text.split()
for w in words:
if w in feat_dict:
feat_dict[w]+=1
else:
feat_dict[w]=0
data=(feat_dict, self.variables[target])
return data