/
create-corpora.py
executable file
·37 lines (28 loc) · 1.16 KB
/
create-corpora.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import pandas as pd
import numpy as np
import os
statements = pd.read_csv('static/fun_statements_subset.csv')
vocab = [word for word in open('static/vocab.txt', 'rb').readline().split()]
gb = statements.groupby('author_id')
num_authors = len(gb.groups.keys())
count = 0
for author_id, group in gb:
count += 1
print "processing text for author id #", author_id
print "author #", count, " /", num_authors
corpus_path = os.path.join('corpora/' + str(author_id).split('.')[0] + '/')
if not os.path.exists(os.path.dirname(corpus_path)):
print "path does not exist for author, creating path..."
os.makedirs(os.path.dirname(corpus_path))
count_2 = 0
for n, row in group.iterrows():
count_2 += 1
if count_2 % 10 == 0:
print "analyzed ", float(count_2)/len(group) * 100, "% of statements for author #", author_id
text = row['cleaned_statement']
words = text.split()
out = open(os.path.join(corpus_path, 'corpus.txt'), 'a')
for word in words:
out.write(str(vocab.index(word)) + ' ')
out.write('\n')
out.close()