/
simple_lda.py
151 lines (112 loc) · 5.45 KB
/
simple_lda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# -*- coding: utf-8 -*-
import unicodecsv
import argparse
import os
from nltk import RegexpTokenizer
from lib.utils import *
from itertools import chain
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import pickle
import pandas as pd
import dateutil.relativedelta
list_of_stemmer_choices = ["none", "porter", "porter2", "lemma"]
list_of_model_choices = ["lda", "dtm"]
# list_of_dictionary_choices = ["none", "technology", "automotive", "english", "extended-technology", "extended-automotive"]
parser = argparse.ArgumentParser(description='run LDA on an input csv file.')
parser.add_argument('-i','--input',dest="filename", help='input CSV file', required=True)
parser.add_argument('-s','--stemmer', help='pick stemmer', default="lemma", choices=list_of_stemmer_choices)
parser.add_argument('-ni','--num_iter', help='number of iterations', default="50")
parser.add_argument('-ntw','--num_top_words', help='number of top_words', default="8")
parser.add_argument('-nt','--num_topics', help='number of topics', default="10")
parser.add_argument('-m', '--model', help='model used', default="lda", choices=list_of_model_choices)
parser.add_argument('-d', '--dictionary', help='dictionary used', nargs='+', default='english')
parser.add_argument('-ip', '--input_field', help='field_used_to_perform_lda', default='contents')
parser.add_argument('-o', '--override', action='store_true')
parser.add_argument('-as', '--add_to_stopwords', nargs='+', help='input file to add to stopwords')
parser.add_argument('-f', '--frequency', default="1Y")
parser.add_argument('-l', '--logging', action='store_true',default=False)
# parser.add_argument('-ad', '--add_to_dictionary', help='input file to add to dictionary')
# parser.add_argument('-sl', '--slice', help='slice of data', default=float("inf"))
args = parser.parse_args()
dir = os.getcwd()
model_dir = os.path.join(dir, 'models/')
dataset_dir = os.path.join(dir, 'datasets/')
dictionary_dir = os.path.join(dir, 'dictionaries/')
executable_dir = os.path.join(dir, 'executables/')
tagged_dataset_dir = os.path.join(dir, 'tagged_datasets/')
model_filename = os.path.join(model_dir, get_model_with_arguments_filename(args))
is_english_word = load_from_dictionary(args.dictionary)
add_to_stopwords(args.add_to_stopwords)
import sys
if args.logging:
class Logger(object):
def __init__(self, filename="Default.log"):
self.terminal = sys.stdout
self.log = open(filename, "a")
def write(self, message):
self.terminal.write(message)
self.log.write(message.encode("utf-8"))
def flush(self):
pass
sys.stdout = Logger("log_soya_6months.txt")
print(model_filename)
print "[INFO] Input File :", args.filename
print "[INFO] Stemmer :", args.stemmer
print "[INFO] Number of iterations :", args.num_iter
print "[INFO] Number of topics :", args.num_topics
print "[INFO] Number of top words :", args.num_top_words
print "[INFO] Model used :", args.model
print "[INFO] Dictionary used :", (',').join(args.dictionary)
print "[INFO] Input Field Used :", args.input_field
if args.add_to_stopwords:
print "[INFO] Added to stopwords :", (',').join(args.add_to_stopwords)
# if args.add_to_dictionary:
# print "[INFO] Added to dictionary :", args.add_to_dictionary
dataset_filepath = os.path.join(dataset_dir, args.filename)
csv = pd.read_csv(dataset_filepath, encoding='utf-8',parse_dates=['date'])
csv = csv.set_index("date").sort_index()
temp = csv.groupby(pd.TimeGrouper(freq=args.frequency))
weeksList = [temp.get_group(x) for x in temp.groups]
for csv in weeksList:
if(len(csv) <= 1):
continue
starting_date = str(csv.iloc[1].name.date())
print "Current Starting Date: " + starting_date
# print "Started training at " + str(datetime.now())
contents = csv["contents"]
titles = csv["title"]
dates = csv.index
# texts = preprocess(contents, args.stemmer, is_english_word)
(texts, tokens) = preprocess(contents, args.stemmer, is_english_word)
generate_allwords(texts, args)
dictionary = corpora.Dictionary(texts)
#remove extremes (similar to the min/max df step used when creating the tf-idf matrix)
dictionary.filter_extremes(no_below=1, no_above=0.8)
my_corpus = [dictionary.doc2bow(text) for text in texts]
try:
model = load_model(model_filename, args.model)
if args.override:
raise IOError("Override flag set")
except IOError:
print "Generating model ..."
model = generate_model(args.model, my_corpus, dictionary, args.num_topics, args.num_iter,dates, timedelta(days=7))
model.save(model_filename)
show_topics(args.model, model, args.num_topics, args.num_top_words, titles, my_corpus)
output_csv_filename = args.num_iter + "_" + "iter_" + args.num_topics + "_topics_" + (',').join(args.dictionary)+ "_dictionary_" + args.filename
if args.add_to_stopwords:
output_csv_filename = "stopwords-" + (',').join(args.add_to_stopwords) + "_" + output_csv_filename
if args.frequency != '1Y':
output_csv_filename = starting_date + "_" + args.frequency + "_" + output_csv_filename
##IMPLEMENT PROPER SAVE FUNCTION
output_dataset_path = os.path.join(tagged_dataset_dir, output_csv_filename)
if args.model == "lda":
print "Saving changes to csv ... ",
try:
print "found, skipped"
f = open(output_dataset_path)
if args.override:
raise IOError("Override flag set")
except IOError:
print "Done"
save(model, my_corpus, csv, output_dataset_path)