This repository has been archived by the owner on Oct 18, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
lda.py
69 lines (56 loc) · 1.89 KB
/
lda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#lda technique to extract keywords
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import pandas as pd
import nltk
#nltk.download('wordnet')
stemmer = SnowballStemmer('english')
def lemmatize(text):
return stemmer.stem(WordNetLemmatizer().lemmatize(text,pos = 'v')) # the v stands for verbs
def preprocess(text):
result = []
for token in gensim.utils.simple_preprocess(text):
if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
result.append(lemmatize(token)),
return result
#print(lemmatize('missing'))
# example = 'I have to write this paper for this class'
# words = []
# for word in example.split(' '):
# words.append(word)
# print(words)
# print(preprocess(example))
with open('sample4.txt', 'r') as file:
data = file.read()
data = preprocess(data)
data = [data]
#print(data)
#print(data[:2])
# data = ['how','how','is','really','world']
dictionary = gensim.corpora.Dictionary(data)
#print(dictionary["miss"])
# count = 0
# for k,v in dictionary.iteritems():
# print(k,v)
# count += 1
# if count > 20:
# break
#work on COUNT of most common words
bow_corpus = [dictionary.doc2bow(text) for text in data]
document_num = 0
bow_doc_x = bow_corpus[document_num]
# print(bow_corpus)
# for i in range(len(bow_corpus[0])):
# print("Word {} (\"{}\") appears {} time.".format(
# bow_corpus[0][i][0], dictionary[bow_corpus[0][i][0]], bow_corpus[0][i][1]
# ))
hashed_corpus = [dictionary.doc2bow(text) for text in data]
lda_model = gensim.models.LdaMulticore(hashed_corpus, num_topics = 1,
id2word = dictionary, passes = 1, workers = 1)
for idx, topic in lda_model.print_topics(-1):
print("Topic: {} \nWords: {}".format(idx, topic))
print("\n")