/
topics.r
153 lines (120 loc) · 6.04 KB
/
topics.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#Topic Model script
#Much of this initial workflow is from Jockers Text Analysis for R
#and Shawn Graham's Ferguson Grand Jury corpus topic model at https://github.com/shawngraham/ferguson
library(mallet)
library(wordcloud2)
library(dplyr)
library(tm)
library(Hmisc)
# import the OCR'd ICC decisions from the text folder
# each decision is here its own text file
decisions <- mallet.read.dir("text")
#Better is to use load.r file to provide cleaner texts via get_real_words function, etc
#Update: OKstate has now changed their icc file structure, not allowing wget downloading. Use the raw txt files of the decisions and skip load.r
mallet.instances <- mallet.import(decisions$id, decisions$text, "icc.txt", token.regexp = "\\p{L}[\\p{L}\\p{P}]+\\p{L}")
#' create topic trainer object
n.topics <- 100
topic.model <- MalletLDA(n.topics)
#' load documents
topic.model$loadDocuments(mallet.instances)
## Get the vocabulary, and some statistics about word frequencies.
## These may be useful in further curating the stopword list.
vocabulary <- topic.model$getVocabulary()
word.freqs <- mallet.word.freqs(topic.model)
#Get additional words for stoplist
#stopwords<- arrange(word.freqs, words, term.freq)
#rank(stopwords)
## Optimize hyperparameters every 20 iterations,
## after 50 burn-in iterations.
topic.model$setAlphaOptimization(20, 50)
## Now train a model. Note that hyperparameter optimization is on, by default.
## We can specify the number of iterations. Here we'll use a large-ish round number.
topic.model$train(400)
## NEW: run through a few iterations where we pick the best topic for each token,
## rather than sampling from the posterior distribution.
topic.model$maximize(40)
#' Get the probability of topics in documents and the probability of words in topics.
#' By default, these functions return raw word counts. Here we want probabilities,
#' so we normalize, and add "smoothing" so that nothing has exactly 0 probability.
doc.topics <- mallet.doc.topics(topic.model, smoothed=T, normalized=T)
topic.words <- mallet.topic.words(topic.model, smoothed=T, normalized=T) ##adap jockers wordcloud script to use this variable
#' from http://www.cs.princeton.edu/~mimno/R/clustertrees.R
#' transpose and normalize the doc topics
topic.docs <- t(doc.topics)
topic.docs <- topic.docs / rowSums(topic.docs)
#Get shorter versions of the topics
topics.labels <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels[topic] <- paste(mallet.top.words(topic.model, topic.words[topic,], num.top.words=7)$words, collapse=" ")
#output some of the topic model raw data
write.csv(topic.docs, "out/icc-topic-docs.csv")
write.csv(topics.labels, "out/icc-topic-labels.csv")
# create data.frame with columns as ids and rows as topics
topic_docs <- data.frame(topic.docs)
plotdocs <- names(topic_docs)
names(topic_docs) <- decisions$id
## cluster based on shared words
jpeg(filename="topic-hcluster.jpeg", width = 960, height = 600, )
print(plot(hclust(dist(topic.words)), labels=topics.labels))
dev.off()
#' Calculate similarity matrix
#' Shows which documents are similar to each other
#' by their proportions of topics. Based on Matt Jockers' method
library(cluster)
topic_df_dist <- as.matrix(daisy(t(topic_docs), metric = "euclidean", stand = TRUE))
# Change row values to zero if less than row minimum plus row standard deviation
# keep only closely related documents and avoid a dense spagetti diagram
# that's difficult to interpret (hat-tip: http://stackoverflow.com/a/16047196/1036500)
topic_df_dist[ sweep(topic_df_dist, 1, (apply(topic_df_dist,1,min) + apply(topic_df_dist,1,sd) )) > 0 ] <- 0
#Shawn Graham's topics as wordclouds with some of Jockers print to pdf code
jpeg(filename="out/images/icc-topic-%03d.jpeg")
for(i in 1:100){
topic.top.words <- mallet.top.words(topic.model,
topic.words[i,], 15)
print(wordcloud2(data = topic.top.words$words,
topic.top.words$weights,
c(4,.8), rot.per=0,
random.order=F))
}
dev.off()
#create some plots of topics across the range of decisions
library(ggplot2)
library(tidyr)
jpeg(filename="out/images/icc-topics-across-docs-%03d.jpeg")
n.decisions <- length(raw)
cols <- 1:n.decisions
doc_topics <- data.frame(doc.topics, row.names = decisions$id, stringsAsFactors = FALSE)
doc_topics$docs <- cols
topic.cols <- names(doc_topics)
for(i in 1:100){
print(ggplot(doc_topics) + geom_smooth(aes_string(x="docs", y=(topic.cols[i]))) + xlab("ICC Decisions in Order"))
}
dev.off()
#print(ggplot(doc_topics, aes(x=docs, y=X30), title = c("Topic " + "30")) + geom_smooth()
##Create a term document matrix to plot word similarities
# library(tm)
# library(igraph)
# corpus<- Corpus(VectorSource(decisions$text))
# corpus<- tm_map(corpus, removeWords, stopwords("english"))
# icc.tdm <- TermDocumentMatrix(corpus)
# clean.tdm <- removeSparseTerms(icc.tdm, .995)
#plot(clean.tdm, terms = findFreqTerms(tdm, lowfreq = 6)[1:25], corThreshold = 0.5) #need Rgraphviz, only for R 3.1
# toi <- "history" # term of interest
# corlimit <- 0.44 # lower correlation bound limit.
# expert_0.7 <- data.frame(corr = findAssocs(icc.tdm, toi, corlimit)[,1],
# terms = row.names(findAssocs(icc.tdm, toi, corlimit)))
# expert_0.7$terms <- factor(expert_0.7$terms ,levels = expert_0.7$terms)
# jpeg(filename="wordsim-history.jpeg", width=900, height = 800)
# ggplot(expert_0.7, aes( y = terms ) ) +
# geom_point(aes(x = corr), data = expert_0.7) +
# xlab(paste0("Correlation with the term ", "\"", toi, "\""))
# dev.off()
# toi <- "award" # term of interest
# corlimit <- 0.36 # lower correlation bound limit.
# expert_0.7 <- data.frame(corr = findAssocs(icc.tdm, toi, corlimit)[,1],
# terms = row.names(findAssocs(icc.tdm, toi, corlimit)))
# expert_0.7$terms <- factor(expert_0.7$terms ,levels = expert_0.7$terms)
# jpeg(filename="wordsim-award.jpeg", width=900, height = 800)
# ggplot(expert_0.7, aes( y = terms ) ) +
# geom_point(aes(x = corr), data = expert_0.7) +
# xlab(paste0("Correlation with the term ", "\"", toi, "\""))
# dev.off()