/
Scrape Descriptions.R
79 lines (61 loc) · 2.25 KB
/
Scrape Descriptions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
require(stringr)
require(rvest)
require(magrittr)
require(koRpus)
require(openNLP)
require(tm)
files <- dir("data/indeed/", pattern = "*.html", full.names = TRUE)
descriptions <-c() #create a vector to store the results
for (i in 1:length(files)){
new <- read_xml(files[i]) %>%
html_nodes("description") %>% html_text()
descriptions <- c(descriptions,new)
}
pos <- function(ele){
sent_token_annotator <- Maxent_Sent_Token_Annotator()
word_token_annotator <- Maxent_Word_Token_Annotator()
annotations <- annotate(ele, list(sent_token_annotator, word_token_annotator))
pos_tag_annotator <- Maxent_POS_Tag_Annotator()
pos_tag_annotator
pos <- annotate(ele, pos_tag_annotator, annotations)
return (pos$features[[2]]$POS == "NN")
}
#get rid of the tags#
for (i in 1:length(descriptions)){
if (!is.na(str_extract(descriptions[i],".*\\."))){
descriptions[i] = str_extract(descriptions[i],".*\\.")
}
}
#remove duplicate entries
descriptions <- unique(descriptions)
corp <- Corpus(VectorSource(descriptions))
#dtm = DocumentTermMatrix(corp,control=list(removePunctuation=TRUE,removeNumbers=TRUE,stemming=TRUE))
corp<-tm_map(corp,content_transformer(removeNumbers)) #convert to lower case#
corp<-tm_map(corp,removePunctuation) #remove punctuation#
#corp <- tm_map(corp, stemDocument)
corp <-tm_map(corp, removeWords, stopwords('english'))
dtm <- as.matrix(DocumentTermMatrix(corp))
sent_token_annotator <- Maxent_Sent_Token_Annotator()
word_token_annotator <- Maxent_Word_Token_Annotator()
annotations <- annotate(colnames(dtm), list(sent_token_annotator, word_token_annotator))
pos_tag_annotator <- Maxent_POS_Tag_Annotator()
pos_tag_annotator
pos <- annotate(colnames(dtm), pos_tag_annotator, annotations)
non <- c()
for (i in 2:length(pos)){
if (pos$features[[i]]$POS != "NN"){
non <- c(non,i-1)
}
}
irrelevant <- c()
for (i in 1:dim(dtm)[2]){
if (str_detect(colnames(dtm)[i],"job")||str_detect(colnames(dtm)[i],"http")||str_detect(colnames(dtm)[i],"indeed")){
irrelevant <- c(irrelevant,i)
}
}
dtm <- dtm[,-c(non,irrelevant)]
#Source: http://www.r-bloggers.com/text-mining/#
v = sort(colSums(dtm), decreasing=TRUE);
myNames = names(v);
d = data.frame(word=myNames, freq=v)
wordcloud(d$word, colors=c(3,4), random.color=FALSE, d$freq,max.words=200)