/
autoWordCloud_updated.py
220 lines (172 loc) · 7.04 KB
/
autoWordCloud_updated.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
"""
Word Cloud Generator
=======================================
---Objective --------------------------
Create a web-crawler that ingests a url,
retrieves the words from said url
and generates a word cloud image
---------------------------------------
The Process
1. Create a simple web crawler
2. Save words to a file
3. Process a colored input image
4. Output a wordCloud
---------------------------------------
"""
import os
import pandas as pd
import numpy as np
#imports for web crawling
import urllib2
from urllib2 import urlopen
import requests
from bs4 import BeautifulSoup
#imports for manipulating words
import nltk
from nltk.tokenize import sent_tokenize
#imaging stuff
import cv2
import imutils
##imports for wordcloud
from os import path
from PIL import Image
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
#initialize some arrays you will use
visitedLinks = []
badList = []
def openPageGetSource(url):
"""opens a url and ingests the sites source code"""
try:
soup = requests.get(url,verify=False)
except Exception,e:
print str(e)
#save the source code just in case you want to run offline
saveFile = open('source.txt','w')
saveFile.write(soup.content)
saveFile.write('\n')
saveFile.close()
return soup.content
def getGrayImage(imageFname):
"""this converts any color image to binary - black and white - for wordcloud"""
imagePath = os.path.join(os.getcwd(),'imgs',imageFname)
im_gray = cv2.imread(imagePath,cv2.IMREAD_GRAYSCALE) #change the file into gray scale
im_gray = imutils.resize(im_gray,width=1000) #resize so its easier to work with
#this calculates the threshold for you and makes it black and white
#what is thresholding http://docs.opencv.org/2.4/doc/tutorials/imgproc/threshold/threshold.html
(thresh, im_bw) = cv2.threshold(im_gray,50, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
#bw image is great but wordcloud assigns words to black space - YOU NEED THE INVERSE
#if imageFname == "yoda.png":
# pass
#else:
mask_inv = cv2.bitwise_not(im_bw)
#cv2.imshow('Orig',im_gray)
#cv2.imshow('Gray',im_bw)
#save to call the files
outF1 = os.path.join(os.getcwd(),"imgs","gray_"+imageFname)
outF2 = os.path.join(os.getcwd(),"imgs","bw_"+imageFname)
#outF3 = os.path.join(os.getcwd(),"imgs","noInverse_"+imageFname)
cv2.imwrite(outF1,im_gray)
cv2.imwrite(outF2,mask_inv)
#cv2.imwrite(outF3,im_bw)
def getWordCloud(textInput,imageFname):
"""this ingests a word csv and image file name to create a wordcloud"""
d = os.getcwd()
# Read the whole text.
text = open(path.join(d,textInput)).read()
# make the mask image
getGrayImage(imageFname)
# taken from
img_mask = np.array(Image.open(path.join(d,'imgs','bw_'+imageFname)))
stopwords = set(STOPWORDS)
stopwords.add("said") #add any extra stopwords
stopwords.add("repositories")
#inputs for wordcloud
wc = WordCloud(background_color="white", max_words=20000, mask=img_mask,
stopwords=stopwords)
# generate word cloud
wc.generate(text)
# store to file
wc.to_file(path.join(d,'imgs','wc_'+imageFname))
# show
plt.imshow(wc)
plt.axis("off")
plt.figure()
plt.imshow(img_mask, cmap=plt.cm.gray)
plt.axis("off")
plt.show()
def mineWebsite(randomURL,imageFname):
"""this should take a url input and return a txt file with words in it"""
"""
#uncomment if you want to run offline
with open('source.txt','r') as f:
sourceCode = f.read()
f.close()
"""
#arrays that we will write to
aLinks = []
pContent = []
wordArray = []
c = 0
#i only want to run it once
if c <= 1:
#get the page
sourceCode = openPageGetSource(randomURL)
soup = BeautifulSoup(sourceCode)
#print soup.prettify()
for a in soup.body.find_all('a', href=True):
link = a['href']
#first level data check
if link not in visitedLinks and link not in badList and "automatemylife.org" in link and "-" in link and "category" not in link:
#second level data check
if "#" not in link:
print "Found --> ",link
#limit switch if you only want the original website
#otherwise, this will take all the a hrefs from the website you visit
#and keep going to the number
stopNum = 3
if c <= stopNum:
c +=1
#follow link to get new sourcecode
linkContent = openPageGetSource(link)
linkSC = BeautifulSoup(linkContent)
print "Fetched source from --> ",link
#add this link to the visited list
visitedLinks.append(link)
print "\n-------------",link
for p in linkSC.find_all('p'):
if p.string is not None:
print "Tokenizing...."
print p.string
#add to the words to content array
tokens = nltk.word_tokenize(p.string)
#running list of links
for t in tokens:
if t not in badList:
#print t
try:
a = str(t) #should weed out any ascii fails
wordArray.append(t)
except Exception,e:
print str(e) #this is due to ascii fail
else:
pass
#add the words to a dataframe
DF = pd.DataFrame(wordArray,columns=['Tokens'])
print DF
#send to an output file -- or this could be a DB
DF.to_csv('outputTest.csv')
#go get the wordCloud based on the input image
print "Getting wordCloud...."
getWordCloud("outputTest.csv",imageFname)
del DF #delete df so you can rewrite over it
os.remove('outputTest.csv')
else:
pass
#print "Sorry, I only run one at a time"
else:
print "Finished"
#inputs to start wordcloud
randomURL = "https://automatemylife.org/how-to-crawl-your-website-and-extract-key-words/"
imageFname = "cloud.png"
mineWebsite(randomURL,imageFname)