/
main.py
172 lines (123 loc) · 4.63 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
'''
main.py
Casey Burns
6/30/2016
- This program searches wikipedia pages for the first clickable
word in the body of each page. It follows this hyperlink until
either diverging or reaching the Philosophy page.
- To run:
$ python main.py [max path length] [iterations] [-vo]
(the three arguments are optional)'''
import requests as reqs
import time
import sys
import common
# Get start time to time program
startTime = time.time()
# Dictionary of initial random article titles and path lengths
randomArticles = {}
# Dictionary of visited articles URL titles and path lengths
# that resolve to Philosophy page or diverge to minimize requests
visitedArticles = {}
# List of articles visited in current search; to be voided after
# completion of each search, and current random article title
currentPath = []
# Boolean to turn on or off visual output
vo = False
def searchForHyperlink(respTxt, idx, mpl, randomTitle):
'''The searchForHyperlink function is a recursive function that
takes a string argument respTxt, an integer argument idx, the max
path length mpl, and the random article title that we are trying to
find the path for. It calls getURLTitle to retrieve the next URL,
HTML requests this URL, and recursively calls itself if need be. Upon
completion, it decides if the path converges to Philosophy or not.'''
global randomArticles
global visitedArticles
global currentPath
global vo
# URL for non-random wiki pages
wikiURL = 'https://en.wikipedia.org/wiki/'
urlTitle = common.getURLTitle(respTxt[idx:])
if vo:
print 'Followed URL:', wikiURL + urlTitle
if urlTitle.lower() == 'philosophy':
# This path has converged to philosophy and is also a new path. Add the path
# lenghts to visitedArticles and randomArticles
if vo:
print '>>>CONVERGENT<<<'
print 'Path Length:', len(currentPath) + 1
randomArticles[randomTitle] = len(currentPath) + 1
addFullVisitedPath()
currentPath = []
return False
elif urlTitle in visitedArticles and urlTitle != '' and visitedArticles[urlTitle] != -1:
# We've visited this page before and it converges
if vo:
print '>>>CONVERGENT<<<'
print 'Path Length:', len(currentPath) + visitedArticles[urlTitle] + 1
randomArticles[randomTitle] = len(currentPath) + visitedArticles[urlTitle] + 1
addVisitedPath(visitedArticles[urlTitle])
currentPath = []
return False
elif len(currentPath) < mpl and urlTitle not in currentPath and urlTitle != '' and urlTitle not in visitedArticles:
# Call searchForHyperlink function again with new URL
# Starting index idx will be at least 204 due to beginning of HTML file
currentPath.append(urlTitle)
response = reqs.get(wikiURL + urlTitle)
diverges = searchForHyperlink(response.text, 204, mpl, randomTitle)
return diverges
else:
# Path diverges
if vo:
print '>>>DIVERGENT<<<'
addDivergentPath()
currentPath = []
return True
def addFullVisitedPath():
'''addFullVisitedPath saves the length of the current path;
pages in the path have not been visited before.'''
global currentPath
global visitedArticles
for i in range(len(currentPath)):
visitedArticles[currentPath[0]] = len(currentPath)
currentPath.pop(0)
return
def addVisitedPath(addLength):
'''addVisitedPath saves the length of the current path;
pages in the path have been visited before.'''
global currentPath
global visitedArticles
for i in range(len(currentPath)):
visitedArticles[currentPath[0]] = len(currentPath) + addLength + 1
currentPath.pop(0)
return
def addDivergentPath():
'''addDivergentPath adds the url titles of divergent articles
to visitedArticles as keys with values of -1'''
global currentPath
global visitedArticles
for title in currentPath:
visitedArticles[title] = -1
return
if __name__ == '__main__':
# Input syntax:
# $ python main.py [maxPathLength] [iterations] [-vo]
maxPathLength, iterations, vo = common.parseCommandLineArgs(sys.argv)
# Input stored path lengths from storedPathLengths.txt
# URL for random wiki page
randomWikiURL = 'https://en.wikipedia.org/wiki/Special:Random'
for i in range(iterations):
# Request random URL HTML
response = reqs.get(randomWikiURL)
# Get title of random article
randomTitle, currentIdx = common.getArticleTitle(response.text)
randomArticles[randomTitle] = 0
if vo:
print 'Random Article:', randomTitle
# Search response text for next URL
diverges = searchForHyperlink(response.text, currentIdx, maxPathLength, randomTitle)
if diverges:
randomArticles[randomTitle] = -1
dc, cc, histo = common.calculateStats(randomArticles)
runtime = time.time() - startTime
common.printStats(dc, cc, histo, iterations, maxPathLength, runtime)