/
searchEngine.java
173 lines (141 loc) · 6.55 KB
/
searchEngine.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
// Kevin Li
// Student #: 260565522
// Dec 6, 2015
import java.util.*;
import java.io.*;
// This class implements a google-like search engine
public class searchEngine {
public HashMap<String, LinkedList<String> > wordIndex; // this will contain a set of pairs (String, LinkedList of Strings)
public directedGraph internet; // this is our internet graph
// Constructor initializes everything to empty data structures
// It also sets the location of the internet files
searchEngine() {
// Below is the directory that contains all the internet files
htmlParsing.internetFilesLocation = "internetFiles";
wordIndex = new HashMap<String, LinkedList<String> > ();
internet = new directedGraph();
} // end of constructor2015
// Returns a String description of a searchEngine
public String toString () {
return "wordIndex:\n" + wordIndex + "\ninternet:\n" + internet;
}
// This does a graph traversal of the internet, starting at the given url.
// For each new vertex seen, it updates the wordIndex, the internet graph,
// and the set of visited vertices.
void traverseInternet(String url) throws Exception{
// Initialize the things we need.
LinkedList<String> content = htmlParsing.getContent(url);
LinkedList<String> links = htmlParsing.getLinks(url);
Iterator<String> content_it = content.iterator();
Iterator<String> links_it = links.iterator();
// 1. Add the vertex to the internet graph.
internet.addVertex(url);
internet.setVisited(url, true);
// 2. Iterate over all words to add them to wordIndex if they're not in wordIndex,
// and LinkedList.addLast the current url to them
while (content_it.hasNext()) {
String s = content_it.next();
if (!wordIndex.containsKey(s)) {
LinkedList<String> new_links = new LinkedList<String>();
new_links.addLast(url);
wordIndex.put(s, new_links);
}
else {
LinkedList<String> old_links = wordIndex.get(s);
old_links.addLast(url);
wordIndex.put(s, old_links);
}
}
// 3. Iterate over all links to add edges and call traverseInternet on them if they're
// not in internet.
while (links_it.hasNext()) {
String l = links_it.next();
boolean vis = internet.getVisited(l);
if (!vis) {
traverseInternet(l);
}
internet.addEdge(url, l);
}
} // end of traverseInternet
// Helper page rank computation function.
double prFunc(String v) {
// Get LinkedList of internet.getEdgesInto(v) and (double)getPageRank on those vertices
// divided by out degree(cast int as double). Sum all of them. Multiply that by 0.5. Add 0.5
LinkedList<String> e = internet.getEdgesInto(v);
Iterator<String> e_it = e.iterator();
double thisPR = 0;
while (e_it.hasNext()) {
String s = e_it.next();
double s_pr = internet.getPageRank(s);
double s_outDegree = internet.getOutDegree(s);
thisPR += (s_pr/s_outDegree);
}
thisPR = (thisPR*0.5) + 0.5;
return thisPR;
}
void computePageRanks() {
// 1. Get LinkedList of vertices, then set all PR of vertices to 1.
// 2. Start 100 iterations for calculating PR of all vertices.
// 3. Loop over each vertex and use the PR formula to calculate its PR.
LinkedList<String> vertices = internet.getVertices();
Iterator<String> v_it = vertices.iterator();
while (v_it.hasNext()) {
String s = v_it.next();
internet.setPageRank(s, 1);
}
for (int i = 0; i < 100; i++) {
LinkedList<String> vxs = internet.getVertices();
Iterator<String> vs = vertices.iterator();
while (vs.hasNext()) {
String a = vs.next();
double pr = prFunc(a);
internet.setPageRank(a, pr);
}
}
} // end of computePageRanks
/* Returns the URL of the page with the high page-rank containing the query word
Returns the String "" if no web site contains the query.
This method can only be called after the computePageRanks method has been executed.
Start by obtaining the list of URLs containing the query word. Then return the URL
with the highest pageRank.
This method should take about 25 lines of code.
*/
String getBestURL(String query) {
try {
LinkedList<String> results = new LinkedList<String>();
results = wordIndex.get(query);
Iterator<String> r = results.iterator();
double best_score = 0;
String best = "";
while (r.hasNext()) {
String current_url = r.next();
double score = internet.getPageRank(current_url);
if (score > best_score) {
best_score = score;
best = current_url;
}
}
best = best + ", p.r. = " + best_score;
return best;
} catch (Exception e) {
return ("No best site for this query.");
}
} // end of getBestURL
public static void main(String args[]) throws Exception{
searchEngine mySearchEngine = new searchEngine();
// to debug your program, start with.
//mySearchEngine.traverseInternet("http://www.cs.mcgill.ca/~blanchem/250/a.html");
// When your program is working on the small example, move on to
mySearchEngine.traverseInternet("http://www.cs.mcgill.ca");
mySearchEngine.computePageRanks();
BufferedReader stndin = new BufferedReader(new InputStreamReader(System.in));
String query;
do {
System.out.print("Enter query: ");
query = stndin.readLine();
if ( query != null && query.length() > 0 ) {
System.out.println("Best site = " + mySearchEngine.getBestURL(query));
}
} while (query!=null && query.length()>0);
} // end of main
}