/
WikiCrawler.java
151 lines (137 loc) · 5.38 KB
/
WikiCrawler.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
/**
* Crawls a wikipedia page and generates a graph starting at user input starting page and
* only including pages with user input keywords. The graph will be recorded as the number
* of edges followed by a list of edges in the textfile. Each line represents an edge and each
* edge is a space separated pair of vertex names.
*
* Pages will be crawled and collect until the maximum number of pages are collected where the maximum
* is user specificed. Once this limit has been reached, edges in the graph will continue to be generated
* but only between previously recorded vertices. Any URLs to previously unseen pages will be ignored
* once the max page limit has been reached.
*
* The above insures a hard upper limit on the number of vertices and prevents the situation where
* there are edges to new pages that were not completely crawled.
* @author Alex Shum
*/
public class WikiCrawler {
private static final String BASE_URL = "http://en.wikipedia.org";
private LinkedList<String> toVisit; //link queue
private Set<String> visitedURL; //visited URL that does not contain search terms
private Set<String> visitedUsefulURL; //visited URL that contains search terms
private List<String> edges;
private String seedURL; //start url
private String[] keywords; //search words
private int max; //max number of pages
private int numCrawled; //number of pages that contain search words
private int pagesRequested;
private String fileName;
/**
* Creates a new WikiCrawler Object to crawl wikipedia pages and generate a graph file.
* @param seedURL Start URL, must be relative wikipedia url: "/wiki/Title_of_Article".
* @param keywords Array of search terms to find.
* @param max Maximum number of unique pages.
* @param fileName Name of file to record graph edges.
*/
public WikiCrawler(String seedURL, String[] keywords, int max, String fileName) {
toVisit = new LinkedList<String>();
visitedURL = new HashSet<String>();
visitedUsefulURL = new HashSet<String>();
edges = new LinkedList<String>();
this.seedURL = seedURL;
this.keywords = keywords;
this.max = max;
numCrawled = 0;
pagesRequested = 0;
this.fileName = fileName;
}
/**
* Starts the wikipedia page crawling process starting from the seedURL.
* @throws InterruptedException if Thread.sleep has any issues.
*/
public void crawl() throws InterruptedException {
//check if seed URL contains your search terms
SimpleWikiContentParser c = new SimpleWikiContentParser(BASE_URL + seedURL, keywords);
if(c.pageContainsAllTerms()) {
toVisit.add(seedURL);
pagesRequested++;
numCrawled++;
}
visitedUsefulURL.add(seedURL);
//begin crawl
boolean keepCrawling = true;
while(keepCrawling) {
keepCrawling = crawlNext();
}
//record results
try {
writeToFile(fileName);
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* Crawls the next page in the queue. Collects new links until max limit reached; afterwards
* this will continue to add edges to the graph only between previously seen vertices.
*
* This function will automatically wait 2 seconds every 200 page requests.
*
* @warning Error handling is handled by SimpleWikiLinkParser and SimpleWikiContentParser.
* Any issues with bad links or connectivity will result in trying the next available URL.
*
* @return true if there are any links left to crawl in the queue. false otherwise.
* @throws InterruptedException if Thread.sleep has any issues.
*/
public boolean crawlNext() throws InterruptedException {
if(toVisit.isEmpty()) return(false);
String fromPage = toVisit.poll();
System.out.println("fromPage: " + BASE_URL + fromPage);
SimpleWikiLinkParser s = new SimpleWikiLinkParser(BASE_URL + fromPage);
Set<String> newLinks = s.getLinks();
pagesRequested++;
SimpleWikiContentParser c;
for(String l : newLinks) {
System.out.println("collected: " + numCrawled + ", Left in Queue: " + toVisit.size() + " toPage: " + BASE_URL + l); //TODO
if(visitedUsefulURL.contains(l) && !fromPage.equalsIgnoreCase(l)) { //previously seen this page and it has keyword matches
edges.add(fromPage + " " + l);
} else if(visitedURL.contains(l) || fromPage.equalsIgnoreCase(l)) { //previously seen this page, no keyword matches
//do nothing
} else if(numCrawled < max) { //new unseen page and still need to crawl more pages
c = new SimpleWikiContentParser(BASE_URL + l, keywords);
if(pagesRequested % 200 == 0) Thread.sleep(2000);
pagesRequested++;
if(c.pageContainsAllTerms()) {
toVisit.add(l);
edges.add(fromPage + " " + l);
visitedUsefulURL.add(l);
numCrawled++;
} else {
visitedURL.add(l);
}
} else { //done getting new pages
//do nothing
}
}
return(true);
}
/**
* Writes the number of edges and list of edges down to the text file.
* @param fileName Name of text file.
* @throws IOException If file cannot be opened, created or written to.
*/
private void writeToFile(String fileName) throws IOException {
FileWriter fw = new FileWriter(fileName);
BufferedWriter bw = new BufferedWriter(fw);
bw.write(max + System.lineSeparator());
for(String s : edges) {
bw.write(s + System.lineSeparator());
}
bw.close();
}
}