|
1 | 1 | package SearchPackage;
|
2 |
| -import java.util.*; |
3 |
| -import java.io.*; |
| 2 | + |
| 3 | +import java.util.*; |
| 4 | +import java.io.*; |
| 5 | + |
4 | 6 | import org.jsoup.Jsoup;
|
| 7 | + |
5 | 8 | import java.util.regex.Pattern;
|
6 | 9 | import java.util.regex.Matcher;
|
| 10 | + |
7 | 11 | import org.jsoup.nodes.Document;
|
8 | 12 |
|
9 | 13 | public class Indexer {
|
10 |
| - public static void main(String[]args){ |
| 14 | + List<String> stopWords; |
| 15 | + |
| 16 | + public static void main(String[] args) { |
11 | 17 | // String s=Parsing("../input.txt");
|
12 | 18 | // Vector<String> ll;
|
13 | 19 | // ll= splitWords(s);
|
14 | 20 | // for(int i=0;i<ll.size();i++){
|
15 | 21 | // System.out.println(ll.get(i));
|
16 | 22 | // }
|
17 | 23 | }
|
18 |
| - public static String Parsing(String input){ |
| 24 | + |
| 25 | + public static String Parsing(String input) { |
19 | 26 | String lines = "";
|
20 |
| - StringBuilder Str=new StringBuilder(); |
| 27 | + StringBuilder Str = new StringBuilder(); |
21 | 28 | try {
|
22 | 29 | BufferedReader reader = new BufferedReader(new FileReader(input));
|
23 | 30 |
|
24 | 31 | while ((lines = reader.readLine()) != null) {
|
25 | 32 | Str.append(lines);
|
26 | 33 | }
|
27 | 34 | reader.close();
|
28 |
| - lines=Str.toString(); |
29 |
| - Document html= Jsoup.parse(lines); |
30 |
| - lines= html.text(); |
| 35 | + lines = Str.toString(); |
| 36 | + Document html = Jsoup.parse(lines); |
| 37 | + lines = html.text(); |
| 38 | + } catch (IOException e) { |
| 39 | + e.printStackTrace(); |
31 | 40 | }
|
32 |
| - catch (IOException e){ |
33 |
| - e.printStackTrace();} |
34 | 41 | return lines;
|
35 | 42 | }
|
36 |
| - public static Vector<String> splitWords(String Lines){ |
37 |
| - Vector<String> words =new<String> Vector(); |
| 43 | + |
| 44 | + public static Vector<String> splitWords(String Lines) { |
| 45 | + Vector<String> words = new <String>Vector(); |
38 | 46 | Pattern pattern = Pattern.compile("\\w+");
|
39 | 47 | Matcher match = pattern.matcher(Lines);
|
40 |
| - while (match.find()){ |
| 48 | + while (match.find()) { |
41 | 49 | words.add(match.group());
|
42 | 50 | }
|
43 | 51 | return words;
|
44 | 52 | }
|
| 53 | + |
| 54 | + |
| 55 | + private void readStopWords() throws IOException { |
| 56 | + BufferedReader reader = new BufferedReader(new FileReader("stopwords.txt")); |
| 57 | + stopWords = new Vector<String>(); |
| 58 | + String word; |
| 59 | + while ((word = reader.readLine()) != null) { |
| 60 | + stopWords.add(word); |
| 61 | + } |
| 62 | + } |
| 63 | + |
| 64 | + private void removeStopWords(List<String> words) { |
| 65 | + words.removeAll(stopWords); |
| 66 | + } |
45 | 67 | }
|
0 commit comments