Skip to content

Commit 0a5e432

Browse files
remove stop words
1 parent 773eac5 commit 0a5e432

File tree

2 files changed

+464
-13
lines changed

2 files changed

+464
-13
lines changed

src/SearchPackage/Indexer.java

Lines changed: 35 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,67 @@
11
package SearchPackage;
2-
import java.util.*;
3-
import java.io.*;
2+
3+
import java.util.*;
4+
import java.io.*;
5+
46
import org.jsoup.Jsoup;
7+
58
import java.util.regex.Pattern;
69
import java.util.regex.Matcher;
10+
711
import org.jsoup.nodes.Document;
812

913
public class Indexer {
10-
public static void main(String[]args){
14+
List<String> stopWords;
15+
16+
public static void main(String[] args) {
1117
// String s=Parsing("../input.txt");
1218
// Vector<String> ll;
1319
// ll= splitWords(s);
1420
// for(int i=0;i<ll.size();i++){
1521
// System.out.println(ll.get(i));
1622
// }
1723
}
18-
public static String Parsing(String input){
24+
25+
public static String Parsing(String input) {
1926
String lines = "";
20-
StringBuilder Str=new StringBuilder();
27+
StringBuilder Str = new StringBuilder();
2128
try {
2229
BufferedReader reader = new BufferedReader(new FileReader(input));
2330

2431
while ((lines = reader.readLine()) != null) {
2532
Str.append(lines);
2633
}
2734
reader.close();
28-
lines=Str.toString();
29-
Document html= Jsoup.parse(lines);
30-
lines= html.text();
35+
lines = Str.toString();
36+
Document html = Jsoup.parse(lines);
37+
lines = html.text();
38+
} catch (IOException e) {
39+
e.printStackTrace();
3140
}
32-
catch (IOException e){
33-
e.printStackTrace();}
3441
return lines;
3542
}
36-
public static Vector<String> splitWords(String Lines){
37-
Vector<String> words =new<String> Vector();
43+
44+
public static Vector<String> splitWords(String Lines) {
45+
Vector<String> words = new <String>Vector();
3846
Pattern pattern = Pattern.compile("\\w+");
3947
Matcher match = pattern.matcher(Lines);
40-
while (match.find()){
48+
while (match.find()) {
4149
words.add(match.group());
4250
}
4351
return words;
4452
}
53+
54+
55+
private void readStopWords() throws IOException {
56+
BufferedReader reader = new BufferedReader(new FileReader("stopwords.txt"));
57+
stopWords = new Vector<String>();
58+
String word;
59+
while ((word = reader.readLine()) != null) {
60+
stopWords.add(word);
61+
}
62+
}
63+
64+
private void removeStopWords(List<String> words) {
65+
words.removeAll(stopWords);
66+
}
4567
}

0 commit comments

Comments
 (0)