-
Notifications
You must be signed in to change notification settings - Fork 2
/
AnsjSegmenterUtil.java
48 lines (42 loc) · 1.49 KB
/
AnsjSegmenterUtil.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
package com.pt.ml.process;
import com.huaban.analysis.jieba.JiebaSegmenter;
import com.huaban.analysis.jieba.SegToken;
import com.pt.ml.data.StopWords;
import org.ansj.domain.Result;
import org.ansj.domain.Term;
import org.ansj.splitWord.analysis.ToAnalysis;
import org.apache.commons.lang3.StringUtils;
import java.util.ArrayList;
import java.util.List;
/**
* 支持分词和词性标注
*/
public class AnsjSegmenterUtil {
public static List<String> getWords(String text) {
Result terms = ToAnalysis.parse(text);
List<String> words = new ArrayList<>();
for (Term term : terms) {
String word = term.getName();
if (StringUtils.isNotBlank(word)) {
words.add(word);
}
}
return words;
}
public static List<String> getWordsRemovedStopWords(String text) {
Result terms = ToAnalysis.parse(text);
List<String> words = new ArrayList<>();
for (Term term : terms) {
String word = term.getName();
if (StringUtils.isNotBlank(word) && !StopWords.isZhStopWords(word)) {
words.add(word);
}
}
return words;
}
public static void main(String[] args) {
getWords("大家早上好,京东发优惠券了啊!").forEach(str -> System.out.print(str + "/"));
System.out.println();
getWordsRemovedStopWords("大家早上好,京东发优惠券了啊!").forEach(str -> System.out.print(str + "/"));
}
}