/
ckip.py
87 lines (77 loc) · 2.92 KB
/
ckip.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# pip install -U ckiptagger
# Tensorflow限定版本
# pip install tensorflow==1.13.1
# pip install gdown
# coding=utf-8
# http://ckip.iis.sinica.edu.tw/data/ckiptagger/data.zip 下載中研院CKIP斷詞資料包並解壓縮
from ckiptagger import data_utils, construct_dictionary, WS, POS, NER
import os
import json
#
# 用OS設定避免報錯
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
# 讀model
ws = WS('C:\\Users\\Big data\\PycharmProjects\\Final_test\\data') # 斷詞
pos = POS('C:\\Users\\Big data\\PycharmProjects\\Final_test\\data') # 詞性標注
ner = NER('C:\\Users\\Big data\\PycharmProjects\\Final_test\\data') # 實體辨識
# 設定字典
# word_to_weight = {
# "土地公": 1,
# "土地婆": 1,
# "公有": 2,
# "": 1,
# "來亂的": "啦",
# "緯來體育台": 1,
# }
# dictionary = construct_dictionary(word_to_weight)
# # print(dictionary)
# # 設定自訂的字典
with open(r'./mydictionary.txt', 'r', encoding='utf8') as g:
mydictionary = g.read()
# print(mydictionary)
word_to_weight = {}
for i in mydictionary.split('\n'):
word_to_weight[i.split(': ')[0]] = 1
# print(word_to_weight)
dictionary = construct_dictionary(word_to_weight)
# print(dictionary)
# 開啟json檔案
with open("C:/Users/Big data/PycharmProjects/Final_test/clean_data.json", "r+", encoding="utf8") as f:
raw_data = json.load(f)
for item in raw_data:
content = str(item['judge_content'])
content = content.replace(' ', '').replace(' ', '').replace('台', '臺').split(',')
# 引入CKIP斷詞
word_s = ws(content,
sentence_segmentation=True,
segment_delimiter_set={",", "。", ":", "?", "!", ";"},
coerce_dictionary=dictionary)
print(word_s)
# word_sentence_list = ws(content,
# sentence_segmentation = True, # To consider delimiters
# segment_delimiter_set = {",", "。", ":", "?", "!", ";"}, # This is the defualt set of delimiters
# recommend_dictionary = dictionary, # words in this dictionary are encouraged
# coerce_dictionary = dictionary # words in this dictionary are forced
# )
#
# # 詞性標注
# word_p = pos(word_s)
# print(word_p)
#
# # 實體辨識
# word_n = ner(word_s, word_p)
# print(word_n)
# # 三個功能都合在一起
# def combine_wandp(w_list, p_list):
# assert len(w_list) == len(p_list)
# for w, p in zip(w_list, p_list):
# print('{}({})'.format(w, p), end='\u3000')
#
#
# for i, sentence in enumerate(content):
# print ("'{}'".format(sentence))
# combine_wandp(word_s[i], word_p[i])
# print ()
# for n in sorted(word_n[i]):
# print (n)
# print ('\n')