/
model_LR_trainer.py
83 lines (64 loc) · 2.86 KB
/
model_LR_trainer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# 直接讀取理由清單檔案
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib # jbolib 儲存模型的模組
import csv
import json
clean_data_path = "C:/Users/Big data/PycharmProjects/Final_test/clean_data.json"
reason_path = "C:/Users/Big data/PycharmProjects/Final_test/reason.csv"
win_lose_path = "C:/Users/Big data/PycharmProjects/Final_test/win_lose.csv"
model_data_path = "C:/Users/Big data/PycharmProjects/Final_test/model_data.json"
def find_reasons(r_path):
reasons = []
with open(r_path, newline='') as c1:
reason_list = csv.reader(c1)
for label in reason_list:
tag_type = label[1]
tag = label[2]
reasons.append(tag_type)
reasons.append(tag)
reasons = sorted(set(reasons), key=reasons.index)
return reasons
# Preprocessing用來:
# 去除判決內容,清單們及高相關性項目
# 類別項目 "judge_court", "judge_place", "lawyer" 用one-hot encoding重新編碼
# 把處理過的DF連起來,遺漏值補0
def Preprocessing(df):
df1 = df.drop(['judge_content', 'judgment_list', 'law_list', 'keywords', 'tag_type'], axis=1)
df2 = pd.DataFrame(pd.get_dummies(df1[["court", "judge_court", "city", "lawyer"]]))
df = pd.concat([df1, df2], axis=1).fillna(0)
return df
# 用masks挑出相同離婚理由的判決,挑選欄位來建模
def masks(df, factor):
df = df[df[str(factor)] == 1] # 篩選資料
x = df[[str(factor), 'foreigner', 'proof', 'noshow', "lawyer_both", "lawyer_plaintiff", "lawyer_defendant",
"lawyer_none", "judge_court_地方法院", "judge_court_高等法院", "judge_court_最高法院", "judge_court_少年及家事法院"]] # 挑選欄位
y = df[['win_lose']]
print(x.columns)
return x, y
# 建模 Logistic Regression
def lr_model(x, y):
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=20200506) # random_state 種子值
lr = LogisticRegression()
lr.fit(x_train, y_train)
accuracy_train = round(lr.score(x_train, y_train), 4)
accuracy_test = round(lr.score(x_test, y_test), 4)
return lr, accuracy_train, accuracy_test
def main():
file = pd.read_json(clean_data_path)
df = Preprocessing(file)
for i in find_reasons(reason_path):
try:
x, y = masks(df, i)
lr, accuracy_train, accuracy_test = lr_model(x, y)
joblib.dump(lr, 'save/s_lr_%s.pkl' % i)
print(i)
print('accuracy_train: ', accuracy_train, )
print('accuracy_test: ', accuracy_test)
except Exception:
print(i, Exception)
if __name__ == '__main__':
main()