/
demonstration.py
99 lines (71 loc) · 1.99 KB
/
demonstration.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#%%
import time
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
#%%
# set path to data
path_to_data = "./"
# read document-term matrix
dt = pd.read_csv(path_to_data + "dt.csv")
# rename first column (contains the cik code)
dt.rename(columns={"Unnamed: 0": "cik"}, inplace=True)
# extract the cik codes and remove from dataframe
cik = dt[["cik"]]
dt.drop(columns=["cik"], inplace=True)
dt
#%%
# save columns as vocabulary
vocab = dt.columns
# transform the document-term matrix into a numpy array
dt = dt.values
dt.shape
#%%
# read the covariates data for the cik codes
df_cov = pd.read_csv(path_to_data + "cik_covariates.csv")
df_cov
#%%
# merge the naics2 sector with the ciks codes
naics2 = pd.merge(cik, df_cov[["cik", "naics2"]], on="cik", how="left")
labels = naics2["naics2"].values
print(labels.shape)
#%%
# define the model
model = RandomForestClassifier(n_estimators=20, n_jobs=-1)
model
### this is a fun tutorial
#%%
# fit the model (time it)
start = time.perf_counter()
model.fit(dt, labels)
end = time.perf_counter()
print(f"Time to fit: {end - start:.2f} seconds")
#%%
# get the feature importances
importances = model.feature_importances_
# sort the feature importances
indices = importances.argsort()[::-1]
# get the top 10 features
top10 = indices[:10]
# get the top 10 feature names
top10_features = [vocab[i] for i in top10]
top10_features
#%%
# generate in-sample predictions
start = time.perf_counter()
preds = model.predict(dt)
end = time.perf_counter()
print(f"Time to predict: {end - start:.2f} seconds")
#%%
# calculate the accuracy
acc = (preds == labels).mean()
print(f"Accuracy: {acc:.2f}")
#%%
# save the model with pickle
import pickle
model_name = "model.pkl"
with open(model_name, "wb") as f:
pickle.dump(model, f)
# save the feature importances as csv with the name of the feature and its importance
importances = pd.DataFrame({"feature": vocab, "importance": model.feature_importances_})
importances.to_csv("importances.csv", index=False)
# %%