/
multiproc_runner.py
95 lines (74 loc) · 3.09 KB
/
multiproc_runner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from itertools import product
from multiprocessing import Pool, Manager
import os
import pickle
from models.non_meta import RandomForest, KNN, SVM, DecisionTree, LogisticRegression, GradientBoosting
from utils import run_non_meta_model
from model_params import common_params, knn_params, svm_params, randomforest_params, logisticregression_params, \
decisiontree_params, gradientboosting_params, meta_train, meta_test
# A function that will take the config and apply it to the model
def run_models(config):
"""A simple run_model function for multi-processing
Args:
config: A tuple in the form of (result_data, base_model_name, model_params_name, category)
"""
# Unpack the configuration names
result_data, base_model_name, model_params_name, category = config
# Find the corresponding model and model-related params
base_model = globals()[base_model_name]
model_params = globals()[model_params_name]
# Run model with the given configuration
run_non_meta_model(
base_model,
common_params,
model_params,
category,
result_dict=result_data
)
def main():
"""Main driver function to run multi-thread process"""
# Set up the results directory
results_folder = './results'
if not os.path.exists(results_folder):
os.makedirs(results_folder)
print('No folder for results storage found')
print('Make folder to store results at')
else:
print('Found existing folder. All results will be stored at')
print(results_folder)
# Listing the categories of experiments we are running
categories = ['category_3', 'category_4_i', 'category_4_ii', 'category_5_i', 'category_5_ii']
models_and_params = [
('KNN', 'knn_params'),
('SVM', 'svm_params'),
('DecisionTree', 'decisiontree_params'),
('RandomForest', 'randomforest_params'),
('LogisticRegression', 'logisticregression_params'),
('GradientBoosting', 'gradientboosting_params')
]
if __name__ == '__main__':
# List of arguments to run desired models
configs = [
(result_data, model_and_params[0], model_and_params[1], category)
for model_and_params, category in product(models_and_params, categories)
]
# Ideally should be at least c-1 or c-2 where c is the number
# of logical cores on the machine
num_processes = 3
# Submit all jobs by mapping function with provided configs
with Pool(num_processes) as pool:
pool.map(run_models, configs)
if not common_params['fine_tuning']:
# Append the data to cv_stats or overwrite the current results
overwrite = common_params['cv_stats_overwrite']
cv_stats_dst = common_params['stats_path']
if os.path.exists(cv_stats_dst) and overwrite:
print('Overwriting the current cv_stats.pkl')
os.remove(cv_stats_dst)
with open(cv_stats_dst, "wb") as f:
pickle.dump(result_data, f)
else:
with open('./results/ft_logs.pkl', "wb") as f:
pickle.dump(result_data, f)
if __name__ == '__main__':
main()