automate_logistic_regression_baseline_experiments.py

# We will automate all the logistic regression baseline experiments for different event types and their subtasks
# For each Event type we will first run the data_preprocessing and 
# then run the logistic regression classifier for each subtask that has few (non-zero) positive examples
# We will save all the different classifier models, configs and results in separate directories
# Finally when all the codes have finished we will aggregate all the results and save the final metrics in csv file

from model.utils import log_list, make_dir_if_not_exists, load_from_pickle, load_from_json, MIN_POS_SAMPLES_THRESHOLD
import os
import json
import time
import csv
import subprocess

import logging
logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)

task_type_to_datapath_dict = {
								"tested_positive": ("./data/positive-add_text.jsonl", "./data/test_positive.pkl"),
								"tested_negative": ("./data/negative-add_text.jsonl", "./data/test_negative.pkl"),
								"can_not_test": ("./data/can_not_test-add_text.jsonl", "./data/can_not_test.pkl"),
								"death": ("./data/death-add_text.jsonl", "./data/death.pkl"),
								"cure": ("./data/cure_and_prevention-add_text.jsonl", "./data/cure_and_prevention.pkl"),
								}

REDO_DATA_FLAG = False
# REDO_DATA_FLAG = False
# REDO_FLAG = True
REDO_FLAG = False

# We will save all the tasks and subtask's results and model configs in this dictionary
all_task_results_and_model_configs = dict()
# We will save the list of question_tags AKA subtasks for each event AKA task in this dict
all_task_question_tags = dict()
for taskname, (data_in_file, processed_out_file) in task_type_to_datapath_dict.items():
	if not os.path.exists(processed_out_file) or REDO_DATA_FLAG:
		data_preprocessing_cmd = f"python model/data_preprocessing.py -d {data_in_file} -s {processed_out_file}"
		logging.info(data_preprocessing_cmd)
		os.system(data_preprocessing_cmd)
	else:
		logging.info(f"Preprocessed data for task {taskname} already exists at {processed_out_file}")

	# Read the data statistics
	task_instances_dict, tag_statistics, question_keys_and_tags = load_from_pickle(processed_out_file)
	
	# We will save the classifier results and model config for each subtask in this dictionary
	all_subtasks_results_and_model_configs = dict()

	# We will store the list of subtasks for which we train the classifier
	tested_tasks = list()
	for i, (question_tag, question_key) in enumerate(question_keys_and_tags):
		current_question_tag_statistics = tag_statistics[0][question_tag]
		if len(current_question_tag_statistics) > 1 and current_question_tag_statistics[1] >= MIN_POS_SAMPLES_THRESHOLD:
			tested_tasks.append(question_tag)
			logging.info(f"Training LR model on {processed_out_file} for subtask {question_tag} ")
			print(f"{i}: {question_tag}\t{current_question_tag_statistics}")
			output_dir = os.path.join("results", "lr_baseline", taskname + "_" + question_tag)
			make_dir_if_not_exists(output_dir)
			results_file = os.path.join(output_dir, "results.json")
			model_config_file = os.path.join(output_dir, "model_config.json")
			if not os.path.exists(results_file) or REDO_FLAG:
				# Execute the lr baseline train and test only if the results file doesn't exists
				lr_cmd = f"python model/logistic_regression_baseline.py -d {processed_out_file} -t {taskname + '_' + question_tag} -st {question_tag} -o {output_dir}"
				logging.info(f"Running: {lr_cmd}")
				try:
					retcode = subprocess.call(lr_cmd, shell=True)
					# os.system(lr_cmd)
				except KeyboardInterrupt:
					exit()
			#  Read the results from the results json file
			results = load_from_json(results_file)
			model_config = load_from_json(model_config_file)
			all_subtasks_results_and_model_configs[question_tag] = results, model_config
	all_task_results_and_model_configs[taskname] = all_subtasks_results_and_model_configs
	all_task_question_tags[taskname] = tested_tasks

# Read the results for each task and save them in csv file
results_tsv_save_file = os.path.join("results", "all_experiments_lr_baseline_results.tsv")
with open(results_tsv_save_file, "w") as tsv_out:
	writer = csv.writer(tsv_out, delimiter='\t')
	header = ["Event", "Sub-task", "Train Data (size, pos., neg.)", "Dev Data (size, pos., neg.)", "Test Data (size, pos., neg.)", "model name", "no. of features", "accuracy", "CM", "pos. F1", "SQuAD_total", "SQuAD_EM", "SQuAD_F1", "SQuAD_Pos. EM_F1_total", "SQuAD_Pos. EM", "SQuAD_Pos. F1", "dev_threshold", "dev_N", "dev_F1", "dev_P", "dev_R", "dev_TP", "dev_FP", "dev_FN", "N", "F1", "P", "R", "TP", "FP", "FN"]
	writer.writerow(header)
	for taskname, question_tags in all_task_question_tags.items():
		current_task_results_and_model_configs = all_task_results_and_model_configs[taskname]
		for question_tag in question_tags:
			results, model_config = current_task_results_and_model_configs[question_tag]
			# Extract results
			classification_report = results["Classification Report"]
			# print(question_tag)
			# print(classification_report)
			positive_f1_classification_report = classification_report['1']['f1-score']
			accuracy = classification_report['accuracy']
			CM = results["CM"]
			# SQuAD results
			total_EM = results["SQuAD_EM"]
			total_F1 = results["SQuAD_F1"]
			total_tweets = results["SQuAD_total"]
			pos_EM = results["SQuAD_Pos. EM"]
			pos_F1 = results["SQuAD_Pos. F1"]
			total_pos_tweets = results["SQuAD_Pos. EM_F1_total"]
			# Best threshold and dev F1
			best_dev_threshold = results["best_dev_threshold"]
			best_dev_F1 = results["best_dev_F1"]
			dev_t_F1_P_Rs = results["dev_t_F1_P_Rs"]
			best_dev_threshold_index = int(best_dev_threshold * 10) - 1
			# Each entry in dev_t_F1_P_Rs is of the format t, dev_F1, dev_P, dev_R, dev_TP + dev_FN, dev_TP, dev_FP, dev_FN
			t, dev_F1, dev_P, dev_R, dev_N, dev_TP, dev_FP, dev_FN = dev_t_F1_P_Rs[best_dev_threshold_index]
			# Alan's metrics
			F1 = results["F1"]
			P = results["P"]
			R = results["R"]
			TP = results["TP"]
			FP = results["FP"]
			FN = results["FN"]
			N = results["N"]
			# Extract model config
			model_name = model_config["model"]
			n_features = model_config["features"]["size"]
			train_data = (model_config["train_data"]["size"], model_config["train_data"]["pos"], model_config["train_data"]["neg"])
			dev_data = (model_config["dev_data"]["size"], model_config["dev_data"]["pos"], model_config["dev_data"]["neg"])
			test_data = (model_config["test_data"]["size"], model_config["test_data"]["pos"], model_config["test_data"]["neg"])
			
			row = [taskname, question_tag, train_data, dev_data, test_data, model_name, n_features, accuracy, CM, positive_f1_classification_report, total_tweets, total_EM, total_F1, total_pos_tweets, pos_EM, pos_F1, best_dev_threshold, dev_N, dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN, N, F1, P, R, TP, FP, FN]
			writer.writerow(row)