-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
adding code that processes data into the right format for SGF (#25)
- Loading branch information
1 parent
5454618
commit 6b9f8cc
Showing
1 changed file
with
65 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
## prepare_data.py | ||
|
||
import numpy as np | ||
import pandas as pd | ||
import json | ||
import argparse | ||
import sys | ||
import os | ||
|
||
|
||
def handle_cmdline_args(): | ||
parser = argparse.ArgumentParser( | ||
description='Read input files and output file for run of sgf.') | ||
|
||
parser.add_argument("--input-filename", type=str, default='hospital_ae_data_deidentify', | ||
help="Input data filename (no extension)") | ||
parser.add_argument("--output-filename", type=str, default='hospital_ae_data_deidentify_numerical_categories', | ||
help="Output data filename (no extension)") | ||
parser.add_argument("--output-dir", type=str, default=os.getcwd(), help="Output directory") | ||
|
||
args = parser.parse_args() | ||
|
||
return args | ||
|
||
def make_column_to_numerical_category(data, column): | ||
|
||
# map numbers with and categories | ||
dict_cat = {} | ||
data_cat = data[column].astype("category").cat.codes | ||
dict_cat[column] = [dict( enumerate(data[column].astype("category").cat.categories))] | ||
return data_cat, dict_cat | ||
|
||
|
||
|
||
def main(): | ||
args = handle_cmdline_args() | ||
|
||
# read original data | ||
data = pd.read_csv(args.input_filename + ".csv") | ||
|
||
dict_cats = [] | ||
|
||
for column in data.columns: | ||
|
||
# only process columns with non numerical categories | ||
if pd.api.types.is_integer_dtype(data[column]) == False: | ||
|
||
# rewrite the column on the dataframe, and get category-number map | ||
data[column], data_dict = make_column_to_numerical_category(data,column) | ||
dict_cats.append(data_dict) | ||
|
||
# saved processed data | ||
data_file = os.path.join(args.output_dir, args.output_filename) + ".csv" | ||
data.to_csv(data_file, index=False) | ||
print('deidentified dataset written out to: ' + data_file) | ||
|
||
# saved ategory-number map | ||
json_dict_file = os.path.join(args.output_dir, args.output_filename) + ".json" | ||
with open(json_dict_file, "w") as jsondict: | ||
json.dump(dict_cats, jsondict,sort_keys=True) | ||
|
||
print('category map written out to: ' + json_dict_file) | ||
|
||
if __name__ == "__main__": | ||
main() |