Skip to content

Commit

Permalink
adding code that processes data into the right format for SGF (#25)
Browse files Browse the repository at this point in the history
  • Loading branch information
crangelsmith committed Mar 5, 2020
1 parent 5454618 commit 6b9f8cc
Showing 1 changed file with 65 additions and 0 deletions.
65 changes: 65 additions & 0 deletions generators/odi-nhs-ae/transform_data_to_numcats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
## prepare_data.py

import numpy as np
import pandas as pd
import json
import argparse
import sys
import os


def handle_cmdline_args():
parser = argparse.ArgumentParser(
description='Read input files and output file for run of sgf.')

parser.add_argument("--input-filename", type=str, default='hospital_ae_data_deidentify',
help="Input data filename (no extension)")
parser.add_argument("--output-filename", type=str, default='hospital_ae_data_deidentify_numerical_categories',
help="Output data filename (no extension)")
parser.add_argument("--output-dir", type=str, default=os.getcwd(), help="Output directory")

args = parser.parse_args()

return args

def make_column_to_numerical_category(data, column):

# map numbers with and categories
dict_cat = {}
data_cat = data[column].astype("category").cat.codes
dict_cat[column] = [dict( enumerate(data[column].astype("category").cat.categories))]
return data_cat, dict_cat



def main():
args = handle_cmdline_args()

# read original data
data = pd.read_csv(args.input_filename + ".csv")

dict_cats = []

for column in data.columns:

# only process columns with non numerical categories
if pd.api.types.is_integer_dtype(data[column]) == False:

# rewrite the column on the dataframe, and get category-number map
data[column], data_dict = make_column_to_numerical_category(data,column)
dict_cats.append(data_dict)

# saved processed data
data_file = os.path.join(args.output_dir, args.output_filename) + ".csv"
data.to_csv(data_file, index=False)
print('deidentified dataset written out to: ' + data_file)

# saved ategory-number map
json_dict_file = os.path.join(args.output_dir, args.output_filename) + ".json"
with open(json_dict_file, "w") as jsondict:
json.dump(dict_cats, jsondict,sort_keys=True)

print('category map written out to: ' + json_dict_file)

if __name__ == "__main__":
main()

0 comments on commit 6b9f8cc

Please sign in to comment.