/
dict_create_edit_ratio.py
103 lines (75 loc) · 2.94 KB
/
dict_create_edit_ratio.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Oct 12 12:48:35 2018
@author: bhur
"""
#import dependencies
import json
import pandas as pd
import numpy as np
import nltk
import csv
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
#read in files
#must have ABLIST_NEW.csv in working folder with columns 'Medication' and 'Antimicrobial'
#where Antimicrobial is ingredient and Medication is medication name
ABLIST_NEW = pd.read_csv('ABLIST_NEW.csv', skip_blank_lines=True).fillna('')
ITEM_LIST = pd.read_csv('is_ab.csv')
ABLIST_NEW['Antimicrobial'] = ABLIST_NEW['Antimicrobial'].str.lower().replace({r'\W' : " "}, regex=True)
ABLIST_NEW['Antimicrobial'] = ABLIST_NEW['Antimicrobial'].apply(tokenizer.tokenize)
ABLIST_NEW['Medication'] = ABLIST_NEW['Medication'].str.lower()
ABLIST_NEW['Medication'] = ABLIST_NEW['Medication'].apply(tokenizer.tokenize)
#
##switch column name to match item
#ITEM_LIST['Item Name'] = ITEM_LIST['item']
ITEM_LIST.columns = ['item']
#create ratio of edit distance between two strings / length of string1
def editRatio(string1, string2):
l1 = len(string1)
l2 = len(string2)
#create a table matching length of strings
table = [[0 for x in range(l2+ 1)] for x in range(l1 + 1)]
#fill table with minium amount of changes necessary for each character
for i in range(l1 + 1):
for j in range(l2 + 1):
if i == 0:
table[i][j] = j
elif j == 0:
table[i][j] = i
elif string1[i - 1] == string2[j - 1]:
table[i][j] = table[i - 1][j - 1]
else:
table[i][j] = 1 + min(table[i][j - 1], # Insert
table[i - 1][j], # Remove
table[i - 1][j - 1]) # Replace
#return ratio of edit distance divided by length of string
return (table[l1][l2] / l1)
d = {}
#make strings lowercased
ITEM_LIST['item'] = ITEM_LIST['item'].str.lower()
ITEM_LIST['item'] = ITEM_LIST['item'].apply(tokenizer.tokenize)
#create edit distance dictionary between all ab and item tokens
for item_line in ITEM_LIST['item']:
for item_token in item_line:
for ingredient, med in zip(ABLIST_NEW.Antimicrobial, ABLIST_NEW.Medication):
for ab_token in med:
if ab_token not in d:
d[ab_token] = {}
if item_token not in d[ab_token]:
ratio = editRatio(ab_token, item_token)
d[ab_token][item_token] = ratio
#store dictionary
js = json.dumps(d)
# Open new json file if not exist it will create
fp = open('new_vic_dict.json', 'a')
# write to json file
fp.write(js)
# close the connection
fp.close()
#load the json file into dictionary
json1_file = open('vic_dict.json')
json1_str = json1_file.read()
d = json.loads(json1_str)