-
Notifications
You must be signed in to change notification settings - Fork 2
/
create_pseudo_translation.py
134 lines (113 loc) · 5.8 KB
/
create_pseudo_translation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import os
import sys
import random
import pickle
import string
from tqdm import tqdm
import progressbar
import argparse
from indictrans import Transliterator
import numpy as np
from create_normalised_dict import normaliser
from tokenizers import BertWordPieceTokenizer
def create_pseudo_translation(mono,dict_path,outfile,replace='first',transliterate=False,l1='hin',l2='pan', norm=False, normalised_dict_path=None, tok=False, tokenizer=None):
if not os.path.exists(dict_path):
print("Dictionary file is not present")
return
dictionary = pickle.load(open(dict_path,'rb'))
normalised_dictionary = None
if norm:
if not os.path.exists(normalised_dict_path):
print("Normalised Dictionary file is not present")
return
normalised_dictionary = pickle.load(open(normalised_dict_path,'rb'))
if not os.path.exists(mono):
print("Monolingual data doesn't exist")
return
if transliterate:
trn = Transliterator(source=l1,target=l2,build_lookup=True)
with open(mono,'r') as f:
lines = f.readlines()
bar = progressbar.ProgressBar(max_value=len(lines),suffix="Number of lines translated: {variables.nlines}",
variables={'nlines':'-'})
print('Read Complete')
with open(outfile,'w') as w:
for i,line in enumerate(lines):
if tok:
line = line[:-1]
line = (" ".join(tokenizer.encode(line.replace(" "," *#* "),add_special_tokens = False).tokens)).replace('##'," ## ")
line += '\n'
translation = ""
for word in line.split(" "):
punct = ""
nl = ""
punctuations = ['...','।']+list(string.punctuation)
if word.endswith("\n"):
word = word[:-1]
nl = "\n"
for punctuation in punctuations:
if word.endswith(punctuation):
word = word[:-1*len(punctuation)]
punct = punctuation
break
translation_present = False
normalised_word = normaliser(word)
if word in dictionary and word not in ('\n',' ','',' \n'):
translation_present = True
translation_key = word
translation_dict = dictionary
elif norm and normalised_word in dictionary and normalised_word not in ('\n',' ','','\n'):
translation_present = True
translation_key = normalised_word
translation_dict = dictionary
elif norm and normalised_word in normalised_dictionary and normalised_word not in ('\n',' ','','\n'):
translation_present = True
translation_key = normalised_word
translation_dict = normalised_dictionary
if translation_present:
translation_words = translation_dict[translation_key][0]
word_probs = np.array(translation_dict[translation_key][1])
if replace=='prob':
translated = np.random.choice(translation_words,p=np.sqrt(word_probs)/np.sqrt(word_probs).sum()) + punct + nl
elif replace=='problin':
translated = np.random.choice(translation_words,p=word_probs/word_probs.sum()) + punct + nl
elif replace=='first':
translated = translation_words[0] + punct + nl
elif replace=='max':
translated = translation_words[np.argmax(word_probs)] + punct + nl
elif replace=='random':
translated = np.random.choice(translation_words) + punct + nl
else:
if transliterate:
trans_word = trn.transform(word)
if trans_word:
translated = trans_word + punct + nl
else:
translated = word + punct + nl
else:
translated = word + punct + nl
translation+=translated+" "
translation = translation[:-1]
if tok:
translation = translation.replace(" ","").replace("##", "").replace("*#*"," ")
w.write(translation)
bar.update(i,nlines=str(i))
if __name__=="__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--mono',type=str,help='Path to monlingual data')
parser.add_argument('--dict_path',type=str,help='Path to dictionary file')
parser.add_argument('--outfile',type=str,help='Path to output file')
parser.add_argument('--replace',type=str,help='Method to replace dictionary matches',choices=['first','prob','max','random','problin'])
parser.add_argument('--transliterate',action='store_true',help='Transliterate to other script')
parser.add_argument('--l1',type=str,help='Code for language 1')
parser.add_argument('--l2',type=str,help='Code for language 2')
parser.add_argument('--norm',action='store_true',help='Normalization of words')
parser.add_argument('--norm_dict_path',type=str,help='Path to normalized dictionary file')
parser.add_argument('--tok',action='store_true',help='Tokenization of words')
parser.add_argument('--tok_vocab_path',type=str,help='Path to tokenization vocab file')
args = parser.parse_args()
if args.tok:
tokenizer = BertWordPieceTokenizer(args.tok_vocab_path,lowercase=False,strip_accents=False)
else:
tokenizer = None
create_pseudo_translation(args.mono,args.dict_path,args.outfile,args.replace,args.transliterate,args.l1,args.l2, args.norm, args.norm_dict_path, args.tok, tokenizer)