/
01_build_corpus.py
198 lines (156 loc) · 5.8 KB
/
01_build_corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
import os
import re
import epitran
from subprocess import check_output
import quntoken
import json
'''
This script iterates over txt files stored in [src] directory and
(1) loads the texts and their metadata,
(2) performs tokenization and phonetic transcription
(3) stores the results in json into [json] directory.
The structure of output is as follows:
{
'file': [string] filename of the original txt file
'metadata': {
'author': [string] author of the poem,
'title': [string] title of the poem,
'year': [string] year when poem published,
'imm': [list] indication of melodic model,
},
'body': [{
'text': [string] text_of_line,
'punct_init': [string] potential line-initial punctuation,
'stanza': [int] id_of_stanza,
'tokens': [{
'token': [string] word-token,
'ipa_espeak': [string] ipa produced by espeak,
'ipa_epitran': [string] ipa produce by epitran,
'punct': [string] potential punctuation following the token,
} ... ],
} ... ],
}
'''
def tokenize(text):
'''
Tokenize the text by means of QUNTOKEN. Output is the list structured as
[{'token': word-token, 'punct': potential punctuation following the word}]
and string punct_init holding potential line-initial punctuation mark
'''
# Regex formula to check whether token is a punctuation mark
punct_regex = '^[\–\.\(\)\„\”\’\!\[\]\:\?\;\-\,]*$'
# Output variables
tokens = []
punct_init = None
# Iterate over tokens
for t in quntoken.tokenize(text):
# Get rid of word-boundary chars
t = t.split('\t')[0]
# If token is a punctuation mark...
if re.search(punct_regex, t):
# if it is line-initial char, store it into punct_init
if len(tokens) == 0:
punct_init = t
# otherwise store it to the last element in output list
else:
tokens[-1]['punct'] = t
# If token is not a punctuation, append new entry into output list
else:
tokens.append({'token': t, 'punct': None})
return tokens, punct_init
def _ipa_epitran(text):
'''
Transcribe text to IPA by means of EPITRAN
'''
epi = epitran.Epitran('hun-Latn')
ipa = epi.transliterate(text)
ipa = ipa.replace('\n', ' ')
ipa = re.sub(' +', ' ', ipa)
return ipa
def _ipa_espeak(text):
'''
Transcribe text to IPA by means of ESPEAK
'''
text = re.sub('^\-', '', text)
ipa = check_output(["espeak", "-q", "--ipa", '-v', 'hu-hu', text]).decode('utf-8')
ipa = ipa.replace('\n', ' ')
ipa = re.sub(' +', '', ipa)
return ipa
def process_file(path):
'''
Load poem from file into an object with following structure:
[{
'text': [string] text_of_line,
'punct_init': [string] potential line-initial punctuation,
'stanza': [int] id_of_stanza,
'tokens': [{
'token': [string] word-token,
'ipa_espeak': [string] ipa produced by espeak,
'ipa_epitran': [string] ipa produce by epitran,
'punct': [string] potential punctuation following the token,
} ... ],
} ... ]
:: path = path to a file (str)
'''
# Red content of the file
file = open(path, "r")
content = file.read().splitlines()
# Output variable where data on poem will be stored
poem = list()
# Dict to store metadata
metadata = dict()
# Index of the stanza
stanza_id = 1
# Iterate over lines of text
for i,c in enumerate(content):
# Print current line index
print(i+1, end=' ', flush=True)
# If it is part of metadata (first 4 lines),
# store it as metadata and continue
if i <= 3:
key, content = c.split(':', 1)
key = key.strip()
content = content.strip()
if key == 'imm':
content = content.split(',')
content = [x.strip() for x in content]
elif content == '':
content = None
metadata[key] = content
continue
# If line empty, increase the stanza index and continue
if c.strip() == '':
if len(poem) > 0:
stanza_id += 1
continue
# Tokenize the line
tokens, punct_init = tokenize(c)
# Iterate over tokens and get phonetic transcription
for j,t in enumerate(tokens):
tokens[j]['ipa_epitran'] = _ipa_epitran(t['token'])
tokens[j]['ipa_espeak'] = _ipa_espeak(t['token'])
# Append data to the output
poem.append({
'text': c,
'tokens': tokens,
'punct_init': punct_init,
'stanza': stanza_id})
return poem, metadata
if __name__ == '__main__':
# Create output directory if it don't exist yet
if not os.path.exists('json'):
os.makedirs('json')
# Iterate over TXT files
for f in sorted(os.listdir('src')):
# Skip if file already processed
if os.path.isfile(os.path.join('json', f.replace('.txt', '.json'))):
continue
# Print current file name
print('\n\t', f)
# Read file conent, tokenize and perform phonetic transcription
poem, metadata = process_file(os.path.join("src", f))
# Append poem and metadata to the list
data = {'metadata': metadata, 'file': f, 'body': poem}
# Store data to json file
with open(os.path.join('json', f.replace('.txt', '.json')), 'w') as outfile:
json.dump(data, outfile, indent=2)