/
03_rhymes.py
122 lines (91 loc) · 4.02 KB
/
03_rhymes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import json
import re
import os
import requests
'''
This script performs rhyme detection.
Data are parsed from JSON files stored in [JSON] directory, enriched
and dumped back to JSON files.
'''
def remove_existing_rhymes(poem):
'''
Remove rhymes from previous tagging
'''
for i, line in enumerate(poem['body']):
poem['body'][i]['rhyme'] = []
poem['body'][i]['rhyme_identity'] = []
poem['body'][i]['rhyme_v_match'] = []
poem['body'][i]['rhyme_grammatical'] = []
return poem
def line_final_vowel(line):
'''
Extract line-final vowel
'''
vowels = 'oeɛøuyiɒaɑɒø'
fin_word = line['tokens'][-1]['ipa_espeak']
fin_vowel = re.sub('[^' + vowels + ']', '', fin_word)[-1]
return similar_vowels(fin_vowel)
def similar_vowels(vowel):
'''
Merge similar vowels under single representation
'''
vowel = re.sub('[eɛ]', 'e', vowel)
vowel = re.sub('[aɑɒ]', 'a', vowel)
return vowel
def line_final_morphemes(line):
'''
Extract line final morphemes
'''
# Extract last words morphemes (possible multiple interpretations => list)
morph = line['tokens'][-1]['morph']
# Count number of morphemes and extract line final morpheme
#morph = re.sub('\[ *\]$', '', morph)
n = morph.count('[')
morph = re.sub(r'^.*(\[[^\]]+\])$', r'\1', morph)
return re.sub('[\[\]]', '', morph), n
if __name__ == '__main__':
# Iterate over JSON files
for f in sorted(os.listdir('json')):
# Print current file name
print('\n\t', f)
# Parse JSON data
with open(os.path.join('json', f)) as file:
poem = json.load(file)
# Remove rhymes from previous taggings /
# create an empty 'rhyme' list in each line
poem = remove_existing_rhymes(poem)
# Iterate over lines of poem (i-index)
for i, line in enumerate(poem['body']):
# Extract line final vowel and final morpheme from i-line
fin_vowel1 = line_final_vowel(poem['body'][i])
morph1,l1 = line_final_morphemes(poem['body'][i])
# Iterate over lines that follows the one in question (j-index)
for j in range(i+1, len(poem['body'])):
# Stop the loop if j-index is outside the i-line's stanza
if poem['body'][i]['stanza'] != poem['body'][j]['stanza']:
break
# Extract line final vowel and final morpheme from j-line
fin_vowel2 = line_final_vowel(poem['body'][j])
morph2,l2 = line_final_morphemes(poem['body'][j])
# If neither line-final vowels match, nor morphemes match
# => continue (it's not a rhyme)
if (
fin_vowel1 != fin_vowel2 and
( morph1 != morph2 or morph1 == 'Nom' )
):
continue
# Store rhyme and its characteristics
poem['body'][i]['rhyme'].append(j)
poem['body'][j]['rhyme'].append(i)
if fin_vowel1 == fin_vowel2:
poem['body'][i]['rhyme_v_match'].append(j)
poem['body'][j]['rhyme_v_match'].append(i)
if poem['body'][i]['tokens'][-1]['token'].lower() == poem['body'][j]['tokens'][-1]['token'].lower():
poem['body'][i]['rhyme_identity'].append(j)
poem['body'][j]['rhyme_identity'].append(i)
if morph1 == morph2 and morph1 != 'Nom': # and l1 > 1:
poem['body'][i]['rhyme_grammatical'].append(j)
poem['body'][j]['rhyme_grammatical'].append(i)
# Store poem back to JSON file
with open(os.path.join('json', f), 'w') as outfile:
json.dump(poem, outfile, indent=2)