-
Notifications
You must be signed in to change notification settings - Fork 0
/
indextools.py
77 lines (59 loc) · 2.02 KB
/
indextools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
"""
Adapt an index generated by XeLaTeX so that it sorts words containing diacritics correctly
The ways how the *dx files are generated varies between different versions of TexLive
"""
import re, sys
try:
from langsci.delatex import dediacriticize
except ImportError:
from delatex import dediacriticize
try:
from langsci.asciify import asciify
except ImportError:
from asciify import asciify
# the LaTeX index entries consist of the string to be displayed (after the "@")
# and the string used for sorting (before the "@").
p = re.compile(r"\\indexentry \{(.*?)@")
def processline(s):
"""Conform the input string to the index requirements and return the conformed string
To conform the string, first LaTex diacritics like {\'{e}} are removed. Then, Unicode
is translated to ASCII
Args:
s (str): the input string
Returns:
str: the output string
Example:
>>> print(processline("\v{C}{\'{e}}pl\"o, Slavomír")
Ceplo, Slavomir
"""
if s.strip() == '':
return s
#find the substring used for sorting
m = p.match(s)
sortstring = ''
try:
sortstring = m.groups(1)[0]
except AttributeError:
print("%s could not be parsed" % repr(s))
tmpstring = dediacriticize(sortstring)
tmpstring = asciify(tmpstring)
if sortstring == tmpstring:
return s
else:
print("%s => %s"%(sortstring,tmpstring))
return s.replace("%s@"%sortstring,"%s@"%tmpstring)
def processfile(filename):
"""Read a file and write the fixed output to another file with "mod" appended to its name
Args:
filename (str): the path to the file
Returns:
None
"""
print("Reading", filename)
with open(filename, encoding='utf-8') as indexfile:
lines = indexfile.readlines()
print("Found %i lines" % len(lines))
#read all lines, process them and write them to output file
processedlines = list(map(processline, lines))
with open(filename.replace('.','mod.'),'w', encoding='utf-8') as out:
out.write(''.join(processedlines))