-
Notifications
You must be signed in to change notification settings - Fork 0
/
sanity.py
324 lines (278 loc) · 13 KB
/
sanity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
"""
Check files of various types for conformity to Language Science Press guidelines. Currently the following
file types are checked:
* tex files from folder chapters/
* bib files
* png/jpg files in forlder figures/
Attributes:
SPELLD: A US-English dictionary
TKNZR: A US_English tokenizer
LATEXTERMS: LaTeX terms which the spell checker should treat as conformant
"""
import re
import glob
import sys
import fnmatch
import os
import textwrap
import uuid
#import enchant
#from enchant.tokenize import get_tokenizer
from PIL import Image
#SPELLD = enchant.Dict("en_US")
#TKNZR = get_tokenizer("en_US")
#LATEXTERMS = ("newpage","clearpage","textit","textbf","textsc","textwidth","tabref","figref","sectref","emph")
class SanityError:
"""A record of a potentially problematic passage
Attributes:
filename (str): the path to the file where the error is found
linenr (int): the number of the line where the error is found
line (str): the full line under consideration
offendingstring (str): the string which was identified as problematic
msg (str): information on why this string was found problematic
pre (str): left context of the offending string
post (str): right context of the offending string
ID (str): ID to uniquely refer to a given error in HTML-DOM
name (str): hexadecimal string used for colour coding based on msg
color (str): a rgb color string
bordercolor (str): a rgb color string, which is darker than color
"""
def __init__(self,filename,linenr,line,offendingstring,msg):
self.filename = filename.split('/')[-1]
self.linenr = linenr
self.line = line
self.offendingstring = offendingstring
self.msg = msg
self.pre =self.line.split(self.offendingstring)[0]
try:
self.post = self.line.split(self.offendingstring)[1]
except IndexError:
self.post = ''
self.ID = uuid.uuid1()
self.name = str(hash(msg))[-6:]
#compute rgb colors from msg
t = textwrap.wrap(self.name,2)[-3:]
self.color = "rgb({},{},{})".format(int(t[0])+140,int(t[1])+140,int(t[2])+140)
self.bordercolor = "rgb({},{},{})".format(int(t[0])+100,int(t[1])+100,int(t[2])+100)
def __str__(self):
return "{linenr}: …{offendingstring}… \t{msg}".format(**self.__dict__)
class SanityFile:
"""A file with information about potentially problematic passages
Attributes:
filename (str): the path to the file
content (str): the content of the file
lines ([]str): the lines of the file
errors ([]SanityError): the list of found errors
spellerrors ([]SanityError): the list of words not found in the spell dictionary
"""
def __init__(self,filename):
self.filename = filename
self.errors = []
try:
self.content = open(filename, encoding='utf-8').read()
except UnicodeDecodeError:
print("file %s is not in proper Unicode encoding"%filename)
self.content = ''
self.errors = [SanityError(filename, 0, ' ', ' ', 'file not in Unicode, no analysis possible')]
self.lines = self.split_(self.content)
#self.spellerrors = []
def split_(self,c):
result = self._removecomments(c).split('\n')
return result
def _removecomments(self,s):
"""strip comments from file so that errors are not marked in comments"""
#negative lookbehind
try:
result = re.sub('(?<!\\\\)%.*\n','\n',s)
except TypeError:
print("%s could not be regexed"%s)
return s
return result
def check(self):
"""
check the file for errors
"""
for i,line in enumerate(self.lines):
if '\\chk' in line: #the line is explicitely marked as being correct
continue
for antipattern,msg in self.antipatterns:
m = re.search('(%s)'%antipattern,line)
if m != None:
g = m.group(1)
if g!='':
self.errors.append(SanityError(self.filename, i, line, g, msg))
for positivepattern, negativepattern, msg in self.posnegpatterns:
posmatch = re.search('(%s)'%positivepattern,line)
if posmatch==None:
continue
# a potentiall incorrect behaviour is found, but could be saved by the presence of additional material
g = posmatch.group(1)
negmatch = re.search(negativepattern,line)
if negmatch==None: # the match required to make this line correct is not found
self.errors.append(SanityError(self.filename, i, line, g, msg))
def spellcheck(self):
"""return a list of all words which are neither known LaTeX terms nor found in the enchant dictionary
"""
result = sorted(list(set([t[0] for t in TKNZR(self.content) if SPELLD.check(t[0])==False and t[0] not in LATEXTERMS])))
self.spellerrors = result
class TexFile(SanityFile):
"""
A tex file to be checked
Attributes:
antipatterns (str[]): a list of 2-tuples consisting of a string to match and a message to deliver if the string is found
posnegpatterns (str[]): a list of 3-tuples consisting a pattern to match, a pattern NOT to match, and a message
filechecks: currently unused #TODO
"""
antipatterns = (
(r" et al.","Please use the citation commands \\citet and \\citep"), #et al in main tex
(r"setfont","You should not set fonts explicitely"), #no font definitions
(r"\\(small|scriptsize|footnotesize)","Please consider whether changing font sizes manually is really a good idea here"), #no font definitions
(r"([Tt]able|[Ff]igure|[Ss]ection|[Pp]art|[Cc]hapter\() *\\ref","It is often advisable to use the more specialized commands \\tabref, \\figref, \\sectref, and \\REF for examples"), #no \ref
#("",""), #\ea\label
#("",""), #\section\label
(" \\footnote","Footnotes should not be preceded by a space"),
#("",""), #footnotes end with .
(r"\[[0-9]+,[0-9]+\]","Please use a space after the comma in lists of numbers "), #no 12,34 without spacing
(r"\([^)]+\\cite[pt][^)]+\)","In order to avoid double parentheses, it can be a good idea to use \\citealt instead of \\citet or \\citep"),
("([0-9]+-[0-9]+)","Please use -- for ranges instead of -"),
#(r"[0-9]+ *ff","Do not use ff. Give full page ranges"),
(r"[^-]---[^-]","Use -- with spaces rather than ---"),
(r"tabular.*\|","Vertical lines in tables should be avoided"),
(r"\\hline","Use \\midrule rather than \\hline in tables"),
(r"\\gl[lt] *[a-z].*[\.?!] *\\\\ *$","Complete sentences should be capitalized in examples"),
(r"\\glt * ``","Use single quotes for translations."),
(r"\\section.*[A-Z].*[A-Z].*","Only capitalize this if it is a proper noun"),
(r"\\s[ubs]+ection.*[A-Z].*[A-Z].*","Only capitalize this if it is a proper noun"),
(r"[ (][12][8901][0-9][0-9]","Please check whether this should be part of a bibliographic reference"),
(r"(?<!\\)[A-Z]{3,}","It is often a good idea to use \\textsc\{smallcaps} instead of ALLCAPS"),
(r"(?<![0-9])[?!;\.,][A-Z]","Please use a space after punctuation (or use smallcaps in abbreviations)"), #negative lookbehind for numbers because of lists of citation keys
(r"\\textsuperscript\{w\}","Please use Unicode ʷ for labialization instead of superscript w"),
(r"\\textsuperscript\{j\}","Please use Unicode ʲ for palatalization instead of superscript j"),
(r"\\textsuperscript\{h\}","Please use Unicode ʰ for aspiration instead of superscript h")
)
posnegpatterns = (
(r"\[sub]*section\{",r"\label","All sections should have a \\label. This is not necessary for subexamples."),
#(r"\\ea.*",r"\label","All examples should have a \\label"),
(r"\\gll\W+[A-Z]",r"[\.?!][ }]*\\\\ *$","All vernacular sentences should end with punctuation"),
(r"\\glt\W+[A-Z]",r"[\.?!][}\\]*['’”ʼ]","All translated sentences should end with punctuation"),
)
filechecks = (
#("",""), #src matches #imt
#("",""), #words
#("",""), #hyphens
#("",""), #tabulars have lsptoprule
#("",""), #US/UK
)
#year not in order in multicite
class BibFile(SanityFile):
"""
A bib file to be checked
Attributes:
antipatterns (str[]): a list of 2-tuples consisting of a string to match and a message to deliver if the string is found
"""
antipatterns = (
#("[Aa]ddress *=.*[,/].*[^ ]","No more than one place of publication. No indications of countries or provinces"), #double cities in address
#("[Aa]ddress *=.* and .*","No more than one place of publication."), #double cities in address
("[Tt]itle * =.*: +(?<!{)[a-zA-Z]+","Subtitles should be capitalized. In order to protect the capital letter, enclose it in braces {} "),
("[Aa]uthor *=.*(?<=(and|AND|..[,{])) *[A-Z]\..*","Full author names should be given. Only use abbreviated names if the author is known to prefer this. It is OK to use middle initials"),
("[Ee]ditor *=.*(?<=(and|AND|..[,{])) *[A-Z]\..*","Full editor names should be given. Only use abbreviated names if the editor is known to prefer this. It is OK to use middle initials"),
("[Aa]uthor *=.* et al","Do not use et al. in the bib file. Give a full list of authors"),
("[Aa]uthor *=.*&.*","Use 'and' rather than & in the bib file"),
("[Tt]itle *=(.* )?[IVXLCDM]*[IVX]+[IVXLCDM]*[\.,\) ]","In order to keep the Roman numbers in capitals, enclose them in braces {}"),
("\.[A-Z]","Please use a space after a period or an abbreviated name"),
)
posnegpatterns = []
class ImgFile(SanityFile):
"""
An image file to be checked
"""
def __init__(self,filename):
self.filename = filename
self.errors = []
self.spellerrors = []
self.latexterms = []
def check(self):
try:
img = Image.open(self.filename)
except IOError:
print("could not open", self.filename)
return
try:
x,y = img.info['dpi']
if x < 72 or y < 72:
print("low res for", self.filename.split)
self.errors.append(SanityError(self.filename, '', '','low resolution', "%sx%sdpi, required 300"%(x,y)))
except KeyError:
x,y = img.size
if x< 1500:
estimatedresolution = x/5
print("resolution of %s when printed full with: %i. Required: 300" %(self.filename.split('/')[-1],estimatedresolution))
self.errors.append(SanityError(self.filename,'low resolution', ' ', ' ', "estimated %sdpi, required 300"%estimatedresolution))
class SanityDir:
"""
A directory with various files to be checked
"""
def __init__(self,dirname,ignorecodes):
self.dirname = dirname
self.ignorecodes = ignorecodes
self.texfiles = self.findallfiles('tex')
self.bibfiles = self.findallfiles('bib')
self.pngfiles = self.findallfiles('png')
self.jpgfiles = self.findallfiles('jpg')
self.texterrors = {}
self.imgerrors = {}
def findallfiles(self, extension):
"""
find all files in or below the current directory with a given extension
args:
extension (str): the extension to be looked for
returns:
a list of paths for the retrieved files
"""
matches = []
localfiles = glob.glob('%s/local*'%self.dirname)
chapterfiles = glob.glob('%s/chapters/*tex'%self.dirname)
imgfiles = glob.glob('%s/figures/*'%self.dirname)
for filename in fnmatch.filter(localfiles+chapterfiles+imgfiles, '*.%s'%extension):
matches.append(os.path.join(self.dirname, filename))
return sorted(matches)
def printErrors(self):
"""
Print all identified possible erros with metadata (filename, line number, reason
"""
for filename in sorted(self.texterrors):
fileerrors = self.texterrors[filename]
print('\n',70*'=','\n%s, %i possible errors found.' % (filename, len(fileerrors)),"Suppressing %i error codes: %s"%(len(self.ignorecodes),','.join(self.ignorecodes)),'\n',70*'=')
#print(fileerrors)
for e in fileerrors:
if e.name not in self.ignorecodes:
print(' ',e.name, e)
for filename in self.imgerrors:
fileerrors = self.imgerrors[filename]
for e in fileerrors:
print(filename)
print(' ',e)
def check(self):
"""
Check all files in the directory
"""
for tfilename in self.texfiles:
try:
t = TexFile(tfilename)
except AttributeError:
print(tfilename)
continue
t.check()
#t.spellcheck()
self.texterrors[tfilename] = t.errors
#self.errors[tfilename][1] = t.spellerrors
for bfilename in self.bibfiles:
b = BibFile(bfilename)
b.check()
self.texterrors[bfilename] = b.errors
#self.errors[bfilename][1] = b.spellerrors
for ifilename in self.pngfiles+self.jpgfiles:
imgf = ImgFile(ifilename)
imgf.check()
#self.spellerrors = []
self.imgerrors[ifilename] = imgf.errors