/
img2djvu
executable file
·422 lines (396 loc) · 11.5 KB
/
img2djvu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
#!/bin/bash
### Inspired by script pdf-trim-to-djvu.sh (http://gist.github.com/315791)
### PUBLIC DOMAIN
### Settings
# default
DPI=300
treshold=129
usemini=0
usecodjvu=0
ag=0
usepro="-contrast -blur 0x1"
ocrengine=""
ocrlanguage=""
verbmini=0
tmp=0
ocrjobs=1
# internal
prog="$0"
dpidefault="$DPI"
trdefault="$treshold"
midefault="$usemini"
codefault="$usecodjvu"
agdefault="$ag"
prodefault="$usepro"
ocrenginedefault="$ocrengine"
ocrlanguagedefault="$ocrlanguage"
verbminidefault="$verbmini"
tmpdefault="$tmp"
ocrjobsdefault="$ocrjobs"
function printversion() {
printf "img2djvu version 20151127\n"
}
function usage() {
me=`basename "$prog"`
cat << END
Usage: "$me" [options] relative_folder_name
Options:
-a <0|1|2> aggressivity: 0 is not aggressive, 1 is aggressive, 2 is very aggressive [default: "$agdefault"]
-c <0|1> make a choice of temporary directory, 0 for /tmp, 1 for current [default: "$tmpdefault"]
-d <int> resolution in DPI [default: "$dpidefault"]
-e "str" if not empty, use OCR engine (supported by ocrodjvu) with this name [default: "$ocrenginedefault"]
-j <int> number of OCR jobs [default: "$ocrjobsdefault"]
-l <int> if not 0, will use forced segmentation (with <int> downsampling) [default: "$codefault"]
-m <int> if not 0, will use minidjvu (with <int> dictionary size) instead of cjb2 [default: "$midefault"]
-r "str" if not empty, use OCR engine with given language [default: "$ocrlanguagedefault"]
-p "str" process color layer with given ImageMagick options [default: "$prodefault"]
-t <int> if not 0, will use cpaldjvu for all images with number of colors less than <int> [default: "$trdefault"]
-v <0|1> if not 0, minidjvu run will be verbose [default: "$verbminidefault"]
-h|--help print this message
-V print version and exit
END
}
opts=`getopt -l "help" "a:c:d:e:f:j:l:m:p:r:t:v:hV" "$@"` && eval set -- "$opts"
while true ; do
case "$1" in
-a) ag="$2" ; shift 2 ;;
-c) tmp="$2" ; shift 2 ;;
-d) DPI="$2" ; shift 2 ;;
-e) ocrengine="$2"; shift 2 ;;
-j) ocrjobs="$2"; shift 2 ;;
-l) usecodjvu="$2" ; shift 2 ;;
-m) usemini="$2" ; shift 2 ;;
-r) ocrlanguage="$2" ; shift 2 ;;
-p) usepro="$2" ; shift 2 ;;
-t) treshold="$2" ; shift 2 ;;
-v) verbmini="$2" ; shift 2 ;;
-h|--help) usage ; exit 0 ;;
-V) printversion ; exit 0 ;;
--) shift ; break ;;
-*) break ;;
esac
done
### Checks
if [ $# -ne 1 ] ; then
printf "\nFolder name is required.\n\n"
usage
exit 1
fi
if [ "`echo "$1" | sed -e 's|^\(.\).*$|\1|'`" = "/" ] ; then
printf "\nAbsolute folder names are not allowed.\n\n"
usage
exit 1
fi
if [ ! -d "$1" ] ; then
printf "\n$1 is not a folder.\n\n"
usage
exit 1
fi
if ! which cpaldjvu >/dev/null ; then
printf "\nDjVu Libre should be installed.\n\n"
usage
exit 1
else
dv=`cpaldjvu 2>&1| sed -e '2,$d' -e 's/^.*-\([1-9.]*\)$/\1/' -e 's/\.//g'`
fi
if ! which identify >/dev/null ; then
printf "\nImageMagick should be installed.\n\n"
usage
exit 1
fi
if ! [[ -n "$ocrengine" && -n "$ocrlanguage" ]] ; then
useocr=0
if [[ -n "$ocrengine" || -n "$ocrlanguage" ]] ; then
printf "\nFor OCR, both OCR options (-e and -r) should be specified.\n\n"
usage
exit 1
fi
else
if ! which ocrodjvu >/dev/null ; then
printf "\nFor OCR, ocrodjvu should be installed.\n\n"
usage
exit 1
fi
useocr=1
fi
### Aggressivity options
case "$ag" in
0)
ags="<"
age=">"
c44opt=""
cjb2opt=""
cpaldjvuopt="-bgwhite"
minidjvuopt=""
;;
1)
ags="<<"
age=">>"
c44opt="-slice 74+13+10"
cjb2opt="-clean"
cpaldjvuopt=""
minidjvuopt="--clean"
;;
2)
ags="<<<"
age=">>>"
c44opt="-slice 76+15"
cjb2opt="-lossy"
cpaldjvuopt=""
minidjvuopt="--lossy --aggression 200"
;;
*)
printf "\nAggressivity has only three possible values: 0, 1, or 2.\n\n"
usage
exit 1
esac
### General coders functions
function colorcoder {
if [ "$usecodjvu" -lt 1 ] ; then
c44coder "$1" "$2"
else
codjvucoder "$1" "$2"
fi
}
### Subcoders functions
### Black and white
function cjb2coder {
cjb2 $cjb2opt -dpi "$DPI" "$1" "$2"
# This function does not print because it is called also from inside codjvucoder
}
function minidjvucoder {
if [ "$verbmini" -gt 0 ] ; then
minidjvu -d "$DPI" -p "$usemini" $minidjvuopt -v $@
else
minidjvu -d "$DPI" -p "$usemini" $minidjvuopt $@
fi
printf "$ags""M:"
printf "$bwcount"
}
### Color
function cpaldjvucoder {
cpaldjvu $cpaldjvuopt -dpi "$DPI" "$1" "$2"
printf "F"
}
function c44coder {
c44 $c44opt -dpi "$DPI" "$1" "$2"
printf "C"
}
# Layer separation and "forced segmentation" coder
function codjvucoder {
# folders
printf "L"
tmcdir="$tmpdir"/tmp
mkdir "$tmcdir"
# image parameters
newsize=`identify -format "%[fx:ceil(w/$usecodjvu)]x%[fx:ceil(h/$usecodjvu)]" "$1"`
# separate layers from Scan Tailor
convert "$1" -threshold 1 "$tmcdir/fore.pbm"
convert "$1" -fill white -opaque black $usepro -resize $newsize\! "$tmcdir/back.ppm"
cd "$tmcdir"
printf ":"
# make BG44 chunk
c44 $c44opt -dpi "$DPI" back.ppm back.djvu
djvuextract back.djvu BG44=bg44.cnk >/dev/null 2>&1
printf "1"
# make Sjbz chunk
cjb2coder fore.pbm sjbz.djvu
djvuextract sjbz.djvu Sjbz=sjbz.cnk >/dev/null 2>&1
printf "+1"
if [ "$dv" -lt 3523 ] ; then
# make FG44 chunk from background (!)
convert -size $newsize\! -depth 8 xc:black fore.pgm
c44 -dpi "$DPI" -slice 120 fore.pgm fore.djvu
djvuextract fore.djvu BG44=fg44.cnk >/dev/null 2>&1
# make compound DjVu
djvumake "$2" INFO=,,"$DPI" Sjbz=sjbz.cnk FG44=fg44.cnk BG44=bg44.cnk
else
# use new ability of djvumake to create FGbz chunk on the fly
djvumake "$2" INFO=,,"$DPI" Sjbz=sjbz.cnk FGbz="#black" BG44=bg44.cnk
fi
printf "+1"
cd ..
rm -rf "$tmcdir"
}
### General coder and bundler
# convert all pages to pnm: convert
# determine color depth of each page
# if depth <= 1, compress with cjb2 or minidjvu
# if depth > 1 and treshold > 0, count number of unique colors
# if unique colors < 129, compress with cpaldjvu
# if unique colors >= 129, compress with c44 or codjvu
# display progress symbols (B or M for Bitonal, C for TrueColor, L for codjvu layers, F for Palette)
# create bundled DjVu file
function nomini {
( cd "$fld" &&
( # for every page
SAVEIFS=$IFS
IFS=$(echo -en "\n\b")
for f in `ls -1p | egrep -v '/$'` ; do
IFS=$SAVEIFS
of="$tmpdir/${f%.*}.pnm"
if which tifftopnm >/dev/null ; then
tifftopnm_cmd=1
fi
if [[ "$fld/$f" =~ .*\.(tif|tiff) ]] ; then # Use `tifftopnm` for TIFFs, if installed (it is faster / uses less memory).
if [ $tifftopnm_cmd = 1 ] ; then
tifftopnm "$fld/$f" > "$of" 2>/dev/null
else
convert "$fld/$f" "$of" 2>/dev/null
fi
else # for all other images (JPEG etc), use `convert` :
convert "$fld/$f" "$of" 2>/dev/null
fi
convertfailed=$?
if [ "$convertfailed" -eq 1 ] ; then
printf "."
continue
fi
printf "$ags" && \
( # switch between formats
if [ `identify -format "%z" "$of"` -gt 1 ] ; then
if [ "$treshold" -gt 0 ] ; then
colors=`identify -format "%k" "$of"` && \
if [ "$colors" -lt "$treshold" ] ; then
cpaldjvucoder "$of" "${of%pnm}djvu"
else
colorcoder "$of" "${of%pnm}djvu"
fi
else
colorcoder "$of" "${of%pnm}djvu"
fi
else
cjb2coder "$of" "${of%pnm}djvu"
printf "B"
fi
) && \
rm "$of" && \
printf "$age"
IFS=$(echo -en "\n\b")
done
IFS=$SAVEIFS
) &&
cd "$tmpdir" && \
djvm -c merged.djvu *.djvu && \
mv merged.djvu "$djvu" && \
printf "\nDone.\n" && \
if [ "$useocr" -gt 0 ] ; then
printf "Starting OCR...\n"
ocrodjvu --engine "$ocrengine" --language "$ocrlanguage" --jobs "$ocrjobs" --in-place --on-error=resume "$djvu"
fi
) && rm -rf "$tmpdir" || (
printf "Failure\nTemporary directory left: %s\n" "$tmpdir"
exit 2
)
}
### Minidjvu-based coder and bundler
# Similar to previous, but instead of cjb2 minidjvu called every time when black and white sequence interrupted with color image, or when sequence ends on black and white file
# Works with sequences, therefore visually less verbose, minidjvu is also slower than cjb2
function mini {
( cd "$fld" &&
( bwcount=0 && \
bwpages="" && \
SAVEIFS=$IFS
IFS=$(echo -en "\n\b")
for f in `ls -1p | egrep -v '/$'` ; do
IFS=$SAVEIFS
pgcount=$((pgcount-1))
of="$tmpdir/${f%.*}.pnm"
if which tifftopnm >/dev/null ; then
tifftopnm_cmd=1
fi
if [[ "$fld/$f" =~ .*\.(tif|tiff) ]] ; then # Use `tifftopnm` for TIFFs, if installed (it is faster / uses less memory).
if [ $tifftopnm_cmd = 1 ] ; then
tifftopnm "$fld/$f" > "$of" 2>/dev/null
else
convert "$fld/$f" "$of" 2>/dev/null
fi
else # for all other images (JPEG etc), use `convert` :
convert "$fld/$f" "$of" 2>/dev/null
fi
convertfailed=$?
if [ "$convertfailed" -eq 1 ] ; then
printf "."
continue
fi
if [ `identify -format "%z" "$of"` -gt 1 ] ; then
if [ "$bwcount" -gt 0 ] ; then
minidjvucoder "$bwpages" "${of%pnm}1.djvu"
rm -f $bwpages
bwpages=""
bwcount=0
printf "$age"
fi
printf "$ags"
if [ "$treshold" -gt 0 ] ; then
colors=`identify -format "%k" "$of"`
if [ "$colors" -lt "$treshold" ] ; then
cpaldjvucoder "$of" "${of%pnm}2.djvu"
else
colorcoder "$of" "${of%pnm}2.djvu"
fi
else
colorcoder "$of" "${of%pnm}2.djvu"
fi
rm "$of" && \
printf "$age"
else
bwpages="$bwpages $of"
bwcount=$((bwcount+1))
fi
if [[ "$pgcount" -eq 0 && "$bwcount" -gt 0 ]] ; then
minidjvucoder "$bwpages" "${of%pnm}1.djvu"
rm -f $bwpages
bwpages=""
bwcount=0
printf "$age"
fi
IFS=$(echo -en "\n\b")
done
IFS=$SAVEIFS
) &&
cd "$tmpdir" && \
djvm -c "$djvu" *.djvu && \
printf "\nDone.\n" && \
if [ "$useocr" -gt 0 ] ; then
printf "Starting OCR...\n"
ocrodjvu --engine "$ocrengine" --language "$ocrlanguage" --jobs "$ocrjobs" --in-place --on-error=resume "$djvu"
fi
) && rm -rf "$tmpdir" || (
printf "Failure\nTemporary directory left: %s\n" "$tmpdir"
exit 2
)
}
# Absolute paths to input folder and output DjVu files
fld="`pwd`/$1"
djvu="`pwd`/`echo $1 | sed -e 's|/$||g'`.djvu"
if [ "$tmp" -eq 0 ]; then
tmpdirprefix="/tmp"
else
tmpdirprefix=`pwd`
fi
### START!
pgcount=`ls -1p "$fld" | egrep -v '/$' | wc -l`
printf "$pgcount files:\n"
if [ "$usemini" -lt 1 ] ; then
tmpdir=`mktemp -d "$tmpdirprefix"/pagesXXXXXX`
nomini
else
if ! which minidjvu >/dev/null ; then
printf "\nFor -m option, minidjvu is required.\n\n"
usage
exit 1
fi
if ls -1p "$fld" | grep " " ; then
printf "\nSome file names contain spaces, minidjvu will not run!\n"
exit 1
fi
if [ "$tmp" -eq 1 ] ; then
if echo "$fld" | grep " " ; then
printf "\nPath to temporary directory contain spaces, minidjvu will not run!\n"
exit 1
fi
fi
tmpdir=`mktemp -d "$tmpdirprefix"/pagesXXXXXX`
mini
fi