Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix for Issue #83: Leading Zeros #97

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 4 additions & 1 deletion index.html
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,10 @@ <h2 id="usage">Usage</h2>
The Docsplit gem includes both the <tt>docsplit</tt> command-line utility
as well as a Ruby API. The available commands and options are identical in both.<br />
<tt>--output</tt> or <tt>-o</tt> can be passed to any command in order to
store the generated files in a directory of your choosing.
store the generated files in a directory of your choosing.<br />
<tt>--leading_zeros</tt> can be passed to any command extracting individual
pages in order to pad the files' page numbers with zeros, resulting in
numerical ordering for particular environments.
</p>

<p>
Expand Down
3 changes: 3 additions & 0 deletions lib/docsplit/command_line.rb
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,9 @@ def parse_options
opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
@options[:rolling] = true
end
opts.on('--leading_zeros', 'include leading zeros when naming a page') do |l|
@options[:leading_zeros] = true
end
opts.on_tail('-v', '--version', 'display docsplit version') do
puts "Docsplit version #{Docsplit::VERSION}"
exit
Expand Down
11 changes: 10 additions & 1 deletion lib/docsplit/image_extractor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def convert(pdf, size, format, previous=nil)
basename = File.basename(pdf, File.extname(pdf))
directory = directory_for(size)
pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s
page_format = page_number_format(pdf)
escaped_pdf = ESCAPE[pdf]
FileUtils.mkdir_p(directory) unless File.exists?(directory)
common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
Expand All @@ -41,7 +42,8 @@ def convert(pdf, size, format, previous=nil)
raise ExtractionFailed, result if $? != 0
else
page_list(pages).each do |page|
out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
page_number = sprintf(page_format, page)
out_file = ESCAPE[File.join(directory, "#{basename}_#{page_number}.#{format}")]
cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
result = `#{cmd}`.chomp
raise ExtractionFailed, result if $? != 0
Expand All @@ -63,6 +65,7 @@ def extract_options(options)
@sizes = [options[:size]].flatten.compact
@sizes = [nil] if @sizes.empty?
@rolling = !!options[:rolling]
@zeros = !!options[:leading_zeros]
end

# If there's only one size requested, generate the images directly into
Expand Down Expand Up @@ -98,6 +101,12 @@ def page_list(pages)
}.flatten.uniq.sort
end

# Generate the appropriate page number format.
def page_number_format(pdf)
digits = Docsplit.extract_length(pdf).to_s.length
@zeros ? "%0#{digits}d" : "%d"
end

end

end
11 changes: 10 additions & 1 deletion lib/docsplit/page_extractor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ def extract(pdfs, opts)
extract_options opts
[pdfs].flatten.each do |pdf|
pdf_name = File.basename(pdf, File.extname(pdf))
page_path = File.join(@output, "#{pdf_name}_%d.pdf")
page_format = page_number_format(pdf)
page_path = File.join(@output, "#{pdf_name}_#{page_format}.pdf")
FileUtils.mkdir_p @output unless File.exists?(@output)

cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
Expand All @@ -29,6 +30,14 @@ def extract(pdfs, opts)

def extract_options(options)
@output = options[:output] || '.'
@zeros = !!options[:leading_zeros]
end

# Generate the appropriate page number format.
def page_number_format(pdf)
digits = Docsplit.extract_length(pdf).to_s.length
# PDFTailor doesn't support printf-style format in the output, yet
(!DEPENDENCIES[:pdftailor] && @zeros) ? "%0#{digits}d" : "%d"
end

end
Expand Down
16 changes: 13 additions & 3 deletions lib/docsplit/text_extractor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,14 @@ def extract_from_pdf(pdf, pages)
def extract_from_ocr(pdf, pages)
tempdir = Dir.mktmpdir
base_path = File.join(@output, @pdf_name)
page_format = page_number_format(pdf)
escaped_pdf = ESCAPE[pdf]
if pages
pages.each do |page|
tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
page_number = sprintf(page_format, page)
tiff = "#{tempdir}/#{@pdf_name}_#{page_number}.tif"
escaped_tiff = ESCAPE[tiff]
file = "#{base_path}_#{page}"
file = "#{base_path}_#{page_number}"
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1"
clean_text(file + '.txt') if @clean_ocr
Expand Down Expand Up @@ -109,7 +111,8 @@ def extract_full(pdf)
# Extract the contents of a single page of text, directly, adding it to
# the `@pages_to_ocr` list if the text length is inadequate.
def extract_page(pdf, page)
text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
page_number = sprintf(page_number_format(pdf), page)
text_path = File.join(@output, "#{@pdf_name}_#{page_number}.txt")
run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
unless @forbid_ocr
@pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
Expand All @@ -123,6 +126,13 @@ def extract_options(options)
@forbid_ocr = options[:ocr] == false
@clean_ocr = !(options[:clean] == false)
@language = options[:language] || 'eng'
@zeros = !!options[:leading_zeros]
end

# Generate the appropriate page number format.
def page_number_format(pdf)
digits = Docsplit.extract_length(pdf).to_s.length
@zeros ? "%0#{digits}d" : "%d"
end

end
Expand Down
Binary file added test/fixtures/leading_zeros.pdf
Binary file not shown.
9 changes: 9 additions & 0 deletions test/unit/test_extract_images.rb
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,13 @@ def test_name_escaping_while_extracting_images
'PDF file with spaces \'single\' and "double quotes"_1.gif'])
end

def test_leading_zeros_while_extracting_images
Docsplit.extract_images('test/fixtures/leading_zeros.pdf', :leading_zeros => true, :output => OUTPUT)
assert_directory_contains(OUTPUT, ['leading_zeros_01.png', 'leading_zeros_02.png',
'leading_zeros_03.png', 'leading_zeros_04.png',
'leading_zeros_05.png', 'leading_zeros_06.png',
'leading_zeros_07.png', 'leading_zeros_08.png',
'leading_zeros_09.png', 'leading_zeros_10.png'])
end

end
13 changes: 13 additions & 0 deletions test/unit/test_extract_pages.rb
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,17 @@ def test_name_escaping_while_extracting_pages
assert Dir["#{OUTPUT}/*.pdf"].length == 2
end

def test_leading_zeros_while_extracting_pages
Docsplit.extract_pages('test/fixtures/leading_zeros.pdf', :leading_zeros => true, :output => OUTPUT)

doc_data_path = File.join(OUTPUT, 'doc_data.txt')
File.delete(doc_data_path) if File.exists?(doc_data_path)

assert_directory_contains(OUTPUT, ['leading_zeros_01.pdf', 'leading_zeros_02.pdf',
'leading_zeros_03.pdf', 'leading_zeros_04.pdf',
'leading_zeros_05.pdf', 'leading_zeros_06.pdf',
'leading_zeros_07.pdf', 'leading_zeros_08.pdf',
'leading_zeros_09.pdf', 'leading_zeros_10.pdf'])
end

end
9 changes: 9 additions & 0 deletions test/unit/test_extract_text.rb
Original file line number Diff line number Diff line change
Expand Up @@ -54,4 +54,13 @@ def test_name_escaping_while_extracting_text
assert Dir["#{OUTPUT}/*.txt"].length == 2
end

def test_leading_zeros_while_extracting_text
Docsplit.extract_text('test/fixtures/leading_zeros.pdf', :pages => 'all', :leading_zeros => true, :output => OUTPUT)
assert_directory_contains(OUTPUT, ['leading_zeros_01.txt', 'leading_zeros_02.txt',
'leading_zeros_03.txt', 'leading_zeros_04.txt',
'leading_zeros_05.txt', 'leading_zeros_06.txt',
'leading_zeros_07.txt', 'leading_zeros_08.txt',
'leading_zeros_09.txt', 'leading_zeros_10.txt'])
end

end