documentcloud · theredcoder · Jan 22, 2014 · Jan 22, 2014 · Jan 22, 2014
diff --git a/index.html b/index.html
@@ -172,7 +172,10 @@ <h2 id="usage">Usage</h2>
       The Docsplit gem includes both the <tt>docsplit</tt> command-line utility
       as well as a Ruby API. The available commands and options are identical in both.<br />
       <tt>--output</tt> or <tt>-o</tt> can be passed to any command in order to
-      store the generated files in a directory of your choosing.
+      store the generated files in a directory of your choosing.<br />
+      <tt>--leading_zeros</tt> can be passed to any command extracting individual
+      pages in order to pad the files' page numbers with zeros, resulting in 
+      numerical ordering for particular environments.
     </p>
 
     <p>

diff --git a/lib/docsplit/command_line.rb b/lib/docsplit/command_line.rb
@@ -101,6 +101,9 @@ def parse_options
         opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
           @options[:rolling] = true
         end
+        opts.on('--leading_zeros', 'include leading zeros when naming a page') do |l|
+          @options[:leading_zeros] = true
+        end
         opts.on_tail('-v', '--version', 'display docsplit version') do
           puts "Docsplit version #{Docsplit::VERSION}"
           exit

diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb
@@ -32,6 +32,7 @@ def convert(pdf, size, format, previous=nil)
       basename  = File.basename(pdf, File.extname(pdf))
       directory = directory_for(size)
       pages     = @pages || '1-' + Docsplit.extract_length(pdf).to_s
+      page_format = page_number_format(pdf)
       escaped_pdf = ESCAPE[pdf]
       FileUtils.mkdir_p(directory) unless File.exists?(directory)
       common    = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
@@ -41,7 +42,8 @@ def convert(pdf, size, format, previous=nil)
         raise ExtractionFailed, result if $? != 0
       else
         page_list(pages).each do |page|
-          out_file  = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
+          page_number = sprintf(page_format, page)
+          out_file  = ESCAPE[File.join(directory, "#{basename}_#{page_number}.#{format}")]
           cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
           result = `#{cmd}`.chomp
           raise ExtractionFailed, result if $? != 0
@@ -63,6 +65,7 @@ def extract_options(options)
       @sizes   = [options[:size]].flatten.compact
       @sizes   = [nil] if @sizes.empty?
       @rolling = !!options[:rolling]
+      @zeros   = !!options[:leading_zeros]
     end
 
     # If there's only one size requested, generate the images directly into
@@ -98,6 +101,12 @@ def page_list(pages)
       }.flatten.uniq.sort
     end
 
+    # Generate the appropriate page number format.
+    def page_number_format(pdf)
+      digits = Docsplit.extract_length(pdf).to_s.length
+      @zeros ? "%0#{digits}d" : "%d"
+    end
+
   end
 
 end
diff --git a/lib/docsplit/page_extractor.rb b/lib/docsplit/page_extractor.rb
@@ -9,7 +9,8 @@ def extract(pdfs, opts)
       extract_options opts
       [pdfs].flatten.each do |pdf|
         pdf_name = File.basename(pdf, File.extname(pdf))
-        page_path = File.join(@output, "#{pdf_name}_%d.pdf")
+        page_format = page_number_format(pdf)
+        page_path = File.join(@output, "#{pdf_name}_#{page_format}.pdf")
         FileUtils.mkdir_p @output unless File.exists?(@output)
 
         cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability
@@ -29,6 +30,14 @@ def extract(pdfs, opts)
 
     def extract_options(options)
       @output = options[:output] || '.'
+      @zeros  = !!options[:leading_zeros]
+    end
+
+    # Generate the appropriate page number format. 
+    def page_number_format(pdf)
+      digits = Docsplit.extract_length(pdf).to_s.length
+      # PDFTailor doesn't support printf-style format in the output, yet
+      (!DEPENDENCIES[:pdftailor] && @zeros) ? "%0#{digits}d" : "%d"
     end
 
   end

diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb
@@ -59,12 +59,14 @@ def extract_from_pdf(pdf, pages)
     def extract_from_ocr(pdf, pages)
       tempdir = Dir.mktmpdir
       base_path = File.join(@output, @pdf_name)
+      page_format = page_number_format(pdf)
       escaped_pdf = ESCAPE[pdf]
       if pages
         pages.each do |page|
-          tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
+          page_number = sprintf(page_format, page)
+          tiff = "#{tempdir}/#{@pdf_name}_#{page_number}.tif"
           escaped_tiff = ESCAPE[tiff]
-          file = "#{base_path}_#{page}"
+          file = "#{base_path}_#{page_number}"
           run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
           run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1"
           clean_text(file + '.txt') if @clean_ocr
@@ -109,7 +111,8 @@ def extract_full(pdf)
     # Extract the contents of a single page of text, directly, adding it to
     # the `@pages_to_ocr` list if the text length is inadequate.
     def extract_page(pdf, page)
-      text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
+      page_number = sprintf(page_number_format(pdf), page)
+      text_path = File.join(@output, "#{@pdf_name}_#{page_number}.txt")
       run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
       unless @forbid_ocr
         @pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
@@ -123,6 +126,13 @@ def extract_options(options)
       @forbid_ocr = options[:ocr] == false
       @clean_ocr  = !(options[:clean] == false)
       @language   = options[:language] || 'eng'
+      @zeros      = !!options[:leading_zeros]
+    end
+
+    # Generate the appropriate page number format.
+    def page_number_format(pdf)
+      digits = Docsplit.extract_length(pdf).to_s.length
+      @zeros ? "%0#{digits}d" : "%d"
     end
 
   end

diff --git a/test/fixtures/leading_zeros.pdf b/test/fixtures/leading_zeros.pdf
diff --git a/test/unit/test_extract_images.rb b/test/unit/test_extract_images.rb
@@ -48,4 +48,13 @@ def test_name_escaping_while_extracting_images
                                        'PDF file with spaces \'single\' and "double quotes"_1.gif'])
   end
 
+  def test_leading_zeros_while_extracting_images
+    Docsplit.extract_images('test/fixtures/leading_zeros.pdf', :leading_zeros => true, :output => OUTPUT)
+    assert_directory_contains(OUTPUT, ['leading_zeros_01.png', 'leading_zeros_02.png',
+                                       'leading_zeros_03.png', 'leading_zeros_04.png',
+                                       'leading_zeros_05.png', 'leading_zeros_06.png',
+                                       'leading_zeros_07.png', 'leading_zeros_08.png',
+                                       'leading_zeros_09.png', 'leading_zeros_10.png'])
+  end
+
 end
diff --git a/test/unit/test_extract_pages.rb b/test/unit/test_extract_pages.rb
@@ -24,4 +24,17 @@ def test_name_escaping_while_extracting_pages
     assert Dir["#{OUTPUT}/*.pdf"].length == 2
   end
 
+  def test_leading_zeros_while_extracting_pages
+    Docsplit.extract_pages('test/fixtures/leading_zeros.pdf', :leading_zeros => true, :output => OUTPUT)
+
+    doc_data_path = File.join(OUTPUT, 'doc_data.txt')
+    File.delete(doc_data_path) if File.exists?(doc_data_path)
+
+    assert_directory_contains(OUTPUT, ['leading_zeros_01.pdf', 'leading_zeros_02.pdf',
+                                       'leading_zeros_03.pdf', 'leading_zeros_04.pdf',
+                                       'leading_zeros_05.pdf', 'leading_zeros_06.pdf',
+                                       'leading_zeros_07.pdf', 'leading_zeros_08.pdf',
+                                       'leading_zeros_09.pdf', 'leading_zeros_10.pdf'])
+  end
+
 end
diff --git a/test/unit/test_extract_text.rb b/test/unit/test_extract_text.rb
@@ -54,4 +54,13 @@ def test_name_escaping_while_extracting_text
     assert Dir["#{OUTPUT}/*.txt"].length == 2
   end
 
+  def test_leading_zeros_while_extracting_text
+    Docsplit.extract_text('test/fixtures/leading_zeros.pdf', :pages => 'all', :leading_zeros => true, :output => OUTPUT)
+    assert_directory_contains(OUTPUT, ['leading_zeros_01.txt', 'leading_zeros_02.txt',
+                                       'leading_zeros_03.txt', 'leading_zeros_04.txt',
+                                       'leading_zeros_05.txt', 'leading_zeros_06.txt',
+                                       'leading_zeros_07.txt', 'leading_zeros_08.txt',
+                                       'leading_zeros_09.txt', 'leading_zeros_10.txt'])
+  end
+
 end