diff --git a/lib/jars/JPedal-LICENSE.txt b/lib/jars/JPedal-LICENSE.txt deleted file mode 100644 index 3462706c..00000000 --- a/lib/jars/JPedal-LICENSE.txt +++ /dev/null @@ -1,165 +0,0 @@ - GNU LESSER GENERAL PUBLIC LICENSE - Version 3, 29 June 2007 - - Copyright (C) 2007 Free Software Foundation, Inc. - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - - This version of the GNU Lesser General Public License incorporates -the terms and conditions of version 3 of the GNU General Public -License, supplemented by the additional permissions listed below. - - 0. Additional Definitions. - - As used herein, "this License" refers to version 3 of the GNU Lesser -General Public License, and the "GNU GPL" refers to version 3 of the GNU -General Public License. - - "The Library" refers to a covered work governed by this License, -other than an Application or a Combined Work as defined below. - - An "Application" is any work that makes use of an interface provided -by the Library, but which is not otherwise based on the Library. -Defining a subclass of a class defined by the Library is deemed a mode -of using an interface provided by the Library. - - A "Combined Work" is a work produced by combining or linking an -Application with the Library. The particular version of the Library -with which the Combined Work was made is also called the "Linked -Version". - - The "Minimal Corresponding Source" for a Combined Work means the -Corresponding Source for the Combined Work, excluding any source code -for portions of the Combined Work that, considered in isolation, are -based on the Application, and not on the Linked Version. - - The "Corresponding Application Code" for a Combined Work means the -object code and/or source code for the Application, including any data -and utility programs needed for reproducing the Combined Work from the -Application, but excluding the System Libraries of the Combined Work. - - 1. Exception to Section 3 of the GNU GPL. - - You may convey a covered work under sections 3 and 4 of this License -without being bound by section 3 of the GNU GPL. - - 2. Conveying Modified Versions. - - If you modify a copy of the Library, and, in your modifications, a -facility refers to a function or data to be supplied by an Application -that uses the facility (other than as an argument passed when the -facility is invoked), then you may convey a copy of the modified -version: - - a) under this License, provided that you make a good faith effort to - ensure that, in the event an Application does not supply the - function or data, the facility still operates, and performs - whatever part of its purpose remains meaningful, or - - b) under the GNU GPL, with none of the additional permissions of - this License applicable to that copy. - - 3. Object Code Incorporating Material from Library Header Files. - - The object code form of an Application may incorporate material from -a header file that is part of the Library. You may convey such object -code under terms of your choice, provided that, if the incorporated -material is not limited to numerical parameters, data structure -layouts and accessors, or small macros, inline functions and templates -(ten or fewer lines in length), you do both of the following: - - a) Give prominent notice with each copy of the object code that the - Library is used in it and that the Library and its use are - covered by this License. - - b) Accompany the object code with a copy of the GNU GPL and this license - document. - - 4. Combined Works. - - You may convey a Combined Work under terms of your choice that, -taken together, effectively do not restrict modification of the -portions of the Library contained in the Combined Work and reverse -engineering for debugging such modifications, if you also do each of -the following: - - a) Give prominent notice with each copy of the Combined Work that - the Library is used in it and that the Library and its use are - covered by this License. - - b) Accompany the Combined Work with a copy of the GNU GPL and this license - document. - - c) For a Combined Work that displays copyright notices during - execution, include the copyright notice for the Library among - these notices, as well as a reference directing the user to the - copies of the GNU GPL and this license document. - - d) Do one of the following: - - 0) Convey the Minimal Corresponding Source under the terms of this - License, and the Corresponding Application Code in a form - suitable for, and under terms that permit, the user to - recombine or relink the Application with a modified version of - the Linked Version to produce a modified Combined Work, in the - manner specified by section 6 of the GNU GPL for conveying - Corresponding Source. - - 1) Use a suitable shared library mechanism for linking with the - Library. A suitable mechanism is one that (a) uses at run time - a copy of the Library already present on the user's computer - system, and (b) will operate properly with a modified version - of the Library that is interface-compatible with the Linked - Version. - - e) Provide Installation Information, but only if you would otherwise - be required to provide such information under section 6 of the - GNU GPL, and only to the extent that such information is - necessary to install and execute a modified version of the - Combined Work produced by recombining or relinking the - Application with a modified version of the Linked Version. (If - you use option 4d0, the Installation Information must accompany - the Minimal Corresponding Source and Corresponding Application - Code. If you use option 4d1, you must provide the Installation - Information in the manner specified by section 6 of the GNU GPL - for conveying Corresponding Source.) - - 5. Combined Libraries. - - You may place library facilities that are a work based on the -Library side by side in a single library together with other library -facilities that are not Applications and are not covered by this -License, and convey such a combined library under terms of your -choice, if you do both of the following: - - a) Accompany the combined library with a copy of the same work based - on the Library, uncombined with any other library facilities, - conveyed under the terms of this License. - - b) Give prominent notice with the combined library that part of it - is a work based on the Library, and explaining where to find the - accompanying uncombined form of the same work. - - 6. Revised Versions of the GNU Lesser General Public License. - - The Free Software Foundation may publish revised and/or new versions -of the GNU Lesser General Public License from time to time. Such new -versions will be similar in spirit to the present version, but may -differ in detail to address new problems or concerns. - - Each version is given a distinguishing version number. If the -Library as you received it specifies that a certain numbered version -of the GNU Lesser General Public License "or any later version" -applies to it, you have the option of following the terms and -conditions either of that published version or of any later version -published by the Free Software Foundation. If the Library as you -received it does not specify a version number of the GNU Lesser -General Public License, you may choose any version of the GNU Lesser -General Public License ever published by the Free Software Foundation. - - If the Library as you received it specifies that a proxy can decide -whether future versions of the GNU Lesser General Public License shall -apply, that proxy's public statement of acceptance of any version is -permanent authorization for you to choose that version for the -Library. \ No newline at end of file diff --git a/lib/jars/jpedal_lgpl.jar b/lib/jars/jpedal_lgpl.jar deleted file mode 100644 index 89c0fab7..00000000 Binary files a/lib/jars/jpedal_lgpl.jar and /dev/null differ diff --git a/lib/jars/tabula-0.9.2-jar-with-dependencies.jar b/lib/jars/tabula-1.0.0-SNAPSHOT-jar-with-dependencies.jar similarity index 74% rename from lib/jars/tabula-0.9.2-jar-with-dependencies.jar rename to lib/jars/tabula-1.0.0-SNAPSHOT-jar-with-dependencies.jar index 1bde0c8c..8eabe30b 100644 Binary files a/lib/jars/tabula-0.9.2-jar-with-dependencies.jar and b/lib/jars/tabula-1.0.0-SNAPSHOT-jar-with-dependencies.jar differ diff --git a/lib/tabula_java_wrapper.rb b/lib/tabula_java_wrapper.rb index 51051f85..8680d8f6 100644 --- a/lib/tabula_java_wrapper.rb +++ b/lib/tabula_java_wrapper.rb @@ -67,12 +67,13 @@ module Extraction def Extraction.openPDF(pdf_filename, password='') raise Errno::ENOENT unless File.exists?(pdf_filename) - document = PDDocument.load(pdf_filename) + document = PDDocument.load(java.io.File.new(pdf_filename)) #document = PDDocument.loadNonSeq(java.io.File.new(pdf_filename), nil, password) document end class ObjectExtractor < Java::TechnologyTabula.ObjectExtractor + field_accessor :pdfDocument alias_method :close!, :close @@ -85,6 +86,10 @@ def initialize(pdf_filename, pages=[1], password='', options={}) super(document) end + + def page_count + self.pdfDocument.get_number_of_pages + end end class PagesInfoExtractor < ObjectExtractor diff --git a/lib/tabula_job_executor/jobs/generate_thumbnails.rb b/lib/tabula_job_executor/jobs/generate_thumbnails.rb index 48a5f7f4..ace19733 100644 --- a/lib/tabula_job_executor/jobs/generate_thumbnails.rb +++ b/lib/tabula_job_executor/jobs/generate_thumbnails.rb @@ -12,7 +12,7 @@ def perform output_dir = options[:output_dir] thumbnail_sizes = options[:thumbnail_sizes] - generator = JPedalThumbnailGenerator.new(filepath, output_dir, thumbnail_sizes) + generator = PDFBox2ThumbnailGenerator.new(filepath, output_dir, thumbnail_sizes) generator.add_observer(self, :at) generator.generate_thumbnails! diff --git a/lib/thumbnail_generator.rb b/lib/thumbnail_generator.rb index 2550c838..fde86f27 100644 --- a/lib/thumbnail_generator.rb +++ b/lib/thumbnail_generator.rb @@ -1,18 +1,18 @@ require 'java' require 'observer' -java.lang.System.setProperty('org.jpedal.jai', 'true') -require_relative './jars/jpedal_lgpl.jar' - java_import javax.imageio.ImageIO java_import java.awt.image.BufferedImage java_import java.awt.Image -java_import org.jpedal.PdfDecoder -java_import org.jpedal.fonts.FontMappings +java_import org.apache.pdfbox.rendering.PDFRenderer +java_import org.apache.pdfbox.pdmodel.PDDocument +java_import java.io.ByteArrayOutputStream + class AbstractThumbnailGenerator include Observable + SIZE = 800 def initialize(pdf_filename, output_directory, sizes=[2048, 560]) raise Errno::ENOENT unless File.directory?(output_directory) @@ -48,41 +48,37 @@ def generate_thumbnails! end end -class JPedalThumbnailGenerator < AbstractThumbnailGenerator +class PDFBox2ThumbnailGenerator < AbstractThumbnailGenerator def initialize(pdf_filename, output_directory, sizes=[2048, 560]) super(pdf_filename, output_directory, sizes) - @decoder = PdfDecoder.new(true) - FontMappings.setFontReplacements - @decoder.openPdfFile(pdf_filename) - @decoder.setExtractionMode(0, 1.0) - @decoder.useHiResScreenDisplay(true) + @pdf_document = PDDocument.load(java.io.File.new(pdf_filename)) end - def generate_thumbnails! - total_pages = @decoder.getPageCount - - total_pages.times do |i| - - begin - image = @decoder.getPageAsImage(i+1); - image_w, image_h = image.getWidth, image.getHeight - - @sizes.each do |s| - scale = s.to_f / image_w.to_f - bi = BufferedImage.new(s, image_h * scale, image.getType) - bi.getGraphics.drawImage(image.getScaledInstance(s, image_h * scale, Image::SCALE_SMOOTH), 0, 0, nil) - ImageIO.write(bi, - 'png', - java.io.File.new(File.join(@output_directory, - "document_#{s}_#{i+1}.png"))) - changed - notify_observers(i+1, total_pages, "generating page thumbnails...") - end - rescue java.lang.RuntimeException - # TODO What? - end + renderer = PDFRenderer.new(@pdf_document); + total_pages = @pdf_document.get_number_of_pages + + total_pages.times do |pi| + image = renderer.render_image_with_dpi(pi, 75); + imageWidth = image.width # was get_width + imageHeight = image.height # was get_height + scale = SIZE / imageWidth.to_f + + bi = BufferedImage.new(SIZE, (imageHeight * scale).round, image.type); + bi.get_graphics.draw_image(image.get_scaled_instance(SIZE, (imageHeight * scale).round, Image::SCALE_SMOOTH), 0, 0, nil); + + out = ByteArrayOutputStream.new + ImageIO.write(bi, "png", out); + + filename = "document_#{SIZE}_#{pi + 1}.png" + ImageIO.write(bi, + 'png', + java.io.File.new(File.join(@output_directory, + filename))) + notify_observers(pi+1, total_pages, "generating page thumbnails...") end - @decoder.closePdfFile + + @pdf_document.close(); + end end @@ -93,9 +89,10 @@ def update(page, total_pages) STDERR.puts "#{page}///#{total_pages}" end end + require_relative '../lib/jars/tabula-1.0.0-SNAPSHOT-jar-with-dependencies.jar' - #pdftg = JPedalThumbnailGenerator.new(ARGV[0], '/tmp', [560]) - pdftg = MUDrawThumbnailGenerator.new(ARGV[0], '/tmp', [560]) + # pdftg = MUDrawThumbnailGenerator.new(ARGV[0], '/tmp', [560]) + pdftg = PDFBox2ThumbnailGenerator.new(ARGV[0], '/tmp', [560]) pdftg.add_observer(STDERRProgressReporter.new) pdftg.generate_thumbnails! end diff --git a/webapp/tabula_web.rb b/webapp/tabula_web.rb index 32206e5a..66a42a64 100644 --- a/webapp/tabula_web.rb +++ b/webapp/tabula_web.rb @@ -9,7 +9,7 @@ require 'fileutils' require 'securerandom' -require_relative '../lib/jars/tabula-0.9.2-jar-with-dependencies.jar' +require_relative '../lib/jars/tabula-1.0.0-SNAPSHOT-jar-with-dependencies.jar' require_relative '../lib/tabula_java_wrapper.rb' java_import 'java.io.ByteArrayOutputStream'