diff --git a/Gemfile b/Gemfile index 40dbaf7e..14eb6ae1 100644 --- a/Gemfile +++ b/Gemfile @@ -4,7 +4,7 @@ source "https://rubygems.org" platform :jruby do - gem "cuba" + gem "roda" gem "rack" gem "tilt" gem "rufus-lru" diff --git a/Gemfile.lock b/Gemfile.lock index 63333175..84eab59c 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,12 +1,12 @@ GEM remote: https://rubygems.org/ specs: - cuba (3.3.0) - rack jruby-jars (1.7.15) jruby-rack (1.1.16) rack (1.5.2) rake (10.3.2) + roda (1.3.0) + rack rubyzip (1.1.6) rufus-lru (1.0.5) tabula-extractor (0.7.6-java) @@ -23,10 +23,10 @@ PLATFORMS java DEPENDENCIES - cuba jruby-jars (= 1.7.15) rack rake + roda rufus-lru tabula-extractor (~> 0.7.6) tilt diff --git a/config.ru b/config.ru index 0d241877..12d743f5 100644 --- a/config.ru +++ b/config.ru @@ -1,7 +1,7 @@ # encoding: UTF-8 require_relative './webapp/tabula_settings.rb' require_relative './webapp/tabula_web.rb' -run Cuba +run Roda.app if "#{$PROGRAM_NAME}".include?("tabula.jar") # only do this if running as jar or app. (if "rackup", we don't @@ -10,7 +10,7 @@ if "#{$PROGRAM_NAME}".include?("tabula.jar") require 'java' # don't do "java_import java.net.URI" -- it conflicts with Ruby URI and - # makes Cuba/Rack really really upset. just call "java.*" classes + # makes Roda/Rack really really upset. just call "java.*" classes # directly. port = java.lang.Integer.getInteger('jetty.port', 8080) url = "http://127.0.0.1:#{port}" diff --git a/webapp/tabula_debug.rb b/webapp/tabula_debug.rb index ac6ca1cc..12ffdd96 100644 --- a/webapp/tabula_debug.rb +++ b/webapp/tabula_debug.rb @@ -1,108 +1,86 @@ require 'json' -class TabulaDebug < Cuba - define do +class TabulaDebug < Roda + clear_middleware! - on ":file_id/characters" do |file_id| - par = JSON.load(req.params['coords']).first + route do + on :file_id, :method=>:get do |file_id| + par = JSON.load(request['coords']).first page = par['page'] - - pdf_path = File.join(TabulaSettings::DOCUMENTS_BASEPATH, file_id, 'document.pdf') - extractor = Tabula::Extraction::ObjectExtractor.new(pdf_path, [page]) - - text_elements = extractor.extract.next.get_text([par['y1'].to_f, - par['x1'].to_f, - par['y2'].to_f, - par['x2'].to_f]) - - res['Content-Type'] = 'application/json' - res.write text_elements.map { |te| - { 'left' => te.left, - 'top' => te.top, - 'width' => te.width, - 'height' => te.height, - 'text' => te.text } - }.to_json - end - - on ":file_id/text_chunks" do |file_id| - par = JSON.load(req.params['coords']).first - page = par['page'] - pdf_path = File.join(TabulaSettings::DOCUMENTS_BASEPATH, file_id, 'document.pdf') extractor = Tabula::Extraction::ObjectExtractor.new(pdf_path, [page]) - text_elements = extractor.extract.next.get_text([par['y1'].to_f, - par['x1'].to_f, - par['y2'].to_f, - par['x2'].to_f]) - - text_chunks = Tabula::TextElement.merge_words(text_elements) + is "characters" do |file_id| + text_elements = extractor.extract.next.get_text([par['y1'].to_f, + par['x1'].to_f, + par['y2'].to_f, + par['x2'].to_f]) + + text_elements.map { |te| + { 'left' => te.left, + 'top' => te.top, + 'width' => te.width, + 'height' => te.height, + 'text' => te.text } + } + end - puts text_chunks.inspect + is "text_chunks" do |file_id| + text_elements = extractor.extract.next.get_text([par['y1'].to_f, + par['x1'].to_f, + par['y2'].to_f, + par['x2'].to_f]) - res['Content-Type'] = 'application/json' - res.write text_chunks.map { |te| - { 'left' => te.left, - 'top' => te.top, - 'width' => te.width, - 'height' => te.height, - 'text' => te.text } - }.to_json - end + text_chunks = Tabula::TextElement.merge_words(text_elements) + text_chunks.map { |te| + { 'left' => te.left, + 'top' => te.top, + 'width' => te.width, + 'height' => te.height, + 'text' => te.text } + } + end - on ":file_id/clipping_paths" do |file_id| - par = JSON.load(req.params['coords']).first - page = par['page'] - pdf_path = File.join(TabulaSettings::DOCUMENTS_BASEPATH, file_id, 'document.pdf') - extractor = Tabula::Extraction::ObjectExtractor.new(pdf_path, [page]) - extractor.debug_clipping_paths = true + is "clipping_paths" do |file_id| + extractor.debug_clipping_paths = true - extractor.extract.next + extractor.extract.next - res['Content-Type'] = 'application/json' - res.write extractor.clipping_paths.map { |cp| - { - 'left' => cp.left, - 'top' => cp.top, - 'width' => cp.width, - 'height' => cp.height + extractor.clipping_paths.map { |cp| + { + 'left' => cp.left, + 'top' => cp.top, + 'width' => cp.width, + 'height' => cp.height + } } - }.to_json - end - - on ":file_id/rulings" do |file_id| - par = JSON.load(req.params['coords']).first - page = par['page'] + end - pdf_path = File.join(TabulaSettings::DOCUMENTS_BASEPATH, file_id, 'document.pdf') - extractor = Tabula::Extraction::ObjectExtractor.new(pdf_path, [page]) + is "rulings" do |file_id| + # crop lines to area of interest + top, left, bottom, right = [par['y1'].to_f, + par['x1'].to_f, + par['y2'].to_f, + par['x2'].to_f] - # crop lines to area of interest - par = JSON.load(req.params['coords']).first - top, left, bottom, right = [par['y1'].to_f, - par['x1'].to_f, - par['y2'].to_f, - par['x2'].to_f] + area = Tabula::ZoneEntity.new(top, left, + right - left, bottom - top) - area = Tabula::ZoneEntity.new(top, left, - right - left, bottom - top) + page_obj = extractor.extract.next + page_area = page_obj.get_area(area) + rulings = page_area.ruling_lines - page_obj = extractor.extract.next - page_area = page_obj.get_area(area) - rulings = page_area.ruling_lines + intersections = {} + if request['show_intersections'] != 'false' + intersections = Tabula::Ruling.find_intersections(page_area.horizontal_ruling_lines, + page_area.vertical_ruling_lines) + end - intersections = {} - if req.params['show_intersections'] != 'false' - intersections = Tabula::Ruling.find_intersections(page_area.horizontal_ruling_lines, - page_area.vertical_ruling_lines) + {:rulings => rulings.uniq, :intersections => intersections.keys} end - res['Content-Type'] = 'application/json' - res.write({:rulings => rulings.uniq, :intersections => intersections.keys }.to_json) end - end end diff --git a/webapp/tabula_job_progress.rb b/webapp/tabula_job_progress.rb index 13af32f9..f1985f39 100644 --- a/webapp/tabula_job_progress.rb +++ b/webapp/tabula_job_progress.rb @@ -1,52 +1,51 @@ require_relative '../lib/tabula_job_executor/executor.rb' -class TabulaJobProgress < Cuba - define do - on ":upload_id/json" do |batch_id| - # upload_id is the "job id" uuid that resque-status provides - batch = Tabula::Background::JobExecutor.get_by_batch(batch_id) - res['Content-Type'] = 'application/json' - message = {} - if batch.empty? - res.status = 404 - message[:status] = "error" - message[:message] = "No such job" - message[:pct_complete] = 0 - elsif batch.any? { |uuid, job| job.failed? } - message[:status] = "error" - message[:message] = "Sorry, your file upload could not be processed. Please double-check that the file you uploaded is a valid PDF file and try again." - message[:pct_complete] = 99 - res.write message.to_json - else - s = batch.find { |uuid, job| job.working? } - message[:status] = !s.nil? ? s.last.status['status'] : 'completed' - message[:message] = !s.nil? && !s.last.message.nil? ? s.last.message.first : '' - message[:pct_complete] = (batch.inject(0.0) { |sum, (uuid, job)| sum + job.pct_complete } / batch.size).to_i - message[:file_id] = req.params['file_id'] - message[:upload_id] = batch_id - res.write message.to_json - end - end +class TabulaJobProgress < Roda + clear_middleware! - on ":upload_id" do |batch_id| + route do + on :upload_id, :method=>:get do |batch_id| # upload_id is the "job id" uuid that resque-status provides batch = Tabula::Background::JobExecutor.get_by_batch(batch_id) - if batch.empty? - res.status = 404 - res.write "" - res.write view("upload_error.html", - :message => "invalid upload_id (TODO: make this generic 404)") - elsif batch.any? { |uuid, job| job.failed? } - res.write view("upload_error.html", - :message => "Sorry, your file upload could not be processed. Please double-check that the file you uploaded is a valid PDF file and try again.") - else - s = batch.find { |uuid, job| job.working? } - res.write view("upload_status.html", - :status => !s.nil? ? s.last.message : 'completed', - :pct_complete => (batch.inject(0.0) { |sum, (uuid, job)| sum + job.pct_complete } / batch.size).to_i, - :upload_id => batch_id, - :file_id => req.params['file_id']) + is "json" do |batch_id| + message = {} + if batch.empty? + response.status = 404 + message[:status] = "error" + message[:message] = "No such job" + message[:pct_complete] = 0 + elsif batch.any? { |uuid, job| job.failed? } + message[:status] = "error" + message[:message] = "Sorry, your file upload could not be processed. Please double-check that the file you uploaded is a valid PDF file and try again." + message[:pct_complete] = 99 + else + s = batch.find { |uuid, job| job.working? } + message[:status] = !s.nil? ? s.last.status['status'] : 'completed' + message[:message] = !s.nil? && !s.last.message.nil? ? s.last.message.first : '' + message[:pct_complete] = (batch.inject(0.0) { |sum, (uuid, job)| sum + job.pct_complete } / batch.size).to_i + message[:file_id] = request['file_id'] + message[:upload_id] = batch_id + end + message + end + + is do + if batch.empty? + response.status = 404 + view("upload_error.html", :locals=>{ + :message => "invalid upload_id (TODO: make this generic 404)"}) + elsif batch.any? { |uuid, job| job.failed? } + view("upload_error.html", :locals=>{ + :message => "Sorry, your file upload could not be processed. Please double-check that the file you uploaded is a valid PDF file and try again."}) + else + s = batch.find { |uuid, job| job.working? } + view("upload_status.html", :locals=>{ + :status => !s.nil? ? s.last.message : 'completed', + :pct_complete => (batch.inject(0.0) { |sum, (uuid, job)| sum + job.pct_complete } / batch.size).to_i, + :upload_id => batch_id, + :file_id => request['file_id']}) + end end end end diff --git a/webapp/tabula_web.rb b/webapp/tabula_web.rb index e9c93954..ba4fab9a 100644 --- a/webapp/tabula_web.rb +++ b/webapp/tabula_web.rb @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -require 'cuba' -require 'cuba/render' +require 'roda' +require 'tilt/erb' require 'rufus-lru' require 'digest/sha1' @@ -44,12 +44,17 @@ def is_valid_pdf?(path) MAX_CACHE_ENTRIES = 10 -Cuba.plugin Cuba::Render -Cuba.settings[:render].store(:views, File.expand_path("views", File.dirname(__FILE__))) -Cuba.use Rack::MethodOverride -Cuba.use Rack::Static, root: STATIC_ROOT, urls: ["/css","/js", "/img", "/swf"] -Cuba.use Rack::ContentLength -Cuba.use Rack::Reloader +Roda.plugin :render, :views=>File.expand_path("views", File.dirname(__FILE__)), + :template_opts=>{:default_encoding=>'UTF-8'} +Roda.plugin :all_verbs +Roda.plugin :json +Roda.plugin :delegate +Roda.request_delegate :on, :is, :delete, :get, :put, :post, :root, :run +Roda.plugin :default_headers, 'Content-Type'=>"text/html; charset=utf-8" +Roda.use Rack::MethodOverride +Roda.use Rack::Static, root: STATIC_ROOT, urls: ["/css","/js", "/img", "/swf"] +Roda.use Rack::ContentLength +Roda.use Rack::Reloader if TabulaSettings::EXTRACTION_CACHE CACHE = Rufus::Lru::SynchronizedHash.new(MAX_CACHE_ENTRIES) @@ -70,24 +75,26 @@ def has_key?(k) CACHE = NoCache.new end -Cuba.define do +if TabulaSettings::ENABLE_DEBUG_METHODS + require_relative './tabula_debug.rb' +end +require_relative './tabula_job_progress.rb' + +Roda.route do if TabulaSettings::ENABLE_DEBUG_METHODS - require_relative './tabula_debug.rb' on 'debug' do run TabulaDebug end end - on 'queue' do - require_relative './tabula_job_progress.rb' run TabulaJobProgress end - on delete do + delete do - on 'pdf/:file_id/page/:page_number' do |file_id, page_number| + is 'pdf/:file_id/page/:page_number' do |file_id, page_number| index_fname = File.join(TabulaSettings::DOCUMENTS_BASEPATH, file_id, 'pages.json') @@ -97,7 +104,7 @@ def has_key?(k) end # delete an uploaded file - on 'pdf/:file_id' do |file_id| + is 'pdf/:file_id' do |file_id| workspace_file = File.join(TabulaSettings::DOCUMENTS_BASEPATH, 'workspace.json') raise if !File.exists?(workspace_file) @@ -118,14 +125,14 @@ def has_key?(k) end - on put do - on 'pdf/:file_id/page/:page_number' do |file_id, page_number| + put do + is 'pdf/:file_id/page/:page_number' do |file_id, page_number| # nothing yet end end - on get do - on root do + get do + root do workspace_file = File.join(TabulaSettings::DOCUMENTS_BASEPATH, 'workspace.json') workspace = if File.exists?(workspace_file) File.open(workspace_file) { |f| JSON.load(f) } @@ -133,8 +140,7 @@ def has_key?(k) [] end - res.write view("index.html", - workspace: workspace) + view("index.html", :locals=>{workspace: workspace}) end @@ -142,43 +148,42 @@ def has_key?(k) run Rack::File.new(TabulaSettings::DOCUMENTS_BASEPATH) end - on "pdf/:file_id" do |file_id| + is "pdf/:file_id" do |file_id| document_dir = File.join(TabulaSettings::DOCUMENTS_BASEPATH, file_id) - unless File.directory?(document_dir) - res.status = 404 - else - res.write view("pdf_view.html", + if File.directory?(document_dir) + view("pdf_view.html", :locals=>{ pages: File.open(File.join(document_dir, 'pages.json')) { |f| JSON.parse(f.read) }, - file_id: file_id) + file_id: file_id}) end end end # /get - on post do - on 'upload' do + post do + is 'upload' do + + tempfile_path = request['file'][:tempfile].path # Make sure this is a PDF, before doing anything - unless is_valid_pdf?(req.params['file'][:tempfile].path) - res.status = 400 - res.write view("upload_error.html", - :message => "Sorry, the file you uploaded was not detected as a PDF. You must upload a PDF file. Please try again.") - next # halt this handler + unless is_valid_pdf?(tempfile_path) + response.status = 400 + next view("upload_error.html", :locals=>{ + :message => "Sorry, the file you uploaded was not detected as a PDF. You must upload a PDF file. Please try again."}) end - original_filename = req.params['file'][:filename] + original_filename = request['file'][:filename] file_id = Digest::SHA1.hexdigest(Time.now.to_s) file_path = File.join(TabulaSettings::DOCUMENTS_BASEPATH, file_id) FileUtils.mkdir(file_path) begin - FileUtils.mv(req.params['file'][:tempfile].path, + FileUtils.mv(tempfile_path, File.join(file_path, 'document.pdf')) rescue Errno::EACCES # move fails on windows sometimes - FileUtils.cp_r(req.params['file'][:tempfile].path, + FileUtils.cp_r(tempfile_path, File.join(file_path, 'document.pdf')) - FileUtils.rm_rf(req.params['file'][:tempfile].path) + FileUtils.rm_rf(tempfile_path) end @@ -192,7 +197,7 @@ def has_key?(k) :id => file_id, :batch => job_batch) - if req.params['autodetect-tables'] + if request['autodetect-tables'] DetectTablesJob.create(:filename => file, :output_dir => file_path, :batch => job_batch) @@ -208,21 +213,21 @@ def has_key?(k) :thumbnail_sizes => [560], :batch => job_batch) - res.redirect "/queue/#{job_batch}?file_id=#{file_id}" + request.redirect "/queue/#{job_batch}?file_id=#{file_id}" end - on "pdf/:file_id/data" do |file_id| + is "pdf/:file_id/data" do |file_id| pdf_path = File.join(TabulaSettings::DOCUMENTS_BASEPATH, file_id, 'document.pdf') - coords = JSON.load(req.params['coords']) + coords = JSON.load(request['coords']) coords.sort_by! do |coord_set| [ coord_set['page'], [coord_set['y1'], coord_set['y2']].min.to_i / 10, [coord_set['x1'], coord_set['x2']].min ] end - if ["guess", "spreadsheet", "original"].include?(req.params['extraction_method']) - extraction_method_requested = req.params['extraction_method'] + if ["guess", "spreadsheet", "original"].include?(request['extraction_method']) + extraction_method_requested = request['extraction_method'] else extraction_method_requested = "guess" end @@ -242,39 +247,37 @@ def has_key?(k) end end - case req.params['format'] + case request['format'] when 'csv' - res['Content-Type'] = 'text/csv' - res['Content-Disposition'] = "attachment; filename=\"tabula-#{file_id}.csv\"" + response['Content-Type'] = 'text/csv' + response['Content-Disposition'] = "attachment; filename=\"tabula-#{file_id}.csv\"" tables = CACHE[coords_method_key].flatten(1) tables.each do |table| - res.write table.to_csv + response.write table.to_csv end when 'tsv' - res['Content-Type'] = 'text/tab-separated-values' - res['Content-Disposition'] = "attachment; filename=\"tabula-#{file_id}.tsv\"" + response['Content-Type'] = 'text/tab-separated-values' + response['Content-Disposition'] = "attachment; filename=\"tabula-#{file_id}.tsv\"" tables = CACHE[coords_method_key].flatten(1) tables.each do |table| - res.write table.to_tsv + response.write table.to_tsv end when 'script' # Write shell script of tabula-extractor commands. $1 takes # the name of a file from the command line and passes it # to tabula-extractor so the script can be reused on similar pdfs. - res['Content-Type'] = 'application/x-sh' - res['Content-Disposition'] = "attachment; filename=\"tabula-#{file_id}.sh\"" + response['Content-Type'] = 'application/x-sh' + response['Content-Disposition'] = "attachment; filename=\"tabula-#{file_id}.sh\"" coords.each do |c| - res.write "tabula -a #{c['y1']},#{c['x1']},#{c['y2']},#{c['x2']} -p #{c['page']} \"$1\" \n" + response.write "tabula -a #{c['y1']},#{c['x1']},#{c['y2']},#{c['x2']} -p #{c['page']} \"$1\" \n" end when 'bbox' # Write json representation of bounding boxes and pages for # use in OCR and other back ends. - res['Content-Type'] = 'application/json' - res['Content-Disposition'] = "attachment; filename=\"tabula-#{file_id}.json\"" - res.write coords.to_json + response['Content-Disposition'] = "attachment; filename=\"tabula-#{file_id}.json\"" + coords else - res['Content-Type'] = 'application/json' - res.write CACHE[coords_method_key].flatten(1).to_json + CACHE[coords_method_key].flatten(1) end end end diff --git a/webapp/views/layout.erb b/webapp/views/layout.erb index dc29b55a..4430a066 100644 --- a/webapp/views/layout.erb +++ b/webapp/views/layout.erb @@ -12,7 +12,7 @@ <% if $TABULA_VERSION.start_with?('rev') %>
DEV mode
<% end %> - <% if req.path != "/" %> + <% if request.path != "/" %>