tabulapdf · dan144 · Nov 26, 2016 · Nov 26, 2016 · Nov 26, 2016 · Nov 26, 2016
diff --git a/lib/jars/tabula-1.0.0-SNAPSHOT-jar-with-dependencies.jar b/lib/jars/tabula-1.0.0-SNAPSHOT-jar-with-dependencies.jar
diff --git a/lib/tabula_job_executor/jobs/detect_tables.rb b/lib/tabula_job_executor/jobs/detect_tables.rb
@@ -5,10 +5,8 @@
 class DetectTablesJob < Tabula::Background::Job
   include Observable
   def perform
-    filepath = options[:filepath]
+	filepath = options[:filepath]
     output_dir = options[:output_dir]
-
-
     page_areas_by_page = []
 
     begin
@@ -39,6 +37,6 @@ def perform
     end
 
     at(100, 100, "complete")
-    return nil
+	return nil
   end
-end
+end
diff --git a/lib/tabula_job_executor/jobs/string_search.rb b/lib/tabula_job_executor/jobs/string_search.rb
@@ -0,0 +1,35 @@
+require 'java'
+
+class StringSearchJob 
+
+  def performString(output_dir, boundariesArray)
+	page_areas_by_page = []
+	begin
+		extractor = Tabula::Extraction::ObjectExtractor.new(File.join(output_dir, 'document.pdf'), :all)
+		page_count = extractor.page_count
+		rda = Java::TechnologyTabulaDetectors::StringSearch.new
+		extractor.extract.each do |page|
+			areas = rda.detect(page, boundariesArray)
+			page_areas_by_page << areas.map { |rect|
+          [ rect.getLeft,
+            rect.getTop,
+            rect.getWidth,
+            rect.getHeight ]
+			}
+		end
+
+	rescue Java::JavaLang::Exception => e
+      warn("String bounds detect failed. You may need to select tables manually.")
+    end
+
+    File.open(output_dir + "/string.json", 'w') do |f|
+      f.puts page_areas_by_page.to_json
+    end
+
+	File.open(output_dir + "/string_list.json", 'a') do |f|
+      f.puts boundariesArray[0] + "," + boundariesArray[1] + "," + boundariesArray[2] + "," + boundariesArray[3] + "\n"
+    end
+
+    return page_areas_by_page
+  end
+end
diff --git a/webapp/index.html b/webapp/index.html
@@ -104,16 +104,61 @@
             <span class="glyphicon glyphicon-paperclip"></span>
             <span class="clipboard-text">Copy to Clipboard</span>
           </button>
+
+		  <div>
+			  <label>Batch Process:</label>
+			  <select id="batch-selection" onchange="autodetectDisable()" <%=disableIfNoData%>>
+				<option value="coords">Coordinates</option>
+				<option value="string">String Search</option>
+			  </select>
+				<input style="margin-right: 10px" class="which_batch" id="batch-input-path" type="text" placeholder="Batch Input folder">
+				<input style="margin-right: 10px" class="which_batch" id="batch-output-path" type="text" placeholder="Batch Output folder">
+				<input id="overlap" type="number" placeholder="Autodetect Overlap" min=0 disabled>
+				<label for="ocr-ok">OCR</label>
+				<input id="ocr-ok" type="checkbox" value="1" onclick="ocrCheck()">
+				<button type="button" id="run-batch" class="btn btn-default"> <span class="glyphicon glyphicon-download"></span>
+				 Run Batch
+				</button>
+		  </div>
         </form>
       </script>
+	<script type="text/javascript">
+		function ocrCheck() {
+			if (document.getElementById('ocr-ok').checked) {
+				var response = confirm("Are you sure? OCR is imperfect and can create issues with data validity");
+				if(!response){
+					document.getElementById('ocr-ok').checked = false;
+				}
+			}
+		}
+	</script>
+	<script type="text/javascript">
+		function autodetectDisable(){
+			var batch_selection_object = document.getElementById('batch-selection');
+			if(batch_selection_object.options[batch_selection_object.selectedIndex].value == "coords"){
+				document.getElementById('overlap').disabled = true;
+			}else{
+				document.getElementById('overlap').disabled = false;
+			}
+		}
+
+	</script>
 
       <script type="text/template" id="select-control-panel-template" >
         <span class="filename"><%= original_filename %></span>
 
         <a href="javascript:void(0)"><button id="restore-detected-tables" type="button" class="btn btn-default <%= restore_detected_tables %>" <%= disable_detected_tables %>><span class="glyphicon glyphicon-flash"></span><span class="glyphicon glyphicon-refresh"></span> Autodetect Tables</button></a>
         <a href="javascript:void(0)"><button type="button" id="clear-all-selections" class="btn btn-default" <%= disable_clear_all_selections %>><span class="glyphicon glyphicon-remove-circle"></span> Clear All Selections</button></a>
         <a href="javascript:void(0)"><button type="button" id="all-data" class="btn btn-success" <%= disable_download_all %>><span class="glyphicon glyphicon-eye-open"></span> Preview & Export Extracted Data</button></a>
-      </script>
+        <div>
+          <span style="margin-right: 10px" class="string_title">String Search</span>
+            <input style="margin-right: 10px" class="stringI" id="top-left-string" type="text" placeholder="Upper Left Bound">
+            <input style="margin-right: 10px" class="stringI" id="top-right-string" type="text" placeholder="Upper Right Bound">
+            <input style="margin-right: 10px" class="stringI" id="bottom-left-string" type="text" placeholder="Lower Left Bound">
+            <input style="margin-right: 10px" class="stringI" id="bottom-right-string" type="text" placeholder="Lower Right Bound">
+		       <a href="javascript:void(0)"><button type="button" id="set-string" class="btn btn-success" <%= set_string %>><span class="glyphicon glyphicon-eye-open"></span> Search Strings</button></a>
+        </div>
+       </script>
 
       <script type="text/template" id="export-page-sidebar-template">
           <h4>Is the extracted data incorrect?</h4>
@@ -341,7 +386,7 @@ <h1>About Tabula</h1>
 
             <p>Tabula was created by journalists for journalists and anyone else working with data locked away in PDFs. Tabula will always be free and open source.</p>
             <p>If you’ve ever tried to do anything with data provided to you in PDFs, you know how painful it is — there's no easy way to copy-and-paste rows of data out of PDF files. Tabula allows you to extract that data into a CSV or Microsoft Excel spreadsheet using a simple, easy-to-use interface. Tabula works on Mac, Windows and Linux.</p>
-            <p>Caveat: Tabula only works on text-based PDFs, not scanned documents. If you can click-and-drag to select text in your table in a PDF viewer (even if the output is unorganized trash), then your PDF is text-based and Tabula should work.</p>
+            <p>Caveat: Tabula only works on text-based PDFs, not scanned documents. If Tabula receives an image-based PDF as input it will first alert you then, if you wish, attempt to convert the PDF to text using, Tesseract, Google's open-source OCR software. Please note that OCR is not 100% accurate and we strongly recommend that you verify the output data.</p>
             <p><strong>Security Concerns?</strong> Tabula is designed with security in mind. Your PDF and the extracted data never touch the net -- when you use Tabula, as long as your browser's URL bar says "localhost" or "127.0.0.1", all processing takes place on your local machine. Tabula does download a list of Tabula versions from our server to alert you if Tabula has been updated (and we use hits to that list to count how often Tabula is being used); it also downloads a few badges and assets from the web.</p>
 
             <h2>Who Uses Tabula?</h2>
@@ -360,17 +405,17 @@ <h2>Credits</h2>
           <div class="jumbotron help">
           <h3 name="howto">How to Use Tabula</h3>
           <ol>
-            <li>Upload a PDF file containing a data table.</li>
+            <li>Upload a PDF file or group of PDF files that contain data tables.</li>
             <li>Select the table by clicking the top left corner of a table and dragging the mouse to the bottom right corner, until all of the data is included in the shaded selection area.</li>
             <li>A window will then appear containing your data. Inspect the data to make sure it looks correct. If data is missing, you may have to slightly expand your selection.</li>
             <li>Click the Download button.</li>
             <li>Now you can work with your data as text file or a spreadsheet rather than a PDF! <br>
             (You can open the downloaded file in Microsoft Excel or the free <a href="http://www.libreoffice.org/discover/calc/">LibreOffice Calc</a>)<p></p></li>
           </ol>
-          <p>Note: Tabula only works on text-based PDFs, not scanned documents.</p>
+          <p>Note: Tabula works best with text-based documents but has the functionality to work with image-based documents as well.</p>
           <h3 name="trouble">Having trouble with Tabula?</h3>
           <ol>
-          <li><strong>Tabula said "Sorry, your PDF file is image-based" -- what does that mean?</strong> Your PDF does not have any embedded text. It might have been scanned from paper. Tabula is not able to extract any data from image-based PDFs. You can try OCRing the PDF with a tool like Adobe Acrobat Pro (paid), Tesseract, <a href="http://www.tobias-elze.de/pdfsandwich/">PDFSandwich</a> (Mac/Linux, free) or <a href="https://code.google.com/p/lime-ocr/">Lime OCR</a> (Windows, free) and then trying Tabula again.</li>
+          <li><strong>Tabula alerted you that your PDF is image based and that your data could be compromised during the conversion process. -- what does that mean?</strong> Your PDF does not have any embedded text. It might have been scanned from paper. Whatever the reason, Tabula is able to extract data from image-based PDFs by OCRing the PDF with <a href="https://github.com/tesseract-ocr">Tesseract</a>. However, you must verify your data is still correct after the OCR process is completed. You may also follow the link to tesseract and convert your PDF manually which would allow you to adjust conversion options to be more or less conservative, functionality that Tabula does not currently support.</li>
           <li><strong>Some columns of my table are combined. What can I do?</strong> Tabula sometimes uses "streams" of whitespace to recreate your table's structure. If headers span multiple columns, they're probably causing a problem. Try excluding them from your selection (or selecting them separately). </li>
           <li><strong>Some columns of my table are combined. And the headers aren't the problem! What <em>else</em> can I do?</strong> Tabula has two extraction methods. It tries to guess which one is right for document, but it's wrong sometimes. Try selecting the other (of "stream" and "lattice"), on the left in extraction mode, to see if that fixes the problem.</li>
           <li><strong>Tabula's taking too long!</strong> Sorry! Tabula has to do a lot of weird math to reconstruct your table. Tabula's command-line counterpart, <a href="https://www.github.com/tabulapdf/tabula-extractor">tabula-extractor</a> is faster, but a little harder to use. You might give it a try.</li>

diff --git a/webapp/static/css/styles.css b/webapp/static/css/styles.css
@@ -175,7 +175,9 @@ input[type="radio"] {
 
 input[type="number"]::-webkit-inner-spin-button,
 input[type="number"]::-webkit-outer-spin-button {
-  height: auto;
+  -webkit-box-sizing: border-box;
+  box-sizing: border-box;
+  -moz-box-sizing: border-box;
 }
 
 input[type="search"] {

diff --git a/webapp/static/css/styles.scss b/webapp/static/css/styles.scss
@@ -41,7 +41,7 @@ body {
 .navbar-default a.navbar-brand {
 	padding-left: 30px;
 	position: relative;
-	font-weight: bold; 
+	font-weight: bold;
 	color: black;
 	&::before {
 		position: absolute;
@@ -55,7 +55,7 @@ body {
 		left: 0;
 		top: 10px;
 	}
-}	
+}
 
 .btn-file {
     position: relative;
@@ -74,7 +74,7 @@ body {
     outline: none;
     background: white;
     cursor: inherit;
-    display: block; 
+    display: block;
 }
 
 .form-inline .input-group > .form-control {
@@ -91,12 +91,12 @@ form {
 		text-align: center;
 		cursor: pointer;
 		position: relative;
-		border-right: 1px solid $table-border-color; 
+		border-right: 1px solid $table-border-color;
 		padding-right: 20px;
 		&::before, &::after {
 			text-align: right;
 			width: 11px;
-			color: $table-border-color; 
+			color: $table-border-color;
 		  position: absolute;
 		  display: block;
 		  font-family: 'Glyphicons Halflings';
@@ -150,7 +150,7 @@ form {
 			}
 		}
 	}
-}	
+}
 
 .glyphicon-remove {
 	color: $gray-light;
@@ -166,7 +166,7 @@ form {
 	height: calc(100% - 51px);
 	text-align: center;
 	overflow: scroll;
-	position: fixed;  
+	position: fixed;
 	left: 0;
 	top: 51px;
 	z-index: 2;
@@ -184,7 +184,7 @@ form {
 		font-size: .9em;
 	}
 	div.page  {
-		margin: 0 auto 1.25em;
+		margin: 0 auto 1.75em;
 		display: block;
 		width: 90%;
 		padding: 1em .5em .5em 1em;
@@ -203,15 +203,15 @@ form {
 				cursor: pointer;
 				@include box-shadow ( 0 0 0 .1em rgba(0,0,0,0.1) );
 			}
-		}	
+		}
 		&.active {
 			img {
 				@include box-shadow ( 0 0 0 .1em $brand-info );
 			}
 		}
 		.remove {
 			position: absolute;
-			left: -.25em; 
+			left: -.25em;
 			top: 1em;
 			height: 1em;
 			display: block;
@@ -250,7 +250,7 @@ form {
 #main-pane {
 	background: $gray-light;
 	padding: 4.25em 2em 2em 2em;
-	left: 180px; 
+	left: 180px;
 	height: 100%;
 	width: calc(100% - 200px);
 	position: relative;
@@ -277,7 +277,7 @@ form {
 				cursor: move;
 				.selection-panel {
 					display: block;
-				}	
+				}
 			}
 		}
 		img {
@@ -287,7 +287,7 @@ form {
 			&:hover {
 				cursor: crosshair;
 			}
-		}		
+		}
 	}
 }
 

diff --git a/webapp/static/js/library.js b/webapp/static/js/library.js
@@ -12,7 +12,7 @@ Tabula.FileUpload = Backbone.Model.extend({
     });
   },
 
-  checkStatus: function() {
+  checkStatus: function(original_filename) {
     if(typeof this.get('file_id') == 'undefined' && typeof !this.get('upload_id') == 'undefined'){
       this.pct_complete = 1;
       this.message = "waiting to be processed..."
@@ -32,18 +32,51 @@ Tabula.FileUpload = Backbone.Model.extend({
 
             if (data.status == "error" && data.error_type == "unknown") {
                 // window.location.reload(true);
-            } else if (data.status == "error" && data.error_type == "no-text") {
+            } else if (data.status == "warning" && data.error_type == "no-text") {
                 console.log('no text');
                 window.clearTimeout(this.timer);
 
-                // resets upload/input form
-                $('form#upload').find('button').removeAttr('disabled');
-                $('form#upload')[0].reset();
-
                 //TODO: something prettier.
-                alert("Sorry, your PDF file is image-based; it does not have any embedded text. It might have been scanned from paper... Tabula isn't able to extract any data from image-based PDFs. Click the Help button for more information.");
-            } else if(data.pct_complete < 100) {
-                this.timer = setTimeout(_.bind(this.checkStatus, this), 1000);
+
+				var message = "Sorry, your PDF file is image-based; it does not have any embedded text. Tabula can convert this using OCR, however data should be verified personally after extraction. Click OK to continue with OCR.";
+                var yesOCR = window.confirm(message);
+				if(yesOCR == true){
+					// ajax call to run OCR
+					ocr_data = {
+						'file_path': this.get('file_id'),
+						'file_name': original_filename
+					}
+					this.message = "Performing OCR"
+					$.ajax({
+						type: 'GET',
+						url: '/ocr',
+						data: ocr_data,
+						success: _.bind(function(data) {
+							data = JSON.parse(data);
+							console.log(data.message);
+							if(data.message == "Success"){
+								this.set('upload_id', data.batch_id);
+								this.timer = setTimeout(_.bind(this.checkStatus, this), 1000, original_filename);
+							}else{
+								// resets upload/input form
+								window.clearTimeout(this.timer);
+								window.alert("OCR Conversion failed");
+								$('form#upload').find('button').removeAttr('disabled');
+								$('form#upload')[0].reset();
+							}
+						}, this),
+						error: function(xhr, status, err) {
+							console.log('OCR convertion error: ', err);
+						}
+					});
+				}else{
+					// resets upload/input form
+					window.clearTimeout(this.timer);
+					$('form#upload').find('button').removeAttr('disabled');
+					$('form#upload')[0].reset();
+				}
+			} else if(data.pct_complete < 100) {
+                this.timer = setTimeout(_.bind(this.checkStatus, this), 1000, original_filename);
             } else {
               this.collection.remove(this);
               Tabula.library.files_collection.fetch();
@@ -212,7 +245,7 @@ Tabula.Library = Backbone.View.extend({
                   file_upload.set('id', status.file_id);
                   file_upload.set('upload_id', status.upload_id);
                   file_upload.set('error', !status.success);
-                  file_upload.checkStatus(); //
+                  file_upload.checkStatus(status.filename); //
                 }else{
                   console.log('TODO: failure')
                   file_upload.set('file_id', status.file_id);