Skip to content

Commit

Permalink
Handle image and line regions in output formats ALTO, hOCR and text
Browse files Browse the repository at this point in the history
Tested-by: Merlijn Wajer <merlijn@archive.org>
Signed-off-by: Stefan Weil <sw@weilnetz.de>
  • Loading branch information
stweil committed Feb 10, 2022
1 parent 4b2553c commit 424b17f
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 6 deletions.
35 changes: 33 additions & 2 deletions src/api/altorenderer.cpp
Expand Up @@ -13,9 +13,11 @@
// See the License for the specific language governing permissions and
// limitations under the License.

#include "errcode.h" // for ASSERT_HOST
#ifdef _WIN32
# include "host.h" // windows.h for MultiByteToWideChar, ...
# include "host.h" // windows.h for MultiByteToWideChar, ...
#endif
#include "tprintf.h" // for tprintf

#include <tesseract/baseapi.h>
#include <tesseract/renderer.h>
Expand Down Expand Up @@ -174,6 +176,36 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
continue;
}

int left, top, right, bottom;
auto block_type = res_it->BlockType();

switch (block_type) {
case PT_FLOWING_IMAGE:
case PT_HEADING_IMAGE:
case PT_PULLOUT_IMAGE: {
// Handle all kinds of images.
// TODO: optionally add TYPE, for example TYPE="photo".
alto_str << "\t\t\t\t<Illustration ID=\"cblock_" << bcnt++ << "\"";
AddBoxToAlto(res_it, RIL_BLOCK, alto_str);
alto_str << "</Illustration>\n";
res_it->Next(RIL_BLOCK);
continue;
}
case PT_HORZ_LINE:
case PT_VERT_LINE:
// Handle horizontal and vertical lines.
alto_str << "\t\t\t\t<GraphicalElement ID=\"cblock_" << bcnt++ << "\"";
AddBoxToAlto(res_it, RIL_BLOCK, alto_str);
alto_str << "</GraphicalElement >\n";
res_it->Next(RIL_BLOCK);
continue;
case PT_NOISE:
tprintf("TODO: Please report image which triggers the noise case.\n");
ASSERT_HOST(false);
default:
break;
}

if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
alto_str << "\t\t\t\t<ComposedBlock ID=\"cblock_" << bcnt << "\"";
AddBoxToAlto(res_it, RIL_BLOCK, alto_str);
Expand All @@ -200,7 +232,6 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
bool last_word_in_tblock = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
bool last_word_in_cblock = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);

int left, top, right, bottom;
res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);

do {
Expand Down
16 changes: 16 additions & 0 deletions src/api/baseapi.cpp
Expand Up @@ -1371,6 +1371,22 @@ char *TessBaseAPI::GetUTF8Text() {
if (it->Empty(RIL_PARA)) {
continue;
}
auto block_type = it->BlockType();
switch (block_type) {
case PT_FLOWING_IMAGE:
case PT_HEADING_IMAGE:
case PT_PULLOUT_IMAGE:
case PT_HORZ_LINE:
case PT_VERT_LINE:
// Ignore images and lines for text output.
continue;
case PT_NOISE:
tprintf("TODO: Please report image which triggers the noise case.\n");
ASSERT_HOST(false);
default:
break;
}

const std::unique_ptr<const char[]> para_text(it->GetUTF8Text(RIL_PARA));
text += para_text.get();
} while (it->Next(RIL_PARA));
Expand Down
41 changes: 37 additions & 4 deletions src/api/hocrrenderer.cpp
Expand Up @@ -189,6 +189,36 @@ char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) {

std::unique_ptr<ResultIterator> res_it(GetIterator());
while (!res_it->Empty(RIL_BLOCK)) {
int left, top, right, bottom;
auto block_type = res_it->BlockType();
switch (block_type) {
case PT_FLOWING_IMAGE:
case PT_HEADING_IMAGE:
case PT_PULLOUT_IMAGE: {
// Handle all kinds of images.
res_it.get()->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom);
hocr_str << " <div class='ocr_photo' id='block_" << page_id << '_'
<< bcnt++ << "' title=\"bbox " << left << " " << top << " "
<< right << " " << bottom << "\"></div>\n";
res_it->Next(RIL_BLOCK);
continue;
}
case PT_HORZ_LINE:
case PT_VERT_LINE:
// Handle horizontal and vertical lines.
res_it.get()->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom);
hocr_str << " <div class='ocr_separator' id='block_" << page_id << '_'
<< bcnt++ << "' title=\"bbox " << left << " " << top << " "
<< right << " " << bottom << "\"></div>\n";
res_it->Next(RIL_BLOCK);
continue;
case PT_NOISE:
tprintf("TODO: Please report image which triggers the noise case.\n");
ASSERT_HOST(false);
default:
break;
}

if (res_it->Empty(RIL_WORD)) {
res_it->Next(RIL_WORD);
continue;
Expand Down Expand Up @@ -218,7 +248,7 @@ char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) {
}
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
hocr_str << "\n <span class='";
switch (res_it->BlockType()) {
switch (block_type) {
case PT_HEADING_TEXT:
hocr_str << "ocr_header";
break;
Expand All @@ -228,6 +258,11 @@ char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) {
case PT_CAPTION_TEXT:
hocr_str << "ocr_caption";
break;
case PT_FLOWING_IMAGE:
case PT_HEADING_IMAGE:
case PT_PULLOUT_IMAGE:
ASSERT_HOST(false);
break;
default:
hocr_str << "ocr_line";
}
Expand All @@ -248,12 +283,10 @@ char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) {
hocr_str << "\n <span class='ocrx_word'"
<< " id='"
<< "word_" << page_id << "_" << wcnt << "'";
int left, top, right, bottom;
bool bold, italic, underlined, monospace, serif, smallcaps;
int pointsize, font_id;
const char *font_name;
res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
font_name =
const char *font_name =
res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
&serif, &smallcaps, &pointsize, &font_id);
hocr_str << " title='bbox " << left << " " << top << " " << right << " "
Expand Down

0 comments on commit 424b17f

Please sign in to comment.