Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
baseapi.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: baseapi.cpp
3  * Description: Simple API for calling tesseract.
4  * Author: Ray Smith
5  * Created: Fri Oct 06 15:35:01 PDT 2006
6  *
7  * (C) Copyright 2006, Google Inc.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 // Include automatically generated configuration file if running autoconf.
21 #ifdef HAVE_CONFIG_H
22 #include "config_auto.h"
23 #endif
24 
25 #include "allheaders.h"
26 
27 #ifdef USING_GETTEXT
28 #include <libintl.h>
29 #include <locale.h>
30 #define _(x) gettext(x)
31 #else
32 #define _(x) (x)
33 #endif
34 
35 #include "baseapi.h"
36 
37 #include "resultiterator.h"
38 #include "mutableiterator.h"
39 #include "thresholder.h"
40 #include "tesseractclass.h"
41 #include "pageres.h"
42 #include "paragraphs.h"
43 #include "tessvars.h"
44 #include "control.h"
45 #include "pgedit.h"
46 #include "paramsd.h"
47 #include "output.h"
48 #include "globals.h"
49 #include "edgblob.h"
50 #include "equationdetect.h"
51 #include "tessbox.h"
52 #include "imgs.h"
53 #include "imgtiff.h"
54 #include "makerow.h"
55 #include "permute.h"
56 #include "otsuthr.h"
57 #include "osdetect.h"
58 #include "params.h"
59 #include "strngs.h"
60 
61 #ifdef _WIN32
62 #include <windows.h>
63 #include <stdlib.h>
64 #else
65 #include <glob.h>
66 #include <libgen.h>
67 #include <string.h>
68 #endif
69 
70 #if defined(_WIN32) && !defined(VERSION)
71 #include "version.h"
72 #endif
73 
74 namespace tesseract {
75 
77 const int kMinRectSize = 10;
79 const char kTesseractReject = '~';
81 const char kUNLVReject = '~';
83 const char kUNLVSuspect = '^';
88 const char* kInputFile = "noname.tif";
90 const char* kOldVarsFile = "failed_vars.txt";
92 const int kMaxIntSize = 22;
97 const int kMinCredibleResolution = 70;
99 const int kMaxCredibleResolution = 2400;
100 
102  : tesseract_(NULL),
103  osd_tesseract_(NULL),
104  equ_detect_(NULL),
105  // Thresholder is initialized to NULL here, but will be set before use by:
106  // A constructor of a derived API, SetThresholder(), or
107  // created implicitly when used in InternalSetImage.
108  thresholder_(NULL),
109  paragraph_models_(NULL),
110  block_list_(NULL),
111  page_res_(NULL),
112  input_file_(NULL),
113  output_file_(NULL),
114  datapath_(NULL),
115  language_(NULL),
116  last_oem_requested_(OEM_DEFAULT),
117  recognition_done_(false),
118  truth_cb_(NULL),
119  rect_left_(0), rect_top_(0), rect_width_(0), rect_height_(0),
120  image_width_(0), image_height_(0) {
121 }
122 
124  End();
125 }
126 
130 const char* TessBaseAPI::Version() {
131  return VERSION;
132 }
133 
138 void TessBaseAPI::SetInputName(const char* name) {
139  if (input_file_ == NULL)
140  input_file_ = new STRING(name);
141  else
142  *input_file_ = name;
143 }
144 
146 void TessBaseAPI::SetOutputName(const char* name) {
147  if (output_file_ == NULL)
148  output_file_ = new STRING(name);
149  else
150  *output_file_ = name;
151 }
152 
153 bool TessBaseAPI::SetVariable(const char* name, const char* value) {
154  if (tesseract_ == NULL) tesseract_ = new Tesseract;
156  tesseract_->params());
157 }
158 
159 bool TessBaseAPI::SetDebugVariable(const char* name, const char* value) {
160  if (tesseract_ == NULL) tesseract_ = new Tesseract;
162  tesseract_->params());
163 }
164 
165 bool TessBaseAPI::GetIntVariable(const char *name, int *value) const {
166  IntParam *p = ParamUtils::FindParam<IntParam>(
168  if (p == NULL) return false;
169  *value = (inT32)(*p);
170  return true;
171 }
172 
173 bool TessBaseAPI::GetBoolVariable(const char *name, bool *value) const {
174  BoolParam *p = ParamUtils::FindParam<BoolParam>(
176  if (p == NULL) return false;
177  *value = (BOOL8)(*p);
178  return true;
179 }
180 
181 const char *TessBaseAPI::GetStringVariable(const char *name) const {
182  StringParam *p = ParamUtils::FindParam<StringParam>(
184  return (p != NULL) ? p->string() : NULL;
185 }
186 
187 bool TessBaseAPI::GetDoubleVariable(const char *name, double *value) const {
188  DoubleParam *p = ParamUtils::FindParam<DoubleParam>(
190  if (p == NULL) return false;
191  *value = (double)(*p);
192  return true;
193 }
194 
196 bool TessBaseAPI::GetVariableAsString(const char *name, STRING *val) {
197  return ParamUtils::GetParamAsString(name, tesseract_->params(), val);
198 }
199 
201 void TessBaseAPI::PrintVariables(FILE *fp) const {
203 }
204 
213 int TessBaseAPI::Init(const char* datapath, const char* language,
214  OcrEngineMode oem, char **configs, int configs_size,
215  const GenericVector<STRING> *vars_vec,
216  const GenericVector<STRING> *vars_values,
217  bool set_only_non_debug_params) {
218  // Default language is "eng".
219  if (language == NULL) language = "eng";
220  // If the datapath, OcrEngineMode or the language have changed - start again.
221  // Note that the language_ field stores the last requested language that was
222  // initialized successfully, while tesseract_->lang stores the language
223  // actually used. They differ only if the requested language was NULL, in
224  // which case tesseract_->lang is set to the Tesseract default ("eng").
225  if (tesseract_ != NULL &&
226  (datapath_ == NULL || language_ == NULL ||
227  *datapath_ != datapath || last_oem_requested_ != oem ||
228  (*language_ != language && tesseract_->lang != language))) {
229  delete tesseract_;
230  tesseract_ = NULL;
231  }
232 
233  bool reset_classifier = true;
234  if (tesseract_ == NULL) {
235  reset_classifier = false;
236  tesseract_ = new Tesseract;
238  datapath, output_file_ != NULL ? output_file_->string() : NULL,
239  language, oem, configs, configs_size, vars_vec, vars_values,
240  set_only_non_debug_params) != 0) {
241  return -1;
242  }
243  }
244  // Update datapath and language requested for the last valid initialization.
245  if (datapath_ == NULL)
246  datapath_ = new STRING(datapath);
247  else
248  *datapath_ = datapath;
249  if (language_ == NULL)
250  language_ = new STRING(language);
251  else
252  *language_ = language;
254 
255  // For same language and datapath, just reset the adaptive classifier.
256  if (reset_classifier) tesseract_->ResetAdaptiveClassifier();
257 
258  return 0;
259 }
260 
270  return (language_ == NULL || language_->string() == NULL) ?
271  "" : language_->string();
272 }
273 
280  GenericVector<STRING>* langs) const {
281  langs->clear();
282  if (tesseract_ != NULL) {
283  langs->push_back(tesseract_->lang);
284  int num_subs = tesseract_->num_sub_langs();
285  for (int i = 0; i < num_subs; ++i)
286  langs->push_back(tesseract_->get_sub_lang(i)->lang);
287  }
288 }
289 
294  GenericVector<STRING>* langs) const {
295  langs->clear();
296  if (tesseract_ != NULL) {
297  STRING pattern = tesseract_->datadir + "/*." + kTrainedDataSuffix;
298 #ifdef _WIN32
299  char fname[_MAX_FNAME];
300  WIN32_FIND_DATA data;
301  BOOL result = TRUE;
302  HANDLE handle = FindFirstFile(pattern.string(), &data);
303  if (handle != INVALID_HANDLE_VALUE) {
304  for (; result; result = FindNextFile(handle, &data)) {
305  _splitpath(data.cFileName, NULL, NULL, fname, NULL);
306  langs->push_back(STRING(fname));
307  }
308  FindClose(handle);
309  }
310 #else
311  glob_t pglob;
312  char **paths;
313  char *path, *dot;
314  if (glob(pattern.string(), 0, NULL, &pglob) == 0) {
315  for (paths = pglob.gl_pathv; *paths != NULL; paths++) {
316  path = basename(*paths);
317  if ((dot = strchr(path, '.'))) {
318  *dot = '\0';
319  langs->push_back(STRING(path));
320  }
321  }
322  globfree(&pglob);
323  }
324 #endif
325  }
326 }
327 
334 int TessBaseAPI::InitLangMod(const char* datapath, const char* language) {
335  if (tesseract_ == NULL)
336  tesseract_ = new Tesseract;
337  return tesseract_->init_tesseract_lm(datapath, NULL, language);
338 }
339 
345  if (tesseract_ == NULL) {
346  tesseract_ = new Tesseract;
348  }
349 }
350 
358 }
359 
363 }
364 
371  if (tesseract_ == NULL)
372  tesseract_ = new Tesseract;
373  tesseract_->tessedit_pageseg_mode.set_value(mode);
374 }
375 
378  if (tesseract_ == NULL)
379  return PSM_SINGLE_BLOCK;
380  return static_cast<PageSegMode>(
381  static_cast<int>(tesseract_->tessedit_pageseg_mode));
382 }
383 
397 char* TessBaseAPI::TesseractRect(const unsigned char* imagedata,
398  int bytes_per_pixel,
399  int bytes_per_line,
400  int left, int top,
401  int width, int height) {
402  if (tesseract_ == NULL || width < kMinRectSize || height < kMinRectSize)
403  return NULL; // Nothing worth doing.
404 
405  // Since this original api didn't give the exact size of the image,
406  // we have to invent a reasonable value.
407  int bits_per_pixel = bytes_per_pixel == 0 ? 1 : bytes_per_pixel * 8;
408  SetImage(imagedata, bytes_per_line * 8 / bits_per_pixel, height + top,
409  bytes_per_pixel, bytes_per_line);
410  SetRectangle(left, top, width, height);
411 
412  return GetUTF8Text();
413 }
414 
420  if (tesseract_ == NULL)
421  return;
424 }
425 
435 void TessBaseAPI::SetImage(const unsigned char* imagedata,
436  int width, int height,
437  int bytes_per_pixel, int bytes_per_line) {
438  if (InternalSetImage())
439  thresholder_->SetImage(imagedata, width, height,
440  bytes_per_pixel, bytes_per_line);
441 }
442 
444  if (thresholder_)
446  else
447  tprintf("Please call SetImage before SetSourceResolution.\n");
448 }
449 
460 void TessBaseAPI::SetImage(const Pix* pix) {
461  if (InternalSetImage())
462  thresholder_->SetImage(pix);
463 }
464 
470 void TessBaseAPI::SetRectangle(int left, int top, int width, int height) {
471  if (thresholder_ == NULL)
472  return;
473  thresholder_->SetRectangle(left, top, width, height);
474  ClearResults();
475 }
476 
482  if (tesseract_ == NULL)
483  return NULL;
484  if (tesseract_->pix_binary() == NULL)
486  return pixClone(tesseract_->pix_binary());
487 }
488 
494 Boxa* TessBaseAPI::GetRegions(Pixa** pixa) {
495  return GetComponentImages(RIL_BLOCK, false, pixa, NULL);
496 }
497 
504 Boxa* TessBaseAPI::GetTextlines(Pixa** pixa, int** blockids) {
505  return GetComponentImages(RIL_TEXTLINE, true, pixa, blockids);
506 }
507 
516 Boxa* TessBaseAPI::GetStrips(Pixa** pixa, int** blockids) {
517  return GetComponentImages(RIL_TEXTLINE, false, pixa, blockids);
518 }
519 
525 Boxa* TessBaseAPI::GetWords(Pixa** pixa) {
526  return GetComponentImages(RIL_WORD, true, pixa, NULL);
527 }
528 
536  return GetComponentImages(RIL_SYMBOL, true, pixa, NULL);
537 }
538 
548  bool text_only,
549  Pixa** pixa, int** blockids) {
550  PageIterator* page_it = GetIterator();
551  if (page_it == NULL)
552  page_it = AnalyseLayout();
553  if (page_it == NULL)
554  return NULL; // Failed.
555 
556  // Count the components to get a size for the arrays.
557  int component_count = 0;
558  int left, top, right, bottom;
559  do {
560  if (page_it->BoundingBoxInternal(level, &left, &top, &right, &bottom) &&
561  (!text_only || PTIsTextType(page_it->BlockType())))
562  ++component_count;
563  } while (page_it->Next(level));
564 
565  Boxa* boxa = boxaCreate(component_count);
566  if (pixa != NULL)
567  *pixa = pixaCreate(component_count);
568  if (blockids != NULL)
569  *blockids = new int[component_count];
570 
571  int blockid = 0;
572  int component_index = 0;
573  page_it->Begin();
574  do {
575  if (page_it->BoundingBoxInternal(level, &left, &top, &right, &bottom) &&
576  (!text_only || PTIsTextType(page_it->BlockType()))) {
577  Box* lbox = boxCreate(left, top, right - left, bottom - top);
578  boxaAddBox(boxa, lbox, L_INSERT);
579  if (pixa != NULL) {
580  Pix* pix = page_it->GetBinaryImage(level);
581  pixaAddPix(*pixa, pix, L_INSERT);
582  pixaAddBox(*pixa, lbox, L_CLONE);
583  }
584  if (blockids != NULL) {
585  (*blockids)[component_index] = blockid;
586  if (page_it->IsAtFinalElement(RIL_BLOCK, level))
587  ++blockid;
588  }
589  ++component_index;
590  }
591  } while (page_it->Next(level));
592  delete page_it;
593  return boxa;
594 }
595 
597  if (thresholder_ == NULL) {
598  return 0;
599  }
600  return thresholder_->GetScaleFactor();
601 }
602 
604 void TessBaseAPI::DumpPGM(const char* filename) {
605  if (tesseract_ == NULL)
606  return;
607  FILE *fp = fopen(filename, "wb");
608  Pix* pix = tesseract_->pix_binary();
609  int width = pixGetWidth(pix);
610  int height = pixGetHeight(pix);
611  l_uint32* data = pixGetData(pix);
612  fprintf(fp, "P5 %d %d 255\n", width, height);
613  for (int y = 0; y < height; ++y, data += pixGetWpl(pix)) {
614  for (int x = 0; x < width; ++x) {
615  uinT8 b = GET_DATA_BIT(data, x) ? 0 : 255;
616  fwrite(&b, 1, 1, fp);
617  }
618  }
619  fclose(fp);
620 }
621 
628 int CubeAPITest(Boxa* boxa_blocks, Pixa* pixa_blocks,
629  Boxa* boxa_words, Pixa* pixa_words,
630  const FCOORD& reskew, Pix* page_pix,
631  PAGE_RES* page_res) {
632  int block_count = boxaGetCount(boxa_blocks);
633  ASSERT_HOST(block_count == pixaGetCount(pixa_blocks));
634  // Write each block to the current directory as junk_write_display.nnn.png.
635  for (int i = 0; i < block_count; ++i) {
636  Pix* pix = pixaGetPix(pixa_blocks, i, L_CLONE);
637  pixDisplayWrite(pix, 1);
638  }
639  int word_count = boxaGetCount(boxa_words);
640  ASSERT_HOST(word_count == pixaGetCount(pixa_words));
641  int pr_word = 0;
642  PAGE_RES_IT page_res_it(page_res);
643  for (page_res_it.restart_page(); page_res_it.word () != NULL;
644  page_res_it.forward(), ++pr_word) {
645  WERD_RES *word = page_res_it.word();
646  WERD_CHOICE* choice = word->best_choice;
647  // Write the first 100 words to files names wordims/<wordstring>.tif.
648  if (pr_word < 100) {
649  STRING filename("wordims/");
650  if (choice != NULL) {
651  filename += choice->unichar_string();
652  } else {
653  char numbuf[32];
654  filename += "unclassified";
655  snprintf(numbuf, 32, "%03d", pr_word);
656  filename += numbuf;
657  }
658  filename += ".tif";
659  Pix* pix = pixaGetPix(pixa_words, pr_word, L_CLONE);
660  pixWrite(filename.string(), pix, IFF_TIFF_G4);
661  }
662  }
663  ASSERT_HOST(pr_word == word_count);
664  return 0;
665 }
666 
679  if (FindLines() == 0) {
680  if (block_list_->empty())
681  return NULL; // The page was empty.
683  DetectParagraphs(false);
684  return new PageIterator(
688  }
689  return NULL;
690 }
691 
697  if (tesseract_ == NULL)
698  return -1;
699  if (FindLines() != 0)
700  return -1;
701  if (page_res_ != NULL)
702  delete page_res_;
703  if (block_list_->empty()) {
705  return 0; // Empty page.
706  }
707 
709  recognition_done_ = true;
714  else
718  return 0;
719  }
720 
721  if (truth_cb_ != NULL) {
722  tesseract_->wordrec_run_blamer.set_value(true);
725  }
726 
727  int result = 0;
729  #ifndef GRAPHICS_DISABLED
731  #endif // GRAPHICS_DISABLED
732  // The page_res is invalid after an interactive session, so cleanup
733  // in a way that lets us continue to the next page without crashing.
734  delete page_res_;
735  page_res_ = NULL;
736  return -1;
739  } else if (tesseract_->tessedit_ambigs_training) {
740  FILE *training_output_file = tesseract_->init_recog_training(*input_file_);
741  // OCR the page segmented into words by tesseract.
743  *input_file_, page_res_, monitor, training_output_file);
744  fclose(training_output_file);
745  } else {
746  // Now run the main recognition.
747  if (tesseract_->recog_all_words(page_res_, monitor, NULL, NULL, 0)) {
748  DetectParagraphs(true);
749  } else {
750  result = -1;
751  }
752  }
753  return result;
754 }
755 
758  if (tesseract_ == NULL)
759  return -1;
760  if (thresholder_ == NULL || thresholder_->IsEmpty()) {
761  tprintf("Please call SetImage before attempting recognition.");
762  return -1;
763  }
764  if (page_res_ != NULL)
765  ClearResults();
766  if (FindLines() != 0)
767  return -1;
768  // Additional conditions under which chopper test cannot be run
769  if (tesseract_->interactive_display_mode) return -1;
770 
771  recognition_done_ = true;
772 
774 
775  PAGE_RES_IT page_res_it(page_res_);
776 
777  while (page_res_it.word() != NULL) {
778  WERD_RES *word_res = page_res_it.word();
779  GenericVector<TBOX> boxes;
780  tesseract_->MaximallyChopWord(boxes, page_res_it.block()->block,
781  page_res_it.row()->row, word_res);
782  page_res_it.forward();
783  }
784  return 0;
785 }
786 
804  const char* retry_config, int timeout_millisec,
805  STRING* text_out) {
806  int page = tesseract_->tessedit_page_number;
807  if (page < 0)
808  page = 0;
809  FILE* fp = fopen(filename, "rb");
810  if (fp == NULL) {
811  tprintf(_("Image file %s cannot be opened!\n"), filename);
812  return false;
813  }
814  // Find the number of pages if a tiff file, or zero otherwise.
815  int npages = CountTiffPages(fp);
816  fclose(fp);
817 
819  *text_out =
820  "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
821  "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n"
822  " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
823  "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" "
824  "lang=\"en\">\n <head>\n <title></title>\n"
825  " <meta http-equiv=\"Content-Type\" content=\"text/html; "
826  "charset=utf-8\" />\n"
827  " <meta name='ocr-system' content='tesseract " VERSION "' />\n"
828  " <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par"
829  " ocr_line ocrx_word'/>\n"
830  " </head>\n <body>\n";
831  } else {
832  *text_out = "";
833  }
834 
835  bool success = true;
836  Pix *pix;
837  if (npages > 0) {
838  for (; page < npages && (pix = pixReadTiff(filename, page)) != NULL;
839  ++page) {
840  if ((page >= 0) && (npages > 1))
841  tprintf(_("Page %d of %d\n"), page + 1, npages);
842  char page_str[kMaxIntSize];
843  snprintf(page_str, kMaxIntSize - 1, "%d", page);
844  SetVariable("applybox_page", page_str);
845  success &= ProcessPage(pix, page, filename, retry_config,
846  timeout_millisec, text_out);
847  pixDestroy(&pix);
848  if (tesseract_->tessedit_page_number >= 0 || npages == 1) {
849  break;
850  }
851  }
852  } else {
853  // The file is not a tiff file, so use the general pixRead function.
854  pix = pixRead(filename);
855  if (pix != NULL) {
856  success &= ProcessPage(pix, 0, filename, retry_config,
857  timeout_millisec, text_out);
858  pixDestroy(&pix);
859  } else {
860  // The file is not an image file, so try it as a list of filenames.
861  FILE* fimg = fopen(filename, "rb");
862  if (fimg == NULL) {
863  tprintf(_("File %s cannot be opened!\n"), filename);
864  return false;
865  }
866  tprintf(_("Reading %s as a list of filenames...\n"), filename);
867  char pagename[MAX_PATH];
868  // Skip to the requested page number.
869  for (int i = 0; i < page &&
870  fgets(pagename, sizeof(pagename), fimg) != NULL;
871  ++i);
872  while (fgets(pagename, sizeof(pagename), fimg) != NULL) {
873  chomp_string(pagename);
874  pix = pixRead(pagename);
875  if (pix == NULL) {
876  tprintf(_("Image file %s cannot be read!\n"), pagename);
877  fclose(fimg);
878  return false;
879  }
880  tprintf(_("Page %d : %s\n"), page, pagename);
881  success &= ProcessPage(pix, page, pagename, retry_config,
882  timeout_millisec, text_out);
883  pixDestroy(&pix);
884  ++page;
885  }
886  fclose(fimg);
887  }
888  }
890  *text_out += " </body>\n</html>\n";
891  return success;
892 }
893 
905 bool TessBaseAPI::ProcessPage(Pix* pix, int page_index, const char* filename,
906  const char* retry_config, int timeout_millisec,
907  STRING* text_out) {
908  SetInputName(filename);
909  SetImage(pix);
910  bool failed = false;
911  if (timeout_millisec > 0) {
912  // Running with a timeout.
913  ETEXT_DESC monitor;
914  monitor.cancel = NULL;
915  monitor.cancel_this = NULL;
916  monitor.set_deadline_msecs(timeout_millisec);
917  // Now run the main recognition.
918  failed = Recognize(&monitor) < 0;
921  // Disabled character recognition.
922  PageIterator* it = AnalyseLayout();
923  if (it == NULL) {
924  failed = true;
925  } else {
926  delete it;
927  return true;
928  }
929  } else {
930  // Normal layout and character recognition with no timeout.
931  failed = Recognize(NULL) < 0;
932  }
934  Pix* page_pix = GetThresholdedImage();
935  pixWrite("tessinput.tif", page_pix, IFF_TIFF_G4);
936  }
937  if (failed && retry_config != NULL && retry_config[0] != '\0') {
938  // Save current config variables before switching modes.
939  FILE* fp = fopen(kOldVarsFile, "wb");
940  PrintVariables(fp);
941  fclose(fp);
942  // Switch to alternate mode for retry.
943  ReadConfigFile(retry_config);
944  SetImage(pix);
945  Recognize(NULL);
946  // Restore saved config variables.
948  }
949  // Get text only if successful.
950  if (!failed) {
951  char* text;
954  text = GetBoxText(page_index);
955  } else if (tesseract_->tessedit_write_unlv) {
956  text = GetUNLVText();
957  } else if (tesseract_->tessedit_create_hocr) {
958  text = GetHOCRText(page_index);
959  } else {
960  text = GetUTF8Text();
961  }
962  *text_out += text;
963  delete [] text;
964  return true;
965  }
966  return false;
967 }
968 
974  if (tesseract_ == NULL || page_res_ == NULL)
975  return NULL;
976  return new LTRResultIterator(
980 }
981 
991  if (tesseract_ == NULL || page_res_ == NULL)
992  return NULL;
997 }
998 
1008  if (tesseract_ == NULL || page_res_ == NULL)
1009  return NULL;
1010  return new MutableIterator(page_res_, tesseract_,
1014 }
1015 
1018  if (tesseract_ == NULL ||
1019  (!recognition_done_ && Recognize(NULL) < 0))
1020  return NULL;
1021  STRING text("");
1022  ResultIterator *it = GetIterator();
1023  do {
1024  if (it->Empty(RIL_PARA)) continue;
1025  char *para_text = it->GetUTF8Text(RIL_PARA);
1026  text += para_text;
1027  delete []para_text;
1028  } while (it->Next(RIL_PARA));
1029  char* result = new char[text.length() + 1];
1030  strncpy(result, text.string(), text.length() + 1);
1031  delete it;
1032  return result;
1033 }
1034 
1035 static void AddBoxTohOCR(const PageIterator *it,
1036  PageIteratorLevel level,
1037  STRING* hocr_str) {
1038  int left, top, right, bottom;
1039  it->BoundingBox(level, &left, &top, &right, &bottom);
1040  hocr_str->add_str_int("' title=\"bbox ", left);
1041  hocr_str->add_str_int(" ", top);
1042  hocr_str->add_str_int(" ", right);
1043  hocr_str->add_str_int(" ", bottom);
1044  *hocr_str += "\">";
1045 }
1046 
1055 char* TessBaseAPI::GetHOCRText(int page_number) {
1056  if (tesseract_ == NULL ||
1057  (page_res_ == NULL && Recognize(NULL) < 0))
1058  return NULL;
1059 
1060  int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
1061  int page_id = page_number + 1; // hOCR uses 1-based page numbers.
1062 
1063  STRING hocr_str("");
1064 
1065  if (input_file_ == NULL)
1066  SetInputName(NULL);
1067 
1068  hocr_str.add_str_int(" <div class='ocr_page' id='page_", page_id);
1069  hocr_str += "' title='image \"";
1070  hocr_str += input_file_ ? *input_file_ : "unknown";
1071  hocr_str.add_str_int("\"; bbox ", rect_left_);
1072  hocr_str.add_str_int(" ", rect_top_);
1073  hocr_str.add_str_int(" ", rect_width_);
1074  hocr_str.add_str_int(" ", rect_height_);
1075  hocr_str.add_str_int("; ppageno ", page_number);
1076  hocr_str += "'>\n";
1077 
1078  ResultIterator *res_it = GetIterator();
1079  while (!res_it->Empty(RIL_BLOCK)) {
1080  if (res_it->Empty(RIL_WORD)) {
1081  res_it->Next(RIL_WORD);
1082  continue;
1083  }
1084 
1085  // Open any new block/paragraph/textline.
1086  if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
1087  hocr_str.add_str_int(" <div class='ocr_carea' id='block_", bcnt);
1088  hocr_str.add_str_int("_", bcnt);
1089  AddBoxTohOCR(res_it, RIL_BLOCK, &hocr_str);
1090  }
1091  if (res_it->IsAtBeginningOf(RIL_PARA)) {
1092  if (res_it->ParagraphIsLtr()) {
1093  hocr_str.add_str_int("\n <p class='ocr_par' dir='ltr' id='par_", pcnt);
1094  } else {
1095  hocr_str.add_str_int("\n <p class='ocr_par' dir='rtl' id='par_", pcnt);
1096  }
1097  AddBoxTohOCR(res_it, RIL_PARA, &hocr_str);
1098  }
1099  if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
1100  hocr_str.add_str_int("\n <span class='ocr_line' id='line_", lcnt);
1101  AddBoxTohOCR(res_it, RIL_TEXTLINE, &hocr_str);
1102  }
1103 
1104  // Now, process the word...
1105  hocr_str.add_str_int("<span class='ocrx_word' id='word_", wcnt);
1106  AddBoxTohOCR(res_it, RIL_WORD, &hocr_str);
1107  const char *font_name;
1108  bool bold, italic, underlined, monospace, serif, smallcaps;
1109  int pointsize, font_id;
1110  font_name = res_it->WordFontAttributes(&bold, &italic, &underlined,
1111  &monospace, &serif, &smallcaps,
1112  &pointsize, &font_id);
1113  bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
1114  bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
1115  bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
1116  if (bold) hocr_str += "<strong>";
1117  if (italic) hocr_str += "<em>";
1118  do {
1119  const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
1120  if (grapheme && grapheme[0] != 0) {
1121  if (grapheme[1] == 0) {
1122  switch (grapheme[0]) {
1123  case '<': hocr_str += "&lt;"; break;
1124  case '>': hocr_str += "&gt;"; break;
1125  case '&': hocr_str += "&amp;"; break;
1126  case '"': hocr_str += "&quot;"; break;
1127  case '\'': hocr_str += "&#39;"; break;
1128  default: hocr_str += grapheme;
1129  }
1130  } else {
1131  hocr_str += grapheme;
1132  }
1133  }
1134  delete []grapheme;
1135  res_it->Next(RIL_SYMBOL);
1136  } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
1137  if (italic) hocr_str += "</em>";
1138  if (bold) hocr_str += "</strong>";
1139  hocr_str += "</span> ";
1140  wcnt++;
1141  // Close any ending block/paragraph/textline.
1142  if (last_word_in_line) {
1143  hocr_str += "\n </span>";
1144  lcnt++;
1145  }
1146  if (last_word_in_para) {
1147  hocr_str += "\n </p>\n";
1148  pcnt++;
1149  }
1150  if (last_word_in_block) {
1151  hocr_str += " </div>\n";
1152  bcnt++;
1153  }
1154  }
1155  hocr_str += " </div>\n";
1156 
1157  char *ret = new char[hocr_str.length() + 1];
1158  strcpy(ret, hocr_str.string());
1159  delete res_it;
1160  return ret;
1161 }
1162 
1164 const int kNumbersPerBlob = 5;
1169 const int kBytesPerNumber = 5;
1178 const int kBytesPer64BitNumber = 20;
1186  UNICHAR_LEN;
1187 
1193 char* TessBaseAPI::GetBoxText(int page_number) {
1194  if (tesseract_ == NULL ||
1195  (!recognition_done_ && Recognize(NULL) < 0))
1196  return NULL;
1197  int blob_count;
1198  int utf8_length = TextLength(&blob_count);
1199  int total_length = blob_count * kBytesPerBoxFileLine + utf8_length +
1201  char* result = new char[total_length];
1202  strcpy(result, "\0");
1203  int output_length = 0;
1205  do {
1206  int left, top, right, bottom;
1207  if (it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom)) {
1208  char* text = it->GetUTF8Text(RIL_SYMBOL);
1209  // Tesseract uses space for recognition failure. Fix to a reject
1210  // character, kTesseractReject so we don't create illegal box files.
1211  for (int i = 0; text[i] != '\0'; ++i) {
1212  if (text[i] == ' ')
1213  text[i] = kTesseractReject;
1214  }
1215  snprintf(result + output_length, total_length - output_length,
1216  "%s %d %d %d %d %d\n",
1217  text, left, image_height_ - bottom,
1218  right, image_height_ - top, page_number);
1219  output_length += strlen(result + output_length);
1220  delete [] text;
1221  // Just in case...
1222  if (output_length + kMaxBytesPerLine > total_length)
1223  break;
1224  }
1225  } while (it->Next(RIL_SYMBOL));
1226  delete it;
1227  return result;
1228 }
1229 
1235 const int kUniChs[] = {
1236  0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0
1237 };
1239 const int kLatinChs[] = {
1240  0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0
1241 };
1242 
1249  if (tesseract_ == NULL ||
1250  (!recognition_done_ && Recognize(NULL) < 0))
1251  return NULL;
1252  bool tilde_crunch_written = false;
1253  bool last_char_was_newline = true;
1254  bool last_char_was_tilde = false;
1255 
1256  int total_length = TextLength(NULL);
1257  PAGE_RES_IT page_res_it(page_res_);
1258  char* result = new char[total_length];
1259  char* ptr = result;
1260  for (page_res_it.restart_page(); page_res_it.word () != NULL;
1261  page_res_it.forward()) {
1262  WERD_RES *word = page_res_it.word();
1263  // Process the current word.
1264  if (word->unlv_crunch_mode != CR_NONE) {
1265  if (word->unlv_crunch_mode != CR_DELETE &&
1266  (!tilde_crunch_written ||
1267  (word->unlv_crunch_mode == CR_KEEP_SPACE &&
1268  word->word->space() > 0 &&
1269  !word->word->flag(W_FUZZY_NON) &&
1270  !word->word->flag(W_FUZZY_SP)))) {
1271  if (!word->word->flag(W_BOL) &&
1272  word->word->space() > 0 &&
1273  !word->word->flag(W_FUZZY_NON) &&
1274  !word->word->flag(W_FUZZY_SP)) {
1275  /* Write a space to separate from preceeding good text */
1276  *ptr++ = ' ';
1277  last_char_was_tilde = false;
1278  }
1279  if (!last_char_was_tilde) {
1280  // Write a reject char.
1281  last_char_was_tilde = true;
1282  *ptr++ = kUNLVReject;
1283  tilde_crunch_written = true;
1284  last_char_was_newline = false;
1285  }
1286  }
1287  } else {
1288  // NORMAL PROCESSING of non tilde crunched words.
1289  tilde_crunch_written = false;
1291  const char* wordstr = word->best_choice->unichar_string().string();
1292  const STRING& lengths = word->best_choice->unichar_lengths();
1293  int length = lengths.length();
1294  int i = 0;
1295  int offset = 0;
1296 
1297  if (last_char_was_tilde &&
1298  word->word->space() == 0 && wordstr[offset] == ' ') {
1299  // Prevent adjacent tilde across words - we know that adjacent tildes
1300  // within words have been removed.
1301  // Skip the first character.
1302  offset = lengths[i++];
1303  }
1304  if (i < length && wordstr[offset] != 0) {
1305  if (!last_char_was_newline)
1306  *ptr++ = ' ';
1307  else
1308  last_char_was_newline = false;
1309  for (; i < length; offset += lengths[i++]) {
1310  if (wordstr[offset] == ' ' ||
1311  wordstr[offset] == kTesseractReject) {
1312  *ptr++ = kUNLVReject;
1313  last_char_was_tilde = true;
1314  } else {
1315  if (word->reject_map[i].rejected())
1316  *ptr++ = kUNLVSuspect;
1317  UNICHAR ch(wordstr + offset, lengths[i]);
1318  int uni_ch = ch.first_uni();
1319  for (int j = 0; kUniChs[j] != 0; ++j) {
1320  if (kUniChs[j] == uni_ch) {
1321  uni_ch = kLatinChs[j];
1322  break;
1323  }
1324  }
1325  if (uni_ch <= 0xff) {
1326  *ptr++ = static_cast<char>(uni_ch);
1327  last_char_was_tilde = false;
1328  } else {
1329  *ptr++ = kUNLVReject;
1330  last_char_was_tilde = true;
1331  }
1332  }
1333  }
1334  }
1335  }
1336  if (word->word->flag(W_EOL) && !last_char_was_newline) {
1337  /* Add a new line output */
1338  *ptr++ = '\n';
1339  tilde_crunch_written = false;
1340  last_char_was_newline = true;
1341  last_char_was_tilde = false;
1342  }
1343  }
1344  *ptr++ = '\n';
1345  *ptr = '\0';
1346  return result;
1347 }
1348 
1351  int* conf = AllWordConfidences();
1352  if (!conf) return 0;
1353  int sum = 0;
1354  int *pt = conf;
1355  while (*pt >= 0) sum += *pt++;
1356  if (pt != conf) sum /= pt - conf;
1357  delete [] conf;
1358  return sum;
1359 }
1360 
1363  if (tesseract_ == NULL ||
1364  (!recognition_done_ && Recognize(NULL) < 0))
1365  return NULL;
1366  int n_word = 0;
1367  PAGE_RES_IT res_it(page_res_);
1368  for (res_it.restart_page(); res_it.word() != NULL; res_it.forward())
1369  n_word++;
1370 
1371  int* conf = new int[n_word+1];
1372  n_word = 0;
1373  for (res_it.restart_page(); res_it.word() != NULL; res_it.forward()) {
1374  WERD_RES *word = res_it.word();
1375  WERD_CHOICE* choice = word->best_choice;
1376  int w_conf = static_cast<int>(100 + 5 * choice->certainty());
1377  // This is the eq for converting Tesseract confidence to 1..100
1378  if (w_conf < 0) w_conf = 0;
1379  if (w_conf > 100) w_conf = 100;
1380  conf[n_word++] = w_conf;
1381  }
1382  conf[n_word] = -1;
1383  return conf;
1384 }
1385 
1396 bool TessBaseAPI::AdaptToWordStr(PageSegMode mode, const char* wordstr) {
1397  int debug = 0;
1398  GetIntVariable("applybox_debug", &debug);
1399  bool success = true;
1400  PageSegMode current_psm = GetPageSegMode();
1401  SetPageSegMode(mode);
1402  SetVariable("classify_enable_learning", "0");
1403  char* text = GetUTF8Text();
1404  if (debug) {
1405  tprintf("Trying to adapt \"%s\" to \"%s\"\n", text, wordstr);
1406  }
1407  if (text != NULL) {
1408  PAGE_RES_IT it(page_res_);
1409  WERD_RES* word_res = it.word();
1410  if (word_res != NULL) {
1411  word_res->word->set_text(wordstr);
1412  } else {
1413  success = false;
1414  }
1415  // Check to see if text matches wordstr.
1416  int w = 0;
1417  int t = 0;
1418  for (t = 0; text[t] != '\0'; ++t) {
1419  if (text[t] == '\n' || text[t] == ' ')
1420  continue;
1421  while (wordstr[w] != '\0' && wordstr[w] == ' ')
1422  ++w;
1423  if (text[t] != wordstr[w])
1424  break;
1425  ++w;
1426  }
1427  if (text[t] != '\0' || wordstr[w] != '\0') {
1428  // No match.
1429  delete page_res_;
1430  GenericVector<TBOX> boxes;
1434  PAGE_RES_IT pr_it(page_res_);
1435  if (pr_it.word() == NULL)
1436  success = false;
1437  else
1438  word_res = pr_it.word();
1439  } else {
1440  word_res->BestChoiceToCorrectText();
1441  }
1442  if (success) {
1443  tesseract_->EnableLearning = true;
1444  tesseract_->LearnWord(NULL, NULL, word_res);
1445  }
1446  delete [] text;
1447  } else {
1448  success = false;
1449  }
1450  SetPageSegMode(current_psm);
1451  return success;
1452 }
1453 
1461  if (thresholder_ != NULL)
1462  thresholder_->Clear();
1463  ClearResults();
1464 }
1465 
1473  if (thresholder_ != NULL) {
1474  delete thresholder_;
1475  thresholder_ = NULL;
1476  }
1477  if (page_res_ != NULL) {
1478  delete page_res_;
1479  page_res_ = NULL;
1480  }
1481  if (block_list_ != NULL) {
1482  delete block_list_;
1483  block_list_ = NULL;
1484  }
1485  if (paragraph_models_ != NULL) {
1487  delete paragraph_models_;
1489  }
1490  if (tesseract_ != NULL) {
1491  delete tesseract_;
1492  if (osd_tesseract_ == tesseract_)
1493  osd_tesseract_ = NULL;
1494  tesseract_ = NULL;
1495  }
1496  if (osd_tesseract_ != NULL) {
1497  delete osd_tesseract_;
1498  osd_tesseract_ = NULL;
1499  }
1500  if (equ_detect_ != NULL) {
1501  delete equ_detect_;
1502  equ_detect_ = NULL;
1503  }
1504  if (input_file_ != NULL) {
1505  delete input_file_;
1506  input_file_ = NULL;
1507  }
1508  if (output_file_ != NULL) {
1509  delete output_file_;
1510  output_file_ = NULL;
1511  }
1512  if (datapath_ != NULL) {
1513  delete datapath_;
1514  datapath_ = NULL;
1515  }
1516  if (language_ != NULL) {
1517  delete language_;
1518  language_ = NULL;
1519  }
1520 }
1521 
1526 int TessBaseAPI::IsValidWord(const char *word) {
1527  return tesseract_->getDict().valid_word(word);
1528 }
1529 
1530 
1531 bool TessBaseAPI::GetTextDirection(int* out_offset, float* out_slope) {
1532  if (page_res_ == NULL)
1533  FindLines();
1534  if (block_list_->length() < 1) {
1535  return false;
1536  }
1537 
1538  // Get first block
1539  BLOCK_IT block_it(block_list_);
1540  block_it.move_to_first();
1541  ROW_LIST* rows = block_it.data()->row_list();
1542  if (rows->length() < 1) {
1543  return false;
1544  }
1545 
1546  // Get first line of block
1547  ROW_IT row_it(rows);
1548  row_it.move_to_first();
1549  ROW* row = row_it.data();
1550 
1551  // Calculate offset and slope (NOTE: Kind of ugly)
1552  *out_offset = static_cast<int>(row->base_line(0.0));
1553  *out_slope = row->base_line(1.0) - row->base_line(0.0);
1554 
1555  return true;
1556 }
1557 
1560  if (tesseract_ != NULL) {
1562  }
1563 }
1564 
1570  if (tesseract_ != NULL) {
1572  // Set it for the sublangs too.
1573  int num_subs = tesseract_->num_sub_langs();
1574  for (int i = 0; i < num_subs; ++i) {
1576  }
1577  }
1578 }
1579 
1583 }
1584 
1587  if (tesseract_ == NULL) {
1588  tprintf("Please call Init before attempting to send an image.");
1589  return false;
1590  }
1591  if (thresholder_ == NULL)
1593  ClearResults();
1594  return true;
1595 }
1596 
1603 void TessBaseAPI::Threshold(Pix** pix) {
1604  ASSERT_HOST(pix != NULL);
1605  if (!thresholder_->IsBinary()) {
1607  }
1608  if (*pix != NULL)
1609  pixDestroy(pix);
1610  // Zero resolution messes up the algorithms, so make sure it is credible.
1611  int y_res = thresholder_->GetScaledYResolution();
1612  if (y_res < kMinCredibleResolution || y_res > kMaxCredibleResolution) {
1613  // Use the minimum default resolution, as it is safer to under-estimate
1614  // than over-estimate resolution.
1616  }
1621  // Set the internal resolution that is used for layout parameters from the
1622  // estimated resolution, rather than the image resolution, which may be
1623  // fabricated, but we will use the image resolution, if there is one, to
1624  // report output point sizes.
1625  int estimated_res = ClipToRange(thresholder_->GetScaledEstimatedResolution(),
1628  if (estimated_res != thresholder_->GetScaledEstimatedResolution()) {
1629  tprintf("Estimated resolution %d out of range! Corrected to %d\n",
1630  thresholder_->GetScaledEstimatedResolution(), estimated_res);
1631  }
1632  tesseract_->set_source_resolution(estimated_res);
1633 }
1634 
1637  if (thresholder_ == NULL || thresholder_->IsEmpty()) {
1638  tprintf("Please call SetImage before attempting recognition.");
1639  return -1;
1640  }
1641  if (recognition_done_)
1642  ClearResults();
1643  if (!block_list_->empty()) {
1644  return 0;
1645  }
1646  if (tesseract_ == NULL) {
1647  tesseract_ = new Tesseract;
1649  }
1650  if (tesseract_->pix_binary() == NULL)
1652  if (tesseract_->ImageWidth() > MAX_INT16 ||
1654  tprintf("Image too large: (%d, %d)\n",
1656  return -1;
1657  }
1658 
1660 
1662  if (equ_detect_ == NULL && datapath_ != NULL) {
1664  }
1666  }
1667 
1668  Tesseract* osd_tess = osd_tesseract_;
1669  OSResults osr;
1670  if (PSM_OSD_ENABLED(tesseract_->tessedit_pageseg_mode) && osd_tess == NULL) {
1671  if (strcmp(language_->string(), "osd") == 0) {
1672  osd_tess = tesseract_;
1673  } else {
1674  osd_tesseract_ = new Tesseract;
1677  NULL, 0, NULL, NULL, false) == 0) {
1678  osd_tess = osd_tesseract_;
1681  } else {
1682  tprintf("Warning: Auto orientation and script detection requested,"
1683  " but osd language failed to load\n");
1684  delete osd_tesseract_;
1685  osd_tesseract_ = NULL;
1686  }
1687  }
1688  }
1689 
1690  if (tesseract_->SegmentPage(input_file_, block_list_, osd_tess, &osr) < 0)
1691  return -1;
1692  // If Devanagari is being recognized, we use different images for page seg
1693  // and for OCR.
1694  tesseract_->PrepareForTessOCR(block_list_, osd_tess, &osr);
1695  return 0;
1696 }
1697 
1700  if (tesseract_ != NULL) {
1701  tesseract_->Clear();
1702  }
1703  if (page_res_ != NULL) {
1704  delete page_res_;
1705  page_res_ = NULL;
1706  }
1707  recognition_done_ = false;
1708  if (block_list_ == NULL)
1709  block_list_ = new BLOCK_LIST;
1710  else
1711  block_list_->clear();
1712  if (paragraph_models_ != NULL) {
1714  delete paragraph_models_;
1716  }
1717 }
1718 
1726 int TessBaseAPI::TextLength(int* blob_count) {
1727  if (tesseract_ == NULL || page_res_ == NULL)
1728  return 0;
1729 
1730  PAGE_RES_IT page_res_it(page_res_);
1731  int total_length = 2;
1732  int total_blobs = 0;
1733  // Iterate over the data structures to extract the recognition result.
1734  for (page_res_it.restart_page(); page_res_it.word () != NULL;
1735  page_res_it.forward()) {
1736  WERD_RES *word = page_res_it.word();
1737  WERD_CHOICE* choice = word->best_choice;
1738  if (choice != NULL) {
1739  total_blobs += choice->length() + 2;
1740  total_length += choice->unichar_string().length() + 2;
1741  for (int i = 0; i < word->reject_map.length(); ++i) {
1742  if (word->reject_map[i].rejected())
1743  ++total_length;
1744  }
1745  }
1746  }
1747  if (blob_count != NULL)
1748  *blob_count = total_blobs;
1749  return total_length;
1750 }
1751 
1757  if (tesseract_ == NULL)
1758  return false;
1759  ClearResults();
1760  if (tesseract_->pix_binary() == NULL)
1762  if (input_file_ == NULL)
1763  input_file_ = new STRING(kInputFile);
1765 }
1766 
1768  tesseract_->min_orientation_margin.set_value(margin);
1769 }
1770 
1785 void TessBaseAPI::GetBlockTextOrientations(int** block_orientation,
1786  bool** vertical_writing) {
1787  delete[] *block_orientation;
1788  *block_orientation = NULL;
1789  delete[] *vertical_writing;
1790  *vertical_writing = NULL;
1791  BLOCK_IT block_it(block_list_);
1792 
1793  block_it.move_to_first();
1794  int num_blocks = 0;
1795  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
1796  if (!block_it.data()->poly_block()->IsText()) {
1797  continue;
1798  }
1799  ++num_blocks;
1800  }
1801  if (!num_blocks) {
1802  tprintf("WARNING: Found no blocks\n");
1803  return;
1804  }
1805  *block_orientation = new int[num_blocks];
1806  *vertical_writing = new bool[num_blocks];
1807  block_it.move_to_first();
1808  int i = 0;
1809  for (block_it.mark_cycle_pt(); !block_it.cycled_list();
1810  block_it.forward()) {
1811  if (!block_it.data()->poly_block()->IsText()) {
1812  continue;
1813  }
1814  FCOORD re_rotation = block_it.data()->re_rotation();
1815  float re_theta = re_rotation.angle();
1816  FCOORD classify_rotation = block_it.data()->classify_rotation();
1817  float classify_theta = classify_rotation.angle();
1818  double rot_theta = - (re_theta - classify_theta) * 2.0 / PI;
1819  if (rot_theta < 0) rot_theta += 4;
1820  int num_rotations = static_cast<int>(rot_theta + 0.5);
1821  (*block_orientation)[i] = num_rotations;
1822  // The classify_rotation is non-zero only if the text has vertical
1823  // writing direction.
1824  (*vertical_writing)[i] = classify_rotation.y() != 0.0f;
1825  ++i;
1826  }
1827 }
1828 
1829 // ____________________________________________________________________________
1830 // Ocropus add-ons.
1831 
1834  FindLines();
1835  BLOCK_LIST* result = block_list_;
1836  block_list_ = NULL;
1837  return result;
1838 }
1839 
1845 void TessBaseAPI::DeleteBlockList(BLOCK_LIST *block_list) {
1846  delete block_list;
1847 }
1848 
1849 
1851  float xheight,
1852  float descender,
1853  float ascender) {
1854  inT32 xstarts[] = {-32000};
1855  double quad_coeffs[] = {0, 0, baseline};
1856  return new ROW(1,
1857  xstarts,
1858  quad_coeffs,
1859  xheight,
1860  ascender - (baseline + xheight),
1861  descender - baseline,
1862  0,
1863  0);
1864 }
1865 
1868  int width = pixGetWidth(pix);
1869  int height = pixGetHeight(pix);
1870  BLOCK block("a character", TRUE, 0, 0, 0, 0, width, height);
1871 
1872  // Create C_BLOBs from the page
1873  extract_edges(pix, &block);
1874 
1875  // Merge all C_BLOBs
1876  C_BLOB_LIST *list = block.blob_list();
1877  C_BLOB_IT c_blob_it(list);
1878  if (c_blob_it.empty())
1879  return NULL;
1880  // Move all the outlines to the first blob.
1881  C_OUTLINE_IT ol_it(c_blob_it.data()->out_list());
1882  for (c_blob_it.forward();
1883  !c_blob_it.at_first();
1884  c_blob_it.forward()) {
1885  C_BLOB *c_blob = c_blob_it.data();
1886  ol_it.add_list_after(c_blob->out_list());
1887  }
1888  // Convert the first blob to the output TBLOB.
1889  return TBLOB::PolygonalCopy(c_blob_it.data());
1890 }
1891 
1898  bool numeric_mode, DENORM *denorm) {
1899  TWERD word;
1900  word.blobs = tblob;
1901  if (denorm != NULL) {
1902  word.SetupBLNormalize(NULL, row, row->x_height(), numeric_mode, denorm);
1903  word.Normalize(*denorm);
1904  } else {
1905  DENORM normer;
1906  word.SetupBLNormalize(NULL, row, row->x_height(), numeric_mode, &normer);
1907  word.Normalize(normer);
1908  }
1909  word.blobs = NULL;
1910 }
1911 
1916 TBLOB *make_tesseract_blob(float baseline, float xheight,
1917  float descender, float ascender,
1918  bool numeric_mode, Pix* pix) {
1919  TBLOB *tblob = TessBaseAPI::MakeTBLOB(pix);
1920 
1921  // Normalize TBLOB
1922  ROW *row =
1923  TessBaseAPI::MakeTessOCRRow(baseline, xheight, descender, ascender);
1924  TessBaseAPI::NormalizeTBLOB(tblob, row, numeric_mode, NULL);
1925  delete row;
1926  return tblob;
1927 }
1928 
1934 void TessBaseAPI::AdaptToCharacter(const char *unichar_repr,
1935  int length,
1936  float baseline,
1937  float xheight,
1938  float descender,
1939  float ascender) {
1940  UNICHAR_ID id = tesseract_->unicharset.unichar_to_id(unichar_repr, length);
1941  TBLOB *blob = make_tesseract_blob(baseline, xheight, descender, ascender,
1943  tesseract_->pix_binary());
1944  float threshold;
1945  UNICHAR_ID best_class = 0;
1946  float best_rating = -100;
1947 
1948 
1949  // Classify to get a raw choice.
1950  BLOB_CHOICE_LIST choices;
1951  DENORM denorm;
1952  tesseract_->AdaptiveClassifier(blob, denorm, &choices, NULL);
1953  BLOB_CHOICE_IT choice_it;
1954  choice_it.set_to_list(&choices);
1955  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
1956  choice_it.forward()) {
1957  if (choice_it.data()->rating() > best_rating) {
1958  best_rating = choice_it.data()->rating();
1959  best_class = choice_it.data()->unichar_id();
1960  }
1961  }
1962 
1963  threshold = tesseract_->matcher_good_threshold;
1964 
1965  if (blob->outlines)
1966  tesseract_->AdaptToChar(blob, denorm, id, kUnknownFontinfoId, threshold);
1967  delete blob;
1968 }
1969 
1970 
1971 PAGE_RES* TessBaseAPI::RecognitionPass1(BLOCK_LIST* block_list) {
1972  PAGE_RES *page_res = new PAGE_RES(block_list,
1974  tesseract_->recog_all_words(page_res, NULL, NULL, NULL, 1);
1975  return page_res;
1976 }
1977 
1978 PAGE_RES* TessBaseAPI::RecognitionPass2(BLOCK_LIST* block_list,
1979  PAGE_RES* pass1_result) {
1980  if (!pass1_result)
1981  pass1_result = new PAGE_RES(block_list,
1983  tesseract_->recog_all_words(pass1_result, NULL, NULL, NULL, 2);
1984  return pass1_result;
1985 }
1986 
1987 void TessBaseAPI::DetectParagraphs(bool after_text_recognition) {
1988  int debug_level = 0;
1989  GetIntVariable("paragraph_debug_level", &debug_level);
1990  if (paragraph_models_ == NULL)
1992  MutableIterator *result_it = GetMutableIterator();
1993  do { // Detect paragraphs for this block
1995  ::tesseract::DetectParagraphs(debug_level, after_text_recognition,
1996  result_it, &models);
1997  *paragraph_models_ += models;
1998  } while (result_it->Next(RIL_BLOCK));
1999  delete result_it;
2000 }
2001 
2004  int length; // of unicode_repr
2005  float cost;
2007 
2008  TESS_CHAR(float _cost, const char *repr, int len = -1) : cost(_cost) {
2009  length = (len == -1 ? strlen(repr) : len);
2010  unicode_repr = new char[length + 1];
2011  strncpy(unicode_repr, repr, length);
2012  }
2013 
2014  TESS_CHAR() { // Satisfies ELISTIZE.
2015  }
2017  delete [] unicode_repr;
2018  }
2019 };
2020 
2021 ELISTIZEH(TESS_CHAR)
2022 ELISTIZE(TESS_CHAR)
2023 
2024 static void add_space(TESS_CHAR_IT* it) {
2025  TESS_CHAR *t = new TESS_CHAR(0, " ");
2026  it->add_after_then_move(t);
2027 }
2028 
2029 
2030 static float rating_to_cost(float rating) {
2031  rating = 100 + rating;
2032  // cuddled that to save from coverage profiler
2033  // (I have never seen ratings worse than -100,
2034  // but the check won't hurt)
2035  if (rating < 0) rating = 0;
2036  return rating;
2037 }
2038 
2043 static void extract_result(TESS_CHAR_IT* out,
2044  PAGE_RES* page_res) {
2045  PAGE_RES_IT page_res_it(page_res);
2046  int word_count = 0;
2047  while (page_res_it.word() != NULL) {
2048  WERD_RES *word = page_res_it.word();
2049  const char *str = word->best_choice->unichar_string().string();
2050  const char *len = word->best_choice->unichar_lengths().string();
2051  TBOX real_rect = word->word->bounding_box();
2052 
2053  if (word_count)
2054  add_space(out);
2055  int n = strlen(len);
2056  for (int i = 0; i < n; i++) {
2057  TESS_CHAR *tc = new TESS_CHAR(rating_to_cost(word->best_choice->rating()),
2058  str, *len);
2059  tc->box = real_rect.intersection(word->box_word->BlobBox(i));
2060  out->add_after_then_move(tc);
2061  str += *len;
2062  len++;
2063  }
2064  page_res_it.forward();
2065  word_count++;
2066  }
2067 }
2068 
2074  int** lengths,
2075  float** costs,
2076  int** x0,
2077  int** y0,
2078  int** x1,
2079  int** y1,
2080  PAGE_RES* page_res) {
2081  TESS_CHAR_LIST tess_chars;
2082  TESS_CHAR_IT tess_chars_it(&tess_chars);
2083  extract_result(&tess_chars_it, page_res);
2084  tess_chars_it.move_to_first();
2085  int n = tess_chars.length();
2086  int text_len = 0;
2087  *lengths = new int[n];
2088  *costs = new float[n];
2089  *x0 = new int[n];
2090  *y0 = new int[n];
2091  *x1 = new int[n];
2092  *y1 = new int[n];
2093  int i = 0;
2094  for (tess_chars_it.mark_cycle_pt();
2095  !tess_chars_it.cycled_list();
2096  tess_chars_it.forward(), i++) {
2097  TESS_CHAR *tc = tess_chars_it.data();
2098  text_len += (*lengths)[i] = tc->length;
2099  (*costs)[i] = tc->cost;
2100  (*x0)[i] = tc->box.left();
2101  (*y0)[i] = tc->box.bottom();
2102  (*x1)[i] = tc->box.right();
2103  (*y1)[i] = tc->box.top();
2104  }
2105  char *p = *text = new char[text_len];
2106 
2107  tess_chars_it.move_to_first();
2108  for (tess_chars_it.mark_cycle_pt();
2109  !tess_chars_it.cycled_list();
2110  tess_chars_it.forward()) {
2111  TESS_CHAR *tc = tess_chars_it.data();
2112  strncpy(p, tc->unicode_repr, tc->length);
2113  p += tc->length;
2114  }
2115  return n;
2116 }
2117 
2120  INT_FEATURE_ARRAY int_features,
2121  int* num_features,
2122  int* FeatureOutlineIndex) {
2123  if (tesseract_) {
2125  }
2126  uinT8* norm_array = new uinT8[MAX_NUM_CLASSES];
2127  inT32 len;
2128  *num_features = tesseract_->GetCharNormFeatures(
2129  blob, denorm, tesseract_->PreTrainedTemplates,
2130  int_features, norm_array, norm_array, &len, FeatureOutlineIndex);
2131  delete [] norm_array;
2132 }
2133 
2138 ROW* TessBaseAPI::FindRowForBox(BLOCK_LIST* blocks,
2139  int left, int top, int right, int bottom) {
2140  TBOX box(left, bottom, right, top);
2141  BLOCK_IT b_it(blocks);
2142  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
2143  BLOCK* block = b_it.data();
2144  if (!box.major_overlap(block->bounding_box()))
2145  continue;
2146  ROW_IT r_it(block->row_list());
2147  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
2148  ROW* row = r_it.data();
2149  if (!box.major_overlap(row->bounding_box()))
2150  continue;
2151  WERD_IT w_it(row->word_list());
2152  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
2153  WERD* word = w_it.data();
2154  if (box.major_overlap(word->bounding_box()))
2155  return row;
2156  }
2157  }
2158  }
2159  return NULL;
2160 }
2161 
2164  int num_max_matches,
2165  int* unichar_ids,
2166  float* ratings,
2167  int* num_matches_returned) {
2168  BLOB_CHOICE_LIST* choices = new BLOB_CHOICE_LIST;
2169  tesseract_->AdaptiveClassifier(blob, denorm, choices, NULL);
2170  BLOB_CHOICE_IT choices_it(choices);
2171  int& index = *num_matches_returned;
2172  index = 0;
2173  for (choices_it.mark_cycle_pt();
2174  !choices_it.cycled_list() && index < num_max_matches;
2175  choices_it.forward()) {
2176  BLOB_CHOICE* choice = choices_it.data();
2177  unichar_ids[index] = choice->unichar_id();
2178  ratings[index] = choice->rating();
2179  ++index;
2180  }
2181  *num_matches_returned = index;
2182  delete choices;
2183 }
2184 
2186 const char* TessBaseAPI::GetUnichar(int unichar_id) {
2187  return tesseract_->unicharset.id_to_unichar(unichar_id);
2188 }
2189 
2191 const Dawg *TessBaseAPI::GetDawg(int i) const {
2192  if (tesseract_ == NULL || i >= NumDawgs()) return NULL;
2193  return tesseract_->getDict().GetDawg(i);
2194 }
2195 
2198  return tesseract_ == NULL ? 0 : tesseract_->getDict().NumDawgs();
2199 }
2200 
2203  return (tesseract_ == NULL) ? NULL : tesseract_->GetCubeRecoContext();
2204 }
2205 } // namespace tesseract.