Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
strngs.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: strngs.c (Formerly strings.c)
3  * Description: STRING class functions.
4  * Author: Ray Smith
5  * Created: Fri Feb 15 09:13:30 GMT 1991
6  *
7  * (C) Copyright 1991, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include "mfcpch.h" // Precompiled headers
21 #include "helpers.h"
22 #include "tprintf.h"
23 #include "strngs.h"
24 #include "genericvector.h"
25 
26 #include <assert.h>
27 // Size of buffer needed to host the decimal representation of the maximum
28 // possible length of an int (in 64 bits, being -<20 digits>.
29 const int kMaxIntSize = 22;
30 
31 /**********************************************************************
32  * STRING_HEADER provides metadata about the allocated buffer,
33  * including total capacity and how much used (strlen with '\0').
34  *
35  * The implementation hides this header at the start of the data
36  * buffer and appends the string on the end to keep sizeof(STRING)
37  * unchanged from earlier versions so serialization is not affected.
38  *
39  * The collection of MACROS provide different implementations depending
40  * on whether the string keeps track of its strlen or not so that this
41  * feature can be added in later when consumers dont modifify the string
42  **********************************************************************/
43 
44 // Smallest string to allocate by default
45 const int kMinCapacity = 16;
46 
47 char* STRING::AllocData(int used, int capacity) {
48  data_ = (STRING_HEADER *)alloc_string(capacity + sizeof(STRING_HEADER));
49 
50  // header is the metadata for this memory block
51  STRING_HEADER* header = GetHeader();
52  header->capacity_ = capacity;
53  header->used_ = used;
54  return GetCStr();
55 }
56 
57 void STRING::DiscardData() {
58  free_string((char *)data_);
59 }
60 
61 // This is a private method; ensure FixHeader is called (or used_ is well defined)
62 // beforehand
63 char* STRING::ensure_cstr(inT32 min_capacity) {
64  STRING_HEADER* orig_header = GetHeader();
65  if (min_capacity <= orig_header->capacity_)
66  return ((char *)this->data_) + sizeof(STRING_HEADER);
67 
68  // if we are going to grow bigger, than double our existing
69  // size, but if that still is not big enough then keep the
70  // requested capacity
71  if (min_capacity < 2 * orig_header->capacity_)
72  min_capacity = 2 * orig_header->capacity_;
73 
74  int alloc = sizeof(STRING_HEADER) + min_capacity;
75  STRING_HEADER* new_header = (STRING_HEADER*)(alloc_string(alloc));
76 
77  memcpy(&new_header[1], GetCStr(), orig_header->used_);
78  new_header->capacity_ = min_capacity;
79  new_header->used_ = orig_header->used_;
80 
81  // free old memory, then rebind to new memory
82  DiscardData();
83  data_ = new_header;
84 
85  assert(InvariantOk());
86  return ((char *)data_) + sizeof(STRING_HEADER);
87 }
88 
89 // This is const, but is modifying a mutable field
90 // this way it can be used on const or non-const instances.
91 void STRING::FixHeader() const {
92  const STRING_HEADER* header = GetHeader();
93  if (header->used_ < 0)
94  header->used_ = strlen(GetCStr()) + 1;
95 }
96 
97 
99  // Empty STRINGs contain just the "\0".
100  memcpy(AllocData(1, kMinCapacity), "", 1);
101 }
102 
103 STRING::STRING(const STRING& str) {
104  str.FixHeader();
105  const STRING_HEADER* str_header = str.GetHeader();
106  int str_used = str_header->used_;
107  char *this_cstr = AllocData(str_used, str_used);
108  memcpy(this_cstr, str.GetCStr(), str_used);
109  assert(InvariantOk());
110 }
111 
112 STRING::STRING(const char* cstr) {
113  if (cstr == NULL) {
114  // Empty STRINGs contain just the "\0".
115  memcpy(AllocData(1, kMinCapacity), "", 1);
116  } else {
117  int len = strlen(cstr) + 1;
118  char* this_cstr = AllocData(len, len);
119  memcpy(this_cstr, cstr, len);
120  }
121  assert(InvariantOk());
122 }
123 
125  DiscardData();
126 }
127 
128 // Writes to the given file. Returns false in case of error.
129 bool STRING::Serialize(FILE* fp) const {
130  inT32 len = length();
131  if (fwrite(&len, sizeof(len), 1, fp) != 1) return false;
132  if (fwrite(GetCStr(), 1, len, fp) != len) return false;
133  return true;
134 }
135 // Reads from the given file. Returns false in case of error.
136 // If swap is true, assumes a big/little-endian swap is needed.
137 bool STRING::DeSerialize(bool swap, FILE* fp) {
138  inT32 len;
139  if (fread(&len, sizeof(len), 1, fp) != 1) return false;
140  if (swap)
141  ReverseN(&len, sizeof(len));
142  truncate_at(len);
143  if (fread(GetCStr(), 1, len, fp) != len) return false;
144  return true;
145 }
146 
147 BOOL8 STRING::contains(const char c) const {
148  return (c != '\0') && (strchr (GetCStr(), c) != NULL);
149 }
150 
152  FixHeader();
153  return GetHeader()->used_ - 1;
154 }
155 
156 const char* STRING::string() const {
157  const STRING_HEADER* header = GetHeader();
158  if (header->used_ == 0)
159  return NULL;
160 
161  // mark header length unreliable because tesseract might
162  // cast away the const and mutate the string directly.
163  header->used_ = -1;
164  return GetCStr();
165 }
166 
167 /******
168  * The STRING_IS_PROTECTED interface adds additional support to migrate
169  * code that needs to modify the STRING in ways not otherwise supported
170  * without violating encapsulation.
171  *
172  * Also makes the [] operator return a const so it is immutable
173  */
174 #if STRING_IS_PROTECTED
175 const char& STRING::operator[](inT32 index) const {
176  return GetCStr()[index];
177 }
178 
179 void STRING::insert_range(inT32 index, const char* str, int len) {
180  // if index is outside current range, then also grow size of string
181  // to accmodate the requested range.
182  STRING_HEADER* this_header = GetHeader();
183  int used = this_header->used_;
184  if (index > used)
185  used = index;
186 
187  char* this_cstr = ensure_cstr(used + len + 1);
188  if (index < used) {
189  // move existing string from index to '\0' inclusive.
190  memmove(this_cstr + index + len,
191  this_cstr + index,
192  this_header->used_ - index);
193  } else if (len > 0) {
194  // We are going to overwrite previous null terminator, so write the new one.
195  this_cstr[this_header->used_ + len - 1] = '\0';
196 
197  // If the old header did not have the terminator,
198  // then we need to account for it now that we've added it.
199  // Otherwise it was already accounted for; we just moved it.
200  if (this_header->used_ == 0)
201  ++this_header->used_;
202  }
203 
204  // Write new string to index.
205  // The string is already terminated from the conditions above.
206  memcpy(this_cstr + index, str, len);
207  this_header->used_ += len;
208 
209  assert(InvariantOk());
210 }
211 
212 void STRING::erase_range(inT32 index, int len) {
213  char* this_cstr = GetCStr();
214  STRING_HEADER* this_header = GetHeader();
215 
216  memcpy(this_cstr+index, this_cstr+index+len,
217  this_header->used_ - index - len);
218  this_header->used_ -= len;
219  assert(InvariantOk());
220 }
221 
222 #else
224  char* this_cstr = ensure_cstr(index + 1);
225  this_cstr[index] = '\0';
226  GetHeader()->used_ = index + 1;
227  assert(InvariantOk());
228 }
229 
230 char& STRING::operator[](inT32 index) const {
231  // Code is casting away this const and mutating the string,
232  // so mark used_ as -1 to flag it unreliable.
233  GetHeader()->used_ = -1;
234  return ((char *)GetCStr())[index];
235 }
236 #endif
237 
238 void STRING::split(const char c, GenericVector<STRING> *splited) {
239  int start_index = 0;
240  for (int i = 0; i < length(); i++) {
241  if ((*this)[i] == c) {
242  if (i != start_index) {
243  (*this)[i] = '\0';
244  STRING tmp = GetCStr() + start_index;
245  splited->push_back(tmp);
246  (*this)[i] = c;
247  }
248  start_index = i + 1;
249  }
250  }
251 
252  if (length() != start_index) {
253  STRING tmp = GetCStr() + start_index;
254  splited->push_back(tmp);
255  }
256 }
257 
258 BOOL8 STRING::operator==(const STRING& str) const {
259  FixHeader();
260  str.FixHeader();
261  const STRING_HEADER* str_header = str.GetHeader();
262  const STRING_HEADER* this_header = GetHeader();
263  int this_used = this_header->used_;
264  int str_used = str_header->used_;
265 
266  return (this_used == str_used)
267  && (memcmp(GetCStr(), str.GetCStr(), this_used) == 0);
268 }
269 
270 BOOL8 STRING::operator!=(const STRING& str) const {
271  FixHeader();
272  str.FixHeader();
273  const STRING_HEADER* str_header = str.GetHeader();
274  const STRING_HEADER* this_header = GetHeader();
275  int this_used = this_header->used_;
276  int str_used = str_header->used_;
277 
278  return (this_used != str_used)
279  || (memcmp(GetCStr(), str.GetCStr(), this_used) != 0);
280 }
281 
282 BOOL8 STRING::operator!=(const char* cstr) const {
283  FixHeader();
284  const STRING_HEADER* this_header = GetHeader();
285 
286  if (cstr == NULL)
287  return this_header->used_ > 1; // either '\0' or NULL
288  else {
289  inT32 length = strlen(cstr) + 1;
290  return (this_header->used_ != length)
291  || (memcmp(GetCStr(), cstr, length) != 0);
292  }
293 }
294 
296  str.FixHeader();
297  const STRING_HEADER* str_header = str.GetHeader();
298  int str_used = str_header->used_;
299 
300  GetHeader()->used_ = 0; // clear since ensure doesnt need to copy data
301  char* this_cstr = ensure_cstr(str_used);
302  STRING_HEADER* this_header = GetHeader();
303 
304  memcpy(this_cstr, str.GetCStr(), str_used);
305  this_header->used_ = str_used;
306 
307  assert(InvariantOk());
308  return *this;
309 }
310 
312  FixHeader();
313  str.FixHeader();
314  const STRING_HEADER* str_header = str.GetHeader();
315  const char* str_cstr = str.GetCStr();
316  int str_used = str_header->used_;
317  int this_used = GetHeader()->used_;
318  char* this_cstr = ensure_cstr(this_used + str_used);
319 
320  STRING_HEADER* this_header = GetHeader(); // after ensure for realloc
321 
322  if (this_used > 1) {
323  memcpy(this_cstr + this_used - 1, str_cstr, str_used);
324  this_header->used_ += str_used - 1; // overwrite '\0'
325  } else {
326  memcpy(this_cstr, str_cstr, str_used);
327  this_header->used_ = str_used;
328  }
329 
330  assert(InvariantOk());
331  return *this;
332 }
333 
334 void STRING::add_str_int(const char* str, int number) {
335  if (str != NULL)
336  *this += str;
337  // Allow space for the maximum possible length of inT64.
338  char num_buffer[kMaxIntSize];
339  snprintf(num_buffer, kMaxIntSize - 1, "%d", number);
340  num_buffer[kMaxIntSize - 1] = '\0';
341  *this += num_buffer;
342 }
343 
344 STRING & STRING::operator=(const char* cstr) {
345  STRING_HEADER* this_header = GetHeader();
346  if (cstr) {
347  int len = strlen(cstr) + 1;
348 
349  this_header->used_ = 0; // dont bother copying data if need to realloc
350  char* this_cstr = ensure_cstr(len);
351  this_header = GetHeader(); // for realloc
352  memcpy(this_cstr, cstr, len);
353  this_header->used_ = len;
354  } else {
355  // Reallocate to same state as default constructor.
356  DiscardData();
357  // Empty STRINGs contain just the "\0".
358  memcpy(AllocData(1, kMinCapacity), "", 1);
359  }
360 
361  assert(InvariantOk());
362  return *this;
363 }
364 
365 void STRING::assign(const char *cstr, int len) {
366  STRING_HEADER* this_header = GetHeader();
367  this_header->used_ = 0; // dont bother copying data if need to realloc
368  char* this_cstr = ensure_cstr(len + 1); // +1 for '\0'
369 
370  this_header = GetHeader(); // for realloc
371  memcpy(this_cstr, cstr, len);
372  this_cstr[len] = '\0';
373  this_header->used_ = len + 1;
374 
375  assert(InvariantOk());
376 }
377 
378 STRING STRING::operator+(const STRING& str) const {
379  STRING result(*this);
380  result += str;
381 
382  assert(InvariantOk());
383  return result;
384 }
385 
386 
387 STRING STRING::operator+(const char ch) const {
388  STRING result;
389  FixHeader();
390  const STRING_HEADER* this_header = GetHeader();
391  int this_used = this_header->used_;
392  char* result_cstr = result.ensure_cstr(this_used + 1);
393  STRING_HEADER* result_header = result.GetHeader();
394  int result_used = result_header->used_;
395 
396  // copies '\0' but we'll overwrite that
397  memcpy(result_cstr, GetCStr(), this_used);
398  result_cstr[result_used] = ch; // overwrite old '\0'
399  result_cstr[result_used + 1] = '\0'; // append on '\0'
400  ++result_header->used_;
401 
402  assert(InvariantOk());
403  return result;
404 }
405 
406 
407 STRING& STRING::operator+=(const char *str) {
408  if (!str || !*str) // empty string has no effect
409  return *this;
410 
411  FixHeader();
412  int len = strlen(str) + 1;
413  int this_used = GetHeader()->used_;
414  char* this_cstr = ensure_cstr(this_used + len);
415  STRING_HEADER* this_header = GetHeader(); // after ensure for realloc
416 
417  // if we had non-empty string then append overwriting old '\0'
418  // otherwise replace
419  if (this_used > 0) {
420  memcpy(this_cstr + this_used - 1, str, len);
421  this_header->used_ += len - 1;
422  } else {
423  memcpy(this_cstr, str, len);
424  this_header->used_ = len;
425  }
426 
427  assert(InvariantOk());
428  return *this;
429 }
430 
431 
432 STRING& STRING::operator+=(const char ch) {
433  if (ch == '\0')
434  return *this;
435 
436  FixHeader();
437  int this_used = GetHeader()->used_;
438  char* this_cstr = ensure_cstr(this_used + 1);
439  STRING_HEADER* this_header = GetHeader();
440 
441  if (this_used > 0)
442  --this_used; // undo old empty null if there was one
443 
444  this_cstr[this_used++] = ch; // append ch to end
445  this_cstr[this_used++] = '\0'; // append '\0' after ch
446  this_header->used_ = this_used;
447 
448  assert(InvariantOk());
449  return *this;
450 }