Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
statistc.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: statistc.c (Formerly stats.c)
3  * Description: Simple statistical package for integer values.
4  * Author: Ray Smith
5  * Created: Mon Feb 04 16:56:05 GMT 1991
6  *
7  * (C) Copyright 1991, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 // Include automatically generated configuration file if running autoconf.
21 #ifdef HAVE_CONFIG_H
22 #include "config_auto.h"
23 #endif
24 
25 #include "mfcpch.h" //precompiled headers
26 #include "statistc.h"
27 #include <string.h>
28 #include <math.h>
29 #include <stdlib.h>
30 #include "helpers.h"
31 #include "scrollview.h"
32 #include "tprintf.h"
33 
34 /**********************************************************************
35  * STATS::STATS
36  *
37  * Construct a new stats element by allocating and zeroing the memory.
38  **********************************************************************/
39 STATS::STATS(inT32 min_bucket_value, inT32 max_bucket_value_plus_1) {
40  if (max_bucket_value_plus_1 <= min_bucket_value) {
41  min_bucket_value = 0;
42  max_bucket_value_plus_1 = 1;
43  }
44  rangemin_ = min_bucket_value; // setup
45  rangemax_ = max_bucket_value_plus_1;
46  buckets_ = new inT32[rangemax_ - rangemin_];
47  clear();
48 }
49 
51  rangemax_ = 0;
52  rangemin_ = 0;
53  buckets_ = NULL;
54 }
55 
56 /**********************************************************************
57  * STATS::set_range
58  *
59  * Alter the range on an existing stats element.
60  **********************************************************************/
61 bool STATS::set_range(inT32 min_bucket_value, inT32 max_bucket_value_plus_1) {
62  if (max_bucket_value_plus_1 <= min_bucket_value) {
63  return false;
64  }
65  if (rangemax_ - rangemin_ != max_bucket_value_plus_1 - min_bucket_value) {
66  delete [] buckets_;
67  buckets_ = new inT32[max_bucket_value_plus_1 - min_bucket_value];
68  }
69  rangemin_ = min_bucket_value; // setup
70  rangemax_ = max_bucket_value_plus_1;
71  clear(); // zero it
72  return true;
73 }
74 
75 /**********************************************************************
76  * STATS::clear
77  *
78  * Clear out the STATS class by zeroing all the buckets.
79  **********************************************************************/
80 void STATS::clear() { // clear out buckets
81  total_count_ = 0;
82  if (buckets_ != NULL)
83  memset(buckets_, 0, (rangemax_ - rangemin_) * sizeof(buckets_[0]));
84 }
85 
86 /**********************************************************************
87  * STATS::~STATS
88  *
89  * Destructor for a stats class.
90  **********************************************************************/
92  if (buckets_ != NULL) {
93  delete [] buckets_;
94  buckets_ = NULL;
95  }
96 }
97 
98 /**********************************************************************
99  * STATS::add
100  *
101  * Add a set of samples to (or delete from) a pile.
102  **********************************************************************/
103 void STATS::add(inT32 value, inT32 count) {
104  if (buckets_ == NULL) {
105  return;
106  }
107  value = ClipToRange(value, rangemin_, rangemax_ - 1);
108  buckets_[value - rangemin_] += count;
109  total_count_ += count; // keep count of total
110 }
111 
112 /**********************************************************************
113  * STATS::mode
114  *
115  * Find the mode of a stats class.
116  **********************************************************************/
117 inT32 STATS::mode() const { // get mode of samples
118  if (buckets_ == NULL) {
119  return rangemin_;
120  }
121  inT32 max = buckets_[0]; // max cell count
122  inT32 maxindex = 0; // index of max
123  for (int index = rangemax_ - rangemin_ - 1; index > 0; --index) {
124  if (buckets_[index] > max) {
125  max = buckets_[index]; // find biggest
126  maxindex = index;
127  }
128  }
129  return maxindex + rangemin_; // index of biggest
130 }
131 
132 /**********************************************************************
133  * STATS::mean
134  *
135  * Find the mean of a stats class.
136  **********************************************************************/
137 double STATS::mean() const { //get mean of samples
138  if (buckets_ == NULL || total_count_ <= 0) {
139  return static_cast<double>(rangemin_);
140  }
141  inT64 sum = 0;
142  for (int index = rangemax_ - rangemin_ - 1; index >= 0; --index) {
143  sum += static_cast<inT64>(index) * buckets_[index];
144  }
145  return static_cast<double>(sum) / total_count_ + rangemin_;
146 }
147 
148 /**********************************************************************
149  * STATS::sd
150  *
151  * Find the standard deviation of a stats class.
152  **********************************************************************/
153 double STATS::sd() const { //standard deviation
154  if (buckets_ == NULL || total_count_ <= 0) {
155  return 0.0;
156  }
157  inT64 sum = 0;
158  double sqsum = 0.0;
159  for (int index = rangemax_ - rangemin_ - 1; index >= 0; --index) {
160  sum += static_cast<inT64>(index) * buckets_[index];
161  sqsum += static_cast<double>(index) * index * buckets_[index];
162  }
163  double variance = static_cast<double>(sum) / total_count_;
164  variance = sqsum / total_count_ - variance * variance;
165  if (variance > 0.0)
166  return sqrt(variance);
167  return 0.0;
168 }
169 
170 /**********************************************************************
171  * STATS::ile
172  *
173  * Returns the fractile value such that frac fraction (in [0,1]) of samples
174  * has a value less than the return value.
175  **********************************************************************/
176 double STATS::ile(double frac) const {
177  if (buckets_ == NULL || total_count_ == 0) {
178  return static_cast<double>(rangemin_);
179  }
180 #if 0
181  // TODO(rays) The existing code doesn't seem to be doing the right thing
182  // with target a double but this substitute crashes the code that uses it.
183  // Investigate and fix properly.
184  int target = IntCastRounded(frac * total_count_);
185  target = ClipToRange(target, 1, total_count_);
186 #else
187  double target = frac * total_count_;
188  target = ClipToRange(target, 1.0, static_cast<double>(total_count_));
189 #endif
190  int sum = 0;
191  int index = 0;
192  for (index = 0; index < rangemax_ - rangemin_ && sum < target;
193  sum += buckets_[index++]);
194  if (index > 0) {
195  ASSERT_HOST(buckets_[index - 1] > 0);
196  return rangemin_ + index -
197  static_cast<double>(sum - target) / buckets_[index - 1];
198  } else {
199  return static_cast<double>(rangemin_);
200  }
201 }
202 
203 /**********************************************************************
204  * STATS::min_bucket
205  *
206  * Find REAL minimum bucket - ile(0.0) isnt necessarily correct
207  **********************************************************************/
208 inT32 STATS::min_bucket() const { // Find min
209  if (buckets_ == NULL || total_count_ == 0) {
210  return rangemin_;
211  }
212  inT32 min = 0;
213  for (min = 0; (min < rangemax_ - rangemin_) && (buckets_[min] == 0); min++);
214  return rangemin_ + min;
215 }
216 
217 
218 /**********************************************************************
219  * STATS::max_bucket
220  *
221  * Find REAL maximum bucket - ile(1.0) isnt necessarily correct
222  **********************************************************************/
223 
224 inT32 STATS::max_bucket() const { // Find max
225  if (buckets_ == NULL || total_count_ == 0) {
226  return rangemin_;
227  }
228  inT32 max;
229  for (max = rangemax_ - rangemin_ - 1; max > 0 && buckets_[max] == 0; max--);
230  return rangemin_ + max;
231 }
232 
233 /**********************************************************************
234  * STATS::median
235  *
236  * Finds a more useful estimate of median than ile(0.5).
237  *
238  * Overcomes a problem with ile() - if the samples are, for example,
239  * 6,6,13,14 ile(0.5) return 7.0 - when a more useful value would be midway
240  * between 6 and 13 = 9.5
241  **********************************************************************/
242 double STATS::median() const { //get median
243  if (buckets_ == NULL) {
244  return static_cast<double>(rangemin_);
245  }
246  double median = ile(0.5);
247  int median_pile = static_cast<int>(floor(median));
248  if ((total_count_ > 1) && (pile_count(median_pile) == 0)) {
249  inT32 min_pile;
250  inT32 max_pile;
251  /* Find preceeding non zero pile */
252  for (min_pile = median_pile; pile_count(min_pile) == 0; min_pile--);
253  /* Find following non zero pile */
254  for (max_pile = median_pile; pile_count(max_pile) == 0; max_pile++);
255  median = (min_pile + max_pile) / 2.0;
256  }
257  return median;
258 }
259 
260 /**********************************************************************
261  * STATS::local_min
262  *
263  * Return TRUE if this point is a local min.
264  **********************************************************************/
265 bool STATS::local_min(inT32 x) const {
266  if (buckets_ == NULL) {
267  return false;
268  }
269  x = ClipToRange(x, rangemin_, rangemax_ - 1) - rangemin_;
270  if (buckets_[x] == 0)
271  return true;
272  inT32 index; // table index
273  for (index = x - 1; index >= 0 && buckets_[index] == buckets_[x]; --index);
274  if (index >= 0 && buckets_[index] < buckets_[x])
275  return false;
276  for (index = x + 1; index < rangemax_ - rangemin_ &&
277  buckets_[index] == buckets_[x]; ++index);
278  if (index < rangemax_ - rangemin_ && buckets_[index] < buckets_[x])
279  return false;
280  else
281  return true;
282 }
283 
284 /**********************************************************************
285  * STATS::smooth
286  *
287  * Apply a triangular smoothing filter to the stats.
288  * This makes the modes a bit more useful.
289  * The factor gives the height of the triangle, i.e. the weight of the
290  * centre.
291  **********************************************************************/
292 void STATS::smooth(inT32 factor) {
293  if (buckets_ == NULL || factor < 2) {
294  return;
295  }
296  STATS result(rangemin_, rangemax_);
297  int entrycount = rangemax_ - rangemin_;
298  for (int entry = 0; entry < entrycount; entry++) {
299  //centre weight
300  int count = buckets_[entry] * factor;
301  for (int offset = 1; offset < factor; offset++) {
302  if (entry - offset >= 0)
303  count += buckets_[entry - offset] * (factor - offset);
304  if (entry + offset < entrycount)
305  count += buckets_[entry + offset] * (factor - offset);
306  }
307  result.add(entry + rangemin_, count);
308  }
309  total_count_ = result.total_count_;
310  memcpy(buckets_, result.buckets_, entrycount * sizeof(buckets_[0]));
311 }
312 
313 /**********************************************************************
314  * STATS::cluster
315  *
316  * Cluster the samples into max_cluster clusters.
317  * Each call runs one iteration. The array of clusters must be
318  * max_clusters+1 in size as cluster 0 is used to indicate which samples
319  * have been used.
320  * The return value is the current number of clusters.
321  **********************************************************************/
322 
323 inT32 STATS::cluster(float lower, // thresholds
324  float upper,
325  float multiple, // distance threshold
326  inT32 max_clusters, // max no to make
327  STATS *clusters) { // array of clusters
328  BOOL8 new_cluster; // added one
329  float *centres; // cluster centres
330  inT32 entry; // bucket index
331  inT32 cluster; // cluster index
332  inT32 best_cluster; // one to assign to
333  inT32 new_centre = 0; // residual mode
334  inT32 new_mode; // pile count of new_centre
335  inT32 count; // pile to place
336  float dist; // from cluster
337  float min_dist; // from best_cluster
338  inT32 cluster_count; // no of clusters
339 
340  if (buckets_ == NULL || max_clusters < 1)
341  return 0;
342  centres = new float[max_clusters + 1];
343  for (cluster_count = 1; cluster_count <= max_clusters
344  && clusters[cluster_count].buckets_ != NULL
345  && clusters[cluster_count].total_count_ > 0;
346  cluster_count++) {
347  centres[cluster_count] =
348  static_cast<float>(clusters[cluster_count].ile(0.5));
349  new_centre = clusters[cluster_count].mode();
350  for (entry = new_centre - 1; centres[cluster_count] - entry < lower
351  && entry >= rangemin_
352  && pile_count(entry) <= pile_count(entry + 1);
353  entry--) {
354  count = pile_count(entry) - clusters[0].pile_count(entry);
355  if (count > 0) {
356  clusters[cluster_count].add(entry, count);
357  clusters[0].add (entry, count);
358  }
359  }
360  for (entry = new_centre + 1; entry - centres[cluster_count] < lower
361  && entry < rangemax_
362  && pile_count(entry) <= pile_count(entry - 1);
363  entry++) {
364  count = pile_count(entry) - clusters[0].pile_count(entry);
365  if (count > 0) {
366  clusters[cluster_count].add(entry, count);
367  clusters[0].add(entry, count);
368  }
369  }
370  }
371  cluster_count--;
372 
373  if (cluster_count == 0) {
374  clusters[0].set_range(rangemin_, rangemax_);
375  }
376  do {
377  new_cluster = FALSE;
378  new_mode = 0;
379  for (entry = 0; entry < rangemax_ - rangemin_; entry++) {
380  count = buckets_[entry] - clusters[0].buckets_[entry];
381  //remaining pile
382  if (count > 0) { //any to handle
383  min_dist = static_cast<float>(MAX_INT32);
384  best_cluster = 0;
385  for (cluster = 1; cluster <= cluster_count; cluster++) {
386  dist = entry + rangemin_ - centres[cluster];
387  //find distance
388  if (dist < 0)
389  dist = -dist;
390  if (dist < min_dist) {
391  min_dist = dist; //find least
392  best_cluster = cluster;
393  }
394  }
395  if (min_dist > upper //far enough for new
396  && (best_cluster == 0
397  || entry + rangemin_ > centres[best_cluster] * multiple
398  || entry + rangemin_ < centres[best_cluster] / multiple)) {
399  if (count > new_mode) {
400  new_mode = count;
401  new_centre = entry + rangemin_;
402  }
403  }
404  }
405  }
406  // need new and room
407  if (new_mode > 0 && cluster_count < max_clusters) {
408  cluster_count++;
409  new_cluster = TRUE;
410  if (!clusters[cluster_count].set_range(rangemin_, rangemax_))
411  return 0;
412  centres[cluster_count] = static_cast<float>(new_centre);
413  clusters[cluster_count].add(new_centre, new_mode);
414  clusters[0].add(new_centre, new_mode);
415  for (entry = new_centre - 1; centres[cluster_count] - entry < lower
416  && entry >= rangemin_
417  && pile_count (entry) <= pile_count(entry + 1); entry--) {
418  count = pile_count(entry) - clusters[0].pile_count(entry);
419  if (count > 0) {
420  clusters[cluster_count].add(entry, count);
421  clusters[0].add(entry, count);
422  }
423  }
424  for (entry = new_centre + 1; entry - centres[cluster_count] < lower
425  && entry < rangemax_
426  && pile_count (entry) <= pile_count(entry - 1); entry++) {
427  count = pile_count(entry) - clusters[0].pile_count(entry);
428  if (count > 0) {
429  clusters[cluster_count].add(entry, count);
430  clusters[0].add (entry, count);
431  }
432  }
433  centres[cluster_count] =
434  static_cast<float>(clusters[cluster_count].ile(0.5));
435  }
436  } while (new_cluster && cluster_count < max_clusters);
437  delete [] centres;
438  return cluster_count;
439 }
440 
441 /**********************************************************************
442  * STATS::print
443  *
444  * Prints a summary and table of the histogram.
445  **********************************************************************/
446 void STATS::print() const {
447  if (buckets_ == NULL) {
448  return;
449  }
450  inT32 min = min_bucket() - rangemin_;
451  inT32 max = max_bucket() - rangemin_;
452 
453  int num_printed = 0;
454  for (int index = min; index <= max; index++) {
455  if (buckets_[index] != 0) {
456  tprintf("%4d:%-3d ", rangemin_ + index, buckets_[index]);
457  if (++num_printed % 8 == 0)
458  tprintf ("\n");
459  }
460  }
461  tprintf ("\n");
462  print_summary();
463 }
464 
465 
466 
467 /**********************************************************************
468  * STATS::print_summary
469  *
470  * Print a summary of the stats.
471  **********************************************************************/
472 void STATS::print_summary() const {
473  if (buckets_ == NULL) {
474  return;
475  }
476  inT32 min = min_bucket();
477  inT32 max = max_bucket();
478  tprintf("Total count=%d\n", total_count_);
479  tprintf("Min=%.2f Really=%d\n", ile(0.0), min);
480  tprintf("Lower quartile=%.2f\n", ile(0.25));
481  tprintf("Median=%.2f, ile(0.5)=%.2f\n", median(), ile(0.5));
482  tprintf("Upper quartile=%.2f\n", ile(0.75));
483  tprintf("Max=%.2f Really=%d\n", ile(1.0), max);
484  tprintf("Range=%d\n", max + 1 - min);
485  tprintf("Mean= %.2f\n", mean());
486  tprintf("SD= %.2f\n", sd());
487 }
488 
489 
490 /**********************************************************************
491  * STATS::plot
492  *
493  * Draw a histogram of the stats table.
494  **********************************************************************/
495 
496 #ifndef GRAPHICS_DISABLED
497 void STATS::plot(ScrollView* window, // to draw in
498  float xorigin, // bottom left
499  float yorigin,
500  float xscale, // one x unit
501  float yscale, // one y unit
502  ScrollView::Color colour) const { // colour to draw in
503  if (buckets_ == NULL) {
504  return;
505  }
506  window->Pen(colour);
507 
508  for (int index = 0; index < rangemax_ - rangemin_; index++) {
509  window->Rectangle( xorigin + xscale * index, yorigin,
510  xorigin + xscale * (index + 1),
511  yorigin + yscale * buckets_[index]);
512  }
513 }
514 #endif
515 
516 
517 /**********************************************************************
518  * STATS::plotline
519  *
520  * Draw a histogram of the stats table. (Line only)
521  **********************************************************************/
522 
523 #ifndef GRAPHICS_DISABLED
524 void STATS::plotline(ScrollView* window, // to draw in
525  float xorigin, // bottom left
526  float yorigin,
527  float xscale, // one x unit
528  float yscale, // one y unit
529  ScrollView::Color colour) const { // colour to draw in
530  if (buckets_ == NULL) {
531  return;
532  }
533  window->Pen(colour);
534  window->SetCursor(xorigin, yorigin + yscale * buckets_[0]);
535  for (int index = 0; index < rangemax_ - rangemin_; index++) {
536  window->DrawTo(xorigin + xscale * index,
537  yorigin + yscale * buckets_[index]);
538  }
539 }
540 #endif
541 
542 
543 /**********************************************************************
544  * choose_nth_item
545  *
546  * Returns the index of what would b the nth item in the array
547  * if the members were sorted, without actually sorting.
548  **********************************************************************/
549 
550 inT32 choose_nth_item(inT32 index, float *array, inT32 count) {
551  inT32 next_sample; // next one to do
552  inT32 next_lesser; // space for new
553  inT32 prev_greater; // last one saved
554  inT32 equal_count; // no of equal ones
555  float pivot; // proposed median
556  float sample; // current sample
557 
558  if (count <= 1)
559  return 0;
560  if (count == 2) {
561  if (array[0] < array[1]) {
562  return index >= 1 ? 1 : 0;
563  }
564  else {
565  return index >= 1 ? 0 : 1;
566  }
567  }
568  else {
569  if (index < 0)
570  index = 0; // ensure legal
571  else if (index >= count)
572  index = count - 1;
573  equal_count = (inT32) (rand() % count);
574  pivot = array[equal_count];
575  // fill gap
576  array[equal_count] = array[0];
577  next_lesser = 0;
578  prev_greater = count;
579  equal_count = 1;
580  for (next_sample = 1; next_sample < prev_greater;) {
581  sample = array[next_sample];
582  if (sample < pivot) {
583  // shuffle
584  array[next_lesser++] = sample;
585  next_sample++;
586  }
587  else if (sample > pivot) {
588  prev_greater--;
589  // juggle
590  array[next_sample] = array[prev_greater];
591  array[prev_greater] = sample;
592  }
593  else {
594  equal_count++;
595  next_sample++;
596  }
597  }
598  for (next_sample = next_lesser; next_sample < prev_greater;)
599  array[next_sample++] = pivot;
600  if (index < next_lesser)
601  return choose_nth_item (index, array, next_lesser);
602  else if (index < prev_greater)
603  return next_lesser; // in equal bracket
604  else
605  return choose_nth_item (index - prev_greater,
606  array + prev_greater,
607  count - prev_greater) + prev_greater;
608  }
609 }
610 
611 /**********************************************************************
612  * choose_nth_item
613  *
614  * Returns the index of what would be the nth item in the array
615  * if the members were sorted, without actually sorting.
616  **********************************************************************/
617 inT32 choose_nth_item(inT32 index, void *array, inT32 count, size_t size,
618  int (*compar)(const void*, const void*)) {
619  int result; // of compar
620  inT32 next_sample; // next one to do
621  inT32 next_lesser; // space for new
622  inT32 prev_greater; // last one saved
623  inT32 equal_count; // no of equal ones
624  inT32 pivot; // proposed median
625 
626  if (count <= 1)
627  return 0;
628  if (count == 2) {
629  if (compar (array, (char *) array + size) < 0) {
630  return index >= 1 ? 1 : 0;
631  }
632  else {
633  return index >= 1 ? 0 : 1;
634  }
635  }
636  if (index < 0)
637  index = 0; // ensure legal
638  else if (index >= count)
639  index = count - 1;
640  pivot = (inT32) (rand () % count);
641  swap_entries (array, size, pivot, 0);
642  next_lesser = 0;
643  prev_greater = count;
644  equal_count = 1;
645  for (next_sample = 1; next_sample < prev_greater;) {
646  result =
647  compar ((char *) array + size * next_sample,
648  (char *) array + size * next_lesser);
649  if (result < 0) {
650  swap_entries (array, size, next_lesser++, next_sample++);
651  // shuffle
652  }
653  else if (result > 0) {
654  prev_greater--;
655  swap_entries(array, size, prev_greater, next_sample);
656  }
657  else {
658  equal_count++;
659  next_sample++;
660  }
661  }
662  if (index < next_lesser)
663  return choose_nth_item (index, array, next_lesser, size, compar);
664  else if (index < prev_greater)
665  return next_lesser; // in equal bracket
666  else
667  return choose_nth_item (index - prev_greater,
668  (char *) array + size * prev_greater,
669  count - prev_greater, size,
670  compar) + prev_greater;
671 }
672 
673 /**********************************************************************
674  * swap_entries
675  *
676  * Swap 2 entries of arbitrary size in-place in a table.
677  **********************************************************************/
678 void swap_entries(void *array, // array of entries
679  size_t size, // size of entry
680  inT32 index1, // entries to swap
681  inT32 index2) {
682  char tmp;
683  char *ptr1; // to entries
684  char *ptr2;
685  size_t count; // of bytes
686 
687  ptr1 = reinterpret_cast<char*>(array) + index1 * size;
688  ptr2 = reinterpret_cast<char*>(array) + index2 * size;
689  for (count = 0; count < size; count++) {
690  tmp = *ptr1;
691  *ptr1++ = *ptr2;
692  *ptr2++ = tmp; // tedious!
693  }
694 }