[ VIGRA Homepage | Function Index | Class Index | Namespaces | File List | Main Page ]
vigra/sampling.hxx | ![]() |
00001 /************************************************************************/ 00002 /* */ 00003 /* Copyright 2008-2009 by Ullrich Koethe and Rahul Nair */ 00004 /* */ 00005 /* This file is part of the VIGRA computer vision library. */ 00006 /* The VIGRA Website is */ 00007 /* http://hci.iwr.uni-heidelberg.de/vigra/ */ 00008 /* Please direct questions, bug reports, and contributions to */ 00009 /* ullrich.koethe@iwr.uni-heidelberg.de or */ 00010 /* vigra@informatik.uni-hamburg.de */ 00011 /* */ 00012 /* Permission is hereby granted, free of charge, to any person */ 00013 /* obtaining a copy of this software and associated documentation */ 00014 /* files (the "Software"), to deal in the Software without */ 00015 /* restriction, including without limitation the rights to use, */ 00016 /* copy, modify, merge, publish, distribute, sublicense, and/or */ 00017 /* sell copies of the Software, and to permit persons to whom the */ 00018 /* Software is furnished to do so, subject to the following */ 00019 /* conditions: */ 00020 /* */ 00021 /* The above copyright notice and this permission notice shall be */ 00022 /* included in all copies or substantial portions of the */ 00023 /* Software. */ 00024 /* */ 00025 /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND */ 00026 /* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES */ 00027 /* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND */ 00028 /* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT */ 00029 /* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, */ 00030 /* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING */ 00031 /* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR */ 00032 /* OTHER DEALINGS IN THE SOFTWARE. */ 00033 /* */ 00034 /************************************************************************/ 00035 00036 #ifndef VIGRA_SAMPLING_HXX 00037 #define VIGRA_SAMPLING_HXX 00038 00039 #include "array_vector.hxx" 00040 #include "random.hxx" 00041 #include <map> 00042 #include <memory> 00043 #include <cmath> 00044 00045 namespace vigra 00046 { 00047 00048 /** \addtogroup MachineLearning Machine Learning 00049 **/ 00050 //@{ 00051 00052 00053 /**\brief Options object for the Sampler class. 00054 00055 <b>usage:</b> 00056 00057 \code 00058 SamplerOptions opt = SamplerOptions() 00059 .withReplacement() 00060 .sampleProportion(0.5); 00061 \endcode 00062 00063 Note that the return value of all methods is <tt>*this</tt> which makes 00064 concatenating of options as above possible. 00065 */ 00066 class SamplerOptions 00067 { 00068 public: 00069 00070 double sample_proportion; 00071 unsigned int sample_size; 00072 bool sample_with_replacement; 00073 bool stratified_sampling; 00074 00075 SamplerOptions() 00076 : sample_proportion(1.0), 00077 sample_size(0), 00078 sample_with_replacement(true), 00079 stratified_sampling(false) 00080 {} 00081 00082 /**\brief Sample from training population with replacement. 00083 * 00084 * <br> Default: true 00085 */ 00086 SamplerOptions& withReplacement(bool in = true) 00087 { 00088 sample_with_replacement = in; 00089 return *this; 00090 } 00091 00092 /**\brief Sample from training population without replacement. 00093 * 00094 * <br> Default (if you don't call this function): false 00095 */ 00096 SamplerOptions& withoutReplacement(bool in = true) 00097 { 00098 sample_with_replacement = !in; 00099 return *this; 00100 } 00101 00102 /**\brief Draw the given number of samples. 00103 * If stratifiedSampling is true, the \a size is equally distributed 00104 * across all strata (e.g. <tt>size / strataCount</tt> samples are taken 00105 * from each stratum, subject to rounding). 00106 * 00107 * <br> Default: 0 (i.e. determine the count by means of sampleProportion()) 00108 */ 00109 SamplerOptions& sampleSize(unsigned int size) 00110 { 00111 sample_size = size; 00112 return *this; 00113 } 00114 00115 00116 /**\brief Determine the number of samples to draw as a proportion of the total 00117 * number. That is, we draw <tt>count = totalCount * proportion</tt> samples. 00118 * This option is overridden when an absolute count is specified by sampleSize(). 00119 * 00120 * If stratifiedSampling is true, the count is equally distributed 00121 * across all strata (e.g. <tt>totalCount * proportion / strataCount</tt> samples are taken 00122 * from each stratum). 00123 * 00124 * <br> Default: 1.0 00125 */ 00126 SamplerOptions& sampleProportion(double proportion) 00127 { 00128 vigra_precondition(proportion >= 0.0, 00129 "SamplerOptions::sampleProportion(): argument must not be negative."); 00130 sample_proportion = proportion; 00131 return *this; 00132 } 00133 00134 /**\brief Draw equally many samples from each "stratum". 00135 * A stratum is a group of like entities, e.g. pixels belonging 00136 * to the same object class. This is useful to create balanced samples 00137 * when the class probabilities are very unbalanced (e.g. 00138 * when there are many background and few foreground pixels). 00139 * Stratified sampling thus avoids that a trained classifier is biased 00140 * towards the majority class. 00141 * 00142 * <br> Default (if you don't call this function): false 00143 */ 00144 SamplerOptions& stratified(bool in = true) 00145 { 00146 stratified_sampling = in; 00147 return *this; 00148 } 00149 }; 00150 00151 /************************************************************/ 00152 /* */ 00153 /* Sampler */ 00154 /* */ 00155 /************************************************************/ 00156 00157 /** \brief Create random samples from a sequence of indices. 00158 00159 Selecting data items at random is a basic task of machine learning, 00160 for example in boostrapping, RandomForest training, and cross validation. 00161 This class implements various ways to select random samples via their indices. 00162 Indices are assumed to be consecutive in 00163 the range <tt>0 <= index < total_sample_count</tt>. 00164 00165 The class always contains a current sample which can be accessed by 00166 the index operator or by the function sampledIndices(). The indices 00167 that are not in the current sample (out-of-bag indices) can be accessed 00168 via the function oobIndices(). 00169 00170 The sampling method (with/without replacement, stratified or not) and the 00171 number of samples to draw are determined by the option object 00172 SamplerOptions. 00173 00174 <b>Usage:</b> 00175 00176 <b>\#include</b> <vigra/sampling.hxx><br> 00177 Namespace: vigra 00178 00179 Create a Sampler with default options, i.e. sample as many indices as there 00180 are data elements, with replacement. On average, the sample will contain 00181 <tt>0.63*totalCount</tt> distinct indices. 00182 00183 \code 00184 int totalCount = 10000; // total number of data elements 00185 int numberOfSamples = 20; // repeat experiment 20 times 00186 Sampler<> sampler(totalCount); 00187 for(int k=0; k<numberOfSamples; ++k) 00188 { 00189 // process current sample 00190 for(int i=0; i<sampler.sampleSize(); ++i) 00191 { 00192 int currentIndex = sampler[i]; 00193 processData(data[currentIndex]); 00194 } 00195 // create next sample 00196 sampler.sample(); 00197 } 00198 \endcode 00199 00200 Create a Sampler for stratified sampling, without replacement. 00201 00202 \code 00203 // prepare the strata (i.e. specify which stratum each element belongs to) 00204 int stratumSize1 = 2000, stratumSize2 = 8000, 00205 totalCount = stratumSize1 + stratumSize2; 00206 ArrayVerctor<int> strata(totalCount); 00207 for(int i=0; i<stratumSize1; ++i) 00208 strata[i] = 1; 00209 for(int i=stratumSize1; i<stratumSize2; ++i) 00210 strata[i] = 2; 00211 00212 int sampleSize = 200; // i.e. sample 100 elements from each of the two strata 00213 int numberOfSamples = 20; // repeat experiment 20 times 00214 Sampler<> stratifiedSampler(strata.begin(), strata.end(), 00215 SamplerOptions().withoutReplacement().stratified().sampleSize(sampleSize)); 00216 // create first sample 00217 sampler.sample(); 00218 00219 for(int k=0; k<numberOfSamples; ++k) 00220 { 00221 // process current sample 00222 for(int i=0; i<sampler.sampleSize(); ++i) 00223 { 00224 int currentIndex = sampler[i]; 00225 processData(data[currentIndex]); 00226 } 00227 // create next sample 00228 sampler.sample(); 00229 } 00230 \endcode 00231 */ 00232 template<class Random = MersenneTwister > 00233 class Sampler 00234 { 00235 public: 00236 /** Internal type of the indices. 00237 Currently, 64-bit indices are not supported because this 00238 requires extension of the random number generator classes. 00239 */ 00240 typedef Int32 IndexType; 00241 00242 typedef ArrayVector <IndexType> IndexArrayType; 00243 00244 /** Type of the array view object that is returned by 00245 sampledIndices() and oobIndices(). 00246 */ 00247 typedef ArrayVectorView <IndexType> IndexArrayViewType; 00248 00249 private: 00250 typedef std::map<IndexType, IndexArrayType> StrataIndicesType; 00251 typedef std::map<IndexType, int> StrataSizesType; 00252 typedef ArrayVector <bool> IsUsedArrayType; 00253 typedef ArrayVectorView <bool> IsUsedArrayViewType; 00254 00255 static const int oobInvalid = -1; 00256 00257 int total_count_, sample_size_; 00258 mutable int current_oob_count_; 00259 StrataIndicesType strata_indices_; 00260 StrataSizesType strata_sample_size_; 00261 IndexArrayType current_sample_; 00262 mutable IndexArrayType current_oob_sample_; 00263 IsUsedArrayType is_used_; 00264 Random const & random_; 00265 SamplerOptions options_; 00266 00267 void initStrataCount() 00268 { 00269 // compute how many samples to take from each stratum 00270 // (may be unequal if sample_size_ is not a multiple of strataCount()) 00271 int strata_sample_size = (int)std::ceil(double(sample_size_) / strataCount()); 00272 int strata_total_count = strata_sample_size * strataCount(); 00273 00274 for(StrataIndicesType::iterator i = strata_indices_.begin(); 00275 i != strata_indices_.end(); ++i) 00276 { 00277 if(strata_total_count > sample_size_) 00278 { 00279 strata_sample_size_[i->first] = strata_sample_size - 1; 00280 --strata_total_count; 00281 } 00282 else 00283 { 00284 strata_sample_size_[i->first] = strata_sample_size; 00285 } 00286 } 00287 } 00288 00289 public: 00290 00291 /** Create a sampler for \a totalCount data objects. 00292 00293 In each invocation of <tt>sample()</tt> below, it will sample 00294 indices according to the options passed. If no options are given, 00295 <tt>totalCount</tt> indices will be drawn with replacement. 00296 */ 00297 Sampler(UInt32 totalCount, SamplerOptions const & opt = SamplerOptions(), 00298 Random const & rnd = Random(RandomSeed)) 00299 : total_count_(totalCount), 00300 sample_size_(opt.sample_size == 0 00301 ? (int)(std::ceil(total_count_ * opt.sample_proportion)) 00302 : opt.sample_size), 00303 current_oob_count_(oobInvalid), 00304 current_sample_(sample_size_), 00305 current_oob_sample_(total_count_), 00306 is_used_(total_count_), 00307 random_(rnd), 00308 options_(opt) 00309 { 00310 vigra_precondition(opt.sample_with_replacement || sample_size_ <= total_count_, 00311 "Sampler(): Cannot draw without replacement when data size is smaller than sample count."); 00312 00313 vigra_precondition(!opt.stratified_sampling, 00314 "Sampler(): Stratified sampling requested, but no strata given."); 00315 00316 // initialize a single stratum containing all data 00317 strata_indices_[0].resize(total_count_); 00318 for(int i=0; i<total_count_; ++i) 00319 strata_indices_[0][i] = i; 00320 00321 initStrataCount(); 00322 //this is screwing up the random forest tests. 00323 //sample(); 00324 } 00325 00326 /** Create a sampler for stratified sampling. 00327 00328 <tt>strataBegin</tt> and <tt>strataEnd</tt> must refer to a sequence 00329 which specifies for each sample the stratum it belongs to. The 00330 total number of data objects will be set to <tt>strataEnd - strataBegin</tt>. 00331 Equally many samples (subject to rounding) will be drawn from each stratum, 00332 unless the option object explicitly requests unstratified sampling, 00333 in which case the strata are ignored. 00334 */ 00335 template <class Iterator> 00336 Sampler(Iterator strataBegin, Iterator strataEnd, SamplerOptions const & opt = SamplerOptions(), 00337 Random const & rnd = Random(RandomSeed)) 00338 : total_count_(strataEnd - strataBegin), 00339 sample_size_(opt.sample_size == 0 00340 ? (int)(std::ceil(total_count_ * opt.sample_proportion)) 00341 : opt.sample_size), 00342 current_oob_count_(oobInvalid), 00343 current_sample_(sample_size_), 00344 current_oob_sample_(total_count_), 00345 is_used_(total_count_), 00346 random_(rnd), 00347 options_(opt) 00348 { 00349 vigra_precondition(opt.sample_with_replacement || sample_size_ <= total_count_, 00350 "Sampler(): Cannot draw without replacement when data size is smaller than sample count."); 00351 00352 // copy the strata indices 00353 if(opt.stratified_sampling) 00354 { 00355 for(int i = 0; strataBegin != strataEnd; ++i, ++strataBegin) 00356 { 00357 strata_indices_[*strataBegin].push_back(i); 00358 } 00359 } 00360 else 00361 { 00362 strata_indices_[0].resize(total_count_); 00363 for(int i=0; i<total_count_; ++i) 00364 strata_indices_[0][i] = i; 00365 } 00366 00367 vigra_precondition(sample_size_ >= (int)strata_indices_.size(), 00368 "Sampler(): Requested sample count must be at least as large as the number of strata."); 00369 00370 initStrataCount(); 00371 //this is screwing up the random forest tests. 00372 //sample(); 00373 } 00374 00375 /** Return the k-th index in the current sample. 00376 */ 00377 IndexType operator[](int k) const 00378 { 00379 return current_sample_[k]; 00380 } 00381 00382 /** Create a new sample. 00383 */ 00384 void sample(); 00385 00386 /** The total number of data elements. 00387 */ 00388 int totalCount() const 00389 { 00390 return total_count_; 00391 } 00392 00393 /** The number of data elements that have been sampled. 00394 */ 00395 int sampleSize() const 00396 { 00397 return sample_size_; 00398 } 00399 00400 /** Same as sampleSize(). 00401 */ 00402 int size() const 00403 { 00404 return sample_size_; 00405 } 00406 00407 /** The number of strata to be used. 00408 Will be 1 if no strata are given. Will be ignored when 00409 stratifiedSampling() is false. 00410 */ 00411 int strataCount() const 00412 { 00413 return strata_indices_.size(); 00414 } 00415 00416 /** Whether to use stratified sampling. 00417 (If this is false, strata will be ignored even if present.) 00418 */ 00419 bool stratifiedSampling() const 00420 { 00421 return options_.stratified_sampling; 00422 } 00423 00424 /** Whether sampling should be performed with replacement. 00425 */ 00426 bool withReplacement() const 00427 { 00428 return options_.sample_with_replacement; 00429 } 00430 00431 /** Return an array view containing the indices in the current sample. 00432 */ 00433 IndexArrayViewType sampledIndices() const 00434 { 00435 return current_sample_; 00436 } 00437 00438 /** Return an array view containing the out-of-bag indices. 00439 (i.e. the indices that are not in the current sample) 00440 */ 00441 IndexArrayViewType oobIndices() const 00442 { 00443 if(current_oob_count_ == oobInvalid) 00444 { 00445 current_oob_count_ = 0; 00446 for(int i = 0; i<total_count_; ++i) 00447 { 00448 if(!is_used_[i]) 00449 { 00450 current_oob_sample_[current_oob_count_] = i; 00451 ++current_oob_count_; 00452 } 00453 } 00454 } 00455 return current_oob_sample_.subarray(0, current_oob_count_); 00456 } 00457 IsUsedArrayType const & is_used() const 00458 { 00459 return is_used_; 00460 } 00461 }; 00462 00463 00464 template<class Random> 00465 void Sampler<Random>::sample() 00466 { 00467 current_oob_count_ = oobInvalid; 00468 is_used_.init(false); 00469 00470 if(options_.sample_with_replacement) 00471 { 00472 //Go thru all strata 00473 int j = 0; 00474 StrataIndicesType::iterator iter; 00475 for(iter = strata_indices_.begin(); iter != strata_indices_.end(); ++iter) 00476 { 00477 // do sampling with replacement in each strata and copy data. 00478 int stratum_size = iter->second.size(); 00479 for(int i = 0; i < (int)strata_sample_size_[iter->first]; ++i, ++j) 00480 { 00481 current_sample_[j] = iter->second[random_.uniformInt(stratum_size)]; 00482 is_used_[current_sample_[j]] = true; 00483 } 00484 } 00485 } 00486 else 00487 { 00488 //Go thru all strata 00489 int j = 0; 00490 StrataIndicesType::iterator iter; 00491 for(iter = strata_indices_.begin(); iter != strata_indices_.end(); ++iter) 00492 { 00493 // do sampling without replacement in each strata and copy data. 00494 int stratum_size = iter->second.size(); 00495 for(int i = 0; i < (int)strata_sample_size_[iter->first]; ++i, ++j) 00496 { 00497 std::swap(iter->second[i], iter->second[i+ random_.uniformInt(stratum_size - i)]); 00498 current_sample_[j] = iter->second[i]; 00499 is_used_[current_sample_[j]] = true; 00500 } 00501 } 00502 } 00503 } 00504 00505 template<class Random =RandomTT800 > 00506 class PoissonSampler 00507 { 00508 public: 00509 Random randfloat; 00510 typedef Int32 IndexType; 00511 typedef vigra::ArrayVector <IndexType> IndexArrayType; 00512 IndexArrayType used_indices_; 00513 double lambda; 00514 int minIndex; 00515 int maxIndex; 00516 00517 PoissonSampler(double lambda,IndexType minIndex,IndexType maxIndex) 00518 : lambda(lambda), 00519 minIndex(minIndex), 00520 maxIndex(maxIndex) 00521 {} 00522 00523 void sample( ) 00524 { 00525 used_indices_.clear(); 00526 IndexType i; 00527 for(i=minIndex;i<maxIndex;++i) 00528 { 00529 //from http://en.wikipedia.org/wiki/Poisson_distribution 00530 int k=0; 00531 double p=1; 00532 double L=exp(-lambda); 00533 do 00534 { 00535 ++k; 00536 p*=randfloat.uniform53(); 00537 00538 }while(p>L); 00539 --k; 00540 //Insert i this many time 00541 while(k>0) 00542 { 00543 used_indices_.push_back(i); 00544 --k; 00545 } 00546 } 00547 } 00548 00549 IndexType const & operator[](int in) const 00550 { 00551 return used_indices_[in]; 00552 } 00553 00554 int numOfSamples() const 00555 { 00556 return used_indices_.size(); 00557 } 00558 }; 00559 00560 //@} 00561 00562 } // namespace vigra 00563 00564 #endif /*VIGRA_SAMPLING_HXX*/
© Ullrich Köthe (ullrich.koethe@iwr.uni-heidelberg.de) |
html generated using doxygen and Python
|