BitMagic-C++
strsvsample03.cpp
Go to the documentation of this file.
1 /*
2 Copyright(c) 2002-2017 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
3 
4 Licensed under the Apache License, Version 2.0 (the "License");
5 you may not use this file except in compliance with the License.
6 You may obtain a copy of the License at
7 
8  http://www.apache.org/licenses/LICENSE-2.0
9 
10 Unless required by applicable law or agreed to in writing, software
11 distributed under the License is distributed on an "AS IS" BASIS,
12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 See the License for the specific language governing permissions and
14 limitations under the License.
15 
16 For more information please visit: http://bitmagic.io
17 */
18 
19 /** \example strsvsample03.cpp
20  Example of how to use bm::str_sparse_vector<> - succinct container for
21  bit-transposed string collections
22 
23  \sa bm::str_sparse_vector
24 */
25 
26 /*! \file strsvsample02.cpp
27  \brief Example: str_sparse_vector<> back insert iterator example
28 
29  This example loads sparse vector from an STL container uses re-mapping
30  to compress, serialize and save container to disk.
31  Example also illustrates how to check memory footprint.
32 */
33 
34 #include <iostream>
35 #include <string>
36 #include <vector>
37 #include <random>
38 #include <algorithm>
39 #include <fstream>
40 
41 #include "bm.h"
42 #include "bmstrsparsevec.h"
43 #include "bmsparsevec_serial.h"
44 
45 
46 using namespace std;
47 
49 
50 // define the sparse vector type for 'char' type using bvector as
51 // a container of bits for bit-transposed planes
52 // 32 - is maximum string length for this container.
53 // Memory allocation is dynamic using sparse techniques, so this number
54 // just defines the max capacity.
55 //
57 
58 
59 // generate collection of strings from integers and shuffle it
60 //
61 static
62 void generate_string_set(vector<string>& str_vec)
63 {
64  const unsigned max_coll = 50000;
65 
66  str_vec.resize(0);
67  string str;
68  for (unsigned i = 10; i < max_coll; i += rand() % 3)
69  {
70  str = to_string(i);
71  str_vec.emplace_back(str);
72  } // for i
73 
74  // shuffle the data set
75  //
76  std::random_device rd;
77  std::mt19937 g(rd());
78  std::shuffle(str_vec.begin(), str_vec.end(), g);
79 }
80 
81 
82 int main(void)
83 {
84  try
85  {
86  str_sv_type str_sv;
87 
88  vector<string> str_vec;
89  generate_string_set(str_vec);
90  std::sort(str_vec.begin(), str_vec.end()); // sort the input vector
91 
92 
93  // load sparse vector from an STL container
94  //
95  {
96  size_t vect_size = 0; // approx std::vector<string> memory usage
97  str_sv_type str_sv_tmp; // temp vector
98  {
100  str_sv_tmp.get_back_inserter();
101  for (auto str : str_vec)
102  {
103  bi = str;
104 
105  // some approximate estimate of std::string element cost
106  //
107  size_t str_size = str.size() + sizeof(str);
108  vect_size += str_size;
109  }
110 
111  // it is important to use flush, because back inserter is
112  // buffering data. Of cause it flashes automatically on
113  // destruction but explicit flush is somewhat better
114  // because of possible exception is thrown here and not from
115  // destructor.
116  //
117 
118  bi.flush();
119 
120  cout << "STL vector<string> approx.memory consumption:"
121  << vect_size << endl;
122  }
123 
124  // calculate memory footprint
125  //
127  str_sv_tmp.calc_stat(&st);
128 
129  cout << "Used memory: " << st.memory_used << std::endl;
130 
131 
132  // final step is re-mapping, which increses chances for
133  // good memory compression.
134  // A side-effect here is that remapping makes container
135  // effectively read-only.
136  //
137  str_sv.remap_from(str_sv_tmp);
138 
140  str_sv.optimize(tb); // optimize the vector
141 
142  str_sv.calc_stat(&st);
143  cout << "Used memory after remap and optimization: "
144  << st.memory_used
145  << std::endl;
146  }
147 
148  // serialize and save
149  //
150  {
151  std::string fname = "test.sv";
153 
155  bm::sparse_vector_serialize(str_sv, sv_lay, tb);
156 
157  std::ofstream fout(fname.c_str(), std::ios::binary);
158  if (!fout.good())
159  {
160  return -1;
161  }
162  const char* buf = (char*)sv_lay.buf();
163  fout.write(buf, (unsigned)sv_lay.size());
164  if (!fout.good())
165  {
166  return -1;
167  }
168  fout.close();
169 
170  cout << "Saved size: " << sv_lay.size() << endl;
171  }
172 
173  }
174  catch(std::exception& ex)
175  {
176  std::cerr << ex.what() << std::endl;
177  return 1;
178  }
179 
180 
181  return 0;
182 }
183 
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
size_t size() const
return current serialized size
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, typename str_sparse_vector< CharType, BV, MAX_STR_SIZE >::statistics *stat=0)
run memory optimization for all vector plains
string sparse vector based on bit-transposed matrix
void sparse_vector_serialize(const SV &sv, sparse_vector_serial_layout< SV > &sv_layout, bm::word_t *temp_block=0)
Serialize sparse vector into a memory buffer(s) structure.
void flush()
flush the accumulated buffer
#define BM_DECLARE_TEMP_BLOCK(x)
Definition: bm.h:47
size_t memory_used
memory usage for all blocks and service tables
Definition: bmfunc.h:61
void calc_stat(struct str_sparse_vector< CharType, BV, MAX_STR_SIZE >::statistics *st) const
Calculates memory statistics.
Back insert iterator implements buffered insert, faster than generic access assignment.
sparse vector for strings with compression using bit transposition method
void remap_from(const str_sparse_vector &str_sv)
Build remapping profile and load content from another sparse vector.
Serialization for sparse_vector<>
static void generate_string_set(vector< string > &str_vec)
int main(void)
layout class for serialization buffer structure
const unsigned char * buf() const
Return serialization buffer pointer.
back_insert_iterator get_back_inserter()
Provide back insert iterator Back insert iterator implements buffered insertion, which is faster...
bm::bvector bvector_type
bm::str_sparse_vector< char, bvector_type, 32 > str_sv_type