BitMagic-C++
strsvsample09.cpp
Go to the documentation of this file.
1/*
2Copyright(c) 2002-2017 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15
16For more information please visit: http://bitmagic.io
17*/
18
19/** \example strsvsample09.cpp
20 Example of how to use bm::str_sparse_vector<> - succinct container for sorting in compressive memory
21
22
23 \sa bm::str_sparse_vector
24 \sa bm::str_sparse_vector::swap
25 \sa bm::str_sparse_vector::compare
26*/
27
28/*! \file strsvsample09.cpp
29 \brief Example: str_sparse_vector<> sorting example
30*/
31
32#include <iostream>
33#include <string>
34#include <vector>
35#include <random>
36#include <algorithm>
37//#define BMAVX2OPT
38//#define BMSSE42OPT
39#include "bm.h"
40#include "bmstrsparsevec.h"
41#include "bmsparsevec_algo.h"
42
43#include "bmdbg.h"
44#include "bmtimer.h"
45
46#include "bmundef.h" /* clear the pre-proc defines from BM */
47
48using namespace std;
49
51
52// define the sparse vector type for 'char' type using bvector as
53// a container of bits for bit-transposed planes
55
56
57/// generate collection of strings from integers with common prefixes
58/// ... and shuffle it
59static
60void generate_string_set(vector<string>& str_vec,
61 const unsigned max_coll = 850000,
62 unsigned repeat = 220)
63{
64 str_vec.resize(0);
65 string str;
66 for (unsigned i = 10; i < max_coll; i += unsigned(rand() % 3))
67 {
68 switch (rand()%8)
69 {
70 case 0: str = "xnssv"; break;
71 default: str = "xrs"; break;
72 }
73 str.append(to_string(i));
74
75 for (unsigned k = 0; k < repeat; ++k, ++i) // add more of the same string
76 str_vec.emplace_back(str);
77
78 } // for i
79
80 std::random_device rd;
81 std::mt19937 g(rd());
82 std::shuffle(str_vec.begin() + str_vec.size()/2, str_vec.end(), g);
83
84}
85
86
87/// quick-sort
88///
89void quicksort(str_sv_type& strsv, int first, int last)
90{
91 using stype = str_sv_type::size_type;
92 int i, j, pivot;
93 if (first < last)
94 {
95 pivot = i= first;
96 j = last;
97 while (i <j)
98 {
99 while((i < last) && (strsv.compare(stype(i), stype(pivot)) <= 0))
100 i++;
101 while(strsv.compare(stype(j), stype(pivot)) > 0) // number[j]>number[pivot])
102 j--;
103 if (i < j)
104 strsv.swap(stype(i), stype(j));
105 } // while
106 strsv.swap(stype(pivot), stype(j));
107
108 quicksort(strsv, first, j-1);
109 quicksort(strsv, j+1, last);
110 }
111}
112
113
114/// Faster variant of quicksort, uses different variant of pivot compare, with decompressed argument
115///
116/// optimizations:
117/// 1. use of bm::str_sparse_vector<>::compare() function friendly for re-use of pivot element
118/// 2. tail call recursion eleimination
119///
120void quicksort2(str_sv_type& strsv, int first, int last)
121{
122 using stype = str_sv_type::size_type;
123 int i, j, pivot;
124
125 // fixed size for simplicity (in prod code needs dynamic buffer handling)
126 static str_sv_type::value_type pivot_buf[128];
127 while (first < last)
128 {
129 pivot = i = first;
130 j = last;
131
132 // save the pivor to re-use it in strsv.compare(..)
133 strsv.get(stype(pivot), pivot_buf, sizeof(pivot_buf));
134
135 while (i < j)
136 {
137 while((i < last) && (strsv.compare(stype(i), pivot_buf) <= 0))
138 ++i;
139 while(strsv.compare(stype(j), pivot_buf) > 0)
140 --j;
141 if (i < j)
142 strsv.swap(stype(i), stype(j));
143 } // while
144 strsv.swap(stype(pivot), stype(j));
145
146 quicksort2(strsv, first, j-1);
147 first = j+1; // tail recursion
148 } // while
149}
150
151
152/// insertion sort for performance comnparison
153///
154static
155void insertion_sort(str_sv_type& str_sv, const vector<string>& str_vec)
156{
157 // scanner object is re-used throught the processing
158 //
160
161 for (const string& s : str_vec)
162 {
163 const char* cs = s.c_str();
165 bool found = scanner.lower_bound_str(str_sv, cs, pos);
166 (void)found; // just to silence the unused variable warning
167
168 str_sv.insert(pos, cs);
169
170 } // for s
171}
172
173
174
176
177int main(void)
178{
179 try
180 {
181 str_sv_type str_sv, str_sv2, str_sv3;
182
183 vector<string> str_vec;
184 generate_string_set(str_vec);
185
186 // load compact vector
187 cout << "Loading " << str_vec.size() << " elements..." << endl;
188 {
189 auto bi = str_sv.get_back_inserter();
190 for (const string& term : str_vec)
191 bi = term;
192 bi.flush();
193 }
194 // remap succinct vector into optimal codes
195 // (after final load of content)
196 //
197 str_sv.remap();
198 {
200 str_sv.optimize(tb);
201 }
202
203 // print_svector_stat(cout, str_sv);
204
205 str_sv2 = str_sv;
206
207 // calculate and print memory usage statistics
208 {
209 str_sv_type::statistics st;
210 str_sv.calc_stat(&st);
211 size_t std_vect_mem = sizeof(str_vec);
212 for (const string& term : str_vec)
213 std_vect_mem += term.size() + sizeof(term);
214
215 cout << "std::vector<string> mem = " << std_vect_mem << endl;
216 cout << "Succinct vector vector mem = " << st.memory_used << endl;
217 }
218
219
220 cout << "Quick Sort... 1" << endl;
221 {
222 bm::chrono_taker tt1(cout, "1. quick sort (succint)", 1, &timing_map);
223 quicksort(str_sv, 0, (int)str_sv.size()-1);
224 }
225
226 cout << "Quick Sort... 2" << endl;
227 {
228 bm::chrono_taker tt1(cout, "2. quick sort 2 (succint)", 1, &timing_map);
229 quicksort2(str_sv2, 0, (int)str_sv2.size()-1);
230 }
231
232 cout << "Insertion Sort... " << endl;
233 {
234 bm::chrono_taker tt1(cout, "3. insertion sort (succint)", 1, &timing_map);
235 insertion_sort(str_sv3, str_vec);
236 }
237
238 cout << "std::sort..." << endl;
239 {
240 bm::chrono_taker tt1(cout, "4. std::sort()", 1, &timing_map);
241 std::sort(str_vec.begin(), str_vec.end());
242 }
243
244 // validation of different sort methods
245 {
246 bool eq = str_sv.equal(str_sv2);
247 if (!eq)
248 {
249 cerr << "post-sort vector mismatch! (qsort 1-2)" << endl;
250 assert(0); exit(1);
251 }
252
253 // validate the results to match STL sort
254 {
255 string s, s3;
256 vector<string>::const_iterator sit = str_vec.begin();
257 str_sv_type::const_iterator it = str_sv.begin();
258 str_sv_type::const_iterator it_end = str_sv.end();
259 str_sv_type::const_iterator it3 = str_sv3.begin();
260 for (; it != it_end; ++it, ++sit, ++it3)
261 {
262 s = *it;
263 if (*sit != s)
264 {
265 cerr << "Mismatch at:" << s << "!=" << *sit << endl;
266 assert(0);
267 return 1;
268 }
269 s3 = *it3;
270 if (s != s3)
271 {
272 cerr << "Mismatch at:" << s << "!=" << s3 << endl;
273 return 1;
274 }
275
276 } // for
277 }
278 cout << "Sort validation Ok." << endl;
279 }
280
282
283 }
284 catch(std::exception& ex)
285 {
286 std::cerr << ex.what() << std::endl;
287 return 1;
288 }
289
290
291 return 0;
292}
293
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
#define BM_DECLARE_TEMP_BLOCK(x)
Definition bm.h:47
Algorithms for bm::sparse_vector.
string sparse vector based on bit-transposed matrix
Timing utilities for benchmarking (internal).
pre-processor un-defines to avoid global space pollution (internal)
Bitvector Bit-vector container with runtime compression of bits.
Definition bm.h:115
Utility class to collect performance measurements and statistics.
Definition bmtimer.h:41
std::map< std::string, statistics > duration_map_type
test name to duration map
Definition bmtimer.h:66
static void print_duration_map(TOut &tout, const duration_map_type &dmap, format fmt=ct_time)
Definition bmtimer.h:150
algorithms for sparse_vector scan/search
bool lower_bound_str(const SV &sv, const value_type *str, size_type &pos)
lower bound search for an array position
succinct sparse vector for strings with compression using bit-slicing ( transposition) method
void insert(size_type idx, const value_type *str)
insert the specified element
const_iterator end() const BMNOEXCEPT
Provide const iterator access to the end.
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, typename str_sparse_vector< CharType, BV, STR_SIZE >::statistics *stat=0)
run memory optimization for all vector planes
void calc_stat(struct str_sparse_vector< CharType, BV, STR_SIZE >::statistics *st) const BMNOEXCEPT
Calculates memory statistics.
bool equal(const str_sparse_vector< CharType, BV, STR_SIZE > &sv, bm::null_support null_able=bm::use_null) const BMNOEXCEPT
check if another sparse vector has the same content and size
size_type size() const
return size of the vector
const_iterator begin() const BMNOEXCEPT
Provide const iterator access to container content.
int compare(size_type idx, const value_type *str) const BMNOEXCEPT
Compare vector element with argument lexicographically.
size_type get(size_type idx, value_type *str, size_type buf_size) const BMNOEXCEPT
get specified element
void remap()
Build remapping profile and re-load content to save memory.
back_insert_iterator get_back_inserter()
Provide back insert iterator Back insert iterator implements buffered insertion, which is faster,...
void swap(size_type idx1, size_type idx2)
swap two vector elements between each other
bm::chrono_taker ::duration_map_type timing_map
Definition sample11.cpp:46
bm::str_sparse_vector< char, bvector_type, 32 > str_sv_type
void quicksort2(str_sv_type &strsv, int first, int last)
Faster variant of quicksort, uses different variant of pivot compare, with decompressed argument.
void quicksort(str_sv_type &strsv, int first, int last)
quick-sort
int main(void)
static void generate_string_set(vector< string > &str_vec, const unsigned max_coll=850000, unsigned repeat=220)
generate collection of strings from integers with common prefixes ... and shuffle it
static void insertion_sort(str_sv_type &str_sv, const vector< string > &str_vec)
insertion sort for performance comnparison
bm::bvector bvector_type