Succinct container scanner search using pipeline to run thousands of searches faster one by one scans.
Succinct container scanner search using pipeline to run thousands of searches faster one by one scans. scanner::pipeline uses variuous cache and algorithmic optimization techniques to run bulk searches faster.
#include <iostream>
#include <string>
#include <vector>
#include <memory>
#include <cassert>
#include <thread>
#include "bmdbg.h"
using namespace std;
static
unsigned max_coll = 8000000,
unsigned repeat_factor=10)
{
string str;
for (unsigned i = 10; i < max_coll; i+= (rand()&0xF))
{
switch (i & 0xF)
{
case 0: str = "AB"; break;
case 1: str = "GTx"; break;
case 2: str = "cnv"; break;
default: str = "AbY11"; break;
}
str.append(to_string(i));
for (unsigned k = 0; k < repeat_factor; ++k)
{
str_coll.emplace_back(str);
bi = str;
}
}
bi.flush();
}
static
{
for (int i = 1; i < argc; ++i)
{
std::string arg = argv[i];
if (arg == "-nodiag")
{
continue;
}
}
}
int main(
int argc,
char *argv[])
{
try
{
std::vector<string> str_coll;
cout << "Generating the test data... " << flush;
cout << "OK" << endl;
{
cout << "Remapping the data to create compressed vector " << flush;
str_sv0.remap();
str_sv0.optimize(tb);
cout << "OK" << endl;
}
{
cout << "\nStatistics on generated SV:" << endl;
bm::print_svector_stat(cout, str_sv1);
cout << "\nStatistics on remapped/optimized SV:" << endl;
bm::print_svector_stat(cout, str_sv0);
cout << endl << endl;
}
unsigned test_runs = 10000;
std::vector<string> str_test_coll;
{
if (idx >= test_runs)
idx = test_runs/2;
str_test_coll.push_back(str_coll[idx]);
}
assert(str_test_coll.size() == test_runs);
std::vector<unique_ptr<bvector_type> > res_vec1;
cout << "Running benchmark tests.." << endl;
for (int pass = 0; pass < 2; pass++)
{
cout << "PASS = " << pass << ((pass==0) ? " -- remap/optimized" : " -- NOT remapped") << endl;
res_vec1.resize(0);
const str_sv_type* str_sv = (pass==0) ? &str_sv0 : &str_sv1;
{
{
const string& str = str_test_coll[i];
res_vec1.emplace_back(unique_ptr<bvector_type>(bv_res.release()));
}
}
pipe1.options().batch_size = test_runs;
{
for (size_t i = 0; i < test_runs; ++i)
{
const string& str = str_test_coll[i];
pipe1.add(str.c_str());
}
pipe1.complete();
}
pipe1_and.set_search_mask(&bv_mask);
pipe1_and.options().batch_size = test_runs;
{
for (size_t i = 0; i < test_runs; ++i)
{
const string& str = str_test_coll[i];
pipe1_and.add(str.c_str());
}
pipe1_and.complete();
}
pipe1.options().batch_size = test_runs;
{
bm::chrono_taker tt(cout,
"scanner::pipeline find_eq_str()-count()", test_runs);
for (size_t i = 0; i < test_runs; ++i)
{
const string& str = str_test_coll[i];
pipe2.add(str.c_str());
}
pipe2.complete();
}
{
auto& res_vect = pipe1.get_bv_res_vector();
auto& res_vect_and = pipe1_and.get_bv_res_vector();
auto& cnt_vect = pipe2.get_bv_count_vector();
assert(res_vect.size() == cnt_vect.size());
size_t res_sz = res_vect.size();
for (size_t i = 0; i < res_sz; ++i)
{
assert(bv1);
assert(bv);
bool match = bv1->
equal(*bv);
assert(match);
auto c = cnt_vect[i];
(void)cnt; (void)c;
assert(cnt == c);
bv_or_total |= *bv;
{
if (bv_and)
{
auto c1 = bv_and->
count();
assert(c1 == c_and); (void)c1; (void)c_and;
match = bv_and->
equal(bv_m);
assert(match);
}
else
{
assert(!c_and);
}
}
}
}
pipe1.options().batch_size = test_runs;
pipe3.set_or_target(&bv_or);
{
for (size_t i = 0; i < test_runs; ++i)
{
const string& str = str_test_coll[i];
pipe3.add(str.c_str());
}
pipe3.complete();
}
bool match = bv_or.
equal(bv_or_total);
if (!match)
{
cerr << "OR vector mismatch!" << endl;
exit(1);
}
cout << endl;
}
}
catch(std::exception& ex)
{
std::cerr << ex.what() << std::endl;
return 1;
}
return 0;
}
Compressed bit-vector bvector<> container, set algebraic methods, traversal iterators.
#define BM_DECLARE_TEMP_BLOCK(x)
Algorithms for bm::sparse_vector.
string sparse vector based on bit-transposed matrix
Timing utilities for benchmarking (internal).
pre-processor un-defines to avoid global space pollution (internal)
Bitvector Bit-vector container with runtime compression of bits.
bool equal(const bvector< Alloc > &bvect) const BMNOEXCEPT
Equal comparison with an agr bit-vector.
size_type count() const BMNOEXCEPT
population count (count of ON bits)
bm::bvector< Alloc > & bit_and(const bm::bvector< Alloc > &bv1, const bm::bvector< Alloc > &bv2, typename bm::bvector< Alloc >::optmode opt_mode=opt_none)
3-operand AND : this := bv1 AND bv2
bvector_size_type size_type
bvector< Alloc > & set_range(size_type left, size_type right, bool value=true)
Sets all bits in the specified closed interval [left,right] Interval must be inside the bvector's siz...
Utility class to collect performance measurements and statistics.
Pipeline to run multiple searches against a particular SV faster using cache optimizations.
algorithms for sparse_vector scan/search
bool find_eq_str(const SV &sv, const value_type *str, bvector_type &bv_out)
find sparse vector elements (string)
succinct sparse vector for strings with compression using bit-slicing ( transposition) method
size_type size() const
return size of the vector
back_insert_iterator get_back_inserter()
Provide back insert iterator Back insert iterator implements buffered insertion, which is faster,...
@ use_null
support "non-assigned" or "NULL" logic
BV::size_type count_and(const BV &bv1, const BV &bv2) BMNOEXCEPT
Computes bitcount of AND operation of two bitsets.
bm::agg_run_options< agg_disable_result_bvectors, agg_compute_counts > agg_opt_only_counts
Pre-defined aggregator options for counts-only (results dropped) operation.
bm::str_sparse_vector< char, bvector_type, 32 > str_sv_type
static void GenerateTestData(std::vector< string > &str_coll, str_sv_type &str_sv, unsigned max_coll=8000000, unsigned repeat_factor=10)
Test data generation.
Aggregation options to control execution Default settings are to support only result bit-vector filte...
static int parse_args(int argc, char *argv[])