BitMagic-C++
xsample07a.cpp File Reference

Example: Use of bvector<> for k-mer fingerprint K should be short, no minimizers here (k < 24). More...

#include <assert.h>
#include <stdlib.h>
#include <math.h>
#include <iostream>
#include <vector>
#include <list>
#include <map>
#include <algorithm>
#include <utility>
#include <memory>
#include <future>
#include <thread>
#include <mutex>
#include <atomic>
#include "bm64.h"
#include "bmalgo.h"
#include "bmserial.h"
#include "bmaggregator.h"
#include "bmsparsevec_compr.h"
#include "bmsparsevec_algo.h"
#include "bmrandom.h"
#include "bmdbg.h"
#include "bmtimer.h"
#include "bmundef.h"
#include "dna_finger.h"
#include "cmd_args.h"
Include dependency graph for xsample07a.cpp:

Go to the source code of this file.

Data Structures

class  CSequenceColl
 Collection of sequences and k-mer fingerprint vectors. More...
class  CSeqGroup
 Group (clustrer) of sequences. More...
class  CSeqClusters
struct  CKMerAcc
 Utility class to accumulate cahnges to cluster before commiting it (mutex syncronous operation). More...

Typedefs

typedef bm::bvector bvector_type
typedef std::vector< char > vector_char_type
typedef bm::dynamic_heap_matrix< unsigned, bm::bvector<>::allocator_type > distance_matrix_type
typedef std::vector< std::unique_ptr< bvector_type > > bvector_ptr_vector_type
typedef bvector_type::size_type bv_size_type
typedef std::vector< std::pair< bv_size_type, bv_size_type > > bv_ranges_vector

Functions

template<typename FV>
void wait_for_slot (FV &futures, unsigned *parallel_cnt, unsigned concurrency)
 wait for any opening in a list of futures used to schedule parallel tasks with CPU overbooking control
static int load_FASTA (const std::string &fname, CSequenceColl &seq_coll)
 Load multi-sequence FASTA.
static void save_kmer_buffers (const std::string &fname, const CSequenceColl &seq_coll)
 save k-mer vectors to a file
static void load_kmer_buffers (const std::string &fname, CSequenceColl &seq_coll)
 Load k-mer vectors.
bool get_DNA_code (char bp, bm::id64_t &dna_code)
bool get_kmer_code (const char *dna, size_t pos, unsigned k_size, bm::id64_t &k_mer)
 Calculate k-mer as an unsigned long integer.
char int2DNA (unsigned code)
 Translate integer code to DNA letter.
void translate_kmer (std::string &dna, bm::id64_t kmer_code, unsigned k_size)
 Translate k-mer code into ATGC DNA string.
template<typename BV>
void generate_k_mer_bvector (BV &bv, const vector_char_type &seq_vect, unsigned k_size, std::vector< bm::id64_t > &k_buf, const bm::id64_t chunk_size=400000000)
 This function turns each k-mer into an integer number and encodes it in a bit-vector (presense vector) The natural limitation here is that integer has to be less tha 48-bits (limitations of bm::bvector<>) This method build a presense k-mer fingerprint vector which can be used for Jaccard distance comparison.
std::atomic_ullong k_mer_progress_count (0)
static void generate_k_mers (CSequenceColl &seq_coll, unsigned k_size, size_t from, size_t to)
static void generate_k_mers_parallel (CSequenceColl &seq_coll, unsigned k_size, unsigned concurrency)
void resolve_duplicates (CSeqGroup &seq_group1, CSeqGroup &seq_group2, const CSequenceColl &seq_coll)
 Resolve duplicate members between two groups.
static void compute_and_sim_row (unsigned *row, const bm::bvector<> *bv_i, size_t i, const std::vector< std::unique_ptr< bvector_type > > &k_mers_vect)
 Compute similarity distances for one row/vector (1:N) of distance matrix.
static void compute_and_sim (distance_matrix_type &dm, const CSequenceColl &seq_coll, const bm::bvector<> &bv_mem, bm::bvector<>::size_type bv_mem_cnt, unsigned concurrency)
 Compute similarity distances matrix (COUNT(AND(a, b)).
static void compute_seq_group_union (CSeqGroup &seq_group, const CSequenceColl &seq_coll)
 Compute union (Universe) of all k-mers in the cluster group Implemented as a OR of all k-mer fingerprints.
static void compute_group (CSeqGroup &seq_group, const CSequenceColl &seq_coll, const bm::bvector<> &bv_exceptions, float similarity_cut_off)
static void assign_to_best_cluster (CSeqClusters &cluster_groups, const CSequenceColl &seq_coll, const bm::bvector<> &bv_seq_ids, bm::bvector<>::size_type seq_id_from, bm::bvector<>::size_type seq_id_to)
 Compute AND similarity to all available clusters assign to the most similar using cluster representative.
static void assign_to_best_cluster_union (CSeqClusters &cluster_groups, const CSequenceColl &seq_coll, const bm::bvector<> &bv_seq_ids, bm::bvector<>::size_type seq_id_from, bm::bvector<>::size_type seq_id_to)
 Compute AND similarity to all available clusters assign to the most similar using UNION of k-mers in the cluster This is a more relaxed assignmnet, used when representative does not work.
static void compute_random_clusters (CSeqClusters &cluster_groups, const CSequenceColl &seq_coll, const bm::bvector<> &bv_total, bm::random_subset< bvector_type > &rsub, unsigned num_clust, float similarity_cut_off, unsigned concurrency)
 Pick random sequences as cluster seed elements, try attach initial sequences based on weighted similarity measure.
static void compute_jaccard_clusters (CSeqClusters &seq_clusters, const CSequenceColl &seq_coll, unsigned num_clust, float similarity_cut_off, unsigned concurrency)
int main (int argc, char *argv[])

Variables

std::string ifa_name
std::string ikd_name
std::string ikd_counts_name
std::string kh_name
std::string ikd_rep_name
std::string ikd_freq_name
bool is_diag = false
bool is_timing = false
bool is_bench = false
unsigned ik_size = 8
unsigned parallel_jobs = 4
unsigned f_percent = 5
bm::chrono_taker ::duration_map_type timing_map

Detailed Description

Example: Use of bvector<> for k-mer fingerprint K should be short, no minimizers here (k < 24).

Example loads FASTA file (large multi-molecule file is expected, builds a collection of k-mers for each molecule and runs clusterization algorithm on the input collection using set intersect (logical AND) as a similarity measure.

Example uses std::async for running parallel jobs.

Definition in file xsample07a.cpp.

Typedef Documentation

◆ bv_ranges_vector

typedef std::vector<std::pair<bv_size_type, bv_size_type> > bv_ranges_vector

Definition at line 99 of file xsample07a.cpp.

◆ bv_size_type

Definition at line 98 of file xsample07a.cpp.

◆ bvector_ptr_vector_type

typedef std::vector<std::unique_ptr<bvector_type> > bvector_ptr_vector_type
Examples
xsample07a.cpp.

Definition at line 97 of file xsample07a.cpp.

◆ bvector_type

◆ distance_matrix_type

typedef bm::dynamic_heap_matrix<unsigned, bm::bvector<>::allocator_type> distance_matrix_type
Examples
xsample07a.cpp.

Definition at line 96 of file xsample07a.cpp.

◆ vector_char_type

typedef std::vector<char> vector_char_type
Examples
xsample07a.cpp.

Definition at line 95 of file xsample07a.cpp.

Function Documentation

◆ assign_to_best_cluster()

void assign_to_best_cluster ( CSeqClusters & cluster_groups,
const CSequenceColl & seq_coll,
const bm::bvector<> & bv_seq_ids,
bm::bvector<>::size_type seq_id_from,
bm::bvector<>::size_type seq_id_to )
static

◆ assign_to_best_cluster_union()

void assign_to_best_cluster_union ( CSeqClusters & cluster_groups,
const CSequenceColl & seq_coll,
const bm::bvector<> & bv_seq_ids,
bm::bvector<>::size_type seq_id_from,
bm::bvector<>::size_type seq_id_to )
static

Compute AND similarity to all available clusters assign to the most similar using UNION of k-mers in the cluster This is a more relaxed assignmnet, used when representative does not work.

Examples
xsample07a.cpp.

Definition at line 1315 of file xsample07a.cpp.

References CSeqGroup::add_member_sync(), BM_DECLARE_TEMP_BLOCK, CSeqGroup::count_and_union_sync(), bm::deserialize(), CSequenceColl::get_buf(), CSeqClusters::get_group(), bm::bvector< Alloc >::enumerator::go_to(), CSeqClusters::groups_size(), bm::bvector< Alloc >::set_allocator_pool(), and bm::bvector< Alloc >::iterator_base::valid().

Referenced by compute_jaccard_clusters().

◆ compute_and_sim()

void compute_and_sim ( distance_matrix_type & dm,
const CSequenceColl & seq_coll,
const bm::bvector<> & bv_mem,
bm::bvector<>::size_type bv_mem_cnt,
unsigned concurrency )
static

Compute similarity distances matrix (COUNT(AND(a, b)).

Examples
xsample07a.cpp.

Definition at line 910 of file xsample07a.cpp.

References compute_and_sim_row(), CSequenceColl::deserialize_k_mers(), bm::random_subset< BV >::sample(), and wait_for_slot().

Referenced by CSeqClusters::elect_leaders().

◆ compute_and_sim_row()

void compute_and_sim_row ( unsigned * row,
const bm::bvector<> * bv_i,
size_t i,
const std::vector< std::unique_ptr< bvector_type > > & k_mers_vect )
static

Compute similarity distances for one row/vector (1:N) of distance matrix.

Examples
xsample07a.cpp.

Definition at line 890 of file xsample07a.cpp.

References bm::bvector< Alloc >::count(), and bm::count_and().

Referenced by compute_and_sim().

◆ compute_group()

◆ compute_jaccard_clusters()

◆ compute_random_clusters()

void compute_random_clusters ( CSeqClusters & cluster_groups,
const CSequenceColl & seq_coll,
const bm::bvector<> & bv_total,
bm::random_subset< bvector_type > & rsub,
unsigned num_clust,
float similarity_cut_off,
unsigned concurrency )
static

Pick random sequences as cluster seed elements, try attach initial sequences based on weighted similarity measure.

Examples
xsample07a.cpp.

Definition at line 1374 of file xsample07a.cpp.

References CSeqClusters::add_group(), compute_group(), bm::bvector< Alloc >::first(), bm::random_subset< BV >::sample(), bm::bvector< Alloc >::iterator_base::valid(), and wait_for_slot().

Referenced by compute_jaccard_clusters().

◆ compute_seq_group_union()

void compute_seq_group_union ( CSeqGroup & seq_group,
const CSequenceColl & seq_coll )
static

Compute union (Universe) of all k-mers in the cluster group Implemented as a OR of all k-mer fingerprints.

Examples
xsample07a.cpp.

Definition at line 973 of file xsample07a.cpp.

References BM_DECLARE_TEMP_BLOCK, bm::bvector< Alloc >::clear(), bm::deserialize(), CSequenceColl::get_buf(), CSeqGroup::get_kmer_union(), CSeqGroup::get_members(), bm::bvector< Alloc >::optimize(), and bm::bvector< Alloc >::iterator_base::valid().

Referenced by CSeqClusters::elect_leaders().

◆ generate_k_mer_bvector()

template<typename BV>
void generate_k_mer_bvector ( BV & bv,
const vector_char_type & seq_vect,
unsigned k_size,
std::vector< bm::id64_t > & k_buf,
const bm::id64_t chunk_size = 400000000 )

This function turns each k-mer into an integer number and encodes it in a bit-vector (presense vector) The natural limitation here is that integer has to be less tha 48-bits (limitations of bm::bvector<>) This method build a presense k-mer fingerprint vector which can be used for Jaccard distance comparison.

Parameters
bv- [out] - target bit-vector
seq_vect- [out] DNA sequence vector
k-size- dimention for k-mer generation
k_buf- sort buffer for generated k-mers
chunk_size- sort buffer size (number of k-mers per sort)

Definition at line 474 of file xsample07a.cpp.

References bm::BM_SORTED, get_DNA_code(), and get_kmer_code().

Referenced by generate_k_mers().

◆ generate_k_mers()

◆ generate_k_mers_parallel()

void generate_k_mers_parallel ( CSequenceColl & seq_coll,
unsigned k_size,
unsigned concurrency )
static

◆ get_DNA_code()

bool get_DNA_code ( char bp,
bm::id64_t & dna_code )
inline

Definition at line 374 of file xsample07a.cpp.

Referenced by generate_k_mer_bvector(), and get_kmer_code().

◆ get_kmer_code()

bool get_kmer_code ( const char * dna,
size_t pos,
unsigned k_size,
bm::id64_t & k_mer )
inline

Calculate k-mer as an unsigned long integer.

Returns
true - if k-mer is "true" (not 'NNNNNN')

Definition at line 402 of file xsample07a.cpp.

References get_DNA_code().

Referenced by generate_k_mer_bvector().

◆ int2DNA()

char int2DNA ( unsigned code)
inline

Translate integer code to DNA letter.

Definition at line 429 of file xsample07a.cpp.

Referenced by translate_kmer().

◆ k_mer_progress_count()

std::atomic_ullong k_mer_progress_count ( 0 )

◆ load_FASTA()

int load_FASTA ( const std::string & fname,
CSequenceColl & seq_coll )
static

Load multi-sequence FASTA.

Definition at line 244 of file xsample07a.cpp.

References CSequenceColl::add_sequence().

Referenced by main().

◆ load_kmer_buffers()

void load_kmer_buffers ( const std::string & fname,
CSequenceColl & seq_coll )
static

Load k-mer vectors.

Examples
xsample07a.cpp.

Definition at line 332 of file xsample07a.cpp.

References CSequenceColl::set_buffer().

Referenced by main().

◆ main()

◆ resolve_duplicates()

◆ save_kmer_buffers()

void save_kmer_buffers ( const std::string & fname,
const CSequenceColl & seq_coll )
static

save k-mer vectors to a file

Examples
xsample07a.cpp.

Definition at line 294 of file xsample07a.cpp.

References CSequenceColl::get_buf(), CSequenceColl::get_buf_size(), and CSequenceColl::size().

Referenced by main().

◆ translate_kmer()

void translate_kmer ( std::string & dna,
bm::id64_t kmer_code,
unsigned k_size )
inline

Translate k-mer code into ATGC DNA string.

Parameters
dna- target string
k_mer- k-mer code
k_size-

Definition at line 444 of file xsample07a.cpp.

References int2DNA().

◆ wait_for_slot()

template<typename FV>
void wait_for_slot ( FV & futures,
unsigned * parallel_cnt,
unsigned concurrency )

wait for any opening in a list of futures used to schedule parallel tasks with CPU overbooking control

Examples
xsample07a.cpp.

Definition at line 111 of file xsample07a.cpp.

Referenced by compute_and_sim(), and compute_random_clusters().

Variable Documentation

◆ f_percent

unsigned f_percent = 5

Definition at line 86 of file xsample07a.cpp.

◆ ifa_name

std::string ifa_name

Definition at line 75 of file xsample07a.cpp.

◆ ik_size

unsigned ik_size = 8

Definition at line 84 of file xsample07a.cpp.

◆ ikd_counts_name

std::string ikd_counts_name

Definition at line 77 of file xsample07a.cpp.

◆ ikd_freq_name

std::string ikd_freq_name

Definition at line 80 of file xsample07a.cpp.

◆ ikd_name

std::string ikd_name

Definition at line 76 of file xsample07a.cpp.

◆ ikd_rep_name

std::string ikd_rep_name

Definition at line 79 of file xsample07a.cpp.

◆ is_bench

bool is_bench = false

Definition at line 83 of file xsample07a.cpp.

◆ is_diag

bool is_diag = false

Definition at line 81 of file xsample07a.cpp.

◆ is_timing

bool is_timing = false

Definition at line 82 of file xsample07a.cpp.

◆ kh_name

std::string kh_name

Definition at line 78 of file xsample07a.cpp.

◆ parallel_jobs

unsigned parallel_jobs = 4

Definition at line 85 of file xsample07a.cpp.

◆ timing_map

bm::chrono_taker ::duration_map_type timing_map

Definition at line 104 of file xsample07a.cpp.