|
BitMagic-C++
|
Example: Use of bvector<> for k-mer fingerprint K should be short, no minimizers here. More...
#include <assert.h>#include <stdlib.h>#include <iostream>#include <vector>#include <map>#include <algorithm>#include <utility>#include <future>#include <thread>#include <mutex>#include <atomic>#include "bm64.h"#include "bmalgo.h"#include "bmserial.h"#include "bmaggregator.h"#include "bmsparsevec_compr.h"#include "bmsparsevec_algo.h"#include "bmundef.h"#include "bmdbg.h"#include "bmtimer.h"#include "dna_finger.h"#include "cmd_args.h"
Go to the source code of this file.
Data Structures | |
| class | SortCounting_JobFunctor< BV > |
| Functor to process job batch (task). More... | |
| class | Counting_JobFunctor< DNA_Scan > |
| k-mer counting job functor class using bm::aggregator<> More... | |
Typedefs | |
| typedef std::vector< char > | vector_char_type |
| typedef DNA_FingerprintScanner< bm::bvector<> > | dna_scanner_type |
| typedef bm::sparse_vector< unsigned, bm::bvector<> > | sparse_vector_u32 |
| typedef bm::rsc_sparse_vector< unsigned, sparse_vector_u32 > | rsc_sparse_vector_u32 |
| typedef std::map< unsigned, unsigned > | histogram_map_u32 |
Functions | |
| std::atomic_ullong | k_mer_progress_count (0) |
| static int | load_FASTA (const std::string &fname, vector_char_type &seq_vect) |
| really simple FASTA parser (one entry per file) | |
| bool | get_DNA_code (char bp, bm::id64_t &dna_code) |
| bool | get_kmer_code (const char *dna, size_t pos, unsigned k_size, bm::id64_t &k_mer) |
| Calculate k-mer as an unsigned long integer. | |
| char | int2DNA (unsigned code) |
| Translate integer code to DNA letter. | |
| void | translate_kmer (std::string &dna, bm::id64_t kmer_code, unsigned k_size) |
| Translate k-mer code into ATGC DNA string. | |
| void | validate_k_mer (const char *dna, size_t pos, unsigned k_size, bm::id64_t k_mer) |
| QA function to validate if reverse k-mer decode gives the same string. | |
| template<typename VECT> | |
| void | sort_unique (VECT &vect) |
| Auxiliary function to do sort+unique on a vactor of ints removes duplicate elements. | |
| template<typename VECT, typename COUNT_VECT> | |
| void | sort_count (VECT &vect, COUNT_VECT &cvect) |
| Auxiliary function to do sort+unique on a vactor of ints and save results in a counts vector. | |
| template<typename BV> | |
| void | generate_k_mer_bvector (BV &bv, const vector_char_type &seq_vect, unsigned k_size, bool check) |
| This function turns each k-mer into an integer number and encodes it in a bit-vector (presense vector) The natural limitation here is that integer has to be less tha 48-bits (limitations of bm::bvector<>) This method build a presense k-mer fingerprint vector which can be used for Jaccard distance comparison. | |
| void | count_kmers (const vector_char_type &seq_vect, unsigned k_size, rsc_sparse_vector_u32 &kmer_counts) |
| k-mer counting algorithm using reference sequence, regenerates k-mer codes, sorts them and counts | |
| template<typename BV> | |
| void | count_kmers_parallel (const BV &bv_kmers, const vector_char_type &seq_vect, rsc_sparse_vector_u32 &kmer_counts, unsigned k_size, unsigned concurrency) |
| MT k-mer counting. | |
| template<typename BV> | |
| void | count_kmers (const BV &bv_kmers, rsc_sparse_vector_u32 &kmer_counts) |
| k-mer counting method using Bitap algorithm for occurence search this method is significantly slower than direct regeneration of k-mer codes and sorting count | |
| template<typename BV> | |
| void | count_kmers_parallel (const BV &bv_kmers, rsc_sparse_vector_u32 &kmer_counts, unsigned concurrency) |
| Runs k-mer counting in parallel. | |
| static void | compute_kmer_histogram (histogram_map_u32 &hmap, const rsc_sparse_vector_u32 &kmer_counts) |
| Compute a map of how often each k-mer frequency is observed in the k-mer counts vector. | |
| static void | report_hmap (const string &fname, const histogram_map_u32 &hmap) |
| Save TSV report of k-mer frequences (reverse sorted, most frequent k-mers first). | |
| template<typename BV> | |
| void | compute_frequent_kmers (BV &frequent_bv, const histogram_map_u32 &hmap, const rsc_sparse_vector_u32 &kmer_counts, unsigned percent, unsigned k_size) |
| Create vector, representing subset of k-mers of high frequency. | |
| int | main (int argc, char *argv[]) |
Variables | |
| std::string | ifa_name |
| std::string | ikd_name |
| std::string | ikd_counts_name |
| std::string | kh_name |
| std::string | ikd_rep_name |
| std::string | ikd_freq_name |
| bool | is_diag = false |
| bool | is_timing = false |
| bool | is_bench = false |
| unsigned | ik_size = 8 |
| unsigned | parallel_jobs = 4 |
| unsigned | f_percent = 5 |
| bm::chrono_taker ::duration_map_type | timing_map |
| dna_scanner_type | dna_scanner |
Example: Use of bvector<> for k-mer fingerprint K should be short, no minimizers here.
Definition in file xsample07.cpp.
| typedef DNA_FingerprintScanner<bm::bvector<> > dna_scanner_type |
Definition at line 100 of file xsample07.cpp.
| typedef std::map<unsigned, unsigned> histogram_map_u32 |
Definition at line 103 of file xsample07.cpp.
| typedef bm::rsc_sparse_vector<unsigned, sparse_vector_u32 > rsc_sparse_vector_u32 |
Definition at line 102 of file xsample07.cpp.
| typedef bm::sparse_vector<unsigned, bm::bvector<> > sparse_vector_u32 |
Definition at line 101 of file xsample07.cpp.
| typedef std::vector<char> vector_char_type |
Definition at line 99 of file xsample07.cpp.
| void compute_frequent_kmers | ( | BV & | frequent_bv, |
| const histogram_map_u32 & | hmap, | ||
| const rsc_sparse_vector_u32 & | kmer_counts, | ||
| unsigned | percent, | ||
| unsigned | k_size ) |
Create vector, representing subset of k-mers of high frequency.
| frequent_bv[out] | - bit-vector of frequent k-mers (subset of all k-mers) |
| hmap | - histogram map of all k-mers |
| kmer_counts | - kmer frequency(counts) vector |
| percent | - percent of frequent k-mers to build a subset (5%) percent here is of total number of k-mers (not percent of all occurences) |
| k_size | - K mer size |
Definition at line 905 of file xsample07.cpp.
References bm::bvector< Alloc >::count(), bm::sparse_vector_scanner< SV, S_FACTOR >::find_eq(), bm::rsc_sparse_vector< Val, SV >::get(), bm::rsc_sparse_vector< Val, SV >::get_null_bvector(), and bm::bvector< Alloc >::iterator_base::valid().
Referenced by main().
|
static |
Compute a map of how often each k-mer frequency is observed in the k-mer counts vector.
| hmap | - [out] histogram map |
| kmer_counts | - [in] kmer counts vector |
Definition at line 859 of file xsample07.cpp.
References bm::bvector< Alloc >::first(), bm::rsc_sparse_vector< Val, SV >::get(), and bm::rsc_sparse_vector< Val, SV >::get_null_bvector().
Referenced by main().
| void count_kmers | ( | const BV & | bv_kmers, |
| rsc_sparse_vector_u32 & | kmer_counts ) |
k-mer counting method using Bitap algorithm for occurence search this method is significantly slower than direct regeneration of k-mer codes and sorting count
Definition at line 653 of file xsample07.cpp.
References dna_scanner, ik_size, bm::rsc_sparse_vector< Val, SV >::set(), and translate_kmer().
|
inline |
k-mer counting algorithm using reference sequence, regenerates k-mer codes, sorts them and counts
Definition at line 408 of file xsample07.cpp.
References get_DNA_code(), get_kmer_code(), and sort_count().
Referenced by count_kmers_parallel(), and count_kmers_parallel().
| void count_kmers_parallel | ( | const BV & | bv_kmers, |
| const vector_char_type & | seq_vect, | ||
| rsc_sparse_vector_u32 & | kmer_counts, | ||
| unsigned | k_size, | ||
| unsigned | concurrency ) |
MT k-mer counting.
Definition at line 594 of file xsample07.cpp.
References count_kmers(), ik_size, and bm::rank_range_split().
Referenced by main().
| void count_kmers_parallel | ( | const BV & | bv_kmers, |
| rsc_sparse_vector_u32 & | kmer_counts, | ||
| unsigned | concurrency ) |
Runs k-mer counting in parallel.
Definition at line 781 of file xsample07.cpp.
References count_kmers(), dna_scanner, k_mer_progress_count(), and bm::rank_range_split().
| void generate_k_mer_bvector | ( | BV & | bv, |
| const vector_char_type & | seq_vect, | ||
| unsigned | k_size, | ||
| bool | check ) |
This function turns each k-mer into an integer number and encodes it in a bit-vector (presense vector) The natural limitation here is that integer has to be less tha 48-bits (limitations of bm::bvector<>) This method build a presense k-mer fingerprint vector which can be used for Jaccard distance comparison.
| bv | - [out] - target bit-vector |
| seq_vect | - [out] DNA sequence vector |
| k-size | - dimention for k-mer generation |
Definition at line 306 of file xsample07.cpp.
References bm::BM_SORTED, get_DNA_code(), get_kmer_code(), sort_unique(), timing_map, and validate_k_mer().
Referenced by main().
|
inline |
Definition at line 138 of file xsample07.cpp.
Referenced by count_kmers(), generate_k_mer_bvector(), get_kmer_code(), and SortCounting_JobFunctor< BV >::operator()().
|
inline |
Calculate k-mer as an unsigned long integer.
Definition at line 165 of file xsample07.cpp.
References get_DNA_code().
Referenced by count_kmers(), generate_k_mer_bvector(), and SortCounting_JobFunctor< BV >::operator()().
|
inline |
Translate integer code to DNA letter.
Definition at line 192 of file xsample07.cpp.
Referenced by translate_kmer(), and validate_k_mer().
| std::atomic_ullong k_mer_progress_count | ( | 0 | ) |
Referenced by count_kmers_parallel(), generate_k_mers(), generate_k_mers_parallel(), and Counting_JobFunctor< DNA_Scan >::operator()().
|
static |
really simple FASTA parser (one entry per file)
Definition at line 116 of file xsample07.cpp.
References timing_map.
Referenced by main().
| int main | ( | int | argc, |
| char * | argv[] ) |
Definition at line 966 of file xsample07.cpp.
References bm::bvector< Alloc >::bit_sub(), bm::BM_GAP, compute_frequent_kmers(), compute_kmer_histogram(), bm::bvector< Alloc >::count(), count_kmers_parallel(), bm::chrono_taker< TOut >::ct_time, dna_scanner, bm::rsc_sparse_vector< Val, SV >::equal(), f_percent, generate_k_mer_bvector(), bm::rsc_sparse_vector< Val, SV >::get(), ifa_name, ik_size, ikd_counts_name, ikd_freq_name, ikd_name, ikd_rep_name, is_diag, is_timing, kh_name, load_FASTA(), bm::bvector< Alloc >::optimize(), bm::rsc_sparse_vector< Val, SV >::optimize(), parallel_jobs, parse_args(), bm::chrono_taker< TOut >::print_duration_map(), report_hmap(), bm::sparse_vector_find_first_mismatch(), bm::rsc_sparse_vector< Val, SV >::sync(), and timing_map.
|
static |
Save TSV report of k-mer frequences (reverse sorted, most frequent k-mers first).
Definition at line 881 of file xsample07.cpp.
Referenced by main().
| void sort_count | ( | VECT & | vect, |
| COUNT_VECT & | cvect ) |
Auxiliary function to do sort+unique on a vactor of ints and save results in a counts vector.
Definition at line 268 of file xsample07.cpp.
Referenced by count_kmers(), and SortCounting_JobFunctor< BV >::operator()().
| void sort_unique | ( | VECT & | vect | ) |
Auxiliary function to do sort+unique on a vactor of ints removes duplicate elements.
Definition at line 256 of file xsample07.cpp.
Referenced by generate_k_mer_bvector().
|
inline |
Translate k-mer code into ATGC DNA string.
| dna | - target string |
| k_mer | - k-mer code |
| k_size | - |
Definition at line 207 of file xsample07.cpp.
References int2DNA().
Referenced by count_kmers(), and Counting_JobFunctor< DNA_Scan >::operator()().
|
inline |
QA function to validate if reverse k-mer decode gives the same string.
Definition at line 224 of file xsample07.cpp.
References int2DNA().
Referenced by generate_k_mer_bvector().
| dna_scanner_type dna_scanner |
Definition at line 109 of file xsample07.cpp.
Referenced by count_kmers(), count_kmers_parallel(), and main().
| unsigned f_percent = 5 |
Definition at line 91 of file xsample07.cpp.
Referenced by main().
| std::string ifa_name |
Definition at line 80 of file xsample07.cpp.
| unsigned ik_size = 8 |
Definition at line 89 of file xsample07.cpp.
Referenced by count_kmers(), count_kmers_parallel(), main(), and Counting_JobFunctor< DNA_Scan >::operator()().
| std::string ikd_counts_name |
Definition at line 82 of file xsample07.cpp.
Referenced by main().
| std::string ikd_freq_name |
Definition at line 85 of file xsample07.cpp.
Referenced by main().
| std::string ikd_name |
Definition at line 81 of file xsample07.cpp.
Referenced by main().
| std::string ikd_rep_name |
Definition at line 84 of file xsample07.cpp.
Referenced by main().
| bool is_bench = false |
Definition at line 88 of file xsample07.cpp.
| bool is_diag = false |
Definition at line 86 of file xsample07.cpp.
| bool is_timing = false |
Definition at line 87 of file xsample07.cpp.
| std::string kh_name |
Definition at line 83 of file xsample07.cpp.
Referenced by main().
| unsigned parallel_jobs = 4 |
Definition at line 90 of file xsample07.cpp.
| bm::chrono_taker ::duration_map_type timing_map |
Definition at line 108 of file xsample07.cpp.