1#ifndef BTLLIB_BLOOM_FILTER_HPP 
    2#define BTLLIB_BLOOM_FILTER_HPP 
    4#include "btllib/nthash.hpp" 
   18static const uint8_t BIT_MASKS[CHAR_BIT] = {
 
   20  0x01, 0x02, 0x04, 0x08, 
 
   21  0x10, 0x20, 0x40, 0x80  
 
   24static const char* 
const BLOOM_FILTER_SIGNATURE = 
"[BTLBloomFilter_v6]";
 
   25static const char* 
const KMER_BLOOM_FILTER_SIGNATURE =
 
   26  "[BTLKmerBloomFilter_v6]";
 
   27static const char* 
const SEED_BLOOM_FILTER_SIGNATURE =
 
   28  "[BTLSeedBloomFilter_v6]";
 
   29static const char* 
const HASH_FN = NTHASH_FN_NAME;
 
   31static const unsigned MAX_HASH_VALUES = 1024;
 
   32static const unsigned PLACEHOLDER_NEWLINES = 50;
 
   35class BloomFilterInitializer
 
   39  BloomFilterInitializer(
const std::string& path, 
const std::string& signature)
 
   42    , table(parse_header(signature))
 
   45  static bool check_file_signature(std::ifstream& ifs,
 
   46                                   const std::string& expected_signature,
 
   47                                   std::string& file_signature);
 
   51  std::shared_ptr<cpptoml::table> table;
 
   53  BloomFilterInitializer(
const BloomFilterInitializer&) = 
delete;
 
   54  BloomFilterInitializer(BloomFilterInitializer&&) = 
default;
 
   56  BloomFilterInitializer& operator=(
const BloomFilterInitializer&) = 
delete;
 
   57  BloomFilterInitializer& operator=(BloomFilterInitializer&&) = 
default;
 
   62  std::shared_ptr<cpptoml::table> parse_header(
const std::string& signature);
 
   80  BloomFilter(
size_t bytes, 
unsigned hash_num, std::string hash_fn = 
"");
 
  108  void insert(
const std::vector<uint64_t>& hashes) { 
insert(hashes.data()); }
 
  127  bool contains(
const std::vector<uint64_t>& hashes)
 const 
  172  void save(
const std::string& path);
 
  174  static void save(
const std::string& path,
 
  175                   const cpptoml::table& table,
 
  186    return check_file_signature(path, BLOOM_FILTER_SIGNATURE);
 
  189  static bool check_file_signature(
const std::string& path,
 
  190                                   const std::string& signature);
 
  193  BloomFilter(
const std::shared_ptr<BloomFilterInitializer>& bfi);
 
  201  size_t array_bits = 0;
 
  202  unsigned hash_num = 0;
 
  204  std::unique_ptr<std::atomic<uint8_t>[]> array;
 
  245  void insert(
const char* seq, 
size_t seq_len);
 
  252  void insert(
const std::string& seq) { 
insert(seq.c_str(), seq.size()); }
 
  267  void insert(
const std::vector<uint64_t>& hashes)
 
  269    bloom_filter.
insert(hashes);
 
  280  unsigned contains(
const char* seq, 
size_t seq_len) 
const;
 
  291    return contains(seq.c_str(), seq.size());
 
  302    return bloom_filter.
contains(hashes);
 
  310  bool contains(
const std::vector<uint64_t>& hashes)
 const 
  312    return bloom_filter.
contains(hashes);
 
  373  unsigned get_k()
 const { 
return k; }
 
  384  void save(
const std::string& path);
 
  393    return btllib::BloomFilter::check_file_signature(
 
  394      path, KMER_BLOOM_FILTER_SIGNATURE);
 
  426                  const std::vector<std::string>& seeds,
 
  427                  unsigned hash_num_per_seed);
 
  448  void insert(
const char* seq, 
size_t seq_len);
 
  455  void insert(
const std::string& seq) { 
insert(seq.c_str(), seq.size()); }
 
  463  void insert(
const uint64_t* hashes) { kmer_bloom_filter.
insert(hashes); }
 
  470  void insert(
const std::vector<uint64_t>& hashes)
 
  472    kmer_bloom_filter.
insert(hashes);
 
  485  std::vector<std::vector<unsigned>> 
contains(
const char* seq,
 
  486                                              size_t seq_len) 
const;
 
  497  std::vector<std::vector<unsigned>> 
contains(
const std::string& seq)
 const 
  499    return contains(seq.c_str(), seq.size());
 
  511    return kmer_bloom_filter.
contains(hashes);
 
  520  bool contains(
const std::vector<uint64_t>& hashes)
 const 
  522    return kmer_bloom_filter.
contains(hashes);
 
  599  const std::vector<std::string>& 
get_seeds()
 const { 
return seeds; }
 
  626  void save(
const std::string& path);
 
  635    return btllib::BloomFilter::check_file_signature(
 
  636      path, SEED_BLOOM_FILTER_SIGNATURE);
 
  642  std::vector<std::string> seeds;
 
  643  std::vector<SpacedSeed> parsed_seeds;
 
Definition: bloom_filter.hpp:67
bool contains(const uint64_t *hashes) const
bool contains(const std::vector< uint64_t > &hashes) const
Definition: bloom_filter.hpp:127
void insert(const std::vector< uint64_t > &hashes)
Definition: bloom_filter.hpp:108
void insert(const uint64_t *hashes)
static bool is_bloom_file(const std::string &path)
Definition: bloom_filter.hpp:184
const std::string & get_hash_fn() const
Definition: bloom_filter.hpp:165
unsigned get_hash_num() const
Definition: bloom_filter.hpp:161
BloomFilter(size_t bytes, unsigned hash_num, std::string hash_fn="")
void save(const std::string &path)
size_t get_bytes() const
Definition: bloom_filter.hpp:155
double get_occupancy() const
bool contains_insert(const uint64_t *hashes)
BloomFilter()
Definition: bloom_filter.hpp:71
bool contains_insert(const std::vector< uint64_t > &hashes)
Definition: bloom_filter.hpp:149
uint64_t get_pop_cnt() const
BloomFilter(const std::string &path)
Definition: bloom_filter.hpp:211
void insert(const char *seq, size_t seq_len)
unsigned contains_insert(const char *seq, size_t seq_len)
double get_fpr() const
Definition: bloom_filter.hpp:371
BloomFilter & get_bloom_filter()
Definition: bloom_filter.hpp:377
void insert(const std::vector< uint64_t > &hashes)
Definition: bloom_filter.hpp:267
void insert(const std::string &seq)
Definition: bloom_filter.hpp:252
unsigned get_hash_num() const
Definition: bloom_filter.hpp:369
static bool is_bloom_file(const std::string &path)
Definition: bloom_filter.hpp:391
unsigned contains(const char *seq, size_t seq_len) const
unsigned contains_insert(const std::string &seq)
Definition: bloom_filter.hpp:332
KmerBloomFilter(const std::string &path)
uint64_t get_pop_cnt() const
Definition: bloom_filter.hpp:365
bool contains_insert(const uint64_t *hashes)
Definition: bloom_filter.hpp:345
KmerBloomFilter()
Definition: bloom_filter.hpp:215
bool contains(const uint64_t *hashes) const
Definition: bloom_filter.hpp:300
void insert(const uint64_t *hashes)
Definition: bloom_filter.hpp:260
const std::string & get_hash_fn() const
Definition: bloom_filter.hpp:375
void save(const std::string &path)
bool contains_insert(const std::vector< uint64_t > &hashes)
Definition: bloom_filter.hpp:357
size_t get_bytes() const
Definition: bloom_filter.hpp:363
double get_occupancy() const
Definition: bloom_filter.hpp:367
unsigned contains(const std::string &seq) const
Definition: bloom_filter.hpp:289
unsigned get_k() const
Definition: bloom_filter.hpp:373
bool contains(const std::vector< uint64_t > &hashes) const
Definition: bloom_filter.hpp:310
KmerBloomFilter(size_t bytes, unsigned hash_num, unsigned k)
Definition: bloom_filter.hpp:410
unsigned get_total_hash_num() const
Definition: bloom_filter.hpp:589
double get_occupancy() const
Definition: bloom_filter.hpp:586
bool contains(const uint64_t *hashes) const
Definition: bloom_filter.hpp:509
std::vector< std::vector< unsigned > > contains_insert(const std::string &seq)
Definition: bloom_filter.hpp:549
void insert(const char *seq, size_t seq_len)
bool contains(const std::vector< uint64_t > &hashes) const
Definition: bloom_filter.hpp:520
std::vector< std::vector< unsigned > > contains_insert(const char *seq, size_t seq_len)
void save(const std::string &path)
SeedBloomFilter(size_t bytes, unsigned k, const std::vector< std::string > &seeds, unsigned hash_num_per_seed)
const std::vector< SpacedSeed > & get_parsed_seeds() const
Definition: bloom_filter.hpp:602
KmerBloomFilter & get_kmer_bloom_filter()
Definition: bloom_filter.hpp:619
void insert(const std::vector< uint64_t > &hashes)
Definition: bloom_filter.hpp:470
bool contains_insert(const std::vector< uint64_t > &hashes)
Definition: bloom_filter.hpp:576
static bool is_bloom_file(const std::string &path)
Definition: bloom_filter.hpp:633
unsigned get_hash_num_per_seed() const
Definition: bloom_filter.hpp:607
SeedBloomFilter(const std::string &path)
uint64_t get_pop_cnt() const
Definition: bloom_filter.hpp:584
size_t get_bytes() const
Definition: bloom_filter.hpp:582
unsigned get_k() const
Definition: bloom_filter.hpp:597
void insert(const uint64_t *hashes)
Definition: bloom_filter.hpp:463
const std::vector< std::string > & get_seeds() const
Definition: bloom_filter.hpp:599
const std::string & get_hash_fn() const
Definition: bloom_filter.hpp:614
std::vector< std::vector< unsigned > > contains(const char *seq, size_t seq_len) const
SeedBloomFilter()
Definition: bloom_filter.hpp:414
unsigned get_hash_num() const
Definition: bloom_filter.hpp:612
std::vector< std::vector< unsigned > > contains(const std::string &seq) const
Definition: bloom_filter.hpp:497
void insert(const std::string &seq)
Definition: bloom_filter.hpp:455
bool contains_insert(const uint64_t *hashes)
Definition: bloom_filter.hpp:563
Definition: bloom_filter.hpp:16