1#ifndef BTLLIB_COUNTING_BLOOM_FILTER_HPP 
    2#define BTLLIB_COUNTING_BLOOM_FILTER_HPP 
    4#include "btllib/bloom_filter.hpp" 
    5#include "btllib/counting_bloom_filter.hpp" 
    6#include "btllib/nthash.hpp" 
    7#include "btllib/status.hpp" 
   23static const char* 
const COUNTING_BLOOM_FILTER_SIGNATURE =
 
   24  "[BTLCountingBloomFilter_v5]";
 
   26static const char* 
const KMER_COUNTING_BLOOM_FILTER_SIGNATURE =
 
   27  "[BTLKmerCountingBloomFilter_v5]";
 
   30class KmerCountingBloomFilter;
 
   54                      std::string hash_fn = 
"");
 
   75  void insert(
const uint64_t* hashes);
 
   82  void insert(
const std::vector<uint64_t>& hashes) { 
insert(hashes.data()); }
 
   92  T 
contains(
const uint64_t* hashes) 
const;
 
  101  T 
contains(
const std::vector<uint64_t>& hashes)
 const 
  224  double get_fpr(T threshold = 1) 
const;
 
  233  void save(
const std::string& path);
 
  242    return btllib::BloomFilter::check_file_signature(
 
  243      path, COUNTING_BLOOM_FILTER_SIGNATURE);
 
  249  void insert(
const uint64_t* hashes, T min_val);
 
  254  size_t array_size = 0;
 
  255  unsigned hash_num = 0;
 
  257  std::unique_ptr<std::atomic<T>[]> array;
 
  301  void insert(
const char* seq, 
size_t seq_len);
 
  308  void insert(
const std::string& seq) { 
insert(seq.c_str(), seq.size()); }
 
  316  void insert(
const uint64_t* hashes) { counting_bloom_filter.insert(hashes); }
 
  323  void insert(
const std::vector<uint64_t>& hashes)
 
  325    counting_bloom_filter.insert(hashes);
 
  336  uint64_t 
contains(
const char* seq, 
size_t seq_len) 
const;
 
  347    return contains(seq.c_str(), seq.size());
 
  360    return counting_bloom_filter.contains(hashes);
 
  370  T 
contains(
const std::vector<uint64_t>& hashes)
 const 
  372    return counting_bloom_filter.contains(hashes);
 
  407    return counting_bloom_filter.contains_insert(hashes);
 
  419    return counting_bloom_filter.contains_insert(hashes);
 
  455    return counting_bloom_filter.insert_contains(hashes);
 
  467    return counting_bloom_filter.insert_contains(hashes);
 
  509    return counting_bloom_filter.insert_thresh_contains(hashes, threshold);
 
  526    return counting_bloom_filter.insert_thresh_contains(hashes, threshold);
 
  568    return counting_bloom_filter.contains_insert_thresh(hashes, threshold);
 
  583    return counting_bloom_filter.contains_insert_thresh(hashes, threshold);
 
  587  size_t get_bytes()
 const { 
return counting_bloom_filter.get_bytes(); }
 
  591    return counting_bloom_filter.get_pop_cnt(threshold);
 
  596    return counting_bloom_filter.get_occupancy(threshold);
 
  599  unsigned get_hash_num()
 const { 
return counting_bloom_filter.get_hash_num(); }
 
  608    return counting_bloom_filter.get_fpr(threshold);
 
  611  unsigned get_k()
 const { 
return k; }
 
  615    return counting_bloom_filter.get_hash_fn();
 
  620    return counting_bloom_filter;
 
  628  void save(
const std::string& path);
 
  638    return btllib::BloomFilter::check_file_signature(
 
  639      path, KMER_COUNTING_BLOOM_FILTER_SIGNATURE);
 
  649using CountingBloomFilter8 = CountingBloomFilter<uint8_t>;
 
  650using CountingBloomFilter16 = CountingBloomFilter<uint16_t>;
 
  651using CountingBloomFilter32 = CountingBloomFilter<uint32_t>;
 
  653using KmerCountingBloomFilter8 = KmerCountingBloomFilter<uint8_t>;
 
  654using KmerCountingBloomFilter16 = KmerCountingBloomFilter<uint16_t>;
 
  655using KmerCountingBloomFilter32 = KmerCountingBloomFilter<uint32_t>;
 
  662      size_t(std::ceil(double(bytes) / sizeof(uint64_t)) * sizeof(uint64_t)))
 
  663  , array_size(get_bytes() / sizeof(array[0]))
 
  665  , hash_fn(std::move(hash_fn))
 
  666  , array(new std::atomic<T>[array_size])
 
  668  check_error(bytes == 0, 
"CountingBloomFilter: memory budget must be >0!");
 
  670              "CountingBloomFilter: number of hash values must be >0!");
 
  672    hash_num > MAX_HASH_VALUES,
 
  673    "CountingBloomFilter: number of hash values cannot be over 1024!");
 
  674  check_warning(
sizeof(uint8_t) != 
sizeof(std::atomic<uint8_t>),
 
  675                "Atomic primitives take extra memory. CountingBloomFilter will " 
  677                  std::to_string(bytes) + 
" for bit array.");
 
  678  std::memset((
void*)array.get(), 0, array_size * 
sizeof(array[0]));
 
  689  bool update_done = 
false;
 
  690  T new_val, tmp_min_val;
 
  692    new_val = min_val + 1;
 
  693    for (
size_t i = 0; i < hash_num; ++i) {
 
  694      tmp_min_val = min_val;
 
  695      update_done = array[hashes[i] % array_size].compare_exchange_strong(
 
  696        tmp_min_val, new_val);
 
  699        (min_val = contains(hashes)) == std::numeric_limits<T>::max()) {
 
  709  contains_insert(hashes);
 
  716  T min = array[hashes[0] % array_size];
 
  717  for (
size_t i = 1; i < hash_num; ++i) {
 
  718    const size_t idx = hashes[i] % array_size;
 
  719    if (array[idx] < min) {
 
  730  const auto count = contains(hashes);
 
  731  if (count < std::numeric_limits<T>::max()) {
 
  732    insert(hashes, count);
 
  741  const auto count = contains(hashes);
 
  742  if (count < std::numeric_limits<T>::max()) {
 
  743    insert(hashes, count);
 
  746  return std::numeric_limits<T>::max();
 
  754  const auto count = contains(hashes);
 
  755  if (count < threshold) {
 
  756    insert(hashes, count);
 
  767  const auto count = contains(hashes);
 
  768  if (count < threshold) {
 
  769    insert(hashes, count);
 
  778  uint64_t pop_cnt = 0;
 
  782#pragma omp parallel for reduction(+ : pop_cnt) 
  783  for (
size_t i = 0; i < array_size; ++i) {
 
  784    if (array[i] >= threshold) {
 
  795  return double(get_pop_cnt(threshold)) / double(array_size);
 
  802  return std::pow(get_occupancy(threshold), 
double(hash_num));
 
  808      std::make_shared<BloomFilterInitializer>(path,
 
  809                                               COUNTING_BLOOM_FILTER_SIGNATURE))
 
  814  const std::shared_ptr<BloomFilterInitializer>& bfi)
 
  815  : bytes(*bfi->table->get_as<decltype(bytes)>(
"bytes"))
 
  816  , array_size(bytes / sizeof(array[0]))
 
  817  , hash_num(*(bfi->table->get_as<decltype(hash_num)>(
"hash_num")))
 
  818  , hash_fn(bfi->table->contains(
"hash_fn")
 
  819              ? *(bfi->table->get_as<decltype(hash_fn)>(
"hash_fn"))
 
  821  , array(new std::atomic<T>[array_size])
 
  823  check_warning(
sizeof(uint8_t) != 
sizeof(std::atomic<uint8_t>),
 
  824                "Atomic primitives take extra memory. CountingBloomFilter will " 
  826                  std::to_string(bytes) + 
" for bit array.");
 
  827  const auto loaded_counter_bits =
 
  828    *(bfi->table->get_as<
size_t>(
"counter_bits"));
 
  829  check_error(
sizeof(array[0]) * CHAR_BIT != loaded_counter_bits,
 
  830              "CountingBloomFilter" +
 
  831                std::to_string(
sizeof(array[0]) * CHAR_BIT) +
 
  832                " tried to load a file of CountingBloomFilter" +
 
  833                std::to_string(loaded_counter_bits));
 
  834  bfi->ifs.read((
char*)array.get(),
 
  835                std::streamsize(array_size * 
sizeof(array[0])));
 
  846  auto root = cpptoml::make_table();
 
  850  auto header = cpptoml::make_table();
 
  851  header->insert(
"bytes", get_bytes());
 
  852  header->insert(
"hash_num", get_hash_num());
 
  853  if (!hash_fn.empty()) {
 
  854    header->insert(
"hash_fn", hash_fn);
 
  856  header->insert(
"counter_bits", 
size_t(
sizeof(array[0]) * CHAR_BIT));
 
  857  std::string header_string = COUNTING_BLOOM_FILTER_SIGNATURE;
 
  859    header_string.substr(1, header_string.size() - 2); 
 
  860  root->insert(header_string, header);
 
  863    path, *root, (
char*)array.get(), array_size * 
sizeof(array[0]));
 
  871  , counting_bloom_filter(bytes, hash_num, HASH_FN)
 
  878  NtHash nthash(seq, seq_len, get_hash_num(), get_k());
 
  879  while (nthash.
roll()) {
 
  880    counting_bloom_filter.insert(nthash.hashes());
 
  889  NtHash nthash(seq, seq_len, get_hash_num(), get_k());
 
  890  while (nthash.
roll()) {
 
  891    sum += counting_bloom_filter.contains(nthash.hashes());
 
  901  NtHash nthash(seq, seq_len, get_hash_num(), get_k());
 
  902  while (nthash.
roll()) {
 
  903    sum += counting_bloom_filter.contains_insert(nthash.hashes());
 
  913  NtHash nthash(seq, seq_len, get_hash_num(), get_k());
 
  914  while (nthash.
roll()) {
 
  915    sum += counting_bloom_filter.insert_contains(nthash.hashes());
 
  927  NtHash nthash(seq, seq_len, get_hash_num(), get_k());
 
  928  while (nthash.
roll()) {
 
  930      counting_bloom_filter.insert_thresh_contains(nthash.hashes(), threshold);
 
  942  NtHash nthash(seq, seq_len, get_hash_num(), get_k());
 
  943  while (nthash.
roll()) {
 
  945      counting_bloom_filter.contains_insert_thresh(nthash.hashes(), threshold);
 
  952  const std::string& path)
 
  954      std::make_shared<BloomFilterInitializer>(
 
  956        KMER_COUNTING_BLOOM_FILTER_SIGNATURE))
 
  961  const std::shared_ptr<BloomFilterInitializer>& bfi)
 
  962  : k(*(bfi->table->get_as<decltype(k)>(
"k")))
 
  963  , counting_bloom_filter(bfi)
 
  965  check_error(counting_bloom_filter.hash_fn != HASH_FN,
 
  966              "KmerCountingBloomFilter: loaded hash function (" +
 
  967                counting_bloom_filter.hash_fn +
 
  968                ") is different from the one used by default (" + HASH_FN +
 
  980  auto root = cpptoml::make_table();
 
  984  auto header = cpptoml::make_table();
 
  985  header->insert(
"bytes", get_bytes());
 
  986  header->insert(
"hash_num", get_hash_num());
 
  987  header->insert(
"hash_fn", get_hash_fn());
 
  988  header->insert(
"counter_bits",
 
  989                 size_t(
sizeof(counting_bloom_filter.array[0]) * CHAR_BIT));
 
  990  header->insert(
"k", k);
 
  991  std::string header_string = KMER_COUNTING_BLOOM_FILTER_SIGNATURE;
 
  993    header_string.substr(1, header_string.size() - 2); 
 
  994  root->insert(header_string, header);
 
  998                    (
char*)counting_bloom_filter.array.get(),
 
  999                    counting_bloom_filter.array_size *
 
 1000                      sizeof(counting_bloom_filter.array[0]));
 
void save(const std::string &path)
Definition: counting_bloom_filter.hpp:39
T insert_thresh_contains(const std::vector< uint64_t > &hashes, const T threshold)
Definition: counting_bloom_filter.hpp:175
void insert(const std::vector< uint64_t > &hashes)
Definition: counting_bloom_filter.hpp:82
double get_occupancy(T threshold=1) const
Definition: counting_bloom_filter.hpp:793
const std::string & get_hash_fn() const
Definition: counting_bloom_filter.hpp:226
T contains_insert(const uint64_t *hashes)
Definition: counting_bloom_filter.hpp:728
static bool is_bloom_file(const std::string &path)
Definition: counting_bloom_filter.hpp:240
double get_fpr(T threshold=1) const
Definition: counting_bloom_filter.hpp:800
uint64_t get_pop_cnt(T threshold=1) const
Definition: counting_bloom_filter.hpp:776
T insert_contains(const std::vector< uint64_t > &hashes)
Definition: counting_bloom_filter.hpp:146
void save(const std::string &path)
Definition: counting_bloom_filter.hpp:840
T contains_insert_thresh(const uint64_t *hashes, T threshold)
Definition: counting_bloom_filter.hpp:764
T contains(const std::vector< uint64_t > &hashes) const
Definition: counting_bloom_filter.hpp:101
size_t get_bytes() const
Definition: counting_bloom_filter.hpp:210
unsigned get_hash_num() const
Definition: counting_bloom_filter.hpp:217
T contains(const uint64_t *hashes) const
Definition: counting_bloom_filter.hpp:714
CountingBloomFilter()
Definition: counting_bloom_filter.hpp:43
T insert_contains(const uint64_t *hashes)
Definition: counting_bloom_filter.hpp:739
void insert(const uint64_t *hashes)
Definition: counting_bloom_filter.hpp:707
T contains_insert_thresh(const std::vector< uint64_t > &hashes, const T threshold)
Definition: counting_bloom_filter.hpp:203
T insert_thresh_contains(const uint64_t *hashes, T threshold)
Definition: counting_bloom_filter.hpp:751
T contains_insert(const std::vector< uint64_t > &hashes)
Definition: counting_bloom_filter.hpp:123
Definition: counting_bloom_filter.hpp:267
uint64_t contains(const char *seq, size_t seq_len) const
Definition: counting_bloom_filter.hpp:886
T insert_contains(const std::vector< uint64_t > &hashes)
Definition: counting_bloom_filter.hpp:465
T insert_thresh_contains(const std::vector< uint64_t > &hashes, const T threshold)
Definition: counting_bloom_filter.hpp:523
void save(const std::string &path)
Definition: counting_bloom_filter.hpp:974
CountingBloomFilter< T > & get_counting_bloom_filter()
Definition: counting_bloom_filter.hpp:618
T insert_contains(const char *seq, size_t seq_len)
Definition: counting_bloom_filter.hpp:910
T insert_thresh_contains(const std::string &seq, const T threshold)
Definition: counting_bloom_filter.hpp:491
T contains_insert_thresh(const uint64_t *hashes, const T threshold)
Definition: counting_bloom_filter.hpp:566
T insert_thresh_contains(const char *seq, size_t seq_len, T threshold)
Definition: counting_bloom_filter.hpp:922
T contains(const uint64_t *hashes) const
Definition: counting_bloom_filter.hpp:358
T contains_insert(const uint64_t *hashes)
Definition: counting_bloom_filter.hpp:405
T contains_insert(const std::vector< uint64_t > &hashes)
Definition: counting_bloom_filter.hpp:417
T contains_insert(const std::string &seq)
Definition: counting_bloom_filter.hpp:392
size_t get_bytes() const
Definition: counting_bloom_filter.hpp:587
T contains_insert_thresh(const char *seq, size_t seq_len, T threshold)
Definition: counting_bloom_filter.hpp:937
T insert_thresh_contains(const uint64_t *hashes, const T threshold)
Definition: counting_bloom_filter.hpp:507
T contains_insert_thresh(const std::vector< uint64_t > &hashes, const T threshold)
Definition: counting_bloom_filter.hpp:580
T insert_contains(const std::string &seq)
Definition: counting_bloom_filter.hpp:439
const std::string & get_hash_fn() const
Definition: counting_bloom_filter.hpp:613
void insert(const std::vector< uint64_t > &hashes)
Definition: counting_bloom_filter.hpp:323
void insert(const char *seq, size_t seq_len)
Definition: counting_bloom_filter.hpp:876
double get_occupancy(T threshold=1) const
Definition: counting_bloom_filter.hpp:594
T contains_insert(const char *seq, size_t seq_len)
Definition: counting_bloom_filter.hpp:898
unsigned get_hash_num() const
Definition: counting_bloom_filter.hpp:599
uint64_t get_pop_cnt(T threshold=1) const
Definition: counting_bloom_filter.hpp:589
T insert_contains(const uint64_t *hashes)
Definition: counting_bloom_filter.hpp:453
double get_fpr(T threshold=1) const
Definition: counting_bloom_filter.hpp:606
unsigned get_k() const
Definition: counting_bloom_filter.hpp:611
void insert(const std::string &seq)
Definition: counting_bloom_filter.hpp:308
uint64_t contains(const std::string &seq) const
Definition: counting_bloom_filter.hpp:345
void insert(const uint64_t *hashes)
Definition: counting_bloom_filter.hpp:316
T contains_insert_thresh(const std::string &seq, const T threshold)
Definition: counting_bloom_filter.hpp:550
static bool is_bloom_file(const std::string &path)
Definition: counting_bloom_filter.hpp:636
KmerCountingBloomFilter()
Definition: counting_bloom_filter.hpp:271
T contains(const std::vector< uint64_t > &hashes) const
Definition: counting_bloom_filter.hpp:370
Definition: nthash.hpp:54
Definition: bloom_filter.hpp:16
void check_error(bool condition, const std::string &msg)
void check_warning(bool condition, const std::string &msg)