1#ifndef BTLLIB_INDEXLR_HPP 
    2#define BTLLIB_INDEXLR_HPP 
    4#include "btllib/bloom_filter.hpp" 
    5#include "btllib/nthash.hpp" 
    6#include "btllib/order_queue.hpp" 
    7#include "btllib/seq_reader.hpp" 
    8#include "btllib/status.hpp" 
    9#include "btllib/util.hpp" 
   36    static const unsigned NO_ID = 1;
 
   38    static const unsigned BX = 2;
 
   40    static const unsigned SEQ = 4;
 
   55  bool output_id()
 const { 
return bool(~flags & 
Flag::NO_ID); }
 
   56  bool output_bx()
 const { 
return bool(flags & 
Flag::BX); }
 
   57  bool output_seq()
 const { 
return bool(flags & 
Flag::SEQ); }
 
   79    uint64_t min_hash = 0, out_hash = 0;
 
   95           std::vector<Minimizer> minimizers)
 
   98      , barcode(std::move(barcode))
 
  100      , minimizers(std::move(minimizers))
 
  107    std::vector<Minimizer> minimizers;
 
  109    operator bool()
 const 
  111      return !
id.empty() || !barcode.empty() || !minimizers.empty();
 
  139          unsigned threads = 5,
 
  140          bool verbose = 
false,
 
  146  void close() noexcept;
 
  148  static const 
size_t MAX_SIMULTANEOUS_INDEXLRS = 256;
 
  155    void operator++() { record = indexlr.read(); }
 
  156    bool operator!=(
const RecordIterator& i)
 
  158      return bool(record) || bool(i.record);
 
  160    Record operator*() { 
return std::move(record); }
 
  164      auto val = operator*();
 
  172    RecordIterator(
Indexlr& indexlr, 
bool end)
 
  185  RecordIterator 
begin() { 
return RecordIterator(*
this, 
false); }
 
  186  RecordIterator end() { 
return RecordIterator(*
this, 
true); }
 
  189  static std::string extract_barcode(
const std::string& 
id,
 
  190                                     const std::string& comment);
 
  191  static void filter_hashed_kmer(Indexlr::HashedKmer& hk,
 
  194                                 const BloomFilter& filter_in_bf,
 
  195                                 const BloomFilter& filter_out_bf);
 
  196  static void calc_minimizer(
 
  197    const std::vector<Indexlr::HashedKmer>& hashed_kmers_buffer,
 
  198    const Indexlr::Minimizer*& min_current,
 
  200    ssize_t& min_idx_left,
 
  201    ssize_t& min_idx_right,
 
  202    ssize_t& min_pos_prev,
 
  204    std::vector<Indexlr::Minimizer>& minimizers);
 
  205  std::vector<Minimizer> minimize(
const std::string& seq) 
const;
 
  207  const std::string seqfile;
 
  209  const unsigned flags;
 
  212  std::atomic<bool> closed{ 
false };
 
  214  static const BloomFilter& dummy_bf()
 
  216    static const BloomFilter var;
 
  220  const std::reference_wrapper<const BloomFilter> filter_in_bf;
 
  221  const std::reference_wrapper<const BloomFilter> filter_out_bf;
 
  222  bool filter_in_enabled;
 
  223  bool filter_out_enabled;
 
  226  OrderQueueMPSC<Record> output_queue;
 
  228  using OutputQueueType = 
decltype(output_queue);
 
  229  static std::unique_ptr<OutputQueueType::Block>* ready_blocks_array()
 
  231    thread_local static std::unique_ptr<
decltype(output_queue)::Block>
 
  232      var[MAX_SIMULTANEOUS_INDEXLRS];
 
  236  static long* ready_blocks_owners()
 
  238    thread_local static long var[MAX_SIMULTANEOUS_INDEXLRS] = { 0 };
 
  242  static size_t* ready_blocks_current()
 
  244    thread_local static size_t var[MAX_SIMULTANEOUS_INDEXLRS] = { 0 };
 
  248  static std::atomic<long>& last_id()
 
  250    static std::atomic<long> var(0);
 
  257    void start() { t = std::thread(do_work, 
this); }
 
  258    void join() { t.join(); }
 
  259    void set_id(
const int id) { this->
id = id; }
 
  261    Worker& operator=(
const Worker& worker) = 
delete;
 
  262    Worker& operator=(Worker&& worker) = 
delete;
 
  267    Worker(
const Worker& worker)
 
  268      : Worker(worker.indexlr)
 
  270    Worker(Worker&& worker) noexcept
 
  271      : Worker(worker.indexlr)
 
  276    static void do_work(Worker* worker) { worker->work(); }
 
  283  std::vector<Worker> workers;
 
  285  std::mutex last_block_num_mutex;
 
  286  uint64_t last_block_num = 0;
 
  287  bool last_block_num_valid = 
false;
 
  293                        const unsigned flags,
 
  294                        const unsigned threads,
 
  298  : seqfile(std::move(seqfile))
 
  304  , filter_in_bf(filter_in() ? bf1 : 
Indexlr::dummy_bf())
 
  305  , filter_out_bf(filter_out() ? filter_in() ? bf2 : bf1 : 
Indexlr::dummy_bf())
 
  306  , filter_in_enabled(filter_in())
 
  307  , filter_out_enabled(filter_out())
 
  308  , reader(this->seqfile,
 
  311  , output_queue(reader.get_buffer_size(), reader.get_block_size())
 
  312  , workers(std::vector<Worker>(threads, Worker(*this)))
 
  313  , end_barrier(threads)
 
  316              "Indexlr: no mode selected, either short or long mode flag must " 
  319              "Indexlr: short and long mode are mutually exclusive.");
 
  321              "Indexlr: Number of processing threads cannot be 0.");
 
  323  for (
auto& worker : workers) {
 
  324    worker.set_id(id_counter++);
 
  329inline Indexlr::~Indexlr()
 
  335Indexlr::close() noexcept
 
  337  bool closed_expected = 
false;
 
  338  if (closed.compare_exchange_strong(closed_expected, 
true)) {
 
  341      output_queue.close();
 
  342      for (
auto& worker : workers) {
 
  345    } 
catch (
const std::system_error& e) {
 
  346      log_error(
"Indexlr thread join failure: " + std::string(e.what()));
 
  347      std::exit(EXIT_FAILURE); 
 
  380Indexlr::extract_barcode(
const std::string& 
id, 
const std::string& comment)
 
  382  const static std::string barcode_prefix = 
"BX:Z:";
 
  384    const auto space_pos = comment.find(
' ');
 
  385    if (space_pos != std::string::npos) {
 
  386      return comment.substr(barcode_prefix.size(),
 
  387                            space_pos - barcode_prefix.size());
 
  389    return comment.substr(barcode_prefix.size());
 
  391  const auto pound_pos = 
id.find(
'#');
 
  392  if (pound_pos != std::string::npos) {
 
  393    const auto slash_pos = 
id.find(
'/');
 
  394    if (slash_pos > pound_pos) {
 
  395      return id.substr(pound_pos + 1, slash_pos - (pound_pos + 1));
 
  402Indexlr::filter_hashed_kmer(Indexlr::HashedKmer& hk,
 
  405                            const BloomFilter& filter_in_bf,
 
  406                            const BloomFilter& filter_out_bf)
 
  408  if (filter_in && filter_out) {
 
  409    std::vector<uint64_t> tmp;
 
  410    tmp = { hk.min_hash };
 
  411    if (!filter_in_bf.contains(tmp) || filter_out_bf.contains(tmp)) {
 
  412      hk.min_hash = std::numeric_limits<uint64_t>::max();
 
  414  } 
else if (filter_in) {
 
  415    if (!filter_in_bf.contains({ hk.min_hash })) {
 
  416      hk.min_hash = std::numeric_limits<uint64_t>::max();
 
  418  } 
else if (filter_out) {
 
  419    if (filter_out_bf.contains({ hk.min_hash })) {
 
  420      hk.min_hash = std::numeric_limits<uint64_t>::max();
 
  426Indexlr::calc_minimizer(
 
  427  const std::vector<Indexlr::HashedKmer>& hashed_kmers_buffer,
 
  428  const Indexlr::Minimizer*& min_current,
 
  430  ssize_t& min_idx_left,
 
  431  ssize_t& min_idx_right,
 
  432  ssize_t& min_pos_prev,
 
  434  std::vector<Indexlr::Minimizer>& minimizers)
 
  436  min_idx_left = ssize_t(idx + 1 - w);
 
  437  min_idx_right = ssize_t(idx + 1);
 
  438  const auto& min_left =
 
  439    hashed_kmers_buffer[min_idx_left % hashed_kmers_buffer.size()];
 
  440  const auto& min_right =
 
  441    hashed_kmers_buffer[(min_idx_right - 1) % hashed_kmers_buffer.size()];
 
  443  if (min_current == 
nullptr || min_current->pos < min_left.pos) {
 
  444    min_current = &min_left;
 
  446    for (ssize_t i = min_idx_left; i < min_idx_right; i++) {
 
  447      const auto& min_i = hashed_kmers_buffer[i % hashed_kmers_buffer.size()];
 
  448      if (min_i.min_hash <= min_current->min_hash) {
 
  449        min_current = &min_i;
 
  452  } 
else if (min_right.min_hash <= min_current->min_hash) {
 
  453    min_current = &min_right;
 
  455  if (ssize_t(min_current->pos) > min_pos_prev &&
 
  456      min_current->min_hash != std::numeric_limits<uint64_t>::max()) {
 
  457    min_pos_prev = ssize_t(min_current->pos);
 
  458    minimizers.push_back(*min_current);
 
  462inline std::vector<Indexlr::Minimizer>
 
  463Indexlr::minimize(
const std::string& seq)
 const 
  465  if ((k > seq.size()) || (w > seq.size() - k + 1)) {
 
  468  std::vector<Minimizer> minimizers;
 
  469  minimizers.reserve(2 * (seq.size() - k + 1) / w);
 
  470  std::vector<HashedKmer> hashed_kmers_buffer(w + 1);
 
  471  ssize_t min_idx_left, min_idx_right, min_pos_prev = -1;
 
  472  const Minimizer* min_current = 
nullptr;
 
  474  for (NtHash nh(seq, 2, k); nh.roll(); ++idx) {
 
  475    auto& hk = hashed_kmers_buffer[idx % hashed_kmers_buffer.size()];
 
  477    hk = HashedKmer(nh.hashes()[0],
 
  481                    output_seq() ? seq.substr(nh.get_pos(), k) : 
"");
 
  484      hk, filter_in(), filter_out(), filter_in_bf.get(), filter_out_bf.get());
 
  487      calc_minimizer(hashed_kmers_buffer,
 
  500inline Indexlr::Record
 
  503  if (ready_blocks_owners()[
id % MAX_SIMULTANEOUS_INDEXLRS] != 
id) {
 
  504    ready_blocks_array()[
id % MAX_SIMULTANEOUS_INDEXLRS] =
 
  505      std::unique_ptr<decltype(output_queue)::Block>(
 
  506        new decltype(output_queue)::Block(reader.get_block_size()));
 
  507    ready_blocks_owners()[
id % MAX_SIMULTANEOUS_INDEXLRS] = id;
 
  508    ready_blocks_current()[
id % MAX_SIMULTANEOUS_INDEXLRS] = 0;
 
  510  auto& block = *(ready_blocks_array()[
id % MAX_SIMULTANEOUS_INDEXLRS]);
 
  511  auto& current = ready_blocks_current()[
id % MAX_SIMULTANEOUS_INDEXLRS];
 
  512  if (current >= block.count) {
 
  514    output_queue.read(block);
 
  515    if (block.count == 0) {
 
  516      output_queue.close();
 
  517      block = 
decltype(output_queue)::Block(reader.get_block_size());
 
  522  return std::move(block.data[current++]);
 
  526Indexlr::Worker::work()
 
  528  decltype(indexlr.output_queue)::Block output_block(
 
  529    indexlr.reader.get_block_size());
 
  530  uint64_t last_block_num = 0;
 
  531  bool last_block_num_valid = 
false;
 
  533    auto input_block = indexlr.reader.read_block();
 
  534    if (input_block.count == 0) {
 
  538    output_block.num = input_block.num;
 
  539    for (
size_t idx = 0; idx < input_block.count; idx++) {
 
  541      auto& reader_record = input_block.data[idx];
 
  542      record.num = reader_record.num;
 
  543      if (indexlr.output_id()) {
 
  544        record.id = std::move(reader_record.id);
 
  546      if (indexlr.output_bx()) {
 
  548          indexlr.extract_barcode(record.id, reader_record.comment);
 
  550      record.readlen = reader_record.seq.size();
 
  552      check_info(indexlr.verbose && indexlr.k > record.readlen,
 
  553                 "Indexlr: skipped seq " + std::to_string(record.num) +
 
  555                   std::to_string(record.num * (indexlr.reader.get_format() ==
 
  556                                                    SeqReader::Format::FASTA
 
  560                   "; k (" + std::to_string(indexlr.k) + 
") > seq length (" +
 
  561                   std::to_string(record.readlen) + 
")");
 
  563      check_info(indexlr.verbose && indexlr.w > record.readlen - indexlr.k + 1,
 
  564                 "Indexlr: skipped seq " + std::to_string(record.num) +
 
  566                   std::to_string(record.num * (indexlr.reader.get_format() ==
 
  567                                                    SeqReader::Format::FASTA
 
  571                   "; w (" + std::to_string(indexlr.w) + 
") > # of hashes (" +
 
  572                   std::to_string(record.readlen - indexlr.k + 1) + 
")");
 
  574      if (indexlr.k <= record.readlen &&
 
  575          indexlr.w <= record.readlen - indexlr.k + 1) {
 
  576        record.minimizers = indexlr.minimize(reader_record.seq);
 
  578        record.minimizers = {};
 
  581      output_block.data[output_block.count++] = std::move(record);
 
  583    if (output_block.count > 0) {
 
  584      last_block_num = output_block.num;
 
  585      last_block_num_valid = 
true;
 
  586      indexlr.output_queue.write(output_block);
 
  587      output_block.count = 0;
 
  590  if (last_block_num_valid) {
 
  591    std::unique_lock<std::mutex> lock(indexlr.last_block_num_mutex);
 
  592    indexlr.last_block_num = std::max(indexlr.last_block_num, last_block_num);
 
  593    indexlr.last_block_num_valid = 
true;
 
  596  indexlr.end_barrier.wait();
 
  597  if (last_block_num_valid && indexlr.last_block_num_valid &&
 
  598      last_block_num == indexlr.last_block_num) {
 
  599    output_block.num = last_block_num + 1;
 
  600    indexlr.output_queue.write(output_block);
 
  601  } 
else if (!indexlr.last_block_num_valid && 
id == 0) {
 
  602    output_block.num = 0;
 
  603    indexlr.output_queue.write(output_block);
 
Definition: bloom_filter.hpp:67
Definition: indexlr.hpp:26
Indexlr(std::string seqfile, size_t k, size_t w, unsigned flags=0, unsigned threads=5, bool verbose=false, const btllib::BloomFilter &bf1=Indexlr::dummy_bf(), const btllib::BloomFilter &bf2=Indexlr::dummy_bf())
Definition: indexlr.hpp:290
RecordIterator begin()
Definition: indexlr.hpp:185
Definition: seq_reader.hpp:43
Definition: bloom_filter.hpp:16
std::string join(const std::vector< std::string > &s, const std::string &delim)
void check_error(bool condition, const std::string &msg)
void log_error(const std::string &msg)
bool startswith(std::string s, std::string prefix)
void check_info(bool condition, const std::string &msg)
Definition: indexlr.hpp:34
static const unsigned BX
Definition: indexlr.hpp:38
static const unsigned SEQ
Definition: indexlr.hpp:40
static const unsigned LONG_MODE
Definition: indexlr.hpp:52
static const unsigned FILTER_IN
Definition: indexlr.hpp:43
static const unsigned FILTER_OUT
Definition: indexlr.hpp:48
static const unsigned NO_ID
Definition: indexlr.hpp:36
static const unsigned SHORT_MODE
Definition: indexlr.hpp:50
Definition: indexlr.hpp:64
Definition: indexlr.hpp:88