1 #ifndef BTLLIB_DATA_STREAM_HPP 
    2 #define BTLLIB_DATA_STREAM_HPP 
   23 #include <sys/types.h> 
   29 static const int PIPE_READ_END = 0;
 
   30 static const int PIPE_WRITE_END = 1;
 
   31 static const int COMM_BUFFER_SIZE = 1024;
 
   32 static const mode_t PIPE_PERMISSIONS = 0666;
 
   34 using PipeId = 
unsigned long;
 
   38 process_spawner_initialized()
 
   40   static bool _process_spawner_initialized;
 
   41   return _process_spawner_initialized;
 
   44 process_spawner_parent2child_fd()
 
   46   static int _process_spawner_parent2child_fd[2];
 
   47   return _process_spawner_parent2child_fd;
 
   50 process_spawner_child2parent_fd()
 
   52   static int _process_spawner_child2parent_fd[2];
 
   53   return _process_spawner_child2parent_fd;
 
   56 process_spawner_comm_mutex()
 
   58   static std::mutex _process_spawner_comm_mutex;
 
   59   return _process_spawner_comm_mutex;
 
   61 inline std::vector<pid_t>&
 
   64   static std::vector<pid_t> _may_fail;
 
   70   static std::mutex _may_fail_mutex;
 
   71   return _may_fail_mutex;
 
   76   static PipeId _last_pipe_id = 0;
 
   77   return _last_pipe_id++;
 
   79 inline std::map<std::string, _Pipeline>&
 
   82   static std::map<std::string, _Pipeline> _pipeline_map;
 
   86 static inline std::string
 
   87 get_pipepath(
const PipeId 
id)
 
   89   return "btllib-" + std::to_string(getpid()) + 
"-" + std::to_string(
id);
 
  103   DataStream(
const std::string& path, Operation op);
 
  107   FILE* operator*()
 const { 
return file; }
 
  108   FILE* operator->()
 const { 
return file; }
 
  109   operator FILE*() 
const { 
return file; }
 
  112   std::string streampath;
 
  114   std::string pipepath;
 
  115   FILE* file = 
nullptr;
 
  132   DataSink(
const std::string& path, 
bool append = 
false)
 
  137 inline DataStream::DataStream(
const std::string& path, Operation op)
 
  141   std::unique_lock<std::mutex> lock(process_spawner_comm_mutex());
 
  143   write(process_spawner_parent2child_fd()[PIPE_WRITE_END], &op, 
sizeof(op));
 
  145   size_t pathlen = path.size() + 1;
 
  146   check_error(pathlen > COMM_BUFFER_SIZE,
 
  147               "Stream path length too large for the buffer.");
 
  148   write(process_spawner_parent2child_fd()[PIPE_WRITE_END],
 
  152     process_spawner_parent2child_fd()[PIPE_WRITE_END], path.c_str(), pathlen);
 
  154   char buf[COMM_BUFFER_SIZE];
 
  155   read(process_spawner_child2parent_fd()[PIPE_READ_END],
 
  158   read(process_spawner_child2parent_fd()[PIPE_READ_END], buf, pathlen);
 
  161   file = fopen(pipepath.c_str(), op == READ ? 
"r" : 
"w");
 
  162   unlink(pipepath.c_str());
 
  169     std::unique_lock<std::mutex> lock(process_spawner_comm_mutex());
 
  175           process_spawner_parent2child_fd()[PIPE_WRITE_END], &op, 
sizeof(op));
 
  177         size_t pathlen = pipepath.size() + 1;
 
  178         check_error(pathlen > COMM_BUFFER_SIZE,
 
  179                     "Stream path length too large for the buffer.");
 
  180         write(process_spawner_parent2child_fd()[PIPE_WRITE_END],
 
  183         write(process_spawner_parent2child_fd()[PIPE_WRITE_END],
 
  187         read(process_spawner_child2parent_fd()[PIPE_READ_END], &op, 1);
 
  191     } 
else if (op == WRITE || op == APPEND) {
 
  193       if (file != stdout) {
 
  197           process_spawner_parent2child_fd()[PIPE_WRITE_END], &op, 
sizeof(op));
 
  199         size_t pathlen = pipepath.size() + 1;
 
  200         check_error(pathlen > COMM_BUFFER_SIZE,
 
  201                     "Stream path length too large for the buffer.");
 
  202         write(process_spawner_parent2child_fd()[PIPE_WRITE_END],
 
  205         write(process_spawner_parent2child_fd()[PIPE_WRITE_END],
 
  209         read(process_spawner_child2parent_fd()[PIPE_READ_END], &op, 1);
 
  233     : pipepath(std::move(pipepath))
 
  234     , direction(direction)
 
  235     , pid_first(pid_first)
 
  241   std::string pipepath;
 
  242   Direction direction = SOURCE;
 
  243   pid_t pid_first = -1;
 
  252     if (direction == SOURCE) {
 
  254         std::unique_lock<std::mutex> lock(may_fail_mutex());
 
  255         may_fail().push_back(pid_first);
 
  257       kill(pid_first, SIGTERM);
 
  259       waitpid(pid_last, &status, 0);
 
  260     } 
else if (direction == SINK) {
 
  262       waitpid(pid_last, &status, 0);
 
  270 process_spawner_init();
 
  272 static const bool process_spawner_initializer = process_spawner_init();
 
  275 sigchld_handler(
const int sig)
 
  277   assert(sig == SIGCHLD);
 
  282   while ((pid = waitpid(-1, &status, WNOHANG)) > 0) {
 
  285         std::unique_lock<std::mutex> lock(may_fail_mutex());
 
  286         auto it = std::find(may_fail().begin(), may_fail().end(), pid);
 
  287         if (it != may_fail().end()) {
 
  288           may_fail().erase(it);
 
  293       if (WIFEXITED(status)) { 
 
  294         std::cerr << 
"PID " << pid << 
" exited with status " 
  295                   << WEXITSTATUS(status) << std::endl; 
 
  296       } 
else if (WIFSIGNALED(status)) {                
 
  297         std::cerr << 
"PID " << pid << 
" killed by signal " 
  301         std::cerr << 
"PID " << pid << 
" exited with code " << status
 
  304       std::exit(EXIT_FAILURE);
 
  307   if (pid == -1 && errno != ECHILD) {
 
  308     std::perror(
"waitpid");
 
  309     std::exit(EXIT_FAILURE);
 
  313 static inline std::string
 
  314 get_pipeline_cmd(
const std::string& path, DataStream::Operation op);
 
  316 static inline _Pipeline
 
  317 run_pipeline_cmd(
const std::string& cmd, DataStream::Operation op);
 
  320 process_spawner_init()
 
  322   if (!process_spawner_initialized()) {
 
  323     process_spawner_initialized() = 
true;
 
  325     process_spawner_parent2child_fd()[PIPE_READ_END] = -1;
 
  326     process_spawner_parent2child_fd()[PIPE_WRITE_END] = -1;
 
  327     process_spawner_child2parent_fd()[PIPE_READ_END] = -1;
 
  328     process_spawner_child2parent_fd()[PIPE_WRITE_END] = -1;
 
  329     check_error(pipe(process_spawner_parent2child_fd()) == -1,
 
  330                 "Error opening a pipe.");
 
  331     check_error(pipe(process_spawner_child2parent_fd()) == -1,
 
  332                 "Error opening a pipe.");
 
  336       close(process_spawner_parent2child_fd()[PIPE_WRITE_END]);
 
  337       close(process_spawner_child2parent_fd()[PIPE_READ_END]);
 
  339       struct sigaction action; 
 
  340       action.sa_handler = sigchld_handler;
 
  341       sigemptyset(&action.sa_mask);
 
  342       action.sa_flags = SA_RESTART;
 
  343       sigaction(SIGCHLD, &action, 
nullptr);
 
  345       DataStream::Operation op;
 
  346       char buf[COMM_BUFFER_SIZE];
 
  350         if (read(process_spawner_parent2child_fd()[PIPE_READ_END],
 
  354           std::exit(EXIT_SUCCESS);
 
  357         read(process_spawner_parent2child_fd()[PIPE_READ_END],
 
  360         read(process_spawner_parent2child_fd()[PIPE_READ_END], buf, pathlen);
 
  363           case DataStream::Operation::READ:
 
  364           case DataStream::Operation::WRITE:
 
  365           case DataStream::Operation::APPEND:
 
  366             pipeline = run_pipeline_cmd(get_pipeline_cmd(buf, op), op);
 
  368             pathlen = pipeline.pipepath.size() + 1;
 
  369             check_error(pathlen > COMM_BUFFER_SIZE,
 
  370                         "Stream path length too large for the buffer.");
 
  371             write(process_spawner_child2parent_fd()[PIPE_WRITE_END],
 
  374             write(process_spawner_child2parent_fd()[PIPE_WRITE_END],
 
  375                   pipeline.pipepath.c_str(),
 
  378             pipeline_map()[pipeline.pipepath] = pipeline;
 
  380           case DataStream::Operation::CLOSE:
 
  381             pipeline = pipeline_map()[std::string(buf)];
 
  383             pipeline_map().erase(std::string(buf));
 
  384             write(process_spawner_child2parent_fd()[PIPE_WRITE_END], &op, 1);
 
  387             log_error(
"Invalid stream operation.");
 
  388             std::exit(EXIT_FAILURE);
 
  392     close(process_spawner_parent2child_fd()[PIPE_READ_END]);
 
  393     close(process_spawner_child2parent_fd()[PIPE_WRITE_END]);
 
  398 static inline std::string
 
  399 get_pipeline_cmd(
const std::string& path, DataStream::Operation op)
 
  403     std::vector<std::string> prefixes;
 
  404     std::vector<std::string> suffixes;
 
  405     std::vector<std::string> cmds_check_existence;
 
  406     std::vector<std::string> read_cmds;
 
  407     std::vector<std::string> write_cmds;
 
  408     std::vector<std::string> append_cmds;
 
  412   static const Datatype DATATYPES[]{
 
  413     { { 
"http://", 
"https://", 
"ftp://" }, {}, { 
"which wget" }, { 
"wget -O-" }, { 
"" }, { 
"" } },
 
  414     { {}, { 
".url" }, { 
"which wget" }, { 
"wget -O- -i" }, { 
"" }, { 
"" } },
 
  415     { {}, { 
".ar" }, { 
"which ar" }, { 
"ar -p" }, { 
"" }, { 
"" } },
 
  416     { {}, { 
".tar" }, { 
"which tar" }, { 
"tar -xOf" }, { 
"" }, { 
"" } },
 
  417     { {}, { 
".tgz" }, { 
"which tar" }, { 
"tar -zxOf" }, { 
"" }, { 
"" } },
 
  418     { {}, { 
".gz", 
".z" }, { 
"which pigz", 
"which gzip" }, { 
"pigz -dc", 
"gzip -dc" }, { 
"pigz >", 
"gzip >" }, { 
"pigz >>", 
"gzip >>" } },
 
  419     { {}, { 
".bz2" }, { 
"which bzip2" }, { 
"bunzip2 -dc" }, { 
"bzip2 >" }, { 
"bzip2 >>" } },
 
  420     { {}, { 
".xz" }, { 
"which xz" }, { 
"unxz -dc" }, { 
"xz -T0 >" }, { 
"xz -T0 >>" } },
 
  421     { {}, { 
".7z" }, { 
"which 7z" }, { 
"7z -so e" }, { 
"7z -si a" }, { 
"7z -si a" } },
 
  422     { {}, { 
".zip" }, { 
"which zip" }, { 
"unzip -p" }, { 
"" }, { 
"" } },
 
  423     { {}, { 
".bam", 
".cram" }, { 
"which samtools" }, { 
"samtools view -h" }, { 
"samtools -Sb - >" }, { 
"samtools -Sb - >>" } },
 
  426   std::string default_cmd = 
"cat";
 
  427   if (op == DataStream::Operation::WRITE) {
 
  429   } 
else if (op == DataStream::Operation::APPEND) {
 
  430     default_cmd += 
" >>";
 
  433   std::string path_trimmed = path;
 
  434   std::vector<std::string> cmd_layers;
 
  436     bool found_datatype = 
false;
 
  437     for (
const auto& datatype : DATATYPES) {
 
  438       size_t trim_start = 0, trim_end = 0;
 
  439       bool this_datatype = 
false;
 
  440       for (
const auto& prefix : datatype.prefixes) {
 
  441         if (starts_with(path_trimmed, prefix)) {
 
  442           this_datatype = 
true;
 
  443           trim_start += prefix.size();
 
  447       for (
const auto& suffix : datatype.suffixes) {
 
  448         if (ends_with(path_trimmed, suffix)) {
 
  449           this_datatype = 
true;
 
  450           trim_end += suffix.size();
 
  456         found_datatype = 
true;
 
  457         bool found_cmd = 
false;
 
  459         for (
const auto& existence_cmd : datatype.cmds_check_existence) {
 
  461           auto sub_cmds = split(existence_cmd, 
"&&");
 
  462           std::for_each(sub_cmds.begin(), sub_cmds.end(), trim);
 
  463           for (
const auto& sub_cmd : sub_cmds) {
 
  464             auto args = split(sub_cmd, 
" ");
 
  465             std::for_each(args.begin(), args.end(), trim);
 
  467             char* 
const* argv = 
new char*[args.size() + 2];
 
  468             ((
char*&)(argv[0])) = (
char*)(args[0].c_str());
 
  469             for (
size_t i = 0; i < args.size(); i++) {
 
  470               ((
char*&)(argv[i + 1])) = (
char*)(args[i].c_str());
 
  472             ((
char*&)(argv[args.size() + 1])) = 
nullptr;
 
  476               std::unique_lock<std::mutex> lock(may_fail_mutex());
 
  478               may_fail().push_back(pid);
 
  481               int null_fd = open(
"/dev/null", O_WRONLY, 0);
 
  482               dup2(null_fd, STDOUT_FILENO);
 
  483               dup2(null_fd, STDERR_FILENO);
 
  486               execvp(argv[0], argv + 1);
 
  487               log_error(
"exec failed.");
 
  488               std::exit(EXIT_FAILURE);
 
  491               check_error(pid == -1, 
"Error on fork.");
 
  493               waitpid(pid, &status, 0);
 
  494               if (WIFSIGNALED(status) ||
 
  495                   (WIFEXITED(status) && WEXITSTATUS(status) != 0)) { 
 
  500                 std::unique_lock<std::mutex> lock(may_fail_mutex());
 
  501                 auto it = std::find(may_fail().begin(), may_fail().end(), pid);
 
  502                 if (it != may_fail().end()) {
 
  503                   may_fail().erase(it);
 
  518             case DataStream::Operation::READ:
 
  519               cmd = datatype.read_cmds[cmd_idx];
 
  521             case DataStream::Operation::WRITE:
 
  522               cmd = datatype.write_cmds[cmd_idx];
 
  524             case DataStream::Operation::APPEND:
 
  525               cmd = datatype.append_cmds[cmd_idx];
 
  528               log_error(
"Invalid operation");
 
  529               std::exit(EXIT_FAILURE);
 
  532             log_warning(
"Filetype recognized for '" + path +
 
  533                         "', but no tool available to work with it.");
 
  535             cmd_layers.push_back(cmd);
 
  538           log_warning(
"Filetype recognized for '" + path +
 
  539                       "', but no tool available to work with it.");
 
  541         path_trimmed.erase(0, trim_start);
 
  542         path_trimmed.erase(path_trimmed.size() - trim_end);
 
  545     if (!found_datatype) {
 
  549   if (cmd_layers.empty()) {
 
  550     cmd_layers.push_back(default_cmd);
 
  552   if (op == DataStream::Operation::WRITE ||
 
  553       op == DataStream::Operation::APPEND) {
 
  554     std::reverse(cmd_layers.begin(), cmd_layers.end());
 
  557   std::string result_cmd;
 
  558   for (
size_t i = 0; i < cmd_layers.size(); i++) {
 
  559     auto& cmd = cmd_layers[i];
 
  560     if (op == DataStream::Operation::WRITE ||
 
  561         op == DataStream::Operation::APPEND) {
 
  562       if (i == cmd_layers.size() - 1) {
 
  563         if (cmd.back() == 
'>') {
 
  570         if (cmd.back() == 
'>') {
 
  571           while (cmd.back() == 
'>' || cmd.back() == 
' ') {
 
  592   check_error(result_cmd.empty(),
 
  593               (op == DataStream::Operation::READ ? 
"Error loading from " 
  594                                                  : 
"Error saving to ") +
 
  599 static inline _Pipeline
 
  600 run_pipeline_cmd(
const std::string& cmd, DataStream::Operation op)
 
  602   std::string pipepath = get_pipepath(new_pipe_id());
 
  603   unlink(pipepath.c_str());
 
  604   mkfifo(pipepath.c_str(), PIPE_PERMISSIONS);
 
  606   auto individual_cmds = split(cmd, 
" | ");
 
  607   check_error(individual_cmds.empty(),
 
  608               "Error processing data stream commands.");
 
  609   std::reverse(individual_cmds.begin(), individual_cmds.end());
 
  611   std::vector<pid_t> pids;
 
  613   int input_fd[2], output_fd[2];
 
  614   input_fd[PIPE_READ_END] = -1;
 
  615   input_fd[PIPE_WRITE_END] = -1;
 
  616   output_fd[PIPE_READ_END] = -1;
 
  617   output_fd[PIPE_WRITE_END] = -1;
 
  620   for (
const auto& individual_cmd : individual_cmds) {
 
  621     auto args = split(individual_cmd, 
" ");
 
  622     std::for_each(args.begin(), args.end(), trim);
 
  624     std::string stdout_to_file;
 
  625     decltype(args)::iterator it;
 
  626     for (it = args.begin(); it != args.end(); ++it) {
 
  627       if (it->front() == 
'>') {
 
  628         stdout_to_file = it->substr(1);
 
  632     if (it != args.end()) {
 
  636     char* 
const* argv = 
new char*[args.size() + 2];
 
  637     ((
char*&)(argv[0])) = (
char*)(args[0].c_str());
 
  638     for (
size_t i = 0; i < args.size(); i++) {
 
  639       ((
char*&)(argv[i + 1])) = (
char*)(args[i].c_str());
 
  641     ((
char*&)(argv[args.size() + 1])) = 
nullptr;
 
  643     if (i < individual_cmds.size() - 1) {
 
  644       check_error(pipe(input_fd) == -1, 
"Error opening a pipe.");
 
  645       fcntl(input_fd[PIPE_READ_END], F_SETFD, FD_CLOEXEC);
 
  646       fcntl(input_fd[PIPE_WRITE_END], F_SETFD, FD_CLOEXEC);
 
  651       if (op == DataStream::Operation::READ) {
 
  653           int fd = open(pipepath.c_str(), O_WRONLY);
 
  654           dup2(fd, STDOUT_FILENO);
 
  657           dup2(output_fd[PIPE_WRITE_END], STDOUT_FILENO);
 
  658           close(output_fd[PIPE_READ_END]);
 
  659           close(output_fd[PIPE_WRITE_END]);
 
  662         if (i < individual_cmds.size() - 1) {
 
  663           dup2(input_fd[PIPE_READ_END], STDIN_FILENO);
 
  664           close(input_fd[PIPE_READ_END]);
 
  665           close(input_fd[PIPE_WRITE_END]);
 
  668         execvp(argv[0], argv + 1);
 
  669         log_error(
"exec failed.");
 
  670         std::exit(EXIT_FAILURE);
 
  672         if (i == individual_cmds.size() - 1) {
 
  673           int fd = open(pipepath.c_str(), O_RDONLY);
 
  674           dup2(fd, STDIN_FILENO);
 
  677           dup2(input_fd[PIPE_READ_END], STDIN_FILENO);
 
  678           close(input_fd[PIPE_READ_END]);
 
  679           close(input_fd[PIPE_WRITE_END]);
 
  682         if (!stdout_to_file.empty()) {
 
  684             open(stdout_to_file.c_str(),
 
  686                    (op == DataStream::Operation::APPEND ? O_APPEND : 0),
 
  687                  S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
 
  688           dup2(outfd, STDOUT_FILENO);
 
  691           dup2(output_fd[PIPE_WRITE_END], STDOUT_FILENO);
 
  692           close(output_fd[PIPE_READ_END]);
 
  693           close(output_fd[PIPE_WRITE_END]);
 
  696         execvp(argv[0], argv + 1);
 
  697         log_error(
"exec failed.");
 
  701     check_error(pid == -1, 
"Error on fork.");
 
  708       close(output_fd[PIPE_READ_END]);
 
  709       close(output_fd[PIPE_WRITE_END]);
 
  712     if (i < individual_cmds.size() - 1) {
 
  713       output_fd[PIPE_READ_END] = input_fd[PIPE_READ_END];
 
  714       output_fd[PIPE_WRITE_END] = input_fd[PIPE_WRITE_END];
 
  720   return _Pipeline(pipepath,
 
  721                    op == DataStream::Operation::READ
 
  722                      ? _Pipeline::Direction::SOURCE
 
  723                      : _Pipeline::Direction::SINK,