#!/usr/bin/env perl
use strict;
use File::Spec;
use File::Basename;
use Data::Dumper;
use FindBin;
use Cwd 'abs_path';

#..............................................................................
# Globals

my $VERSION = "0.5.2";
my $URL = "https://github.com/tseemann/legsta";
my $OUTSEP = "\t";
my $ISPCR = "isPcr";
my $ISPCR_OPT = "-minPerfect=5 -tileSize=6 -maxSize=1200 -stepSize=5";
my $SEP = "/";
my $UNK = "-";
my $NOVEL = "?";

#..............................................................................
# Command line options

my(@Options, $debug, $csv, $datadir, $quiet, $noheader);
$datadir = abs_path("$FindBin::RealBin/../db");
setOptions();

#..............................................................................
# Option parsing

$OUTSEP = ',' if $csv;  # default is tab
@ARGV or usage(1); # err("Please provide some FASTA files to type");
$ENV{PATH} = $ENV{PATH}.":$FindBin::RealBin";
require_exe('any2fasta');
require_exe($ISPCR);
system("$ISPCR 2>&1 | grep -q minPerfect")==0 or err("Can not run $ISPCR");
my $primer_fn = "$datadir/ispcr.tab";
-r $primer_fn or err("Could not read file: $primer_fn");

#..............................................................................
# Load databases

my %db;
my @gene;
my $sbt_fn = "$datadir/legionella.txt";
open my $SBT, '<', $sbt_fn or err("Could not read file: $sbt_fn");
while (<$SBT>) {
  chomp;
  my($type, @profile) = split m/\t/;
  if ($type eq 'ST') {
    @gene = @profile;
  }
  else {
    $db{$type} = join($SEP, @profile);
  }
}
msg("Loaded", scalar keys %db, "SBT types.");
print STDERR Dumper(\%db) if $debug > 2;

my %al;
for my $g (@gene) {
  my $gene_fn = "$datadir/$g.tfa";
  open my $gene_fh, '<', $gene_fn or err("Could not open FASTA file: $gene_fn");
  $al{$g} = load_fasta( $gene_fh, '^.*?_(\d+)' );
  msg("Loaded", scalar keys %{$al{$g}}, "$g alleles");
}
print STDERR Dumper(\%al) if $debug > 2;

#..............................................................................
# Run in silico PCR

# header
print join($OUTSEP, "FILE", "SBT", @gene), "\n" unless $noheader;

for my $fasta (@ARGV) {
  -r $fasta or err("Could not read file: $fasta");
  -s $fasta or err("Empty input file? $fasta");
  msg("Genotyping $fasta") if $debug;

  # Do in silico PCR against the primer set and collate results
  my $cmd = "any2fasta -q -u \Q$fasta\E | $ISPCR stdin \Q$primer_fn\E stdout -out=fa $ISPCR_OPT";
  msg("Running: $cmd");
  open my $ispcr, "-|", $cmd;
  my $amp = load_fasta( $ispcr, '^\S+\s+([A-Za-z]+)' );
  print STDERR Dumper($amp) if $debug > 1;
  
  # Determine the allele profile
  my @profile;
  for my $g (@gene) {
    my $num = $NOVEL;
    if (my $hit = $amp->{$g}) {
      msg("Scanning $g database") if $debug;
      # Look for exact substring matches in linear search (db is small)
      for my $n (keys %{$al{$g}}) {
        if ( index($hit, $al{$g}{$n}) >= 0 ) {  
          $num = $n;
          last;
        }
      }
    }
    else {
      $num = $UNK;
    }
    msg("Allele $g => $num") if $debug;
    push @profile, $num;
  }
  my $p = join($SEP, @profile);
  
  # Look for matching profiles in db
  my $type = $UNK;
  for my $t (keys %db) {
    if ($p eq $db{$t}) {
      $type = $t;
      last;
    }
  }
  
  # Print line result
  print join($OUTSEP, $fasta, $type, @profile),"\n";
}


#----------------------------------------------------------------------

sub load_fasta {
  my($fh, $regexp) = @_;
  $regexp ||= '(\S+)';  # capture ID
  my %res;
  my($id,$seq);
  while (my $line = <$fh>) {
    #print STDERR "DEBUG: $line" if $debug;
    chomp $line;
    if ($line =~ m/^>(.*)$/) {
      my $header = $1;
      $header =~ m/$regexp/ or err("Unmatched sequence header for $regexp: $header");
      my $new_id = $1;
      if ($id) {
        $res{$id} .= $seq; # save last seq we saw
      }      
      $id = $new_id;
      $seq = '';
    }
    else {
      $seq .= uc($line);
    }
  }
  $res{$id} .= $seq if $id && $seq;
  return \%res;
}

#----------------------------------------------------------------------

sub show_version {
  my(undef,undef,$exe) = File::Spec->splitpath($0);
  print "$exe $VERSION\n";
  exit(0);
}

#----------------------------------------------------------------------

sub update_db {
  err("Not implemented yet.");
}

#----------------------------------------------------------------------

sub msg {
  print STDERR "@_\n" unless $quiet;
}

#----------------------------------------------------------------------

sub err {
  msg("ERROR:", @_);
  exit(1);
}
        
#----------------------------------------------------------------------

sub require_exe {
  my(@arg) = @_;
  for my $exe (@arg) {
    my $where = '';
    for my $dir ( File::Spec->path ) {
      if (-x "$dir/$exe") {
        $where = "$dir/$exe";
        last;
      }
    }
    if ($where) {
      msg("Found '$exe' => $where");
    }
    else {
      err("Could not find '$exe'. Please install it and ensure it is in the PATH.");
    }
  }
  return;        
}
        
#----------------------------------------------------------------------
# Option setting routines

sub setOptions {
  use Getopt::Long;

  @Options = (
    {OPT=>"help",     VAR=>\&usage, DESC=>"This help"},
    {OPT=>"version",  VAR=>\&show_version, DESC=>"Print version and exit"},
    {OPT=>"quiet",    VAR=>\$quiet, DESC=>"Don't print anything to stderr"},
    {OPT=>"debug+",   VAR=>\$debug, DEFAULT=>0, DESC=>"Verbose debug output to stderr"},
    {OPT=>"dbdir=s",  VAR=>\$datadir, DEFAULT=>$datadir, DESC=>"SBT database folder"},
#    {OPT=>"update",   VAR=>\&update_db, DESC=>"Update SBT database"},
    {OPT=>"csv",      VAR=>\$csv, DEFAULT=>0, DESC=>"Output CSV instead of TSV"},
    {OPT=>"noheader", VAR=>\$noheader, DEFAULT=>0, DESC=>"Don't print header row"},
  );

  &GetOptions(map {$_->{OPT}, $_->{VAR}} @Options) || usage(1);

  # Now setup default values.
  foreach (@Options) {
    if (defined($_->{DEFAULT}) && !defined(${$_->{VAR}})) {
      ${$_->{VAR}} = $_->{DEFAULT};
    }
  }
}

sub usage {
  my($errcode) = @_;
  $errcode ||= 0;
  my $fh = $errcode ? \*STDOUT : \*STDERR;
  select $fh;
  my $exe = basename($0);
  print "NAME\n  $exe $VERSION\n";
  print "SYNPOSIS\n  Legionella in silico SBT typing of contig sequences\n";
  print "USAGE\n  $exe [options] <contigs.fa> ... \n";
  print "OPTIONS\n";
  foreach (@Options) {
    printf "  --%-10s %s%s.\n",$_->{OPT},$_->{DESC},
           defined($_->{DEFAULT}) ? " (default '$_->{DEFAULT}')" : "";
  }
  print "HOMEPAGE\n  $URL\n";
  exit($errcode);
}
 
#----------------------------------------------------------------------
