#!/usr/bin/env perl 

use strict;
use warnings;
use Getopt::Std;
use File::Temp qw/ tempdir /;
use File::Basename qw/ basename /; 
use FindBin '$Bin';
use lib "$Bin/lib";
use Memory::Usage;
use Resources qw/ get_time_RAM /;

# Takes two genome assemblies (A & B) in FASTA format and computes a Whole-Genome Alignment (WGA).
# By default uses Cgaln for the alignments, which is single-threaded, but can also use GSAlign (-g).
# The MAPCOORDS job is the most memory-hungry task, even more than RED or CGALN.
# Returns a folder with several files, these are the most important:
# 1) BED file matching 0-based coordinates from A to B 
# 2) LOG file (text) with WGA stats
# 3) PDF file with dotplot of WGA for quality control [requires gnuplot in the system, not provided]

# Bruno Contreras Moreira, Ruben Sancho EEAD-CSIC 2014-2025

# external binaries assumed to be available, edit if non-standard locations
my $GZIPEXE    = 'gzip'; 
my $BZIP2EXE   = 'bzip2';
my $GNUPLOTEXE = "gnuplot";

# binaries that should be installed with make 
my $GSALIGNEXE = "GSAlign";
my $CGALNEXE   = "$Bin/lib/Cgaln/Cgaln";
my $CINDEXEXE  = "$Bin/lib/Cgaln/maketable";
my $REDEXE=`which Red`;chomp($REDEXE);
my $RED2ENS    = "$Bin/utils/Red2Ensembl.py";

# utilities shipping with script
my $ALN2FASTA  = "$Bin/utils/aln2fasta.pl";
my $FASTA2DOT  = "$Bin/utils/fasta2dot.pl";
my $MAPCOORDS  = "$Bin/utils/mapcoords.pl";

# read version.txt
my $VERSION = 'XXXX';
open(VERSION,"$Bin/version.txt");
while(<VERSION>) {
  if(/^(\S+)/){ $VERSION = $1 }
}
close(VERSION);

my %opts;
my ($fileA, $fileB, $minlenMb, $ncores, $softmask, $use_gsalign) = ('', '', 1.0, 4, 1, 0);
my ($WGA_Cgaln_params, $WGA_index_params) = ('-X4000', '-K11 -BS10000');
my $WGA_GSAlign_params = '-no_vcf -one'; #-one, -sen
my $MAPCOORDS_params = '0.25 0.05';
my ($tmppath, $maxRAM) = ('/tmp', 0);
my ($root, $cmd, $name, $tempdirA, $tempdirB, $chrcodesA, $chrcodesB);
my ($lenfile, $tmpWGAfile, $WGAfile, $WGAdotfile, $WGAscriptfile);
my ($outfolder, $outBEDfile, $outLOGfile, $outPDFfile);

getopts('hcmgo:M:G:n:N:C:l:A:B:', \%opts);

if(($opts{'h'})||(scalar(keys(%opts))==0))
{
  print "\nusage: $0 [options]\n\n";
  print "-h this message\n";
  print "-A FASTA file of genome A              (example: -A speciesA.fna[.gz])\n";
  print "-B FASTA file of genome B              (example: -B speciesB.fna[.gz])\n";
  print "-o output folder                       (optional, default: speciesA.speciesB)\n";  
  print "-l min contig length [Mbp]             (optional, default: -l $minlenMb)\n";
  print "-m FASTA files already soft-masked     (optional, default: masked with Red\n";
  print "-n number of cores                     (optional, some tasks only, default: $ncores\n";
  print "-g use multithreaded GSAlign algorithm (optional, default: Cgaln)\n";              
  print "-C parameters for Cgaln aligner        (optional, default: -C '$WGA_Cgaln_params')\n";
  print "-N parameters for Cgaln indexer        (optional, default: -N '$WGA_index_params')\n";
  print "-G parameters for GSAlign aligner      (optional, default: -G '$WGA_GSAlign_params')\n";
  print "-M parameters for utils/mapcoords.pl   (optional, default: -M '$MAPCOORDS_params' ,\n";
  print "                                        1st: max ratio of mapped positions in other blocks,\n";
  print "                                        2nd: max ratio of coordinates with multiple positions in same block)\n";
  print "-t path to dir for temp files          (optional, default: -t $tmppath)\n";
  print "-c print credits and checks install    (recommended)\n";
  print "\nversion: $VERSION\n";
  exit(0);
}

if(defined($opts{'c'})) {

  print "\nPrimary citation: https://www.biorxiv.org/content/10.1101/2025.07.17.665301v1\n";
  print "\nThis software uses external algorithms, please cite them accordingly:\n";
  print " GSAlign https://doi.org/10.1186/s12864-020-6569-1\n";
  print " Cgaln https://doi.org/10.1186/1471-2105-11-224\n";
  print " Red https://doi.org/10.1186/s12859-015-0654-5\n";
  print " Red2Ensembl https://doi.org/10.1002/tpg2.20143\n\n";

  # check all required binaries and print diagnostic info
  foreach my $exe ($GSALIGNEXE, $CGALNEXE, $REDEXE, $RED2ENS, $MAPCOORDS) {
    $cmd = `$exe -h 2>&1` || '';	  
    if(!$cmd || ($cmd !~ /usage/i && $cmd !~ /input/i)) { 
      print "# ERROR: $exe not correctly installed; please run 'make install' ($cmd)\n";
    }
  }

  # check also gnuplot, not provided
  $cmd = `gnuplot -h 2>&1`; 
  if(!$cmd || $cmd !~ /usage/i) {
    print "# WARNING: gnuplot not found; please install, ie \$ sudo apt install gnuplot-qt\n";
  }

  exit(0);
}

if(!defined($opts{'A'})) {
  die "# ERROR: need FASTA file of genome A (-A)\n";
} else {
  $fileA = $opts{'A'}
}

if(!defined($opts{'B'})) {
  die "# ERROR: need FASTA file of genome B (-B)\n";
} else {
  $fileB = $opts{'B'}
}

if(!defined($opts{'o'})) {
  $outfolder = basename($fileA) .'.'. basename($fileB);
} else {
  $outfolder = $opts{'o'};
}

if( -e $outfolder ) {
  printf("\n# WARNING : folder '%s' exists, files might be overwritten\n\n",
    basename($outfolder));
} else {
  if ( !mkdir($outfolder) ) {
    die "# ERROR: cannot create $outfolder\n";
  }
}  

if(defined($opts{'l'}) && $opts{'l'} >= 0 && $opts{'l'} < 1000) {
  $minlenMb = $opts{'l'}
}

if(defined($opts{'n'}) && $opts{'n'} >= 0) {
  $ncores = $opts{'n'}
}

if(defined($opts{'t'})) {
  $tmppath = $opts{'t'};
  if(!-d $tmppath) {
    die "# ERROR: need valid temp path (-t)\n";
  } 
}

if(defined($opts{'g'})) { # GSAlign

  $use_gsalign = 1;

  if(defined($opts{'G'})) {
    $WGA_GSAlign_params = $opts{'G'}
  }

  $root = basename($fileA) .'.'. basename($fileB) . '_GSAlign_' . $WGA_GSAlign_params;
  $root =~ s/\s+/_/g;

  warn "## $0 -A $fileA -B $fileB -o $outfolder -l $minlenMb -g $use_gsalign ".
    "-G $WGA_GSAlign_params -n $ncores -t $tmppath\n\n";

} else { # Cgaln

  if(defined($opts{'m'})) {
    $softmask = 0
  }

  if(defined($opts{'N'})) {
    $WGA_index_params = $opts{'N'}
  }

  if(defined($opts{'C'})) {
    $WGA_Cgaln_params = $opts{'C'}
  }

  if(defined($opts{'M'}) && $opts{'M'} =~ m/^\S+\s+\S+$/) {  
    $MAPCOORDS_params = $opts{'M'}
  }

  $root = basename($fileA) .'.'. basename($fileB) .'_Cgaln_'. $WGA_index_params .'_'. $WGA_Cgaln_params .'_'. $MAPCOORDS_params;
  $root =~ s/\s+/_/g;
  
  warn "## $0 -A $fileA -B $fileB -o $outfolder -l $minlenMb -m $softmask -G $use_gsalign ".
    "-N $WGA_index_params -C $WGA_Cgaln_params -M $MAPCOORDS_params -n $ncores -t $tmppath\n\n";

  warn "# version: $VERSION\n\n";    
}

warn "## root: $root\n\n";

######################################################

my $mu = Memory::Usage->new();
$mu->record('start');

# 1) for each FASTA file remove short sequences
my ($filtA, $chrlenA, $chrorderA) = filter_FASTA_sequences( $fileA, $outfolder, $minlenMb, $GZIPEXE, $BZIP2EXE );
my ($filtB, $chrlenB, $chrorderB) = filter_FASTA_sequences( $fileB, $outfolder, $minlenMb, $GZIPEXE, $BZIP2EXE );

if(scalar(keys(%$chrlenA)) == 0 || scalar(keys(%$chrlenB)) == 0) {
    die "\n# EXIT: all sequences too short, exit\n";
}

# 1.1) make sure chr names are different
foreach $name (keys(%$chrlenA)) {
  if(defined($chrlenB->{ $name })) {
    die "# EXIT: same chromosome name ($name) found in $fileA and $fileB\n";
  }
}

# 1.2) propose regex/chrcodes for config.synteny.tsv, required by vcf2synteny
$chrcodesA = regex4chrnames(keys(%$chrlenA));
$chrcodesB = regex4chrnames(keys(%$chrlenB));
if($chrcodesA =~ m/\|/) {
  die "# EXIT: please rename chrs in $fileA so they share a common prefix ($chrcodesA)\n";
} if($chrcodesB =~ m/\|/) {
  die "# EXIT: please rename chrs in $fileB so they share a common prefix ($chrcodesB)\n";

} else {
  print "\n# chrcode A example: $chrcodesA\n";
  print "# chrcode B example: $chrcodesB\n";
}

# 1.3) write lengths of filtered seqs to file
$lenfile = "$outfolder/length.tsv";
open(LENFILE,">",$lenfile) || die "# cannot create $lenfile $!\n";

foreach $name (@$chrorderA) { 
  print LENFILE "A\t$name\t$chrlenA->{$name}\n";
}
foreach $name (@$chrorderB) { 
  print LENFILE "B\t$name\t$chrlenB->{$name}\n";
}

close(LENFILE); 


# 2) mask filtered FASTA files if required
my $maskA = "$outfolder/" . basename($fileA) . '.sm.fasta';     
my $maskB = "$outfolder/" . basename($fileB) . '.sm.fasta';

if($softmask == 1) {

  print "\n## soft-masking filtered sequences\n\n";

  if(!-s $maskA) {  
    $tempdirA = tempdir( CLEANUP => 1 );
    $cmd = "$RED2ENS --exe $REDEXE --cor $ncores --msk_file $maskA $filtA $tempdirA";
    system("$cmd");
    if($? != 0) {
      die "# EXIT: failed while soft-masking ($cmd)\n";
    }
  }

  if(!-s $maskB) {
    $tempdirB = tempdir( CLEANUP => 1 );
    $cmd = "$RED2ENS --exe $REDEXE --cor $ncores --msk_file $maskB $filtB $tempdirB";
    system("$cmd");
    if($? != 0) {
      die "# EXIT: failed while soft-masking ($cmd)\n";
    }
  }

  unlink($filtA,$filtB);

} else {
  $maskA = $filtA;
  $maskB = $filtB;  
}

print "# filtered file A: $maskA\n";
print "# filtered file B: $maskB\n\n";


# 3) compute WGA

$tmpWGAfile = "$outfolder/$root" . '.aln';
$WGAfile    = "$outfolder/$root" . '.fasta';
$WGAdotfile = "$outfolder/$root" . '.dot';

# 3.1) Cgaln algorithm (default), saves indexes also in outfolder
if($use_gsalign == 0) {

  print "## CGaln algorithm\n\n";	
  print "\n## indexing masked, filtered sequences\n\n";

  # 3.1.1) compute indexes of filtered sequences
  $cmd = "$CINDEXEXE $WGA_index_params -o $outfolder $maskA";
  system("$cmd");
  if($? != 0) {
    die "# EXIT: failed while indexing ($cmd)\n";
  }

  $cmd = "$CINDEXEXE $WGA_index_params -o $outfolder $maskB";
  system("$cmd");
  if($? != 0) {
    die "# EXIT: failed while indexing ($cmd)\n";
  }

  # 3.1.2) compute Whole Genome Alignments of indexed sequences
  print "\n## computing and plotting Whole Genome Alignment\n\n";

  $cmd = "$CGALNEXE $maskA $maskB -o $WGAfile $WGA_Cgaln_params -r -otype2 -t $outfolder";
  system("$cmd");
  if($? != 0) {
    die "# EXIT: failed while aligning ($cmd)\n";

  } 

  $cmd = "$FASTA2DOT $WGAfile $lenfile $WGAdotfile";
  system("$cmd");
  if($? != 0) {
    die "# EXIT: failed while converting FASTA to DOT ($cmd)\n";

  } else {
    system("gzip -f $WGAfile");
    $WGAfile .= '.gz';
  }

} else {

  # 3.2) GSAlign algorithm (optional, multithreaded) 	
  print "## GSAlign algorithm\n\n";       	

  # 3.2.1) compute Whole Genome Alignments 
  print "\n## computing and plotting Whole Genome Alignment\n\n";

  $cmd = "$GSALIGNEXE -r $maskA -q $maskB -o $outfolder/$root $WGA_GSAlign_params -fmt 2";
  system("$cmd");
  if($? != 0) {
    die "# EXIT: failed while aligning ($cmd)\n";

  } else {
    # convert alignment format
    $cmd = "$ALN2FASTA $tmpWGAfile $maskA $maskB $WGAfile";
    system("$cmd");
    if($? != 0) {
      die "# EXIT: failed conversion ($cmd)\n";

    } else {

      $cmd = "$FASTA2DOT $WGAfile $lenfile $WGAdotfile";
      system("$cmd");
      if($? != 0) {
        die "# EXIT: failed while converting FASTA to DOT ($cmd)\n";
      }

      system("gzip -f $WGAfile");
      $WGAfile .= '.gz';
    }
  }
}

# 3.3) export dotplot to PDF for visual inspection and parameter tweaking

$WGAscriptfile = "$outfolder/$root" . '.dot.script';
$outPDFfile    = "$outfolder/$root" . '.dot.pdf';

open(SCRIPT,">",$WGAscriptfile) || die "# cannot create $WGAscriptfile $!\n";

print SCRIPT<<"END"; 
set terminal pdf
set grid
set output "$outPDFfile"
plot "$WGAdotfile" with lines lw 2
unset output
exit
END

close(SCRIPT);

$cmd = "$GNUPLOTEXE -c $WGAscriptfile";
system("$cmd");
if($? != 0) {
  warn "# WARNING: failed while exporting dotplot to PDF ($cmd)\n\n";

  warn "# Inspection of dotplot is highly recommended to validate WGA and parameters.\n";
  warn "# Please install gnuplot ie \$ sudo apt install gnuplot-qt\n\n";
  $outPDFfile = "(failed, need to install gnuplot)\n"; 
}

# 3.4) convert WGA to TSV, soft-masked positions are skipped 

print "\n## converting Whole Genome Alignment to BED\n\n";

$outBEDfile = "$outfolder/$root" . '.bed';
$outLOGfile = "$outfolder/$root" . '.coords.log';

if($use_gsalign == 0) {
  $cmd = "$MAPCOORDS $tmppath $WGAfile $maskA $maskB $MAPCOORDS_params > $outBEDfile 2> $outLOGfile";
} else {
  $cmd = "$MAPCOORDS $tmppath $WGAfile $maskA $maskB $MAPCOORDS_params 1 > $outBEDfile 2> $outLOGfile";
}
system("$cmd");
if($? != 0) {
  die "# EXIT: failed while converting to TSV ($cmd)\n";
}

print "## output files:\n\n";
print "# BED: $outBEDfile\n";
print "# LOG: $outLOGfile\n";
print "# PDF: $outPDFfile\n\n";

open(LOG,"<",$outLOGfile) || die "# EXIT: cannot read $outLOGfile\n";
while(<LOG>) {
  if(/^# (valid blocks: \d+ unique positions: \d+)/){ 
    print "## WGA summary: $1\n";

  } elsif(/# time used \(s\): \d+ memory used \(Mb\): (\S+)/) {
    $maxRAM = $1; # map_coords consumes the most RAM
  }
}
close(LOG);

# 4) summary of resources 
$mu->record('end');

my ($time, $RAM) = get_time_RAM($mu->report());
printf("\n\n\n# time used (s): %d memory used (Mb): %2.1f\n",
  $time, $maxRAM);




# Takes 5 params:
# i)  path to FASTA file  (string)
# ii) path to output folder (string)
# iii)min sequence length (float, in Mbp)
# iv) path to GZIP executable
# v)  path to BZIP2 executable
# Returns:
# i) path to FASTA file with long sequences
# ii) reference to hash matching PASSING sequence names to their length
# iii) reference to array with PASSING sequence names in file order
sub filter_FASTA_sequences {
  
  my ( $infile, $dir, $min_length_Mb, $gzipexe, $bzip2exe ) = @_;

  my (%FASTA,%names,@ids,@order);
  my ($magic,$root,$name,$seq,$seqid,$length);
  my ($n_of_sequences, $n_filtered_sequences) = (0, 0);

  # check input file format and open it accordingly
  open(INFILE,$infile) || die "# filter_FASTA_sequences: cannot read $infile, exit\n";
  sysread(INFILE,$magic,2);
  close(INFILE);

  if($infile =~ /\.gz$/ || $magic eq "\x1f\x8b") {  # GZIP compressed input
    if(!open(FASTA,"$gzipexe -dc $infile |")) {
      die "# filter_FASTA_sequences: cannot read GZIP compressed $infile $!\n"
        ."# please check gzip is installed\n";

    } else {
      $root = $infile;
      $root =~ s/\.gz//;      
    }
  }
  elsif($infile =~ /\.bz2$/ || $magic eq "BZ") {   # BZIP2 compressed input
    if(!open(FASTA,"$bzip2exe -dc $infile |")) {
      die "# filter_FASTA_sequences: cannot read BZIP2 compressed $infile $!\n"
        ."# please check bzip2 is installed\n";

    } else {
      $root = $infile;
      $root =~ s/\.bz2//;                 
    }
  }
  else { 
    open(FASTA,"<$infile") || die "# filter_FASTA_sequences: cannot read $infile $!\n"; 
    $root = $infile;
  }

  while(<FASTA>) {
    if(/^\>(\S+)/) {
      $name = $1;
      $n_of_sequences++;
      $seqid = $n_of_sequences;
      $FASTA{$seqid}{'NAME'} = $name;
      push(@ids,$seqid);

    } else {
      s/[\s|\n]//g;
      $FASTA{$seqid}{'SEQ'} .= $_;
    }
  }

  close(FASTA);

  # create file with long sequences 
  my $filtfile = "$dir/" . basename($root) . ".filt$min_length_Mb.fasta";
  open(FILT,">",$filtfile) || die "# filter_FASTA_sequences: cannot create $filtfile $!\n";

  foreach $seqid (@ids) {

    $length = length($FASTA{$seqid}{'SEQ'});

    if($length >= $min_length_Mb * 1_000_000) { 
      print "# filter_FASTA_sequences: [passed $length bp] $FASTA{$seqid}{'NAME'}\n";
      print FILT ">$FASTA{$seqid}{'NAME'}\n$FASTA{$seqid}{'SEQ'}\n";

      $names{$FASTA{$seqid}{'NAME'}} = $length;
      push(@order, $FASTA{$seqid}{'NAME'});

    } else {
      print "# filter_FASTA_sequences: [skipped $length bp] $FASTA{$seqid}{'NAME'}\n";
    }
  }

  close(FILT);

  return ($filtfile, \%names, \@order);
}


# Takes 4 params:
# i) array of chr names that should be prefix\d+
# Returns:
# i) string with regex
sub regex4chrnames {

  my (@chrnames) = @_;
  my (%prefix, $name, $regex);

  foreach $name (@chrnames) {
    if($name =~ m/(\S+?)\d+/) { 
      $prefix{$1}++
    }
  }

  foreach $name (keys(%prefix)) {
    $regex = "$name(\\d+) | ";
  } 

  # remove last |
  $regex = substr($regex,0,-2);

  return $regex;
}
