#!/usr/bin/env perl
use strict;
use warnings;
use Getopt::Std;
use DB_File;
use FileHandle;
use File::Temp qw/ tempfile /;
use FindBin '$Bin';
use lib "$Bin/lib";
use Memory::Usage;
use Resources qw/ get_time_RAM /;

$|=1;

# Takes input VCF file with reads mapped to several concatenated references,
# and uses synteny-based equivalent coordinates to map polymorphism back to the 
# main/user-defined chromosome positions, in an effort to separate subgenomes, 
# which are then multiply aligned.
# FASTA output is produced by default to facilitate collapsing sequences afterwards.
# Optionally, VCF output is also produced.
# 
# Synteny-based equivalent positions in BED format produced with script WGA.
# These files are used to translate positions of secondary reference genomes
# to coordinates of the master reference, which is chosen by the user.
# Optionally a WGA BED file can be used to introduce a (diploid) outgroup for 
# which a genome assembly is available. The abbreviation of outgroup in 
# configsynt file is 'outg'; you can add several with outg-xxx, outg-yyy, etc

# Note: during benchmarks the master was B. distachyon (Bdis), while B. stacei (Bsta) 
# and B. sylvaticum (Bsyl) were defined as secondary.

# Bruno Contreras Moreira, Ruben Sancho EEAD-CSIC 2014-2025


my %IUPACdegen = (
  'AG'=>'R', 'GA'=>'R', 'CT'=>'Y', 'TC'=>'Y',
  'CG'=>'S', 'GC'=>'S', 'AT'=>'W', 'TA'=>'W',
  'GT'=>'K', 'TG'=>'K', 'AC'=>'M', 'CA'=>'M' 
);

my %revcomp = (
  'A'=>'T', 'T'=>'A','G'=>'C','C'=>'G', 'N'=>'N',
  'AG'=>'CT', 'GA'=>'TC', 'CT'=>'AG', 'TC'=>'GA',
  'CG'=>'CG', 'GC'=>'GC', 'AT'=>'AT', 'TA'=>'TA',  
  'GT'=>'AC', 'TG'=>'CA', 'AC'=>'GT', 'CA'=>'TG' 
);

my @validformats = qw( fasta );

# VCF filtering and output options, edit as required
my $SYNTENYZEROBASED       = 1;        # set to 1 if synteny coords are 0-based, as those produced by CGaln
my $MINDEPTHCOVERPERSAMPLE = 3;        # integer, min read depth at each position for each sample
my $MAXMISSINGSAMPLES      = 10;       # integer, max number of missing samples accepted per locus (VCF row)
my $ONLYPOLYMORPHIC        = 0;        # output option: 0 (constant and SNPs) or 1 (only SNPs). Zero is recommended at this stage

# first guess of key VCF columns, adjusted in real time below
my $COLUMNFIRSTSAMPLE      = 9; # integer, Column number (0/1-based) with the first sample in the VCF file
my $GENOTYPECOLUMNFORMAT   = 0; # column count 0-based in the VCF file
my $DEPTHCOLUMNFORMAT      = 1; # column count 0-based in the VCF file

# external binaries, edit if not installed elsewhere and not in path
my $GZIPEXE  = 'gzip'; 
my $BZIP2EXE = 'bzip2';
my $GREPEXE  = 'grep';
my $ZGREPEXE = 'zgrep';
my $SORTEXE  = 'sort -S 20G';
my $JOINEXE  = 'join';
my $UNIQEXE  = 'uniq';
my $CATEXE   = 'cat';
my $PERLEXE  = 'perl';

# utilities shipping with script
my $RM2LINES = "$Bin/utils/rm_double_lines.pl";

# read version.txt
my $VERSION = 'XXXX';
open(VERSION,"$Bin/version.txt");
while(<VERSION>) {
  if(/^(\S+)/){ $VERSION = $1 } 
}
close(VERSION);

########################################################################

my %opts;
my ($filename,$prevlogfile,$configfile,$outfilename) = ('','','','');
my ($reuse,$het,$master_ref) = (1,0);
my ($mindepth,$maxmissing,$only_polymorphic,$outfilename2) =
  ($MINDEPTHCOVERPERSAMPLE,$MAXMISSINGSAMPLES,$ONLYPOLYMORPHIC,'');
my $tmppath = '/tmp';
my $zerobased = $SYNTENYZEROBASED;

getopts('hNH1pt:v:c:o:d:m:V:r:l:', \%opts);

if(($opts{'h'})||(scalar(keys(%opts))==0)) {
  print "\nusage: $0 [options]\n\n";
  print "-h this message\n";
  print "-v input VCF file                          (example: -v data.vcf.gz)\n";
  print "-l report file from vcf2alignment, 1-based (example: -l vcf.rport.log.gz)\n"; 
  print "-c input TSV config file                   (example: -c config.synteny.tsv)\n";
  print "-o output FASTA file name                  (example: -o out.fasta)\n";
  print "-r master reference genome                 (example: -r Bdis)\n";
  print "-d min depth of called SNPs                (optional, example: -d 3, default -d $mindepth)\n";
  print "-m max missing samples                     (optional, example: -m 10, default -m $maxmissing\n";
  print "-V output VCF file name                    (optional, coordinates from -r genome, example: -f out.vcf)\n";
  print "-1 syntenic coords are 1-based             (optional, 0-based/BED by default, WGA in -c config.synteny.tsv)\n"; 
  print "-p take only polymorphic sites             (optional, by default all sites, constant and SNPs, are taken)\n";
  print "-H take also heterozygous sites            (optional, by default only homozygous, requires -l report from vcf2alignment -H)\n";
  print "-N new temp files, don't re-use            (optional, by default temp files are re-used if available at -t)\n";
  print "-t path to dir for temp file               (optional, default: -t $tmppath)\n";
  print "\nversion: $VERSION\n";
  print "\nPrimary citation: https://www.biorxiv.org/content/10.1101/2025.07.17.665301v1\n";
  exit(0);
} 

if(!defined($opts{'v'})) {
  die "# ERROR: need input VCF file (-v)\n";
} else {
  $filename = $opts{'v'}
}

if(!defined($opts{'c'})) {
  die "# ERROR: need input TSV config file (-c)\n";
} else {
  $configfile = $opts{'c'}
}

if(!defined($opts{'l'})) {
  die "# ERROR: need report log file (-l), generated by vcf2alignment\n";
} else {
  if($opts{'l'} !~ /\.gz$/) {
    die "# ERROR: need valid report log file (-l vcf.report.log.gz)\n";
  } else {
    $prevlogfile = $opts{'l'};
  }
}

if(!defined($opts{'o'})){
  die "# ERROR: need output filename (-o)\n";
} else {
  if(-s $opts{'o'}) {
    die "# ERROR: $opts{'o'} already exists, please choose valid -o filename\n";
  } else {
    $outfilename = $opts{'o'}
  }
}

if(!defined($opts{'r'})){
  die "# ERROR: need master reference genome (-r)\n";
} else {
  $master_ref = $opts{'r'}
}

if(defined($opts{'t'})) {
  $tmppath = $opts{'t'};
  if(!-d $tmppath) {
    die "# ERROR: need valid temp path (-t)\n";
  } else {
    $SORTEXE .= " -T $tmppath ";
  }  
}

if(defined($opts{'d'}) && $opts{'d'} > 0) {
  $mindepth = $opts{'d'}
}

if(defined($opts{'m'}) && $opts{'d'} >= 0) {
  $maxmissing = $opts{'m'}
}

if(defined($opts{'V'})) {
  if(-s $opts{'V'}) {
    die "# ERROR: $opts{'V'} already exists, please choose valid -V filename\n";
  } else {	  
    $outfilename2 = $opts{'V'}
  }
}

if(defined($opts{'p'})) {
  $only_polymorphic = 1
}

if(defined($opts{'H'})) {
  $het = 1
}

if(defined($opts{'N'})) {
  $reuse = 0
}

if(defined($opts{'1'})) {
  $zerobased = 0
}

printf(STDERR "# %s -r %s -v %s -c %s -l %s -o %s -d %d -m %d -V %s -p %d -H %d -t %s -N %d -1 %d\n\n",
  $0,$master_ref,$filename,$configfile,$prevlogfile,$outfilename,
  $mindepth,$maxmissing,$outfilename2,$only_polymorphic,$het,$tmppath,
  !$reuse,!$zerobased);

warn "# version: $VERSION\n\n";

######################################################

my ($n_of_samples,$n_of_loci,$n_var_loci) = (0,0,0);
my ($depthcover,$missing,$genotype,$allele,$dipsp);
my ($allele1,$allele2,$call1,$call2);
my ($corr_coord,$sample,$lastsample,$idx,$lastsampleidx,$file);
my (@samplenames,@MSA,%MSAref,%stats);
my ($snpname,$badSNP,$shortname,%contigstats);
my ($coord,$subgenome,$code,$corr_snpname,$chr,$pos);
my (%corr2snpname,%synfiles);

my $mu = Memory::Usage->new();
$mu->record('start');

# variables to create tied hashes which are BDB under the hood 
my (%synmap_fwd,%synmap_rev,%synmap_master,%refallele);

my ($fhf,$fwd_tmp_file) = tempfile(SUFFIX => '_fwd.coords.db', UNLINK => 1, DIR => $tmppath);
my ($fhr,$rev_tmp_file) = tempfile(SUFFIX => '_rev.coords.db', UNLINK => 1, DIR => $tmppath);
my ($fhm,$mas_tmp_file) = tempfile(SUFFIX => '_mas.coords.db', UNLINK => 1, DIR => $tmppath);
my ($fha,$refa_tmp_file) = tempfile(SUFFIX => '_refallele.db', UNLINK => 1, DIR => $tmppath);

tie(%synmap_fwd,'DB_File',$fwd_tmp_file)
  || die "# EXIT : cannot create $fwd_tmp_file: $! (BerkeleyDB::Error)\n";
tie(%synmap_rev,'DB_File',$rev_tmp_file)
  || die "# EXIT : cannot create $rev_tmp_file: $! (BerkeleyDB::Error)\n"; 
tie(%synmap_master,'DB_File',$mas_tmp_file)
  || die "# EXIT : cannot create $mas_tmp_file: $! (BerkeleyDB::Error)\n";
tie(%refallele,'DB_File',$refa_tmp_file)
  || die "# EXIT : cannot create $refa_tmp_file: $! (BerkeleyDB::Error)\n";

# temp files for multiple sequence alignments
my ($fhmsa, $MSA_tmp_file) = tempfile(SUFFIX => '_msa.txt'      , UNLINK => 1, DIR => $tmppath);
my ($fhmss,$MSA_sort_file) = tempfile(SUFFIX => '_msa.sort.txt' , UNLINK => 1, DIR => $tmppath);

my %vcf_real_names; # To shorten sample names in output alignment
my %mapcoordfiles;  # Files with pre-computed syntenic genomic coords
my %chrcodes;       # Prefixes/regular expressions that connect chromosomes to reference diploid genomes,
                    # must match the master and 2ary references or subgenomes
my %genomic_samples;# Deprecated: Set samples which should not count as missing data.
                    # For instance, we used it to leave outgroups out of these calculations,
                    # as WGS reads are significantly deeper than GBS/RNAseq samples

## parse config file
open(CONFIG,"<",$configfile) || die "# cannot read $configfile\n";
while(my $line = <CONFIG>) {

  # Example:config.synt.tsv file
  # Sample1.sort.bam	Sample1	real_name
  # Sample2.sort.bam	Sample2	real_name
  # GenomeB	GenomeA.GenomeB.coords.bed WGA
  # GenomeA	 SpA(\d+)	chrcode
  # GenomeB      SpB(\d+)	chrcode
  # optionally if outgroup is needed
  # outg-A	/path/to/..	WGA
  # outg-B	/path/to/..	WGA
  # outg-A	Os(\d+)	chrcode
  # outg-B	Tur(\d+)	chrcode

  next if($line =~ m/^#/);
  chomp($line);
  my @cdata = split(/\t/,$line);
  if($cdata[2] eq 'real_name') {
    $vcf_real_names{ $cdata[0] } = $cdata[1];

  } elsif($cdata[2] eq 'deep_sample') { 
    $genomic_samples{ $cdata[0] } = 1

  } elsif($cdata[2] eq 'WGA') {
    $mapcoordfiles{ $cdata[0] } = $cdata[1]; # uncompressed BED    

  } elsif($cdata[2] eq 'chrcode') {
    $chrcodes{ $cdata[0] } = $cdata[1];

  } else { 
    print "# unrecognized configuration: $line\n";
  }
} 
close(CONFIG);

## Read mapcoords files with syntenic coords and get subsets of SNP positions,
## takes three steps carried out with one-liners. Repeat for each 2ary diploid reference, keeping the same master genome.
## NOTE: resulting synteny BED files, after the three steps, contain 1-based genomics coordinates in all cases
my ($grepchr_ref, $grepchr,$onelinercmd, $tmpfile, $tmpfile2);

foreach $dipsp (keys(%mapcoordfiles)) {

  # outgroup is taken care of below	
  next if($dipsp =~ m/^outg/);

  if(!$chrcodes{ $master_ref }) {
    die "# ERROR: cannot find chr regex for $master_ref in $configfile\n";
  } else {
    # get regex prefix before capture (\d+)
    $grepchr_ref = (split(/\(/,$chrcodes{ $master_ref }))[0];
  }

  if(!$chrcodes{ $dipsp }) {
    die "# ERROR: cannot find chr regex for $dipsp in $configfile\n";
  } else {
    # get regex prefix before capture (\d+)
    $grepchr = (split(/\(/,$chrcodes{ $dipsp }))[0];
  }

  # name expected outfiles
  $tmpfile  = $tmppath . "/_list_$dipsp\_positions.coords";
  $tmpfile2 = $tmppath . "/_list_$dipsp\_positions.coords2";
  $synfiles{ $dipsp } = $tmppath . "/_$master_ref.$dipsp.coords.positions.tsv";

  if($reuse && -s $synfiles{ $dipsp }) { 
    warn "# re-using $synfiles{ $dipsp }\n";
    next;
  } else {
    warn "# computing $synfiles{ $dipsp } (3 steps)\n";
  }

  # step-1) Get VCF SNPs mapping diploid species chromosome (1-based)
    # grep "valid locus" ../01_Toy_Bd2plusChr01/SNPs.BRACHY.rm.Bd2plusChr01.SNPs_plus_Constant_DP5MS3.log | \
    # grep -v -P "Bd\d+" | grep Chr | perl -lne 'if(/(Chr\d+)_(\d+)/){ printf("%s.%d\n",$1,$2) }' | \
    # sort -t '.' -k1,1 -k2,2n | uniq > tmpfile 
  
  $onelinercmd = "$ZGREPEXE 'valid locus' $prevlogfile | $GREPEXE -v -P \"$grepchr_ref\\d+\" " .
    "| $GREPEXE -F $grepchr | $PERLEXE -lne 'if(/($grepchr\\d+)_(\\d+)/){ printf(\"%s.%d\n\",\$1,\$2) }' " .
    "| $SORTEXE -t '.' -k1,1 -k2,2n | $UNIQEXE > $tmpfile"; # Chr01.373933

  system("$onelinercmd");
  if(!-s $tmpfile) {
    die "# ERROR: failed while running one-liner1: $onelinercmd\n";  
  }

  # step-2) sort coords from mapcoords synteny BED file to temp file, will correct 0-based coords to 1-based if needed
    # zcat ../Bdistachyon.Bstacei2.1.coords.toy.tsv.gz | \
    # perl -lane '$F[1]+=1; $F[5]+=1; print join(" ",@F[0 .. 8])' | \
    # sort -k5,5 -k6,6n | \
    # perl -lane 'print join(" ",@F[0 .. 3])." $F[4].".join(" ",@F[5 .. 8])' > tmpfile2
  
    #BED: chrA  posA    endA    baseA   strandA chrB    posB    endB    baseB   block   SNP
    #should be shortened to:
    #        chrA  posA    baseA   strandA chrB    posB    baseB   block   SNP

  if($zerobased == 1) {
    $onelinercmd = "$CATEXE $mapcoordfiles{ $dipsp } | $PERLEXE " .
      '-lane \'$F[1]+=1; $F[6]+=1; print join(" ",@F[0 .. 8])\' |' .
      "$SORTEXE -k6,6 -k7,7n | $PERLEXE ".
      '-lane \'print join(" ",@F[0,1,3,4])." $F[5].".join(" ",@F[6,8,9,10])\' ' .
      " > $tmpfile2";

  } else {
    $onelinercmd = "$CATEXE $mapcoordfiles{ $dipsp } | " .
      "$SORTEXE -k6,6 -k7,7n | $PERLEXE ".
      '-lane \'print join(" ",@F[0,1,3,4])." $F[5].".join(" ",@F[6,8,9,10])\' ' .
      "> $tmpfile2";
  } #chrA posA baseA strandA chrB.posB baseB block SNP
  
  system("$onelinercmd");
  if(!-s $tmpfile2) { 
    die "# ERROR: failed while running one-liner2: $onelinercmd\n";
  }  

  # step-3) Get master_ref mappings of diploid species coordinates, will produce 1-based genomic coords
  # Deprecated: $onelinercmd = "$JOINEXE -o \"1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8\" -1 5 -2 1 $tmpfile2 $tmpfile |" .
  # Deprecated: 'perl -plne "s/[\s\.]/\t/g" ' ."> $synfiles{ $dipsp }"; print "$onelinercmd\n";
  # grep -w -F -f tmpfile tmpfile2 | perl -plne "s/[\s\.]/\t/g" > synfile_step3
  
  $onelinercmd = "$GREPEXE -w -F -f $tmpfile $tmpfile2 | $PERLEXE " .
    '-plne "s/[\s\.]/\t/g" ' .
    "> $synfiles{ $dipsp }"; #print "$onelinercmd\n";

  system("$onelinercmd");
  if(!-s $synfiles{ $dipsp }) {
    die "# ERROR: failed while running one-liner3: $onelinercmd\n";
  }

  unlink($tmpfile, $tmpfile2);
}

## read syntenic mapped positions (1-based) 

warn "\n# master reference: $master_ref\n";
warn "# secondary references: ".join(',',sort(keys(%synfiles)))."\n";
warn "# outgroups: ".join(',',sort( grep{/^outg/} keys(%mapcoordfiles)))."\n";
warn "# synteny files (SYNTENYZEROBASED=$zerobased): \n";

foreach my $file (keys(%synfiles)) {
  warn "# $file : $synfiles{$file}...\n";

  my $n_of_positions = 0;
  open(SYNFILE,$synfiles{$file}) || die "# cannot read $synfiles{$file}\n";
  while(<SYNFILE>) {

    #Bd2	51953360	A	-	Chr01	6863391	A

    my @rawdata = split(/\t/,$_);

    $corr_coord = $rawdata[1];
    $coord      = $rawdata[5];

    $snpname = "$rawdata[4]_$coord"; # -> position in secondary reference (i.e GenomeB)

    # record syntenic coords and allele of master subgenome (keys are master)
    $synmap_master{ "$rawdata[0]_$rawdata[1]" } = 1;
    $refallele{ "$rawdata[0]_$rawdata[1]" } = $rawdata[2];

    # record syntenic coords of secondary subgenome (keys are 2ary)
    if($rawdata[3] eq '+'){ 
      $synmap_fwd{ $snpname } = "$rawdata[0]_$corr_coord"; 
      #print "$snpname $synmap_fwd{$snpname}\n";
    }
    else{ 
      $synmap_rev{ $snpname } = "$rawdata[0]_$corr_coord"; 
      #print "$snpname $synmap_rev{$snpname}\n";
    }

    $n_of_positions++;
    #last if($n_of_positions == 1_000_000); # debug
  }
  close(SYNFILE);
  warn "# total positions=$n_of_positions\n";
} 
warn "\n"; 


## check input VCF and choose right way to parse input lines
my ($genomic_samples,$gbs_samples); 
my $master_contigs = ''; 

# open temp file to store SNPs, will be sorted after
open(MSA,">",$MSA_tmp_file) || 
   die "#ERROR: cannot create $MSA_tmp_file\n";

# Example: VCF file and information
# Index        0     1   2  3   4   5    6      7              8      9        n
# Headers      CHROM POS ID REF ALT QUAL FILTER INFO           FORMAT Sample_1 Sample_n
# Position_1   Chr1  25  .  G   C   284  .      DP=45;...      GT:DP  0/0:45   ...
# Position_N   ChrN  N   .  ...

open(VCF,"$RM2LINES $filename |") || die "# ERROR: cannot run $RM2LINES $filename\n";
while(<VCF>) {  
   chomp($_);
   my @rawdata = split(/\t/,$_);

   if($n_of_samples > 0) {

      # skip non-polymorphic (constant) sites, if required
      next if($rawdata[4] eq '.' && $only_polymorphic);

      # skip indels (should have been removed previously)
      next if($rawdata[3] =~ m/[A-Z]{2,}/ || $rawdata[4] =~ m/[A-Z]{2,}/); 
   
      # find out which data fields to parse, 
      # this is done every row to avoid errors in VCF files where format might change
      my @sampledata = split(/:/,$rawdata[$COLUMNFIRSTSAMPLE-1]);
      foreach my $sd (0 .. $#sampledata) {
        if($sampledata[$sd] eq 'GT'){ $GENOTYPECOLUMNFORMAT = $sd; last }
        $sd++;
      }
      foreach my $sd (0 .. $#sampledata) {
        if($sampledata[$sd] eq 'DP'){ $DEPTHCOLUMNFORMAT = $sd; last }
        $sd++;
      } 

      my (@sequence, %nts);
      $sample=$depthcover=$missing=$badSNP=0;
      ($genomic_samples,$gbs_samples) = (0,0);
      foreach $idx ( $COLUMNFIRSTSAMPLE .. $lastsampleidx ) {

        # 0/0:0,255,255:93:0:99 --> homozygous reference (REF) allele
        # 1/1:255,255,0:185:0:99 --> homozygous alternative (ALT) allele
        # 0/1:236,0,237:66:7:9 --> heterozygous

        my @sampledata = split(/:/,$rawdata[$idx]); #print "$rawdata[$idx]\n";
        $genotype = $sampledata[$GENOTYPECOLUMNFORMAT];
        $depthcover = $sampledata[$DEPTHCOLUMNFORMAT]; #die "$genotype $depthcover\n";
        $allele = 'N'; # by default is unknown

        if($genotype eq '0/0' || $genotype eq '0|0') {
          if($depthcover >= $mindepth){ 
            $allele = $rawdata[3]; 
            if(defined($genomic_samples{$samplenames[$sample]})){ $genomic_samples++ }
            else{ $gbs_samples++ }
          } else { 
            if(!$genomic_samples{$samplenames[$sample]}){ $missing++ } 
          }
        } elsif($genotype eq '1/1' || $genotype eq '1|1') {
          if($depthcover >= $mindepth){ 
            $allele = (split(/,/,$rawdata[4]))[0]; 
            if(defined($genomic_samples{$samplenames[$sample]})){ $genomic_samples++ }
            else{ $gbs_samples++ }
          } else { 
            if(!$genomic_samples{$samplenames[$sample]}){ $missing++ } 
          } 
        } elsif($genotype eq '2/2' || $genotype eq '2|2') {
          if($depthcover >= $mindepth){ 
            $allele = (split(/,/,$rawdata[4]))[1]; 
            if(defined($genomic_samples{$samplenames[$sample]})){ $genomic_samples++ }
            else{ $gbs_samples++ }
          } else { 
            if(!$genomic_samples{$samplenames[$sample]}){ $missing++ } 
          }
        } elsif($genotype eq '3/3' || $genotype eq '3|3') {
          if($depthcover >= $mindepth){       
            $allele = (split(/,/,$rawdata[4]))[2]; 
            if(defined($genomic_samples{$samplenames[$sample]})){ $genomic_samples++ }
            else{ $gbs_samples++ }
          } else { 
            if(!$genomic_samples{$samplenames[$sample]}){ $missing++ } 
          }
        } else { # heterozygous and/or missing 
              
          # parse heterozygous if requested, note allele will be a dinucleotide             
          if($het == 1 && $genotype =~ m/^(\d+)[\/|](\d+)/) {

            ($call1,$call2) = ($1, $2);
            if($depthcover >= $mindepth){
              if($call1 eq '0') { $allele1 = $rawdata[3] }
              else { 
                $allele1 = (split(/,/,$rawdata[4]))[$call1-1] || 'N' 
              }
              if($call2 eq '0') { $allele2 = $rawdata[3] }
              else { 
                $allele2 = (split(/,/,$rawdata[4]))[$call2-1] || 'N' 
              }
              $allele = $allele1.$allele2 || 'N';  
              if(defined($genomic_samples{$samplenames[$sample]})){ $genomic_samples++ }
              else{ $gbs_samples++ }
            } else {
              if(!$genomic_samples{$samplenames[$sample]}){ $missing++ }
            }

          } else {    
            if(!$genomic_samples{$samplenames[$sample]}){ $missing++ }
          }
        }
	
        if($missing > $maxmissing) {
          $badSNP = 1;
          last;
        }   
         
        $sequence[$sample] = $allele;
        if($allele ne 'N') { 
	  foreach $call1 (split(//,$allele)) { $nts{$call1}++ }	
        }

        $sample++;
      } 

      # make sure genomic-only sites are skipped
      if($gbs_samples == 0) {
         $badSNP = 1;
      } 

      # make sure monomorphic sites are skipped (if it was required)
      if(scalar(keys(%nts)) < 2 && $only_polymorphic){ $badSNP = 1 }
     
      if(!$badSNP) {

        if($sample != $n_of_samples) {
          die "# VCF line contains less samples than expected ($sample < $n_of_samples):\n$_\n";
        }

        # check to which subgenome this SNP belongs
        $subgenome = 'NA';
        foreach $code (keys(%chrcodes)) {
          next if($code =~ m/^outg/);
          if($rawdata[0] =~ m/$chrcodes{$code}/) {
            $subgenome = $code;
            last;
          }
        }

        # SNP name in master subgenome
	$snpname = "$rawdata[0]_$rawdata[1]";
	$corr_snpname = $snpname; #default: same SNP name

        if($subgenome ne $master_ref) {

          # translate coords to master frame using syntenic positions
          if($synmap_fwd{$snpname}) { 
            $corr_snpname = $synmap_fwd{$snpname};
            # sequence[sample] set by default earlier
	    $refallele{ $snpname } = $rawdata[3];

          } elsif($synmap_rev{$snpname}) {
            $corr_snpname = $synmap_rev{$snpname};
            foreach $sample (0 .. $lastsample) {
              $sequence[$sample] = $revcomp{$sequence[$sample]}
            }
          }
          else{ next } 

        } else { # make sure only syntenic positions are taken

          next if(!defined($synmap_master{ $snpname }));
        }	

	($chr, $pos) = split(/_/,$corr_snpname); 
        foreach $sample (0 .. $lastsample) {

          # save sequence to MSA temp file
          print MSA "$chr\t$pos\t$sample\t$subgenome\t$sequence[$sample]\n";	

          # save stats of missing data
          if($sequence[$sample] eq 'N'){
            $contigstats{$subgenome}{$chr}{$sample}{'N'}++;
          }
          else{
            $contigstats{$subgenome}{$chr}{$sample}{'SNP'}++;
          }
        }

        if($corr_snpname ne $snpname) {
          $corr2snpname{$corr_snpname} .= "$snpname,";
	  #print "($snpname $corr_snpname -> $corr2snpname{$corr_snpname})\n";
        }
  
        # count total and variable loci	
        if(scalar(keys(%nts)) > 1){ $n_var_loci++ }
        $n_of_loci++;  

        # print to log	
	if($n_of_loci % 100_000 == 0) {
          warn "# number of loci read from VCF: $n_of_loci\n";
	}
      } 

   } elsif($rawdata[0] eq '#CHROM') {
      #CHROM   POS   ID   REF   ALT   QUAL   FILTER   INFO   FORMAT   sample1   sampleN
      push(@samplenames,@rawdata[9 .. $#rawdata]);
      $n_of_samples = scalar(@samplenames);
      $lastsample = $n_of_samples-1;
      $lastsampleidx = $#rawdata;
      printf(STDERR "# number of samples found=$n_of_samples\n");
      printf(STDERR "# parse heterozygous=$het\n");   

   } elsif(/^##contig=<ID=([^,]+),length=\d+>/ && $outfilename2 ne '') {
      if($1 =~ m/$chrcodes{ $master_ref }/) {
        $master_contigs .= $_;
      }
   }
}  
close(VCF);

close(MSA);

## sort SNPs ahead of printing the final multiple sequence alignment (MSA),
## keys in %MSA look like Bd2_44989455-sgBsta-s4

# sort in several steps:
# return SNP name ie Bd2_44989455 , [Note: it will be repeated]
# sort by chr then coord ,
# extract tuple [ Bd2_44989455, 2, 44989455 ] (split(/\-sg/))[0]

print "# sorting SNPs by position ...\n";
$onelinercmd = "$SORTEXE -k1,1 -k2,2n $MSA_tmp_file > $MSA_sort_file";
system("$onelinercmd");
if($? != 0) {
  die "# ERROR: failed sorting SNPs ($onelinercmd)\n"
} else {
  unlink($MSA_tmp_file); # can be large
}

## print sorted valid loci,
## these are syntenic positions that nonetheless can have missing secondary subgenome data
my (%seen);    
open(SORTMSA,"<",$MSA_sort_file) ||
  die "# ERROR: cannot read $MSA_sort_file\n";
while(<SORTMSA>) {
  ($chr,$pos,$sample,$subgenome,$allele) = split(/\t/,$_); 
  $snpname = $chr .'_' . $pos;
  next if(defined($seen{ $snpname }));

  printf(STDERR "# aligned position: %s : %s\n",
    $snpname,
    $corr2snpname{$snpname} || '');
  $seen{ $snpname } = 1;
}
close(SORTMSA);

printf(STDERR "# number of valid loci=$n_of_loci\n");

if(!$only_polymorphic) {
  printf(STDERR "# number of polymorphic loci=$n_var_loci\n");
} warn "\n";



## print sorted SNPs in FASTA and optionally VCF format

my (%tmp_fh, %tmp_fname, %block);
my ($fh, $prev_snpname, $block, $sample_allele);

if($outfilename2 ne '') {
  open(OUTVCF,">",$outfilename2) ||
    die "# ERROR: cannot create $outfilename2\n";

  print OUTVCF <<"END";  
##fileformat=VCFv4.2
$master_contigs
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##vcf2sintenyVersion=$VERSION
END

  print OUTVCF "CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT";
}

# create separate files for outgroup(s) in config file
foreach $dipsp (keys(%mapcoordfiles)) {
  next if($dipsp !~ m/^outg/);

  my $tmpFASTAfile = "_$dipsp.fasta";
  $tmp_fh{ $dipsp }{ $dipsp } = FileHandle->new(">$tmpFASTAfile");
  $tmp_fname{ $dipsp }{ $dipsp } = $tmpFASTAfile;
  $fh = $tmp_fh{ $dipsp }{ $dipsp };
  print $fh ">$dipsp\n";

  print OUTVCF "\t$dipsp" if($outfilename2 ne '');
}

# open separate files for each sample,subgenome tuple
foreach $sample (0 .. $lastsample) {
  foreach $subgenome (sort keys(%chrcodes)) {
    next if($subgenome =~ m/^outg/);

    my $tmpFASTAfile = "_$sample.$subgenome.fasta";

    # open file and save filehandle in hash
    $tmp_fh{ $sample }{ $subgenome } = FileHandle->new(">$tmpFASTAfile");
    $tmp_fname{ $sample }{ $subgenome } = $tmpFASTAfile;
    $fh = $tmp_fh{ $sample }{ $subgenome };

    # print FASTA header
    $shortname = $vcf_real_names{$samplenames[$sample]} || $samplenames[$sample];
    print $fh ">$shortname\_$subgenome\n";

    # print VCF samples
    print OUTVCF "\t$shortname\_$subgenome" if($outfilename2 ne '');
  }
}

print OUTVCF "\n" if($outfilename2 ne '');

# if required, parse WGA bedfiles of outgroups in config file
my (%outgcall);

# those BED files look like this:
## chrA  posA    endA    baseA   strandA chrB    posB    endB    baseB   block   SNP
#Bd1     64316346        64316347        G       -       Os03    10616195        10616196        G       19      .
#Bd1     64316338        64316339        A       -       Os03    10616203        10616204        G       19      SNP
#Bd2     51368823        51368824        T       +       Os01    33989953        33989954        T       32      .
#Bd2     51368826        51368827        A       +       Os01    33989956        33989957        T       32      SNP
foreach $dipsp (keys(%mapcoordfiles)) {
  next if($dipsp !~ m/^outg/);

  open(BED,"<",$mapcoordfiles{ $dipsp }) || 
     die "# ERROR: cannot read $mapcoordfiles{ $dipsp }\n";
  while(<BED>) {
    next if(/^#/);
    my @data = split(/\t/,$_);
    # add +1 to match VCF coords
    if($data[4] eq '+') { 
      $outgcall{ $dipsp }{ sprintf("%s_%d",$data[0], $data[2]+1) } = $data[8];

    } elsif(defined( $revcomp{ $data[8] } )) {
      $outgcall{ $dipsp }{ sprintf("%s_%d",$data[0], $data[2]+1) } = $revcomp{ $data[8] };
    } else {
       $outgcall{ $dipsp }{ sprintf("%s_%d",$data[0], $data[2]+1) } = 'N';
    }
  }
  close(BED);
}

# parse sorted SNPs, one block at a time,
# two example blocks shown below
#Bd2     48166443        0       Bsta    N
#Bd2     48166443        1       Bsta    N
#Bd2     48166443        2       Bsta    C
#Bd2     48166443        3       Bsta    N
#Bd2     48166443        4       Bsta    C
#Bd2     48166443        5       Bsta    C
##
#Bd2     48166444        0       Bsta    N
#Bd2     48166444        1       Bsta    N
#Bd2     48166444        2       Bsta    T
#Bd2     48166444        3       Bsta    N
#...

$prev_snpname = '';

open(SORTMSA,"<",$MSA_sort_file) ||
  die "# ERROR: cannot read $MSA_sort_file\n";
while(<SORTMSA>) {
  chomp;
  ($chr,$pos,$sample,$subgenome,$allele) = split(/\t/,$_);
  $snpname = $chr .'_' . $pos; #next if($snpname ne 'Bd2_48166445');

  # previous block complete (samples only)
  if($snpname ne $prev_snpname && $prev_snpname ne '') { 

    # process outgroups
    my @calls;

    foreach $dipsp (keys(%mapcoordfiles)) {
      next if($dipsp !~ m/^outg/);

      $fh = $tmp_fh{ $dipsp }{ $dipsp };
      $sample_allele =  $outgcall{ $dipsp }{ $snpname } || 'N';
      print $fh $sample_allele;

      if($outfilename2 ne '') {
        push(@calls, $sample_allele) if($sample_allele eq 'N');
      }
    }  

    # process previous block, fill missing data with Ns
    foreach $sample (0 .. $lastsample) {
      foreach $subgenome (sort keys(%chrcodes)) {
        next if($subgenome =~ m/^outg/);
        $fh = $tmp_fh{ $sample }{ $subgenome };	      

        if(defined($block{$sample}{$subgenome}) && 
          length($block{$sample}{$subgenome}) > 1) {
          $sample_allele = $IUPACdegen{ $block{$sample}{$subgenome} } || 'N';
	  #print "$block{$sample}{$subgenome} -> $sample_allele\n";
        } else {
          $sample_allele = $block{$sample}{$subgenome} || 'N';
        }

        print $fh $sample_allele;     

        if($outfilename2 ne '') {
          push(@calls, $block{$sample}{$subgenome} || 'N');
        }

	$stats{$sample}{$subgenome}{'total'}++;
        if($sample_allele ne 'N') 
        {
          $stats{$sample}{$subgenome}{'noN'}++
        }	
      }
    }

    # compute ALT and allele numbers for VCF format
    if($outfilename2 ne '') { 
  
      my ($ALT,$ref_array_genotypes) = calls2alleles($refallele{$snpname}, \@calls);

      print OUTVCF "$chr\t$pos\t$corr2snpname{$snpname}\t$refallele{$snpname}\t$ALT\t.\t.\t.\tGT";

      foreach $allele2 (@$ref_array_genotypes) { 
        print OUTVCF "\t$allele2";
      } print OUTVCF "\n"; #"\t".join(',',@calls).";$refallele{$snpname}\n";
    }

    %block = ();
  }

  # add SNP data to the current block
  $block{$sample}{$subgenome} = $allele;

  # record name of previous SNP 
  $prev_snpname = $snpname;
}
close(SORTMSA);

# take care of last block
my @calls;

# outgroups
foreach $dipsp (keys(%mapcoordfiles)) {
  next if($dipsp !~ m/^outg/);

  $fh = $tmp_fh{ $dipsp }{ $dipsp };
  $sample_allele =  $outgcall{ $dipsp }{ $snpname } || 'N';
  print $fh $sample_allele;

  if($outfilename2 ne '') {
    push(@calls, $sample_allele) if($sample_allele eq 'N');
  }
}

# samples    
foreach $sample (0 .. $lastsample) {
  foreach $subgenome (sort keys(%chrcodes)) {
    next if($subgenome =~ m/^outg/);
    $fh = $tmp_fh{ $sample }{ $subgenome };
    $sample_allele = $block{$sample}{$subgenome} || 'N';

    if(defined($block{$sample}{$subgenome}) && 
      length($block{$sample}{$subgenome}) > 1) {
      $sample_allele = $IUPACdegen{ $block{$sample}{$subgenome} } || 'N';
    } else {
      $sample_allele = $block{$sample}{$subgenome} || 'N';
    }

    print $fh $sample_allele;

    if($outfilename2 ne '') {
      push(@calls, $block{$sample}{$subgenome} || 'N');
    }

    $stats{$sample}{$subgenome}{'total'}++;
    if($sample_allele ne 'N')
    {
      $stats{$sample}{$subgenome}{'noN'}++
    }
  }
}

# compute ALT and allele numbers for VCF format
if($outfilename2 ne '') {

  my ($ALT,$ref_array_genotypes) = calls2alleles($refallele{$snpname}, \@calls);

  print OUTVCF "$chr\t$pos\t$corr2snpname{$snpname}\t$refallele{$snpname}\t$ALT\t.\t.\t.\tGT";

  foreach $allele2 (@$ref_array_genotypes) {
    print OUTVCF "\t$allele2";
  } print OUTVCF "\n";
}



## add final newline, print stats & close FASTA temp files 
foreach $sample (0 .. $lastsample) {
  foreach $subgenome (sort keys(%chrcodes)) {
    next if($subgenome =~ m/^outg/);
    $fh = $tmp_fh{ $sample }{ $subgenome };
    print $fh "\n";
    $fh->close();

    # stats
    $shortname = $vcf_real_names{$samplenames[$sample]} || $samplenames[$sample];
    printf(STDERR "# %s_%s variants: %d / %d\n",
      $shortname,
      $subgenome,
      $stats{$sample}{$subgenome}{'noN'} || 0,
      $stats{$sample}{$subgenome}{'total'});
  }
}

# outgroups
foreach $dipsp (keys(%mapcoordfiles)) {
  next if($dipsp !~ m/^outg/);

  $fh = $tmp_fh{ $dipsp }{ $dipsp };
  print $fh "\n";
  $fh->close(); 
}

## concat temp FASTA files and produce single output FASTA file
print "# concatenating temp files ...\n";
$onelinercmd = "$CATEXE ";

# outgroups ahead as in VCF
foreach $dipsp (keys(%mapcoordfiles)) {
  next if($dipsp !~ m/^outg/);
  $onelinercmd .= " $tmp_fname{ $dipsp }{ $dipsp } ";
}

foreach $sample (0 .. $lastsample) {
  foreach $subgenome (sort keys(%chrcodes)) {
    next if($subgenome =~ m/^outg/);	  
    $onelinercmd .= "$tmp_fname{ $sample }{ $subgenome } "
  }
}

$onelinercmd .= " > $outfilename";
system("$onelinercmd");
if($? != 0) {
  die "# ERROR: failed concatenating temp files ($onelinercmd)\n"
}

# untie hashes and delete temp files
untie(%synmap_fwd);
untie(%synmap_rev);
untie(%synmap_master);
untie(%refallele);

foreach $sample (0 .. $lastsample) {
  foreach $subgenome (sort keys(%chrcodes)) {
    next if($subgenome =~ m/^outg/);
    unlink($tmp_fname{ $sample }{ $subgenome })
  }
}

# outgroups
foreach $dipsp (keys(%mapcoordfiles)) {
  next if($dipsp !~ m/^outg/);
  unlink( $tmp_fname{ $dipsp }{ $dipsp } );
}

foreach $dipsp (keys(%mapcoordfiles)) {
  next if($dipsp =~ m/^outg/);
  unlink( $synfiles{ $dipsp } );
}

# summary of resources
$mu->record('end');

printf("\n\n\n# time used (s): %d memory used (Mb): %2.1f\n",
  get_time_RAM($mu->report()));


################################

# Takes: i) REF allele char, ii) reference to array of calls ['A','N','TC',..]
# Returns: i) ALT allele(s) string, ii) reference to array of genotypes ['1/1','./.','3/2',..]
sub calls2alleles {

  my ($REF, $ref_array_calls) = @_;

  my ($allele,$call1,$call2);
  my ($ALT,%nts,%nt2num,@genotypes) = ('');

  foreach $allele (@$ref_array_calls) {
    while($allele =~ m/([ACGT])/g) { $nts{ $1 } = 1 }
  }

  # REF allele in VCF contig should be 1st
  $nt2num{ $REF } = 0;

  # now add all other sorted alleles
  foreach $allele (sort keys(%nts)) {
    next if($allele eq $REF);
    $nt2num{ $allele } = scalar(keys(%nt2num));
    $ALT .= "$allele,";
  } chop($ALT);

  if($ALT eq '') {
    $ALT = $REF;
  }

  foreach $allele (@$ref_array_calls) {
    if($allele eq 'N') {
       push(@genotypes, "./.")

    } elsif(length($allele) == 1) {
       push(@genotypes, sprintf("%s/%s",$nt2num{ $allele },$nt2num{ $allele }))

    } else {
       ($call1,$call2) = split(//,$allele);
       push(@genotypes, sprintf("%s/%s",$nt2num{ $call1 },$nt2num{ $call2 }))
    }
  }

  return ($ALT, \@genotypes)
}
