#!/usr/bin/env perl
use strict;
use warnings;
use Getopt::Std;
use FindBin '$Bin';
use lib "$Bin/lib";
use Memory::Usage;
use Resources qw/ get_time_RAM /;

# Converts an input VCF file, with might be GZIP/BZIP2 compressed, into a  
# multiple sequence alignment (MSA) and produces a compressed list of 1-based 
# valid homozygous sites (-l) considering read depth (-d) and missing data (-m).
#
# Optionally it can output the MSA in several supported formats 
# (check @validformats below).

# Note: Chromosome names of genomes in VCF file must be different.

# Bruno Contreras Moreira, Ruben Sancho EEAD-CSIC 2014-2025

my %IUPACdegen = (  
  'AG'=>'R', 'GA'=>'R', 'CT'=>'Y', 'TC'=>'Y',
  'CG'=>'S', 'GC'=>'S', 'AT'=>'W', 'TA'=>'W',
  'GT'=>'K', 'TG'=>'K', 'AC'=>'M', 'CA'=>'M' );

my @validformats = qw( phylip nexus fasta );

# VCF thresholds for filtering and output options
my $MINDEPTHCOVERPERSAMPLE = 3;       # integer, min read depth at each position for each sample
my $MAXMISSINGSAMPLES      = 10;      # integer, max number of missing samples accepted per locus (VCF row)
my $ONLYPOLYMORPHIC        = 0;       # output option: 0 (constant and SNPs) or 1 (only SNPs). Zero is recommended at this stage
my $OUTFILEFORMAT          = 'fasta'; # other formats in @validformats (phylip, nexus or fasta)

# first guess of key VCF columns, adjusted in real time below
my $COLUMNFIRSTSAMPLE      = 9; # integer, Column number (0/1-based) with the first sample in the VCF file
my $GENOTYPECOLUMNFORMAT   = 0; # column count 0-based in the VCF file
my $DEPTHCOLUMNFORMAT      = 1; # column count 0-based in the VCF file

# edit if they are installed elsewhere and not in path
my $GZIPEXE  = 'gzip'; 

# utilities shipping with script
my $RM2LINES = "$Bin/utils/rm_double_lines.pl";

# read version.txt
my $VERSION = 'XXXX';
open(VERSION,"$Bin/version.txt");
while(<VERSION>) {
  if(/^(\S+)/){ $VERSION = $1 }
}
close(VERSION);

my %opts;
my ($filename,$configfile,$reportfilename,$outfilename,$het) = ('','','','',0);
my ($raw_reportfilename);
my ($mindepth,$maxmissing,$only_polymorphic,$outformat) = 
  ($MINDEPTHCOVERPERSAMPLE,$MAXMISSINGSAMPLES,$ONLYPOLYMORPHIC,$OUTFILEFORMAT);

getopts('hHpv:c:o:l:d:m:f:', \%opts);

if(($opts{'h'})||(scalar(keys(%opts))==0)) {
  print "\nusage: $0 [options]\n\n";
  print "-h this message\n";
  print "-v input VCF file                                  (example: -v data.vcf.gz)\n";
  print "-c input TSV config file                           (example: -c config.tsv)\n";
  print "-l output report file name, 1-based coordinates    (example: -l vcf.report.log.gz)\n";  
  print "-o output MSA file name                            (optional, example: -o out.fasta)\n";
  print "-d min read depth at each position for each sample (optional, example: -d 3, default -d $mindepth,\n"; 
  print "                                                              use -d 0 if VCF file lacks DP)\n";
  print "-m max missing samples                             (optional, example: -m 10, default -m $maxmissing\n";
  print "-f output format                                   (optional, example: -f nexus, default -f $outformat)\n";
  print "-p take only polymorphic sites                     (optional, by default all sites, constant and SNPs, are taken)\n";
  print "-H take also heterozygous sites                    (optional, by default only homozygous are taken)\n";
  print "\nversion: $VERSION\n";
  print "\nPrimary citation: https://www.biorxiv.org/content/10.1101/2025.07.17.665301v1\n";
  exit(0);
}

if(!defined($opts{'v'})) {
  die "# ERROR: need input VCF file (-v)\n";
} else {
  $filename = $opts{'v'}
}

if(!defined($opts{'c'})) {
  die "# ERROR: need input TSV config file (-c)\n";
} else {
  $configfile = $opts{'c'}
}

if(!defined($opts{'l'})) {
  die "# ERROR: need valid output filename (-l vcf.report.log.gz)\n";
} else {
  if($opts{'l'} !~ /\.gz$/) {
    die "# ERROR: need valid output filename (-l vcf.report.log.gz)\n";
  } else {	  
    $reportfilename = $opts{'l'};
    $raw_reportfilename = $reportfilename;
    $raw_reportfilename =~ s/\.gz$//;
  }  
}

if(defined($opts{'o'})) {
  $outfilename = $opts{'o'}
}

if(defined($opts{'d'}) && $opts{'d'} >= 0) {
  $mindepth = $opts{'d'}
}

if(defined($opts{'m'}) && $opts{'d'} >= 0) {
  $maxmissing = $opts{'m'}
}

if(defined($opts{'f'}) && grep(/^$opts{'f'}$/,@validformats)) {
  $outformat = $opts{'f'}
}

if(defined($opts{'p'})) {
  $only_polymorphic = 1
}

if(defined($opts{'H'})) {
  $het = 1
}

warn "# $0 -v $filename -c $configfile -l $reportfilename -o $outfilename ".
  "-d $mindepth -m $maxmissing -f $outformat -p $only_polymorphic -H $het\n\n";

warn "# version: $VERSION\n\n";

######################################################

my ($n_of_samples,$n_of_loci,$n_var_loci) = (0,0,0);
my ($depthcover,$missing,$genotype,$allele,$allele1,$allele2,$call1,$call2);
my ($corr_coord,$sample,$lastsample,$idx,$lastsampleidx,$file,$snpname,$badSNP,$shortname);
my (@samplenames,@MSA,%MSAref,%stats,%contigstats);
my %vcf_real_names; # To shorten sample names in output alignment
my %genomic_samples;# Deprecated: Set samples which should not count as missing data.
                    # For instance, we used it to leave outgroups out of these calculations,
                    # as WGS reads are significantly deeper than GBS/RNAseq samples 

my $mu = Memory::Usage->new();
$mu->record('start');

## parse config file
open(CONFIG,"<$configfile") || die "# cannot read $configfile\n";
while(my $line = <CONFIG>) {
  # Example:config.tsv file
  # sample1.sort.bam	Sample1	real_name
  # sample1.sort.bam	Sample2	real_name
  # sampleN.sort.bam	SampleN	real_name
  
  chomp($line);
  my @cdata = split(/\t/,$line);
  if($cdata[2] eq 'real_name')   {
    $vcf_real_names{ $cdata[0] } = $cdata[1];

  } elsif($cdata[2] eq 'deep_sample') {
    $genomic_samples{ $cdata[0] } = 1

  } else {
    print "# unrecognized configuration: $line\n";
  }
}
close(CONFIG);


## create report file
open(LOG,">",$raw_reportfilename) || die "# ERROR: cannot create $raw_reportfilename\n";

## parse input VCF file
my ($genomic_samples,$gbs_samples); 

# Example: VCF file and information
# Index        0     1   2  3   4   5    6      7              8      9        n
# Headers      CHROM POS ID REF ALT QUAL FILTER INFO           FORMAT Sample_1 Sample_n
# Position_1   Chr1  25  .  G   C   284  .      DP=45;MQ0F=0;... GT:DP  0/0:45   ...
# Position_N   ChrN  N   .  ...

open(VCF,"$RM2LINES $filename |") || die "# ERROR: cannot run $RM2LINES $filename\n";
while(<VCF>) {  
  chomp($_);
  my @rawdata = split(/\t/,$_);

  if($n_of_samples > 0) {

    # skip non-polymorphic (constant) sites, if required
    next if($rawdata[4] eq '.' && $only_polymorphic);

    # skip indels (should have been removed previously)
    next if($rawdata[3] =~ m/[A-Z]{2,}/ || $rawdata[4] =~ m/[A-Z]{2,}/); 
  
    # find out which data fields to parse ie GT, DP
    my @sampledata = split(/:/,$rawdata[$COLUMNFIRSTSAMPLE-1]);
    foreach my $sd (0 .. $#sampledata) {
      if($sampledata[$sd] eq 'GT'){ $GENOTYPECOLUMNFORMAT = $sd; last }
      $sd++;
    }
    if($mindepth > 0) {
      foreach my $sd (0 .. $#sampledata) {
        if($sampledata[$sd] eq 'DP'){ $DEPTHCOLUMNFORMAT = $sd; last }
        $sd++;
      }	
    } 

    my (@sequence, %nts); 
    $sample=$depthcover=$missing=$badSNP=0;
    ($genomic_samples,$gbs_samples) = (0,0);
    foreach $idx ( $COLUMNFIRSTSAMPLE .. $lastsampleidx ) {
      # 0/0:0,255,255:93:0:99 --> homozygous reference (REF) allele
      # 1/1:255,255,0:185:0:99 --> homozygous alternative (ALT) allele
      # 0/1:236,0,237:66:7:9 --> heterozygous

      my @sampledata = split(/:/,$rawdata[$idx]); 
      $genotype = $sampledata[$GENOTYPECOLUMNFORMAT];
      if($mindepth > 0) {   
        $depthcover = $sampledata[$DEPTHCOLUMNFORMAT]; 

      } else { # in case depth is not required
        $depthcover = 0;
      }	      
      $allele = 'N'; # default 
      
      if($genotype eq '0/0' || $genotype eq '0|0') {
        if($depthcover >= $mindepth){ 
          $allele = $rawdata[3]; 
          if(defined($genomic_samples{$samplenames[$sample]})){ $genomic_samples++ }
          else{ $gbs_samples++ }
        } else { 
          if(!$genomic_samples{$samplenames[$sample]}){ $missing++ } 
        }
      } elsif($genotype eq '1/1' || $genotype eq '1|1') {
        if($depthcover >= $mindepth){ 
          $allele = (split(/,/,$rawdata[4]))[0]; 
          if(defined($genomic_samples{$samplenames[$sample]})){ $genomic_samples++ }
          else{ $gbs_samples++ }
        } else { 
          if(!$genomic_samples{$samplenames[$sample]}){ $missing++ } 
        } 
      } elsif($genotype eq '2/2' || $genotype eq '2|2') {
        if($depthcover >= $mindepth){ 
          $allele = (split(/,/,$rawdata[4]))[1]; 
          if(defined($genomic_samples{$samplenames[$sample]})){ $genomic_samples++ }
          else{ $gbs_samples++ }
        } else { 
          if(!$genomic_samples{$samplenames[$sample]}){ $missing++ } 
        }
      } elsif($genotype eq '3/3' || $genotype eq '3|3') {
        if($depthcover >= $mindepth){     
          $allele = (split(/,/,$rawdata[4]))[2]; 
          if(defined($genomic_samples{$samplenames[$sample]})){ $genomic_samples++ }
          else{ $gbs_samples++ }
        } else { 
          if(!$genomic_samples{$samplenames[$sample]}){ $missing++ } 
        }

      } else { # heterozygous and/or missing 
	      
        # parse heterozygous if requested	      
        if($het == 1 && $genotype =~ m/^(\d+)[\/|](\d+)/) {

          ($call1,$call2) = ($1, $2);
          if($depthcover >= $mindepth){
            if($call1 eq '0') { $allele1 = $rawdata[3] }
            else { 
              $allele1 = (split(/,/,$rawdata[4]))[$call1-1] || 'N' 
            }
            if($call2 eq '0') { $allele2 = $rawdata[3] }
            else { 
              $allele2 = (split(/,/,$rawdata[4]))[$call2-1] || 'N' 
            }
            $allele = $IUPACdegen{ $allele1.$allele2 } || 'N'; 
            if(defined($genomic_samples{$samplenames[$sample]})){ $genomic_samples++ }
            else{ $gbs_samples++ }
          } else {
            if(!$genomic_samples{$samplenames[$sample]}){ $missing++ }
          }

	} else {    
          if(!$genomic_samples{$samplenames[$sample]}){ $missing++ }
        }
      }
  
      if($missing > $maxmissing) {
        $badSNP = 1;
        last;
      }  
      
      $sequence[$sample] = $allele;
      if($allele ne 'N'){ $nts{$allele}++ }

      $sample++;
    } 

    # make sure genomic-only sites are skipped
    if($gbs_samples == 0) {
      $badSNP = 1;
    } 

    # make sure monomorphic sites are skipped (if required)
    if(scalar(keys(%nts)) < 2 && $only_polymorphic){ $badSNP = 1 }
    
    if(!$badSNP) {
      if($sample != $n_of_samples) {
        die "# VCF line contains less samples than expected ($sample < $n_of_samples):\n$_\n";
      }
    
      $snpname = "$rawdata[0]_$rawdata[1]";
     
      # valid locus = valid row in VCF file, these are 1-based genomic coordinates straight from VCF
      printf(LOG "# valid locus: $snpname $missing ".join(',',keys(%nts))."\n");
      
      foreach $sample (0 .. $lastsample) {
        $MSA[$sample] .= $sequence[$sample]; 

        # save stats of missing data
        if($sequence[$sample] eq 'N') {
          $stats{$sample}{'totalNs'}++;
          $contigstats{$sample}{$rawdata[0]}{'N'}++;

        } else {
          $stats{$sample}{'total'}++;
          $contigstats{$sample}{$rawdata[0]}{'SNP'}++;
        }  
      }

      if(scalar(keys(%nts)) > 1){ $n_var_loci++ }
      $n_of_loci++;
    }  
  }
  elsif($rawdata[0] eq '#CHROM') {
    #CHROM  POS  ID  REF  ALT  QUAL  FILTER  INFO  FORMAT  sample1  sampleN
    push(@samplenames,@rawdata[9 .. $#rawdata]);
    $n_of_samples = scalar(@samplenames);
    $lastsample = $n_of_samples-1;
    $lastsampleidx = $#rawdata;
    printf(LOG "# number of samples found=$n_of_samples\n");
    printf(LOG "# parse heterozygous=$het\n");
  }
}  
close(VCF);

printf(LOG "# number of valid loci=$n_of_loci\n");
if(!$only_polymorphic) {
  printf(LOG "# number of polymorphic loci=$n_var_loci\n");  
} warn "\n";

if($outfilename ne '') {
  open(OUTFILE,">$outfilename") || die "# cannot create output file $outfilename, exit\n";

  if($outformat eq 'nexus') {
    print OUTFILE "#NEXUS\nBEGIN DATA;\nDIMENSIONS  NTAX=$n_of_samples NCHAR=$n_of_loci;\n";
    print OUTFILE "FORMAT DATATYPE=DNA  MISSING=N GAP=-;\nMATRIX\n";
  }
  elsif($outformat eq 'phylip') {
    print OUTFILE "$n_of_samples    $n_of_loci\n";    
  }
}

foreach $sample (0 .. $lastsample) {
  if($vcf_real_names{$samplenames[$sample]}){
    $shortname = $vcf_real_names{$samplenames[$sample]} 
  } 
  else{ $shortname = $samplenames[$sample]; }

  if($outformat eq 'nexus') {
    print OUTFILE "$shortname $MSA[$sample]\n" if($outfilename ne '');

  } elsif($outformat eq 'phylip') {
    if(length($shortname)>10){ 
      $shortname = substr($shortname,0,9).'_'; # prefix
      #$shortname = '_'.substr($shortname,-9); # suffix
      printf(LOG "# phylip sample name shortened: $samplenames[$sample] -> $shortname\n");
    }
    print OUTFILE "$shortname    $MSA[$sample]\n"  if($outfilename ne '');

  } elsif($outformat eq 'fasta' && $outfilename ne '') {
    print OUTFILE ">$shortname\n";
    print OUTFILE "$MSA[$sample]\n";
  }
} 

if($outfilename ne '') {
  if($outformat eq 'nexus') {
    print OUTFILE ";\nEND;\n";
  }

  close(OUTFILE);
}

printf(LOG "\n\n# stats (#SNPs):\n");

foreach $sample (0 .. $lastsample) {
  printf(LOG "$samplenames[$sample] : $stats{$sample}{'total'}\n");
} 

printf(LOG "\n# stats per contig/chr (#SNPs):\n");

foreach $sample (0 .. $lastsample) {
  foreach my $contig (sort keys(%{$contigstats{$sample}})) {
    printf(LOG "%s\t%s\t%d\n",
      $samplenames[$sample],
      $contig,
      $contigstats{$sample}{$contig}{'SNP'} || 0);
  }
}

printf(LOG "\n# stats (#N):\n");

foreach $sample (0 .. $lastsample) {
  printf(LOG "$samplenames[$sample] : $stats{$sample}{'totalNs'}\n");
}

printf(LOG "\n# stats per contig/chr (#N):\n");

foreach $sample (0 .. $lastsample) {
  foreach my $contig (sort keys(%{$contigstats{$sample}})) {
    printf(LOG "%s\t%s\t%d\n",
      $samplenames[$sample],$contig,$contigstats{$sample}{$contig}{'N'} || 0);
  }
}

close(LOG);

# compress report log, these files can be > 200GB in wheat!
system("$GZIPEXE $raw_reportfilename");
if($? != 0) {
  die "# EXIT: failed while compressing $raw_reportfilename\n";
}


# summary of resources
$mu->record('end');

printf("\n\n\n# time used (s): %d memory used (Mb): %2.1f\n",
  get_time_RAM($mu->report()));
