#!/usr/bin/env perl
use 5.012;
use warnings;
use File::Basename;
use Getopt::Long;
use Data::Dumper;
use FindBin qw($RealBin);
use File::Basename;
use File::Spec;
use File::Spec::Functions;
use File::Copy;
use Term::ANSIColor qw(color);
use FASTX::Reader;
use FASTX::ScriptHelper;
use JSON::PP;
use Digest::MD5 qw(md5_hex);
use File::Temp;
use Pod::Usage;


=pod

=head1 NAME

B<dadaist2-dada2fasta> - a program to process the feature table generated by DADA2
(that uses the sequences as feature names) and saves it as feature table (using
progressive feature names, or the MD5 of the sequences).

=head1 AUTHOR

Andrea Telatin <andrea.telatin@quadram.ac.uk>

=head1 SYNOPSIS

  dadaist2-dada2fasta  -i dada2table.tsv -o table.tsv -r repseqs.fasta

=cut


my $VERSION          = '1.0.0';
my $PROGRAM           = basename($0);

my $opt_input;
my $opt_out_fasta;
my $opt_out_table;
my $opt_pattern = '_R1.fastq.gz';
my $opt_seq_prefix = 'MD5';
my $opt_version;
my $opt_help;

GetOptions(
  'i|input=s'        => \$opt_input,
  'o|output-table=s' => \$opt_out_table,
  'r|rep-seqs=s'     => \$opt_out_fasta,
  's|strip-pattern=s'=> \$opt_pattern,
  'p|otu-prefix=s'   => \$opt_seq_prefix,
  'version'                         => \$opt_version,
  'h|help'                          => \$opt_help,
) || die " Parameters error. Check $PROGRAM --help\n";

$opt_version && version();
$opt_help    && pod2usage({-exitval => 0, -verbose => 2});

=pod

=head1 PARAMETERS

=head2 Main Parameters

=over 4

=item I<-i>, I<--input> FILE

Output produced by DADA2 (feature table tsv).

=item I<-o>, I<--output-table> FILE

Output feature table

=item I<-r>, I<--rep-seqs> FILE

Fasta output with the representative sequences.

=item I<-s>, I<--strip-pattern> STR

Remove from the sample names this string, usually found as
filename suffix (default: _R1.fastq.gz)

=item I<-p>, I<--otu-prefix> STR

Prefix used for the represenative sequences, by default the
MD5 of the sequence (example: ASV)

=back

=cut

if (not defined $opt_input or not defined $opt_out_fasta or not defined $opt_out_table) {
  say STDERR "Missing required parameters: -i INPUT -r REPSEQ.fasta -o TABLE.tsv";
  exit;
}
if (! -e "$opt_input") {
  die "ERROR: Input file not found: $opt_input\n";
}
my @header = ();

open my $DADA_TSV,   '<', "$opt_input"      || die " ERROR:\n Unable to open <$opt_input>\n";
open my $dada2_FO,   '>', "$opt_out_table"  || die " ERROR:\n Unable to write to <$opt_out_table>\n";
open my $repseqs_FO, '>', "$opt_out_fasta"  || die " ERROR:\n Unable to write to <$opt_out_fasta>\n";

my $feature_counter = 0;

 

# Create OTU Table and FASTA file from TSV
while ( my $line = readline($DADA_TSV) ) {
	if ($line=~/^#/) {
		$line=~s/$opt_pattern//g;
		print {$dada2_FO} $line;
		chomp($line);
		@header = split /\t/, $line;
	} else {
		chomp($line);
		$feature_counter++;
		my ($sequence, @values) = split /\t/, $line;
    checksequence($sequence);
		my $name;
    my $comment = '';
		if ($opt_seq_prefix eq 'MD5') {
			$name = md5_hex($sequence);
		} else {
			$name = $opt_seq_prefix . $feature_counter;
      #$comment = "\t" . $taxonomy[$feature_counter - 1] if ($chooser->{'database'} eq 'dada2'); ##TODO
		}

		say {$repseqs_FO} '>' , $name, $comment, "\n", $sequence;
		say {$dada2_FO} $name, "\t", join("\t", @values);
	}

}


sub version {
	say "$PROGRAM v$VERSION";
	exit 0;
}

sub checksequence {
  my ($seq) = @_;
  my $unrecognized_chars = 0;
  if ($seq=~/[^ACGTN]/i) {
    $unrecognized_chars++;
  }

  if ($unrecognized_chars > 0) {
    print STDERR "ERROR: <$seq> is not a sequence.\n";
    exit 1;
  }
}

=pod

=head1 SOURCE CODE AND DOCUMENTATION

The program is freely available at L<https://quadram-institute-bioscience.github.io/dadaist2>
released under the MIT licence. The website contains the full I<DOCUMENTATION> and we recommend 
checking for updates.

=cut
