#!/usr/bin/env perl
#ABSTRACT: Assign taxonomy using R Decipher


use 5.012;
use warnings;
use Pod::Usage;
my $VERSION = "1.1.3";
my %methods_avail = (
	'DECIPHER' => 1, 
	'DADA2' => 1
);
use Getopt::Long;
use Data::Dumper;
use FindBin qw($RealBin);
use File::Basename;
use File::Spec;
use File::Spec::Functions;
use File::Copy;
use Term::ANSIColor qw(color);
use FASTX::Reader;
use FASTX::ScriptHelper;
use JSON::PP;
use File::Temp;


my (
	$opt_inputfile,
	$opt_outdir,
	$opt_reference,
	$opt_version, $opt_help, $opt_verbose, $opt_force, $opt_underscore, $opt_fasta
);
my $opt_method = 'DECIPHER';
my $opt_confidence = 70;
my $opt_threads = 1;
my $DecipherTaxScript = File::Spec->catfile($RealBin, 'D2-decipher.R');
my $DadaTaxScript = File::Spec->catfile($RealBin, 'D2-dada-taxonomy.R');
GetOptions(
	'm|method=s'     => \$opt_method,
	'i|input=s'      => \$opt_inputfile,
	'o|outdir=s'     => \$opt_outdir,
	'r|reference=s'  => \$opt_reference,
	'c|confidence=f' => \$opt_confidence,
	'f|fasta-out=s'  => \$opt_fasta,
	't|threads=i'    => \$opt_threads,
	'u|underscore-join'   => \$opt_underscore,
	'version'        => \$opt_version,
	'force'          => \$opt_force,
	'help'           => \$opt_help,
);

say STDERR color('magenta bold'), " DADAIST2 Assign Taxonomy", color('reset');

$opt_version && version();
$opt_help    && pod2usage({-exitval => 0, -verbose => 2});

$opt_method = uc($opt_method);
if (not defined $methods_avail{$opt_method}) {
	say STDERR "Method not available $opt_method.";
	say STDERR "Available methods: ", join(', ', keys %methods_avail);
	exit 1;
}
my $log_filename = File::Temp->new(TEMPLATE => "dadaist2-assigntax-temp-XXXXXXX");
my $S = FASTX::ScriptHelper->new({
	verbose => $opt_verbose,
	logfile => $log_filename,
});


die "Missing parameters. Use --help for full manual.\n" if (!$opt_inputfile or !$opt_reference);
$opt_outdir = "./" unless ($opt_outdir);
die "ERROR: Reference not found: $opt_reference\n" unless (-e $opt_reference);
die "ERROR: Input file not found: $opt_inputfile\n" unless (-e $opt_inputfile);
die "ERROR: Missing script $DadaTaxScript/$DecipherTaxScript.\n" unless (-e $DadaTaxScript or -e $DecipherTaxScript);
if (! -d "$opt_outdir") {
	if ($opt_force) {
		say STDERR " - Attempting to create directory: $opt_outdir.\n";
  	mkdir "$opt_outdir" || die "ERROR: Output directory not found / unable to write ($opt_outdir)\n";
	} else {
		say STDERR " - Output directory found: $opt_outdir (--force).";
	}
} 

say STDERR "
 Input file:  $opt_inputfile
 Reference:   $opt_reference
 Output dir:  $opt_outdir
 Threads:     $opt_threads\n";


## Load from DADA2 tsv
if ($opt_inputfile =~/(tsv|csv)$/i) {
	my $fasta_out = File::Spec->catfile($opt_outdir, "rep-seqs.fasta");
	$S->verbose("Extracting FASTA file from $opt_inputfile");
	open (my $I, '<', $opt_inputfile) || die "ERROR: Unable to read input file <$opt_inputfile>.\n";
	open (my $O, '>', $fasta_out) || die "ERROR: Unable to write to $opt_outdir.\n";
	my $c = 0;
	while (my $line = readline($I)) {
		chomp($line);
		next if ($line =~/^#/);
		my @fields = split /\t/, $line;
		$c++;
		print {$O} ">ASV", $c, "\n", $fields[0], "\n";
	}
	$opt_inputfile = $fasta_out;
}

## Assign taxonomy
if ( $opt_reference =~/RData$/i) {
		say STDERR " * Using DECIPHER";
		# DECIPHER
		my $exec =  $S->run(
			qq(Rscript --vanilla "$DecipherTaxScript" "$opt_inputfile" "$opt_reference" "$opt_outdir" "$opt_threads"),
			{ candie => 1 }
		);

		if ($exec->{exit} != 0) {
			print STDERR $exec->{stderr}, "\n";
			die "Taxonomy assignment failed.\n";
		}

		my $raw_output = File::Spec->catfile($opt_outdir, "taxonomy.decipher");
		my $clean_out  = File::Spec->catfile($opt_outdir, "taxonomy.tsv");
		if ( -e $raw_output) {
			open (my $I, '<', $raw_output) || die "ERROR: Unable to read input file <$raw_output>.\n";
			open (my $O, '>', $clean_out) || die "ERROR: Unable to write to $opt_outdir.\n";
			#2 Bacteria Cyanobacteria Cyanobacteriia Chloroplast NA NA
			say {$O} "Kingdom Phylum Class Order Family Genus";
			my $c;
			while (my $line = readline($I)) {
				# Name \t Tax;ranks;.. \t Root [rootrank, 100.0%]; Bacteria [domain, 100.0%]; Firmicutes [phylum, 100.0%]; 		
				my ($seqname, $rank_string, $rank_confidence) = split /\t/, $line;
				next unless defined $rank_string;
				$c++;
				if (defined $rank_confidence) {

					my @ranks = split/;/, $rank_string;
					my $out = "$c ";
					for (my $i = 1; $i <= 6; $i++) {
						if ( $ranks[$i] ) {
							$out .= "$ranks[$i]";
						} else {
							$out .= "NA";
						}

						if ($i == 6) {
							$out=~s/\s+/ /g;
							say {$O} $out;
						} else {
							$out .= " ";
						}
					}
				} else {
					#ASV1	Root [rootrank, 90.8%]; Bacteria [domain, 90.8%]; Proteobacteria [phylum, 90.8%]; Gammaproteobacteria [class, 90.8%]; Burkholderiales [order, 90.8%]; Burkholderiaceae [family, 74.7%]; Ralstonia [genus, 73.9%]
					my @ranks = split/;/, $rank_string;
					my $out = "$c ";
					for (my $i = 1; $i <= 6; $i++) {
						if ( $ranks[$i] ) {
							my ($taxName, $rankName, $confidence) = $ranks[$i] =~/^\s*(.*)\s+\[(\w+), ([0-9.]+)\%\]/;
							if ( $confidence > $opt_confidence ) {
									$out .= quote_tax($taxName);
							} else {
								$out .= "NA";
							} 
						} else {
							$out .= "NA";
						}

						if ($i == 6) {
							$out=~s/\s+/ /g;
							say {$O} $out;
						} else {
							$out .= " ";
						}
					}
				}
			}
		} else {
			die "Taxonomy assignment failed, output file not found.\n";
		}

} elsif ( $opt_reference =~/gz$/ ) {
		say STDERR " * Using DADA2";
		my $R = FASTX::Reader->new({ filename => "$opt_inputfile"});
		
		my $tmp = File::Temp->new( TEMPLATE => 'dadaist2-assigntax-XXXXX',
                       SUFFIX => '.txt');
		say STDERR "Writing temporary file to $tmp.";
		while (my $record = $R->getRead() ) {
			say {$tmp} $record->{seq};
		}

		# DADA2
		my $exec =  $S->run(
			qq(Rscript --vanilla "$DadaTaxScript" "$tmp" "$opt_reference" "$opt_outdir" "$opt_threads"),
			{ candie => 1 }
		);

		if ($exec->{exit} != 0) {
			print STDERR $exec->{stderr}, "\n";
			die "Taxonomy assignment failed.\n";
		} else {
			unlink "$tmp";
		}

} else {
	die "ERROR: Reference not in RData/fasta.gz format: $opt_reference.\n";
}

# Save FASTA file

if (defined $opt_fasta) {
	
	open (my $O, '>', "$opt_fasta") || die "ERROR:\nUnable to read $opt_fasta\n";
	if (-e File::Spec->catfile("$opt_outdir", 'taxonomy.tsv') ) {
		my @taxonomy = loadTaxonomyArray(File::Spec->catfile("$opt_outdir", 'taxonomy.tsv'));
		my $R = FASTX::Reader->new({ filename => "$opt_inputfile"});
		my $c = 0;
		while (my $z = $R->getRead() ) {
			say {$O} '>', $z->{name}, "\t", $taxonomy[$c];
			say {$O} $z->{seq};
		}
	}
}

# Remove log
unlink "$log_filename";


sub loadTaxonomyArray {
  # load taxonomy from files
  my ($file) = @_;
  my @results = ();
  return () if (! -e "$file");
  open(my $I, '<', "$file") || return ();
  my $c = 0;

  while (my $line = readline($I)) {
    chomp($line);
    $c++;
    next if ($c == 1);
    if ($line =~/^\d+/) {
			my @local = ();
			
			# Remove first field (seq number)
      my @fields = split / /, $line;
      shift(@fields);
      
			my $newstr = join(" ", @fields);
			while ($newstr =~/(".*?"|\S+)/g) {
				push(@local, $1)
			}
			push(@results, join(';', @local));
    }
  }
	
  return @results;
}


sub quote_tax {
	my $name = shift @_;
	if ($name !~/\s/) {
		return $name;
	}
	if ($opt_underscore) {
		$name =~s/\s/_/g;
	} else {
		$name = '"' . $name . '"';
	}
	return $name;
}


sub version {
	say basename($0), " ", $VERSION;
	exit;
}
sub usage {
	my $PROG = basename($0);
	say "USAGE:";
	say "$PROG -i rep-seqs.fasta -o outdir/ -r reference.RData [-t threads]";
 
	exit;
}

__END__

=head1 NAME

B<dadaist2-assigntax> - Assign Taxonomy

=head1 AUTHOR

Andrea Telatin <andrea.telatin@quadram.ac.uk>

=head1 USAGE

  dadaist2-assigntax [options] -i FASTA -o DIR -r REFERENCE 


=over 4

=item I<-m>, I<--method>

Taxonomy assignment method, either DECIPHER or DADA2
(default: DECIPHER)

=item I<-i>, I<--input> FASTA

Input file in FASTA format (or in DADA2 table format)

=item I<-o>, I<--outdir> DIR

Output directory, or the current working directory if not specified.

=item I<-f>, I<--fasta> FILE

Save taxonomy assigned FASTA file.

=item I<-r>, I<--reference> FILE

RData file with the training set in DECIPHER format.

=item I<-t>, I<--threads> INT

Number of threads to use.

=item I<-u>, I<--underscore-join>

Join taxa names that have spaces with an underscore (default:
use double quotes)

=back

=head1 SOURCE CODE AND DOCUMENTATION

The program is freely available at L<https://quadram-institute-bioscience.github.io/dadaist2>
released under the MIT licence. The website contains further DOCUMENTATION.
