#!/usr/bin/env perl

use Getopt::Long;
use Pod::Usage;
use FindBin;
use lib $FindBin::Bin;
use strict;

use csem_perl_utils;

my $nThreads = 1;
my $is_sam = 0; # default
my $is_bam = 0;
my $upperBound = 200;
my $noExtendingReads = 0;
my $version = 0;
my $help = 0;

GetOptions("p|num-threads=i" => \$nThreads,
	   "sam" => \$is_sam,
	   "bam" => \$is_bam,
	   "upper-bound=i" => \$upperBound,
	   "no-extending-reads" => \$noExtendingReads,
	   "version" => \$version,
	   "h|help" => \$help) or pod2usage(-exitval => 2, -verbose => 2);

pod2usage(-verbose => 2) if ($help == 1);

my $dir = "$FindBin::Bin/";
my $command = "";

&showVersionInfo($dir) if ($version == 1);

pod2usage(-msg => "Number of threads should be at least 1!", -exitval => 2, -verbose => 2) if ($nThreads < 1);
pod2usage(-msg => "--sam and --bam cannot be set at the same time!", -exitval => 2, -verbose => 2) if ($is_sam + $is_bam == 2); 
pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 3);
pod2usage(-msg => "Fragment length must be positive!", -exitval => 2, -verbose => 2) if ($ARGV[1] <= 0);

if ($is_sam + $is_bam == 0) { $is_sam = 1; }

$command = $dir."csem";
if ($is_sam) { $command .= " s"; }
else { $command .= " b"; }
$command .= " $ARGV[0] $ARGV[1] $upperBound $ARGV[2] $nThreads";
if (!$noExtendingReads) { $command .= " --extend-reads"; }

&runCommand($command);

# sort the BAM file generated
$command = $dir."sam/samtools sort $ARGV[2].bam $ARGV[2].sorted";
&runCommand($command);

# index the sorted BAM file
$command = $dir."sam/samtools index $ARGV[2].sorted.bam";
&runCommand($command);

__END__

=head1 NAME

run-csem

=head1 SYNOPSIS

=over

 run-csem [options] input_file fragment_length output_name

=back

=head1 ARGUMENTS

=over

=item B<input_file>

Input alignment file, can be in either SAM or BAM format.

=item B<fragment_length>

The average fragment length. This value must be positive. 

=item B<output_name>

The output files' name. All output files are prefixed by this name (e.g. output_name.bam).

=back

=head1 OPTIONS

=over

=item B<-p/--num-threads> <int>

Number of threads to use. (Default: 1)

=item B<--sam>

Input file is in SAM format. (Default: on)

=item B<--bam>

Input file is in BAM format. (Default: off)

=item B<--upper-bound> <int>

The maximal number of iterations for CSEM. (Default: 200)

=item B<--no-extending-reads>

Disable extending reads. (Default: off)

=item B<--version>

Show version information.

=item B<-h/--help>

Show help information.

=back

=head1 DESCRIPTION

This program allocates multi-reads fractionally using an iterative
algorithm. The main output is 'output_name.bam', which contains all
alignments and their posterior probabilities. Users can use
'csem-generate-input' to generate ChIP-Seq peak caller input files
from 'output_name.bam'.

By default, CSEM extends reads to the average fragment length and then
uses the midpoints for allocating multi-reads. If
'--no-extending-reads' is set, CSEM will not extend reads and use the
left most genomic coordinates (same as what we proposed in our
PLoS Computational Biology paper) instead.

=head1 OUTPUT

=over

=item B<output_name.bam, output_name.sorted.bam and output_name.sorted.bam.bai>

'output_name.bam' is a BAM-formatted file of read alignments. The MAPQ
field of each alignment is set to min(100, floor(-10 * log10(1.0 - w)
+ 0.5)), where w is the posterior probability of that alignment being
the true mapping of the read. In addition, RSEM pads a new tag
ZW:f:value, where value is a single precision floating number
representing the posterior probability.

'output_name.genome.sorted.bam' and 'output_name.genome.sorted.bam.bai' are the
sorted BAM file and indices generated by samtools (included in CSEM package).

=back

=head1 EXAMPLES

We assume that the input alignemnt file is in SAM format and named as 'GATA1.sam'. We want to use 8 threads. The average fragment length is 200 bp and output name is set as 'GATA1_csem':

 run-csem --sam \
          -p 8 \
          GATA1.sam \
          200 \
          GATA1_csem

The outputs are 'GATA1_csem.bam', 'GATA1_csem.sorted.bam' and 'GATA1_csem.sorted.bam.bai'.

=cut
