#!/usr/bin/env perl

use Getopt::Long;
use Pod::Usage;
use FindBin;
use lib $FindBin::Bin;
use strict;

use csem_perl_utils;

my $isUnique = 0;
my $isSampling = 0;
my $isBAM = 0;
my $isBED = 0;
my $isTagAlign = 0;
my $version = 0;
my $help = 0;

GetOptions("unique-only" => \$isUnique,
	   "sampling" => \$isSampling,
	   "bam" => \$isBAM,
	   "bed" => \$isBED,
	   "tag-align" => \$isTagAlign,
	   "version" => \$version,
	   "h|help" => \$help) or pod2usage(-exitval => 2, -verbose => 2);

pod2usage(-verbose => 2) if ($help == 1);

my $dir = "$FindBin::Bin/";
my $command = "";

&showVersionInfo($dir) if ($version == 1);

pod2usage(-msg => "--unique-only and --sampling cannot be set at the same time!", -exitval => 2, -verbose => 2) if ($isUnique + $isSampling == 2);
pod2usage(-msg => "Only one of --bam, --bed and --tag-align can be set!", -exitval => 2, -verbose => 2) if ($isBAM + $isBED + $isTagAlign > 1);
pod2usage(-msg => "Invalid number of arguments!", -exitval =>2, -verbose => 2) if (scalar(@ARGV) != 2);

if ($isBAM + $isBED + $isTagAlign == 0) { $isBAM = 1; }



my ($choice, $outputFormat) = (0, 0);

if ($isUnique) { $choice = 1; }
elsif ($isSampling) { $choice = 2; }

if ($isBED) { $outputFormat = 1; }
elsif ($isTagAlign) { $outputFormat = 2; }

if ($choice == 0 && $outputFormat == 0) { 
    print "Warning: Current setting is equivalent to the input BAM file (except that unalignable reads are removed)!\n";
}

$command = $dir."csem-bam-processor $ARGV[0] $ARGV[1] $choice $outputFormat";
&runCommand($command);

if ($isBAM == 1) {
    # sort the BAM file generated
    $command = $dir."sam/samtools sort $ARGV[1].bam $ARGV[1].sorted";
    &runCommand($command);

    # index the sorted BAM file
    $command = $dir."sam/samtools index $ARGV[1].sorted.bam";
    &runCommand($command);
}

__END__

=head1 NAME

csem-generate-input

=head1 SYNOPSIS

=over

 csem-generate-input [options] input.bam output_name

=back

=over

=item B<input.bam>

Input BAM file, this should be the BAM file (unsorted) generated by 'run-csem'

=item B<output_name>

The output files' prefix name. 'csem-generate-input' will add suffixes accordingly (e.g. 'output_name.bam').

=back

=head1 OPTIONS

=over

=item B<--unique-only>

Only report uniquely mapped reads. (Default: off)

=item B<--sampling>

For each alignable read, if it has multiple alignments, sample one alignment (based on each alignment's posterior probability) from the alignment set to report. Otherwise, just report the alignment it has. (Default: off)

=item B<--bam>

Write reported alignments into a BAM file. (Default: on)

=item B<--bed>

Write reported alignments into a BED file. (Default: off)

=item B<--tag-align>

Write reported alignments into a tagAlign file. (Default: off)

=item B<--version>

Show version information.

=item B<-h/--help>

Show help information.

=back

=head1 DESCRIPTION

This program processes the BAM file (unsorted) generated by 'run-csem' and generates input files for ChIP-Seq peak callers. '--unique-only' and '--sampling' are mutually exclusive. '--bam', '--bed' and '--tag-align' are also mutually exclusive. If neither '--unique-only' nor '--sampling' is set, this program will report all alignments.

Please note that unaligned reads contained in the input BAM file are not counted as alignments and therefore will not be reported.

=head1 OUTPUT

=over

=item B<output_name.bam, output_name.sorted.bam and output_name.sorted.bam.bai>

Only generated when '--bam' is specified or neither '--bam', '--bed' nor
'--tag-align' is specified (default output format is BAM).

'output_name.bam' is a BAM-formatted file which should be used as
ChIP-Seq peak callers' input (e.g. MACS).

'output_name.genome.sorted.bam' and 'output_name.genome.sorted.bam.bai' are the
sorted BAM file and indices generated by samtools (included in CSEM package).

If '--unique-only' or '--sampling' is specified, the 'ZW' tags will be
removed from all alignments for the outputted BAM file. However, the
QUAL fields will not be changed.

=item B<output_name.bed>

Only generated when '--bed' is specified.

'output_name.bed' is a BED-formatted file of read alignments. It contains 6 fields each line:

chrom chromStart chromEnd name score strand

Fields are separated by the tab character. The definitions of each field are the same as the standard BED format definition. 'score' is calculated by multiplying 1000 to the posterior probability each alignment has. For uniquely aligned reads, the posterior probabilities are 1.0. 

MOSAiCS requires a BED file as its input.

=item B<output_name.tagAlign>

Only generated when '--tag-align' is specified.

'output_name.tagAlign' is a tagAlign-formatted file of read alignments. It contains 6 fields each line:

chrom chromStart chromEnd sequence score strand

Fields are separated by the tab character. The definitions of each field can be found at http://genome.ucsc.edu/FAQ/FAQformat.html#format15 . 'score' is calculated by multiplying 1000 to the posterior probability each alignment has. For uniquely aligned reads, the posterior probabilities are 1.0. 

=back

=head1 EXAMPLES

We assume that the input BAM file is 'GATA1_csem.bam'. 

1) Generate BED input for MOSAiCS:

 csem-generate-input --bed GATA1_csem.bam GATA1_csem

The output is 'GATA1_csem.bed'.

2) Generate BAM input for MACS, uniquely aligned reads only:

 csem-generate-input --unique-only --bam GATA1_csem.bam GATA1_csem_for_MACS.uniq

The outputs are 'GATA1_csem_for_MACS.uniq.bam', 'GATA1_csem_for_MACS.uniq.sorted.bam' and 'GATA1_csem_for_MACS.uniq.sorted.bam.bai'. 

3) Generate BAM input for MACS. Report all alignable reads. If one read has multiple alignments, sample one to report.

 csem-generate-input --sampling GATA1_csem.bam GATA1_csem_for_MACS

The outputs are 'GATA1_csem_for_MACS.bam', 'GATA1_csem_for_MACS.sorted.bam' and 'GATA1_csem_for_MACS.sorted.bam.bai'.

=cut

