/usr/bin/plot-bamstats

#!/usr/bin/env perl
#
#    Copyright (C) 2012-2014 Genome Research Ltd.
#
#    Author: Petr Danecek <pd3@sanger.ac.uk>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.

use strict;
use warnings;
use Carp;

my $opts = parse_params();
parse_bamcheck($opts);
if ( @{$$opts{bamcheck}} > 1 ) { merge_bamcheck($opts); exit; }

plot_qualities($opts);
plot_acgt_cycles($opts);
plot_gc($opts);
plot_gc_depth($opts);
plot_isize($opts);
plot_coverage($opts);
plot_mismatches_per_cycle($opts);
plot_indel_dist($opts);
plot_indel_cycles($opts);
create_html($opts);

exit;

#--------------------------------

sub error
{
    my (@msg) = @_;
    if ( scalar @msg ) { confess @msg; }
    die
        "About: Parses output of samtools stats (former bamcheck) and calls gnuplot to create graphs.\n",
        "Usage: plot-bamstats [OPTIONS] file.bam.bc\n",
        "       plot-bamstats -p outdir/ file.bam.bc\n",
        "Options:\n",
        "   -k, --keep-files                    Do not remove temporary files.\n",
        "   -l, --log-y                         Set the Y axis scale of the Insert Size plot to log 10.\n",
        "   -m, --merge                         Merge multiple bamstats files and output to STDOUT.\n",
        "   -p, --prefix <path>                 The output files prefix, add a slash to create new directory.\n",
        "   -r, --ref-stats <file.fa.gc>        Optional reference stats file with expected GC content (created with -s).\n",
        "   -s, --do-ref-stats <file.fa>        Calculate reference sequence GC for later use with -r\n",
        "   -t, --targets <file.tab>            Restrict -s to the listed regions (tab-delimited chr,from,to. 1-based, inclusive)\n",
        "   -h, -?, --help                      This help message.\n",
        "\n";
}


sub parse_params
{
    $0 =~ s{^.+/}{};
    my $opts =
    {
        args => join(' ',$0,@ARGV),

        # for file version sanity check
        sections =>
        [
            {
                id=>'SN',
                header=>
                    "# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n",
            },
            {
                id=>'FFQ',
                header=>
                    "# First Fragment Qualitites. Use `grep ^FFQ | cut -f 2-` to extract this part.\n" .
                    "# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n",
            },
            {
                id=>'LFQ',
                header=>
                    "# Last Fragment Qualitites. Use `grep ^LFQ | cut -f 2-` to extract this part.\n" .
                    "# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n",
            },
            {
                id=>'MPC',
                header=>
                    "# Mismatches per cycle and quality. Use `grep ^MPC | cut -f 2-` to extract this part.\n" .
                    "# Columns correspond to qualities, rows to cycles. First column is the cycle number, second\n" .
                    "# is the number of N's and the rest is the number of mismatches\n",
            },
            {
                id=>'GCF',
                header=>"# GC Content of first fragments. Use `grep ^GCF | cut -f 2-` to extract this part.\n",
            },
            {
                id=>'GCL',
                header=>"# GC Content of last fragments. Use `grep ^GCL | cut -f 2-` to extract this part.\n",
            },
            {
                id=>'GCC',
                header=>"# ACGT content per cycle. Use `grep ^GCC | cut -f 2-` to extract this part. The columns are: cycle, and A,C,G,T counts [\%]\n",
            },
            {
                id=>'IS',
                header=>"# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: insert size, pairs total, inward oriented pairs, outward oriented pairs, other pairs\n",
            },
            {
                id=>'RL',
                header=>"# Read lengths. Use `grep ^RL | cut -f 2-` to extract this part. The columns are: read length, count\n",
            },
            {
                id=>'ID',
                header=>"# Indel distribution. Use `grep ^ID | cut -f 2-` to extract this part. The columns are: length, number of insertions, number of deletions\n",
            },
            {
                id=>'IC',
                header=>"# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions (fwd), .. (rev) , number of deletions (fwd), .. (rev)\n",
            },
            {
                id=>'COV',
                header=>"# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.\n",
            },
            {
                id=>'GCD',
                header=>"# GC-depth. Use `grep ^GCD | cut -f 2-` to extract this part. The columns are: GC%, unique sequence percentiles, 10th, 25th, 50th, 75th and 90th depth percentile\n",
            },
        ],

        # for merging
        merge_keys =>
        {
            'sum' =>
            [
                'raw total sequences:',
                'filtered sequences:',
                'sequences:',
                '1st fragments:',
                'last fragments:',
                'reads mapped:',
                'reads mapped and paired:',
                'reads unmapped:',
                'reads properly paired:',
                'reads paired:',
                'reads duplicated:',
                'reads MQ0:',
                'reads QC failed:',
                'non-primary alignments:',
                'total length:',
                'bases mapped:',
                'bases mapped (cigar):',
                'bases trimmed:',
                'bases duplicated:',
                'mismatches:',
                'inward oriented pairs:',
                'outward oriented pairs:',
                'pairs with other orientation:',
                'pairs on different chromosomes:',
            ],
            'min' =>
            [
                'is sorted:',
            ],
            'max' =>
            [
                'maximum length:',
            ],
        },
    };
    for my $key (keys %{$$opts{merge_keys}})
    {
        for my $val (@{$$opts{merge_keys}{$key}}) { $$opts{hmerge}{$val} = $key; }
    }
    while (defined(my $arg=shift(@ARGV)))
    {
        if ( $arg eq '-k' || $arg eq '--keep-files' ) { $$opts{keep_files}=1; next; }
        if ( $arg eq '-l' || $arg eq '--log-y' ) { $$opts{y_axis_log10}=1; next; }
        if ( $arg eq '-m' || $arg eq '--merge' ) { $$opts{merge}=1; next; }
        if ( $arg eq '-r' || $arg eq '--ref-stats' ) { $$opts{ref_stats}=shift(@ARGV); next; }
        if ( $arg eq '-s' || $arg eq '--do-ref-stats' ) { $$opts{do_ref_stats}=shift(@ARGV); next; }
        if ( $arg eq '-t' || $arg eq '--targets' ) { $$opts{targets}=shift(@ARGV); next; }
        if ( $arg eq '-p' || $arg eq '--prefix' ) { $$opts{prefix}=shift(@ARGV); next; }
        if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); }
        if ( $arg eq '-'  || -e $arg ) { push @{$$opts{bamcheck}},$arg; next; }
        error("Unknown parameter or non-existent file \"$arg\". Run -h for help.\n");
    }
    if ( exists($$opts{do_ref_stats }) ) { do_ref_stats($opts); exit; }
    if ( !exists($$opts{bamcheck}) ) { error("No samtools stats file?\n") }

    if ( !exists($$opts{prefix}) )
    {
        if ( !$$opts{merge} ) { error("Expected -p parameter.\n") }
        $$opts{prefix} = './';
    }
    elsif ( $$opts{merge} ) { error("Only one of -p or -m should be given.\n"); }
    if ( $$opts{merge} )
    {
        if ( @{$$opts{bamcheck}} < 2 ) { error("Nothing to merge\n") }
    }
    else
    {
        if ( !exists($$opts{prefix}) ) { error("Expected -p parameter.\n") }
        if ( $$opts{prefix}=~m{/$} ) { `mkdir -p $$opts{prefix}`; }
        elsif ( !($$opts{prefix}=~/-$/) ) { $$opts{prefix} .= '-'; }
    }
    return $opts;
}


# Creates GC stats for either the whole reference or only on target regions for exome QC
sub do_ref_stats
{
    my ($opts) = @_;


    my %targets = ();
    if ( exists($$opts{targets}) )
    {
        my ($prev_chr,$prev_pos);
        open(my $fh,'<',$$opts{targets}) or error("$$opts{targets}: $!");
        while (my $line=<$fh>)
        {
            if ( $line=~/^#/ ) { next; }
            my ($chr,$from,$to) = split(/\s+/,$line);
            chomp($to);
            push @{$targets{$chr}}, $from,$to;
            if ( !defined $prev_chr or $chr ne $prev_chr ) { $prev_chr=$chr; $prev_pos=$from }
            if ( $prev_pos > $from ) { error("The file must be sorted: $$opts{targets}\n"); }
            $prev_pos = $from;
        }
        close($fh);
    }

    my $nlen = 0;
    my %lens = ();
    my %gc_counts = ();
    my ($skip_chr,$pos,$ireg,$regions);
    open(my $fh,'<',$$opts{do_ref_stats}) or error("$$opts{do_ref_stats}: $!");
    while (my $line=<$fh>)
    {
        if ( $line=~/^>/ )
        {
            if ( !scalar %targets ) { next; }

            if ( !($line=~/>(\S+)/) ) { error("FIXME: could not determine chromosome name: $line"); }
            if ( !exists($targets{$1}) ) { $skip_chr=$1; next; }
            undef $skip_chr;
            $pos = 0;
            $ireg = 0;
            $regions = $targets{$1};
        }
        if ( defined $skip_chr ) { next; }

        # Only $_len sized lines are considered and no chopping for target regions.
        chomp($line);
        my $len = length($line);
        $lens{$len}++;
        $nlen++;

        if ( scalar %targets )
        {
            while ( $ireg<@$regions && $$regions[$ireg+1]<=$pos ) { $ireg += 2; }
            $pos += $len;
            if ( $ireg==@$regions ) { next; }
            if ( $pos < $$regions[$ireg] ) { next; }
        }

        my $gc_count = 0;
        for (my $i=0; $i<$len; $i++)
        {
            my $base = substr($line,$i,1);
            if ( $base eq 'g' || $base eq 'G' || $base eq 'c' || $base eq 'C' ) { $gc_count++; }
        }
        $gc_counts{$gc_count}++;
    }

    # Calculate median length
    my $n = 0;
    my $median_len = 0;
    for my $len (sort { $a<=>$b } keys %lens)
    {
        $n += $lens{$len};
        if ( $n >= $nlen ) { $median_len = $len; last; }
    }
    if ( !$median_len ) { error("FIXME: median_len=$median_len??\n"); }

    print "# Generated by $$opts{args}\n";
    print "# The columns are: GC content bin, normalized frequency\n";
    my $max;
    for my $count (values %gc_counts)
    {
        if ( !defined $max or $count>$max ) { $max=$count; }
    }
    for my $gc (sort {$a<=>$b} keys %gc_counts)
    {
        if ( $gc==0 ) { next; }
        printf "%f\t%f\n", $gc*100./$median_len, $gc_counts{$gc}/$max;
    }
}

sub plot
{
    my ($cmdfile) = @_;
    my $cmd = "gnuplot $cmdfile";
    system($cmd);
    if ( $? ) { error("The command exited with non-zero status $?:\n\t$cmd\n\n"); }
}

sub open_file
{
    my ($file) = @_;
    my $fh;
    if ( $file eq '-' ) { $fh = *STDIN; }
    elsif ( $file=~/\.gz$/i ) { open($fh, "gunzip -c $file |") or error("gunzip -c $file: $!"); }
    else { open($fh,'<',$file) or error("$file: $!"); }
    return $fh;
}

sub parse_bamcheck1
{
    my ($opts, $i) = @_;
    my $file = $$opts{bamcheck}[$i];
    print STDERR "Parsing samtools stats output: $file\n" unless !$$opts{verbose};
    my $fh = open_file($file);
    my $line = <$fh>;
    if ( !($line=~/^# This file was produced by (\S+)/) ) { error("Sanity check failed: was this file generated by samtools stats or plot-bamstats?"); }
    if ( $1 ne 'plot-bamstats' && $1 ne 'samtools' ) { error("Sanity check failed: was this file generated by samtools stats or plot-bamstats?"); }
    my %dat;
    while ($line=<$fh>)
    {
        if ( $line=~/^#/ ) { next; }
        my @items = split(/\t/,$line);
        chomp($items[-1]);
        if ( $items[0] eq 'SN' )
        {
            $dat{SN}{$items[1]} = $items[2];
            next;
        }
        push @{$dat{$items[0]}}, [splice(@items,1)];
    }
    close($fh) unless $file eq '-';

    # Check sanity
    if ( !exists($dat{'SN'}{'sequences:'}) or !$dat{'SN'}{'sequences:'} )
    {
        error("Sanity check failed: no sequences found by samtools stats??\n");
    }
    my $nseq_new = $dat{'SN'}{'sequences:'};
    my $nseq_ori = exists($$opts{dat}{'SN'}) ? $$opts{dat}{'SN'}{'sequences:'} : 0;

    my %add = map { $_ => 1 } (qw(FFQ LFQ MPC GCF GCL IS ID IC RL));
    for my $a (keys %dat)
    {
        if ( !exists($$opts{dat}{$a}) ) { $$opts{dat}{$a} = $dat{$a}; next; } # first bamcheck file
        if ( $a eq 'SN' )
        {
            for my $b (keys %{$dat{$a}})
            {
                if ( exists($$opts{hmerge}{$b}) )
                {
                    if ( $$opts{hmerge}{$b} eq 'sum' ) { $$opts{dat}{$a}{$b} += $dat{$a}{$b}; next; }
                    if ( $$opts{hmerge}{$b} eq 'min' ) { $$opts{dat}{$a}{$b} = $$opts{dat}{$a}{$b} < $dat{$a}{$b} ? $$opts{dat}{$a}{$b} : $dat{$a}{$b}; }
                    if ( $$opts{hmerge}{$b} eq 'max' ) { $$opts{dat}{$a}{$b} = $$opts{dat}{$a}{$b} > $dat{$a}{$b} ? $$opts{dat}{$a}{$b} : $dat{$a}{$b}; }
                }
            }
            next;
        }
        if ( $add{$a} ) { add_to_matrix($$opts{dat}{$a}, $dat{$a}, 0); next; }
        if ( $a eq 'COV' ) { merge_coverage($$opts{dat}{$a}, $dat{$a}); next; }
        if ( $a eq 'GCC' ) { merge_gcc($nseq_ori, $$opts{dat}{$a}, $nseq_new, $dat{$a}); next; }
        if ( $a eq 'GCD' ) { merge_gcd(); next; }
        error("not processed: $a\n");
    }
    if ( $i==0 ) { return; }

    $$opts{dat}{SN}{'error rate:'} = sprintf "%e", get_value($opts,'SN','mismatches:') / get_value($opts,'SN','bases mapped (cigar):');
    update_average_length($opts);
    update_average_quality($opts);
    update_average_isize($opts);
}

sub parse_bamcheck
{
    my ($opts) = @_;
    for (my $i=0; $i<@{$$opts{bamcheck}}; $i++) { parse_bamcheck1($opts,$i); }

    # Determine the default title, for now use the first BAM name:
    #       5446_6/5446_6.bam.bc.gp -> 5446_6
    #       test.aaa.png -> test.aaa
    if ( !($$opts{bamcheck}[0]=~m{([^/]+?)(?:\.bam)?(?:\.bc)?$}i) ) { error("FIXME: Could not determine the title from [$$opts{bamcheck}[0]]\n"); }
    $$opts{title} = $1;
}

sub add_to_matrix
{
    my ($dst,$src,$key) = @_;
    my $id = 0;
    my $is = 0;
    while ($is<@$src)
    {
        while ( $id<@$dst && $$src[$is][$key] > $$dst[$id][$key] ) { $id++; }
        if ( $id<@$dst && $$src[$is][$key] == $$dst[$id][$key] )
        {
            for (my $j=0; $j<@{$$src[$is]}; $j++)
            {
                if ( $j==$key ) { next; }
                $$dst[$id][$j] += $$src[$is][$j];
            }
        }
        else { splice(@$dst,$id,0,$$src[$is]); }
        $is++;
    }
}

sub rebin_values
{
    my ($vals,$bin_size,$col) = @_;
    error("pd3\@sanger todo: rebin_values\n");
}


sub merge_coverage
{
    my ($dst,$src) = @_;
    if ( !($$dst[0][0]=~/^\[(\d+)-(\d+)\]$/) ) { error("Could not determine bin size in COV: $$dst[0][0]\n"); }
    my $dst_bin = $2 - $1 + 1;
    if ( !($$src[0][0]=~/^\[(\d+)-(\d+)\]$/) ) { error("Could not determine bin size in COV: $$src[0][0]\n"); }
    my $src_bin = $2 - $1 + 1;
    my (@dst, @src);
    for my $row (@$dst) { splice(@$row,0,1); push @dst,$row; }
    for my $row (@$src) { splice(@$row,0,1); push @src,$row; }
    my $dst_out  = pop(@dst);
    my $src_out  = pop(@src);
    my $bin_size = $dst_bin;
    if ( $src_bin > $dst_bin ) { my $vals = rebin_values(\@dst,$src_bin,0); @dst = @$vals; $bin_size = $src_bin; }
    elsif ( $src_bin < $dst_bin ) { my $vals = rebin_values(\@src, $dst_bin, 0); @src = @$vals; }
    add_to_matrix(\@dst, \@src, 0);
    for my $row (@dst) { unshift(@$row, sprintf("[%d-%d]", $$row[0],$$row[0]+$bin_size-1)); }
    push @dst, [sprintf("[%d<]", $dst[-1][1]), $dst[-1][1], $$dst_out[1]+$$src_out[1] ];
    @$dst = @dst;
}

sub merge_gcc
{
    my ($ndst,$dst,$nsrc,$src) = @_;
    if ( @$dst != @$src ) { error("todo: improve me\n"); }
    for (my $i=0; $i<@$dst; $i++)
    {
        if ( $$dst[$i][0] ne $$src[$i][0] ) { error("todo: improve me\n"); }
        for (my $j=1; $j<@{$$dst[$i]}; $j++)
        {
            $$dst[$i][$j] = ($$dst[$i][$j]*$ndst + $$src[$i][$j]*$nsrc) / ($ndst + $nsrc);
        }
    }
}

sub merge_gcd
{
    # todo
}

sub get_value
{
    my ($opts, $id, $key) = @_;
    if ( !exists($$opts{dat}{$id}) ) { return undef; }
    if ( !exists($$opts{dat}{$id}{$key}) ) { return undef}
    return $$opts{dat}{$id}{$key};
}

sub get_values
{
    my ($opts, $id) = @_;
    if ( !exists($$opts{dat}{$id}) ) { return (); }
    # todo: add version sanity check here
    return (@{$$opts{dat}{$id}});
}

sub update_average_length
{
    my ($opts) = @_;
    my @vals = get_values($opts,'RL');
    my $sum = 0;
    my $avg = 0;
    for my $val (@vals) { $sum += $$val[1]; }
    for my $val (@vals) { $avg += $$val[0]*$$val[1] / $sum; }
    $$opts{dat}{SN}{'average length:'} = sprintf "%.1f", $avg;
}

sub update_average_quality
{
    my ($opts) = @_;
    my @vals;
    push @vals, get_values($opts, 'FFQ'), get_values($opts,'LFQ');

    my $qsum = 0;
    for my $row (@vals)
    {
        for (my $i=1; $i<@$row; $i++) { $qsum += $$row[$i] }
    }
    my $qavg = 0;
    for my $row (@vals)
    {
        for (my $i=1; $i<@$row; $i++) { $qavg += ($i-1)*$$row[$i] / $qsum; }
    }
    $$opts{dat}{SN}{'average quality:'} = sprintf "%.1f", $qavg;
}

sub update_average_isize
{
    my ($opts) = @_;
    my @vals = get_values($opts, 'IS');
    my $sum = 0;
    for my $row (@vals) { $sum += $$row[1]; }
    my $avg = 0;
    for my $row (@vals) { $avg += $$row[0]*$$row[1] / $sum; }
    my $dev = 0;
    for my $row (@vals) { $dev += ($avg - $$row[0])*($avg - $$row[0]) * $$row[1] / $sum; }
    $$opts{dat}{SN}{'insert size average:'} = sprintf "%.1f", $avg;
    $$opts{dat}{SN}{'insert size standard deviation:'} = sprintf "%.1f", sqrt($dev);
}

sub get_defaults
{
    my ($opts,$img_fname,%args) = @_;

    if ( !($img_fname=~/\.png$/i) ) { error("FIXME: currently only PNG supported. (Easy to extend.)\n"); }

    # Determine the gnuplot script file name
    my $gp_file = $img_fname;
    $gp_file =~ s{\.[^.]+$}{.gp};
    if ( !($gp_file=~/.gp$/) ) { $gp_file .= '.gp'; }

    my $title = $$opts{title};

    my $dir = $gp_file;
    $dir =~ s{/[^/]+$}{};
    if ( $dir && $dir ne $gp_file ) { `mkdir -p $dir`; }

    my $wh = exists($args{wh}) ? $args{wh} : '600,400';

    open(my $fh,'>',$gp_file) or error("$gp_file: $!");
    return {
        title => $title,
        gp    => $gp_file,
        img   => $img_fname,
        fh    => $fh,
        terminal => qq[set terminal png size $wh truecolor],
        grid  => 'set grid xtics ytics y2tics back lc rgb "#cccccc"',
    };
}

sub percentile
{
    my ($p,@vals) = @_;
    my $N = 0;
    for my $val (@vals) { $N += $val; }
    my $n = $p*($N+1)/100.;
    my $k = int($n);
    my $d = $n-$k;
    if ( $k<=0 ) { return 0; }
    if ( $k>=$N ) { return scalar @vals-1; }
    my $cnt;
    for (my $i=0; $i<@vals; $i++)
    {
        $cnt += $vals[$i];
        if ( $cnt>=$k ) { return $i; }
    }
    error("FIXME: this should not happen [percentile]\n");
}

sub plot_qualities
{
    my ($opts) = @_;

    my @ffq = get_values($opts, 'FFQ');
    if ( !@ffq ) { return; }

    my $yrange = @{$ffq[0]} > 50 ? @{$ffq[0]} : 50;
    my $is_paired = get_value($opts,'SN','reads paired:');

    # Average quality per cycle, forward and reverse reads in one plot
    my $args = get_defaults($opts,"$$opts{prefix}quals.png");
    my $fh = $$args{fh};
    print $fh qq[
            $$args{terminal}
            set output "$$args{img}"
            $$args{grid}
            set ylabel "Average Quality"
            set xlabel "Cycle"
            set yrange [0:$yrange]
            set title "$$args{title}"
            plot '-' using 1:2 with lines title 'Forward reads' ] . ($is_paired ? q[, '-' using 1:2 with lines title 'Reverse reads'] : '') . q[
        ];
    my (@fp75,@fp50,@fmean);
    my (@lp75,@lp50,@lmean);
    my ($fmax,$fmax_qual,$fmax_cycle);
    my ($lmax,$lmax_qual,$lmax_cycle);
    for my $cycle (@ffq)
    {
        my $sum=0; my $n=0;
        for (my $iqual=1; $iqual<@$cycle; $iqual++)
        {
            $sum += $$cycle[$iqual]*$iqual;
            $n += $$cycle[$iqual];
            if ( !defined $fmax or $fmax<$$cycle[$iqual] ) { $fmax=$$cycle[$iqual]; $fmax_qual=$iqual; $fmax_cycle=$$cycle[0]; }
        }
        my $p25 = percentile(25,(@$cycle)[1..$#$cycle]);
        my $p50 = percentile(50,(@$cycle)[1..$#$cycle]);
        my $p75 = percentile(75,(@$cycle)[1..$#$cycle]);
        if ( !$n ) { next; }
        push @fp75, "$$cycle[0]\t$p25\t$p75\n";
        push @fp50, "$$cycle[0]\t$p50\n";
        push @fmean, sprintf "%d\t%.2f\n", $$cycle[0],$sum/$n;
        printf $fh $fmean[-1];
    }
    print $fh "end\n";
    my @lfq = ();
    if ( $is_paired )
    {
        @lfq = get_values($opts, 'LFQ');
        for my $cycle (@lfq)
        {
            my $sum=0; my $n=0;
            for (my $iqual=1; $iqual<@$cycle; $iqual++)
            {
                $sum += $$cycle[$iqual]*$iqual;
                $n += $$cycle[$iqual];
                if ( !defined $lmax or $lmax<$$cycle[$iqual] ) { $lmax=$$cycle[$iqual]; $lmax_qual=$iqual; $lmax_cycle=$$cycle[0]; }
            }
            my $p25 = percentile(25,(@$cycle)[1..$#$cycle]);
            my $p50 = percentile(50,(@$cycle)[1..$#$cycle]);
            my $p75 = percentile(75,(@$cycle)[1..$#$cycle]);
            if ( !$n ) { next; }
            push @lp75, "$$cycle[0]\t$p25\t$p75\n";
            push @lp50, "$$cycle[0]\t$p50\n";
            push @lmean, sprintf "%d\t%.2f\n", $$cycle[0],$sum/$n;
            printf $fh $lmean[-1];
        }
        print $fh "end\n";
    }
    close($fh);
    plot($$args{gp});



    # Average, mean and quality percentiles per cycle, forward and reverse reads in separate plots
    $args = get_defaults($opts,"$$opts{prefix}quals2.png",wh=>$is_paired ? '700,500' : '600,400');
    $fh = $$args{fh};
    my $pos_size = $is_paired ? " set rmargin 0; set lmargin 0; set tmargin 0; set bmargin 0; set origin 0.1,0.1; set size 0.4,0.8" : '';
    print $fh qq[
            $$args{terminal}
            set output "$$args{img}"
            $$args{grid}
            set multiplot
            $pos_size
            set yrange [0:$yrange]
            set ylabel "Quality"
            set xlabel "Cycle (fwd reads)"
            plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#cccccc" t '25-75th percentile' , '-' using 1:2 with lines lc rgb "#000000" t 'Median', '-' using 1:2 with lines lt 1 t 'Mean'
        ];
    print $fh join('',@fp75),"end\n";
    print $fh join('',@fp50),"end\n";
    print $fh join('',@fmean),"end\n";
    if ( $is_paired )
    {
        print $fh qq[
                set origin 0.55,0.1
                set size 0.4,0.8
                unset ytics
                set y2tics mirror
                set yrange [0:$yrange]
                unset ylabel
                set xlabel "Cycle (rev reads)"
                set label "$$args{title}" at screen 0.5,0.95 center
                plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#cccccc" t '25-75th percentile' , '-' using 1:2 with lines lc rgb "#000000" t 'Median', '-' using 1:2 with lines lt 2 t 'Mean'
            ];
        print $fh join('',@lp75),"end\n";
        print $fh join('',@lp50),"end\n";
        print $fh join('',@lmean),"end\n";
    }
    close($fh);
    plot($$args{gp});



    # Quality distribution per cycle, the distribution is for each cycle plotted as a separate curve
    $args = get_defaults($opts,"$$opts{prefix}quals3.png",wh=>$is_paired ? '600,600' : '600,400');
    $fh = $$args{fh};
    my $nquals = @{$ffq[0]}-1;
    my $ncycles = @ffq;
    $pos_size = $is_paired ? " set rmargin 0; set lmargin 0; set tmargin 0; set bmargin 0; set origin 0.15,0.52; set size 0.8,0.4" : '';
    print $fh qq[
            $$args{terminal}
            set output "$$args{img}"
            $$args{grid}
            set multiplot
            $pos_size
            set title "$$args{title}"
            set ylabel "Frequency (fwd reads)"
            set label "Cycle $fmax_cycle" at $fmax_qual+1,$fmax
            unset xlabel
            set xrange [0:$nquals]
            set format x ""
        ];
    my @plots;
    for (my $i=0; $i<$ncycles; $i++) { push @plots, q['-' using 1:2 with lines t ''] }
    print $fh "plot ", join(",", @plots), "\n";
    for my $cycle (@ffq)
    {
        for (my $iqual=1; $iqual<$nquals; $iqual++) { print $fh "$iqual\t$$cycle[$iqual]\n"; }
        print $fh "end\n";
    }
    if ( $is_paired )
    {
        print $fh qq[
                set origin 0.15,0.1
                set size 0.8,0.4
                unset title
                unset format
                set xtics
                set xlabel "Quality"
                unset label
                set label "Cycle $lmax_cycle" at $lmax_qual+1,$lmax
                set ylabel "Frequency (rev reads)"
            ];
        print $fh "plot ", join(",", @plots), "\n";
        for my $cycle (@lfq)
        {
            for (my $iqual=1; $iqual<$nquals; $iqual++)
            {
                print $fh "$iqual\t$$cycle[$iqual]\n";
            }
            print $fh "end\n";
        }
    }
    close($fh);
    plot($$args{gp});

    # Heatmap qualitites
    $args = get_defaults($opts,"$$opts{prefix}quals-hm.png", wh=>'600,500');
    $fh = $$args{fh};
    my $max = defined $lmax && $lmax > $fmax ? $lmax : $fmax;
    my @ytics;
    for my $cycle (@ffq) { if ( $$cycle[0]%10==0 ) { push @ytics,qq["$$cycle[0]" $$cycle[0]]; } }
    my $ytics = join(',', @ytics);
    $pos_size = $is_paired ? " set origin 0,0.46\n set size 0.95,0.6" : '';
    print $fh qq[
            $$args{terminal}
            set output "$$args{img}"
            unset key
            unset colorbox
            set palette defined (0 0 0 0, 1 0 0 1, 3 0 1 0, 4 1 0 0, 6 1 1 1)
            set cbrange [0:$max]
            set yrange  [0:$ncycles]
            set xrange  [0:$nquals]
            set view map
            set multiplot
            set rmargin 0
            set lmargin 0
            set tmargin 0
            set bmargin 0
            $pos_size
            set obj 1 rectangle behind from first 0,0 to first $nquals,$ncycles
            set obj 1 fillstyle solid 1.0 fillcolor rgbcolor "black"
            set ylabel "Cycle (fwd reads)" offset character -1,0
            unset ytics
            set ytics ($ytics)
            unset xtics
            set title "$$args{title}"
            splot '-' matrix with image
        ];
    for my $cycle (@ffq)
    {
        for (my $iqual=1; $iqual<@$cycle; $iqual++) { print $fh "\t$$cycle[$iqual]"; }
        print $fh "\n";
    }
    print $fh "end\nend\n";
    if ( $is_paired )
    {
        @ytics = ();
        for my $cycle (@lfq) { if ( $$cycle[0]%10==0 ) { push @ytics,qq["$$cycle[0]" $$cycle[0]]; } }
        $ytics = join(',', @ytics);
        print $fh qq[
                set origin 0,0.03
                set size 0.95,0.6
                set ylabel "Cycle (rev reads)" offset character -1,0
                set xlabel "Base Quality"
                unset title
                unset ytics
                set ytics ($ytics)
                set xrange  [0:$nquals]
                set xtics
                set colorbox vertical user origin first ($nquals+1),0 size screen 0.025,0.812
                set cblabel "Number of bases"
                splot '-' matrix with image
            ];
        for my $cycle (@lfq)
        {
            for (my $iqual=1; $iqual<@$cycle; $iqual++) { print $fh "\t$$cycle[$iqual]"; }
            print $fh "\n";
        }
        print $fh "end\nend\n";
    }
    close($fh);
    plot($$args{gp});
}


sub plot_acgt_cycles
{
    my ($opts) = @_;

    my @gcc = get_values($opts, 'GCC');
    if ( !@gcc ) { return; }

    my $args = get_defaults($opts,"$$opts{prefix}acgt-cycles.png");
    my $fh = $$args{fh};
    print $fh qq[
            $$args{terminal}
            set output "$$args{img}"
            $$args{grid}
            set style line 1 linecolor rgb "green"
            set style line 2 linecolor rgb "red"
            set style line 3 linecolor rgb "black"
            set style line 4 linecolor rgb "blue"
            set style increment user
            set ylabel "Base content [%]"
            set xlabel "Read Cycle"
            set yrange [0:100]
            set title "$$args{title}"
            plot '-' w l ti 'A', '-' w l ti 'C', '-' w l ti 'G', '-' w l ti 'T'
        ];
    for my $base (1..4)
    {
        for my $cycle (@gcc)
        {
            print $fh $$cycle[0]+1,"\t",$$cycle[$base],"\n";
        }
        print $fh "end\n";
    }
    close($fh);
    plot($$args{gp});
}


sub plot_gc
{
    my ($opts) = @_;

    my $is_paired = get_value($opts,'SN','reads paired:');
    my $args = get_defaults($opts,"$$opts{prefix}gc-content.png");
    my $fh = $$args{fh};
    my ($gcl_max,$gcf_max,$lmax,$fmax);
    my @gcf = get_values($opts, 'GCF');
    my @gcl = get_values($opts, 'GCL');
    for my $gc (@gcf) { if ( !defined $gcf_max or $gcf_max<$$gc[1] ) { $gcf_max=$$gc[1]; $fmax=$$gc[0]; } }
    for my $gc (@gcl) { if ( !defined $gcl_max or $gcl_max<$$gc[1] ) { $gcl_max=$$gc[1]; $lmax=$$gc[0]; } }
    my $gcmax = $is_paired && $gcl_max > $gcf_max ? $lmax : $fmax;
    print $fh qq[
            $$args{terminal}
            set output "$$args{img}"
            $$args{grid}
            set title "$$args{title}"
            set ylabel "Normalized Frequency"
            set xlabel "GC Content [%]"
            set yrange [0:1.1]
            set label sprintf("%.1f",$gcmax) at $gcmax,1 front offset 1,0
            plot ]
                 . (exists($$opts{ref_stats}) ? q['-' smooth csplines with lines lt 0 title 'Reference', ] : '')
                 . q['-' smooth csplines with lines lc 1 title 'First fragments' ]
                 . ($is_paired ? q[, '-' smooth csplines with lines lc 2 title 'Last fragments'] : '')
                 . q[
        ];
    if ( exists($$opts{ref_stats}) )
    {
        open(my $ref,'<',$$opts{ref_stats}) or error("$$opts{ref_stats}: $!");
        while (my $line=<$ref>) { print $fh $line }
        close($ref);
        print $fh "end\n";
    }
    for my $cycle (@gcf) { printf $fh "%d\t%f\n", $$cycle[0],$$cycle[1]/$gcf_max; }
    print $fh "end\n";
    if ( $is_paired )
    {
        for my $cycle (@gcl) { printf $fh "%d\t%f\n", $$cycle[0],$$cycle[1]/$gcl_max; }
        print $fh "end\n";
    }
    close($fh);
    plot($$args{gp});
}


sub plot_gc_depth
{
    my ($opts) = @_;

    my @gcd = get_values($opts,'GCD');
    if ( @gcd <= 1 ) { return; }

    # Find unique sequence percentiles for 30,40, and 50% GC content, just to draw x2tics.
    my @tics = ( {gc=>30},{gc=>40},{gc=>50} );
    for my $gc (@gcd)
    {
        for my $tic (@tics)
        {
            my $diff = abs($$gc[0]-$$tic{gc});
            if ( !exists($$tic{pr}) or $diff<$$tic{diff} ) { $$tic{pr}=$$gc[1]; $$tic{diff}=$diff; }
        }
    }

    my @x2tics;
    for my $tic (@tics) { push @x2tics, qq["$$tic{gc}" $$tic{pr}]; }
    my $x2tics = join(',',@x2tics);

    my $args = get_defaults($opts,"$$opts{prefix}gc-depth.png", wh=>'600,500');
    my $fh = $$args{fh};
    print $fh qq[
            $$args{terminal}
            set output "$$args{img}"
            $$args{grid}
            set ylabel "Mapped depth"
            set xlabel "Percentile of mapped sequence ordered by GC content"
            set x2label "GC Content [%]"
            set title "$$args{title}"
            set x2tics ($x2tics)
            set xtics nomirror
            set xrange [0.1:99.9]

            plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#dedede" t '10-90th percentile' , \\
                 '-' using 1:2:3 with filledcurve lt 1 lc rgb "#bbdeff" t '25-75th percentile' , \\
                 '-' using 1:2 with lines lc rgb "#0084ff" t 'Median'
        ];
    for my $gc (@gcd) { print $fh "$$gc[1]\t$$gc[2]\t$$gc[6]\n"; } print $fh "end\n";
    for my $gc (@gcd) { print $fh "$$gc[1]\t$$gc[3]\t$$gc[5]\n"; } print $fh "end\n";
    for my $gc (@gcd) { print $fh "$$gc[1]\t$$gc[4]\n"; } print $fh "end\n";
    close($fh);
    plot($$args{gp});
}


sub plot_isize
{
    my ($opts) = @_;

    my $is_paired = get_value($opts,'SN','reads paired:');
    my @isizes = get_values($opts, 'IS');
    if ( !$is_paired or !@isizes ) { return; }

    my ($isize_max,$isize_cnt);
    for my $isize (@isizes)
    {
        if ( !defined $isize_max or $isize_cnt<$$isize[1] ) { $isize_cnt=$$isize[1]; $isize_max=$$isize[0]; }
    }

    my $args = get_defaults($opts,"$$opts{prefix}insert-size.png");
    my $fh = $$args{fh};
    print $fh qq[
            $$args{terminal}
            set output "$$args{img}"
            $$args{grid}
            set rmargin 5
            set label sprintf("%d",$isize_max) at $isize_max+10,$isize_cnt
            set ylabel  "Number of pairs"
            set xlabel  "Insert Size"
            set title "$$args{title}"];
    print $fh qq[
            set logscale y 10] if exists $$opts{y_axis_log10};
    print $fh qq[
            plot \\
                '-' with lines lc rgb 'black' title 'All pairs', \\
                '-' with lines title 'Inward', \\
                '-' with lines title 'Outward', \\
                '-' with lines title 'Other'
        ];
    for my $isize (@isizes) { print $fh "$$isize[0]\t$$isize[1]\n"; } print $fh "end\n";
    for my $isize (@isizes) { print $fh "$$isize[0]\t$$isize[2]\n"; } print $fh "end\n";
    for my $isize (@isizes) { print $fh "$$isize[0]\t$$isize[3]\n"; } print $fh "end\n";
    for my $isize (@isizes) { print $fh "$$isize[0]\t$$isize[4]\n"; } print $fh "end\n";
    close($fh);
    plot($$args{gp});
}


sub plot_coverage
{
    my ($opts) = @_;

    my @cov = get_values($opts, 'COV');
    if ( !@cov ) { return; }

    my @vals;
    for my $cov (@cov) { push @vals,$$cov[2]; }
    my $i = percentile(99.8,@vals);
    my $p99 = $cov[$i][1];

    my $args = get_defaults($opts,"$$opts{prefix}coverage.png");
    my $fh = $$args{fh};
    print $fh qq[
            $$args{terminal}
            set output "$$args{img}"
            $$args{grid}
            set ylabel "Number of mapped bases"
            set xlabel "Coverage"
            set log y
            set style fill solid border -1
            set title "$$args{title}"
            set xrange [:$p99]
            plot '-' with lines notitle
        ];
    for my $cov (@cov)
    {
        if ( $$cov[2]==0 ) { next; }
        print $fh "$$cov[1]\t$$cov[2]\n";
    }
    print $fh "end\n";
    close($fh);
    plot($$args{gp});
}


sub plot_mismatches_per_cycle
{
    my ($opts) = @_;

    my @mpc = get_values($opts, 'MPC');
    if ( !@mpc ) { return; }

    my $nquals = @{$mpc[0]} - 2;
    my $ncycles = @mpc;
    my ($style,$with);
    if ( $ncycles>100 ) { $style = ''; $with = 'w l'; }
    else { $style = 'set style data histogram; set style histogram rowstacked'; $with = ''; }

    my $args = get_defaults($opts,"$$opts{prefix}mism-per-cycle.png");
    my $fh = $$args{fh};
    print $fh qq[
            $$args{terminal}
            set output "$$args{img}"
            $$args{grid}
            set style line 1 linecolor rgb "#e40000"
            set style line 2 linecolor rgb "#ff9f00"
            set style line 3 linecolor rgb "#bbbb00"
            set style line 4 linecolor rgb "#4ebd68"
            set style line 5 linecolor rgb "#0061ff"
            set style increment user
            set key left top
            $style
            set ylabel "Number of mismatches"
            set xlabel "Read Cycle"
            set style fill solid border -1
            set title "$$args{title}"
            set xrange [-1:$ncycles]
            plot '-' $with ti 'Base Quality>30', \\
                 '-' $with ti '30>=Q>20', \\
                 '-' $with ti '20>=Q>10', \\
                 '-' $with ti '10>=Q', \\
                 '-' $with ti "N's"
        ];
    for my $cycle (@mpc)
    {
        my $sum; for my $idx (31..$#$cycle) { $sum += $$cycle[$idx]; }
        print $fh "$sum\n";
    }
    print $fh "end\n";
    for my $cycle (@mpc)
    {
        my $sum; for my $idx (22..31) { $sum += $$cycle[$idx]; }
        print $fh "$sum\n";
    }
    print $fh "end\n";
    for my $cycle (@mpc)
    {
        my $sum; for my $idx (12..21) { $sum += $$cycle[$idx]; }
        print $fh "$sum\n";
    }
    print $fh "end\n";
    for my $cycle (@mpc)
    {
        my $sum; for my $idx (2..11) { $sum += $$cycle[$idx]; }
        print $fh "$sum\n";
    }
    print $fh "end\n";
    for my $cycle (@mpc) { print $fh "$$cycle[1]\n"; }
    print $fh "end\n";
    close($fh);
    plot($$args{gp});
}

sub plot_indel_dist
{
    my ($opts) = @_;

    my @indels = get_values($opts, 'ID');
    if ( !@indels ) { return; }

    my $args = get_defaults($opts,"$$opts{prefix}indel-dist.png");
    my $fh = $$args{fh};
    print $fh qq[
        $$args{terminal}
        set output "$$args{img}"
        $$args{grid}
        set style line 1 linetype 1  linecolor rgb "red"
        set style line 2 linetype 2  linecolor rgb "black"
        set style line 3 linetype 3  linecolor rgb "green"
        set style increment user
        set ylabel "Indel count [log]"
        set xlabel "Indel length"
        set y2label "Insertions/Deletions ratio"
        set log y
        set y2tics nomirror
        set ytics nomirror
        set title "$$args{title}"
        plot '-' w l ti 'Insertions', '-' w l ti 'Deletions', '-' axes x1y2 w l ti "Ins/Dels ratio"
    ];
    for my $len (@indels) { print $fh "$$len[0]\t$$len[1]\n"; } print $fh "end\n";
    for my $len (@indels) { print $fh "$$len[0]\t$$len[2]\n"; } print $fh "end\n";
    for my $len (@indels) { printf $fh "%d\t%f\n", $$len[0],$$len[2]?$$len[1]/$$len[2]:0; } print $fh "end\n";
    close($fh);
    plot($$args{gp});
}

sub plot_indel_cycles
{
    my ($opts) = @_;

    my @indels = get_values($opts, 'IC');
    if ( !@indels ) { return; }

    my $is_paired = get_value($opts,'SN','reads paired:');

    my $args = get_defaults($opts,"$$opts{prefix}indel-cycles.png");
    my $fh = $$args{fh};
    print $fh qq[
        $$args{terminal}
        set output "$$args{img}"
        $$args{grid}
        set style line 1 linetype 1  linecolor rgb "red"
        set style line 2 linetype 2  linecolor rgb "black"
        set style line 3 linetype 3  linecolor rgb "green"
        set style line 4 linetype 4  linecolor rgb "blue"
        set style increment user
        set ylabel "Indel count"
        set xlabel "Read Cycle"
        set title "$$args{title}"
    ];
    if ( $is_paired )
    {
        print $fh qq[plot '-' w l ti 'Insertions (fwd)', '' w l ti 'Insertions (rev)', '' w l ti 'Deletions (fwd)', '' w l ti 'Deletions (rev)'\n];
        for my $len (@indels) { print $fh "$$len[0]\t$$len[1]\n"; } print $fh "end\n";
        for my $len (@indels) { print $fh "$$len[0]\t$$len[2]\n"; } print $fh "end\n";
        for my $len (@indels) { print $fh "$$len[0]\t$$len[3]\n"; } print $fh "end\n";
        for my $len (@indels) { print $fh "$$len[0]\t$$len[4]\n"; } print $fh "end\n";
    }
    else
    {
        print $fh qq[plot '-' w l ti 'Insertions', '' w l ti 'Deletions'\n];
        for my $len (@indels) { print $fh "$$len[0]\t$$len[2]\n"; } print $fh "end\n";
        for my $len (@indels) { print $fh "$$len[0]\t$$len[4]\n"; } print $fh "end\n";
    }
    close($fh);
    plot($$args{gp});
}

sub merge_bamcheck
{
    my ($opts) = @_;
    my $fh = *STDOUT;
    if ( !$$opts{merge} )
    {
        open($fh,'>',"$$opts{prefix}merge.bchk") or error("$$opts{prefix}merge.bchk: $!\n");
    }

    print $fh "# This file was produced by plot-bamstats, the command line was:\n#   $$opts{args}\n#\n";

    for my $sec (@{$$opts{sections}})
    {
        my $sid = $$sec{id};
        if ( !exists($$opts{dat}{$sid}) ) { next; }

        print $fh $$sec{header};
        if ( ref($$opts{dat}{$sid}) eq 'HASH' )
        {
            for my $key (keys %{$$opts{dat}{$sid}})
            {
                print "$sid\t$key\t$$opts{dat}{$sid}{$key}\n";
            }
            next;
        }

        if ( ref($$opts{dat}{$sid}) eq 'ARRAY' )
        {
            for my $rec (@{$$opts{dat}{$sid}})
            {
                print $fh "$sid\t", join("\t",@$rec), "\n";
            }
        }

        #
        #   if ( $sid eq 'ID' )
        #   {
        #       print $fh "# $$opts{id2sec}{SN}{header}\n$$opts{id2sec}{SN}{exp}";
        #       # output summary numbers here
        #       for my $id (keys %{$$opts{dat}})
        #       {
        #           if ( exists($$opts{exp}{$id}) ) { next; }
        #           for my $key (keys %{$$opts{dat}{$id}})
        #           {
        #               print $fh "SN\t$id\t$key\t$$opts{dat}{$id}{$key}\n";
        #           }
        #       }
        #   }
    }

}

sub bignum
{
    my ($num) = @_;
    if ( !defined $num ) { return 0; }
    my $out = '';
    my $slen = length($num);
    for (my $i=0; $i<$slen; $i++)
    {
        $out .= substr($num,$i,1);
        if ( $i+1<$slen && ($slen-$i-1)%3==0 ) { $out .= ','; }
    }
    return $out;
}

sub create_html
{
    my ($opts) = @_;
    my ($fname,$prefix);
    if ( $$opts{prefix}=~m{/$} )
    {
        $fname  = "$$opts{prefix}index.html";
        $prefix = '';
    }
    else
    {
        $prefix = $$opts{prefix};
        $prefix =~ s{^.*/}{};
        $fname = $$opts{prefix};
        $fname =~ s/-$/.html/;
    }
    open(my $fh,'>',$fname) or error("$fname: $!");
    print $fh q[
        <html>
        <head>
            <style>
                .thumbnail
                {
                    text-decoration:none;
                    color:black;
                    font-weight:bold;
                }
                .thumbnail span  /*CSS for enlarged image*/
                {
                    visibility: hidden;
                    position: absolute;
                    padding: 5px;
                    border: 1px solid #000;
                    background-color: #e5e5e5;
                }
                .thumbnail:hover span  /*CSS for enlarged image on hover*/
                {
                    visibility: visible;
                    left: 550px;
                    top: 10px;
                }
                .imgs td
                {
                    vertical-align:middle;
                    padding: 0.5em;
                    border: 1px solid black;
                }
                table.imgs
                {
                    border-collapse:collapse;
                    margin-left:20px;
                }
                .nums th
                {
                    text-align: left;
                }
                table.nums
                {
                    margin-top: 1em;
                    margin-left:20px;
                    border: 1px dotted #83A4C3;
                    background-color: #F5F5F5;
                    padding: 0.5em;
                }
                .pad { padding-left:1em; vertical-align:top; }
                .right { text-align:right; padding-left:1em; }
            </style>
        </head>
        <body>
            <table class="imgs">
    ];

    my %imgs =
    (
        'insert-size'    => 'Insert size',
        'gc-content'     => 'GC content',
        'acgt-cycles'    => 'Per-base sequence content',
        'mism-per-cycle' => 'Mismatches per cycle',
        'quals'          => 'Quality per cycle',
        'quals2'         => 'Quality per cycle',
        'quals3'         => 'Quality per cycle',
        'quals-hm'       => 'Quality per cycle',
        'indel-cycles'   => 'Indels per cycle',
        'indel-dist'     => 'Indel lengths',
        'gc-depth'       => 'Mapped depth vs GC',
# I think this may be broken: 'coverage'       => '',
    );
    my @imgs = (qw(insert-size gc-content acgt-cycles mism-per-cycle quals quals2 quals3 quals-hm indel-cycles indel-dist gc-depth));
    for (my $i=0; $i<@imgs; $i++)
    {
        if ( $i % 3 == 0 )
        {
            # new row
            if ( $i>0 ) { print $fh "</tr>\n"; }
            print $fh "<tr>";
        }
        if ( -e "$$opts{prefix}$imgs[$i].png" )
        {
            my $desc = $imgs{$imgs[$i]};
            print $fh qq[<td><a class="thumbnail" href="$prefix$imgs[$i].png"><img src="$prefix$imgs[$i].png" width="150px"><span>$desc<br><img src="$prefix$imgs[$i].png"></span></a>\n];
        }
        else { print $fh "<td>\n"; }
    }

    my $reads_total      = get_value($opts,'SN','raw total sequences:');
    my $reads_filtered   = get_value($opts,'SN','filtered sequences:');
    my $percent_filtered = sprintf "(%.1f%%)", $reads_filtered * 100. / $reads_total;
    my $reads_mapped     = get_value($opts,'SN','reads mapped:');
    my $percent_mapped   = sprintf "(%.1f%%)", $reads_mapped * 100. / ($reads_total - $reads_filtered);
    my $reads_dup        = get_value($opts,'SN','reads duplicated:');
    my $percent_dup      = sprintf "(%.1f%%)", $reads_dup * 100. / ($reads_total - $reads_filtered);
    my $reads_mq0        = get_value($opts,'SN','reads MQ0:');
    my $percent_mq0      = sprintf "(%.1f%%)", ($reads_mapped ? $reads_mq0 * 100. / $reads_mapped : 0);
    my $reads_nonprim    = get_value($opts,'SN','non-primary alignments:');
    my $read_avglen      = get_value($opts,'SN','average length:');
    my $bases_total      = get_value($opts,'SN','total length:');
    my $bases_mapped     = get_value($opts,'SN','bases mapped (cigar):');
    my $bpercent_mapped  = sprintf "(%.1f%%)", $bases_mapped * 100. / $bases_total;
    my $error_rate       = sprintf "%.2f%%", 100.*get_value($opts,'SN','error rate:');

    $reads_total    = bignum($reads_total);
    $reads_filtered = bignum($reads_filtered);
    $reads_mapped   = bignum($reads_mapped);
    $reads_dup      = bignum($reads_dup);
    $reads_mq0      = bignum($reads_mq0);
    $reads_nonprim  = bignum($reads_nonprim);
    $bases_total    = bignum($bases_total);
    $bases_mapped   = bignum($bases_mapped);

    print $fh qq[
        </table>

        <table class="nums">
            <tr><th>Reads</tr>
            <tr>
                <td class="pad"><table>
                <tr><td>total:       <td class="right"> $reads_total    <td class="right"></tr>
                <tr><td>filtered:    <td class="right"> $reads_filtered <td class="right"> $percent_filtered</tr>
                <tr><td>non-primary: <td class="right"> $reads_nonprim  <td class="right"> </tr>
                <tr><td>duplicated:  <td class="right"> $reads_dup      <td class="right"> $percent_dup</tr>
                <tr><td>mapped:      <td class="right"> $reads_mapped   <td class="right"> $percent_mapped</tr>
                <tr><td>zero MQ:     <td class="right"> $reads_mq0      <td class="right"> $percent_mq0</tr>
                <tr><td>avg read length: <td class="right"> $read_avglen    <td class="right"></tr>
                </table></tr>

            <tr><th>Bases</tr>
            <tr>
                <td class="pad"><table>
                <tr><td>total:       <td class="right"> $bases_total    <td class="right"></tr>
                <tr><td>mapped:      <td class="right"> $bases_mapped   <td class="right"> $bpercent_mapped</tr>
                <tr><td>error rate:      <td class="right"> $error_rate  <td class="right"></tr>
                </table></tr>
        </table>

        </body>
        </html>
    ];

    close($fh);
}
samtools 1.7-1 / usr / bin / plot-bamstats