#!/usr/bin/perl -w
use strict;

################# Parameters ########################
my $B = 0.1;               # Weighting factor for pseudocounts
my $threshold = -2**30;    # Save motifs with scores greater than $threshold
                           #    Set initially to a very low number. 
                           #    Changes as hits are found.
my $info_threshold = 1.5;  # Column must have at least this information content
                           #    to be used in comparisons
my $number_to_save = 300;  # number of top scores and coordinates to save
my %bg_frequency =         # nucleotide frequencies of overall sequences
   ("A" => 0.33264,
    "T" => 0.33264,
    "G" => 0.16736,
    "C" => 0.16736); 

###################### FILES ########################

#   motif_input: input file of aligned motifs, produced by Meme
#   sequence_file: input file containing sequences in FA format
#   motif_hits: output file of best motifs within sequence file
#               Each line is name of sequence, score, start_coord, sequence

my $motif_input = '71NpNtsm.txt';
my $sequence_file = 'NostocChromosome.nt';
my $motif_hits = 'motif.hits';

############### GLOBAL MOTIF VARIABLES ################

my $motif_length;         # length of motifs
my @informational;        # holds those positions in the motif that contribute
                          #    to the motif -- i.e. those which are not
                          #    don't-care positions.
my %n;                    # number of occurrences of nucleotides at each
                          #    position
my %log_q;                # $log_q{$nucleotide}[$position] is the log of the
                          #    frequency of the nucleotide at that position
my %log_p;                # $log_p{$nucleotide} is the background frequency
                          #    of the nucleotide
my @motif_sequences;      # holds sequences in training set
my $N;                    # holds number of sequences in training set
my $stars;                # holds prior information on informational positions

################### MAIN PROGRAM ####################

print "Information content threshold: $info_threshold\n";
print "Saving top $number_to_save hits\n";

# Set the global motif variables $motif_length, @informational,
# %log_q, and %log_p

   get_motif();
   calc_motif_frequency();
   calc_background_frequency();
   if ($stars) { record_information_content() }
   else { calc_information_content() }

   my $good_positions = @informational;
   print "$good_positions out of $motif_length positions are informational\n";

# Find and save hits

   open_sequence_file();
   while (my($seq_name, $sequence) = get_new_sequence()) {
      analyze_sequence(shorten($seq_name), $sequence);
   }

print_hits();

#################### SUBROUTINES ####################

sub shorten {
   my ($long) = @_;

   # Return just the first word of $long
   
   my ($short) = $long =~ /(\w+)/;
   return $short;
}

################# MOTIF SUBROUTINES #################

sub get_motif {

   # Return array of aligned sequences from output of Meme
   # Set the global variables $motif_length and @informational

   # Extract informational line and motif sequences
   #
   open MOTIF_INPUT, "<$motif_input" or die "Can't open $motif_input: $!\n";
   while (<MOTIF_INPUT>) {
      if (/\b([ATCG]+)\b/i) { push @motif_sequences, uc $1 }
      elsif (/^\s+([\s\*\.]*\*)$/) { $stars = $1 }
   }
   close MOTIF_INPUT;

   # Complain and die unless we found some motifs and they are all the
   # same length 
   #
   @motif_sequences > 0 or die "No motif sequences found in $motif_input\n";
   $motif_length = length($motif_sequences[0]);
   foreach my $motif (@motif_sequences) {
      length($motif) == $motif_length
         or die "Length of motif $motif in $motif_input is not $motif_length\n";
   }

}

sub calc_motif_frequency {

   # Accumulate nucleotide totals at each position
   # Calculate the adjusted frequency at each position.
   #
   # The actual frequency is
   #
   #            n{nucleotide}[position] / N   ,
   # where:
   #    n{nucleotide}[position] is the number of times the nucleotide appears
   #       at the position
   #    N is the number of motifs found in the training session,
   #
   # The adjusted formula is a little more complicated:
   #
   #                           n{nucleotide}[position] + B * bg_pct[nucleotide]
   # q{nucleotide}[position] = ------------------------------------------------
   #                                                N + B
   # where n and N are as before, and:
   #    q{nucleotide}[position] is the adjusted frequency of the nucleotide
   #       at the position
   #    B is the weighting constant for pseudocounts
   #    bg_pct[nucleotide] is the fraction that the nucleotide appears in total
   #       DNA
   #
   # We actually store $log_q{$nucleotide}[position], the log of the adjusted
   # frequency, rather than the frequency itself.

   # Calculate %n
   #
   foreach my $nucleotide ("A", "C", "G", "T") {
      foreach my $position (0..$motif_length) {
         $n{$nucleotide}[$position] = 0;
      }
   }
   foreach my $motif (@motif_sequences) {
      foreach my $position (0..$motif_length-1) {
         my $nucleotide = substr($motif, $position, 1);
         $n{$nucleotide}[$position] = $n{$nucleotide}[$position] + 1;
      }
   }

   # Calculate %log_q
   #
   $N = @motif_sequences;
   foreach my $nucleotide ("A", "C", "G", "T") {
      foreach my $position (0..$motif_length) {
         $log_q{$nucleotide}[$position] =
            log(  ($n{$nucleotide}[$position] + $B * $bg_frequency{$nucleotide})
                / ($N + $B)
               );
      }
   }
}

sub calc_background_frequency {

   # Calculate background percentage (for pseudocounts)

   foreach my $nucleotide ("A", "C", "G", "T") {
      $log_p{$nucleotide} = log($bg_frequency{$nucleotide});
   }
}

######## INFORMATION CONTENT ROUTINES #################
#
# Here are two routines to initialize @informational, which tells
# the rest of the program which positions (columns) are worth bothering
# about.
#
sub record_information_content {

   # Informational positions were specified in input file by a line of
   # asterisks ($stars), so initialize @informational from that.

   foreach my $position (0..$motif_length-1) {
      push @informational, $position if substr($stars, $position, 1) eq "*";
   }
}

sub calc_information_content {

   # Determine information content for each column and on that basis
   # initialize @informational

   foreach my $position (0..$motif_length-1) {
      my $info = 0;
      foreach my $nucleotide ("A", "C", "G", "T") {
         my $fraction = $n{$nucleotide}[$position]/$N;
         if ($fraction > 0) {
            my $log2fraction = - log($fraction) / log(2);
            $info = $info + $fraction * $log2fraction;
         }
      }
      push @informational, $position if $info < $info_threshold;
   }

}

################# SCORING SUBROUTINES #################

sub analyze_sequence {
   my ($name, $sequence) = @_;
   my $region_start;
   my $subregion;
   foreach $region_start (0 .. length($sequence) - $motif_length) {
      if ($region_start % 1000 == 0) {print $region_start/1000,"\t"}
      $subregion = substr($sequence, $region_start, $motif_length);
      my $score = score_of_region($subregion);         
      if ($score > $threshold) {
         store_hit_information($name, $score, $region_start, $subregion)
      }
   }
}

sub score_of_region {
   my ($region) = @_;

   # The score is Sum [log_q - log_p] at each position
   # This means that the closer the sequence is to the set that
   # produced q, the higher the score

   my $score = 0;
   foreach my $position (@informational) {
     my $nucleotide = substr($region, $position, 1);
     if ($nucleotide eq ".") {return $threshold - 1}
     $score += $log_q{$nucleotide}[$position] - $log_p{$nucleotide};
  }
  return $score
}

my @hit_info;  # Information about each high-scoring sequence

sub store_hit_information {

   # Store the hit information.  If we've accumulated too many hits,
   # discard all but the highest-scoring ones.

   my ($name, $score, $start, $subregion) = @_;
   push @hit_info, [$score, $name, $start, $subregion];
   if (@hit_info > $number_to_save * 7) {
      sort_and_discard_excess_values();
      my $best_score = $hit_info[0][0];
      my $last_saved_score = $hit_info[$number_to_save - 1][0];
      printf "\nRange: %1.1f to %1.1f\t\t", $last_saved_score, $best_score;
   }
}

sub sort_and_discard_excess_values {
   @hit_info = reverse sort { $$a[0] <=> $$b[0] } @hit_info;
   while (@hit_info > $number_to_save) { pop @hit_info }
}

sub print_hits {

   # Print the highest scoring hits.

   sort_and_discard_excess_values();

   open HITS, ">$motif_hits" or die "Can't open $motif_hits: $!\n";
   foreach my $info (@hit_info) {
      my ($score, $name, $start, $subregion) = @$info;
      printf HITS "%9s %6.1f %8d %s\n", $name, $score, $start, $subregion;
   }
   close HITS
}

############# SEQUENCE INPUT SUBROUTINES #############

my $header_line;  # The header line of the next sequence (or undefined if
                  #    there are no more sequences in $sequence_file

sub open_sequence_file {

   # Open the sequence file, and initialize $header_line to (what should be)
   #   the header line of the first sequence
   #
   open SEQUENCE_INPUT, "<$sequence_file"
      or die "Can't open $sequence_file: $!\n";
   $header_line = <SEQUENCE_INPUT>;

}

sub get_new_sequence {

   # Return the next sequence in the input file, with its name.
   # The input is in FastA format, and in general will contain many
   # sequences.

   defined $header_line or return ();
   my ($name) = $header_line =~ /^>\s*(.*)/
      or die "Expected a sequence name at line $. of the sequence file\n";
   my @parts;
   while (<SEQUENCE_INPUT>) {
      last if /^>/;              # Header line of next sequence, so stop reading
      s/\s//g;
      push @parts, uc $_;
   }
   $header_line = $_;            # Save header line (or undef) for next time
   return ($name, join("", @parts));

}