#!/usr/bin/perl
#############################################################################
##
## File: filter.pl
## Date Created: 2006-03-20
##
## Copyright (c) 2006 David D. Allen
##
## Permission is hereby granted, free of charge, to any person obtaining a
## copy of this software and associated documentation files (the "Software"),
## to deal in the Software without restriction, including without limitation
## the rights to use, copy, modify, merge, publish, distribute, sublicense,
## and/or sell copies of the Software, and to permit persons to whom the
## Software is furnished to do so, subject to the following conditions:
##
## The above copyright notice and this permission notice shall be included in
## all copies or substantial portions of the Software.
##
## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
## THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
## FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
## DEALINGS IN THE SOFTWARE.
##
#############################################################################

# don't allow questionable usage of perl
use strict;
	
#------------------------------------------------------------------ constants
my $javaClassPath =
   "../weka/lib/weka.jar";

#---------------------------------------------------------------------- setup
# check command line arguments
if ( scalar( @ARGV ) != 3 )
{
   die "Invalid command line arguments.\n" .
       "\n" .
       "usage: filter.pl input.arff output.arff threshold\n" .
       "   input.arff  - Name of input arff file to filter.\n" .
       "   output.arff - Name of output file to write data to. If this\n" .
       "                 file already exists, it will be overwritten.\n" .
       "   threshold   - Minimum probability of detection to accept for\n" .
       "                 any class in the data set as determined by a\n" .
       "                 Naive Bayes classifier. Classes that don't meet\n" .
       "                 this threshold will be removed.\n" .
       "\n" .
       "The filtering process occurs iteratively. Low PD classes are\n" .
       "removed one by one until all classes have a PD above the\n" .
       "threshold. The exception to this is that any class with a PD of \n" .
       "0 will be remove during the first pass.\n" .
       "\n";
}  # check command line arguments

# get the command line arguments
my ( $inputArff, $outputArff, $pdThresh ) = @ARGV;

#------------------------------------------------------ filter the input file
# set the name of the current input file to the original input file
my $currentInput = $inputArff;

# loop until filtering is complete
while ( 1 )
{
   #-------------------------------------------------- find classes to remove
   # run NaiveBayes with weka, 10 fold cross validation, and parse results
   # for PDs and class names
   my $wekaCommand = "java -cp $javaClassPath " .
      "weka.classifiers.bayes.NaiveBayes -o -i -t $currentInput";
   my $sectionCount = 0;
   my ( $lowClass, $lowPd, %lowPdClasses );
   for ( `$wekaCommand` )
   {
      # count the number of times we have seen the target section we are
      # intersted in
      /^=== Detailed Accuracy By Class ===$/ and ++$sectionCount, next;

      # skip all output until the second time we have seen this section
      ( $sectionCount < 2 ) and next;

      # skip rest of data after the section is done
      /^===/ and last;

      # skip blank lines
      /^$/ and next;

      # skip header line
      /^TP Rate/ and next;

      # get the PD
      /^\s*([^\s]+)[\s]/ or next;
      my $classPd = $1;

      # get the class name
      /.{52}(.+)$/ or next;
      my $className = $1;

      # add class to lowPdClasses if Pd is 0
      if ( $classPd == 0 )
      {
         # store class name in the hash
         $lowPdClasses{ $className } = $classPd;
      }  # add class to lowPdClasses if Pd is 0

      # otherwise set it as the low class if the class has the lowest PD
      # that is less than the threshold
      elsif (( $classPd < $pdThresh ) and
             ( !defined( $lowPd ) or ( $classPd < $lowPd )))
      {
         # remember the current values as the class with the lowest PD
         $lowClass = $className;
         $lowPd = $classPd;
      }  # otherwise set it as the low class if the class has the lowest PD..
   }  # run NaiveBayes with weka...parse results...

   # if found, add class with lowest PD to list of low PD classes
   defined( $lowClass ) and $lowPdClasses{ $lowClass } = $lowPd;

   # quit if there is no more work to do
   (( scalar( keys( %lowPdClasses )) == 0 ) and
    ( $currentInput eq $outputArff )) and exit( 0 );

   #-------------------------------------------------------- parse input file
   # open the input arff file
   open( FH, "<$currentInput" ) ||
      die "Could not open input ARFF file \"$currentInput\"\n";

   # parse input file
   my $inHeader = 1;
   my ( @headerData, @instanceData, %classes );
   for ( <FH> )
   {
      # strip comments
      s/^([^\%]*)\%.*$/$1/;

      # strip leading white space
      s/^\s*(.*)$/$1/;

      # skip empty lines
      /^$/ and next;

      # done with header when @data is reached
      /^\@data/i and (( $inHeader = 0 ), push( @instanceData, $_ ), next );

      # store header lines
      ( $inHeader ) and push( @headerData, $_ ), next;

      # split out class of instance
      my $class;
      /,\s*\'?([^,\']*)\'?$/ or next;
      chomp( $class = $1 );

      # skip the data if this class is being filtered out
      exists( $lowPdClasses{ $class } ) and next;

      # otherwise store the data
      push( @instanceData, $_ );

      # remember every class that has been seen
      $classes{ $class } = $class;
   } # parse input file

   # close the input file
   close( FH );

   # get the name of the class attribute
   my $attribHeader = pop( @headerData );
   $attribHeader =~ /^\@attribute\s+[\']?([^\s\'\{]+)[\']?\s+/i;
   my $classAttribName = $1;

   # create and add new class attribute to header
   if ( defined( $attribHeader ) and ( scalar( keys( %classes )) > 0 ))
   {
      # create the new class attribute
      my $newClassAttrib = "\@attribute '$classAttribName' {";
      for ( sort( keys( %classes )))
      {
         $newClassAttrib .= "'" . $_ . "',";
      }
      chop( $newClassAttrib );
      $newClassAttrib .= "}\n";

      # add new class attribute to header
      push( @headerData, $newClassAttrib );
   }  # create and add new class attribute to header

   #--------------------------------------- check for failing stop conditions
   # exit if filtering has eliminated the usefulness of the input data file
   if (( scalar( keys( %classes )) < 2 ) or
       ( scalar( @headerData ) == 0 ) or
       ( scalar( @instanceData ) == 0 ))
   {
      # delete the dest file if it exists
      unlink( $outputArff );

      # exit
      exit( 0 );
   }  # exit if filtering has eliminated the usefulness of the input...

   #------------------------------------------------------- write output file
   # open the output arff file
   open( FH, ">$outputArff" ) ||
      die "Could not open output ARFF file \"$outputArff\"\n";

   # write header and data to output file
   for ( @headerData ) { print FH; }
   for ( @instanceData ) { print FH; }

   # close the output file
   close( FH );

   # quit if there is no more work to do
   ( scalar( keys( %lowPdClasses )) == 0 ) and exit( 0 );

   # reset file name for next pass
   $currentInput = $outputArff;
} # loop until filtering is complete