#!/usr/bin/perl
#############################################################################
##
## File: resample.pl
## Date Created: 2006-03-20
##
## Copyright (c) 2006 David D. Allen
##
## Permission is hereby granted, free of charge, to any person obtaining a
## copy of this software and associated documentation files (the "Software"),
## to deal in the Software without restriction, including without limitation
## the rights to use, copy, modify, merge, publish, distribute, sublicense,
## and/or sell copies of the Software, and to permit persons to whom the
## Software is furnished to do so, subject to the following conditions:
##
## The above copyright notice and this permission notice shall be included in
## all copies or substantial portions of the Software.
##
## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
## THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
## FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
## DEALINGS IN THE SOFTWARE.
##
#############################################################################

# don't allow questionable usage of perl
use strict;

#---------------------------------------------------------------------- setup
# check command line arguments
if ( scalar( @ARGV ) != 7 )
{
   die "Invalid command line arguments.\n" .
       "\n" .
       "usage: filter.pl input.arff output minTrain maxTrain minTest\n" .
       "       maxTest repeats\n" .
       "   input.arff - Name of input arff file to resample.\n" .
       "   output     - Name of folder to write to.\n" .
       "   minTrain   - Minimum number of instances per set for the test\n" .
       "                files.\n" .
       "   maxTrain   - Maximum number of instances per set for the test\n" .
       "                files.\n" .
       "   minTest    - Minimum number of instances per class for\n" .
       "                training files.\n" .
       "   maxTest    - Maximum number of instances per class for\n" .
       "                training files.\n" .
       "   repeats    - Number of repeats of files.\n" .
       "\n" .
       "The input arff file is read and training and tests sets are\n" .
       "created from it. Training arff files have instances grouped by\n" .
       "class, where the order of classes is randomly set, and the\n" .
       "number of instances per class is chosen between minTest and\n" .
       "maxText. Test arff files contain two sets of instances with the\n" .
       "size of each set chosen between minTest and maxTest.\n" .
       "\n" .
       "A pair of <training, test> is generated for each class in the\n" .
       "input arff files. For each pair one class is chosen to not\n" .
       "appear in the training file, but to show up as the class for the\n" .
       "second section of the test file, with the instances randomly\n" .
       "selected from that class. The instances in first section of the\n" .
       "test file are randomly selected from every other class.\n" .
       "\n" .
       "The sets of files created based on classes have the class number\n" .
       "appended to the name. The entire process is repeated\n" .
       "\"repeats\" times, with the repeat number appended to the file\n" .
       "name.\n" .
       "\n";
}  # check command line arguments

# get the command line arguments
my ( $inputArff, $outputFolder, $minTrain, $maxTrain, $minTest, $maxTest,
     $repeats ) = @ARGV;

# get the base file name
$inputArff =~ /([^\/]+)\.arff$/ and
   my $arffFile = $1;

# fail if input file name was not correct
defined( $arffFile ) or
   die "Input file \"$inputArff\" must end with an .arff extension.\n";

# setup csv array
my @csvContent;
push( @csvContent, [ "set name", "file name", "iteration",
   "class number", "class", "holdout instance" ] );

#------------------------------------------------------ produce the file sets
# create files for each repeat
for ( my $repeatNum = 0; $repeatNum < $repeats; ++$repeatNum )
{
   # create text for repeat
   my $repeatNumText = sprintf( "%02d", $repeatNum );

   #--------------------------------------------------------- read input file
   # open the input arff file
   open( FH, "<$inputArff" ) ||
      die "Could not open input ARFF file \"$inputArff\"\n";

   # parse input file
   my $inHeader = 1;
   my ( @headerData, %instanceData );
   for ( <FH> )
   {
      # strip comments
      s/^([^\%]*)\%.*$/$1/;

      # strip leading white space
      s/^\s*(.*)$/$1/;

      # skip empty lines
      /^$/ and next;

      # done with header when @data is reached
      /^\@data/i and (( $inHeader = 0 ), next );

      # store header lines
      ( $inHeader ) and push( @headerData, $_ ), next;

      # split out class of instance
      my $class;
      /,\s*\'?([^,\']*)\'?$/ or next;
      chomp( $class = $1 );

      # store data in array based on the class
      push @{ $instanceData{ $class }}, $_;
   } # parse input file

   # close the input file
   close( FH );

   #------------------------------------------- create training/testing files
   # create files for each class in input file
   my @classes = sort( keys( %instanceData ));
   my $maxFileNum = scalar( @classes );
   for ( my $fileNum = 0; $fileNum < $maxFileNum; ++$fileNum )
   {
      # create text for file number
      my $fileNumText = sprintf( "%02d", $fileNum );

      # create test for file suffix
      my $suffixText = "-" . $repeatNumText . "-" . $fileNumText . "-";

      #------------------------------------------------- create training file
      # create the file name for the training file
      my $trainFileName = $outputFolder . "/" . $arffFile . $suffixText .
         "train.arff";

      # choose random order for classes
      my @tempClasses = @classes;
      splice( @tempClasses, $fileNum, 1 );         # remove the holdout class
      my @classOrder;
      while ( scalar( @tempClasses ))
      {
         # find a random index into the array
         my $index = int( rand( scalar( @tempClasses )));

         # add class name to order array and remove from temporary array
         push( @classOrder, $tempClasses[ $index ] );
         splice( @tempClasses, $index, 1 );
      }  # choose random order for classes

      # open the training file
      open( FH, ">$trainFileName" ) ||
         die "Could not open output training ARFF file \"$trainFileName\"\n";

      # write header to the training file
      for ( @headerData ) { print FH; }
      print FH "\@data\n";

      # write training arff data for each class
      foreach my $class ( @classOrder )
      {
         # get a reference to the instance array for the class
         my $instanceArray = $instanceData{ $class };

         # determine the number of instances for the class
         my $instances = scalar @{ $instanceArray };

         # calculate the number of instances to produce for the class
         my $classInstances = int( rand( $maxTrain - $minTrain ) +
            $minTrain );

         # output the chosen number of instances for the class
         foreach ( 1 .. $classInstances )
         {
            # write one class intance chosen randomly
            print FH $instanceArray->[ int( rand( $instances )) ];
         }  # output the chosen number of instances for the class
      }  # write training arff data for each class

      # close the output training file
      close( FH );

      #----------------------------------------------------- create test file
      my $testFileName = $outputFolder . "/" . $arffFile . $suffixText .
         "-test.arff";

      # open the test file
      open( FH, ">$testFileName" ) ||
         die "Could not open output test ARFF file \"$testFileName\"\n";

      # write header to the test file
      for ( @headerData ) { print FH; }
      print FH "\@data\n";

      # get array of classes, removing holdout class
      my @classArray = @classes;
      splice( @classArray, $fileNum, 1 );

      # determine the number of classes
      my $classNum = scalar( @classArray );

      # calculate the number of instances to output
      my $testInstances = int( rand( $maxTest - $minTest ) + $minTest );

      # write the desired number of instances from all classes but holdout
      foreach ( 1 .. $testInstances )
      {
         # choose the class for the next instance
         my $class = $classArray[ int( rand( $classNum )) ];

         # get a reference to the instance array for the class
         my $instanceArray = $instanceData{ $class };

         # determine the number of instances for the instance array
         my $instances = scalar @{ $instanceArray };

         # write one class instance chosen randomly
         print FH $instanceArray->[ int( rand( $instances )) ];
      }  # write the desired number of instances from all classes but holdout

      # add row to CSV output
      push( @csvContent, [ $arffFile, $testFileName, $repeatNum, $fileNum,
         $classes[ $fileNum ], $testInstances + 1 ] );

      # calculate the number of holdout instances to output
      my $testInstances = int( rand( $maxTest - $minTest ) + $minTest );

      # get a reference to the instance array for the class
      my $instanceArray = $instanceData{ $classes[ $fileNum ] };

      # determine the number of instances for the instance array
      my $instances = scalar @{ $instanceArray };

      # write the desired number of holdout instances
      foreach ( 1 .. $testInstances )
      {
         # write one class instance chosen randomly
         print FH $instanceArray->[ int( rand( $instances )) ];
      }  # write the desired number of instances from all classes but holdout

      # close the output test file
      close( FH );
   }  # create files for each class in input file
}  # create files for each repeat

#-------------------------------------------------------- write the CSV files
# create the file name
my $csvFileName = $outputFolder . "/" . $arffFile . ".csv";

# open the test file
open( FH, ">$csvFileName" ) ||
   die "Could not open output CSV file \"$csvFileName\"\n";

# loop over each row of csv data
foreach my $row ( @csvContent )
{
   # build row string
   my $rowString;
   for ( @{ $row } )
   {
      # add value to the string
      $rowString .= $_ . ",";
   }  # build row string

   # remove last extra comma
   chop( $rowString );

   # write to CSV file
   print FH $rowString, "\n";
}  # loop over each row of csv data

# close the output CSV file
close( FH );

