#!/usr/bin/gawk -f BEGIN { TrainNormal="train.arff"; TrainManualStratification="trainManualStratification.arff"; TestNormal="test.arff"; TestSize = 5; Seed = 1; } /^[ \t]*$/ { next; } NR==1{ Seed ? srand(Seed) : srand(1) printf "" > TrainNormal; printf "" > TrainManualStratification; printf "" > TestNormal; } FILENAME==ARGV[3] && /@relation/,/@data/ { print $0 >> TrainNormal; print $0 >> TrainManualStratification; print $0 >> TestNormal; next; } FILENAME==ARGV[4] && /@relation/,/@data/ { next; } FILENAME==ARGV[3] { InstanceNormal[rand()] = $0; } FILENAME==ARGV[4] { InstanceManualStratification[rand()] = $0; } END { for (i in InstanceNormal) { if (TestSize > 0) { #store the ones for test file and print them to the test file as well TestArray[TestSize] = InstanceNormal[i]; print InstanceNormal[i] >> TestNormal; TestSize--; } else { #print the rest to the train file print InstanceNormal[i] >> TrainNormal; } } for (i in InstanceManualStratification) { UsedBefore = 0; #check to see if that instance is in the test file. If not, it can be in the train file used for manual stratification for (j in TestArray) { if (InstanceManualStratification[i] == TestArray[j]) { UsedBefore = 1; } } #if not used for test file, it can be used for training if (UsedBefore == 0) print InstanceManualStratification[i] >> TrainManualStratification; } }