/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * JRip.java * Copyright (C) 2001 Xin Xu, Eibe Frank */ package weka.classifiers.rules; import java.util.Enumeration; import java.util.Random; import java.util.Vector; import java.io.Serializable; import weka.core.FastVector; import weka.core.Instances; import weka.core.Instance; import weka.core.Attribute; import weka.core.AttributeStats; import weka.core.Utils; import weka.core.OptionHandler; import weka.core.Option; import weka.core.Copyable; import weka.core.WeightedInstancesHandler; import weka.core.AdditionalMeasureProducer; import weka.core.UnsupportedAttributeTypeException; import weka.core.UnsupportedClassTypeException; import weka.filters.supervised.attribute.ClassOrder; import weka.filters.Filter; import weka.classifiers.Evaluation; import weka.classifiers.Classifier; /** * This class implements a propositional rule learner, Repeated Incremental * Pruning to Produce Error Reduction (RIPPER), which is proposed by William * W. Cohen as an optimized version of IREP.

* * The algorithm is briefly described as follows:

* Initialize RS = {}, and for each class from the less prevalent one to * the more frequent one, DO:

* * 1. Building stage: repeat 1.1 and 1.2 until the descrition length (DL) * of the ruleset and examples is 64 bits greater than the smallest DL * met so far, or there are no positive examples, or the error rate >= 50%. *

* 1.1. Grow phase:
* Grow one rule by greedily adding antecedents (or conditions) to * the rule until the rule is perfect (i.e. 100% accurate). The * procedure tries every possible value of each attribute and selects * the condition with highest information gain: p(log(p/t)-log(P/T)). *

* 1.2. Prune phase:
* Incrementally prune each rule and allow the pruning of any * final sequences of the antecedents;
* The pruning metric is (p-n)/(p+n) -- but it's actually * 2p/(p+n) -1, so in this implementation we simply use p/(p+n) * (actually (p+1)/(p+n+2), thus if p+n is 0, it's 0.5).

* * 2. Optimization stage: after generating the initial ruleset {Ri}, * generate and prune two variants of each rule Ri from randomized data * using procedure 1.1 and 1.2. But one variant is generated from an * empty rule while the other is generated by greedily adding antecedents * to the original rule. Moreover, the pruning metric used here is * (TP+TN)/(P+N).
* Then the smallest possible DL for each variant and the original rule * is computed. The variant with the minimal DL is selected as the final * representative of Ri in the ruleset.
* After all the rules in {Ri} have been examined and if there are still * residual positives, more rules are generated based on the residual * positives using Building Stage again.

* * 3. Delete the rules from the ruleset that would increase the DL of the * whole ruleset if it were in it. and add resultant ruleset to RS.

* * ENDDO

* * Note that there seem to be 2 bugs in the ripper program that would * affect the ruleset size and accuracy slightly. This implementation avoids * these bugs and thus is a little bit different from Cohen's original * implementation. Even after fixing the bugs, since the order of classes with * the same frequency is not defined in ripper, there still seems to be * some trivial difference between this implementation and the original ripper, * especially for audiology data in UCI repository, where there are lots of * classes of few instances.

* * If wrapped by other classes, typical usage of this class is:
* * JRip rip = new JRip(); * Instances data = ... // Data from somewhere * double[] orderedClasses = ... // Get the ordered class counts for the data * double expFPRate = ... // Calculate the expected FP/(FP+FN) rate * double classIndex = ... // The class index for which ruleset is built * // DL of default rule, no theory DL, only data DL * double defDL = RuleStats.dataDL(expFPRate, 0.0, data.sumOfWeights(), * 0.0, orderedClasses[(int)classIndex]); * * rip.rulesetForOneClass(expFPRate, data, classIndex, defDL); * RuleStats rulesetStats = rip.getRuleStats(0); * * // Can get heaps of information from RuleStats, e.g. combined DL, * // simpleStats, etc. * double comDL = rulesetStats.combinedDL(expFPRate, classIndex); * int whichRule = ... // Want simple stats of which rule? * double[] simpleStats = rulesetStats.getSimpleStats(whichRule); * ... * * * Details please see "Fast Effective Rule Induction", William W. Cohen, * 'Machine Learning: Proceedings of the Twelfth International Conference' * (ML95).

* * PS. We have compared this implementation with the original ripper * implementation in aspects of accuracy, ruleset size and running time * on both artificial data "ab+bcd+defg" and UCI datasets. In all these * aspects it seems to be quite comparable to the original ripper * implementation. However, we didn't consider memory consumption * optimization in this implementation.

* * @author Xin Xu (xx5@cs.waikato.ac.nz) * @author Eibe Frank (eibe@cs.waikato.ac.nz) * @version $Revision: 1.14 $ */ public class JRip extends Classifier implements OptionHandler, AdditionalMeasureProducer, WeightedInstancesHandler{ /** The limit of description length surplus in ruleset generation */ private static double MAX_DL_SURPLUS = 64.0; /** The class attribute of the data*/ private Attribute m_Class; /** The ruleset */ private FastVector m_Ruleset; /** The predicted class distribution */ private FastVector m_Distributions; /** Runs of optimizations */ private int m_Optimizations = 2; /** Random object used in this class */ private Random m_Random = null; /** # of all the possible conditions in a rule */ private double m_Total = 0; /** The seed to perform randomization */ private long m_Seed = 1; /** The number of folds to split data into Grow and Prune for IREP */ private int m_Folds = 3; /** The minimal number of instance weights within a split*/ private double m_MinNo = 2.0; /** Whether in a debug mode */ private boolean m_Debug = false; /** Whether check the error rate >= 0.5 in stopping criteria */ private boolean m_CheckErr = true; /** Whether use pruning, i.e. the data is clean or not */ private boolean m_UsePruning = true; /** The filter used to randomize the class order */ private Filter m_Filter = null; /** The RuleStats for the ruleset of each class value */ private FastVector m_RulesetStats; /** * Returns a string describing classifier * @return a description suitable for * displaying in the explorer/experimenter gui */ public String globalInfo() { return "This class implements a propositional rule learner, Repeated Incremental " + "Pruning to Produce Error Reduction (RIPPER), which was proposed by William " + "W. Cohen as an optimized version of IREP. \n\n" + "The algorithm is briefly described as follows: \n\n" + "Initialize RS = {}, and for each class from the less prevalent one to " + "the more frequent one, DO: \n\n" + "1. Building stage:\nRepeat 1.1 and 1.2 until the descrition length (DL) " + "of the ruleset and examples is 64 bits greater than the smallest DL " + "met so far, or there are no positive examples, or the error rate >= 50%. " + "\n\n" + "1.1. Grow phase:\n" + "Grow one rule by greedily adding antecedents (or conditions) to " + "the rule until the rule is perfect (i.e. 100% accurate). The " + "procedure tries every possible value of each attribute and selects " + "the condition with highest information gain: p(log(p/t)-log(P/T))." + "\n\n" + "1.2. Prune phase:\n" + "Incrementally prune each rule and allow the pruning of any " + "final sequences of the antecedents;" + "The pruning metric is (p-n)/(p+n) -- but it's actually " + "2p/(p+n) -1, so in this implementation we simply use p/(p+n) " + "(actually (p+1)/(p+n+2), thus if p+n is 0, it's 0.5).\n\n" + "2. Optimization stage:\n after generating the initial ruleset {Ri}, " + "generate and prune two variants of each rule Ri from randomized data " + "using procedure 1.1 and 1.2. But one variant is generated from an " + "empty rule while the other is generated by greedily adding antecedents " + "to the original rule. Moreover, the pruning metric used here is " + "(TP+TN)/(P+N)." + "Then the smallest possible DL for each variant and the original rule " + "is computed. The variant with the minimal DL is selected as the final " + "representative of Ri in the ruleset." + "After all the rules in {Ri} have been examined and if there are still " + "residual positives, more rules are generated based on the residual " + "positives using Building Stage again. \n" + "3. Delete the rules from the ruleset that would increase the DL of the " + "whole ruleset if it were in it. and add resultant ruleset to RS. \n" + "ENDDO\n\n" + "Note that there seem to be 2 bugs in the original ripper program that would " + "affect the ruleset size and accuracy slightly. This implementation avoids " + "these bugs and thus is a little bit different from Cohen's original " + "implementation. Even after fixing the bugs, since the order of classes with " + "the same frequency is not defined in ripper, there still seems to be " + "some trivial difference between this implementation and the original ripper, " + "especially for audiology data in UCI repository, where there are lots of " + "classes of few instances.\n\n" + "Details please see \"Fast Effective Rule Induction\", William W. Cohen, " + "'Machine Learning: Proceedings of the Twelfth International Conference'" + "(ML95). \n\n" + "PS. We have compared this implementation with the original ripper " + "implementation in aspects of accuracy, ruleset size and running time " + "on both artificial data \"ab+bcd+defg\" and UCI datasets. In all these " + "aspects it seems to be quite comparable to the original ripper " + "implementation. However, we didn't consider memory consumption " + "optimization in this implementation.\n\n"; } /** * Returns an enumeration describing the available options * Valid options are:

* * -F number
* The number of folds for reduced error pruning. One fold is * used as the pruning set. (Default: 3)

* * -N number
* The minimal weights of instances within a split. * (Default: 2)

* * -O number
* Set the number of runs of optimizations. (Default: 2)

* * -D
* Whether turn on the debug mode * * -S number
* The seed of randomization used in Ripper.(Default: 1)

* * -E
* Whether NOT check the error rate >= 0.5 in stopping criteria. * (default: check)

* * -P
* Whether NOT use pruning. (default: use pruning)

* * @return an enumeration of all the available options */ public Enumeration listOptions() { Vector newVector = new Vector(3); newVector.addElement(new Option("\tSet number of folds for REP\n" + "\tOne fold is used as pruning set.\n" + "\t(default 3)","F", 1, "-F ")); newVector.addElement(new Option("\tSet the minimal weights of instances\n" + "\twithin a split.\n" + "\t(default 2.0)","N", 1, "-N ")); newVector.addElement(new Option("\tSet the number of runs of\n"+ "\toptimizations. (Default: 2)", "O", 1,"-O ")); newVector.addElement(new Option("\tSet whether turn on the\n"+ "\tdebug mode (Default: false)", "D", 0,"-D")); newVector.addElement(new Option("\tThe seed of randomization\n"+ "\t(Default: 1)", "S", 1,"-S ")); newVector.addElement(new Option("Whether NOT check the error rate>=0.5\n" +"\tin stopping criteria " +"\t(default: check)", "E", 0, "-E")); newVector.addElement(new Option("Whether NOT use pruning\n" +"\t(default: use pruning)", "P", 0, "-P")); return newVector.elements(); } /** * Parses a given list of options. * * @param options the list of options as an array of strings * @exception Exception if an option is not supported */ public void setOptions(String[] options) throws Exception { String numFoldsString = Utils.getOption('F', options); if (numFoldsString.length() != 0) m_Folds = Integer.parseInt(numFoldsString); else m_Folds = 3; String minNoString = Utils.getOption('N', options); if (minNoString.length() != 0) m_MinNo = Double.parseDouble(minNoString); else m_MinNo = 2.0; String seedString = Utils.getOption('S', options); if (seedString.length() != 0) m_Seed = Long.parseLong(seedString); else m_Seed = 1; String runString = Utils.getOption('O', options); if (runString.length() != 0) m_Optimizations = Integer.parseInt(runString); else m_Optimizations = 2; m_Debug = Utils.getFlag('D', options); m_CheckErr = !Utils.getFlag('E', options); m_UsePruning = !Utils.getFlag('P', options); } /** * Gets the current settings of the Classifier. * * @return an array of strings suitable for passing to setOptions */ public String [] getOptions() { String [] options = new String [11]; int current = 0; options[current++] = "-F"; options[current++] = "" + m_Folds; options[current++] = "-N"; options[current++] = "" + m_MinNo; options[current++] = "-O"; options[current++] = "" + m_Optimizations; options[current++] = "-S"; options[current++] = "" + m_Seed; if(m_Debug) options[current++] = "-D"; if(!m_CheckErr) options[current++] = "-E"; if(!m_UsePruning) options[current++] = "-P"; while(current < options.length) options[current++] = ""; return options; } /** * Returns an enumeration of the additional measure names * @return an enumeration of the measure names */ public Enumeration enumerateMeasures() { Vector newVector = new Vector(1); newVector.addElement("measureNumRules"); return newVector.elements(); } /** * Returns the value of the named measure * @param measureName the name of the measure to query for its value * @return the value of the named measure * @exception IllegalArgumentException if the named measure is not supported */ public double getMeasure(String additionalMeasureName) { if (additionalMeasureName.compareToIgnoreCase("measureNumRules") == 0) return m_Ruleset.size(); else throw new IllegalArgumentException(additionalMeasureName+" not supported (RIPPER)"); } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String foldsTipText() { return "Determines the amount of data used for pruning. One fold is used for " + "pruning, the rest for growing the rules."; } public void setFolds(int fold){ m_Folds = fold; } public int getFolds(){ return m_Folds; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String minNoTipText() { return "The minimum total weight of the instances in a rule."; } public void setMinNo(double m){ m_MinNo = m; } public double getMinNo(){ return m_MinNo; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String seedTipText() { return "The seed used for randomizing the data."; } public void setSeed(long s){ m_Seed = s; } public long getSeed(){ return m_Seed; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String optimizationsTipText() { return "The number of optimization runs."; } public void setOptimizations(int run){ m_Optimizations = run; } public int getOptimizations(){ return m_Optimizations; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String debugTipText() { return "Whether debug information is output to the console."; } public void setDebug(boolean d){m_Debug = d;} public boolean getDebug(){ return m_Debug; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String checkErrorRateTipText() { return "Whether check for error rate >= 1/2 is included" + " in stopping criterion."; } public void setCheckErrorRate(boolean d){ m_CheckErr = d;} public boolean getCheckErrorRate(){ return m_CheckErr; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String usePruningTipText() { return "Whether pruning is performed."; } public void setUsePruning(boolean d){ m_UsePruning = d;} public boolean getUsePruning(){ return m_UsePruning; } /** * Get the ruleset generated by Ripper * * @return the ruleset */ public FastVector getRuleset(){ return m_Ruleset; } /** * Get the statistics of the ruleset in the given position * * @param pos the position of the stats, assuming correct */ public RuleStats getRuleStats(int pos) { return (RuleStats)m_RulesetStats.elementAt(pos); } /** * The single antecedent in the rule, which is composed of an attribute and * the corresponding value. There are two inherited classes, namely NumericAntd * and NominalAntd in which the attributes are numeric and nominal respectively. */ private abstract class Antd implements WeightedInstancesHandler, Copyable, Serializable { /* The attribute of the antecedent */ protected Attribute att; /* The attribute value of the antecedent. For numeric attribute, value is either 0(1st bag) or 1(2nd bag) */ protected double value; /* The maximum infoGain achieved by this antecedent test * in the growing data */ protected double maxInfoGain; /* The accurate rate of this antecedent test on the growing data */ protected double accuRate; /* The coverage of this antecedent in the growing data */ protected double cover; /* The accurate data for this antecedent in the growing data */ protected double accu; /* Constructor*/ public Antd(Attribute a){ att=a; value=Double.NaN; maxInfoGain = 0; accuRate = Double.NaN; cover = Double.NaN; accu = Double.NaN; } /* The abstract members for inheritance */ public abstract Instances[] splitData(Instances data, double defAcRt, double cla); public abstract boolean covers(Instance inst); public abstract String toString(); /** Implements Copyable */ public abstract Object copy(); /* Get functions of this antecedent */ public Attribute getAttr(){ return att; } public double getAttrValue(){ return value; } public double getMaxInfoGain(){ return maxInfoGain; } public double getAccuRate(){ return accuRate; } public double getAccu(){ return accu; } public double getCover(){ return cover; } } /** * The antecedent with numeric attribute */ private class NumericAntd extends Antd{ /* The split point for this numeric antecedent */ private double splitPoint; /* Constructor*/ public NumericAntd(Attribute a){ super(a); splitPoint = Double.NaN; } /* Get split point of this numeric antecedent */ public double getSplitPoint(){ return splitPoint; } /** Implements Copyable */ public Object copy(){ NumericAntd na = new NumericAntd(getAttr()); na.value = this.value; na.splitPoint = this.splitPoint; return na; } /** * Implements the splitData function. * This procedure is to split the data into two bags according * to the information gain of the numeric attribute value * The maximum infoGain is also calculated. * * @param insts the data to be split * @param defAcRt the default accuracy rate for data * @param cl the class label to be predicted * @return the array of data after split */ public Instances[] splitData(Instances insts, double defAcRt, double cl){ Instances data = insts; int total=data.numInstances();// Total number of instances without // missing value for att int split=1; // Current split position int prev=0; // Previous split position int finalSplit=split; // Final split position maxInfoGain = 0; value = 0; double fstCover=0, sndCover=0, fstAccu=0, sndAccu=0; data.sort(att); // Find the las instance without missing value for(int x=0; x // Can't split within data.instance(prev).value(att))){ // same value for(int y=prev; y sndInfoGain){ isFirst = true; infoGain = fstInfoGain; accRate = fstAccuRate; accurate = fstAccu; coverage = fstCover; } else{ isFirst = false; infoGain = sndInfoGain; accRate = sndAccuRate; accurate = sndAccu; coverage = sndCover; } /* Check whether so far the max infoGain */ if(infoGain > maxInfoGain){ splitPoint = data.instance(prev).value(att); value = (isFirst) ? 0 : 1; accuRate = accRate; accu = accurate; cover = coverage; maxInfoGain = infoGain; finalSplit = (isFirst) ? split : prev; } for(int y=prev; y splitPoint) isCover=false; } else if(inst.value(att) < splitPoint) // Second bag isCover=false; } else isCover = false; return isCover; } /** * Prints this antecedent * * @return a textual description of this antecedent */ public String toString() { String symbol = ((int)value == 0) ? " <= " : " >= "; return (att.name() + symbol + Utils.doubleToString(splitPoint, 6)); } } /** * The antecedent with nominal attribute */ private class NominalAntd extends Antd{ /* The parameters of infoGain calculated for each attribute value * in the growing data */ private double[] accurate; private double[] coverage; /* Constructor*/ public NominalAntd(Attribute a){ super(a); int bag = att.numValues(); accurate = new double[bag]; coverage = new double[bag]; } /** Implements Copyable */ public Object copy(){ Antd antec = new NominalAntd(getAttr()); antec.value = this.value; return antec; } /** * Implements the splitData function. * This procedure is to split the data into bags according * to the nominal attribute value * The infoGain for each bag is also calculated. * * @param data the data to be split * @param defAcRt the default accuracy rate for data * @param cl the class label to be predicted * @return the array of data after split */ public Instances[] splitData(Instances data, double defAcRt, double cl){ int bag = att.numValues(); Instances[] splitData = new Instances[bag]; for(int x=0; x maxInfoGain){ maxInfoGain = infoGain; cover = coverage[x]; accu = accurate[x]; accuRate = p/t; value = (double)x; } } return splitData; } /** * Whether the instance is covered by this antecedent * * @param inst the instance in question * @return the boolean value indicating whether the instance is * covered by this antecedent */ public boolean covers(Instance inst){ boolean isCover=false; if(!inst.isMissing(att)){ if((int)inst.value(att) == (int)value) isCover=true; } return isCover; } /** * Prints this antecedent * * @return a textual description of this antecedent */ public String toString() { return (att.name() + " = " +att.value((int)value)); } } /** * This class implements a single rule that predicts specified class. * * A rule consists of antecedents "AND"ed together and the consequent * (class value) for the classification. * In this class, the Information Gain (p*[log(p/t) - log(P/T)]) is used to * select an antecedent and Reduced Error Prunning (REP) with the metric * of accuracy rate p/(p+n) or (TP+TN)/(P+N) is used to prune the rule. */ protected class RipperRule extends Rule{ /** The internal representation of the class label to be predicted*/ private double m_Consequent = -1; /** The vector of antecedents of this rule*/ protected FastVector m_Antds = null; public void setConsequent(double cl){ m_Consequent = cl; } public double getConsequent(){ return m_Consequent; } /** Constructor */ public RipperRule(){ m_Antds = new FastVector(); } /** * Get a shallow copy of this rule * * @return the copy */ public Object copy(){ RipperRule copy = new RipperRule(); copy.setConsequent(getConsequent()); copy.m_Antds = (FastVector)this.m_Antds.copyElements(); return copy; } /** * Whether the instance covered by this rule * * @param inst the instance in question * @return the boolean value indicating whether the instance * is covered by this rule */ public boolean covers(Instance datum){ boolean isCover=true; for(int i=0; i 0); } /** * the number of antecedents of the rule * * @return the size of this rule */ public double size(){ return (double)m_Antds.size(); } /** * Private function to compute default number of accurate instances * in the specified data for the consequent of the rule * * @param data the data in question * @return the default accuracy number */ private double computeDefAccu(Instances data){ double defAccu=0; for(int i=0; i 0) && Utils.sm(defAcRt, 1.0) ){ // We require that infoGain be positive /*if(numAntds == originalSize) maxInfoGain = 0.0; // At least one condition allowed else maxInfoGain = Utils.eq(defAcRt, 1.0) ? defAccu/(double)numAntds : 0.0; */ maxInfoGain = 0.0; /* Build a list of antecedents */ Antd oneAntd=null; Instances coverData = null; Enumeration enumAttr=growData.enumerateAttributes(); /* Build one condition based on all attributes not used yet*/ while (enumAttr.hasMoreElements()){ Attribute att= (Attribute)(enumAttr.nextElement()); if(m_Debug) System.err.println("\nOne condition: size = " + growData.sumOfWeights()); Antd antd =null; if(att.isNumeric()) antd = new NumericAntd(att); else antd = new NominalAntd(att); if(!used[att.index()]){ /* Compute the best information gain for each attribute, it's stored in the antecedent formed by this attribute. This procedure returns the data covered by the antecedent*/ Instances coveredData = computeInfoGain(growData, defAcRt, antd); if(coveredData != null){ double infoGain = antd.getMaxInfoGain(); if(m_Debug) System.err.println("Test of \'"+antd.toString()+ "\': infoGain = "+ infoGain + " | Accuracy = " + antd.getAccuRate()+ "="+antd.getAccu() +"/"+antd.getCover()+ " def. accuracy: "+defAcRt); if(infoGain > maxInfoGain){ oneAntd=antd; coverData = coveredData; maxInfoGain = infoGain; } } } } if(oneAntd == null) break; // Cannot find antds if(Utils.sm(oneAntd.getAccu(), m_MinNo)) break;// Too low coverage //Numeric attributes can be used more than once if(!oneAntd.getAttr().isNumeric()){ used[oneAntd.getAttr().index()]=true; numUnused--; } m_Antds.addElement(oneAntd); growData = coverData;// Grow data size is shrinking defAcRt = oneAntd.getAccuRate(); } } /** * Compute the best information gain for the specified antecedent * * @param instances the data based on which the infoGain is computed * @param defAcRt the default accuracy rate of data * @param antd the specific antecedent * @param numConds the number of antecedents in the rule so far * @return the data covered by the antecedent */ private Instances computeInfoGain(Instances instances, double defAcRt, Antd antd){ Instances data = instances; /* Split the data into bags. The information gain of each bag is also calculated in this procedure */ Instances[] splitData = antd.splitData(data, defAcRt, m_Consequent); /* Get the bag of data to be used for next antecedents */ if(splitData != null) return splitData[(int)antd.getAttrValue()]; else return null; } /** * Prune all the possible final sequences of the rule using the * pruning data. The measure used to prune the rule is based on * flag given. * * @param pruneData the pruning data used to prune the rule * @param useWhole flag to indicate whether use the error rate of * the whole pruning data instead of the data covered */ public void prune(Instances pruneData, boolean useWhole){ Instances data = pruneData; double total = data.sumOfWeights(); if(!Utils.gr(total, 0.0)) return; /* The default accurate # and rate on pruning data */ double defAccu=computeDefAccu(data); if(m_Debug) System.err.println("Pruning with " + defAccu + " positive data out of " + total + " instances"); int size=m_Antds.size(); if(size == 0) return; // Default rule before pruning double[] worthRt = new double[size]; double[] coverage = new double[size]; double[] worthValue = new double[size]; for(int w=0; w maxValue){ // Prefer to the maxValue = worthRt[i]; // shorter rule maxIndex = i; } } /* Prune the antecedents according to the accuracy parameters */ for(int z=size-1;z>maxIndex;z--) m_Antds.removeElementAt(z); } /** * Prints this rule * * @param classAttr the class attribute in the data * @return a textual description of this rule */ public String toString(Attribute classAttr) { StringBuffer text = new StringBuffer(); if(m_Antds.size() > 0){ for(int j=0; j< (m_Antds.size()-1); j++) text.append("(" + ((Antd)(m_Antds.elementAt(j))).toString()+ ") and "); text.append("("+((Antd)(m_Antds.lastElement())).toString() + ")"); } text.append(" => " + classAttr.name() + "=" + classAttr.value((int)m_Consequent)); return text.toString(); } } /** * Builds Ripper in the order of class frequencies. For each class * it's built in two stages: building and optimization * * @param instances the training data * @exception Exception if classifier can't be built successfully */ public void buildClassifier(Instances instances) throws Exception { if(instances.numInstances() == 0) throw new Exception(" No instances with a class value!"); if (instances.checkForStringAttributes()) throw new UnsupportedAttributeTypeException(" Cannot handle string attributes!"); if (!instances.classAttribute().isNominal()) throw new UnsupportedClassTypeException(" Only nominal class, please."); m_Random = instances.getRandomNumberGenerator(m_Seed); m_Total = RuleStats.numAllConditions(instances); if(m_Debug) System.err.println("Number of all possible conditions = "+m_Total); Instances data = null; m_Filter = new ClassOrder(); ((ClassOrder)m_Filter).setSeed(m_Random.nextInt()); ((ClassOrder)m_Filter).setClassOrder(ClassOrder.FREQ_ASCEND); m_Filter.setInputFormat(instances); data = Filter.useFilter(instances, m_Filter); if(data == null) throw new Exception(" Unable to randomize the class orders."); data.deleteWithMissingClass(); if(data.numInstances() == 0) throw new Exception(" No instances with a class value!"); if(data.numInstances() < m_Folds) throw new Exception(" Not enough data for REP."); m_Class = data.classAttribute(); m_Ruleset = new FastVector(); m_RulesetStats = new FastVector(); m_Distributions = new FastVector(); // Sort by classes frequency double[] orderedClasses = ((ClassOrder)m_Filter).getClassCounts(); if(m_Debug){ System.err.println("Sorted classes:"); for(int x=0; x < m_Class.numValues(); x++) System.err.println(x+": "+m_Class.value(x) + " has " + orderedClasses[x] + " instances."); } // Iterate from less prevalent class to more frequent one oneClass: for(int y=0; y < data.numClasses()-1; y++){ // For each class double classIndex = (double)y; if(m_Debug){ int ci = (int)classIndex; System.err.println("\n\nClass "+m_Class.value(ci)+"("+ci+"): " + orderedClasses[y] + "instances\n"+ "=====================================\n"); } if(Utils.eq(orderedClasses[y],0.0)) // No data for this class continue oneClass; // The expected FP/err is the proportion of the class double all = 0; for(int i=y; i 0) defDL = RuleStats.dataDL(expFPRate, 0.0, totalWeights, 0.0, classYWeights); else continue oneClass; // Subsumed by previous rules if(Double.isNaN(defDL) || Double.isInfinite(defDL)) throw new Exception("Should never happen: "+ "defDL NaN or infinite!"); if(m_Debug) System.err.println("The default DL = "+defDL); data = rulesetForOneClass(expFPRate, data, classIndex, defDL); } // Set the default rule RipperRule defRule = new RipperRule(); defRule.setConsequent((double)(data.numClasses()-1)); m_Ruleset.addElement(defRule); RuleStats defRuleStat = new RuleStats(); defRuleStat.setData(data); defRuleStat.setNumAllConds(m_Total); defRuleStat.addAndUpdate(defRule); m_RulesetStats.addElement(defRuleStat); for(int z=0; z < m_RulesetStats.size(); z++){ RuleStats oneClass = (RuleStats)m_RulesetStats.elementAt(z); for(int xyz=0; xyz < oneClass.getRulesetSize(); xyz++){ double[] classDist = oneClass.getDistributions(xyz); Utils.normalize(classDist); if(classDist != null) m_Distributions.addElement(((ClassOrder)m_Filter).distributionsByOriginalIndex(classDist)); } } } /** * Classify the test instance with the rule learner and provide * the class distributions * * @param datum the instance to be classified * @return the distribution */ public double[] distributionForInstance(Instance datum){ try{ for(int i=0; i < m_Ruleset.size(); i++){ RipperRule rule = (RipperRule)m_Ruleset.elementAt(i); if(rule.covers(datum)) return (double[])m_Distributions.elementAt(i); } }catch(Exception e){ System.err.println(e.getMessage()); e.printStackTrace(); } System.err.println("Should never happen!"); return new double[datum.classAttribute().numValues()]; } /** Build a ruleset for the given class according to the given data * * @param expFPRate the expected FP/(FP+FN) used in DL calculation * @param data the given data * @param classIndex the given class index * @param defDL the default DL in the data * @exception if the ruleset can be built properly */ protected Instances rulesetForOneClass(double expFPRate, Instances data, double classIndex, double defDL) throws Exception { Instances newData = data, growData, pruneData; boolean stop = false; FastVector ruleset = new FastVector(); double dl = defDL, minDL = defDL; RuleStats rstats = null; double[] rst; // Check whether data have positive examples boolean defHasPositive = true; // No longer used boolean hasPositive = defHasPositive; /********************** Building stage ***********************/ if(m_Debug) System.err.println("\n*** Building stage ***"); while((!stop) && hasPositive){ // Generate new rules until // stopping criteria met RipperRule oneRule; if(m_UsePruning){ /* Split data into Grow and Prune*/ // We should have stratified the data, but ripper seems // to have a bug that makes it not to do so. In order // to simulate it more precisely, we do the same thing. //newData.randomize(m_Random); newData = RuleStats.stratify(newData, m_Folds, m_Random); Instances[] part = RuleStats.partition(newData, m_Folds); growData=part[0]; pruneData=part[1]; //growData=newData.trainCV(m_Folds, m_Folds-1); //pruneData=newData.testCV(m_Folds, m_Folds-1); oneRule = new RipperRule(); oneRule.setConsequent(classIndex); // Must set first if(m_Debug) System.err.println("\nGrowing a rule ..."); oneRule.grow(growData); // Build the rule if(m_Debug) System.err.println("One rule found before pruning:"+ oneRule.toString(m_Class)); if(m_Debug) System.err.println("\nPruning the rule ..."); oneRule.prune(pruneData, false); // Prune the rule if(m_Debug) System.err.println("One rule found after pruning:"+ oneRule.toString(m_Class)); } else{ oneRule = new RipperRule(); oneRule.setConsequent(classIndex); // Must set first if(m_Debug) System.err.println("\nNo pruning: growing a rule ..."); oneRule.grow(newData); // Build the rule if(m_Debug) System.err.println("No pruning: one rule found:\n"+ oneRule.toString(m_Class)); } // Compute the DL of this ruleset if(rstats == null){ // First rule rstats = new RuleStats(); rstats.setNumAllConds(m_Total); rstats.setData(newData); } rstats.addAndUpdate(oneRule); int last = rstats.getRuleset().size()-1; // Index of last rule dl += rstats.relativeDL(last, expFPRate, m_CheckErr); if(Double.isNaN(dl) || Double.isInfinite(dl)) throw new Exception("Should never happen: dl in "+ "building stage NaN or infinite!"); if(m_Debug) System.err.println("Before optimization("+last+ "): the dl = "+dl+" | best: "+minDL); if(dl < minDL) minDL = dl; // The best dl so far rst = rstats.getSimpleStats(last); if(m_Debug) System.err.println("The rule covers: "+rst[0]+ " | pos = " + rst[2] + " | neg = " + rst[4]+ "\nThe rule doesn't cover: "+rst[1]+ " | pos = " + rst[5]); stop = checkStop(rst, minDL, dl); if(!stop){ ruleset.addElement(oneRule); // Accepted newData = rstats.getFiltered(last)[1];// Data not covered hasPositive = Utils.gr(rst[5], 0.0); // Positives remaining? if(m_Debug) System.err.println("One rule added: has positive? " +hasPositive); } else{ if(m_Debug) System.err.println("Quit rule"); rstats.removeLast(); // Remove last to be re-used } }// while !stop /******************** Optimization stage *******************/ RuleStats finalRulesetStat = null; if(m_UsePruning){ for(int z=0; z < m_Optimizations; z++){ if(m_Debug) System.err.println("\n*** Optimization: run #" +z+" ***"); newData = data; finalRulesetStat = new RuleStats(); finalRulesetStat.setData(newData); finalRulesetStat.setNumAllConds(m_Total); int position=0; stop = false; boolean isResidual = false; hasPositive = defHasPositive; dl = minDL = defDL; oneRule: while(!stop && hasPositive){ isResidual = (position>=ruleset.size()); // Cover residual positive examples // Re-do shuffling and stratification //newData.randomize(m_Random); newData = RuleStats.stratify(newData, m_Folds, m_Random); Instances[] part = RuleStats.partition(newData, m_Folds); growData=part[0]; pruneData=part[1]; //growData=newData.trainCV(m_Folds, m_Folds-1); //pruneData=newData.testCV(m_Folds, m_Folds-1); RipperRule finalRule; if(m_Debug) System.err.println("\nRule #"+position + "| isResidual?" + isResidual+ "| data size: "+newData.sumOfWeights()); if(isResidual){ RipperRule newRule = new RipperRule(); newRule.setConsequent(classIndex); if(m_Debug) System.err.println("\nGrowing and pruning"+ " a new rule ..."); newRule.grow(growData); newRule.prune(pruneData, false); finalRule = newRule; if(m_Debug) System.err.println("\nNew rule found: "+ newRule.toString(m_Class)); } else{ RipperRule oldRule = (RipperRule)ruleset.elementAt(position); boolean covers = false; // Test coverage of the next old rule for(int i=0; i 0)// If any rules newData = finalRulesetStat.getFiltered(position)[1]; hasPositive = Utils.gr(rst[5], 0.0); //Positives remaining? position++; } // while !stop && hasPositive if(ruleset.size() > (position+1)){ // Hasn't gone through yet for(int k=position+1; k 0)// If any rules for this class return rstats.getFiltered(ruleset.size()-1)[1]; // Data not else // covered return data; } /** * Check whether the stopping criterion meets * * @param rst the statistic of the ruleset * @param minDL the min description length so far * @param dl the current description length of the ruleset * @return true if stop criterion meets, false otherwise */ private boolean checkStop(double[] rst, double minDL, double dl){ if(dl > minDL+MAX_DL_SURPLUS){ if(m_Debug) System.err.println("DL too large: "+dl+" | "+minDL); return true; } else if(!Utils.gr(rst[2], 0.0)){// Covered positives if(m_Debug) System.err.println("Too few positives."); return true; } else if((rst[4]/rst[0]) >= 0.5){// Err rate if(m_CheckErr){ if(m_Debug) System.err.println("Error too large: "+ rst[4] + "/" + rst[0]); return true; } else return false; } else{// Not stops if(m_Debug) System.err.println("Continue."); return false; } } /** * Prints the all the rules of the rule learner. * * @return a textual description of the classifier */ public String toString() { if (m_Ruleset == null) return "JRIP: No model built yet."; StringBuffer sb = new StringBuffer("JRIP rules:\n"+ "===========\n\n"); for(int j=0; j