/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * Decorate.java * Copyright (C) 2002 Prem Melville * */ package weka.classifiers.meta; import weka.classifiers.*; import java.util.*; import weka.core.*; import weka.experiment.*; /** * DECORATE is a meta-learner for building diverse ensembles of * classifiers by using specially constructed artificial training * examples. Comprehensive experiments have demonstrated that this * technique is consistently more accurate than the base classifier, * Bagging and Random Forests. Decorate also obtains higher accuracy than * Boosting on small training sets, and achieves comparable performance * on larger training sets. For more * details see:

* * Prem Melville and Raymond J. Mooney. Constructing diverse * classifier ensembles using artificial training examples. * Proceedings of the Seventeeth International Joint Conference on * Artificial Intelligence 2003.

* * Prem Melville and Raymond J. Mooney. Creating diversity in ensembles using artificial data. * Submitted.

* * Valid options are:

* * -D
* Turn on debugging output.

* * -W classname
* Specify the full class name of a weak classifier as the basis for * Decorate (default weka.classifiers.trees.J48()).

* * -E num
* Specify the desired size of the committee (default 10).

* * -I iterations
* Set the maximum number of Decorate iterations (default 10).

* * -S seed
* Seed for random number generator. (default 0).

* * -R factor
* Factor that determines number of artificial examples to generate.

* * Options after -- are passed to the designated classifier.

* * @author Prem Melville (melville@cs.utexas.edu) * @version $Revision: 1.3.2.1 $ */ public class Decorate extends RandomizableIteratedSingleClassifierEnhancer { /** Vector of classifiers that make up the committee/ensemble. */ protected Vector m_Committee = null; /** The desired ensemble size. */ protected int m_DesiredSize = 10; /** Amount of artificial/random instances to use - specified as a fraction of the training data size. */ protected double m_ArtSize = 1.0 ; /** The random number generator. */ protected Random m_Random = new Random(0); /** Attribute statistics - used for generating artificial examples. */ protected Vector m_AttributeStats = null; /** * Constructor. */ public Decorate() { m_Classifier = new weka.classifiers.trees.J48(); } /** * String describing default classifier. */ protected String defaultClassifierString() { return "weka.classifiers.trees.J48"; } /** * Returns an enumeration describing the available options * * @return an enumeration of all the available options */ public Enumeration listOptions() { Vector newVector = new Vector(8); newVector.addElement(new Option( "\tDesired size of ensemble.\n" + "\t(default 10)", "E", 1, "-E")); newVector.addElement(new Option( "\tFactor that determines number of artificial examples to generate.\n" +"\tSpecified proportional to training set size.\n" + "\t(default 1.0)", "R", 1, "-R")); Enumeration enu = super.listOptions(); while (enu.hasMoreElements()) { newVector.addElement(enu.nextElement()); } return newVector.elements(); } /** * Parses a given list of options. Valid options are:

* * -D
* Turn on debugging output.

* * -W classname
* Specify the full class name of a weak classifier as the basis for * Decorate (required).

* * -E num
* Specify the desired size of the committee (default 10).

* * -I iterations
* Set the maximum number of Decorate iterations (default 10).

* * -S seed
* Seed for random number generator. (default 0).

* * -R factor
* Factor that determines number of artificial examples to generate.

* * Options after -- are passed to the designated classifier.

* * @param options the list of options as an array of strings * @exception Exception if an option is not supported */ public void setOptions(String[] options) throws Exception { String desiredSize = Utils.getOption('E', options); if (desiredSize.length() != 0) { setDesiredSize(Integer.parseInt(desiredSize)); } else { setDesiredSize(10); } String artSize = Utils.getOption('R', options); if (artSize.length() != 0) { setArtificialSize(Double.parseDouble(artSize)); } else { setArtificialSize(1.0); } super.setOptions(options); } /** * Gets the current settings of the Classifier. * * @return an array of strings suitable for passing to setOptions */ public String [] getOptions() { String [] superOptions = super.getOptions(); String [] options = new String [superOptions.length + 4]; int current = 0; options[current++] = "-E"; options[current++] = "" + getDesiredSize(); options[current++] = "-R"; options[current++] = "" + getArtificialSize(); System.arraycopy(superOptions, 0, options, current, superOptions.length); current += superOptions.length; while (current < options.length) { options[current++] = ""; } return options; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String desiredSizeTipText() { return "the desired number of member classifiers in the Decorate ensemble. Decorate may terminate " +"before this size is reached (depending on the value of numIterations). " +"Larger ensemble sizes usually lead to more accurate models, but increases " +"training time and model complexity."; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String numIterationsTipText() { return "the maximum number of Decorate iterations to run. Each iteration generates a classifier, " +"but does not necessarily add it to the ensemble. Decorate stops when the desired ensemble " +"size is reached. This parameter should be greater than " +"equal to the desiredSize. If the desiredSize is not being reached it may help to " +"increase this value."; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String artificialSizeTipText() { return "determines the number of artificial examples to use during training. Specified as " +"a proportion of the training data. Higher values can increase ensemble diversity."; } /** * Returns a string describing classifier * @return a description suitable for * displaying in the explorer/experimenter gui */ public String globalInfo() { return "DECORATE is a meta-learner for building diverse ensembles of " +"classifiers by using specially constructed artificial training " +"examples. Comprehensive experiments have demonstrated that this " +"technique is consistently more accurate than the base classifier, Bagging and Random Forests." +"Decorate also obtains higher accuracy than Boosting on small training sets, and achieves " +"comparable performance on larger training sets. " +"For more details see: P. Melville & R. J. Mooney. Constructing diverse classifier ensembles " +"using artificial training examples (IJCAI 2003).\n" +"P. Melville & R. J. Mooney. Creating diversity in ensembles using artificial data (submitted)."; } /** * Factor that determines number of artificial examples to generate. * * @return factor that determines number of artificial examples to generate */ public double getArtificialSize() { return m_ArtSize; } /** * Sets factor that determines number of artificial examples to generate. * * @param newwArtSize factor that determines number of artificial examples to generate */ public void setArtificialSize(double newArtSize) { m_ArtSize = newArtSize; } /** * Gets the desired size of the committee. * * @return the desired size of the committee */ public int getDesiredSize() { return m_DesiredSize; } /** * Sets the desired size of the committee. * * @param newDesiredSize the desired size of the committee */ public void setDesiredSize(int newDesiredSize) { m_DesiredSize = newDesiredSize; } /** * Build Decorate classifier * * @param data the training data to be used for generating the classifier * @exception Exception if the classifier could not be built successfully */ public void buildClassifier(Instances data) throws Exception { if(m_Classifier == null) { throw new Exception("A base classifier has not been specified!"); } if(data.checkForStringAttributes()) { throw new UnsupportedAttributeTypeException("Cannot handle string attributes!"); } if(data.classAttribute().isNumeric()) { throw new UnsupportedClassTypeException("Decorate can't handle a numeric class!"); } if(m_NumIterations < m_DesiredSize) throw new Exception("Max number of iterations must be >= desired ensemble size!"); //initialize random number generator if(m_Seed==-1) m_Random = new Random(); else m_Random = new Random(m_Seed); int i = 1;//current committee size int numTrials = 1;//number of Decorate iterations Instances divData = new Instances(data);//local copy of data - diversity data divData.deleteWithMissingClass(); Instances artData = null;//artificial data //compute number of artficial instances to add at each iteration int artSize = (int) (Math.abs(m_ArtSize)*divData.numInstances()); if(artSize==0) artSize=1;//atleast add one random example computeStats(data);//Compute training data stats for creating artificial examples //initialize new committee m_Committee = new Vector(); Classifier newClassifier = m_Classifier; newClassifier.buildClassifier(divData); m_Committee.add(newClassifier); double eComm = computeError(divData);//compute ensemble error if(m_Debug) System.out.println("Initialize:\tClassifier "+i+" added to ensemble. Ensemble error = "+eComm); //repeat till desired committee size is reached OR the max number of iterations is exceeded while(i cdf[index]){ index++; } return index; } /** * Removes a specified number of instances from the given set of instances. * * @param data given instances * @param numRemove number of instances to delete from the given instances */ protected void removeInstances(Instances data, int numRemove){ int num = data.numInstances(); for(int i=num - 1; i>num - 1 - numRemove;i--){ data.delete(i); } } /** * Add new instances to the given set of instances. * * @param data given instances * @param newData set of instances to add to given instances */ protected void addInstances(Instances data, Instances newData){ for(int i=0; i