
%% bare_jrnl.tex
%% V1.1
%% 2002/08/13
%% by Michael Shell
%% mshell@ece.gatech.edu
%%
%% NOTE: This text file uses MS Windows line feed conventions. When (human)
%% reading this file on other platforms, you may have to use a text
%% editor that can handle lines terminated by the MS Windows line feed
%% characters (0x0D 0x0A).
%%
%% This is a skeleton file demonstrating the use of IEEEtran.cls
%% (requires IEEEtran.cls version 1.6 or later) with an IEEE journal paper.
%%
%% Support sites:
%% http://www.ieee.org
%% and/or
%% http://www.ctan.org/tex-archive/macros/latex/contrib/supported/IEEEtran/
%%
%% This code is offered as-is - no warranty - user assumes all risk.
%% Free to use, distribute and modify.

% *** Authors should verify (and, if needed, correct) their LaTeX system  ***
% *** with the testflow diagnostic prior to trusting their LaTeX platform ***
% *** with production work. IEEE's font choices can trigger bugs that do  ***
% *** not appear when using other class files.                            ***
% Testflow can be obtained at:
% http://www.ctan.org/tex-archive/macros/latex/contrib/supported/IEEEtran/testflow


% Note that the a4paper option is mainly intended so that authors in
% countries using A4 can easily print to A4 and see how their papers will
% look in print. Authors are encouraged to use U.S. letter paper when
% submitting to IEEE. Use the testflow package mentioned above to verify
% correct handling of both paper sizes by the author's LaTeX system.
%
% Also note that the "draftcls", not "draft", option should be used if
% it is desired that the figures are to be displayed in draft mode.

% *************************************************************************
% You can now use this template both for submitting to peer review and,
% if your paper is accepted, for sending in final publication materials.
% To change from peer review format (single column, double-spaced) to
% final publication format (double column, single-spaced), just move the
% comment-line sign (%) from one \documentclass line to the other.
% The first is for peer review format(single column), the second is for final publication(double column).

\documentclass[12pt,journal,draftcls,letterpaper,onecolumn]{IEEEtran}
%\documentclass[9.5pt,journal,final,finalsubmission,twocolumn]{IEEEtran}
% *************************************************************************


% If the IEEEtran.cls has not been installed into the LaTeX system files,
% manually specify the path to it:
% \documentclass[journal,draftcls,onecolumn]{../sty/IEEEtran}


% some very useful LaTeX packages include:

 \usepackage{cite}      % Written by Donald Arseneau
                        % V1.6 and later of IEEEtran pre-defines the format
                        % of the cite.sty package \cite{} output to follow
                        % that of IEEE. Loading the cite package will
                        % result in citation numbers being automatically
                        % sorted and properly "ranged". i.e.,
                        % [1], [9], [2], [7], [5], [6]
                        % (without using cite.sty)
                        % will become:
                        % [1], [2], [5]--[7], [9] (using cite.sty)
                        % cite.sty's \cite will automatically add leading
                        % space, if needed. Use cite.sty's noadjust option
                        % (cite.sty V3.8 and later) if you want to turn this
                        % off. cite.sty is already installed on most LaTeX
                        % systems. The latest version can be obtained at:
                        % http://www.ctan.org/tex-archive/macros/latex/contrib/supported/cite/

 \usepackage{graphicx}  % Written by David Carlisle and Sebastian Rahtz
                        % Required if you want graphics, photos, etc.
                        % graphicx.sty is already installed on most LaTeX
                        % systems. The latest version and documentation can
                        % be obtained at:
                        % http://www.ctan.org/tex-archive/macros/latex/required/graphics/
                        % Another good source of documentation is "Using
                        % Imported Graphics in LaTeX2e" by Keith Reckdahl
                        % which can be found as esplatex.ps and epslatex.pdf
                        % at: http://www.ctan.org/tex-archive/info/

%\usepackage{psfrag}    % Written by Craig Barratt, Michael C. Grant,
                        % and David Carlisle
                        % This package allows you to substitute LaTeX
                        % commands for text in imported EPS graphic files.
                        % In this way, LaTeX symbols can be placed into
                        % graphics that have been generated by other
                        % applications. You must use latex->dvips->ps2pdf
                        % workflow (not direct pdf output from pdflatex) if
                        % you wish to use this capability because it works
                        % via some PostScript tricks. Alternatively, the
                        % graphics could be processed as separate files via
                        % psfrag and dvips, then converted to PDF for
                        % inclusion in the main file which uses pdflatex.
                        % Docs are in "The PSfrag System" by Michael C. Grant
                        % and David Carlisle. There is also some information
                        % about using psfrag in "Using Imported Graphics in
                        % LaTeX2e" by Keith Reckdahl which documents the
                        % graphicx package (see above). The psfrag package
                        % and documentation can be obtained at:
                        % http://www.ctan.org/tex-archive/macros/latex/contrib/supported/psfrag/

%\usepackage{subfigure} % Written by Steven Douglas Cochran
                        % This package makes it easy to put subfigures
                        % in your figures. i.e. "figure 1a and 1b"
                        % Docs are in "Using Imported Graphics in LaTeX2e"
                        % by Keith Reckdahl which also documents the graphicx
                        % package (see above). subfigure.sty is already
                        % installed on most LaTeX systems. The latest version
                        % and documentation can be obtained at:
                        % http://www.ctan.org/tex-archive/macros/latex/contrib/supported/subfigure/

%\usepackage{url}       % Written by Donald Arseneau
                        % Provides better support for handling and breaking
                        % URLs. url.sty is already installed on most LaTeX
                        % systems. The latest version can be obtained at:
                        % http://www.ctan.org/tex-archive/macros/latex/contrib/other/misc/
                        % Read the url.sty source comments for usage information.

%\usepackage{stfloats}  % Written by Sigitas Tolusis
                        % Gives LaTeX2e the ability to do double column
                        % floats at the bottom of the page as well as the top.
                        % (e.g., "\begin{figure*}[!b]" is not normally
                        % possible in LaTeX2e). This is an invasive package
                        % which rewrites many portions of the LaTeX2e output
                        % routines. It may not work with other packages that
                        % modify the LaTeX2e output routine and/or with other
                        % versions of LaTeX. The latest version and
                        % documentation can be obtained at:
                        % http://www.ctan.org/tex-archive/macros/latex/contrib/supported/sttools/
                        % Documentation is contained in the stfloats.sty
                        % comments as well as in the presfull.pdf file.
                        % Do not use the stfloats baselinefloat ability as
                        % IEEE does not allow \baselineskip to stretch.
                        % Authors submitting work to the IEEE should note
                        % that IEEE rarely uses double column equations and
                        % that authors should try to avoid such use.
                        % Do not be tempted to use the cuted.sty or
                        % midfloat.sty package (by the same author) as IEEE
                        % does not format its papers in such ways.

 \usepackage{amsmath}   % From the American Mathematical Society
                        % A popular package that provides many helpful commands
                        % for dealing with mathematics. Note that the AMSmath
                        % package sets \interdisplaylinepenalty to 10000 thus
                        % preventing page breaks from occurring within multiline
                        % equations. Use:
 \interdisplaylinepenalty=2500
                        % after loading amsmath to restore such page breaks
                        % as IEEEtran.cls normally does. amsmath.sty is already
                        % installed on most LaTeX systems. The latest version
                        % and documentation can be obtained at:
                        % http://www.ctan.org/tex-archive/macros/latex/required/amslatex/math/



% Other popular packages for formatting tables and equations include:

% Frank Mittelbach's and David Carlisle's array.sty which improves the
% LaTeX2e array and tabular environments to provide better appearances and
% additional user controls. Array.sty is already installed on most systems.
% The latest version and documentation can be obtained at:
% http://www.ctan.org/tex-archive/macros/latex/required/tools/

% Mark Wooding's extremely powerful MDW tools, especially mdwmath.sty and
% mdwtab.sty which are used to format equations and tables, respectively.
% The MDWtools set is already installed on most LaTeX systems. The lastest
% version and documentation is available at:
% http://www.ctan.org/tex-archive/macros/latex/contrib/supported/mdwtools/

% V1.6 of IEEEtran contains the IEEEeqnarray family of commands that can
% be used to generate multiline equations as well as matrices, tables, etc.


% Also of notable interest:

% Scott Pakin's eqparbox package for creating (automatically sized) equal
% width boxes. Available:
% http://www.ctan.org/tex-archive/macros/latex/contrib/supported/eqparbox/




% *** Do not adjust lengths that control margins, column widths, etc. ***
% *** Do not use packages that alter fonts (such as pslatex).         ***
% There should be no need to do such things with IEEEtran.cls V1.6 and later.


% correct bad hyphenation here
\hyphenation{op-tical net-works semi-conduc-tor}


\begin{document}
%
% paper title
\title{Using a Genetic Algorithm to Control Randomized Unit Testing}
%
%
% author names and IEEE memberships
% note positions of commas and nonbreaking spaces ( ~ ) LaTeX will not break
% a structure at a ~ so this keeps an author's name from being broken across
% two lines.
% use \thanks{} to gain access to the first footnote area
% a separate \thanks must be used for each paragraph as LaTeX2e's \thanks
% was not built to handle multiple paragraphs
\author{James~H.~Andrews,~\IEEEmembership{Member,~IEEE,}
        Felix~C.~H.~Li,~\IEEEmembership{Member,~IEEE,}
        and~Tim~Menzies,~\IEEEmembership{Member,~IEEE}% <-this % stops a space
\thanks{Manuscript received Septober 31, 1984; revised Nowonder 13, 1985.}% <-this % stops a space
\thanks{J.\ Andrews and F.\ Li are with the Department of Computer Science, University of Western Ontario, London, Ont., Canada, N6A 2B7.  E-mail: andrews@csd.uwo.ca.}%
\thanks{T.\ Menzies is with the Lane Department of Computer Science and  Electrical Engineering, West Virginia University, Morgantown, WV 26506-610.  E-mail: tim@menzies.us.}}
% note the % following the last \IEEEmembership and also the first \thanks -
% these prevent an unwanted space from occurring between the last author name
% and the end of the author line. i.e., if you had this:
%
% \author{....lastname \thanks{...} \thanks{...} }
%                     ^------------^------------^----Do not want these spaces!
%
% a space would be appended to the last name and could cause every name on that
% line to be shifted left slightly. This is one of those "LaTeX things". For
% instance, "A\textbf{} \textbf{}B" will typeset as "A B" not "AB". If you want
% "AB" then you have to do: "A\textbf{}\textbf{}B"
% \thanks is no different in this regard, so shield the last } of each \thanks
% that ends a line with a % and do not let a space in before the next \thanks.
% Spaces after \IEEEmembership other than the last one are OK (and needed) as
% you are supposed to have spaces between the names. For what it is worth,
% this is a minor point as most people would not even notice if the said evil
% space somehow managed to creep in.
%
% The paper headers
\markboth{IEEE Transactions on Software Engineering,~Vol.~1, No.~1,~Septober~1999}{Andrews \MakeLowercase{\textit{et al.}}: Using a Genetic Algorithm to Control Randomized Unit Testing}
% The only time the second header will appear is for the odd numbered pages
% after the title page when using the twoside option.


% If you want to put a publisher's ID mark on the page
% (can leave text blank if you just want to see how the
% text height on the first page will be reduced by IEEE)
 \pubid{0000--0000/00\$00.00~\copyright~2002 IEEE}

% use only for invited papers
%\specialpapernotice{(Invited Paper)}

% make the title area
\maketitle


\begin{abstract}
  Randomized testing has been shown to be an effective method for
  testing software units.  However, the thoroughness of randomized
  unit testing varies widely according to the settings of certain
  parameters, such as the relative frequencies with which methods
  are called.  In this paper, we describe a system which uses a
  genetic algorithm to find parameters for randomized unit testing
  that optimize test coverage.  We compare our coverage results to
  previous work, and report on case studies and experiments on
  system options.  In order to optimize the system, we used data
  mining techniques to analyze which genes were the most useful.
  We also report on the results of this analysis and optimization.
\end{abstract}

\begin{keywords}
Software testing, randomized testing, genetic algorithms,
data mining.
\end{keywords}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Introduction}

XXX NOT MORE THAN 35 PAGES IN THIS FORMAT

\IEEEPARstart{S}oftware testing involves running a piece of software (the
software under test, or SUT) on selected input data, and
checking the outputs for correctness.  The goals of software
testing are to force failures of the SUT, and to be thorough.
The more thoroughly we have tested an SUT without forcing
failures, the more sure we are of the reliability of the SUT.

Randomized testing is the practice of using randomization for
some aspects of test input data selection.
% A software unit is a
% method, group of methods, module or class; a unit test is
% usually built upon a sequence of method calls.
Several independent studies
\cite{miller-fuzz-cacm,andrews-etal-rute-rt,%
pacheco-etal-icse2007,groce-etal-icse2007}
have found that randomized
testing of software units is effective at forcing failures in
even well-tested units.  However, there remains a question of
whether randomized testing can be thorough enough.  Using
various code coverage measures to measure thoroughness,
researchers have come to varying conclusions about the ability
of randomized testing to be thorough
\cite{michael-etal-ga-tcg,visser-etal-issta06,andrews-etal-rute-rt}.

The thoroughness of randomized unit testing is highly dependent
on parameters that control when and how randomization is
applied.  These parameters include the number of method calls
to make, the relative frequency with which different methods are
called, and the ranges from which numeric arguments are chosen.
The manner in which previously-used arguments or
previously-returned values are used in new method calls,
which we refer to as the {\it value reuse policy}, is also
a crucial factor.  It is often difficult to work out the optimal
values of the parameters and the optimal value reuse policy by hand.

In this paper, we describe Nighthawk, a system for generating
unit test data.  The system can be viewed as consisting of two
levels.  The lower level is a randomized unit testing engine
which tests a set of methods according to parameter values
specified as genes in a chromosome, including parameters that
encode a value reuse policy.  The upper level is a genetic
algorithm (GA) which uses fitness evaluation, selection,
mutation and recombination of chromosomes to find good values
for the genes.  Goodness is evaluated on the basis of test
coverage and number of method calls performed.

Users can use Nighthawk to find good parameters, and then
perform randomized unit testing based on those parameters.  The
randomized testing can quickly generate many new test cases that
achieve high coverage, and can continue to do so for as long as
users wish to run it.

XXXX why this is great

\subsection{Randomized Unit Testing}

Unit testing is variously defined as the
testing of a single method, a group of methods, a module or a
class.  We will use it in this paper to mean the testing of a
group $M$ of methods, called the {\it target methods}.  A unit
test is a sequence of calls to the target methods, with each
call possibly preceded by code that sets up the arguments and
the receiver\footnote{
  We use the word ``receiver'' to refer to the object that
  a method is called on.  For instance, in the Java method
  call ``{\tt t.add(3)}'', the receiver is {\tt t}.
}, and
with each call possibly followed by code that stores and checks results.

{\it Randomized unit testing} is unit testing where there is
some randomization in the selection of the target
method call sequence and/or arguments to the
method calls.  Many researchers
\cite{doong-frankl-tosem94,antoy-hamlet-tse-jan2000,%
claessen-hughes-quickcheck,pacheco-etal-icse2007}
%sen-etal-cute,visser-etal-issta06}
have performed randomized unit testing,
sometimes combined with other tools such as model
checkers.

A key concept in randomized unit testing is that of {\it value
reuse}.  We use this term to refer to how the
testing engine reuses the receiver, arguments or return values
of past method calls when making new method calls.  In previous
research, value reuse has mostly taken the form of making a
sequence of method calls all on the same receiver object.

In our previous research, we developed a GUI-based randomized
unit testing engine called RUTE-J \cite{andrews-etal-rute-rt}.
To use RUTE-J, users write their own customized test wrapper
classes, hand-coding such parameters as relative frequencies of
method calls.  Users also hand-code a value reuse policy by
drawing receiver and argument values from value pools, and
placing return values back in value pools.  Finding good
parameters quickly, however, requires experience with the tool.

The system Nighthawk described in this paper significantly
builds on this work by automatically determining good parameters.
The lower, randomized-testing,
level of Nighthawk initializes and maintains one or more value
pools for all relevant types, and draws and replaces values in
the pools according to a policy specified in a chromosome.
The chromosome also specifies relative frequencies of methods,
method parameter ranges, and other testing parameters.
The upper, genetic-algorithm, level performs a search for the
parameter setting that causes the lower level to achieve a high
value of a coverage-related measure.  Nighthawk uses only the
Java reflection facility to gather information about the SUT,
making its general approach robust and adaptable to other languages.

\subsection{Contributions and Paper Organization}

The main contributions of this paper are as follows.
\begin{enumerate}
\item We describe the implementation of a novel two-level
  genetic-random testing system, Nighthawk.  In particular,
  we describe how we encode a value reuse policy in a manner
  amenable to meta-heuristic search.
\item We compare Nighthawk to other systems described in
  previous research, showing that it can achieve the same
  coverage levels.
\item We describe the results of a case study carried out on
  real-world units (the Java 1.5.0 Collection and Map classes)
  to determine the effects of different option settings on the
  basic algorithm.
\item We describe how we optimized Nighthawk by systematically
  analyzing which genes have the greatest effect on the fitness
  of a chromosome, and eliminating genes that we found to have
  little effect.
\end{enumerate}

We discuss related work in section 2.
In section 3, we describe the results of an exploratory study
that suggested that a genetic-random approach was feasible and
could find useful parameter settings.  In section 4, we describe
the design and use of Nighthawk.  Section 5 contains our
comparison to previous work, and section 6 our case study;
section 7 contains a discussion of the threats to validity
of the empirical work in the paper.

\section{Related Work}

\subsection{Randomized Unit Testing}

``Random'' or ``randomized'' testing has a long history, being
mentioned as far back as 1973 \cite{hetzel-book-1973};
Hamlet \cite{Hamlet94randomtesting} gives a good survey.
The key benefit of randomized testing is the ability to generate
many distinct test inputs in a short time, including test inputs
that may not be selected by test engineers but which may
nevertheless force failures.  There are, however, two main
problems with randomized testing:  the oracle problem and
and the question of thoroughness.

Since randomized testing depends on the generation of many
inputs, it is infeasible to get a human to check all test
outputs; an automated test oracle \cite{weyuker-oracles} is
needed.  There are two main approaches to the oracle problem.
The first
is to use general-purpose, ``high-pass'' oracles that pass many
executions but check properties that should be true of most
software.  For instance, Miller et al.\ \cite{miller-fuzz-cacm}
judge a randomly-generated GUI test case as failing only if the
software crashes or hangs; Csallner and
Smaragdakis \cite{jcrasher-spe} judge a randomly-generated unit
test case as failing if it throws an exception; and Pacheco
et al.\ \cite{pacheco-etal-icse2007} check general-purpose
contracts for units, such as one that states that a method
should not throw a ``null pointer'' exception unless one of its
arguments is null.  Despite the use of high-pass oracles,
all these authors found randomized testing to be effective in
forcing failures.

The second approach to the oracle problem for randomized testing
is to write oracles in order to check properties specific to the
software \cite{andrews-etal-rute-rt,ciupa-etal-icse2008}.
These oracles, like all formal unit specifications,
are non-trivial to write; tools such as Daikon for automatically
deriving likely invariants \cite{ernst-daikon} could help here.

Since randomized unit testing does not use any intelligence to
guide its search for test cases, there has always been
justifiable concern about how thorough it can be, given various
measures of thoroughness, such as coverage and fault-finding
ability.
%
Michael et al.\ \cite{michael-etal-ga-tcg} performed randomized testing
on the well-known Triangle program; this program accepts three
integers as arguments, interprets them as sides of a triangle,
and reports whether the triangle is equilateral, isosceles,
scalene, or not a triangle at all.  They concluded that
randomized testing could not achieve 50\% condition/decision
coverage of the code, even after 1000 runs.
%
Visser et al.\ \cite{visser-etal-issta06}
compared randomized unit testing with various model-checking
approaches and found that while randomized testing was good at
achieving block coverage, it failed to achieve optimal coverage
for stronger coverage measures, such as a measure derived from
Ball's predicate coverage \cite{ball-pred-coverage}.

Other researchers, however, have found that the thoroughness of
randomized unit testing depends on how exactly it is implemented.
%
Doong and Frankl \cite{doong-frankl-tosem94} tested several
units using randomized sequences of method calls, and found that
by varying some parameters of the randomized testing, they could
greatly increase or decrease the likelihood of finding injected
faults.  The parameters included number of operations performed,
ranges of integer arguments, and the relative frequencies of
some of the methods in the call sequence.
%
Antoy and Hamlet \cite{antoy-hamlet-tse-jan2000}, who checked
the Java {\tt Vector} class against a formal specification
using random input, similarly found that if they avoided calling
some of the methods (essentially setting their relative
frequencies to zero), they could cover more code in the class.
%
Andrews and Zhang \cite{andrews-zhang-tse2003}, performing
randomized unit testing on C data structures, found that varying
the ranges from which integer key and data parameters were
chosen increased the fault-finding ability of the random testing.

Finally, the aforementioned research by Pacheco
et al.\ \cite{pacheco-etal-icse2007} enhances the thoroughness
of randomized testing by doing a partial randomized
breadth-first search of the search space of possible test cases,
pruning branches that lead to redundant or illegal values which
would cause the system to waste time on unproductive test cases.

Of the cited approaches, the approach described in this paper
is most similar to Pacheco et al.'s.  The primary difference
is that we achieve thoroughness by generating long sequences of
method calls on different receivers, while they do so by
deducing shorter sequences of method calls on a smaller set of
receivers.
%
The focus of our research is also different.
Pacheco et al.\ focus on identifying contracts for units and
finding test cases that violate them.  In contrast, we focus on
maximizing code coverage; coverage is an objective measure of
thoroughness that applies regardless of whether failures have
been found, for instance in situations in which most bugs have
been eliminated from a unit.

\subsection{Analysis-Based Test Data Generation Approaches}

Approaches to test data generation via symbolic execution have
existed as far back as 1976
\cite{clarke-76-testdata,king-symbolic-1976}.
Such approaches typically attempt to generate a thorough set of
test cases by deducing which combinations of inputs will cause
the software to follow given paths.
Korel's TESTGEN system \cite{korel-testgen}, for example,
transforms each condition in the program to one of the form
$e < 0$ or $e \leq 0$, and then searches for values that minimize
(resp.\ maximize) $e$, thus causing the condition to become true (resp.\ false).

Other source code analysis-based approaches have used such
methods as iterative relaxation of a set of constraints on
input data \cite{gupta-etal-gen-test-data} and generation
of call sequences using goal-directed
reasoning \cite{leow-etal-icse2004}.  Some recent approaches use
model checkers such as Java Pathfinder \cite{visser-etal-issta04}.
These approaches are sometimes augmented with ``lossy''
randomized search for paths, as in
the DART and CUTE systems \cite{godefroid-etal-dart,sen-etal-cute},
the Lurch system \cite{owen-menzies-lurch},
and the Java Pathfinder-based
research of Visser et al.\ \cite{visser-etal-issta06}.

Some analysis-based approaches are limited in the range of
different conditions they consider; for instance, TESTGEN's
minimization strategy \cite{korel-testgen} cannot be applied
usefully to conditions involving pointers.  In addition, most
analysis-based approaches incur heavy memory and processing time
costs, and are as yet infeasible except for small software
units.  These limitations are the primary reason why researchers
have explored the use of heuristic and metaheuristic approaches
to test case generation.

\subsection{Genetic Algorithms for Testing}

Genetic algorithms (GAs) were first described by Holland
\cite{holland-ga-book}.  GAs typically represent a solution to a
problem as a ``chromosome'', with various aspects of the
solution to the problem represented as ``genes'' in the
chromosomes.  The possible chromosomes form a search space and
are associated with a fitness function, which typically
represents how good a solution the chromosome encodes.  Search
proceeds by evaluating the fitness of each of a population of
chromosomes, and then performing point mutations and
recombination on the most successful chromosomes.
GAs have been found to be superior to
purely random search in finding solutions to many complex
problems. Goldberg \cite{goldberg-ga-book} argues that their
power stems from being able to engage in ``discovery and
recombination of building blocks'' for solutions in a solution space.

Meta-heuristic search methods such as GAs have often been
applied to the problem of test suite generation.
In Rela's review of 122 applications of meta-heuristic search
in software engineering \cite{rela04}, 44\% of the
applications related to testing.  Approaches to GA test suite
generation can be black-box (requirements-based) or white-box
(code-based); here we focus on four representative white-box
approaches, since our approach is coverage-based and therefore
also white-box.

Pargas et al.\ \cite{pargas-etal-ga-tcg} represent a set of
test data as a chromosome, in which each gene encodes one
input value to the software.
Michael et al.\ \cite{michael-etal-ga-tcg} represent test data
similarly, and conduct experiments comparing various strategies
for augmenting the GA search.  Both of these approaches
evaluate the fitness of a chromosome by measuring
how close the input is to covering some desired statement or
condition direction.
Guo et al.\ \cite{guo-etal-genetic} generate unique input-output
(UIO) sequences for protocol testing using a genetic algorithm;
the sequence of genes represents a sequence of inputs to a
protocol agent, and the fitness function computes a measure
related to the coverage of the possible states and transitions
of the agent.  Finally, Tonella's approach to class testing
\cite{tonella-issta04}
represents the sequence of method calls in a unit test as a
chromosome; the approach features mutation operators customized
to the problem, such as one that inserts method invocations.

\subsection{Analytic Comparison of Approaches}

Once a large community starts
comparatively evaluating some technique, then
{\em evaluation methods} for different methods become just as
important as the {\em generation of new methods}.
To place this comment in an historical perspective, we note that
evaluation bias is an active research area in the field of data
mining \cite{bouck03,demsar06}.
Much of our future work should hence focus on a meta-level
analysis of the advantages and
disadvantages of different assessment criteria.
Currently, there are no clear conventions on how this
type of work should be assessed.  However, we can attempt some
analytic comparisons.

It is clear that there are situations in which a source code
analysis-based approach such as symbolic evaluation or model
checking will be superior to any randomized
approach.  For instance, for an {\tt if} statement decision
of the form {\tt (x==742 \verb/&&/ y==113)}, random search of
the space of all possible {\tt x,y} pairs is unlikely to
produce a test case that executes the decision in the true
direction, while a simple analysis of the source code will be
successful.  The question is how often these situations
arise in real-world programs.  The system Nighthawk of this
paper cannot guess at constants like 742, but is still able
to cover the true direction of decisions of the form
{\tt x==y} because the value reuse policies it discovers will
often choose {\tt x} and {\tt y} from the same value pool.

It is therefore likely that randomized testing and
analysis-based approaches have complementary strengths.
Groce et al.\ \cite{groce-etal-icse2007} conclude that
randomized testing is a good first step, before model checking,
in achieving high quality software, especially where the
existence of a reference implementation allows differential
randomized testing \cite{mckeeman-differential}.



%\begin{itemize}
%\item Static code analysis can direct the generation of the test
%  cases.  Our method, on the other hand, generates test cases at
%  random, so it is possible that we cover some code more than is
%  necessary, and that parts of our value pools are  useless.
%  Our view is that this is not a major issue since our
%  runtimes, on real-world systems, are quite impressive.
%  Nevertheless, there needs to be more discussion on how to
%  assess test suite generation; i.e.\ runtimes versus
%  superfluous tests versus any other criteria.
%\end{itemize}


\begin{figure}

\centering
%\includegraphics[width=2.5in]{myfigure.eps}
\includegraphics[width=3in]{xyPlot.eps}

\caption{Spiky search space resulting from poor fitness function.}
\label{spiky-search-space-fig}
\end{figure}

Genetic algorithms do not perform well when the search space is
mostly flat, with steep jumps in fitness score.
Consider the problem of generating two input values
$x$ and $y$ that will cover the true direction of the decision
``$x$\verb/==/$y$''.  If we cast the problem as a search for the
two values themselves, and the score as whether we have found
two equal values, the search space is shaped as in
Figure \ref{spiky-search-space-fig}: a flat plain
of zero score
with spikes along the diagonal.  Most approaches
to GA white-box test data generation attempt to address this
problem by proposing fitness functions that detect ``how close''
the target decision is to being true, often using analysis-based
techniques.  For instance, Tonella \cite{tonella-issta04} uses
a fitness function that specifically takes account of such
conditions by measuring how close {\tt x} and {\tt y} are.
Watkins and Hufnagel \cite{watkins-hufnagel-fitness} enumerate
and compare fitness functions proposed for GA-based test case
generation.

\begin{figure}

\centering
%\includegraphics[width=2.5in]{myfigure.eps}
\includegraphics[width=3in]{lohiPlot.eps}

\caption{Smooth search space resulting from recasting problem.}
\label{smooth-search-space-fig}
\end{figure}

In contrast, in our research, we essentially recast the problem as a
search for the best values of two variables $lo$ and $hi$ that
will be used as the lower and upper bound for random generation
of $x$ and $y$, and the score as the probability of generating
two equal values.  Seen in this way, the search space landscape
still contains a spiky ``cliff'', as seen in
Figure \ref{smooth-search-space-fig}, but the cliff is
approached on one side by a gentle slope.
We further consider not only numeric data, but data of any type.

In all the approaches to GA-based test input generation that
we are aware of, each run of the GA results in a single
test case, which is meant to reach a particular target.  A test
suite is built up by aiming the GA at different targets, resulting
in a minimal test suite that achieves coverage of all targets.
Herein lies a potential disadvantage of such approaches.
Rothermel et al. \cite{rothermel-etal-effects-minimization}
have shown that reducing a test suite to a minimal subset that
achieves the same coverage can significantly reduce its fault
detection capability.  The GA can of course be re-run to generate
more test cases, but there is a potential performance penalty
since each run of the GA generates only one new test case.
In contrast, in our approach, each run of the GA results in a
setting for parameters for randomized testing, which can be
applied cheaply many times to generate many distinct
high-coverage test cases.

%\begin{center}
%\begin{tabular}{c|ccccccc|ccccc}
%$x|y$ & 1 & 2 & 3 & 4 & 5 & ~~~ &
%$lo|hi$ & 1 & 2 & 3 & 4 & 5 \\
%\cline{1-6} \cline{8-13}
%1 & 1.0 & .00 & .00 & .00 & .00 & &
%1 & 1.0 & .50 & .33 & .25 & .20 \\
%2 & .00 & 1.0 & .00 & .00 & .00 & &
%2 & .00 & 1.0 & .50 & .33 & .25 \\
%3 & .00 & .00 & 1.0 & .00 & .00 & &
%3 & .00 & .00 & 1.0 & .50 & .33 \\
%4 & .00 & .00 & .00 & 1.0 & .00 & &
%4 & .00 & .00 & .00 & 1.0 & .50 \\
%5 & .00 & .00 & .00 & .00 & 1.0 & &
%5 & .00 & .00 & .00 & .00 & 1.0 \\
%\end{tabular}
%\end{center}
%
%\caption{Chance of selecting two identical integers $x$ and $y$.
%(left) Search space as space of $(x,y)$ pairs.
%(right) Search space as space of lower and upper bounds
%for random generation of $x$ and $y$.}
%\label{search-space-fig}
%\end{figure*}

All analysis-based
approaches share the disadvantage of requiring a robust parser
and source code analyzer that can be updated to reflect changes
in the source language.  These complex tools are not often
provided by language providers.
Our approach does not require source code or bytecode analysis,
instead depending only on the robust Java reflection mechanism
and commonly-available coverage tools.
For instance, our code was
initially written with Java 1.4 in mind, but worked seamlessly
on the Java 1.5 versions of the {\tt java.util} classes, despite
the fact that the source code of many of the units had been
heavily modified to introduce templates.  However,
model-checking approaches have other strengths,
such as the ability to analyze multithreaded code
\cite{havelund-pathfinder}, further supporting the conclusion
that the two approaches are complementary.

% Look up Baresel paper BSS02!!

%We also generalize the problem domain from the heuristic to the
%meta-heuristic by adding the GA level.

\section{Exploratory Study}
\label{exploratory-section}

To find out whether there was any merit in the idea of a
genetic-random system, we conducted an exploratory study.
In this section, we describe the prototype software we developed,
the design of the study and its results.

\subsection{Software Developed}

Using code from RUTE-J (see above) and the open-source genetic
algorithm package JDEAL \cite{jdeal}, we constructed a prototype
two-level genetic-random unit testing system that took Java
classes as its testing units.  For each unit under test (UUT) with
$n$ methods to call, the GA level constructed a chromosome with
$2+n$ integer genes; these genes represented
the number of method calls to make in each
test case, the number of test cases to generate, and the
relative weights (calling frequencies) of the $n$ methods.  All
other randomized testing parameters
were hard-coded in the test wrappers.

The evaluation of the fitness of each chromosome $c$ proceeded as
follows.  We got the random testing level to generate the number
of test cases of the length specified in $c$, using
the method weights specified in $c$.  We then
measured the number of coverage points covered using
Cobertura \cite{cobertura-website},
which measures line coverage.  If we had based the
fitness function {\it only} on coverage, however, then any chromosome
would have benefitted from having a larger number of
method calls and test cases, since every new method call has the
potential of covering more code.
We therefore built in a brake to prevent these values from
getting unfeasibly high.  We calculated the fitness function
as:
\begin{center}
(number of coverage points covered) $*$ (coverage factor) \\
 $-$ (number of method calls performed overall)
\end{center}
We set the coverage factor to 1000, meaning that we were willing
to make 1000 more method calls (but not more) if that meant
covering one more coverage point.

\subsection{Experiment Design}

We chose as our subject programs three units taken from the Java
1.4.2 edition of {\tt java.util}: {\tt BitSet}, {\tt HashMap}
and {\tt TreeMap}.  These units were clearly in wide use,
and {\tt TreeMap} had been used as the basis of earlier
experiments by other researchers \cite{visser-etal-issta04}.
For each UUT, we wrote a
test wrapper class containing methods that called selected
target methods of the UUT (16 methods for BitSet, 8 for HashMap
and 9 for TreeMap).
Each wrapper contained a simple oracle for checking correctness.
We instrumented each UUT using Cobertura.

We ran the two-level algorithm 30 times on each of the three
test wrappers, and recorded the amount of time taken and the
parameters in the final chromosome.
To test whether the weights in the chromosomes were
useful given the length and number of method calls, for each
final chromosome $c$ we created a variant chromosome $c'$
with the same length and number of method calls but with all
weights equal.  We then compared the coverage achieved by
$c$ and $c'$ on 30 paired trials.  Full results from the
experiment are available in \cite{cli-msc}.

\subsection{Results}

We performed two statistical tests to evaluate whether the
system was converging on a reasonable solution.  First, we
ordered the average weights discovered for each method in each
class, and performed a $t$ test
with Bonferroni correction between each pair of
adjacent columns.
We found that for the {\tt HashMap} and {\tt TreeMap} units, the
{\tt clear} method (which removes all data from the map) had a
statistically significantly lower weight than the other methods,
indicating that the algorithm was consistently converging on a
solution in which it had a lower weight.
This is because much of the code in these
units can be executed only when there is a large amount of data
in the container objects.  Since the {\tt clear} method clears
out all the data, executing it infrequently ensured that
the objects would get large enough.

We also found that for the {\tt TreeMap} unit, the {\tt remove}
and {\tt put} methods had a statistically significantly higher
weight than the other methods.  This is explainable by the large
amount of complex code in these methods and the private methods
that they call; it takes more calls to
cover this code than it does for the simpler code of
the other methods.  Another reason is that sequences of {\it put}
and {\it remove} were needed to create data structures via which
code in some of the other methods was accessible.

The second statistical test we performed tested
whether the weights found by the GA were efficient.
For this, we used the 30 trials comparing the discovered
chromosome $c$ and the equal-weight variant $c'$.  We found that
for all three units, the equal-weight chromosome covered less
code than the
original, to a statistically significant level (as measured by a
$t$ test with $\alpha=0.05$).  This can be interpreted as
meaning that the GA was correctly choosing a good {\it combination}
of parameters.

In the course of the experiment, we found a bug in the Java
1.4.2 version of {\tt BitSet}:  when a call to {\tt set()} is
performed on a range of bits of length 0, the unit could later
return an incorrect ``length'' for the {\tt BitSet}.
We found that a bug report for this bug
had already been submitted to Sun's bug database.  It has been
corrected in the Java 1.5.0 version of the library.

In summary, the experiment indicated that the two-level
algorithm was potentially useful, and was consistently
converging on similar solutions that were more optimal than
calling all methods equally often.

\section{Nighthawk:  System Description}

The results of our exploratory study encouraged us to expand
the scope of the GA to include method parameter ranges, value
reuse policy and other randomized testing parameters.  The
result was the Nighthawk system.

In this section, we first outline the lower, randomized-testing,
level of Nighthawk, and then describe the chromosome that
controls its operation.  We then describe the
genetic-algorithm level and the
end user interface.  Finally, we describe the use
of automatically-generated test wrappers for precondition
checking, result evaluation and coverage enhancement.

\subsection{Randomized Testing Level}

Here we present a simplified description of the
algorithm that the lower, randomized-testing, level of Nighthawk
uses to construct and run a test case.  The algorithm takes two
parameters:  a set $M$ of Java methods, and a GA
chromosome $c$ appropriate to $M$.  The chromosome controls
aspects of the algorithm's behaviour, such as the number of
method calls to be made, and will be described in
more detail in the next subsection.

We refer to $M$ as the set of ``target methods''.  We define the
set $I_M$ of {\it types of interest} corresponding to $M$ as the
union of the following sets of types\footnote{
  In this paper, the word ``type'' refers to any primitive type,
  interface, or abstract or concrete class.
}:
\begin{itemize}
\item All types of receivers, parameters and return values of methods in $M$.
%\item All types of parameters of methods in $M$.
%\item All types of return values of methods in $M$.
\item All primitive types that are the types of parameters
  to constructors of other types of interest.
\end{itemize}
Each type $t \in I_M$
is associated with an array of {\it value pools}, and each
value pool for $t$ contains an array of values of type $t$. 
Each value pool for a range primitive type (a primitive type
other than {\tt boolean} and {\tt void}) has bounds on
the values that can appear in it.  The number of value pools,
number of values in each value pool, and the range primitive
type bounds are specified by the chromosome $c$.

The algorithm first chooses initial values for primitive type
pools, and then moves on to non-primitive type pools.  We define
a constructor method to be an {\it initializer} if it has no
parameters, or if all its parameters are of primitive types.
We define a constructor to be a {\it reinitializer} if it has no
parameters, or if all its parameters are of types in $I_M$.
We define the set $C_M$ of {\it callable methods} to be the
methods in $M$ plus the reinitializers of the types of $I_M$.
The callable methods are the ones that Nighthawk calls directly.

A {\it call description} is an object representing one method
call that has been constructed and run.  It consists of the
method name, an indication of whether the method call succeeded,
failed or threw an exception, and one {\it object description}
for each of the receiver, the parameters and the
result (if any).
%The object description of an object of primitive type or a
%string consists of the object itself.  The object description of
%other objects consists of the class of interest, the value pool
%number and the value number associated with the object; this is
%the source from which the object came (for receivers and
%parameters) or the place where the value was put (for results).
A {\it test case} is a sequence of call descriptions, together
with an indication of whether the test case succeeded or failed.

\begin{figure}
Input:  a set $M$ of target methods; a chromosome $c$. \\
Output:  a test case. \\
Steps:
\begin{enumerate}
\item For each element of each value pool of each primitive
  type in $I_M$, choose an initial value that is within the bounds
  for that value pool.
\item For each element of each value pool of each other type $t$
  in $I_M$:
  \begin{enumerate}
  \item If $t$ has no initializers, then set the element to {\tt null}.
  \item Otherwise, choose an initializer method $i$ of $t$, call
    {\sf tryRunMethod}$(i, c)$, and place the result in the
    destination element.
  \end{enumerate}
\item Initialize test case $k$ to the empty test case.
\item Repeat $n$ times, where $n$ is the number of method calls to
  perform:
  \begin{enumerate}
  \item Choose a target method $m \in C_M$.
  \item Run algorithm {\sf tryRunMethod}$(m, c)$, and
        add the call description returned to $k$.
  \item If {\sf tryRunMethod} returns a method call failure
    indication, return $k$ with a failure indication.
  \end{enumerate}
\item Return $k$ with a success indication.
\end{enumerate}

\caption{Algorithm {\sf constructRunTestCase}.}
\label{constructRunTestCase-fig}
\end{figure}

\begin{figure}[!t]
Input:  a method $m$; a chromosome $c$. \\
Output:  a call description. \\
Steps:
\begin{enumerate}
\item If $m$ is non-static and not a constructor:
  \begin{enumerate}
  \item Choose a type $t \in I_M$ which is a subtype of the receiver of $m$.  
  \item Choose a value pool $p$ for $t$.
  \item Choose one value $recv$ from $p$ to
    act as a receiver for the method call.
  \end{enumerate}
\item For each argument position to $m$:
  \begin{enumerate}
  \item Choose a type $t \in I_M$ which is a subtype of the argument type.
  \item Choose a value pool $p$ for $t$.
  \item Choose one value $v$ from $p$ to act as the argument.
  \end{enumerate}
\item If the method is a constructor or is static, call it with
  the chosen arguments.  Otherwise, call it on $recv$ with the
  chosen arguments.
\item If the method call threw an {\tt AssertionError},
  return a call description with a failure indication.
\item Otherwise, if the method call threw some other exception,
  return a call description with an exception indication.
\item Otherwise, if the method return type is not {\tt void},
  and the return value $ret$ is non-null:
  \begin{enumerate}
  \item Choose a type $t \in I_M$ which is a supertype of the
    type of the return value.
  \item Choose a value pool $p$ for $t$.
  \item If $t$ is not a primitive type, or if $t$ is a primitive
    type and $ret$ does not violate the bounds on
    $p$, then choose an element of $p$ and
    replace it by $ret$.
  \item Return a call description with a success indication.
  \end{enumerate}
\end{enumerate}

\caption{Algorithm {\sf tryRunMethod}.}
\label{tryRunMethod-fig}
\end{figure}

Nighthawk's randomized testing algorithm is referred to as
{\sf constructRunTestCase}, and is described in Figure
\ref{constructRunTestCase-fig}.  It takes a set $M$ of target
methods and a chromosome $c$ as inputs.  It begins by
initializing value pools, and then constructs and runs a
test case, and returns the test case.  It uses an auxiliary
method called {\sf tryRunMethod}, described in Figure
\ref{tryRunMethod-fig}, which takes a method as input,
calls the method and returns a call description.
In the algorithm descriptions, the word ``choose'' is always used
to mean specifically a random choice which may partly depend on
the chromosome $c$.

{\sf tryRunMethod} considers a method call to fail if and only
if it throws an {\tt AssertionError}.  It does not consider
other exceptions to be failures, since they might be correct
responses to bad input parameters.  A separate mechanism
is used for detecting precondition violations and checking
correctness of return values and exceptions; see
Section \ref{test-wrappers-section}.

For conciseness, the algorithm descriptions omit some details
which we now fill in.  These concern the treatment of nulls, the
treatment of {\tt String}, and the treatment of {\tt Object}.

The receiver of a method call cannot be null, and no parameter
can be null unless {\sf tryRunMethod} chooses it to be.  If
{\sf tryRunMethod} fails to find a non-null value when it is
looking for one, it reports failure of the {\it attempt} to call the
method.  {\sf constructRunTestCase} tolerates a certain number
of these attempt failures before terminating the
test case generation process.

{\tt java.lang.String} is treated as if it is a primitive type,
the values in the value pools being initialized with ``seed
strings''.  Some default seed strings are supplied by the
system, and the user can supply more.

\label{object-section}
Formal parameters of type {\tt java.lang.Object} stand
for some arbitrary object, but it is usually sufficient
to use a small number of specific
types as actual parameters;
Nighthawk uses only {\tt int} and {\tt String} by default.
A notable exception to this rule is the parameter to the {\tt equals()}
method, which can be treated specially by test wrapper objects
(see Section \ref{test-wrappers-section}).

\subsection{Chromosomes}

\begin{figure}

\begin{center}
\begin{tabular}{|l|l|c|l|}
Gene type & Occurrence & Type & Description \\
\hline
\parbox[t]{1.5in}{\tt numberOfCalls}
  & \parbox[t]{2in}{One for whole chromosome}
  & int
  & \parbox[t]{2in}{the number $n$ of method calls
                    to be made} \\
% that
%                 {\sf constructRunTestCase} is to run} \\
\hline
\parbox[t]{1.5in}{\tt methodWeight}
  & \parbox[t]{2in}{One for each method $m \in C_M$}
  & int
  & \parbox[t]{2in}{The relative weight of the method, i.e.\ the
                    likelihood that it will be chosen} \\
%                at step 4(a)
%                of {\sf constructRunTestCase}} \\
\hline
\parbox[t]{1.5in}{\tt numberOf- ValuePools}
  & \parbox[t]{2in}{One for each type $t \in I_M$}
  & int
  & \parbox[t]{2in}{The number of value pools for that type} \\
\hline
\parbox[t]{1.5in}{\tt numberOfValues}
  & \parbox[t]{2in}{One for each value pool of each type $t \in I_M$
                 except for {\tt boolean}}
  & int
  & \parbox[t]{2in}{The number of values in the pool} \\
\hline
\parbox[t]{1.5in}{\tt chanceOfTrue}
  & \parbox[t]{2in}{One for each value pool of type {\tt boolean}}
  & int
  & \parbox[t]{2in}{The percentage chance that the value {\it true} will be
                 chosen from the value pool} \\
\hline
\parbox[t]{1.5in}{\tt lowerBound, upperBound}
  & \parbox[t]{2in}{One for each value pool of each
                 range primitive type $t \in I_M$}
  & \parbox[t]{0.5in}{int or float}
  & \parbox[t]{2in}{The lower and upper bounds on values in the pool;
                 initial values are drawn uniformly from this range} \\
\hline
\parbox[t]{1.5in}{\tt chanceOfNull}
  & \parbox[t]{2in}{One for each argument position of non-primitive type
                 of each method $m \in C_M$}
  & int
  & \parbox[t]{2in}{The percentage chance that {\tt null} will be chosen
                 as the argument} \\
\hline
\parbox[t]{1.5in}{\tt candidateBitSet}
  & \parbox[t]{2in}{One for each parameter and quasi-parameter of
                    each method $m \in C_M$}
  & BitSet
  & \parbox[t]{2in}{Each bit represents one candidate type, and signifies
                 whether the argument will be drawn from the value
                 pools of that type} \\
\hline
\parbox[t]{1.5in}{\tt valuePool- ActivityBitSet}
  & \parbox[t]{2in}{One for each candidate type of each parameter
                    and quasi-parameter of each method $m \in C_M$}
  & BitSet
  & \parbox[t]{2in}{Each bit represents one value pool, and signifies
                 whether the argument will be drawn from that value
                 pool} \\
\hline
\end{tabular}
\end{center}

\caption{Nighthawk gene types.}
\label{gene-types-fig}
\end{figure}

Aspects of the test case execution algorithms are controlled
by the genetic algorithm chromosome given as an argument.  A
{\it chromosome} is composed of a finite number of {\it genes}.
Each gene is a pair consisting of a name
and an integer, floating-point, or {\tt BitSet} value.
Figure \ref{gene-types-fig} summarizes the different types of
genes that can occur in a chromosome.
We refer to the receiver (if any) and the return value
(if non-{\tt void}) of a method call
as {\it quasi-parameters} of the method call.  Parameters and
quasi-parameters have {\it candidate types}:
\begin{itemize}
\item A type is a candidate type for a receiver if it is a
  subtype of the type of the receiver.  These are the types
  from whose value pools the receiver can be drawn.
\item A type is a candidate type for a parameter if it is a
  subtype of the type of the parameter.  These are the types
  from whose value pools the parameter can be drawn.
\item A type is a candidate type for a return value if it is a
  supertype of the type of the return value.  These are the
  types into whose value pools the return value can be placed.
\end{itemize}
Note that the gene types
{\tt candidateBitSet} and {\tt valuePoolActivityBitSet}
essentially encode a value reuse policy by determining the
pattern in which receivers, arguments and return values are
drawn from and placed into value pools.

%Every Nighthawk chromosome contains a gene specifying the number
%$n$ of method calls that {\sf constructRunTestCase} is to
%run.  In addition, a chromosome appropriate to a set $M$
%of target methods contains the following genes:
%\begin{itemize}
%%\item The number $n$ of method calls to perform.
%\item For each method in $C_M$, the relative weight of the
%  method, i.e.\ the likelihood that it will be chosen at step
%  4(a) of {\sf constructRunTestCase}.
%\item For each type of interest in $I_M$, the number of value
%  pools for that type.
%\item For each value pool, the number of values in the pool.
%\item For each value pool of a range primitive type, the upper
%  and lower bounds on values in the pool.  Initial values are
%  drawn from this range with a uniform distribution.
%\item For each method in $C_M$ and every argument position, the
%  chance that {\tt null} will be chosen as an argument.
%\item For each method in $C_M$, the types of interest that will
%  be chosen from to find a receiver, and, for each of those
%  types, the value pools that will be chosen from.
%\item For each method in $C_M$ and every argument position, the
%  types of interest that will be chosen from to find an
%  argument, and, for each of those types, the value pools
%  that will be chosen from.
%\item For each method in $C_M$, the types of interest that will
%  be chosen from to find an element that will be replaced by
%  the return value,
%  and, for each of those types, the value pools that will be
%  chosen from.
%\end{itemize}
%The last three kinds of genes are expressed as bit vectors;
%each bit stands for one of the types of interest that is a
%subtype of the declared type (resp.\ one of the value pools).
%These bit vectors thus encode the value reuse policy
%expressed by the chromosome.

It is clear that different gene values in the chromosome may
cause dramatically different behaviour of the algorithm on the
methods.  We illustrate this point with two concrete examples.

Consider the ``triangle'' unit from \cite{michael-etal-ga-tcg}.
If the chromosome specifies
that all three parameter values are to be taken from a value
pool of 65536 values in the range -32768 to 32767, then the
chance that the algorithm will ever choose two
or three identical values for the parameters (needed for the
``isosceles'' and ``equilateral'' cases) is very low.  If,
on the other hand, the value pool contains only 30 integers
each chosen from the range 2 to 5, then the chance rises
dramatically due to reuse of previously-used values.  The amount
of additional coverage this would give would vary depending on
the UUT, but is probably nonzero.

Consider further a container class with {\tt put} and
{\tt remove} methods, each taking an integer key as its only
parameter.  If the parameters to the two methods are taken from
two different value pools of 30 values in the range 0 to 1000,
there is little chance that a key that has been put into the
container will be successfully removed.  If, however, the
parameters are taken from a single value pool of 30 values in
the range 0 to 1000, then the chance is very good that added
values are removed, again due to value reuse.  A {\tt remove}
method for a typical data structure executes different
code for a successful removal than it does for a failing one.

\subsection{Genetic Algorithm Level}

We take the space of possible chromosomes as a solution space to
search, and apply the GA approach to search it
for a good solution.  We chose GAs over other metaheuristic
approaches such as simulated annealing because of our belief that
recombining parts of successful chromosomes would result in
chromosomes that are better than their parents.  However, other
metaheuristic approaches may have other advantages and should be
explored in future work.
%  For the randomized
%unit testing problem, the ``building blocks'' are individual
%choices for the gene values; discovering and recombining good
%building blocks leads to better overall solutions.

The parameter to Nighthawk's GA is the set $M$ of target methods.
The GA performs the usual steps of chromosome
evaluation, fitness selection, mutation and recombination.
The GA first derives an
initial template chromosome appropriate to $M$, constructs
an initial population of size $p$ as clones of this chromosome, and
mutates the population.  It then performs a loop, for
the desired number $g$ of generations, of evaluating each
chromosome's fitness, retaining the fittest chromosomes,
discarding the rest, cloning the fit chromosomes, and mutating
the genes of the clones with probability $m$\% using point mutations and
crossover (exchange of genes between chromosomes).
The {\it fitness function} for a chromosome is calculated in
a manner identical to the exploratory study (Section
\ref{exploratory-section}).
%; that is, as
%\begin{center}
%(number of coverage points covered) $*$ (coverage factor) \\
% $-$ (number of method calls)
%\end{center}

%It is recognized that the design of genetic algorithms is a
%``black art'' \cite{ga-blackart}, and that very little is known
%about why GAs work when they do work and why they do not work
%when they do not.
Nighthawk uses default settings of
$p=20, g=50, m=20$.  These settings are different from those
taken as standard in GA literature \cite{dejong-spears-genetic},
and are motivated by a need to do as few chromosome evaluations
as possible (the primary cost driver of the system).  The
settings of other variables, such as the retention percentage,
are consistent with the literature.

To enhance availability of the software,
Nighthawk uses the popular open-source coverage tool Cobertura
\cite{cobertura-website} to measure coverage.  Cobertura can measure
only line coverage (each coverage point corresponds to a source
code line, and is covered if any code on the line is executed) \footnote{
  Cobertura (v.\ 1.8) also reports what it calls ``decision coverage'',
  but this is coverage of lines containing decisions.
}.
However, Nighthawk's algorithm is not specific to this measure;
indeed, our empirical studies (see below) show that Nighthawk
performs well when using other coverage measures.

\subsection{Top-Level Application}

\label{deep-section}
The Nighthawk application takes several switches and a set
of class names as command-line parameters.  The default
behaviour is to consider the command-line class names as a set
of ``target classes''.  If, however, the ``{\tt --deep}'' switch
is given to Nighthawk, the public declared methods of the
command-line classes are explored, and all non-primitive types
of parameters and return values of those methods are added to
the set of target classes.  The set $M$ of target {\it methods}
is computed as all public declared methods of the target
{\it classes}.  Intuitively, therefore, the {\tt --deep} switch
performs a ``deep target analysis'' by getting Nighthawk to call
methods in the layer of classes surrounding the command-line
classes.

Nighthawk runs the GA, monitoring the
chromosomes and retaining the most fit
chromosome ever encountered.  This most
fit chromosome is the final output of the program.

\label{run-chromosome-sec}
After finding the most fit chromosome, a test engineer can
perform the randomized testing that it specifies.
To do this, they run a separate program, RunChromosome, which
takes the chromosome description as input and runs test cases
based on it for a user-specified number of times.  Randomized
unit testing generates new test cases with new data every time
it is run, so if Nighthawk finds a parameter setting that
achieves high coverage, a test engineer can automatically
generate a large number of distinct, high-coverage test cases with
RunChromosome.


\subsection{Test Wrappers}
\label{test-wrappers-section}

We provide a utility program that, given a class name, generates
the Java source file of a ``test wrapper'' class for the class.
Running Nighthawk on an unmodified test wrapper is the same
as running it on the target class; however, test wrappers can be
customized for precondition checking, result checking or
coverage enhancement.

A test wrapper for class X is a class that contains one private
field of class X (the ``wrapped object''), and one public method
with an identical declaration for each public declared method of
class X.  Each wrapper method simply passes the call on to the
wrapped object.

To customize a test wrapper for precondition checking, the user
can insert a check in the wrapper method before the target
method call.  If the precondition is violated, the wrapper
method can simply return.
To customize a test wrapper for test result checking, the user
can insert any result-checking code after the target method
call; examples include normal Java assertions and JML
\cite{jml-overview} contracts.
%For instance, a user can choose to customize a wrapper method so
%that it throws an {\tt AssertionError} (signalling test case failure)
%whenever the target method throws an {\tt ArrayOutOfBoundsException}.
We provide switches to the test wrapper generation program that
cause the wrapper to
check commonly-desired properties, such as that a method throws
no {\tt NullPointerException} unless one of its arguments is
null.  The switch \verb/--pleb/ generates a wrapper that checks
all the Java Exception and Object contracts from
Pacheco et al.\ \cite{pacheco-etal-icse2007}.

To customize a test wrapper for coverage enhancement,
the user can insert extra methods that cause extra code to be
executed.  We provide two switches
for commonly-desired enhancements.  The switch \verb/--checkTypedEquals/
adds a method to the test wrapper for class X that takes one
argument of type X and passes it to the {\tt equals} method
of the wrapped object.  This is distinct from the normal wrapper
method that calls {\tt equals}, which has an argument of type
{\tt Object} and would therefore by default receive arguments
only of type {\tt int} or {\tt String} (see Section \ref{object-section}).
For classes X that implement their own {\tt equals} method, the
typed-equals method is likely to execute more code.

Tailored serialization is accomplished in Java via
specially-named private methods that are inaccessible to
Nighthawk.  The test wrapper generation program switch
\verb/--checkSerialization/ adds a
method to the test wrapper that writes the object to a byte
array and reads it again from the byte array.
This causes Nighthawk to be able to execute the code in the
private serialization methods.

\section{Comparison with Previous Results}

We compared Nighthawk with two well-documented systems in the
literature by running it on the same software and measuring
the results.

\subsection{Pure GA Approach}

To compare the results of our genetic-random approach with those
of the purely genetic approach of
Michael et al.\ \cite{michael-etal-ga-tcg}, we adapted their
published C code for the Triangle program to Java, transforming
each decision so that each condition and decision direction
corresponded to an executable line of code measurable by
Cobertura.  We then ran Nighthawk 10 separate times on the
resulting class.

%6782 gen 8
%6914 gen 10
%6527 gen 10
%5008 gen 7
%3346 gen 5
%1790 gen 2
%2486 gen 5
%7945 gen 13
%18363 gen 21
%2509 gen 4

We found that Nighthawk reached 100\% of feasible
condition/decision coverage on average after 8.5 generations, in
an average of 6.2 seconds of clock time \footnote{
All empirical studies in this paper were performed on a Sun
UltraSPARC-IIIi running SunOS 5.10 and Java 1.5.0\_11.
}.
%There were 22 decisions having a total of 28 condition
%and decision directions.
%All condition and decision directions were coverable
%except the false direction of the last ``if'' statement; Michael
%et al.\ found that their best method also achieved 95\% coverage.
%
Michael et al.\ had found that a purely random approach could
not achieve even 50\% condition/decision coverage.  The
discrepancy between the results may be due to
Nighthawk being able to find a setting of the randomized testing
parameters that is more optimal than the one Michael et al.\ were
using.  Inspection revealed that the chromosomes encoded value
reuse policies that guaranteed frequent selection of the same
values.

\subsection{Model-Checking and Feedback-Directed Randomization}

To compare our results with those of the model-checking approach
of Visser et al.\ \cite{visser-etal-issta06} and the
feedback-directed random testing of
Pacheco et al.\ \cite{pacheco-etal-icse2007},
we downloaded the four data structure units used in those
studies.  The units had been hand-instrumented to record
coverage of the deepest basic blocks in the code.

We first wrote restricted test wrapper classes that called only
the methods called by the previous researchers.  We ran
Nighthawk giving these test wrapper classes as command-line
classes, and observed the number of instrumented basic blocks
covered, and the number of lines covered as measured by
Cobertura.

\begin{figure}

\begin{center}
\begin{tabular}{|c|c|c|c|c|c|c|}
\hline
		& \multicolumn{3}{c|}{Instr Blk Cov}
				  & Time
					  & \multicolumn{2}{c|}{Line Cov} \\
UUT		& JPF & RP  & NH  & (sec) & Restr & Full \\
\hline
BinTree		& .78 & .78 & .78 &  .58  & .84   &   1  \\
\hline
BHeap		& .95 & .95 & .95 &  4.1  & .88   &  .92 \\
\hline
FibHeap		&  1  &  1  &  1  &  5.1  & .74   &  .92 \\
\hline
TreeMap		& .72 & .72 & .72 &  5.4  & .76   &  .90 \\
\hline
\end{tabular}
\end{center}

\caption{Comparison of results on the JPF subject units.}
\label{jpf-results-fig}
\end{figure}

Figure \ref{jpf-results-fig} shows the results of the comparison.
We show the block coverage ratio achieved
by the best Java-Pathfinder-based technique from Visser et
al.\ (JPF), by Pacheco et al.'s tool Randoop (RP), and
by Nighthawk using the restricted test wrappers. 
Nighthawk was able to achieve the same coverage as the previous
tools.  The Time column shows the clock time in seconds needed by
Nighthawk to achieve its greatest coverage.
For BHeap and FibHeap, Nighthawk runs faster than JPF, but for
the other two units it runs slower than both JPF and Randoop.
This difference in run times may be in part because
Nighthawk relies on general-purpose Cobertura instrumentation,
which slows down programs, rather than the efficient, but
application-specific, hand instrumentation that the other
methods used.  It may also be in part due to
the fact that our experiments were run on a different
machine architecture than that of Pacheco et al.

We then ran Nighthawk giving the target classes themselves as
command-line classes (bypassing the test wrappers), and observed
the number of lines covered.  The ``Line Cov'' columns show the
line coverage ratio achieved when using the restricted wrappers
and on the full target classes.  When using the full target
classes, Nighthawk was able to cover significantly more lines of
code, including all the blocks covered by the previous studies.

\begin{figure}

%\begin{center}
%\begin{tabular}{|c|c|c|c|}
%\hline
%		& \multicolumn{3}{c|}{Number of cond value combinations} \\
%UUT		& Total & Unreached & Rch/Uncov \\
%\hline
%BinTree		&  34   &     6     &    0 \\
%\hline
%BHeap		&  75   &     0     &    5 \\
%\hline
%FibHeap		&  57   &    10     &    3 \\
%\hline
%TreeMap		&  157  &    31     &    19 \\
%\hline
%\end{tabular}
%\end{center}

\begin{center}
\begin{tabular}{|c|c|c|c|}
\hline
		& \multicolumn{3}{c|}{Number of cond value combinations} \\
UUT		& Total & Reachable & Covered \\
\hline
BinTree		&  34   &    28     & 28 (.82, 1.0) \\
\hline
BHeap		&  75   &    75     & 70 (.93, .93) \\
\hline
FibHeap		&  57   &    47     & 44 (.77, .94) \\
\hline
TreeMap		&  157  &   126     & 107 (.68, .85) \\
\hline
\end{tabular}
\end{center}

\caption{Multiple condition coverage of the subject units.}
\label{jpf-mcc-results-fig}
\end{figure}

Visser et al.\ and Pacheco et al.\ also studied a form of
predicate coverage \cite{ball-pred-coverage} whose
implementation is linked to the underlying Java Pathfinder code,
and is difficult for Nighthawk to access.
While this predicate coverage is an interesting assessment
criterion, there is no consensus in the literature on the
connection of this criterion to other measures.
For comparison, we
therefore studied multiple condition coverage (MCC), a standard
coverage metric which is, like predicate coverage, intermediate
in strength between decision/condition and path coverage.
We instrumented the source code so that every combination of
values of conditions in every decision caused a separate line of
code to be executed.  We then ran Nighthawk on the test
wrappers, thus effectively causing it to optimize MCC rather
than just line coverage.

The results are in Figure \ref{jpf-mcc-results-fig}.  We list
the total number of valid condition value combinations in all
the code, and the number that were in decisions reachable by
calling only the methods called by the other research groups.
We also list the number of combinations covered by Nighthawk,
both as a raw total and as a fraction of the total combinations
and the reachable combinations.
Nighthawk achieved between 68\% and 93\% of
MCC, or between 85\% and 100\% when only reachable condition
combinations were considered.  These results are very good,
since ``code coverage of 70-80\% is a reasonable goal for system
test of most projects with most coverage metrics''
\cite{cornett-minimum-coverage}.

In summary, the comparison suggests that
Nighthawk was achieving good coverage with respect to the
results achieved by previous researchers, even when strong
coverage measures such as decision/condition and MCC were taken
into consideration.

\section{Case Study}

In order to study the effects of different test wrapper
generation and command-line switches to Nighthawk, we studied
the Java 1.5.0 Collection and Map classes; these are the 16
concrete classes with public constructors in {\tt java.util}
that inherit from the {\tt Collection} or {\tt Map}
interface.  The source files total 12137 LOC, and Cobertura
reports that 3512 of those LOC contain executable code.  These
units are ideal subjects because they are heavily used and
contain complex code, including templates and inner classes.

For each unit,
we generated test wrappers of two kinds:  plain test wrappers
(P), and enriched wrappers (E) generated with the
{\tt --checkTypedEquals} and {\tt --checkSerializable} switches
(see Section \ref{test-wrappers-section}).
We studied two different option sets for Nighthawk: with no
command-line switches (N), and with the {\tt --deep} switch
(see Section \ref{deep-section}) turned on (D).
For each $\langle$UUT, test wrapper, option set$\rangle$ triple, we
ran Nighthawk for 50 generations and saved the best chromosome it found.
For each triple, we then executed RunChromosome (see Section
\ref{run-chromosome-sec}) specifying that it generate 10 test
cases with the given chromosome, and we measured the coverage
achieved.

\begin{figure}
\begin{center}
\begin{tabular}{|l|c|c|c|c|c|}
Source file	& SLOC	& PN	& EN	& PD	& ED \\
\hline
ArrayList	& 150	& 111	& 140	& 109	& 140 (.93) \\
\hline
EnumMap		& 239	& 7	& 9	& 10	& 7 (.03) \\
\hline
HashMap		& 360	& 238	& 265	& 305	& 347 (.96) \\
\hline
HashSet		& 46	& 24	& 40	& 26	& 44 (.96) \\
\hline
Hashtable	& 355	& 205	& 253	& 252	& 325 (.92) \\
\hline
IHashMap	& 392	& 156	& 196	& 283	& 333 (.85) \\
\hline
LHashMap	& 103	& 27	& 37	& 28	& 96 (.93) \\
\hline
LHashSet	& 9	& 6	& 6	& 7	& 9 (1.0) \\
\hline
LinkedList	& 227	& 156	& 173	& 196	& 225 (.99) \\
\hline
PQueue		& 203	& 98	& 123	& 140	& 155 (.76) \\
\hline
Properties	& 249	& 101	& 102	& 102	& 102 (.41) \\
\hline
Stack		& 17	& 17	& 17	& 17	& 17 (1.0) \\
\hline
TreeMap		& 562	& 392	& 415	& 510	& 526 (.94) \\
\hline
TreeSet		& 62	& 44	& 59	& 41	& 59 (.95) \\
\hline
Vector		& 200	& 183	& 184	& 187	& 195 (.98) \\
\hline
WHashMap	& 338	& 149	& 175	& 274	& 300 (.89) \\
\hline
\hline
Total		& 3512	& 1914	& 2194	& 2487	& 2880 \\
\hline
Ratio		& 	& .54	& .62	& .71	& .82  \\
\hline
\end{tabular}
\end{center}

\caption{
  Coverage achieved by configurations of Nighthawk
  on the {\tt java.util} Collection and Map classes.
}
\label{collection-map-cov}
\end{figure}

\begin{figure}
\begin{center}
\begin{tabular}{|l|c|c|c|c|c|c|}
Source file	& PN	& EN	& PD	& ED	& RC \\
\hline
ArrayList	& 75	& 91	& 29	& 48	& 15 \\
\hline
EnumMap		& 3	& 9	& 6	& 5	& 8 \\
\hline
HashMap		& 63	& 37	& 136	& 176	& 30 \\
\hline
HashSet		& 25	& 29	& 27	& 39	& 22 \\
\hline
Hashtable	& 8	& 110	& 110	& 157	& 25 \\
\hline
IHashMap	& 31	& 41	& 59	& 134	& 34 \\
\hline
LHashMap	& 1	& 5	& 4	& 129	& 25 \\
\hline
LHashSet	& 1	& 4	& 6	& 24	& 16 \\
\hline
LinkedList	& 32	& 61	& 41	& 53	& 17 \\
\hline
PQueue		& 23	& 40	& 242	& 103	& 13 \\
\hline
Properties	& 104	& 19	& 49	& 47	& 18 \\
\hline
Stack		& 5	& 10	& 5	& 26	& 8 \\
\hline
TreeMap		& 80	& 131	& 231	& 106	& 26 \\
\hline
TreeSet		& 110	& 93	& 98	& 186	& 26 \\
\hline
Vector		& 106	& 83	& 156	& 176	& 20 \\
\hline
WHashMap	& 37	& 35	& 92	& 110	& 21 \\
\hline
\hline
Total		& 704	& 798	& 1291	& 1519	& 324 \\
\hline
\end{tabular}
\end{center}

\caption{
  Time in seconds taken by configurations of Nighthawk
  to achieve highest coverage
  on the {\tt java.util} Collection and Map classes.
}
\label{collection-times}
\end{figure}

Figure \ref{collection-map-cov} shows the results of this study.
The column labelled SLOC shows the total number of source lines
of code reported by Cobertura (including inner classes) in the
source file associated with the class.  Column PN shows the
SLOC covered by Nighthawk with the plain test wrappers and
no Nighthawk switches; columns EN, PD and ED show the other
combinations, and column ED also shows the coverage ratio with
respect to total SLOC.
% EN shows the effect of using the
%enriched wrappers, and column PD the effect of using deep target
%class analysis with the plain wrappers; column ED shows the effect of
%using both the enriched wrappers and deep target class analysis.
The second last line shows the totals for each column, and
the last line shows the coverage ratio attained.

With enriched test wrappers and deep target class analysis,
Nighthawk performs well, achieving over 90\% coverage on 11 out
of the 16 classes, and
82\% coverage overall.
Paired $t$ tests with Bonferroni
correction (corrected $\alpha = .00833$) on each pair of columns
in the table
indicate that there are statistically significant differences
between every pair except the (EN, PD) pair.

Nighthawk performed poorly on the {\tt EnumMap}
class because
the main constructor to {\tt EnumMap} expects
an enumerated type as one of the parameters.
Nighthawk had no facility for supplying such a type, and so
only a few lines of error code in constructors were executed.
When we customized the test wrapper class so that it used a
fixed enumerated type, Nighthawk with the ED configuration covered
204 lines of code (coverage ratio .85), raising the total
coverage ratio for all classes to .88.

%It also performed poorly on the {\tt Properties} class, due to
%the fact that it could not automatically generate a useful
%{\tt InputStream} argument for several of the methods.
%We expect that these deficiencies could be at least partially
%addressed by hand-customizing the test wrappers for the classes.

Table \ref{collection-times} shows the amount of time taken by
Nighthawk on the various configurations.  In columns PN-ED, we
report the number
of seconds of clock time taken for Nighthawk to first achieve
its best coverage.  $t$ tests
showed that the only pairs of
columns that were different to a statistically significant level
were (PN, ED) and (EN, ED).  This suggests that generating the
enriched wrappers allowed Nighthawk to
cover significantly more code without running significantly
longer;  the deep target class analysis
also caused Nighthawk to cover
significantly more code, but took significantly longer
(though still less than 100 seconds per unit on average).

In column RC of Table \ref{collection-times}, we report the
number of CPU seconds needed for the RunChromosome program to
create and run the 10 new test cases with the parameters chosen
by Nighthawk in the ED configuration.
This time includes JVM startup time.  The results show that
with the parameters chosen by Nighthawk, RunChromosome can
automatically generate many new test cases that achieve high
coverage, in an average of approximately 2 seconds per test
case.

\section{Threats to Validity}

Here we discuss the threats to validity of the empirical results
in this paper.

The representativeness of the units under test is the major
threat to external validity.  We studied Java collection
classes because these are complex, heavily-used units that have
high quality requirements.  However, other units might
have characteristics that cause Nighthawk to perform poorly.
Randomized unit testing schemes in general require many test
cases to be executed, so they perform poorly on methods that do
a significant amount of disk I/O or thread generation.
%  They also
%perform poorly on methods expecting string inputs that are
%sentences in a grammar; we expect Nighthawk to do the same
%unless it is given valid sentences as seed strings.

Nighthawk uses Cobertura, which measures line coverage, a weak
coverage measure.  The results that we obtained may not extend
to stronger coverage measures.  However, the Nighthawk algorithm
%treats the coverage measurement as a black-box integer, and
does
not perform special checks particular to line coverage.  The
comparison studies suggest that it still performs well when
decision/condition coverage and MCC are simulated.
The question of whether code coverage measures are a good
indication of the thoroughness of testing is still, however, an
area of active debate in the software testing community, making
this a threat to construct validity.

Time measurement is also a threat to construct validity. 
We measured time using Java's {\tt systemTimeInMillis}, which
gives total wall clock time rather than CPU time.  This may give
run time numbers that do not reflect the actual cost to the user
of the testing.

\section{Data Mining-Based Optimization}

It is recognized that the design of genetic algorithms is a
``black art'' \cite{ga-blackart}, and that very little is known
about why GAs work when they do work and why they do not work
when they do not.  In the initial design of Nighthawk, we got
the GA to control many aspects of the randomized testing
algorithm, as reflected by the ten gene types listed in Figure
\ref{gene-types-fig}.  This was the result of our expectation
that any of them might turn out to be crucial to the success
of the system, and our inability to predict which.

However, for a genetic algorithm, every gene that is not useful
results in time wasted:  time is spent mutating genes that do
not lead to better solutions, and time is also spent extracting
values from genes instead of simply using constant values.
This effect is especially pronounced for Nighthawk, since some
genes control code inside the innermost loops of the randomized
testing algorithm.  We were therefore motivated to find whether
we could speed up Nighthawk by eliminating gene types that did
not lead to better performance.

In this section, we describe how we used feature subset
selection (FSS) to analyze data from Nighthawk runs, in order to
systematically examine the usefulness of each gene type.
We also describe how we optimized the system by eliminating the
least useful genes, resulting in a system that achieved the same
results in less time.

\subsection{Feature Subset Selection}

A repeated result in the data mining community is that simpler
models with  equivalent or higher performance can be built via
{\em feature subset selection}, algorithms that intelligently
prune useless features \cite{hall03}. Features may be pruned for
several reasons: 
\begin{itemize}
\item They may be noisy, i.e.\ contain spurious signals unrelated
  to the target class; 
\item They may be uninformative, e.g.\ contain mostly one value,
  or no repeating values;
\item They may be correlated to other variables, in which case
  they can be pruned since their signal is also present in other
  variables.
\end{itemize}

The reduced feature set has many advantages:
\begin{itemize}
\item Miller has shown that models generally containing fewer
  variables have less variance in their outputs \cite{miller02}.
\item The smaller the model, the fewer are the demands on
  interfaces  (sensors and actuators) to the external environment.
  Hence, systems designed around  small models are easier to use
  (less to do) and cheaper to build.
\item In terms of this article, the most important aspect of
  learning from a reduced features set is that it produces
  smaller models. Such smaller models are easier to explain (or audit).
\end{itemize}

The literature lists many feature subset selectors.  
In the Wrapper method, a {\em target learner} is augmented with a
pre-processor that used a heuristic search to grow subsets of the
available features. At each step in the growth, the target learner
is called  to find the accuracy of the model learned from the
current subset. Subset growth is stopped when the addition of new
features did not improve the accuracy. 
Kohavi and John \cite{kohavi97} report experiments with Wrapper
where, 83\% (on average) of the measures in a domain could be
ignored with only a minimal loss of accuracy.

The advantage of the Wrapper approach is that, if some target
learner is already implemented, then the Wrapper is simple to
implement. The disadvantage of the wrapper method is that each
step in the heuristic search requires another call to the target
learner. Since there are many steps in such a search ($N$
features have $2^N$ subsets), Wrappers may be too slow.

Another feature subset selector is Relief.  
This is an instance based learning scheme \cite{Kir92,Kon94}
that works by randomly sampling one instance within the data. It
then locates the nearest neighbors for that instance from not
only the same class but the opposite class as well. The values
of the nearest neighbor features are  then compared to that of
the sampled instance and the feature scores are maintained and
updated based on this. This process is specified for some
user-specified $M$ number of instances. Relief can handle noisy
data and other data anomalies by averaging the values for $K$
nearest neighbors of the same and opposite class for each
instance \cite{Kon94}. For data sets with multiple classes, the
nearest neighbors for each class that is different from the
current sampled instance are selected and the contributions are 
determined by using the class probabilities of the class in the dataset.

The experiments of Hall and Holmes \cite{hall03} reject numerous
feature subset selection methods.  Wrapper is their preferred
option, but only for small data sets.  For larger data sets, the
stochastic nature of Relief makes it a natural choice.

Another reason to prefer Relief is that it can take advantage
of Nighthawk's stochastic search.  Currently, Relief is a
batch process that is executed after data generation is
completed.  However, it may not be necessary to wait till the end
of data generation to gain insights into which features are most
relevant.  Given the stochastic nature of the algorithm, we can
see feature work where Relief and a genetic algorithm work in
tandem.  Consider the random selection process at the core of
Relief: a genetic algorithm exploring mutations of the current
set of parents is an excellent stochastic source of data.  In
the future, we plan to apply Relief incrementally and in
parallel to our GAs.

\subsection{Data Collection and Analysis}

We instrumented Nighthawk so that every time a chromosome was
evaluated, it printed the current value of every gene and the
final fitness function score.  (For the two BitSet gene types,
we printed only the cardinality of the set.)
For each of the 16 collection and map classes from {\tt java.util},
we ran Nighthawk for 40 generations.  Each class therefore
yielded 800 observations of gene value and score.

%Relief assumes continuous data and Nighthawk's performance
Relief assumes discrete data and Nighthawk's performance
scores are continuous.  To enable feature subset selection, we
first discretized Nighthawk's output. A repeated pattern
across all our experiments is that the Nighthawk scores fall
into three regions. 
\begin{itemize}
\item The 65\% majority of the scores are within 30\% of the top
  score for any experiment. We call this the {\em high plateau}.
\item A 10\%  minority of scores are less than 10\% of the maximum
  score.  We call this region {\em the hole}.
\item After the high plateau there is a  {\em slope} of
  increasing gradient that falls into {\em the hole}. This {\em
  slope} accounds for 25\% of the results.
\end{itemize}
Accordingly, to select our features, we sorted the results and
divided them into three classes: bottom 10\%, next 25\%, remaining
65\%. Relief then found the subset of features that distinguished
these three regions.  

Using the above discretization policy, we ran Relief 10 times in a 10-way
cross-validation study.
The data set was divided
into $10$ buckets. Each bucket was (temporarily) removed and Relief was
run on the remaining data. This produced a list of ``merit'' figures for
each feature (this ``merit'' value is an internal heuristic measure
generated from Relief, and reflects the difference ratio of neighboring
instances that have different regions). We took the maximum merit ($Max$)
and generated a {\it Selected} list. A feature was {\it Selected} if its merit
$m$ was $m> \alpha Max$ (e.g. at $\alpha=0.5$ then we selected all
features that score at least half the maximum merit).

The following table shows the number of features that were {\it Selected} in
our 19 examples, using different values for $\alpha$. Note that as
$\alpha$ increases, we selected fewer and fewer features.
\begin{tabular}{rl}
$\alpha$ & $| {\it Selected} | $\\\hline
0.5 & 439\\
0.6& 217 \\
0.7 & 112 \\
0.8 & 62 \\
0.9 & 39 \\
\end{tabular}
That is, this method allowed us to discover the genes that were most
important in selecting
for the {\em high plateau} and not the {\em slope} or {\em hole}.

\begin{figure}

\begin{center}
\begin{tabular}{|l|l|l|l|l|}
Gene type & $\alpha=0.9$ & $\alpha=0.5$ & Best avg ranks & Best merit \\
\hline
{\tt candidateBitSet}         & 18 & 106 & 1, 1, 1          & 0.373 \\
\hline
{\tt chanceOfNull}            &  0 &   3 & 24.1, 24.3, 25.4 & 0.166 \\
\hline
{\tt chanceOfTrue}            &  2 &   7 & 1, 3.9, 5.7      & 0.150 \\
\hline
{\tt lowerBound}              &  1 &   9 & 4.1, 5.6, 7.2    & 0.129 \\
\hline
{\tt methodWeight}            &  0 &  11 & 7.5, 7.8, 10.6   & 0.144 \\
\hline
{\tt numberOfCalls}           &  0 &   1 & 7.2, 8.4, 16.7   & 0.195 \\
\hline
{\tt numberOfValuePools}      &  0 &   6 & 14.1, 17.9, 20   & 0.186 \\
\hline
{\tt numberOfValues}          &  0 &  28 & 5.2, 5.2, 7.4    & 0.160 \\
\hline
{\tt upperBound}              &  8 &  16 & 1, 1, 1          & 0.267 \\
\hline
{\tt valuePoolActivityBitSet} & 10 & 252 & 1, 1.4, 2        & 0.267 \\
\hline
\end{tabular}
\end{center}

\caption{
}
\label{genes-merit-fig}
\end{figure}

Since our goal was to identify gene types that were not useful and
that could therefore possibly be eliminated from chromosomes, we
tabulated the properties of each of the different types of genes.
Figure \ref{genes-merit-fig} shows the result.  The three most
useful types of genes for the software under test were clearly
{\tt candidateBitSet},
{\tt valuePoolActivityBitSet}, and
{\tt upperBound},
since genes of these types sometimes had merit 90\% or more of
the maximum merit, often were ranked first or second in merit,
and were the only gene types such that some genes of that type
had merit greater than $0.2$ on some runs.  This result confirms
our intuition that changing the value reuse policy encoded in
the genes of type 
{\tt candidateBitSet} and
{\tt valuePoolActivityBitSet}
is an important way of improving the performance of a Nighthawk
chromosome.
The good performance of {\tt upperBound} is attributable to the
fact that for all the hash table-related units, adjusting this
parameter caused a ``load factor'' parameter in some of the
constructors to be able to take on values that led to the table
being reallocated frequently.

Just as clearly, the two least useful gene types for the
software under test were
{\tt chanceOfNull} and
{\tt numberOfValuePools}, since neither were ranked better than
than 14th in merit for any run.
This does not indicate that null values and multiple value pools
for each type were not important, only that changing the default
values of these genes (3\% chance of choosing null, and two
value pools) did not usually result in improved performance.

\subsection{Optimization and Performance Comparison}

In order to see whether our analysis was useful, we optimized
the Nighthawk code.
We decided to retain the four gene types with
the highest maximum merit
({\tt candidateBitSet},
{\tt valuePoolActivityBitSet},
{\tt upperBound}, and {\tt numberOfCalls}),
and also two other gene types:
{\tt lowerBound}, because it was paired with 
{\tt upperBound}; and
{\tt numberOfValues}, because it had the highest maximum
merit of the remaining well-ranked gene types.

Accordingly, we adjusted Nighthawk's code to delete all
references to the other four gene types
({\tt chanceOfNull},
{\tt numberOfValuePools},
{\tt chanceOfTrue}, and
{\tt methodWeight}).  We changed the code that depended on the
values stored for such genes in the current chromosome, so that
it instead used the default initial values for those gene types.
 
On each of the 16 Collection and Map classes,
we performed one run of the original Nighthawk,
and one run of the new, optimized version.
For each version of Nighthawk, we then
performed the same analysis that is reflected in Figures
\ref{collection-map-cov} and
\ref{collection-times};
that is, we asked RunChromosome to run 10 test cases with the
winning chromosome and measured the coverage, and we calculated
the clock time taken by Nighthawk to achieve the highest
coverage it achieved.  We then compared the original and
optimized Nighthawk.

The chromosomes resulting from the optimized Nighthawk achieved
slightly higher coverage than those from the original, though a
$t$ test concluded that the difference was not statistically
significant ($p=0.058$).  However, the optimized Nighthawk was
faster to a statistically significant degree ($p=0.010$).  The
time ratio between the original and optimized systems
was 1.46, i.e.\ the
optimized system was 46\% faster.  We can therefore say that
the process of analyzing the usefulness of the genes resulted
in a system that ran substantially faster but achieved the same
high coverage of the SUT.

\subsection{Discussion}

The feature subset selection and optimization procedure worked
well for us when we collected data from a set of classes and
then optimized Nighthawk for those classes.  This does not mean
that the optimized Nighthawk would necessarily work
well for other classes.  For instance, for some classes there
might be a great deal of code that is accessible only by giving
null pointers as parameters; we could expect our newly optimized
version of Nighthawk to perform poorly on such classes, since
there would be a fixed 3\% chance of choosing null.

However, the results of the optimization exercise indicate that
FSS-based analysis is a valuable adjunct to the GA in this
application.  It is for this reason that we envision in the
future integrating FSS into a genetic-random testing algorithm,
so that the algorithm can improve its performance dynamically,
while GA mutation and recombination is going on.

\section{Conclusions and Future Work}

Randomized unit testing is a promising technology that has been
shown to be effective, but whose thoroughness depends on the
settings of test algorithm parameters.  In this paper, we have
described Nighthawk, a system in which an upper-level genetic
algorithm automatically derives good parameter values for a
lower-level randomized unit test algorithm.  We have shown that
Nighthawk is able to achieve high coverage of complex Java units.
The code is available by writing to the first author.

XXXX why this is great

Future work includes the integration into Nighthawk of useful
facilities from past systems, such as failure-preserving or
coverage-preserving test case minimization, and further
experiments on the effect of program options on coverage and
efficiency.

%\begin{itemize}
%\item While the predicate coverage proposed by Visser et al.\ is
%  an interesting assessment criteria, there is no consensus in the
%  literature on the connection of this criteria to other measures.
%\item Static code analysis can direct the generation of the test
%  cases.  Our method, on the other hand, generates test cases at
%  random, so it is possible that we cover some code more than is
%  necessary, and that parts of our value pools are
%  useless.  Our view is that this is not a major issue since our
%  runtimes, on real-world systems, are quite impressive. 
%  Nevertheless, there needs to be more discussion on how to
%  assess test suite generation; i.e.\ runtimes versus
%  superfluous tests versus any other criteria.
%\end{itemize}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% Reminder: the "draftcls", not "draft", class option should be used if
% it is desired that the figures are to be displayed while in draft mode.

% An example of a floating figure using the graphicx package.
% Note that \label must occur AFTER (or within) \caption.
% For figures, \caption should occur after the \includegraphics.
%
%\begin{figure}
%\centering
%\includegraphics[width=2.5in]{myfigure.eps}
%\caption{Simulation Results}
%\label{fig_sim}
%\end{figure}


% An example of a double column floating figure using two subfigures.
% (The subfigure.sty package must be loaded for this to work.)
% The subfigure \label commands are set within each subfigure command, the
% \label for the overall fgure must come after \caption.
% \hfil must be used as a separator to get equal spacing
%
%\begin{figure*}
%\centerline{\subfigure[Case I]{\includegraphics[width=2.5in]{subfigcase1.eps}
%\label{fig_first_case}}
%\hfil
%\subfigure[Case II]{\includegraphics[width=2.5in]{subfigcase2.eps}
%\label{fig_second_case}}}
%\caption{Simulation results}
%\label{fig_sim}
%\end{figure*}



% An example of a floating table. Note that, for IEEE style tables, the
% \caption command should come BEFORE the table. Table text will default to
% \footnotesize as IEEE normally uses this smaller font for tables.
% The \label must come after \caption as always.
%
%\begin{table}
%% increase table row spacing, adjust to taste
%\renewcommand{\arraystretch}{1.3}
%\caption{An Example of a Table}
%\label{table_example}
%\centering
%% The array package and the MDW tools package offers better commands
%% for making tables than plain LaTeX2e's tabular which is used here.
%\begin{tabular}{|c||c|}
%\hline
%One & Two\\
%\hline
%Three & Four\\
%\hline
%\end{tabular}
%\end{table}


% if have a single appendix:
%\appendix[Proof of the Zonklar Equations]
% or
%\appendix  % for no appendix heading
% do not use \section anymore after \appendix, only \section*
% is possibly needed

% use appendices with more than one appendix
% then use \section to start each appendix
% you must declare a \section before using any
% \subsection or using \label (\appendices by itself
% starts a section numbered zero.)
%
% Use this command to get the appendices' numbers in "A", "B" instead of the
% default capitalized Roman numerals ("I", "II", etc.).
% However, the capital letter form may result in awkward subsection numbers
% (such as "A-A"). Capitalized Roman numerals are the default.
%\useRomanappendicesfalse
%
\appendices
% you can choose not to have a title for an appendix
% if you want by leaving the argument blank
%\section{}
%Appendix two text goes here.

% use section* for acknowledgement
\section*{Acknowledgment}
% optional entry into table of contents (if used)
%\addcontentsline{toc}{section}{Acknowledgment}
Thanks to Willem Visser for making the source code of the Java
Pathfinder subject units available, and to the JDEAL development
team for their tool package.  Thanks also for interesting
comments and discussions to Rob Hierons, Charles Ling, Bob
Mercer and Andy Podgurski. 
This research is supported by the first author's grant from the
Natural Sciences and Research Council of Canada (NSERC).


% trigger a \newpage just before the given reference
% number - used to balance the columns on the last page
% adjust value as needed - may need to be readjusted if
% the document is modified later
%\IEEEtriggeratref{8}
% The "triggered" command can be changed if desired:
%\IEEEtriggercmd{\enlargethispage{-5in}}

% references section
% NOTE: BibTeX documentation can be easily obtained at:
% http://www.ctan.org/tex-archive/biblio/bibtex/contrib/doc/

% can use a bibliography generated by BibTeX as a .bbl file
% standard IEEE bibliography style from:
% http://www.ctan.org/tex-archive/macros/latex/contrib/supported/IEEEtran/
 \bibliographystyle{IEEEtran.bst}
% argument is your BibTeX string definitions and bibliography database(s)
 \bibliography{my}
%
% <OR> manually copy in the resultant .bbl file
% set second argument of \begin to the number of references
% (used to reserve space for the reference number labels box)

% biography section
%
% If you had an eps/pdf photo file (graphicx package needed)
% the extra braces prevent the LaTeX parser from getting confused
% when it sees the complicated \includegraphics command within an
% optional argument. You can create your own macro to make things
% simpler here.
%\begin{biography}[{\includegraphics[width=1in,height=1.25in,clip,keepaspectratio]{mshell.eps}}]{Michael Shell}
% or if you just want to reserve a space for a photo:

\begin{IEEEbiography}{James H.\ Andrews}
received the BSc and MSc degrees in computer science from the
University of British Columbia, and the PhD degree  in computer
science from the University of Edinburgh.  He worked from 1982
to 1984 at Bell-Northern Research, from 1991 to 1995 at Simon
Fraser University, and from 1996 to 1997 on the FormalWare
project at the University of British Columbia, Hughes Aircraft
Canada and MacDonald Dettwiler.  He is currently an Associate
Professor in the Department of Computer Science at the
University of Western Ontario, in London, Canada, where he has
been since 1997.  His research interests include software
testing, semantics of programming languages, and formal
specification.  He is a member of the IEEE.
\end{IEEEbiography}

\begin{IEEEbiography}{Felix C.\ H.\ Li}
received the BSc and MSc degrees in Computer Science from the
University of Western Ontario, in 2004 and 2006 respectively.
He is currently a software developer based in Toronto.
\end{IEEEbiography}

\begin{IEEEbiography}{Tim Menzies}
received the CS and PhD degrees from the University of New South
Wales and is the author of more than 160 publications.  He is an
associate professor at the Lane Department of Computer Science
at West Virginia University (USA), and has been working with
NASA on software quality issues since 1998.  His recent research
concerns modeling and learning with a particular focus on
lightweight modeling methods.  His doctoral research explored
the validation of possibly inconsistent knowledge-based systems
in the QMOD specification language.  He is a member of the IEEE.
\end{IEEEbiography}

% if you will not have a photo
%\begin{biographynophoto}{John Doe}
%Biography text here.
%\end{biographynophoto}

% insert where needed to balance the two columns on the last page
%\newpage

%\begin{biographynophoto}{Jane Doe}
%Biography text here.
%\end{biographynophoto}

% You can push biographies down or up by placing
% a \vfill before or after them. The appropriate
% use of \vfill depends on what kind of text is
% on the last page and whether or not the columns
% are being equalized.

%\vfill

% Can be used to pull up biographies so that the bottom of the last one
% is flush with the other column.
%\enlargethispage{-5in}

% that's all folks
\end{document}
