%
%- add a note on page 4 about the apache classes and where this paper
%differs from prior work
%
%- add in some text around p24 about using best of the average (which
%got confused by a few outliners and lead to the sub-optimal PROMISE
%result) vs average of the average (which lead us to a new better
%result). unless you get there first with the text marked ABOVE as
%"!!!"
%
%- go over the reviewer comments and see what i can do with their smaller details
%
%- add in the apache results in the "threats of validity" section- it
%will be a nice external validity check
%
%%% bare_jrnl.tex
%%% V1.1
%% 2002/08/13
%% by Michael Shell
%% mshell@ece.gatech.edu
%%
%% NOTE: This text file uses MS Windows line feed conventions. When (human)
%% reading this file on other platforms, you may have to use a text
%% editor that can handle lines terminated by the MS Windows line feed
%% characters (0x0D 0x0A).
%%
%% This is a skeleton file demonstrating the use of IEEEtran.cls
%% (requires IEEEtran.cls version 1.6 or later) with an IEEE journal paper.
%%
%% Support sites:
%% http://www.ieee.org
%% and/or
%% http://www.ctan.org/tex-archive/macros/latex/contrib/supported/IEEEtran/
%%
%% This code is offered as-is - no warranty - user assumes all risk.
%% Free to use, distribute and modify.

% *** Authors should verify (and, if needed, correct) their LaTeX system  ***
% *** with the testflow diagnostic prior to trusting their LaTeX platform ***
% *** with production work. IEEE's font choices can trigger bugs that do  ***
% *** not appear when using other class files.                            ***
% Testflow can be obtained at:
% http://www.ctan.org/tex-archive/macros/latex/contrib/supported/IEEEtran/testflow


% Note that the a4paper option is mainly intended so that authors in
% countries using A4 can easily print to A4 and see how their papers will
% look in print. Authors are encouraged to use U.S. letter paper when
% submitting to IEEE. Use the testflow package mentioned above to verify
% correct handling of both paper sizes by the author's LaTeX system.
%
% Also note that the "draftcls", not "draft", option should be used if
% it is desired that the figures are to be displayed in draft mode.

% *************************************************************************
% You can now use this template both for submitting to peer review and,
% if your paper is accepted, for sending in final publication materials.
% To change from peer review format (single column, double-spaced) to
% final publication format (double column, single-spaced), just move the
% comment-line sign (%) from one \documentclass line to the other.
% The first is for peer review format(single column), the second is for final publication(double column).

\documentclass[12pt,journal,draftcls,letterpaper,onecolumn]{IEEEtran}
%\documentclass[9.5pt,journal,final,finalsubmission,twocolumn]{IEEEtran}
% *************************************************************************


% If the IEEEtran.cls has not been installed into the LaTeX system files,
% manually specify the path to it:
% \documentclass[journal,draftcls,onecolumn]{../sty/IEEEtran}


% some very useful LaTeX packages include:
\usepackage{wrapfig}

 \usepackage{cite}      % Written by Donald Arseneau
                        % V1.6 and later of IEEEtran pre-defines the format
                        % of the cite.sty package \cite{} output to follow
                        % that of IEEE. Loading the cite package will
                        % result in citation numbers being automatically
                        % sorted and properly "ranged". i.e.,
                        % [1], [9], [2], [7], [5], [6]
                        % (without using cite.sty)
                        % will become:
                        % [1], [2], [5]--[7], [9] (using cite.sty)
                        % cite.sty's \cite will automatically add leading
                        % space, if needed. Use cite.sty's noadjust option
                        % (cite.sty V3.8 and later) if you want to turn this
                        % off. cite.sty is already installed on most LaTeX
                        % systems. The latest version can be obtained at:
                        % http://www.ctan.org/tex-archive/macros/latex/contrib/supported/cite/

 \usepackage{graphicx}  % Written by David Carlisle and Sebastian Rahtz
                        % Required if you want graphics, photos, etc.
                        % graphicx.sty is already installed on most LaTeX
                        % systems. The latest version and documentation can
                        % be obtained at:
                        % http://www.ctan.org/tex-archive/macros/latex/required/graphics/
                        % Another good source of documentation is "Using
                        % Imported Graphics in LaTeX2e" by Keith Reckdahl
                        % which can be found as esplatex.ps and epslatex.pdf
                        % at: http://www.ctan.org/tex-archive/info/

%\usepackage{psfrag}    % Written by Craig Barratt, Michael C. Grant,
                        % and David Carlisle
                        % This package allows you to substitute LaTeX
                        % commands for text in imported EPS graphic files.
                        % In this way, LaTeX symbols can be placed into
                        % graphics that have been generated by other
                        % applications. You must use latex->dvips->ps2pdf
                        % workflow (not direct pdf output from pdflatex) if
                        % you wish to use this capability because it works
                        % via some PostScript tricks. Alternatively, the
                        % graphics could be processed as separate files via
                        % psfrag and dvips, then converted to PDF for
                        % inclusion in the main file which uses pdflatex.
                        % Docs are in "The PSfrag System" by Michael C. Grant
                        % and David Carlisle. There is also some information
                        % about using psfrag in "Using Imported Graphics in
                        % LaTeX2e" by Keith Reckdahl which documents the
                        % graphicx package (see above). The psfrag package
                        % and documentation can be obtained at:
                        % http://www.ctan.org/tex-archive/macros/latex/contrib/supported/psfrag/

%\usepackage{subfigure} % Written by Steven Douglas Cochran
                        % This package makes it easy to put subfigures
                        % in your figures. i.e. "figure 1a and 1b"
                        % Docs are in "Using Imported Graphics in LaTeX2e"
                        % by Keith Reckdahl which also documents the graphicx
                        % package (see above). subfigure.sty is already
                        % installed on most LaTeX systems. The latest version
                        % and documentation can be obtained at:
                        % http://www.ctan.org/tex-archive/macros/latex/contrib/supported/subfigure/

%\usepackage{url}       % Written by Donald Arseneau
                        % Provides better support for handling and breaking
                        % URLs. url.sty is already installed on most LaTeX
                        % systems. The latest version can be obtained at:
                        % http://www.ctan.org/tex-archive/macros/latex/contrib/other/misc/
                        % Read the url.sty source comments for usage information.

%\usepackage{stfloats}  % Written by Sigitas Tolusis
                        % Gives LaTeX2e the ability to do double column
                        % floats at the bottom of the page as well as the top.
                        % (e.g., "\begin{figure*}[!b]" is not normally
                        % possible in LaTeX2e). This is an invasive package
                        % which rewrites many portions of the LaTeX2e output
                        % routines. It may not work with other packages that
                        % modify the LaTeX2e output routine and/or with other
                        % versions of LaTeX. The latest version and
                        % documentation can be obtained at:
                        % http://www.ctan.org/tex-archive/macros/latex/contrib/supported/sttools/
                        % Documentation is contained in the stfloats.sty
                        % comments as well as in the presfull.pdf file.
                        % Do not use the stfloats baselinefloat ability as
                        % IEEE does not allow \baselineskip to stretch.
                        % Authors submitting work to the IEEE should note
                        % that IEEE rarely uses double column equations and
                        % that authors should try to avoid such use.
                        % Do not be tempted to use the cuted.sty or
                        % midfloat.sty package (by the same author) as IEEE
                        % does not format its papers in such ways.

 \usepackage{amsmath}   % From the American Mathematical Society
                        % A popular package that provides many helpful commands
                        % for dealing with mathematics. Note that the AMSmath
                        % package sets \interdisplaylinepenalty to 10000 thus
                        % preventing page breaks from occurring within multiline
                        % equations. Use:
 \interdisplaylinepenalty=2500
                        % after loading amsmath to restore such page breaks
                        % as IEEEtran.cls normally does. amsmath.sty is already
                        % installed on most LaTeX systems. The latest version
                        % and documentation can be obtained at:
                        % http://www.ctan.org/tex-archive/macros/latex/required/amslatex/math/



% Other popular packages for formatting tables and equations include:

% Frank Mittelbach's and David Carlisle's array.sty which improves the
% LaTeX2e array and tabular environments to provide better appearances and
% additional user controls. Array.sty is already installed on most systems.
% The latest version and documentation can be obtained at:
% http://www.ctan.org/tex-archive/macros/latex/required/tools/

% Mark Wooding's extremely powerful MDW tools, especially mdwmath.sty and
% mdwtab.sty which are used to format equations and tables, respectively.
% The MDWtools set is already installed on most LaTeX systems. The lastest
% version and documentation is available at:
% http://www.ctan.org/tex-archive/macros/latex/contrib/supported/mdwtools/

% V1.6 of IEEEtran contains the IEEEeqnarray family of commands that can
% be used to generate multiline equations as well as matrices, tables, etc.


% Also of notable interest:

% Scott Pakin's eqparbox package for creating (automatically sized) equal
% width boxes. Available:
% http://www.ctan.org/tex-archive/macros/latex/contrib/supported/eqparbox/




% *** Do not adjust lengths that control margins, column widths, etc. ***
% *** Do not use packages that alter fonts (such as pslatex).         ***
% There should be no need to do such things with IEEEtran.cls V1.6 and later.


% correct bad hyphenation here
\hyphenation{op-tical net-works semi-conduc-tor}
\newcommand{\bi}{\begin{itemize}}
\newcommand{\ei}{\end{itemize}}
\newcommand{\bd}{\begin{description}}
\newcommand{\ed}{\end{description}}
\newcommand{\bbi}{\begin{itemize}}
\newcommand{\eei}{\end{itemize}}
\newcommand{\be}{\begin{enumerate}}
\newcommand{\ee}{\end{enumerate}}
\newcommand{\fig}[1]{Figure~\ref{fig:#1}}
\newcommand{\eq}[1]{Equation~\ref{eq:#1}}
\newcommand{\tion}[1]{\S\ref{sec:#1}}
\begin{document}
%
% paper title
\title{Genetic Algorithms for Randomized Unit Testing}
%
%
% author names and IEEE memberships
% note positions of commas and nonbreaking spaces ( ~ ) LaTeX will not break
% a structure at a ~ so this keeps an author's name from being broken across
% two lines.
% use \thanks{} to gain access to the first footnote area
% a separate \thanks must be used for each paragraph as LaTeX2e's \thanks
% was not built to handle multiple paragraphs
\author{James~H.~Andrews,~\IEEEmembership{Member,~IEEE,}
        Tim~Menzies,~\IEEEmembership{Member,~IEEE,}
        and~Felix~C.~H.~Li% <-this % stops a space
\thanks{Manuscript received January 1, 2009; revised September 4, 2009.}% <-this % stops a space
\thanks{J.\ Andrews and F.\ Li are with the Department of Computer Science, University of Western Ontario, London, Ont., Canada, N6A 2B7.  E-mail: andrews@csd.uwo.ca.}%
\thanks{T.\ Menzies is with the Lane Department of Computer Science and  Electrical Engineering, West Virginia University, Morgantown, WV 26506-610.  E-mail: tim@menzies.us.}}
% note the % following the last \IEEEmembership and also the first \thanks -
% these prevent an unwanted space from occurring between the last author name
% and the end of the author line. i.e., if you had this:
%
% \author{....lastname \thanks{...} \thanks{...} }
%                     ^------------^------------^----Do not want these spaces!
%
% a space would be appended to the last name and could cause every name on that
% line to be shifted left slightly. This is one of those "LaTeX things". For
% instance, "A\textbf{} \textbf{}B" will typeset as "A B" not "AB". If you want
% "AB" then you have to do: "A\textbf{}\textbf{}B"
% \thanks is no different in this regard, so shield the last } of each \thanks
% that ends a line with a % and do not let a space in before the next \thanks.
% Spaces after \IEEEmembership other than the last one are OK (and needed) as
% you are supposed to have spaces between the names. For what it is worth,
% this is a minor point as most people would not even notice if the said evil
% space somehow managed to creep in.
%
% The paper headers
\markboth{IEEE Transactions on Software Engineering,~Vol.~1, No.~1,~January~2001}{Andrews \MakeLowercase{\textit{et al.}}: Using a Genetic Algorithm to Control Randomized Unit Testing}
% The only time the second header will appear is for the odd numbered pages
% after the title page when using the twoside option.


% If you want to put a publisher's ID mark on the page
% (can leave text blank if you just want to see how the
% text height on the first page will be reduced by IEEE)
 \pubid{0000--0000/00\$00.00~\copyright~2002 IEEE}

% use only for invited papers
%\specialpapernotice{(Invited Paper)}

% make the title area
\maketitle


\begin{abstract}
  Randomized testing is an effective method for
  testing software units.   Thoroughness of randomized
  unit testing varies widely according to the settings of certain
  parameters, such as the relative frequencies with which methods
  are called.  In this paper, we describe Nighthawk, a  system which uses a
  genetic algorithm (GA) to find parameters for randomized unit testing
  that optimize test coverage.  

Designing  GAs is somewhat of a black art. We therefore use
a feature subset selection (FSS)
tool to assess the size and content of the representations within the GA.
Using that tool,
we can prune back 90\% of our GA's mutators  while still achieving most
of the coverage found using  all the mutators. Our pruned GA achieves almost
the same results as the full system,  but in only 10\% of the time.
These results suggest that FSS for mutator pruning could significantly
optimize meta-heuristic search-based software engineering tools.
\end{abstract}

\begin{keywords}
Software testing, randomized testing, genetic algorithms,
feature subset selection, search-based optimization
\end{keywords}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Introduction}

% XXX NOT MORE THAN 35 PAGES IN THIS FORMAT

\IEEEPARstart{S}oftware testing involves running a piece of software (the
software under test, or SUT) on selected input data, and
checking the outputs for correctness.  The goals of software
testing are to force failures of the SUT, and to be thorough.
The more thoroughly we have tested an SUT without forcing
failures, the more sure we are of the reliability of the SUT.

Randomized testing uses  randomization for
some aspects of test input data selection.
% A software unit is a
% method, group of methods, module or class; a unit test is
% usually built upon a sequence of method calls.
Several studies
\cite{miller-fuzz-cacm,andrews-etal-rute-rt,%
pacheco-etal-icse2007,groce-etal-icse2007}
have found that randomized
testing of software units is effective at forcing failures in
even well-tested units.  However, there remains a question of
the thoroughness of
randomized testing.  Using
various code coverage measures to measure thoroughness,
researchers have come to varying conclusions about the ability
of randomized testing to be thorough
\cite{michael-etal-ga-tcg,visser-etal-issta06,andrews-etal-rute-rt}.

The thoroughness of randomized unit testing is dependent
on when and how randomization is
applied; e.g.   the number of method calls
to make, the relative frequency with which different methods are
called, and the ranges from which numeric arguments are chosen.
The manner in which previously-used arguments or
previously-returned values are used in new method calls,
which we call the {\it value reuse policy}, is also
a crucial factor.  It is often difficult to work out the optimal
values of the parameters and the optimal value reuse policy by hand.

This paper describes the Nighthawk unit test data generator.
Nighthawk has two
levels.  The lower level is a randomized unit testing engine
which tests a set of methods according to parameter values
specified as genes in a chromosome, including parameters that
encode a value reuse policy.  The upper level is a genetic
algorithm (GA) which uses fitness evaluation, selection,
mutation and recombination of chromosomes to find good values
for the genes.  Goodness is evaluated on the basis of test
coverage and number of method calls performed.
Users can use Nighthawk to find good parameters, and then
perform randomized unit testing based on those parameters.  The
randomized testing can quickly generate many new test cases that
achieve high coverage, and can continue to do so for as long as
users wish to run it.

This paper also discusses optimization techniques for GA tools like
Nighthawk. Using feature subset selection techniques, we show that
we can prune many of Nighthawk's mutators (gene types) without
compromising coverage. The pruned Nighthawk
tool achieves nearly the same coverage as full Nighthawk (90\%)
and does so ten times faster. Therefore, we recommend that 
meta-heuristic
search-based SE tools should also routinely perform subset selection.
% XXXX why this is great

\subsection{Randomized Unit Testing}

Unit testing is variously defined as the
testing of a single method, a group of methods, a module or a
class.  We will use it in this paper to mean the testing of a
group $M$ of methods, called the {\it target methods}.  A unit
test is a sequence of calls to the target methods, with each
call possibly preceded by code that sets up the arguments and
the receiver\footnote{
  We use the word ``receiver'' to refer to the object that
  a method is called on.  For instance, in the Java method
  call ``{\tt t.add(3)}'', the receiver is {\tt t}.
}, and
with each call possibly followed by code that stores and checks results.

{\it Randomized unit testing} is unit testing where there is
some randomization in the selection of the target
method call sequence and/or arguments to the
method calls.  Many researchers
\cite{doong-frankl-tosem94,antoy-hamlet-tse-jan2000,%
claessen-hughes-quickcheck,pacheco-etal-icse2007,%
sen-etal-cute,visser-etal-issta06, andrews-etal-rute-rt}
have performed randomized unit testing,
sometimes combined with other tools such as model
checkers.
%
A key concept in randomized unit testing is that of {\it value
reuse}.  We use this term to refer to how the
testing engine reuses the receiver, arguments or return values
of past method calls when making new method calls.  In previous
research, value reuse has mostly taken the form of making a
sequence of method calls all on the same receiver object.

In our previous research, we developed a GUI-based randomized
unit testing engine called RUTE-J \cite{andrews-etal-rute-rt}.
To use RUTE-J, users write their own customized test wrapper
classes, hand-coding such parameters as relative frequencies of
method calls.  Users also hand-code a value reuse policy by
drawing receiver and argument values from value pools, and
placing return values back in value pools.  Finding good
parameters quickly, however, requires experience with the tool.

The Nighthawk system described here significantly
builds on this work by automatically determining good parameters.
The lower, randomized-testing,
level of Nighthawk initializes and maintains one or more value
pools for all relevant types, and draws and replaces values in
the pools according to a policy specified in a chromosome.
Chromosomes specifies relative frequencies of methods,
method parameter ranges, and other testing parameters.
The upper, genetic-algorithm, level searches for the
parameter settings that increases the coverage seen in the lower level.
The information that Nighthawk uses about the SUT is only
information about type names, method names, parameter types,
method return types, and which classes are subclasses of others;
this makes its general approach robust and adaptable to other languages.

Designing a GA means making decisions about what features are
worthy of modeling and mutating.  For example, much of the effort
on this project was a laborious trial-and-error process of
trying different types of genes within a chromosome.  To simplify that
process, we
describe experiments here with automatic feature subset
selection (FSS), which lead us to
propose that automatic
feature subset selection should be a routine part of the design
of any large GA system.
% First, we try a very large and elaborate set of
%gene types. Next, we filter that set using automatic feature
%subset selection. The filtered set ran over 40\% more quickly, with
%no loss of coverage.  We therefore propose that automatic
%feature subset selection should be a routine part of the design
%of any large GA system.

\subsection{Contributions and Paper Organization}

The main contributions of this paper are as follows.
\begin{enumerate}
\item We describe Nighthawk, a novel two-level
  genetic-random testing system that 
  encodes a value reuse policy in a manner
  amenable to meta-heuristic search.
\item We demonstrate the value of feature subset selection (FSS) for
  optimizing genetic algorithms.  Using FSS, we can prune 
  Nighthawk's gene types while achieving nearly the same coverage.
  Compared to full Nighthawk, this coverage is achieved
  ten times faster.
\item We offer evidence for the external validity of our FSS-based optimization:
the optimization learned from one set of classes (Java utils) also works when applied to
another set of classes
(classes from the Apache system).
\end{enumerate}

We discuss related work in Section
\ref{related-work-section}.
Section \ref{system-description-section}
describes  our system,
and
%An empirical comparison of our work to three other published systems
%is in Section
%\ref{comparison-section}.
%A case study of applying Nighthawk to a real-world set of units
%(the Java collection and map classes) is described in Section
%\ref{case-study-section}; this section also contains the results
%of experiments we ran to determine the best default settings for
%two Nighthawk parameters.
gene type pruning using feature subset selection is described
in Section
\ref{optimizing-section}.
Section
\ref{sec:chars} describes the performance of optimized
versions of Nighthawk on subject software.
Section
\ref{threats-section} describes threats to validity, and Section
\ref{conclusions-section} concludes.

This paper differs from prior publications as follows:
\bi
\item The original Nighthawk publication~\cite{andrews07} studied its
  effectiveness only on classes from {\tt java.util}.
\item A subsequent paper~\cite{andrews-menzies-promise09} offered an
  FSS-based optimization. That paper found ways to prune 60\% of the gene
  types when trying to cover the {\tt java.util} classes. 
\item This paper shows that that FSS-based optimizer was sub-optimal. We
  present here a better optimization method that prunes 90\% of the gene
  types (see \S{\ref{sec:rerank}}). An analysis of that optimized system
  in \S{\ref{sec:opt}} revealed further improvements.  We have
  successfully tested  the optimized and improved version of Nighthawk on
  Apache Commons Collections classes (see \S{\ref{sec:apache}}).
\ei
\section{Related Work}
\label{related-work-section}

\subsection{Randomized Unit Testing}

``Random'' or ``randomized'' testing has a long history, being
mentioned as far back as 1973 \cite{hetzel-book-1973};
Hamlet \cite{Hamlet94randomtesting} gives a good survey.
The key benefit of randomized testing is the ability to generate
many distinct test inputs in a short time, including test inputs
that may not be selected by test engineers but which may
nevertheless force failures.  There are, however, two main
problems with randomized testing:  the oracle problem and
and the question of thoroughness.

Randomized testing depends on the generation of so many
inputs that it is infeasible to get a human to check all test
outputs.  
An automated test oracle \cite{weyuker-oracles} is
needed.  
There are two main approaches to the oracle problem.
The first
is to use general-purpose, ``high-pass'' oracles that pass many
executions but check properties that should be true of most
software.  For instance, Miller et al.\ \cite{miller-fuzz-cacm}
judge a randomly-generated GUI test case as failing only if the
software crashes or hangs; Csallner and
Smaragdakis \cite{jcrasher-spe} judge a randomly-generated unit
test case as failing if it throws an exception; and Pacheco
et al.\ \cite{pacheco-etal-icse2007} check general-purpose
contracts for units, such as one that states that a method
should not throw a ``null pointer'' exception unless one of its
arguments is null.  Despite the use of high-pass oracles,
all these authors found randomized testing to be effective in
forcing failures.
%
The second approach to the oracle problem for randomized testing
is to write oracles in order to check properties specific to the
software \cite{andrews-etal-rute-rt,ciupa-etal-icse2008}.
These oracles, like all formal unit specifications,
are non-trivial to write; tools such as Daikon for automatically
deriving likely invariants \cite{ernst-daikon} could help here.

Since randomized unit testing does not use any intelligence to
guide its search for test cases, there has always been
justifiable concern about how thorough it can be, given various
measures of thoroughness, such as coverage and fault-finding
ability.
%
Michael et al.\ \cite{michael-etal-ga-tcg} performed randomized testing
on the well-known Triangle program; this program accepts three
integers as arguments, interprets them as sides of a triangle,
and reports whether the triangle is equilateral, isosceles,
scalene, or not a triangle at all.  They concluded that
randomized testing could not achieve 50\% condition/decision
coverage of the code, even after 1000 runs.
%
Visser et al.\ \cite{visser-etal-issta06}
compared randomized unit testing with various model-checking
approaches and found that while randomized testing was good at
achieving block coverage, it failed to achieve optimal coverage
for a measure derived from
Ball's predicate coverage \cite{ball-pred-coverage}.

Other researchers, however, have found that the thoroughness of
randomized unit testing depends on the implementation.
%
Doong and Frankl \cite{doong-frankl-tosem94} tested several
units using randomized sequences of method calls. By
varying some parameters of the randomized testing, they could
greatly increase/decrease the likelihood of finding injected
faults.  The parameters included number of operations performed,
ranges of integer arguments, and the relative frequencies of
some of the methods in the call sequence.
%
Antoy and Hamlet \cite{antoy-hamlet-tse-jan2000}, who checked
the Java {\tt Vector} class against a formal specification
using random input, similarly found that if they avoided calling
some of the methods (essentially setting the relative frequencies of those
methods to zero), they could cover more code in the class.
%
Andrews and Zhang \cite{andrews-zhang-tse2003}, performing
randomized unit testing on C data structures, found that varying
the ranges from which integer key and data parameters were
chosen increased the fault-finding ability of the random testing.

Pacheco
et al.\ \cite{pacheco-etal-icse2007} show that randomized
testing can be
enhanced via
randomized
breadth-first search of the search space of possible test cases,
pruning branches that lead to redundant or illegal values which
would cause the system to waste time on unproductive test cases.

Of the cited approaches, the approach described in this paper
is most similar to Pacheco et al.'s.  The primary difference
is that we achieve thoroughness by generating long sequences of
method calls on different receivers, while they do so by
deducing shorter sequences of method calls on a smaller set of
receivers.
%
The focus of our research is also different.
Pacheco et al.\ focus on identifying contracts for units and
finding test cases that violate them.  In contrast, we focus on
maximizing code coverage; coverage is an objective measure of
thoroughness that applies regardless of whether failures have
been found, for instance in situations in which most bugs have
been eliminated from a unit.

\subsection{Analysis-Based Test Data Generation Approaches}

Approaches to test data generation via symbolic execution 
date back
to 1976
\cite{clarke-76-testdata,king-symbolic-1976}; 
typically these approaches generate a thorough set of
test cases by deducing which combinations of inputs will cause
the software to follow given paths.
TESTGEN \cite{korel-testgen}, for example,
transforms each condition in the program to one of the form
$e < 0$ or $e \leq 0$, and then searches for values that minimize
(resp.\ maximize) $e$, thus causing the condition to become true (resp.\ false).

Other source code analysis tools have applied
iterative relaxation of a set of constraints on
input data \cite{gupta-etal-gen-test-data} and generation
of call sequences using goal-directed
reasoning \cite{leow-etal-icse2004}.  Some recent approaches use
model checkers such as Java Pathfinder \cite{visser-etal-issta04}.
These approaches are sometimes augmented with ``lossy''
randomized search for paths, as in
the DART and CUTE systems \cite{godefroid-etal-dart,sen-etal-cute},
the Lurch system \cite{owen-menzies-lurch},
and the Java Pathfinder-based
research of Visser et al.\ \cite{visser-etal-issta06}.

Some analysis-based approaches limit the range of
different conditions they consider; for instance, TESTGEN's
minimization strategy \cite{korel-testgen} cannot be applied
to conditions involving pointers.  In addition, most
analysis-based approaches incur heavy memory and processing time
costs.
These limitations are the primary reason why researchers
have explored the use of heuristic and metaheuristic approaches
to test case generation.

\subsection{Genetic Algorithms for Testing}

Genetic algorithms (GAs) were first described by Holland
\cite{holland-ga-book}.  Candidate
solutions
are represented
as ``chromosomes'', with solution 
represented as ``genes'' in the
chromosomes.  The possible chromosomes form a search space and
are associated with a fitness function
representing the value of solutions encoded in the chromosome.  Search
proceeds by evaluating the fitness of each of a population of
chromosomes, and then performing point mutations and
recombination on the successful chromosomes.
GAs can defeat
purely random search in finding solutions to complex
problems. Goldberg \cite{goldberg-ga-book} argues that their
power stems from being able to engage in ``discovery and
recombination of building blocks'' for solutions in a solution space.

Meta-heuristic search methods such as GAs have often been
applied to the problem of test suite generation.
In Rela's review of 122 applications of meta-heuristic search
in SE \cite{rela04}, 44\% of the
applications related to testing.  Approaches to GA test suite
generation can be black-box (requirements-based) or white-box
(code-based); here we focus on four representative white-box
approaches, since our approach focuses on increasing coverage, and
is therefore
also white-box.

Pargas et al.\ \cite{pargas-etal-ga-tcg} represent a set of
test data as a chromosome, in which each gene encodes one
input value.
Michael et al.\ \cite{michael-etal-ga-tcg} represent test data
similarly, and conduct experiments comparing various strategies
for augmenting the GA search.  Both of these approaches
evaluate the fitness of a chromosome by measuring
how close the input is to covering some desired statement or
condition direction.
Guo et al.\ \cite{guo-etal-genetic} generate unique input-output
(UIO) sequences for protocol testing using a genetic algorithm;
the sequence of genes represents a sequence of inputs to a
protocol agent, and the fitness function computes a measure
related to the coverage of the possible states and transitions
of the agent.  Finally, Tonella's approach to class testing
\cite{tonella-issta04}
represents the sequence of method calls in a unit test as a
chromosome; the approach features customized mutation operators,
such as one that inserts method invocations.

\subsection{Nighthawk}

In work reported in \cite{andrews07}, we developed
Nighthawk, the two-level genetic-random test data generation
system explored further in this paper, and carried out
experiments aimed at comparing it with previous research and
finding the optimal setting of program switches.

Unlike the methods discussed above,
Nighthawk's genetic algorithm does not result in a single test
input.
Instead, it finds settings to parameters which control
aspects of randomized testing.  We designed
Nighthawk by identifying aspects of the basic randomized
testing algorithm that would benefit from being controlled by
parameters, and then encoding each parameter as a gene in
a chromosome.  Details of the design of Nighthawk are presented
in Section \ref{system-description-section}.

Our initial empirical studies showed that, when run on the
subject software used by
Michael et al.\ \cite{michael-etal-ga-tcg}, Nighthawk reached
100\% of feasible condition/decision coverage on average after
8.5 generations.  They also showed that, when run on the subject
software used by
Visser et al.\ \cite{visser-etal-issta06} and
Pacheco et al.\ \cite{pacheco-etal-icse2007},
Nighthawk achieved the same coverage in a comparable amount of
time.  Finally, our studies showed that Nighthawk could achieve
high coverage (82\%) automatically, when using the best setting
of system parameters, when run on the 16 Collection and Map
classes from the {\tt java.util} package in Java 1.5.0.  These
results encouraged us to explore Nighthawk further.  The full
empirical results are described in \cite{andrews07}.


\subsection{Analytic Comparison of Approaches}

Once a large community starts
comparatively evaluating some technique, then
{\em evaluation methods} for different methods become just as
important as the {\em generation of new methods}.
%To place this comment in an historical perspective, we note that
%evaluation bias is an active research area in the field of data
%mining \cite{bouck03,demsar06}.
%Much of our future work should hence focus on a meta-level
%analysis of the advantages and
%disadvantages of different assessment criteria.
Currently, there are no clear conventions on how this
type of work should be assessed.  However, we can attempt some
analytic comparisons.



It is clear that there are situations in which a source code
analysis-based approach such as symbolic evaluation or model
checking will be superior to any randomized
approach.  For instance, for an {\tt if} statement decision
of the form {\tt (x==742 \verb/&&/ y==113)}, random search of
the space of all possible {\tt x,y} pairs is unlikely to
produce a test case that executes the decision in the true
direction, while a simple analysis of the source code will be
successful.  The question is how often these situations
arise in real-world programs.  The Nighthawk system of this
paper cannot guess at constants like 742, but is still able
to cover the true direction of decisions of the form
{\tt x==y} because the value reuse policies it discovers will
often choose {\tt x} and {\tt y} from the same value pool.
%
It is therefore likely that randomized testing and
analysis-based approaches
have complementary strengths.
Groce et al.\ \cite{groce-etal-icse2007} conclude that
randomized testing is a good first step, before model checking,
in achieving high quality software.
%, especially where the
%existence of a reference implementation allows differential
%randomized testing \cite{mckeeman-differential}.



%\begin{itemize}
%\item Static code analysis can direct the generation of the test
%  cases.  Our method, on the other hand, generates test cases at
%  random, so it is possible that we cover some code more than is
%  necessary, and that parts of our value pools are  useless.
%  Our view is that this is not a major issue since our
%  runtimes, on real-world systems, are quite impressive.
%  Nevertheless, there needs to be more discussion on how to
%  assess test suite generation; i.e.\ runtimes versus
%  superfluous tests versus any other criteria.
%\end{itemize}

\begin{wrapfigure}{r}{2.5in}
\centering
%\includegraphics[width=2.5in]{myfigure.pdf}
\includegraphics[width=2.5in]{xyPlot.pdf}
\caption{Spiky search space resulting from poor fitness function.}
\label{spiky-search-space-fig}
\end{wrapfigure}
Genetic algorithms do not perform well when the search space is mostly flat, with steep jumps 
in fitness score.
Consider the problem of finding integer inputs
$x$ and $y$ that cover the true direction of the decision
``$x$\verb/==/$y$''.  If we cast the problem as a search for the
two values, and the score as whether we have found
two equal values, the search space is shaped as in
Figure \ref{spiky-search-space-fig}: a flat plain
of zero score
with spikes along the diagonal.  Most approaches
to GA white-box test data generation use
fitness functions that detect ``how close''
the target decision is to being true, often using analysis-based
techniques.  For instance, Michael et al.\ \cite{michael-etal-ga-tcg}
use fitness functions that specifically take account of such
conditions
by measuring how close {\tt x} and {\tt y} are.
Watkins and Hufnagel \cite{watkins-hufnagel-fitness} enumerate
and compare fitness functions proposed for GA-based test case
generation.

\begin{wrapfigure}{r}{2.5in}
\centering
%\includegraphics[width=2.5in]{myfigure.eps}
\includegraphics[width=2.5in]{lohiPlot.pdf}
\caption{Smooth search space resulting from recasting problem.}
\label{smooth-search-space-fig}
\end{wrapfigure}
In contrast, we recast the problem as a
search for the best values of $lo$ and $hi$ that
will be used as the lower and upper bound for random generation
of $x$ and $y$, and the score as whether we have generated two equal
values of $x$ and $y$.  Seen in this way, the search space landscape
still contains a spiky ``cliff'', as seen in
Figure \ref{smooth-search-space-fig}, but the cliff is
approached on one side by a gentle slope.

If the inputs in Figure \ref{spiky-search-space-fig} were
floating-point numbers (not integers),  the search space would
consist of a flat plain of zero score with a thin, sharp ridge along the
diagonal.  In this case the solution depicted in Figure
\ref{smooth-search-space-fig} would yield only a small improvement.
This is where value pools come in.  We draw each parameter value from a value pool
of finite size; each numeric value pool
has size $s$ and bounds $lo$ and $hi$, and is
initially seeded with values drawn randomly within $lo$
to $hi$.  For the example problem, we will be more likely to
choose equal $x$ and $y$ as $s$ becomes smaller, regardless of
the value of $lo$ and $hi$ and regardless of whether the values
are integers or floating-point numbers, because the smaller the
value pool, the more likely we are to pick the same value for
$x$ and $y$.

This approach generalizes to non-numeric data.  Each type of interest is
associated with one or more value pools; the number and size of which
are controlled by genes.  At any time, a value in a value pool may
be chosen as the receiver or parameter of a method call, which may in turn
change the value in the pool.  Also, at any time a value in a value pool
may be replaced by a return value from a method call.  Which value pools
are drawn on, and which value pools receive the
return values of which methods, are also controlled by genes.  A test case
may consist of hundreds of randomized method calls, culminating in the
creation of values in value pools which, when used as parameters to a
method call, cause that method to execute code not executed before.  
Changing gene values therefore makes this more/less likely to happen.

To the best of our knowledge, each run of previous GA-based tools
has resulted in a single
test case, which is meant to reach a particular target.  A test
suite is built up by aiming the GA at different targets, resulting
in a fixed-size test suite that achieves coverage of all targets.
%Herein lies a potential disadvantage of such approaches.
However,
Frankl and Weiss \cite{frankl-weiss-tse93}
and Andrews and Siami Namin \cite{andrews-siami-issta09}
have shown that both size and
coverage exert an influence over test suite effectiveness, and
Rothermel et al.\ \cite{rothermel-etal-effects-minimization}
have shown that reducing test suite size while preserving
coverage can significantly reduce its fault
detection capability.  Therefore, given a choice between three
systems achieving the same coverage, (a) which generates one
fixed set of test cases, (b) which generates many different test
cases slowly, and (c) which generates many different test cases
quickly, (c) is the optimal choice.
%  Any deterministic algorithm
%for generating test cases is in class (a), whereas a
A GA which
generates one test case per run is in class (a) or (b) (it may generate
different test cases on each run as a result of random mutation).
In contrast,
Nighthawk is in class (c) because each run of the GA results only
in a setting of randomized testing parameters that achieves high
coverage; new high-coverage test suites can then be generated
quickly at low cost.
%  Our previous research
%\cite{andrews-zhang-tse2003,andrews-ase2004} indicates that under such
%conditions, randomized testing can be highly effective.

%\begin{center}
%\begin{tabular}{c|ccccccc|ccccc}
%$x|y$ & 1 & 2 & 3 & 4 & 5 & ~~~ &
%$lo|hi$ & 1 & 2 & 3 & 4 & 5 \\
%\cline{1-6} \cline{8-13}
%1 & 1.0 & .00 & .00 & .00 & .00 & &
%1 & 1.0 & .50 & .33 & .25 & .20 \\
%2 & .00 & 1.0 & .00 & .00 & .00 & &
%2 & .00 & 1.0 & .50 & .33 & .25 \\
%3 & .00 & .00 & 1.0 & .00 & .00 & &
%3 & .00 & .00 & 1.0 & .50 & .33 \\
%4 & .00 & .00 & .00 & 1.0 & .00 & &
%4 & .00 & .00 & .00 & 1.0 & .50 \\
%5 & .00 & .00 & .00 & .00 & 1.0 & &
%5 & .00 & .00 & .00 & .00 & 1.0 \\
%\end{tabular}
%\end{center}
%
%\caption{Chance of selecting two identical integers $x$ and $y$.
%(left) Search space as space of $(x,y)$ pairs.
%(right) Search space as space of lower and upper bounds
%for random generation of $x$ and $y$.}
%\label{search-space-fig}
%\end{figure*}

All analysis-based
approaches share the disadvantage of requiring a robust parser
and/or an analyzer for source code, byte code or machine code
that can be updated to reflect changes
in the source language.
%  Such tools are not often
%provided by language providers.
As an example from the domain
of formal specification, Java 1.5 was released in 2004, but as
of this writing, the widely-used specification language JML
does not fully support Java 1.5 features \cite{cok-adapting-jml}.
Our approach does not require source code or bytecode analysis,
instead depending only on class and method parameter information
(such as that supplied by the robust Java reflection mechanism)
and commonly-available coverage tools.
For instance, our code was
initially written with Java 1.4 in mind, but worked seamlessly
on the Java 1.5 versions of the {\tt java.util} classes, despite
the fact that the source code of many of the units had been
heavily modified to introduce templates.  However,
model-checking approaches have other strengths,
such as the ability to analyze multi-threaded code
\cite{havelund-pathfinder}, further supporting the conclusion
that the two approaches are complementary.

% Look up Baresel paper BSS02!!

%We also generalize the problem domain from the heuristic to the
%meta-heuristic by adding the GA level.
%
%\section{Exploratory Study}
%\label{exploratory-section}
%
%To test the merit of a 
%genetic-random system, we conducted an exploratory study.
%In this section, we describe the prototype software we developed,
%the design of the study and its results.
%
%\subsection{Software Developed}
%
%Using code from RUTE-J (see above) and the open-source genetic
%algorithm package JDEAL \cite{jdeal}, we constructed a prototype
%two-level genetic-random unit testing system that took Java
%classes as its testing units.  For each unit under test (UUT) with
%$n$ methods to call, the GA level constructed a chromosome with
%$2+n$ integer genes; these genes represented
%the number of method calls to make in each
%test case, the number of test cases to generate, and the
%relative weights (calling frequencies) of the $n$ methods.  All
%other randomized testing parameters
%were hard-coded in the test wrappers.
%
%The evaluation of the fitness of each chromosome $c$ proceeded as
%follows.  We got the random testing level to generate the number
%of test cases of the length specified in $c$, using
%the method weights specified in $c$.  We then
%measured the number of coverage points covered using
%Cobertura \cite{cobertura-website},
%which measures line coverage.  If we had based the
%fitness function {\it only} on coverage, however, then any chromosome
%would have benefitted from having a larger number of
%method calls and test cases, since every new method call has the
%potential of covering more code.
%We therefore built in a brake to prevent these values from
%getting unfeasibly high.  We calculated the fitness function
%as:
%\begin{center}
%(number of coverage points covered) $*$ (coverage factor) \\
% $-$ (number of method calls performed overall)
%\end{center}
%We set the coverage factor to 1000, meaning that we were willing
%to make 1000 more method calls (but not more) if that meant
%covering one more coverage point.
%
%\subsection{Experiment Design}
%
%For subject programs, we used three units from the Java
%1.4.2 edition of {\tt java.util}: {\tt BitSet}, {\tt HashMap}
%and {\tt TreeMap}.  These units are widely used,
%and {\tt TreeMap} had been used in the
%experiments of other researchers \cite{visser-etal-issta04}.
%For each UUT, we wrote a
%test wrapper class containing methods that called selected
%target methods of the UUT (16 methods for BitSet, 8 for HashMap
%and 9 for TreeMap).
%Each wrapper contained a simple oracle for checking correctness.
%We instrumented each UUT using Cobertura.
%
%We ran the two-level algorithm 30 times on each of the three
%test wrappers, and recorded the amount of time taken and the
%parameters in the final chromosome.
%To test whether the weights in the chromosomes were
%useful given the length and number of method calls, for each
%final chromosome $c$ we created a variant chromosome $c'$
%with the same length and number of method calls but with all
%weights equal.  We then compared the coverage achieved by
%$c$ and $c'$ on 30 paired trials.  Full results from the
%experiment are available in \cite{cli-msc}.
%
%\subsection{Results}
%
%We performed two statistical tests to evaluate whether the
%system was converging on a reasonable solution.  First, we
%ordered the average weights discovered for each method in each
%class, and performed a $t$ test
%with Bonferroni correction between each pair of
%adjacent columns.
%We found that for the {\tt HashMap} and {\tt TreeMap} units, the
%{\tt clear} method (which removes all data from the map) had a
%statistically significantly lower weight than the other methods,
%indicating that the algorithm was consistently converging on a
%solution in which it had a lower weight.
%This is because much of the code in these
%units can be executed only when there is a large amount of data
%in the container objects.  Since the {\tt clear} method clears
%out all the data, executing it infrequently ensured that
%the objects would get large enough.
%
%We also found that for the {\tt TreeMap} unit, the {\tt remove}
%and {\tt put} methods had a statistically significantly higher
%weight than the other methods.  This is explainable by the large
%amount of complex code in these methods and the private methods
%that they call; it takes more calls to
%cover this code than it does for the simpler code of
%the other methods.  Another reason is that sequences of {\it put}
%and {\it remove} were needed to create data structures through
%which code in some of the other methods was accessible.
%
%The second statistical test we performed tested
%whether the weights found by the GA were efficient.
%For this, we used the 30 trials comparing the discovered
%chromosome $c$ and the equal-weight variant $c'$.  We found that
%for all three units, the equal-weight chromosome covered less
%code than the
%original, to a statistically significant level (as measured by a
%$t$ test with $\alpha=0.05$).  This can be interpreted as
%meaning that the GA was correctly choosing a good {\it combination}
%of parameters.
%
%In the course of the experiment, we found a bug in the Java
%1.4.2 version of {\tt BitSet}:  when a call to {\tt set()} is
%performed on a range of bits of length 0, the unit could later
%return an incorrect ``length'' for the {\tt BitSet}.
%We found that a bug report for this bug
%had already been submitted to Sun's bug database.  It has been
%corrected in the Java 1.5.0 version of the library.
%
%In summary, the experiment indicated that the two-level
%algorithm was potentially useful, and was consistently
%converging on similar solutions that were more optimal than
%calling all methods equally often.

\section{Nighthawk:  System Description}
\label{system-description-section}

\begin{wrapfigure}{r}{3in}
\includegraphics[width=3in]{valuepools.pdf}
\caption{
{ Value pool initialization and use.  Stage 1:
  random values are seeded into the value pools for primitive types such
  as {\tt int}, according to bounds in the pools.  Stage 2:
  values are seeded into non-primitive type classes that have
  initializer constructors, by calling those constructors.  Stage 3:
  the rest of the test case is constructed and run, by repeatedly randomly
  choosing a method and receiver and parameter values.  Each method call
  may result in a return value, which is placed back into a value
  pool (not shown).
}}
\label{valuepools-fig}
\end{wrapfigure}
Our exploratory studies~\cite{andrews07} suggested
that 
GAs generating unit tests
should  search
method parameter ranges, value
reuse policy and other randomized testing parameters.  
This section describes Nighthawk's implementation of that search.
We outline the lower, randomized-testing,
level of Nighthawk, and then describe the chromosome that
controls its operation.  We then describe the
genetic-algorithm level and the
end user interface.  Finally, we describe 
automatically-generated test wrappers for precondition
checking, result evaluation and coverage enhancement.

\subsection{Randomized Testing Level}

Nighthawk's lower level constructs and runs one test case.  
The algorithm takes two
parameters:  a set $M$ of Java methods, and a GA
chromosome $c$ appropriate to $M$.  The chromosome controls
aspects of the algorithm's behavior, such as the number of
method calls to be made, and will be described in
more detail in the next subsection.
We say that 
$M$ is the set of ``target methods''.
$I_M$, the {\it types of interest} corresponding to $M$,
is 
the union of the following sets of types\footnote{
  In this paper, the word ``type'' refers to any primitive type,
  interface, or abstract or concrete class.
}:
\begin{itemize}
\item All types of receivers, parameters and return values of methods in $M$.
%\item All types of parameters of methods in $M$.
%\item All types of return values of methods in $M$.
\item All primitive types that are the types of parameters
  to constructors of other types of interest.
\end{itemize}
Each type $t \in I_M$
has an array of {\it value pools}.
Each
value pool for $t$ contains an array of values of type $t$. 
Each value pool for a range primitive type (a primitive type
other than {\tt boolean} and {\tt void}) has bounds on
the values that can appear in it.  The number of value pools,
number of values in each value pool, and the range primitive
type bounds are specified by chromosome $c$.

See Figure \ref{valuepools-fig}
for a high-level view of how the value pools are initialized
and used in the test case generation process.
The algorithm chooses initial values for primitive type
pools, before considering  non-primitive type pools.  
A constructor method is an {\it initializer} if it has no
parameters, or if all its parameters are of primitive types.
A constructor is a {\it reinitializer} if it has no
parameters, or if all its parameters are of types in $I_M$.
(All initializers are also reinitializers.)
We define the set $C_M$ of {\it callable methods} to be the
methods in $M$ plus the reinitializers of the types in $I_M$
(Nighthawk calls these {\it callables} directly).

A {\it call description} is an object representing one method
call that has been constructed and run.  It consists of the
method name, an indication of whether the method call succeeded,
failed or threw an exception, and one {\it object description}
for each of the receiver, the parameters and the
result (if any).
%The object description of an object of primitive type or a
%string consists of the object itself.  The object description of
%other objects consists of the class of interest, the value pool
%number and the value number associated with the object; this is
%the source from which the object came (for receivers and
%parameters) or the place where the value was put (for results).
A {\it test case} is a sequence of call descriptions, together
with an indication of whether the test case succeeded or failed.

\begin{figure}
\renewcommand{\baselinestretch}{0.85} 
{\scriptsize
Input:  a set $M$ of target methods; a chromosome $c$. \\
Output:  a test case. \\
Steps:
\begin{enumerate}
\item For each element of each value pool of each primitive
  type in $I_M$, choose an initial value that is within the bounds
  for that value pool.
\item For each element of each value pool of each other type $t$
  in $I_M$:
  \begin{enumerate}
  \item If $t$ has no initializers, then set the element to {\tt null}.
  \item Otherwise, choose an initializer method $i$ of $t$, and call
    {\tt tryRunMethod}$(i, c)$.  If the call returns a non-null value,
    place the result in the destination element.
  \end{enumerate}
\item Initialize test case $k$ to the empty test case.
\item Repeat $n$ times, where $n$ is the number of method calls to
  perform:
  \begin{enumerate}
  \item Choose a target method $m \in C_M$.
  \item Run {\tt tryRunMethod}$(m, c)$. Add
        the returned call description to $k$.
  \item If {\tt tryRunMethod} returns a method call failure
    indication, return $k$ with a failure indication.
  \end{enumerate}
\item Return $k$ with a success indication.
\end{enumerate}
}
\caption{Algorithm {\sf constructRunTestCase}.}
\label{constructRunTestCase-fig}
\end{figure}

\begin{figure}[!t]
\renewcommand{\baselinestretch}{0.85} 
{\scriptsize
Input:  a method $m$; a chromosome $c$. \\
Output:  a call description. \\
Steps:
\begin{enumerate}
\item If $m$ is non-static and not a constructor:
  \begin{enumerate}
  \item Choose a type $t \in I_M$ which is a subtype of the receiver of $m$.  
  \item Choose a value pool $p$ for $t$.
  \item Choose one value $recv$ from $p$ to
    act as a receiver for the method call.
  \end{enumerate}
\item For each argument position to $m$:
  \begin{enumerate}
  \item Choose a type $t \in I_M$ which is a subtype of the argument type.
  \item Choose a value pool $p$ for $t$.
  \item Choose one value $v$ from $p$ to act as the argument.
  \end{enumerate}
\item If the method is a constructor or is static, call it with
  the chosen arguments.  Otherwise, call it on $recv$ with the
  chosen arguments.
\item If the call throws {\tt AssertionError},
  return a failure indication call description.
\item Otherwise, if the call threw  another exception,
  return a call description with an exception indication.
\item Otherwise, if the method return is not {\tt void},
  \& the return value $ret$ is non-null:
  \begin{enumerate}
  \item Choose  type $t \in I_M$ that is a supertype of the
    the return value.
  \item Choose a value pool $p$ for $t$.
  \item If $t$ is not a primitive type, or if $t$ is a primitive
    type and $ret$ does not violate the $p$ bounds,
    then replace an element of $p$ with
    $ret$.
  \item Return a call description with a success indication.
  \end{enumerate}
\end{enumerate}
}
\caption{Algorithm {\sf tryRunMethod}.}
\label{tryRunMethod-fig}
\end{figure}

Nighthawk's randomized testing algorithm is referred to as
{\sf constructRunTestCase}, and is described in Figure
\ref{constructRunTestCase-fig}.  It takes a set $M$ of target
methods and a chromosome $c$ as inputs.  It begins by
initializing value pools, and then constructs and runs a
test case, and returns the test case.  It uses an auxiliary
method called {\sf tryRunMethod} (Figure
\ref{tryRunMethod-fig}), which takes a method as input,
calls the method and returns a call description.
In the algorithm descriptions, the word ``choose'' is always used
to mean specifically a random choice which may partly depend on
$c$.

{\sf tryRunMethod} considers a method call to fail if and only
if it throws an {\tt AssertionError}.  It does not consider
other exceptions to be failures, since they might be correct
responses to bad input parameters.
We facilitate checking correctness of return values and
exceptions by providing
a generator for ``test wrapper'' classes.  The generated test wrapper
classes can be instrumented with assertions;
see Section \ref{test-wrappers-section} for more details.

Return values may represent new object instances never yet created during
the running of the test case.  If these new instances are given as
arguments to method calls, they may cause the method to execute statements
never yet executed.  Thus, the return values are valuable and are returned
to the value pools when they are created.

Although we have targeted Nighthawk specifically at Java, note
that its general principles apply to any object-oriented or
procedural language.  For instance, for C, we would need only
information about the types of parameters and return values of
functions, and the types of fields in {\tt struct}s.  {\tt struct}
types and pointer types could be treated as classes with special
constructors, getters and setters; functions could be treated as
static methods of a single target class.

\subsection{Chromosomes}

\begin{figure*}
\renewcommand{\baselinestretch}{0.75} 
{\footnotesize
\begin{center}
\begin{tabular}{|l|l|c|l|}
Gene type & Occurrence & Type & Description \\
\hline
\parbox[t]{1.3in}{\tt numberOfCalls}
  & \parbox[t]{2in}{One for whole chromosome}
  & int
  & \parbox[t]{2in}{the number $n$ of method calls
                    to be made\vspace{1mm}} \\
% that
%                 {\sf constructRunTestCase} is to run} \\
\hline
\parbox[t]{1.3in}{\tt methodWeight}
  & \parbox[t]{2in}{One for each method $m \in C_M$}
  & int
  & \parbox[t]{2in}{The relative weight of the method, i.e.\ the
                    likelihood that it will be chosen\vspace{1mm}} \\
%                at step 4(a)
%                of {\sf constructRunTestCase}} \\
\hline
\parbox[t]{1.3in}{\tt numberOf- ValuePools}
  & \parbox[t]{2in}{One for each type $t \in I_M$}
  & int
  & \parbox[t]{2in}{The number of value pools for that type\vspace{1mm}} \\
\hline
\parbox[t]{1.3in}{\tt numberOfValues}
  & \parbox[t]{2in}{One for each value pool of each type $t \in I_M$
                 except for {\tt boolean}\vspace{1mm}}
  & int
  & \parbox[t]{2in}{The number of values in the pool} \\
\hline
\parbox[t]{1.3in}{\tt chanceOfTrue}
  & \parbox[t]{2in}{One for each value pool of type {\tt boolean}}
  & int
  & \parbox[t]{2in}{The percentage chance that the value {\it true} will be
                 chosen from the value pool\vspace{1mm}} \\
\hline
\parbox[t]{1.3in}{\tt lowerBound, upperBound}
  & \parbox[t]{2in}{One for each value pool of each
                 range primitive type $t \in I_M$}
  & \parbox[t]{0.3in}{int or float}
  & \parbox[t]{2in}{Lower and upper bounds on pool values;
                 initial values are drawn uniformly from this range} \\
\hline
\parbox[t]{1.3in}{\tt chanceOfNull}
  & \parbox[t]{2in}{One for each argument position of non-primitive type
                 of each method $m \in C_M$}
  & int
  & \parbox[t]{2in}{The percentage chance that {\tt null} will be chosen
                 as the argument\vspace{1mm}} \\
\hline
\parbox[t]{1.3in}{\tt candidateBitSet}
  & \parbox[t]{2in}{One for each parameter and quasi-parameter of
                    each method $m \in C_M$}
  & BitSet
  & \parbox[t]{2in}{Each bit represents 1 candidate type,  signifying
                 if the argument will be of that type\vspace{1mm}} \\
\hline
\parbox[t]{1.3in}{\tt valuePool- ActivityBitSet}
  & \parbox[t]{2in}{One for each candidate type of each parameter
                    and quasi-parameter of each method $m \in C_M$}
  & BitSet
  & \parbox[t]{2in}{Each bit represents one value pool, and signifies
                 whether the argument will be drawn from that value
                 pool\vspace{1mm}} \\
\hline
\end{tabular}
\end{center}
}
\caption{Nighthawk gene types.}
\label{gene-types-fig}
\end{figure*}

Aspects of the test case execution algorithms are controlled
by the genetic algorithm chromosome given as an argument.  A
{\it chromosome} is composed of a finite number of {\it genes}.
Each gene is a pair consisting of a name
and an integer, floating-point, or {\tt BitSet} value.
Figure \ref{gene-types-fig} summarizes the different types of
genes that can occur in a chromosome.
We refer to the receiver (if any) and the return value
(if non-{\tt void}) of a method call
as {\it quasi-parameters} of the method call.  Parameters and
quasi-parameters have {\it candidate types}:
\begin{itemize}
\item A type is a {\it receiver candidate type} if it is a
  subtype of the type of the receiver.  These are the types
  from whose value pools the receiver can be drawn.
\item A type is a {\it parameter candidate type} if it is a
  subtype of the type of the parameter.  These are the types
  from whose value pools the parameter can be drawn.
\item A type is a {\it return value candidate type} if it is a
  supertype of the type of the return value.  These are the
  types into whose value pools the return value can be placed.
\end{itemize}
Note that the gene types
{\tt candidateBitSet} and {\tt valuePoolActivityBitSet}
encode  value reuse policies by determining the
pattern in which receivers, arguments and return values are
drawn from and placed into value pools.

%Every Nighthawk chromosome contains a gene specifying the number
%$n$ of method calls that {\sf constructRunTestCase} is to
%run.  In addition, a chromosome appropriate to a set $M$
%of target methods contains the following genes:
%\begin{itemize}
%%\item The number $n$ of method calls to perform.
%\item For each method in $C_M$, the relative weight of the
%  method, i.e.\ the likelihood that it will be chosen at step
%  4(a) of {\sf constructRunTestCase}.
%\item For each type of interest in $I_M$, the number of value
%  pools for that type.
%\item For each value pool, the number of values in the pool.
%\item For each value pool of a range primitive type, the upper
%  and lower bounds on values in the pool.  Initial values are
%  drawn from this range with a uniform distribution.
%\item For each method in $C_M$ and every argument position, the
%  chance that {\tt null} will be chosen as an argument.
%\item For each method in $C_M$, the types of interest that will
%  be chosen from to find a receiver, and, for each of those
%  types, the value pools that will be chosen from.
%\item For each method in $C_M$ and every argument position, the
%  types of interest that will be chosen from to find an
%  argument, and, for each of those types, the value pools
%  that will be chosen from.
%\item For each method in $C_M$, the types of interest that will
%  be chosen from to find an element that will be replaced by
%  the return value,
%  and, for each of those types, the value pools that will be
%  chosen from.
%\end{itemize}
%The last three kinds of genes are expressed as bit vectors;
%each bit stands for one of the types of interest that is a
%subtype of the declared type (resp.\ one of the value pools).
%These bit vectors thus encode the value reuse policy
%expressed by the chromosome.

It is clear that different gene values in the chromosome may
cause dramatically different behavior of the algorithm on the
methods.  We illustrate this point with two concrete examples.

Consider the ``triangle'' unit from \cite{michael-etal-ga-tcg}.
If the value pool for
all three parameter values contains
65536 values in the range -32768 to 32767, then the
chance that the algorithm will ever choose two
or three identical values for the parameters (needed for the
``isosceles'' and ``equilateral'' cases) is very low.  If,
on the other hand, the value pool contains only 30 integers
each chosen from the range 0 to 10, then the chance rises
dramatically due to reuse of previously-used values  (the
additional coverage this would give would depend on
the SUT, but is probably $>0$).

Consider further a container class with {\tt put} and
{\tt remove} methods, each taking an integer key as its only
parameter.  If the parameters to the two methods are taken from
two different value pools of 30 values in the range 0 to 1000,
there is little chance that a key that has been put into the
container will be successfully removed.  If, however, the
parameters are taken from a single value pool of 30 values in
the range 0 to 1000, then the chance is very good that added
keys will be removed, again due to value reuse.  A {\tt remove}
method for a typical data structure executes different
code for a successful removal than it does for a failing one.

\subsection{Genetic Algorithm Level}

We take the space of possible chromosomes as a solution space to
search, and apply the GA approach to search it
for a good solution.  We chose GAs over other metaheuristic
approaches such as simulated annealing because of our belief that
recombining parts of successful chromosomes would result in
chromosomes that are better than their parents.  However, other
metaheuristic approaches may have other advantages and should be
explored in future work.
%  For the randomized
%unit testing problem, the ``building blocks'' are individual
%choices for the gene values; discovering and recombining good
%building blocks leads to better overall solutions.

The parameter to Nighthawk's GA is the set $M$ of target methods.
The GA performs the usual chromosome
evaluation steps (fitness selection, mutation \& recombination).
The GA derives an
initial template chromosome appropriate to $M$, constructs
an initial population of size $p$ as clones of this chromosome, and
mutates the population.  It then loops for
the desired number $g$ of generations, of evaluating each
chromosome's fitness, retaining the fittest chromosomes,
discarding the rest, cloning the fit chromosomes, and mutating
the genes of the clones with probability $m$\% using point mutations and
crossover (exchange of genes between chromosomes).

The evaluation of the fitness of each chromosome $c$ proceeds as
follows.  
The random testing level of Nighthawk generates and runs 
a test case, using the parameters encoded in $c$.  
It
then collects the number of lines covered by the test case.  If
we based the fitness function {\it only} on coverage,
then any chromosome would benefit from having a larger number of
method calls and test cases, since every new method call has the
potential of covering more code.
%  We therefore built in a brake
%to prevent these values from getting unfeasibly high.
Nighthawk therefore
calculates the fitness of the chromosome as:
\begin{center}
(number of coverage points covered) $*$ (coverage factor) \\
 $-$ (number of method calls performed overall)
\end{center}
We set the coverage factor to 1000, meaning that we are willing
to make 1000 more method calls (but not more) if that means
covering one more coverage point.

%It is recognized that the design of genetic algorithms is a
%``black art'' \cite{ga-blackart}, and that very little is known
%about why GAs work when they do work and why they do not work
%when they do not.
For the three variables mentioned above,
Nighthawk uses default settings of
$p=20, g=50, m=20$.  These settings are different from those
taken as standard in GA literature \cite{dejong-spears-genetic},
and are motivated by a need to do as few chromosome evaluations
as possible (the primary cost driver of the system).
The population size $p$ and the number of generations $g$ are
smaller than standard, resulting in fewer chromosome evaluations; to compensate
for the lack of diversity in the population that would otherwise
result, the mutation rate $m$ is larger.  The
settings of other variables, such as the retention percentage,
are consistent with the literature.

To enhance availability of the software,
Nighthawk uses the popular open-source coverage tool Cobertura
\cite{cobertura-website} to measure coverage.  Cobertura can measure
only line coverage (each coverage point corresponds to a source
code line, and is covered if any code on the line is executed).
% \footnote{
%  Cobertura (v.\ 1.8) also reports what it calls ``decision coverage'',
%  but this is coverage of lines containing decisions.
%}.
However, Nighthawk's algorithm is not specific to this measure.

\subsection{Top-Level Application}

%\label{deep-section}
The Nighthawk application takes several switches and a set
of class names as command-line parameters.
Our empirical studies \cite{andrews07} showed that it is best to consider
the set of ``target classes'' as the command-line classes together with
all non-primitive types of parameters and return values of the public
declared methods of the command-line classes.
The set $M$ of target {\it methods}
is computed as all public declared methods of the target
{\it classes}.

Nighthawk runs the GA, monitoring the
chromosomes and retaining the first chromosome that has the
highest fitness ever encountered.  This most
fit chromosome is the final output of the program.
%
\label{run-chromosome-sec}
After finding the most fit chromosome, test engineers can
apply the specified randomized test.
To do this, they run a separate program, RunChromosome, which
takes the chromosome description as input and runs test cases
for a user-specified number of times.  Randomized
unit testing generates new test cases with new data every time
it is run, so if Nighthawk finds a parameter setting that
achieves high coverage, a test engineer can automatically
generate a large number of distinct, high-coverage test cases with
RunChromosome.


\subsection{Test Wrappers}
\label{test-wrappers-section}

We provide a utility program that, given a class name, generates
the Java source file of a ``test wrapper'' class.
Running Nighthawk on an unmodified test wrapper is the same
as running it on the target class; however, test wrappers can be
customized for precondition checking, result checking or
coverage enhancement.
%
A test wrapper for class X is a class with one private
field of class X (the ``wrapped object''), and one public method
with an identical declaration for each public declared method of
class X.  Each wrapper method passes calls to the
wrapped object.

To improve test wrapper precondition checking, users can add
checks in a wrapper method before the target
method call.  When preconditions are violated, the wrapper
method just returns.
To customize a wrapper for test result checking, the user
can insert any result-checking code after the target method
call; examples include normal Java assertions and JML
\cite{jml-overview} contracts.
%For instance, a user can choose to customize a wrapper method so
%that it throws an {\tt AssertionError} (signaling test case failure)
%whenever the target method throws an {\tt ArrayOutOfBoundsException}.
Switches to the test wrapper generation program can make
the wrapper 
check commonly-desired properties; e.g.  a method throws
no {\tt NullPointerException} unless one of its arguments is
null.  The switch \verb/--pleb/ generates a wrapper that checks
the Java Exception and Object contracts from
Pacheco et al.\ \cite{pacheco-etal-icse2007}.
Test wrapper generation is discussed further in
\cite{andrews07}.

%To improve test wrapper coverage,
%users may add methods to execute extra code.
%The switch \verb/--checkTypedEquals/
%adds a method to the test wrapper for class X that takes one
%argument of type X and passes it to the {\tt equals} method
%of the wrapped object.  This differs from normal wrapper
%methods that call {\tt equals}, which have an argument of type
%{\tt Object} and would therefore by default receive arguments
%only of type {\tt int} or {\tt String} (see Section \ref{object-section}).
%For classes that override the {\tt equals} method, the
%typed-equals method executes more code.
%
%Tailored serialization is accomplished in Java via
%specially-named private methods that are inaccessible to
%Nighthawk.  The test wrapper generation program switch
%\verb/--checkSerialization/ adds a
%method to the test wrapper that writes the object to a byte
%array and reads it back again.
%This causes Nighthawk to be able to execute the code in the
%serialization methods.  Our empirical studies
%\cite{andrews07} indicated that using the
%\verb/--checkTypedEquals/ and
%\verb/--checkSerialization/ with the Java Collection and Map
%classes resulted in a statistically significant increase in
%coverage with no statistically significant increase in
%execution time.

% \section{Comparison with Previous Results}
% \label{comparison-section}
% 
% We compared Nighthawk with three well-documented systems in the
% literature by running it on the same software and measuring
% the results.
% 
% \subsection{Pure GA Approach}
% 
% To compare our genetic-random approach against
% the purely genetic approach of
% Michael et al.\ \cite{michael-etal-ga-tcg},
% we ported their
% C code for the Triangle program to Java, transforming
% each decision so that each condition and decision direction
% corresponded to an executable line of code measurable by
% Cobertura.  Nighthawk was then run 10 times on the
% resulting class.
% 
% %6782 gen 8
% %6914 gen 10
% %6527 gen 10
% %5008 gen 7
% %3346 gen 5
% %1790 gen 2
% %2486 gen 5
% %7945 gen 13
% %18363 gen 21
% %2509 gen 4
% 
% We found that Nighthawk reached 100\% of feasible
% condition/decision coverage on average after 8.5 generations
% (sample standard deviation: 5.5, 95\% confidence interval
% for mean:  (4.6, 12.4)), in
% an average of 6.2 seconds of clock time
% (sample standard deviation: 4.8, 95\% confidence interval
% for mean: (2.7, 9.6))\footnote{
% All empirical studies in this paper were performed on a Sun
% UltraSPARC-IIIi running SunOS 5.10 and Java 1.5.0\_11.
% }.
% %There were 22 decisions having a total of 28 condition
% %and decision directions.
% %All condition and decision directions were coverable
% %except the false direction of the last ``if'' statement; Michael
% %et al.\ found that their best method also achieved 95\% coverage.
% %
% Michael et al.\ had found that a purely random approach could
% not achieve even 50\% condition/decision coverage.  The
% discrepancy between the results may be due to
% Nighthawk being able to find a setting of the randomized testing
% parameters that is more optimal than the one Michael et al.\ were
% using.  Inspection revealed that the chromosomes encoded value
% reuse policies that guaranteed frequent selection of the same
% values.
% 
% \subsection{Model-Checking and Feedback-Directed Randomization}
% 
% To compare our method against the model-checking approach
% of Visser et al.\ \cite{visser-etal-issta06} and the
% feedback-directed random testing of
% Pacheco et al.\ \cite{pacheco-etal-icse2007},
% we downloaded the data structure units used in those
% studies  (these had been hand-instrumented to record
% coverage of the deepest basic blocks in the code).
% %
% We first wrote restricted test wrapper classes that called only
% the methods used by previous researchers.  We ran
% Nighthawk giving these test wrapper classes as command-line
% classes, and observed the number of instrumented basic blocks
% covered, and the number of lines covered as measured by
% Cobertura.
% 
% \begin{figure}
% {\footnotesize
% \begin{center}
% \begin{tabular}{|c|c|c|c|c|c|c|}
% \hline
		% & \multicolumn{3}{c|}{Instr Blk Cov}
				  % & Time
					  % & \multicolumn{2}{c|}{Line Cov} \\
% UUT		& JPF & RP  & NH  & (sec) & Restr & Full \\
% \hline
% BinTree		& .78 & .78 & .78 &  .58  & .84   &   1  \\
% \hline
% BHeap		& .95 & .95 & .95 &  4.1  & .88   &  .92 \\
% \hline
% FibHeap		&  1  &  1  &  1  &  5.1  & .74   &  .92 \\
% \hline
% TreeMap		& .72 & .72 & .72 &  5.4  & .76   &  .90 \\
% \hline
% \end{tabular}
% \end{center}
% }
% \caption{Comparison of results on the JPF subject units.}
% \label{jpf-results-fig}
% \end{figure}
% 
% Figure \ref{jpf-results-fig} shows the results of the comparison.
% We show the block coverage ratio achieved
% by the best Java-Pathfinder-based technique from Visser et
% al.\ (JPF), by Pacheco et al.'s tool Randoop (RP), and
% by Nighthawk using the restricted test wrappers. 
% Nighthawk was able to achieve the same coverage as the previous
% tools.  The Time column shows the clock time in seconds needed by
% Nighthawk to achieve its greatest coverage.
% For BHeap and FibHeap, Nighthawk runs more quickly than JPF, but for
% the other two units it runs more slowly than both JPF and Randoop.
% This difference in run times may be in part because
% Nighthawk relies on general-purpose Cobertura instrumentation,
% which slows down programs, rather than the efficient, but
% application-specific, hand instrumentation that the other
% methods used (it may also be in part due to
% our use of a 
% different
% machine architecture than Pacheco et al.).
% 
% The JPF subject units were run by the other researchers by
% calling only selected methods.
% In order to study how Nighthawk performed when not restricted
% to the selected methods,
% we then ran it giving the target classes themselves as
% command-line classes (bypassing the test wrappers), and observed
% the number of lines covered.  The ``Line Cov'' columns show the
% line coverage ratio achieved when using the restricted wrappers
% and on the full target classes.   Using the full target
% classes, Nighthawk could cover more lines of
% code, including all the blocks covered by the previous studies.
% 
% \begin{figure}
% 
% %\begin{center}
% %\begin{tabular}{|c|c|c|c|}
% %\hline
% %		& \multicolumn{3}{c|}{Number of cond value combinations} \\
% %UUT		& Total & Unreached & Rch/Uncov \\
% %\hline
% %BinTree		&  34   &     6     &    0 \\
% %\hline
% %BHeap		&  75   &     0     &    5 \\
% %\hline
% %FibHeap		&  57   &    10     &    3 \\
% %\hline
% %TreeMap		&  157  &    31     &    19 \\
% %\hline
% %\end{tabular}
% %\end{center}
% {\footnotesize
% 
% \begin{center}
% \begin{tabular}{|c|c|c|c|}
% \hline
		% & \multicolumn{3}{c|}{Number of cond value combinations} \\
% UUT		& Total & Reachable & Covered \\
% \hline
% BinTree		&  34   &    28     & 28 (.82, 1.0) \\
% \hline
% BHeap		&  75   &    75     & 70 (.93, .93) \\
% \hline
% FibHeap		&  57   &    47     & 44 (.77, .94) \\
% \hline
% TreeMap		&  157  &   126     & 107 (.68, .85) \\
% \hline
% \end{tabular}
% \end{center}
% }
% \caption{Multiple condition coverage of the subject units.}
% \label{jpf-mcc-results-fig}
% \end{figure}
% 
% Visser et al.\ and Pacheco et al.\ also studied a form of
% predicate coverage \cite{ball-pred-coverage} whose
% implementation is linked to the underlying Java Pathfinder code,
% and is difficult for Nighthawk to access.
% While this predicate coverage is an interesting assessment
% criterion, there is no consensus in the literature on the
% connection of this criterion to other measures.
% For comparison, we
% therefore studied multiple condition coverage (MCC), a standard
% coverage metric which is, like predicate coverage, intermediate
% in strength between decision/condition and path coverage.
% We instrumented the source code so that every combination of
% values of conditions in every decision caused a separate line of
% code to be executed.  We then ran Nighthawk on the test
% wrappers, thus effectively causing it to optimize MCC rather
% than just line coverage.
% 
% In Figure \ref{jpf-mcc-results-fig},  we list
% the total number of valid condition value combinations in all
% the code, and the number that were in decisions reachable by
% calling only the methods called by the other research groups.
% We also show the combinations covered by Nighthawk,
% both as a raw total and as a fraction of the total combinations
% and the reachable combinations.
% Nighthawk achieved between 68\% and 93\% of
% MCC, or between 85\% and 100\% when only reachable condition
% combinations were considered.
% One coverage tool vendor website \cite{cornett-minimum-coverage} states that
% ``code coverage of 70-80\% is a reasonable goal for system
% test of most projects with most coverage metrics'', and suggests
% 90\% coverage during unit testing.  Berner et al.\ \cite{berner-etal-icse07},
% reporting on a study of the commercial use of unit testing on
% Java software, report unit test coverage of no more than 85\% over
% the source code of the whole system, on a variety of coverage
% measures.  We therefore
% consider the coverage levels achieved by Nighthawk to be very
% good.
% 
% In summary, our comparisons show
% Nighthawk achieving good coverage with respect to the
% results achieved by previous researchers, even when strong
% coverage measures such as decision/condition and MCC were taken
% into consideration.
% 
% \section{Case Study}
% \label{case-study-section}
% 
% In order to study the effects of different test wrapper
% generation and command-line switches to Nighthawk, we studied
% the Java 1.5.0 Collection and Map classes; these are the 16
% concrete classes with public constructors in {\tt java.util}
% that inherit from the {\tt Collection} or {\tt Map}
% interface.  The source files total 12137 LOC, and Cobertura
% reports that 3512 of those LOC contain executable code.  These
% units are ideal subjects because they are heavily used and
% contain complex code, including templates and inner classes.
% 
% For each unit,
% we generated test wrappers of two kinds:  plain test wrappers
% (P), and enriched wrappers (E) generated with the
% {\tt --checkTypedEquals} and {\tt --checkSerializable} switches
% (see Section \ref{test-wrappers-section}).
% We studied two different option sets for Nighthawk: with no
% command-line switches (N), and with the {\tt --deep} switch
% (see Section \ref{deep-section}) turned on (D).
% For each $\langle$UUT, test wrapper, option set$\rangle$ triple, we
% ran Nighthawk for 50 generations and saved the best chromosome it found.
% For each triple, we then executed RunChromosome (see Section
% \ref{run-chromosome-sec}) specifying that it generate 10 test
% cases with the given chromosome, and we measured the coverage
% achieved.
% 
% \begin{figure}
% {\footnotesize
% \begin{center}
% \begin{tabular}{|l|c|c|c|c|c|}
% Source file	& SLOC	& PN	& EN	& PD	& ED \\
% \hline
% ArrayList	& 150	& 111	& 140	& 109	& 140 (.93) \\
% \hline
% EnumMap		& 239	& 7	& 9	& 10	& 7 (.03) \\
% \hline
% HashMap		& 360	& 238	& 265	& 305	& 347 (.96) \\
% \hline
% HashSet		& 46	& 24	& 40	& 26	& 44 (.96) \\
% \hline
% Hashtable	& 355	& 205	& 253	& 252	& 325 (.92) \\
% \hline
% IHashMap	& 392	& 156	& 196	& 283	& 333 (.85) \\
% \hline
% LHashMap	& 103	& 27	& 37	& 28	& 96 (.93) \\
% \hline
% LHashSet	& 9	& 6	& 6	& 7	& 9 (1.0) \\
% \hline
% LinkedList	& 227	& 156	& 173	& 196	& 225 (.99) \\
% \hline
% PQueue		& 203	& 98	& 123	& 140	& 155 (.76) \\
% \hline
% Properties	& 249	& 101	& 102	& 102	& 102 (.41) \\
% \hline
% Stack		& 17	& 17	& 17	& 17	& 17 (1.0) \\
% \hline
% TreeMap		& 562	& 392	& 415	& 510	& 526 (.94) \\
% \hline
% TreeSet		& 62	& 44	& 59	& 41	& 59 (.95) \\
% \hline
% Vector		& 200	& 183	& 184	& 187	& 195 (.98) \\
% \hline
% WHashMap	& 338	& 149	& 175	& 274	& 300 (.89) \\
% \hline
% \hline
% Total		& 3512	& 1914	& 2194	& 2487	& 2880 \\
% \hline
% Ratio		& 	& .54	& .62	& .71	& .82  \\
% \hline
% \end{tabular}
% \end{center}
% }
% \caption{
  % Coverage achieved by configurations of Nighthawk
  % on the {\tt java.util} Collection and Map classes.
% }
% \label{collection-map-cov}
% \end{figure}
% 
% \begin{figure}
% {\footnotesize
% \begin{center}
% \begin{tabular}{|l|c|c|c|c|c|c|}
% Source file	& PN	& EN	& PD	& ED	& RC \\
% \hline
% ArrayList	& 75	& 91	& 29	& 48	& 15 \\
% \hline
% EnumMap		& 3	& 9	& 6	& 5	& 8 \\
% \hline
% HashMap		& 63	& 37	& 136	& 176	& 30 \\
% \hline
% HashSet		& 25	& 29	& 27	& 39	& 22 \\
% \hline
% Hashtable	& 8	& 110	& 110	& 157	& 25 \\
% \hline
% IHashMap	& 31	& 41	& 59	& 134	& 34 \\
% \hline
% LHashMap	& 1	& 5	& 4	& 129	& 25 \\
% \hline
% LHashSet	& 1	& 4	& 6	& 24	& 16 \\
% \hline
% LinkedList	& 32	& 61	& 41	& 53	& 17 \\
% \hline
% PQueue		& 23	& 40	& 242	& 103	& 13 \\
% \hline
% Properties	& 104	& 19	& 49	& 47	& 18 \\
% \hline
% Stack		& 5	& 10	& 5	& 26	& 8 \\
% \hline
% TreeMap		& 80	& 131	& 231	& 106	& 26 \\
% \hline
% TreeSet		& 110	& 93	& 98	& 186	& 26 \\
% \hline
% Vector		& 106	& 83	& 156	& 176	& 20 \\
% \hline
% WHashMap	& 37	& 35	& 92	& 110	& 21 \\
% \hline
% \hline
% Total		& 704	& 798	& 1291	& 1519	& 324 \\
% \hline
% \end{tabular}
% \end{center}
% }
% \caption{
  % Time in seconds taken by configurations of Nighthawk
  % to achieve highest coverage
  % on the {\tt java.util} Collection and Map classes.
% }
% \label{collection-times}
% \end{figure}
% 
% The column labeled SLOC in Figure \ref{collection-map-cov} shows the total number of source lines
% of code reported by Cobertura (including inner classes) in the
% source file associated with the class.  Column PN shows the
% SLOC covered by Nighthawk with the plain test wrappers and
% no Nighthawk switches; columns EN, PD and ED show the other
% combinations, and column ED also shows the coverage ratio with
% respect to total SLOC.
% % EN shows the effect of using the
% %enriched wrappers, and column PD the effect of using deep target
% %class analysis with the plain wrappers; column ED shows the effect of
% %using both the enriched wrappers and deep target class analysis.
% The second last line shows the totals for each column, and
% the last line shows the coverage ratio attained.
% 
% With enriched test wrappers and deep target class analysis,
% Nighthawk performs well, achieving over 90\% coverage on 11 out
% of the 16 classes, and
% 82\% coverage overall.
% The Shapiro-Wilk normality test ($\alpha=.05$) was performed on
% each of the columns PN, EN, PD and ED from Figure
% \ref{collection-map-cov},
% and was unable to reject the null hypothesis that the data was
% normally distributed ($p$ values .0665, .1444, .0635 and .1256
% respectively).
% Both the paired $t$ test and the paired Wilcoxon test
% with Bonferroni
% correction (corrected $\alpha = .00833$) on each pair of columns
% in the table show that there
% are statistically significant differences
% between every pair except (PN, EN) and (EN, PD), though Wilcoxon
% suggests that there is a statistically significant difference
% between (PN, EN) as well.
% 
% % PN, EN p= .009301, .006008 (t, Wilcoxon)
% % PN, PD p= .005655, .004458
% % PN, ED p= .001805, .001936
% % EN, PD p= .01884, .02571
% % EN, ED p= .001768, .001662
% % PD, ED p= .004065, .006008
% 
% Nighthawk performed poorly on the {\tt EnumMap}
% class. 
% The main constructor to {\tt EnumMap} expects
% an enumerated type as one of the parameters.
% Nighthawk can not find such a type, so
% only a few lines of error code in constructors were executed.
% When we customized the test wrapper class to use a
% fixed enumerated type, Nighthawk with the ED configuration covered
% 204 lines of code (coverage ratio .85), raising the total
% coverage ratio for all classes to .88.
% 
% %It also performed poorly on the {\tt Properties} class, due to
% %the fact that it could not automatically generate a useful
% %{\tt InputStream} argument for several of the methods.
% %We expect that these deficiencies could be at least partially
% %addressed by hand-customizing the test wrappers for the classes.
% 
% Table \ref{collection-times} shows the amount of time taken by
% Nighthawk on the various configurations.  In columns PN-ED, we
% report the number
% of seconds of clock time taken for Nighthawk to first achieve
% its best coverage.
% Shapiro-Wilk tests ($\alpha=.05$) showed that two of the
% columns of data (PN and PD) were not normally distributed, so
% we performed only paired Wilcoxon tests with Bonferroni
% correction (corrected $\alpha=.00833$) on each pair of columns.
% These tests
% showed that the only pairs of
% columns that were different to a statistically significant level
% were (PN, ED) and (EN, ED); in particular, using the
% enriched wrappers (PN vs.\ EN, PD vs.\ ED) did not take
% significantly longer.
% 
% These statistical results suggest that generating the
% enriched wrappers allowed Nighthawk to
% cover significantly more code without running significantly
% longer;  the deep target class analysis
% also caused Nighthawk to cover
% significantly more code, but took significantly longer
% (though still less than 100 seconds per unit on average).
% For normal use of Nighthawk, we would therefore recommend
% setting enriched wrappers and deep target class analysis as
% the defaults.  With these in place as defaults, Nighthawk
% needs only the wrapper class name as a parameter.
% 
% In column RC of Table \ref{collection-times}, we report the
% number of CPU seconds needed for the RunChromosome program to
% create and run the 10 new test cases with the parameters chosen
% by Nighthawk in the ED configuration.
% This time includes JVM startup time.  The results show that
% with the parameters chosen by Nighthawk, RunChromosome can
% automatically generate many new test cases that achieve high
% coverage, in an average of approximately 2 seconds per test
% case.
% 
%\input{fssold}

\section{Analysis and Optimization of the GA}
\label{optimizing-section}

Our earlier work convinced us that Nighthawk was capable of achieving high
coverage.  However, a pressing question remains: is Nighthawk spending
time usefully in its quest for good chromosomes?  In particular, are all
the aspects of the randomized testing level that are controlled by genes
the best aspects to be so controlled?  If the answer to this question is
``no'', then the GA level of
Nighthawk could be wasting time mutating genes that have little effect on
cost-effective code coverage; furthermore, the randomized testing level
could be wasting time retrieving the values of genes and changing its
behavior based on them, when it could be using hard-coded values and
behavior.  If this is the case, then further research work, on areas like
comparing GA search to other techniques such as
simulated annealing, could be a waste of time because the GA under
study is using useless genes.

In general, it could be the case that some types of genes used in a GA are
more useful and some are less useful.  If this is the case, then we need
to identify the least useful genes, determine if there is any benefit to
eliminating them, and if so, do so.

In order to check the utility of  Nighthawk genes, we turn to 
feature subset selection (FSS).  As shown below,
using FSS, we were able to identify and eliminate useless genes, find a
better initial value for one major gene type, and come to a better
understanding of the tradeoffs between coverage and performance of
Nighthawk.  In particular, we found that Nighthawk can run an order of
magnitude faster while maintaining nearly the same level of coverage.

In this section, we first discuss the motivation of this work in more
detail, and then describe the FSS method we selected.
We describe the four major types of analysis activities we undertook, and
then describe how we iteratively applied them.  We end the section with
conclusions about Nighthawk and about FSS-based analysis of GAs in
general.

\subsection{Motivation}

The search space of a GA is the product of the sets of possible values for
all genes in a chromosome.  In the simplest case, where all genes have $R$
possible values and there are $L$ genes, the size of this search space is
$R^L$.  The run time cost to find the best possible chromosome is
therefore proportional to this size times the evaluation cost of each
chromosome:
\begin{equation}\label{eq:cost}
cost = R^L * eval
\end{equation}
Nighthawk's chromosomes for the {\tt java.util} classes range in size from
128 genes to 1273 genes (recall that the number of genes is dependent on
such things as the number of target methods and the numbers of parameters
of those methods), and each gene can have a large number of values. 
Nighthawk's chromosomes store information related to the gene types of
Figure \ref{gene-types-fig}.  For example, for the public methods of
{\tt java.util.Vector}, Nighthawk uses 933 genes, 392 of which are
{\tt valuePoolActivityBitSet} genes, and 254 of which are
{\tt candidateBitSet} genes.  If we could discard some of those gene
types, then \eq{cost} suggests that this would lead to a large improvement
in Nighthawk's runtimes.

We can get information about which genes are valuable by recording, for
each chromosome, the values of the genes and the resulting fitness score
of the chromosome.  This leads to a large volume of data, however: since
the population is of size 20, there are 20 vectors of data for each
generation for each unit being tested.  We can interpret our problem as a
data mining problem.  Essentially, what we need to know from this data is
what information within it is not needed to accurately predict the fitness
score of a chromosome.

Feature subset selection (FSS) is a data mining technique that removes
needless information.  A repeated result is that simpler models with 
equivalent or higher performance can be built via FSS~\cite{hall03}.
Features may be pruned for several reasons: 
\bi
\item {\em Noisy:} spurious signals unrelated
  to the target; 
\item {\em Uninformative:} contain mostly one value,
  or no repeating values;
\item {\em Correlated to other variables:} so,
if 
  pruned, their signal will remain in other
  variables.
\ei

Apart from reduced runtimes, using fewer features has other advantages. 
Miller has shown that models generally containing fewer variables have
less variance in their outputs \cite{miller02}.  Also, the smaller the
model, the fewer are the demands on interfaces to the external
environment.  Hence, systems designed around  small models are easier to
use (less to do) and cheaper to build.

\subsection{Selecting an FSS Method}

\newcommand{\FOR}{{\sffamily \underline{for}}~}
\newcommand{\TO}{{\sffamily \underline{to}}~}
\newcommand{\DO}{{\sffamily \underline{do}}~}
\newcommand{\DONE}{{\sffamily \underline{done}}~}
\newcommand{\setuptabs}{
\hspace*{.2in}\=\hspace*{.2in}\=\hspace*{1in}\=\hspace*{.2in}\=\hspace*{.1in}
\hspace*{.2in}\=\hspace*{.2in}\=\hspace*{.2in}\=\hspace*{.2in}\=\hspace*{.2in}
\hspace*{.2in}\=\hspace*{.2in}\=\hspace*{.2in}\=\hspace*{.2in}\=\hspace*{.2in}\kill
}
\begin{wrapfigure}{r}{3in}
\renewcommand{\baselinestretch}{1} 
{\footnotesize
 \begin{tabbing}\setuptabs
\FOR  $f \leftarrow 1$ \TO $|features|$ \DO \\
\>$M_f =  0$ \>\> {\em // set all merits to 0}\\
\DONE\\
\FOR i $\leftarrow$ 1 \TO $N$ \DO\\
\>   randomly select instance $R$ from group $G$\\
\>   find nearest hit $H$ \>\>{\em // closest thing in the same group}\\
\>   find nearest miss $M$ \>\>{\em // closest thing in a  different group}\\
\>   \FOR $f \leftarrow 1$ \TO $|features|$ \DO\\
\>\>       $M_f \leftarrow M_f - \frac{\Delta(f,R,H)}{N} + \frac{\Delta(f,R,M)}{N}$\\
\>	\DONE\\
\DONE
\end{tabbing}}
\caption{Binary RELIEF (two group system) for $N$ instances
for merit of different features. }\label{fig:relief2}
\end{wrapfigure}
The RELIEF feature subset selector
\cite{Kir92,Kon94}
assumes
that the data is divided into $groups$\footnote{Technically, RELIEF assumes
that instances have been classified using some ``class'' attribute. However,
to avoid confusion with the concept of ``class'' discussed above, we will
describe RELIEF in terms of ``groups''.} and tries to find the features that
serve to distinguish instances in one group from instances in other groups.  


RELIEF is a stochastic instance-based scheme that works by randomly
selecting $N$ reference instances $R_1 .. R_N$; by default, \mbox{$N=250$}.  For
data sets with two groups, RELIEF can be implemented using the simple
algorithm of \fig{relief2}.  For each instance, the algorithm finds two
other instances: the ``hit'' is the nearest instance to $R$ in the same
group while the ``miss'' is the nearest instance to $R$ in another group.
RELIEF's core intuition is that features that change value between groups
are more meritorious than features that change value within the same
group.  Accordingly, the merit of a feature (denoted $M_f$) is
{\em increased} for all features with a different value in the ``miss''
and {\em decreased} for all features with different values in the ``hit''.
The $\Delta$ function of figure \fig{relief2} detects differences between
feature values.  If a feature is discrete then the distance is one (if the
symbols are different) or zero (if the symbols are the same). If a feature
is numeric, %the the feature is normalizes 0..1 for min..max then
% subtracted.
then the distance is the difference in value (normalized to
0...1).  If a feature has a missing value, then a Bayesian statistic is
used to generate an estimate for the expected value (see ~\cite{Kir92} for
details).
For a complex data set with $k>2$ groups, RELIEF samples
the $k$ nearest misses and hits from the same or different groups.

Hall and Holmes \cite{hall03} review and reject numerous
FSS methods.  Their favorite method (called WRAPPER)
is suitable only for
small data sets.  For larger data sets, they
recommend RELIEF.

%Another reason to prefer RELIEF is that it can take advantage
%of Nighthawk's stochastic search.  Currently, RELIEF is a
%batch process that is executed after data generation is
%completed.  However, it may not be necessary to wait till the end
%of data generation to gain insights into which features are most
%relevant.  Given the stochastic nature of the algorithm, we can
%see feature work where RELIEF and a genetic algorithm work in
%tandem.  Consider the random selection process at the core of
%RELIEF: a genetic algorithm exploring mutations of the current
%set of parents is an excellent stochastic source of data.  In
%the future, we plan to apply RELIEF incrementally and in
%parallel to our GAs.
%

\subsection{Analysis Activities}

In our FSS analysis of Nighthawk, we iteratively applied three
distinct analysis activities, which we refer to as
{\em merit analysis}, {\em gene type ranking}, and {\em progressive
gene type knockout}.  

\subsubsection{Merit Analysis}

Using data from a run of Nighthawk on a subject
unit, merit analysis  finds a ``merit'' score between 0.0 and 1.0 for each
of the genes corresponding to the subject unit (higher merits
indicates that that gene was more useful in producing a fit chromosome).

To prepare for merit analysis, we modified Nighthawk 
so that each chromosome evaluation printed the current value of every gene
and the final fitness function score.  (For the two BitSet gene types, we
printed only the cardinality of the set.)  The input to the merit
analysis, for a set of subject classes, is the output of one run of the
modified Nighthawk for 40 generations on each of the subject classes;
by 40 generations, the fitness score had usually stabilized, and we did
not want to bias our dataset by including many instances with high score.
Each subject class therefore yielded 800 instances, each consisting of the
gene values and the chromosome score.

RELIEF assumes discrete data, but Nighthawk's fitness scores are
continuous.  We therefore discretized Nighthawk's output:
\bi
\item The 65\% majority of the scores are within 30\% of the top
  score for any experiment. We call this the {\em high plateau}.
\item A 10\%  minority of scores are less than 10\% of the maximum
  score (called {\em the hole}).
\item  The remaining data
{\em slopes} from the plateau into the hole.
\ei
We therefore assigned each instance to one of three groups (plateau, slope
and hole), and gave the data to RELIEF to seek features (in this context,
genes) that distinguished between the groups.  This produced one merit
figure for each feature (gene).

Each run therefore also yielded a ranked list $R$ of all genes, where gene
1 had the highest merit for this run, gene 2 had the second highest merit,
and so on.  We define:
\begin{itemize}
\item $merit(g,u)$ is the RELIEF merit score of gene $g$ derived from unit $u$.
\item $rank(g,u)$ is the rank in $R$ of gene $g$ derived from unit $u$.
\end{itemize}
Note that higher/lower $merits/rank$ indicate a more important gene (respectively).
\subsubsection{Gene Type Ranking}

Nighthawk uses the ten gene types listed in Figure \ref{gene-types-fig}. 
However, recall that each gene type $t$ may correspond to zero or more
genes, depending on the unit under test.  In order to eliminate gene
{\it types}, we need to rank them based on the merit scores for
the {\em individual} genes.  We refer to this activity as {\it gene type
ranking}.

We used four gene type rankings in our analysis.  Each was based on
assigning a numerical score to the gene type derived from the merit scores
of the genes of that type.
\begin{itemize}
\item $bestMerit(t)$ is the maximum, over all genes $g$ of
  type $t$ and all subject units $u$, of $merit(g,u)$.
  Ranking by $bestMerit$ favors
  genes that showed high merit for some unit.
\item $bestRank(t)$ is the minimum, over all genes $g$ of
  type $t$ and all subject units $u$, of $rank(g,u)$.
  Ranking by $bestRank$ favors
  genes that were important in Nighthawk's handling of some
  unit, regardless of their absolute merit score.
\item $avgMerit(t)$ is the average, over all genes $g$ of
  type $t$ and all subject units $u$, of $merit(g,u)$.
  Ranking by $avgMerit$ favors
  genes that consistently showed high merit across all units.
\item $avgRank(t)$ is the average, over all genes $g$ of
  type $t$ and all subject units $u$, of $rank(g,u)$.
  Ranking by $avgRank$ favors
  genes that were consistently important across all units.
\end{itemize}

\begin{figure}[tp]
\renewcommand{\baselinestretch}{0.85} 
{
\footnotesize
\begin{center}
\begin{tabular}{r|rl}
Rank &Gene type $t$  & $avgMerit$ \\
\hline
     1&	 {\tt numberOfCalls}&  85\\
     2&	{\tt valuePoolActivityBitSet} & 83\\
     3&	{\tt upperBound} & 64\\
     4&	{\tt chanceOfTrue}&  50\\
     5&	{\tt methodWeight}&  50\\
     6&	{\tt numberOfValuePools}&  49\\
     7&	{\tt lowerBound }& 44\\
     8&	{\tt chanceOfNull}&  40\\
     9&	{\tt numberOfValues }& 40\\
    10&	{\tt candidateBitSet}&  34\\
    \end{tabular}
\end{center}
}
\caption{
  Nighthawk gene types sorted by $avgMerit$, the average RELIEF
  merit over all genes of that type and all subject units.
}
\label{fig:genes-merit-fig}
\end{figure}

For example, \fig{genes-merit-fig} shows the ten gene types from Figure
\ref{gene-types-fig}, ranked in terms of their $avgMerit$ as defined
above, resulting from running version 1.0 of Nighthawk on
the first set of subject units.  This ranking places
{\tt numberOfCalls} at the top, meaning that it considers genes of that
type to be the most valuable; it also places {\tt candidateBitSet} at the
bottom, meaning that it considers genes of that type to be the most
expendable.

%Ranked by bestMerit:
%candidateBitSet bestMerit 0.373 1
%upperBound bestMerit 0.267 2
%valuePoolActivityBitSet bestMerit 0.267 3
%numberOfCalls bestMerit 0.195 4
%numberOfValuePools bestMerit 0.186 5
%chanceOfNull bestMerit 0.166 6
%numberOfValues bestMerit 0.16 7
%chanceOfTrue bestMerit 0.15 8
%methodWeight bestMerit 0.144 9
%lowerBound bestMerit 0.129 10
%
%Ranked by bestRank:
%candidateBitSet bestRank 1 1
%upperBound bestRank 1 2
%valuePoolActivityBitSet bestRank 1 3
%chanceOfTrue bestRank 1 4
%lowerBound bestRank 4.1 5
%numberOfValues bestRank 5.2 6
%numberOfCalls bestRank 7.2 7
%methodWeight bestRank 7.5 8
%numberOfValuePools bestRank 14.1 9
%chanceOfNull bestRank 24.1 10
%
%
%Ranked by avgMerit:
%numberOfCalls avgMerit 0.084533 1
%valuePoolActivityBitSet avgMerit 0.083091 2
%upperBound avgMerit 0.063655 3
%chanceOfTrue avgMerit 0.050625 4
%methodWeight avgMerit 0.050055 5
%numberOfValuePools avgMerit 0.049355 6
%lowerBound avgMerit 0.044286 7
%chanceOfNull avgMerit 0.040337 8
%numberOfValues avgMerit 0.040047 9
%candidateBitSet avgMerit 0.034309 10
%
%Ranked by avgRank:
%numberOfCalls avgRank 39.793333 1
%valuePoolActivityBitSet avgRank 85.604311 2
%upperBound avgRank 91.748810 3
%chanceOfTrue avgRank 100.906250 4
%lowerBound avgRank 109.358929 5
%numberOfValuePools avgRank 113.536364 6
%numberOfValues avgRank 133.048438 7
%methodWeight avgRank 141.473047 8
%chanceOfNull avgRank 167.094857 9
%candidateBitSet avgRank 193.528213 10

%Sorted by gene type:

%candidateBitSet avgMerit 0.034309 10
%candidateBitSet avgRank 193.528213 10
%candidateBitSet bestMerit 0.373 1
%candidateBitSet bestRank 1 1

%chanceOfNull avgMerit 0.040337 8
%chanceOfNull avgRank 167.094857 9 
%chanceOfNull bestMerit 0.166 6
%chanceOfNull bestRank 24.1 10

%chanceOfTrue avgMerit 0.050625 4
%chanceOfTrue avgRank 100.906250 4
%chanceOfTrue bestMerit 0.15 8
%chanceOfTrue bestRank 1 4

%lowerBound avgMerit 0.044286 7
%lowerBound avgRank 109.358929 5
%lowerBound bestMerit 0.129 10
%lowerBound bestRank 4.1 5

%methodWeight avgMerit 0.050055 5
%methodWeight avgRank 141.473047 8
%methodWeight bestMerit 0.144 9
%methodWeight bestRank 7.5 8

%numberOfCalls avgMerit 0.084533 1
%numberOfCalls avgRank 39.793333 1
%numberOfCalls bestMerit 0.195 4
%numberOfCalls bestRank 7.2 7

%numberOfValuePools avgMerit 0.049355 6
%numberOfValuePools avgRank 113.536364 6
%numberOfValuePools bestMerit 0.186 5
%numberOfValuePools bestRank 14.1 9

%numberOfValues avgMerit 0.040047 9
%numberOfValues avgRank 133.048438 7
%numberOfValues bestMerit 0.16 7
%numberOfValues bestRank 5.2 6

%upperBound avgMerit 0.063655 3
%upperBound avgRank 91.748810 3
%upperBound bestMerit 0.267 2
%upperBound bestRank 1 2

%valuePoolActivityBitSet avgMerit 0.083091 2
%valuePoolActivityBitSet avgRank 85.604311 2
%valuePoolActivityBitSet bestMerit 0.267 3
%valuePoolActivityBitSet bestRank 1 3

\begin{figure}
\renewcommand{\baselinestretch}{0.85} 
{\footnotesize
\begin{center}
\begin{tabular}{|l|c|c|c|c|}
\hline
          & \multicolumn{4}{c|}{Rank of gene type when ranked by measure} \\
Gene type & $bestMerit$ & $bestRank$ & $avgMerit$ & $avgRank$ \\
\hline \verb/numberOfCalls/ & 4 & 7 & 1 & 1 \\
\hline \verb/valuePoolActivityBitSet/ & 3 & 3 & 2 & 2 \\
\hline \verb/upperBound/ & 2 & 2 & 3 & 3 \\
\hline \verb/chanceOfTrue/ & 8 & 4 & 4 & 4 \\
\hline \verb/methodWeight/ & 9 & 8 & 5 & 8 \\
\hline \verb/numberOfValuePools/ & 5 & 9 & 6 & 6 \\
\hline \verb/lowerBound/ & 10 & 5 & 7 & 5 \\
\hline \verb/chanceOfNull/ & 6 & 10 & 8 & 9 \\
\hline \verb/numberOfValues/ & 7 & 6 & 9 & 7 \\
\hline \verb/candidateBitSet/ & 1 & 1 & 10 & 10 \\
\hline \end{tabular}
\end{center}
}

\caption{
  Ranks of all gene types, when ranked by four measures
  computed from data on the first set of subject units.
}
\label{fig:all-gene-type-rankings-fig}
\end{figure}

However, note also \fig{all-gene-type-rankings-fig}, which compares the
ranks of the gene types from the first set of subject units.  Some gene
types, such as \verb/valuePoolActivityBitSet/, have fairly consistent ranks,
whereas others, such as \verb/candidateBitSet/ and \verb/numberOfCalls/,
have quite different ranks when different ranking measures are used.

\subsubsection{Progressive Gene Type Knockout}

To validate the rankings found by gene type rankings,
we conducted a {\em progressive gene type knockout} experiment.
We instrumented the Nighthawk source code so that we could easily ``knock
out'' all genes of a given type, by replacing the code controlled by that
gene type by code that assumed a constant value for each gene of that type.
For example, if we chose to knock out the \verb/numberOfCalls/ gene type,
then no information about the number of method calls to be made was
contained in the chromosome, and the randomized testing level made the
same number of calls in each case.  The constant value chosen was the
initial value that the gene was set to before chromosome evolution,
which we decided on based on our previous experience with randomized unit
testing.

We then ran Nighthawk on the subject units again, first with all ten gene
types, then with the lowest (least useful) gene type in the gene type
ranking knocked out, then with the lowest two gene types knocked out, and
so on until we were left with one gene type not knocked out.  We collected
two result variables from each run on each subject unit: the amount of
time it took, and the coverage achieved by the winning chromosome.
We then inspected the results using data visualization methods in order
to evaluate the tradeoffs in cost (time) and benefits (coverage).

\subsection{Analysis Procedure}

\subsubsection{Stage 1: Initial Analysis}

In the first stage of our analysis, we ran Nighthawk on the Java 1.5.0
Collection and Map classes.  These are the 16 concrete classes with public
constructors in {\tt java.util} that inherit from the {\tt Collection} or
{\tt Map} interface, which we used for our earlier experiments
\cite{andrews07}.  The source files total 12137 LOC, and Cobertura reports
that 3512 of those LOC contain executable code.

We performed a merit analysis and gene type ranking based on $bestMerit$
and $bestRank$.
We then proceeded with
progressive gene type knockout based on the $bestMerit$ and $bestRank$
rankings of gene types.
The results of this initial progressive gene type knockout experiment were
reported in \cite{andrews-menzies-promise09}.  We showed that with only
the best four gene types according to the $bestMerit$ ranking, or the best
seven gene types according to the $bestRank$ ranking, we were able to
achieve 90\% of the coverage achieved by all ten gene types, in about 10\%
of the time.

\begin{wrapfigure}{r}{3in}
\begin{center}\includegraphics[width=3in]{Hashtable_rankedByAvgMerit.pdf}
\end{center}\caption{
  Gene type elimination using
  $avgMerit$.
}
\label{fig:hashtable}
\end{wrapfigure}
\subsubsection{Stage 2: Re-Ranking}\label{sec:rerank}

We observed a large (around 10\%) drop in coverage
occurred at the point at which we knocked out the gene
corresponding to \verb/numberOfCalls/.  This finding cast doubt on the
validity of the $bestMerit$ and $bestRank$ rankings.
After re-ranked 
according to $avgMerit$, we saw that \verb/numberOfCalls/ was the
highest ranked, and that the $avgRank$ ranking largely agreed with this
assessment.
%
%\begin{figure}[tp]
%\begin{center}\includegraphics[width=2.5in]{Hashtable_rankedByAvgRank.pdf}
%\end{center}\caption{
%  Nighthawk on Hashtable unit, eliminating gene types according
%  to $bestRank$ ranking.
%}
%\label{fig:hashtable-bestRank}
%\end{figure}
%
%\begin{figure}\begin{center}\includegraphics[width=2.5in]{EnumMap_rankedByMerit.pdf}
%\end{center}\caption{EnumMap}\label{fig:enummap}
%\end{figure}
%
We conducted a progressive gene type knockout study using
$avgMerit$, and tabulated the results.

\begin{figure}[tp]
\renewcommand{\baselinestretch}{1} 
\scriptsize
\begin{center}
\begin{tabular}{r@{~}r@{~}r@{~}r@{~}r@{~}r@{~}r@{~}r@{~}r@{~}r|l}
\multicolumn{9}{c}{Number of used gene types}\\\cline{1-10}
1    &  2   &   3 &   4   &  5   & 6    & 7    & 8    &  9  & 10    & class being tested\\\hline
1.00&1.00&1.01&0.99&1.00&1.00&1.00&1.00&1.00&1.00&ArrayList\\
1.13&1.00&1.03&0.96&0.65&0.68&0.67&0.88&0.71&1.00&EnumMap\\
0.98&0.99&0.98&0.96&1.01&1.00&1.00&1.00&0.98&1.00&HashMap\\
0.99&1.00&0.99&1.00&1.00&1.00&0.99&1.00&1.00&1.00&HashSet\\
0.99&1.00&1.00&1.00&0.99&1.00&0.98&1.01&1.01&1.00&Hashtable\\
0.97&0.97&0.98&0.97&0.98&0.97&1.00&0.98&0.98&1.00&IdentityHashMap\\
0.98&1.00&1.00&1.00&0.99&1.00&1.00&0.99&0.99&1.00&LinkedHashMap\\
0.99&1.01&1.01&1.00&1.00&1.00&1.00&1.01&1.01&1.00&LinkedHashSet\\
1.01&1.01&1.01&1.02&1.00&1.01&1.01&1.02&1.00&1.00&LinkedList\\
1.01&0.99&0.97&1.01&1.03&0.99&0.95&0.98&1.00&1.00&PriorityQueue\\
1.00&1.00&1.00&1.00&1.00&1.00&1.00&1.00&0.99&1.00&Properties\\
1.00&1.00&1.00&1.00&1.00&1.00&1.00&1.00&1.00&1.00&Stack\\
0.90&0.97&0.95&0.93&0.99&0.97&0.97&1.02&1.01&1.00&TreeMap\\
0.90&0.95&0.98&0.93&0.97&0.98&0.98&1.00&0.98&1.00&TreeSet\\
0.95&0.97&0.99&0.96&0.99&0.92&0.99&0.97&1.01&1.00&Vector\\
0.96&0.97&0.97&0.98&0.98&0.97&0.98&0.99&0.97&1.00&WeakHashMap\\\hline
0.98&0.99&0.99&0.98&0.97&0.97&0.97&0.99&0.98&1.00&mean
\end{tabular}
\end{center}
\caption{Coverage found using the top $i$ ranked gene types for
$1 \le i \le 10$, using the $avgMerit$ ranking.
Coverages are expressed as a ratio of the coverages found using all gene types.
}\label{fig:coverage}
\end{figure}

In the following, each run was compared to
the runtime and coverage seen using all ten gene types and
running for $g=50$ generations.  \fig{hashtable} shows how the
coverage changed for Hashtable (one of the {\tt java.util} classes);
the results for this subject unit
are typical.  The axes of that figure is defined such that
the point (50,1) represents the coverage of all 10
gene types after 50 generations.  The thick  black curve on those figures shows the
performance of Nighthawk using all ten gene
types. Other curves show results from using $1 \le i \le 9$
gene types.  A measure of interest in \fig{hashtable} is the area
under the curve, which is maximal when Nighthawk converges
to maximum coverage in a few generations.  Due to
the random nature of the GA and the randomized test data
generation, some of the curves are sometimes higher than the
$i=10$ line.

If we calculate the ratio of the area under
a curve (AUC) with the area under the thick
black
curve, then we can summarize all the curves of
\fig{hashtable} as the Hashtable row of \fig{coverage}. In that
figure, each column shows how many gene types were used in  a
particular run (and when $used=10$, we are ignoring the FSS
results and using all gene types). The number 1.00 informs
us that we achieved 100\% of the coverage reached by using all
ten gene types.

\fig{coverage} 
summarizes \fig{hashtable} as well as results
from all the other {\tt java.util} classes.
The last row of 
\fig{coverage} shows that the mean AUC is very similar using
the top-ranked $i$ gene types.  
From this observation,
we concluded that
Nighthawk's GA gained most of its efficacy just for mutations of
the top-ranked gene type {\tt numberOfCalls}.
However, a statistical comparison of the coverage measures with
only the top gene type and those with all ten gene types does
show a statistically significant difference (a paired Wilcoxon test
yields $p=0.013$).
%We also conclude that
%in terms of coverage, there is little benefit in forcing Nighthawk's GA
%to mutate more than this first ranked gene type.

To check if  eliminating gene types from
Nighthawk's GA is cost-effective, we must consider both  the
coverage achievable and  the time taken to achieve that
coverage.  
We therefore made two runs of Nighthawk
using (a) all the gene types and  using (b) just the top gene
type ranked by $avgMerit$ (i.e. {\tt numberOfCalls}).
We then divided the runtime and coverage results
from (b) by the (a) values seen after 50
generations, and plotted the results.

\begin{wrapfigure}{r}{3in}
\includegraphics[width=3in]{mar21timereport.pdf}
\caption{Time vs coverage result, compared between one and ten gene types, for the {\tt java.util} classes.}\label{fig:timereport100}
\end{wrapfigure}
\fig{timereport100} shows the results, with time percentage on
the X axis and coverage percentage on the Y axis.
Note the point indicated by the arrow in \fig{timereport100}.
This point shows that it is usually
(in $\frac{13}{16}$ cases) possible to achieve 90\% of the
coverage in under 10\% of the time required to run all gene
types for 50 generations.
The data for the two outlier subject units in \fig{timereport100},
{\tt EnumMap} and {\tt TreeMap}, can be
attributed to the low coverage achieved by the original Nighthawk
on {\tt EnumMap} and the stochastic nature of both the GA and
random testing level.

The results of this progressive gene type knockout
procedure corroborated the rankings that we had derived using
the $avgMerit$ and $avgRank$ measures, lending support to the
validity of these measures as opposed to the $bestMerit$ and
$bestRank$ measures.  We therefore continued to use only
$avgMerit$ and $avgRank$ in our subsequent research.

%The two anomalous subject units in \fig{timereport100} are
%{\tt EnumMap} and {\tt TreeMap}:
%\bi
%\item  {\tt EnumMap} looks like a
%success story, since we were able to achieve 140\% of the
%original coverage.  In fact, this is simply noise, since the results presented
%earlier in this paper showed that we 
%could achieve only 3\% coverage of {\tt EnumMap} using
%Nighthawk and generic test wrappers \cite{andrews07}. The improvements
%reported in \fig{timereport100}
%shows an improvement to less than 5\%.  
%\item
%%\fig{timereport100} showed that the run
%with TreeMap using one gene type took 1.48 longer than using TreeMap with all ten gene types.
%%To understand this result,
%note that \eq{cost} has two
%components: number of options and the evaluation cost of each
%option. In the case of {\tt TreeMap}, even though we reduced the
%chromosome search space size, we made choices that greatly
%increased the runtime cost.
%\ei

\subsubsection{Stage 3:  Optimizing {\tt numberOfCalls}}\label{sec:opt}

The implication of the result regarding the
\linebreak[4]
{\tt numberOfCalls} gene type
was that changing the number of method calls made in the test case was
most important to reach higher coverage.  
What was the cause of this importance?  If it was due simply
to the fact that we had chosen a sub-optimal initial value for the number
of calls, then changing the initial value could result in a quicker
convergence to an optimal value.  Moreover, a sub-optimal initial
value for {\tt numberOfCalls} could have the effect of skewing our
FSS analysis, since {\tt numberOfCalls} could assume a skewed importance
due to the need to change it.

Nighthawk assigns an initial value of $k \cdot |C_M|$ to {\tt numberOfCalls},
where $k$ is a constant and $C_M$ is the set of callable methods.  The
motivation for this initial value is that if there are more callable
methods associated with the unit under test, then we expect to have to
make more method calls in order to test them.  In version 1.0 of
Nighthawk, we set $k$ to 5, based on our earlier experience with
randomized testing.

To investigate whether this initial value of {\tt numberOfCalls}
was optimal, we generated a scatter plot (\fig{optimalInitialNumCalls})
plotting the number of callable methods against the final value
of {\tt numberOfCalls} for the winning chromosome found by Nighthawk
for each of our subject units.  From this scatter plot, it became
obvious that the initial value was indeed sub-optimal:  a value of
$5$ for $k$ results in the lower line in the figure, which is below
the final value for all but one of the subject units.  A value of
$50$ for $k$ results in the upper line in the figure, which is much
closer to the final value for all units.

\begin{wrapfigure}{r}{3in}
\includegraphics[width=3in]{optimalInitialNumCalls.pdf}
\caption{Finding the optimal initial number of calls.}\label{fig:optimalInitialNumCalls}
\end{wrapfigure}
Based on this analysis, we changed the value of $k$ to 50; that is, we set
the initial value of {\tt numberOfCalls} to 50 times the number of
callable methods.  We refer to Nighthawk with this change as Nighthawk
version 1.1, since it exhibits quite different behavior from Nighthawk
1.0.

\subsubsection{Stage 4:  Analyzing the Optimized Version}\label{sec:apache}

Because of our concern that the sub-optimal initial {\tt numberOfCalls}
value had skewed our previous analysis, we re-ran the merit analysis
using Nighthawk 1.1 and the {\tt java.util} classes.

To corroborate our conclusions, we ran a merit analysis using
Nighthawk 1.1 and a new set of classes.  The Apache Commons
Collection classes is a set of classes implementing efficient data
structures, developed by the open-source Apache development community.  We
focused on the 36 concrete top-level classes in this effort.  Two of these
classes ({\tt ExtendedProperties} and {\tt MapUtils}) tended to generate
large numbers of files when Nighthawk was run on them; we could have dealt
with this by modifying the appropriate test wrappers, but for greater
reproducibility we decided to omit them.  We therefore ran a merit analysis
using Nighthawk 1.1 and the other 34 concrete top-level classes.

% with java.util classes
%candidateBitSet Average merit:   0.022027 10
%methodWeight Average merit:   0.053939 9
%chanceOfNull Average merit:   0.064538 8
%numberOfValuePools Average merit:   0.066075 7
%numberOfValues Average merit:   0.069082 6
%lowerBound Average merit:   0.076550 5
%valuePoolActivityBitSet Average merit:   0.082291 4
%numberOfCalls Average merit:   0.083200 3
%upperBound Average merit:   0.083469 2
%chanceOfTrue Average merit:   0.087675 1
%
% with java.util classes
%chanceOfTrue Average rank:   156.835833 1
%upperBound Average rank:   163.315625 2
%numberOfCalls Average rank:   170.080000 3
%lowerBound Average rank:   195.308750 4
%valuePoolActivityBitSet Average rank:   242.024020 5
%numberOfValues Average rank:   284.362414 6
%numberOfValuePools Average rank:   307.979503 7
%chanceOfNull Average rank:   323.334530 8
%methodWeight Average rank:   398.758447 9
%candidateBitSet Average rank:   573.076524 10

% with Apache Commons Collections classes
%candidateBitSet Average merit:   0.015989 10
%chanceOfNull Average merit:   0.019018 9
%lowerBound Average merit:   0.019477 8
%upperBound Average merit:   0.021316 7
%numberOfValues Average merit:   0.021861 6
%methodWeight Average merit:   0.022195 5
%numberOfValuePools Average merit:   0.024560 4
%chanceOfTrue Average merit:   0.025795 3
%valuePoolActivityBitSet Average merit:   0.029886 2
%numberOfCalls Average merit:   0.034302 1
%
% with Apache Commons Collections classes
%chanceOfTrue Average rank:   204.929889 1
%numberOfCalls Average rank:   233.176471 2
%valuePoolActivityBitSet Average rank:   270.208029 3
%numberOfValuePools Average rank:   274.814085 4
%upperBound Average rank:   275.024306 5
%lowerBound Average rank:   281.588652 6
%numberOfValues Average rank:   301.920437 7
%methodWeight Average rank:   360.867586 8
%chanceOfNull Average rank:   408.058226 9
%candidateBitSet Average rank:   465.039339 10

\begin{figure}
\renewcommand{\baselinestretch}{1} 
{\footnotesize
\begin{center}
\begin{tabular}{|l|c|c|c|c|}
\hline
          & \multicolumn{4}{c|}{Rank of gene type when ranked by measure} \\
          & \multicolumn{2}{c|}{{\tt java.util}}
          & \multicolumn{2}{c|}{Apache} \\
Gene type & $avgMerit$ & $avgRank$ & $avgMerit$ & $avgRank$ \\
\hline \verb/numberOfCalls/ & 3 & 3 & 1 & 2 \\
\hline \verb/valuePoolActivityBitSet/ & 4 & 5 & 2 & 3 \\
\hline \verb/chanceOfTrue/ & 1 & 1 & 3 & 1 \\
\hline \verb/numberOfValuePools/ & 7 & 7 & 4 & 4 \\
\hline \verb/methodWeight/ & 9 & 9 & 5 & 8 \\
\hline \verb/numberOfValues/ & 6 & 6 & 6 & 7 \\
\hline \verb/upperBound/ & 2 & 2 & 7 & 5 \\
\hline \verb/lowerBound/ & 5 & 4 & 8 & 6 \\
\hline \verb/chanceOfNull/ & 8 & 8 & 9 & 9 \\
\hline \verb/candidateBitSet/ & 10 & 10 & 10 & 10 \\
\hline
\end{tabular}
\end{center}
}

\caption{
  Ranks of all gene types according to average merit and average gene
  rank, after running Nighthawk 1.1 on the {\tt java.util} classes
  and the Apache Commons Collections classes.
}
\label{fig:all-gene-type-rankings-v1-1fig}
\end{figure}

We then performed a gene type ranking using only the $avgMerit$
and $avgRank$ measures.  The results of this ranking, for both
the {\tt java.util} classes and the Apache Commons Collections
classes, are in
\fig{all-gene-type-rankings-v1-1fig}.  Note that while
{\tt numberOfCalls} is still ranked highly, now the gene type
{\tt chanceOfTrue} emerges as the most influential gene type,
with {\tt upperBound} and {\tt valuePoolActivityBitSet} also
being influential for some classes.
However, as noted even for Nighthawk 1.0 using the $avgMerit$ and
$avgRank$ rankings, the gene types {\tt candidateBitSet} and
{\tt chanceOfNull} were consistently not useful.

A progressive gene type knockout based on the $avgMerit$ gene type ranking
supported the validity of that ranking:  knocking out the lowest-ranked
gene types had almost no effect on eventual coverage, while knocking out
the higher-ranked gene types had a small but more noticeable effect. 

\begin{figure}
\begin{center}
\includegraphics[width=3in]{aug20timereport1.pdf}\includegraphics[width=3in]{aug20timereport2.pdf}
\end{center}
\caption{Time vs coverage result, compared between one and ten gene types, for the Apache classes.
}\label{fig:apache}
\end{figure}


Furthermore, as shown  in \fig{apache},
it is still the case that we can achieve 90\% of
the coverage achieved by all 10 gene types in 10\% of the time if we use
only the top-ranked gene type and stop the genetic algorithm early.
The \fig{apache} results were generated in the same manner as above (in \fig{timereport100}):
\bi
\item We made two runs
using (a) all the gene types and  using (b) just the top gene
type ranked by $avgMerit$ (i.e. {\tt numberOfCalls}).
\item
We then divided the runtime and coverage results
from (b) by the (a) values seen after 50
generations, and plotted the results in \fig{apache}.
\item
The points indicated by the two arrows in \fig{apache}
show that it is usually
(in $\frac{28}{34}$ cases) possible to achieve 90\% of the
coverage in under 10\% of the time required to run all gene
types for 50 generations.
\item There exist some outlier results
({\em  BinaryHeap, BufferUtils, FunctorException})
where the runtimes were much longer for the Nighthawk using only one gene type than for full Nighthawk.
Note that, in all those exception cases, the Nighthawk optimized by FSS still achieved 90\% of the coverage in 10\% of the time
required of full Nighthawk.
\ei


\subsection{Discussion}

There are two main areas of implications for this work:  implications
for Nighthawk itself, and implications for other systems.

\subsubsection{Implications for Nighthawk}

On the surface, the results reported above suggest
that most gene types can be eliminated.  However, there are
at least two mitigating factors.
First, the additional
coverage achieved by using all ten gene types
may be of code that is difficult to cover, and thus
this additional coverage might be more valuable to the user than the
raw coverage numbers suggest.  Second, the observations about gene types
might not carry over to other, different subject units.

\label{two-expendable-genes}
Nevertheless, the results show that it is very likely that Nighthawk can
be modified to give users a better range of cost-benefit tradeoffs,
for instance by eliminating gene types or using early stopping criteria
that take advantage of the early plateaus in coverage seen in
\fig{timereport100}.
In particular, the results suggest that the gene types
{\tt candidateBitSet} and {\tt chanceOfNull} were not useful.
Translating this back into the terms of Nighthawk's randomized
test input generation algorithm, this means that when it is
looking for a value to pass to a parameter of class C, it should always
draw parameter values from value pools in all the subclasses of C
(the behaviour resulting from the default value of {\tt candidateBitSet});
furthermore, it is
sufficient for it to choose {\tt null} as a parameter 3\% of the
time (the default value of {\tt chanceOfNull}).

%We are currently exploring early stopping criteria within Nighthawk to take
%advantage of the early plateaus in coverage seen in \fig{timereport100}
%The above results show that
%the original Nighthawk was doing
%much work that did not pay off by achieving much better results.
%Consider the nine gene types ranked the lowest in the $avgMerit$
%ranking.  Maintaining all the genes associated with these gene
%types meant that the original Nighthawk was spending time
%mutating these genes and extracting information from the current
%values of the genes ( it also meant that the original Nighthawk
%was using up memory storing the representations of these genes,
%for each chromosome in each generation, with a concomitant
%time expense).  When these less useful genes were eliminated,
%no large loss of coverage was observed, but a great increase
%in efficiency was observed.
%Hence, we are also exploring methods to reduce Nighthawk's memory requirements
%and other performance criteria.

\subsubsection{Implications for Other Systems}

At the {\it meta-heuristic level}, this work suggests that it may
be useful to integrate FSS directly into meta-heuristic
algorithms.  Such an integration would enable the automatic
reporting of the merits of individual features, and the
automatic or semi-automatic selection of features.  If the
results of this paper extend to other domains, this would lead
to meta-heuristic algorithms that improve themselves
automatically each time they are run.
%  We also speculate that
%this is a promising direction to look at for a further order
%of magnitude improvement in performance.

Also, on the {\it theory-formation level}, this work opens
up the possibility of rapid turnover of the theoretical
foundations underlying present tools, as aspects of heuristic
and meta-heuristic approaches are shown to be consistently
valuable or expendable.
The expendability of Nighthawk's {\tt candidateBitSet} and
{\tt chanceOfNull} gene types is an example of this latter
phenomenon.
%  In the framework of a self-monitoring
%meta-heuristic algorithm, new strategies, heuristics, and
%gene and mutator types can be introduced as theories evolve, and
%can be evaluated efficiently by the algorithms themselves.

%As an example of this latter point, the
%{\tt chanceOfNull} gene type in Nighthawk determines how likely
%(in percent) Nighthawk's random test data generation is to
%choose a {\tt null} as a parameter in a given parameter
%position.  That gene type was ranked as expendable by both
%rankings explored here.  This suggests that the default value for this
%gene (3\%) is sufficient in most cases.  Given sufficient
%corroboration from other case studies and systems, this in turn
%suggests that the value of generating test data with nulls is a
%relatively settled problem, and that we can turn toward other
%aspects of test data generation for future improvements.

\section{Properties of the Optimized System}
\label{sec:chars}

\begin{figure}
\renewcommand{\baselinestretch}{1} 
{\footnotesize
\begin{center}
\begin{tabular}{|l|c|c|c|c|c|c|c|}
		& 
		& \multicolumn{3}{c|}{Nighthawk v1.0}
		& \multicolumn{3}{c|}{Nighthawk v1.1}
	 \\
Source file	& SLOC
		&  Nt	& RCt & Coverage
		&  Nt	& RCt & Coverage
	 \\
\hline
ArrayList	& 150
		& 48	& 15
		& 140 (.93)
		& 93	& 12
		& 140 (.93)
	 \\
\hline
EnumMap		& 239
		& 5	& 8
		& 7 (.03)
		& 20	& 10
		& 12 (.05)
	 \\
\hline
HashMap		& 360
		& 176	& 30
		& 347 (.96)
		& 136	& 25
		& 347 (.96)
	 \\
\hline
HashSet		& 46
		& 39	& 22
		& 44 (.96)
		& 125	& 21
		& 44 (.96)
	 \\
\hline
Hashtable	& 355
		& 157	& 25
		& 325 (.92)
		& 252	& 26
		& 329 (.93)
	 \\
\hline
IHashMap	& 392
		& 134	& 34
		& 333 (.85)
		& 182	& 17
		& 335 (.85)
	 \\
\hline
LHashMap	& 103
		& 129	& 25
		& 96 (.93)
		& 153	& 24
		& 96 (.93)
	 \\
\hline
LHashSet	& 9
		& 24	& 16
		& 9 (1.0)
		& 69	& 15
		& 9 (1.0)
	 \\
\hline
LinkedList	& 227
		& 53	& 17
		& 225 (.99)
		& 172	& 18
		& 225 (.99)
	 \\
\hline
PQueue		& 203
		& 103	& 13
		& 155 (.76)
		& 120	& 14
		& 147 (.72)
	 \\
\hline
Properties	& 249
		& 47	& 18
		& 102 (.41)
		& 79	& 35
		& 102 (.41)
	 \\
\hline
Stack		& 17
		& 26	& 8
		& 17 (1.0)
		& 45	& 7
		& 17 (1.0)
	 \\
\hline
TreeMap		& 562
		& 106	& 26
		& 526 (.94)
		& 227	& 28
		& 525 (.93)
	 \\
\hline
TreeSet		& 62
		& 186	& 26
		& 59 (.95)
		& 124	& 27
		& 59 (.95)
	 \\
\hline
Vector		& 200
		& 176	& 20
		& 195 (.98)
		& 36	& 19
		& 196 (.98)
	 \\
\hline
WHashMap	& 338
		& 110	& 21
		& 300 (.89)
		& 201	& 24
		& 300 (.89)
	 \\
\hline
\hline
Total		& 3512
		& 1519	& 324
		& 2880 (.82)
		& 2034	& 322
		& 2883 (.82)
	 \\
\hline
Per unit	& 220
		& 95	& 20
		& 
		& 127	& 20
		&
	 \\
\hline
\end{tabular}
\end{center}
}
\caption{
  Results of running configurations of Nighthawk
  on the 16 {\tt java.util} Collection and Map classes.
  SLOC: number of source lines of code contained in the {\tt .java}
  file of the unit, including inner classes.
  Nt: time (sec) taken by Nighthawk to find the winning chromosome.
  RCt: time (sec) taken by RunChromosome to generate and run 10 test cases
  based on the winning chromosome.
  Coverage: source lines of code covered by the 10 test cases run
  by RunChromosome, as measured by Cobertura
  (the number in parentheses is the ratio of lines covered).
}
\label{collection-map-cov}
\end{figure}

\begin{figure}
\renewcommand{\baselinestretch}{1} 
{\footnotesize
\begin{center}
\begin{tabular}{|l|c|c|c|c|}
Source file		& SLOC
			& Nt	& RCt
	& Coverage \\
\hline
ArrayStack		& 37
			& 6		& 6
	& 37 (1.0) \\
\hline
BagUtils		& 14
			& 52		& 7
	& 14 (1.0) \\
\hline
BeanMap			& 212
			& 19		& 76
	& 138 (0.65) \\
\hline
BinaryHeap		& 149
			& 123		& 18
	& 148 (0.99) \\
\hline
BoundedFifoBuffer	& 89
			& 6		& 11
	& 89 (1.0) \\
\hline
BufferOverflowException	& 9
			& 3 		& 4
	& 9 (1.0) \\
\hline
BufferUnderflowException	& 9
			& 3		& 5
	& 9 (1.0) \\
\hline
BufferUtils		& 12
			& 2		& 6
	& 12 (1.0) \\
\hline
ClosureUtils		& 34
			& 3		& 10
	& 22 (0.65) \\
\hline
CollectionUtils		& 329
			& 301		& 45
	& 211 (0.64) \\
\hline
ComparatorUtils		& 33
			& 13		& 10
	& 33 (1.0) \\
\hline
CursorableLinkedList	& 528
			& 170		& 41
	& 496 (0.94) \\
\hline
DefaultMapEntry		& 25
			& 2		& 5
	& 24 (0.96) \\
\hline
DoubleOrderedMap	& 521
			& 164		& 26
	& 480 (0.92) \\
\hline
EnumerationUtils	& 3
			& 2		& 7
	& 3 (1.0) \\
\hline
FactoryUtils		& 8
			& 2		& 7
	& 8 (1.0) \\
\hline
FastArrayList		& 519
			& 334		& 49
	& 481 (0.93) \\
\hline
FastHashMap		& 258
			& 167		& 24
	& 223 (0.86) \\
\hline
FastTreeMap		& 288
			& 239		& 37
	& 262 (0.91) \\
\hline
FunctorException	& 36
			& 4		& 12
	& 29 (0.81) \\
\hline
HashBag			& 5
			& 8		& 9
	& 5 (1.0) \\
\hline
IteratorUtils		& 114
			& 455		& 54
	& 97 (0.85) \\
\hline
LRUMap			& 44
			& 76		& 28
	& 44 (1.0) \\
\hline
ListUtils		& 64
			& 53		& 10
	& 28 (0.44) \\
\hline
MultiHashMap		& 138
			& 36		& 20
	& 128 (0.93) \\
\hline
PredicateUtils		& 31
			& 64		& 9
	& 31 (1.0) \\
\hline
ReferenceMap		& 297
			& 161		& 23
	& 247 (0.83) \\
\hline
SequencedHashMap	& 236
			& 105		& 41
	& 232 (0.98) \\
\hline
SetUtils		& 30
			& 42		& 8
	& 19 (0.63) \\
\hline
StaticBucketMap		& 214
			& 324		& 36
	& 199 (0.93) \\
\hline
SynchronizedPriorityQueue	& 11
			& 2	& 4
	& 9 (0.82) \\
\hline
TransformerUtils	& 39
			& 3		& 11
	& 27 (0.69) \\
\hline
TreeBag			& 10
			& 10		& 10
	& 10 (1.0) \\
\hline
UnboundedFifoBuffer	& 81
			& 32		& 48
	& 81 (1.0) \\
\hline
\hline
Total			& 4427
			& 2986		& 717
	& 3885 (.88) \\
\hline
Per unit		& 130
			& 88		& 21
	& \\
\hline
\end{tabular}
\end{center}
}

% Even for the 8 units of 200 SLOC or greater,
% this represents 2631 / 2976 SLOC covered, or 88.44%

\caption{
  Results of running Nighthawk 1.1 using 8 gene types
  on the 34 Apache Commons Collections classes studied.
  Column headings are as in Figure \ref{collection-map-cov}.
}
\label{apache-cov}
\end{figure}

We now present statistics on runs of versions of Nighthawk on
two subject software packages, in order to help evaluate how
cost-effective Nighthawk is, and to help compare version 1.0
of Nighthawk with version 1.1.

We collect statistics on Nighthawk using the following
procedure.  For each subject unit, we run Nighthawk for 50
generations.  In order to give engineers an accurate sense of
how long Nighthawk typically takes to find its highest coverage,
we record the time Nighthawk reported after first achieving the
highest coverage it achieved\footnote{
  All empirical studies in this paper were performed on a Sun
  UltraSPARC-IIIi running SunOS 5.10 and Java 1.5.0\_11.
}.  We take the winning chromosome
from Nighthawk, run 10 test cases based on that chromosome
using RunChromosome, and record how long it took.
Finally, we run Cobertura's report
generation script; we calculate how many lines of code are
measurable by Cobertura in the source file for the unit, and
how many lines are reported by Cobertura as having been covered.
These counts include inner classes.

In order to compare Nighthawk v1.0 with v1.1 (the version using
the more accurate initial value for {\tt numberOfCalls}), we
ran the above procedure using v1.0 on the {\tt java.util}
Collection and Map classes, and again using v1.1.  The v1.0
results were reported originally in \cite{andrews07};
Figure \ref{collection-map-cov} collects these statistics and
also those for v1.1.

A glance at the Nt columns in Figure \ref{collection-map-cov}
suggest that version 1.1 of Nighthawk takes longer than version
1.0, but statistical tests suggest there is no statistically
significant difference in run times (a paired Wilcoxon test
yields $p=0.07$; a Shapiro-Wilk normality test on the difference
between the columns yields $p=0.14$, not rejecting the
hypothesis of normality; and a paired $t$ test yields $p=0.08$).
There is no statistically significant difference in either the
runtime needed by RunChromosome (a paired Wilcoxon test yields
$p=0.98$) or the coverage achieved (a paired Wilcoxon test
yields $p=0.60$).

We conclude that for these subject units, the corrected value of
{\tt numberOfCalls} did not result in either significant extra
runtime or significant extra coverage.  Nighthawk was able to
find a winning chromosome after an average of at most 127
seconds per unit, or 58 seconds per 100 source lines of code.
RunChromosome was able to generate and run ten test cases in an
average of 20 seconds per unit, or 9.2 seconds per 100 source
lines of code.  These tests covered an average of 82\% of the
source lines of code, a ratio which is not higher mainly because
of the poor performance of Nighthawk on the large {\tt EnumMap}
unit.  (In \cite{andrews07} we discuss why Nighthawk did
poorly on {\tt EnumMap} and what we did to correct the problem.)

We also ran the evaluation procedure on the Apache Commons
Collections classes that we used in our optimization research.
As discussed in Section
\ref{two-expendable-genes},
there was strong evidence that the
two low-ranked gene types {\tt candidateBitSet} and
{\tt chanceOfNull} were not cost-effective.
For this study, we therefore ran version 1.1 of Nighthawk using
the 8 other, highest-ranked gene types.
Figure \ref{apache-cov} shows the resulting data.
Nighthawk was able to find a winning chromosome after an average
of 88 seconds per unit, or 67 seconds per 100 source lines of code.
RunChromosome was able to generate and run ten test cases in an
average of 21 seconds per unit, or 16 seconds per 100 source
lines of code.  These tests covered an average of 88\% of the
source lines of code.  Inspection of the units on which
Nighthawk did less well indicate that some of the missing
coverage was due to code that tested whether arguments were
instances of specific subclasses of {\tt java.util.Collection},
such as {\tt java.util.Set} or {\tt java.util.Map}.

One coverage tool vendor website \cite{cornett-minimum-coverage} states that
``code coverage of 70-80\% is a reasonable goal for system
test of most projects with most coverage metrics'', and suggests
90\% coverage during unit testing.  Berner et al.\ \cite{berner-etal-icse07},
reporting on a study of the commercial use of unit testing on
Java software, report unit test coverage of no more than 85\% over
the source code of the whole system, on a variety of coverage
measures.  We therefore
consider the coverage levels achieved by Nighthawk to be very
good.

In summary, for the widely-used subject units we studied,
Nighthawk is able to find randomized testing parameters that
achieve high coverage in a reasonable amount of time; the
parameters that it finds allow programmers to quickly generate
many distinct test cases that achieve high coverage of the units
under test.


\section{Threats to Validity}
\label{threats-section}

The representativeness of the units under test is the major
threat to external validity.  We studied {\tt java.util}
and Apache Commons data structure classes
because these are complex, heavily-used units that have
high quality requirements.  However, other units might
have characteristics that cause Nighthawk to perform poorly.
Randomized unit testing schemes in general require many test
cases to be executed, so they perform poorly on methods that do
a large amount of disk I/O or thread generation.
%  They also
%perform poorly on methods expecting string inputs that are
%sentences in a grammar; we expect Nighthawk to do the same
%unless it is given valid sentences as seed strings.

Nighthawk uses Cobertura, which measures line coverage, a weak
coverage measure.  The results that we obtained may not extend
to stronger coverage measures.  However, the Nighthawk algorithm
%treats the coverage measurement as a black-box integer, and
does
not perform special checks particular to line coverage.
%  The
% comparison studies suggest that it still performs well when
% decision/condition coverage and MCC are simulated.
The question of whether code coverage measures are a good
indication of the thoroughness of testing is still, however, an
area of active debate in the software testing community, making
this a threat to construct validity.

Also, time measurement is a construct validity threat. 
We measured time using Java's \linebreak[4]
{\tt systemTimeInMillis}, which
reports total wall clock time, not CPU time.  This may show
run times that do not reflect the testing cost to a real user.

\section{Conclusions and Future Work}
\label{conclusions-section}

Randomized unit testing is a promising technology that has been
shown to be effective, but whose thoroughness depends on the
settings of test algorithm parameters.  In this paper, we have
described Nighthawk, a system in which an upper-level genetic
algorithm automatically derives good parameter values for a
lower-level randomized unit test algorithm.  We have shown that
Nighthawk is able to achieve
% the same coverage as earlier
% studies, and
high coverage of complex, real-world Java units,
while retaining the most desirable feature of randomized
testing: the ability to generate many new high-coverage test
cases quickly.

We have also shown that we were able to optimize and
simplify 
meta-heuristic search tools.
Metaheuristic tools (such as genetic algorithms and simulated
annealers) typically mutate some aspect of a candidate solution
and evaluate the results.  If the effect of mutating each aspect
is recorded, then each aspect can be considered a feature and
is amenable to the FSS processing described here.  
In this way,
FSS can be used to automatically find and remove
superfluous parts of the search control.

Future work includes the integration into Nighthawk of useful
facilities from past systems, such as failure-preserving or
coverage-preserving test case minimization, and further
experiments on the effect of program options on coverage and
efficiency.  We also wish to integrate a feature subset
selection learner into the GA level of the Nighthawk algorithm
for dynamic optimization of the GA.
Further, we can see a promising line of research where the cost/benefits
of a particular meta-heuristic are tuned to the particulars of a specific problem.
Here, we have shown that if we surrender $\frac{1}{10}$-th of the coverage, we can 
run Nighthawk ten times faster. 
While this is an acceptable trade-off in many domains,
it may unsuitable for 
safety critical applications. 
More work is required to understand how to best  match meta-heuristics (with or without FSS)
to particular problem domains.

%\begin{itemize}
%\item While the predicate coverage proposed by Visser et al.\ is
%  an interesting assessment criteria, there is no consensus in the
%  literature on the connection of this criteria to other measures.
%\item Static code analysis can direct the generation of the test
%  cases.  Our method, on the other hand, generates test cases at
%  random, so it is possible that we cover some code more than is
%  necessary, and that parts of our value pools are
%  useless.  Our view is that this is not a major issue since our
%  runtimes, on real-world systems, are quite impressive. 
%  Nevertheless, there needs to be more discussion on how to
%  assess test suite generation; i.e.\ runtimes versus
%  superfluous tests versus any other criteria.
%\end{itemize}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% Reminder: the "draftcls", not "draft", class option should be used if
% it is desired that the figures are to be displayed while in draft mode.

% An example of a floating figure using the graphicx package.
% Note that \label must occur AFTER (or within) \caption.
% For figures, \caption should occur after the \includegraphics.
%
%\begin{figure}
%\centering
%\includegraphics[width=2.5in]{myfigure.eps}
%\caption{Simulation Results}
%\label{fig_sim}
%\end{figure}


% An example of a double column floating figure using two subfigures.
% (The subfigure.sty package must be loaded for this to work.)
% The subfigure \label commands are set within each subfigure command, the
% \label for the overall fgure must come after \caption.
% \hfil must be used as a separator to get equal spacing
%
%\begin{figure*}
%\centerline{\subfigure[Case I]{\includegraphics[width=2.5in]{subfigcase1.eps}
%\label{fig_first_case}}
%\hfil
%\subfigure[Case II]{\includegraphics[width=2.5in]{subfigcase2.eps}
%\label{fig_second_case}}}
%\caption{Simulation results}
%\label{fig_sim}
%\end{figure*}



% An example of a floating table. Note that, for IEEE style tables, the
% \caption command should come BEFORE the table. Table text will default to
% \footnotesize as IEEE normally uses this smaller font for tables.
% The \label must come after \caption as always.
%
%\begin{table}
%% increase table row spacing, adjust to taste
%\renewcommand{\arraystretch}{1.3}
%\caption{An Example of a Table}
%\label{table_example}
%\centering
%% The array package and the MDW tools package offers better commands
%% for making tables than plain LaTeX2e's tabular which is used here.
%\begin{tabular}{|c||c|}
%\hline
%One & Two\\
%\hline
%Three & Four\\
%\hline
%\end{tabular}
%\end{table}


% if have a single appendix:
%\appendix[Proof of the Zonklar Equations]
% or
%\appendix  % for no appendix heading
% do not use \section anymore after \appendix, only \section*
% is possibly needed

% use appendices with more than one appendix
% then use \section to start each appendix
% you must declare a \section before using any
% \subsection or using \label (\appendices by itself
% starts a section numbered zero.)
%
% Use this command to get the appendices' numbers in "A", "B" instead of the
% default capitalized Roman numerals ("I", "II", etc.).
% However, the capital letter form may result in awkward subsection numbers
% (such as "A-A"). Capitalized Roman numerals are the default.
%\useRomanappendicesfalse
%
\appendices
% you can choose not to have a title for an appendix
% if you want by leaving the argument blank
%\section{}
%Appendix two text goes here.

% use section* for acknowledgment
%%\section*{Acknowledgment}
%%% optional entry into table of contents (if used)
%%%\addcontentsline{toc}{section}{Acknowledgment}
%%Thanks to Willem Visser for making the source code of the Java
%%Pathfinder subject units available, and to the JDEAL development
%%team for their tool package.  Thanks also for interesting
%%comments and discussions to Rob Hierons, Charles Ling, Bob
%%Mercer and Andy Podgurski. 
%%This research is supported by the first author's grant from the
%%Natural Sciences and Research Council of Canada (NSERC).
%%

% trigger a \newpage just before the given reference
% number - used to balance the columns on the last page
% adjust value as needed - may need to be readjusted if
% the document is modified later
%\IEEEtriggeratref{8}
% The "triggered" command can be changed if desired:
%\IEEEtriggercmd{\enlargethispage{-5in}}

% references section
% NOTE: BibTeX documentation can be easily obtained at:
% http://www.ctan.org/tex-archive/biblio/bibtex/contrib/doc/

% can use a bibliography generated by BibTeX as a .bbl file
% standard IEEE bibliography style from:
% http://www.ctan.org/tex-archive/macros/latex/contrib/supported/IEEEtran/
 \bibliographystyle{IEEEtran.bst}
% argument is your BibTeX string definitions and bibliography database(s)
 \bibliography{my}

\clearpage
\section*{Reply to reviewers}

Thank you to all reviewers for their careful assessment of this paper. Based on those reviews
we have prepared a significantly modified version which, we believe, address all current concerns
(see below for our comments on particular reviewer comments).

FYI: The attached draft (without this review section) becomes a 15 page document is the standard IEEE TSE  two-column format.
This is one page longer than the standard TSE article.
We would be happy to pay the overlength charges for the extra page.

\subsection*{Reply to reviewer \#1}

{\em My initial reservations regarding this paper were concerned with
the similarity to the authors previous ASE publication (I had a
number of other more minor concerns but the authors have adequately
addressed these). The distinction between this paper and the ASE
paper has been enhanced by including more information on the Feature
Subset Selection study. The authors initial work in this area has
also been reported in their Promise'09 paper and so there is an
element of similarity between these two publications, particularly
in terms of the background. The results are new since this paper
employs a different (more effective) reduction method to the Promise
paper.}
\begin{quote}
As commented on in our introduction, the difference between this draft and
prior publications has been increased.  The paper differs from prior
publications as follows:
\bi
\item We have deleted the experimental comparison to previous work,
  instead summarizing it in the ``Related Work'' section and referring
  the reader to the ASE paper for details.
\item The original Nighthawk publication~\cite{andrews07} studied its effectiveness only on the {\tt java.util} Collection and Map classes.
\item A subsequent paper~\cite{andrews-menzies-promise09} offered an FSS-based optimization. That paper found ways to prune 60\% of the gene types when trying to cover
the {\tt java.util} classes. 
\item This paper shows that that FSS-based optimizer was sub-optimal. We present here
 a better optimization method that prunes 90\% of the gene types (see \S{\ref{sec:rerank}}). An
analysis of that optimized system in \S{\ref{sec:opt}} revealed further improvements.
We have successfully tested  the optimized and improved version of Nighthawk
on 34 classes from the Apache Commons Collections library (see \S{\ref{sec:apache}}).
\ei
In the resulting paper, the material not in the ASE paper (other than
about a page in the ``related work'' section) starts on page 17 out of 34.
The material overlapping the PROMISE paper consists of about 5 pages in
the rest of the paper.  We therefore count 12 out of the 34
non-bibliography pages as new material.  It would be nice to increase this
further, but it seemed that for the new work to be coherent to the reader,
we needed this much explanation of our previous work.
\end{quote}
{\em On a
minor point, Figure 10 is not easily readable (except to indicate
that the results for the varying numbers of gene types are quite
similar). }
\begin{quote}
\fig{hashtable} has now been enlarged.
\end{quote}
{\em
The results in figure 14 are particularly interesting
since it would appear that the approach seems to be class-dependent,
as the trends appear to differ more for the classes than for the
number of gene types. This is something that could be explored and
tested.}
\begin{quote}
We agree and have proceeded as your suggest. However, this paper is already over-sized
so we do not think we can do justice
to this issue in this paper.  With your permission, we would prefer to make that study a separate publication.
\end{quote}
{\em 
It is also mentioned that the additional coverage is statistically
significant but there is no mention of how this was tested.}
\begin{quote}
Our mistake. In \tion{rerank} we have added that this test was a Wilcoxon test.
\end{quote}
{\em The bigger concern with this is the range of classes (subject units)
used (which is a general concern with this paper) - important though
they are, they may not be diverse enough to demonstrate the general
applicability of the approach. This relates to the earlier observation
about the results in figure 14 appearing to be class-dependent. To
be more convincing this study would benefit from being run on a
wider range of diverse classes.
}
\begin{quote}
This paper extends the prior results (based on 16 {\tt java.util} classes) to 34 new classes from the Apache Commons Collections package.
As shown in this paper, the optimization result repeated in these new classes: simplifying Nighthawk to mutate only one gene type achieved 90\% (or more) of the
coverage seen in full Nighthawk, using only 10\% (or less).
\end{quote}


\subsection*{Response to Reviewer 2}
{\em
While it is true that the fss approach described here, is a new
formulation, the broad method is similar.  Thus, given that
the main case study is essentially that previously published at
ASE, and the FSS refinement similar to that published at PROMISE,
the proportion of unpublished material is arguably lower than in
the initial version of the paper.
}
\begin{quote}
As we mentioned above in replies to reviewer 1, this paper is significantly different to the ASE and PROMISE papers and the previous draft:
\bi
\item We have deleted the empirical comparison to previous work.
\item
The ASE paper did not discuss FSS optimization. 
\item
The FSS optimizer mentioned in the PROMISE paper is demonstrably
sub-optimal. The PROMISE optimizer pruned 60\% of the gene types while
 the better optimization method shown here prunes 90\% of the gene types (see \S{\ref{sec:rerank}}). 
\item Since the last draft, we have added much new material:
\bi
\item
We have discussed better ways to find select gene types (see the discussion around 
\fig{all-gene-type-rankings-fig} and 
\fig{all-gene-type-rankings-v1-1fig}).
\item
We have added
and
analysis of the newly optimized system in \S{\ref{sec:opt}} that found that we were hobbling our system (starting with a {\em number of calls} setting that was way
too low).
\item
We have successfully tested  the optimized and improved version of Nighthawk
on 34 classes from the Apache system (see \S{\ref{sec:apache}}).
\ei
\ei
\end{quote}
{\em
Comment regarding 'spikes' in Fig 1:  
The authors haven't completely addressed this point.  The 'spikes' are simply
an artefact of the how the graph is drawn.
}
\begin{quote}
We understand the reviewer's point, and spent some time trying to get
{\tt gnuplot} to produce a ridge instead of spikes, but we were unable to.
The spikes are an unexpected, but correct, consequence of using integers
here.  We have tried to make this clear in the text by contrasting the
situation with integers with the situation with real numbers.

One possible solution to this problem would be to drop this figure and try to make the same point with some extended text.
The reviewer's advice on this matter would be appreciated.
\end{quote}
{\em
 Abstract - 2nd paragraph:  While relevant, the description here
is perhaps a little too detailed for an abstract.  The terminology
of 'gene type' isn't clear at this point, and the terminology of
'mutator' does not appear to explained in the paper at all.  I'd
suggest referring simply to the use of feature subset selection to
reduce the size and content of the representation and the resultant
improvement in performance with only a small decrease in quality.
}
\begin{quote}
Good suggestion. We have done as you say.
\end{quote}
{\em Section III C - 3rd para: ``Nighthawk gets the random testing level
to generate and run ...''.  For readability, I suggest rewording as
``The random testing level of Nighthawk generates and runs ...''
}
\begin{quote}
Good suggestion. We have done as you say.
\end{quote}
{\em Section V - 4th para: Unfortunately the Shapiro-Wilk test has
been applied inappropriately.  In order to use the paired t-test,
it is necessary to demonstrate that the distribution for each
combination of option and source file (i.e. each cell in Fig 9) is
a normal distribution, i.e. across the 10 test cases taken for each.
Apply the SW to the entire column is not meaningful: the values in
the column will essentially depend instead of how 'difficult' each
source file is, and this is a consequence of the choice of the
source file, not of the distribution of each.  Therefore, I suggest
removing discussion of the SW test, and of the t-test and simply
retain the Wilcoxon test results: this does not change the conclusion.
}
\begin{quote}
Thank you for the correction.  We have deleted the section in
question, but in our new section describing the properties of
the optimized system, we have been more careful in applying
Shapiro-Wilk (applying it to the difference between two columns).
\end{quote}
{\em The random testing level of Nighthawk generates and runs .
 Section V - 4th para: Arguably, there was no need to apply the
Bonferroni correction.  If the hypothesis was that at least one
pair of columns is different, then it is necessary (since one such
difference could easily occur by chance over the multiple combination).
However, if the hypothesis is about specific pairs e.g. that (PN,EN)
are no different, then no correction is necessary.  Since the
Bonferroni correction makes the test more conservative, I suggest
leaving it as is.
}
\begin{quote}
The relevant section has been deleted, but thank you for the correction.
\end{quote}
{\em Section VI - D : how were the default constant values chosen for
gene types that were no optimized by the GA?
}
\begin{quote}
We have now added a sentence explaining that the default value is the
initial value set for the gene.
\end{quote}
{\em Section VI - D, Fig 13 - although these results are summarized
in Fig 14, the figure 13 itself is to small to be readable: it is
impossible to distinguish the lines corresponding to different
number of gene types eliminated.  Same is true of Fig 15, but to a
lesser extent.
}
\begin{quote}
We have tried various fixes to this problem- but the performances are so similar that, so far, we have failed to come up with a representation that better
(and succinctly) represents that data. 

Also, we are not clear what purpose is served by increasing the size of these figures- the point is not the details of a particular class. Rather, the point is
that the general trends are the same (fig13) or that there is a point at (10,90) in fig15 which most of the plots are above.

Note, in this draft:
\bi
\item
 What was fig13 is now fig10 .
\item 
What was fig15 is now fig12+fig15.
\ei
\end{quote}
{\em
Section VI - D: it appears that only one 'run' was performed for
each source code/number of eliminated gene types: as mentioned
above, averaging results across multiple runs of the GA would have
reduced the variance (and therefore may have avoided lines 'above'
the thick line in Fig 13).
}
\begin{quote}
  This is true, but we felt the reader might be interested in
  seeing a comparison between all the subject units to get a
  sense of how they varied.
\end{quote}
{\em Section VI - D: why is area under the curve chosen as the metric?
Would not have coverage of the best chromosome (as used in the
earlier case study) been a better response metric?
}
\begin{quote}
We use the ratio of optimized Nighthawk's final coverage to the coverage of full Nighthawk.
Why? Well,
the first half of the paper assess Nighthawk in absolute terms (compared to other methods for generating coverage). The second half of the paper
asks ``is full Nighthawk doing too much work'' so the performance criteria here is $\frac{reduced\;Nighthawk}{full\;Nighthawk}$. Note that this
fraction can be directly and succinctly presented to user using the performance ratios of old-to-new Nighthawk.
\end{quote}
{\em 
 Section VI - D (1): the alpha-value does not ``result'' in a
particular p-value, the p-value is independent of the alpha value.
The alpha value is the critical value for determining whether the
p-value demonstrates a significant result.  (Also, is alpha=0.5 the
correct value - should it be 0.05?)
}
\begin{quote}
The relevant section has been deleted, but
you are quite correct; we should have written $\alpha=0.05$, but in
any case it was an awkwardly-constructed comment that seemed to
suggest that the $\alpha$ value affected the resulting $p$ value.
\end{quote}
{\em
Section VIII - final para: ``depreciated in safety critical
applications'' - is ``depreciated'' the correct word here - perhaps
just ``but possible not for safety critical applications''; also typo
in ``applications''
}
\begin{quote}
We have fixed the typos and changed 
``depreciated'' to ``unsuitable for''.
\end{quote}
\subsection*{Response to Reviewer 3}
Reviewer 3 accepted the last draft, as is.

% <OR> manually copy in the resultant .bbl file
% set second argument of \begin to the number of references
% (used to reserve space for the reference number labels box)

% biography section
%
% If you had an eps/pdf photo file (graphicx package needed)
% the extra braces prevent the LaTeX parser from getting confused
% when it sees the complicated \includegraphics command within an
% optional argument. You can create your own macro to make things
% simpler here.
%\begin{biography}[{\includegraphics[width=1in,height=1.25in,clip,keepaspectratio]{mshell.eps}}]{Michael Shell}
% or if you just want to reserve a space for a photo:
%
%\begin{IEEEbiography}{James H.\ Andrews}
%received the BSc and MSc degrees in computer science from the
%University of British Columbia, and the PhD degree  in computer
%science from the University of Edinburgh.  He worked from 1982
%to 1984 at Bell-Northern Research, from 1991 to 1995 at Simon
%Fraser University, and from 1996 to 1997 on the FormalWare
%project at the University of British Columbia, Hughes Aircraft
%Canada and MacDonald Dettwiler.  He is currently an Associate
%Professor in the Department of Computer Science at the
%University of Western Ontario, in London, Canada, where he has
%been since 1997.  His research interests include software
%testing, semantics of programming languages, and formal
%specification.  He is a member of the IEEE.
%\end{IEEEbiography}
%
%\begin{IEEEbiography}{Felix C.\ H.\ Li}
%received the BSc and MSc degrees in Computer Science from the
%University of Western Ontario, in 2005 and 2007 respectively.
%He is currently a software developer based in Toronto.
%\end{IEEEbiography}
%
%\begin{IEEEbiography}{Tim Menzies}
%received the CS and PhD degrees from the University of New South
%Wales and is the author of more than 160 publications.  He is an
%associate professor at the Lane Department of Computer Science
%at West Virginia University (USA), and has been working with
%NASA on software quality issues since 1998.  His recent research
%concerns modeling and learning with a particular focus on
%lightweight modeling methods.  His doctoral research explored
%the validation of possibly inconsistent knowledge-based systems
%in the QMOD specification language.  He is a member of the IEEE.
%\end{IEEEbiography}
%
% if you will not have a photo
%\begin{biographynophoto}{John Doe}
%Biography text here.
%\end{biographynophoto}

% insert where needed to balance the two columns on the last page
%\newpage

%\begin{biographynophoto}{Jane Doe}
%Biography text here.
%\end{biographynophoto}

% You can push biographies down or up by placing
% a \vfill before or after them. The appropriate
% use of \vfill depends on what kind of text is
% on the last page and whether or not the columns
% are being equalized.

%\vfill

% Can be used to pull up biographies so that the bottom of the last one
% is flush with the other column.
%\enlargethispage{-5in}

% that's all folks
\end{document}
