%XXX: remve the sd stuff in fig 1
%% bare_jrnl.tex
%% V1.2 Measure
%% 2002/11/18
%% by Michael Sell
%% mshell@ece.gatech.edu
%%
%% NOTE: This text file uses MS Windows line feed conventions. When (human)
%% reading this file on other platforms, you may have to use a text
%% editor that can handle lines terminated by the MS Windows line feed
%% characters (0x0D 0x0A).
%%
%% This is a skeleton file demonstrating the use of IEEEtran.cls
%% (requires IEEEtran.cls version 1.6b or later) with an IEEE journal paper.
%%
%% Support sites:
%% http://www.ieee.org
%% and/or
%% http://www.ctan.org/tex-archive/macros/latex/contrib/supported/IEEEtran/
%%
%% This code is offered as-is - no warranty - user assumes all risk.
%% Free to use, distribute and modify.

% *** Authors should verify (and, if needed, correct) their LaTeX system  ***
% *** with the testflow diagnostic prior to trusting their LaTeX platform ***
% *** with production work. IEEE's font choices can trigger bugs that do  ***
% *** not appear when using other class files.                            ***
% http://www.ctan.org/tex-archive/macros/latex/contrib/IEEEtran/testflow


% Note that the a4paper option is mainly intended so that authors in
% countries using A4 can easily print to A4 and see how their papers will
% look in print. Authors are encouraged to use U.S. letter paper when
% submitting to IEEE. Use the testflow package mentioned above to verify
% correct handling of both paper sizes by the author's LaTeX system.
%
% Also note that the "draftcls" or "draftclsnofoot", not "draft", option
% should be used if it is desired that the figures are to be displayed in
% draft mode.
%
% This example can be formatted using the peer review
% (instead of journal) mode.
\documentclass[draftcls,onecolumn,journal]{IEEEtran/IEEEtran}
%\documentclass[journal]{IEEEtran/IEEEtran}
% If the IEEEtran.cls has not been installed into the LaTeX system files,
% manually specify the path to it:
% \documentclass[journal]{../sty/IEEEtran}

\newenvironment{smallitem}{
   \setlength{\topsep}{0pt}
   \setlength{\partopsep}{0pt}
   \setlength{\parskip}{0pt}
   \begin{itemize}
   \setlength{\leftmargin}{.2in}
   \setlength{\parsep}{0pt}
   \setlength{\parskip}{0pt}
   \setlength{\itemsep}{0pt}}{\end{itemize}}

\newenvironment{smallnums}{
   \setlength{\topsep}{0pt}
   \setlength{\partopsep}{0pt}
   \setlength{\parskip}{0pt}
   \begin{enumerate}
   \setlength{\leftmargin}{.2in}
   \setlength{\parsep}{0pt}
   \setlength{\parskip}{0pt} \setlength{\itemsep}{0pt}}{\end{enumerate}}

\newenvironment{hhh}{
  \begin{hypo}[?]}{\end{hypo}}
\newenvironment{yup}{
  \begin{hypo}[\ding{51}]}{\end{hypo}}
\newenvironment{nupe}{
  \begin{hypo}[\ding{55}]}{\end{hypo}}

\newcommand{\boxplot}[5]{\begin{picture}(100,10)
\put(0,0){\line(0,1){8}}
\put(100,0){\line(0,1){8}}
\put(#1,4){\line(1,0){#2}}
\put(#3,4){\circle*{6}}
\put(#4,4){\line(1,0){#5}}
\put(50,0){\line(0,1){8}}
\end{picture}}

\newcommand{\aive}{{\"aive}}
\newcommand{\bi}{\begin{smallitem}}
\newcommand{\ei}{\end{smallitem}}
\newcommand{\be}{\begin{smallnums}}
\newcommand{\ee}{\end{smallnums}}
\newcommand{\bd}{\begin{description}}
\newcommand{\ed}{\end{description}}
\newcommand{\fig}[1]{Figure~\ref{fig:#1}}
\newcommand{\eq}[1]{Equation~\ref{eq:#1}}
\newcommand{\hyp}[1]{Hypothesis~\ref{hyp:#1}}


% some very useful LaTeX packages include:
\usepackage{alltt}
\usepackage{cite}      % Written by Donald Arseneau
                        % V1.6 and later of IEEEtran pre-defines the format
                        % of the cite.sty package \cite{} output to follow
                        % that of IEEE. Loading the cite package will
                        % result in citation numbers being automatically
                        % sorted and properly "ranged". i.e.,
                        % [1], [9], [2], [7], [5], [6]
                        % (without using cite.sty)
                        % will become:
                        % [1], [2], [5]--[7], [9] (using cite.sty)
                        % cite.sty's \cite will automatically add leading
                        % space, if needed. Use cite.sty's noadjust option
                        % (cite.sty V3.8 and later) if you want to turn this
                        % off. cite.sty is already installed on most LaTeX
                        % systems. The latest version can be obtained at:
                        % http://www.ctan.org/tex-archive/macros/latex/contrib/supported/cite/

%\usepackage{graphicx}  % Written by David Carlisle and Sebastian Rahtz
                        % Required if you want graphics, photos, etc.
                        % graphicx.sty is already installed on most LaTeX
                        % systems. The latest version and documentation can
                        % be obtained at:
                        % http://www.ctan.org/tex-archive/macros/latex/required/graphics/
                        % Another good source of documentation is "Using
                        % Imported Graphics in LaTeX2e" by Keith Reckdahl
                        % which can be found as esplatex.ps and epslatex.pdf
                        % at: http://www.ctan.org/tex-archive/info/
% NOTE: for dual use with latex and pdflatex, instead load graphicx like:
%\ifx\pdfoutput\undefined
%\usepackage{graphicx}
%\else
\usepackage[pdftex]{graphicx}
%\fi
% However, be warned that pdflatex will require graphics to be in PDF
% (not EPS) format and will preclude the use of PostScript based LaTeX
% packages such as psfrag.sty and pstricks.sty. IEEE conferences typically
% allow PDF graphics (and hence pdfLaTeX). However, IEEE journals do not
% (yet) allow image formats other than EPS or TIFF. Therefore, authors of
% journal papers should use traditional LaTeX with EPS graphics.
%
% The path(s) to the graphics files can also be declared: e.g.,
% \graphicspath{{../eps/}{../ps/}}
% if the graphics files are not located in the same directory as the
% .tex file. This can be done in each branch of the conditional above
% (after graphicx is loaded) to handle the EPS and PDF cases separately.
% In this way, full path information will not have to be specified in
% each \includegraphics command.
%
% Note that, when switching from latex to pdflatex and vice-versa, the new
% compiler will have to be run twice to clear some warnings.


%\usepackage{psfrag}    % Written by Craig Barratt, Michael C. Grant,
                        % and David Carlisle
                        % This package allows you to substitute LaTeX
                        % commands for text in imported EPS graphic files.
                        % In this way, LaTeX symbols can be placed into
                        % graphics that have been generated by other
                        % applications. You must use latex->dvips->ps2pdf
                        % workflow (not direct pdf output from pdflatex) if
                        % you wish to use this capability because it works
                        % via some PostScript tricks. Alternatively, the
                        % graphics could be processed as separate files via
                        % psfrag and dvips, then converted to PDF for
                        % inclusion in the main file which uses pdflatex.
                        % Docs are in "The PSfrag System" by Michael C. Grant
                        % and David Carlisle. There is also some information
                        % about using psfrag in "Using Imported Graphics in
                        % LaTeX2e" by Keith Reckdahl which documents the
                        % graphicx package (see above). The psfrag package
                        % and documentation can be obtained at:
                        % http://www.ctan.org/tex-archive/macros/latex/contrib/supported/psfrag/

\usepackage{subfigure} % Written by Steven Douglas Cochran
                        % This package makes it easy to put subfigures
                        % in your figures. i.e., "figure 1a and 1b"
                        % Docs are in "Using Imported Graphics in LaTeX2e"
                        % by Keith Reckdahl which also documents the graphicx
                        % package (see above). subfigure.sty is already
                        % installed on most LaTeX systems. The latest version
                        % and documentation can be obtained at:
                        % http://www.ctan.org/tex-archive/macros/latex/contrib/supported/subfigure/

\usepackage{url}       % Written by Donald Arseneau
                        % Provides better support for handling and breaking
                        % URLs. url.sty is already installed on most LaTeX
                        % systems. The latest version can be obtained at:
                        % http://www.ctan.org/tex-archive/macros/latex/contrib/other/misc/
                        % Read the url.sty source comments for usage information.

%\usepackage{stfloats}  % Written by Sigitas Tolusis
                        % Gives LaTeX2e the ability to do double column
                        % floats at the bottom of the page as well as the top.
                        % (e.g., "\begin{figure*}[!b]" is not normally
                        % possible in LaTeX2e). This is an invasive package
                        % which rewrites many portions of the LaTeX2e output
                        % routines. It may not work with other packages that
                        % modify the LaTeX2e output routine and/or with other
                        % versions of LaTeX. The latest version and
                        % documentation can be obtained at:
                        % http://www.ctan.org/tex-archive/macros/latex/contrib/supported/sttools/
                        % Documentation is contained in the stfloats.sty
                        % comments as well as in the presfull.pdf file.
                        % Do not use the stfloats baselinefloat ability as
                        % IEEE does not allow \baselineskip to stretch.
                        % Authors submitting work to the IEEE should note
                        % that IEEE rarely uses double column equations and
                        % that authors should try to avoid such use.
                        % Do not be tempted to use the cuted.sty or
                        % midfloat.sty package (by the same author) as IEEE
                        % does not format its papers in such ways.

\usepackage{amssymb}
\usepackage{amsmath}   % From the American Mathematical Society
                        % A popular package that provides many helpful commands
                        % for dealing with mathematics. Note that the AMSmath
                        % package sets \interdisplaylinepenalty to 10000 thus
                        % preventing page breaks from occurring within multiline
                        % equations. Use:
\interdisplaylinepenalty=2500
                        % after loading amsmath to restore such page breaks
                        % as IEEEtran.cls normally does. amsmath.sty is already
                        % installed on most LaTeX systems. The latest version
                        % and documentation can be obtained at:
                        % http://www.ctan.org/tex-archive/macros/latex/required/amslatex/math/



% Other popular packages for formatting tables and equations include:

%\usepackage{array}
% Frank Mittelbach's and David Carlisle's array.sty which improves the
% LaTeX2e array and tabular environments to provide better appearances and
% additional user controls. array.sty is already installed on most systems.
% The latest version and documentation can be obtained at:
% http://www.ctan.org/tex-archive/macros/latex/required/tools/

% Mark Wooding's extremely powerful MDW tools, especially mdwmath.sty and
% mdwtab.sty which are used to format equations and tables, respectively.
% The MDWtools set is already installed on most LaTeX systems. The lastest
% version and documentation is available at:
% http://www.ctan.org/tex-archive/macros/latex/contrib/supported/mdwtools/


% V1.6 of IEEEtran contains the IEEEeqnarray family of commands that can
% be used to generate multiline equations as well as matrices, tables, etc.


% Also of notable interest:

% Scott Pakin's eqparbox package for creating (automatically sized) equal
% width boxes. Available:
% http://www.ctan.org/tex-archive/macros/latex/contrib/supported/eqparbox/



% Notes on hyperref:
% IEEEtran.cls attempts to be compliant with the hyperref package, written
% by Heiko Oberdiek and Sebastian Rahtz, which provides hyperlinks within
% a document as well as an index for PDF files (produced via pdflatex).
% However, it is a tad difficult to properly interface LaTeX classes and
% packages with this (necessarily) complex and invasive package. It is
% recommended that hyperref not be used for work that is to be submitted
% to the IEEE. Users who wish to use hyperref *must* ensure that their
% hyperref version is 6.72u or later *and* IEEEtran.cls is version 1.6b
% or later. The latest version of hyperref can be obtained at:
%
% http://www.ctan.org/tex-archive/macros/latex/contrib/supported/hyperref/
%
% Also, be aware that cite.sty (as of version 3.9, 11/2001) and hyperref.sty
% (as of version 6.72t, 2002/07/25) do not work optimally together.
% To mediate the differences between these two packages, IEEEtran.cls, as
% of v1.6b, predefines a command that fools hyperref into thinking that
% the natbib package is being used - causing it not to modify the existing
% citation commands, and allowing cite.sty to operate as normal. However,
% as a result, citation numbers will not be hyperlinked. Another side effect
% of this approach is that the natbib.sty package will not properly load
% under IEEEtran.cls. However, current versions of natbib are not capable
% of compressing and sorting citation numbers in IEEE's style - so this
% should not be an issue. If, for some strange reason, the user wants to
% load natbib.sty under IEEEtran.cls, the following code must be placed
% before natbib.sty can be loaded:
%
% \makeatletter
% \let\NAT@parse\undefined
% \makeatother
%
% Hyperref should be loaded differently depending on whether pdflatex
% or traditional latex is being used:
%
%\ifx\pdfoutput\undefined
%\usepackage[hypertex]{hyperref}
%\else
%\usepackage[pdftex,hypertexnames=false]{hyperref}
%\fi
%
% Pdflatex produces superior hyperref results and is the recommended
% compiler for such use.



% *** Do not adjust lengths that control margins, column widths, etc. ***
% *** Do not use packages that alter fonts (such as pslatex).         ***
% There should be no need to do such things with IEEEtran.cls V1.6 and later.


% correct bad hyphenation here
%\hyphenation{op-tical net-works semi-conduc-tor}

\usepackage{pifont}
\begin{document}
%
% paper title
\title{Data Mining Static Code Attributes\\ to Learn Defect Predictors}
%
%
% author names and IEEE memberships
% note positions of commas and nonbreaking spaces ( ~ ) LaTeX will not break
% a structure at a ~ so this keeps an author's name from being broken across
% two lines.
% use \thanks{} to gain access to the first footnote area
% a separate \thanks must be used for each paragraph as LaTeX2e's \thanks
% was not built to handle multiple paragraphs
\author{Tim~Menzies,~\IEEEmembership{Member,~IEEE}% <-this % stops a space
\thanks{Dr. Menzies is with the Lane Department of Computer
Science, West Virginia University and can be reached at \protect\url{tim@menzies.us}},
Jeremy Greenwald\thanks{Jeremy Greenwald is with
Computer Science, Portland State University and can be reached
at \protect\url{jegreen@cecs.pdx.edu}.},
Art Frank\thanks{Art Frank is an undergraduate student
at
Computer Science, Portland State University,
and can be reached at \protect\url{arf@cs.pdx.edu}.}
\thanks{The research described in this paper
was carried out at West Virginia and Portland State University under
contracts and sub-contracts with NASA's Software Assurance Research Program
and the Galaxy Global Corporation, Fairmont West Virginia.
Reference herein to any specific commercial product, process, or service
by trademark, manufacturer, or otherwise, does not constitute or imply
its endorsement by the United States Government.
}\thanks{Manuscript received January 1, 2006; August 5, 2006.
Download an earlier draft from
\protect\url{http://menzies.us/pdf/06learnPredict.pdf}.
}}% <-this % stops a space
% note the % following the last \IEEEmembership and also the first \thanks -
% these prevent an unwanted space from occurring between the last author name
% and the end of the author line. i.e., if you had this:
%
% \author{....lastname \thanks{...} \thanks{...} }
%                     ^------------^------------^----Do not want these spaces!
%
% a space would be appended to the last name and could cause every name on that
% line to be shifted left slightly. This is one of those "LaTeX things". For
% instance, "A\textbf{} \textbf{}B" will typeset as "A B" not "AB". If you want
% "AB" then you have to do: "A\textbf{}\textbf{}B"
% \thanks is no different in this regard, so shield the last } of each \thanks
% that ends a line with a % and do not let a space in before the next \thanks.
% Spaces after \IEEEmembership other than the last one are OK (and needed) as
% you are supposed to have spaces between the names. For what it is worth,
% this is a minor point as most people would not even notice if the said evil
% space somehow managed to creep in.
%
% The paper headers
\markboth{IEEE Transactions on Software Engineering,~Vol.~W, No.~X,~YYY~200Z}{Menzies, Greenwald, Frank:
Learning Defect Detectors}
% The only time the second header will appear is for the odd numbered pages
% after the title page when using the twoside option.
%
% *** Note that you probably will NOT want to include the author's name in ***
% *** the headers of peer review papers.                                   ***

% If you want to put a publisher's ID mark on the page
% (can leave text blank if you just want to see how the
% text height on the first page will be reduced by IEEE)
%\pubid{0000--0000/00\$00.00~\copyright~2002 IEEE}

% use only for invited papers
%\specialpapernotice{(Invited Paper)}

% make the title area
\maketitle


\begin{abstract}
The value of using static code attributes to learn defect predictors has
been widely debated.  Prior work has explored
issues like the merits of
``McCabes vs Halstead vs lines of code counts'' for
generating defect predictors.  
We show here that
such debates are irrelevant since {\em how} the attributes are used
to build predictors is much more important than {\em which} particular
attributes are used.
Also, contrary to prior pessimism, we show that such defect
predictors are demonstrably useful
and, on the data studied here, yield predictors
with a mean probability of detection of 71\% and mean false alarms
rates of 25\%.  These predictors 
would be useful for  prioritizing
a resource-bound exploration of code that has yet to be inspected.  
\end{abstract}

% Note that keywords are not normally used for peer review papers.

% For peer review papers, you can put extra information on the cover
% page as needed:
% \begin{center} \bfseries EDICS Category: 3-BBND \end{center}
%
% For peer review papers, inserts a page break and creates the second title.
% Will be ignored for other modes.
%\IEEEpeerreviewmaketitle



\section{Introduction}
% The very first letter is a 2 line initial drop letter followed
% by the rest of the first word in caps.
%
% form to use if the first word consists of a single letter:
% \PARstart{A}{demo} file is ....
%
% form to use if you need the single drop letter followed by
% normal text (unknown if ever used by IEEE):
% \PARstart{A}{}demo file is ....
%
% Some journals put the first two words in caps:
% \PARstart{T}{his demo} file is ....
%
% Here we have the typical use of a "T" for an initial drop letter
% and "HIS" in caps to complete the first word.


Given recent research in artificial intelligence, it is now practical
to 
use {\em data miners} to automatically learn predictors for software quality.
When budget does not allow for  complete testing of an entire system,
software managers can use such predictors to 
focus the testing on parts of the system that seem defect-prone.
These potential defect-prone trouble spots can then
be examined in more detail by, say, model checking, intensive testing, etc.

The value of static code attributes as defect predictors has
been widely debated. 
Some researchers endorse 
them 
\mbox{(\cite{halstead77,mccabe76,chapman02,me04g,polyspace,nagappan05,hall00,nikora03,conf/icse/NagappanB05a,khoshgoftaar01,conf/ictai/TangK04,journals/ese/KhoshgoftaarS03,me03a,me02e,me03k,me03q,porter90,tiang95,khoshgoftaar99,srinivasan95})}
 while others  vehemently oppose
them (~\cite{fenton00b,shepperd94}).


Prior studies may have reached different conclusions because they
were based on different data.  This potential conflation can now be removed since
it is now possible to define a {\em baseline experiment} using
public-domain data sets\footnote{
\protect\url{http://mdp.ivv.nasa.gov} and  
\protect\url{http://promise.site.uottawa.ca/SERepository}}  
which different researchers can use to
compare their techniques

This paper  defines and 
motivates such a baseline. 
The baseline {\em definition} draws from standard practices in the
data mining community~\cite{witten05,hall03}.
To {\em motivate} others to use our definition of a baseline experiment,
we must demonstrate that it can yield interesting results.  
The baseline
experiment of this article shows that the rule-based or decision-tree
learning methods used in prior work~\cite{me04g,me03a,me03e,me03k,me03q} 
are clearly out-performed by  a {\em N{\aive} Bayes} data miner with a
{\em log-filtering} pre-processor on the numeric data (the terms
in italics are defined later in this paper). 

Further, the experiment can  explain {\em why} our preferred Bayesian method
performs best. That explanation is quite technical and comes from
information theory. In this introduction, we need only say that
the space of ``best'' predictors is ``brittle'';
i.e.
minor changes in the data (such as a slightly different sample used
to learn a predictor) can make different attributes appear most
useful for defect prediction.

This brittleness result offers a new insight on prior work.
Prior results about  defect predictors
were so contradictory since they were drawn from a large space of
competing conclusions with similar, but distinct properties.  
Different studies could conclude that, say, lines of code are
better/worse predictor for defects than the McCabes complexity
attribute, just because of small variations to the data.  
Bayesian
methods  smooth over the brittleness problem by
polling numerous Gaussian approximations to the numerics distributions.
Hence,  Bayesian methods do not get confused by minor details about
candidate predictors.

Our conclusion is that, contrary to prior pessimism~\cite{fenton00b,shepperd94}, data
mining  static code attributes to learn defect predictors is useful.
Given our new results on N{\aive} Bayes and log-filtering, these
predictors are  much better than previously demonstrated.  
Also, prior contradictory results  
on the merits of defect predictors can be explained in terms of the brittleness
of the space of 
``best'' predictors.
Further, our baseline experiment clearly shows that it is a mis-directed discussion to
debate (e.g.) ``lines of code vs McCabe'' for predicting defects. As we shall see,
 {\em the choice of learning method} is far more
 important than {\em which subset of the available data} is used for
 learning.  

\section{Background}

For this study, we learn defect predictors from  static code
attributes defined by McCabe~\cite{mccabe76} and
Halstead~\cite{halstead77}. McCabe (and Halstead) are ``module''-based
metrics where a module is the smallest unit of functionality\footnote{
In other languages, modules may be called ``function'' or
``method''.}.  We study defect predictors learned from
static code attributes since they are {\em useful}, {\em easy to
use}, and {\em widely-used}.

{\em Useful}: 
This paper  finds defect predictors
with a probability of detection of 71\%.
This is markedly
higher than other currently-used
industrial
methods such as manual code reviews:
\bi
\item
A panel at {\em IEEE Metrics
2002}~\cite{shu02} concluded that manual software  reviews can find ${\approx}60\%$ 
of defects\footnote{That panel supported neither Fagan
claim~\cite{fagan86} that inspections can find 95\% of defects before
testing or Shull's claim that specialized directed inspection methods
can catch 35\% more defects that other
methods~\cite{shull00a}.}  
\item
Raffo found that the defect detection capability of
industrial review methods can vary from \mbox{$pd=TR(35, 50,
65)\%$}\footnote{ $TR(a,b,c)$ is a triangular distribution with
min/mode/max of $a,b,c$.}.
 for full Fagan inspections~\cite{fagan76} to
\mbox{$pd=TR(13, 21, 30)\%$} for less-structured inspections.
\ei

{\em Easy to use}: 
static code attributes like lines of code
and  the McCabe/Halstead
attributes can be automatically and cheaply collected, even for very large systems~\cite{nagappan05}.
By contrast, 
other methods such as manual code reviews are labor-intensive.
Depending on the review methods 8 to 20 LOC/minute can be
inspected and this effort repeats for all members of the review team,
which can be as large as four or six~\cite{me02f}. 


{\em Widely used}:
Many researchers use static attributes to guide software 
quality predictions  
(see \cite{halstead77,mccabe76,chapman02,me04g,polyspace,nagappan05,hall00,nikora03,conf/icse/NagappanB05a,khoshgoftaar01,conf/ictai/TangK04,journals/ese/KhoshgoftaarS03,me03a,me02e,me03k,me03q,porter90,tiang95,khoshgoftaar99,srinivasan95}).
Verification and validation (V\&V) textbooks
(\cite{rakitin01}) advise using static code complexity attributes
to decide which modules are worthy of manual inspections.  
For several  years, the first author worked on-site at the NASA software Independent Verification
and Validation facility
and he
knows of several large government software contractors that won't
review software modules {\em unless} tools like McCabe predict that
they are fault prone.  

Nevertheless, static code attributes are hardly a complete characterization of
the internals of a function. Fenton offers an insightful example where
{\em the same} functionality is achieved using {\em different}
programming language constructs resulting in {\em different} static
measurements for that module~\cite{fenton97}.  
Fenton uses this example to argue the uselessness
of static code attributes. 

An {\em alternative
interpretation} of Fenton's example 
is that static attributes can never 
be 
a certain indicator of the
presence of a  fault.  Nevertheless, they are useful
as
as probabilistic statements that the frequency of faults tends
to increase in code modules that trigger the predictor. 

Shepperd~\&~Ince and Fenton~\&~Pfleeger might
reject the
{\em alternative interpretation}.
They present 
empirical evidence  that  the McCabe
static attributes offer
nothing more than
uninformative attributes such as lines of code.
Fenton~\&~Pfleeger  
note that the main McCabe's attribute
(cyclomatic complexity, or $v(g)$) 
is highly correlated with lines of code~\cite{fenton97}.
Also, Shepperd~\&~Ince remarks that ``for a large class of software it
(cyclomatic complexity) is no more than a proxy for, and in many cases
outperformed by, lines of code''~\cite{shepperd94}.

If 
Shepperd \& Ince and Fenton \& Pfleeger.
are right, then:
\bi
\item 
The supposedly better static code attributes such as Halstead and McCabes should
perform no better than just simple thresholds on lines of code. 
\item 
The performance of a predictor learned learned by a data miner should be very poor;
\ei

Neither of these are true, at least for the data sets used in this study.
Our experimental
method seeks the ``best'' subsets of the available
attributes that  are most useful for predicting defects. We will show  that 
the best size for the ``best'' set is larger than  one; i.e. 
predictors based on single lines of code
counts do not perform as well as other methods.

\begin{figure}[!t]
\begin{center}
{\footnotesize
\begin{tabular}{r|rr|}\cline{2-3}
\multicolumn{1}{c|}{}&\multicolumn{2}{c|}{probability of}\\\hline
data&	detection&	false alarm\\\hline
pima diabetes&	60&	19\\
sonar	&	71	&29\\
horse-colic	&71	&7\\
heart-statlog &	73	&21\\
rangeseg	&76	&30\\
credit rating	&88&	16\\
sick	&88&	1\\
hepatitis	&94	&56\\
vote	&95	&3\\
ionosphere	&96	&18\\\hline
mean &81	& 20\\\cline{2-3}
\end{tabular}
}
\end{center}
\caption{Some representative $pd$s and $pf$s for prediction problems
 from the UC Irvine machine learning
database~\protect\cite{Blake+Merz:1998}. These values were generated
using the
standard
settings of a state-of-art decision tree learner (J48). For each data set, ten
experiments where conducted where a decision tree was learned on 90\% of
the data, then testes of the remaining 10\%. 
The numbers shown here are the average results across ten such experiments.}\label{fig:uci}
\end{figure}
\begin{figure}[!t]
\begin{center}
{\footnotesize
\begin{tabular}{r|rr|}\cline{2-3}
\multicolumn{1}{c|}{}&\multicolumn{2}{c|}{probability of}\\\hline
data&	detection&	false alarm\\\hline
pc1	&24	&25\\
jm1 &	25	&18\\
cm1&	35	&10  \\
kc2&	45&	15\\
kc1&	50	&15\\\hline
mean &36	& 17\\\cline{2-3}
\end{tabular}
}
\end{center}
\caption{Prior results of learning defect predictors. 
From \protect\cite{me04g}.}\label{fig:before}
\end{figure}


Also, the predictors learned from those
``best'' sets perform surprisingly well.
Formally, learning a defect predictor is a {\em binary prediction problem} where each
modules in a database has been labeled "defect-free" or "defective".
The learning problem is to build some predictor which guesses the labels
for as-yet-unseen modules. Using the methods described below, this paper offers 
new
defect predictors with
a probability of detection (pd) and probability of false alarm (pf) of
\[mean(pd,pf)=(71\%,25\%)\]
\fig{uci} lets us compare our new results against
standard binary prediction results from the UC Irvine machine learning
repository of standard test sets for data miners~\protect\cite{Blake+Merz:1998}.
Our new results of
 ($pd$,$pf$)=(71\%,25\%) are close to
the standard results of 
 ($pd$,$pf$)=(81\%,20\%). 
which is noteworthy in four ways:
\be
\item It is unexpected. If static code attributes capture so little
about source code (as argued by 
Shepherd, Ince, Fenton and Pfleeger), then we would expect lower probabilities
of defection much higher false alarm rates. 
\item
These new ($pd$,$pf$) figures are much larger than any of our prior results.
of $mean(pd,pf)=(36\%,17\%)$~\cite{me04g} (see \fig{before}).
Despite much experimentation~\cite{me02e,me03a},
the only way we could  achieve a $pd>70\%$ was to accept a 50\% false
alarm rate.
\item
These new results of $mean(pd)=71\%$ are better than currently
used industrial methods such as the $pd{\approx}60\%$ reported
at the 2002 IEEE Metrics panel or the $median(pd)=21..50$ reported
by Raffo.
\item
There is still considerable room for improvement such as lower $pf$s and higher $pd$s. 
We are actively researching better code metrics which, potentially, will yield ``better''
predictors. 
\ee

This last point motivates much of this paper. Before we can demonstrate
``better'', we need to define ``better than what?''. That is, improvement can only be measured against a well-defined
baseline result. That baseline needs
to be repeatable and based on public-domain data set.
Further, the basis for comparatively assessing different data mining methods should be
well-justified and well-specified so that others
can repeat, improve, or refute prior results.
Hence, much of the rest of this paper is devoted to a meticulous description of our experimental method.

The baseline experiment was selected in response to certain shortcomings in other work.
For example, 
Nagappan and Ball~\cite[p6]{nagappan05}
report accuracies of 82.91\% for their defect predictor.
Accuracy attributes the number of times the predicted class of a module (defect-free or defective)
is the same as the actual class. 
These accuracy values were found in a {\em self-test}; i.e. the  learned predictor 
was applied to the data used to train it. 
In our study, we use neither accuracy nor self-tests:
\bi
\item
When the target class (defect-free or defective) is in the minority,
accuracy is a poor measure of a learner's performance.
For example, a learner could score 90\% 
accuracy on a data set with 10\% defective modules, even if it predicts that all defective modules
were defect-free. 
\item
Self-tests are deprecated by the data mining community
since  such self-tests can grossly over-estimate performance~\cite{witten05}. 
If the goal is
to understand how well a defect predictor will work on future projects,
it is best to assess the predictor via {\em
hold out} modules not used in
the generation of that predictor.
\ei
Hence, for this study, we use attributes other than accuracy 
including $pd$, $pf$ and several others defined below.
Also, our learned predictors will be assessed using {\em hold out} modules.
\begin{figure*}
{\footnotesize
\begin{center}
\begin{tabular}{r|l|l|rrr}
system &language &sub-system data set & total LOC &  \# modules & \% defective \\\hline
spacecraft instrument&C&cm1-05& 17K & 506 & 9 \\\hline
storage management for  ground data&Java&kc3& 8K& 459 & 9 \\\cline{3-6}
    &&kc4& 25K& 126 & 49\\\hline
Db   &C&mw1& 8K & 404 & 7 \\\hline
Flight software for earth orbiting satellite&C&pc1-05&26K& 1,108 & 6 \\\cline{3-6}
&&pc2& 25K & 5,590  & 0.4 \\\cline{3-6}
&&pc3& 36K  & 1,564  & 10 \\\cline{3-6}
&&pc4&30K & 1,458&12  \\\hline
\end{tabular}
\end{center}
}
\caption{Data sets used in this study. The datasets cm1-05 and pc1-05 updates
data sets $cm1$ and $pc1$ processed previously by the authors~\protect\cite{me03k}.}\label{fig:datasets}
\end{figure*}


\section{Threats to Validity}

Like any empirical data
mining paper, our conclusions are biased according to what
data was used to generate them.
Issues of {\em sampling bias} threaten any data mining
experiment; i.e. what matters {\em there} may not be true {\em here}.
For example, the sample used here comes from NASA, which works in a
unique market niche.

Nevertheless, we argue that results from NASA are relevant to the
general software engineering industry.  NASA makes extensive use of
contractors who are contractually obliged (ISO-9O01) to demonstrate
their understanding and usage of current industrial best
practices. These contractors service many other industries;
for example, Rockwell-Collins builds systems for many government and
commercial organizations.  For these reasons, other noted researchers
such as Basili, Zelkowitz, et al.~\cite{basili02} have argued that
conclusions from NASA data are relevant to the general software
engineering industry.

All inductive generalization suffers from a sampling bias.
The best we can do is define our methods and publicize our data such
that other researchers can try to repeat our results and, perhaps,
point out a previously unknown bias in our analysis.
Hopefully, other researchers will
emulate our methods in order to repeat, refute or improve our results. We
would encourage such researchers to offer not just their conclusions,
but the data used to generate those conclusions.  The MDP is a
repository for NASA data sets and 
the PROMISE
code repository
are places to
store and discuss software engineering data sets from other
organizations.

Another source of bias in this study is the set of learners explored by this
study.  Data mining is a large and active field and any single study
can only use a small subset of the known data mining algorithms.
For example, neural networks~\cite{rumel86} and genetic
algorithms~\cite{goldberg00} were not used for this study as
they can be very slow.  The experiment described in this paper took
weeks to debug and a full day to run once debugged.  We were therefore
not motivated to explore other, slower, learners but would encourage
other researchers with access to super-computers or a large CPU-farm to do so.


%\section{Hypotheses}
%
%This study 
%comments on
%eleven hypotheses relating to the merits of defect predictors learned from
%static measures.
%
%\newtheorem{hypo}{Hypothesis}
%
%%Numerous researchers have claimed the
%%measures extracted from source code can be used to predict which new
%%modules will be defective.
%%Initially, intra-module\footnote{Here, ``module'' is a synonym for
%%the smallest compilable unit in a language; e.g. a ``C''
%%function or a PASCAL procedures or a JAVA
%%method.} measures were
%%proposed to predict detects.  Halstead and McCabe argued that the
%%complexity of reading a single module~\cite{halstead77} or the
%%pathways through a module~\cite{mccabe76} were predictors for module
%%defects. Tacitly,  Halstead and McCabe were arguing:
%%\begin{hypo}\label{hyp:one}
%%Static code measures predict for defects.
%%\end{hypo}
%%
%%\begin{hypo}\label{hyp:two}
%%There are better intra-module measures than lines of code for predicting for defects.
%%\end{hypo}
%%
%%
%%%XXX 10 to 11
%%
%%
%%This study will explore numerous other hypotheses.
%%For example, since Halstead published after McCabe, we can
%%assume that Halstead was arguing:
%%\begin{hypo}\label{hyp:three}
%%The Halstead measures are better than the McCabe measures
%% building defect predictors.
%%\end{hypo}
%%
%%Using his base measures, Halstead defined a set of {\em derived attributes}
%%that, supposedly, describe the complexity of the reading source code. Halstead assumes that:
%%\begin{hypo}\label{hyp:thirteen}
%%There is  added value in
%%the derived Halstead attributes.
%%\end{hypo}
%%
%%%XXX 12 to 11
%%% XXX bias: sample and balance
%%
%
%%XXX stress "learning magi" in the intro
%
%A common use of McCabe's is to use his preferred
% rule ($v(g)>10 \vee iv(g)>4$) as a defect predictor. This assumes:
%\begin{hypo}\label{hyp:four}
%Disjunctions of single attribute thresholds suffice for defect prediction.
%\end{hypo}
%
%Other researchers have challenged this assumption, preferring instead to build
%predictors using decision tree learners~\cite{porter90,tiang95,srinivasan95}.
%Each branch of a decision tree is a conjunction
%of tests on attribute thresholds.
%All the branches form one large disjunction (i.e. this branch OR that branch OR that branch OR\ldots).
%Proponents of decision trees assume:
%\begin{hypo}\label{hyp:six}
%Defect predictions are best made by disjunctions of conjunctions of attribute thresholds.
%\end{hypo}
%
%Elsewhere, we have used N{\aive} Bayes classifiers as a method for learning defect predictors~\cite{me04g}.
%Such  classifiers make no use of disjunctions or conjunctions. Rather, they compare
%the product of probabilities
%that a new example falls into one class or another. Proponents of N{\aive} Bayes classifiers assume:
%\begin{hypo}\label{hyp:seven}
%Defect predictions are best made by products of probabilities.
%\end{hypo}
%
%
%%Decision tree learning and N{\aive} Bayes classifiers are two different kinds of learners.
%%There any many more. Khoshgoftaar, for example, has explored a
%%very wide range of learning methods such as to Poisson
%%regression~\cite{khoshgoftaar01}, to genetic programming to k-means
%%clustering~\cite{conf/ictai/TangK04}, as well as many
%%others~\cite{journals/ese/KhoshgoftaarS03}.
%%In our own previous work, for a while, we advocated the use
%%of various learners including a minimal set contrast learners~\cite{me02e}
%%and a
%%minimalist discretization  policy called ROCKY~\cite{me03a}. Like Khoshgoftaar, our experimentation
%%with new learners was driven by the  assumption
%%that:
%%\begin{hypo}\label{hyp:eight}
%%Learning defect predictors is a hard problem requiring intricate solutions.
%%\end{hypo}
%%
%
%
%\begin{figure}
%\begin{center}
%\includegraphics[width=1.7in]{loc.pdf}
%\includegraphics[width=1.7in]{lnloc.pdf}
%\end{center}
%\caption{A McCabe's metric from $cm1$:  raw values on left, log-filtered on right.}\label{fig:rawlog}
%\end{figure}
%Elsewhere, we have conducted limited experiments suggesting that a
%{\em logarithmic filter} on all numeric values might improve predictor
%performance~\cite{me02e}. \fig{rawlog}, on the left-hand-side, shows
%some measures collected from the MDP {\em cm1} data set. These values
%form a exponential distribution with many small values and a few much
%larger values. The right-hand-side of that figure shows the same values,
%after being filtered through a logarithmic function. Note how the
%values are now more evenly spread across the y-range, making it easier
%to reason about them. \fig{rawlog} suggests that:
%\begin{hypo}\label{hyp:ten}
%Log-filtering of all numerics can improve predictor performance.
%\end{hypo}
%
%All the
%MDP datasets contain miscellaneous attributes
%describing data that was easily available at the time of
%collection. Several of these miscellaneous variables have no
%proponents in the literature, yet are included under the assumption
%that:
%\begin{hypo}\label{hyp:eleven}
%The more information, the better the learned predictors.
%\end{hypo}
%
%The MDP data sets only refer to simple intra-module measures.
%Much newer research has focused on {\em what else} can be gleamed
%from source code.
%For example,
%the PolySpace Verifier~\cite{polyspace} checks for a variety of common
%program errors such as out of bounds array index and uninitialized
%variables.
%Other tools don't just look at the current version of the source code,
%but ask what is the {\em rate of change} in that source code.
%Microsoft has a static code analysis tool that predicts defects via
%{\em churn}; i.e. the percent change in the code seen by a version
%control system~\cite{nagappan05}. Similarly, Hall, Nikora, and
%Munson~\cite{hall00,nikora03} use uses churn to characterize
%problematic software evolution paths.
%Beyond mere static code measures are other automatic tools which, after decades
%of research, are becoming practical for industrial applications such as
%runtime monitoring~\cite{have00b},
%and model checking~\cite{clarke99}, and automatic theorem proving~\cite{Rushby:Tutorial}.
%Other promising avenues include Fenton and Neil's work on Bayesian modeling
%where data collection and analysis is guided by an underlying causal model of factors
%that influence software production~\cite{fenton99}.
%These tools assume:
%\begin{hypo}\label{hyp:11}
%There are better ways than intra-module measures to find module defects.
%\end{hypo}
%
%Sadly, using
%these supposedly better methods can consume resources
%exponentially. For example:
%\bi
%\item {\em Black box probing:}
%A {\em linear} increase
%in the confidence $C$ that we have found all defects
%can take {\em exponentially} more effort.
%For example, for one-in-a-thousand detects,
%moving $C$ from
%90\% to 94\% to 98\% takes 2301, 2812, and 3910 black box
%probes (respectively)\footnote{A randomly selected
%input to a program will find a fault with probability $x$.
%After $N$ random black-box tests, the chances of the inputs
%not revealing any fault
%is $(1-x)^N$. Hence, the chances $C$ of seeing the fault is $1-(1-x)^N$
%which can be rearranged to
% $N(C,x)=\frac{log(1 -
%C)}{log(1-x)}$. For example, $N(0.90,10^{-3})=2301$~\cite{voas95}.}.
%\item {\em Automatic formal methods:}
%The
% state space explosion problem imposes
%strict limits on how much a system can be explored
%via automatic formal methods~\cite{me00q}.
%\ei
%Lowry et.al.~\cite{lowrey98} and Menzies and Cukic~\cite{me00y}
%offer numerous other examples
%where assessment effectiveness is exponential on effort.
%For example,  the more complicated
%the tool, the better the user needs to be.  This is particularly true
%for tools that require temporal logic constraints such as runtime
%monitoring~\cite{have00b} and model checking~\cite{clarke99}.
%However, it is also true for the more complex static analysis tools
%such as Polyspace~\cite{visser05a}.
%
%Exponential costs quickly exhaust finite resources unless
%they are somehow focused on small portions of the system.
%The thesis of this article is that defect detectors learned from
%static code measures can quickly find potentially problematic
%regions of the system.  These potential trouble spots can then
%be examined in more detail by (e.g.) model checking, intensive testing, etc.
%
%XXXthe more data, the better
%XXXX for every hypothsies, offer a contrary
%XXXwhen acc defined, show how high acc means zip

\section{Data}

An experiment needs three things:
\bi
\item
Data to be processed;
\item
A processing method;
\item
A reporting method;
\ei

This section discusses the data used in this study. Processing via data miners 
and our reporting methods are discussed later.

All our data comes from the MDP.
At the time of this writing, ten
data sets are available in that repository. Two of those data sets have a
different format to the rest and were not used in this study.  
% JG NOTE either the above or the below
This left eight, shown in \fig{datasets}. 
Each module of each data
sets describes the attributes of that module,
plus the number of defects known for that module.
This data comes from
eight
sub-systems taken from four systems.  These systems were developed
 in
different geographical locations across North America.
Within a system, the sub-systems
shared some a common code base but did not pass personnel or code
between sub-systems. \fig{sizes} shows the module sizes of our data; 
for example, there are 126 modules
in the $kc4$ data set, most of them are under 100 lines of code, but a few of them
are over 1000 lines of code long.

\begin{figure}
\begin{center}
\includegraphics[width=2.5in]{lines.pdf}
\end{center}
\caption{Log-log plot of module sizes in the
\protect\fig{datasets} data.}\label{fig:sizes}
\end{figure}


Each data set was pre-processed by removing the module identifier attribute
(which  is different for each row). Also, the
$error\_count$ column was converted into a boolean attribute called {\em defective?}  as follows:
$defective? = (error\_count \ge 1)$.
Finally, the $error\_density$ column was removed (since it can be
derived from line counts and $error\_count$).
The pre-processed data
sets had 38 attributes, plus one target attribute
({\em defective?})
shown in \fig{attr} and include Halstead, McCabe, lines of code, and other miscellaneous
attributes.

\begin{figure}
\begin{center}
{\footnotesize
\begin{tabular}{l@{~}|l@{~}|l@{~}|c@{~}p{3.9cm}|}\cline{2-5}
%\begin{tabular}{|l|l|lcl|}\hline
%\multicolumn{3}{|c}{grouping}&\multicolumn{2}{|c|}{measure}\\\hline
		&\multicolumn{2}{l}{m = Mccabe }       &$v(g)$  &cyclomatic\_complexity  \\
        &        \multicolumn{2}{l}{~}                                 & $iv(G)$&design\_complexity\\
        &         \multicolumn{2}{l}{~}                                & $ev(G)$&essential\_complexity\\\cline{2-5}
        &locs&{loc}                         &   &loc\_total (one line = one count\\\cline{3-5}
	&	&loc(other)                      &   &loc\_blank           \\
        &       &                                &   &loc\_code\_and\_comment\\
        &       &                                &   &loc\_comments         \\
        &        &                                &   &loc\_executable\\
        &       &                                &    &number\_of\_lines (opening to closing brackets)\\\cline{2-5}
	&{Halstead} & h& $N_1$ &num\_operators\\
        &       &                                 &$N_2$ &num\_operands \\
        &       &                                 &$\mu_1$ &num\_unique\_operators\\
        &       &                                 &$\mu_2$ &num\_unique\_operands\\\cline{3-5}
        &       &  { H}&$N$  &length: $N= N_1 + N_2$\\
        &       &                                 & $V$  &volume: $V=N * log_{2}\mu$\\
        &       &                                 & $L$  &level: $L=V^*/V$ where\newline $V^*=(2+\mu{_2}^{*})log_{2}(2+\mu{_2}^{*})$\\
        &       &                                 & $D$  &difficulty: $D=1/L$\\
        &       &                                 &$I$   &content: $I=\hat{L}*V$ where\newline $\hat{L}=\frac{2}{\mu_{1}} * \frac{\mu_{2}}{N_{2}}$ \\
        &       &                                 & $E$  &effort: $E=V/\hat{L}$\\
        &       &                                 & $B$  &error\_est\\
        &       &                                 & $T$  &prog\_time: $T=E/18$ seconds\\\cline{2-5}
        &       \multicolumn{2}{l}{misc = Miscellaneous}                                  &&branch\_count\\
        &       \multicolumn{2}{l}{~}                                  &&call\_pairs\\
        &       \multicolumn{2}{l}{~}                                  &&condition\_count\\
        &       \multicolumn{2}{l}{~}                                  &&decision\_count\\
        &       \multicolumn{2}{l}{~}                                  &&decision\_density\\
        &       \multicolumn{2}{l}{~}                                  &&design\_density\\
        &       \multicolumn{2}{l}{~}                                  &&edge\_count\\
        &       \multicolumn{2}{l}{~}                                  &&global\_data\_complexity\\
        &       \multicolumn{2}{l}{~}                                  &&global\_data\_density\\
        &       \multicolumn{2}{l}{~}                                  &&maintenance\_severity\\
        &       \multicolumn{2}{l}{~}                                  &&modified\_condition\_count\\
        &       \multicolumn{2}{l}{~}                                  &&multiple\_condition\_count\\
        &       \multicolumn{2}{l}{~}                                  &&node\_count\\
        &       \multicolumn{2}{l}{~}                                  &&normalized\_cyclomatic\_complexity\\
        &       \multicolumn{2}{l}{~}                                  &&parameter\_count\\
        &       \multicolumn{2}{l}{~}                                  &&pathological\_complexity\\
        &       \multicolumn{2}{l}{~}                                  &&percent\_comments\\\cline{2-5}
\end{tabular}}
\end{center}
\caption{Attributes used in this study. }\label{fig:attr}
\end{figure}
% XXX need introduction words saying contribution is definition of baseline experiment

The Halstead attributes were derived by Maurice Halstead in 1977. He
argued that modules
that are hard to read are more likely to be fault
prone~\cite{halstead77}. Halstead estimates reading complexity by
counting the number of operators and operands in a module: see the
$h$ attributes of \fig{attr}.  These three raw $h$  Halstead
attributes were 
then used to compute the $H$: the eight derived Halstead attributes
using the equations shown in
\fig{attr}.  In between the raw and derived Halstead attributes are
certain intermediaries (which do not appear in the MDP data sets);
 \bi
\item
$\mu= \mu_1 + \mu_2$;
\item
minimum operator count: $\mu_1^*=2$;
\item
$\mu_2^*$  is the minimum operand count and equals the number of module parameters.
\ei

%XXXX conclusion. Halstead and McCabe very odd 1970s. sureyin the 21st cenyru we can do better

An alternative to the Halstead attributes are the complexity attributes proposed by Thomas McCabe in 1976.
Unlike Halstead,
McCabe
 argued that the complexity of
pathways {\em between} module symbols are more insightful than just a count of the symbols~\cite{mccabe76}. 
The first three lines of \fig{attr}
shows McCabe three
main attributes for this pathway complexity. These are defined as follows.
A
module is said to have a {\em flow graph}; i.e. a directed graph where each node corresponds to a
program statement, and each arc indicates the flow of control from
one statement to another.
The {\em cyclomatic complexity} of a module is   $v(G) = e - n +
2$ where $G$ is a program's flow graph, $e$ is the number of arcs in
the flow graph, and $n$ is the number of nodes in the
flow graph~\cite{fenton96}.
The {\em essential complexity}, ($ev(G)$) or a module is  the extent to which a
flow graph can be ``reduced'' by decomposing all the subflowgraphs
of $G$ that are {\em D-structured primes}
(also sometimes referred to as ``proper one-entry
one-exit subflowgraphs''~\cite{fenton96}). $ev(G) = v(G) - m$ where $m$ is the number of subflowgraphs of
$G$ that are D-structured primes~\cite{fenton96}.
Finally, the
{\em design complexity}   ($iv(G)$) of a module is the cyclomatic complexity of a
module's reduced flow graph.

At the end of \fig{attr} are a set of {\em misc} attributes that are less well-defined than lines of code attributes
or the Halstead and McCabe attributes. 
The meaning of these attributes is 
   poorly documented in the MDP database. Indeed,
   they seem to be values generated from some unknown tool set that
   was available at the time of uploading the data into the MDP.
Since there are difficulties in reproducing these attributes at other sites, an argument could be made
for removing them from this study.
A counter-argument is that if static code attributes are as weak as suggested by Shepherd, Ince, Fenton and Pfleeger
then we should use all possible attributes in order to make maximum use of the available information.
This study took a middle ground: all these attributes were passed to the learners and they determined
which ones had most information.

\begin{figure}
\begin{center}
\includegraphics[width=1.7in]{loc.pdf}
\includegraphics[width=1.7in]{lnloc.pdf}
\end{center}
\caption{A McCabe's metric from $cm1$:  raw values on left, log-filtered on right.}\label{fig:rawlog}
\end{figure}
An interesting repeated pattern in our data sets are {\em exponential distributions}
in the numeric attributes.
For example,
\fig{rawlog}, on the left-hand-side, shows
the sorted McCabe $v(g)$ attributes from {\em cm1}. These values
form an exponential distribution with many small values and a few much
larger values. 
Elsewhere, we have conducted limited experiments suggesting that a
{\em logarithmic filter} on all numeric values might improve predictor
performance~\cite{me02e}.  
Such a filter replaces all numerics $n$ with their logarithm.
$ln(n)$.
The effects of such a filter are shown on the right-hand-side of \fig{rawlog}:
the
log-filtered values are now more evenly spread across the y-range, making it easier
to reason about them. 
To test the value of log-filtering, all the data was passed through one of two filters:
\be
\item {\em none}; i.e. no change;
\item {\em logNums}; i.e. logarithmic filtering.
To avoid numerical errors with  $ln(0)$,
all numbers under 0.000001 are replaced with  $ln(0.000001)$.
\ee



\section{Learners}

The above data was passed to three learners from the
WEKA data mining toolkit~\cite{witten05}: OneR, J48, and N{\aive}
Bayes\footnote{\protect\url{http://www.cs.waikato.ac.nz/~ml/weka}.}.
This section describes those learners.


Holte's OneR algorithm~\cite{holte93}.
builds prediction rules using one or more
values from a single attribute. 
For example, OneR executing on the {\em kc4} data set can return:

{\footnotesize\begin{verbatim}
EDGE_COUNT:
	< 2.99	-> defect-free
	>= 2.99	-> defective
\end{verbatim}}
\noindent
which may be read as follows: ``a module is defect-free if its edge count is less than 2.99''.

OneR was chosen to test the value of predictors based on {\em simple thresholds on single attributes}.
For an example of such a simple threshold,
recall that McCabe recommends inspected modules that satisfy:
$v(g)>10$ or $iv(g)>4$.  Several other example thresholds exist in defect prediction literature:
\bi
\item
Chapman and Solomon
advocate predicting defects using
$v(g)>20$ or $ev(g)>8$~\cite{chapman02}; 
\item
In early work we advocated
$ev(g)>7$ or $\mbox{lines of code} > 118$~\cite{me02e} (based on this study, we now reject that old advice).
\ei
OneR can only return simple thresholds on single attributes. If predictors built by OneR were as good as any other,
then that would support the use of simple thresholds such as those advocated by McCabe, Chapman, {\em et al.}.

One way to view OneR's defect predictions rules is a decision tree
of maximum depth one whose leaves are either the label {\em defective}
or {\em defect-free}. The J48 learner builds decision trees of any depth. For example, J48 
executing on the {\em kc4} data set can return:

{\footnotesize\begin{verbatim}
CALL_PAIRS <= 0: defect-free 
CALL_PAIRS > 0
|  NUMBER_OF_LINES <= 3.12: defect-free 
|  NUMBER_OF_LINES > 3.12
|  |  NORMALIZED_CYLOMATIC_COMPLEXITY <= 0.02
|  |  |   NODE_COUNT <= 3.47: defective 
|  |  |   NODE_COUNT > 3.47: defect-free 
|  |  NORMALIZED_CYLOMATIC_COMPLEXITY>0.02: defective 
\end{verbatim}}
\noindent
which may be read as follows: ``a module is defective if it has non-zero call-pairs and has more than
3.12 lines and hasn't  a low normalized cyclomatic complexity (0.02) or it has a low normalized
cyclomatic complexity and a low node-count (up to 3.47)''. 

Note that J48 predictors can be more
complex and explore more special cases than OneR. J48's predictors would out-perform OneR if
defect prediction required such elaboration.

J48 is a JAVA implementation of Quinlan's C4.5 (version 8)
algorithm~\cite{quinlan92}. The algorithm recursively {\em splits}
a data set according to tests on attribute values in order to
separate the possible predictions (although attribute tests are chosen one at a
time in a greedy manner, they are dependent on results of previous
tests). C4.5/J48 uses information theory to assess candidate splits:
the {\em best split} is the one that {\em most simplifies} the target
concept.  
Concept simplicity is measured using information theory.
Suppose a data set has 80\% defect-free modules and  20\% defective modules.
Then that data set has a class distribution $C_0$ with classes $c(1)=defect-free$ and $c(2)=defective$
with frequencies $n(1)=0.8$ and $n(2)=0.2$.
The
number of bits required to encode an arbitrary class distribution $C_0$ is $H(C_0)$ defined as follows:
\begin{equation}\label{eq:ent}{
\left.\begin{array}{rcl}
N&=&\sum_{c{\in}C}n(c)\\
p(c)& =& n(c)/N \\
H(C)&=& -{\sum}_{c{\in}C} p(c)log_2p(c)
\end{array}\right\}
}\end{equation}
A split divides $C_0$ (before the split) into $C_1$ and $C_2$
(after the split). The best split leads to the simplest concepts;
i.e. maximize $H(C_0) - (H(C_1) + H(C_2))$.

Another way to build defect predictors is to use a
N{\aive} Bayes data miner.
Such  classifiers are based on Bayes' Theorem. 
Informally, the
theorem says $next=old*new$ i.e. what we'll believe {\em next} comes
from how {\em new} evidence effects {\em old} beliefs.  More formally:
\[
P(H|E)=\frac{P(H)}{P(E)}\prod_iP(E_i|H)
\]
i.e. given fragments of evidence $E_i$ and a prior probability for
a class $P(H)$, the theorem lets us calculate a  posteriori  probability
$P(H|E)$. 
When building defect detectors,
the
posterior probability of each class (``defective'' or ``defect-free'')
is calculated, given the attributes
extracted from a 
module such as the
lines of code, the McCabe attributes, the Halstead attributes, etc.
The module is assigned to the possibility
with the highest probability.  This is  straightforward processing
and involves simply estimating the probability of attribute measurements
within the historical modules.
Simple frequency counts
are used to estimate the probability of discrete attribute attributes. For
numeric attributes it is common practice to use the probability
density function for a normal distribution~\cite{witten05}:
\[f(x) =
\frac{1}{\sqrt{2\pi}\sigma}e^{-\frac{(x-\mu)^2}{2\sigma^2}}\] where
$\{\mu,\sigma\}$ are the attribute \{mean,standard
deviation\}.  To be precise, the probability of a
continuous attribute being a particular continuous value $x$ is zero,
but the probability that it lies within a small region, say $x \pm
\epsilon/2$, is $\epsilon \times f(x)$.  Since $\epsilon$ is a
constant that weighs across all possibilities, it cancels out and
needs not to be computed. 

The above learning technology can be used to generate defect predictors from
data {\em or} to assess the value of different portions of the
data. Various  {\em attribute subset  selection}
algorithms~\cite{hall03} (hereafter, {\em subsetting}) find what attributes can be deleted, without
damaging the performance of the learned predictor.
Subsetting can be
used independently of the learning technique of choice, as a
general method for data reduction.

The simplest and fastest
subsetting
method is to rank attributes from the most
informative to least informative. After discretizing numeric 
data\footnote{E.g. given an attribute's
minimum and maximum values, replace a particular value $n$ with $(n - min)/((max - min)/10)$.
For more on discretization, see~\cite{dou95}.}
then if $A$ is a set of attributes, the number of bits required to encode
a class after observing an attribute is:
\[H(C|A) = -{\sum}_{a{\in}A} p(a) {\sum}_{c{\in}C} p(c|a)log_2(p(c|a)\]
The highest ranked
attribute $A_i$ is the one with the largest {\em information gain};
i.e the one that most
reduces the encoding required for the data {\em after}
using that attribute; i.e.
\begin{equation}\label{eq:infogain}
InfoGain(A_i) = H(C) - H(C|A_i)
\end{equation}
where $H(C)$ comes from \eq{ent}.
In {\em iterative InfoGain subsetting}, predictors are  learned
using the  $i=1,2...,N$-th top-ranked attributes.
Subsetting  terminates when $i+1$ attributes perform no better than
$i$.
In {\em exhaustive InfoGain subsetting},
the attributes
are first ranked using iterative subsetting.
Next, predictors are built using
all subsets of the  top $j$ ranked
attributes.
For both iterative and exhaustive subsetting, the process is repeated 10 times
using 90\% of the  data (randomly selected).
Iterative subsetting takes time 
linear on the number of attributes $N$ while exhaustive subsetting takes time
$2^j$ (so it is only practical for small $j\le N$).

% JG NOTE:  what about also being linear in the number of instances?
% JG NOTE:  We don't mention that because we are only 
% JG NOTE:  talking about attribute selection?
% JG NOTE:  Also Iterative subsetting is linear in j too, although j <= N for all cases

\begin{figure}[!t]
{\scriptsize\begin{alltt}
M    = 10
N    = 10
All  = 38 {\em # all the attributes}
DATAS={\em(cm1 kc3 kc4 mw1 pc1 pc2 pc3 pc4)} # data set list
FILTERS={\em (none logNums)}                 # filter list
LEARNERS={\em (oneR j48 nb)}                 # learner list

{\bf for} data {\bf in} DATAS
  {\bf for} filter {\bf in} FILTERS
      data' = filter(data)
      rank data' attributes via InfoGain {\em\# \eq{infogain}}
      {\bf for} i = 1,2,3, All
         attribute' = the i-th highest ranked attributes
         data''     = select attributes' from data' 
        {\bf repeat} M times
            randomized order from data'' 
            generate N bins from data''
           {\bf for} i {\bf in} 1 {\bf to} N
              tests        = bin[i]
              trainingData = data'' - tests
             {\bf for} learner {\bf in} LEARNERS
                METHOD         = (filter attributes' learner)
                predictor      = learner(trainingData)
                RESULT[METHOD] = apply predictor to tests\end{alltt}}
\caption{This study. Data is filtered and the attributes are ranked
using InfoGain. The data is then shuffled into a random order and divided
into 10 bins. A learner is then applied to a {\em training set} built from nine of the bins.
The learned predictor is test on the remaining bin.
}\label{fig:study}
\end{figure}

\section{Experimental Design}

This study used the
{\em (M=10)*(N=10)-way
cross-evaluation iterative
attribute subset selection} shown in \fig{study}. The study is nearly the same
as the procedure defined in
Hall and Holmes' subsetting experiments~\cite{hall03} (but we have added a data filtering step).  
The data set is divided
into $N$ buckets. For each bucket in a 10-way cross-evaluation, a predictor is
learned on the nine of the buckets, then tested on the remaining bucket.

Hall and Holmes advise repeating an N-way study M times,
randomizing the order each time.  Many algorithms exhibit {\em order
effects} where certain orderings dramatically improve or degrade
performance~\cite{fisher92} (insertion sort runs slowest if the inputs are
already sorted in reverse order). Randomizing the order of the inputs
defends against order effects.

These M*N studies implements a {\em hold out} study which,
as argued above, is necessary to properly assess the value of learned predictor.
Hold out studies assess a learned predictor using data {\em not}
used to generate it.
Such hold out studies
are the 
referred evaluation method when the
goal  is to produce predictors intended to  predict future 
events~\cite{witten05}.

The 10*10-way study was wrapped inside scripts that explored different
subsets of the attributes in the order suggested by {\em InfoGain}
(\eq{infogain}).
In the inner-most loop of the study, some {\em method} was applied to
some data set. As shown in the third last line of \fig{study}, these
{\em methods} were some combination of {\em filter, attributes',
learner}.

\section{Assessing Performance}

The performance of the learners on the MDP data was assessed using {\em
receiver-operator} (ROC) curves.
Formally,  a defect {\em predictor}   hunts for a {\em signal} that a
software module is defect prone. Signal detection
theory~\cite{heeger98} offers 
ROC curves as an analysis method for assessing different
predictors.
%% ROC curves are widely used in various fields including
%% assessing different clinical computing systems~\cite{adlassnig89} and
%% assessing different machine learning methods~\cite{provost98case}.
%% The central intuition of ROC curves is that different detectors can
%% be assessed via how often they correctly or incorrectly respond to
%% the a signal.
\begin{figure}
\begin{center}
\includegraphics[width=2.7in]{locplatonic.pdf}
\end{center}
\caption{Regions of a typical ROC curve.}\label{fig:typical}
\end{figure}
A typical ROC curve is shown in \fig{typical}. The y-axis shows
probability of detection ($pd$) and the x-axis shows probability of false alarms ($pf$).
By definition,
the ROC
curve must pass through the points $pf=pd=0$ and $pf=pd=1$
(a predictor that
never triggers never makes false alarms; a predictor that always
triggers always generates false alarms).
Three interesting trajectories connect these points:
\be
\item
 A straight line from
(0,0) to (1,1) is of little interest since it offers {\em no
information}: i.e.  the probability of
a  predictor firing is the same as it being silent.
\item
Another trajectory is the {\em negative curve} that bends away
from the ideal point. Elsewhere~\cite{me02e}, we have found that
if predictors negate their tests, the negative curve will transpose into a {\em
preferred
curve}.
\item
The point ($pf=0$, $pd=1$) is the ideal position (a.k.a. ``sweet spot'')
on a ROC curve. This is
where we recognize all errors and never make mistakes. {\em Preferred curves} bend
up towards this ideal point.
\ee

In the ideal case, a predictor has a high probability of detecting a
genuine fault ($pd$) and a very low probability of false alarm
($pf$). This ideal case is very rare. The only way to achieve high probabilities
of detection is to trigger the predictor more often. This, in turn, incurs the cost
of more false alarms.


$Pf$ and $pd$ can be  calculated using the
ROC sheet of \fig{roc2}.
Consider a predictor which,
when presented with some signal, either triggers or is silent. If
some oracle knows whether or not the signal is actually present,
then \fig{roc2} shows four interesting situations.
The predictor may be silent when the signal is
absent  (cell A) or present (cell B).
Alternatively,
if the predictor registers a signal, sometimes the signal
is actually absent (cell C)
and sometimes it is present (cell D).

\begin{figure}[!t]
\begin{center}
{\footnotesize \begin{tabular}{p{1cm}|p{2.5cm}|l|l|}
\multicolumn{2}{c}{}&\multicolumn{2}{c }{module found in defect
logs?}\\\cline{3-4} \multicolumn{2}{c|}{}&no&yes\\\cline{2-4}
 signal&no (i.e. $v(g) < 10$)&A = 395
 &B = 67\\\cline{2-4}
detected?&yes (i.e. $v(g) \ge 10$)&C = 19
&D = 39\\\cline{2-4}
\end{tabular}
\begin{eqnarray}\nonumber
pd=&Prop.detected=&37\%\\\nonumber
pf=&Prob.falseAlarm=&5\%\\\nonumber
notPf=&1-pf=&95\%\\\nonumber
bal=&Balance=&45\%\\\nonumber
Acc=&accuracy=&83\%\nonumber
\end{eqnarray}}
\end{center}\caption{A
ROC sheet assessing the predictor $v(g) \ge 10$. Each cell \{A,B,C,D\}
shows the number of modules that fall into each cell of this ROC
sheet. The $bal$ (or balance) variable is defined below.}\label{fig:roc2}
\end{figure}



% $Correlation$
%  is a statistic
% representing how closely two variables co-vary.  It can vary from -1
% (perfect negative correlation) through 0 (no correlation) to +1
% (perfect positive correlation).
% Let $a_i$ and $p_i$ denote some actual and predicted
% values respectively. Let $n$ and $\overline{x}$ denote
% the number of observations and the mean of the $n$ observations,
% respectively. Then:
% {\footnotesize
% \begin{eqnarray}\nonumber
%  S_{PA}&=&\frac{\sum_i (p_i - \overline{p})(a_i -
%  \overline{a})}{n-1}\\\nonumber
%  S_{p}&=&\frac{\sum_i (p_i - \overline{p})^2}{n-1}\\\nonumber
%  S_{a}&=&\frac{\sum_i (a_i - \overline{a})^2}{n-1}\\\nonumber
%  correlation=c&=&\frac{S_{PA}}{\sqrt{S_pS_a}}\nonumber
% \end{eqnarray}
% }
% For example, linear standard regression
% can generate the following equation for defects
% from the KC2 NASA data set shown in \fig{datasets}.
% The correlation of this equation is 0.65:

% {\footnotesize
% \begin{eqnarray}\label{eq:lsrloc}
% defects_1& =& 0.0164 +0.0114*LOC\\\nonumber
%  c_1&=&0.65
% \end{eqnarray}
% }

% Our two-dimensional definition of correlation expands
% simply to N-dimensions. Such
% multi-dimensional
% correlations can be calculated using
% many public domain tools including the M5' learner in
% the WEKA toolkit (downloaded from
% \url{http://www.cs.waikato.ac.nz/~ml/weka/})\footnote{Using the
% setting {\em modelType = simple linear regression}.}.

If the predictor registers
a signal, there are three cases of interest. In one case, the
predictor has correctly recognized the signal. This probability of this detection
is the ratio of detected
signals, true positives, to all signals:
\begin{equation}\label{eq:pd}
\mbox{probability
detection} = pd = recall= D/(B+D)
\end{equation}
(Note that $pd$ is also called  $recall$.)
 In another case, the
probability of a false alarm is the ratio of
detections when no signal was present to all non-signals:
\begin{equation}\label{eq:pf}
\mbox{probability false alarm} = pf= C/(A+C)
\end{equation}
For convenience, we say that $notPf$ is the complement of $pf$:
\begin{equation}\label{eq:notpf}
\mbox{notPf} = 1- C/(A+C)
\end{equation}
\fig{roc2} also lets us define the {\em accuracy}, or $acc$, of a
predictor as the percentage of true negatives and true positives:
\begin{equation}\label{eq:acc}
\mbox{accuracy} = acc = (A+D)/(A+B+C+D)
\end{equation}

If reported as percentages, these attributes
have the range:
\[
0 \le \;acc\%,\;pd\% ,\; ,notPf\% \le 100\]
Ideally, we seek predictors that maximize {\em acc\%, pd\%, notPf\%}.

Note that maximizing any one of these does not imply high values for
the others. For example \fig{roc2} shows an example with a high
accuracy (83\%) but a low probability of detection (37\%). Accuracy is
a good measure of a learner's performance  when the possible outcomes occur with similar
frequencies. The data sets used in this study, however, have very uneven
class distributions (see~\fig{datasets}). Therefore this paper will assess
its learned predictors using $bal,pd,notPf$ and not $acc$.

In practice, engineers {\em balance}
between $pf$ and $pd$. To operationalize this notion of {\em
balance}, we define $bal$ to be the Euclidean distance from the sweet
spot $pf=0,pd=1$ to a pair of $<pf,pd>$. For convenience, we
(a)~normalize $bal$ by the maximum possible distance across the ROC square
( $\sqrt{2}$); (b)~subtract this from 1; and (c)~express it as a percentage; i.e.
\begin{equation}\label{eq:bal}
balance= bal             = 1 - \frac{\sqrt{\left(0 - pf\right)^2 + \left(1 - pd\right)^2}}{\sqrt{2}}
\end{equation}
Hence, better and {\em higher} balances fall  {\em closer} to the desired  sweet spot  of $pf=0,pd=1$.

\section{Quartile Charts of Performance Deltas}


Recall from \fig{study} that a {\em method}
is some combination of $filter, attributes', learner$.
The experiment described above explored numerous combinations of
filters, attributes, and learners all within a
$(M=10)*(N=10)$-way cross-evaluation 
study. Hence,
this experiment 
generated nearly 800,000  {\em performance deltas} (defined below)
comparing  
$pd,notPf,bal$
values from different {\em methods} applied to the
same $test$ data.

The performance deltas were computed
using simple subtraction,
defined as follows. A {\em positive performance delta} for method X means
that method X has out-performed some other method in {\em one} comparison.
Using performance deltas, we say that the best method is the one
that generates the largest performance deltas over {\em all}
comparisons.

The performance
deltas for each method were sorted and displayed as {\em
quartile charts}.  To generate these charts, the performance
deltas for some $method$ 
were sorted to find the lowest and highest quartile as well as the
median value; e.g.

\[
{\footnotesize
\overbrace{\underbrace{-59}_{min},-19,-19}^{lowest\; quartile},-16,-14,-10,\underbrace{-10}_{median},5,14,39,\overbrace{42,62,\underbrace{69}_{max}}^{highest\; quartile}
}
\]

In a quartile chart,
the upper and lower quartiles are  marked with black lines;
the
median is marked with a black dot; and
vertical bars  are added to
mark (i)~the zero point and (ii)~the minimum possible value
and (iii)~the maximum
possible value (in our case, -100\% and 100\%).
The above numbers
would therefore be drawn as follows:


{\small
\[-100\%\; \boxplot{20}{10}{45}{80}{14}\; 100\%\]}
We prefer quartile charts of performance deltas to other summarization
methods for M*N studies. Firstly, they offer a very succinct summary
of a large number of experiments. For example, \fig{allpd} display
200,000 performance deltas in $\frac{1}{4}$ of a page. Secondly, they
are a {\em non-parametric} display; i.e. they make no assumptions
about the underling distribution.  Standard practice in data mining is
to compare the mean performance of different methods using
t-test~\cite{witten05}. T-tests are a {\em parametric method} that
assume that the underling population distribution is a Gaussian.
Recent results suggest that there are many statistical issues left to
explore regarding how to best to apply those t-tests for summarizing
M*N-way studies~\cite{brouck03}.  Such t-tests assume
Gaussian distributions and some of
our results are clearly non-Gaussian:
\bi
\item
The N{\aive} Bayes performance
delta $pd$ results (using $logNums$) of \fig{allpd} exhibits
an extreme skewness (a median point at 52.4 with a quarter of the
performance deltas pushed up towards the maximum figure of 100\%).
\item
All the OneR  performance
delta $pd$ results of \fig{allpd} are highly skewed. OneR's pd performance
delta
was {\em never} higher than 16.7 and over half the
performance deltas for that method had that value  (hence,
the missing upper arms in the OneR results of \fig{allpd}).
\ei

\begin{figure}
{\footnotesize{\bf Pd:}

\input{pd.changes}}

{\footnotesize {\bf NotPf=$100- pf$}

\input{notPf.changes}}

{\footnotesize {\bf Balance}

\input{bal.changes}}
\caption{Performance deltas for $pd,notPf,bal$ using all 38 attributes.}\label{fig:allpd}.
\end{figure}

%% \begin{figure}
%% {\footnotesize
%% {\bf Pd:}

%% \input {var/report/u/timm/var/dm/w4/nbLognumpd.changes}}

%% ~\\


%% {\footnotesize
%% {\bf NotPd=$1-pf$:}

%% \input{var/report/u/timm/var/dm/w4/nbLognumnotPf.changes}}

%% ~\\

%% {\footnotesize
%% {\bf Balance:}

%% \input {var/report/u/timm/var/dm/w4/nbLognumbal.changes}}
%% \caption{Deltas for just N{\aive} Bayes with $logNum$.}\label{fig:justBayes}.
%% \end{figure}


For the sake of completeness, we applied t-tests when sorting quartile
charts: one quartile chart appears above its neighbor if it was
statistically different (at the 95\% confidence level) and has a
larger mean.  However, given the skews we are seeing in the data, we
base our conclusions on {\em stand-out} effects seen in the
non-parametric quartile diagrams.  A {\em stand-out} effect is a large
and positive median with a highest quartile bunched up towards the
maximum figure. The $pd$ results for N{\aive} Bayes (with $logNums$) are
an example of such a stand-out effect.  On the other hand, OneR's
$pd$ results are a {\em negative stand-out}: those performance
deltas tend to bunch down towards -100\%; i.e. in terms of $pd$,
OneR usually performs much worse than anything else.

\section{Results}

N{\aive} Bayes with a log-transform has both a positive stand-out
result for $pd$ and a negative stand-out result for $notPf$.
This result, of winning on $pd$, but losing on $pf$,
is to be expected. \fig{typical} showed that the cost
of high $pd$s are higher $pf$s. The other learning methods cannot
emulate the high $pd$s of N{\aive} Bayes (with log-transforms) since
they take less chances (hence, have lower false alarm rates).



The {\em balance} results of \fig{allpd}
combines the $pd$ and $pf$ results, using \eq{bal}.
On balance, with  38 attributes:
\bi
\item
OneR loses more often than it wins: observe
that OneR has a negative median $balance$.
\item
The best method, on balance, is clearly N{\aive} Bayes with
log-transforms since it has a minority of negative balance performance deltas
(only 25\%); and it beats other methods by 22.1\% (or more) half the
time.  \ei

A review of the J48 and OneR quartile charts in the \fig{allpd} shows
that J48 out-performs OneR in terms of $pd$ and $notPf$ and $bal$.
That is, for these data sets, predictors that use simple threshold comparisons 
(OneR)
perform worse that predictors built from more elaborate decision trees (J48).

Since, on balance {\em logNums.nb} performs best, the rest
of this article only presents the
subsetting results for that method.  Initial experiments with iterative
InfoGain subsetting showed that all but one of the data sets (pc1) could be
reduced from 38 to three attributes without degrading the on-balance
performance.  However, iterative subsetting selected seven attributes for
$PC1$.  Therefore, for that
data set only, exhaustive subsetting was performed
on $2^7$ subsets to find the three best attributes.  

These InfoGain results were then compared to various other subsetting
methods: CFS~\cite{Hall98};
Relief~\cite{Kir92,Kon94}; and CBS~\cite{Alm91}.
Measured in terms of $pd$ or $nofPf$ or $balance$,
or  number of  selected attributes,
there was no apparent advantage in
using these other subsetting methods instead of InfoGain


\begin{figure}[!t]{\footnotesize {\bf Balance:}

\begin{tabular}{|rr|r@{~}r@{~}l}\cline{1-2}
\#attributes & median  &  &  & \\\cline{1-2}
3          & 10.0 & -100\% & \boxplot{19}{30}{55}{67}{17}&100\%\\
38        & 8.0 & -100\% & \boxplot{16}{31}{54}{66}{12}&100\%\\
2          & 0.0 & -100\% & \boxplot{15}{24}{50}{54}{27}&100\%\\
1          & -30.7 & -100\% & \boxplot{15}{13}{34}{48}{24}&100\%\\
\cline{1-2}\end{tabular}}
\caption{On balance performance deltas of N{\aive} Bayes (with logNums) 
using just the best 1,2, or 3 attributes,
or all 38 attributes.}\label{fig:fss}
\end{figure}

\fig{fss} shows the InfoGain results
for N{\aive} Bayes with logNums.
On balance, large reductions in the number of attributes
are possible, without compromising the performance of the learned
predictor.
Using two or three attributes worked as well as using 38
attributes. However, using only one attribute resulted in
inferior performance. 

%% In retrospect, this large reduction in the attribute space (from 38
%% attributes to just
%% to two or three) should have been expected.
%% \fig{attr} shows that many of the static
%% code attributes are correlated; e.g. the derived Halstead measures are
%% computed from the raw Halstead measures.  It is hardly surprising that
%% such a highly correlated set of attributes can be reduced to a smaller
%% set without lose of overall performance.

% JG NOTE: I am not sure I agree with the previous paragraph.
% JG NOTE: Since all our learners had relatively simple output languages
% JG NOTE: ie they could not build any sort of mathematical model,
% JG NOTE: I don't think that they could have discovered that a divide by b was
% JG NOTE: important if just given the attributes a and b, but not
% JG NOTE: a divide by b.  Also, there are still more than 2 or 3
% JG NOTE: non-correlated variables.  In fact only the derived
% JG NOTE: Halstead attributes are directly related to other
% JG NOTE: attributes.  Therefore, even in retrospect, I don't think we 
% JG NOTE: should have expected, rather hoped, that it was possible to find
% JG NOTE: such a small set of attributes that worked.
% JG NOTE: I think rather than saying we accomplished the obvious, it
% JG NOTE: might be better to simply point out that we haven't really gotten
% JG NOTE: a chance to really think about the implications of our small theories.
% JG NOTE: In particular, since we haven't had a chance yet to look for a 
% JG NOTE: universal small attribute set (ie 5 attributes that yield good 
% JG NOTE: performing theories for all data sets), . . .  well actually I
% JG NOTE: am not sure what that search would do for us, but it sounds cool.

All the results up to this point have been {\em comparisons between}
different methods.  Having determined that N{\aive}  Bayes (with
logNums) is our preferred method, the next question is how well
does that method perform in {\em absolute} terms.  To test that,
in \fig{best}, a standard 10*10-way experiment with attribute
subset-selection was performed (hence, each line in \fig{best} shows
the results of 10*10=100 experiments using just
the two or three attributes shown in the {\em selected
attributes} column of \fig{best}).
On average, N{\aive} Bayes (with logNums)
built predictors with
mean $pd=71\%$, and mean $pf=25\%$.  

\begin{figure}
{\footnotesize
\begin{center}
%XX remove accuracy
\begin{tabular}{rr|rr|c|c}
\multicolumn{2}{c}{~}& \multicolumn{2}{c}{\%}&selected 
attributes&selection\\\cline{3-4}
data  &  N  & pd      & pf  &   (see\fig{attrs})& method  \\\hline
pc1   & 100&48 & 17 &  3, 35, 37&exhaustive  subsetting\\
mw1   & 100&52 & 15 &  23, 31, 35&iterative subsetting\\
kc3   &  100&69 & 28 & 16, 24, 26&iterative subsetting\\
cm1   & 100&71 & 27  & 5, 35, 36&iterative subsetting\\
pc2   & 100&72 & 14 & 5, 39&iterative subsetting\\
kc4   & 100&79 & 32 &  3, 13, 31&iterative subsetting\\
pc3   & 100&80 & 35 & 1, 20, 37&iterative subsetting\\
pc4   & 100&98  & 29 & 1, 4, 39&iterative subsetting\\\hline
all   & 800& 71& 25 &\multicolumn{2}{c}{}
\end{tabular}
\end{center}}
\caption{Best defect predictors learned in this study. Mean
results from 
N{\aive} Bayes after a 10 repeats of (i)~randomize the order of the data;
(ii)~divide that data into ten 90\%:10\% splits for
training:test. Prior to learning,
all numerics where replaced with logarithms.  InfoGain was then
used to select the best two or three attributes shown in the right-hand
column (and if ``three'' performed
as well as ``two'', then this table shows the results using ``two'').}
\label{fig:best}
\end{figure}
The \fig{best} results are better than they first appear:
\bi
\item
Recall from \fig{datasets} that the  
number of defective modules may be very small: the most
extreme example of this is $PC2$ with only 0.4\% defective modules.
It is somewhat of an achievement that, for $PC2$,  our methods
yielded $\{pd=72\%,pf=14\%\}$ for such a tiny target.
\item
The best we have achieved in the past with
cross-validation was a mean 
$pd$ under 50\%~\cite{me04c} (recall \fig{before}). In those experiments,  
the only way to achieve a $pd>70\%$ was to accept around a 50\% false
alarm rate~\cite{me02e,me03a}
The results of \fig{fss} results have
much  higher $pd$s and lower $pf$s.
\ei
\begin{figure} {\footnotesize
\begin{center}
\begin{tabular}{llll}
  &frequency&\\
ID&in \fig{best}&what& type\\\hline
1&2&  {loc\_blanks}  & locs\\
3&2 &  {call\_pairs} & misc\\
4&1 &  {loc\_code\_and\_command} & locs\\
5&2 &  {loc\_comments} & locs\\
13&1 &  {edge\_count}   & misc\\
16&1 &  {loc\_executable} & locs\\
20&1 &  {I} & H (derived Halstead)\\
23&1 &   {B}& H (derived Halstead)\\
24&1 &   {L}& H (derived Halstead)\\
26&1 &   {T}& H (derived Halstead)\\
31&2 & {node\_count} & misc\\
35&3 & {$\mu_2$}& h (raw Halstead)\\
36&1 & {$\mu_1$}& h (raw Halstead)\\
37&2 &  {number\_of\_lines} & locs\\
39&2 & {percent\_comments}& misc\\
\end{tabular}
\end{center}}
\caption{Attributes used in \fig{best}, sorted
into the groups of \fig{attr}. }\label{fig:attrs}
\end{figure}


\begin{figure}
\begin{center}
\includegraphics[width=3.5in]{infogain.pdf}
\end{center}
\caption{InfoGain for KC3 attributes. Calculated from \eq{infogain}.
Lines show means and t-bars show standard deviations after 10 trials
on 90\% of the training data (randomly selected).}\label{fig:sorted}
\end{figure}

One interesting aspect of
\fig{best} is that different data sets selected very different
``best'' attributes (see the {\em selected attribute} column).  This
aspect can be explained by \fig{sorted} which shows the InfoGain of
all the attributes in an MDP data set (KC3).  Note how the highest ranked
attributes (those on the left-hand-side) offer very similar
information.  That is, there are no clear winners so minor changes in
the training sample (the 90\% sub-sampling used in subsetting or a
cross-validation study) can result in the selection of very different
``best'' attributes.



The pattern of InfoGain values of \fig{sorted} (where there are many
alternative ``best'' attributes) repeats in all the MDP data sets.
This pattern explains a prior observation of Shepperd \& Ince who
found 18 publications where an equal number of studies reporting that the
McCabe cyclomatic complexity is the same; is better; or is worse than
lines of code in predicting defects~\cite{shepperd94}.  
\fig{sorted} motivates the following 
principles: 
\bi
\item
Do not seek ``best'' subsets of static code attributes.
\item
Rather, seek instead for learning methods that can combine multiple
partial defect indicators like  the statistical methods of
N{\aive} Bayes. 
\ei



%XXX


%  cm1& 100&     75&      59&      22 \\
%  kc3& 100&     75&      69&      23 \\
%  kc4& 100&     50&      59&      48 \\
%  mw1& 100&     82&      45&      15 \\
%  pc1& 100&     72&      58&      26 \\
%  pc2& 100&     80&      67&      19 \\
%  pc3& 100&     65&      77&      36 \\
%  pc4& 100&     73&      92&      28 \\\hline
%  all& 800&     73&      67&      25

\section{Conclusion}

These results strongly endorse building defect predictors using
N{\aive} Bayes (with logNums).
The combination of learner+filter generated
predictors with
average results of  $pd=71\%$ and $pf=25\%$ (see
\fig{best}). 
This is an interesting result since, as mentioned above,
if static code attributes capture so little
about source code (as argued by 
Shepherd, Ince, Fenton and Pfleeger), then we would expect much lower probabilities
of defection and much higher false alarm rates. 

Our results also comment on the relative merits of certain learners.
Based on these experiments, we would reject
the use of
simple thresholds
for defect prediction.
If simple thresholds such as \mbox{$v(g)>10 \vee
iv(g)>4$} were the best defect predictors, then two results would be
predicted. Firstly, the single attribute tests of OneR would perform
as well as the multiple tests of J48.  Secondly, the subsetting methods would
select attribute sets of size one.  Neither of these results were
seen in \fig{allpd} and \fig{fss}. 

This experiment was also negative regarding the merits of building
intricate decision trees to predict defects.
Recalling \fig{allpd},
N{\aive} Bayes (with logNums) out-performed
J48.
We offer
two explanations why N{\aive} Bayes with logNums
out-performs our prior work:
\bi
\item
Recalling \fig{rawlog}, it is possible that code defects are
actually associated in some log-normal way to static code attributes. Of
all the methods studied here, only N{\aive} Bayes (with $logNums$) was
able to directly exploit this association.  
\item
Recalling \fig{sorted}, many of the static code attributes have similar
information content. Perhaps defect detection is best implemented as
some kind of thresholding systems; i.e.  by summing the signal from
several partial indicators. Of all the learners used in this study,
only the statistical approach of N{\aive} Bayes can sum 
information from multiple attributes.
\ei

The best attributes to use for
defect prediction vary from data set to data set.
Hence, rather than advocating a particular subset of possible attributes
as being the {\em best attributes}, these experiments suggest that
defect predictors should be built using {\em all available} attributes,
followed by subsetting to  find the most appropriate particular subset for a particular domain.


%There is
%much current work on building better static code measures
%(e.g.~\cite{polyspace,nagappan05,hall00,nikora03,have00b,clarke99,Rushby:Tutorial,fenton99})
%that could yield better $pd/pf$ results.
%To date, the results with these alternate methods are promising, but
%hardly conclusive.
%In order to truly compare methods, comparative studies on the same data must be performed.
%Such comparative  studies discussing are
%only just appearing (e.g.~\cite{brat05}) and it is too soon to generalize their conclusions.
%Perhaps, in the future, there will exist data sets with attributes
%created from intra-module measurements as well as measures from a
%growing number of new tools At that time, the experimental procedures
%defined in this paper would be useful for assessing the relative
%merits of these different tools. 
%



In summary
we endorse the use of static code attributes for predicting detects
with the following caveat. Those  predictors
should be treated as probabilistic, not categorical, indicators. While our
best methods have a non-zero false alarm, they also have a usefully
high probability of detection (over $\frac{2}{3}$rds).  Just as long
as users treat these predictors as {\em indicators} and not definite
{\em oracles}, then the predictors learned here would be pragmatically
useful for  focusing limited verification and validation budgets on portions of the
code base that are predicted to be problematic.  


Since we are optimistic about using static code attributes, we need to
explain prior pessimism about such attributes~\cite{fenton00b,shepperd94}:
\bi
\item
Prior work would not have found good predictors if that work had
focused on attribute subsets, rather than the learning methods.
\fig{best} shows that the best attribute subsets for defects
 predictors can change dramatically from data set to data set. Hence,
conclusions regarding the {\em best attribute(s)} are very brittle;
i.e. may not still apply when we change data sets.
\item
Also, prior work would not have found good predictors if that work had not
explored a large space of learning methods.  
For example, \fig{allpd} shows that, of the six methods
explored here, only {\em one} (N{\aive} Bayes with logNums) had a median
performance that was both large and positive.  
\ei

More generally, our high-level conclusion is that it is no longer adequate to assess
defect learning methods using only one data set and only one learner. Further research
should assess the merits of their proposed techniques via extensive experimentation.

\section{Future Work}

Our hope is that numerous researchers repeat our
experiments and discover learning methods that are superior to the one proposed here.
Paradoxically, this paper will be a success if it is quickly superseded.

There are many ways to design learning methods that could out-perform the results of this paper.
Here, we list just three:
\bi
\item
Data mining is a dynamic field and new data miners and continually being
developed. For example, Webb et.al. have proposed an improvement to N\aive Bayes that aggregates
1-dependence estimators~\cite{webb05}.  It would be interesting to check if these newer learners
improved the results of this paper.
\item
With regard to pre-processing
the numerics, we might be able to do even better than our current
results.  Dougherty et.al.~\cite{dou95} report spectacular
improvements in the performance of N{\aive} Bayes via the
use of better numeric pre-processing than just simple
log-filtering. 
\item
The Halstead and McCabe attributes were defined in the 1970s and complier technology
has evolved considerably since then. Halstead and McCabe are {\em intra-module} metrics
and with modern intra-procedural data flow analysis, it should be possible
to define a new set of $21^{st}$ century {\em inter-module} metrics that yield
better defect predictors.
\ei


%
%
%
%Conventional static code measures (e.g. Halstead and McCabe) do not
%offer
%detailed comments
%on the connectivity of one module to another.
%Our proposal is to augment those standard intra-module methods with the new intra-module metrics
%which can be found via CodeSurfer.
%
%Our intra-module metrics will be based on a graph theoretic view of software.
%In that view,
%a software system is a directed graph
%$G=(V,E,v_0)$ with a set of nodes $V$ representing the modules and
%edges $E \subseteq V \times V$, and a distinguished initial node
%$v_0$ representing the main starting function. 
%Numerous metrics of interest can be extracted from that information:
%
%\begin{description}
%\item[{\bf Neighborhoods:}]
%The {\em neighborhood} of a node is then the $M$ nodes are can be reached
%or can reach this node by crossing $M$ edges. A {\em local neighborhood} are the neighborhood
%nodes within
%a small number of edges  (say $M=10$).
%\item[{\bf Hubs:}]
%The {\em out-degree}
%({\em in-degree}) of a node (module) is the number of edges leading
%from (to) this node (module). The average degree is just $|E|/|V|$.
%Another measure of interest is the in-degree of a module relative
%to the average degree and the maximal in-degree seen in the whole
%graph.
%  Modules with an in-degree of one and an out-degree of zero are {\em terminals}. Faults
%in these modules can't propagate anywhere else.
%Modules with large in-degrees are $hubs$ that most connect to
%the rest of the system. 
%Typically, the hubs of graphs representing
%real-world systems are characterized by a power-law distribution;
%i.e. most nodes aren't hubs and a few hubs connect to a very large
%number of nodes.
%\item[{\bf Components:}]
%A {\em component } can be view as a  a generalizing of the concept
%of neighborhoods and hubs.  A {\em strongly connected component}
%(SCC) of $G$ is a maximal set of nodes $C\subseteq V$ such that for
%each $u,v\in C$, the node $v$ is reachable from $u$ and vice versa.
%Modules can be classified according to the size of their SCC ($|V \cap 
%SCC|$).  
%The {\em quotient graph} of $G$ is a graph  ($W,H$) such that $W$ is
%the set of the SCCs of $G$ and ($C_1,C2 \in H$ if they connect;
%i.e. $C_1 \not= C_2$ and there exists $r\in C_1, s\in C_2$ such
%that $(r,s) \in E$. The SCC quotient height of the graph $G$ is the
%length of the longest path in the quotient graph of $G$.  A component
%is $trivial$ if it contains only one vertex 
%(in real world graphs, the number SCC can be very high, but this is mainly due
%to trivial components).
%Finally, a component
%is $terminal$ if it has no successors in the quotient graph.
%\item[{\bf Diameters:}]
%The {\em diameter} of a graph is the length of the largest shortest
%path between two vertices.  The {\em girth} of a graph is the
%length of the shortest cycle. Diameter and girth is expensive to
%computer precisely. Nevertheless, approximately values can be
%computed via heuristic sampling~\cite{pelanek04}; e.g.  returning
%the maximum height of the search graph found via depth-firsiterative deepening~\cite{korf85}.
%\ed
%
%\noindent
%The above intra-module metrics comment on reliability as follows:
%\bi
%\item
%Many real-world networks are {\em sparse}
%where $|E| \approx |V|$. If the local neighborhood is more inter-connected than the rest of the system of networks, then
%this is a region of much inter-connectivity where faults are relatively more dangerous.
%\item
%Faults in  hub nodes 
%can have a large impact on the rest of the system.
%\item
%Diameters and girths play an important role in assessing how quickly
%one module can bring down the rest of the system.
%\item
%Similarly, SCCs play an important role in assessing the effects of a module failure.
%Modules that fail in a large SCC can bring down many other
%modules. Further, in systems with a small SCC quotient count, one SCC failing
%can quickly compromise large portions of the remaining system.
%\ei
%
%\section{Discussion}
%
%

%% \section{Future Work}


%% In the short term,  the next step in this work is to find the limits of
%% the current conclusions. The results shown here are expressed in terms
%% of a standard data mining experiment where the available data is
%% divided into training and test sets. As part of this work, we have
%% spent much time with the business users of defect detectors. They have
%% offered various specialized scenarios that might stress our
%% conclusions.

%% For example, one such scenario is that a new project is
%% starting up and the manager of that project is seeking the best old
%% data to use to predict defects in the new work. This appears to be a
%% nearest-neighbor problem or an instance-selection problem. We are
%% exploring what is the minimum number of modules required before we can
%% find appropriate subsets of old data.   At this time, it is not clear
%% if different learners need different number of minimum modules.

%% Another scenario is that a business user has to defend their decision
%% that, say, 500 modules are probably defective and require
%% inspection. In that scenario, we would need to defend the decisions of
%% the data miner and, currently, we don't have good methods for
%% explaining how a N{\aive} Bayes classifier reaches its conclusions.


% if have a single appendix:
%\appendix[Proof of the Zonklar Equations]
% or
%\appendix  % for no appendix heading
% do not use \section anymore after \appendix& only \section*
% is possibly needed

% use appendices with more than one appendix
% then use \section to start each appendix
% you must declare a \section before using any
% \subsection or using \label (\appendices by itself
% starts a section numbered zero.)
%
% Use this command to get the appendices' numbers in "A", "B" instead of the
% default capitalized Roman numerals ("I", "II", etc.).
% However, the capital letter form may result in awkward subsection numbers
% (such as "A-A"). Capitalized Roman numerals are the default.
%\useRomanappendicesfalse
%

% trigger a \newpage just before the given reference
% number - used to balance the columns on the last page
% adjust value as needed - may need to be readjusted if
% the document is modified later
%\IEEEtriggeratref{8}
% The "triggered" command can be changed if desired:
%\IEEEtriggercmd{\enlargethispage{-5in}}

% references section bib
% NOTE: BibTeX documentation can be easily obtained at:
% http://www.ctan.org/tex-archive/biblio/bibtex/contrib/doc/

% can use a bibliography generated by BibTeX as a .bbl file
% standard IEEE bibliography style from:
% http://www.ctan.org/tex-archive/macros/latex/contrib/supported/IEEEtran/bibtex
\bibliographystyle{IEEEtranBST/IEEEtran.bst}
\bibliography{refs}
%
% <OR> manually copy in the resultant .bbl file
% set second argument of \begin to the number of references
% (used to reserve space for the reference number labels box)

% biography section
%
% If you have an EPS/PDF photo (graphicx package needed) extra braces are
% needed around the contents of the optional argument to biography to prevent
% the LaTeX parser from getting confused when it sees the complicated
% \includegraphics command within an optional argument. (You could create
% your own custom macro containing the \includegraphics command to make things
% simpler here.)
%\begin{biography}[{\includegraphics[width=1in,height=1.25in,clip,keepaspectratio]{mshell}}]{Michael Shell}
% where an .eps filename suffix will be assumed under latex, and a .pdf suffix
% will be assumed for pdflatex; or if you just want to reserve a space for
% a photo:

\begin{biography}[{\includegraphics[width=1in,keepaspectratio]
{timbeach}}]{Tim Menzies}
is a  associate professor
at the Lane Department of Computer Science at West Virginia University
(USA), and has been working with
NASA on software quality issues since 1998. He has a CS degree and a PhD
from the University of New South Wales and is the author of over 160 publications.
His recent research concerns modeling
and learning with a particular focus on light weight modeling methods.
His doctoral research explored the validation of, possibly inconsistent,
knowledge-based systems in the QMOD specification language. 
\end{biography}



\begin{biography}[{\includegraphics[width=1in,clip,keepaspectratio]
{jeremy}}]{Jeremy Greenwald}
is a graduate student in the Computer Science Department at
Portland State University.  He received his BS in Physics and Astronomy from
the University of Pittsburgh in 2001.  He has over six years of research
experience in numeric methods and data mining.  His master thesis focuses on
comparative study of data mining techniques and equivalences with numeric
optimization techniques.  He also has interned at a software development firm
in Beaverton, Oregon.  His expected graduation date from PSU is late 2006.
\end{biography}

\begin{biography}[{\includegraphics[width=1in,clip,keepaspectratio]
{art}}]{Art Frank}
Art Frank is an undergraduate student pursuing a BS in Computer Science at                                    
Portland State University.  He is currently working as an IT Manager and                                      
Database Administrator at a large non-profit organization in Portland,                                        
Oregon.        
\end{biography}

% You can push biographies down or up by placing
% a \vfill before or after them. The appropriate
% use of \vfill depends on what kind of text is
% on the last page and whether or not the columns
% are being equalized.

%\vfill

% Can be used to pull up biographies so that the bottom of the last one
% is flush with the other column.
%\enlargethispage{-5in}

% that's all folks
\end{document}


