\documentclass[journal,compsoc]{IEEEtran}
%\documentclass[onecolumn,12pt,journal,compsoc]{IEEEtran}\renewcommand{\baselinestretch}{1.41}

\usepackage[table]{xcolor}

% *** CITATION PACKAGES ***
%
\ifCLASSOPTIONcompsoc
  % IEEE Computer Society needs nocompress option
  % requires cite.sty v4.0 or later (November 2003)
  % \usepackage[nocompress]{cite}
\else
  % normal IEEE
  % \textsc{}\usepackage{cite}
\fi

\usepackage{cite,url}

\ifCLASSINFOpdf
  \usepackage[pdftex]{graphicx}
  % declare the path(s) where your graphic files are
  \graphicspath{{/plots/}}
  % and their extensions so you won't have to specify these with
  % every instance of \includegraphics
  \DeclareGraphicsExtensions{.eps,.pdf,.jpeg,.png}
\else
  % or other class option (dvipsone, dvipdf, if not using dvips). graphicx
  % will default to the driver specified in the system graphics.cfg if no
  % driver is specified.
  % \usepackage[dvips]{graphicx}
  % declare the path(s) where your graphic files are
  % \graphicspath{{../eps/}}
  % and their extensions so you won't have to specify these with
  % every instance of \includegraphics
  % \DeclareGraphicsExtensions{.eps}
\fi


\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{verbatim}
\usepackage{fancyvrb}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{timtricks}
\usepackage{multirow}
\usepackage{rotating}
\usepackage{times}
\usepackage{cite}
\usepackage{float}
\usepackage{color}
\usepackage[pdftex]{graphicx}
\newcounter{over}
\newcounter{max}
\newcommand{\fred}[2]{\setcounter{over}{#2}\addtocounter{over}{#1}}
\newcommand{\boxplot}[5]{%
\scalebox{1}{\textcolor{white}{\setcounter{max}{#4}\addtocounter{max}{#5}\fred{#4}{-#2}}%
\begin{picture}(100,10)%
\put(0,0){\line(0,1){8}}%
\put(100,0){\line(0,1){8}}%
\put(#2,4){\line(1,0){\theover}}%
\put(#3,4){\circle*{8}}%
\put(50,0){\line(0,1){8}}%
\end{picture}}
}

% correct bad hyphenation here
\hyphenation{op-tical net-works semi-conduc-tor}

\begin{document}

\title{Sampling Methods in Software Effort Estimation: An Investigation on Bias-Variance Trade-Off}

\author{Ekrem~Kocaguneli,~\IEEEmembership{Student Member,~IEEE}
        Tim~Menzies,~\IEEEmembership{Member,~IEEE}
        Martin~Shepperd,~\IEEEmembership{Member,~IEEE}
\IEEEcompsocitemizethanks{\IEEEcompsocthanksitem Ekrem Kocaguneli and Tim Menzies are with the Lane Department of Computer Science and Electrical Engineering, West Virginia University.
E-mail: ekocagun@mix.wvu.edu, tim@menzies.us
\IEEEcompsocthanksitem Martin Shepperd is with the School of Information Systems Computing \& Maths, Brunel University.
E-mail: martin.shepperd@brunel.ac.uk}% <-this % stops a space
\thanks{This research is funded in part by NSF,CISE, project \#0810879 }}


\markboth{Journal of IEEE Transactions on Software Engineering,~Vol.~X, No.~Y, SomeMonth~201z}%
{Shell \MakeLowercase{\textit{et al.}}: Bare Demo of IEEEtran.cls for Computer Society Journals}
\IEEEcompsoctitleabstractindextext{

\begin{abstract}

~\\
\textbf{\textit{Background}}: Experimental design is an important concept in software effort estimation.
Many papers use different experimental selections: leave-one-out-cross-validation (LOOCV) a.k.a. N-Way, 10-Way, 3-Way etc.
Also there are different justifications for the adopted strategies: The bias-variance trade-off, associated run times and so on.

\textit{\textbf{Aim:}} The theory states that the more test sets we have (i.e. as \textit{x} increases in \textit{x}-Way), the variance increases and the bias decreases.
However, there is no systematic investigation of this theoretical concept.
In this paper, we systematically investigate whether theoretical assumptions hold for software effort datasets.

\textit{\textbf{Method:}} We selected 20 different effort datasets and 90 different algorithms to compare different experimental settings.
For each dataset, we calculated the bias and variance of every algorithm under the experimental settings of LOOCV and 3-Way.

\textit{\textbf{Results:}} 
As a result of our investigation on 90 algorithms and 20 effort datasets, we saw that the theory does not hold for effort estimation.
We have observed that LOOCV and 3-Way have very similar bias and variance values.

\textit{\textbf{Conclusion:}} Seeing that LOOCV and 3-Way have almost exactly the same bias and variance values, we can conclude that for software effort estimation the bias-variance trade-off is not the main concern of experimentation.
Therefore, the main concern when opting for a particular experimental strategy should be run-times and reproduction of the experiments.

\end{abstract}

% Note that keywords are not normally used for peer review papers.
\begin{IEEEkeywords}
Software Cost Estimation, Experimentation, Bias, Variance
\end{IEEEkeywords}}

\maketitle

\section{Introduction}

Sampling method is an important topic for software effort estimation (from now on SEE) studies and an empirical study to compare the pros and cons of different sampling methods in SEE is urgent. 

The biggest research topic in SEE since 1980s is the introduction of new methods and comparing them to old ones~\cite{jorgensen07}.
In their comprehensive systematic review Jorgensen and Shepperd report $61\%$ of selected SEE papers deal with that topic''~\cite{jorgensen07}.
This group of papers use \textit{historical data}, i.e. and not a single one of them employs a data collection methodology.

Only generating theories from historical data entails an internal validity threat, which we would like to call \textit{fixed-scenario-problem}.
Ideally a learned theory should be applied to new scenarios to observe if the predicted effect occurs in practice.
The lack of new scenarios in evaluation is defined to be the \textit{fixed-scenario-problem} and it threats the evaluation experiments like the ones reported in~\cite{jorgensen07}.
Therefore, studies without a new scenario for the learned model are limited within their experimental settings.

On the other hand it is impractical to expect every study to collect new data.
The mitigation to \textit{fixed-scenario-problem} is possible by simulating the application of a method to a new situation.  
Sampling method (from now on SM) forms the basis of such a simulation~\cite{Demsar2006,Alpaydin2004}.

There is a wide palette of available SMs used in the literature~\cite{demsar06, Alpaydin2004, lessmann09, seni10}: Leave-one-out (LOO), 10Way and 3Way are examples to the most commonly used ones.
Similar to choosing colors from a palette, the choice of different SMs paints a different picture.
For example, theoretically LOO results in high-variance and low-bias in the results, whereas 10Way or 3Way generate just the opposite (low-variance, high-bias)~\cite{seni10,Hastie2008}.
The change of bias and variance (from now on $B\&V$) from one method to the other is known as $B\&V$ trade-off.
Employing the wrong SM or disregard of the $B\&V$ trade-off due to particular SMs endanger the validity of a particular study. 


To our surprise, in SEE domain there is no study employing a rigorous experimentation to observe the effects of different SMs.
Kitchenham et al. have already identified and raised the issue of SM selection~\cite{Kitchenham2007, Kitchenham2009}; however, their mentioning is more of a pointer to future work rather than an investigation.
Hence, this paper is a natural follow-up of previous SEE research.
Furthermore, it is the first of its kind to rigorously investigate the $B\&V$ trade-off inherent in different SMs and it concerns more than half the SEE field.
In this paper, we present $B\&V$ trade-off results of 3 different SMs: LOO, 3Way and 10Way cross-validation.
Our experimentation includes $90$ methods applied on $20$ datasets.
 
The experimental results showed that $B\&V$ behavior of SMs are different than the predicted: For most of the algorithms, bias and variance values are statistically the same.
However, we have seen orders of magnitude differences in terms of run times, see \fig{runtimes} for exact values.
The values of \fig{runtimes} belong to experiments coded in MATLAB and run on a 64-bit dual-core machine.
Given these findings, we recommend considering experimental concerns to choose an SM.
If the main concern is the exact reproduction of the current work by another researcher, then LOO should be used.
Otherwise, if the lower run times are the main concern, then we recommend 3Way or 10Way.

\begin{figure}[!t]
\centering
{ 
\begin{tabular}{l|l}  
\textbf{SM} & \textbf{Run Time} \\\hline 
LOO & $8199.945 * 5$\\
3Way & $8199.945 * 3$\\
10Way & $8199.945$\\
\end{tabular}}
\label{fig:runtimes}
\caption{The run times in seconds for different SMs.}\label{fig:dataset-paper}
\end{figure}


\subsection{Contributions}

The contributions of this research are summarized below:

\bi
\item The first systematic investigation of \textit{B\&V} trade-off in SEE domain
\item An extensive experimentation of 20 datasets and 90 algorithms
\item Showing that \textit{B\&V} is not the main concern for SEE
\item Recommendations based on experimental concerns:
\bi
\item For lower run-times the order of preference is: 1) 3Way, 2) 10Way, 3) LOO.
\item For reproducibility prefer LOO
\ei
\ei

\section{Terminology}

A typical dataset consists of a a matrix X and a vector Y. 
The input variables (a.k.a. features) are stored in X, where each row corresponds to an observation and each column corresponds to a particular variable.
Similarly, the dependent variable is stored in a vector Y, where for each observation in X there exists a response value.

Now assume that a prediction model represented by $\hat{f}(x)$ has been learned from a training dataset $\tau$.
So as to measure the errors between the actual values in Y and the predictions given by $\hat{f}(x)$, we can make use of an error function represented by $L(Y,\hat{f}(x))$.
Some examples of error functions are squared loss (given in Equation \ref{equ_loss_squared}) or absolute loss (given in Equation \ref{equ_loss_abs}).


\begin{equation}
L(Y,\hat{f}(x)) = \left( Y-\hat{f}(x) \right)^2
\label{equ_loss_squared}
\end{equation}

\begin{equation}
L(Y,\hat{f}(x)) =  |Y-\hat{f}(x)|
\label{equ_loss_abs}
\end{equation}

Given the assumptions that the underlying model is $Y=f(X) + \epsilon$ where $E(\epsilon)=0$ and $Var(\epsilon)=\sigma_{\epsilon}^{2}$, then we can come up with a derivation of the squared-error loss for $\hat{f}(X)$~\cite{Hastie2003}. The error for a point $X=x_0$ is:\\

\begin{tabular}{l c l}
$Error(x_0)$ & = & $E\left[ \left( Y - \hat{f}(x_0) \right)^{2} \vert X=x_0 \right]$\\[4ex]
			 & = & $\sigma^{2}_{\epsilon} + \left( E[\hat{f}(x_0) - f(x_0)] \right)^2 $ \\[4ex]
			 &   & $ + E\left[ \hat{f}(x_0) - E[\hat{f}(x_0)] \right]$ \\[4ex]
			 & = & $\sigma^{2}_{\epsilon} + Bias^{2}(\hat{f}(x_0)) + Var(\hat{f}(x_0))$\\[4ex]
			 & = & $\underbrace{Irreducable Error}_{1^{st} Term} + \underbrace{Bias^2}_{2^{nd} Term}$\\[4ex]
			 &   & $ + \underbrace{Variance}_{3^{rd} Term}$ \\
			 		& & \\[1ex]
\end{tabular}

In the above derivation, the explanations of the $1^{st}$, $2^{nd}$ and $3^{rd}$ terms are as follows:
\begin{itemize}
\item The $1^{st} Term$ is the so called \textit{``irreducable error''}, i.e. the variance of the actual model around its true mean.
This variance is inevitable regardless of how well  we model $f(x_0)$, only exception to that is when the actual variance is zero (when $\sigma^{2}_{\epsilon} = 0$).
\item The $2^{nd} Term$ is the square of the bias, which is the measure of how different the model estimates are fromt the \textit{true} mean of the underlying model.
\item The $3^{rd} Term$ is the variance of the estimated model. It is the expectation of the squared deviation of the estimated model from its own mean.
\end{itemize}
Furthermore, the above derivation is for an individual instance.
The bias and variance values associated with an algorithm $\hat{f}(X)$ is the mean of all individual values.

Then the question becomes how the bias and variance (from now on $B\&V$) relate to different choices of the training size (\textit{K}), i.e. the relation to cross-validation method (CV).
Here we will consider two cases of CV: leave-one-out (LOO) and 3-Way.
Ideally when training size is equal to the dataset size (\textit{K=N}), we expect CV to be approximately unbiased and to have high variance, because N training sets are so similar to one another.
On the other hand, for small values of \textit{K}, say \textit{K=N/3} as in 3-Way, we expect lower variance and a higher bias~\cite{Hastie2003}.
Naively put, the relationship is:
\bi
\item LOO	: Higher variance, lower bias
\item 3-Way	: Lower variance, higher bias
\ei

In an ideal case, when we plot $B\&V$ values of each individual test instances on x and y axes respectively, we expect 2 clusters:
\bi
\item Upper Left: Low bias, high variance; i.e. LOO results.
\item Lower right: High bias, low variance; i.e. 3Way results.
\ei

Just for the sake of clarity, a very \textit{simple} but \textit{ideal} case would look like \fig{ideal-simulation}. 
%In that figure, 30 hypothetical algorithms subject to both LOO and 3-Way are represented.

\begin{figure}[th!]
\begin{center}
\includegraphics[width=0.4\textwidth]{lib/IdealSimulation.pdf}
\end{center}
\caption{A simple simulation for the ideal case of $B\&V$ relation to testing strategies.}
\label{fig:ideal-simulation}
\end{figure}

\section{Related Work}

\subsection{Effort Estimation}

\subsubsection{Algorithmic Methods}
There are many algorithmic effort estimators.
For example, if we
restrict ourselves to just instance-based algorithms, \fig{cbr} shows
that there are thousands of options just in that one sub-field.

As to non-instance methods, there are many proposed in the literature
including various kinds of regression (simple, partial least square,
stepwise, regression trees), and neural networks just to name a
few. For notes on these non-instance methods, see \tion{learners}.

Note that instance \& non-instance-based methods can be combined to create even more algorithms. For example, once an instance-based method finds its nearest neighbors, those
neighbors might be summarized with regression or
neural nets~\cite{Li2009}.


\subsubsection{Non-Algorithmic Methods}
An alternative approach to algorithmic approaches (e.g. the instance-based methods of \fig{cbr})
is to utilize the best knowledge of an experienced expert. 
Expert based estimation \cite{Jor2004e} is a human intensive approach that is most commonly adopted in practice. 
Estimates are usually produced by domain experts based on their very own personal experience. It is flexible and intuitive in a sense that it can be applied in a variety of circumstances where other estimating techniques do not work  (for example when there is a lack of historical data). 
Furthermore in many cases requirements are simply unavailable at the bidding stage of a project where a rough estimate is required in a very short period of time.

Jorgensen \cite{Jor2005b} provides guidelines for producing realistic software development effort estimates derived from industrial experience and empirical studies. One important finding concluded was that the {\em combined estimation} method in expert based estimation offers the most robust and accurate combination method, as combining estimates captures a broader range of information that is relevant to the target problem, for example combining estimates of analogy based with expert based method. Data and knowledge relevance to the project's context and characteristics are more likely to influence the prediction accuracy.

Although widely used in industry, there are still many ad-hoc methods for
expert based estimation. Shepperd et al. \cite{shepperd96} do not
consider expert based estimation an empirical method because the
means of deriving an estimate are not explicit and therefore not
repeatable, nor easily transferable to other staff. In addition,
knowledge relevancy is also a problem, as an expert may not be able
to justify estimates for a new application domain. Hence, the rest of this paper does not consider non-algorithmic methods.

\subsection{Bias-Variance Trade-Off}

\fig{dataset-paper} shows the studies used the datasets presented here.\footnote{Make another table showing which methods these papers use.}

\begin{figure}[!t]
\centering
{\scriptsize  
\begin{tabular}{l|r|l|l}  
& \textbf{Dataset} & Used by us & Used by others \\\hline 
 & telecom & \cite{keung08a} & \cite{shepperd97}\\
 &   kemerer & \cite{keung08a} & \cite{shepperd97,Finnie1997}  \\
 &  cocomo81o &\cite{Menzies2006,Lum2008,Kocaguneli2010} &\\
 &  desharnaisL1 & \cite{Kocaguneli2010} &\\ 
 &   cocomo81s &\cite{Menzies2006,Lum2008,Kocaguneli2010} &\\ 
 & desharnaisL3 & \cite{Kocaguneli2010}  &\\
 &  albrecht & \cite{keung08a}, &\cite{Li2009,Li2009a,shepperd97,shepperd96,Finnie1997} \\ 
 &  cocomo81e & \cite{Menzies2006,Bakir2009,Kocaguneli2010} &\\ 
 &  nasa93\_center\_5 &\cite{Menzies2006,Lum2008,Kocaguneli2010} &\\ 
 & desharnaisL2 & \cite{Kocaguneli2010}   &\\\hline 
 & desharnais & \cite{keung08b,keung08a,keung2008b,Kocaguneli2010} & \cite{shepperd97,Li2008,Kadoda2000,Kirsopp2002,Li2009,Li2009a}\\ 
 & maxwell & &\cite{Li2009a,Sentas2005} \\
 & sdr &  &\cite{Kultur2008,Turhan2007} \\ 
 &  nasa93\_center\_1 & \cite{Menzies2006,Lum2008,Kocaguneli2010}  &\\
 & miyazaki94 & &\cite{Miyazaki1994} \\
 & nasa93\_center\_2 &\cite{Menzies2006,Lum2008,Kocaguneli2010} &\\ 
 & finnish &&\cite{Briand1999,shepperd97} \\
 & cocomo81 &\cite{Menzies2006,Lum2008,Kocaguneli2010} &\cite{Boehm1981}, \\
 & nasa93 &\cite{Menzies2006,Lum2008,Kocaguneli2010} &\\ 
 & china & this study&\\
\end{tabular}}
\caption{A sample of effort estimation papers that use the
data sets explored in this paper.}\label{fig:dataset-paper}
\end{figure}

When the studies shown in \fig{dataset-paper} are investigated we see that they use different testing strategies.
The below table shows the distribution of these papers w.r.t. the testing strategy they use.

\begin{figure}[!t]
\centering
{\small  
\begin{tabular}{l|l}  
Method 	&	Used by	\\\hline\hline
\multirow{3}{*}{LOO} 	&	\cite{keung08b, Li2008, keung08a}	\\
	&	\cite{keung2008b, Kocaguneli2010, Li2009a}	\\
	&	\cite{Kocaguneli2011, shepperd97}	\\\hline
\multirow{1}{*}{3-Way} 	&	\cite{Kocaguneli2011}	\\\hline
\multirow{2}{*}{10-Way} 	&	\cite{Bakir2009, Lum2008, Kocaguneli2011}	\\
	&	\cite{Turhan2007}	\\\hline
\multirow{2}{*}{Others (ad-hoc, 6-Way etc.)} 	&	\cite{Briand1999, Kultur2008, Li2009}	\\
	&	\cite{Menzies2006, Sentas2005, shepperd96}	\\
\end{tabular}}
\caption{Distribution of the studies in \fig{dataset-paper} w.r.t. their SM. Majority of the studies use LOO. LOO is followed by ad-hoc methods, 10-Way then 3-Way.}\label{fig:dataset-sampling-mehhod}
\end{figure}

\section{Methodology}

\subsection{Datasets}

The description of 20 datasets used in this study are provided in \fig{datasets}.

\begin{figure*}
\centering
{\scriptsize  
\begin{tabular}{l|rrl|lrrrrr}  
~\\~\\
 \textbf{} & \textbf{} & \textbf{} & \textbf{}&\multicolumn{6}{c}{\textbf{Historical Effort Data}}\\\cline{5-10} 
 \textbf{Dataset} & \textbf{Features} & \textbf{Size} & \textbf{Description}&\textbf{Units}& \textbf{Min} & \textbf{Median} &\textbf{Mean} & \textbf{Max} & \textbf{Skewness}\\ \hline
  cocomo81 &17&63& NASA projects& months&6&98&683&11400& 4.4\\
  \hspace{4 mm}cocomo81e &17&28& Cocomo81 embedded projects&months&9&354&1153&11400& 3.4\\ 
  \hspace{4 mm}cocomo81o &17&24& Cocomo81 organic projects&months&6&46&60&240& 1.7\\ 
  \hspace{4 mm}cocomo81s &17&11& Cocomo81 semi-detached projects&months&5.9&156&849.65&6400&2.64 \\ 
  nasa93 &17&93& NASA projects&months&8&252&624&8211& 4.2\\ 
  \hspace{4 mm}nasa93\_center\_1 &17 &12 & Nasa93 projects from center 1&months&24 &66 &139.92 &360 &0.86 \\ 
  \hspace{4 mm}nasa93\_center\_2 &17&37& Nasa93 projects from center 2&months&8&82&223&1350& 2.4\\ 
  \hspace{4 mm}nasa93\_center\_5
   &17&40& Nasa93 projects from center 5&months&72&571&1011&8211& 3.4\\ 
  desharnais &12&81& Canadian software projects&hours&546&3647&5046&23940& 2.0\\ 
  \hspace{4 mm}desharnaisL1 &11 &46 & Projects in Desharnais that are developed with Language1 & hours &805 &4035.5 &5738.9 &23940 &2.09 \\ 
  \hspace{4 mm}desharnaisL2 &11 &25 & Projects in Desharnais that are developed with Language2 & hours &1155 &3472 &5116.7 &14973 &1.16 \\ 
  \hspace{4 mm}desharnaisL3 &11 &10 & Projects in Desharnais that are developed with Language3 & hours &546 &1123.5 &1684.5 &5880 &1.86 \\ 
  sdr &22&24& Turkish software projects& months&2&12&32&342& 3.9\\ 
  albrecht &7&24& Projects from IBM&months&1&12&22&105& 2.2\\ 
  finnish &8 &38 &Software projects developed in Finland  &hours &460 &5430 &7678.3 &26670 &0.95 \\
  kemerer &7 &15 &Large business applications  &months &23.2 &130.3 &219.24 &1107.3 &2.76 \\
  maxwell &27 &62 &Projects from  commercial banks in Finland  &hours&583 &5189.5&8223.2&63694&3.26 \\
  miyazaki94 &8 &48 &Japanese software projects developed in COBOL  &months &5.6 &38.1 &87.47 &1586 &6.06 \\
  telecom &3 &18 &Maintenance projects for telecom companies  &months &23.54 &222.53 &284.33 &1115.5 &1.78 \\
  china &18 &499 &Projects from Chines software companies  &hours &26 &1829 &3921 &54620 &3.92 \\\cline{3-3}
          \multicolumn{2}{c}{~}  & Total: 1198& \multicolumn{7}{c}{~} 
\end{tabular}}
\caption{The 1198 projects used in this study come from 20 data sets.
Indentation in column one denotes a dataset that is a subset of another dataset.
 For notes
on these datasets, see the appendix.}\label{fig:datasets}
\end{figure*}


\subsection{Methods}

\subsubsection{Ten Pre-processors}

In this study, we investigate:
\bi
\item Three {\em simple preprocessors}: {\bf none, norm, and log};
\item One {\em feature synthesis} methods called {\bf PCA};
\item Two {\em feature selection} methods: {\bf SFS} (sequential forward selection) and {\bf SWreg};
\item Four {\em discretization} methods:  divided on equal frequency/width.
\ei
{\bf None} is the simplest preprocessor- all values are unchanged.

With the {\bf norm} preprocessor,
numeric values are  normalized
to a
0-1 interval using Equation \ref{equation:normalization}. Normalization means
that no variable has a greater influence that any other. 
\begin{equation}
\small
normalizedValue = \frac{(actualValue - min(allValues))}{(max(allValues) - min(allValues))}
\label{equation:normalization}
\end{equation}

With the {\bf log} preprocessor, all numerics are replaced with their logarithm. This {\bf log}ging
procedure minimizes the effects of the occasional very large numeric value.

Principal component analysis~\cite{Alpaydin2004}, or
{\bf PCA}, is a {\em feature synthesis} preprocessor that
converts a number of possibly correlated variables into a smaller number of uncorrelated variables called components. The first component accounts for as much of the variability in the data as possible, and each succeeding component accounts for as much of the remaining variability as possible.

Some of the preprocessors aim at finding a subset of all features according to certain criteria
such as
{\bf SFS} (sequential forward selection) and {\bf SWR} (stepwise regression).
{\bf SFS} adds features into an initially empty set until no improvement is possible with the addition of another feature. Whenever the selected feature set is enlarged, some oracle is called to assess the value
of that set of features. In this study, 
we used the MATLAB, \textit{objective} function (which reports the 
the mean-squared-error of a simple linear regression on the training set).
One caution to be made here is that exhaustive search algorithms over all features can be very time consuming ($2^n$ combinations in an \textit{n}-feature dataset), therefore SFS works only in forward direction (no backtracking).

{\bf SWR} adds and removes features from a multilinear model.
Addition and removal is controlled by the p-value in an F-Statistic.  At
each step, the F-statistics for two models (models with/out
one feature) are
calculated.  Provided that the feature was not in the model, the
null hypothesis is: ``Feature would have a zero coefficient in the
model, when it is added''.  If the null hypothesis can be rejected,
then the feature is added to the model.  As for the other scenario
(i.e. feature is already in the model), the null hypothesis is:
``Feature has a zero coefficient''.  If we fail to reject the null
hypothesis, then the term is removed.  

{\em Discretizers} are pre-processors that maps every numeric value in a column of data
into a small number of discrete values:
\bi
\item {\bf width3bin:} This procedure clumps the data features into 3 bins, depending on equal width of all bins see
Equation \ref{equation:binning}.

\begin{equation}\small
binWidth = ceiling\left(\frac{max(allValues) - min(allValues)}{n}\right)
\label{equation:binning}
\end{equation}
\item {\bf width5bin:} Same as {\bf width3bin} except we use 5 bins.
\item {\bf freq3bin:} Generates 3 bins of  equal population size;
\item {\bf freq5bin:} Same as {\bf freq3bin}, only this time we have {\em 5} bins.
\ei

\subsubsection {Nine Learners}\label{sec:learners}

Based on our reading of the effort estimation literature, we identified nine commonly used learners that divide
into
\bi
\item Two {\em instance-based} learners: {\bf ABE0-1NN, ABE0-5NN};
\item Two {\em iterative dichotomizers}: {\bf CART(yes),CART(no)};
\item A {\em neural net}: {\bf NNet};
\item Four {\em regression methods}: {\bf LReg, PCR, PLSR, SWReg}.
\ei
{\em Instance-based learning} can be used for analog-based estimation.
A large class of   ABE algorithms was described in \fig{cbr}. Since it is
not practical to experiment with the 6000 options defined in \fig{cbr},
we focus on two standard variants.
ABE0 is our name for
a very basic type of ABE that we derived from
various ABE studies~\cite{Mendes2003, Li2009, Kadoda2000}.
In {\bf ABE0-xNN}, features are firstly normalized to 0-1 interval,
then the distance between test and train instances is measured
according to Euclidean distance function, \textit{x} nearest neighbors
are chosen from training set and finally for finding estimated value
(a.k.a adaptation procedure) the median of \textit{x} nearest
neighbors is calculated.  We explored
two different \textit{x}:
\bi
\item {\bf ABE0-1NN:} Only the closest analogy is used. 
Since the median of a single value is itself, the 
estimated value in {\bf ABE0-1NN} is the actual effort value of the closest analogy.
\item {\bf ABE0-5NN:} The 5 closest analogies are used for adaptation.
\ei
\textit{Iterative Dichotomizers} 
seek
the best attribute value $splitter$ that most simplifies the data that
fall into the different splits. 
Each such splitter becomes a root of a tree.
Sub-trees are generated
by 
calling iterative dichotomization recursively
on each of the splits.
The CART iterative dichotomizer~\cite{Breimann1984} is defined for continuous target concepts 
and its  $splitters$ strive to reduce the GINI index of the data that
falls into
each split.
In this study, we use two variants:
\bi
\item {\bf CART (yes):} This version prunes the generated tree using cross-validation.
For each cross-val, an internal nodes is made into a leaf (thus pruning its sub-nodes).
The sub-tree that resulted in the lowest error rate is returned. 
\item {\bf CART (no):} Uses the full tree (no pruning).
\ei

In \textit{ Neural Nets}, or {\bf NNet},
an input layer of project details
is connected to zero or more ``hidden'' layers which then  connect
to an output node (the effort prediction). The connections are weighted.
If the signal arriving to a node sums to more than some
threshold, the node  ``fires'' and a weight is propagated
across the network.  Learning in a neural net
compares the output value to the expected value, then applies some
correction method to improve the edge weights (e.g. 
back propagation).
Our {\bf NNet} uses three layers.

This study also uses four
\textit{regression methods}.
{\bf LReg} is a simple linear regression algorithm. 
Given the dependent variables, this learner calculates the coefficient estimates of the independent variables.
{\bf SWreg} is the stepwise regression discussed above. Whereas above, {\bf SWreg} was used to
select features for other learners, here we use {\bf SWreg} as a learner (that is, the predicted
value is a regression result using the features selected by the last step of {\bf SWreg}).
Partial Least Squares Regression ({\bf PLSR}) as well as Principal Components Regression ({\bf PCR}) 
are algorithms that are used to model a dependent variable.
While modeling an independent variable, they both construct new independent variables as linear combinations of original independent variables.
However, the ways they construct the new independent variables are different.
 {\bf PCR} generates new independent variables to explain the observed variability in the actual ones.
While  generating new variables the dependent variable is not considered at all.
In that respect, {\bf PCR} is similar to selection of \textit{n-many} components via {\bf PCA} (the default value of components to select is 2, so we used it that way) and applying linear regression.
{\bf PLSR}, on the other hand,
 considers the independent variable and picks up the \textit{n-many} of the new components (again with a default value of 2) that yield lowest error rate.
Due to this particular property of {\bf PLSR}, it usually results in a better fitting.


\subsection{Experiments}

\section{Results}

When we calculated the $B\&V$ values for $90$ algorithms (the algorithms in Comba paper) on various datasets, we were unable to observe the behavior of \fig{ideal-simulation}, i.e. we did not observe two distinct clusters at predicted $B\&V$ zones.
On the contrary, we observed that both $B\&V$ values are close to one another for LOO and 3Way, i.e. the two clusters mostly overlap.
Also, the \textit{ideal} or \textit{predicted} lowness and highness for $B\&V$ values were not visible too.
The actual $B\&V$ values were both high, regardless of the testing strategy.
In \fig{nasa93}, \fig{cocomo81}, \fig{desharnais} the $B\&V$ plots of $90$ algorithms (i.e. $90$ circles for 3-Way and $90$ triangles for LOO) for Nasa93, Cocomo81 and Desharnais datasets are to be seen.
All the values reported in these figures are logged.
Also note that the axes in these figures are not scaled, because the differences are so small that scaling the axes makes it difficult to observe the behavior of $B\&V$.
See in these figures, how the \textit{ideal} behavior of $B\&V$ differs from the \textit{actual} case for software effort datasets.
We have conducted these experiments on many more datasets and the results are pretty much the same: 1) No ideal behavior for 3-Way and LOO; 2) 3-Way and LOO $B\&V$ values overlap.

%\begin{figure}[th!]
%\begin{center}
%\includegraphics[width=0.4\textwidth]{lib/nasa93.png}
%\end{center}
%\caption{$B\&V$ values for Nasa93.}
%\label{fig:nasa93}
%\end{figure}


\begin{figure}[th!]
\begin{center}
\includegraphics[width=0.4\textwidth]{lib/cocomo81.pdf}
\end{center}
\caption{$B\&V$ values for Cocomo81.}
\label{fig:cocomo81}
\end{figure}

%\begin{figure}[th!]
%\begin{center}
%\includegraphics[width=0.4\textwidth]{lib/desharnais.png}
%\end{center}
%\caption{$B\&V$ values for Desharnais.}
%\label{fig:desharnais}
%\end{figure}

\begin{figure*}[th!]
\begin{center}
\scriptsize
\baselinestretch
\begin{tabular}{l | l | c | c}
Dataset & SM & Bias     & Variance     \\\hline\hline
 \multirow{3}{*}{  cocomo81 } &  3Way  & \boxplot{ 15.0 }{ 15.0 }{ 15.0 }{ 15.0 }{ 15.1 } & \boxplot{ 9.7 }{ 13.0 }{ 14.4 }{ 14.8 }{ 15.0 } \\
  &  10Way  & \boxplot{ 15.0 }{ 15.0 }{ 15.0 }{ 15.0 }{ 15.1 } & \boxplot{ 8.6 }{ 12.7 }{ 14.3 }{ 14.6 }{ 15.4 } \\
  &  LOO  & \boxplot{ 15.0 }{ 15.0 }{ 15.0 }{ 15.0 }{ 15.1 } & \boxplot{ 6.7 }{ 12.5 }{ 14.1 }{ 14.5 }{ 15.5 } \\\hline
 \multirow{3}{*}{  cocomo81o } &  3Way  & \boxplot{ 7.9 }{ 7.9 }{ 7.9 }{ 7.9 }{ 13.8 } & \boxplot{ 3.7 }{ 6.8 }{ 7.2 }{ 7.5 }{ 16.6 } \\
  &  10Way  & \boxplot{ 7.9 }{ 7.9 }{ 7.9 }{ 7.9 }{ 9.9 } & \boxplot{ 2.5 }{ 6.8 }{ 7.1 }{ 7.4 }{ 9.9 } \\
  &  LOO  & \boxplot{ 7.9 }{ 7.9 }{ 7.9 }{ 7.9 }{ 9.9 } & \boxplot{ 1.6 }{ 6.9 }{ 7.2 }{ 7.4 }{ 10.0 } \\\hline
 \multirow{3}{*}{  cocomo81e } &  3Way  & \boxplot{ 15.5 }{ 15.5 }{ 15.6 }{ 15.6 }{ 16.8 } & \boxplot{ 10.3 }{ 13.8 }{ 15.4 }{ 15.7 }{ 18.3 } \\
  &  10Way  & \boxplot{ 15.5 }{ 15.5 }{ 15.5 }{ 15.6 }{ 15.9 } & \boxplot{ 10.1 }{ 12.9 }{ 14.4 }{ 15.5 }{ 17.0 } \\
  &  LOO  & \boxplot{ 15.5 }{ 15.5 }{ 15.5 }{ 15.6 }{ 16.0 } & \boxplot{ 8.9 }{ 12.9 }{ 14.7 }{ 15.7 }{ 17.1 } \\\hline
 \multirow{3}{*}{  cocomo81s } &  3Way  & \boxplot{ 15.0 }{ 15.0 }{ 15.0 }{ 15.1 }{ 15.5 } & \boxplot{ 8.4 }{ 12.0 }{ 12.1 }{ 13.3 }{ 15.5 } \\
  &  10Way  & \boxplot{ 15.0 }{ 15.0 }{ 15.0 }{ 15.1 }{ 17.8 } & \boxplot{ 8.3 }{ 11.5 }{ 12.5 }{ 13.8 }{ 16.5 } \\
  &  LOO  & \boxplot{ 15.0 }{ 15.0 }{ 15.1 }{ 15.1 }{ 17.9 } & \boxplot{ 8.3 }{ 10.6 }{ 12.1 }{ 13.4 }{ 16.5 } \\\hline
 \multirow{3}{*}{  nasa93 } &  3Way  & \boxplot{ 14.1 }{ 14.1 }{ 14.1 }{ 14.1 }{ 14.2 } & \boxplot{ 8.6 }{ 12.7 }{ 13.5 }{ 13.7 }{ 14.4 } \\
  &  10Way  & \boxplot{ 14.1 }{ 14.1 }{ 14.1 }{ 14.1 }{ 14.2 } & \boxplot{ 7.0 }{ 12.5 }{ 13.6 }{ 13.7 }{ 14.6 } \\
  &  LOO  & \boxplot{ 14.1 }{ 14.1 }{ 14.1 }{ 14.1 }{ 14.2 } & \boxplot{ 4.8 }{ 12.5 }{ 13.4 }{ 13.6 }{ 14.4 } \\\hline
 \multirow{3}{*}{  nasa93\_center\_1 } &  3Way  & \boxplot{ 9.7 }{ 9.7 }{ 9.7 }{ 9.8 }{ 12.1 } & \boxplot{ 3.6 }{ 6.6 }{ 8.9 }{ 9.9 }{ 12.8 } \\
  &  10Way  & \boxplot{ 9.7 }{ 9.7 }{ 9.7 }{ 9.8 }{ 11.9 } & \boxplot{ 3.5 }{ 8.9 }{ 9.6 }{ 9.7 }{ 12.7 } \\
  &  LOO  & \boxplot{ 9.7 }{ 9.7 }{ 9.7 }{ 9.8 }{ 12.0 } & \boxplot{ 3.5 }{ 8.9 }{ 9.6 }{ 9.7 }{ 12.6 } \\\hline
 \multirow{3}{*}{  nasa93\_center\_2 } &  3Way  & \boxplot{ 11.6 }{ 11.6 }{ 11.6 }{ 11.7 }{ 13.2 } & \boxplot{ 7.2 }{ 10.5 }{ 11.1 }{ 11.4 }{ 15.4 } \\
  &  10Way  & \boxplot{ 11.6 }{ 11.6 }{ 11.6 }{ 11.7 }{ 14.3 } & \boxplot{ 5.5 }{ 10.4 }{ 11.1 }{ 11.3 }{ 15.8 } \\
  &  LOO  & \boxplot{ 11.6 }{ 11.6 }{ 11.6 }{ 11.7 }{ 14.6 } & \boxplot{ 4.4 }{ 10.4 }{ 11.0 }{ 11.3 }{ 15.9 } \\\hline
 \multirow{3}{*}{  nasa93\_center\_5 } &  3Way  & \boxplot{ 14.5 }{ 14.5 }{ 14.6 }{ 14.6 }{ 15.6 } & \boxplot{ 10.1 }{ 13.1 }{ 14.0 }{ 14.3 }{ 19.2 } \\
  &  10Way  & \boxplot{ 14.5 }{ 14.5 }{ 14.6 }{ 14.6 }{ 15.4 } & \boxplot{ 8.7 }{ 12.7 }{ 13.9 }{ 14.1 }{ 17.1 } \\
  &  LOO  & \boxplot{ 14.5 }{ 14.5 }{ 14.6 }{ 14.6 }{ 15.3 } & \boxplot{ 7.2 }{ 12.9 }{ 13.8 }{ 14.0 }{ 17.0 } \\\hline
 \multirow{3}{*}{  desharnais } &  3Way  & \boxplot{ 16.8 }{ 16.8 }{ 16.8 }{ 16.8 }{ 17.0 } & \boxplot{ 11.5 }{ 15.9 }{ 16.2 }{ 16.4 }{ 16.8 } \\
 &  10Way  & \boxplot{ 16.8 }{ 16.8 }{ 16.8 }{ 16.8 }{ 17.0 } & \boxplot{ 9.9 }{ 16.0 }{ 16.2 }{ 16.5 }{ 16.8 } \\
 &  LOO  & \boxplot{ 16.8 }{ 16.8 }{ 16.8 }{ 16.8 }{ 17.0 } & \boxplot{ 7.7 }{ 16.0 }{ 16.2 }{ 16.4 }{ 16.9 } \\\hline
 \multirow{3}{*}{  desharnaisL1 } &  3Way  & \boxplot{ 16.9 }{ 16.9 }{ 16.9 }{ 16.9 }{ 17.2 } & \boxplot{ 12.3 }{ 16.1 }{ 16.4 }{ 16.5 }{ 16.9 } \\
  &  10Way  & \boxplot{ 16.9 }{ 16.9 }{ 16.9 }{ 16.9 }{ 17.2 } & \boxplot{ 10.8 }{ 16.0 }{ 16.4 }{ 16.6 }{ 16.9 } \\
  &  LOO  & \boxplot{ 16.9 }{ 16.9 }{ 16.9 }{ 16.9 }{ 17.3 } & \boxplot{ 9.1 }{ 16.0 }{ 16.3 }{ 16.6 }{ 16.7 } \\\hline
 \multirow{3}{*}{  desharnaisL2 } &  3Way  & \boxplot{ 16.5 }{ 16.5 }{ 16.5 }{ 16.5 }{ 16.8 } & \boxplot{ 12.6 }{ 16.2 }{ 16.3 }{ 16.5 }{ 16.9 } \\
  &  10Way  & \boxplot{ 16.5 }{ 16.5 }{ 16.5 }{ 16.5 }{ 16.7 } & \boxplot{ 11.2 }{ 16.1 }{ 16.3 }{ 16.4 }{ 16.8 } \\
  &  LOO  & \boxplot{ 16.5 }{ 16.5 }{ 16.5 }{ 16.5 }{ 16.8 } & \boxplot{ 10.0 }{ 16.1 }{ 16.3 }{ 16.3 }{ 16.7 } \\\hline
 \multirow{3}{*}{  desharnaisL3 } &  3Way  & \boxplot{ 14.7 }{ 14.7 }{ 14.7 }{ 14.7 }{ 16.8 } & \boxplot{ 11.1 }{ 11.6 }{ 14.1 }{ 14.7 }{ 17.7 } \\
  &  10Way  & \boxplot{ 14.7 }{ 14.7 }{ 14.7 }{ 14.7 }{ 18.5 } & \boxplot{ 10.3 }{ 10.3 }{ 13.9 }{ 14.7 }{ 19.1 } \\
  &  LOO  & \boxplot{ 14.7 }{ 14.7 }{ 14.7 }{ 14.7 }{ 18.5 } & \boxplot{ 10.3 }{ 10.3 }{ 13.9 }{ 14.7 }{ 19.1 } \\\hline
 \multirow{3}{*}{  sdr } &  3Way  & \boxplot{ 8.4 }{ 8.4 }{ 8.4 }{ 8.5 }{ 15.0 } & \boxplot{ 3.4 }{ 5.5 }{ 7.5 }{ 8.6 }{ 16.1 } \\
  &  10Way  & \boxplot{ 8.4 }{ 8.4 }{ 8.5 }{ 8.5 }{ 14.9 } & \boxplot{ 2.9 }{ 5.5 }{ 6.7 }{ 7.7 }{ 15.3 } \\
  &  LOO  & \boxplot{ 8.4 }{ 8.4 }{ 8.5 }{ 8.5 }{ 14.7 } & \boxplot{ 2.1 }{ 5.2 }{ 6.8 }{ 8.4 }{ 16.3 } \\\hline
 \multirow{3}{*}{  albrecht } &  3Way  & \boxplot{ 6.7 }{ 6.7 }{ 6.7 }{ 6.7 }{ 6.8 } & \boxplot{ 2.8 }{ 5.7 }{ 6.1 }{ 6.3 }{ 6.9 } \\
  &  10Way  & \boxplot{ 6.7 }{ 6.7 }{ 6.7 }{ 6.7 }{ 6.8 } & \boxplot{ 1.2 }{ 5.7 }{ 6.3 }{ 6.5 }{ 6.9 } \\
  &  LOO  & \boxplot{ 6.7 }{ 6.7 }{ 6.7 }{ 6.7 }{ 6.8 } & \boxplot{ 0.3 }{ 5.6 }{ 6.3 }{ 6.5 }{ 7.1 } \\\hline
 \multirow{3}{*}{  finnish } &  3Way  & \boxplot{ 17.7 }{ 17.7 }{ 17.7 }{ 17.7 }{ 17.7 } & \boxplot{ 12.8 }{ 17.2 }{ 17.4 }{ 17.6 }{ 17.9 } \\
  &  10Way  & \boxplot{ 17.7 }{ 17.7 }{ 17.7 }{ 17.7 }{ 17.8 } & \boxplot{ 11.5 }{ 17.2 }{ 17.4 }{ 17.6 }{ 17.9 } \\
  &  LOO  & \boxplot{ 17.7 }{ 17.7 }{ 17.7 }{ 17.7 }{ 17.8 } & \boxplot{ 10.3 }{ 17.2 }{ 17.4 }{ 17.6 }{ 18.0 } \\\hline
 \multirow{3}{*}{  kemerer } &  3Way  & \boxplot{ 11.1 }{ 11.1 }{ 11.1 }{ 11.1 }{ 11.5 } & \boxplot{ 7.3 }{ 9.0 }{ 10.1 }{ 10.8 }{ 11.9 } \\
  &  10Way  & \boxplot{ 11.1 }{ 11.1 }{ 11.1 }{ 11.1 }{ 11.3 } & \boxplot{ 6.0 }{ 8.9 }{ 9.8 }{ 10.4 }{ 11.0 } \\
  &  LOO  & \boxplot{ 11.1 }{ 11.1 }{ 11.1 }{ 11.1 }{ 11.3 } & \boxplot{ 5.8 }{ 8.7 }{ 9.9 }{ 10.3 }{ 11.1 } \\\hline
 \multirow{3}{*}{  maxwell } &  3Way  & \boxplot{ 18.5 }{ 18.5 }{ 18.5 }{ 18.5 }{ 18.8 } & \boxplot{ 13.6 }{ 17.4 }{ 18.2 }{ 18.3 }{ 19.6 } \\
  &  10Way  & \boxplot{ 18.5 }{ 18.5 }{ 18.5 }{ 18.5 }{ 18.6 } & \boxplot{ 12.1 }{ 17.4 }{ 18.2 }{ 18.3 }{ 19.0 } \\
  &  LOO  & \boxplot{ 18.5 }{ 18.5 }{ 18.5 }{ 18.5 }{ 18.6 } & \boxplot{ 10.1 }{ 17.4 }{ 18.2 }{ 18.4 }{ 19.0 } \\\hline
 \multirow{3}{*}{  miyazaki94 } &  3Way  & \boxplot{ 10.8 }{ 10.8 }{ 10.9 }{ 10.9 }{ 10.9 } & \boxplot{ 5.3 }{ 7.6 }{ 9.1 }{ 10.4 }{ 11.3 } \\
 &  10Way  & \boxplot{ 10.8 }{ 10.8 }{ 10.9 }{ 10.9 }{ 10.9 } & \boxplot{ 4.6 }{ 7.8 }{ 8.8 }{ 10.7 }{ 11.7 } \\
 &  LOO  & \boxplot{ 10.8 }{ 10.8 }{ 10.9 }{ 10.9 }{ 10.9 } & \boxplot{ 3.1 }{ 7.4 }{ 8.7 }{ 10.2 }{ 11.5 } \\\hline
 \multirow{3}{*}{  telecom } &  3Way  & \boxplot{ 11.1 }{ 11.1 }{ 11.1 }{ 11.1 }{ 11.3 } & \boxplot{ 7.6 }{ 10.0 }{ 10.5 }{ 10.8 }{ 11.0 } \\
 &  10Way  & \boxplot{ 11.1 }{ 11.1 }{ 11.1 }{ 11.1 }{ 11.3 } & \boxplot{ 5.9 }{ 10.0 }{ 10.5 }{ 10.9 }{ 11.1 } \\
 &  LOO  & \boxplot{ 11.1 }{ 11.1 }{ 11.1 }{ 11.1 }{ 11.4 } & \boxplot{ 5.4 }{ 9.9 }{ 10.5 }{ 10.9 }{ 11.1 } \\
\end{tabular}
\end{center}
\caption{$B\&V$ values in quartiles.}
\label{fig:desharnais}
\end{figure*}

\newcommand{\R}{\rowcolor[rgb]{0.8,0.8,0.8}}
\newcommand{\G}{\cellcolor[rgb]{0.8,0.8,0.8}}

\begin{figure}[th!]
\begin{center}
{\scriptsize  
\begin{tabular}{l | l | l l | l  l }
dataset &   \multicolumn{3}{c|}{bias}	&	\multicolumn{2}{c}{variance} \\\hline\hline
 &  \multicolumn{2}{r}{3Way} & 10Way  & 3Way & 10Way \\\cline{3-6} 
\multirow{2}{*}{cocomo81} 	&	 LOO 	&	{\G}	43.33	&	{\G}	82.22	&	{\G}	56.67	&	{\G}	80.00	\\
	&	 3Way 	&			&	{\G}	21.11	&			&	{\G}	40.00	\\\hline
\multirow{2}{*}{cocomo81o} 	&	 LOO 	&	{\G}	91.11	&	{\G}	100.00	&	{\G}	75.56	&	{\G}	93.33	\\
	&	 3Way 	&			&	{\G}	90.00	&			&	{\G}	63.33	\\\hline
\multirow{2}{*}{cocomo81e} 	&	 LOO 	&	{\G}	67.78	&	{\G}	88.89	&	{\G}	54.44	&	{\G}	77.78	\\
	&	 3Way 	&			&	{\G}	35.56	&			&	{\G}	18.89	\\\hline
\multirow{2}{*}{cocomo81s} 	&	 LOO 	&	{\G}	62.22	&	{\G}	86.67	&	{\G}	55.56	&	{\G}	74.44	\\
	&	 3Way 	&			&	{\G}	32.22	&			&	{\G}	34.44	\\\hline
\multirow{2}{*}{nasa93} 	&	 LOO 	&	{\G}	81.11	&	{\G}	90.00	&	{\G}	62.22	&	{\G}	75.56	\\
	&	 3Way 	&			&	{\G}	58.89	&			&	{\G}	60.00	\\\hline
\multirow{2}{*}{nasa93\_center\_1} 	&	 LOO 	&	{\G}	94.44	&	{\G}	94.44	&	{\G}	46.67	&	{\G}	84.44	\\
	&	 3Way 	&			&	{\G}	81.11	&			&	{\G}	46.67	\\\hline
\multirow{2}{*}{nasa93\_center\_2} 	&	 LOO 	&	{\G}	84.44	&	{\G}	95.56	&	{\G}	76.67	&	{\G}	91.11	\\
	&	 3Way 	&			&	{\G}	57.78	&			&	{\G}	42.22	\\\hline
\multirow{2}{*}{nasa93\_center\_5} 	&	 LOO 	&	{\G}	86.67	&	{\G}	96.67	&	{\G}	70.00	&	{\G}	87.78	\\
	&	 3Way 	&			&	{\G}	71.11	&			&	{\G}	41.11	\\\hline
\multirow{2}{*}{desharnais} 	&	 LOO 	&	{\G}	100.00	&	{\G}	100.00	&	{\G}	91.11	&	{\G}	93.33	\\
	&	 3Way 	&			&	{\G}	100.00	&			&	{\G}	81.11	\\\hline
\multirow{2}{*}{desharnaisL1} 	&	 LOO 	&	{\G}	100.00	&	{\G}	100.00	&	{\G}	91.11	&	{\G}	92.22	\\
	&	 3Way 	&			&	{\G}	97.78	&			&	{\G}	85.56	\\\hline
\multirow{2}{*}{desharnaisL2} 	&	 LOO 	&	{\G}	98.89	&	{\G}	100.00	&	{\G}	91.11	&	{\G}	93.33	\\
	&	 3Way 	&			&	{\G}	94.44	&			&	{\G}	68.89	\\\hline
\multirow{2}{*}{desharnaisL3} 	&	 LOO 	&	{\G}	94.44	&	{\G}	100.00	&	{\G}	60.00	&	{\G}	100.00	\\
	&	 3Way 	&			&	{\G}	85.56	&			&	{\G}	43.33	\\\hline
\multirow{2}{*}{sdr} 	&	 LOO 	&	{\G}	52.22	&	{\G}	64.44	&	{\G}	28.89	&	{\G}	62.22	\\
	&	 3Way 	&			&	{\G}	20.00	&			&	{\G}	16.67	\\\hline
\multirow{2}{*}{albrecht} 	&	 LOO 	&	{\G}	98.89	&	{\G}	100.00	&	{\G}	78.89	&	{\G}	93.33	\\
	&	 3Way 	&			&	{\G}	77.78	&			&	{\G}	50.00	\\\hline
\multirow{2}{*}{finnish} 	&	 LOO 	&	{\G}	100.00	&	{\G}	100.00	&	{\G}	91.11	&	{\G}	92.22	\\
	&	 3Way 	&			&	{\G}	100.00	&			&	{\G}	84.44	\\\hline
\multirow{2}{*}{kemerer} 	&	 LOO 	&	{\G}	92.22	&	{\G}	100.00	&	{\G}	77.78	&	{\G}	85.56	\\
	&	 3Way 	&			&	{\G}	82.22	&			&	{\G}	57.78	\\\hline
\multirow{2}{*}{maxwell} 	&	 LOO 	&	{\G}	94.44	&	{\G}	100.00	&	{\G}	81.11	&	{\G}	88.89	\\
	&	 3Way 	&			&	{\G}	82.22	&			&	{\G}	64.44	\\\hline
\multirow{2}{*}{miyazaki94} 	&	 LOO 	&	{\G}	76.67	&	{\G}	93.33	&	{\G}	52.22	&	{\G}	77.78	\\
	&	 3Way 	&			&	{\G}	50.00	&			&	{\G}	35.56	\\\hline
\multirow{2}{*}{telecom} 	&	 LOO 	&	{\G}	100.00	&	{\G}	100.00	&	{\G}	91.11	&	{\G}	95.56	\\
	&	 3Way 	&			&	{\G}	100.00	&			&	{\G}	70.00	\\\hline
\end{tabular}
}
\end{center}
\caption{Percentage of ties. For every highlighted cell, the percentage of ties w.r.t. the dataset size is given. LOO, 3Way and 10Way are represented by the letters $a$, $b$ and $c$ respectively.}
\label{fig:bias-var-table}
\end{figure}

The plot of sorted $B\&V$ values of \fig{bias-var-table} are given in \fig{sortedBiasPlot} and \fig{sortedVariancePlot}.

\begin{figure}[th!]
\begin{center}
\includegraphics[width=0.4\textwidth]{lib/sortedBiasTiePerc.pdf}
\end{center}
\caption{Sorted bias values of LOO, 3Way and 10Way. Actual values are given in \fig{bias-var-table}.}
\label{fig:sortedBiasPlot}
\end{figure}

\begin{figure}[th!]
\begin{center}
\includegraphics[width=0.4\textwidth]{lib/sortedBiasTiePerc.pdf}
\end{center}
\caption{Sorted bias values of LOO, 3Way and 10Way. Actual values are given in \fig{bias-var-table}.}
\label{fig:sortedBiasPlot}
\end{figure}


\subsection{Conclusions}

No difference between bias and variance.

\bibliographystyle{abbrv}
\bibliography{icse,timm,references,library}
\end{document}