\documentclass{svjour3}                     % onecolumn (standard format)
%\documentclass[smallextended]{svjour3}     % onecolumn (second format)
%\documentclass[twocolumn]{svjour3}         % twocolumn
%
\smartqed  % flush right qed marks, e.g. at end of proof
%
\usepackage{graphicx}
\usepackage{verbatim}
\usepackage{fancyvrb}
\usepackage{algorithmic}
\usepackage{cite}
\usepackage{multirow}
\usepackage{rotating}
\usepackage{subfigure}
\usepackage{float}
\restylefloat{figure}
\usepackage{times}
\usepackage{algorithm}

\newenvironment{smallitem}
 {\setlength{\topsep}{0pt}
  \setlength{\partopsep}{0pt}
  \setlength{\parskip}{0pt}
  \begin{itemize}
  \setlength{\leftmargin}{.2in}
  \setlength{\parsep}{0pt}
  \setlength{\parskip}{0pt}
  \setlength{\itemsep}{0pt}}
 {\end{itemize}}

\newenvironment{smallenum}
 {\setlength{\topsep}{0pt}
  \setlength{\partopsep}{0pt}
  \setlength{\parskip}{0pt}
  \begin{enumerate}
  \setlength{\leftmargin}{.2in}
   \setlength{\parsep}{0pt}
  \setlength{\parskip}{0pt}
  \setlength{\itemsep}{0pt}}
 {\end{enumerate}}


\usepackage[table]{xcolor}
\usepackage{url,graphicx}
\newcommand{\G}{\cellcolor[rgb]{0.8,0.8,0.8}}
\newcommand{\fig}[1]{Figure~\ref{fig:#1}}
\newcommand{\eq}[1]{Equation~\ref{eq:#1}}
\newcommand{\hyp}[1]{Hypothesis~\ref{hyp:#1}}
\newcommand{\tion}[1]{\S\ref{sec:#1}}

\newcommand{\bi}{\begin{smallitem}}
\newcommand{\ei}{\end{smallitem}}
\newcommand{\be}{\begin{smallenum}}
\newcommand{\ee}{\end{smallenum}}
\newcommand{\bd}{\begin{description}}
\newcommand{\ed}{\end{description}}


\begin{document}

\title{$21^{st}$ Century Software Effort Estimation Application Process}
%\subtitle{Investigation of stability}

%\titlerunning{Short form of title}        % if too long for running head

\author{Jacky W. Keung       \and
Ekrem Kocaguneli        \and
\\
        Tim Menzies 
}

%\authorrunning{Short form of author list} % if too long for running head

\institute{Jacky W. Keung \at
              Department of Computing\\
              The Hong Kong Polytechnic University\\
              Kowloon, Hong Kong\\
              \email{Jacky.Keung@comp.polyu.edu.hk}
              \and
E. Kocaguneli and T. Menzies\at
              Lane Department of Computer Science and Electrical Engineering \\
              West Virginia University\\
              Morgantown, WV 26505, USA\\
              \email{ekocagun@mix.wvu.edu, tim@menzies.us}
}

\date{Received: 24 December 2010 / Accepted: Feburary 2011
\\ Springer Science+Business Media, LLC 2011
}
% The correct dates will be entered by the editor


\maketitle

\begin{abstract}
This paper shows that we can propose a strong/weak relationship between datasets...
Many datasets used by prior publications are very  limited in number
to distinguish {\em strong/weak} datasets.
\end{abstract}


%In \fig{error_datasets}, we report percentage of {\em losses} for each error measure separately, instead of the {\em losses} reported above.
%The maximum number of wins for any dataset over ninety algorithms per error measure is $89\times90=8,010$. 
%\fig{error_datasets} sorts all 20 data sets by their total wins
%in all 
%seven performance criteria separately (expressed as a ratio of $8,010$). For example,
%with 
%the TELECOM dataset, all 90 methods rarely won.
Similarly, the maximum number of losses for any dataset over ninety algorithms is $89\times7\times90=56,070$. 
\fig{error_datasets} sorts all 20 data sets by their total losses
in all 
seven performance criteria (expressed as a ratio of 50,070). For example,
with 
the TELECOM dataset, all 90 methods rarely lost.

%\begin{figure}[!b]
%\begin{center} \includegraphics[width=3in]{lib/cols-win-all-together.pdf} \end{center}
%\caption{Total wins seen in 20 datasets, expressed as a percentage
%of the maximum number of possible wins seen for one datasets (so 100\%=8,010).}\label{fig:error_datasets}
%\end{figure}

\begin{figure}[!b]
\begin{center} \includegraphics[width=3in]{lib/cols.pdf} \end{center}
\caption{Total losses seen in 20 datasets, expressed as a percentage
of the maximum number of possible losses seen for one datasets (so 100\%=50,070).}\label{fig:error_datasets}
\end{figure} 


\fig{rank_changes_dataset} is somewhat a continuation of \fig{error_datasets}, in the sense that it deals with the stability of datasests.
To test the stability, we question the mean of maximum rank change among datasets, when sorted w.r.t. $win$, $tie$, $win-loss$ over 7 error measures.
\fig{rank_changes_dataset} shows that the maximum value of mean-rank change is $18$, i.e. a method ranked as $2^{nd}$ in one scenario can rank as $20^{th}$ in another scenario.
Therefore, with that amount of datasets, it is not healthy to propose {\em strong} or {\em weak} datasets that always attain lowest/highest performance values.
If a dataset can change its position with a $+x$ or $-x$ amount, then there is a need for a window size of at least $2x$ and possibly some more datasets to actually observe how datasets would rank.

%\fig{rank_changes_dataset} distinguishes 3 regions when we impose a less than 10 mean rank change line.
%In these 3 regions, only $r2$ has datasets with less than 10 mean rank change.
%$r2$ contains 9 different datasets from cocomo81o to desharnais.
%When we impose these 3 regions onto \fig{error_datasets}, we see that stable datasets fall between the loss percentages of $5\%$ to $15\%$ (see dashed lines).

\begin{figure}[!b]
\begin{center} \includegraphics[width=0.7\textwidth]{lib/rank-changes-dataset.pdf} \end{center}
\caption{Datasets and the mean of their maximum rank changes over all performance measures w.r.t. $win$, $loss$ and $win-loss$ values. Some datasets have lower rank-changes. However, the maximum mean-rank change is around $18$ and we need more than $2 \times 18 = 36$ datasets to claim an order, hence strong/weak rekationship, between datasets.}
\label{fig:rank_changes_dataset}
\end{figure}


Our  datasets could be sorted according to how well
they can distinguish between  effort estimators; for that matter, there is a need for more publicly available datasets.
%specifically, eleven
%datasets (in common use  in the effort estimation literature)   are
%{\em weak}; i.e. poorly distinguish the behavior of different
%estimators. 


\bibliographystyle{abbrv}
\bibliography{icsp}

\end{document}