% This is LLNCS.DEM the demonstration file of
% the LaTeX macro package from Springer-Verlag
% for Lecture Notes in Computer Science,
% version 2.3 for LaTeX2e
%
\documentclass[9pt]{llncs}
\usepackage{endfloat}
\usepackage{graphicx}
\usepackage{alltt}
\usepackage{wrapfig,pifont,url,cite,times}
%
\usepackage{makeidx}  % allows for index generation
\newenvironment{smallitem}
 {\setlength{\topsep}{0pt}
  \setlength{\partopsep}{0pt}
  \setlength{\parskip}{0pt}
  \begin{itemize}
   \setlength{\leftmargin}{.2in}
  \setlength{\parsep}{0pt}
  \setlength{\parskip}{0pt}
  \setlength{\itemsep}{0pt}}
 {\end{itemize}}
\newenvironment{smallenum}
 {\setlength{\topsep}{0pt}
  \setlength{\partopsep}{0pt}
  \setlength{\parskip}{0pt}
  \begin{enumerate}
  \setlength{\leftmargin}{.2in}
  \setlength{\parsep}{0pt}
  \setlength{\parskip}{0pt}
  \setlength{\itemsep}{0pt}}
 {\end{enumerate}}

 \newcommand{\bd}{\begin{description}}
\newcommand{\ed}{\end{description}}
 \newcommand{\bi}{\begin{smallitem}}
\newcommand{\ei}{\end{smallitem}}
 \newcommand{\be}{\begin{smallenum}}
\newcommand{\ee}{\end{smallenum}}
\newcommand{\fig}[1]{Figure~\ref{fig:#1}}
\newcommand{\eq}[1]{Equation~\ref{eq:#1}}
\newcommand{\tion}[1]{\S\ref{sec:#1}}

%

\renewcommand{\baselinestretch}{1.3} 

\begin{document}
%
\frontmatter          % for the preliminaries
%
%\addtocmark{Hamiltonian Mechanics} % additional mark in the TOC
%
%\chapter*{Preface}
%%
%This textbook is intended for use by students of physics, physical
%chemistry, and theoretical chemistry. The reader is presumed to have a
%basic knowledge of atomic and quantum physics at the level provided, for
%example, by the first few chapters in our book {\it The Physics of Atoms
%and Quanta}. The student of physics will find here material which should
%be included in the basic education of every physicist. This book should
%furthermore allow students to acquire an appreciation of the breadth and
%variety within the field of molecular physics and its future as a
%fascinating area of research.
%
%For the student of chemistry, the concepts introduced in this book will
%provide a theoretical framework for that entire field of study. With the
%help of these concepts, it is at least in principle possible to reduce
%the enormous body of empirical chemical knowledge to a few basic
%principles: those of quantum mechanics. In addition, modern physical
%methods whose fundamentals are introduced here are becoming increasingly
%important in chemistry and now represent indispensable tools for the
%chemist. As examples, we might mention the structural analysis of
%complex organic compounds, spectroscopic investigation of very rapid
%reaction processes or, as a practical application, the remote detection
%of pollutants in the air.
%
%\vspace{1cm}
%\begin{flushright}\noindent
%April 1995\hfill Walter Olthoff\\
%Program Chair\\
%ECOOP'95
%\end{flushright}
%%
%\chapter*{Organization}
%ECOOP'95 is organized by the department of Computer Science, University
%of \AA rhus and AITO (association International pour les Technologie
%Object) in cooperation with ACM/SIGPLAN.
%%
%\section*{Executive Committee}
%\begin{tabular}{@{}p{5cm}@{}p{7.2cm}@{}}
%Conference Chair:&Ole Lehrmann Madsen (\AA rhus University, DK)\\
%Program Chair:   &Walter Olthoff (DFKI GmbH, Germany)\\
%Organizing Chair:&J\o rgen Lindskov Knudsen (\AA rhus University, DK)\\
%Tutorials:&Birger M\o ller-Pedersen\hfil\break
%(Norwegian Computing Center, Norway)\\
%Workshops:&Eric Jul (University of Kopenhagen, Denmark)\\
%Panels:&Boris Magnusson (Lund University, Sweden)\\
%Exhibition:&Elmer Sandvad (\AA rhus University, DK)\\
%Demonstrations:&Kurt N\o rdmark (\AA rhus University, DK)
%\end{tabular}
%%
%\section*{Program Committee}
%\begin{tabular}{@{}p{5cm}@{}p{7.2cm}@{}}
%Conference Chair:&Ole Lehrmann Madsen (\AA rhus University, DK)\\
%Program Chair:   &Walter Olthoff (DFKI GmbH, Germany)\\
%Organizing Chair:&J\o rgen Lindskov Knudsen (\AA rhus University, DK)\\
%Tutorials:&Birger M\o ller-Pedersen\hfil\break
%(Norwegian Computing Center, Norway)\\
%Workshops:&Eric Jul (University of Kopenhagen, Denmark)\\
%Panels:&Boris Magnusson (Lund University, Sweden)\\
%Exhibition:&Elmer Sandvad (\AA rhus University, DK)\\
%Demonstrations:&Kurt N\o rdmark (\AA rhus University, DK)
%\end{tabular}
%%
%\begin{multicols}{3}[\section*{Referees}]
%V.~Andreev\\
%B\"arwolff\\
%E.~Barrelet\\
%H.P.~Beck\\
%G.~Bernardi\\
%E.~Binder\\
%P.C.~Bosetti\\
%Braunschweig\\
%F.W.~B\"usser\\
%T.~Carli\\
%A.B.~Clegg\\
%G.~Cozzika\\
%S.~Dagoret\\
%Del~Buono\\
%P.~Dingus\\
%H.~Duhm\\
%J.~Ebert\\
%S.~Eichenberger\\
%R.J.~Ellison\\
%Feltesse\\
%W.~Flauger\\
%A.~Fomenko\\
%G.~Franke\\
%J.~Garvey\\
%M.~Gennis\\
%L.~Goerlich\\
%P.~Goritchev\\
%H.~Greif\\
%E.M.~Hanlon\\
%R.~Haydar\\
%R.C.W.~Henderso\\
%P.~Hill\\
%H.~Hufnagel\\
%A.~Jacholkowska\\
%Johannsen\\
%S.~Kasarian\\
%I.R.~Kenyon\\
%C.~Kleinwort\\
%T.~K\"ohler\\
%S.D.~Kolya\\
%P.~Kostka\\
%U.~Kr\"uger\\
%J.~Kurzh\"ofer\\
%M.P.J.~Landon\\
%A.~Lebedev\\
%Ch.~Ley\\
%F.~Linsel\\
%H.~Lohmand\\
%Martin\\
%S.~Masson\\
%K.~Meier\\
%C.A.~Meyer\\
%S.~Mikocki\\
%J.V.~Morris\\
%B.~Naroska\\
%Nguyen\\
%U.~Obrock\\
%G.D.~Patel\\
%Ch.~Pichler\\
%S.~Prell\\
%F.~Raupach\\
%V.~Riech\\
%P.~Robmann\\
%N.~Sahlmann\\
%P.~Schleper\\
%Sch\"oning\\
%B.~Schwab\\
%A.~Semenov\\
%G.~Siegmon\\
%J.R.~Smith\\
%M.~Steenbock\\
%U.~Straumann\\
%C.~Thiebaux\\
%P.~Van~Esch\\
%from Yerevan Ph\\
%L.R.~West\\
%G.-G.~Winter\\
%T.P.~Yiou\\
%M.~Zimmer\end{multicols}
%%
%\section*{Sponsoring Institutions}
%%
%Bernauer-Budiman Inc., Reading, Mass.\\
%The Hofmann-International Company, San Louis Obispo, Cal.\\
%Kramer Industries, Heidelberg, Germany
%%
%\tableofcontents
%%
%\mainmatter              % start of the contributions
%
\title{Accurate Estimates Without Local Data?}

%
%
\author{Tim Menzies \inst{1} 
\and Steve Williams \inst{1} 
\and Oussama Elrawas  \inst{1}  
\and Daniel Baker\inst{1} 
\and Barry Boehm \inst{2}
\and Jairus Hihn\inst{3} 
\and Karen Lum\inst{3}
\and Ray Madachy \inst{4} 
%\thanks{
%Parts of this research was carried out at West Virginia University, the University of Southern California,
%and 
%the Naval Postgraduate School.
%Other parts of this research was carried out at
%the Jet Propulsion Laboratory, California Institute of Technology, under a contract with the National Aeronautics and Space Administration.
%Reference herein to any specific commercial product, process, or service by trade name, trademark, manufacturer, or otherwise, does not constitute or imply its endorsement by the United States Government or the Jet Propulsion Laboratory, California Institute of Technology.
%} 
}
%
\authorrunning{Menzies et.al.}   % abbreviated author list (for running head)
%
%%%% list of authors for the TOC (use if author list has to be modified)
\tocauthor{Tim Menzies,Oussama Elrawas,Barry Boehm, Ray Madachy}

%
\institute{LCSEE,
West Virginia University, 
Morgantown, WV, USA,
\email{tim@menzies.us}, \email{oelrawas@mix.wvu.edu}, 
\email{swilli12@mix.wvu.edu} \email{danielryanbaker@gmail.com}
\and
CS,
 University of Southern California,
Los Angeles, California, USA,
\email{boehm@sunset.usc.edu},
\email{madachy@usc.edu}
\and
JPL,  California, USA,
\email{jairus.hihn@jpl.nasa.gov}, \email{karen.t.lum@jpl.nasa.gov}
\and
SE, Naval Postgraduate School, San Diego CA, USA,
\email{rjmadach@nps.edu}
}
 
\pagenumbering{arabic}
\pagestyle{plain}  % switches on printing of running heads
\thispagestyle{empty}
\maketitle              % typeset the title of the contribution

\begin{abstract}
Models of software projects input project details and output predictions
via their internal tunings. The output predictions, therefore, are
affected by variance in the project details $P$ and variance in
the internal tunings $T$.  
Local data is often used to constrain 
the internal tunings (reducing $T$).

While constraining internal tunings with
local data is always the preferred option, there exist some models
for which constraining tuning is optional.
We show empirically that, for the USC COCOMO family of models, 
the effects of $P$ dominate the effects of $T$
 i.e. the output variance of these models can be controlled 
{\em without using local data to
constrain the tuning variance} (in ten case
studies, we show that the estimates generated by only constraining
$P$ are very similar to those produced by constraining $T$ with
historical data).

We conclude that, if possible, models should be designed such that the effects
of the project options dominate the effects of the tuning options.
Such models can be used for the purposes of decision making without
elaborate, tedious, and time-consuming data collection from the local domain.

{\em KEYWORDS:} AI, decision making, software engineering, model-based project management,
search.

{\em LENGTH:} 5500 words + 8 figures (at 250 words each) = 7500 words
 \end{abstract} % \section{Introduction}


% Consequently, for some years now, we have been exploring methods to
%% reduce the uncertainty in model
%% estimation~\cite{me04h,me05a,me05c,me05d,me06d,me06e,me06f,baker07}.  Like
%% other researchers, we have explored data pruning methods: given a
%% table of $V$ variables from $P$ projects, we find a new table $P'*V'$
%% where $P' \subseteq P$ and $V' \subseteq V$.  Pruning is useful since
%% project data collected in one context may not be relevant to another.
%% Similar projects have less variation in their data and so can make
%% better use of local data for the purposes of model calibration.  For
%% example, Chulani et.al.~\cite{chulani99} \& Shepperd and
%% Schofield~\cite{shepperd97} report that intelligently pruning project
%% data improves estimation accuracy.  Similarly, there are many
%% theoretical~\cite{miller02} and experimental
%% results~\cite{me05c,me05d,me06d} showing that intelligently pruning
%% model variables can reduce estimation variance and error.

%% While certainly useful, data pruning still assumes that there is data to prune.
%% This is not always the case.
%If enough high-quality data is available, then  
%it is good practice to tune general process models with local information.
%However, it can be  very difficult
%to find relevant data within a single organization to 
%precisely tune all the internal parameters inside a process model.
%For example:
%\bi
%\item After 26 years of trying, we have only collected less
%than 200 sample projects for the COCOMO database.
%\item
%Even after two years of effort we were only able to
%add 7 records to a NASA-wide software cost metrics repository~\cite{me07f}.
%\ei
%There are many reasons for this ``data drought'' including data not
%being collected or the business sensitivity associated with the
%data as well as differences in how the metrics are defined, collected
%and archived. For years, we have struggled with the data drought
%problem and have recommended elaborate feature subset selection
%methods to prune uninformative data~\cite{chen05,me06d}. Here, we take
%a radically different approach and explore the value of models that have
%not been tuned.

\section{Introduction}
Predictions are generated using data and/or models.
If data is available, it is possible to learn new models.
If data is unavailable, it is possible to reuse models learned from
other sites. Typically, models taken from other sites must be tuned
using local data.
This paper is about a novel technique for reusing models {\em without}
tuning them.

In model-based project management, models are used to
find better ways
to organize a project. Managers conduct what-if queries across the space of project options $P$
to find  a subset of those options  
that most improves predictions about the project (e.g. 
reduce defects and development time). 
This problem can be formalized as follows: find the smallest $P' \subseteq P$
that most improves model estimates.
Conceptually,
the estimate generated by a model are a function of the project options $P$ and the internal
model tunings $T$; i.e.
\begin{equation}{\mbox {\em
estimate = project  * tuning            } 
}\end{equation}
For example, consider the following simplified COCOMO model, 
\begin{equation}\label{eq:simp}
effort = a \cdot LOC^{b +  pmat}\cdot acap  
\end{equation}
Here, $a,b$ control the linear and exponential
effects (respectively) on model estimates.
while
$pmat$ (process maturity) and $acap$ (analyst capability) are project choices
adjusted by managers. 
That is, the tuning options $T$ are the range of $a,b$ and the project 
options $P$ are the range of $pmat$ and $acap$.

Traditional approaches use historical data to reduce the  space of possible
calibrations (e.g. using regression). In our approach, we leave the tunable  variables
unconstrained and instead use an AI search engine to reduce the space  of possibilities in
the project options. 
Our SEESAW tool
performs large scale what-if queries looking for 
the smallest $P' \subseteq P$
that most improves model estimates.
SEESAW
constrains  project options $P$, 
{\em but not
the tuning options $T$}.
In two studies (ASE 2007~\cite{me07f}
and ICSE 2009~\cite{me09a}) 
Menzies, Boehm, Hihn, et al. found that, at least
for the USC models we use, estimate variance was more controlled by project variance.
That is, in terms of what factors dominate the output:
\begin{equation}\label{eq:goodnews}
\mbox{\em
estimate = {\LARGE project}  * {\scriptsize tuning}              
}\end{equation}
Note the {\em reverse case} of the above equation: 
\begin{equation}\label{eq:badnews}{\mbox {\em
estimate = {\scriptsize project}  * {\LARGE tuning}            
}}
\end{equation}
In the case of \eq{badnews}, the tuning variance is the major controller of the estimate and 
decision making must be delayed until after tuning.
\eq{badnews} is undesirable when it is
difficult to access the
data required for constraining model tunings. This ``data drought''
is quite common:
\bi
\item
Metrics-guru Norman Fenton spent years advocating careful data collection~\cite{fenton96}.
Recently, he has 
despaired of that approach. At a keynote address in 2007\footnote{\url{http://promisedata.org/?cat=130}} he shocked
his audience by saying:
\begin{quote}
{\em ``....much of the current software metrics research is inherently irrelevant to the industrial mix ...
any software metrics program that depends on some extensive metrics collection is doomed to failure.''} 
\end{quote}
\item
Our experience is similar to that of Fenton. After 26 years of trying, we have only collected less
than 200 sample projects for the COCOMO database.
Also,
even after two years of effort we were only able to
add 7 records to a NASA-wide software cost metrics repository~\cite{me07f}.
\ei
There are many reasons for this data drought including data not
being collected or the business sensitivity associated with the
data, as well as differences in how the metrics are defined, collected
and archived. 
Also, within the organizations we have contact with,
we note a decreasing to share data compared to, say, 15 years ago.
 

%Process models can be viewed
%as a set of constraints between inputs and  output. In this view, tuning a model using local data
%{\em constrains}
%the model to reduce the  {\em variance} in the model output. 
%It is useful to distinguish two classes of variables,
%both of which are illustrated in the following
%simplified COCOMO model:
%Both classes of variables introduce variance into the estimates:
%\bi
%\item {\em Tuning variance:} 
%variability in the tuning variables due to training data that is incomplete or noisy;
% \item {\em Project variance:}
%variability in the project choices due to, say, uncertainty about the process maturity of the sub-contractors.
%\ei
%This paper shows that the tuning variance is surprisingly high, even after applying historical data. 
%Since tuning variance remains an open issue, it  is prudent
%to  explore the effects
%of project choices.
Whatever the reason, we often need to adjust our modeling methodologies to accommodate the data drought.
For example, we could assume \eq{goodnews} and use SEESAW. But before
we can trust those tools we must test that AI tools like SEESAW not less
accurate at estimation than (say) traditional linear regression methods.
Accordingly, this paper compares:
\bi
\item Estimates generated by conventional methods that reduce the tuning variance $T$ using local data;
\item Estimates generated by SEESAW  that reduce the project variance $P$ without constraining
the tuning variance.
\ei
For the USC models used in this study,  the range of estimate errors seen after constraining the
project options $P$ (but not the tuning options $T$)
is similar to the range seen after constraining just the
tuning options $T$.  
That is, using our AI methods, we can perform model-based project management without requiring local data.

%It is unknown if our results apply to software process models more
%complex than the COCOMO-style of models used in this study. However,
%our results to date suggest that our procedure should be applied
%to other process models:
%\bi
%\item find the key project choices  that most effect model output;
%\item constrain them;
%\item check for stable conclusions in the constrained space.
%\ei

The rest of this paper is structured as follows.
After discussing the  implications of this work,
we review the models used in this study
and the AI search engine that finds
useful constraints to project choices. This is followed by a description of an
experiment that compares model estimates
after constraining (a) just the project options using  SEESAW
or (b) just the
tuning options using linear regression.

\section{Implications}
\subsection{Implications of SEESAW for Early Lifecycle Decision Making}
Standard practice in model building normally involves three steps:
\bi
\item {\em Step 1:} Collecting domain knowledge (previous results, expert knowledge); 
\item {\em Step 2:} Building a initial model based on step 1 including as yet unknown parameters. Note
that these unknowns represent a range of tuning options. 
\item {\em Step 3}: Constraining tuning options using local data (e.g. via linear regression).
\ei
In domains suffering from a  data drought,
we cannot consider
methods requiring large volumes of data. In the approach advocated
by this paper, we reuse existing models (from USC) and then study them with an AI search engine
that only reduces project options (and not tuning options). 
This approach  removes the need for Step 1 and Step 2,
and  does not require Step 3. Yet, as shown below, it still makes accurate effort
predictions.
Since our method does not require detailed domain knowledge it can
be applied very early in the process life cycle.


From a business perspective, our result means that certain models 
can be used for decision making in one of two ways:
\be
\item 
{\em Either}
constrain the
tuning variance using historical data;
\item
{\em Or} constrain the project
variance using an AI search engine like SEESAW.
\ee
Note that this second method avoids a lengthy and expensive
data collection phase prior to decision making.
This result is of tremendous practical benefit when
it is difficult
to find relevant data within a single organization to 
constrain the tuning options of a model.


\subsection{Implications of the ``Data Drought'' on Model Selection}

Our introduction mentioned Norman Fenton's pessimism on the practicality of industrial data collection for
software engineering. Sometimes, Fenton's pessimism is unfounded.
There exists an increasing number of high process maturity organizations that store large
amounts of consistent  data
collected from projects with well defined processes.
The techniques reported in this paper (AI search using the SEESAW algorithm)
are not required for such data-rich domains.

However, in data-starved domains, a ``Goldilocks'' principle might be appropriate:
\bi
\item
Tiny models offer trite conclusions and are insensitive to important project features. 
\item
Very large models need extensive data collection to constrain the internal tunings. 
\item 
In between there may exist some models that are   ``just right'';
i.e., big enough to draw interesting conclusions,
but small enough such that the internal tuning variance does not dominate
the variance results from input project options.
\ei

We make no claim that {\em all} process models are ``just
right'' and, hence, can be controlled by our methods.
Such process models can be quite complex and include:
discrete-event models~\cite{law00,kelton02};
system dynamics models~\cite{hamid91}; 
state-based models~\cite{akhavi93,harel90,martin00};
rule-based programs~\cite{mi90}; 
or  standard programming constructs
such as those used in Little-JIL~\cite{cass00,wise00}.
These rich modeling frameworks
allow the representation of 
detailed insights into an organization. However,  in data starved domains,
the effort required
to tune may be non-trivial. 
In terms of the Goldilocks principle, we suspect
that many process models may not be  near the ``right size'' and will require
extensive tuning before they can be used for decision making. 
Hence, in domains suffering from a data drought, we would advocate ``just right'' models
like those from USC.

\section{Tools Used in Our Experiments}
\subsection{Models used in this Study}\label{sec:mods}
The USC models, we argue, are ``just right'' because they have been
developed, refined, and constrained over a very long period of time.
The range of tuning options
we explore below are taken from 30 years of modeling experience and regression
studies of hundreds of projects~\cite{boehm00a}.
The variables in our models
have been selected 
and tested by a large community of academic and industrial 
researchers led by Boehm (this large group has meet annually since 1985).
It is hardly surprising that, for the USC models, the project options
dominate the tuning options since these tuning options have been
refined and constrained by decades
of work. 

The USC models have other
useful features.
Unlike other models such as PRICE TRUE PLANNING~\cite{park88}, 
SLIM~\cite{putnam92}, or SEER-SEM~\cite{jensen83},
the COCOMO family of models
is fully described in the literature. 
Also, at least
for the effort model, there exist  baseline results~\cite{chulani99}. 
Further,
we work extensively with government agencies writing software.
Amongst those agencies, these models are frequently used to generate
and justify budgets.
Lastly,
the space of possible tunings within COCOMO \& COQUALMO
is well defined (see below). Hence, it is possible
to explore the space of possible tunings.

\begin{figure}
\begin{center}
{\scriptsize
\begin{tabular}{p{3cm}|r@{:~}l|c|c|}
\multicolumn{3}{c}{~}&strategic?&\multicolumn{1}{|c}{tactical?}\\\cline{2-5}
scale   &prec & have we done this before?&\ding{51}&\\
factors &flex & development flexibility &&\ding{51}\\
(exponentially        &resl & any risk resolution activities?&&\ding{51}\\ 
decrease       &team &  team cohesion&&\ding{51}\\ 
effort)       &pmat & process maturity&\ding{51}&\\\hline 
upper  &acap & analyst capability&\ding{51}&\\
(linearly       &pcap & programmer capability&\ding{51}&\\ 
decrease      &pcon & programmer continuity&\ding{51}&\\
effort)       &aexp &  analyst experience&\ding{51}&\\       
&pexp &  programmer experience&\ding{51}&\\       
&ltex &  language and tool experience&\ding{51}&\\       
&tool &  tool use&&\ding{51}\\       
&site &  multiple site development&\ding{51}&\\       
&sced & length of schedule  &&\ding{51} \\\hline 
lower &rely &    required reliability && \\
(linearly      &data &   secondary memory  storage requirements&&\ding{51}\\
increase      &cplx &  program complexity&&\ding{51}\\
effort)      &ruse &  software reuse&&\ding{51}\\      
&docu &   documentation requirements&&\ding{51}\\      
&time &   runtime pressure&&\\      
&stor &   main memory requirements&&\ding{51}\\     
&pvol &    platform volatility  &&\\\hline
COQUALMO &auto& automated analysis&\ding{51}&\ding{51}\\
defect removal&execTest & execution-based testing tools&\ding{51}&\ding{51}\\
methods&peer& peer reviews&\ding{51}&\ding{51}\\\cline{2-5}
\end{tabular}}
\end{center}
\caption{The variables of COCOMO, COQUALMO, and the THREAT model.}\label{fig:emsf2}
\end{figure}

\fig{emsf2} shows the variables used by our models.
The last two columns of this figure
show the results of a Delphi panel session at the
Jet Propulsion Laboratory (JPL) where the COCOMO variables
were separated into:
\bi
\item the $tactical$ variables
that can be changed within the space of one project;
\item
the  $strategic$ variables
that require higher-level institutional change (and so may take
longer to change).
\ei
For example, the panel declared that $pmat$ (process maturity)
is hard to change within the space of a single JPL project. 
When searching for subsets of project options that improve model predictions,
we take care to separate our search into either {\em tactical} or {\em strategic} runs.
Note that these definitions of $strategic$ and $tactical$ options
are not hard-wired into our system. 
If a user disagrees with our definitions of
strategic/tactical, they can change a simple configuration file.


This study uses
three USC models:
\be
\item
The COQUALMO software {\em defect} predictor~\cite[p254-268]{boehm00b}.
COQUALMO models defect introduction and defect removal
in  requirements, design, and coding.  
\item
The COCOMO {\em effort} and development {\em time} predictor~\cite[p29-57]{boehm00b}.  
COCOMO assumes that
effort depends exponentially on some {\em scale factors} and
linearly to some {\em effort multipliers}.  COCOMO
estimates development work months (total) and calendar months (elapsed)
and includes all coding,
debugging, and management activities.  \item
The THREAT model~\cite[p284-291]{boehm00b} 
contains a
large set of two-dimensional tables like \fig{madeg} representing pairs of variable
settings that are problematic. For example, using the $rely$ vs $sced$
table, the THREAT model would raise an alert if our tool decides to
build a system with high $rely$ (required reliability) and low $sced$
(schedule available to the development).
\ee

When searching for good subsets of the project options,
our AI search engine tries to minimize the outputs of these above models,
combined as follows:
\begin{equation}\label{eq:score}
combined = \sqrt{Effort^2 + Time^2 + Threats^2 + Defects^2} 
\end{equation}
This is  
the Euclidean distance to minimum values for all these predictions.
Note one small technical detail: in order that one measure does not dominate
over the others,
we normalize all predictions  to the range 0..100.


\subsection{ $P$: The Project Options}\label{sec:p}

\fig{cases} summarizes four NASA case studies using the project options of \fig{emsf2}:
\bi
\item ``OSP'' is the GNC (guidance, navigation, and control)
component of NASA's 1990s {\em Orbital Space Plane}; 
\item ``OSP2'' is a later version of OSP;
\item ``Flight'' and ``ground'' show typical ranges of NASA's Jet Propulsion Laboratory. 
\ei

\begin{figure}
\begin{center}
\scriptsize
\begin{minipage}{.45\linewidth}\begin{tabular}{c|lrr|lr}
       &\multicolumn{3}{c|}{float}      &\multicolumn{2}{c}{fixed}\\\hline
project&variable&low&high&variable&setting\\\hline
&prec&1&2&data&3\\
OSP&flex&2&5&pvol&2\\
&resl&1&3&rely&5\\
&team&2&3&pcap&3\\
&pmat&1&4&plex&3\\
&stor&3&5&site&3\\
&ruse&2&4&&\\
&docu&2&4&&\\
&acap&2&3&&\\
&pcon&2&3&&\\
&apex&2&3&&\\
&ltex&2&4&&\\
&tool&2&3&&\\
&sced&1&3&&\\
&cplx&5&6\\
&KSLOC&75&125\\
\hline
&prec&3&5&flex&3\\
OSP2 &pmat&4&5&resl&4\\
&docu&3&4&team&3\\
&ltex&2&5&time&3\\
&sced&2&4&stor&3\\
&KSLOC&75&125&data&4\\
&&&&pvol&3\\
&&&&ruse&4\\
&&&&rely&5\\
&&&&acap&4\\
&&&&pcap&3\\
&&&&pcon&3\\
&&&&apex&4\\
&&&&plex&4\\
&&&&tool&5\\
&&&&cplx&4\\
&&&&site&6
\end{tabular}
\end{minipage}~~~~~~~~\begin{minipage}{.45\linewidth}
\begin{tabular}{c|lrr|lr}
       &\multicolumn{3}{c|}{float}      &\multicolumn{2}{c}{fixed}\\\hline
project&variable&low&high&variable&setting\\\hline
&rely&3&5&tool&2\\
 &data&2&3&sced&3\\
flight&cplx&3&6&&\\
&time&3&4&&\\
&stor&3&4&&\\
&acap&3&5&&\\
&apex&2&5&&\\
&pcap&3&5&&\\
&plex&1&4&&\\
&ltex&1&4&&\\
&pmat&2&3&&\\
&KSLOC&7&418&&\\
\hline
&rely&1&4&tool&2\\
 &data&2&3&sced&3\\
ground&cplx&1&4&&\\
&time&3&4&&\\
&stor&3&4&&\\
&acap&3&5&&\\
&apex&2&5&&\\
&pcap&3&5&&\\
&plex&1&4&&\\
&ltex&1&4&&\\
&pmat&2&3&&\\
&KSLOC&11&392&&\\
\multicolumn{5}{c}{~}\\
\multicolumn{5}{c}{~}\\
\multicolumn{5}{c}{~}\\
\multicolumn{5}{c}{~}\\
\multicolumn{5}{c}{~}\\
\multicolumn{5}{c}{~}\\
\multicolumn{5}{c}{~}\\
\multicolumn{5}{c}{~}\\
\multicolumn{5}{c}{~}
\end{tabular}
\end{minipage}
\end{center}
\caption{Four case studies define a space of project options $P$.}\label{fig:cases}
\end{figure}
Inside our model,
project options typically range from
1 to 5 where ``3'' is the nominal
value that offers no change to the default estimate. 
Some of the project options in \fig{cases}
are known precisely (see all the options with single {\em values}).
But  many of the features in \fig{cases} do not
have precise values
(see all the features that {\em range} from some {\em low} to {\em high}
value).  

Sometimes the ranges of options
are very narrow (e.g., the process maturity of JPL ground software is between 2
and 3), and  sometimes the ranges are very broad.
\fig{cases} does not mention all the  features listed in \fig{emsf2}
inputs.  For example, our defect predictor has inputs for
use of {\em automated analysis}, {\em peer
reviews}, and {\em execution-based testing tools}. During SEESAW's search,
for all project options
not mentioned in \fig{cases}, values are picked at random from the full
range of \fig{emsf2}.


\subsection{$T$ : the Tuning Options}\label{sec:t}
Many of our project options have a linear relationship
to the output. Such linear relations form the
line
$y=mx+b$ with slope ``$m$''
passing through point $x=3,y=1$; i.e., at the nominal value of ``3'',
there are no changes to the effort estimate.  
Such a line has a y-intercept
of $b=1 - 3m$. Substituting this value of $b$ into $y=mx+b$ yields:
\begin{equation}\label{eq:bbbb}\footnotesize\begin{array}{l}
estimate = m(x-3) + 1
\end{array}\end{equation}
Over the 
history of the COCOMO project, 
it has been observed that all the linear parameters that increase/decrease
effort have the slopes of \fig{linear}a.
Similarly, the linear relations in the
COQUALMO defect model linear relationships
fall within very narrow slopes of \fig{linear}b.

Like COCOMO, COQUALMO also includes scale factors that affect
the estimates exponentially. These scale factors
hinge about
the origin and have the slopes of \fig{linear}c.

\begin{figure}
\begin{center}
\footnotesize
\begin{tabular}{c|c}
 increasing effort & decreasing effort\\\hline
$0.073 \le m \le 0.21$&$ -0.178 \le m \le -0.078$
\end{tabular}

~\\
\fig{linear}a: tuning options in COCOMO

~\\

\begin{tabular}{l|c|c}
phase & increasing defects & decreasing defects\\\hline
requirements&
$0 \le m \le 0.112$ & 
$-0.183 \le m \le -0.035$\\
design &
$0 \le m \le 0.14$&
$-0.208 \le m \le -0.048$\\
coding & 
$0 \le m\le  0.14$ &
$-0.19 \le m \le -0.053$\\
\end{tabular} 

~\\
\fig{linear}b: tuning options in COQUALMO effort multipliers

~\\

\begin{tabular}{l|c}
 phase & defect removal\\\hline
 requirements& $0.08 	\le m \le 0.14$ \\
design	& $0.1 	\le m \le 0.156$ \\
 coding & $0.11 	\le m \le 0.176$ \\
\end{tabular}

~\\ 
\fig{linear}c: tuning options in COQUALMO scale factors
\end{center}
\caption{Linear relations in our models}\label{fig:linear}
\end{figure}


Note also that COCOMO includes two other tuning points: the $\{a,b\}$ ``calibration
parameters'' that can  any linear and exponential effects: see \eq{aaaa},
discussed below.

Sampling across \eq{bbbb} 
is a simple matter of picking random $m$ values from \fig{linear}.
Similarly, it is possible to sample
the space of $\{a,b\}$ values by  selecting at random from their known ranges
(see \fig{dan}, below).

To sample across the space of THREAT tunings, another mechanism is required.
Tables like
\fig{madeg} can be represented as
an exponentially decaying function that peaks in one corner of the risk table
at a value between two to four. 
Since this model is heuristic in nature, the exact height of the peak is
not certain.
When we perform tuning samplings over THREAT, we vary the height of
the peak by 
a random factor $0.5 \le x \le 1$ if the peak is four, and $0.5 \le x \le 1.5$ if the peak is two.
 
\begin{figure}
{\scriptsize
\begin{center}
\begin{tabular}{|p{0.8in}|c|c|c|c|c|}\cline{2-6}
\multicolumn{1}{c|}{} &rely=&rely=&rely=&rely=&rely=\\
\multicolumn{1}{c|}{}& very & low &nominal & high & very\\
\multicolumn{1}{c|}{}&  low&  & &  &  high\\\hline sced= very
low& 0 & 0 & 0  &1  &2\\\hline sced= low     & 0 &0  & 0 &
0&1\\\hline sced= nominal& 0 & 0 & 0  &0  &0\\\hline sced= high&
0& 0 & 0  & 0 &0\\\hline sced= very high& 0 & 0 & 0  &0 &0\\\hline
\end{tabular}
\end{center}
} \caption{An example risk table}\label{fig:madeg}
\end{figure}

\subsection{Changing Project Options $P$ with SEESAW}

SEESAW seeks  the smallest subset of the project options $P' \subseteq P$
that most improves model estimates  of \eq{score}.
To this, it sets one variable at a time (selected at random) using a greedy stochastic
search (see the algorithm described in the appendix).
The resulting estimates (combined using  \eq{score}) are then logged. At the end, SEESAW
reviews the effects of how setting the first, second, third (etc) variables
changes the resulting predictions. SEESAW recommends all the settings up to 
the setting where the resulting estimates are all minimized.
That is, SEESAW be can viewed as {\em either} a tool for selecting the best
$P' \subseteq P$ {\em or} 
as a tool for pruning away irrelevant project options (those not found in $P'$).


Note that:
\bi
\item
One run of SEESAW selects at random from the tuning options (see Equations
\fig{linear}) and project options
(as defined by \fig{cases}).  
\item SEESAW  constrains project
options $P$ and not the tuning options $T$.  \ei
\subsection{Changing Tuning Options $T$ with ``LC'' }

At the end of this paper is an experiment comparing the effects of
generating estimates after reducing project  options $P$ with SEESAW or
reducing tuning options $T$.
This section
describes ``LC'', the standard regression procedure used by the
COCOMO community since 1981.  LC changes the tuning parameters $T$
inside the COCOMO model using local data.  

LC assumes that some  matrix $D_{i,j}$ 
holds:
\bi
\item
The natural log of the
$LOC$ (lines of code) estimates;
\item
The natural log of  the actual efforts
for each project $j$;
\item
The  natural logarithm of the cost drivers (the scale factors
and effort multipliers) at locations $1\le i \le 15$ (for COCOMO-I)
or $1 \le i \le 22$ (for COCOMO-II).
\ei
With those assumptions, Boehm~\cite{boehm81} shows that
for COCOMO-I,
the following calculation yields estimates
for ``$a$'' and ``$b$'' that minimize the sum of the squares of residual
errors:
\begin{equation}\label{eq:ab}{\scriptsize
\left.
\begin{array}{rcl}
EAF_i& =& \sum_j^N D_{i,j}\\
a_0  & =& t \\\
a_1  & =& \sum_i^t KLOC_i\\
a_2  & =& \sum_i^t (KLOC_i)^2\\
d_0  & =& \sum_i^t \left(actual_i - EAF_i\right)\\
d_1  & =& \sum_i^t \left((actual_i - EAF_i)*KLOC_i\right)\\
b  & =& (a_0d_1 - a_1d_0)/(a_0a_2 - a_1^2)\\
a_3    & =& (a_2d_0 - a_1d_1)/(a_0a_2 - a_1^2)\\
a    & =& e^{a_3}
\end{array}\right\}}
\end{equation}
Note that  LC
is the {\em opposite} of SEESAW in that the former makes
no
comment on the project options. Rather, LC just proposes constraints
to two tuning variables $(a,b)$.  

In the case of COCOMO-I~\cite{boehm81} these $a,b$ values are used in the following equation to generate
effort estimates. In this equation, $EM_i$ are the effort multipliers from \fig{emsf2}:
\begin{equation}\label{eq:aaaa}\scriptsize 
effort = a \cdot KSLOC ^ b \cdot \prod^{15}_i EM_i
\end{equation}
Even after applying LC, the tuning variance can still be quite large.
The left-hand-side of
\fig{dan}  shows the COCOMO-I $(a,b)$ values learned by   Baker~\cite{baker07}
after, 300 times, extracting 10 projects at random from COCOMO data sets,
then applying \eq{ab} to the remaining data.  
The data sets used in this study contained 93 projects, so LC was applied
to  $\frac{93-10}{93}=89\%$ of the data.
A pre-experimental intuition was that we were not using enough
data to yield stable $(a,b)$ values. As can be clearly seen by the wide
variance on the $(a,b)$ values in \fig{dan},
this was not the case.
\begin{figure}[!h]
\begin{center}
\includegraphics[width=2.4in]{nasa93_300.pdf}\includegraphics[width=2.4in]{comparesReport_lc.pdf}
\end{center}
\caption{Results of
applying
LC numerous times to 90\%
of the NASA93 data sets (available from \protect\url{http://promisedata.org/data}).
Left-hand-side shows computed $(a,b)$ values. Right-hand-side shows MRE1s
 generated
over the NASA93 data set for 
ten case studies (one study per line).
}\label{fig:dan}
\end{figure}

The right-hand-side of
\fig{dan} shows the magnitude of the relative error (or $MRE$)
values 
seen in Baker's study (MRE is a standard measure in the effort estimation field
as follows:
$MRE\%=100*\frac{abs(actual - predicted)}{actual}$). 
In the sequel, we will refer to Baker's results as MRE1. 

Some of the MRE1 errors
are very large (up to nearly 500\%)
suggesting that LC was incomplete or that the variance in the $(a,b)$ calculations  
has significant impact on the estimation.
Note that this right-hand-side figure  is not without precedent in the estimation literature:
it is a well-established result that  initial development effort estimates may be incorrect
by a factor of four~\cite{boehm81} or even more~\cite{kemerer87}.


Elsewhere we have been partially successful in
reducing estimation variance
of \fig{dan}
using feature subset selection (FSS)~\cite{chen05,me06d}
or more data collection.
Unfortunately,
further
data collection is possible, but only at great organizational expense. 
Also, FSS reduces but does not eliminate the $a,b$ variance.  
Since we failed to reduce estimation variance by constraining the tuning variables,
we took another approach: 
we developed the SEESAW system to explore the effects of just constraining the 
projects variables.


%% %% Alexei Lapouchnian, Yijun Yu, John Mylopoulos: Requirements-Driven Design and Configuration Management of Business Processes. BPM 2007: 246-261
%% %% exploring goal graphs

%% %% Yuan An, John Mylopoulos, Alexander Borgida: Building Semantic Mappings from Databases to Ontologies. AAAI 2006
%% %% golog framework

%SEESAW incrementally grows solutions
%from unconstrained (where all choices can take any value in $\{Low,High\}$)
%to fully constrained (where all choices are set to a single value).
%This is unlike
%simulated annealing or MaxWalkSat, which simultaneously offer settings to all choices
%at every step of their reasoning.
%\fig{run} shows a single run of SEESAW: each dot marks one selection  (lines 16,17,18 of \fig{MWF}).
%In our terminology, selecting  one value from a range  is a {\em decision} on all  
%values in the range. For example, if \fig{cases} includes a process maturity of (3,4,5)
%and SEESAW selects ``5'', then that is three decisions.
%(this is why the 29 dots of
%\fig{run} result in 100 decisions).
%Note how, as decisions are made, the score is minimized. Score minimization is desirable since
%our scores
%are calculated 
%from  a combination of project predictions that we want to reduce 
%(total effort, defects, development time). 
%\begin{figure}
%\centering
%\includegraphics[width=3in]{default.pdf}
%\caption{Single run of SEESAW, score normalized min..max to 0..1}
%\label{fig:run}
%\end{figure}
%
%Incremental decision making  is an important property of SEESAW.
%Observe how, in 
%\fig{run},
%50\% of the score reduction
%arises from around 15\% of the decisions.
%If a manager cannot implement all SEESAW's recommendations and a 50\% reduction
%is adequate,
%she might elect to  use just these top 15\% decisions.
%That is, SEESAW not only makes $R$ recommendations, it also reports on the value of
%just applying just some subset of $r \subseteq R$.
%
%
%In order to support incremental decision making, SEESAW uses
%a very large $P$ value to control its reasoning.   This
%$P$ value is used to delay ``lose-lose'' decisions until the
%later iterations of the algorithm.  Suppose our search has only constrained (say)
%$pcap=5$ so far.  Our next randomly chosen choice might be  $data$,
%so SEESAW tries $\{Low, High\}$ values of $data=2$ and
%$data=5$.  It is possible that both of these values will result in scores
%that are worse than the current score (using $pcap=5$ alone).
%This means that
%$data$ is not a valuable choice to control,
%and picking either of the $\{Low,High\}$ values would be a ``lose-lose'' decision.
%Using $P=0.95$,
%SEESAW will very likely (95\% of the time) leave $data$ unconstrained
%and pick another, potentially better,  choice to constrain on the next iteration.  The algorithm
%will eventually come back to $data$, but if SEESAW delays making
%that lose-lose decision, it has a better chance of making good
%decisions early in the search.
%
%.
\section{Experiments}

\subsection{Goals}
In terms of trusting SEESAW, the key question is how much SEESAW'S estimates differ from those generated by conventional methods such as LC. Formally, this can be expressed as two hypotheses:
\bi
\item[H0:] 
The effort estimates generated after constraining the tuning options $T$
are different to those generated after {\em not} constraining them.
\item[H1:] 
The effort estimates
generated by constraining just the project options $P$ are similar to those seen after constraining
just the
tuning options $T$.
\ei
Having stated our goal formally in this way, we  add that the
following conclusions will be not be based solely on  statistical
significance tests of H0 vs H1.  Cohen~\cite{cohen88} is scathing
about such tests when he writes that  the significance hypothesis
testing of H0 vs H1 is a ``potent but sterile intellectual rake who
leaves ... no
viable scientific offspring''. In support of Cohen's thesis, we
offer the following salutary lesson. Writing in the field of
marketing, Armstrong~\cite{armstrong05} reviews one study that,
using significance testing, concludes that  estimates generated
from multiple sources do no better than those generated from a
single source.  He then demolishes this conclusion by listing 31
studies where multiple source prediction consistently
 out-performs single source prediction by 3.4\% to 23.4\%
 (average=12.5\%). These improvements, in every
study surveyed by Armstrong, are the exact opposite of what would
be predicted from the significance test results.

Accordingly, in the following, we will use simple visualizations
to assess H0 vs H1 and demote significance tests to the role of a
``reasonableness test'' for the conclusions drawn from the
visualizations.

\subsection{Method}

\begin{figure}[!h]
\begin{center}
\includegraphics[width=3.5in]{experiment.png}
\end{center}
\caption{Experimental design. ``LC'' denotes Boehm's 1981 regression procedure.}
\label{fig:design}
\end{figure}
This experimental design described below (and illustrated in \fig{design}) lets us reflect over two distributions:
\bi
\item MRE1: the difference between expected and actual when the tuning options $T$ 
were changed by LC;
\item MRE2: the difference between the predictions generated by LC or SEESAW.
\ei
Note that LC
changes only the tuning options $T$ while SEESAW changes project options $P$.
Hence, by comparing the two
distributions, we can assess H0 vs H1.

Our experiment executes as follows:
\be
\item 5*2=10 different case studies were created:
\bi \item Two kinds of control policies were explored; i.e., the strategic and tactical choices marked
in \fig{emsf2}.
\item Five kinds of projects were created; i.e., the four projects
from \fig{cases} and one using an imaginary
project whose project choices included the entire range of all COCOMO variables.
\ei
\item
For these 2*5=10 case studies, we ran SEESAW to find the constraints that led
to minimum effort, threats, defects, and development time. 
\item
From each of the 10 sets of constraints,
we generated projects consistent with those constraints.
\item
Effort estimates were then added to the above randomly generated projects.
Each estimate was the median estimate value seen in 1000 simulations  
% SCW: The data I generated for Ous had 1000 iterations of SEESAW for each case study.  Is that what this number is referring to?
% TM: yes
 of SEESAW, for a particular set of project constraints.
That is, these estimates were generated from constrained project choices but unconstrained tuning variables.
\item
SEESAW's predicted effort estimates were then compared to those generated by conventional means;
i.e., LC learning on some historical NASA data\footnote{NASA93, available for download at
\url{http://promisedata.org/repository/data/nasa93/nasa93.arff}.}
then applied to the projects generated from the constraints
found by SEESAW.
\item The delta in SEESAW's and LC's estimates was computed using
$\Delta=100*\frac{abs(SEESAW - LC)}{SEESAW}$.  
\item
Steps 4, 5, 6, and 7 were repeated 20 times to generate the set ``MRE2''.
\ee

\subsection{Results}

\begin{figure}
\begin{center}
\includegraphics[width=2.4in]{comparesReport_lc.pdf}\includegraphics[width=2.4in]{comparesReport_star.pdf}
\end{center}
\caption{MRE2 results for \protect\fig{design}, for
ten case studies (one study per line).
The MRE1 results (left-hand-side) come from Figure 4.
}\label{fig:mre2}
\end{figure}

\fig{mre2}, shows the MRE values seen in 10 case studies (one per line).
The left-hand-side shows the sorted MRE1 results from LC and has median values
\[
median(MRE)= \{25,26,27,28,29,30,31,35,37,38\}\%
\]
The right-hand-side shows the sorted MRE2 results and has median values
\[
median(\Delta)= \{20,20,21,22,23,23,23,23,24,25\}\%
\]
Three features of \fig{mre2} are noteworthy:
\bi
\item 
The SEESAW errors (in MRE2) are never greater than the LC errors (in MRE1).
\item
SEESAW's MRE2 errors are small
(within 25\% or less of the LC results). This is a surprising result, given that
SEESAW samples effort from the unconstrained space of possible tunings.
The only explanation for this effect is that, within the USC models, the project
options $P$ dominate the 
tuning options $T$.
The project choices found by SEESAW forced this process model
into a narrow space of behaviors. In this narrow space, the impact of the tuning variance
becomes unimportant.
\item
The median differences between SEESAW and LC, as shown in MRE2, are quite small when compared
to the range of values seen with just LC (MRE1). 
These deltas are very small  when
 superimposed over the MRE1 curve (which can 
range to over
400\%).
\ei
That is, a visual inspection of these results favors H1.

\subsection{Significance Tests}
For the sake of completeness, we performed a statistical significance test on the MRE1 and MRE2 distributions. 
Recalling Cohen and Armstrong (discussed above), we believe that such significance tests should be a secondary,
not primary, evaluation criteria.
For example, we use them
as a reasonableness check on conclusions gathered via intuitive visual means.
\bi
\item Since we are comparing distributions, paired tests were deemed  inappropriate. 
\item Since \fig{dan} and \fig{mre2} have a small number of large outliers, tests that make a Gaussian assumption
were also deemed inappropriate.
\item
Therefore, we compared the left and right sides
of \fig{mre2}. 
\item
In this case,  Mann-Whitney tests did not violate our visual impressions.
At the 99\% confidence level,
the distributions were different only in the minority case (four out of ten).
\ei
\subsection{Summary of Results}
In summary, the visualizations  of \fig{mre2} make us endorse H1 (and
statistical significance tests do not challenge that endorsement).
Hence, we say that
the estimates generated after constraining project
options $P$  (using SEESAW) are about the same as those
generated after constraining tuning options $T$ (using LC). 


%\section{Discussion}
%How are we to  explain the remarkable effectiveness of SEESAW in
%managing uncertainty?
%Researchers in planning and theorem proving have
%recently shown that as model complexity grows, other constraining
%effects may appear such as ``master variables''; i.e. a small number
%of settings that control all other
%settings~\cite{crawford94,williams03}. Such master variables can
%greatly reduce the search space within large models.  
%
% We hypothesize that software process models also contain master variables;
%i.e. much 
% much of uncertainty in a model is due to the influence of a 
%small subset
%of model variables. If so, then after (a)~ranking variables
%by their ability to constrain the output; and
%(b)~applying a small number of the top-ranked variables;
%then 
%it should be possible to (c)~make
%stable predictions
%in the constrained space.

\section{Related Work}
To the best of our knowledge, this work is the first to try controlling
 the $project$ options while leaving the $tuning$ options unconstrained. 
 Much of the related work on uncertainty in software engineering uses a Bayesian analysis.
For example,
Pendharkar et.al.~\cite{pendharkar05} demonstrate the utility of Bayes networks in effort
estimation while Fenton and Neil explore Bayes nets and defect prediction~\cite{fenton99,fenton08}
(but unlike this paper,
neither of these teams links defect models to effort models).
We elect to take a non-Bayesian approach since most of the industrial and government
contractors we work with use parametric models like
COCOMO. 

Even in the field
 in {\em search-based software engineering} (SBSE), we have not seen anything like
 this study. 
Typically, SBSE hunts
for near
optimal solutions to complex and over-constrained
software engineering problems. 
This  approach has been applied to many
problems in software engineering (e.g., requirements engineering~\cite{jalali08b})
 but most often in the field of software testing~\cite{andrews07}.
A recent review of 123 search-based SE
 papers~\cite{rela04} showed that much of that work relates to testing
 (e.g., simulated annealing  to minimize test suites for regression
 testing) while only a handful of those papers related to the kinds of early
 project process planning discussed here.  For example, Aguilar-Ruiz
 et.al.~\cite{aguilarevolutionary} and Alvarez
 et.al.~\cite{alvarez-data} apply search-based methods for effort
 estimation.  
One facet that distinguishes SEESAW from other methods is
 that we are searching over more than just the
 effort models explored by the 
 Aquilar-Ruiz \& Alvarez teams. 


The SBSE literature initially inspired us to try simulated annealing to search the what-ifs in untuned COCOMO
models~\cite{me07f}. However, we found that SEESAW ran much faster and produced results with far less
variance than simulated annealing.

As to our own work, 
for years, we have struggled with the data drought
problem and have recommended elaborate feature subset selection
methods to prune uninformative data~\cite{chen05,me06d}. Here, we take
a radically different approach and explore the value of models whose internal
tunings have not been constrained with local data.

\section{Conclusion}

We are concerned with using models in domains suffering from a  data drought.
If  a particular domain is data rich, the techniques described in this paper are
not required. Otherwise, AI search methods like SEESAW can be used to
reuse models from other sites {\em without} requiring local data.

To make that case, this report
showed ten case studies comparing the deltas
between ranges of estimates generated by SEESAW (without constraining
the tuning options) and those found
by traditional estimation methods (that constrain the tuning
options). The distributions of the two estimates was
found to be similar.
From a technical perspective, this means that  if estimation
variance arises from a tuning variance $T$ and project variance
$P$, then  there exist models such as COCOMO / COQUALMO where $P \gg T$;
i.e., project choices
dominate tuning options.  

When $P \gg T$,
the estimates found by constraining project choices
will be close to estimates found via tuning on historical data.
For such models:
\bi
\item
Using local data to reduce tuning variance is optional;
\item 
It is possible to compute accurate estimates without local data;
\item
Decision making need not wait for detailed local data collection.
\ei

\bibliographystyle{plain}
\bibliography{refs}
%% \newpage
%% \appendix
%% \section{Simulated Annealing}
%% A {\em Monte Carlo} algorithms randomly sample the space
%% of possible controllable model states. 
%% A  {\em Metropolis} Monte Carlo algorithm~\cite{metropolis53}
%% creates new states
%% by  small mutations to some
%% {\em current} state. 
%% If a new state is ``better'' (as assessed via an
%% {\em energy function}),  it becomes the new {\em current}
%% state used  for future
%% mutations.  Otherwise, a Boltzmann acceptance
%% criteria is used to probabilistically decide to
%% assess the new state: the worse the new state, the less
%% likely that it becomes the new current state.
%% The algorithm is silent on the mutation mechanism.
%% In our implementation of this algorithm, we only randomized $\frac{1}{3}$ of the
%% attributes being sampled during each iteration of the algorithm. Note that
%% the attributes being randomized are different for consecutive iterations.

%% In 1983, Kirkpatrick et.el.~\cite{kirkpatrick83} 
%% proposed a modification that was
%% inspired
%% by a procedure used to make the strongest possible glass. Initially,
%% glass is heated to allow atoms to move freely. The temperature
%% is then slowly lowered such that the atoms can find the stablest
%% configuration with lowest energy. 
%% A {\em simulated annealing} (SA) algorithm adds
%% a ``temperature'' variable to the Boltzmann accept criteria
%% such that, at high temperatures, it is more likely that the
%% algorithm will jump to a  new worst current state.
%% This allows the algorithm to jump out of local minima while
%% sampling the space of options. As the temperature cools,
%% such jumps become less likely and the algorithm reverts
%% to a simple hill climber.

%% Formally, the acceptance criteria $P$ for a new state is defined using
%% the current temperature $T$, 
%% the energy of the current solution ($e$),
%% and the energy of the new mutation ($en$):
%% \begin{equation}\small
%% P(e,en,T) = e^{(e -en)/T}
%% \end{equation}
%% $T$ is defined to $decrease$ as the simulator loops through $k= 1\ldots kmax$ iterations.
%% We use
%% $T = e^{-100*k/kmax}$, where $kmax$ used in our implementation is 1000, which is a number of iterations determined to produce stable solutions in our implementation of the algorithm and for our intents and purposes.

%% The following pseudo-code {\tt sample} describes our simulated
%% annealer. 
%% A new solution $sn$ (with new energy $en$) replaces the current solution                                  
%% if (a)~it has a lower energy or (b)~the acceptance predicate $P$
%% endorses it. Only in the case of (a)
%% should the new solution
%% replaces the current best solution.

%% {\scriptsize\begin{verbatim}
%% function sample 
%%   s := s0; e := E(s)                       // Init state, energy.
%%   for t in 1 to tmax                       // temperature control
%%   do   sn = randomly change 1/3 of 's'     // Pick neighbor.
%%        en := E(sn)                         // Compute energy.
%%        for x in sn   
%%             out[x] += en                   // record project settings
%%        if P(e,en,temp(t/tmax)) > random()  // is it better?
%%        then 
%%              s := sn;                      // new solution
%%              e := en                       // new best energy
%%   done                                		 
%%   return out, sorted ascending  by en
%% \end{verbatim}}

%% The {\tt sample} function returns an array whose keys are the $Project$
%% settings and the values are the sum of the energies seen when that
%% setting was used. This array is sorted in increasing order so
%% the first entries were associated with lower (and better) energies while
%% the later entries are associated with higher (and less desirable) energies.

%% The {\tt rank} function  takes this array and experiments with 
%% using just the first $I$ items:

%% {\scriptsize\begin{verbatim}
%% function rank(order)
%%    for I in order
%%    do  cache=();                           // clear old results
%%        for n = 1 to 1000
%%        do  sn = random settings            // pick anything at all
%%       	    for j = 1 to I                  // insert the first I things
%%                sn[I] = order[I]
%%            cache[n]= E(sn)                 // score it
%%        done
%%        sort cache                          // sort
%%        print  cache[500]                   // print 50% (median)
%%        print  (cache[750] - cache[500])    // print 75% (spread)
%%    done
%% \end{verbatim}}
%% By plotting the {\tt print} statements in {\tt rank}, \fig{u} is generated.

\section*{Appendix: Inside SEESAW}
Recall our formalization of the model-based management task from the introduction:
find the smallest subset of the project options $P' \subseteq P$
that most improves model estimates. This section describes the SEESAW algorithm and its
methods for reducing the model estimates (as computed by \eq{score}).

SEESAW was designed after observing experimentally
that
the most interesting part of the project options $P$
was generally the minimum and maximum values for each variable.  
For example, if the project options of \fig{cases} say that analyst
capability ranges from $low$ to {\em very high}, we kept finding that best effects
were seen at either $low$ or {\em very high}, and nowhere in between.

The reason for this is simple:
all the functions in COCOMO / COQUALMO / THREAT are  monotonic, causing
the most dramatic effects to occur at the extreme ends of the ranges.
In fact,
SEESAW takes its name from the way earlier versions
of this  algorithm tended to seesaw between extreme values.
We have conducted experiments with other approaches that allow
intermediate values.
On comparison with the simulated annealing method used
in a prior publications~\cite{me07f}, we found that
seesawing between
$\{Low,High\}$ values was adequate for our purposes.

SEESAW is an adaption
of Kautz \& Selman's MaxWalkSat local search procedure~\cite{kautz97}. 
Its pseudocode is presented in \fig{MWF}.
Each solution  is scored via a Monte Carlo  procedure (see {\tt score} in \fig{MWF}), and
SEESAW seeks to {\em minimize} that score.


\begin{figure}
{\scriptsize
\begin{alltt}
 1 {\bf function} run (AllRanges, ProjectConstraints) \{
 2   OutScore = -1
 3   Prob = 0.95
 4   Out = combine(AllRanges, ProjectConstraints)
 5   Options = all Out choices with ranges low < high
 6   {\bf while} Options \{
 7     X = any member of Options, picked at random
 8     \{Low, High\} = low, high ranges of X
 9     LowScore = score(X, Low)
10     HighScore = score(X, High)
11     {\bf if} LowScore < HighScore
12       {\bf then} Maybe = Low; MaybeScore = LowScore
13       {\bf else} Maybe = High; MaybeScore = HighScore
14     {\bf fi}
15     {\bf if} MaybeScore < OutScore {\bf or} Prob < rand()
16       {\bf then} delete all ranges of X except Maybe from Out
17         delete X from Options
18         OutScore = MaybeScore
19     {\bf fi} \}
20  {\bf return} backSelect(Out) \}
21
22 {\bf function} score(X, Value) \{
23   Temp = copy(Out)  ;; don't mess up the Out global 
24   from Temp, remove all ranges of X except Value 
25   run monte carlo on Temp for 100 simulations, scoring each run using Equation 3.
26   {\bf return} median score of the 100  \} \end{alltt} }
\caption{Pseudocode for SEESAW.}
\label{fig:MWF}
\end{figure}


SEESAW
first combines the ranges for 
all the COCOMO project choices
with the known project constraints of \fig{cases}.
These constraints 
range from $Low$ to $High$ values.
If a case study does not mention a project choice, then there
are no constraints on that choice, and the $combine$ function (line 4)
returns the entire range of that choice. Otherwise, $combine$
returns only the values from $Low$ to $High$.

In the case where a choice is $fixed$ to a single
value, 
then $Low = High$. Since there is no decision to be made for this
choice, SEESAW ignores it. The algorithm
 explores only those
 choices with a range of $Options$ where
 $Low < High$ (line 5).
In each iteration of the algorithm, it is possible that one acceptable value
for a choice $X$ will be discovered. If so, the range for $X$
is reduced to that single
value, and the choice is not examined again (line 17).

SEESAW prunes the final  recommendations (line 20).
This function removes the $N$
selections added last that do not significantly change the final score (t-tests, 95\% confidence). This culls
any final irrelevancies in the selections.

The $score$ function shown at the bottom of
\fig{MWF} calls COCOMO / COQUALMO / THREAT models 100 times\footnote{The value ``100'' was
set after experimentation showed that  this was big enough to reduce our variance and not
large enough to slow down the runtimes.},
each time selecting
random values for the project choices (from the  $Options$ set);
and
random values for the tuning options (as described 
in \fig{linear}).
The median value
of the \eq{score} values seen in these runs  is the $score$ for those project choices. As SEESAW executes,
the ranges in $Options$ are removed and replaced by single values (lines 16-17),
thus constraining the space of possible simulations.


SEESAW is a stochastic algorithm:
the selection of the next choice to explore is completely random (line 7).
We use this stochastic approach
 since much research from the 1990s showed the benefit of such search methods.
Not only can stochastic algorithms solve non-linear problems and escape from local minima/maxima, 
but they can also find solutions faster than complete search, and for larger problems~\cite{motwani95}.
For example, we have implemented a deterministic version of SEESAW that
replaces the random selection of {\em one} choice in line 7 with a search
through {\em all} choice for
the best $\{Low,High\}$ value.  That algorithm ran much slower (runtimes were 12 times greater)
with nearly identical results to those of the stochastic search. Crawford and Baker~\cite{crawford94} offer one explanation for the strange success of stochastic search.
For models
where the solutions are a small part of the total space, a complete search
wastes much time exploring uninformative areas of the problem. A stochastic search, 
on the other hand, does not get stuck in such  uninformative areas.

\end{document}