% before everything, do a clean-up of the workspace clear;clc; % define dataset names for different within-company like features cocomoProjType = {'cocomo81e','cocomo81o','cocomo81s'}; nasaCenterType = {'nasa93_center_1','nasa93_center_2','nasa93_center_5'}; desharnaisLangType = {'desharnaisL1','desharnaisL2','desharnaisL3'}; chinaResource = {'chinaResource1','chinaResource2','chinaResource3','chinaResource4'}; finnishAppArea = {'finnishAppArea2','finnishAppArea7','finnishAppArea10'}; finnishAppType = {'finnishAppType1','finnishAppType2345'}; kemererHardware = {'kemererHardware1','kemererHardware23456'}; maxwellAppType = {'maxwellAppType1','maxwellAppType2','maxwellAppType3'}; maxwellHardware = {'maxwellHardware2','maxwellHardware3','maxwellHardware5'}; maxwellSource = {'maxwellSource1','maxwellSource2'}; % combine above datasets into a single variable allMyDatasets = {... % 'chinaResource'... % ,'finnishAppArea',... 'cocomoProjType',... 'nasaCenterType',... 'desharnaisLangType',... 'finnishAppType',... 'kemererHardware',... 'maxwellAppType',... 'maxwellHardware',... 'maxwellSource'... }; % open and close file for current dataset % below is the file for number of instances resultsFile = fopen('resultsFile.txt','w' ); fclose(resultsFile); % below is for all win,tie,loss and actual mar, mmre, mdmre, pred25 % combined winTieLossFileAll = fopen('winTieLossFileAll.txt','w' ); fclose(winTieLossFileAll); % below is the file for MMRE win, tie, loss values winTieLossFileMMRE = fopen('winTieLossFileMMRE.txt','w' ); fclose(winTieLossFileMMRE); % below is the file for MdMRE win, tie, loss values winTieLossFileMdMRE = fopen('winTieLossFileMdMRE.txt','w' ); fclose(winTieLossFileMdMRE); % below is the file for Pred(25) win, tie, loss values winTieLossFilePred25 = fopen('winTieLossFilePred25.txt','w' ); fclose(winTieLossFilePred25); % below is the file for MAR win, tie, loss values winTieLossFileMAR = fopen('winTieLossFileMAR.txt','w' ); fclose(winTieLossFileMAR); % start doing cross-company and within company experiments for % all the datasets and their subsets for datasetCounter = 1:size(allMyDatasets,2) % load subsets of the dataset currentDatasetName = (allMyDatasets(datasetCounter)); % get subset size eval(['subsetSize = size(' char(currentDatasetName) ',2);']); % do for each subset for subsetCounter = 1:subsetSize % define the within company and cross company datasets subsetIndices = [1:subsetSize]; % get indices wcIndex = subsetIndices(subsetCounter); % define one as wc % subsetIndices(subsetCounter) = []; % delete wc from subsets ccIndex = subsetIndices; % assing cc indices eval(['wcFileName = char(' char(currentDatasetName) '(' num2str(subsetCounter) '));']); wcDataset = csvread([wcFileName '.csv']); % read the wc dataset wcDataset = myNormalizer(wcDataset); ccDataset = []; % define a var. for cc dataset ccDatasetSizes = zeros(1,size(ccIndex,2)); for tmp1 = 1:size(ccIndex,2) % append cc datasets in that for loop eval(['tmpName = [' char(currentDatasetName) '(' num2str(ccIndex(tmp1)) ')];']); tmpDataset = csvread([char(tmpName) '.csv']); ccDatasetSizes(tmp1) = size(tmpDataset,1); tmpDataset = myNormalizer(tmpDataset); ccDataset = [ccDataset;tmpDataset]; % also define a variable to keep track of how many instances % are selected from each CC center eval([char(tmpName) 'Instances = -1 * ones(20,size(wcDataset,1));']); end % define the variables to keep track of.. well pretty much % everything % variables related to how many instances are selected from % different centers predictionZoneSizeCC = -1 * ones(20,size(wcDataset,1)); % below variables keep track of stuff related to wc experiments pred0WC = -1 * ones(20,size(wcDataset,1)); predGac2WC = -1 * ones(20,size(wcDataset,1)); predictionZoneWC = -1 * ones(20,size(wcDataset,1)); % pred1WC = -1 * ones(20,size(wcDataset,1)); % pred2WC = -1 * ones(20,size(wcDataset,1)); % pred4WC = -1 * ones(20,size(wcDataset,1)); % pred8WC = -1 * ones(20,size(wcDataset,1)); % pred16WC = -1 * ones(20,size(wcDataset,1)); % predxWC = -1 * ones(20,size(wcDataset,1)); % below variables keep track of stuff related to cc experiments pred0CC = -1 * ones(20,size(wcDataset,1)); predGac2CC = -1 * ones(20,size(wcDataset,1)); predictionZoneCC = -1 * ones(20,size(wcDataset,1)); % pred1CC = -1 * ones(20,size(wcDataset,1)); % pred2CC = -1 * ones(20,size(wcDataset,1)); % pred4CC = -1 * ones(20,size(wcDataset,1)); % pred8CC = -1 * ones(20,size(wcDataset,1)); % pred16CC = -1 * ones(20,size(wcDataset,1)); % predxCC = -1 * ones(20,size(wcDataset,1)); % a dummy to keep track of whether the best k value was found % bestKFound = 0; % % find the best k value for the train data % if bestKFound == 0 % myBestK = bestk(wcDataset); % end % randomize dataset wcDataset = randomizeDataset(wcDataset); ccDataset = randomizeDataset(ccDataset); for runCounter = 1:20 % do WC and CC experiments 20 times % for each row in the within company dataset do the following actualEffortValues = wcDataset(:,size(wcDataset,2)); i = 0; while i < size(wcDataset,1) % increment i by 1 i = i + 1; % pick up the row myRow = wcDataset(i,:); % wcTrain is the within-company dataset minus the selected row wcTrain = wcDataset; wcTrain(i,:) = []; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%% below are predictions for within-company%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % now start predictions % the one below -treeK- is our guy to defend [pred0WC(runCounter,i), gac2TreeWC, gac2RootWC] = treeK(myRow,wcTrain); % at this point we have build our GAC tree -above executable line- and we can check the % instances with the second tree [predGac2WC(runCounter,i), predictionZoneWC] = secondGac(gac2TreeWC, gac2RootWC, myRow); if predGac2WC(runCounter,i) == -1 % meaning there were not enough instances for gac2 tree i = i-1; continue; end % below are the ones for various k values % [pred1(runCounter,i)] = nnk(myRow,wcTrain,1); % [pred2(runCounter,i)] = nnk(myRow,wcTrain,2); % [pred4(runCounter,i)] = nnk(myRow,wcTrain,4); % [pred8(runCounter,i)] = nnk(myRow,wcTrain,8); % [pred16(runCounter,i)] = nnk(myRow,wcTrain,16); % [predx(runCounter,i)] = nnk(myRow,wcTrain,myBestK); end i = 0; while i < size(wcDataset,1) % increment i by 1 i = i + 1; % pick up the row myRow = wcDataset(i,:); % wcTrain is the within-company dataset minus the selected row wcTrain = wcDataset; wcTrain(i,:) = []; % ccTrain is the cross-company train dataset and it is used in % an as-is manner ccTrain = ccDataset; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%% below are predictions for cross-company%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % now start predictions % the one below -treeK- is our guy to defend [pred0CC, gac2TreeCC, gac2RootCC] = treeK(myRow,ccTrain); % at this point we have build our GAC tree -above executable line- and we can check the % instances with the second tree [predGac2CC(runCounter,i), predictionZone] = secondGac(gac2TreeCC, gac2RootCC, myRow); if predGac2CC(runCounter,i) == -1 % meaning there were not enough instances for gac2 tree i = i-1; continue; end % below part needs to be dynamic % now get statistics from the prediction zone predictionZoneSizeCC(runCounter,i) = size(predictionZone,1); for tmp1 = 1:size(ccIndex,2) eval(['tmpName = [' char(currentDatasetName) '(' num2str(ccIndex(tmp1)) ')];']); tmpDataset = csvread([char(tmpName) '.csv']); tmpDataset = myNormalizer(tmpDataset); [howManySelected] = findMyClassStatistics(predictionZone, tmpDataset); if howManySelected < 0 howManySelected end % also define a variable to keep track of how many instances % are selected from each CC center eval([char(tmpName) 'Instances(runCounter,' num2str(i) ') = howManySelected;']); end % below are the ones for various k values % [pred1CC(runCounter,i)] = nnk(myRow,ccTrain,1); % [pred2CC(runCounter,i)] = nnk(myRow,ccTrain,2); % [pred4CC(runCounter,i)] = nnk(myRow,ccTrain,4); % [pred8CC(runCounter,i)] = nnk(myRow,ccTrain,8); % [pred16CC(runCounter,i)] = nnk(myRow,ccTrain,16); % [predxCC(runCounter,i)] = nnk(myRow,ccTrain,myBestK); end runCounter end % now that we have done WC and CC experiments for one WC subset % we need to derive the win,tie,loss statistics out of them [winMmre tieMmre lossMmre medMmre] = winTieLossCalculatorMMRE(actualEffortValues',predGac2WC,predGac2CC); [winMdmre tieMdmre lossMdmre medMdmre] = winTieLossCalculatorMdmre(actualEffortValues',predGac2WC,predGac2CC); [winPred25 tiePred25 lossPred25 medPred25] = winTieLossCalculatorPred25(actualEffortValues',predGac2WC,predGac2CC); [winAr tieAr lossAr medAr] = winTieLossCalculatorAr(actualEffortValues',predGac2WC,predGac2CC); % now that we are done with the derivation of all the stuff about % the subset, we can start writing these things into a file resultsFile = fopen('resultsFile.txt','a' ); winTieLossFileAll = fopen('winTieLossFileAll.txt','a' ); winTieLossFileMMRE = fopen('winTieLossFileMMRE.txt','a' ); winTieLossFileMdMRE = fopen('winTieLossFileMdMRE.txt','a' ); winTieLossFilePred25 = fopen('winTieLossFilePred25.txt','a' ); winTieLossFileMAR = fopen('winTieLossFileMAR.txt','a' ); fprintf(resultsFile, ['\\textbf{S' num2str(subsetCounter) ': }' char(wcFileName) ' (' num2str(size(wcDataset,1)) ') & ']); fprintf(winTieLossFileAll, [char(wcFileName) ' & ']); fprintf(winTieLossFileMMRE, [char(wcFileName) ' & ']); fprintf(winTieLossFileMdMRE, [char(wcFileName) ' & ']); fprintf(winTieLossFilePred25, [char(wcFileName) ' & ']); fprintf(winTieLossFileMAR, [char(wcFileName) ' & ']); %%%%% now write the avg. instance sizes of GAC and subsets fprintf(resultsFile, [ num2str(mean(mean(predictionZoneSizeCC')),'%6.1f') ]); for tmp1 = 1:size(ccIndex,2) eval(['tmpName = [' char(currentDatasetName) '(' num2str(ccIndex(tmp1)) ')];']); eval(['avgSubsetSize = mean(mean(' char(tmpName) 'Instances''));']); myTmpPercentage = (avgSubsetSize*100)/ccDatasetSizes(tmp1); fprintf(resultsFile, ' & '); if subsetCounter == tmp1 fprintf(resultsFile, [' {\\G} ' num2str(avgSubsetSize,'%6.1f')]); else fprintf(resultsFile, [num2str(avgSubsetSize,'%6.1f')]); end fprintf(resultsFile, [ ' (' num2str(myTmpPercentage,'%6.1f') 'asdf\\%)' ]); end fprintf(resultsFile, '\\\\ \n'); % write win tie loss values of within company w.r.t. MAR fprintf(winTieLossFileMAR, [num2str(winAr(1)) '&' num2str(tieAr(1)) '&' num2str(lossAr(1)) '& ']); fprintf(winTieLossFileMAR, [num2str(medAr(1),'%6.1f') '&' num2str(medAr(2),'%6.1f')]); fprintf(winTieLossFileMAR, ' \\\\ \n'); % write win tie loss values of within company w.r.t. MMRE fprintf(winTieLossFileMMRE, [num2str(winMmre(1)) '&' num2str(tieMmre(1)) '&' num2str(lossMmre(1)) '& ']); fprintf(winTieLossFileMMRE, [num2str(medMmre(1),'%6.1f') '&' num2str(medMmre(2),'%6.1f')]); fprintf(winTieLossFileMMRE, ' \\\\ \n'); % write win tie loss values of within company w.r.t. MdMRE fprintf(winTieLossFileMdMRE, [num2str(winMdmre(1)) '&' num2str(tieMdmre(1)) '&' num2str(lossMdmre(1)) '& ']); fprintf(winTieLossFileMdMRE, [num2str(medMmre(1),'%6.1f') '&' num2str(medMdmre(2),'%6.1f')]); fprintf(winTieLossFileMdMRE, ' \\\\ \n'); % write win tie loss values of within company w.r.t. Pred25 fprintf(winTieLossFilePred25, [num2str(winPred25(1)) '&' num2str(tiePred25(1)) '&' num2str(lossPred25(1)) '& ']); fprintf(winTieLossFilePred25, [num2str(medPred25(1),'%6.1f') '&' num2str(medPred25(2),'%6.1f')]); fprintf(winTieLossFilePred25, ' \\\\ \n'); % now write all above statistics into winTieLossAll file fprintf(winTieLossFileAll, [num2str(winAr(1)) '&' num2str(tieAr(1)) '&' num2str(lossAr(1)) '& ']); fprintf(winTieLossFileAll, [num2str(medAr(1),'%6.1e') '&' num2str(medAr(2),'%6.1e') '& ']); fprintf(winTieLossFileAll, [num2str(winMmre(1)) '&' num2str(tieMmre(1)) '&' num2str(lossMmre(1)) '& ']); fprintf(winTieLossFileAll, [num2str(medMmre(1),'%6.1f') '&' num2str(medMmre(2),'%6.1f') '& ']); fprintf(winTieLossFileAll, [num2str(winMdmre(1)) '&' num2str(tieMdmre(1)) '&' num2str(lossMdmre(1)) '& ']); fprintf(winTieLossFileAll, [num2str(medMdmre(1),'%6.1f') '&' num2str(medMdmre(2),'%6.1f') '& ']); fprintf(winTieLossFileAll, [num2str(winPred25(1)) '&' num2str(tiePred25(1)) '&' num2str(lossPred25(1)) '& ']); fprintf(winTieLossFileAll, [num2str(medPred25(1),'%6.1f') '&' num2str(medPred25(2),'%6.1f')]); fprintf(winTieLossFileAll, ' \\\\ \n'); end % place a straight line at the end of each dataset fprintf(resultsFile, ' \\hline \n'); fprintf(winTieLossFileAll, ' \\hline \n'); fprintf(winTieLossFileMAR, ' \\hline \n'); fprintf(winTieLossFileMMRE, ' \\hline \n'); fprintf(winTieLossFileMdMRE, ' \\hline \n'); fprintf(winTieLossFilePred25, ' \\hline \n'); % finally close the file fclose(resultsFile); fclose(winTieLossFileAll); fclose(winTieLossFileMMRE); fclose(winTieLossFileMdMRE); fclose(winTieLossFilePred25); fclose(winTieLossFileMAR); end