#.H1 Processing Data
#.H2 Motivation
#.P
#The experiment is over, you've run multiple data subset through multiple learners 
#with different treatments and you've collected multiple performance measures. And you're data might look
#like this:
#.SMALL
#.PRE
#    #data, repeat, bin, learner,            goal,  a,  b,  c,  d,   acc,    pd,   pf,  prec,   bal
#    autos,      1,   1,     j48,              -2, 22,  0,  0,  0, 100.0,   0.0,  0.0,   0.0,   0.0
#    autos,      1,   1,      nb,              -2, 22,  0,  0,  0, 100.0,   0.0,  0.0,   0.0,   0.0
#    autos,      1,   1,    oner,              -2, 22,  0,  0,  0, 100.0,   0.0,  0.0,   0.0,   0.0
#    autos,      1,  10,     j48,              -2, 24,  0,  0,  0, 100.0,   0.0,  0.0,   0.0,   0.0
#    autos,      1,  10,      nb,              -2, 24,  0,  0,  0, 100.0,   0.0,  0.0,   0.0,   0.0
#    autos,      1,  10,    oner,              -2, 24,  0,  0,  0, 100.0,   0.0,  0.0,   0.0,   0.0
#    autos,      1,   2,     j48,              -2, 20,  0,  0,  0, 100.0,   0.0,  0.0,   0.0,   0.0
#    autos,      1,   2,      nb,              -2, 20,  0,  0,  0, 100.0,   0.0,  0.0,   0.0,   0.0
#    autos,      1,   2,    oner,              -2, 20,  0,  0,  0, 100.0,   0.0,  0.0,   0.0,   0.0
#    ...
#    etc for 8000 lines.
#./PRE
#./SMALL
#.P
#Now, how to make sense of it all? Well, read on...
#.H2 Install
#.PRE
#svn export http://unbox.org/wisp/var/timm/09/ai/lib/bash/stats stats
#cd stats
#bash --init-file stat.sh
#./PRE
#.P
#If that works, you should now be looking at a prompt:
#.PRE
#Stats> 
#./PRE
#If you now:
#.UL
#.LI
#Edit and save "yourcode.sh"
#.LI
#At the prompt type "reload ENTER"
#./UL
#.P 
#Now you have some simple visualization and statistical tools which you can tune
#using "yourcode.sh". A sample "yourcode.sh" is offered below.
#.H2 Preparing the data
#.P
#Usually there is a data file, say "data.dat" with quriks that you need to fix.
#For example, you may want to get rid of comment lines and you want to add columns
#derived from other columns:
#.PRE
get() { cat data.dat | grep -v \# | f ;  }

f() { cat - | gawk  ' BEGIN {FS=OFS=","}
	{pd=$11;
	 prec=$13; 
	 print $0,   2*pd*prec/ (0.000000001 + pd +prec)
	}'
}
#./PRE
#.P
#In the above, we removing comment lines and adding the f-measure (harmonic mean of pd and pf) to the
#end of each line.
#In the code that follows, we will now ever talk to the raw "data.dat" file ever again. Instead, we we will
#only ever use data that comes from "get".
#.P
#To see what comes out of the above, we write a "demo" function:
#.PRE
demo1a() { get | gawk 'NR>100 && NR<111'; }
#./PRE
#.P which we execute using:
#.PRE
#Stats> reload; demo1a
#./PRE
#.P which generates:
#.SMALL
#.PRE
# diabetes,      1,   9,    oner, tested_negative,  9,  7, 20, 42,  65.4,  85.7, 69.0,  67.7,  50.2,75.6439
# diabetes,      1,   9,    oner, tested_positive, 42, 20,  7,  9,  65.4,  31.0, 14.3,  56.2,  50.2,39.9587
#    autos,      2,   8,      nb,               1,  9,  4,  6,  3,  54.5,  42.9, 40.0,  33.3,  50.7,37.4953
#    autos,      2,   7,      nb,               0, 14,  2,  2,  1,  78.9,  33.3, 12.5,  33.3,  52.0,33.3
# diabetes,      2,   4,    oner, tested_negative,  8,  5, 16, 44,  71.2,  89.8, 66.7,  73.3,  52.3,80.7154
# diabetes,      2,   4,    oner, tested_positive, 44, 16,  5,  8,  71.2,  33.3, 10.2,  61.5,  52.3,43.2057
# diabetes,      2,   5,    oner, tested_negative, 10,  8, 19, 41,  65.4,  83.7, 65.5,  68.3,  52.3,75.2199
# diabetes,      2,   5,    oner, tested_positive, 41, 19,  8, 10,  65.4,  34.5, 16.3,  55.6,  52.3,42.5794
#    autos,      1,   9,      nb,              -1, 19,  2,  2,  1,  83.3,  33.3,  9.5,  33.3,  52.4,33.3
#    autos,      1,   7,      nb,               1,  5,  2,  7,  4,  50.0,  66.7, 58.3,  36.4,  52.5,47.0976
#./PRE
#./SMALL
#.P Note the new numbers at the end.
#.P Before continuing, we note that it is useful to write feature extractors that report aspects of your code. For example:
#.PRE
learners()  { get | gawk -F, ' {print $4","} ' | sort | uniq; }
data()      { get | gawk -F, ' {print $1","} ' | sort | uniq; }
#./PRE
#.P This code reports the uniq symbols in columns one and four:
#.PRE
demo1b() {
	printf "learners:\n `learners`\n"
	printf "data:\n `data`\n"
}
#./PRE
#.P This is run in the usual way:
#.PRE
#Stats> reload; demo1b
#learners:
#      nb,
#     j48,
#    oner,
#data:
#    autos,
# diabetes,
#./PRE	
#.P Note that we return the learner/data name followed by a comma. This is a safety trick to ensure
#names that are substrings of other names do not collide with each other.
#.H2 Quartiles
#.P
#Data can be divided into quartiles, i.e. sorted then divided into their lower quarter, next quarter, third quarter
# and top quarter. The "median" of the values is the number between quarters two and three.
#.P
#A subtle feature of quartiles is that they can show any amount of data. 1,000,000 numbers can be reported in
#the same space as 20.
#.P
#To generate one line of a quartile graph, we extract one column of data and ask it to be displayed.
#For example, to see the "f-measures" generated above we add the following to "yourcode.sh":
#.PRE
demo2() {
	get |
	gawk -F, ' {print $NF}'  |# print the last column 
	quartile
}
#./PRE
#.P Then we run it:
#.PRE
#Stats> reload; demo2
#  0.0, 42.9, 66.7, 80.7,100.0,[---------------------           |      ++++++++++ ]
#./PRE
#.P
#The numbers on the right show the 0th, 25, 50,75, 100th percentile.
# The horizontal bar chart on the right shows the same information, graphically.
#.P
#Now we can generate the f-measure quartiles each learner in each data set.  
#Observe how, in "demo3",
#we use grep to collect the results just for one learner in one data file.
#.PRE
demo3() {
	for datum in `data`; do
		for learner in `learners`; do
			echo -n "$datum $learner" 
			get | 
			grep "$learner"	| 
			grep "$datum" | 
			gawk -F, '{print $NF}' |
			quartile
		done
	done 
}
demo4() {
	demo3|	sort -n -t, -k 4 | malign
}
#./PRE
#.P
#"Demo4" pretty prints "demo3" by 
#sorting in the median value, the passing the whole output to "malign" that lines up columns.
#.P And the output?
#.SMALL
#.PRE
#Stats> reload; demo3; echo "" ; demo4
#autos, nb,  0.0, 22.2, 50.0, 72.7,100.0,[-----------             |          ++++++++++++++ ]
#autos, j48,  0.0, 66.7, 85.7,100.0,100.0,[---------------------------------        |       +]
#autos, oner,  0.0,  0.0, 50.0, 75.0,100.0,[                        |           +++++++++++++ ]
#diabetes, nb, 50.0, 64.3, 75.3, 80.8, 90.4,[                       --------     |  +++++      ]
#diabetes, j48, 49.0, 57.1, 72.4, 80.4, 90.7,[                       -----       |   ++++++     ]
#diabetes, oner, 36.8, 50.0, 65.3, 79.6, 86.4,[                 -------       |      ++++        ]
#
#    autos,  oner,   0.0,   0.0,  50.0,  75.0, 100.0, [                        |           +++++++++++++ ]
#    autos,    nb,   0.0,  22.2,  50.0,  72.7, 100.0, [-----------             |          ++++++++++++++ ]
# diabetes,  oner,  36.8,  50.0,  65.3,  79.6,  86.4, [                 -------       |      ++++        ]
# diabetes,   j48,  49.0,  57.1,  72.4,  80.4,  90.7, [                       -----       |   ++++++     ]
# diabetes,    nb,  50.0,  64.3,  75.3,  80.8,  90.4, [                       --------     |  +++++      ]
#    autos,   j48,   0.0,  66.7,  85.7, 100.0, 100.0, [---------------------------------        |       +]
#Now, if we repeat that 
#./PRE
#./SMALL
#.P Note how the "demo4" output is prettier. 
#.H2 Significance Tests
#.P
#Observe how in the above that the f-measures come from a wide range- so large in fact that we might
#doubt that the results are significantly different.
#To test that, we apply the Mann-Whitney test to our output. We can Mann-Whitney on every subset of our data
#("auto" and "diabetes") as well as to the entire data set (this is why we do the funny for loop at the start of
#"demo5").
#.PRE
demo5() {
	for datum in " " `data`; do
		printf "\n[$datum]\n\n"	
		get | grep "$datum" > data.tmp
		for learner in `learners`; do
			echo -n "$learner" 
			cat data.tmp  | grep "$learner" | gawk -F, '{print $NF}'  |  quartile
		done | sort -n -t, -k 4 | malign
		echo 
		winLossTie --input data.tmp --fields 15 --perform 15 --95 --key 4 --high                    
	done
}
#./PRE
#.P
#Breaking down that last call:
#.PRE
# winLossTie --input data.tmp --fields 15 # how wide is the data file
#            --perform 15                 # we want the last column f-measures
#            --95                         # 95% confidence (we can use 95 or 99)
#            --key 4                      # collect stats for each column 4 item
#            --high                       # larger values are better (and --low is the reverse)
#./PRE
#.P
#This produces one quartile chart for all the data, the separate ones for "auto" and "diabetes":
#.SMALL
#.PRE
#[]
#
# oner,   0.0,  33.3,  56.5,  76.2, 100.0, [----------------           |         ++++++++++++ ]
#   nb,   0.0,  40.0,  62.2,  79.2, 100.0, [-------------------           |       +++++++++++ ]
#  j48,   0.0,  66.7,  80.0,  90.9, 100.0, [---------------------------------      |    +++++ ]
#
# #key, ties, win, loss, win-loss
#  j48,    0,   2,    0,        2
# oner,    1,   0,    1,       -1
#   nb,    1,   0,    1,       -1
#
#[autos,]
#
#   nb,   0.0,  22.2,  50.0,  72.7, 100.0, [-----------             |          ++++++++++++++ ]
# oner,   0.0,   0.0,  50.0,  75.0, 100.0, [                        |           +++++++++++++ ]
#  j48,   0.0,  66.7,  85.7, 100.0, 100.0, [---------------------------------        |       +]
#
# #key, ties, win, loss, win-loss
#  j48,    0,   2,    0,        2
# oner,    1,   0,    1,       -1
#   nb,    1,   0,    1,       -1
#
#[diabetes,]
#
# oner,  36.8,  50.0,  65.3,  79.6,  86.4, [                 -------       |      ++++        ]
#  j48,  49.0,  57.1,  72.4,  80.4,  90.7, [                       -----       |   ++++++     ]
#   nb,  50.0,  64.3,  75.3,  80.8,  90.4, [                       --------     |  +++++      ]
#
# #key, ties, win, loss, win-loss
#   nb,    1,   1,    0,        1
#  j48,    2,   0,    0,        0
# oner,    1,   0,    1,       -1
#./PRE
#./SMALL
#.P  
#The win/loss/tie values compares the range of values for each treatment to every other.
#The interesting statistic here is how often a method "losses" to everything else. In the "all"
#case (at top), j48 never losses. It is good to see the general pattern (in "all") then look
#for exceptions (in the specific data sets). In this case, the observation that j4b never
#losses holds in all the specific data sets. Yes, nb also never loses in "diabetes" but
#then it does lose once in "autos". So j48 is the treatment that yields the highest f-measures.
#.H2 Multiple Assessment Criteria
#.P
#It is usually not enough to evaluate on a single criteria. Sometimes it is good to combine
#many measures into one (say, as with the f-measure like we did above) but combination rules have
#their own complexities (e.g. how to weight the different parts of the combination). So
#it is best to look at all the measurements separately. 
#.P
#Our last demo does just that. It is a lot of output but the thing to learn is how to summarize
# the results.
#.PRE
demo6() {
	worker6  accuracy  10 4 95 --high
	worker6  pd 	   11 4 95 --high
	worker6  pf        12 4 95 --low  # for false alarms, lower values are better
	worker6  precision 13 4 95 --high
}
worker6() {
	local title=$1
	local perform=$2
	local key=$3
	local confidence=$4
	local better=$5
	local d10="==================="
	printf "\n\n====| $title |$d10$d10\n"
	for datum in " " `data`; do
		printf "\n[$datum $title]\n\n"	
		get | grep "$datum" > data.tmp
		for learner in `learners`; do
			echo -n "$learner" 
			cat data.tmp  | grep "$learner" | cut -d, -f $perform |  quartile
		done | sort -n -t, -k 4 | malign
		echo 
		winLossTie --input data.tmp --fields 15 \
	               --perform $perform --$confidence --key $key $better
	done
}
#./PRE
#.P This is run as usual...
#.SMALL
#.PRE
#Stats> demo6
#
#====| accuracy |======================================
#
#[  accuracy]
#
#   nb,  50.0,  74.4,  83.3,  90.5, 100.0, [                        -------------   |   +++++ ]
# oner,  33.3,  73.7,  83.3,  90.9, 100.0, [               ---------------------    |   +++++ ]
#  j48,  65.3,  80.3,  91.7, 100.0, 100.0, [                               --------     |    +]
#
# #key, ties, win, loss, win-loss
#  j48,    0,   2,    0,        2
# oner,    1,   0,    1,       -1
#   nb,    1,   0,    1,       -1
#
#[autos, accuracy]
#
#   nb,  50.0,  76.5,  85.0,  95.2, 100.0, [                        --------------   |    +++ ]
# oner,  33.3,  81.0,  87.5,  94.4, 100.0, [               ------------------------   |   +++ ]
#  j48,  69.6,  90.0,  95.2, 100.0, 100.0, [                                 -----------  |  +]
#
# #key, ties, win, loss, win-loss
#  j48,    0,   2,    0,        2
# oner,    1,   0,    1,       -1
#   nb,    1,   0,    1,       -1
#
#[diabetes, accuracy]
#
# oner,  63.2,  67.9,  71.2,  74.7,  79.5, [                              --- | +++           ]
#  j48,  65.3,  69.7,  73.4,  78.2,  85.7, [                               --- |  ++++        ]
#   nb,  67.9,  71.8,  74.7,  78.1,  85.9, [                                --  | ++++        ]
#
# #key, ties, win, loss, win-loss
#   nb,    1,   1,    0,        1
#  j48,    1,   1,    0,        1
# oner,    0,   0,    2,       -2
#
#
#====| pd |======================================
#
#[  pd]
#
# oner,   0.0,  25.0,  50.0,  85.7, 100.0, [------------            |                ++++++++ ]
#   nb,   0.0,  33.3,  63.0,  84.8, 100.0, [----------------              |          ++++++++ ]
#  j48,   0.0,  60.0,  83.0, 100.0, 100.0, [------------------------------          |        +]
#
# #key, ties, win, loss, win-loss
#  j48,    0,   2,    0,        2
# oner,    1,   0,    1,       -1
#   nb,    1,   0,    1,       -1
#
#[autos, pd]
#
# oner,   0.0,   0.0,  50.0,  83.3, 100.0, [                        |               +++++++++ ]
#   nb,   0.0,  20.0,  55.6,  81.8, 100.0, [----------                |            ++++++++++ ]
#  j48,   0.0,  63.6,  87.5, 100.0, 100.0, [-------------------------------           |      +]
#
# #key, ties, win, loss, win-loss
#  j48,    0,   2,    0,        2
# oner,    1,   0,    1,       -1
#   nb,    1,   0,    1,       -1
#
#[diabetes, pd]
#
# oner,  25.9,  41.4,  60.0,  86.4,  93.5, [           --------          |            ++++    ]
#  j48,  37.5,  55.6,  72.4,  83.7,  95.9, [                 ----------        |    +++++++   ]
#   nb,  41.4,  62.1,  76.7,  84.8,  93.9, [                   -----------       |   +++++    ]
#
# #key, ties, win, loss, win-loss
# oner,    2,   0,    0,        0
#   nb,    2,   0,    0,        0
#  j48,    2,   0,    0,        0
#
#
#====| pf |======================================
#
#[  pf]
#
#  j48,   0.0,   0.0,   5.3,  15.8,  62.5, [ |    ++++++++++++++++++++++++                    ]
# oner,   0.0,   0.0,   8.7,  18.2,  75.0, [   |    +++++++++++++++++++++++++++++             ]
#   nb,   0.0,   0.0,  11.1,  21.4,  69.2, [    |    ++++++++++++++++++++++++                 ]
#
# #key, ties, win, loss, win-loss
#  j48,    0,   2,    0,        2
# oner,    1,   0,    1,       -1
#   nb,    1,   0,    1,       -1
#
#[autos, pf]
#
#  j48,   0.0,   0.0,   0.0,   6.7,  21.4, [  ++++++++                                        ]
# oner,   0.0,   0.0,   5.6,  13.3,  75.0, [ |   +++++++++++++++++++++++++++++++              ]
#   nb,   0.0,   0.0,   6.2,  15.8,  69.2, [  |   +++++++++++++++++++++++++++                 ]
#
# #key, ties, win, loss, win-loss
#  j48,    0,   2,    0,        2
# oner,    1,   0,    1,       -1
#   nb,    1,   0,    1,       -1
#
#[diabetes, pf]
#
# oner,   6.5,  13.5,  21.7,  56.5,  74.1, [  ----   |                 +++++++++              ]
#   nb,   6.1,  14.6,  22.4,  37.9,  58.6, [  -----   |      +++++++++++                      ]
#  j48,   4.1,  16.3,  26.4,  44.0,  62.5, [ -------    |        ++++++++++                   ]
#
# #key, ties, win, loss, win-loss
# oner,    2,   0,    0,        0
#   nb,    2,   0,    0,        0
#  j48,    2,   0,    0,        0
#
#
#====| precision |======================================
#
#[  precision]
#
# oner,   0.0,  37.5,  63.2,  77.8, 100.0, [------------------            |      ++++++++++++ ]
#   nb,   0.0,  33.3,  66.7,  80.0, 100.0, [----------------                |      +++++++++++]
#  j48,   0.0,  66.7,  80.0, 100.0, 100.0, [---------------------------------      |         +]
#
# #key, ties, win, loss, win-loss
#  j48,    0,   2,    0,        2
# oner,    1,   0,    1,       -1
#   nb,    1,   0,    1,       -1
#
#[autos, precision]
#
#   nb,   0.0,  33.3,  50.0,  80.0, 100.0, [----------------        |              +++++++++++]
# oner,   0.0,   0.0,  50.0,  80.0, 100.0, [                        |              +++++++++++]
#  j48,   0.0,  66.7,  88.9, 100.0, 100.0, [---------------------------------          |     +]
#
# #key, ties, win, loss, win-loss
#  j48,    0,   2,    0,        2
# oner,    1,   0,    1,       -1
#   nb,    1,   0,    1,       -1
#
#[diabetes, precision]
#
# oner,  52.6,  60.0,  70.2,  75.0,  86.4, [                         ----     | ++++++        ]
#  j48,  45.5,  65.6,  73.2,  80.0,  90.0, [                     -----------   |   ++++++     ]
#   nb,  48.1,  66.7,  74.6,  80.8,  92.9, [                       ----------   |  +++++++    ]
#
# #key, ties, win, loss, win-loss
#   nb,    1,   1,    0,        1
#  j48,    2,   0,    0,        0
# oner,    1,   0,    1,       -1
#./PRE
#./SMALL
#.P 
#Now that seems like a lot of data but we can summarize it quite simply. Note that the general
#cases j48 always losses least. And in the specific cases there are no counter-examples to
#the general statement that j48 losses least across all criteria.
#Therefore, in this happy case, there is no extra information associated with showing all the
#data. We need only write down the general cases. Also, we will add a column, left hand side,
#that "ranks" each treatment by the number of losses. Row "i" gets a different rank to row "i+1"
#if they loss at different rates:
#So we need only report five little tables (for pd,pf,accuracy,f-measures, precision)
#of the following form:
#.PRE
#pf:
#         0  25  50  75 100
#         ------------------
#1   j48  0   0   5  16  63 [ |    ++++++++++++++++++++++++                    ]
#2  oner  0   0   9  18  75 [   |    +++++++++++++++++++++++++++++             ]
#2    nb  0   0  11  21  62 [    |    ++++++++++++++++++++++++                 ]
#./PRE
#.P
#Note that we sort the rows such that the top row has the best rank. So for "false alarms",
#we sort ascending on median pf.
#.H2 Discussion
#.P
#Note that your experimental reports may not be exactly like the above. The point of this tool is that
#you can quickly tune your own reports.
#.P
#Enjoy!