# no more From To. need to pull at random BEGIN { # command-line options Samples = 20 K1 = 5 K2 = 15 Seed = 1 Klass = -1 Tests = 0.33 } BEGIN { # internal options OFS="," IGNORECASE=1 Inf = 10^32 _ = SUBSEP CONVFMT="%.8g" } ################################################################## # main program function main() { worker(Samples,K1,K2) } function worker(samples, k1,k2, rankeds, ranked) { # print "samples : " samples # print "k1 : " k1 # print "k2 : " k2 # print "%test : " Tests*100 "\n" # print "Training results on " Train[0] " historical examples (what looks useful):" rankeds = train(samples,k1,k2,ranked) #saya(ranked,"ranked") # print "Test results on " Test[0] " new projects (applying the training results to new data):\n" test( samples,k1,k2,rankeds,ranked) } function train(samples,k1,k2,ranked, \ projects, neighbors, memos, best, rest,\ knearest,rankeds) { # inputs outputs # ------ ------- projects(Train,samples, projects) # example1 projects neighbors(samples,projects,Train[0],Train, neighbors,memos) # distances example1 to Train set knn(k1+k2,samples,neighbors,memos, knearest) # knearest Train instance row numbers to example1 projects bestRest(knearest,k1, best,rest) # divide knearest into best/worst rankeds = rank(k1,k2,best,rest, ranked) # contrast set between best/worst return rankeds } function test(samples,k1,k2,rankeds,ranked, \ i,projects,neighbors,memos,knearest,\ m,n,sorted,kloc,row,col,data) { projects(Test,samples, projects) # different example2 projects neighbors(samples,projects,Test[0],Test, neighbors,memos) # distances example2 to Test set knn(k1+k2,samples,neighbors,memos, knearest) # knearest Test instances row numbers to example2 projects for(row=1;row<=Test[0];row++) if (row in knearest) { data[0]++ kloc[++n]= int(Test[row,Klass]) for(col=1;col<=Cols;col++) data[data[0],col]=Test[row,col] # convert row numbers to their data rows } m=asort(kloc,sorted) # report baseline distributions # print "Baseline (estimates without any project changes): " for(i=1;i<=m;i++) printf("%s ", sorted[i]) print "\n\t\t\t\t25%\t50%\t75%" print "\t Baseline:\t\t"sorted[round(m*0.25)] "\t" sorted[round(m*0.5)] "\t" sorted[round(m*0.75)] split("",Previous,"") Previous[0] = m for (k=1; k=1;i--) { # highest score must be forst range = memo[sorted[i]] split(range,tmp,_) print sprintf("%5.2f",sorted[i]) "\t" Eman[tmp[1]] " = " tmp[2] | com } close(com) print "" } ######################################################################## # ICBR functions function drunkadvisor (k1,data,rankeds,ranked,usedconstraint, constraints,n,pick) { pick = rand() if (pick < 0.5) { #Here we determine what constraint to add. for starters, we will add constraints randomly from the ranked list print "adding random constraint\n" do n = int(rand() * rankeds) + 1 while (usedconstraint[n]) usedconstraint[n] = 1 addNextConstraint2(k1,n,data,ranked[n],constraints) } else if (pick < 0.75) { #Add the worst one print "adding worst constraint\n" n = rankeds + 1 do n-- while (usedconstraint[n]) usedconstraint[n] = 1 addNextConstraint2(k1,n,data,ranked[n],constraints) } else { #Add the best one print "adding best constraint\n" n = 0 do n++ while (usedconstraint[n]) usedconstraint[n] = 1 addNextConstraint2(k1,n,data,ranked[n],constraints) } } function report2(k1,n,data,constraint, constraints, selected, \ all,attr,range,attrange,row,tmp,i,str,sep,j,max,sorted) { split(constraint, attrange,_); attr = Eman[attrange[1]] range = attrange[2] for(row in selected) { all++ tmp[++i] = int(data[row,Klass]) } max=asort(tmp,sorted) for(j=1;j<=max;j++) { str = str sep sorted[j] sep=" " } #printf(n==1 ? " " : "and ") printf("n="n ": " attr "= " constraints[attrange[1]] " :\t\t ") #printf sorted[round(i*0.25)] "\t" sorted[round(i*0.5)] "\t" sorted[round(i*0.75)] "\t" printf sorted[1] ##begin ugly code## #store for later # Previous[0] = max # for (k=1; k 0) # printf "Better" # else # printf "Worse" #printf "\t" ##end ugly code## #printf max #print (all <= 30) ? "*{" str "}" : "*{..}" } function addNextConstraint2(k1,n,data,constraint,constraints, selected) { extendConstraint(constraint, constraints) selectRows(data,constraints,selected) report2(k1,n,data,constraint, constraints, selected) } ######################################################################### # select and report subset of relevant rows that satisfy constraints 1..n function selects(k1,data,rankeds,ranked, constraints,n) { for(n=1;n<=rankeds;n++) addNextConstraint(k1,n,data,ranked[n],constraints) } function addNextConstraint(k1,n,data,constraint,constraints, selected) { extendConstraint(constraint, constraints) selectRows(data,constraints,selected) report(k1,n,data,constraint, constraints, selected) } function extendConstraint(constraint,constraints, attrange,attr,range) { split(constraint, attrange,_); attr = attrange[1] range = attrange[2] if (attr in constraints) constraints[attr] = "(" range "|" substr(constraints[attr],2) else { constraints[attr] = "(" range ")" } if (SelectsDebug) saya(constraints,"constrants") } function selectRows(data,constraints,selected, row) { for(row=1;row<=data[0];row++) if ( selectRow(data,row,constraints) ) selected[row]=1 } function selectRow(data,row,constraints, col) { for(col in constraints) if ( constraints[col] !~ data[row,col] ) return 0 return 1 } function report(k1,n,data,constraint, constraints, selected, \ all,attr,range,attrange,row,tmp,i,str,sep,j,max,sorted) { split(constraint, attrange,_); attr = Eman[attrange[1]] range = attrange[2] for(row in selected) { all++ tmp[++i] = int(data[row,Klass]) } max=asort(tmp,sorted) for(j=1;j<=max;j++) { str = str sep sorted[j] sep=" " } printf(n==1 ? " " : "and ") printf("n="n ": " attr "= " constraints[attrange[1]] " :\t\t ") printf sorted[round(i*0.25)] "\t" sorted[round(i*0.5)] "\t" sorted[round(i*0.75)] "\t" ##begin ugly code## #store for later Previous[0] = max for (k=1; k 0) printf "Better" else printf "Worse" printf "\t" ##end ugly code## printf max print (all <= 30) ? "*{" str "}" : "*{..}" } ################################################################## # mann-whitney tests function mwRank(data0,ranks, data,starter,n,old,start,skipping,sum,i,j,r) { starter="someCraZYsymBOL"; n = asort(data0,data) old = starter start = 1; for(i=1;i<=n;i++) { skipping = (old == starter) || (data[i] == old); if (skipping) { sum += i } else { r = sum/(i - start) for(j=start;j meanU ? -0.5 : 0.5 z = abs((sum1 - meanU + correction )/sdU) if (z >= 0 && z <= critical) return 0 if (up) return median(ranks1,n1) - median(ranks2,n2) # positive if ranks1 wins else return median(ranks2,n2) - median(ranks1,n1) # positive if ranks2 wins } function criticalValue(conf) { conf = conf ? conf : 95 if (conf==99) return 2.326 if (conf==95) return 1.960 if (conf==90) return 1.645 } function s2a(s,a, tmp,i,n) { n=split(s,tmp,/ /) for(i=1;i"/dev/stderr"; fflush("/dev/stderr") } function push2(v,a,i) { a[i,++a[i,0]] = v; return v } function push(v,a) { a[++a[0]] = v; return v } function as100(n) { return (n*100) + rand()/10 } function abs(n) { return n < 0 ? -1* n : n } function max(n1,n2) { return n1