# /* vim: set filetype=awk : */ -*- awk -*- ############################################################### # 42.awK : generic naive bayes classifier # (c) 2007 Tim Menzies tim@menzies.us # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; version 3. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc.,51 Franklin Street, Fifth Floor, Boston, # MA 02110-1301, USA. ############################################################### # url: http://unbox.org/wisp/branches/tims-our/minerc.lib/nbd.awk # usage: gawk -f nbd.awk Pass=1 train.arff Pass=1 test.arff ## warning: hastily written code! use with great care! BEGIN { #### command line options Klass= -1 # Position of class attribute. if negative, # counts in from the right. # e.g "-2" means second most right column # e.g "3" means third most left column File = "/dev/stdin" #### internal stuff OFS="," # Simplifies printing output IGNORECASE=1 # make case comparison case insenstive } BEGIN {main(File,Klass)} function klass(n) { return n=="yes" } function empty(a, i) { for(i in a) return 0; return 1 } function main(file,klass,rows,data,cols, goal) { goal=readArff(file,klass,rows,data,cols); #saya("rows",rows); #saya("data",data); #saya("cols",cols); loop(1,rows,data,cols,goal); } function loop(n,rows0,data,cols,goal,rule,used, heaven,hell) { if (n>3) exit if(empty(rows0)) return 0 counts(rows0,data,cols,goal,heaven,hell); if (better(rows0,cols,heaven,hell,rule,used)) { print 1 saya(n "_rule",rule); saya(n "_used",used); print 2 select(rows0,data,rule,rows); print 3 saya(n "_rows",rows); loop(++n,rows,data,cols,goal,rule,used) } } function counts(rows,data,cols,goal,heaven,hell, klass,row,col,val,seen) { for(row in rows) { best=klass(data[row,goal]); for(col in cols) if (col != goal) { val=data[row,col] if (val !~ /\?/ ) best ? heaven[col,val]++ : hell[col,val]++}} } function better(rows,cols,heaven,hell, rule,used, \ best,i, a,b,score,what) { best = -1; for(i in heaven) if (! (i in used)) { a = heaven[i]; b = hell[i]; score = a^2/(a+b); if (score> best) { best=score; what = i }} if (!what) return 0 rule[++rule[0]]= what used[what]=1 return 1 } function select(rows0,data,rule,rows, tmp,col,want,got) { split(rule[rule[0]],tmp,SUBSEP); col=tmp[1]; want=tmp[2]; for(row in rows0) { got = data[row,col] if (got ~ /\?/ || got==want) rows[row]=row } } function readArff(f,klass,rows,d,cols, attr,i,data) { while (getline < f) { sub(/\%.*/,""); if ($0 ~ /^[ \t]*$/) continue; if ($0 ~ /@attribute/) cols[++attr]=$2 if ($0 ~ /@data/) {data=1; FS="," } if ($0 ~ /@/) continue if (data) {gsub(/ /,""); j++; rows[j]=j; for(i=1;i<=NF;i++) d[j,i]=$i; } } return klass > 0 ? klass : attr+1+klass; } function array(a) { split("",a) } function saya(str,a, com,i,j) { com="sort #" rand() for(i in a) { j=i; gsub(SUBSEP,",",j) print str "[" j "]=" a[i] | com; } close(com) }