#|
Take a space of actual examples and map them into the space of possible examples. What happens? What can we say about any future examples we might see? If you understand that, then you understand data mining.
First, we'll need some examples of actual data. We'll store those in rows on a table. A row of data has two parts inputs (also called independent variables) that select for some outputs (called the depedents variables).
|# #+SBCL (DECLAIM (SB-EXT:MUFFLE-CONDITIONS CL:STYLE-WARNING)) (defun sqrd (x) (* x x)) (defun norm (x min max) (if (zerop x) (norm (/ (+ max min) 2) min max) (/ (- x min) (- max min)))) (defun distance (one two dims) (let ((min 1) ;set to 0 for standard geometry (sum 0) (max 0)) (mapc #'(lambda (a b dim) (incf sum (sqrd (- (norm a min dim) (norm b min dim)))) (incf max (sqrd (- dim min)))) one two dims) (/ (sqrt sum) (sqrt max)))) (defun when-we-played-golf () (deftable :columns '(outlook temperature humidity windy play) :ranges '(3 3 2 2 2) :observed '((sunny hot high FALSE no) (sunny hot high TRUE no) (overcast hot high FALSE yes) (rainy mild high FALSE yes) (rainy cool normal FALSE yes) (rainy cool normal TRUE no) (overcast cool normal TRUE yes) (sunny mild high FALSE no) (sunny cool normal FALSE yes) (rainy mild normal FALSE yes) (sunny mild normal TRUE yes) (overcast mild high TRUE yes) (overcast hot normal FALSE yes) (rainy mild high TRUE no)) :synonyms '((rainy . 1) (overcast . 2) (sunny . 3) (cool . 1) (mild . 2) (hot . 3) (normal . 1) (high . 2) (FALSE . 1) (TRUE . 2) (no . 1) (yes . 2)) :possibles '((3 3 1 1 0) (3 2 2 2 0) (3 1 1 2 0) (2 3 2 1 0) (2 2 1 1 0) (2 1 2 1 0) (1 3 1 2 0) (1 2 2 1 0) (1 1 0 0 0) (2 0 0 2 0)))) (defstruct table columns independent-dimensions dependent-dimensions actuals possibles nearests) (defstruct row id independents dependents) (let ((n 0)) (defun defrow (new) (make-row :id (incf n) :independents (butlast new) :dependents (first (last new))))) (defun deftable (&key columns ranges synonyms observed possibles) (make-table :columns columns :independent-dimensions (butlast ranges) :dependent-dimensions (first (last ranges)) :actuals (mapcar #'defrow (sublis synonyms observed)) :possibles (mapcar #'defrow possibles))) (defun nearest (tbl actual) (let (d best (min most-positive-fixnum)) (dolist (possible (table-possibles tbl)) (setf d (distance (row-independents actual) (row-independents possible) (table-independent-dimensions tbl))) (if (< d min) (setf min d best possible))) (cons (row-id actual) (row-id best)))) (defun nearests (tbl) (mapcar #'(lambda (actual) (nearest tbl actual)) (table-actuals tbl))) (defmacro geta (key alist &optional default) (or (cdr (assoc key alist)) default)) (defun statistics (tbl) (let (report (all (length (table-actuals tbl))) (h (make-hash-table))) (dolist (pair (nearests tbl)) (let* ((actual (first pair)) (klass (row-dependents (find actual (table-actuals tbl) :key #'row-id))) (possible (rest pair))) (push klass (gethash possible h)))) (dolist (possible (mapcar #'row-id (table-possibles tbl))) (let* ((tmp (gethash possible h)) (per (round (* 100 (/ (length tmp) all))))) (if tmp (push `(,possible ,(sort tmp #'<) ,per) report) (push `(,possible () 0) report)))) (mapc #'(lambda(one) (format t "at ~a% possibility ~a has ~a~%" (third one) (first one) (second one))) (sort report #'> :key #'third)) t)) #|
|# #||#