(ns code.utils.pruners.sbbr (:use (code.utils utils EqualFrequencyBinning)) (:use (incanter core stats))) ; Note - datasets are stored as matrices ; The story goes... ; Given the dataset, bin each column, ; so new dataset is the binned version. ; If there are more than 2 classes ; let one represent best and the others ; represent rest. ; 1. Bin dataset (have 2 versions stored) ; a. replace original dataset with binned values (discretize1 and discretizer2) ; b. store cols separately with binned vals (defn discretize1 [col] (let [sortcol (sort col) bincol (efb sortcol) mush (map #(vector %1 %2) sortcol bincol)] (loop [c col result (transient [])] (if (empty? c) (persistent! result) (recur (rest c) (conj! result (second (first (filter #(= (first c) (first %)) mush))))))))) (defn discretizer2 [data] (let [issuper (Transpose data) unsuper (butlast issuper) a (bind-columns (matrix (Transpose (map discretize1 unsuper))) (matrix (last issuper))) b0 (map #(vector %1 %2) (butlast (Transpose a)) unsuper) together (fn [one] (matrix (map #(vector %1 %2) (first one) (second one)))) b (map #(together %) b0)] [a b])) ; 2. Group the new dataset by the class col ; then choose one at a time to be best and ; others, rest. So output will be a series ; of best-rest folds (defn my-best-rest [data] (let [group-it0 (group-by data (- (ncol data) 1)) group-it (map #(if (= (nrow %) 1) [%] %) group-it0)] (loop [g group-it results (transient [])] (if (empty? g) (persistent! results) (recur (rest g) (conj! results (vector (first g) ;(if (= (nrow (first g)) 1) [(first g)] (first g)) (matrix (apply concat (filter #(not= (first g) %) group-it)))))))))) ; 3. Get ranks for each val in each col - ; but first - sort cols and compress (maybe not) ; Anyway, return list of best bins ;(rank-vals-for-all-cols iris (first (my-best-rest (first (discretizer2 iris)))) (Transpose (first (discretizer2 iris))) (range 0 (- (ncol iris) 1)) 2) (defn bin-rank [D val col best rest] (let [pbest (/ (nrow best) (nrow D)) prest (/ (nrow rest) (nrow D)) freqEbest (fn [] (let [one (filter #(= (nth % col) val) best)] (if (= (count one) 0) 0 (let [freqEbest0 (count (filter #(= (nth % col) val) best)) freqEbest1 (/ freqEbest0 (nrow best))] freqEbest1)))) freqErest (fn [] (let [two (filter #(= (nth % col) val) rest)] (if (= (count two) 0) 0 (let [freqErest0 (count (filter #(= (nth % col) val) rest)) freqErest1 (/ freqErest0 (nrow rest))] freqErest1)))) likebestE (* (freqEbest) pbest) likerestE (* (freqErest) prest) rank (/ (Math/pow likebestE 2) (+ likebestE likerestE))] [val (* rank 100)])) (defn rank-val1 [D val col] (let [br (first (my-best-rest (first (discretizer2 D)))) ;* best (first br) rest (second br) myrank (bin-rank D val col best rest)] myrank)) (defn rank-val [D br val col] (let [;br (first (my-best-rest (first (discretizer2 D)))) ;* best (first br) rest (second br) myrank (bin-rank D val col best rest)] myrank)) (defn rank-vals [D br vals col] (let [ranks (map #(rank-val D br % col) vals) biggest (filter #(= (apply max (map second ranks)) (second %)) ranks)] biggest)) (defn rank-vals-for-all-cols [D br all-vals cols k] ;* (let [vals (map first (map first (map #(rank-vals D br %1 %2) all-vals cols))) ranks (map second (map first (map #(rank-vals D br %1 %2) all-vals cols))) idx-vals (map #(vector %1 %2) vals (range 0 (count vals))) idx-ranks (map #(vector %1 %2) ranks (range 0 (count ranks))) high-k-idx (map second (take k (reverse (sort-by first idx-ranks)))) get-high-vals (fn [idxs] (loop [idx idxs result (transient [])] (if (empty? idx) (persistent! result) (recur (rest idx) (conj! result (filter #(= (first idx) (second %)) idx-vals)))))) best-vals (get-high-vals high-k-idx)] (map first best-vals))) ; 4. Convert bin vals into real vals from original dataset ; (convert-bin-to-actual '([0.3 3] [1.5 2]) (second (discretizer2 iris))) (defn convert-bin-to-actual [bin-vals1 b] ;* (let [bin-vals (map first bin-vals1) grab (fn [one oneb] (let [result (filter #(= one (first %)) oneb) sane (if (= (count result) 1) result (Transpose result))] sane)) get-b (fn [idxs] (loop [idx idxs result (transient [])] (if (empty? idx) (persistent! result) (recur (rest idx) (conj! result (nth b (first idx))))))) actual (map second (map #(grab %1 %2) bin-vals (get-b (map second bin-vals1)))) mins (map #(apply min %) actual) maxs (map #(apply max %) actual) min-and-max (map #(vector %1 %2) mins maxs) idx-mn-mx (map #(vector %1 %2) min-and-max (map second bin-vals1))] idx-mn-mx)) ; 5. Get the instances (defn get-instances [actual-best minmaxs] ;* (let [grab (fn [mn mx data col] (filter #(and (>= (nth % col) mn) (<= (nth % col) mx)) data)) minmaxs0 (map first minmaxs)] (loop [i (map second minmaxs) j 0 ab actual-best] (if (empty? i) (matrix ab) (recur (rest i) (inc j) (grab (first (nth minmaxs0 j)) (second (nth minmaxs0 j)) ab (first i))))))) ; 6. Putting it all together (defn csl-prototypes [D0 k] ;* (let [D1 (group-by D0 (- (ncol D0) 1)) D (matrix (apply concat (filter #(not= (nrow %) 1) D1))) ; leave (matrix (filter #(= (nrow %) 1) D1)) run-discretizer2 (discretizer2 D) brs (my-best-rest (first run-discretizer2)) realD (group-by D (- (ncol D) 1))] (loop [i 0 result (transient [])] (if (= (count brs) i) ; (bind-rows leave (matrix (apply concat (persistent! result)))) (matrix (apply concat (persistent! result))) (recur (inc i) (conj! result (let [ans1 (get-instances (if (= (nrow (nth realD i)) 1) [(nth realD i)] (nth realD i)) (convert-bin-to-actual (rank-vals-for-all-cols D (nth brs i) (Transpose (first run-discretizer2)) (range 0 (- (ncol D) 1)) k) (second run-discretizer2)))] (if (= (nrow ans1) 1) [ans1] (if (= ans1 nil) D0 ans1))) ))))))