(ns code.utils.pruners.my_csl (:use (code.utils utils EqualFrequencyBinning)) (:use (incanter core stats))) ; Note - datasets are stored as matrices ; The story goes... ; Given the dataset, bin each column, ; so new dataset is the binned version. ; If there are more than 2 classes ; let one represent best and the others ; represent rest. ; 1. Bin dataset (have 2 versions stored) ; a. replace original dataset with binned values (discretize1 and discretizer2) ; b. store cols separately with binned vals (defn discretize1 [col] (let [sortcol (sort col) bincol (bin1 0.001 sortcol) mush (map #(vector %1 %2) sortcol bincol)] (loop [c col result (transient [])] (if (empty? c) (persistent! result) (recur (rest c) (conj! result (second (first (filter #(= (first c) (first %)) mush))))))))) (defn discretizer2 [data] "a = binned version of data set b = each binned col matched to actual col" (let [issuper (Transpose data) unsuper (butlast issuper) a (bind-columns (matrix (Transpose (map discretize1 unsuper))) (matrix (last issuper))) b0 (map #(vector %1 %2) (butlast (Transpose a)) unsuper) together (fn [one] (matrix (map #(vector %1 %2) (first one) (second one)))) b (map #(together %) b0)] [a b])) ; 2. Group the new dataset by the class col ; then choose one at a time to be best and ; others, rest. So output will be a series ; of best-rest folds (defn my-best-rest [data] "data = binned version = a" (let [group-it0 (group-by data (- (ncol data) 1)) group-it (map #(if (= (nrow %) 1) [%] %) group-it0)] (loop [g group-it results (transient [])] (if (empty? g) (persistent! results) (recur (rest g) (conj! results (vector (first g) (matrix (apply concat (filter #(not= (first g) %) group-it)))))))))) ; 3. Get ranks for each val in each col - ; but first - sort cols and compress (maybe not) ; Anyway, return list of best bins (defn bin-rank [D val col best rest] "D = binned data set" (let [pbest (/ (nrow best) (nrow D)) prest (/ (nrow rest) (nrow D)) freqEbest (fn [] (let [one (filter #(= (nth % col) val) best)] (if (= (count one) 0) 0 (let [freqEbest0 (count (filter #(= (nth % col) val) best)) freqEbest1 (/ freqEbest0 (nrow best))] freqEbest1)))) freqErest (fn [] (let [two (filter #(= (nth % col) val) rest)] (if (= (count two) 0) 0 (let [freqErest0 (count (filter #(= (nth % col) val) rest)) freqErest1 (/ freqErest0 (nrow rest))] freqErest1)))) likebestE (* (freqEbest) pbest) likerestE (* (freqErest) prest) rank (/ (Math/pow likebestE 2) (+ likebestE likerestE))] [val ; (* rank 100) rank])) ; 4. Get ranks for vals in one col for one class (best) (defn ranks-for-one-col [D vals col best rest] (map #(bin-rank D % col best rest) vals)) ; 5. Get ranks for vals in all cols except class col for one class (best) (defn ranks-for-all-cols [D all-vals cols best rest] "all-vals = each col of vals D = binned data set" (map #(ranks-for-one-col D %1 %2 best rest) all-vals cols))