(ns code.utils.pruners.sbbr1 (:use (code.utils utils EqualFrequencyBinning)) (:use (incanter core stats))) ; Note - datasets are stored as matrices ; The story goes... ; Given the dataset, bin each column, ; so new dataset is the binned version. ; If there are more than 2 classes ; let one represent best and the others ; represent rest. ;1111111111111111111111111111111111111111111111111111111111111111111111111 ; 1. Bin dataset (have 2 versions stored) ; a. replace original dataset with binned values (discretize1 and discretizer2) ; b. store cols separately with binned vals (defn discretize1 [col] (let [sortcol (sort col) bincol (efb sortcol) mush (map #(vector %1 %2) sortcol bincol)] (loop [c col result (transient [])] (if (empty? c) (persistent! result) (recur (rest c) (conj! result (second (first (filter #(= (first c) (first %)) mush))))))))) (defn discretizer2 [data] "a = binned version of data set b = each binned col matched to actual col" (let [issuper (Transpose data) unsuper (butlast issuper) a (bind-columns (matrix (Transpose (map discretize1 unsuper))) (matrix (last issuper))) b0 (map #(vector %1 %2) (butlast (Transpose a)) unsuper) together (fn [one] (matrix (map #(vector %1 %2) (first one) (second one)))) b (map #(together %) b0)] [a b])) ;22222222222222222222222222222222222222222222222222222222222222222222222222 ; 2. Group the new dataset by the class col ; then choose one at a time to be best and ; others, rest. So output will be a series ; of best-rest folds (defn my-best-rest [data] "data = binned version = a" (let [group-it0 (group-by data (- (ncol data) 1)) group-it (map #(if (= (nrow %) 1) [%] %) group-it0)] (loop [g group-it results (transient [])] (if (empty? g) (persistent! results) (recur (rest g) (conj! results (vector (first g) (matrix (apply concat (filter #(not= (first g) %) group-it)))))))))) ;3333333333333333333333333333333333333333333333333333333333333333333333333333333 ; 3. Get ranks for each val in each col - ; but first - sort cols and compress (maybe not) ; Anyway, return list of best bins (defn bin-rank [D val col best rest] "D = binned data set" (let [pbest (/ (nrow best) (nrow D)) prest (/ (nrow rest) (nrow D)) freqEbest (fn [] (let [one (filter #(= (nth % col) val) best)] (if (= (count one) 0) 0 (let [freqEbest0 (count (filter #(= (nth % col) val) best)) freqEbest1 (/ freqEbest0 (nrow best))] freqEbest1)))) freqErest (fn [] (let [two (filter #(= (nth % col) val) rest)] (if (= (count two) 0) 0 (let [freqErest0 (count (filter #(= (nth % col) val) rest)) freqErest1 (/ freqErest0 (nrow rest))] freqErest1)))) likebestE (* (freqEbest) pbest) likerestE (* (freqErest) prest) rank (/ (Math/pow likebestE 2) (+ likebestE likerestE))] [val ; (* rank 100) rank])) ;4444444444444444444444444444444444444444444444444444444444444444444444444444444 ; 4. Get ranks for vals in one col for one class (best) (defn ranks-for-one-col [D vals col best rest] (map #(bin-rank D % col best rest) vals)) ;5555555555555555555555555555555555555555555555555555555555555555555555555555555 ; 5. Get ranks for vals in all cols except class col for one class (best) (defn ranks-for-all-cols [D all-vals cols best rest] "all-vals = each col of vals D = binned data set" (map #(ranks-for-one-col D %1 %2 best rest) all-vals cols)) (defn ranks-for-one-class [D idx] "D = original data set idx = index of best-rest set vals = transposed binned-D without class col matched1 = matrix of binned-val and actual-val matched2 = matrix of binned-val and ranks want to combine actual-val with ranks output = list of vals vectored with ranks for one set of best-rest data" (let [binned-D1 (discretizer2 D) binned-D (first binned-D1) matched1 (second binned-D1) ;* not used for now br (nth (my-best-rest binned-D) idx) vals (Transpose (sel binned-D :cols (range 0 (- (ncol D) 1)))) my-cols (range 0 (- (ncol D) 1)) matched2 (map matrix (ranks-for-all-cols D vals my-cols (first br) (second br))) result (map #(bind-columns (sel %1 :cols 1) (sel %2 :cols 1)) matched1 matched2) rank-cols (map second (map Transpose matched2)) col-importance (map first (reverse (sort-by second (map #(vector %1 %2) (range 0 (- (ncol D) 1)) (map #(apply max %) rank-cols)))))] [col-importance result])) (defn top-k-ranks [D idx k] ;* if any errors k might be the problem - don't know how many unique ranks present - k = 1 is safe (let [result (second (ranks-for-one-class D idx)) rank-cols (map sort (map second (map Transpose result))) unique-ranks (map #(unique-compress (compress %)) rank-cols) max-k-ranks (map #(if (< (count %) k) (println "Error: your k var is to big") (take k %)) (map #(reverse %) unique-ranks))] [result max-k-ranks])) (defn top-vals [D idx k] (let [tops (top-k-ranks D idx k) top-ranks (second tops) att-vals (first tops) remove-nil (fn [lst] (remove #(= nil %) lst)) top-vals1 (fn [i] (map #(loop [lst (nth att-vals i) ans1 []] (if (empty? lst) (remove-nil ans1) (recur (rest lst) (conj ans1 (if (= % (second (first lst))) (first (first lst)) nil))))) (nth top-ranks i))) vals (loop [i 0 ans []] (if (= (count top-ranks) i) ans (recur (inc i) (conj ans (apply concat (top-vals1 i))))))] vals)) (defn csl-for-one-class [D idx k n] (let [col-order (first (ranks-for-one-class D idx)) col-vals (top-vals D idx k) vecD (Transpose (Transpose D)) get-instances1 (fn [lst i data] (loop [l lst ans []] (if (empty? l) (apply concat ans) (recur (rest l) (conj ans (filter #(= (nth % i) (first l)) data)))))) get-instances2 (loop [dat vecD i (take n col-order)] (if (empty? i) dat (recur (get-instances1 (nth col-vals (first i)) (first i) dat) (rest i)))) get-instances3 (map #(if (= (last %) idx) % nil) get-instances2)] (remove #(= nil %) get-instances3))) (defn extract-unique [lst] (loop [l lst result []] (if (empty? l) (remove #(= 'none %) result) (recur (rest l) (conj result (if (member? (first l) result) 'none (first l))))))) (defn run-csl [D k n] ; n = 1 - 4, k = 1 (let [ans (map #(csl-for-one-class D % k n) (unique-compress (compress (sort (last (Transpose D)))))) ans1 (remove #(= nil %) (apply concat ans)) ans2 (matrix (extract-unique ans1))] ans2))