;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; LDA and QDA classifiers from chapter 4 of Elements of Statistical Learning ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; LDA ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (use '(incanter core stats charts io)) (def training (to-matrix (read-dataset "http://bit.ly/464h4h" :header true))) (def testing (to-matrix (read-dataset "http://bit.ly/1btCei" :header true))) (def K 11) (def p 10) (def N (nrow training)) (def group-counts (map nrow (group-by training 1))) ;; estimate the prior probabilities for each cluster (def prior-probs (div group-counts N)) ;; estimate the centroids for each cluster (def cluster-centroids (matrix (for [x_k (group-by training 1 :cols (range 2 12))] (map mean (trans x_k))))) ;; estimate the covariance matrix to be used for all clusters (def cluster-cov-mat (let [groups (group-by training 1 :cols (range 2 12))] (reduce plus (map (fn [group centroid n] (reduce plus (map #(div (mmult (trans (minus % centroid)) (minus % centroid)) (- N K)) group))) groups cluster-centroids group-counts)))) ;; calculate the inverses of the cluster covariance matrices (def inv-cluster-cov-mat (solve cluster-cov-mat)) ;; define the linear discriminant function (ldf) (defn ldf [x Sigma-inv mu_k pi_k] (+ (mmult x Sigma-inv (trans mu_k)) (- (mult 1/2 (mmult mu_k Sigma-inv (trans mu_k)))) (log pi_k))) ;; define a function to calculate the linear quadratic scores. (defn calculate-scores ([data inv-cov-mat centroids priors] (matrix (pmap (fn [row] (pmap (partial ldf row inv-cov-mat) centroids priors)) (sel data :cols (range 2 12)))))) ;; calculate the scores for the training data (def training-lda-scores (calculate-scores training inv-cluster-cov-mat cluster-centroids prior-probs)) ;; calculate the scores for the testing data (def testing-lda-scores (calculate-scores testing inv-cluster-cov-mat cluster-centroids prior-probs)) ;;(bind-columns (sel training :cols 1) (plus 1 (map max-index training-lda-scores))) ;;(bind-columns (sel testing :cols 1) (plus 1 (map max-index testing-lda-scores))) (defn max-index "Returns the index of the maximum value in the given sequence." ([x] (let [max-x (reduce max x) n (length x)] (loop [i 0] (if (= (nth x i) max-x) i (recur (inc i))))))) ;; define a function to calculate the error rate (defn error-rate [data scores] (/ (sum (map #(if (= %1 %2) 0 1) (sel data :cols 1) (plus 1 (map max-index scores)))) (nrow data))) ;; calculate the error rate for the training data (0.316) (error-rate training training-lda-scores) ;; calculate the error rate for the testing data (0.56) (error-rate testing testing-lda-scores) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; QDA ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (use '(incanter core stats charts io)) ;(def training (to-matrix (read-dataset "/Users/dliebke/Desktop/esl/vowel.train.txt" :header true))) (def training (to-matrix (read-dataset "http://www-stat.stanford.edu/~tibs/ElemStatLearn/datasets/vowel.train" :header true))) ;(def testing (to-matrix (read-dataset "/Users/dliebke/Desktop/esl/vowel.test.txt" :header true))) (def testing (to-matrix (read-dataset "http://www-stat.stanford.edu/~tibs/ElemStatLearn/datasets/vowel.test" :header true))) (def K 11) (def p 10) (def N (nrow training)) (def group-counts (map nrow (group-by training 1))) ;; estimate the prior probabilities for each cluster (def prior-probs (div group-counts N)) ;; estimate the centroids for each cluster (def cluster-centroids (matrix (for [x_k (group-by training 1 :cols (range 2 12))] (map mean (trans x_k))))) ;;------------------------------------------------------------------------------ ;; CALCULATE THE K COVARIANCE MATRICES NECESSARY FOR QDA ;;------------------------------------------------------------------------------ ;; estimate the covariance matrices for each cluster (def cluster-cov-mats (let [groups (group-by training 1 :cols (range 2 12))] (map (fn [group centroid n] (reduce plus (map #(div (mmult (trans (minus % centroid)) (minus % centroid)) (dec n)) group))) groups cluster-centroids group-counts))) ;; calculate the inverses of the cluster covariance matrices (def inv-cluster-cov-mats (map solve cluster-cov-mats)) ;; define the quadratic discriminant function (defn qdf [x Sigma_k Sigma-inv_k mu_k pi_k] (+ (- (mult 1/2 (log (det Sigma_k)))) (- (mult 1/2 (mmult (minus x mu_k) Sigma-inv_k (trans (minus x mu_k))))) (log pi_k))) (defn calculate-scores ([data cov-mats inv-cov-mats centroids priors] (matrix (pmap (fn [row] (pmap (partial qdf row) cov-mats inv-cov-mats centroids priors)) (sel data :cols (range 2 12)))))) ;; calculate the scores for the training data (def training-qda-scores (calculate-scores training cluster-cov-mats inv-cluster-cov-mats cluster-centroids prior-probs)) ;; calculate the scores for the testing data (def testing-qda-scores (calculate-scores testing cluster-cov-mats inv-cluster-cov-mats cluster-centroids prior-probs)) ;;(bind-columns (sel training :cols 1) (plus 1 (map max-index training-qda-scores))) ;;(bind-columns (sel testing :cols 1) (plus 1 (map max-index testing-qda-scores))) (defn max-index "Returns the index of the maximum value in the given sequence." ([x] (let [max-x (reduce max x) n (length x)] (loop [i 0] (if (= (nth x i) max-x) i (recur (inc i))))))) ;; define a function to calculate the error rate (defn error-rate [data scores] (/ (sum (map #(if (= %1 %2) 0 1) (sel data :cols 1) (plus 1 (map max-index scores)))) (nrow data))) ;; calculate the error rate for the training data (error-rate training training-qda-scores) ;; calculate the error rate for the testing data (error-rate testing testing-qda-scores) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; QDA w/ Eigenvalue Decomposition ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (use '(incanter core stats charts io)) ;(def training (to-matrix (read-dataset "/Users/dliebke/Desktop/esl/vowel.train.txt" :header true))) (def training (to-matrix (read-dataset "http://www-stat.stanford.edu/~tibs/ElemStatLearn/datasets/vowel.train" :header true))) ;(def testing (to-matrix (read-dataset "/Users/dliebke/Desktop/esl/vowel.test.txt" :header true))) (def testing (to-matrix (read-dataset "http://www-stat.stanford.edu/~tibs/ElemStatLearn/datasets/vowel.test" :header true))) (def K 11) (def p 10) (def N (nrow training)) (def group-counts (map nrow (group-by training 1))) ;; estimate the prior probabilities for each cluster (def prior-probs (div group-counts N)) ;; estimate the centroids for each cluster (def cluster-centroids (matrix (for [x_k (group-by training 1 :cols (range 2 12))] (map mean (trans x_k))))) ;; estimate the covariance matrices for each cluster (def cluster-cov-matrices (let [groups (group-by training 1 :cols (range 2 12))] (map (fn [group centroid n] (reduce plus (map #(div (mmult (trans (minus % centroid)) (minus % centroid)) (dec n)) group))) groups cluster-centroids group-counts))) ;;------------------------------------------------------------------------------ ;; ADDED TO QDA EXAMPLE TO IMPROVE PERFORMANCE ;;------------------------------------------------------------------------------ ;; extract the eigenvalues and eigenvectors from the covariance matrices ;; for each cluster, to improve performance (def Sigma-decomp (map decomp-eigenvalue cluster-cov-matrices )) (def D (map #(diag (:values %)) Sigma-decomp)) (def U (map #(:vectors %) Sigma-decomp)) ;;------------------------------------------------------------------------------ ;; CHANGED FROM QDA EXAMPLE ;;------------------------------------------------------------------------------ ;; define the quadratic discriminant function using the eigenvalues and eigenvectors (defn qdf [x D_k U_k mu_k pi_k] (+ (- (mult 1/2 (sum (map log (diag D_k))))) (- (mult 1/2 (mmult (trans (mmult (trans U_k) (trans (minus x mu_k)))) (solve D_k) (mmult (trans U_k) (trans (minus x mu_k)))))) (log pi_k))) ;;------------------------------------------------------------------------------ ;; END OF CHANGES ;;------------------------------------------------------------------------------ ;; define a function to calculate the quadratic discriminant scores (defn calculate-scores ([data D U centroids priors] (matrix (pmap (fn [row] (pmap (partial qdf row) D U centroids priors)) (sel data :cols (range 2 12)))))) ;; calculate the scores for each row of the training data set across all 11 clusters (def training-qda-scores (calculate-scores training D U cluster-centroids prior-probs)) ;; calculate the scores for each row of the testing data set across all 11 clusters (def testing-qda-scores (calculate-scores testing D U cluster-centroids prior-probs)) ;;(bind-columns (sel training :cols 1) (plus 1 (map max-index training-qda-scores))) ;;(bind-columns (sel testing :cols 1) (plus 1 (map max-index testing-qda-scores))) (defn max-index "Returns the index of the maximum value in the given sequence." ([x] (let [max-x (reduce max x) n (length x)] (loop [i 0] (if (= (nth x i) max-x) i (recur (inc i))))))) ;; define a function to calculate the error rate (defn error-rate [data scores] (/ (sum (map #(if (= %1 %2) 0 1) (sel data :cols 1) (plus 1 (map max-index scores)))) (nrow data))) ;; calculate the error rate for the training data (error-rate training training-qda-scores) ;; calculate the error rate for the testing data (error-rate testing testing-qda-scores)