|
| 1 | +(ns clj-tumblr-summarizer.ml |
| 2 | + (:require |
| 3 | + [clj-tumblr-summarizer.storage.fs :as fs] |
| 4 | + [clj-tumblr-summarizer.time :as time] |
| 5 | + [scicloj.ml.core :as ml] |
| 6 | + [scicloj.ml.metamorph :as mm] |
| 7 | + [scicloj.ml.dataset :as ds])) |
| 8 | + |
| 9 | +@(def tag-lists-ds |
| 10 | + (-> (->> (fs/posts-in-time-range (time/previous-month)) |
| 11 | + (map #(select-keys % [:tags]))) |
| 12 | + (ds/dataset) |
| 13 | + (ds/separate-column :tags :infer #(zipmap % (repeat 1))) |
| 14 | + (ds/replace-missing :all :value 0))) |
| 15 | + |
| 16 | +(def min-nr-neighbors 1) ; 1 => 4x, 2 => 2x cluster |
| 17 | +(def radius 1) ; > 1 => single cluster |
| 18 | + |
| 19 | +;; (def ds (-> [{:tags ["app" "mobile" "travel"]} |
| 20 | +;; {:tags ["business"]} |
| 21 | +;; {:tags ["clojure" "library" "data processing"]} |
| 22 | +;; {:tags ["clojure" "tool" "docker" "devops"]} |
| 23 | +;; {:tags ["tool" "automation" "macos"]}] |
| 24 | +;; (ds/dataset) |
| 25 | +;; (ds/separate-column :tags :infer #(zipmap % (repeat 1))) |
| 26 | +;; (ds/replace-missing :all :value 0))) |
| 27 | + |
| 28 | +(def pipe-fn |
| 29 | + (ml/pipeline |
| 30 | + {:metamorph/id :tag-cluster} |
| 31 | + (scicloj.ml.metamorph/cluster :dbscan [min-nr-neighbors radius] :cluster-id))) |
| 32 | + |
| 33 | +(-> |
| 34 | + @(def trained-ctx |
| 35 | + (pipe-fn {:metamorph/data tag-lists-ds |
| 36 | + :metamorph/mode :fit})) |
| 37 | + |
| 38 | + (:metamorph/data trained-ctx) |
| 39 | + (ds/select-columns :cluster-id) |
| 40 | + (ds/unique-by :cluster-id)) |
| 41 | + |
| 42 | +;; (-> (ds/dataset tag-lists) (ds/fit-one-hot :tags)) |
| 43 | +;; (-> (ds/dataset tag-lists) |
| 44 | +;; ;(ds/add-column :id (range)) |
| 45 | +;; (ds/separate-column :tags :infer #(zipmap % (repeat 1)))) |
| 46 | + |
| 47 | + |
| 48 | +;; EXAMPLE CODE |
| 49 | +;; (def titanic-test |
| 50 | +;; (-> "https://github.com/scicloj/metamorph-examples/raw/main/data/titanic/test.csv" |
| 51 | +;; (ds/dataset {:key-fn keyword :parser-fn :string}) |
| 52 | +;; (ds/add-column :Survived [""] :cycle))) |
| 53 | + |
| 54 | +;; ;; construct pipeline function including Logistic Regression model |
| 55 | +;; (def pipe-fn |
| 56 | +;; (ml/pipeline |
| 57 | +;; (mm/select-columns [:Survived :Pclass]) |
| 58 | +;; (mm/add-column :Survived (fn [ds] (map #(case % "1" "yes" "0" "no" nil "") (:Survived ds)))) |
| 59 | +;; (mm/categorical->number [:Survived :Pclass]) |
| 60 | +;; (mm/set-inference-target :Survived) |
| 61 | +;; {:metamorph/id :model} |
| 62 | +;; (mm/model {:model-type :smile.classification/logistic-regression}))) |
| 63 | + |
| 64 | +;; ;; execute pipeline with train data including model in mode :fit |
| 65 | +;; (def trained-ctx |
| 66 | +;; (pipe-fn {:metamorph/data titanic-train |
| 67 | +;; :metamorph/mode :fit})) |
0 commit comments