Merge pull request #7 from innerNULL/dev

innerNULL · web-flow · commit b385826f145d · 2025-08-02T14:10:45.000+08:00
Fix Distributed Training and Metrics Calculation
diff --git a/README.md b/README.md
@@ -12,11 +12,9 @@ to make this as a general program for text multi-label classification task.
 ## Usage
 ### Python Env
 ```sh
-python -m venv ./_venv --copies
-source ./_venv/bin/activate
-python -m pip install --upgrade pip
-python -m pip install -r requirements.txt
-# deactivate
+micromamba env create -f environment.yaml -p ./_pyenv --yes
+micromamba activate ./_pyenv
+pip install -r requirements.txt
 ```
 ### Run Tests
 ```sh
diff --git a/environment.yaml b/environment.yaml
@@ -0,0 +1,11 @@
+name: pyenv
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - python=3.11
+  - setuptools<65
+  - gfortran_linux-64>=11.2.0   
+  - openblas>=0.3.18          
+  - ninja>=1.10.2
+  - openmpi>=5.0.8
diff --git a/requirements.txt b/requirements.txt
@@ -1,12 +1,12 @@
-mypy==0.982
-torch==2.0.0
-#torchmetrics==1.2.0
-sentencepiece==0.1.96
-duckdb==0.9.1
-pandas==2.0.0
-numpy==1.24.2
-scikit-learn==1.2.2
-transformers==4.28.1
-onnx==1.14.0
-onnxruntime==1.15.0
-ray[train]==2.7.0
+mypy>=1.17.0,<=1.17.0
+torch>=2.0.0,<=2.7.1
+sentencepiece>=0.1.96
+duckdb>=0.9.1,<=1.3.2
+pandas>=2.0.0,<=2.3.1
+numpy>=1.24.2,<=2.3.1
+scikit-learn>=1.2.2,<=1.7.1
+transformers>=4.53.1,<=4.54.1
+onnx>=1.14.0,<=1.18.0
+onnxruntime>=1.15.0,<=1.22.1
+ray[train]>=2.7.0,<=2.48.0
+xformers==0.0.31.post1
diff --git a/src/plm_icd_multi_label_classifier/metrics.py b/src/plm_icd_multi_label_classifier/metrics.py
@@ -5,20 +5,41 @@
 
 import pdb
 import torch
-from typing import Dict, Optional
+from typing import List, Dict, Optional
 from torch import Tensor, IntTensor, FloatTensor
 
 
 def metrics_func(
     preds_one_hot: IntTensor, label_one_hot: IntTensor, bias: float=1e-6
 ) -> float:
+    pred_nonzero_idx: List[int] | int = torch.nonzero(preds_one_hot.sum(dim=0))\
+        .squeeze()\
+        .tolist()
+    gt_nonzero_idx: List[int] | int = torch.nonzero(label_one_hot.sum(dim=0))\
+        .squeeze()\
+        .tolist()
+    pred_nonzero_idx = (
+        [pred_nonzero_idx] if isinstance(pred_nonzero_idx, int) 
+        else pred_nonzero_idx
+    )
+    gt_nonzero_idx = (
+        [gt_nonzero_idx] if isinstance(gt_nonzero_idx, int)
+        else gt_nonzero_idx
+    )
+    target_label_idx: List[str] = sorted(
+        list(set(pred_nonzero_idx + gt_nonzero_idx))
+    )
+    preds_one_hot = preds_one_hot[:, target_label_idx]
+    label_one_hot = label_one_hot[:, target_label_idx]
+
     # 1 represents correctly predicted positive class
     pred_pos_correctness: IntTensor = preds_one_hot.mul(label_one_hot)
     
-    correct_pos_pred_cnt: IntTensor = pred_pos_correctness.sum(dim=1)
-    sample_label_cnt: IntTensor = label_one_hot.sum(dim=1) + bias
-    pred_label_cnt: IntTensor = preds_one_hot.sum(dim=1) + bias
-
+    # Label level statistics
+    correct_pos_pred_cnt: IntTensor = pred_pos_correctness.sum(dim=0)
+    sample_label_cnt: IntTensor = label_one_hot.sum(dim=0) + bias
+    pred_label_cnt: IntTensor = preds_one_hot.sum(dim=0) + bias
+    
     macro_recall: FloatTensor = correct_pos_pred_cnt.div(sample_label_cnt).mean()
     macro_precision: FloatTensor = correct_pos_pred_cnt.div(pred_label_cnt).mean()
     macro_f1: FloatTensor = 2 * macro_recall * macro_precision / (macro_recall + macro_precision + bias)
diff --git a/src/plm_icd_multi_label_classifier/model.py b/src/plm_icd_multi_label_classifier/model.py
@@ -21,7 +21,10 @@ def __init__(self,
         super().__init__()
         
         # Language model
-        self._lm: Module = AutoModel.from_pretrained(lm) if isinstance(lm, str) else lm
+        self._lm: Module = (
+            AutoModel.from_pretrained(lm, trust_remote_code=True) if isinstance(lm, str) 
+            else lm
+        )
 
         # Dimension info
         self._label_num: int = label_num
diff --git a/train.py b/train.py
@@ -9,8 +9,10 @@
 import tempfile
 import json
 import torch
+import random
 import ray.train
 import torch.nn.functional as F
+import numpy as np
 from typing import Dict
 from transformers import AutoTokenizer
 from torch import device
@@ -28,6 +30,9 @@
 from src.plm_icd_multi_label_classifier.metrics import metrics_func, topk_metrics_func
 
 
+THRESHOLD: float = 0.6
+
+
 def init_with_ckpt(net: PlmMultiLabelEncoder, ckpt_root_path: str, engine: str) -> None:
     ckpts: List[str] = [x for x in os.listdir(ckpt_root_path) if x != "bak"]
     if len(ckpts) == 0:
@@ -94,7 +99,7 @@ def eval(
 
         logits: FloatTensor = torch.concat(all_logits, dim=0)
         output_label_probs: FloatTensor = torch.sigmoid(logits)
-        output_one_hot: FloatTensor = (output_label_probs > 0.5).float()
+        output_one_hot: FloatTensor = (output_label_probs > THRESHOLD).float()
         label_one_hot: FloatTensor = torch.concat(all_label_one_hots, dim=0)
         # Loss
         loss: float = float(
@@ -104,9 +109,9 @@ def eval(
         prob50_metrics: Dict[str, float] = metrics_func(
             output_one_hot.int(), label_one_hot.int()
         )
-        top5_metrics: Dict[str, float] = topk_metrics_func(logits, label_one_hot, top_k=5) 
-        top8_metrics: Dict[str, float] = topk_metrics_func(logits, label_one_hot, top_k=8)
-        top15_metrics: Dict[str, float] = topk_metrics_func(logits, label_one_hot, top_k=15)
+        #top5_metrics: Dict[str, float] = topk_metrics_func(logits, label_one_hot, top_k=5) 
+        #top8_metrics: Dict[str, float] = topk_metrics_func(logits, label_one_hot, top_k=8)
+        #top15_metrics: Dict[str, float] = topk_metrics_func(logits, label_one_hot, top_k=15)
 
         out = {
             "loss": round(loss, 8),  
@@ -116,29 +121,31 @@ def eval(
             "macro_recall": round(prob50_metrics["macro_recall"], 4), 
             "macro_precision": round(prob50_metrics["macro_precision"], 4),
             "macro_f1": round(prob50_metrics["macro_f1"], 4), 
-            "micro_recall@5": round(top5_metrics["micro_recall@5"], 4), 
-            "micro_precision@5": round(top5_metrics["micro_precision@5"], 4), 
-            "micro_f1@5": round(top5_metrics["micro_f1@5"], 4), 
-            "macro_recall@5": round(top5_metrics["macro_recall@5"], 4), 
-            "macro_precision@5": round(top5_metrics["macro_precision@5"], 4), 
-            "macro_f1@5": round(top5_metrics["macro_f1@5"], 4), 
-            "micro_recall@8": round(top8_metrics["micro_recall@8"], 4), 
-            "micro_precision@8": round(top8_metrics["micro_precision@8"], 4), 
-            "micro_f1@8": round(top8_metrics["micro_f1@8"], 4), 
-            "macro_recall@8": round(top8_metrics["macro_recall@8"], 4), 
-            "macro_precision@8": round(top8_metrics["macro_precision@8"], 4), 
-            "macro_f1@8": round(top8_metrics["macro_f1@8"], 4), 
-            "micro_recall@15": round(top15_metrics["micro_recall@15"], 4), 
-            "micro_precision@15": round(top15_metrics["micro_precision@15"], 4), 
-            "micro_f1@15": round(top15_metrics["micro_f1@15"], 4), 
-            "macro_recall@15": round(top15_metrics["macro_recall@15"], 4), 
-            "macro_precision@15": round(top15_metrics["macro_precision@15"], 4), 
-            "macro_f1@15": round(top15_metrics["macro_f1@15"], 4) 
+            #"micro_recall@5": round(top5_metrics["micro_recall@5"], 4), 
+            #"micro_precision@5": round(top5_metrics["micro_precision@5"], 4), 
+            #"micro_f1@5": round(top5_metrics["micro_f1@5"], 4), 
+            #"macro_recall@5": round(top5_metrics["macro_recall@5"], 4), 
+            #"macro_precision@5": round(top5_metrics["macro_precision@5"], 4), 
+            #"macro_f1@5": round(top5_metrics["macro_f1@5"], 4), 
+            #"micro_recall@8": round(top8_metrics["micro_recall@8"], 4), 
+            #"micro_precision@8": round(top8_metrics["micro_precision@8"], 4), 
+            #"micro_f1@8": round(top8_metrics["micro_f1@8"], 4), 
+            #"macro_recall@8": round(top8_metrics["macro_recall@8"], 4), 
+            #"macro_precision@8": round(top8_metrics["macro_precision@8"], 4), 
+            #"macro_f1@8": round(top8_metrics["macro_f1@8"], 4), 
+            #"micro_recall@15": round(top15_metrics["micro_recall@15"], 4), 
+            #"micro_precision@15": round(top15_metrics["micro_precision@15"], 4), 
+            #"micro_f1@15": round(top15_metrics["micro_f1@15"], 4), 
+            #"macro_recall@15": round(top15_metrics["macro_recall@15"], 4), 
+            #"macro_precision@15": round(top15_metrics["macro_precision@15"], 4), 
+            #"macro_f1@15": round(top15_metrics["macro_f1@15"], 4) 
         }
     return out
 
 def train_func(configs: Dict) -> None:
     torch.manual_seed(configs["random_seed"])
+    random.seed(configs["random_seed"])
+    np.random.seed(configs["random_seed"])
 
     device: device = None
     if configs["training_engine"] == "torch":
@@ -233,8 +240,10 @@ def train_func(configs: Dict) -> None:
                 elif configs["training_engine"] == "ray":
                     if ray.train.get_context().get_world_rank() == 0:
                         open(os.path.join(ckpt_dir, "train.json"), "w").write(json.dumps(configs))
-                        torch.save(model.module.state_dict(), os.path.join(ckpt_dir, "model.pt"))
-
+                        try:
+                            torch.save(model.module.state_dict(), os.path.join(ckpt_dir, "model.pt"))
+                        except:
+                            torch.save(model.state_dict(), os.path.join(ckpt_dir, "model.pt"))
             global_step_id += 1
 
     final_ckpt_dir: str = os.path.join(configs["ckpt_dir"], "final")
@@ -246,7 +255,10 @@ def train_func(configs: Dict) -> None:
     elif configs["training_engine"] == "ray":
         if ray.train.get_context().get_world_rank() == 0:
             open(os.path.join(final_ckpt_dir, "train.json"), "w").write(json.dumps(configs))
-            torch.save(model.module.state_dict(), os.path.join(final_ckpt_dir, "model.pt"))
+            try:
+                torch.save(model.module.state_dict(), os.path.join(final_ckpt_dir, "model.pt"))
+            except:
+                torch.save(model.state_dict(), os.path.join(ckpt_dir, "model.pt"))
 
 
 if __name__ == "__main__":
@@ -256,7 +268,9 @@ def train_func(configs: Dict) -> None:
     if os.path.exists(train_conf["hf_lm"]):
         train_conf["hf_lm"] = os.path.abspath(train_conf["hf_lm"])
     print("Training config:\n{}".format(train_conf))
-
+    
+    os.environ["HF_TOKEN"] = train_conf["hf_key"]
+    
     os.system("mkdir -p %s" % train_conf["ckpt_dir"])
 
     if train_conf["training_engine"] == "torch":
diff --git a/train_mimic3_icd.json b/train_mimic3_icd.json
@@ -2,6 +2,7 @@
   "chunk_size": 256,
   "chunk_num": 3,
   "hf_lm": "distilbert-base-uncased",
+  "hf_key": "",
   "lm_hidden_dim": 768,
   "data_dir": "./_data/etl/mimic3",
   "training_engine": "ray",