Merge pull request #9 from innerNULL/dev

innerNULL · web-flow · commit 9b667b8a365e · 2025-08-07T13:35:57.000+08:00
Batch Fixs and Upgrades
diff --git a/README.md b/README.md
@@ -21,7 +21,55 @@ pip install -r requirements.txt
 python -m pytest ./test --cov=./src/plm_icd_multi_label_classifier --durations=0 -v
 ```
 
-### ETL
+### Custom Dataset Preparation
+The training dataset should be a directory with following structure:
+```
+├── dev.jsonl
+├── dict.json
+└── train.jsonl
+
+0 directories, 3 files
+```
+
+`train.jsonl` and `dev.jsonl` are train and validation dataset which are in JSON lines format.
+Each JSON data should contains at lease 2 fields which correspondingly be as inputs text and 
+output label names, following is an example:
+```
+{
+  "input_text": "...",
+  "labels": "label_1, label_5, ..., label_m"
+}
+```
+and based on this sample data format, you should have following settings in your configs:
+```
+{
+  ...
+  "data_dir": "your dataset directory path",
+  "text_col": "input_text",
+  "label_col": "labels"
+  ...   
+}
+```
+
+And the `dict.json` is for bi-directionary mapping between label names and IDs, the format is:
+```
+{
+  "label2id": {
+
+  },
+  "id2label": {
+    0: "label_0",
+    1: "label_1",
+    2: "label_2",
+    ...
+    n: "label_n"
+  }
+}
+```
+As the label ID will be also used as index in one-hot vector, so must start from 0.
+
+
+### (MIMIC3 Dataset Preparation)
 The ETL contain following steps:
 * Origin JSON line dataset preparation
 * Transform JSON line file to **limited** JOSN line file, which means all `list` or `dict` 
diff --git a/src/plm_icd_multi_label_classifier/data.py b/src/plm_icd_multi_label_classifier/data.py
@@ -20,18 +20,27 @@
 
 class TextOnlyDataset(Dataset):
     def __init__(self, 
-        data_path: str, data_dict_path: str, tokenizer: AutoTokenizer,
-        text_col: str="text", label_col: str="label", data_format: int="csv", 
-        chunk_size: int=512, chunk_num: int=2
+        data_path: str, 
+        data_dict_path: str, 
+        tokenizer: AutoTokenizer,
+        text_col: str="text", 
+        label_col: str="label", 
+        data_format: int="csv", 
+        chunk_size: int=512, 
+        chunk_num: int=2,
+        label_splitter: str=","
     ):
         self.data_path: str = data_path
         self.data_dict: Dict[str, Dict] = json.loads(open(data_dict_path, "r").read())
         self.text_col: str = text_col
         self.label_col: str = label_col
         self.data: List[Dict] = []
         self.model_ctx: PlmIcdCtx = PlmIcdCtx().init(
-            data_dict_path=data_dict_path, lm_tokenizer=tokenizer, 
-            chunk_size=chunk_size, chunk_num=chunk_num
+            data_dict_path=data_dict_path, 
+            lm_tokenizer=tokenizer, 
+            chunk_size=chunk_size, 
+            chunk_num=chunk_num,
+            label_splitter=label_splitter
         )
 
         if data_format == "csv":
@@ -45,7 +54,8 @@ def __init__(self,
         # using cutomized dev/test data do evaluation.
         for i, record in enumerate(self.data):
             curr_filtered_label: List[str] = [
-                x for x in record[label_col].split(",") if x in self.data_dict["label2id"]
+                x for x in record[label_col].split(label_splitter) 
+                if x in self.data_dict["label2id"]
             ]
             if len(curr_filtered_label) == 0:
                 self.data[i] = None
diff --git a/src/plm_icd_multi_label_classifier/eval.py b/src/plm_icd_multi_label_classifier/eval.py
@@ -0,0 +1,106 @@
+# -*- coding: utf-8 -*-
+# file: eval.py
+# date: 2025-08-04
+
+
+import pdb
+import json
+import torch
+import torch.nn.functional as F
+from typing import Dict
+from torch import device
+from torch import LongTensor, FloatTensor, IntTensor
+from torch.utils.data import DataLoader 
+
+from .model import PlmMultiLabelEncoder
+from .metrics import metrics_func, topk_metrics_func
+
+
+THRESHOLD: float = 0.6
+
+
+def evaluation(
+    model: PlmMultiLabelEncoder, 
+    dataloader: DataLoader, 
+    device: device=None, 
+    max_sample: int=1e4,
+    label_confidence_threshold: float=THRESHOLD,
+    verbose: bool=False
+) -> Dict[str, float]:
+    out: Dict[str, float] = {}
+    total_cnt: int = 0
+    all_logits: List[FloatTensor] = []
+    all_label_one_hots: List[FloatTensor] = []
+
+    model.eval()
+    with torch.no_grad():
+        for batch in dataloader:
+            curr_label_one_hot: FloatTensor = None
+            curr_text_ids: LongTensor = None
+            curr_attn_masks: LongTensor = None
+
+            curr_text_ids, curr_attn_masks, curr_label_one_hot = batch
+
+            if device is not None:
+                curr_label_one_hot = curr_label_one_hot.to(device)
+                curr_text_ids = curr_text_ids.to(device)
+                curr_attn_masks = curr_attn_masks.to(device)
+            
+            curr_logits: FloatTensor = model(curr_text_ids, curr_attn_masks)
+            all_logits.append(curr_logits)
+            all_label_one_hots.append(curr_label_one_hot)
+
+            total_cnt += curr_text_ids.shape[0]
+            if total_cnt >= max_sample:
+                break
+
+        logits: FloatTensor = torch.concat(all_logits, dim=0)
+        output_label_probs: FloatTensor = torch.sigmoid(logits)
+        output_one_hot: FloatTensor = (
+            (output_label_probs > label_confidence_threshold).float()
+        )
+        label_one_hot: FloatTensor = torch.concat(all_label_one_hots, dim=0)
+        # Loss
+        loss: float = float(
+            F.binary_cross_entropy(output_label_probs, label_one_hot).cpu()
+        )
+        # Metrics
+        prob50_metrics: Dict[str, float] = metrics_func(
+            output_one_hot.int(), label_one_hot.int()
+        )
+        #top5_metrics: Dict[str, float] = topk_metrics_func(logits, label_one_hot, top_k=5) 
+        #top8_metrics: Dict[str, float] = topk_metrics_func(logits, label_one_hot, top_k=8)
+        #top15_metrics: Dict[str, float] = topk_metrics_func(logits, label_one_hot, top_k=15)
+
+        out = {
+            "loss": round(loss, 8),  
+            "micro_recall": round(prob50_metrics["micro_recall"], 4), 
+            "micro_precision": round(prob50_metrics["micro_precision"], 4),
+            "micro_f1": round(prob50_metrics["micro_f1"], 4),
+            "macro_recall": round(prob50_metrics["macro_recall"], 4), 
+            "macro_precision": round(prob50_metrics["macro_precision"], 4),
+            "macro_f1": round(prob50_metrics["macro_f1"], 4), 
+            #"micro_recall@5": round(top5_metrics["micro_recall@5"], 4), 
+            #"micro_precision@5": round(top5_metrics["micro_precision@5"], 4), 
+            #"micro_f1@5": round(top5_metrics["micro_f1@5"], 4), 
+            #"macro_recall@5": round(top5_metrics["macro_recall@5"], 4), 
+            #"macro_precision@5": round(top5_metrics["macro_precision@5"], 4), 
+            #"macro_f1@5": round(top5_metrics["macro_f1@5"], 4), 
+            #"micro_recall@8": round(top8_metrics["micro_recall@8"], 4), 
+            #"micro_precision@8": round(top8_metrics["micro_precision@8"], 4), 
+            #"micro_f1@8": round(top8_metrics["micro_f1@8"], 4), 
+            #"macro_recall@8": round(top8_metrics["macro_recall@8"], 4), 
+            #"macro_precision@8": round(top8_metrics["macro_precision@8"], 4), 
+            #"macro_f1@8": round(top8_metrics["macro_f1@8"], 4), 
+            #"micro_recall@15": round(top15_metrics["micro_recall@15"], 4), 
+            #"micro_precision@15": round(top15_metrics["micro_precision@15"], 4), 
+            #"micro_f1@15": round(top15_metrics["micro_f1@15"], 4), 
+            #"macro_recall@15": round(top15_metrics["macro_recall@15"], 4), 
+            #"macro_precision@15": round(top15_metrics["macro_precision@15"], 4), 
+            #"macro_f1@15": round(top15_metrics["macro_f1@15"], 4) 
+        }
+        if verbose == True:
+            out["verbose"] = {}
+            out["verbose"]["pred_one_hot"] = output_one_hot.int().tolist()
+            out["verbose"]["gt_one_hot"] = label_one_hot.int().tolist()
+    return out
diff --git a/src/plm_icd_multi_label_classifier/model.py b/src/plm_icd_multi_label_classifier/model.py
@@ -15,8 +15,12 @@
 class PlmMultiLabelEncoder(Module):
     def __init__(self, 
         label_num: int, 
-        lm: Union[str, Module], lm_embd_dim: int, chunk_size: int=128, chunk_num: int=5, 
-        first_attn_hidden_dim: int=512
+        lm: Union[str, Module], 
+        lm_embd_dim: int, 
+        chunk_size: int=128, 
+        chunk_num: int=5, 
+        first_attn_hidden_dim: int=512,
+        freeze_lm: bool=False
     ):
         super().__init__()
         
@@ -25,6 +29,8 @@ def __init__(self,
             AutoModel.from_pretrained(lm, trust_remote_code=True) if isinstance(lm, str) 
             else lm
         )
+        if freeze_lm:
+            self._lm.requires_grad = False
 
         # Dimension info
         self._label_num: int = label_num
diff --git a/src/plm_icd_multi_label_classifier/model_ctx.py b/src/plm_icd_multi_label_classifier/model_ctx.py
@@ -21,20 +21,30 @@ def __init__(self):
         self.label2id: Dict[str, int] = {}
         self.chunk_size: int = -1
         self.chunk_num: int = -1
+        self.label_splitter: str = ","
 
     def init_by_train_config(self, train_conf_path: str):
         train_conf: Dict = json.loads(open(train_conf_path, "r").read())
         data_dict_path: str = os.path.join(train_conf["data_dir"], "dict.json")
         lm_tokenizer: str = train_conf["hf_lm"]
         chunk_size: int = train_conf["chunk_size"] 
         chunk_num: int = train_conf["chunk_num"]
-
-        return self.init(data_dict_path, lm_tokenizer, chunk_size, chunk_num)
+        label_splitter: str = train_conf["label_splitter"]
+
+        return self.init(
+            data_dict_path, 
+            lm_tokenizer, 
+            chunk_size, 
+            chunk_num, 
+            label_splitter
+        )
 
     def init(self, 
         data_dict_path: str, 
         lm_tokenizer: Union[str, AutoTokenizer], 
-        chunk_size: int, chunk_num: int
+        chunk_size: int, 
+        chunk_num: int,
+        label_splitter: str
     ):
         self.data_dict = json.loads(open(data_dict_path, "r").read())
         self.id2label = {int(k): v for k, v in self.data_dict["id2label"].items()}
@@ -44,6 +54,7 @@ def init(self,
             else lm_tokenizer
         self.chunk_size = chunk_size
         self.chunk_num = chunk_num
+        self.label_splitter = label_splitter
         return self
 
     def json_inputs2model_inf_inputs(self, 
@@ -64,7 +75,7 @@ def json_inputs2model_train_inputs(self,
         model_inputs: Dict[Tensor] = self.json_inputs2model_inf_inputs(
             json_inputs, text_fields
         )
-        label_names: List[str] = json_inputs[label_field].split(",")
+        label_names: List[str] = json_inputs[label_field].split(self.label_splitter)
         label_ids: List[int] = [
             self.label2id[x] for x in label_names if x in self.label2id
         ]
diff --git a/train.py b/train.py
diff --git a/train_mimic3_icd.json b/train_mimic3_icd.json