aggregating confusion matrices to calculate accuracy #17789

taloy42 · 2023-06-08T09:37:19Z

taloy42
Jun 8, 2023

Goal

I want to calculate the confusion matrix $C_g$ on each gpu, add it all to $C=\sum C_g$, use $C$ to calculate the accuracy and log it using self.log_dict(accuracies_from_confmat(C))

Setup

pytorch-lightning=2.0.1
python 3.8
running on Amazon SageMaker
using an instance with 4 GPUs

Current Situation

right now I am using th following code:

 

class Model(pl.LightningModule):

 

    def __init__(self, ..., num_grades, **kwargs):
        super().__init__())
        
        self.num_grades=num_grades
        self.confusion_matrix = ConfusionMatrix(num_classes=self.num_grades, task="multiclass")
        
        stages = ('train','valid','test',)
        self.outputs = { stage:[] for stage in stages }

 

    def shared_step(self, batch, stage):
        
       ...
        image, label = batch
        logits_mask, pred_label = self.forward(image)
        

 

        loss = ...

 

        acc = (torch.argmax(pred_label, 1) == torch.argmax(label, 1))
        
        ret = {
            "loss": loss,
            "pred_class": torch.argmax(pred_label, 1),
            "true_class": torch.argmax(label, 1),
            "acc": acc,
        }
        self.outputs[stage].append(ret)
        return ret
    def shared_epoch_end(self, stage):
        # aggregate step metrics
        loss = torch.Tensor([x["loss"] for x in self.outputs[stage]])
        pred_class = torch.cat([x["pred_class"] for x in self.outputs[stage]])
        true_class = torch.cat([x["true_class"] for x in self.outputs[stage]])
        acc = torch.cat([x["acc"] for x in self.outputs[stage]])
        
        conf_matrix = self.confusion_matrix(pred_class.cuda(), true_class.cuda())
        
        print(f"{stage} Confusion matrix from GPU {torch.distributed.get_rank()}:\n{conf_matrix}")

 

        accs = conf_matrix.diagonal()/conf_matrix.sum(axis=1)
        print(f'{stage} accuracies in gpu {torch.distributed.get_rank()}:\n{accs}')
        acc_per_class = dict(zip([f'{stage}_acc_label_{i}' for i in range(1,self.num_grades+1)], accs))
    
        metrics = {
            f"{stage}_accuracy": acc.float().mean(),
        }        
        self.log_dict({**metrics,**acc_per_class,'step':self.current_epoch}, prog_bar=True, sync_dist=True)
        self.outputs[stage].clear()
    def training_step(self, batch, batch_idx):
        return self.shared_step(batch,"train")
    
    def on_training_epoch_end(self):
        return self.shared_epoch_end("train")

 

    
    def validation_step(self, batch, batch_idx):
        return self.shared_step(batch,"valid")

 

    def on_validation_epoch_end(self):
        return self.shared_epoch_end("valid")
  
    def test_step(self, batch, batch_idx):
        return self.shared_step(batch, "test")  

 

    def on_test_epoch_end(self):
        return self.shared_epoch_end("test")

Wanted Behaviour

I would like to do something like

def shared_epoch_end(self, stage):
    ...
    conf_mat = ...
    add_matrices(conf_mat,gpus=[...])
    if torch.distributed.get_rank() == 0:
        acc_per_class = dict(zip([f'{stage}_acc_label_{i}' for i in range(1,self.num_grades+1)], accs))
        metrics = {
            f"{stage}_accuracy": acc.float().mean(),
        }        
        self.log_dict({**metrics,**acc_per_class,'step':self.current_epoch}, prog_bar=True, sync_dist=True)

so to add all the matrices into one from all the GPUs, and then log the data only on rank 0.

Attempts

I have tried to use torch.distributed.all_reduce but I have got a cuda memory error

def shared_epoch_end(self, stage):
    conf_matrix = ...
    torch.distributed.all_reduce(conf_matrix)
    if torch.distributed.get_rank() == 0:
        acc_per_class = dict(zip([f'{stage}_acc_label_{i}' for i in range(1,self.num_grades+1)], accs))
        metrics = {
            f"{stage}_accuracy": acc.float().mean(),
        }        
        self.log_dict({**metrics,**acc_per_class,'step':self.current_epoch}, prog_bar=True, sync_dist=True)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

aggregating confusion matrices to calculate accuracy #17789

Uh oh!

{{title}}

Uh oh!

Replies: 0 comments

Select a reply

Uh oh!

aggregating confusion matrices to calculate accuracy #17789

Uh oh!

taloy42 Jun 8, 2023

Goal

Setup

Current Situation

Wanted Behaviour

Attempts

Replies: 0 comments

taloy42
Jun 8, 2023