DDP Hangs with TORCH_DISTRIBUTED_DEBUG = DETAIL #13503
-
I'm not certain whether this is user error or a PyTorch/Lightning issue, so am posting a discussion instead. Adding the line To reproduce: import argparse
import os
import torch
from torch.utils.data import DataLoader, Dataset
from pytorch_lightning import LightningModule, Trainer
class RandomDataset(Dataset):
def __init__(self, size, length):
self.len = length
self.data = torch.randn(length, size)
def __getitem__(self, index):
return self.data[index]
def __len__(self):
return self.len
class BoringModel(LightningModule):
def __init__(self):
super().__init__()
self.layer = torch.nn.Linear(32, 2)
def forward(self, x):
return self.layer(x)
def training_step(self, batch, batch_idx):
loss = self(batch).sum()
self.log("train_loss", loss)
return {"loss": loss}
def validation_step(self, batch, batch_idx):
loss = self(batch).sum()
self.log("valid_loss", loss)
def test_step(self, batch, batch_idx):
loss = self(batch).sum()
self.log("test_loss", loss)
def configure_optimizers(self):
return torch.optim.SGD(self.layer.parameters(), lr=0.1)
def run(cl_args):
train_data = DataLoader(RandomDataset(32, 64), batch_size=2)
val_data = DataLoader(RandomDataset(32, 64), batch_size=2)
test_data = DataLoader(RandomDataset(32, 64), batch_size=2)
model = BoringModel()
# Start changed code
import os
os.environ[
"TORCH_DISTRIBUTED_DEBUG"
] = "DETAIL" # set to DETAIL for runtime logging.
parser = argparse.ArgumentParser()
parser = Trainer.add_argparse_args(parser)
args = parser.parse_args(cl_args.split() if cl_args else None)
trainer = Trainer.from_argparse_args(args)
# End changed code
trainer.fit(model, train_dataloaders=train_data, val_dataloaders=val_data)
trainer.test(model, dataloaders=test_data)
if __name__ == "__main__":
run('--gpus 2 --strategy ddp') |
Beta Was this translation helpful? Give feedback.
Replies: 2 comments 5 replies
-
I confirmed the hang with my script, too. https://github.com/akihironitta/gist/blob/repro/13503-torch-dist-debug-detail/pl_boring_model/main.py env$ pip list|grep torch
torch 1.12.0+cu116
torchaudio 0.12.0+cu116
torchmetrics 0.9.2
torchvision 0.13.0+cu116 Giving the doc page a read and tying out a few runs, I think the env var is supposed to be set on rank 0 only, so instead, you might want to set the env var outside the script |
Beta Was this translation helpful? Give feedback.
-
Reading @akihironitta 's response and looking at the documentation again, I noticed that they set the environment variable prior to calling import argparse
import torch
from torch.utils.data import DataLoader, Dataset
from pytorch_lightning import LightningModule, Trainer
class RandomDataset(Dataset):
def __init__(self, size, length):
self.len = length
self.data = torch.randn(length, size)
def __getitem__(self, index):
return self.data[index]
def __len__(self):
return self.len
class BoringModel(LightningModule):
def __init__(self):
super().__init__()
self.layer = torch.nn.Linear(32, 2)
def forward(self, x):
return self.layer(x)
def training_step(self, batch, batch_idx):
loss = self(batch).sum()
self.log("train_loss", loss)
return {"loss": loss}
def validation_step(self, batch, batch_idx):
loss = self(batch).sum()
self.log("valid_loss", loss)
def test_step(self, batch, batch_idx):
loss = self(batch).sum()
self.log("test_loss", loss)
def configure_optimizers(self):
return torch.optim.SGD(self.layer.parameters(), lr=0.1)
# Start changed code
import os
os.environ['TORCH_DISTRIBUTED_DEBUG'] = 'DETAIL'
# End changed code
def run(cl_args):
train_data = DataLoader(RandomDataset(32, 64), batch_size=2)
val_data = DataLoader(RandomDataset(32, 64), batch_size=2)
test_data = DataLoader(RandomDataset(32, 64), batch_size=2)
model = BoringModel()
# Start changed code
parser = argparse.ArgumentParser()
parser = Trainer.add_argparse_args(parser)
args = parser.parse_args(cl_args.split() if cl_args else None)
trainer = Trainer.from_argparse_args(args)
# End changed code
trainer.fit(model, train_dataloaders=train_data, val_dataloaders=val_data)
trainer.test(model, dataloaders=test_data)
if __name__ == "__main__":
run('--gpus 2 --strategy ddp') I presume this has to do with where the Trainer is forking the process. In summary, it seems one can
|
Beta Was this translation helpful? Give feedback.
Reading @akihironitta 's response and looking at the documentation again, I noticed that they set the environment variable prior to calling
mp.spawn
. Moving theos.environ['TORCH_DISTRIBUTED_DEBUG] = 'DETAIL'
line outside of the main function prevented hanging.