Problem
Multi-GPU DDP Deep Learning 중에 loss를 copy하는 부분에서 문제가 발생했다.(고 생각된다..)
에러 전문은 다음과 같다.
Traceback (most recent call last):
File "/home/ubuntu/jini1114/ddp_test/maskrcnn_trainer.py", line 54, in <module>
trainer.fit(ddpmaskrcnn, datamodule=dl)
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 460, in fit
Traceback (most recent call last):
self._run(model)
File "/home/ubuntu/jini1114/ddp_test/maskrcnn_trainer.py", line 54, in <module>
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 758, in _run
trainer.fit(ddpmaskrcnn, datamodule=dl)
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 460, in fit
self.dispatch()
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 799, in dispatch
self._run(model)
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 758, in _run
self.accelerator.start_training(self)
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 96, in start_training
self.training_type_plugin.start_training(trainer)
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 144, in start_training
self.dispatch()
self._results = trainer.run_stage()
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 799, in dispatch
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 809, in run_stage
return self.run_train()
self.accelerator.start_training(self)
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 871, in run_train
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 96, in start_training
self.training_type_plugin.start_training(trainer)
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 144, in start_training
self.train_loop.run_training_epoch()
self._results = trainer.run_stage()
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 535, in run_training_epoch
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 809, in run_stage
monitor_metrics = deepcopy(self.trainer.logger_connector.callback_metrics)
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/copy.py", line 146, in deepcopy
return self.run_train()
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 871, in run_train
y = copier(x, memo)
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/copy.py", line 230, in _deepcopy_dict
y[deepcopy(key, memo)] = deepcopy(value, memo)
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/copy.py", line 146, in deepcopy
Traceback (most recent call last):
File "maskrcnn_trainer.py", line 54, in <module>
trainer.fit(ddpmaskrcnn, datamodule=dl)
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 460, in fit
self._run(model)
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 758, in _run
self.dispatch()
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 799, in dispatch
self.accelerator.start_training(self)
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 96, in start_training
self.training_type_plugin.start_training(trainer)
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 144, in start_training
self._results = trainer.run_stage()
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 809, in run_stage
return self.run_train()
y = copier(x, memo)
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/copy.py", line 230, in _deepcopy_dict
self.train_loop.run_training_epoch()
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 535, in run_training_epoch
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 871, in run_train
self.train_loop.run_training_epoch()
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 535, in run_training_epoch
monitor_metrics = deepcopy(self.trainer.logger_connector.callback_metrics)
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/copy.py", line 146, in deepcopy
y = copier(x, memo)
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/copy.py", line 230, in _deepcopy_dict
y[deepcopy(key, memo)] = deepcopy(value, memo)
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/copy.py", line 146, in deepcopy
y = copier(x, memo)
y[deepcopy(key, memo)] = deepcopy(value, memo)
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/copy.py", line 153, in deepcopy
monitor_metrics = deepcopy(self.trainer.logger_connector.callback_metrics)
y = copier(memo)
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/copy.py", line 146, in deepcopy
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/torch/tensor.py", line 55, in __deepcopy__
raise RuntimeError("Only Tensors created explicitly by the user "
RuntimeError: Only Tensors created explicitly by the user (graph leaves) support the deepcopy protocol at the moment
y = copier(x, memo)
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/copy.py", line 230, in _deepcopy_dict
y[deepcopy(key, memo)] = deepcopy(value, memo)
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/copy.py", line 146, in deepcopy
y = copier(x, memo)
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/copy.py", line 230, in _deepcopy_dict
y[deepcopy(key, memo)] = deepcopy(value, memo)
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/copy.py", line 153, in deepcopy
y = copier(memo)
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/torch/tensor.py", line 55, in __deepcopy__
raise RuntimeError("Only Tensors created explicitly by the user "
RuntimeError: Only Tensors created explicitly by the user (graph leaves) support the deepcopy protocol at the moment
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/copy.py", line 230, in _deepcopy_dict
y[deepcopy(key, memo)] = deepcopy(value, memo)
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/copy.py", line 153, in deepcopy
y = copier(memo)
File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/torch/tensor.py", line 55, in __deepcopy__
raise RuntimeError("Only Tensors created explicitly by the user "
RuntimeError: Only Tensors created explicitly by the user (graph leaves) support the deepcopy protocol at the moment
그리고 이때의 loss_dict는 다음과 같다.
{'loss_classifier': tensor(0.4936, device='cuda:1', grad_fn=<NllLossBackward>), 'loss_box_reg': tensor(0.1806, device='cuda:1', grad_fn=<DivBackward0>), 'loss_mask': tensor(1.7224, device='cuda:1', grad_fn=<BinaryCrossEntropyWithLogitsBackward>), 'loss_objectness': tensor(0.0041, device='cuda:1', grad_fn=<BinaryCrossEntropyWithLogitsBackward>), 'loss_rpn_box_reg': tensor(0.0051, device='cuda:1', grad_fn=<DivBackward0>)}
Solution
찾아보니 device때문에 deepcopy가 실패하는것 같았다.
따라서 detach()를 해서 device를 free하게 해줬다.
loss_dict = {t:loss_dict[t].detach() for t in loss_dict}
'인공지능 > Python' 카테고리의 다른 글
Python 그래프 부드럽게 그리기 (Python plot line smooth) (0) | 2022.05.24 |
---|---|
nvidia-smi ; Failed to initialize NVML: Driver/library version mismatch 에러 해결 (0) | 2022.05.19 |
model 학습 시 loss대신 predict 결과가 나올 경우 (0) | 2022.05.17 |
Dataloader dictionary에서 batch를 밖으로 빼내기 (0) | 2022.05.17 |
RuntimeError: stack expects each tensor to be equal size 에러 해결 (0) | 2022.05.16 |