본문 바로가기

인공지능/Python

RuntimeError: Only Tensors created explicitly by the user (graph leaves) support the deepcopy protocol at the moment 해결

Problem

Multi-GPU DDP Deep Learning 중에 loss를 copy하는 부분에서 문제가 발생했다.(고 생각된다..)

에러 전문은 다음과 같다.

더보기

Traceback (most recent call last):
  File "/home/ubuntu/jini1114/ddp_test/maskrcnn_trainer.py", line 54, in <module>
    trainer.fit(ddpmaskrcnn, datamodule=dl)
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 460, in fit
Traceback (most recent call last):
    self._run(model)
  File "/home/ubuntu/jini1114/ddp_test/maskrcnn_trainer.py", line 54, in <module>
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 758, in _run
    trainer.fit(ddpmaskrcnn, datamodule=dl)
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 460, in fit
    self.dispatch()
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 799, in dispatch
    self._run(model)
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 758, in _run
    self.accelerator.start_training(self)
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 96, in start_training
    self.training_type_plugin.start_training(trainer)
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 144, in start_training
    self.dispatch()
    self._results = trainer.run_stage()
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 799, in dispatch
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 809, in run_stage
    return self.run_train()
    self.accelerator.start_training(self)
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 871, in run_train
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 96, in start_training
    self.training_type_plugin.start_training(trainer)
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 144, in start_training
    self.train_loop.run_training_epoch()
    self._results = trainer.run_stage()
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 535, in run_training_epoch
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 809, in run_stage
    monitor_metrics = deepcopy(self.trainer.logger_connector.callback_metrics)
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/copy.py", line 146, in deepcopy
    return self.run_train()
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 871, in run_train
    y = copier(x, memo)
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/copy.py", line 230, in _deepcopy_dict
    y[deepcopy(key, memo)] = deepcopy(value, memo)
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/copy.py", line 146, in deepcopy
Traceback (most recent call last):
  File "maskrcnn_trainer.py", line 54, in <module>
    trainer.fit(ddpmaskrcnn, datamodule=dl)
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 460, in fit
    self._run(model)
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 758, in _run
    self.dispatch()
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 799, in dispatch
    self.accelerator.start_training(self)
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 96, in start_training
    self.training_type_plugin.start_training(trainer)
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 144, in start_training
    self._results = trainer.run_stage()
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 809, in run_stage
    return self.run_train()
    y = copier(x, memo)
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/copy.py", line 230, in _deepcopy_dict
    self.train_loop.run_training_epoch()
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 535, in run_training_epoch
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 871, in run_train
    self.train_loop.run_training_epoch()
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 535, in run_training_epoch
    monitor_metrics = deepcopy(self.trainer.logger_connector.callback_metrics)
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/copy.py", line 146, in deepcopy
    y = copier(x, memo)
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/copy.py", line 230, in _deepcopy_dict
    y[deepcopy(key, memo)] = deepcopy(value, memo)
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/copy.py", line 146, in deepcopy
    y = copier(x, memo)
    y[deepcopy(key, memo)] = deepcopy(value, memo)
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/copy.py", line 153, in deepcopy
    monitor_metrics = deepcopy(self.trainer.logger_connector.callback_metrics)
    y = copier(memo)
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/copy.py", line 146, in deepcopy
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/torch/tensor.py", line 55, in __deepcopy__
    raise RuntimeError("Only Tensors created explicitly by the user "
RuntimeError: Only Tensors created explicitly by the user (graph leaves) support the deepcopy protocol at the moment
    y = copier(x, memo)
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/copy.py", line 230, in _deepcopy_dict
    y[deepcopy(key, memo)] = deepcopy(value, memo)
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/copy.py", line 146, in deepcopy
    y = copier(x, memo)
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/copy.py", line 230, in _deepcopy_dict
    y[deepcopy(key, memo)] = deepcopy(value, memo)
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/copy.py", line 153, in deepcopy
    y = copier(memo)
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/torch/tensor.py", line 55, in __deepcopy__
    raise RuntimeError("Only Tensors created explicitly by the user "
RuntimeError: Only Tensors created explicitly by the user (graph leaves) support the deepcopy protocol at the moment
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/copy.py", line 230, in _deepcopy_dict
    y[deepcopy(key, memo)] = deepcopy(value, memo)
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/copy.py", line 153, in deepcopy
    y = copier(memo)
  File "/home/ubuntu/anaconda3/envs/pytorch/lib/python3.8/site-packages/torch/tensor.py", line 55, in __deepcopy__
    raise RuntimeError("Only Tensors created explicitly by the user "
RuntimeError: Only Tensors created explicitly by the user (graph leaves) support the deepcopy protocol at the moment

그리고 이때의 loss_dict는 다음과 같다.

{'loss_classifier': tensor(0.4936, device='cuda:1', grad_fn=<NllLossBackward>), 'loss_box_reg': tensor(0.1806, device='cuda:1', grad_fn=<DivBackward0>), 'loss_mask': tensor(1.7224, device='cuda:1', grad_fn=<BinaryCrossEntropyWithLogitsBackward>), 'loss_objectness': tensor(0.0041, device='cuda:1', grad_fn=<BinaryCrossEntropyWithLogitsBackward>), 'loss_rpn_box_reg': tensor(0.0051, device='cuda:1', grad_fn=<DivBackward0>)}

 

Solution

찾아보니 device때문에 deepcopy가 실패하는것 같았다.

따라서 detach()를 해서 device를 free하게 해줬다.

loss_dict = {t:loss_dict[t].detach() for t in loss_dict}