Обнимающее лицо - RuntimeError: обнаружена RuntimeError в реплике 0 на устройстве 0 в Azure Databricks
Как запустить сценарий run_language_modeling.py из обнимающего лица с использованием предварительно обученной модели случая Роберты для точной настройки с использованием моих собственных данных в блоках данных Azure с кластером графического процессора.
Использование Transformer версии 2.9.1 и 3.0 . Python 3.6 Torch `1.5.0 torchvision 0.6
Это сценарий, который я запускал ниже на базе данных Azure.
%run '/dbfs/FileStore/tables/dev/run_language_modeling.py' \
--output_dir='/dbfs/FileStore/tables/final_train/models/roberta_base_reduce_n' \
--model_type=roberta \
--model_name_or_path=roberta-base \
--do_train \
--num_train_epochs 5 \
--train_data_file='/dbfs/FileStore/tables/final_train/train_data/all_data_desc_list_full.txt' \
--mlm
Это ошибка, которую я получаю после выполнения указанной выше команды.
/dbfs/FileStore/tables/dev/run_language_modeling.py in <module>
279
280 if __name__ == "__main__":
--> 281 main()
/dbfs/FileStore/tables/dev/run_language_modeling.py in main()
243 else None
244 )
--> 245 trainer.train(model_path=model_path)
246 trainer.save_model()
247 # For convenience, we also re-save the tokenizer to the same directory,
/databricks/python/lib/python3.7/site-packages/transformers/trainer.py in train(self, model_path)
497 continue
498
--> 499 tr_loss += self._training_step(model, inputs, optimizer)
500
501 if (step + 1) % self.args.gradient_accumulation_steps == 0 or (
/databricks/python/lib/python3.7/site-packages/transformers/trainer.py in _training_step(self, model, inputs, optimizer)
620 inputs["mems"] = self._past
621
--> 622 outputs = model(**inputs)
623 loss = outputs[0] # model outputs are always tuple in transformers (see doc)
624
/databricks/python/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
--> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)
/databricks/python/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py in forward(self, *inputs, **kwargs)
153 return self.module(*inputs[0], **kwargs[0])
154 replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
--> 155 outputs = self.parallel_apply(replicas, inputs, kwargs)
156 return self.gather(outputs, self.output_device)
157
/databricks/python/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py in parallel_apply(self, replicas, inputs, kwargs)
163
164 def parallel_apply(self, replicas, inputs, kwargs):
--> 165 return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
166
167 def gather(self, outputs, output_device):
/databricks/python/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py in parallel_apply(modules, inputs, kwargs_tup, devices)
83 output = results[i]
84 if isinstance(output, ExceptionWrapper):
---> 85 output.reraise()
86 outputs.append(output)
87 return outputs
/databricks/python/lib/python3.7/site-packages/torch/_utils.py in reraise(self)
393 # (https://bugs.python.org/issue2651), so we work around it.
394 msg = KeyErrorMessage(msg)
--> 395 raise self.exc_type(msg)
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/databricks/python/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
output = module(*input, **kwargs)
File "/databricks/python/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/databricks/python/lib/python3.7/site-packages/transformers/modeling_roberta.py", line 239, in forward
output_hidden_states=output_hidden_states,
File "/databricks/python/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/databricks/python/lib/python3.7/site-packages/transformers/modeling_bert.py", line 762, in forward
output_hidden_states=output_hidden_states,
File "/databricks/python/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/databricks/python/lib/python3.7/site-packages/transformers/modeling_bert.py", line 439, in forward
output_attentions,
File "/databricks/python/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/databricks/python/lib/python3.7/site-packages/transformers/modeling_bert.py", line 371, in forward
hidden_states, attention_mask, head_mask, output_attentions=output_attentions,
File "/databricks/python/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/databricks/python/lib/python3.7/site-packages/transformers/modeling_bert.py", line 315, in forward
hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, output_attentions,
File "/databricks/python/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/databricks/python/lib/python3.7/site-packages/transformers/modeling_bert.py", line 240, in forward
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
RuntimeError: CUDA out of memory. Tried to allocate 96.00 MiB (GPU 0; 11.17 GiB total capacity; 10.68 GiB already allocated; 95.31 MiB free; 10.77 GiB reserved in total by PyTorch)```
Please how do I resolve this
1 ответ
Ошибка нехватки памяти, скорее всего, вызвана невыполнением очистки сеанса и / или освобождением графического процессора.
Из аналогичной проблемы Github.
Это из-за того, что мини-пакет данных не помещается в память графического процессора. Просто уменьшите размер партии. Когда я установил размер пакета = 256 для набора данных cifar10, я получил ту же ошибку; Потом ставлю размер партии = 128, решается.