cuda out of memory #67

ppzhr · 2024-04-01T10:44:53Z

"Traceback (most recent call last):
File "run_entity.py", line 259, in
evaluate(model, test_batches, test_ner)
File "run_entity.py", line 88, in evaluate
output_dict = model.run_batch(batches[i], training=False)
File "/storage/zhr/PURE-main/entity/models.py", line 309, in run_batch
ner_logits, spans_embedding, last_hidden = self.bert_model(
File "/home/hyy/anaconda3/envs/PURE/lib/python3.8/site-packages/torch/nn/modules/module.py", line 532, in call
result = self.forward(*input, **kwargs)
File "/home/hyy/anaconda3/envs/PURE/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 148, in forward
inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
File "/home/hyy/anaconda3/envs/PURE/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 159, in scatter
return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
File "/home/hyy/anaconda3/envs/PURE/lib/python3.8/site-packages/torch/nn/parallel/scatter_gather.py", line 37, in scatter_kwargs
kwargs = scatter(kwargs, target_gpus, dim) if kwargs else []
File "/home/hyy/anaconda3/envs/PURE/lib/python3.8/site-packages/torch/nn/parallel/scatter_gather.py", line 28, in scatter
res = scatter_map(inputs)
File "/home/hyy/anaconda3/envs/PURE/lib/python3.8/site-packages/torch/nn/parallel/scatter_gather.py", line 19, in scatter_map
return list(map(type(obj), zip(map(scatter_map, obj.items()))))
File "/home/hyy/anaconda3/envs/PURE/lib/python3.8/site-packages/torch/nn/parallel/scatter_gather.py", line 15, in scatter_map
return list(zip(map(scatter_map, obj)))
File "/home/hyy/anaconda3/envs/PURE/lib/python3.8/site-packages/torch/nn/parallel/scatter_gather.py", line 13, in scatter_map
return Scatter.apply(target_gpus, None, dim, obj)
File "/home/hyy/anaconda3/envs/PURE/lib/python3.8/site-packages/torch/nn/parallel/_functions.py", line 89, in forward
outputs = comm.scatter(input, target_gpus, chunk_sizes, ctx.dim, streams)
File "/home/hyy/anaconda3/envs/PURE/lib/python3.8/site-packages/torch/cuda/comm.py", line 147, in scatter
return tuple(torch._C._scatter(tensor, devices, chunk_sizes, dim, streams))
RuntimeError: CUDA error: out of memory (malloc at /pytorch/c10/cuda/CUDACachingAllocator.cpp:260)
frame #0: c10::Error::Error(c10::SourceLocation, std::string const&) + 0x33 (0x7f439110d193 in /home/hyy/anaconda3/envs/PURE/lib/python3.8/site-packages/torch/lib/libc10.so)
frame #1: + 0x1ba1a (0x7f439134ea1a in /home/hyy/anaconda3/envs/PURE/lib/python3.8/site-packages/torch/lib/libc10_cuda.so)
frame #2: + 0x1cd5e (0x7f439134fd5e in /home/hyy/anaconda3/envs/PURE/lib/python3.8/site-packages/torch/lib/libc10_cuda.so)
frame #3: THCStorage_resize + 0xa3 (0x7f42ae8fd6f3 in /home/hyy/anaconda3/envs/PURE/lib/python3.8/site-packages/torch/lib/libtorch.so)
frame #4: at::native::empty_strided_cuda(c10::ArrayRef, c10::ArrayRef, c10::TensorOptions const&) + 0x636 (0x7f42afecb856 in /home/hyy/anaconda3/envs/PURE/lib/python3.8/site-packages/torch/lib/libtorch.so)
frame #5: + 0x45bcd2a (0x7f42ae80ed2a in /home/hyy/anaconda3/envs/PURE/lib/python3.8/site-packages/torch/lib/libtorch.so)
frame #6: + 0x1f4fc81 (0x7f42ac1a1c81 in /home/hyy/anaconda3/envs/PURE/lib/python3.8/site-packages/torch/lib/libtorch.so)
frame #7: + 0x3aadfb0 (0x7f42adcfffb0 in /home/hyy/anaconda3/envs/PURE/lib/python3.8/site-packages/torch/lib/libtorch.so)
frame #8: + 0x1f4fc81 (0x7f42ac1a1c81 in /home/hyy/anaconda3/envs/PURE/lib/python3.8/site-packages/torch/lib/libtorch.so)
frame #9: + 0x1cb869e (0x7f42abf0a69e in /home/hyy/anaconda3/envs/PURE/lib/python3.8/site-packages/torch/lib/libtorch.so)
frame #10: at::native::to(at::Tensor const&, c10::TensorOptions const&, bool, bool, c10::optionalc10::MemoryFormat) + 0x245 (0x7f42abf0b6f5 in /home/hyy/anaconda3/envs/PURE/lib/python3.8/site-packages/torch/lib/libtorch.so)
frame #11: + 0x1ffdb9a (0x7f42ac24fb9a in /home/hyy/anaconda3/envs/PURE/lib/python3.8/site-packages/torch/lib/libtorch.so)
frame #12: + 0x3ce3866 (0x7f42adf35866 in /home/hyy/anaconda3/envs/PURE/lib/python3.8/site-packages/torch/lib/libtorch.so)
frame #13: + 0x20485e2 (0x7f42ac29a5e2 in /home/hyy/anaconda3/envs/PURE/lib/python3.8/site-packages/torch/lib/libtorch.so)
frame #14: torch::cuda::scatter(at::Tensor const&, c10::ArrayRef, c10::optional<std::vector<long, std::allocator > > const&, long, c10::optional<std::vector<c10::optionalc10::cuda::CUDAStream, std::allocator<c10::optionalc10::cuda::CUDAStream > > > const&) + 0x710 (0x7f42aec08f60 in /home/hyy/anaconda3/envs/PURE/lib/python3.8/site-packages/torch/lib/libtorch.so)
frame #15: + 0x9c5dcf (0x7f4392112dcf in /home/hyy/anaconda3/envs/PURE/lib/python3.8/site-packages/torch/lib/libtorch_python.so)
frame #16: + 0x295928 (0x7f43919e2928 in /home/hyy/anaconda3/envs/PURE/lib/python3.8/site-packages/torch/lib/libtorch_python.so)

frame #25: THPFunction_apply(_object, _object) + 0xaff (0x7f4391db1f3f in /home/hyy/anaconda3/envs/PURE/lib/python3.8/site-packages/torch/lib/libtorch_python.so)"

executed this command：python run_entity.py --do_eval --eval_test --context_window 0 --task scierc --data_dir /storage/zhr/PURE-main/scierc_data/processed_data/json --model allenai/scibert_scivocab_uncased --output_dir /storage/zhr/PURE-main/scierc_models/ent-scib-ctx0

I changed eval_batch_size from 32 to 8，but I still get an error, is there any good solution?

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

cuda out of memory #67

cuda out of memory #67

ppzhr commented Apr 1, 2024

cuda out of memory #67

cuda out of memory #67

Comments

ppzhr commented Apr 1, 2024