You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
error:
Node: 'cond/IteratorGetNext'
Detected at node 'cond/IteratorGetNext' defined at (most recent call last):
File "/LLM/models/official/recommendation/ncf_keras_main.py", line 576, in
app.run(main)
File "/root/.local/lib/python3.10/site-packages/absl/app.py", line 308, in run
_run_main(main, args)
File "/root/.local/lib/python3.10/site-packages/absl/app.py", line 254, in _run_main
sys.exit(main(argv))
File "/LLM/models/official/recommendation/ncf_keras_main.py", line 571, in main
logging.info("Result is %s", run_ncf(FLAGS))
File "/LLM/models/official/recommendation/ncf_keras_main.py", line 330, in run_ncf
history = keras_model.fit(
File "/root/.local/lib/python3.10/site-packages/tf_keras/src/utils/traceback_utils.py", line 65, in error_handler
return fn(*args, **kwargs)
File "/root/.local/lib/python3.10/site-packages/tf_keras/src/engine/training.py", line 1804, in fit
tmp_logs = self.train_function(iterator)
File "/root/.local/lib/python3.10/site-packages/tf_keras/src/engine/training.py", line 1408, in
lambda it: self._cluster_coordinator.schedule(
File "/root/.local/lib/python3.10/site-packages/tf_keras/src/engine/training.py", line 1398, in train_function
return step_function(self, iterator)
File "/root/.local/lib/python3.10/site-packages/tf_keras/src/engine/training.py", line 1380, in step_function
data = next(iterator)
Node: 'cond/IteratorGetNext'
End of sequence
[[{{node cond/IteratorGetNext}}]]
Additional GRPC error information from remote target /job:worker/replica:0/task:0:
:{"created":"@1705051450.146141045","description":"Error received from peer ipv4:127.0.0.1:12345","file":"external/com_github_grpc_grpc/src/core/lib/surface/call.cc","file_line":1056,"grpc_message":" End of sequence\n\t [[{{node cond/IteratorGetNext}}]]","grpc_status":11} [Op:__inference_train_function_2453]
2024-01-12 09:24:10.396242: I tensorflow/core/framework/local_rendezvous.cc:409] Local rendezvous send item cancelled. Key hash: 8765810659132640250
2024-01-12 09:24:10.396490: W tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc:510] RecvTensor cancelled for -197545566131642667
2024-01-12 09:24:10.396592: W tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc:510] RecvTensor cancelled for -197545566131642667
2024-01-12 09:24:10.396623: W tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc:510] RecvTensor cancelled for -197545566131642667
2024-01-12 09:24:10.396692: W tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc:510] RecvTensor cancelled for -197545566131642667
2024-01-12 09:24:10.396719: W tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc:510] RecvTensor cancelled for -197545566131642667
2024-01-12 09:24:10.396880: W tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc:510] RecvTensor cancelled for -197545566131642667
2024-01-12 09:24:10.396905: W tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc:510] RecvTensor cancelled for -197545566131642667
2024-01-12 09:24:10.397131: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous recv item cancelled. Key hash: 14709069076869533544
2024-01-12 09:24:10.397237: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous recv item cancelled. Key hash: 12806972282902040695
2024-01-12 09:24:10.397251: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous recv item cancelled. Key hash: 7099544718350412265
2024-01-12 09:24:10.397262: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous recv item cancelled. Key hash: 13550495496366309674
2024-01-12 09:24:10.397271: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous recv item cancelled. Key hash: 7470721310366749220
2024-01-12 09:24:10.397282: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous recv item cancelled. Key hash: 178911781484057798
2024-01-12 09:24:10.397292: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous recv item cancelled. Key hash: 11001711794723579528
ERROR:tensorflow:Start cancelling closures due to error OutOfRangeError(): Graph execution error:
Detected at node 'cond/IteratorGetNext' defined at (most recent call last):
File "/LLM/models/official/recommendation/ncf_keras_main.py", line 576, in
app.run(main)
File "/root/.local/lib/python3.10/site-packages/absl/app.py", line 308, in run
_run_main(main, args)
File "/root/.local/lib/python3.10/site-packages/absl/app.py", line 254, in _run_main
sys.exit(main(argv))
File "/LLM/models/official/recommendation/ncf_keras_main.py", line 571, in main
logging.info("Result is %s", run_ncf(FLAGS))
File "/LLM/models/official/recommendation/ncf_keras_main.py", line 330, in run_ncf
history = keras_model.fit(
File "/root/.local/lib/python3.10/site-packages/tf_keras/src/utils/traceback_utils.py", line 65, in error_handler
return fn(*args, **kwargs)
File "/root/.local/lib/python3.10/site-packages/tf_keras/src/engine/training.py", line 1804, in fit
tmp_logs = self.train_function(iterator)
File "/root/.local/lib/python3.10/site-packages/tf_keras/src/engine/training.py", line 1408, in
lambda it: self._cluster_coordinator.schedule(
File "/root/.local/lib/python3.10/site-packages/tf_keras/src/engine/training.py", line 1398, in train_function
return step_function(self, iterator)
File "/root/.local/lib/python3.10/site-packages/tf_keras/src/engine/training.py", line 1380, in step_function
data = next(iterator)
Node: 'cond/IteratorGetNext'
Detected at node 'cond/IteratorGetNext' defined at (most recent call last):
File "/LLM/models/official/recommendation/ncf_keras_main.py", line 576, in
app.run(main)
File "/root/.local/lib/python3.10/site-packages/absl/app.py", line 308, in run
_run_main(main, args)
File "/root/.local/lib/python3.10/site-packages/absl/app.py", line 254, in _run_main
sys.exit(main(argv))
File "/LLM/models/official/recommendation/ncf_keras_main.py", line 571, in main
logging.info("Result is %s", run_ncf(FLAGS))
File "/LLM/models/official/recommendation/ncf_keras_main.py", line 330, in run_ncf
history = keras_model.fit(
File "/root/.local/lib/python3.10/site-packages/tf_keras/src/utils/traceback_utils.py", line 65, in error_handler
return fn(*args, **kwargs)
File "/root/.local/lib/python3.10/site-packages/tf_keras/src/engine/training.py", line 1804, in fit
tmp_logs = self.train_function(iterator)
File "/root/.local/lib/python3.10/site-packages/tf_keras/src/engine/training.py", line 1408, in
lambda it: self._cluster_coordinator.schedule(
File "/root/.local/lib/python3.10/site-packages/tf_keras/src/engine/training.py", line 1398, in train_function
return step_function(self, iterator)
File "/root/.local/lib/python3.10/site-packages/tf_keras/src/engine/training.py", line 1380, in step_function
data = next(iterator)
Node: 'cond/IteratorGetNext'
End of sequence
[[{{node cond/IteratorGetNext}}]]
Additional GRPC error information from remote target /job:worker/replica:0/task:0:
:{"created":"@1705051450.146141045","description":"Error received from peer ipv4:127.0.0.1:12345","file":"external/com_github_grpc_grpc/src/core/lib/surface/call.cc","file_line":1056,"grpc_message":" End of sequence\n\t [[{{node cond/IteratorGetNext}}]]","grpc_status":11} [Op:__inference_train_function_2453]
E0112 09:24:10.397781 140489348497856 cluster_coordinator.py:412] Start cancelling closures due to error OutOfRangeError(): Graph execution error:
how to resolve OutOfRangeError ?
The text was updated successfully, but these errors were encountered:
environment:
CUDA 1.17
tensorflow2.14
code:
https://github.com/tensorflow/models/blob/master/official/recommendation/ncf_keras_main.py
command:
python3 /LLM/models/official/recommendation/ncf_keras_main.py --distribution_strategy parameter_server --model_dir /LLM/models/dataset/ncf_model --data_dir /LLM/models/dataset/ --dataset ml-1m --train_epochs 3 --batch_size 8000 --learning_rate 0.00382059 --beta1 0.783529 --beta2 0.909003 --epsilon 1.45439e-07 --layers 256,256,128,64 --num_factors 64 --hr_threshold 0.635 --eval_batch_size 8000
error:
Node: 'cond/IteratorGetNext'
Detected at node 'cond/IteratorGetNext' defined at (most recent call last):
File "/LLM/models/official/recommendation/ncf_keras_main.py", line 576, in
app.run(main)
File "/root/.local/lib/python3.10/site-packages/absl/app.py", line 308, in run
_run_main(main, args)
File "/root/.local/lib/python3.10/site-packages/absl/app.py", line 254, in _run_main
sys.exit(main(argv))
File "/LLM/models/official/recommendation/ncf_keras_main.py", line 571, in main
logging.info("Result is %s", run_ncf(FLAGS))
File "/LLM/models/official/recommendation/ncf_keras_main.py", line 330, in run_ncf
history = keras_model.fit(
File "/root/.local/lib/python3.10/site-packages/tf_keras/src/utils/traceback_utils.py", line 65, in error_handler
return fn(*args, **kwargs)
File "/root/.local/lib/python3.10/site-packages/tf_keras/src/engine/training.py", line 1804, in fit
tmp_logs = self.train_function(iterator)
File "/root/.local/lib/python3.10/site-packages/tf_keras/src/engine/training.py", line 1408, in
lambda it: self._cluster_coordinator.schedule(
File "/root/.local/lib/python3.10/site-packages/tf_keras/src/engine/training.py", line 1398, in train_function
return step_function(self, iterator)
File "/root/.local/lib/python3.10/site-packages/tf_keras/src/engine/training.py", line 1380, in step_function
data = next(iterator)
Node: 'cond/IteratorGetNext'
End of sequence
[[{{node cond/IteratorGetNext}}]]
Additional GRPC error information from remote target /job:worker/replica:0/task:0:
:{"created":"@1705051450.146141045","description":"Error received from peer ipv4:127.0.0.1:12345","file":"external/com_github_grpc_grpc/src/core/lib/surface/call.cc","file_line":1056,"grpc_message":" End of sequence\n\t [[{{node cond/IteratorGetNext}}]]","grpc_status":11} [Op:__inference_train_function_2453]
2024-01-12 09:24:10.396242: I tensorflow/core/framework/local_rendezvous.cc:409] Local rendezvous send item cancelled. Key hash: 8765810659132640250
2024-01-12 09:24:10.396490: W tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc:510] RecvTensor cancelled for -197545566131642667
2024-01-12 09:24:10.396592: W tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc:510] RecvTensor cancelled for -197545566131642667
2024-01-12 09:24:10.396623: W tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc:510] RecvTensor cancelled for -197545566131642667
2024-01-12 09:24:10.396692: W tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc:510] RecvTensor cancelled for -197545566131642667
2024-01-12 09:24:10.396719: W tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc:510] RecvTensor cancelled for -197545566131642667
2024-01-12 09:24:10.396880: W tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc:510] RecvTensor cancelled for -197545566131642667
2024-01-12 09:24:10.396905: W tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc:510] RecvTensor cancelled for -197545566131642667
2024-01-12 09:24:10.397131: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous recv item cancelled. Key hash: 14709069076869533544
2024-01-12 09:24:10.397237: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous recv item cancelled. Key hash: 12806972282902040695
2024-01-12 09:24:10.397251: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous recv item cancelled. Key hash: 7099544718350412265
2024-01-12 09:24:10.397262: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous recv item cancelled. Key hash: 13550495496366309674
2024-01-12 09:24:10.397271: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous recv item cancelled. Key hash: 7470721310366749220
2024-01-12 09:24:10.397282: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous recv item cancelled. Key hash: 178911781484057798
2024-01-12 09:24:10.397292: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous recv item cancelled. Key hash: 11001711794723579528
ERROR:tensorflow:Start cancelling closures due to error OutOfRangeError(): Graph execution error:
Detected at node 'cond/IteratorGetNext' defined at (most recent call last):
File "/LLM/models/official/recommendation/ncf_keras_main.py", line 576, in
app.run(main)
File "/root/.local/lib/python3.10/site-packages/absl/app.py", line 308, in run
_run_main(main, args)
File "/root/.local/lib/python3.10/site-packages/absl/app.py", line 254, in _run_main
sys.exit(main(argv))
File "/LLM/models/official/recommendation/ncf_keras_main.py", line 571, in main
logging.info("Result is %s", run_ncf(FLAGS))
File "/LLM/models/official/recommendation/ncf_keras_main.py", line 330, in run_ncf
history = keras_model.fit(
File "/root/.local/lib/python3.10/site-packages/tf_keras/src/utils/traceback_utils.py", line 65, in error_handler
return fn(*args, **kwargs)
File "/root/.local/lib/python3.10/site-packages/tf_keras/src/engine/training.py", line 1804, in fit
tmp_logs = self.train_function(iterator)
File "/root/.local/lib/python3.10/site-packages/tf_keras/src/engine/training.py", line 1408, in
lambda it: self._cluster_coordinator.schedule(
File "/root/.local/lib/python3.10/site-packages/tf_keras/src/engine/training.py", line 1398, in train_function
return step_function(self, iterator)
File "/root/.local/lib/python3.10/site-packages/tf_keras/src/engine/training.py", line 1380, in step_function
data = next(iterator)
Node: 'cond/IteratorGetNext'
Detected at node 'cond/IteratorGetNext' defined at (most recent call last):
File "/LLM/models/official/recommendation/ncf_keras_main.py", line 576, in
app.run(main)
File "/root/.local/lib/python3.10/site-packages/absl/app.py", line 308, in run
_run_main(main, args)
File "/root/.local/lib/python3.10/site-packages/absl/app.py", line 254, in _run_main
sys.exit(main(argv))
File "/LLM/models/official/recommendation/ncf_keras_main.py", line 571, in main
logging.info("Result is %s", run_ncf(FLAGS))
File "/LLM/models/official/recommendation/ncf_keras_main.py", line 330, in run_ncf
history = keras_model.fit(
File "/root/.local/lib/python3.10/site-packages/tf_keras/src/utils/traceback_utils.py", line 65, in error_handler
return fn(*args, **kwargs)
File "/root/.local/lib/python3.10/site-packages/tf_keras/src/engine/training.py", line 1804, in fit
tmp_logs = self.train_function(iterator)
File "/root/.local/lib/python3.10/site-packages/tf_keras/src/engine/training.py", line 1408, in
lambda it: self._cluster_coordinator.schedule(
File "/root/.local/lib/python3.10/site-packages/tf_keras/src/engine/training.py", line 1398, in train_function
return step_function(self, iterator)
File "/root/.local/lib/python3.10/site-packages/tf_keras/src/engine/training.py", line 1380, in step_function
data = next(iterator)
Node: 'cond/IteratorGetNext'
End of sequence
[[{{node cond/IteratorGetNext}}]]
Additional GRPC error information from remote target /job:worker/replica:0/task:0:
:{"created":"@1705051450.146141045","description":"Error received from peer ipv4:127.0.0.1:12345","file":"external/com_github_grpc_grpc/src/core/lib/surface/call.cc","file_line":1056,"grpc_message":" End of sequence\n\t [[{{node cond/IteratorGetNext}}]]","grpc_status":11} [Op:__inference_train_function_2453]
E0112 09:24:10.397781 140489348497856 cluster_coordinator.py:412] Start cancelling closures due to error OutOfRangeError(): Graph execution error:
how to resolve OutOfRangeError ?
The text was updated successfully, but these errors were encountered: