Tensorflow
Guys, I'm lunching several jobs to our GPU cluster (hundreds).
For some hosts I'm getting the following error once estimator.train
is triggered.
Have you guys faced similar issue?
I'm using tensorflow-gpu 1.8
ping @andre.anjos, @amohammadi
thanks
totalMemory: 11.17GiB freeMemory: 11.11GiB
2018-11-23 14:18:50.403387: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1435] Adding visible gpu devices: 0
2018-11-23 14:18:50.403643: E tensorflow/core/common_runtime/direct_session.cc:154] Internal: cudaGetDevice() failed. Status: CUDA driver version is insufficient for CUDA runtime version
Traceback (most recent call last):
File "/remote/idiap.svm/user.active/tpereira/gitlab/bob/bob.bio.htface/bin/bob", line 33, in <module>
sys.exit(bob.extension.scripts.main_cli())
File "/idiap/user/tpereira/conda/envs/bob.bio.htface/lib/python3.6/site-packages/click/core.py", line 722, in __call__
return self.main(*args, **kwargs)
File "/idiap/user/tpereira/conda/envs/bob.bio.htface/lib/python3.6/site-packages/click/core.py", line 697, in main
rv = self.invoke(ctx)
File "/idiap/user/tpereira/conda/envs/bob.bio.htface/lib/python3.6/site-packages/click/core.py", line 1066, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File "/idiap/user/tpereira/conda/envs/bob.bio.htface/lib/python3.6/site-packages/click/core.py", line 1066, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File "/idiap/user/tpereira/conda/envs/bob.bio.htface/lib/python3.6/site-packages/click/core.py", line 1066, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File "/idiap/user/tpereira/conda/envs/bob.bio.htface/lib/python3.6/site-packages/click/core.py", line 895, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/idiap/user/tpereira/conda/envs/bob.bio.htface/lib/python3.6/site-packages/click/core.py", line 535, in invoke
return callback(*args, **kwargs)
File "/idiap/user/tpereira/conda/envs/bob.bio.htface/lib/python3.6/site-packages/click/decorators.py", line 17, in new_func
return f(get_current_context(), *args, **kwargs)
File "/remote/idiap.svm/user.active/tpereira/gitlab/bob/bob.bio.htface/bob/bio/htface/script/domain_specic_units.py", line 86, in htface_train_dsu
steps=200000)
File "/idiap/user/tpereira/conda/envs/bob.bio.htface/lib/python3.6/site-packages/tensorflow/python/estimator/estimator.py", line 363, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/idiap/user/tpereira/conda/envs/bob.bio.htface/lib/python3.6/site-packages/tensorflow/python/estimator/estimator.py", line 843, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File "/idiap/user/tpereira/conda/envs/bob.bio.htface/lib/python3.6/site-packages/tensorflow/python/estimator/estimator.py", line 859, in _train_model_default
saving_listeners)
File "/idiap/user/tpereira/conda/envs/bob.bio.htface/lib/python3.6/site-packages/tensorflow/python/estimator/estimator.py", line 1056, in _train_with_estimator_spec
log_step_count_steps=self._config.log_step_count_steps) as mon_sess:
File "/idiap/user/tpereira/conda/envs/bob.bio.htface/lib/python3.6/site-packages/tensorflow/python/training/monitored_session.py", line 405, in MonitoredTrainingSession
stop_grace_period_secs=stop_grace_period_secs)
File "/idiap/user/tpereira/conda/envs/bob.bio.htface/lib/python3.6/site-packages/tensorflow/python/training/monitored_session.py", line 816, in __init__
stop_grace_period_secs=stop_grace_period_secs)
File "/idiap/user/tpereira/conda/envs/bob.bio.htface/lib/python3.6/site-packages/tensorflow/python/training/monitored_session.py", line 539, in __init__
self._sess = _RecoverableSession(self._coordinated_creator)
File "/idiap/user/tpereira/conda/envs/bob.bio.htface/lib/python3.6/site-packages/tensorflow/python/training/monitored_session.py", line 1002, in __init__
_WrappedSession.__init__(self, self._create_session())
File "/idiap/user/tpereira/conda/envs/bob.bio.htface/lib/python3.6/site-packages/tensorflow/python/training/monitored_session.py", line 1007, in _create_session
return self._sess_creator.create_session()
File "/idiap/user/tpereira/conda/envs/bob.bio.htface/lib/python3.6/site-packages/tensorflow/python/training/monitored_session.py", line 696, in create_session
self.tf_sess = self._session_creator.create_session()
File "/idiap/user/tpereira/conda/envs/bob.bio.htface/lib/python3.6/site-packages/tensorflow/python/training/monitored_session.py", line 467, in create_session
init_fn=self._scaffold.init_fn)
File "/idiap/user/tpereira/conda/envs/bob.bio.htface/lib/python3.6/site-packages/tensorflow/python/training/session_manager.py", line 279, in prepare_session
config=config)
File "/idiap/user/tpereira/conda/envs/bob.bio.htface/lib/python3.6/site-packages/tensorflow/python/training/session_manager.py", line 180, in _restore_checkpoint
sess = session.Session(self._target, graph=self._graph, config=config)
File "/idiap/user/tpereira/conda/envs/bob.bio.htface/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1560, in __init__
super(Session, self).__init__(target, graph, config=config)
File "/idiap/user/tpereira/conda/envs/bob.bio.htface/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 633, in __init__
self._session = tf_session.TF_NewSession(self._graph._c_graph, opts)
tensorflow.python.framework.errors_impl.InternalError: Failed to create session.