Skip to content

Commit

Permalink
feature: reduce training job wait time (#295)
Browse files Browse the repository at this point in the history
  • Loading branch information
sasha-gitg committed Apr 7, 2021
1 parent 89e3212 commit 0d9dccf
Showing 1 changed file with 10 additions and 5 deletions.
15 changes: 10 additions & 5 deletions google/cloud/aiplatform/training_jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -623,17 +623,22 @@ def _block_until_complete(self):

# Used these numbers so failures surface fast
wait = 5 # start at five seconds
log_wait = 5
max_wait = 60 * 5 # 5 minute wait
multiplier = 2 # scale wait by 2 every iteration

previous_time = time.time()
while self.state not in _PIPELINE_COMPLETE_STATES:
self._sync_gca_resource()
current_time = time.time()
if current_time - previous_time >= log_wait:
_LOGGER.info(
"Training %s current state:\n%s"
% (self._gca_resource.name, self._gca_resource.state)
)
log_wait = min(log_wait * multiplier, max_wait)
previous_time = current_time
time.sleep(wait)
_LOGGER.info(
"Training %s current state:\n%s"
% (self._gca_resource.name, self._gca_resource.state)
)
wait = min(wait * multiplier, max_wait)

self._raise_failure()

Expand Down

0 comments on commit 0d9dccf

Please sign in to comment.