Skip to content
This repository has been archived by the owner on Jul 6, 2023. It is now read-only.

Commit

Permalink
apps: only treat errors as fatal if brick destroy fails on a working …
Browse files Browse the repository at this point in the history
…host

We make this same fix, but only to the happy path. Add a helper function
and change all call sites to have similar behavior.

Signed-off-by: John Mulligan <jmulligan@redhat.com>
(cherry picked from commit 4641d73)
  • Loading branch information
phlogistonjohn committed Sep 29, 2020
1 parent b362691 commit dd067b7
Showing 1 changed file with 29 additions and 21 deletions.
50 changes: 29 additions & 21 deletions apps/glusterfs/operations_device.go
Expand Up @@ -590,25 +590,8 @@ func (beo *BrickEvictOperation) execReplaceBrick(
return err
}

beo.reclaimed, err = old.bhmap().destroy(executor)
if err != nil {
// If the destroy failed because the node is not running (ideally,
// permanently down) we want to continue with the operation . But if
// the node works we failed for a "real" reason we don't want to
// blindly ignore errors. So we do a probe to see if the host is
// responsive. If the destroy has failed we will use the node's
// responsiveness to decide if // this should be treated as a failure
// or not. It is bit hacky but should work around the lack of proper
// error handling in most common cases.
destroyErr := logger.LogError("Error destroying old brick: %v", err)
err := executor.GlusterdCheck(old.node.ManageHostName())
if err == nil {
logger.Warning("node responds to probe: failing operation")
return destroyErr
}
logger.Warning("node does not respond to probe: ignoring error destroying bricks")
}
return nil
beo.reclaimed, err = tryDestroyBrickMap(old.bhmap(), executor)
return err
}

func (beo *BrickEvictOperation) Exec(executor executors.Executor) error {
Expand Down Expand Up @@ -739,13 +722,13 @@ func (beo *BrickEvictOperation) Clean(executor executors.Executor) error {
switch beo.replaceResult {
case replaceComplete:
logger.Info("Destroying old brick contents")
beo.reclaimed, err = old.bhmap().destroy(executor)
beo.reclaimed, err = tryDestroyBrickMap(old.bhmap(), executor)
if err != nil {
return logger.LogError("Error destroying old brick: %v", err)
}
case replaceIncomplete:
logger.Info("Destroying unused new brick contents")
beo.reclaimed, err = newBHMap.destroy(executor)
beo.reclaimed, err = tryDestroyBrickMap(newBHMap, executor)
if err != nil {
return logger.LogError("Error destroying new brick: %v", err)
}
Expand Down Expand Up @@ -1033,3 +1016,28 @@ func getWorkingNode(n *NodeEntry, db wdb.RODB, executor executors.Executor) (str
}
return node, err
}

func tryDestroyBrickMap(bhmap brickHostMap, executor executors.Executor) (ReclaimMap, error) {
reclaimed, err := bhmap.destroy(executor)
if err == nil {
return reclaimed, nil
}

// If the destroy failed because the node is not running (ideally,
// permanently down) we want to continue with the operation . But if the
// node works we failed for a "real" reason we don't want to blindly ignore
// errors. So we do a probe to see if the host is responsive. If the
// destroy has failed we will use the node's responsiveness to decide if
// this should be treated as a failure or not. It is bit hacky but should
// work around the lack of proper error handling in most common cases.
for b, node := range bhmap {
destroyErr := logger.LogError("Error destroying brick [%v]: %v", b, err)
err := executor.GlusterdCheck(node)
if err == nil {
logger.Warning("node [%v] responds to probe: failing operation", node)
return reclaimed, destroyErr
}
logger.Warning("node [%v] does not respond to probe: ignoring error destroying bricks", node)
}
return reclaimed, nil
}

0 comments on commit dd067b7

Please sign in to comment.