Skip to content

Commit

Permalink
vdev probe to slow disk can stall mmp write checker
Browse files Browse the repository at this point in the history
Simplify vdev probes in the zio_vdev_io_done context to
avoid holding the spa config lock for long durations.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.

Signed-off-by: Don Brady <don.brady@klarasystems.com>
  • Loading branch information
Don Brady authored and behlendorf committed Mar 1, 2024
1 parent fb6d532 commit b45960d
Show file tree
Hide file tree
Showing 6 changed files with 36 additions and 29 deletions.
2 changes: 1 addition & 1 deletion include/sys/spa.h
Expand Up @@ -773,7 +773,7 @@ extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);

#define SPA_ASYNC_CONFIG_UPDATE 0x01
#define SPA_ASYNC_REMOVE 0x02
#define SPA_ASYNC_PROBE 0x04
#define SPA_ASYNC_FAULT_VDEV 0x04
#define SPA_ASYNC_RESILVER_DONE 0x08
#define SPA_ASYNC_RESILVER 0x10
#define SPA_ASYNC_AUTOEXPAND 0x20
Expand Down
16 changes: 8 additions & 8 deletions include/sys/uberblock_impl.h
Expand Up @@ -50,20 +50,20 @@ extern "C" {
#define MMP_SEQ_VALID_BIT 0x02
#define MMP_FAIL_INT_VALID_BIT 0x04

#define MMP_VALID(ubp) (ubp->ub_magic == UBERBLOCK_MAGIC && \
ubp->ub_mmp_magic == MMP_MAGIC)
#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
#define MMP_VALID(ubp) ((ubp)->ub_magic == UBERBLOCK_MAGIC && \
(ubp)->ub_mmp_magic == MMP_MAGIC)
#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \
MMP_INTERVAL_VALID_BIT))
#define MMP_SEQ_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
#define MMP_SEQ_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \
MMP_SEQ_VALID_BIT))
#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \
MMP_FAIL_INT_VALID_BIT))

#define MMP_INTERVAL(ubp) ((ubp->ub_mmp_config & 0x00000000FFFFFF00) \
#define MMP_INTERVAL(ubp) (((ubp)->ub_mmp_config & 0x00000000FFFFFF00) \
>> 8)
#define MMP_SEQ(ubp) ((ubp->ub_mmp_config & 0x0000FFFF00000000) \
#define MMP_SEQ(ubp) (((ubp)->ub_mmp_config & 0x0000FFFF00000000) \
>> 32)
#define MMP_FAIL_INT(ubp) ((ubp->ub_mmp_config & 0xFFFF000000000000) \
#define MMP_FAIL_INT(ubp) (((ubp)->ub_mmp_config & 0xFFFF000000000000) \
>> 48)

#define MMP_INTERVAL_SET(write) \
Expand Down
2 changes: 1 addition & 1 deletion include/sys/vdev_impl.h
Expand Up @@ -292,7 +292,7 @@ struct vdev {
txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */
txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */
boolean_t vdev_remove_wanted; /* async remove wanted? */
boolean_t vdev_probe_wanted; /* async probe wanted? */
boolean_t vdev_fault_wanted; /* async faulted wanted? */
list_node_t vdev_config_dirty_node; /* config dirty list */
list_node_t vdev_state_dirty_node; /* state dirty list */
uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */
Expand Down
17 changes: 9 additions & 8 deletions module/zfs/spa.c
Expand Up @@ -8391,15 +8391,16 @@ spa_async_remove(spa_t *spa, vdev_t *vd)
}

static void
spa_async_probe(spa_t *spa, vdev_t *vd)
spa_async_fault_vdev(spa_t *spa, vdev_t *vd)
{
if (vd->vdev_probe_wanted) {
vd->vdev_probe_wanted = B_FALSE;
vdev_reopen(vd); /* vdev_open() does the actual probe */
if (vd->vdev_fault_wanted) {
vd->vdev_fault_wanted = B_FALSE;
vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
VDEV_AUX_ERR_EXCEEDED);
}

for (int c = 0; c < vd->vdev_children; c++)
spa_async_probe(spa, vd->vdev_child[c]);
spa_async_fault_vdev(spa, vd->vdev_child[c]);
}

static void
Expand Down Expand Up @@ -8487,11 +8488,11 @@ spa_async_thread(void *arg)
}

/*
* See if any devices need to be probed.
* See if any devices need to be marked faulted.
*/
if (tasks & SPA_ASYNC_PROBE) {
if (tasks & SPA_ASYNC_FAULT_VDEV) {
spa_vdev_state_enter(spa, SCL_NONE);
spa_async_probe(spa, spa->spa_root_vdev);
spa_async_fault_vdev(spa, spa->spa_root_vdev);
(void) spa_vdev_state_exit(spa, NULL, 0);
}

Expand Down
22 changes: 13 additions & 9 deletions module/zfs/vdev.c
Expand Up @@ -1574,6 +1574,7 @@ vdev_metaslab_fini(vdev_t *vd)
typedef struct vdev_probe_stats {
boolean_t vps_readable;
boolean_t vps_writeable;
boolean_t vps_zio_done_probe;
int vps_flags;
} vdev_probe_stats_t;

Expand Down Expand Up @@ -1617,6 +1618,17 @@ vdev_probe_done(zio_t *zio)
(void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
spa, vd, NULL, NULL, 0);
zio->io_error = SET_ERROR(ENXIO);

/*
* If this probe was initiated from zio pipeline, then
* change the state in a spa_async_request. Probes that
* were initiated from a vdev_open can change the state
* as part of the open call.
*/
if (vps->vps_zio_done_probe) {
vd->vdev_fault_wanted = B_TRUE;
spa_async_request(spa, SPA_ASYNC_FAULT_VDEV);
}
}

mutex_enter(&vd->vdev_probe_lock);
Expand Down Expand Up @@ -1668,6 +1680,7 @@ vdev_probe(vdev_t *vd, zio_t *zio)
vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE |
ZIO_FLAG_TRYHARD;
vps->vps_zio_done_probe = (zio != NULL);

if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
/*
Expand All @@ -1694,15 +1707,6 @@ vdev_probe(vdev_t *vd, zio_t *zio)
vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
vdev_probe_done, vps,
vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);

/*
* We can't change the vdev state in this context, so we
* kick off an async task to do it on our behalf.
*/
if (zio != NULL) {
vd->vdev_probe_wanted = B_TRUE;
spa_async_request(spa, SPA_ASYNC_PROBE);
}
}

if (zio != NULL)
Expand Down
6 changes: 4 additions & 2 deletions module/zfs/zio.c
Expand Up @@ -2376,8 +2376,10 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason)
"failure and the failure mode property for this pool "
"is set to panic.", spa_name(spa));

cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O "
"failure and has been suspended.\n", spa_name(spa));
if (reason != ZIO_SUSPEND_MMP) {
cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable "
"I/O failure and has been suspended.\n", spa_name(spa));
}

(void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL,
NULL, NULL, 0);
Expand Down

0 comments on commit b45960d

Please sign in to comment.