Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions Documentation/ABI/stable/sysfs-block
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,32 @@ Description:
unsigned integer, but only "0" and "1" are valid values.


What: /sys/block/<disk>/read_err_retry_sec
What: /sys/block/<disk>/<partition>/read_err_retry_sec
Date: May 2026
Contact: linux-block@vger.kernel.org
Description:
(RW) Configure the fail-fast window, in seconds, for repeated
buffer_head reads after read I/O errors.

The default value is 0, which disables the fail-fast behavior and
preserves the existing retry behavior. When this value is non-zero,
a buffer_head that has recently seen a non-readahead read I/O error
can fail another read immediately within the configured window,
instead of submitting another bio for the same buffer_head.

This only applies to buffer_head reads submitted through submit_bh().
It is not a generic block layer read retry policy, and it does not
affect direct I/O or non-buffer_head bio submissions.

Disk and partition attributes are independent. Setting the disk
attribute does not change the value for existing or future
partition block devices.

The maximum accepted value is MAX_JIFFY_OFFSET / HZ. Larger values
are rejected with -ERANGE.


What: /sys/block/<disk>/<partition>/alignment_offset
Date: April 2009
Contact: Martin K. Petersen <martin.petersen@oracle.com>
Expand Down
24 changes: 24 additions & 0 deletions block/genhd.c
Original file line number Diff line number Diff line change
Expand Up @@ -1159,6 +1159,28 @@ static ssize_t partscan_show(struct device *dev,
return sysfs_emit(buf, "%u\n", disk_has_partscan(dev_to_disk(dev)));
}

static ssize_t read_err_retry_sec_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
return sysfs_emit(buf, "%lu\n",
READ_ONCE(dev_to_bdev(dev)->bd_read_err_retry_sec));
}

static ssize_t read_err_retry_sec_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
unsigned long sec;

if (kstrtoul(buf, 0, &sec))
return -EINVAL;
if (sec > MAX_JIFFY_OFFSET / HZ)
return -ERANGE;

WRITE_ONCE(dev_to_bdev(dev)->bd_read_err_retry_sec, sec);
return count;
}

static DEVICE_ATTR(range, 0444, disk_range_show, NULL);
static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL);
static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL);
Expand All @@ -1173,6 +1195,7 @@ static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store);
static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL);
static DEVICE_ATTR(partscan, 0444, partscan_show, NULL);
static DEVICE_ATTR_RW(read_err_retry_sec);

#ifdef CONFIG_FAIL_MAKE_REQUEST
ssize_t part_fail_show(struct device *dev,
Expand Down Expand Up @@ -1224,6 +1247,7 @@ static struct attribute *disk_attrs[] = {
&dev_attr_events_poll_msecs.attr,
&dev_attr_diskseq.attr,
&dev_attr_partscan.attr,
&dev_attr_read_err_retry_sec.attr,
#ifdef CONFIG_FAIL_MAKE_REQUEST
&dev_attr_fail.attr,
#endif
Expand Down
24 changes: 24 additions & 0 deletions block/partitions/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,28 @@ static ssize_t part_discard_alignment_show(struct device *dev,
return sysfs_emit(buf, "%u\n", bdev_discard_alignment(dev_to_bdev(dev)));
}

static ssize_t read_err_retry_sec_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
return sysfs_emit(buf, "%lu\n",
READ_ONCE(dev_to_bdev(dev)->bd_read_err_retry_sec));
}

static ssize_t read_err_retry_sec_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
unsigned long sec;

if (kstrtoul(buf, 0, &sec))
return -EINVAL;
if (sec > MAX_JIFFY_OFFSET / HZ)
return -ERANGE;

WRITE_ONCE(dev_to_bdev(dev)->bd_read_err_retry_sec, sec);
return count;
}

static DEVICE_ATTR(partition, 0444, part_partition_show, NULL);
static DEVICE_ATTR(start, 0444, part_start_show, NULL);
static DEVICE_ATTR(size, 0444, part_size_show, NULL);
Expand All @@ -213,6 +235,7 @@ static DEVICE_ATTR(alignment_offset, 0444, part_alignment_offset_show, NULL);
static DEVICE_ATTR(discard_alignment, 0444, part_discard_alignment_show, NULL);
static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
static DEVICE_ATTR_RW(read_err_retry_sec);
#ifdef CONFIG_FAIL_MAKE_REQUEST
static struct device_attribute dev_attr_fail =
__ATTR(make-it-fail, 0644, part_fail_show, part_fail_store);
Expand All @@ -227,6 +250,7 @@ static struct attribute *part_attrs[] = {
&dev_attr_discard_alignment.attr,
&dev_attr_stat.attr,
&dev_attr_inflight.attr,
&dev_attr_read_err_retry_sec.attr,
#ifdef CONFIG_FAIL_MAKE_REQUEST
&dev_attr_fail.attr,
#endif
Expand Down
65 changes: 65 additions & 0 deletions fs/buffer.c
Original file line number Diff line number Diff line change
Expand Up @@ -920,6 +920,7 @@ static sector_t folio_init_buffers(struct folio *folio,
bh->b_private = NULL;
bh->b_bdev = bdev;
bh->b_blocknr = block;
clear_buffer_read_io_error_state(bh);
if (uptodate)
set_buffer_uptodate(bh);
if (block < end_block)
Expand Down Expand Up @@ -1503,6 +1504,7 @@ static void discard_buffer(struct buffer_head * bh)
lock_buffer(bh);
clear_buffer_dirty(bh);
bh->b_bdev = NULL;
clear_buffer_read_io_error_state(bh);
b_state = READ_ONCE(bh->b_state);
do {
} while (!try_cmpxchg_relaxed(&bh->b_state, &b_state,
Expand Down Expand Up @@ -1997,6 +1999,7 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
inode->i_blkbits;
set_buffer_mapped(bh);
clear_buffer_read_io_error_state(bh);
return 0;
default:
WARN_ON_ONCE(1);
Expand Down Expand Up @@ -2663,17 +2666,71 @@ sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
}
EXPORT_SYMBOL(generic_block_bmap);

static void bh_update_io_error_state(struct buffer_head *bh, const struct bio *bio)
{
const enum req_op op = bio_op(bio);

if (op != REQ_OP_READ && op != REQ_OP_WRITE)
return;

/*
* Track non-readahead read failures (timestamped) so submit_bh() can
* fail repeated reads fast. A successful read or rewrite clears the
* state.
*/
if (!bio->bi_status) {
clear_buffer_read_io_error(bh);
bh->b_err_timestamp = 0;
return;
}

/* Record the first failure; don't extend the window on repeats. */
if (op != REQ_OP_READ || (bio->bi_opf & REQ_RAHEAD) ||
buffer_read_io_error(bh))
return;

set_buffer_read_io_error(bh);
bh->b_err_timestamp = jiffies;
}

static void end_bio_bh_io_sync(struct bio *bio)
{
struct buffer_head *bh = bio->bi_private;

if (unlikely(bio_flagged(bio, BIO_QUIET)))
set_bit(BH_Quiet, &bh->b_state);

bh_update_io_error_state(bh, bio);

bh->b_end_io(bh, !bio->bi_status);
bio_put(bio);
}

static bool bh_failfast_read(struct buffer_head *bh)
{
unsigned long retry_sec = READ_ONCE(bh->b_bdev->bd_read_err_retry_sec);

if (!retry_sec || !buffer_read_io_error(bh))
return false;

/* No timestamp: treat as stale state and re-arm on the next failure. */
if (!bh->b_err_timestamp) {
clear_buffer_read_io_error(bh);
return false;
}

if (time_before(jiffies,
bh->b_err_timestamp + secs_to_jiffies(retry_sec))) {
test_set_buffer_req(bh);
bh->b_end_io(bh, 0);
return true;
}

clear_buffer_read_io_error(bh);
bh->b_err_timestamp = 0;
return false;
}

static void buffer_set_crypto_ctx(struct bio *bio, const struct buffer_head *bh,
gfp_t gfp_mask)
{
Expand Down Expand Up @@ -2702,6 +2759,14 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
BUG_ON(buffer_delay(bh));
BUG_ON(buffer_unwritten(bh));

/*
* Fail fast for repeated non-readahead buffer_head reads after a recent
* I/O error. This avoids serializing many callers on BH_Lock while
* re-submitting the same failing read.
*/
if (op == REQ_OP_READ && !(opf & REQ_RAHEAD) && bh_failfast_read(bh))
return;

/*
* Only clear out a write error when rewriting
*/
Expand Down
3 changes: 3 additions & 0 deletions include/linux/blk_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ struct block_device {
atomic_t bd_fsfreeze_count; /* number of freeze requests */
struct mutex bd_fsfreeze_mutex; /* serialize freeze/thaw */

/* Seconds; 0 disables read fail-fast window for submit_bh(READ). */
unsigned long bd_read_err_retry_sec;

struct partition_meta_info *bd_meta_info;
int bd_writers;
#ifdef CONFIG_SECURITY
Expand Down
10 changes: 10 additions & 0 deletions include/linux/buffer_head.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ enum bh_state_bits {
BH_Delay, /* Buffer is not yet allocated on disk */
BH_Boundary, /* Block is followed by a discontiguity */
BH_Write_EIO, /* I/O error on write */
BH_Read_EIO, /* I/O error on read */
BH_Unwritten, /* Buffer is allocated on disk but not written */
BH_Quiet, /* Buffer Error Prinks to be quiet */
BH_Meta, /* Buffer contains metadata */
Expand Down Expand Up @@ -79,6 +80,7 @@ struct buffer_head {
spinlock_t b_uptodate_lock; /* Used by the first bh in a page, to
* serialise IO completion of other
* buffers in the page */
unsigned long b_err_timestamp; /* timestamp of last I/O error */
};

/*
Expand Down Expand Up @@ -132,11 +134,18 @@ BUFFER_FNS(Async_Write, async_write)
BUFFER_FNS(Delay, delay)
BUFFER_FNS(Boundary, boundary)
BUFFER_FNS(Write_EIO, write_io_error)
BUFFER_FNS(Read_EIO, read_io_error)
BUFFER_FNS(Unwritten, unwritten)
BUFFER_FNS(Meta, meta)
BUFFER_FNS(Prio, prio)
BUFFER_FNS(Defer_Completion, defer_completion)

static inline void clear_buffer_read_io_error_state(struct buffer_head *bh)
{
clear_buffer_read_io_error(bh);
bh->b_err_timestamp = 0;
}

static __always_inline void set_buffer_uptodate(struct buffer_head *bh)
{
/*
Expand Down Expand Up @@ -411,6 +420,7 @@ map_bh(struct buffer_head *bh, struct super_block *sb, sector_t block)
bh->b_bdev = sb->s_bdev;
bh->b_blocknr = block;
bh->b_size = sb->s_blocksize;
clear_buffer_read_io_error_state(bh);
}

static inline void wait_on_buffer(struct buffer_head *bh)
Expand Down