diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block index 900b3fc4c72d0..b850f96fa048e 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -185,6 +185,32 @@ Description: unsigned integer, but only "0" and "1" are valid values. +What: /sys/block//read_err_retry_sec +What: /sys/block///read_err_retry_sec +Date: May 2026 +Contact: linux-block@vger.kernel.org +Description: + (RW) Configure the fail-fast window, in seconds, for repeated + buffer_head reads after read I/O errors. + + The default value is 0, which disables the fail-fast behavior and + preserves the existing retry behavior. When this value is non-zero, + a buffer_head that has recently seen a non-readahead read I/O error + can fail another read immediately within the configured window, + instead of submitting another bio for the same buffer_head. + + This only applies to buffer_head reads submitted through submit_bh(). + It is not a generic block layer read retry policy, and it does not + affect direct I/O or non-buffer_head bio submissions. + + Disk and partition attributes are independent. Setting the disk + attribute does not change the value for existing or future + partition block devices. + + The maximum accepted value is MAX_JIFFY_OFFSET / HZ. Larger values + are rejected with -ERANGE. + + What: /sys/block///alignment_offset Date: April 2009 Contact: Martin K. Petersen diff --git a/block/genhd.c b/block/genhd.c index 7d6854fd28e95..302dce67d685c 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1159,6 +1159,28 @@ static ssize_t partscan_show(struct device *dev, return sysfs_emit(buf, "%u\n", disk_has_partscan(dev_to_disk(dev))); } +static ssize_t read_err_retry_sec_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lu\n", + READ_ONCE(dev_to_bdev(dev)->bd_read_err_retry_sec)); +} + +static ssize_t read_err_retry_sec_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned long sec; + + if (kstrtoul(buf, 0, &sec)) + return -EINVAL; + if (sec > MAX_JIFFY_OFFSET / HZ) + return -ERANGE; + + WRITE_ONCE(dev_to_bdev(dev)->bd_read_err_retry_sec, sec); + return count; +} + static DEVICE_ATTR(range, 0444, disk_range_show, NULL); static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL); static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL); @@ -1173,6 +1195,7 @@ static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store); static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL); static DEVICE_ATTR(partscan, 0444, partscan_show, NULL); +static DEVICE_ATTR_RW(read_err_retry_sec); #ifdef CONFIG_FAIL_MAKE_REQUEST ssize_t part_fail_show(struct device *dev, @@ -1224,6 +1247,7 @@ static struct attribute *disk_attrs[] = { &dev_attr_events_poll_msecs.attr, &dev_attr_diskseq.attr, &dev_attr_partscan.attr, + &dev_attr_read_err_retry_sec.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif diff --git a/block/partitions/core.c b/block/partitions/core.c index 5d5332ce586b6..62b4c2f70709f 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -205,6 +205,28 @@ static ssize_t part_discard_alignment_show(struct device *dev, return sysfs_emit(buf, "%u\n", bdev_discard_alignment(dev_to_bdev(dev))); } +static ssize_t read_err_retry_sec_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lu\n", + READ_ONCE(dev_to_bdev(dev)->bd_read_err_retry_sec)); +} + +static ssize_t read_err_retry_sec_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned long sec; + + if (kstrtoul(buf, 0, &sec)) + return -EINVAL; + if (sec > MAX_JIFFY_OFFSET / HZ) + return -ERANGE; + + WRITE_ONCE(dev_to_bdev(dev)->bd_read_err_retry_sec, sec); + return count; +} + static DEVICE_ATTR(partition, 0444, part_partition_show, NULL); static DEVICE_ATTR(start, 0444, part_start_show, NULL); static DEVICE_ATTR(size, 0444, part_size_show, NULL); @@ -213,6 +235,7 @@ static DEVICE_ATTR(alignment_offset, 0444, part_alignment_offset_show, NULL); static DEVICE_ATTR(discard_alignment, 0444, part_discard_alignment_show, NULL); static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); +static DEVICE_ATTR_RW(read_err_retry_sec); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, 0644, part_fail_show, part_fail_store); @@ -227,6 +250,7 @@ static struct attribute *part_attrs[] = { &dev_attr_discard_alignment.attr, &dev_attr_stat.attr, &dev_attr_inflight.attr, + &dev_attr_read_err_retry_sec.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif diff --git a/fs/buffer.c b/fs/buffer.c index b0b3792b1496e..2a28ab6a51f0e 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -920,6 +920,7 @@ static sector_t folio_init_buffers(struct folio *folio, bh->b_private = NULL; bh->b_bdev = bdev; bh->b_blocknr = block; + clear_buffer_read_io_error_state(bh); if (uptodate) set_buffer_uptodate(bh); if (block < end_block) @@ -1503,6 +1504,7 @@ static void discard_buffer(struct buffer_head * bh) lock_buffer(bh); clear_buffer_dirty(bh); bh->b_bdev = NULL; + clear_buffer_read_io_error_state(bh); b_state = READ_ONCE(bh->b_state); do { } while (!try_cmpxchg_relaxed(&bh->b_state, &b_state, @@ -1997,6 +1999,7 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, bh->b_blocknr = (iomap->addr + offset - iomap->offset) >> inode->i_blkbits; set_buffer_mapped(bh); + clear_buffer_read_io_error_state(bh); return 0; default: WARN_ON_ONCE(1); @@ -2663,6 +2666,33 @@ sector_t generic_block_bmap(struct address_space *mapping, sector_t block, } EXPORT_SYMBOL(generic_block_bmap); +static void bh_update_io_error_state(struct buffer_head *bh, const struct bio *bio) +{ + const enum req_op op = bio_op(bio); + + if (op != REQ_OP_READ && op != REQ_OP_WRITE) + return; + + /* + * Track non-readahead read failures (timestamped) so submit_bh() can + * fail repeated reads fast. A successful read or rewrite clears the + * state. + */ + if (!bio->bi_status) { + clear_buffer_read_io_error(bh); + bh->b_err_timestamp = 0; + return; + } + + /* Record the first failure; don't extend the window on repeats. */ + if (op != REQ_OP_READ || (bio->bi_opf & REQ_RAHEAD) || + buffer_read_io_error(bh)) + return; + + set_buffer_read_io_error(bh); + bh->b_err_timestamp = jiffies; +} + static void end_bio_bh_io_sync(struct bio *bio) { struct buffer_head *bh = bio->bi_private; @@ -2670,10 +2700,37 @@ static void end_bio_bh_io_sync(struct bio *bio) if (unlikely(bio_flagged(bio, BIO_QUIET))) set_bit(BH_Quiet, &bh->b_state); + bh_update_io_error_state(bh, bio); + bh->b_end_io(bh, !bio->bi_status); bio_put(bio); } +static bool bh_failfast_read(struct buffer_head *bh) +{ + unsigned long retry_sec = READ_ONCE(bh->b_bdev->bd_read_err_retry_sec); + + if (!retry_sec || !buffer_read_io_error(bh)) + return false; + + /* No timestamp: treat as stale state and re-arm on the next failure. */ + if (!bh->b_err_timestamp) { + clear_buffer_read_io_error(bh); + return false; + } + + if (time_before(jiffies, + bh->b_err_timestamp + secs_to_jiffies(retry_sec))) { + test_set_buffer_req(bh); + bh->b_end_io(bh, 0); + return true; + } + + clear_buffer_read_io_error(bh); + bh->b_err_timestamp = 0; + return false; +} + static void buffer_set_crypto_ctx(struct bio *bio, const struct buffer_head *bh, gfp_t gfp_mask) { @@ -2702,6 +2759,14 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, BUG_ON(buffer_delay(bh)); BUG_ON(buffer_unwritten(bh)); + /* + * Fail fast for repeated non-readahead buffer_head reads after a recent + * I/O error. This avoids serializing many callers on BH_Lock while + * re-submitting the same failing read. + */ + if (op == REQ_OP_READ && !(opf & REQ_RAHEAD) && bh_failfast_read(bh)) + return; + /* * Only clear out a write error when rewriting */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 8808ee76e73c0..9437c471ee7d7 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -69,6 +69,9 @@ struct block_device { atomic_t bd_fsfreeze_count; /* number of freeze requests */ struct mutex bd_fsfreeze_mutex; /* serialize freeze/thaw */ + /* Seconds; 0 disables read fail-fast window for submit_bh(READ). */ + unsigned long bd_read_err_retry_sec; + struct partition_meta_info *bd_meta_info; int bd_writers; #ifdef CONFIG_SECURITY diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index e4939e33b4b51..3ab36429f8f38 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -29,6 +29,7 @@ enum bh_state_bits { BH_Delay, /* Buffer is not yet allocated on disk */ BH_Boundary, /* Block is followed by a discontiguity */ BH_Write_EIO, /* I/O error on write */ + BH_Read_EIO, /* I/O error on read */ BH_Unwritten, /* Buffer is allocated on disk but not written */ BH_Quiet, /* Buffer Error Prinks to be quiet */ BH_Meta, /* Buffer contains metadata */ @@ -79,6 +80,7 @@ struct buffer_head { spinlock_t b_uptodate_lock; /* Used by the first bh in a page, to * serialise IO completion of other * buffers in the page */ + unsigned long b_err_timestamp; /* timestamp of last I/O error */ }; /* @@ -132,11 +134,18 @@ BUFFER_FNS(Async_Write, async_write) BUFFER_FNS(Delay, delay) BUFFER_FNS(Boundary, boundary) BUFFER_FNS(Write_EIO, write_io_error) +BUFFER_FNS(Read_EIO, read_io_error) BUFFER_FNS(Unwritten, unwritten) BUFFER_FNS(Meta, meta) BUFFER_FNS(Prio, prio) BUFFER_FNS(Defer_Completion, defer_completion) +static inline void clear_buffer_read_io_error_state(struct buffer_head *bh) +{ + clear_buffer_read_io_error(bh); + bh->b_err_timestamp = 0; +} + static __always_inline void set_buffer_uptodate(struct buffer_head *bh) { /* @@ -411,6 +420,7 @@ map_bh(struct buffer_head *bh, struct super_block *sb, sector_t block) bh->b_bdev = sb->s_bdev; bh->b_blocknr = block; bh->b_size = sb->s_blocksize; + clear_buffer_read_io_error_state(bh); } static inline void wait_on_buffer(struct buffer_head *bh)