The original behaviour is to refuse to add a new page if the maximum number
of segments has been reached, regardless of the fact the page we are
going to add can be merged into the last segment or not.
Unfortunately, when the system runs under heavy memory fragmentation conditions,
a driver may try to add multiple pages to the last segment.
The original code won't accept them and EBUSY will be reported to
userspace.
This patch modifies the function so it refuses to add a page
only in case the latter starts a new segment and the maximum number
of segments has already been reached.
The bug can be easily reproduced with the st driver:
1) set CONFIG_SCSI_MPT2SAS_MAX_SGE or CONFIG_SCSI_MPT3SAS_MAX_SGE to 16
2) modprobe st buffer_kbs=1024
3) #dd if=/dev/zero of=/dev/st0 bs=1M count=10
dd: error writing ‘/dev/st0’: Device or resource busy
Signed-off-by: Maurizio Lombardi <[email protected]>
---
fs/bio.c | 50 ++++++++++++++++++++++++++++----------------------
1 file changed, 28 insertions(+), 22 deletions(-)
diff --git a/fs/bio.c b/fs/bio.c
index 6f0362b..9a3a0b1 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -750,29 +750,31 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
return 0;
/*
- * we might lose a segment or two here, but rather that than
- * make this too complex.
+ * setup the new entry, we might clear it again later if we
+ * cannot add the page
+ */
+ bvec = &bio->bi_io_vec[bio->bi_vcnt];
+ bvec->bv_page = page;
+ bvec->bv_len = len;
+ bvec->bv_offset = offset;
+ bio->bi_vcnt++;
+ bio->bi_phys_segments++;
+
+ /*
+ * Perform a recount if the number of segments is greater
+ * than queue_max_segments(q).
*/
- while (bio->bi_phys_segments >= queue_max_segments(q)) {
+ while (bio->bi_phys_segments > queue_max_segments(q)) {
if (retried_segments)
- return 0;
+ goto failed;
retried_segments = 1;
blk_recount_segments(q, bio);
}
/*
- * setup the new entry, we might clear it again later if we
- * cannot add the page
- */
- bvec = &bio->bi_io_vec[bio->bi_vcnt];
- bvec->bv_page = page;
- bvec->bv_len = len;
- bvec->bv_offset = offset;
-
- /*
* if queue has other restrictions (eg varying max sector size
* depending on offset), it can specify a merge_bvec_fn in the
* queue to get further control
@@ -789,23 +791,27 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
* merge_bvec_fn() returns number of bytes it can accept
* at this offset
*/
- if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) {
- bvec->bv_page = NULL;
- bvec->bv_len = 0;
- bvec->bv_offset = 0;
- return 0;
- }
+ if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len)
+ goto failed;
}
/* If we may be able to merge these biovecs, force a recount */
- if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
+ if (bio->bi_vcnt > 1 && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
bio->bi_flags &= ~(1 << BIO_SEG_VALID);
- bio->bi_vcnt++;
- bio->bi_phys_segments++;
done:
bio->bi_iter.bi_size += len;
return len;
+
+ failed:
+ bvec->bv_page = NULL;
+ bvec->bv_len = 0;
+ bvec->bv_offset = 0;
+ bio->bi_vcnt--;
+ if (!retried_segments)
+ bio->bi_phys_segments--;
+
+ return 0;
}
/**
--
Maurizio Lombardi
Sorry I did a mistake in this patch: on failure I should restore the original value
of bi_phys_segments.
I'm going to send a new version.
Maurizio Lombardi
On Tue, Apr 29, 2014 at 04:58:18PM +0200, Maurizio Lombardi wrote:
> The original behaviour is to refuse to add a new page if the maximum number
> of segments has been reached, regardless of the fact the page we are
> going to add can be merged into the last segment or not.
>
> Unfortunately, when the system runs under heavy memory fragmentation conditions,
> a driver may try to add multiple pages to the last segment.
> The original code won't accept them and EBUSY will be reported to
> userspace.
>
> This patch modifies the function so it refuses to add a page
> only in case the latter starts a new segment and the maximum number
> of segments has already been reached.
>
> The bug can be easily reproduced with the st driver:
>
> 1) set CONFIG_SCSI_MPT2SAS_MAX_SGE or CONFIG_SCSI_MPT3SAS_MAX_SGE to 16
> 2) modprobe st buffer_kbs=1024
> 3) #dd if=/dev/zero of=/dev/st0 bs=1M count=10
> dd: error writing ‘/dev/st0’: Device or resource busy
>
> Signed-off-by: Maurizio Lombardi <[email protected]>
> ---
> fs/bio.c | 50 ++++++++++++++++++++++++++++----------------------
> 1 file changed, 28 insertions(+), 22 deletions(-)
>
> diff --git a/fs/bio.c b/fs/bio.c
> index 6f0362b..9a3a0b1 100644
> --- a/fs/bio.c
> +++ b/fs/bio.c
> @@ -750,29 +750,31 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
> return 0;
>
> /*
> - * we might lose a segment or two here, but rather that than
> - * make this too complex.
> + * setup the new entry, we might clear it again later if we
> + * cannot add the page
> + */
> + bvec = &bio->bi_io_vec[bio->bi_vcnt];
> + bvec->bv_page = page;
> + bvec->bv_len = len;
> + bvec->bv_offset = offset;
> + bio->bi_vcnt++;
> + bio->bi_phys_segments++;
> +
> + /*
> + * Perform a recount if the number of segments is greater
> + * than queue_max_segments(q).
> */
>
> - while (bio->bi_phys_segments >= queue_max_segments(q)) {
> + while (bio->bi_phys_segments > queue_max_segments(q)) {
>
> if (retried_segments)
> - return 0;
> + goto failed;
>
> retried_segments = 1;
> blk_recount_segments(q, bio);
> }
>
> /*
> - * setup the new entry, we might clear it again later if we
> - * cannot add the page
> - */
> - bvec = &bio->bi_io_vec[bio->bi_vcnt];
> - bvec->bv_page = page;
> - bvec->bv_len = len;
> - bvec->bv_offset = offset;
> -
> - /*
> * if queue has other restrictions (eg varying max sector size
> * depending on offset), it can specify a merge_bvec_fn in the
> * queue to get further control
> @@ -789,23 +791,27 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
> * merge_bvec_fn() returns number of bytes it can accept
> * at this offset
> */
> - if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) {
> - bvec->bv_page = NULL;
> - bvec->bv_len = 0;
> - bvec->bv_offset = 0;
> - return 0;
> - }
> + if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len)
> + goto failed;
> }
>
> /* If we may be able to merge these biovecs, force a recount */
> - if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
> + if (bio->bi_vcnt > 1 && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
> bio->bi_flags &= ~(1 << BIO_SEG_VALID);
>
> - bio->bi_vcnt++;
> - bio->bi_phys_segments++;
> done:
> bio->bi_iter.bi_size += len;
> return len;
> +
> + failed:
> + bvec->bv_page = NULL;
> + bvec->bv_len = 0;
> + bvec->bv_offset = 0;
> + bio->bi_vcnt--;
> + if (!retried_segments)
> + bio->bi_phys_segments--;
> +
> + return 0;
> }
>
> /**
> --
> Maurizio Lombardi
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html