Hi,
Interests[1] have been shown in multipage bvecs, so this patchset
try to prepare for the support and do two things:
1) the 1st 4 patches use bvec iterator to implement iterate_bvec(),
then we can drop the non-standard way for iterating bvec
2) remove BIO_MAX_SECTORS & BIO_MAX_SIZE, and now there is only
one user for each. Once multipage bvecs is introduced, one bio
may hold lots of sectors, and we should always use sort of BIO_MAX_VECS
which should be introduced in future and is similiar with current
BIO_MAX_PAGES.
xfstests(-a auto) have been run and no regression found by this
patchset against linus v4.5+.
drivers/block/drbd/drbd_int.h | 4 +--
fs/xfs/xfs_buf.c | 2 +-
include/linux/bio.h | 55 +------------------------------
include/linux/blk_types.h | 4 +--
include/linux/bvec_iter.h | 77 +++++++++++++++++++++++++++++++++++++++++++
lib/iov_iter.c | 31 +++++++----------
6 files changed, 94 insertions(+), 79 deletions(-)
[1], http://marc.info/?w=2&r=1&s=++[LSF%2FMM+ATTEND]+block%3A+multipage+bvecs&q=t
Thanks,
Ming
bvec iterator helpers should be used to implement by
iterate_bvec():lib/iov_iter.c too, and move them into
one header, so that we can keep bvec iterator header
out of CONFIG_BLOCK. Then we can remove the inventing
of wheel in iterate_bvec().
Signed-off-by: Ming Lei <[email protected]>
---
include/linux/bio.h | 55 +--------------------------------
include/linux/bvec_iter.h | 78 +++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 79 insertions(+), 54 deletions(-)
create mode 100644 include/linux/bvec_iter.h
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 88bc64f..4abc129 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -31,6 +31,7 @@
/* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */
#include <linux/blk_types.h>
+#include <linux/bvec_iter.h>
#define BIO_DEBUG
@@ -40,10 +41,6 @@
#define BIO_BUG_ON
#endif
-#define BIO_MAX_PAGES 256
-#define BIO_MAX_SIZE (BIO_MAX_PAGES << PAGE_CACHE_SHIFT)
-#define BIO_MAX_SECTORS (BIO_MAX_SIZE >> 9)
-
/*
* upper 16 bits of bi_rw define the io priority of this bio
*/
@@ -57,29 +54,6 @@
(bio)->bi_rw |= ((unsigned long) (prio) << BIO_PRIO_SHIFT); \
} while (0)
-/*
- * various member access, note that bio_data should of course not be used
- * on highmem page vectors
- */
-#define __bvec_iter_bvec(bvec, iter) (&(bvec)[(iter).bi_idx])
-
-#define bvec_iter_page(bvec, iter) \
- (__bvec_iter_bvec((bvec), (iter))->bv_page)
-
-#define bvec_iter_len(bvec, iter) \
- min((iter).bi_size, \
- __bvec_iter_bvec((bvec), (iter))->bv_len - (iter).bi_bvec_done)
-
-#define bvec_iter_offset(bvec, iter) \
- (__bvec_iter_bvec((bvec), (iter))->bv_offset + (iter).bi_bvec_done)
-
-#define bvec_iter_bvec(bvec, iter) \
-((struct bio_vec) { \
- .bv_page = bvec_iter_page((bvec), (iter)), \
- .bv_len = bvec_iter_len((bvec), (iter)), \
- .bv_offset = bvec_iter_offset((bvec), (iter)), \
-})
-
#define bio_iter_iovec(bio, iter) \
bvec_iter_bvec((bio)->bi_io_vec, (iter))
@@ -193,33 +167,6 @@ static inline void *bio_data(struct bio *bio)
#define bio_for_each_segment_all(bvl, bio, i) \
for (i = 0, bvl = (bio)->bi_io_vec; i < (bio)->bi_vcnt; i++, bvl++)
-static inline void bvec_iter_advance(struct bio_vec *bv, struct bvec_iter *iter,
- unsigned bytes)
-{
- WARN_ONCE(bytes > iter->bi_size,
- "Attempted to advance past end of bvec iter\n");
-
- while (bytes) {
- unsigned len = min(bytes, bvec_iter_len(bv, *iter));
-
- bytes -= len;
- iter->bi_size -= len;
- iter->bi_bvec_done += len;
-
- if (iter->bi_bvec_done == __bvec_iter_bvec(bv, *iter)->bv_len) {
- iter->bi_bvec_done = 0;
- iter->bi_idx++;
- }
- }
-}
-
-#define for_each_bvec(bvl, bio_vec, iter, start) \
- for (iter = (start); \
- (iter).bi_size && \
- ((bvl = bvec_iter_bvec((bio_vec), (iter))), 1); \
- bvec_iter_advance((bio_vec), &(iter), (bvl).bv_len))
-
-
static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter,
unsigned bytes)
{
diff --git a/include/linux/bvec_iter.h b/include/linux/bvec_iter.h
new file mode 100644
index 0000000..cc43055
--- /dev/null
+++ b/include/linux/bvec_iter.h
@@ -0,0 +1,78 @@
+/*
+ * bvec iterator
+ *
+ * Copyright (C) 2001 Ming Lei <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public Licens
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
+ */
+#ifndef __LINUX_BVEC_ITER_H
+#define __LINUX_BVEC_ITER_H
+
+#include <linux/blk_types.h>
+
+#define BIO_MAX_PAGES 256
+#define BIO_MAX_SIZE (BIO_MAX_PAGES << PAGE_CACHE_SHIFT)
+#define BIO_MAX_SECTORS (BIO_MAX_SIZE >> 9)
+
+/*
+ * various member access, note that bio_data should of course not be used
+ * on highmem page vectors
+ */
+#define __bvec_iter_bvec(bvec, iter) (&(bvec)[(iter).bi_idx])
+
+#define bvec_iter_page(bvec, iter) \
+ (__bvec_iter_bvec((bvec), (iter))->bv_page)
+
+#define bvec_iter_len(bvec, iter) \
+ min((iter).bi_size, \
+ __bvec_iter_bvec((bvec), (iter))->bv_len - (iter).bi_bvec_done)
+
+#define bvec_iter_offset(bvec, iter) \
+ (__bvec_iter_bvec((bvec), (iter))->bv_offset + (iter).bi_bvec_done)
+
+#define bvec_iter_bvec(bvec, iter) \
+((struct bio_vec) { \
+ .bv_page = bvec_iter_page((bvec), (iter)), \
+ .bv_len = bvec_iter_len((bvec), (iter)), \
+ .bv_offset = bvec_iter_offset((bvec), (iter)), \
+})
+
+static inline void bvec_iter_advance(struct bio_vec *bv, struct bvec_iter *iter,
+ unsigned bytes)
+{
+ WARN_ONCE(bytes > iter->bi_size,
+ "Attempted to advance past end of bvec iter\n");
+
+ while (bytes) {
+ unsigned len = min(bytes, bvec_iter_len(bv, *iter));
+
+ bytes -= len;
+ iter->bi_size -= len;
+ iter->bi_bvec_done += len;
+
+ if (iter->bi_bvec_done == __bvec_iter_bvec(bv, *iter)->bv_len) {
+ iter->bi_bvec_done = 0;
+ iter->bi_idx++;
+ }
+ }
+}
+
+#define for_each_bvec(bvl, bio_vec, iter, start) \
+ for (iter = (start); \
+ (iter).bi_size && \
+ ((bvl = bvec_iter_bvec((bio_vec), (iter))), 1); \
+ bvec_iter_advance((bio_vec), &(iter), (bvl).bv_len))
+
+#endif /* __LINUX_BVEC_ITER_H */
--
1.9.1
bvec_iter_advance() only writes the parameter of iterator,
so the base address of bvec can be marked as const safely.
Without the change, we can see compiling warning in the
following patch for implementing iterate_bvec(): lib/iov_iter.c
with bvec iterator.
Signed-off-by: Ming Lei <[email protected]>
---
include/linux/bvec_iter.h | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/include/linux/bvec_iter.h b/include/linux/bvec_iter.h
index cc43055..5798c21 100644
--- a/include/linux/bvec_iter.h
+++ b/include/linux/bvec_iter.h
@@ -49,7 +49,8 @@
.bv_offset = bvec_iter_offset((bvec), (iter)), \
})
-static inline void bvec_iter_advance(struct bio_vec *bv, struct bvec_iter *iter,
+static inline void bvec_iter_advance(const struct bio_vec *bv,
+ struct bvec_iter *iter,
unsigned bytes)
{
WARN_ONCE(bytes > iter->bi_size,
--
1.9.1
bvec has provided one iterator already, so not necessary
to invent a new wheel for this job.
Signed-off-by: Ming Lei <[email protected]>
---
lib/iov_iter.c | 31 +++++++++++--------------------
1 file changed, 11 insertions(+), 20 deletions(-)
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 5fecddc..5e1b224 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -3,6 +3,7 @@
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
+#include <linux/bvec_iter.h>
#include <net/checksum.h>
#define iterate_iovec(i, n, __v, __p, skip, STEP) { \
@@ -57,35 +58,25 @@
}
#define iterate_bvec(i, n, __v, __p, skip, STEP) { \
- size_t wanted = n; \
+ struct bvec_iter __bi, __start; \
+ __start.bi_size = n; \
+ __start.bi_bvec_done = skip; \
+ __start.bi_idx = 0; \
__p = i->bvec; \
- __v.bv_len = min_t(size_t, n, __p->bv_len - skip); \
- if (likely(__v.bv_len)) { \
- __v.bv_page = __p->bv_page; \
- __v.bv_offset = __p->bv_offset + skip; \
+ for_each_bvec(__v, __p, __bi, __start) { \
(void)(STEP); \
- skip += __v.bv_len; \
- n -= __v.bv_len; \
} \
- while (unlikely(n)) { \
- __p++; \
- __v.bv_len = min_t(size_t, n, __p->bv_len); \
- if (unlikely(!__v.bv_len)) \
- continue; \
- __v.bv_page = __p->bv_page; \
- __v.bv_offset = __p->bv_offset; \
- (void)(STEP); \
+ if (!__bi.bi_idx) \
+ skip += __v.bv_len; \
+ else \
skip = __v.bv_len; \
- n -= __v.bv_len; \
- } \
- n = wanted; \
}
#define iterate_all_kinds(i, n, v, I, B, K) { \
size_t skip = i->iov_offset; \
if (unlikely(i->type & ITER_BVEC)) { \
const struct bio_vec *bvec; \
- struct bio_vec v; \
+ struct bio_vec v = { 0 }; \
iterate_bvec(i, n, v, bvec, skip, (B)) \
} else if (unlikely(i->type & ITER_KVEC)) { \
const struct kvec *kvec; \
@@ -102,7 +93,7 @@
size_t skip = i->iov_offset; \
if (unlikely(i->type & ITER_BVEC)) { \
const struct bio_vec *bvec; \
- struct bio_vec v; \
+ struct bio_vec v = { 0 }; \
iterate_bvec(i, n, v, bvec, skip, (B)) \
if (skip == bvec->bv_len) { \
bvec++; \
--
1.9.1
BIO_MAX_PAGES is used as maximum count of bvecs, so
replace BIO_MAX_SECTORS with BIO_MAX_PAGES since
BIO_MAX_SECTORS is to be removed.
Signed-off-by: Ming Lei <[email protected]>
---
fs/xfs/xfs_buf.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 9a2191b..01ef6d2 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1161,7 +1161,7 @@ xfs_buf_ioapply_map(
next_chunk:
atomic_inc(&bp->b_io_remaining);
- nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
+ nr_pages = BIO_MAX_PAGES;
if (nr_pages > total_nr_pages)
nr_pages = total_nr_pages;
--
1.9.1
We will use bvec iterator to implement iterate_bvec(): lib/iov_iter.c.
Signed-off-by: Ming Lei <[email protected]>
---
include/linux/blk_types.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 86a38ea..fd8527b 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -26,8 +26,6 @@ struct bio_vec {
unsigned int bv_offset;
};
-#ifdef CONFIG_BLOCK
-
struct bvec_iter {
sector_t bi_sector; /* device address in 512 byte
sectors */
@@ -39,6 +37,8 @@ struct bvec_iter {
current bvec */
};
+#ifdef CONFIG_BLOCK
+
/*
* main unit of I/O for the block layer and lower layers (ie drivers and
* stacking drivers)
--
1.9.1
No one need this macro now, so remove it. The motivation is
for supporting multipage bvecs, in which we only know
what the max count of bvecs is supported in the bio,
instead of max bio size.
Signed-off-by: Ming Lei <[email protected]>
---
include/linux/bvec_iter.h | 1 -
1 file changed, 1 deletion(-)
diff --git a/include/linux/bvec_iter.h b/include/linux/bvec_iter.h
index a979690..8ddaacf 100644
--- a/include/linux/bvec_iter.h
+++ b/include/linux/bvec_iter.h
@@ -23,7 +23,6 @@
#include <linux/blk_types.h>
#define BIO_MAX_PAGES 256
-#define BIO_MAX_SIZE (BIO_MAX_PAGES << PAGE_CACHE_SHIFT)
/*
* various member access, note that bio_data should of course not be used
--
1.9.1
drbd is the only user of BIO_MAX_SIZE, so use BIO_MAX_PAGES
instead.
Signed-off-by: Ming Lei <[email protected]>
---
drivers/block/drbd/drbd_int.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index c227fd4..10bfff1 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1327,14 +1327,14 @@ struct bm_extent {
#endif
#endif
-/* BIO_MAX_SIZE is 256 * PAGE_CACHE_SIZE,
+/* Estimate max bio size as 256 * PAGE_CACHE_SIZE,
* so for typical PAGE_CACHE_SIZE of 4k, that is (1<<20) Byte.
* Since we may live in a mixed-platform cluster,
* we limit us to a platform agnostic constant here for now.
* A followup commit may allow even bigger BIO sizes,
* once we thought that through. */
#define DRBD_MAX_BIO_SIZE (1U << 20)
-#if DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE
+#if DRBD_MAX_BIO_SIZE > (BIO_MAX_PAGES << PAGE_CACHE_SHIFT)
#error Architecture not supported: DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE
#endif
#define DRBD_MAX_BIO_SIZE_SAFE (1U << 12) /* Works always = 4k */
--
1.9.1
No one need this macro, so remove it. The motivation is
for supporting multipage bvecs, in which we only know
what the max count of bvecs is supported in the bio,
instead of max size or max sectors.
Signed-off-by: Ming Lei <[email protected]>
---
include/linux/bvec_iter.h | 1 -
1 file changed, 1 deletion(-)
diff --git a/include/linux/bvec_iter.h b/include/linux/bvec_iter.h
index 5798c21..a979690 100644
--- a/include/linux/bvec_iter.h
+++ b/include/linux/bvec_iter.h
@@ -24,7 +24,6 @@
#define BIO_MAX_PAGES 256
#define BIO_MAX_SIZE (BIO_MAX_PAGES << PAGE_CACHE_SHIFT)
-#define BIO_MAX_SECTORS (BIO_MAX_SIZE >> 9)
/*
* various member access, note that bio_data should of course not be used
--
1.9.1
On Tue, Mar 22, 2016 at 2:12 PM, Ming Lei <[email protected]> wrote:
> Hi,
>
> Interests[1] have been shown in multipage bvecs, so this patchset
> try to prepare for the support and do two things:
>
> 1) the 1st 4 patches use bvec iterator to implement iterate_bvec(),
> then we can drop the non-standard way for iterating bvec
>
> 2) remove BIO_MAX_SECTORS & BIO_MAX_SIZE, and now there is only
> one user for each. Once multipage bvecs is introduced, one bio
> may hold lots of sectors, and we should always use sort of BIO_MAX_VECS
> which should be introduced in future and is similiar with current
> BIO_MAX_PAGES.
>
> xfstests(-a auto) have been run and no regression found by this
> patchset against linus v4.5+.
Hi Jens,
Looks no one objects this patchset, and the change of iov_iter.c
is a good cleanup too, so what do you think of the patchset?
Thanks,
Ming
>
> drivers/block/drbd/drbd_int.h | 4 +--
> fs/xfs/xfs_buf.c | 2 +-
> include/linux/bio.h | 55 +------------------------------
> include/linux/blk_types.h | 4 +--
> include/linux/bvec_iter.h | 77 +++++++++++++++++++++++++++++++++++++++++++
> lib/iov_iter.c | 31 +++++++----------
> 6 files changed, 94 insertions(+), 79 deletions(-)
>
>
> [1], http://marc.info/?w=2&r=1&s=++[LSF%2FMM+ATTEND]+block%3A+multipage+bvecs&q=t
>
>
> Thanks,
> Ming
>
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -31,6 +31,7 @@
>
> /* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */
> #include <linux/blk_types.h>
> +#include <linux/bvec_iter.h>
>
> #define BIO_DEBUG
>
> @@ -40,10 +41,6 @@
> #define BIO_BUG_ON
> #endif
>
> -#define BIO_MAX_PAGES 256
> -#define BIO_MAX_SIZE (BIO_MAX_PAGES << PAGE_CACHE_SHIFT)
> -#define BIO_MAX_SECTORS (BIO_MAX_SIZE >> 9)
I see no reason why these should be moved out of bio.h.
Otherwise this looks fine to me.
Looks fine,
Reviewed-by: Christoph Hellwig <[email protected]>
Looks fine,
Reviewed-by: Christoph Hellwig <[email protected]>
This looks fine to me, but I'd really like to see Al review it as well.
> + nr_pages = BIO_MAX_PAGES;
> if (nr_pages > total_nr_pages)
> nr_pages = total_nr_pages;
Looks reasonable, but the whole thing could simply become:
nr_pages = min(total_nr_pages, BIO_MAX_PAGES);
Looks fine,
Reviewed-by: Christoph Hellwig <[email protected]>
On Tue, Mar 22, 2016 at 02:12:28PM +0800, Ming Lei wrote:
> drbd is the only user of BIO_MAX_SIZE, so use BIO_MAX_PAGES
> instead.
That whole code block looks completely bogus to me, although your patch
doesn't make it any worse.
I/O size for a network protocol shouldn't dependend on the number of
vectors in a kernel internal structure.
Well, getting rid of BIO_MAX_SIZE is worth it, so:
Reviewed-by: Christoph Hellwig <[email protected]>
Looks fine,
Reviewed-by: Christoph Hellwig <[email protected]>
On Tue, Mar 29, 2016 at 12:31:24AM -0700, Christoph Hellwig wrote:
> On Tue, Mar 22, 2016 at 02:12:28PM +0800, Ming Lei wrote:
> > drbd is the only user of BIO_MAX_SIZE, so use BIO_MAX_PAGES
> > instead.
>
> That whole code block looks completely bogus to me, although your patch
> doesn't make it any worse.
>
> I/O size for a network protocol shouldn't dependend on the number of
> vectors in a kernel internal structure.
That's correct. But we needed some limit there.
Initially, up until I changed it like six years ago iirc,
the receiving side would receive into a single bio.
So limiting us to what a single bio could usually handle
seemed like a good idea at the time.
Today, we should be able to handle 128 MiB easily,
maybe more. But that would require a protocol bump
to stay backwards compatible.
The part about "architecture not supported",
if our limit (1 MiB) is bigger than the "system" limit:
Never met that in real life. Probably not even possible.
Just a paranoia on my side: what if.
If that would have happened somewhere,
on some strange architecture or configuration,
I wanted to know about that.
Best way: don't even compile.
> Well, getting rid of BIO_MAX_SIZE is worth it, so:
>
> Reviewed-by: Christoph Hellwig <[email protected]>
Thanks,
Lars Ellenberg
On Tue, Mar 29, 2016 at 3:26 PM, Christoph Hellwig <[email protected]> wrote:
>> --- a/include/linux/bio.h
>> +++ b/include/linux/bio.h
>> @@ -31,6 +31,7 @@
>>
>> /* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */
>> #include <linux/blk_types.h>
>> +#include <linux/bvec_iter.h>
>>
>> #define BIO_DEBUG
>>
>> @@ -40,10 +41,6 @@
>> #define BIO_BUG_ON
>> #endif
>>
>> -#define BIO_MAX_PAGES 256
>> -#define BIO_MAX_SIZE (BIO_MAX_PAGES << PAGE_CACHE_SHIFT)
>> -#define BIO_MAX_SECTORS (BIO_MAX_SIZE >> 9)
>
> I see no reason why these should be moved out of bio.h.
You are right, these should be kept in bio.h.
Thanks for your review!
>
> Otherwise this looks fine to me.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-block" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html