On Thu, Apr 28, 2022 at 5:44 PM NeilBrown <[email protected]> wrote:
>
> Pages passed to swap_readpage()/swap_writepage() are not necessarily all
> the same size - there may be transparent-huge-pages involves.
>
> The BIO paths of swap_*page() handle this correctly, but the SWP_FS_OPS
> path does not.
>
> So we need to use thp_size() to find the size, not just assume
> PAGE_SIZE, and we need to track the total length of the request, not
> just assume it is "page * PAGE_SIZE".
Swap-over-nfs doesn't support THP swap IIUC. So SWP_FS_OPS should not
see THP at all. But I agree to remove the assumption about page size
in this path.
>
> Reported-by: Miaohe Lin <[email protected]>
> Signed-off-by: NeilBrown <[email protected]>
> ---
> mm/page_io.c | 23 +++++++++++++----------
> 1 file changed, 13 insertions(+), 10 deletions(-)
>
> diff --git a/mm/page_io.c b/mm/page_io.c
> index c132511f521c..d636a3531cad 100644
> --- a/mm/page_io.c
> +++ b/mm/page_io.c
> @@ -239,6 +239,7 @@ struct swap_iocb {
> struct kiocb iocb;
> struct bio_vec bvec[SWAP_CLUSTER_MAX];
> int pages;
> + int len;
> };
> static mempool_t *sio_pool;
>
> @@ -261,7 +262,7 @@ static void sio_write_complete(struct kiocb *iocb, long ret)
> struct page *page = sio->bvec[0].bv_page;
> int p;
>
> - if (ret != PAGE_SIZE * sio->pages) {
> + if (ret != sio->len) {
> /*
> * In the case of swap-over-nfs, this can be a
> * temporary failure if the system has limited
> @@ -301,7 +302,7 @@ static int swap_writepage_fs(struct page *page, struct writeback_control *wbc)
> sio = *wbc->swap_plug;
> if (sio) {
> if (sio->iocb.ki_filp != swap_file ||
> - sio->iocb.ki_pos + sio->pages * PAGE_SIZE != pos) {
> + sio->iocb.ki_pos + sio->len != pos) {
> swap_write_unplug(sio);
> sio = NULL;
> }
> @@ -312,10 +313,12 @@ static int swap_writepage_fs(struct page *page, struct writeback_control *wbc)
> sio->iocb.ki_complete = sio_write_complete;
> sio->iocb.ki_pos = pos;
> sio->pages = 0;
> + sio->len = 0;
> }
> sio->bvec[sio->pages].bv_page = page;
> - sio->bvec[sio->pages].bv_len = PAGE_SIZE;
> + sio->bvec[sio->pages].bv_len = thp_size(page);
> sio->bvec[sio->pages].bv_offset = 0;
> + sio->len += thp_size(page);
> sio->pages += 1;
> if (sio->pages == ARRAY_SIZE(sio->bvec) || !wbc->swap_plug) {
> swap_write_unplug(sio);
> @@ -371,8 +374,7 @@ void swap_write_unplug(struct swap_iocb *sio)
> struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
> int ret;
>
> - iov_iter_bvec(&from, WRITE, sio->bvec, sio->pages,
> - PAGE_SIZE * sio->pages);
> + iov_iter_bvec(&from, WRITE, sio->bvec, sio->pages, sio->len);
> ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
> if (ret != -EIOCBQUEUED)
> sio_write_complete(&sio->iocb, ret);
> @@ -383,7 +385,7 @@ static void sio_read_complete(struct kiocb *iocb, long ret)
> struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
> int p;
>
> - if (ret == PAGE_SIZE * sio->pages) {
> + if (ret == sio->len) {
> for (p = 0; p < sio->pages; p++) {
> struct page *page = sio->bvec[p].bv_page;
>
> @@ -415,7 +417,7 @@ static void swap_readpage_fs(struct page *page,
> sio = *plug;
> if (sio) {
> if (sio->iocb.ki_filp != sis->swap_file ||
> - sio->iocb.ki_pos + sio->pages * PAGE_SIZE != pos) {
> + sio->iocb.ki_pos + sio->len != pos) {
> swap_read_unplug(sio);
> sio = NULL;
> }
> @@ -426,10 +428,12 @@ static void swap_readpage_fs(struct page *page,
> sio->iocb.ki_pos = pos;
> sio->iocb.ki_complete = sio_read_complete;
> sio->pages = 0;
> + sio->len = 0;
> }
> sio->bvec[sio->pages].bv_page = page;
> - sio->bvec[sio->pages].bv_len = PAGE_SIZE;
> + sio->bvec[sio->pages].bv_len = thp_size(page);
> sio->bvec[sio->pages].bv_offset = 0;
> + sio->len += thp_size(page);
> sio->pages += 1;
> if (sio->pages == ARRAY_SIZE(sio->bvec) || !plug) {
> swap_read_unplug(sio);
> @@ -521,8 +525,7 @@ void __swap_read_unplug(struct swap_iocb *sio)
> struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
> int ret;
>
> - iov_iter_bvec(&from, READ, sio->bvec, sio->pages,
> - PAGE_SIZE * sio->pages);
> + iov_iter_bvec(&from, READ, sio->bvec, sio->pages, sio->len);
> ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
> if (ret != -EIOCBQUEUED)
> sio_read_complete(&sio->iocb, ret);
>
>
>
On Sat, 30 Apr 2022, Yang Shi wrote:
> On Thu, Apr 28, 2022 at 5:44 PM NeilBrown <[email protected]> wrote:
> >
> > Pages passed to swap_readpage()/swap_writepage() are not necessarily all
> > the same size - there may be transparent-huge-pages involves.
> >
> > The BIO paths of swap_*page() handle this correctly, but the SWP_FS_OPS
> > path does not.
> >
> > So we need to use thp_size() to find the size, not just assume
> > PAGE_SIZE, and we need to track the total length of the request, not
> > just assume it is "page * PAGE_SIZE".
>
> Swap-over-nfs doesn't support THP swap IIUC. So SWP_FS_OPS should not
> see THP at all. But I agree to remove the assumption about page size
> in this path.
Can you help me understand this please. How would the swap code know
that swap-over-NFS doesn't support THP swap? There is no reason that
NFS wouldn't be able to handle 2MB writes. Even 1GB should work though
NFS would have to split into several smaller WRITE requests.
Thanks,
NeilBrown
>
> >
> > Reported-by: Miaohe Lin <[email protected]>
> > Signed-off-by: NeilBrown <[email protected]>
> > ---
> > mm/page_io.c | 23 +++++++++++++----------
> > 1 file changed, 13 insertions(+), 10 deletions(-)
> >
> > diff --git a/mm/page_io.c b/mm/page_io.c
> > index c132511f521c..d636a3531cad 100644
> > --- a/mm/page_io.c
> > +++ b/mm/page_io.c
> > @@ -239,6 +239,7 @@ struct swap_iocb {
> > struct kiocb iocb;
> > struct bio_vec bvec[SWAP_CLUSTER_MAX];
> > int pages;
> > + int len;
> > };
> > static mempool_t *sio_pool;
> >
> > @@ -261,7 +262,7 @@ static void sio_write_complete(struct kiocb *iocb, long ret)
> > struct page *page = sio->bvec[0].bv_page;
> > int p;
> >
> > - if (ret != PAGE_SIZE * sio->pages) {
> > + if (ret != sio->len) {
> > /*
> > * In the case of swap-over-nfs, this can be a
> > * temporary failure if the system has limited
> > @@ -301,7 +302,7 @@ static int swap_writepage_fs(struct page *page, struct writeback_control *wbc)
> > sio = *wbc->swap_plug;
> > if (sio) {
> > if (sio->iocb.ki_filp != swap_file ||
> > - sio->iocb.ki_pos + sio->pages * PAGE_SIZE != pos) {
> > + sio->iocb.ki_pos + sio->len != pos) {
> > swap_write_unplug(sio);
> > sio = NULL;
> > }
> > @@ -312,10 +313,12 @@ static int swap_writepage_fs(struct page *page, struct writeback_control *wbc)
> > sio->iocb.ki_complete = sio_write_complete;
> > sio->iocb.ki_pos = pos;
> > sio->pages = 0;
> > + sio->len = 0;
> > }
> > sio->bvec[sio->pages].bv_page = page;
> > - sio->bvec[sio->pages].bv_len = PAGE_SIZE;
> > + sio->bvec[sio->pages].bv_len = thp_size(page);
> > sio->bvec[sio->pages].bv_offset = 0;
> > + sio->len += thp_size(page);
> > sio->pages += 1;
> > if (sio->pages == ARRAY_SIZE(sio->bvec) || !wbc->swap_plug) {
> > swap_write_unplug(sio);
> > @@ -371,8 +374,7 @@ void swap_write_unplug(struct swap_iocb *sio)
> > struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
> > int ret;
> >
> > - iov_iter_bvec(&from, WRITE, sio->bvec, sio->pages,
> > - PAGE_SIZE * sio->pages);
> > + iov_iter_bvec(&from, WRITE, sio->bvec, sio->pages, sio->len);
> > ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
> > if (ret != -EIOCBQUEUED)
> > sio_write_complete(&sio->iocb, ret);
> > @@ -383,7 +385,7 @@ static void sio_read_complete(struct kiocb *iocb, long ret)
> > struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
> > int p;
> >
> > - if (ret == PAGE_SIZE * sio->pages) {
> > + if (ret == sio->len) {
> > for (p = 0; p < sio->pages; p++) {
> > struct page *page = sio->bvec[p].bv_page;
> >
> > @@ -415,7 +417,7 @@ static void swap_readpage_fs(struct page *page,
> > sio = *plug;
> > if (sio) {
> > if (sio->iocb.ki_filp != sis->swap_file ||
> > - sio->iocb.ki_pos + sio->pages * PAGE_SIZE != pos) {
> > + sio->iocb.ki_pos + sio->len != pos) {
> > swap_read_unplug(sio);
> > sio = NULL;
> > }
> > @@ -426,10 +428,12 @@ static void swap_readpage_fs(struct page *page,
> > sio->iocb.ki_pos = pos;
> > sio->iocb.ki_complete = sio_read_complete;
> > sio->pages = 0;
> > + sio->len = 0;
> > }
> > sio->bvec[sio->pages].bv_page = page;
> > - sio->bvec[sio->pages].bv_len = PAGE_SIZE;
> > + sio->bvec[sio->pages].bv_len = thp_size(page);
> > sio->bvec[sio->pages].bv_offset = 0;
> > + sio->len += thp_size(page);
> > sio->pages += 1;
> > if (sio->pages == ARRAY_SIZE(sio->bvec) || !plug) {
> > swap_read_unplug(sio);
> > @@ -521,8 +525,7 @@ void __swap_read_unplug(struct swap_iocb *sio)
> > struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
> > int ret;
> >
> > - iov_iter_bvec(&from, READ, sio->bvec, sio->pages,
> > - PAGE_SIZE * sio->pages);
> > + iov_iter_bvec(&from, READ, sio->bvec, sio->pages, sio->len);
> > ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
> > if (ret != -EIOCBQUEUED)
> > sio_read_complete(&sio->iocb, ret);
> >
> >
> >
>