This patchset standalone is an optimization leveraging the page fault
FAULT_FLAG_ALLOW_RETRY flag which allows the page fault paths to drop
the mmap_sem before I/O.
For the userfaultfd patchset this patch is instead a dependency as we
need that flag always set the first time any thread attempts a page
fault, in order to release the mmap_sem before stopping the page fault
(while waiting for a later userland wakeup).
http://thread.gmane.org/gmane.linux.kernel.mm/123575
Andrea Arcangeli (5):
mm: gup: add get_user_pages_locked and get_user_pages_unlocked
mm: gup: add __get_user_pages_unlocked to customize gup_flags
mm: gup: use get_user_pages_unlocked within get_user_pages_fast
mm: gup: use get_user_pages_unlocked
mm: gup: kvm use get_user_pages_unlocked
arch/mips/mm/gup.c | 8 +-
arch/powerpc/mm/gup.c | 6 +-
arch/s390/mm/gup.c | 6 +-
arch/sh/mm/gup.c | 6 +-
arch/sparc/mm/gup.c | 6 +-
arch/x86/mm/gup.c | 7 +-
drivers/iommu/amd_iommu_v2.c | 6 +-
drivers/media/pci/ivtv/ivtv-udma.c | 6 +-
drivers/scsi/st.c | 7 +-
drivers/video/fbdev/pvr2fb.c | 6 +-
include/linux/kvm_host.h | 11 --
include/linux/mm.h | 11 ++
mm/gup.c | 203 ++++++++++++++++++++++++++++++++++---
mm/nommu.c | 33 ++++++
mm/process_vm_access.c | 7 +-
mm/util.c | 10 +-
net/ceph/pagevec.c | 6 +-
virt/kvm/async_pf.c | 2 +-
virt/kvm/kvm_main.c | 50 +--------
19 files changed, 265 insertions(+), 132 deletions(-)
This allows those get_user_pages calls to pass FAULT_FLAG_ALLOW_RETRY
to the page fault in order to release the mmap_sem during the I/O.
Signed-off-by: Andrea Arcangeli <[email protected]>
---
drivers/iommu/amd_iommu_v2.c | 6 ++----
drivers/media/pci/ivtv/ivtv-udma.c | 6 ++----
drivers/scsi/st.c | 7 ++-----
drivers/video/fbdev/pvr2fb.c | 6 ++----
mm/process_vm_access.c | 7 ++-----
net/ceph/pagevec.c | 6 ++----
6 files changed, 12 insertions(+), 26 deletions(-)
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
index 90d734b..4cd8a87 100644
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -521,10 +521,8 @@ static void do_fault(struct work_struct *work)
write = !!(fault->flags & PPR_FAULT_WRITE);
- down_read(&fault->state->mm->mmap_sem);
- npages = get_user_pages(NULL, fault->state->mm,
- fault->address, 1, write, 0, &page, NULL);
- up_read(&fault->state->mm->mmap_sem);
+ npages = get_user_pages_unlocked(NULL, fault->state->mm,
+ fault->address, 1, write, 0, &page);
if (npages == 1) {
put_page(page);
diff --git a/drivers/media/pci/ivtv/ivtv-udma.c b/drivers/media/pci/ivtv/ivtv-udma.c
index 7338cb2..96d866b 100644
--- a/drivers/media/pci/ivtv/ivtv-udma.c
+++ b/drivers/media/pci/ivtv/ivtv-udma.c
@@ -124,10 +124,8 @@ int ivtv_udma_setup(struct ivtv *itv, unsigned long ivtv_dest_addr,
}
/* Get user pages for DMA Xfer */
- down_read(¤t->mm->mmap_sem);
- err = get_user_pages(current, current->mm,
- user_dma.uaddr, user_dma.page_count, 0, 1, dma->map, NULL);
- up_read(¤t->mm->mmap_sem);
+ err = get_user_pages_unlocked(current, current->mm,
+ user_dma.uaddr, user_dma.page_count, 0, 1, dma->map);
if (user_dma.page_count != err) {
IVTV_DEBUG_WARN("failed to map user pages, returned %d instead of %d\n",
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 4daa372..a98e00b 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -4538,18 +4538,15 @@ static int sgl_map_user_pages(struct st_buffer *STbp,
return -ENOMEM;
/* Try to fault in all of the necessary pages */
- down_read(¤t->mm->mmap_sem);
/* rw==READ means read from drive, write into memory area */
- res = get_user_pages(
+ res = get_user_pages_unlocked(
current,
current->mm,
uaddr,
nr_pages,
rw == READ,
0, /* don't force */
- pages,
- NULL);
- up_read(¤t->mm->mmap_sem);
+ pages);
/* Errors and no page mapped should return here */
if (res < nr_pages)
diff --git a/drivers/video/fbdev/pvr2fb.c b/drivers/video/fbdev/pvr2fb.c
index 7c74f58..0e24eb9 100644
--- a/drivers/video/fbdev/pvr2fb.c
+++ b/drivers/video/fbdev/pvr2fb.c
@@ -686,10 +686,8 @@ static ssize_t pvr2fb_write(struct fb_info *info, const char *buf,
if (!pages)
return -ENOMEM;
- down_read(¤t->mm->mmap_sem);
- ret = get_user_pages(current, current->mm, (unsigned long)buf,
- nr_pages, WRITE, 0, pages, NULL);
- up_read(¤t->mm->mmap_sem);
+ ret = get_user_pages_unlocked(current, current->mm, (unsigned long)buf,
+ nr_pages, WRITE, 0, pages);
if (ret < nr_pages) {
nr_pages = ret;
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 5077afc..b159769 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -99,11 +99,8 @@ static int process_vm_rw_single_vec(unsigned long addr,
size_t bytes;
/* Get the pages we're interested in */
- down_read(&mm->mmap_sem);
- pages = get_user_pages(task, mm, pa, pages,
- vm_write, 0, process_pages, NULL);
- up_read(&mm->mmap_sem);
-
+ pages = get_user_pages_unlocked(task, mm, pa, pages,
+ vm_write, 0, process_pages);
if (pages <= 0)
return -EFAULT;
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
index 5550130..096d914 100644
--- a/net/ceph/pagevec.c
+++ b/net/ceph/pagevec.c
@@ -23,17 +23,15 @@ struct page **ceph_get_direct_page_vector(const void __user *data,
if (!pages)
return ERR_PTR(-ENOMEM);
- down_read(¤t->mm->mmap_sem);
while (got < num_pages) {
- rc = get_user_pages(current, current->mm,
+ rc = get_user_pages_unlocked(current, current->mm,
(unsigned long)data + ((unsigned long)got * PAGE_SIZE),
- num_pages - got, write_page, 0, pages + got, NULL);
+ num_pages - got, write_page, 0, pages + got);
if (rc < 0)
break;
BUG_ON(rc == 0);
got += rc;
}
- up_read(¤t->mm->mmap_sem);
if (rc < 0)
goto fail;
return pages;
Some caller (like KVM) may want to set the gup_flags like
FOLL_HWPOSION to get a proper -EHWPOSION retval instead of -EFAULT to
take a more appropriate action if get_user_pages runs into a memory
failure.
Signed-off-by: Andrea Arcangeli <[email protected]>
---
include/linux/mm.h | 4 ++++
mm/gup.c | 44 ++++++++++++++++++++++++++++++++------------
mm/nommu.c | 16 +++++++++++++---
3 files changed, 49 insertions(+), 15 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 99831d9..9a5ada3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1222,6 +1222,10 @@ long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages,
int *locked);
+long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages,
+ unsigned int gup_flags);
long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages);
diff --git a/mm/gup.c b/mm/gup.c
index a8521f1..01534ff 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -591,9 +591,9 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
int write, int force,
struct page **pages,
struct vm_area_struct **vmas,
- int *locked, bool notify_drop)
+ int *locked, bool notify_drop,
+ unsigned int flags)
{
- int flags = FOLL_TOUCH;
long ret, pages_done;
bool lock_dropped;
@@ -707,11 +707,37 @@ long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
int *locked)
{
return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
- pages, NULL, locked, true);
+ pages, NULL, locked, true, FOLL_TOUCH);
}
EXPORT_SYMBOL(get_user_pages_locked);
/*
+ * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows to
+ * pass additional gup_flags as last parameter (like FOLL_HWPOISON).
+ *
+ * NOTE: here FOLL_TOUCH is not set implicitly and must be set by the
+ * caller if required (just like with __get_user_pages). "FOLL_GET",
+ * "FOLL_WRITE" and "FOLL_FORCE" are set implicitly as needed
+ * according to the parameters "pages", "write", "force"
+ * respectively.
+ */
+__always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages,
+ unsigned int gup_flags)
+{
+ long ret;
+ int locked = 1;
+ down_read(&mm->mmap_sem);
+ ret = __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
+ pages, NULL, &locked, false, gup_flags);
+ if (locked)
+ up_read(&mm->mmap_sem);
+ return ret;
+}
+EXPORT_SYMBOL(__get_user_pages_unlocked);
+
+/*
* get_user_pages_unlocked() is suitable to replace the form:
*
* down_read(&mm->mmap_sem);
@@ -732,14 +758,8 @@ long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages)
{
- long ret;
- int locked = 1;
- down_read(&mm->mmap_sem);
- ret = __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
- pages, NULL, &locked, false);
- if (locked)
- up_read(&mm->mmap_sem);
- return ret;
+ return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
+ force, pages, FOLL_TOUCH);
}
EXPORT_SYMBOL(get_user_pages_unlocked);
@@ -803,7 +823,7 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
int force, struct page **pages, struct vm_area_struct **vmas)
{
return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
- pages, vmas, NULL, false);
+ pages, vmas, NULL, false, FOLL_TOUCH);
}
EXPORT_SYMBOL(get_user_pages);
diff --git a/mm/nommu.c b/mm/nommu.c
index 2a63d1f..6f6c752 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -223,9 +223,10 @@ long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
}
EXPORT_SYMBOL(get_user_pages_locked);
-long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
- int write, int force, struct page **pages)
+long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages,
+ unsigned int gup_flags)
{
long ret;
down_read(&mm->mmap_sem);
@@ -234,6 +235,15 @@ long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
up_read(&mm->mmap_sem);
return ret;
}
+EXPORT_SYMBOL(__get_user_pages_unlocked);
+
+long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages)
+{
+ return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
+ force, pages, 0);
+}
EXPORT_SYMBOL(get_user_pages_unlocked);
/**
This allows the get_user_pages_fast slow path to release the mmap_sem
before blocking.
Signed-off-by: Andrea Arcangeli <[email protected]>
---
arch/mips/mm/gup.c | 8 +++-----
arch/powerpc/mm/gup.c | 6 ++----
arch/s390/mm/gup.c | 6 ++----
arch/sh/mm/gup.c | 6 ++----
arch/sparc/mm/gup.c | 6 ++----
arch/x86/mm/gup.c | 7 +++----
mm/gup.c | 6 ++----
mm/util.c | 10 ++--------
8 files changed, 18 insertions(+), 37 deletions(-)
diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c
index 06ce17c..20884f5 100644
--- a/arch/mips/mm/gup.c
+++ b/arch/mips/mm/gup.c
@@ -301,11 +301,9 @@ slow_irqon:
start += nr << PAGE_SHIFT;
pages += nr;
- down_read(&mm->mmap_sem);
- ret = get_user_pages(current, mm, start,
- (end - start) >> PAGE_SHIFT,
- write, 0, pages, NULL);
- up_read(&mm->mmap_sem);
+ ret = get_user_pages_unlocked(current, mm, start,
+ (end - start) >> PAGE_SHIFT,
+ write, 0, pages);
/* Have to be a bit careful with return values */
if (nr > 0) {
diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
index d874668..b70c34a 100644
--- a/arch/powerpc/mm/gup.c
+++ b/arch/powerpc/mm/gup.c
@@ -215,10 +215,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
start += nr << PAGE_SHIFT;
pages += nr;
- down_read(&mm->mmap_sem);
- ret = get_user_pages(current, mm, start,
- nr_pages - nr, write, 0, pages, NULL);
- up_read(&mm->mmap_sem);
+ ret = get_user_pages_unlocked(current, mm, start,
+ nr_pages - nr, write, 0, pages);
/* Have to be a bit careful with return values */
if (nr > 0) {
diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c
index 639fce46..5c586c7 100644
--- a/arch/s390/mm/gup.c
+++ b/arch/s390/mm/gup.c
@@ -235,10 +235,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
/* Try to get the remaining pages with get_user_pages */
start += nr << PAGE_SHIFT;
pages += nr;
- down_read(&mm->mmap_sem);
- ret = get_user_pages(current, mm, start,
- nr_pages - nr, write, 0, pages, NULL);
- up_read(&mm->mmap_sem);
+ ret = get_user_pages_unlocked(current, mm, start,
+ nr_pages - nr, write, 0, pages);
/* Have to be a bit careful with return values */
if (nr > 0)
ret = (ret < 0) ? nr : ret + nr;
diff --git a/arch/sh/mm/gup.c b/arch/sh/mm/gup.c
index 37458f3..e15f52a 100644
--- a/arch/sh/mm/gup.c
+++ b/arch/sh/mm/gup.c
@@ -257,10 +257,8 @@ slow_irqon:
start += nr << PAGE_SHIFT;
pages += nr;
- down_read(&mm->mmap_sem);
- ret = get_user_pages(current, mm, start,
- (end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
- up_read(&mm->mmap_sem);
+ ret = get_user_pages_unlocked(current, mm, start,
+ (end - start) >> PAGE_SHIFT, write, 0, pages);
/* Have to be a bit careful with return values */
if (nr > 0) {
diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c
index ae6ce38..2e5c4fc 100644
--- a/arch/sparc/mm/gup.c
+++ b/arch/sparc/mm/gup.c
@@ -249,10 +249,8 @@ slow:
start += nr << PAGE_SHIFT;
pages += nr;
- down_read(&mm->mmap_sem);
- ret = get_user_pages(current, mm, start,
- (end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
- up_read(&mm->mmap_sem);
+ ret = get_user_pages_unlocked(current, mm, start,
+ (end - start) >> PAGE_SHIFT, write, 0, pages);
/* Have to be a bit careful with return values */
if (nr > 0) {
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 207d9aef..2ab183b 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -388,10 +388,9 @@ slow_irqon:
start += nr << PAGE_SHIFT;
pages += nr;
- down_read(&mm->mmap_sem);
- ret = get_user_pages(current, mm, start,
- (end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
- up_read(&mm->mmap_sem);
+ ret = get_user_pages_unlocked(current, mm, start,
+ (end - start) >> PAGE_SHIFT,
+ write, 0, pages);
/* Have to be a bit careful with return values */
if (nr > 0) {
diff --git a/mm/gup.c b/mm/gup.c
index 01534ff..a349ae3 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1187,10 +1187,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
start += nr << PAGE_SHIFT;
pages += nr;
- down_read(&mm->mmap_sem);
- ret = get_user_pages(current, mm, start,
- nr_pages - nr, write, 0, pages, NULL);
- up_read(&mm->mmap_sem);
+ ret = get_user_pages_unlocked(current, mm, start,
+ nr_pages - nr, write, 0, pages);
/* Have to be a bit careful with return values */
if (nr > 0) {
diff --git a/mm/util.c b/mm/util.c
index fec39d4..f3ef639 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -240,14 +240,8 @@ int __weak get_user_pages_fast(unsigned long start,
int nr_pages, int write, struct page **pages)
{
struct mm_struct *mm = current->mm;
- int ret;
-
- down_read(&mm->mmap_sem);
- ret = get_user_pages(current, mm, start, nr_pages,
- write, 0, pages, NULL);
- up_read(&mm->mmap_sem);
-
- return ret;
+ return get_user_pages_unlocked(current, mm, start, nr_pages,
+ write, 0, pages);
}
EXPORT_SYMBOL_GPL(get_user_pages_fast);
Use the more generic get_user_pages_unlocked which has the additional
benefit of passing FAULT_FLAG_ALLOW_RETRY at the very first page fault
(which allows the first page fault in an unmapped area to be always
able to block indefinitely by being allowed to release the mmap_sem).
Signed-off-by: Andrea Arcangeli <[email protected]>
---
include/linux/kvm_host.h | 11 -----------
virt/kvm/async_pf.c | 2 +-
virt/kvm/kvm_main.c | 50 ++++--------------------------------------------
3 files changed, 5 insertions(+), 58 deletions(-)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ea53b04..82c67da 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -199,17 +199,6 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva,
int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
#endif
-/*
- * Carry out a gup that requires IO. Allow the mm to relinquish the mmap
- * semaphore if the filemap/swap has to wait on a page lock. pagep == NULL
- * controls whether we retry the gup one more time to completion in that case.
- * Typically this is called after a FAULT_FLAG_RETRY_NOWAIT in the main tdp
- * handler.
- */
-int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long addr, bool write_fault,
- struct page **pagep);
-
enum {
OUTSIDE_GUEST_MODE,
IN_GUEST_MODE,
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 5ff7f7f..44660ae 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -80,7 +80,7 @@ static void async_pf_execute(struct work_struct *work)
might_sleep();
- kvm_get_user_page_io(NULL, mm, addr, 1, NULL);
+ get_user_pages_unlocked(NULL, mm, addr, 1, 1, 0, NULL);
kvm_async_page_present_sync(vcpu, apf);
spin_lock(&vcpu->async_pf.lock);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 25ffac9..78236ad 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1134,43 +1134,6 @@ static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);
}
-int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long addr, bool write_fault,
- struct page **pagep)
-{
- int npages;
- int locked = 1;
- int flags = FOLL_TOUCH | FOLL_HWPOISON |
- (pagep ? FOLL_GET : 0) |
- (write_fault ? FOLL_WRITE : 0);
-
- /*
- * If retrying the fault, we get here *not* having allowed the filemap
- * to wait on the page lock. We should now allow waiting on the IO with
- * the mmap semaphore released.
- */
- down_read(&mm->mmap_sem);
- npages = __get_user_pages(tsk, mm, addr, 1, flags, pagep, NULL,
- &locked);
- if (!locked) {
- VM_BUG_ON(npages);
-
- if (!pagep)
- return 0;
-
- /*
- * The previous call has now waited on the IO. Now we can
- * retry and complete. Pass TRIED to ensure we do not re
- * schedule async IO (see e.g. filemap_fault).
- */
- down_read(&mm->mmap_sem);
- npages = __get_user_pages(tsk, mm, addr, 1, flags | FOLL_TRIED,
- pagep, NULL, NULL);
- }
- up_read(&mm->mmap_sem);
- return npages;
-}
-
static inline int check_user_page_hwpoison(unsigned long addr)
{
int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
@@ -1233,15 +1196,10 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
npages = get_user_page_nowait(current, current->mm,
addr, write_fault, page);
up_read(¤t->mm->mmap_sem);
- } else {
- /*
- * By now we have tried gup_fast, and possibly async_pf, and we
- * are certainly not atomic. Time to retry the gup, allowing
- * mmap semaphore to be relinquished in the case of IO.
- */
- npages = kvm_get_user_page_io(current, current->mm, addr,
- write_fault, page);
- }
+ } else
+ npages = __get_user_pages_unlocked(current, current->mm, addr, 1,
+ write_fault, 0, page,
+ FOLL_TOUCH|FOLL_HWPOISON);
if (npages != 1)
return npages;
We can leverage the VM_FAULT_RETRY functionality in the page fault
paths better by using either get_user_pages_locked or
get_user_pages_unlocked.
The former allow conversion of get_user_pages invocations that will
have to pass a "&locked" parameter to know if the mmap_sem was dropped
during the call. Example from:
down_read(&mm->mmap_sem);
do_something()
get_user_pages(tsk, mm, ..., pages, NULL);
up_read(&mm->mmap_sem);
to:
int locked = 1;
down_read(&mm->mmap_sem);
do_something()
get_user_pages_locked(tsk, mm, ..., pages, &locked);
if (locked)
up_read(&mm->mmap_sem);
The latter is suitable only as a drop in replacement of the form:
down_read(&mm->mmap_sem);
get_user_pages(tsk, mm, ..., pages, NULL);
up_read(&mm->mmap_sem);
into:
get_user_pages_unlocked(tsk, mm, ..., pages);
Where tsk, mm, the intermediate "..." paramters and "pages" can be any
value as before. Just the last parameter of get_user_pages (vmas) must
be NULL for get_user_pages_locked|unlocked to be usable (the latter
original form wouldn't have been safe anyway if vmas wasn't null, for
the former we just make it explicit by dropping the parameter).
If vmas is not NULL these two methods cannot be used.
Signed-off-by: Andrea Arcangeli <[email protected]>
Reviewed-by: Andres Lagar-Cavilla <[email protected]>
Reviewed-by: Peter Feiner <[email protected]>
---
include/linux/mm.h | 7 +++
mm/gup.c | 177 +++++++++++++++++++++++++++++++++++++++++++++++++----
mm/nommu.c | 23 +++++++
3 files changed, 196 insertions(+), 11 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 27eb1bf..99831d9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1218,6 +1218,13 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages,
struct vm_area_struct **vmas);
+long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages,
+ int *locked);
+long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages);
int get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages);
struct kvec;
diff --git a/mm/gup.c b/mm/gup.c
index cd62c8c..a8521f1 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -584,6 +584,165 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
return 0;
}
+static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long nr_pages,
+ int write, int force,
+ struct page **pages,
+ struct vm_area_struct **vmas,
+ int *locked, bool notify_drop)
+{
+ int flags = FOLL_TOUCH;
+ long ret, pages_done;
+ bool lock_dropped;
+
+ if (locked) {
+ /* if VM_FAULT_RETRY can be returned, vmas become invalid */
+ BUG_ON(vmas);
+ /* check caller initialized locked */
+ BUG_ON(*locked != 1);
+ }
+
+ if (pages)
+ flags |= FOLL_GET;
+ if (write)
+ flags |= FOLL_WRITE;
+ if (force)
+ flags |= FOLL_FORCE;
+
+ pages_done = 0;
+ lock_dropped = false;
+ for (;;) {
+ ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages,
+ vmas, locked);
+ if (!locked)
+ /* VM_FAULT_RETRY couldn't trigger, bypass */
+ return ret;
+
+ /* VM_FAULT_RETRY cannot return errors */
+ if (!*locked) {
+ BUG_ON(ret < 0);
+ BUG_ON(ret >= nr_pages);
+ }
+
+ if (!pages)
+ /* If it's a prefault don't insist harder */
+ return ret;
+
+ if (ret > 0) {
+ nr_pages -= ret;
+ pages_done += ret;
+ if (!nr_pages)
+ break;
+ }
+ if (*locked) {
+ /* VM_FAULT_RETRY didn't trigger */
+ if (!pages_done)
+ pages_done = ret;
+ break;
+ }
+ /* VM_FAULT_RETRY triggered, so seek to the faulting offset */
+ pages += ret;
+ start += ret << PAGE_SHIFT;
+
+ /*
+ * Repeat on the address that fired VM_FAULT_RETRY
+ * without FAULT_FLAG_ALLOW_RETRY but with
+ * FAULT_FLAG_TRIED.
+ */
+ *locked = 1;
+ lock_dropped = true;
+ down_read(&mm->mmap_sem);
+ ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED,
+ pages, NULL, NULL);
+ if (ret != 1) {
+ BUG_ON(ret > 1);
+ if (!pages_done)
+ pages_done = ret;
+ break;
+ }
+ nr_pages--;
+ pages_done++;
+ if (!nr_pages)
+ break;
+ pages++;
+ start += PAGE_SIZE;
+ }
+ if (notify_drop && lock_dropped && *locked) {
+ /*
+ * We must let the caller know we temporarily dropped the lock
+ * and so the critical section protected by it was lost.
+ */
+ up_read(&mm->mmap_sem);
+ *locked = 0;
+ }
+ return pages_done;
+}
+
+/*
+ * We can leverage the VM_FAULT_RETRY functionality in the page fault
+ * paths better by using either get_user_pages_locked() or
+ * get_user_pages_unlocked().
+ *
+ * get_user_pages_locked() is suitable to replace the form:
+ *
+ * down_read(&mm->mmap_sem);
+ * do_something()
+ * get_user_pages(tsk, mm, ..., pages, NULL);
+ * up_read(&mm->mmap_sem);
+ *
+ * to:
+ *
+ * int locked = 1;
+ * down_read(&mm->mmap_sem);
+ * do_something()
+ * get_user_pages_locked(tsk, mm, ..., pages, &locked);
+ * if (locked)
+ * up_read(&mm->mmap_sem);
+ */
+long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages,
+ int *locked)
+{
+ return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
+ pages, NULL, locked, true);
+}
+EXPORT_SYMBOL(get_user_pages_locked);
+
+/*
+ * get_user_pages_unlocked() is suitable to replace the form:
+ *
+ * down_read(&mm->mmap_sem);
+ * get_user_pages(tsk, mm, ..., pages, NULL);
+ * up_read(&mm->mmap_sem);
+ *
+ * with:
+ *
+ * get_user_pages_unlocked(tsk, mm, ..., pages);
+ *
+ * It is functionally equivalent to get_user_pages_fast so
+ * get_user_pages_fast should be used instead, if the two parameters
+ * "tsk" and "mm" are respectively equal to current and current->mm,
+ * or if "force" shall be set to 1 (get_user_pages_fast misses the
+ * "force" parameter).
+ */
+long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages)
+{
+ long ret;
+ int locked = 1;
+ down_read(&mm->mmap_sem);
+ ret = __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
+ pages, NULL, &locked, false);
+ if (locked)
+ up_read(&mm->mmap_sem);
+ return ret;
+}
+EXPORT_SYMBOL(get_user_pages_unlocked);
+
/*
* get_user_pages() - pin user pages in memory
* @tsk: the task_struct to use for page fault accounting, or
@@ -633,22 +792,18 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
* use the correct cache flushing APIs.
*
* See also get_user_pages_fast, for performance critical applications.
+ *
+ * get_user_pages should be phased out in favor of
+ * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
+ * should use get_user_pages because it cannot pass
+ * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
*/
long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages, int write,
int force, struct page **pages, struct vm_area_struct **vmas)
{
- int flags = FOLL_TOUCH;
-
- if (pages)
- flags |= FOLL_GET;
- if (write)
- flags |= FOLL_WRITE;
- if (force)
- flags |= FOLL_FORCE;
-
- return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
- NULL);
+ return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
+ pages, vmas, NULL, false);
}
EXPORT_SYMBOL(get_user_pages);
diff --git a/mm/nommu.c b/mm/nommu.c
index bd1808e..2a63d1f 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -213,6 +213,29 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
}
EXPORT_SYMBOL(get_user_pages);
+long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages,
+ int *locked)
+{
+ return get_user_pages(tsk, mm, start, nr_pages, write, force,
+ pages, NULL);
+}
+EXPORT_SYMBOL(get_user_pages_locked);
+
+long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages)
+{
+ long ret;
+ down_read(&mm->mmap_sem);
+ ret = get_user_pages(tsk, mm, start, nr_pages, write, force,
+ pages, NULL);
+ up_read(&mm->mmap_sem);
+ return ret;
+}
+EXPORT_SYMBOL(get_user_pages_unlocked);
+
/**
* follow_pfn - look up PFN at a user virtual address
* @vma: memory mapping
On Wed Oct 29 2014 at 9:35:34 AM Andrea Arcangeli <[email protected]> wrote:
>
> Use the more generic get_user_pages_unlocked which has the additional
> benefit of passing FAULT_FLAG_ALLOW_RETRY at the very first page fault
> (which allows the first page fault in an unmapped area to be always
> able to block indefinitely by being allowed to release the mmap_sem).
>
> Signed-off-by: Andrea Arcangeli <[email protected]>
> ---
> include/linux/kvm_host.h | 11 -----------
> virt/kvm/async_pf.c | 2 +-
> virt/kvm/kvm_main.c | 50 ++++--------------------------------------------
> 3 files changed, 5 insertions(+), 58 deletions(-)
>
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index ea53b04..82c67da 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -199,17 +199,6 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva,
> int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
> #endif
>
> -/*
> - * Carry out a gup that requires IO. Allow the mm to relinquish the mmap
> - * semaphore if the filemap/swap has to wait on a page lock. pagep == NULL
> - * controls whether we retry the gup one more time to completion in that case.
> - * Typically this is called after a FAULT_FLAG_RETRY_NOWAIT in the main tdp
> - * handler.
> - */
> -int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm,
> - unsigned long addr, bool write_fault,
> - struct page **pagep);
> -
> enum {
> OUTSIDE_GUEST_MODE,
> IN_GUEST_MODE,
> diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
> index 5ff7f7f..44660ae 100644
> --- a/virt/kvm/async_pf.c
> +++ b/virt/kvm/async_pf.c
> @@ -80,7 +80,7 @@ static void async_pf_execute(struct work_struct *work)
>
> might_sleep();
>
> - kvm_get_user_page_io(NULL, mm, addr, 1, NULL);
> + get_user_pages_unlocked(NULL, mm, addr, 1, 1, 0, NULL);
> kvm_async_page_present_sync(vcpu, apf);
>
> spin_lock(&vcpu->async_pf.lock);
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 25ffac9..78236ad 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -1134,43 +1134,6 @@ static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
> return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);
> }
>
> -int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm,
> - unsigned long addr, bool write_fault,
> - struct page **pagep)
> -{
> - int npages;
> - int locked = 1;
> - int flags = FOLL_TOUCH | FOLL_HWPOISON |
> - (pagep ? FOLL_GET : 0) |
> - (write_fault ? FOLL_WRITE : 0);
> -
> - /*
> - * If retrying the fault, we get here *not* having allowed the filemap
> - * to wait on the page lock. We should now allow waiting on the IO with
> - * the mmap semaphore released.
> - */
> - down_read(&mm->mmap_sem);
> - npages = __get_user_pages(tsk, mm, addr, 1, flags, pagep, NULL,
> - &locked);
> - if (!locked) {
> - VM_BUG_ON(npages);
> -
> - if (!pagep)
> - return 0;
> -
> - /*
> - * The previous call has now waited on the IO. Now we can
> - * retry and complete. Pass TRIED to ensure we do not re
> - * schedule async IO (see e.g. filemap_fault).
> - */
> - down_read(&mm->mmap_sem);
> - npages = __get_user_pages(tsk, mm, addr, 1, flags | FOLL_TRIED,
> - pagep, NULL, NULL);
> - }
> - up_read(&mm->mmap_sem);
> - return npages;
> -}
> -
> static inline int check_user_page_hwpoison(unsigned long addr)
> {
> int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
> @@ -1233,15 +1196,10 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
> npages = get_user_page_nowait(current, current->mm,
> addr, write_fault, page);
> up_read(¤t->mm->mmap_sem);
> - } else {
> - /*
> - * By now we have tried gup_fast, and possibly async_pf, and we
> - * are certainly not atomic. Time to retry the gup, allowing
> - * mmap semaphore to be relinquished in the case of IO.
> - */
> - npages = kvm_get_user_page_io(current, current->mm, addr,
> - write_fault, page);
> - }
> + } else
Braces here, per coding style.
Other than that:
Reviewed-by: Andres Lagar-Cavilla <[email protected]>
>
> + npages = __get_user_pages_unlocked(current, current->mm, addr, 1,
> + write_fault, 0, page,
> + FOLL_TOUCH|FOLL_HWPOISON);
> if (npages != 1)
> return npages;
>
On Wed, Oct 29, 2014 at 05:35:16PM +0100, Andrea Arcangeli wrote:
> We can leverage the VM_FAULT_RETRY functionality in the page fault
> paths better by using either get_user_pages_locked or
> get_user_pages_unlocked.
>
> The former allow conversion of get_user_pages invocations that will
> have to pass a "&locked" parameter to know if the mmap_sem was dropped
> during the call. Example from:
>
> down_read(&mm->mmap_sem);
> do_something()
> get_user_pages(tsk, mm, ..., pages, NULL);
> up_read(&mm->mmap_sem);
>
> to:
>
> int locked = 1;
> down_read(&mm->mmap_sem);
> do_something()
> get_user_pages_locked(tsk, mm, ..., pages, &locked);
> if (locked)
> up_read(&mm->mmap_sem);
>
> The latter is suitable only as a drop in replacement of the form:
>
> down_read(&mm->mmap_sem);
> get_user_pages(tsk, mm, ..., pages, NULL);
> up_read(&mm->mmap_sem);
>
> into:
>
> get_user_pages_unlocked(tsk, mm, ..., pages);
>
> Where tsk, mm, the intermediate "..." paramters and "pages" can be any
> value as before. Just the last parameter of get_user_pages (vmas) must
> be NULL for get_user_pages_locked|unlocked to be usable (the latter
> original form wouldn't have been safe anyway if vmas wasn't null, for
> the former we just make it explicit by dropping the parameter).
>
> If vmas is not NULL these two methods cannot be used.
>
> Signed-off-by: Andrea Arcangeli <[email protected]>
> Reviewed-by: Andres Lagar-Cavilla <[email protected]>
> Reviewed-by: Peter Feiner <[email protected]>
Acked-by: Kirill A. Shutemov <[email protected]>
--
Kirill A. Shutemov
On Wed, Oct 29, 2014 at 05:35:17PM +0100, Andrea Arcangeli wrote:
> Some caller (like KVM) may want to set the gup_flags like
> FOLL_HWPOSION to get a proper -EHWPOSION retval instead of -EFAULT to
> take a more appropriate action if get_user_pages runs into a memory
> failure.
>
> Signed-off-by: Andrea Arcangeli <[email protected]>
> ---
> include/linux/mm.h | 4 ++++
> mm/gup.c | 44 ++++++++++++++++++++++++++++++++------------
> mm/nommu.c | 16 +++++++++++++---
> 3 files changed, 49 insertions(+), 15 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 99831d9..9a5ada3 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1222,6 +1222,10 @@ long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
> unsigned long start, unsigned long nr_pages,
> int write, int force, struct page **pages,
> int *locked);
> +long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
> + unsigned long start, unsigned long nr_pages,
> + int write, int force, struct page **pages,
> + unsigned int gup_flags);
> long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
> unsigned long start, unsigned long nr_pages,
> int write, int force, struct page **pages);
> diff --git a/mm/gup.c b/mm/gup.c
> index a8521f1..01534ff 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -591,9 +591,9 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
> int write, int force,
> struct page **pages,
> struct vm_area_struct **vmas,
> - int *locked, bool notify_drop)
> + int *locked, bool notify_drop,
> + unsigned int flags)
Argument list getting too long. Should we consider packing them into a
struct?
--
Kirill A. Shutemov
On Wed, Oct 29, 2014 at 05:35:18PM +0100, Andrea Arcangeli wrote:
> This allows the get_user_pages_fast slow path to release the mmap_sem
> before blocking.
>
> Signed-off-by: Andrea Arcangeli <[email protected]>
Acked-by: Kirill A. Shutemov <[email protected]>
--
Kirill A. Shutemov
On Wed, Oct 29, 2014 at 05:35:19PM +0100, Andrea Arcangeli wrote:
> This allows those get_user_pages calls to pass FAULT_FLAG_ALLOW_RETRY
> to the page fault in order to release the mmap_sem during the I/O.
>
> Signed-off-by: Andrea Arcangeli <[email protected]>
> ---
> drivers/iommu/amd_iommu_v2.c | 6 ++----
> drivers/media/pci/ivtv/ivtv-udma.c | 6 ++----
> drivers/scsi/st.c | 7 ++-----
> drivers/video/fbdev/pvr2fb.c | 6 ++----
> mm/process_vm_access.c | 7 ++-----
> net/ceph/pagevec.c | 6 ++----
> 6 files changed, 12 insertions(+), 26 deletions(-)
>
> diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
> index 90d734b..4cd8a87 100644
> --- a/drivers/iommu/amd_iommu_v2.c
> +++ b/drivers/iommu/amd_iommu_v2.c
> @@ -521,10 +521,8 @@ static void do_fault(struct work_struct *work)
>
> write = !!(fault->flags & PPR_FAULT_WRITE);
>
> - down_read(&fault->state->mm->mmap_sem);
> - npages = get_user_pages(NULL, fault->state->mm,
> - fault->address, 1, write, 0, &page, NULL);
> - up_read(&fault->state->mm->mmap_sem);
> + npages = get_user_pages_unlocked(NULL, fault->state->mm,
> + fault->address, 1, write, 0, &page);
>
> if (npages == 1) {
> put_page(page);
> diff --git a/drivers/media/pci/ivtv/ivtv-udma.c b/drivers/media/pci/ivtv/ivtv-udma.c
> index 7338cb2..96d866b 100644
> --- a/drivers/media/pci/ivtv/ivtv-udma.c
> +++ b/drivers/media/pci/ivtv/ivtv-udma.c
> @@ -124,10 +124,8 @@ int ivtv_udma_setup(struct ivtv *itv, unsigned long ivtv_dest_addr,
> }
>
> /* Get user pages for DMA Xfer */
> - down_read(¤t->mm->mmap_sem);
> - err = get_user_pages(current, current->mm,
> - user_dma.uaddr, user_dma.page_count, 0, 1, dma->map, NULL);
> - up_read(¤t->mm->mmap_sem);
> + err = get_user_pages_unlocked(current, current->mm,
> + user_dma.uaddr, user_dma.page_count, 0, 1, dma->map);
>
> if (user_dma.page_count != err) {
> IVTV_DEBUG_WARN("failed to map user pages, returned %d instead of %d\n",
> diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
> index 4daa372..a98e00b 100644
> --- a/drivers/scsi/st.c
> +++ b/drivers/scsi/st.c
> @@ -4538,18 +4538,15 @@ static int sgl_map_user_pages(struct st_buffer *STbp,
> return -ENOMEM;
>
> /* Try to fault in all of the necessary pages */
> - down_read(¤t->mm->mmap_sem);
> /* rw==READ means read from drive, write into memory area */
Consolidate two one-line configs into a one?
Acked-by: Kirill A. Shutemov <[email protected]>
--
Kirill A. Shutemov
On Thu, Oct 30, 2014 at 02:17:37PM +0200, Kirill A. Shutemov wrote:
> On Wed, Oct 29, 2014 at 05:35:17PM +0100, Andrea Arcangeli wrote:
> > diff --git a/mm/gup.c b/mm/gup.c
> > index a8521f1..01534ff 100644
> > --- a/mm/gup.c
> > +++ b/mm/gup.c
> > @@ -591,9 +591,9 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
> > int write, int force,
> > struct page **pages,
> > struct vm_area_struct **vmas,
> > - int *locked, bool notify_drop)
> > + int *locked, bool notify_drop,
> > + unsigned int flags)
>
> Argument list getting too long. Should we consider packing them into a
> struct?
It's __always_inline, so it's certainly not a runtime concern. The
whole point of using __always_inline is to optimize away certain
branches at build time.
If this about cleaning it up and not for changing the runtime (which I
think couldn't get any better because of the __always_inline), we
should at least make certain gcc can still see through the structure
offsets to delete the same code blocks at build time if possible,
before doing the change.
On Thu, Oct 30, 2014 at 06:43:09PM +0100, Andrea Arcangeli wrote:
> On Thu, Oct 30, 2014 at 02:17:37PM +0200, Kirill A. Shutemov wrote:
> > On Wed, Oct 29, 2014 at 05:35:17PM +0100, Andrea Arcangeli wrote:
> > > diff --git a/mm/gup.c b/mm/gup.c
> > > index a8521f1..01534ff 100644
> > > --- a/mm/gup.c
> > > +++ b/mm/gup.c
> > > @@ -591,9 +591,9 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
> > > int write, int force,
> > > struct page **pages,
> > > struct vm_area_struct **vmas,
> > > - int *locked, bool notify_drop)
> > > + int *locked, bool notify_drop,
> > > + unsigned int flags)
> >
> > Argument list getting too long. Should we consider packing them into a
> > struct?
>
> It's __always_inline, so it's certainly not a runtime concern. The
> whole point of using __always_inline is to optimize away certain
> branches at build time.
Its also exported:
+EXPORT_SYMBOL(__get_user_pages_unlocked);
Note that __always_inline is only valid within the same translation unit
(unless you get LTO working).
On Wed, Oct 29, 2014 at 05:35:19PM +0100, Andrea Arcangeli wrote:
> This allows those get_user_pages calls to pass FAULT_FLAG_ALLOW_RETRY
> to the page fault in order to release the mmap_sem during the I/O.
>
> Signed-off-by: Andrea Arcangeli <[email protected]>
Reviewed-by: Peter Feiner <[email protected]>
> diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
> index 5550130..096d914 100644
> --- a/net/ceph/pagevec.c
> +++ b/net/ceph/pagevec.c
> @@ -23,17 +23,15 @@ struct page **ceph_get_direct_page_vector(const void __user *data,
> if (!pages)
> return ERR_PTR(-ENOMEM);
>
> - down_read(¤t->mm->mmap_sem);
> while (got < num_pages) {
> - rc = get_user_pages(current, current->mm,
> + rc = get_user_pages_unlocked(current, current->mm,
> (unsigned long)data + ((unsigned long)got * PAGE_SIZE),
> - num_pages - got, write_page, 0, pages + got, NULL);
> + num_pages - got, write_page, 0, pages + got);
> if (rc < 0)
> break;
> BUG_ON(rc == 0);
> got += rc;
> }
> - up_read(¤t->mm->mmap_sem);
> if (rc < 0)
> goto fail;
> return pages;
I spent a while looking at this to make sure that BUG_ON(rc == 0) won't
trigger. AFAICT, __get_user_pages_locked can't return 0 since __get_user_pages
only returns 0 when nonblocking is not NULL; when __get_user_pages_locked
calls __get_user_pages with nonblocking != NULL (i.e., the first call in the
body of the for(;;) loop) and __get_user_pages returns 0, then
__get_user_pages_locked will call __get_user_pages again with nonblocking ==
NULL.
On Wed, Oct 29, 2014 at 05:35:18PM +0100, Andrea Arcangeli wrote:
> This allows the get_user_pages_fast slow path to release the mmap_sem
> before blocking.
>
> Signed-off-by: Andrea Arcangeli <[email protected]>
Reviewed-by: Peter Feiner <[email protected]>
On Wed, Oct 29, 2014 at 05:35:17PM +0100, Andrea Arcangeli wrote:
> Some caller (like KVM) may want to set the gup_flags like
> FOLL_HWPOSION to get a proper -EHWPOSION retval instead of -EFAULT to
> take a more appropriate action if get_user_pages runs into a memory
> failure.
>
> Signed-off-by: Andrea Arcangeli <[email protected]>
Reviewed-by: Peter Feiner <[email protected]>
On Wed, Oct 29, 2014 at 05:35:20PM +0100, Andrea Arcangeli wrote:
> Use the more generic get_user_pages_unlocked which has the additional
> benefit of passing FAULT_FLAG_ALLOW_RETRY at the very first page fault
> (which allows the first page fault in an unmapped area to be always
> able to block indefinitely by being allowed to release the mmap_sem).
>
> Signed-off-by: Andrea Arcangeli <[email protected]>
Reviewed-by: Peter Feiner <[email protected]>