Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
prevented the oom reaper from unmapping private anonymous memory with the
oom reaper when the oom victim mm had mmu notifiers registered.
The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
around the unmap_page_range(), which is needed, can block and the oom
killer will stall forever waiting for the victim to exit, which may not
be possible without reaping.
That concern is real, but only true for mmu notifiers that have blockable
invalidate_range_{start,end}() callbacks. This patch adds a "flags" field
for mmu notifiers that can set a bit to indicate that these callbacks do
block.
The implementation is steered toward an expensive slowpath, such as after
the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
Signed-off-by: David Rientjes <[email protected]>
---
arch/powerpc/platforms/powernv/npu-dma.c | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 1 +
drivers/gpu/drm/amd/amdkfd/kfd_process.c | 1 +
drivers/gpu/drm/i915/i915_gem_userptr.c | 1 +
drivers/gpu/drm/radeon/radeon_mn.c | 5 +++--
drivers/infiniband/core/umem_odp.c | 1 +
drivers/infiniband/hw/hfi1/mmu_rb.c | 1 +
drivers/iommu/amd_iommu_v2.c | 1 +
drivers/iommu/intel-svm.c | 1 +
drivers/misc/mic/scif/scif_dma.c | 1 +
drivers/misc/sgi-gru/grutlbpurge.c | 1 +
drivers/xen/gntdev.c | 1 +
include/linux/mmu_notifier.h | 13 +++++++++++++
mm/hmm.c | 1 +
mm/mmu_notifier.c | 25 +++++++++++++++++++++++++
virt/kvm/kvm_main.c | 1 +
16 files changed, 54 insertions(+), 2 deletions(-)
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -710,6 +710,7 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
mm->context.npu_context = npu_context;
npu_context->mm = mm;
+ npu_content->mn.flags = MMU_INVALIDATE_MAY_BLOCK;
npu_context->mn.ops = &nv_nmmu_notifier_ops;
__mmu_notifier_register(&npu_context->mn, mm);
kref_init(&npu_context->kref);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
@@ -276,6 +276,7 @@ struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev)
rmn->adev = adev;
rmn->mm = mm;
+ rmn->mn.flags = MMU_INVALIDATE_MAY_BLOCK;
rmn->mn.ops = &amdgpu_mn_ops;
init_rwsem(&rmn->lock);
rmn->objects = RB_ROOT_CACHED;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -282,6 +282,7 @@ static struct kfd_process *create_process(const struct task_struct *thread)
process->mm = thread->mm;
/* register notifier */
+ process->mmu_notifier.flags = 0;
process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops;
err = __mmu_notifier_register(&process->mmu_notifier, process->mm);
if (err)
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -170,6 +170,7 @@ i915_mmu_notifier_create(struct mm_struct *mm)
return ERR_PTR(-ENOMEM);
spin_lock_init(&mn->lock);
+ mn->mn.flags = MMU_INVALIDATE_MAY_BLOCK;
mn->mn.ops = &i915_gem_userptr_notifier;
mn->objects = RB_ROOT_CACHED;
mn->wq = alloc_workqueue("i915-userptr-release",
diff --git a/drivers/gpu/drm/radeon/radeon_mn.c b/drivers/gpu/drm/radeon/radeon_mn.c
--- a/drivers/gpu/drm/radeon/radeon_mn.c
+++ b/drivers/gpu/drm/radeon/radeon_mn.c
@@ -164,7 +164,7 @@ static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
radeon_bo_unreserve(bo);
}
}
-
+
mutex_unlock(&rmn->lock);
}
@@ -203,10 +203,11 @@ static struct radeon_mn *radeon_mn_get(struct radeon_device *rdev)
rmn->rdev = rdev;
rmn->mm = mm;
+ rmn->mn.flags = MMU_INVALIDATE_MAY_BLOCK;
rmn->mn.ops = &radeon_mn_ops;
mutex_init(&rmn->lock);
rmn->objects = RB_ROOT_CACHED;
-
+
r = __mmu_notifier_register(&rmn->mn, mm);
if (r)
goto free_rmn;
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -411,6 +411,7 @@ int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem,
*/
atomic_set(&context->notifier_count, 0);
INIT_HLIST_NODE(&context->mn.hlist);
+ context->mn.flags = MMU_INVALIDATE_MAY_BLOCK;
context->mn.ops = &ib_umem_notifiers;
/*
* Lock-dep detects a false positive for mmap_sem vs.
diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
--- a/drivers/infiniband/hw/hfi1/mmu_rb.c
+++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
@@ -110,6 +110,7 @@ int hfi1_mmu_rb_register(void *ops_arg, struct mm_struct *mm,
handlr->ops_arg = ops_arg;
INIT_HLIST_NODE(&handlr->mn.hlist);
spin_lock_init(&handlr->lock);
+ handlr->mn.flags = 0;
handlr->mn.ops = &mn_opts;
handlr->mm = mm;
INIT_WORK(&handlr->del_work, handle_remove);
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -671,6 +671,7 @@ int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid,
pasid_state->pasid = pasid;
pasid_state->invalid = true; /* Mark as valid only if we are
done with setting up the pasid */
+ pasid_state->mn.flags = 0;
pasid_state->mn.ops = &iommu_mn;
if (pasid_state->mm == NULL)
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -382,6 +382,7 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_
goto out;
}
svm->pasid = ret;
+ svm->notifier.flags = 0;
svm->notifier.ops = &intel_mmuops;
svm->mm = mm;
svm->flags = flags;
diff --git a/drivers/misc/mic/scif/scif_dma.c b/drivers/misc/mic/scif/scif_dma.c
--- a/drivers/misc/mic/scif/scif_dma.c
+++ b/drivers/misc/mic/scif/scif_dma.c
@@ -249,6 +249,7 @@ static void scif_init_mmu_notifier(struct scif_mmu_notif *mmn,
{
mmn->ep = ep;
mmn->mm = mm;
+ mmn->ep_mmu_notifier.flags = MMU_INVALIDATE_MAY_BLOCK;
mmn->ep_mmu_notifier.ops = &scif_mmu_notifier_ops;
INIT_LIST_HEAD(&mmn->list);
INIT_LIST_HEAD(&mmn->tc_reg_list);
diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c
--- a/drivers/misc/sgi-gru/grutlbpurge.c
+++ b/drivers/misc/sgi-gru/grutlbpurge.c
@@ -298,6 +298,7 @@ struct gru_mm_struct *gru_register_mmu_notifier(void)
return ERR_PTR(-ENOMEM);
STAT(gms_alloc);
spin_lock_init(&gms->ms_asid_lock);
+ gms->ms_notifier.flags = 0;
gms->ms_notifier.ops = &gru_mmuops;
atomic_set(&gms->ms_refcnt, 1);
init_waitqueue_head(&gms->ms_wait_queue);
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -539,6 +539,7 @@ static int gntdev_open(struct inode *inode, struct file *flip)
kfree(priv);
return -ENOMEM;
}
+ priv->mn.flags = MMU_INVALIDATE_MAY_BLOCK;
priv->mn.ops = &gntdev_mmu_ops;
ret = mmu_notifier_register(&priv->mn, priv->mm);
mmput(priv->mm);
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -10,6 +10,9 @@
struct mmu_notifier;
struct mmu_notifier_ops;
+/* This mmu notifier's invalidate_{start,end}() callbacks may block */
+#define MMU_INVALIDATE_MAY_BLOCK (0x01)
+
#ifdef CONFIG_MMU_NOTIFIER
/*
@@ -137,6 +140,9 @@ struct mmu_notifier_ops {
* page. Pages will no longer be referenced by the linux
* address space but may still be referenced by sptes until
* the last refcount is dropped.
+ *
+ * If either of these callbacks can block, the mmu_notifier.flags
+ * must have MMU_INVALIDATE_MAY_BLOCK set.
*/
void (*invalidate_range_start)(struct mmu_notifier *mn,
struct mm_struct *mm,
@@ -182,6 +188,7 @@ struct mmu_notifier_ops {
* 3. No other concurrent thread can access the list (release)
*/
struct mmu_notifier {
+ int flags;
struct hlist_node hlist;
const struct mmu_notifier_ops *ops;
};
@@ -218,6 +225,7 @@ extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
bool only_end);
extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
unsigned long start, unsigned long end);
+extern int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
static inline void mmu_notifier_release(struct mm_struct *mm)
{
@@ -457,6 +465,11 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
{
}
+static inline int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
+{
+ return 0;
+}
+
static inline void mmu_notifier_mm_init(struct mm_struct *mm)
{
}
diff --git a/mm/hmm.c b/mm/hmm.c
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -104,6 +104,7 @@ static struct hmm *hmm_register(struct mm_struct *mm)
* We should only get here if hold the mmap_sem in write mode ie on
* registration of first mirror through hmm_mirror_register()
*/
+ hmm->mmu_notifier.flags = MMU_INVALIDATE_MAY_BLOCK;
hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) {
kfree(hmm);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -236,6 +236,31 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
}
EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
+/*
+ * Must be called while holding mm->mmap_sem for either read or write.
+ * The result is guaranteed to be valid until mm->mmap_sem is dropped.
+ */
+int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
+{
+ struct mmu_notifier *mn;
+ int id;
+ int ret = 0;
+
+ WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
+
+ if (!mm_has_notifiers(mm))
+ return ret;
+
+ id = srcu_read_lock(&srcu);
+ hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist)
+ if (mn->flags & MMU_INVALIDATE_MAY_BLOCK) {
+ ret = 1;
+ break;
+ }
+ srcu_read_unlock(&srcu, id);
+ return ret;
+}
+
static int do_mmu_notifier_register(struct mmu_notifier *mn,
struct mm_struct *mm,
int take_mmap_sem)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -487,6 +487,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
static int kvm_init_mmu_notifier(struct kvm *kvm)
{
+ kvm->mmu_notifier.flags = 0;
kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
}
This uses the new annotation to determine if an mm has mmu notifiers with
blockable invalidate range callbacks to avoid oom reaping. Otherwise, the
callbacks are used around unmap_page_range().
Signed-off-by: David Rientjes <[email protected]>
---
mm/oom_kill.c | 21 +++++++++++----------
1 file changed, 11 insertions(+), 10 deletions(-)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -514,15 +514,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
}
/*
- * If the mm has notifiers then we would need to invalidate them around
- * unmap_page_range and that is risky because notifiers can sleep and
- * what they do is basically undeterministic. So let's have a short
+ * If the mm has invalidate_{start,end}() notifiers that could block,
* sleep to give the oom victim some more time.
* TODO: we really want to get rid of this ugly hack and make sure that
- * notifiers cannot block for unbounded amount of time and add
- * mmu_notifier_invalidate_range_{start,end} around unmap_page_range
+ * notifiers cannot block for unbounded amount of time
*/
- if (mm_has_notifiers(mm)) {
+ if (mm_has_blockable_invalidate_notifiers(mm)) {
up_read(&mm->mmap_sem);
schedule_timeout_idle(HZ);
goto unlock_oom;
@@ -565,10 +562,14 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
* count elevated without a good reason.
*/
if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
- tlb_gather_mmu(&tlb, mm, vma->vm_start, vma->vm_end);
- unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
- NULL);
- tlb_finish_mmu(&tlb, vma->vm_start, vma->vm_end);
+ const unsigned long start = vma->vm_start;
+ const unsigned long end = vma->vm_end;
+
+ tlb_gather_mmu(&tlb, mm, start, end);
+ mmu_notifier_invalidate_range_start(mm, start, end);
+ unmap_page_range(&tlb, vma, start, end, NULL);
+ mmu_notifier_invalidate_range_end(mm, start, end);
+ tlb_finish_mmu(&tlb, start, end);
}
}
pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
On 11/12/2017 23:11, David Rientjes wrote:
> Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
> prevented the oom reaper from unmapping private anonymous memory with the
> oom reaper when the oom victim mm had mmu notifiers registered.
>
> The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
> around the unmap_page_range(), which is needed, can block and the oom
> killer will stall forever waiting for the victim to exit, which may not
> be possible without reaping.
>
> That concern is real, but only true for mmu notifiers that have blockable
> invalidate_range_{start,end}() callbacks. This patch adds a "flags" field
> for mmu notifiers that can set a bit to indicate that these callbacks do
> block.
Why not put the flag in the ops, since the same ops should always be
either blockable or unblockable?
Paolo
> The implementation is steered toward an expensive slowpath, such as after
> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
>
> Signed-off-by: David Rientjes <[email protected]>
On Mon, 11 Dec 2017, Paolo Bonzini wrote:
> > Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
> > prevented the oom reaper from unmapping private anonymous memory with the
> > oom reaper when the oom victim mm had mmu notifiers registered.
> >
> > The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
> > around the unmap_page_range(), which is needed, can block and the oom
> > killer will stall forever waiting for the victim to exit, which may not
> > be possible without reaping.
> >
> > That concern is real, but only true for mmu notifiers that have blockable
> > invalidate_range_{start,end}() callbacks. This patch adds a "flags" field
> > for mmu notifiers that can set a bit to indicate that these callbacks do
> > block.
>
> Why not put the flag in the ops, since the same ops should always be
> either blockable or unblockable?
>
Hi Paolo,
It certainly can be in mmu_notifier_ops, the only rationale for putting
the flags member in mmu_notifier was that it may become generally useful
later for things other than callbacks. I'm indifferent to where it is
placed and will happily move it if that's desired, absent any other
feedback on other parts of the patchset.
Thanks.
On Mon, Dec 11, 2017 at 02:11:55PM -0800, David Rientjes wrote:
> --- a/drivers/misc/sgi-gru/grutlbpurge.c
> +++ b/drivers/misc/sgi-gru/grutlbpurge.c
> @@ -298,6 +298,7 @@ struct gru_mm_struct *gru_register_mmu_notifier(void)
> return ERR_PTR(-ENOMEM);
> STAT(gms_alloc);
> spin_lock_init(&gms->ms_asid_lock);
> + gms->ms_notifier.flags = 0;
> gms->ms_notifier.ops = &gru_mmuops;
> atomic_set(&gms->ms_refcnt, 1);
> init_waitqueue_head(&gms->ms_wait_queue);
> diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
There is a kzalloc() just above this:
gms = kzalloc(sizeof(*gms), GFP_KERNEL);
Is that not sufficient to clear the 'flags' field?
On Tue, 12 Dec 2017, Dimitri Sivanich wrote:
> > --- a/drivers/misc/sgi-gru/grutlbpurge.c
> > +++ b/drivers/misc/sgi-gru/grutlbpurge.c
> > @@ -298,6 +298,7 @@ struct gru_mm_struct *gru_register_mmu_notifier(void)
> > return ERR_PTR(-ENOMEM);
> > STAT(gms_alloc);
> > spin_lock_init(&gms->ms_asid_lock);
> > + gms->ms_notifier.flags = 0;
> > gms->ms_notifier.ops = &gru_mmuops;
> > atomic_set(&gms->ms_refcnt, 1);
> > init_waitqueue_head(&gms->ms_wait_queue);
> > diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
>
> There is a kzalloc() just above this:
> gms = kzalloc(sizeof(*gms), GFP_KERNEL);
>
> Is that not sufficient to clear the 'flags' field?
>
Absolutely, but whether it is better to explicitly document that the mmu
notifier has cleared flags, i.e. there are no blockable callbacks, is
another story. I can change it if preferred.
Am 12.12.2017 um 22:28 schrieb David Rientjes:
> On Tue, 12 Dec 2017, Dimitri Sivanich wrote:
>
>>> --- a/drivers/misc/sgi-gru/grutlbpurge.c
>>> +++ b/drivers/misc/sgi-gru/grutlbpurge.c
>>> @@ -298,6 +298,7 @@ struct gru_mm_struct *gru_register_mmu_notifier(void)
>>> return ERR_PTR(-ENOMEM);
>>> STAT(gms_alloc);
>>> spin_lock_init(&gms->ms_asid_lock);
>>> + gms->ms_notifier.flags = 0;
>>> gms->ms_notifier.ops = &gru_mmuops;
>>> atomic_set(&gms->ms_refcnt, 1);
>>> init_waitqueue_head(&gms->ms_wait_queue);
>>> diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
>> There is a kzalloc() just above this:
>> gms = kzalloc(sizeof(*gms), GFP_KERNEL);
>>
>> Is that not sufficient to clear the 'flags' field?
>>
> Absolutely, but whether it is better to explicitly document that the mmu
> notifier has cleared flags, i.e. there are no blockable callbacks, is
> another story. I can change it if preferred.
Actually I would invert the new flag, in other words specify that an MMU
notifier will never sleep.
The first reason is that we have 8 blocking notifiers and 5 not blocking
if I counted right. So it is actually more common to sleep than not to.
The second reason is to be conservative and assume the worst, e.g. that
the flag is forgotten when a new notifier is added.
Regards,
Christian.
On 2017/12/13 18:34, Christian König wrote:
> Am 12.12.2017 um 22:28 schrieb David Rientjes:
>> On Tue, 12 Dec 2017, Dimitri Sivanich wrote:
>>
>>>> --- a/drivers/misc/sgi-gru/grutlbpurge.c
>>>> +++ b/drivers/misc/sgi-gru/grutlbpurge.c
>>>> @@ -298,6 +298,7 @@ struct gru_mm_struct *gru_register_mmu_notifier(void)
>>>> return ERR_PTR(-ENOMEM);
>>>> STAT(gms_alloc);
>>>> spin_lock_init(&gms->ms_asid_lock);
>>>> + gms->ms_notifier.flags = 0;
>>>> gms->ms_notifier.ops = &gru_mmuops;
>>>> atomic_set(&gms->ms_refcnt, 1);
>>>> init_waitqueue_head(&gms->ms_wait_queue);
>>>> diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
>>> There is a kzalloc() just above this:
>>> gms = kzalloc(sizeof(*gms), GFP_KERNEL);
>>>
>>> Is that not sufficient to clear the 'flags' field?
>>>
>> Absolutely, but whether it is better to explicitly document that the mmu
>> notifier has cleared flags, i.e. there are no blockable callbacks, is
>> another story. I can change it if preferred.
>
> Actually I would invert the new flag, in other words specify that an MMU notifier will never sleep.
>
> The first reason is that we have 8 blocking notifiers and 5 not blocking if I counted right. So it is actually more common to sleep than not to.
>
> The second reason is to be conservative and assume the worst, e.g. that the flag is forgotten when a new notifier is added.
I agree. Some out of tree module might forget to set the flags.
Although you don't need to fix out of tree modules, as a troubleshooting
staff at a support center, I want to be able to identify the careless module.
I guess specifying the flags at register function would be the best, for
an attempt to call register function without knowing this change will
simply results in a build failure.
On 13/12/2017 11:26, Tetsuo Handa wrote:
> On 2017/12/13 18:34, Christian König wrote:
>> Am 12.12.2017 um 22:28 schrieb David Rientjes:
>>> On Tue, 12 Dec 2017, Dimitri Sivanich wrote:
>>>
>>>>> --- a/drivers/misc/sgi-gru/grutlbpurge.c
>>>>> +++ b/drivers/misc/sgi-gru/grutlbpurge.c
>>>>> @@ -298,6 +298,7 @@ struct gru_mm_struct *gru_register_mmu_notifier(void)
>>>>> return ERR_PTR(-ENOMEM);
>>>>> STAT(gms_alloc);
>>>>> spin_lock_init(&gms->ms_asid_lock);
>>>>> + gms->ms_notifier.flags = 0;
>>>>> gms->ms_notifier.ops = &gru_mmuops;
>>>>> atomic_set(&gms->ms_refcnt, 1);
>>>>> init_waitqueue_head(&gms->ms_wait_queue);
>>>>> diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
>>>> There is a kzalloc() just above this:
>>>> gms = kzalloc(sizeof(*gms), GFP_KERNEL);
>>>>
>>>> Is that not sufficient to clear the 'flags' field?
>>>>
>>> Absolutely, but whether it is better to explicitly document that the mmu
>>> notifier has cleared flags, i.e. there are no blockable callbacks, is
>>> another story. I can change it if preferred.
>>
>> Actually I would invert the new flag, in other words specify that an MMU notifier will never sleep.
>>
>> The first reason is that we have 8 blocking notifiers and 5 not blocking if I counted right. So it is actually more common to sleep than not to.
>>
>> The second reason is to be conservative and assume the worst, e.g. that the flag is forgotten when a new notifier is added.
>
> I agree. Some out of tree module might forget to set the flags.
>
> Although you don't need to fix out of tree modules, as a troubleshooting
> staff at a support center, I want to be able to identify the careless module.
>
> I guess specifying the flags at register function would be the best, for
> an attempt to call register function without knowing this change will
> simply results in a build failure.
Specifying them in the ops would have the same effect and it would be
even better, as you don't have to split the information across two places.
Paolo
On Wed, 13 Dec 2017, Christian König wrote:
> > > > --- a/drivers/misc/sgi-gru/grutlbpurge.c
> > > > +++ b/drivers/misc/sgi-gru/grutlbpurge.c
> > > > @@ -298,6 +298,7 @@ struct gru_mm_struct
> > > > *gru_register_mmu_notifier(void)
> > > > return ERR_PTR(-ENOMEM);
> > > > STAT(gms_alloc);
> > > > spin_lock_init(&gms->ms_asid_lock);
> > > > + gms->ms_notifier.flags = 0;
> > > > gms->ms_notifier.ops = &gru_mmuops;
> > > > atomic_set(&gms->ms_refcnt, 1);
> > > > init_waitqueue_head(&gms->ms_wait_queue);
> > > > diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
> > > There is a kzalloc() just above this:
> > > gms = kzalloc(sizeof(*gms), GFP_KERNEL);
> > >
> > > Is that not sufficient to clear the 'flags' field?
> > >
> > Absolutely, but whether it is better to explicitly document that the mmu
> > notifier has cleared flags, i.e. there are no blockable callbacks, is
> > another story. I can change it if preferred.
>
> Actually I would invert the new flag, in other words specify that an MMU
> notifier will never sleep.
>
Very good idea, I'll do that. I'll also move the flags member to ops as
Paolo suggested.
Thanks both!
Hi David,
I love your patch! Yet something to improve:
[auto build test ERROR on drm/drm-next]
[also build test ERROR on v4.15-rc3 next-20171214]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
url: https://github.com/0day-ci/linux/commits/David-Rientjes/mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks/20171214-173044
base: git://people.freedesktop.org/~airlied/linux.git drm-next
config: powerpc-defconfig (attached as .config)
compiler: powerpc64-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=powerpc
All errors (new ones prefixed by >>):
arch/powerpc/platforms/powernv/npu-dma.c: In function 'pnv_npu2_init_context':
>> arch/powerpc/platforms/powernv/npu-dma.c:713:3: error: 'npu_content' undeclared (first use in this function); did you mean 'npu_context'?
npu_content->mn.flags = MMU_INVALIDATE_MAY_BLOCK;
^~~~~~~~~~~
npu_context
arch/powerpc/platforms/powernv/npu-dma.c:713:3: note: each undeclared identifier is reported only once for each function it appears in
vim +713 arch/powerpc/platforms/powernv/npu-dma.c
639
640 /*
641 * Call into OPAL to setup the nmmu context for the current task in
642 * the NPU. This must be called to setup the context tables before the
643 * GPU issues ATRs. pdev should be a pointed to PCIe GPU device.
644 *
645 * A release callback should be registered to allow a device driver to
646 * be notified that it should not launch any new translation requests
647 * as the final TLB invalidate is about to occur.
648 *
649 * Returns an error if there no contexts are currently available or a
650 * npu_context which should be passed to pnv_npu2_handle_fault().
651 *
652 * mmap_sem must be held in write mode.
653 */
654 struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
655 unsigned long flags,
656 struct npu_context *(*cb)(struct npu_context *, void *),
657 void *priv)
658 {
659 int rc;
660 u32 nvlink_index;
661 struct device_node *nvlink_dn;
662 struct mm_struct *mm = current->mm;
663 struct pnv_phb *nphb;
664 struct npu *npu;
665 struct npu_context *npu_context;
666
667 /*
668 * At present we don't support GPUs connected to multiple NPUs and I'm
669 * not sure the hardware does either.
670 */
671 struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
672
673 if (!firmware_has_feature(FW_FEATURE_OPAL))
674 return ERR_PTR(-ENODEV);
675
676 if (!npdev)
677 /* No nvlink associated with this GPU device */
678 return ERR_PTR(-ENODEV);
679
680 if (!mm || mm->context.id == 0) {
681 /*
682 * Kernel thread contexts are not supported and context id 0 is
683 * reserved on the GPU.
684 */
685 return ERR_PTR(-EINVAL);
686 }
687
688 nphb = pci_bus_to_host(npdev->bus)->private_data;
689 npu = &nphb->npu;
690
691 /*
692 * Setup the NPU context table for a particular GPU. These need to be
693 * per-GPU as we need the tables to filter ATSDs when there are no
694 * active contexts on a particular GPU.
695 */
696 rc = opal_npu_init_context(nphb->opal_id, mm->context.id, flags,
697 PCI_DEVID(gpdev->bus->number, gpdev->devfn));
698 if (rc < 0)
699 return ERR_PTR(-ENOSPC);
700
701 /*
702 * We store the npu pci device so we can more easily get at the
703 * associated npus.
704 */
705 npu_context = mm->context.npu_context;
706 if (!npu_context) {
707 npu_context = kzalloc(sizeof(struct npu_context), GFP_KERNEL);
708 if (!npu_context)
709 return ERR_PTR(-ENOMEM);
710
711 mm->context.npu_context = npu_context;
712 npu_context->mm = mm;
> 713 npu_content->mn.flags = MMU_INVALIDATE_MAY_BLOCK;
714 npu_context->mn.ops = &nv_nmmu_notifier_ops;
715 __mmu_notifier_register(&npu_context->mn, mm);
716 kref_init(&npu_context->kref);
717 } else {
718 kref_get(&npu_context->kref);
719 }
720
721 npu_context->release_cb = cb;
722 npu_context->priv = priv;
723 nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
724 if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
725 &nvlink_index)))
726 return ERR_PTR(-ENODEV);
727 npu_context->npdev[npu->index][nvlink_index] = npdev;
728
729 if (!nphb->npu.nmmu_flush) {
730 /*
731 * If we're not explicitly flushing ourselves we need to mark
732 * the thread for global flushes
733 */
734 npu_context->nmmu_flush = false;
735 mm_context_add_copro(mm);
736 } else
737 npu_context->nmmu_flush = true;
738
739 return npu_context;
740 }
741 EXPORT_SYMBOL(pnv_npu2_init_context);
742
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
prevented the oom reaper from unmapping private anonymous memory with the
oom reaper when the oom victim mm had mmu notifiers registered.
The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
around the unmap_page_range(), which is needed, can block and the oom
killer will stall forever waiting for the victim to exit, which may not
be possible without reaping.
That concern is real, but only true for mmu notifiers that have blockable
invalidate_range_{start,end}() callbacks. This patch adds a "flags" field
to mmu notifier ops that can set a bit to indicate that these callbacks do
not block.
The implementation is steered toward an expensive slowpath, such as after
the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
Signed-off-by: David Rientjes <[email protected]>
---
v2:
- specifically exclude mmu_notifiers without invalidate callbacks
- move flags to mmu_notifier_ops per Paolo
- reverse flag from blockable -> not blockable per Christian
drivers/infiniband/hw/hfi1/mmu_rb.c | 1 +
drivers/iommu/amd_iommu_v2.c | 1 +
drivers/iommu/intel-svm.c | 1 +
drivers/misc/sgi-gru/grutlbpurge.c | 1 +
include/linux/mmu_notifier.h | 21 +++++++++++++++++++++
mm/mmu_notifier.c | 31 +++++++++++++++++++++++++++++++
virt/kvm/kvm_main.c | 1 +
7 files changed, 57 insertions(+)
diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
--- a/drivers/infiniband/hw/hfi1/mmu_rb.c
+++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
@@ -77,6 +77,7 @@ static void do_remove(struct mmu_rb_handler *handler,
static void handle_remove(struct work_struct *work);
static const struct mmu_notifier_ops mn_opts = {
+ .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
.invalidate_range_start = mmu_notifier_range_start,
};
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -427,6 +427,7 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
}
static const struct mmu_notifier_ops iommu_mn = {
+ .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
.release = mn_release,
.clear_flush_young = mn_clear_flush_young,
.invalidate_range = mn_invalidate_range,
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -276,6 +276,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
}
static const struct mmu_notifier_ops intel_mmuops = {
+ .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
.release = intel_mm_release,
.change_pte = intel_change_pte,
.invalidate_range = intel_invalidate_range,
diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c
--- a/drivers/misc/sgi-gru/grutlbpurge.c
+++ b/drivers/misc/sgi-gru/grutlbpurge.c
@@ -258,6 +258,7 @@ static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm)
static const struct mmu_notifier_ops gru_mmuops = {
+ .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
.invalidate_range_start = gru_invalidate_range_start,
.invalidate_range_end = gru_invalidate_range_end,
.release = gru_release,
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -10,6 +10,9 @@
struct mmu_notifier;
struct mmu_notifier_ops;
+/* mmu_notifier_ops flags */
+#define MMU_INVALIDATE_DOES_NOT_BLOCK (0x01)
+
#ifdef CONFIG_MMU_NOTIFIER
/*
@@ -26,6 +29,15 @@ struct mmu_notifier_mm {
};
struct mmu_notifier_ops {
+ /*
+ * Flags to specify behavior of callbacks for this MMU notifier.
+ * Used to determine which context an operation may be called.
+ *
+ * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
+ * block
+ */
+ int flags;
+
/*
* Called either by mmu_notifier_unregister or when the mm is
* being destroyed by exit_mmap, always before all pages are
@@ -137,6 +149,9 @@ struct mmu_notifier_ops {
* page. Pages will no longer be referenced by the linux
* address space but may still be referenced by sptes until
* the last refcount is dropped.
+ *
+ * If both of these callbacks cannot block, mmu_notifier_ops.flags
+ * should have MMU_INVALIDATE_DOES_NOT_BLOCK set.
*/
void (*invalidate_range_start)(struct mmu_notifier *mn,
struct mm_struct *mm,
@@ -218,6 +233,7 @@ extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
bool only_end);
extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
unsigned long start, unsigned long end);
+extern int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
static inline void mmu_notifier_release(struct mm_struct *mm)
{
@@ -457,6 +473,11 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
{
}
+static inline int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
+{
+ return 0;
+}
+
static inline void mmu_notifier_mm_init(struct mm_struct *mm)
{
}
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -236,6 +236,37 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
}
EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
+/*
+ * Must be called while holding mm->mmap_sem for either read or write.
+ * The result is guaranteed to be valid until mm->mmap_sem is dropped.
+ */
+int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
+{
+ struct mmu_notifier *mn;
+ int id;
+ int ret = 0;
+
+ WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
+
+ if (!mm_has_notifiers(mm))
+ return ret;
+
+ id = srcu_read_lock(&srcu);
+ hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+ if (!mn->ops->invalidate_range &&
+ !mn->ops->invalidate_range_start &&
+ !mn->ops->invalidate_range_end)
+ continue;
+
+ if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
+ ret = 1;
+ break;
+ }
+ }
+ srcu_read_unlock(&srcu, id);
+ return ret;
+}
+
static int do_mmu_notifier_register(struct mmu_notifier *mn,
struct mm_struct *mm,
int take_mmap_sem)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -476,6 +476,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
}
static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
+ .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
.invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
.invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
.clear_flush_young = kvm_mmu_notifier_clear_flush_young,
This uses the new annotation to determine if an mm has mmu notifiers with
blockable invalidate range callbacks to avoid oom reaping. Otherwise, the
callbacks are used around unmap_page_range().
Signed-off-by: David Rientjes <[email protected]>
---
mm/oom_kill.c | 21 +++++++++++----------
1 file changed, 11 insertions(+), 10 deletions(-)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -514,15 +514,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
}
/*
- * If the mm has notifiers then we would need to invalidate them around
- * unmap_page_range and that is risky because notifiers can sleep and
- * what they do is basically undeterministic. So let's have a short
+ * If the mm has invalidate_{start,end}() notifiers that could block,
* sleep to give the oom victim some more time.
* TODO: we really want to get rid of this ugly hack and make sure that
- * notifiers cannot block for unbounded amount of time and add
- * mmu_notifier_invalidate_range_{start,end} around unmap_page_range
+ * notifiers cannot block for unbounded amount of time
*/
- if (mm_has_notifiers(mm)) {
+ if (mm_has_blockable_invalidate_notifiers(mm)) {
up_read(&mm->mmap_sem);
schedule_timeout_idle(HZ);
goto unlock_oom;
@@ -565,10 +562,14 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
* count elevated without a good reason.
*/
if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
- tlb_gather_mmu(&tlb, mm, vma->vm_start, vma->vm_end);
- unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
- NULL);
- tlb_finish_mmu(&tlb, vma->vm_start, vma->vm_end);
+ const unsigned long start = vma->vm_start;
+ const unsigned long end = vma->vm_end;
+
+ tlb_gather_mmu(&tlb, mm, start, end);
+ mmu_notifier_invalidate_range_start(mm, start, end);
+ unmap_page_range(&tlb, vma, start, end, NULL);
+ mmu_notifier_invalidate_range_end(mm, start, end);
+ tlb_finish_mmu(&tlb, start, end);
}
}
pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
On 14/12/2017 22:30, David Rientjes wrote:
> Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
> prevented the oom reaper from unmapping private anonymous memory with the
> oom reaper when the oom victim mm had mmu notifiers registered.
>
> The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
> around the unmap_page_range(), which is needed, can block and the oom
> killer will stall forever waiting for the victim to exit, which may not
> be possible without reaping.
>
> That concern is real, but only true for mmu notifiers that have blockable
> invalidate_range_{start,end}() callbacks. This patch adds a "flags" field
> to mmu notifier ops that can set a bit to indicate that these callbacks do
> not block.
>
> The implementation is steered toward an expensive slowpath, such as after
> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
>
> Signed-off-by: David Rientjes <[email protected]>
> ---
> v2:
> - specifically exclude mmu_notifiers without invalidate callbacks
> - move flags to mmu_notifier_ops per Paolo
> - reverse flag from blockable -> not blockable per Christian
>
> drivers/infiniband/hw/hfi1/mmu_rb.c | 1 +
> drivers/iommu/amd_iommu_v2.c | 1 +
> drivers/iommu/intel-svm.c | 1 +
> drivers/misc/sgi-gru/grutlbpurge.c | 1 +
> include/linux/mmu_notifier.h | 21 +++++++++++++++++++++
> mm/mmu_notifier.c | 31 +++++++++++++++++++++++++++++++
> virt/kvm/kvm_main.c | 1 +
> 7 files changed, 57 insertions(+)
>
> diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
> --- a/drivers/infiniband/hw/hfi1/mmu_rb.c
> +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
> @@ -77,6 +77,7 @@ static void do_remove(struct mmu_rb_handler *handler,
> static void handle_remove(struct work_struct *work);
>
> static const struct mmu_notifier_ops mn_opts = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .invalidate_range_start = mmu_notifier_range_start,
> };
>
> diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
> --- a/drivers/iommu/amd_iommu_v2.c
> +++ b/drivers/iommu/amd_iommu_v2.c
> @@ -427,6 +427,7 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
> }
>
> static const struct mmu_notifier_ops iommu_mn = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .release = mn_release,
> .clear_flush_young = mn_clear_flush_young,
> .invalidate_range = mn_invalidate_range,
> diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
> --- a/drivers/iommu/intel-svm.c
> +++ b/drivers/iommu/intel-svm.c
> @@ -276,6 +276,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
> }
>
> static const struct mmu_notifier_ops intel_mmuops = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .release = intel_mm_release,
> .change_pte = intel_change_pte,
> .invalidate_range = intel_invalidate_range,
> diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c
> --- a/drivers/misc/sgi-gru/grutlbpurge.c
> +++ b/drivers/misc/sgi-gru/grutlbpurge.c
> @@ -258,6 +258,7 @@ static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm)
>
>
> static const struct mmu_notifier_ops gru_mmuops = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .invalidate_range_start = gru_invalidate_range_start,
> .invalidate_range_end = gru_invalidate_range_end,
> .release = gru_release,
> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> --- a/include/linux/mmu_notifier.h
> +++ b/include/linux/mmu_notifier.h
> @@ -10,6 +10,9 @@
> struct mmu_notifier;
> struct mmu_notifier_ops;
>
> +/* mmu_notifier_ops flags */
> +#define MMU_INVALIDATE_DOES_NOT_BLOCK (0x01)
> +
> #ifdef CONFIG_MMU_NOTIFIER
>
> /*
> @@ -26,6 +29,15 @@ struct mmu_notifier_mm {
> };
>
> struct mmu_notifier_ops {
> + /*
> + * Flags to specify behavior of callbacks for this MMU notifier.
> + * Used to determine which context an operation may be called.
> + *
> + * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
> + * block
> + */
> + int flags;
> +
> /*
> * Called either by mmu_notifier_unregister or when the mm is
> * being destroyed by exit_mmap, always before all pages are
> @@ -137,6 +149,9 @@ struct mmu_notifier_ops {
> * page. Pages will no longer be referenced by the linux
> * address space but may still be referenced by sptes until
> * the last refcount is dropped.
> + *
> + * If both of these callbacks cannot block, mmu_notifier_ops.flags
> + * should have MMU_INVALIDATE_DOES_NOT_BLOCK set.
> */
> void (*invalidate_range_start)(struct mmu_notifier *mn,
> struct mm_struct *mm,
> @@ -218,6 +233,7 @@ extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
> bool only_end);
> extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
> unsigned long start, unsigned long end);
> +extern int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
>
> static inline void mmu_notifier_release(struct mm_struct *mm)
> {
> @@ -457,6 +473,11 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
> {
> }
>
> +static inline int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +{
> + return 0;
> +}
> +
> static inline void mmu_notifier_mm_init(struct mm_struct *mm)
> {
> }
> diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
> --- a/mm/mmu_notifier.c
> +++ b/mm/mmu_notifier.c
> @@ -236,6 +236,37 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
> }
> EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
>
> +/*
> + * Must be called while holding mm->mmap_sem for either read or write.
> + * The result is guaranteed to be valid until mm->mmap_sem is dropped.
> + */
> +int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +{
> + struct mmu_notifier *mn;
> + int id;
> + int ret = 0;
> +
> + WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
> +
> + if (!mm_has_notifiers(mm))
> + return ret;
> +
> + id = srcu_read_lock(&srcu);
> + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
> + if (!mn->ops->invalidate_range &&
> + !mn->ops->invalidate_range_start &&
> + !mn->ops->invalidate_range_end)
> + continue;
> +
> + if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
> + ret = 1;
> + break;
> + }
> + }
> + srcu_read_unlock(&srcu, id);
> + return ret;
> +}
> +
> static int do_mmu_notifier_register(struct mmu_notifier *mn,
> struct mm_struct *mm,
> int take_mmap_sem)
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -476,6 +476,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
> }
>
> static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
> .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
> .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
>
Acked-by: Paolo Bonzini <[email protected]>
Am 14.12.2017 um 22:30 schrieb David Rientjes:
> Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
> prevented the oom reaper from unmapping private anonymous memory with the
> oom reaper when the oom victim mm had mmu notifiers registered.
>
> The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
> around the unmap_page_range(), which is needed, can block and the oom
> killer will stall forever waiting for the victim to exit, which may not
> be possible without reaping.
>
> That concern is real, but only true for mmu notifiers that have blockable
> invalidate_range_{start,end}() callbacks. This patch adds a "flags" field
> to mmu notifier ops that can set a bit to indicate that these callbacks do
> not block.
>
> The implementation is steered toward an expensive slowpath, such as after
> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
>
> Signed-off-by: David Rientjes <[email protected]>
Acked-by: Christian König <[email protected]>
> ---
> v2:
> - specifically exclude mmu_notifiers without invalidate callbacks
> - move flags to mmu_notifier_ops per Paolo
> - reverse flag from blockable -> not blockable per Christian
>
> drivers/infiniband/hw/hfi1/mmu_rb.c | 1 +
> drivers/iommu/amd_iommu_v2.c | 1 +
> drivers/iommu/intel-svm.c | 1 +
> drivers/misc/sgi-gru/grutlbpurge.c | 1 +
> include/linux/mmu_notifier.h | 21 +++++++++++++++++++++
> mm/mmu_notifier.c | 31 +++++++++++++++++++++++++++++++
> virt/kvm/kvm_main.c | 1 +
> 7 files changed, 57 insertions(+)
>
> diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
> --- a/drivers/infiniband/hw/hfi1/mmu_rb.c
> +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
> @@ -77,6 +77,7 @@ static void do_remove(struct mmu_rb_handler *handler,
> static void handle_remove(struct work_struct *work);
>
> static const struct mmu_notifier_ops mn_opts = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .invalidate_range_start = mmu_notifier_range_start,
> };
>
> diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
> --- a/drivers/iommu/amd_iommu_v2.c
> +++ b/drivers/iommu/amd_iommu_v2.c
> @@ -427,6 +427,7 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
> }
>
> static const struct mmu_notifier_ops iommu_mn = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .release = mn_release,
> .clear_flush_young = mn_clear_flush_young,
> .invalidate_range = mn_invalidate_range,
> diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
> --- a/drivers/iommu/intel-svm.c
> +++ b/drivers/iommu/intel-svm.c
> @@ -276,6 +276,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
> }
>
> static const struct mmu_notifier_ops intel_mmuops = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .release = intel_mm_release,
> .change_pte = intel_change_pte,
> .invalidate_range = intel_invalidate_range,
> diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c
> --- a/drivers/misc/sgi-gru/grutlbpurge.c
> +++ b/drivers/misc/sgi-gru/grutlbpurge.c
> @@ -258,6 +258,7 @@ static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm)
>
>
> static const struct mmu_notifier_ops gru_mmuops = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .invalidate_range_start = gru_invalidate_range_start,
> .invalidate_range_end = gru_invalidate_range_end,
> .release = gru_release,
> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> --- a/include/linux/mmu_notifier.h
> +++ b/include/linux/mmu_notifier.h
> @@ -10,6 +10,9 @@
> struct mmu_notifier;
> struct mmu_notifier_ops;
>
> +/* mmu_notifier_ops flags */
> +#define MMU_INVALIDATE_DOES_NOT_BLOCK (0x01)
> +
> #ifdef CONFIG_MMU_NOTIFIER
>
> /*
> @@ -26,6 +29,15 @@ struct mmu_notifier_mm {
> };
>
> struct mmu_notifier_ops {
> + /*
> + * Flags to specify behavior of callbacks for this MMU notifier.
> + * Used to determine which context an operation may be called.
> + *
> + * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
> + * block
> + */
> + int flags;
> +
> /*
> * Called either by mmu_notifier_unregister or when the mm is
> * being destroyed by exit_mmap, always before all pages are
> @@ -137,6 +149,9 @@ struct mmu_notifier_ops {
> * page. Pages will no longer be referenced by the linux
> * address space but may still be referenced by sptes until
> * the last refcount is dropped.
> + *
> + * If both of these callbacks cannot block, mmu_notifier_ops.flags
> + * should have MMU_INVALIDATE_DOES_NOT_BLOCK set.
> */
> void (*invalidate_range_start)(struct mmu_notifier *mn,
> struct mm_struct *mm,
> @@ -218,6 +233,7 @@ extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
> bool only_end);
> extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
> unsigned long start, unsigned long end);
> +extern int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
>
> static inline void mmu_notifier_release(struct mm_struct *mm)
> {
> @@ -457,6 +473,11 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
> {
> }
>
> +static inline int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +{
> + return 0;
> +}
> +
> static inline void mmu_notifier_mm_init(struct mm_struct *mm)
> {
> }
> diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
> --- a/mm/mmu_notifier.c
> +++ b/mm/mmu_notifier.c
> @@ -236,6 +236,37 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
> }
> EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
>
> +/*
> + * Must be called while holding mm->mmap_sem for either read or write.
> + * The result is guaranteed to be valid until mm->mmap_sem is dropped.
> + */
> +int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +{
> + struct mmu_notifier *mn;
> + int id;
> + int ret = 0;
> +
> + WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
> +
> + if (!mm_has_notifiers(mm))
> + return ret;
> +
> + id = srcu_read_lock(&srcu);
> + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
> + if (!mn->ops->invalidate_range &&
> + !mn->ops->invalidate_range_start &&
> + !mn->ops->invalidate_range_end)
> + continue;
> +
> + if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
> + ret = 1;
> + break;
> + }
> + }
> + srcu_read_unlock(&srcu, id);
> + return ret;
> +}
> +
> static int do_mmu_notifier_register(struct mmu_notifier *mn,
> struct mm_struct *mm,
> int take_mmap_sem)
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -476,6 +476,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
> }
>
> static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
> .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
> .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
On Thu, Dec 14, 2017 at 01:30:56PM -0800, David Rientjes wrote:
> Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
> prevented the oom reaper from unmapping private anonymous memory with the
> oom reaper when the oom victim mm had mmu notifiers registered.
>
> The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
> around the unmap_page_range(), which is needed, can block and the oom
> killer will stall forever waiting for the victim to exit, which may not
> be possible without reaping.
>
> That concern is real, but only true for mmu notifiers that have blockable
> invalidate_range_{start,end}() callbacks. This patch adds a "flags" field
> to mmu notifier ops that can set a bit to indicate that these callbacks do
> not block.
>
> The implementation is steered toward an expensive slowpath, such as after
> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
>
> Signed-off-by: David Rientjes <[email protected]>
Acked-by: Dimitri Sivanich <[email protected]>
> ---
> v2:
> - specifically exclude mmu_notifiers without invalidate callbacks
> - move flags to mmu_notifier_ops per Paolo
> - reverse flag from blockable -> not blockable per Christian
>
> drivers/infiniband/hw/hfi1/mmu_rb.c | 1 +
> drivers/iommu/amd_iommu_v2.c | 1 +
> drivers/iommu/intel-svm.c | 1 +
> drivers/misc/sgi-gru/grutlbpurge.c | 1 +
> include/linux/mmu_notifier.h | 21 +++++++++++++++++++++
> mm/mmu_notifier.c | 31 +++++++++++++++++++++++++++++++
> virt/kvm/kvm_main.c | 1 +
> 7 files changed, 57 insertions(+)
>
> diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
> --- a/drivers/infiniband/hw/hfi1/mmu_rb.c
> +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
> @@ -77,6 +77,7 @@ static void do_remove(struct mmu_rb_handler *handler,
> static void handle_remove(struct work_struct *work);
>
> static const struct mmu_notifier_ops mn_opts = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .invalidate_range_start = mmu_notifier_range_start,
> };
>
> diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
> --- a/drivers/iommu/amd_iommu_v2.c
> +++ b/drivers/iommu/amd_iommu_v2.c
> @@ -427,6 +427,7 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
> }
>
> static const struct mmu_notifier_ops iommu_mn = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .release = mn_release,
> .clear_flush_young = mn_clear_flush_young,
> .invalidate_range = mn_invalidate_range,
> diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
> --- a/drivers/iommu/intel-svm.c
> +++ b/drivers/iommu/intel-svm.c
> @@ -276,6 +276,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
> }
>
> static const struct mmu_notifier_ops intel_mmuops = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .release = intel_mm_release,
> .change_pte = intel_change_pte,
> .invalidate_range = intel_invalidate_range,
> diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c
> --- a/drivers/misc/sgi-gru/grutlbpurge.c
> +++ b/drivers/misc/sgi-gru/grutlbpurge.c
> @@ -258,6 +258,7 @@ static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm)
>
>
> static const struct mmu_notifier_ops gru_mmuops = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .invalidate_range_start = gru_invalidate_range_start,
> .invalidate_range_end = gru_invalidate_range_end,
> .release = gru_release,
> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> --- a/include/linux/mmu_notifier.h
> +++ b/include/linux/mmu_notifier.h
> @@ -10,6 +10,9 @@
> struct mmu_notifier;
> struct mmu_notifier_ops;
>
> +/* mmu_notifier_ops flags */
> +#define MMU_INVALIDATE_DOES_NOT_BLOCK (0x01)
> +
> #ifdef CONFIG_MMU_NOTIFIER
>
> /*
> @@ -26,6 +29,15 @@ struct mmu_notifier_mm {
> };
>
> struct mmu_notifier_ops {
> + /*
> + * Flags to specify behavior of callbacks for this MMU notifier.
> + * Used to determine which context an operation may be called.
> + *
> + * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
> + * block
> + */
> + int flags;
> +
> /*
> * Called either by mmu_notifier_unregister or when the mm is
> * being destroyed by exit_mmap, always before all pages are
> @@ -137,6 +149,9 @@ struct mmu_notifier_ops {
> * page. Pages will no longer be referenced by the linux
> * address space but may still be referenced by sptes until
> * the last refcount is dropped.
> + *
> + * If both of these callbacks cannot block, mmu_notifier_ops.flags
> + * should have MMU_INVALIDATE_DOES_NOT_BLOCK set.
> */
> void (*invalidate_range_start)(struct mmu_notifier *mn,
> struct mm_struct *mm,
> @@ -218,6 +233,7 @@ extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
> bool only_end);
> extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
> unsigned long start, unsigned long end);
> +extern int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
>
> static inline void mmu_notifier_release(struct mm_struct *mm)
> {
> @@ -457,6 +473,11 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
> {
> }
>
> +static inline int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +{
> + return 0;
> +}
> +
> static inline void mmu_notifier_mm_init(struct mm_struct *mm)
> {
> }
> diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
> --- a/mm/mmu_notifier.c
> +++ b/mm/mmu_notifier.c
> @@ -236,6 +236,37 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
> }
> EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
>
> +/*
> + * Must be called while holding mm->mmap_sem for either read or write.
> + * The result is guaranteed to be valid until mm->mmap_sem is dropped.
> + */
> +int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +{
> + struct mmu_notifier *mn;
> + int id;
> + int ret = 0;
> +
> + WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
> +
> + if (!mm_has_notifiers(mm))
> + return ret;
> +
> + id = srcu_read_lock(&srcu);
> + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
> + if (!mn->ops->invalidate_range &&
> + !mn->ops->invalidate_range_start &&
> + !mn->ops->invalidate_range_end)
> + continue;
> +
> + if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
> + ret = 1;
> + break;
> + }
> + }
> + srcu_read_unlock(&srcu, id);
> + return ret;
> +}
> +
> static int do_mmu_notifier_register(struct mmu_notifier *mn,
> struct mm_struct *mm,
> int take_mmap_sem)
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -476,6 +476,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
> }
>
> static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
> .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
> .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
On Thu 14-12-17 13:30:56, David Rientjes wrote:
> Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
> prevented the oom reaper from unmapping private anonymous memory with the
> oom reaper when the oom victim mm had mmu notifiers registered.
>
> The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
> around the unmap_page_range(), which is needed, can block and the oom
> killer will stall forever waiting for the victim to exit, which may not
> be possible without reaping.
>
> That concern is real, but only true for mmu notifiers that have blockable
> invalidate_range_{start,end}() callbacks. This patch adds a "flags" field
> to mmu notifier ops that can set a bit to indicate that these callbacks do
> not block.
>
> The implementation is steered toward an expensive slowpath, such as after
> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
>
> Signed-off-by: David Rientjes <[email protected]>
Yes, this make sense. I haven't checked all the existing mmu notifiers
but those that you have marked seem to be OK.
I just think that the semantic of the flag should be describe more. See
below
Acked-by: Michal Hocko <[email protected]>
> ---
> v2:
> - specifically exclude mmu_notifiers without invalidate callbacks
> - move flags to mmu_notifier_ops per Paolo
> - reverse flag from blockable -> not blockable per Christian
>
> drivers/infiniband/hw/hfi1/mmu_rb.c | 1 +
> drivers/iommu/amd_iommu_v2.c | 1 +
> drivers/iommu/intel-svm.c | 1 +
> drivers/misc/sgi-gru/grutlbpurge.c | 1 +
> include/linux/mmu_notifier.h | 21 +++++++++++++++++++++
> mm/mmu_notifier.c | 31 +++++++++++++++++++++++++++++++
> virt/kvm/kvm_main.c | 1 +
> 7 files changed, 57 insertions(+)
>
[...]
> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> --- a/include/linux/mmu_notifier.h
> +++ b/include/linux/mmu_notifier.h
> @@ -10,6 +10,9 @@
> struct mmu_notifier;
> struct mmu_notifier_ops;
>
> +/* mmu_notifier_ops flags */
> +#define MMU_INVALIDATE_DOES_NOT_BLOCK (0x01)
> +
> #ifdef CONFIG_MMU_NOTIFIER
>
> /*
> @@ -26,6 +29,15 @@ struct mmu_notifier_mm {
> };
>
> struct mmu_notifier_ops {
> + /*
> + * Flags to specify behavior of callbacks for this MMU notifier.
> + * Used to determine which context an operation may be called.
> + *
> + * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
> + * block
> + */
> + int flags;
This should be more specific IMHO. What do you think about the following
wording?
invalidate_{start,end,range} doesn't block on any locks which depend
directly or indirectly (via lock chain or resources e.g. worker context)
on a memory allocation.
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -476,6 +476,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
> }
>
> static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
> + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
> .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
> .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
> .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
--
Michal Hocko
SUSE Labs
On Thu 14-12-17 13:31:00, David Rientjes wrote:
> This uses the new annotation to determine if an mm has mmu notifiers with
> blockable invalidate range callbacks to avoid oom reaping. Otherwise, the
> callbacks are used around unmap_page_range().
Do you have any example where this helped? KVM guest oom killed I guess?
> Signed-off-by: David Rientjes <[email protected]>
Acked-by: Michal Hocko <[email protected]>
> ---
> mm/oom_kill.c | 21 +++++++++++----------
> 1 file changed, 11 insertions(+), 10 deletions(-)
>
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -514,15 +514,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
> }
>
> /*
> - * If the mm has notifiers then we would need to invalidate them around
> - * unmap_page_range and that is risky because notifiers can sleep and
> - * what they do is basically undeterministic. So let's have a short
> + * If the mm has invalidate_{start,end}() notifiers that could block,
> * sleep to give the oom victim some more time.
> * TODO: we really want to get rid of this ugly hack and make sure that
> - * notifiers cannot block for unbounded amount of time and add
> - * mmu_notifier_invalidate_range_{start,end} around unmap_page_range
> + * notifiers cannot block for unbounded amount of time
> */
> - if (mm_has_notifiers(mm)) {
> + if (mm_has_blockable_invalidate_notifiers(mm)) {
> up_read(&mm->mmap_sem);
> schedule_timeout_idle(HZ);
> goto unlock_oom;
> @@ -565,10 +562,14 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
> * count elevated without a good reason.
> */
> if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
> - tlb_gather_mmu(&tlb, mm, vma->vm_start, vma->vm_end);
> - unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
> - NULL);
> - tlb_finish_mmu(&tlb, vma->vm_start, vma->vm_end);
> + const unsigned long start = vma->vm_start;
> + const unsigned long end = vma->vm_end;
> +
> + tlb_gather_mmu(&tlb, mm, start, end);
> + mmu_notifier_invalidate_range_start(mm, start, end);
> + unmap_page_range(&tlb, vma, start, end, NULL);
> + mmu_notifier_invalidate_range_end(mm, start, end);
> + tlb_finish_mmu(&tlb, start, end);
> }
> }
> pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
--
Michal Hocko
SUSE Labs
On Fri, 15 Dec 2017, Michal Hocko wrote:
> > This uses the new annotation to determine if an mm has mmu notifiers with
> > blockable invalidate range callbacks to avoid oom reaping. Otherwise, the
> > callbacks are used around unmap_page_range().
>
> Do you have any example where this helped? KVM guest oom killed I guess?
>
KVM is the most significant one that we are interested in, but haven't had
sufficient time to quantify how much of an impact this has other than to
say it will certainly be non-zero.
The motivation was more to exclude mmu notifiers that have a reason to be
excluded rather than a blanket exemption to make the oom reaper more
robust.
On Thu, 14 Dec 2017 13:30:56 -0800 (PST) David Rientjes <[email protected]> wrote:
> Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
> prevented the oom reaper from unmapping private anonymous memory with the
> oom reaper when the oom victim mm had mmu notifiers registered.
>
> The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
> around the unmap_page_range(), which is needed, can block and the oom
> killer will stall forever waiting for the victim to exit, which may not
> be possible without reaping.
>
> That concern is real, but only true for mmu notifiers that have blockable
> invalidate_range_{start,end}() callbacks. This patch adds a "flags" field
> to mmu notifier ops that can set a bit to indicate that these callbacks do
> not block.
>
> The implementation is steered toward an expensive slowpath, such as after
> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
some tweakage, please review.
From: Andrew Morton <[email protected]>
Subject: mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
make mm_has_blockable_invalidate_notifiers() return bool, use rwsem_is_locked()
Cc: Alex Deucher <[email protected]>
Cc: Andrea Arcangeli <[email protected]>
Cc: Benjamin Herrenschmidt <[email protected]>
Cc: Boris Ostrovsky <[email protected]>
Cc: Christian König <[email protected]>
Cc: David Airlie <[email protected]>
Cc: David Rientjes <[email protected]>
Cc: Dimitri Sivanich <[email protected]>
Cc: Doug Ledford <[email protected]>
Cc: Jani Nikula <[email protected]>
Cc: Jérôme Glisse <[email protected]>
Cc: Joerg Roedel <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Mike Marciniszyn <[email protected]>
Cc: Oded Gabbay <[email protected]>
Cc: Paolo Bonzini <[email protected]>
Cc: Paul Mackerras <[email protected]>
Cc: Radim Krčmář <[email protected]>
Cc: Sean Hefty <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
---
include/linux/mmu_notifier.h | 7 ++++---
mm/mmu_notifier.c | 8 ++++----
2 files changed, 8 insertions(+), 7 deletions(-)
diff -puN include/linux/mmu_notifier.h~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix include/linux/mmu_notifier.h
--- a/include/linux/mmu_notifier.h~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
+++ a/include/linux/mmu_notifier.h
@@ -2,6 +2,7 @@
#ifndef _LINUX_MMU_NOTIFIER_H
#define _LINUX_MMU_NOTIFIER_H
+#include <linux/types.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/mm_types.h>
@@ -233,7 +234,7 @@ extern void __mmu_notifier_invalidate_ra
bool only_end);
extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
unsigned long start, unsigned long end);
-extern int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
+extern bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
static inline void mmu_notifier_release(struct mm_struct *mm)
{
@@ -473,9 +474,9 @@ static inline void mmu_notifier_invalida
{
}
-static inline int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
+static inline bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
{
- return 0;
+ return false;
}
static inline void mmu_notifier_mm_init(struct mm_struct *mm)
diff -puN mm/mmu_notifier.c~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix mm/mmu_notifier.c
--- a/mm/mmu_notifier.c~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
+++ a/mm/mmu_notifier.c
@@ -240,13 +240,13 @@ EXPORT_SYMBOL_GPL(__mmu_notifier_invalid
* Must be called while holding mm->mmap_sem for either read or write.
* The result is guaranteed to be valid until mm->mmap_sem is dropped.
*/
-int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
+bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
{
struct mmu_notifier *mn;
int id;
- int ret = 0;
+ bool ret = false;
- WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
+ WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
if (!mm_has_notifiers(mm))
return ret;
@@ -259,7 +259,7 @@ int mm_has_blockable_invalidate_notifier
continue;
if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
- ret = 1;
+ ret = true;
break;
}
}
_
On 2017/12/16 1:25, Michal Hocko wrote:
>> struct mmu_notifier_ops {
>> + /*
>> + * Flags to specify behavior of callbacks for this MMU notifier.
>> + * Used to determine which context an operation may be called.
>> + *
>> + * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
>> + * block
>> + */
>> + int flags;
>
> This should be more specific IMHO. What do you think about the following
> wording?
>
> invalidate_{start,end,range} doesn't block on any locks which depend
> directly or indirectly (via lock chain or resources e.g. worker context)
> on a memory allocation.
I disagree. It needlessly complicates validating the correctness.
What if the invalidate_{start,end} calls schedule_timeout_idle(10 * HZ) ?
schedule_timeout_idle() will not block on any locks which depend directly or
indirectly on a memory allocation, but we are already blocking other memory
allocating threads at mutex_trylock(&oom_lock) in __alloc_pages_may_oom().
This is essentially same with "sleeping forever due to schedule_timeout_killable(1) by
SCHED_IDLE thread with oom_lock held" versus "looping due to mutex_trylock(&oom_lock)
by all other allocating threads" lockup problem. The OOM reaper does not want to get
blocked for so long.
On 2017/12/16 8:04, Andrew Morton wrote:
>> The implementation is steered toward an expensive slowpath, such as after
>> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
>
> some tweakage, please review.
>
> From: Andrew Morton <[email protected]>
> Subject: mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
>
> make mm_has_blockable_invalidate_notifiers() return bool, use rwsem_is_locked()
>
> @@ -240,13 +240,13 @@ EXPORT_SYMBOL_GPL(__mmu_notifier_invalid
> * Must be called while holding mm->mmap_sem for either read or write.
> * The result is guaranteed to be valid until mm->mmap_sem is dropped.
> */
> -int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> {
> struct mmu_notifier *mn;
> int id;
> - int ret = 0;
> + bool ret = false;
>
> - WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
> + WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
>
> if (!mm_has_notifiers(mm))
> return ret;
rwsem_is_locked() test isn't equivalent with __mutex_owner() == current test, is it?
If rwsem_is_locked() returns true because somebody else has locked it, there is
no guarantee that current thread has locked it before calling this function.
down_write_trylock() test isn't equivalent with __mutex_owner() == current test, is it?
What if somebody else held it for read or write (the worst case is registration path),
down_write_trylock() will return false even if current thread has not locked it for
read or write.
I think this WARN_ON_ONCE() can not detect incorrect call to this function.
On Fri 15-12-17 15:04:29, Andrew Morton wrote:
> On Thu, 14 Dec 2017 13:30:56 -0800 (PST) David Rientjes <[email protected]> wrote:
>
> > Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
> > prevented the oom reaper from unmapping private anonymous memory with the
> > oom reaper when the oom victim mm had mmu notifiers registered.
> >
> > The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
> > around the unmap_page_range(), which is needed, can block and the oom
> > killer will stall forever waiting for the victim to exit, which may not
> > be possible without reaping.
> >
> > That concern is real, but only true for mmu notifiers that have blockable
> > invalidate_range_{start,end}() callbacks. This patch adds a "flags" field
> > to mmu notifier ops that can set a bit to indicate that these callbacks do
> > not block.
> >
> > The implementation is steered toward an expensive slowpath, such as after
> > the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
>
> some tweakage, please review.
>
> From: Andrew Morton <[email protected]>
> Subject: mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
>
> make mm_has_blockable_invalidate_notifiers() return bool, use rwsem_is_locked()
Yes, that makes sense to me.
>
> Cc: Alex Deucher <[email protected]>
> Cc: Andrea Arcangeli <[email protected]>
> Cc: Benjamin Herrenschmidt <[email protected]>
> Cc: Boris Ostrovsky <[email protected]>
> Cc: Christian König <[email protected]>
> Cc: David Airlie <[email protected]>
> Cc: David Rientjes <[email protected]>
> Cc: Dimitri Sivanich <[email protected]>
> Cc: Doug Ledford <[email protected]>
> Cc: Jani Nikula <[email protected]>
> Cc: Jérôme Glisse <[email protected]>
> Cc: Joerg Roedel <[email protected]>
> Cc: Michal Hocko <[email protected]>
> Cc: Mike Marciniszyn <[email protected]>
> Cc: Oded Gabbay <[email protected]>
> Cc: Paolo Bonzini <[email protected]>
> Cc: Paul Mackerras <[email protected]>
> Cc: Radim Krčmář <[email protected]>
> Cc: Sean Hefty <[email protected]>
> Signed-off-by: Andrew Morton <[email protected]>
> ---
>
> include/linux/mmu_notifier.h | 7 ++++---
> mm/mmu_notifier.c | 8 ++++----
> 2 files changed, 8 insertions(+), 7 deletions(-)
>
> diff -puN include/linux/mmu_notifier.h~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix include/linux/mmu_notifier.h
> --- a/include/linux/mmu_notifier.h~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
> +++ a/include/linux/mmu_notifier.h
> @@ -2,6 +2,7 @@
> #ifndef _LINUX_MMU_NOTIFIER_H
> #define _LINUX_MMU_NOTIFIER_H
>
> +#include <linux/types.h>
> #include <linux/list.h>
> #include <linux/spinlock.h>
> #include <linux/mm_types.h>
> @@ -233,7 +234,7 @@ extern void __mmu_notifier_invalidate_ra
> bool only_end);
> extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
> unsigned long start, unsigned long end);
> -extern int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
> +extern bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
>
> static inline void mmu_notifier_release(struct mm_struct *mm)
> {
> @@ -473,9 +474,9 @@ static inline void mmu_notifier_invalida
> {
> }
>
> -static inline int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +static inline bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> {
> - return 0;
> + return false;
> }
>
> static inline void mmu_notifier_mm_init(struct mm_struct *mm)
> diff -puN mm/mmu_notifier.c~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix mm/mmu_notifier.c
> --- a/mm/mmu_notifier.c~mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
> +++ a/mm/mmu_notifier.c
> @@ -240,13 +240,13 @@ EXPORT_SYMBOL_GPL(__mmu_notifier_invalid
> * Must be called while holding mm->mmap_sem for either read or write.
> * The result is guaranteed to be valid until mm->mmap_sem is dropped.
> */
> -int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> {
> struct mmu_notifier *mn;
> int id;
> - int ret = 0;
> + bool ret = false;
>
> - WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
> + WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
>
> if (!mm_has_notifiers(mm))
> return ret;
> @@ -259,7 +259,7 @@ int mm_has_blockable_invalidate_notifier
> continue;
>
> if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
> - ret = 1;
> + ret = true;
> break;
> }
> }
> _
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
--
Michal Hocko
SUSE Labs
On Sat 16-12-17 15:21:51, Tetsuo Handa wrote:
> On 2017/12/16 1:25, Michal Hocko wrote:
> >> struct mmu_notifier_ops {
> >> + /*
> >> + * Flags to specify behavior of callbacks for this MMU notifier.
> >> + * Used to determine which context an operation may be called.
> >> + *
> >> + * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
> >> + * block
> >> + */
> >> + int flags;
> >
> > This should be more specific IMHO. What do you think about the following
> > wording?
> >
> > invalidate_{start,end,range} doesn't block on any locks which depend
> > directly or indirectly (via lock chain or resources e.g. worker context)
> > on a memory allocation.
>
> I disagree. It needlessly complicates validating the correctness.
But it makes it clear what is the actual semantic.
> What if the invalidate_{start,end} calls schedule_timeout_idle(10 * HZ) ?
Let's talk seriously about a real code. Any mmu notifier doing this is
just crazy and should be fixed.
> schedule_timeout_idle() will not block on any locks which depend directly or
> indirectly on a memory allocation, but we are already blocking other memory
> allocating threads at mutex_trylock(&oom_lock) in __alloc_pages_may_oom().
Then the reaper will block and progress would be slower.
> This is essentially same with "sleeping forever due to schedule_timeout_killable(1) by
> SCHED_IDLE thread with oom_lock held" versus "looping due to mutex_trylock(&oom_lock)
> by all other allocating threads" lockup problem. The OOM reaper does not want to get
> blocked for so long.
Yes, it absolutely doesn't want to do that. MMu notifiers should be
reasonable because they are called from performance sensitive call
paths.
--
Michal Hocko
SUSE Labs
On Sat 16-12-17 16:14:07, Tetsuo Handa wrote:
> On 2017/12/16 8:04, Andrew Morton wrote:
> >> The implementation is steered toward an expensive slowpath, such as after
> >> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
> >
> > some tweakage, please review.
> >
> > From: Andrew Morton <[email protected]>
> > Subject: mm-mmu_notifier-annotate-mmu-notifiers-with-blockable-invalidate-callbacks-fix
> >
> > make mm_has_blockable_invalidate_notifiers() return bool, use rwsem_is_locked()
> >
>
> > @@ -240,13 +240,13 @@ EXPORT_SYMBOL_GPL(__mmu_notifier_invalid
> > * Must be called while holding mm->mmap_sem for either read or write.
> > * The result is guaranteed to be valid until mm->mmap_sem is dropped.
> > */
> > -int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> > +bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> > {
> > struct mmu_notifier *mn;
> > int id;
> > - int ret = 0;
> > + bool ret = false;
> >
> > - WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
> > + WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
> >
> > if (!mm_has_notifiers(mm))
> > return ret;
>
> rwsem_is_locked() test isn't equivalent with __mutex_owner() == current test, is it?
> If rwsem_is_locked() returns true because somebody else has locked it, there is
> no guarantee that current thread has locked it before calling this function.
>
> down_write_trylock() test isn't equivalent with __mutex_owner() == current test, is it?
> What if somebody else held it for read or write (the worst case is registration path),
> down_write_trylock() will return false even if current thread has not locked it for
> read or write.
>
> I think this WARN_ON_ONCE() can not detect incorrect call to this function.
Yes it cannot catch _all_ cases. This is an inherent problem of
rwsem_is_locked because semaphores do not really have the owner concept.
The core idea behind this, I guess, is to catch obviously incorrect
usage and as such it gives us a reasonabe coverage. I could live without
the annotation but rwsem_is_locked looks better than down_write_trylock
to me.
--
Michal Hocko
SUSE Labs
Michal Hocko wrote:
> On Sat 16-12-17 16:14:07, Tetsuo Handa wrote:
> > rwsem_is_locked() test isn't equivalent with __mutex_owner() == current test, is it?
> > If rwsem_is_locked() returns true because somebody else has locked it, there is
> > no guarantee that current thread has locked it before calling this function.
> >
> > down_write_trylock() test isn't equivalent with __mutex_owner() == current test, is it?
> > What if somebody else held it for read or write (the worst case is registration path),
> > down_write_trylock() will return false even if current thread has not locked it for
> > read or write.
> >
> > I think this WARN_ON_ONCE() can not detect incorrect call to this function.
>
> Yes it cannot catch _all_ cases. This is an inherent problem of
> rwsem_is_locked because semaphores do not really have the owner concept.
> The core idea behind this, I guess, is to catch obviously incorrect
> usage and as such it gives us a reasonabe coverage. I could live without
> the annotation but rwsem_is_locked looks better than down_write_trylock
> to me.
I agree that rwsem_is_locked() is better than down_write_trylock() because
the former does not have side effect when nobody was holding the rwsem.
Looking at how rwsem_is_locked() is used in mm/ directory for sanity checks,
only VM_BUG_ON(), VM_BUG_ON_MM(), or VM_BUG_ON_VMA() are used. Therefore,
this WARN_ON_ONCE() usage might be irregular.
Also, regarding the problem that semaphores do not really have the owner
concept, we can add "struct task_struct *owner_of_mmap_sem_for_write" to
"struct mm_struct" and replace direct down_write_killable() etc. with
corresponding wrapper functions like
int __must_check get_mmap_sem_write_killable(struct mm_struct *mm) {
if (down_write_killable(&mm->mmap_sem))
return -EINTR;
mm->owner_of_mmap_sem_for_write = current;
return 0;
}
and make the rwsem_is_locked() test more robust by doing like
bool mmap_sem_is_held_for_write_by_current(struct mm_struct *mm) {
return mm->owner_of_mmap_sem_for_write == current;
}
. If there is a guarantee that no thread is allowed to hold multiple
mmap_sem, wrapper functions which manipulate per "struct task_struct"
flag will work.
But the fundamental problem is that we are heavily relying on runtime
testing (e.g. lockdep / syzkaller). Since there are a lot of factors which
prevent sanity checks from being called (e.g. conditional calls based on
threshold check), we can not exercise all paths, and everybody is making
changes without understanding all the dependencies. Consider that nobody
noticed that relying on __GFP_DIRECT_RECLAIM with oom_lock held may cause
lockups. We are too easily introducing unsafe dependency. I think that we
need to describe all the dependencies without relying on runtime testing.
Back to MMU_INVALIDATE_DOES_NOT_BLOCK flag, I worry that we will fail to
notice when somebody in future makes changes with mmu notifier which
currently does not rely on __GFP_DIRECT_RECLAIM to by error rely on
__GFP_DIRECT_RECLAIM. Staying at "whether the callback might sleep"
granularity will help preventing such unnoticed dependency bugs from
being introduced.
Fix mmu_notifier.h comments in "mm, mmu_notifier: annotate mmu notifiers
with blockable invalidate callbacks".
mmu_notifier_invalidate_range_end() can also call the invalidate_range()
callback, so it must not block for MMU_INVALIDATE_DOES_NOT_BLOCK to be
set.
Also remove a bogus comment about invalidate_range() always being called
under the ptl spinlock.
Signed-off-by: David Rientjes <[email protected]>
---
include/linux/mmu_notifier.h | 16 +++++++++-------
1 file changed, 9 insertions(+), 7 deletions(-)
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -34,8 +34,8 @@ struct mmu_notifier_ops {
* Flags to specify behavior of callbacks for this MMU notifier.
* Used to determine which context an operation may be called.
*
- * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
- * block
+ * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_range_* callbacks do not
+ * block
*/
int flags;
@@ -151,8 +151,9 @@ struct mmu_notifier_ops {
* address space but may still be referenced by sptes until
* the last refcount is dropped.
*
- * If both of these callbacks cannot block, mmu_notifier_ops.flags
- * should have MMU_INVALIDATE_DOES_NOT_BLOCK set.
+ * If both of these callbacks cannot block, and invalidate_range
+ * cannot block, mmu_notifier_ops.flags should have
+ * MMU_INVALIDATE_DOES_NOT_BLOCK set.
*/
void (*invalidate_range_start)(struct mmu_notifier *mn,
struct mm_struct *mm,
@@ -175,12 +176,13 @@ struct mmu_notifier_ops {
* external TLB range needs to be flushed. For more in depth
* discussion on this see Documentation/vm/mmu_notifier.txt
*
- * The invalidate_range() function is called under the ptl
- * spin-lock and not allowed to sleep.
- *
* Note that this function might be called with just a sub-range
* of what was passed to invalidate_range_start()/end(), if
* called between those functions.
+ *
+ * If this callback cannot block, and invalidate_range_{start,end}
+ * cannot block, mmu_notifier_ops.flags should have
+ * MMU_INVALIDATE_DOES_NOT_BLOCK set.
*/
void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm,
unsigned long start, unsigned long end);