2015-07-23 21:55:42

by Spencer Baugh

[permalink] [raw]
Subject: [PATCH] mm: add resched points to remap_pmd_range/ioremap_pmd_range

From: Joern Engel <[email protected]>

Mapping large memory spaces can be slow and prevent high-priority
realtime threads from preempting lower-priority threads for a long time.
In my case it was a 256GB mapping causing at least 950ms scheduler
delay. Problem detection is ratelimited and depends on interrupts
happening at the right time, so actual delay is likely worse.

------------[ cut here ]------------
WARNING: at arch/x86/kernel/irq.c:182 do_IRQ+0x126/0x140()
Thread not rescheduled for 36 jiffies
CPU: 14 PID: 6684 Comm: foo Tainted: G O 3.10.59+
0000000000000009 ffff883f7fbc3ee0 ffffffff8163a12c ffff883f7fbc3f18
ffffffff8103f131 ffff887f48275ac0 0000000000000012 000000000000007c
0000000000000000 ffff887f5bc11fd8 ffff883f7fbc3f78 ffffffff8103f19c
Call Trace:
<IRQ> [<ffffffff8163a12c>] dump_stack+0x19/0x1b
[<ffffffff8103f131>] warn_slowpath_common+0x61/0x80
[<ffffffff8103f19c>] warn_slowpath_fmt+0x4c/0x50
[<ffffffff810bd917>] ? rcu_irq_exit+0x77/0xc0
[<ffffffff8164a556>] do_IRQ+0x126/0x140
[<ffffffff816407ef>] common_interrupt+0x6f/0x6f
<EOI> [<ffffffff810fde68>] ? set_pageblock_migratetype+0x28/0x30
[<ffffffff8126da37>] ? clear_page_c_e+0x7/0x10
[<ffffffff811004b3>] ? get_page_from_freelist+0x5b3/0x880
[<ffffffff81100863>] __alloc_pages_nodemask+0xe3/0x810
[<ffffffff8126f48b>] ? trace_hardirqs_on_thunk+0x3a/0x3c
[<ffffffff81138206>] alloc_pages_current+0x86/0x120
[<ffffffff810fc02e>] __get_free_pages+0xe/0x50
[<ffffffff81034e85>] pte_alloc_one_kernel+0x15/0x20
[<ffffffff8111b6cd>] __pte_alloc_kernel+0x1d/0xf0
[<ffffffff8126531c>] ioremap_page_range+0x2cc/0x320
[<ffffffff81031619>] __ioremap_caller+0x1e9/0x2b0
[<ffffffff810316f7>] ioremap_nocache+0x17/0x20
[<ffffffff81275b45>] pci_iomap+0x55/0xb0
[<ffffffffa007f29a>] vfio_pci_mmap+0x1ea/0x210 [vfio_pci]
[<ffffffffa0025173>] vfio_device_fops_mmap+0x23/0x30 [vfio]
[<ffffffff81124ed8>] mmap_region+0x3d8/0x5e0
[<ffffffff811253e5>] do_mmap_pgoff+0x305/0x3c0
[<ffffffff8126f3f3>] ? call_rwsem_down_write_failed+0x13/0x20
[<ffffffff81111677>] vm_mmap_pgoff+0x67/0xa0
[<ffffffff811237e2>] SyS_mmap_pgoff+0x272/0x2e0
[<ffffffff810067e2>] SyS_mmap+0x22/0x30
[<ffffffff81648c59>] system_call_fastpath+0x16/0x1b
---[ end trace 6b0a8d2341444bdd ]---
------------[ cut here ]------------
WARNING: at arch/x86/kernel/irq.c:182 do_IRQ+0x126/0x140()
Thread not rescheduled for 95 jiffies
CPU: 14 PID: 6684 Comm: foo Tainted: G W O 3.10.59+
0000000000000009 ffff883f7fbc3ee0 ffffffff8163a12c ffff883f7fbc3f18
ffffffff8103f131 ffff887f48275ac0 000000000000002f 000000000000007c
0000000000000000 00007fadd1e00000 ffff883f7fbc3f78 ffffffff8103f19c
Call Trace:
<IRQ> [<ffffffff8163a12c>] dump_stack+0x19/0x1b
[<ffffffff8103f131>] warn_slowpath_common+0x61/0x80
[<ffffffff8103f19c>] warn_slowpath_fmt+0x4c/0x50
[<ffffffff810bd917>] ? rcu_irq_exit+0x77/0xc0
[<ffffffff8164a556>] do_IRQ+0x126/0x140
[<ffffffff816407ef>] common_interrupt+0x6f/0x6f
<EOI> [<ffffffff81640483>] ? _raw_spin_lock+0x13/0x30
[<ffffffff8111b621>] __pte_alloc+0x31/0xc0
[<ffffffff8111feac>] remap_pfn_range+0x45c/0x470
[<ffffffffa007f1f8>] vfio_pci_mmap+0x148/0x210 [vfio_pci]
[<ffffffffa0025173>] vfio_device_fops_mmap+0x23/0x30 [vfio]
[<ffffffff81124ed8>] mmap_region+0x3d8/0x5e0
[<ffffffff811253e5>] do_mmap_pgoff+0x305/0x3c0
[<ffffffff8126f3f3>] ? call_rwsem_down_write_failed+0x13/0x20
[<ffffffff81111677>] vm_mmap_pgoff+0x67/0xa0
[<ffffffff811237e2>] SyS_mmap_pgoff+0x272/0x2e0
[<ffffffff810067e2>] SyS_mmap+0x22/0x30
[<ffffffff81648c59>] system_call_fastpath+0x16/0x1b
---[ end trace 6b0a8d2341444bde ]---
------------[ cut here ]------------
WARNING: at arch/x86/kernel/irq.c:182 do_IRQ+0x126/0x140()
Thread not rescheduled for 45 jiffies
CPU: 18 PID: 21726 Comm: foo Tainted: G O 3.10.59+
0000000000000009 ffff88203f203ee0 ffffffff8163a13c ffff88203f203f18
ffffffff8103f131 ffff881ec5f1ad60 0000000000000016 000000000000006e
0000000000000000 ffffc939a6dd8000 ffff88203f203f78 ffffffff8103f19c
Call Trace:
<IRQ> [<ffffffff8163a13c>] dump_stack+0x19/0x1b
[<ffffffff8103f131>] warn_slowpath_common+0x61/0x80
[<ffffffff8103f19c>] warn_slowpath_fmt+0x4c/0x50
[<ffffffff810bd917>] ? rcu_irq_exit+0x77/0xc0
[<ffffffff8164a556>] do_IRQ+0x126/0x140
[<ffffffff816407ef>] common_interrupt+0x6f/0x6f
<EOI> [<ffffffff81640861>] ? retint_restore_args+0x13/0x13
[<ffffffff810346c7>] ? free_memtype+0x87/0x150
[<ffffffff8112bb46>] ? vunmap_page_range+0x1e6/0x2a0
[<ffffffff8112c5e1>] remove_vm_area+0x51/0x70
[<ffffffff810318a7>] iounmap+0x67/0xa0
[<ffffffff812757e5>] pci_iounmap+0x35/0x40
[<ffffffffa00973da>] vfio_pci_release+0x9a/0x150 [vfio_pci]
[<ffffffffa0065cbc>] vfio_device_fops_release+0x1c/0x40 [vfio]
[<ffffffff8114d82b>] __fput+0xdb/0x220
[<ffffffff8114d97e>] ____fput+0xe/0x10
[<ffffffff810614ac>] task_work_run+0xbc/0xe0
[<ffffffff81043d0e>] do_exit+0x3ce/0xe50
[<ffffffff8104557f>] do_group_exit+0x3f/0xa0
[<ffffffff81054769>] get_signal_to_deliver+0x1a9/0x5b0
[<ffffffff810023f8>] do_signal+0x48/0x5e0
[<ffffffff81056778>] ? k_getrusage+0x368/0x3d0
[<ffffffff810736e2>] ? default_wake_function+0x12/0x20
[<ffffffff816471c0>] ? kprobe_flush_task+0xc0/0x150
[<ffffffff81070684>] ? finish_task_switch+0xc4/0xe0
[<ffffffff810029f5>] do_notify_resume+0x65/0x80
[<ffffffff8164098e>] retint_signal+0x4d/0x9f
---[ end trace 3506c05e4a0af3e5 ]---

Signed-off-by: Joern Engel <[email protected]>
Signed-off-by: Spencer Baugh <[email protected]>
---
lib/ioremap.c | 1 +
mm/memory.c | 1 +
mm/vmalloc.c | 1 +
3 files changed, 3 insertions(+)

diff --git a/lib/ioremap.c b/lib/ioremap.c
index 86c8911..d38e46d 100644
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -90,6 +90,7 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,

if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, prot))
return -ENOMEM;
+ cond_resched();
} while (pmd++, addr = next, addr != end);
return 0;
}
diff --git a/mm/memory.c b/mm/memory.c
index 388dcf9..1541880 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1656,6 +1656,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
if (remap_pte_range(mm, pmd, addr, next,
pfn + (addr >> PAGE_SHIFT), prot))
return -ENOMEM;
+ cond_resched();
} while (pmd++, addr = next, addr != end);
return 0;
}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 2faaa29..d503c8e 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -80,6 +80,7 @@ static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
if (pmd_none_or_clear_bad(pmd))
continue;
vunmap_pte_range(pmd, addr, next);
+ cond_resched();
} while (pmd++, addr = next, addr != end);
}

--
2.5.0.rc3


2015-07-23 23:33:22

by Toshi Kani

[permalink] [raw]
Subject: Re: [PATCH] mm: add resched points to remap_pmd_range/ioremap_pmd_range

On Thu, 2015-07-23 at 14:54 -0700, Spencer Baugh wrote:
> From: Joern Engel <[email protected]>
>
> Mapping large memory spaces can be slow and prevent high-priority
> realtime threads from preempting lower-priority threads for a long time.

Yes, and one of the goals of large page ioremap support is to address such
problem.

> In my case it was a 256GB mapping causing at least 950ms scheduler
> delay. Problem detection is ratelimited and depends on interrupts
> happening at the right time, so actual delay is likely worse.

ioremap supports 1GB and 2MB mappings now. If you create 1GB mappings, you
only need to initialize 256 pud entries, which should not take a long time.

Is the 256GB range aligned by 1GB (or 2MB)? From the log below, it appears
that you ended up with 4KB mappings, which is the problem.

> ------------[ cut here ]------------
> WARNING: at arch/x86/kernel/irq.c:182 do_IRQ+0x126/0x140()
> Thread not rescheduled for 36 jiffies
> CPU: 14 PID: 6684 Comm: foo Tainted: G O 3.10.59+
> 0000000000000009 ffff883f7fbc3ee0 ffffffff8163a12c ffff883f7fbc3f18
> ffffffff8103f131 ffff887f48275ac0 0000000000000012 000000000000007c
> 0000000000000000 ffff887f5bc11fd8 ffff883f7fbc3f78 ffffffff8103f19c
> Call Trace:
> <IRQ> [<ffffffff8163a12c>] dump_stack+0x19/0x1b
> [<ffffffff8103f131>] warn_slowpath_common+0x61/0x80
> [<ffffffff8103f19c>] warn_slowpath_fmt+0x4c/0x50
> [<ffffffff810bd917>] ? rcu_irq_exit+0x77/0xc0
> [<ffffffff8164a556>] do_IRQ+0x126/0x140
> [<ffffffff816407ef>] common_interrupt+0x6f/0x6f
> <EOI> [<ffffffff810fde68>] ? set_pageblock_migratetype+0x28/0x30
> [<ffffffff8126da37>] ? clear_page_c_e+0x7/0x10
> [<ffffffff811004b3>] ? get_page_from_freelist+0x5b3/0x880
> [<ffffffff81100863>] __alloc_pages_nodemask+0xe3/0x810
> [<ffffffff8126f48b>] ? trace_hardirqs_on_thunk+0x3a/0x3c
> [<ffffffff81138206>] alloc_pages_current+0x86/0x120
> [<ffffffff810fc02e>] __get_free_pages+0xe/0x50
> [<ffffffff81034e85>] pte_alloc_one_kernel+0x15/0x20
> [<ffffffff8111b6cd>] __pte_alloc_kernel+0x1d/0xf0

This shows that you created 4KB (pte) mappings.

> [<ffffffff8126531c>] ioremap_page_range+0x2cc/0x320
> [<ffffffff81031619>] __ioremap_caller+0x1e9/0x2b0
> [<ffffffff810316f7>] ioremap_nocache+0x17/0x20
> [<ffffffff81275b45>] pci_iomap+0x55/0xb0
> [<ffffffffa007f29a>] vfio_pci_mmap+0x1ea/0x210 [vfio_pci]
> [<ffffffffa0025173>] vfio_device_fops_mmap+0x23/0x30 [vfio]
> [<ffffffff81124ed8>] mmap_region+0x3d8/0x5e0
> [<ffffffff811253e5>] do_mmap_pgoff+0x305/0x3c0
> [<ffffffff8126f3f3>] ? call_rwsem_down_write_failed+0x13/0x20
> [<ffffffff81111677>] vm_mmap_pgoff+0x67/0xa0
> [<ffffffff811237e2>] SyS_mmap_pgoff+0x272/0x2e0
> [<ffffffff810067e2>] SyS_mmap+0x22/0x30
> [<ffffffff81648c59>] system_call_fastpath+0x16/0x1b
> ---[ end trace 6b0a8d2341444bdd ]---
> ------------[ cut here ]------------
> WARNING: at arch/x86/kernel/irq.c:182 do_IRQ+0x126/0x140()
> Thread not rescheduled for 95 jiffies
> CPU: 14 PID: 6684 Comm: foo Tainted: G W O 3.10.59+
> 0000000000000009 ffff883f7fbc3ee0 ffffffff8163a12c ffff883f7fbc3f18
> ffffffff8103f131 ffff887f48275ac0 000000000000002f 000000000000007c
> 0000000000000000 00007fadd1e00000 ffff883f7fbc3f78 ffffffff8103f19c
> Call Trace:
> <IRQ> [<ffffffff8163a12c>] dump_stack+0x19/0x1b
> [<ffffffff8103f131>] warn_slowpath_common+0x61/0x80
> [<ffffffff8103f19c>] warn_slowpath_fmt+0x4c/0x50
> [<ffffffff810bd917>] ? rcu_irq_exit+0x77/0xc0
> [<ffffffff8164a556>] do_IRQ+0x126/0x140
> [<ffffffff816407ef>] common_interrupt+0x6f/0x6f
> <EOI> [<ffffffff81640483>] ? _raw_spin_lock+0x13/0x30
> [<ffffffff8111b621>] __pte_alloc+0x31/0xc0
> [<ffffffff8111feac>] remap_pfn_range+0x45c/0x470

remap_pfn_range() does not have large page mappings support yet. So, yes,
this can still take a long time at this point. We can extend large page
support for this interface if necessary.

> [<ffffffffa007f1f8>] vfio_pci_mmap+0x148/0x210 [vfio_pci]
> [<ffffffffa0025173>] vfio_device_fops_mmap+0x23/0x30 [vfio]
> [<ffffffff81124ed8>] mmap_region+0x3d8/0x5e0
> [<ffffffff811253e5>] do_mmap_pgoff+0x305/0x3c0
> [<ffffffff8126f3f3>] ? call_rwsem_down_write_failed+0x13/0x20
> [<ffffffff81111677>] vm_mmap_pgoff+0x67/0xa0
> [<ffffffff811237e2>] SyS_mmap_pgoff+0x272/0x2e0
> [<ffffffff810067e2>] SyS_mmap+0x22/0x30
> [<ffffffff81648c59>] system_call_fastpath+0x16/0x1b
> ---[ end trace 6b0a8d2341444bde ]---
> ------------[ cut here ]------------
> WARNING: at arch/x86/kernel/irq.c:182 do_IRQ+0x126/0x140()
> Thread not rescheduled for 45 jiffies
> CPU: 18 PID: 21726 Comm: foo Tainted: G O 3.10.59+
> 0000000000000009 ffff88203f203ee0 ffffffff8163a13c ffff88203f203f18
> ffffffff8103f131 ffff881ec5f1ad60 0000000000000016 000000000000006e
> 0000000000000000 ffffc939a6dd8000 ffff88203f203f78 ffffffff8103f19c
> Call Trace:
> <IRQ> [<ffffffff8163a13c>] dump_stack+0x19/0x1b
> [<ffffffff8103f131>] warn_slowpath_common+0x61/0x80
> [<ffffffff8103f19c>] warn_slowpath_fmt+0x4c/0x50
> [<ffffffff810bd917>] ? rcu_irq_exit+0x77/0xc0
> [<ffffffff8164a556>] do_IRQ+0x126/0x140
> [<ffffffff816407ef>] common_interrupt+0x6f/0x6f
> <EOI> [<ffffffff81640861>] ? retint_restore_args+0x13/0x13
> [<ffffffff810346c7>] ? free_memtype+0x87/0x150
> [<ffffffff8112bb46>] ? vunmap_page_range+0x1e6/0x2a0
> [<ffffffff8112c5e1>] remove_vm_area+0x51/0x70
> [<ffffffff810318a7>] iounmap+0x67/0xa0

iounmap() should be fast if you created 1GB mappings.

Thanks,
-Toshi

> [<ffffffff812757e5>] pci_iounmap+0x35/0x40
> [<ffffffffa00973da>] vfio_pci_release+0x9a/0x150 [vfio_pci]
> [<ffffffffa0065cbc>] vfio_device_fops_release+0x1c/0x40 [vfio]
> [<ffffffff8114d82b>] __fput+0xdb/0x220
> [<ffffffff8114d97e>] ____fput+0xe/0x10
> [<ffffffff810614ac>] task_work_run+0xbc/0xe0
> [<ffffffff81043d0e>] do_exit+0x3ce/0xe50
> [<ffffffff8104557f>] do_group_exit+0x3f/0xa0
> [<ffffffff81054769>] get_signal_to_deliver+0x1a9/0x5b0
> [<ffffffff810023f8>] do_signal+0x48/0x5e0
> [<ffffffff81056778>] ? k_getrusage+0x368/0x3d0
> [<ffffffff810736e2>] ? default_wake_function+0x12/0x20
> [<ffffffff816471c0>] ? kprobe_flush_task+0xc0/0x150
> [<ffffffff81070684>] ? finish_task_switch+0xc4/0xe0
> [<ffffffff810029f5>] do_notify_resume+0x65/0x80
> [<ffffffff8164098e>] retint_signal+0x4d/0x9f
> ---[ end trace 3506c05e4a0af3e5 ]---

2015-07-24 07:04:28

by Michal Hocko

[permalink] [raw]
Subject: Re: [PATCH] mm: add resched points to remap_pmd_range/ioremap_pmd_range

On Thu 23-07-15 14:54:33, Spencer Baugh wrote:
> From: Joern Engel <[email protected]>
>
> Mapping large memory spaces can be slow and prevent high-priority
> realtime threads from preempting lower-priority threads for a long time.

How can a lower priority task block the high priority one? Do you have
preemption disabled?

[...]
--
Michal Hocko
SUSE Labs

2015-07-24 16:56:33

by Jörn Engel

[permalink] [raw]
Subject: Re: [PATCH] mm: add resched points to remap_pmd_range/ioremap_pmd_range

On Fri, Jul 24, 2015 at 09:04:21AM +0200, Michal Hocko wrote:
> On Thu 23-07-15 14:54:33, Spencer Baugh wrote:
> > From: Joern Engel <[email protected]>
> >
> > Mapping large memory spaces can be slow and prevent high-priority
> > realtime threads from preempting lower-priority threads for a long time.
>
> How can a lower priority task block the high priority one? Do you have
> preemption disabled?

Yes.

J?rn

--
If you're willing to restrict the flexibility of your approach,
you can almost always do something better.
-- John Carmack

2015-07-24 17:00:42

by Jörn Engel

[permalink] [raw]
Subject: Re: [PATCH] mm: add resched points to remap_pmd_range/ioremap_pmd_range

On Thu, Jul 23, 2015 at 05:32:03PM -0600, Toshi Kani wrote:
> On Thu, 2015-07-23 at 14:54 -0700, Spencer Baugh wrote:
> > From: Joern Engel <[email protected]>
> >
> > Mapping large memory spaces can be slow and prevent high-priority
> > realtime threads from preempting lower-priority threads for a long time.
>
> Yes, and one of the goals of large page ioremap support is to address such
> problem.

Nice! Once we upgrade we should retest this one then.

> > ------------[ cut here ]------------
> > WARNING: at arch/x86/kernel/irq.c:182 do_IRQ+0x126/0x140()
> > Thread not rescheduled for 95 jiffies
> > CPU: 14 PID: 6684 Comm: foo Tainted: G W O 3.10.59+
> > 0000000000000009 ffff883f7fbc3ee0 ffffffff8163a12c ffff883f7fbc3f18
> > ffffffff8103f131 ffff887f48275ac0 000000000000002f 000000000000007c
> > 0000000000000000 00007fadd1e00000 ffff883f7fbc3f78 ffffffff8103f19c
> > Call Trace:
> > <IRQ> [<ffffffff8163a12c>] dump_stack+0x19/0x1b
> > [<ffffffff8103f131>] warn_slowpath_common+0x61/0x80
> > [<ffffffff8103f19c>] warn_slowpath_fmt+0x4c/0x50
> > [<ffffffff810bd917>] ? rcu_irq_exit+0x77/0xc0
> > [<ffffffff8164a556>] do_IRQ+0x126/0x140
> > [<ffffffff816407ef>] common_interrupt+0x6f/0x6f
> > <EOI> [<ffffffff81640483>] ? _raw_spin_lock+0x13/0x30
> > [<ffffffff8111b621>] __pte_alloc+0x31/0xc0
> > [<ffffffff8111feac>] remap_pfn_range+0x45c/0x470
>
> remap_pfn_range() does not have large page mappings support yet. So, yes,
> this can still take a long time at this point. We can extend large page
> support for this interface if necessary.

A cond_resched() is enough to solve the latency impact. But I suspect
large pages will perform better as well, so having that support would be
appreciated.

> > [<ffffffffa007f1f8>] vfio_pci_mmap+0x148/0x210 [vfio_pci]
> > [<ffffffffa0025173>] vfio_device_fops_mmap+0x23/0x30 [vfio]
> > [<ffffffff81124ed8>] mmap_region+0x3d8/0x5e0
> > [<ffffffff811253e5>] do_mmap_pgoff+0x305/0x3c0
> > [<ffffffff8126f3f3>] ? call_rwsem_down_write_failed+0x13/0x20
> > [<ffffffff81111677>] vm_mmap_pgoff+0x67/0xa0
> > [<ffffffff811237e2>] SyS_mmap_pgoff+0x272/0x2e0
> > [<ffffffff810067e2>] SyS_mmap+0x22/0x30
> > [<ffffffff81648c59>] system_call_fastpath+0x16/0x1b
> > ---[ end trace 6b0a8d2341444bde ]---

J?rn

--
A defeated army first battles and then seeks victory.
-- Sun Tzu

2015-07-27 07:08:46

by Michal Hocko

[permalink] [raw]
Subject: Re: [PATCH] mm: add resched points to remap_pmd_range/ioremap_pmd_range

On Fri 24-07-15 09:56:27, J?rn Engel wrote:
> On Fri, Jul 24, 2015 at 09:04:21AM +0200, Michal Hocko wrote:
> > On Thu 23-07-15 14:54:33, Spencer Baugh wrote:
> > > From: Joern Engel <[email protected]>
> > >
> > > Mapping large memory spaces can be slow and prevent high-priority
> > > realtime threads from preempting lower-priority threads for a long time.
> >
> > How can a lower priority task block the high priority one? Do you have
> > preemption disabled?
>
> Yes.

Yes what? PREEMT enabled and still low priority task starving a high
priority one? What is your exact setup?
--
Michal Hocko
SUSE Labs

2015-07-27 08:29:58

by Mike Galbraith

[permalink] [raw]
Subject: Re: [PATCH] mm: add resched points to remap_pmd_range/ioremap_pmd_range

On Mon, 2015-07-27 at 09:08 +0200, Michal Hocko wrote:
> On Fri 24-07-15 09:56:27, Jörn Engel wrote:
> > On Fri, Jul 24, 2015 at 09:04:21AM +0200, Michal Hocko wrote:
> > > On Thu 23-07-15 14:54:33, Spencer Baugh wrote:
> > > > From: Joern Engel <[email protected]>
> > > >
> > > > Mapping large memory spaces can be slow and prevent high-priority
> > > > realtime threads from preempting lower-priority threads for a long time.
> > >
> > > How can a lower priority task block the high priority one? Do you have
> > > preemption disabled?
> >
> > Yes.
>
> Yes what? PREEMT enabled and still low priority task starving a high
> priority one? What is your exact setup?

There are other places that are pretty horrible too if you don't run a
PREEMPT kernel. Spending milliseconds in kernel kinda takes the real
outta realtime, even for the most casual of users.

(ponder: preempt kernel for rt only, rt could have decent latency
without driving normal task throughput through the floor)

kbuild make -j8 + cyclictest -Smp99

PREEMPT_VOLUNTARY Before:
T: 0 ( 6459) P:99 I:1000 C: 286022 Min: 1 Act: 1 Avg: 5 Max: 1718
T: 1 ( 6460) P:99 I:1500 C: 190701 Min: 1 Act: 1 Avg: 5 Max: 1639
T: 2 ( 6461) P:99 I:2000 C: 143024 Min: 1 Act: 2 Avg: 5 Max: 2504
T: 3 ( 6462) P:99 I:2500 C: 114420 Min: 1 Act: 1 Avg: 5 Max: 1922
T: 4 ( 6463) P:99 I:3000 C: 95350 Min: 1 Act: 1 Avg: 5 Max: 1482
T: 5 ( 6464) P:99 I:3500 C: 81728 Min: 1 Act: 2 Avg: 5 Max: 1496
T: 6 ( 6465) P:99 I:4000 C: 71511 Min: 1 Act: 1 Avg: 5 Max: 1813
T: 7 ( 6466) P:99 I:4500 C: 63566 Min: 1 Act: 1 Avg: 5 Max: 1901

PREEMPT_VOLUNTARY After:
T: 0 ( 6997) P:99 I:1000 C: 286032 Min: 1 Act: 2 Avg: 3 Max: 125
T: 1 ( 6998) P:99 I:1500 C: 190687 Min: 1 Act: 1 Avg: 4 Max: 130
T: 2 ( 6999) P:99 I:2000 C: 143015 Min: 1 Act: 1 Avg: 4 Max: 97
T: 3 ( 7000) P:99 I:2500 C: 114411 Min: 1 Act: 2 Avg: 4 Max: 90
T: 4 ( 7001) P:99 I:3000 C: 95341 Min: 1 Act: 1 Avg: 4 Max: 139
T: 5 ( 7002) P:99 I:3500 C: 81722 Min: 1 Act: 2 Avg: 4 Max: 112
T: 6 ( 7003) P:99 I:4000 C: 71506 Min: 1 Act: 2 Avg: 4 Max: 137
T: 7 ( 7004) P:99 I:4500 C: 63561 Min: 1 Act: 2 Avg: 4 Max: 109

---
mm/memory.c | 8 ++++++--
mm/page_alloc.c | 1 +
2 files changed, 7 insertions(+), 2 deletions(-)

--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1174,8 +1174,10 @@ static unsigned long zap_pte_range(struc
force_flush = 0;
tlb_flush_mmu_free(tlb);

- if (addr != end)
+ if (addr != end) {
+ cond_resched();
goto again;
+ }
}

return addr;
@@ -1336,8 +1338,10 @@ void unmap_vmas(struct mmu_gather *tlb,
struct mm_struct *mm = vma->vm_mm;

mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
- for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
+ for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
+ cond_resched();
+ }
mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
}

--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1934,6 +1934,7 @@ void free_hot_cold_page_list(struct list
list_for_each_entry_safe(page, next, list, lru) {
trace_mm_page_free_batched(page, cold);
free_hot_cold_page(page, cold);
+ cond_resched();
}
}




2015-07-27 15:18:19

by Jörn Engel

[permalink] [raw]
Subject: Re: [PATCH] mm: add resched points to remap_pmd_range/ioremap_pmd_range

On Mon, Jul 27, 2015 at 09:08:42AM +0200, Michal Hocko wrote:
> On Fri 24-07-15 09:56:27, Jörn Engel wrote:
> > On Fri, Jul 24, 2015 at 09:04:21AM +0200, Michal Hocko wrote:
> > > On Thu 23-07-15 14:54:33, Spencer Baugh wrote:
> > > > From: Joern Engel <[email protected]>
> > > >
> > > > Mapping large memory spaces can be slow and prevent high-priority
> > > > realtime threads from preempting lower-priority threads for a long time.
> > >
> > > How can a lower priority task block the high priority one? Do you have
> > > preemption disabled?
> >
> > Yes.

We have kernel preemption disabled. A lower-priority task in a system
call will block higher-priority tasks.

Jörn

--
After Iraq, if a PM said there are trees in the rainforest you’d need
to send for proof.
-- Alex Thompson

2015-07-28 13:33:01

by Michal Hocko

[permalink] [raw]
Subject: Re: [PATCH] mm: add resched points to remap_pmd_range/ioremap_pmd_range

On Mon 27-07-15 08:18:14, J?rn Engel wrote:
> On Mon, Jul 27, 2015 at 09:08:42AM +0200, Michal Hocko wrote:
> > On Fri 24-07-15 09:56:27, J?rn Engel wrote:
> > > On Fri, Jul 24, 2015 at 09:04:21AM +0200, Michal Hocko wrote:
> > > > On Thu 23-07-15 14:54:33, Spencer Baugh wrote:
> > > > > From: Joern Engel <[email protected]>
> > > > >
> > > > > Mapping large memory spaces can be slow and prevent high-priority
> > > > > realtime threads from preempting lower-priority threads for a long time.
> > > >
> > > > How can a lower priority task block the high priority one? Do you have
> > > > preemption disabled?
> > >
> > > Yes.
>
> We have kernel preemption disabled. A lower-priority task in a system
> call will block higher-priority tasks.

This is an inherent problem of !PREEMPT, though. There are many
loops which can take quite some time but we do not want to sprinkle
cond_resched all over the kernel. On the other hand these io/remap resp.
vunmap page table walks do not have any cond_resched points AFAICS so we
can at least mimic zap_pmd_range which does cond_resched.
--
Michal Hocko
SUSE Labs

2015-07-28 17:08:51

by Jörn Engel

[permalink] [raw]
Subject: Re: [PATCH] mm: add resched points to remap_pmd_range/ioremap_pmd_range

On Tue, Jul 28, 2015 at 03:32:55PM +0200, Michal Hocko wrote:
> >
> > We have kernel preemption disabled. A lower-priority task in a system
> > call will block higher-priority tasks.
>
> This is an inherent problem of !PREEMPT, though. There are many
> loops which can take quite some time but we do not want to sprinkle
> cond_resched all over the kernel. On the other hand these io/remap resp.
> vunmap page table walks do not have any cond_resched points AFAICS so we
> can at least mimic zap_pmd_range which does cond_resched.

Even for !PREEMPT we don't want infinite scheduler latencies. Real
question is how much we are willing to accept and at what point we
should start sprinkling cond_resched. I would pick 100ms, but that is
just a personal choice. If we decide on 200ms or 500ms, I can live with
that too.

But whatever value we pick, I suspect these resched points need to go in
eventually. As memory sizes grow, people will also start mapping bigger
regions and the scheduler latency will eventually exceed whatever value
we picked.

J?rn

--
Fools ignore complexity. Pragmatists suffer it.
Some can avoid it. Geniuses remove it.
-- Perlis's Programming Proverb #58, SIGPLAN Notices, Sept. 1982

2015-07-29 09:54:45

by Michal Hocko

[permalink] [raw]
Subject: Re: [PATCH] mm: add resched points to remap_pmd_range/ioremap_pmd_range

On Tue 28-07-15 10:08:44, J?rn Engel wrote:
> On Tue, Jul 28, 2015 at 03:32:55PM +0200, Michal Hocko wrote:
> > >
> > > We have kernel preemption disabled. A lower-priority task in a system
> > > call will block higher-priority tasks.
> >
> > This is an inherent problem of !PREEMPT, though. There are many
> > loops which can take quite some time but we do not want to sprinkle
> > cond_resched all over the kernel. On the other hand these io/remap resp.
> > vunmap page table walks do not have any cond_resched points AFAICS so we
> > can at least mimic zap_pmd_range which does cond_resched.
>
> Even for !PREEMPT we don't want infinite scheduler latencies. Real
> question is how much we are willing to accept and at what point we
> should start sprinkling cond_resched. I would pick 100ms, but that is
> just a personal choice. If we decide on 200ms or 500ms, I can live with
> that too.

I do not thing this is about a magic value. It is more about natural
places for scheduling point. As I've written above cond_resched at pmd
level of the page table walk sounds reasonable to me as we do that
already for zap_pmd_range and consistency would make sense to me.

--
Michal Hocko
SUSE Labs

2015-07-30 15:23:00

by Mike Galbraith

[permalink] [raw]
Subject: Re: [PATCH] mm: add resched points to remap_pmd_range/ioremap_pmd_range

On Wed, 2015-07-29 at 11:54 +0200, Michal Hocko wrote:
> On Tue 28-07-15 10:08:44, Jörn Engel wrote:
> > On Tue, Jul 28, 2015 at 03:32:55PM +0200, Michal Hocko wrote:
> > > >
> > > > We have kernel preemption disabled. A lower-priority task in a system
> > > > call will block higher-priority tasks.
> > >
> > > This is an inherent problem of !PREEMPT, though. There are many
> > > loops which can take quite some time but we do not want to sprinkle
> > > cond_resched all over the kernel. On the other hand these io/remap resp.
> > > vunmap page table walks do not have any cond_resched points AFAICS so we
> > > can at least mimic zap_pmd_range which does cond_resched.
> >
> > Even for !PREEMPT we don't want infinite scheduler latencies. Real
> > question is how much we are willing to accept and at what point we
> > should start sprinkling cond_resched. I would pick 100ms, but that is
> > just a personal choice. If we decide on 200ms or 500ms, I can live with
> > that too.
>
> I do not thing this is about a magic value. It is more about natural
> places for scheduling point. As I've written above cond_resched at pmd
> level of the page table walk sounds reasonable to me as we do that
> already for zap_pmd_range and consistency would make sense to me.

I piddled about with the thought that it might be nice to be able to
sprinkle cond_resched() about to cut rt latencies without wrecking
normal load throughput, cobbled together a cond_resched_rt().

On my little box that was a waste of time, as the biggest hits are block
softirq and free_hot_cold_page_list().

-Mike

2015-07-30 16:58:10

by Jörn Engel

[permalink] [raw]
Subject: Re: [PATCH] mm: add resched points to remap_pmd_range/ioremap_pmd_range

On Thu, Jul 30, 2015 at 05:22:55PM +0200, Mike Galbraith wrote:
>
> I piddled about with the thought that it might be nice to be able to
> sprinkle cond_resched() about to cut rt latencies without wrecking
> normal load throughput, cobbled together a cond_resched_rt().
>
> On my little box that was a waste of time, as the biggest hits are block
> softirq and free_hot_cold_page_list().

Block softirq is one of our problems as well. It is a bit of a joke
that __do_softirq() moves work to ksoftirqd after 2ms, but block softirq
can take several 100ms in bad cases.

We could give individual softirqs a time budget. If they exceed the
budget they should complete, but reassert themselves. Not sure about
the rest, but that would be pretty simple to implement for block
softirq.

J?rn

--
Happiness isn't having what you want, it's wanting what you have.
-- unknown

2015-07-30 18:55:27

by Mike Galbraith

[permalink] [raw]
Subject: Re: [PATCH] mm: add resched points to remap_pmd_range/ioremap_pmd_range

On Thu, 2015-07-30 at 09:58 -0700, Jörn Engel wrote:
> On Thu, Jul 30, 2015 at 05:22:55PM +0200, Mike Galbraith wrote:
> >
> > I piddled about with the thought that it might be nice to be able to
> > sprinkle cond_resched() about to cut rt latencies without wrecking
> > normal load throughput, cobbled together a cond_resched_rt().
> >
> > On my little box that was a waste of time, as the biggest hits are block
> > softirq and free_hot_cold_page_list().
>
> Block softirq is one of our problems as well. It is a bit of a joke
> that __do_softirq() moves work to ksoftirqd after 2ms, but block softirq
> can take several 100ms in bad cases.
>
> We could give individual softirqs a time budget. If they exceed the
> budget they should complete, but reassert themselves. Not sure about
> the rest, but that would be pretty simple to implement for block
> softirq.

Yeah, it wants something, not sure what though. Fix up every spot that
hinders rt performance, you'll end up with PREEMPT_RT, and generic
performance falls straight through the floor. Darn.

-Mike

2015-08-06 08:56:05

by Mike Galbraith

[permalink] [raw]
Subject: Re: [PATCH] mm: add resched points to remap_pmd_range/ioremap_pmd_range

On Thu, 2015-07-30 at 09:58 -0700, Jörn Engel wrote:
> On Thu, Jul 30, 2015 at 05:22:55PM +0200, Mike Galbraith wrote:
> >
> > I piddled about with the thought that it might be nice to be able to
> > sprinkle cond_resched() about to cut rt latencies without wrecking
> > normal load throughput, cobbled together a cond_resched_rt().
> >
> > On my little box that was a waste of time, as the biggest hits are block
> > softirq and free_hot_cold_page_list().
>
> Block softirq is one of our problems as well. It is a bit of a joke
> that __do_softirq() moves work to ksoftirqd after 2ms, but block softirq
> can take several 100ms in bad cases.

On my little desktop box, one blk_done_softirq() loop iteration can take
up to a few milliseconds, leaving me wondering if breaking that loop
will help a studly box much. iow, I'd like to know how bad it gets, if
one iteration can be huge, loop breaking there is fairly pointless, and
I can stop fiddling. Do you happen to know iteration time during a size
huge block softirq hit? On my little box, loop break/re-raise and
whatnot improves the general case substantially, but doesn't do much at
all for worst case.. or rather the next worst case in a list of unknown
length ;-)

-Mike

2015-08-09 09:25:46

by Mike Galbraith

[permalink] [raw]
Subject: [hack] sched: create PREEMPT_VOLUNTARY_RT and some RT specific resched points

On Thu, 2015-07-30 at 20:55 +0200, Mike Galbraith wrote:
> On Thu, 2015-07-30 at 09:58 -0700, Jörn Engel wrote:
> > On Thu, Jul 30, 2015 at 05:22:55PM +0200, Mike Galbraith wrote:
> > >
> > > I piddled about with the thought that it might be nice to be able to
> > > sprinkle cond_resched() about to cut rt latencies without wrecking
> > > normal load throughput, cobbled together a cond_resched_rt().
> > >
> > > On my little box that was a waste of time, as the biggest hits are block
> > > softirq and free_hot_cold_page_list().
> >
> > Block softirq is one of our problems as well. It is a bit of a joke
> > that __do_softirq() moves work to ksoftirqd after 2ms, but block softirq
> > can take several 100ms in bad cases.
> >
> > We could give individual softirqs a time budget. If they exceed the
> > budget they should complete, but reassert themselves. Not sure about
> > the rest, but that would be pretty simple to implement for block
> > softirq.
>
> Yeah, it wants something, not sure what though. Fix up every spot that
> hinders rt performance, you'll end up with PREEMPT_RT, and generic
> performance falls straight through the floor. Darn.

So back to that cond_resched_rt() thingy...

The below isn't a cure all, isn't intended to be one, nor will it win
any beauty contests. Happily, experiments only have produce interesting
results. This one is simple, works better than I expected it to on my
little desktop box, and hypothetically speaking shouldn't wreck
throughput, so what the heck, let's see if anybody with casual use RT
latency woes wants to play with it...

Warranty: "You get to keep the pieces."

...after reading the fine print.

Some numbers:

make clean;make -j8;sync;sudo killall cyclictest

master PREEMPT_NONE
cyclictest -Smp99
# /dev/cpu_dma_latency set to 0us
policy: fifo: loadavg: 6.76 7.00 3.95 2/353 15702

T: 0 ( 3524) P:99 I:1000 C: 576934 Min: 1 Act: 4 Avg: 5 Max: 1650
T: 1 ( 3525) P:99 I:1500 C: 384683 Min: 1 Act: 1 Avg: 5 Max: 1386
T: 2 ( 3526) P:99 I:2000 C: 288512 Min: 1 Act: 1 Avg: 6 Max: 1463
T: 3 ( 3527) P:99 I:2500 C: 230809 Min: 1 Act: 1 Avg: 6 Max: 1459
T: 4 ( 3528) P:99 I:3000 C: 192340 Min: 1 Act: 1 Avg: 6 Max: 1381
T: 5 ( 3529) P:99 I:3500 C: 164863 Min: 1 Act: 1 Avg: 6 Max: 1970
T: 6 ( 3530) P:99 I:4000 C: 144254 Min: 1 Act: 1 Avg: 6 Max: 1389
T: 7 ( 3531) P:99 I:4500 C: 128226 Min: 1 Act: 1 Avg: 6 Max: 1360

master PREEMPT_VOLUNTARY_RT/COND_RESCHED_RT_ALL (PREEMPT_NONE for normal tasks)
cyclictest -Smp99
# /dev/cpu_dma_latency set to 0us
policy: fifo: loadavg: 7.44 7.30 4.07 1/355 15627

T: 0 ( 3458) P:99 I:1000 C: 578181 Min: 1 Act: 1 Avg: 3 Max: 70
T: 1 ( 3459) P:99 I:1500 C: 385453 Min: 1 Act: 2 Avg: 4 Max: 109
T: 2 ( 3460) P:99 I:2000 C: 289089 Min: 1 Act: 1 Avg: 4 Max: 80
T: 3 ( 3461) P:99 I:2500 C: 231271 Min: 1 Act: 2 Avg: 4 Max: 55
T: 4 ( 3462) P:99 I:3000 C: 192725 Min: 1 Act: 8 Avg: 4 Max: 122
T: 5 ( 3463) P:99 I:3500 C: 165193 Min: 1 Act: 2 Avg: 4 Max: 58
T: 6 ( 3464) P:99 I:4000 C: 144543 Min: 1 Act: 1 Avg: 4 Max: 309
T: 7 ( 3465) P:99 I:4500 C: 128483 Min: 1 Act: 2 Avg: 4 Max: 66

master PREEMPT
cyclictest -Smp99
# /dev/cpu_dma_latency set to 0us
policy: fifo: loadavg: 6.57 7.13 4.07 2/356 15714

T: 0 ( 3513) P:99 I:1000 C: 585356 Min: 1 Act: 1 Avg: 4 Max: 121
T: 1 ( 3514) P:99 I:1500 C: 390236 Min: 1 Act: 1 Avg: 4 Max: 119
T: 2 ( 3515) P:99 I:2000 C: 292676 Min: 1 Act: 1 Avg: 4 Max: 106
T: 3 ( 3516) P:99 I:2500 C: 234140 Min: 1 Act: 1 Avg: 4 Max: 85
T: 4 ( 3517) P:99 I:3000 C: 195116 Min: 1 Act: 2 Avg: 4 Max: 90
T: 5 ( 3518) P:99 I:3500 C: 167242 Min: 1 Act: 1 Avg: 4 Max: 76
T: 6 ( 3519) P:99 I:4000 C: 146336 Min: 1 Act: 1 Avg: 5 Max: 519
T: 7 ( 3520) P:99 I:4500 C: 130076 Min: 1 Act: 1 Avg: 4 Max: 136

/me adds git pulling repositories to the kbuild load...

master PREEMPT
# /dev/cpu_dma_latency set to 0us
policy: fifo: loadavg: 7.99 9.10 6.75 4/358 676

T: 0 (15788) P:99 I:1000 C: 605208 Min: 1 Act: 2 Avg: 4 Max: 603
T: 1 (15789) P:99 I:1500 C: 403464 Min: 1 Act: 3 Avg: 4 Max: 1622
T: 2 (15790) P:99 I:2000 C: 302602 Min: 1 Act: 5 Avg: 4 Max: 1205
T: 3 (15791) P:99 I:2500 C: 242081 Min: 1 Act: 4 Avg: 4 Max: 1432
T: 4 (15792) P:99 I:3000 C: 201734 Min: 1 Act: 3 Avg: 5 Max: 1510
T: 5 (15793) P:99 I:3500 C: 172914 Min: 1 Act: 4 Avg: 4 Max: 75
T: 6 (15794) P:99 I:4000 C: 151299 Min: 1 Act: 4 Avg: 5 Max: 1474
T: 7 (15795) P:99 I:4500 C: 134488 Min: 1 Act: 4 Avg: 5 Max: 92

master PREEMPT_VOLUNTARY_RT/COND_RESCHED_RT_ALL
cyclictest -Smp99
# /dev/cpu_dma_latency set to 0us
policy: fifo: loadavg: 9.13 9.56 5.76 2/359 26297

T: 0 ( 3671) P:99 I:1000 C: 788852 Min: 0 Act: 1 Avg: 3 Max: 1417
T: 1 ( 3672) P:99 I:1500 C: 525895 Min: 0 Act: 1 Avg: 3 Max: 2404
T: 2 ( 3673) P:99 I:2000 C: 394425 Min: 1 Act: 1 Avg: 3 Max: 313
T: 3 ( 3674) P:99 I:2500 C: 315540 Min: 0 Act: 1 Avg: 3 Max: 475
T: 4 ( 3675) P:99 I:3000 C: 262949 Min: 0 Act: 1 Avg: 4 Max: 155
T: 5 ( 3676) P:99 I:3500 C: 225385 Min: 0 Act: 2 Avg: 4 Max: 457
T: 6 ( 3677) P:99 I:4000 C: 197211 Min: 0 Act: 2 Avg: 3 Max: 2408
T: 7 ( 3678) P:99 I:4500 C: 175299 Min: 0 Act: 1 Avg: 4 Max: 767

master PREEMPT_NONE
# /dev/cpu_dma_latency set to 0us
policy: fifo: loadavg: 8.48 9.23 7.03 3/383 6748

T: 0 (20952) P:99 I:1000 C: 608365 Min: 0 Act: 2 Avg: 6 Max: 2334
T: 1 (20953) P:99 I:1500 C: 405738 Min: 0 Act: 3 Avg: 6 Max: 1850
T: 2 (20954) P:99 I:2000 C: 304308 Min: 0 Act: 13 Avg: 7 Max: 2137
T: 3 (20955) P:99 I:2500 C: 243446 Min: 0 Act: 4 Avg: 6 Max: 2012
T: 4 (20956) P:99 I:3000 C: 202870 Min: 0 Act: 3 Avg: 6 Max: 2918
T: 5 (20957) P:99 I:3500 C: 173890 Min: 0 Act: 3 Avg: 6 Max: 1754
T: 6 (20958) P:99 I:4000 C: 152153 Min: 1 Act: 4 Avg: 7 Max: 1560
T: 7 (20959) P:99 I:4500 C: 135247 Min: 1 Act: 4 Avg: 6 Max: 2058


sched: create PREEMPT_VOLUNTARY_RT and some RT specific resched points

Steal might_resched() voluntary resched points, and apply them to
PREEMPT_NONE kernels only if an RT task is waiting, thus the name.
Add a few RT specific resched points, and get RT tasks to CPU a tad
sooner by breaking out of softirq processing loops.

Bend-spindle-mutilate-by: Mike Galbraith <[email protected]>
---
block/blk-iopoll.c | 4 ++-
block/blk-softirq.c | 8 ++++++
drivers/md/dm-bufio.c | 8 ++++++
fs/dcache.c | 4 ++-
include/linux/kernel.h | 22 ++++++++++++++++++-
include/linux/sched.h | 55 ++++++++++++++++++++++++++++++++++++++++++++++++
kernel/Kconfig.preempt | 47 +++++++++++++++++++++++++++++++++++++++--
kernel/rcu/tree.c | 4 +++
kernel/sched/core.c | 19 ++++++++++++++++
kernel/sched/deadline.c | 2 +
kernel/sched/rt.c | 2 +
kernel/sched/sched.h | 15 +++++++++++++
kernel/softirq.c | 38 ++++++++++++++++++++++++++++++++-
kernel/trace/trace.c | 2 -
lib/ioremap.c | 1
mm/memory.c | 15 ++++++++++++-
mm/page_alloc.c | 1
mm/vmalloc.c | 1
net/core/dev.c | 7 ++++++
19 files changed, 246 insertions(+), 9 deletions(-)

--- a/block/blk-iopoll.c
+++ b/block/blk-iopoll.c
@@ -79,6 +79,7 @@ static void blk_iopoll_softirq(struct so
struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll);
int rearm = 0, budget = blk_iopoll_budget;
unsigned long start_time = jiffies;
+ u64 __maybe_unused timeout = 0;

local_irq_disable();

@@ -89,7 +90,8 @@ static void blk_iopoll_softirq(struct so
/*
* If softirq window is exhausted then punt.
*/
- if (budget <= 0 || time_after(jiffies, start_time)) {
+ if (budget <= 0 || time_after(jiffies, start_time) ||
+ _need_resched_rt_delayed(&timeout, 100)) {
rearm = 1;
break;
}
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -21,6 +21,7 @@ static DEFINE_PER_CPU(struct list_head,
static void blk_done_softirq(struct softirq_action *h)
{
struct list_head *cpu_list, local_list;
+ u64 __maybe_unused timeout = 0;

local_irq_disable();
cpu_list = this_cpu_ptr(&blk_cpu_done);
@@ -30,6 +31,13 @@ static void blk_done_softirq(struct soft
while (!list_empty(&local_list)) {
struct request *rq;

+ if (_need_resched_rt_delayed(&timeout, 100)) {
+ local_irq_disable();
+ list_splice(&local_list, cpu_list);
+ __raise_softirq_irqoff(BLOCK_SOFTIRQ);
+ local_irq_enable();
+ break;
+ }
rq = list_entry(local_list.next, struct request, ipi_list);
list_del_init(&rq->ipi_list);
rq->q->softirq_done_fn(rq);
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -188,12 +188,18 @@ static void dm_bufio_unlock(struct dm_bu
/*
* FIXME Move to sched.h?
*/
-#ifdef CONFIG_PREEMPT_VOLUNTARY
+#if defined(CONFIG_PREEMPT_VOLUNTARY)
# define dm_bufio_cond_resched() \
do { \
if (unlikely(need_resched())) \
_cond_resched(); \
} while (0)
+#elif defined(CONFIG_PREEMPT_VOLUNTARY_RT)
+# define dm_bufio_cond_resched() \
+do { \
+ if (unlikely(need_resched())) \
+ _cond_resched_rt(); \
+} while (0)
#else
# define dm_bufio_cond_resched() do { } while (0)
#endif
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -311,7 +311,7 @@ static void dentry_free(struct dentry *d
struct external_name *p = external_name(dentry);
if (likely(atomic_dec_and_test(&p->u.count))) {
call_rcu(&dentry->d_u.d_rcu, __d_free_external);
- return;
+ goto out;
}
}
/* if dentry was never visible to RCU, immediate free is OK */
@@ -319,6 +319,8 @@ static void dentry_free(struct dentry *d
__d_free(&dentry->d_u.d_rcu);
else
call_rcu(&dentry->d_u.d_rcu, __d_free);
+out:
+ cond_resched_rt();
}

/**
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -166,11 +166,28 @@ struct completion;
struct pt_regs;
struct user;

-#ifdef CONFIG_PREEMPT_VOLUNTARY
+/*
+ * PREEMPT_VOLUNTARY receives might_sleep() annotated reschedule points.
+ * PREEMPT_VOLUNTARY_RT receives might_sleep() and might_sleep_rt(), but
+ * both reschedule only when an RT task wants the CPU.
+ * PREEMPT_VOLUNTARY + COND_RESCHED_RT receives normal might_sleep() plus
+ * RT specific might_sleep_rt().
+ */
+#if (defined(CONFIG_PREEMPT_VOLUNTARY) && !defined(CONFIG_COND_RESCHED_RT))
+extern int _cond_resched(void);
+# define might_resched() _cond_resched()
+#elif defined(CONFIG_PREEMPT_VOLUNTARY_RT)
+extern int _cond_resched_rt(void);
+# define might_resched() _cond_resched_rt()
+# define might_resched_rt() _cond_resched_rt()
+#elif defined(CONFIG_COND_RESCHED_RT)
extern int _cond_resched(void);
+extern int _cond_resched_rt(void);
# define might_resched() _cond_resched()
+# define might_resched_rt() _cond_resched_rt()
#else
# define might_resched() do { } while (0)
+# define might_resched_rt() do { } while (0)
#endif

#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
@@ -188,6 +205,8 @@ extern int _cond_resched(void);
*/
# define might_sleep() \
do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
+# define might_sleep_rt() \
+ do { __might_sleep(__FILE__, __LINE__, 0); might_resched_rt(); } while (0)
# define sched_annotate_sleep() (current->task_state_change = 0)
#else
static inline void ___might_sleep(const char *file, int line,
@@ -195,6 +214,7 @@ extern int _cond_resched(void);
static inline void __might_sleep(const char *file, int line,
int preempt_offset) { }
# define might_sleep() do { might_resched(); } while (0)
+# define might_sleep_rt() do { might_resched_rt(); } while (0)
# define sched_annotate_sleep() do { } while (0)
#endif

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3014,6 +3014,61 @@ static __always_inline bool need_resched
return unlikely(tif_need_resched());
}

+#ifdef CONFIG_COND_RESCHED_RT
+DECLARE_PER_CPU(unsigned int, sched_rt_queued);
+extern int _cond_resched_rt(void);
+extern int _cond_resched_softirq_rt(void);
+
+static inline bool sched_rt_active(void)
+{
+ /* Yes, take a racy oportunistic peek */
+ return raw_cpu_read(sched_rt_queued) != 0;
+}
+
+static inline bool _need_resched_rt(void)
+{
+ return need_resched() && sched_rt_active();
+}
+
+static inline bool _need_resched_rt_delayed(u64 *timeout, unsigned int usecs)
+{
+ if (!*timeout) {
+ *timeout = local_clock() + usecs * 1000UL;
+ return false;
+ }
+ return _need_resched_rt() && local_clock() > *timeout;
+}
+
+#ifdef CONFIG_COND_RESCHED_RT_ALL
+/*
+ * These two are for use in sometimes preemptible context,
+ * therefore require and select CONFIG_PREEMPT_COUNT.
+ */
+static inline bool need_resched_rt(void)
+{
+ return _need_resched_rt() && !in_atomic();
+}
+
+static inline int cond_resched_rt(void)
+{
+ return need_resched_rt() && _cond_resched_rt();
+}
+#else /* !CONFIG_COND_RESCHED_RT_ALL */
+static inline bool need_resched_rt(void) { return false; }
+static inline int cond_resched_rt(void) { return 0; }
+#endif /* CONFIG_COND_RESCHED_RT_ALL */
+#else /* !CONFIG_COND_RESCHED_RT */
+static inline bool sched_rt_active(void) { return false; }
+static inline bool _need_resched_rt(void) { return false; }
+static inline bool _need_resched_rt_delayed(u64 *timeout, unsigned int usecs)
+{
+ return false;
+}
+static inline bool need_resched_rt(void) { return false; }
+static inline int _cond_resched_rt(void) { return 0; }
+static inline int cond_resched_rt(void) { return 0; }
+#endif /* CONFIG_COND_RESCHED_RT */
+
/*
* Thread group CPU time accounting.
*/
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -1,4 +1,3 @@
-
choice
prompt "Preemption Model"
default PREEMPT_NONE
@@ -16,6 +15,22 @@ config PREEMPT_NONE
raw processing power of the kernel, irrespective of scheduling
latencies.

+config PREEMPT_VOLUNTARY_RT
+ bool "Voluntary Kernel Preemption for RT tasks only (Server)"
+ select COND_RESCHED_RT
+ help
+ This option reduces the RT latency of the kernel by adding more
+ "explicit preemption points" to the kernel code. These new
+ preemption points have been selected to reduce the maximum
+ latency of rescheduling, providing faster application reactions,
+ at the cost of slightly lower throughput.
+
+ This allows reaction to realtime events by allowing a
+ low priority process to voluntarily preempt itself even if it
+ is in kernel mode executing a system call. This allows
+ RT applications to run more 'smoothly' even when the system is
+ under load.
+
config PREEMPT_VOLUNTARY
bool "Voluntary Kernel Preemption (Desktop)"
help
@@ -54,5 +69,33 @@ config PREEMPT

endchoice

+if PREEMPT_VOLUNTARY || PREEMPT_VOLUNTARY_RT
+
+menu "Voluntary preemption extensions"
+
+config COND_RESCHED_RT
+ bool "Enable RT specific preemption points"
+ default n
+ help
+ This option further reduces RT scheduling latencies by adding
+ more "explicit preemption points" for RT tasks only.
+
+
+config COND_RESCHED_RT_ALL
+ bool "Enable PREEMPT_COUNT dependent RT preemption points"
+ depends on COND_RESCHED_RT
+ select PREEMPT_COUNT
+ select DEBUG_ATOMIC_SLEEP
+ help
+ This option further reduces RT scheduling latency by adding
+ more "explicit preemption points", in code which may or may
+ not be called in a preemptible context, thus we must enable
+ PREEMPT_COUNT to make such contexts visible. Note that this
+ option adds some overhead to kernel locking primitives.
+
+endmenu
+
+endif
+
config PREEMPT_COUNT
- bool
\ No newline at end of file
+ bool
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2617,6 +2617,7 @@ static void rcu_do_batch(struct rcu_stat
unsigned long flags;
struct rcu_head *next, *list, **tail;
long bl, count, count_lazy;
+ u64 __maybe_unused timeout = 0;
int i;

/* If no callbacks are ready, just return. */
@@ -2648,6 +2649,9 @@ static void rcu_do_batch(struct rcu_stat
/* Invoke callbacks. */
count = count_lazy = 0;
while (list) {
+ /* Budget 100us per flavor and hope for the best */
+ if (_need_resched_rt_delayed(&timeout, 100))
+ break;
next = list->next;
prefetch(next);
debug_rcu_head_unqueue(list);
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4551,6 +4551,25 @@ int __sched __cond_resched_softirq(void)
}
EXPORT_SYMBOL(__cond_resched_softirq);

+#ifdef CONFIG_COND_RESCHED_RT
+DEFINE_PER_CPU(unsigned int, sched_rt_queued);
+
+int __sched _cond_resched_rt(void)
+{
+ if (!_need_resched_rt() || !should_resched(0))
+ return 0;
+
+ do {
+ preempt_active_enter();
+ __schedule();
+ preempt_active_exit();
+ } while (_need_resched_rt());
+
+ return 1;
+}
+EXPORT_SYMBOL(_cond_resched_rt);
+#endif
+
/**
* yield - yield the current processor to other threads.
*
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -984,12 +984,14 @@ static void enqueue_task_dl(struct rq *r

if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
+ sched_rt_active_inc();
}

static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
{
dequeue_dl_entity(&p->dl);
dequeue_pushable_dl_task(rq, p);
+ sched_rt_active_dec();
}

static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1274,6 +1274,7 @@ enqueue_task_rt(struct rq *rq, struct ta

if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
+ sched_rt_active_inc();
}

static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -1284,6 +1285,7 @@ static void dequeue_task_rt(struct rq *r
dequeue_rt_entity(rt_se);

dequeue_pushable_task(rq, p);
+ sched_rt_active_dec();
}

/*
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1770,3 +1770,18 @@ static inline u64 irq_time_read(int cpu)
}
#endif /* CONFIG_64BIT */
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#ifdef CONFIG_COND_RESCHED_RT
+static inline void sched_rt_active_inc(void)
+{
+ __this_cpu_inc(sched_rt_queued);
+}
+
+static inline void sched_rt_active_dec(void)
+{
+ __this_cpu_dec(sched_rt_queued);
+}
+#else
+static inline void sched_rt_active_inc(void) { }
+static inline void sched_rt_active_dec(void) { }
+#endif
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -280,6 +280,8 @@ asmlinkage __visible void __do_softirq(v
}
h++;
pending >>= softirq_bit;
+ if (need_resched_rt() && current != this_cpu_ksoftirqd())
+ break;
}

rcu_bh_qs();
@@ -299,6 +301,12 @@ asmlinkage __visible void __do_softirq(v
__local_bh_enable(SOFTIRQ_OFFSET);
WARN_ON_ONCE(in_interrupt());
tsk_restore_flags(current, old_flags, PF_MEMALLOC);
+
+ if (need_resched_rt() && current != this_cpu_ksoftirqd()) {
+ local_irq_enable();
+ _cond_resched_rt();
+ local_irq_disable();
+ }
}

asmlinkage __visible void do_softirq(void)
@@ -340,7 +348,7 @@ void irq_enter(void)

static inline void invoke_softirq(void)
{
- if (!force_irqthreads) {
+ if (!force_irqthreads && !sched_rt_active()) {
#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
/*
* We can safely execute softirq on the current stack if
@@ -485,6 +493,7 @@ EXPORT_SYMBOL(__tasklet_hi_schedule_firs
static void tasklet_action(struct softirq_action *a)
{
struct tasklet_struct *list;
+ u64 __maybe_unused timeout = 0;

local_irq_disable();
list = __this_cpu_read(tasklet_vec.head);
@@ -495,6 +504,19 @@ static void tasklet_action(struct softir
while (list) {
struct tasklet_struct *t = list;

+ if (t && _need_resched_rt_delayed(&timeout, 100)) {
+ local_irq_disable();
+ while (list->next)
+ list = list->next;
+ list->next = __this_cpu_read(tasklet_vec.head);
+ __this_cpu_write(tasklet_vec.head, t);
+ if (!__this_cpu_read(tasklet_vec.tail))
+ __this_cpu_write(tasklet_vec.tail, &(list->next));
+ __raise_softirq_irqoff(TASKLET_SOFTIRQ);
+ local_irq_enable();
+ return;
+ }
+
list = list->next;

if (tasklet_trylock(t)) {
@@ -521,6 +543,7 @@ static void tasklet_action(struct softir
static void tasklet_hi_action(struct softirq_action *a)
{
struct tasklet_struct *list;
+ u64 __maybe_unused timeout = 0;

local_irq_disable();
list = __this_cpu_read(tasklet_hi_vec.head);
@@ -531,6 +554,19 @@ static void tasklet_hi_action(struct sof
while (list) {
struct tasklet_struct *t = list;

+ if (t && _need_resched_rt_delayed(&timeout, 100)) {
+ local_irq_disable();
+ while (list->next)
+ list = list->next;
+ list->next = __this_cpu_read(tasklet_hi_vec.head);
+ __this_cpu_write(tasklet_hi_vec.head, t);
+ if (!__this_cpu_read(tasklet_hi_vec.tail))
+ __this_cpu_write(tasklet_hi_vec.tail, &(list->next));
+ __raise_softirq_irqoff(HI_SOFTIRQ);
+ local_irq_enable();
+ return;
+ }
+
list = list->next;

if (tasklet_trylock(t)) {
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2624,7 +2624,7 @@ print_trace_header(struct seq_file *m, s
entries,
total,
buf->cpu,
-#if defined(CONFIG_PREEMPT_NONE)
+#if defined(CONFIG_PREEMPT_NONE) || defined(CONFIG_PREEMPT_VOLUNTARY_RT)
"server",
#elif defined(CONFIG_PREEMPT_VOLUNTARY)
"desktop",
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -90,6 +90,7 @@ static inline int ioremap_pmd_range(pud_

if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, prot))
return -ENOMEM;
+ might_sleep_rt();
} while (pmd++, addr = next, addr != end);
return 0;
}
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1075,7 +1075,7 @@ static unsigned long zap_pte_range(struc
struct zap_details *details)
{
struct mm_struct *mm = tlb->mm;
- int force_flush = 0;
+ int force_flush = 0, resched_rt = 0;
int rss[NR_MM_COUNTERS];
spinlock_t *ptl;
pte_t *start_pte;
@@ -1132,6 +1132,10 @@ static unsigned long zap_pte_range(struc
addr += PAGE_SIZE;
break;
}
+ if (_need_resched_rt()) {
+ resched_rt = 1;
+ break;
+ }
continue;
}
/* If details->check_mapping, we leave swap entries. */
@@ -1178,6 +1182,14 @@ static unsigned long zap_pte_range(struc
goto again;
}

+ if (resched_rt) {
+ resched_rt = 0;
+ might_sleep_rt();
+
+ if (addr != end)
+ goto again;
+ }
+
return addr;
}

@@ -1656,6 +1668,7 @@ static inline int remap_pmd_range(struct
if (remap_pte_range(mm, pmd, addr, next,
pfn + (addr >> PAGE_SHIFT), prot))
return -ENOMEM;
+ might_sleep_rt();
} while (pmd++, addr = next, addr != end);
return 0;
}
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1934,6 +1934,7 @@ void free_hot_cold_page_list(struct list
list_for_each_entry_safe(page, next, list, lru) {
trace_mm_page_free_batched(page, cold);
free_hot_cold_page(page, cold);
+ cond_resched_rt();
}
}

--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -80,6 +80,7 @@ static void vunmap_pmd_range(pud_t *pud,
if (pmd_none_or_clear_bad(pmd))
continue;
vunmap_pte_range(pmd, addr, next);
+ might_sleep_rt();
} while (pmd++, addr = next, addr != end);
}

--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3549,6 +3549,7 @@ int netif_rx_ni(struct sk_buff *skb)
if (local_softirq_pending())
do_softirq();
preempt_enable();
+ cond_resched_rt();

return err;
}
@@ -4787,6 +4788,7 @@ static void net_rx_action(struct softirq
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
unsigned long time_limit = jiffies + 2;
+ u64 __maybe_unused timeout = 0;
int budget = netdev_budget;
LIST_HEAD(list);
LIST_HEAD(repoll);
@@ -4804,6 +4806,11 @@ static void net_rx_action(struct softirq
break;
}

+ if (unlikely(_need_resched_rt_delayed(&timeout, 100))) {
+ sd->time_squeeze++;
+ break;
+ }
+
n = list_first_entry(&list, struct napi_struct, poll_list);
budget -= napi_poll(n, &repoll);


2015-08-09 10:51:53

by Mike Galbraith

[permalink] [raw]
Subject: Re: [hack] sched: create PREEMPT_VOLUNTARY_RT and some RT specific resched points

Damn, the hunk below was supposed to go away before hack escaped.

The whole thing is just a "could we maybe...", but just in case anybody
plays with it, that hunk proved to be a bad idea, kill it.

> --- a/kernel/softirq.c
> +++ b/kernel/softirq.c
> @@ -280,6 +280,8 @@ asmlinkage __visible void __do_softirq(v
> }
> h++;
> pending >>= softirq_bit;
> + if (need_resched_rt() && current != this_cpu_ksoftirqd())
> + break;