Minimalistic functionality for having the write rare version of
atomic_long_t data.
Signed-off-by: Igor Stoppa <[email protected]>
CC: Will Deacon <[email protected]>
CC: Peter Zijlstra <[email protected]>
CC: Boqun Feng <[email protected]>
CC: Arnd Bergmann <[email protected]>
CC: [email protected]
CC: [email protected]
---
MAINTAINERS | 1 +
include/linux/pratomic-long.h | 73 +++++++++++++++++++++++++++++++++++
2 files changed, 74 insertions(+)
create mode 100644 include/linux/pratomic-long.h
diff --git a/MAINTAINERS b/MAINTAINERS
index e7f7cb1682a6..9d72688d00a3 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9466,6 +9466,7 @@ F: mm/test_pmalloc.c
F: Documentation/core-api/prmem.rst
F: include/linux/prlist.h
F: lib/test_prlist.c
+F: include/linux/pratomic-long.h
MEMORY MANAGEMENT
L: [email protected]
diff --git a/include/linux/pratomic-long.h b/include/linux/pratomic-long.h
new file mode 100644
index 000000000000..8f1408593733
--- /dev/null
+++ b/include/linux/pratomic-long.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Atomic operations for write rare memory */
+#ifndef _LINUX_PRATOMIC_LONG_H
+#define _LINUX_PRATOMIC_LONG_H
+#include <linux/prmem.h>
+#include <linux/compiler.h>
+#include <asm-generic/atomic-long.h>
+
+struct pratomic_long_t {
+ atomic_long_t l __aligned(sizeof(atomic_long_t));
+} __aligned(sizeof(atomic_long_t));
+
+#define PRATOMIC_LONG_INIT(i) { \
+ .l = ATOMIC_LONG_INIT((i)), \
+}
+
+static __always_inline
+bool __pratomic_long_op(bool inc, struct pratomic_long_t *l)
+{
+ struct page *page;
+ uintptr_t base;
+ uintptr_t offset;
+ unsigned long flags;
+ size_t size = sizeof(*l);
+ bool is_virt = __is_wr_after_init(l, size);
+
+ if (WARN(!(is_virt || likely(__is_wr_pool(l, size))),
+ WR_ERR_RANGE_MSG))
+ return false;
+ local_irq_save(flags);
+ if (is_virt)
+ page = virt_to_page(l);
+ else
+ vmalloc_to_page(l);
+ offset = (~PAGE_MASK) & (uintptr_t)l;
+ base = (uintptr_t)vmap(&page, 1, VM_MAP, PAGE_KERNEL);
+ if (WARN(!base, WR_ERR_PAGE_MSG)) {
+ local_irq_restore(flags);
+ return false;
+ }
+ if (inc)
+ atomic_long_inc((atomic_long_t *)(base + offset));
+ else
+ atomic_long_dec((atomic_long_t *)(base + offset));
+ vunmap((void *)base);
+ local_irq_restore(flags);
+ return true;
+
+}
+
+/**
+ * pratomic_long_inc - atomic increment of rare write long
+ * @l: address of the variable of type struct pratomic_long_t
+ *
+ * Return: true on success, false otherwise
+ */
+static __always_inline bool pratomic_long_inc(struct pratomic_long_t *l)
+{
+ return __pratomic_long_op(true, l);
+}
+
+/**
+ * pratomic_long_inc - atomic decrement of rare write long
+ * @l: address of the variable of type struct pratomic_long_t
+ *
+ * Return: true on success, false otherwise
+ */
+static __always_inline bool pratomic_long_dec(struct pratomic_long_t *l)
+{
+ return __pratomic_long_op(false, l);
+}
+
+#endif
--
2.17.1
On Wed, Oct 24, 2018 at 12:35:03AM +0300, Igor Stoppa wrote:
> +static __always_inline
> +bool __pratomic_long_op(bool inc, struct pratomic_long_t *l)
> +{
> + struct page *page;
> + uintptr_t base;
> + uintptr_t offset;
> + unsigned long flags;
> + size_t size = sizeof(*l);
> + bool is_virt = __is_wr_after_init(l, size);
> +
> + if (WARN(!(is_virt || likely(__is_wr_pool(l, size))),
> + WR_ERR_RANGE_MSG))
> + return false;
> + local_irq_save(flags);
> + if (is_virt)
> + page = virt_to_page(l);
> + else
> + vmalloc_to_page(l);
> + offset = (~PAGE_MASK) & (uintptr_t)l;
> + base = (uintptr_t)vmap(&page, 1, VM_MAP, PAGE_KERNEL);
> + if (WARN(!base, WR_ERR_PAGE_MSG)) {
> + local_irq_restore(flags);
> + return false;
> + }
> + if (inc)
> + atomic_long_inc((atomic_long_t *)(base + offset));
> + else
> + atomic_long_dec((atomic_long_t *)(base + offset));
> + vunmap((void *)base);
> + local_irq_restore(flags);
> + return true;
> +
> +}
That's just hideously nasty.. and horribly broken.
We're not going to duplicate all these kernel interfaces wrapped in gunk
like that. Also, you _cannot_ call vunmap() with IRQs disabled. Clearly
you've never tested this with debug bits enabled.
On 25/10/2018 01:13, Peter Zijlstra wrote:
> On Wed, Oct 24, 2018 at 12:35:03AM +0300, Igor Stoppa wrote:
>> +static __always_inline
>> +bool __pratomic_long_op(bool inc, struct pratomic_long_t *l)
>> +{
>> + struct page *page;
>> + uintptr_t base;
>> + uintptr_t offset;
>> + unsigned long flags;
>> + size_t size = sizeof(*l);
>> + bool is_virt = __is_wr_after_init(l, size);
>> +
>> + if (WARN(!(is_virt || likely(__is_wr_pool(l, size))),
>> + WR_ERR_RANGE_MSG))
>> + return false;
>> + local_irq_save(flags);
>> + if (is_virt)
>> + page = virt_to_page(l);
>> + else
>> + vmalloc_to_page(l);
>> + offset = (~PAGE_MASK) & (uintptr_t)l;
>> + base = (uintptr_t)vmap(&page, 1, VM_MAP, PAGE_KERNEL);
>> + if (WARN(!base, WR_ERR_PAGE_MSG)) {
>> + local_irq_restore(flags);
>> + return false;
>> + }
>> + if (inc)
>> + atomic_long_inc((atomic_long_t *)(base + offset));
>> + else
>> + atomic_long_dec((atomic_long_t *)(base + offset));
>> + vunmap((void *)base);
>> + local_irq_restore(flags);
>> + return true;
>> +
>> +}
>
> That's just hideously nasty.. and horribly broken.
>
> We're not going to duplicate all these kernel interfaces wrapped in gunk
> like that.
one possibility would be to have macros which use typeof() on the
parameter being passed, to decide what implementation to use: regular or
write-rare
This means that type punning would still be needed, to select the
implementation.
Would this be enough? Is there some better way?
> Also, you _cannot_ call vunmap() with IRQs disabled. Clearly
> you've never tested this with debug bits enabled.
I thought I had them. And I _did_ have them enabled, at some point.
But I must have messed up with the configuration and I failed to notice
this.
I can think of a way it might work, albeit it's not going to be very pretty:
* for the vmap(): if I understand correctly, it might sleep while
obtaining memory for creating the mapping. This part could be executed
before disabling interrupts. The rest of the function, instead, would be
executed after interrupts are disabled.
* for vunmap(): after the writing is done, change also the alternate
mapping to read only, then enable interrupts and destroy the alternate
mapping. Making also the secondary mapping read only makes it equally
secure as the primary, which means that it can be visible also with
interrupts enabled.
--
igor
On Mon, Oct 29, 2018 at 11:17:14PM +0200, Igor Stoppa wrote:
>
>
> On 25/10/2018 01:13, Peter Zijlstra wrote:
> > On Wed, Oct 24, 2018 at 12:35:03AM +0300, Igor Stoppa wrote:
> > > +static __always_inline
> > > +bool __pratomic_long_op(bool inc, struct pratomic_long_t *l)
> > > +{
> > > + struct page *page;
> > > + uintptr_t base;
> > > + uintptr_t offset;
> > > + unsigned long flags;
> > > + size_t size = sizeof(*l);
> > > + bool is_virt = __is_wr_after_init(l, size);
> > > +
> > > + if (WARN(!(is_virt || likely(__is_wr_pool(l, size))),
> > > + WR_ERR_RANGE_MSG))
> > > + return false;
> > > + local_irq_save(flags);
> > > + if (is_virt)
> > > + page = virt_to_page(l);
> > > + else
> > > + vmalloc_to_page(l);
> > > + offset = (~PAGE_MASK) & (uintptr_t)l;
> > > + base = (uintptr_t)vmap(&page, 1, VM_MAP, PAGE_KERNEL);
> > > + if (WARN(!base, WR_ERR_PAGE_MSG)) {
> > > + local_irq_restore(flags);
> > > + return false;
> > > + }
> > > + if (inc)
> > > + atomic_long_inc((atomic_long_t *)(base + offset));
> > > + else
> > > + atomic_long_dec((atomic_long_t *)(base + offset));
> > > + vunmap((void *)base);
> > > + local_irq_restore(flags);
> > > + return true;
> > > +
> > > +}
> >
> > That's just hideously nasty.. and horribly broken.
> >
> > We're not going to duplicate all these kernel interfaces wrapped in gunk
> > like that.
>
> one possibility would be to have macros which use typeof() on the parameter
> being passed, to decide what implementation to use: regular or write-rare
>
> This means that type punning would still be needed, to select the
> implementation.
>
> Would this be enough? Is there some better way?
Like mentioned elsewhere; if you do write_enable() + write_disable()
thingies, it all becomes:
write_enable();
atomic_foo(&bar);
write_disable();
No magic gunk infested duplication at all. Of course, ideally you'd then
teach objtool about this (or a GCC plugin I suppose) to ensure any
enable reached a disable.
The alternative is something like:
#define ALLOW_WRITE(stmt) do { write_enable(); do { stmt; } while (0); write_disable(); } while (0)
which then allows you to write:
ALLOW_WRITE(atomic_foo(&bar));
No duplication.
> > Also, you _cannot_ call vunmap() with IRQs disabled. Clearly
> > you've never tested this with debug bits enabled.
>
> I thought I had them. And I _did_ have them enabled, at some point.
> But I must have messed up with the configuration and I failed to notice
> this.
>
> I can think of a way it might work, albeit it's not going to be very pretty:
>
> * for the vmap(): if I understand correctly, it might sleep while obtaining
> memory for creating the mapping. This part could be executed before
> disabling interrupts. The rest of the function, instead, would be executed
> after interrupts are disabled.
>
> * for vunmap(): after the writing is done, change also the alternate mapping
> to read only, then enable interrupts and destroy the alternate mapping.
> Making also the secondary mapping read only makes it equally secure as the
> primary, which means that it can be visible also with interrupts enabled.
That doesn't work if you wanted to do this write while you already have
IRQs disabled for example.
On Tue, Oct 30, 2018 at 04:58:41PM +0100, Peter Zijlstra wrote:
> On Mon, Oct 29, 2018 at 11:17:14PM +0200, Igor Stoppa wrote:
> >
> >
> > On 25/10/2018 01:13, Peter Zijlstra wrote:
> > > On Wed, Oct 24, 2018 at 12:35:03AM +0300, Igor Stoppa wrote:
> > > > +static __always_inline
> > > > +bool __pratomic_long_op(bool inc, struct pratomic_long_t *l)
> > > > +{
> > > > + struct page *page;
> > > > + uintptr_t base;
> > > > + uintptr_t offset;
> > > > + unsigned long flags;
> > > > + size_t size = sizeof(*l);
> > > > + bool is_virt = __is_wr_after_init(l, size);
> > > > +
> > > > + if (WARN(!(is_virt || likely(__is_wr_pool(l, size))),
> > > > + WR_ERR_RANGE_MSG))
> > > > + return false;
> > > > + local_irq_save(flags);
> > > > + if (is_virt)
> > > > + page = virt_to_page(l);
> > > > + else
> > > > + vmalloc_to_page(l);
> > > > + offset = (~PAGE_MASK) & (uintptr_t)l;
> > > > + base = (uintptr_t)vmap(&page, 1, VM_MAP, PAGE_KERNEL);
> > > > + if (WARN(!base, WR_ERR_PAGE_MSG)) {
> > > > + local_irq_restore(flags);
> > > > + return false;
> > > > + }
> > > > + if (inc)
> > > > + atomic_long_inc((atomic_long_t *)(base + offset));
> > > > + else
> > > > + atomic_long_dec((atomic_long_t *)(base + offset));
> > > > + vunmap((void *)base);
> > > > + local_irq_restore(flags);
> > > > + return true;
> > > > +
> > > > +}
> > >
> > > That's just hideously nasty.. and horribly broken.
> > >
> > > We're not going to duplicate all these kernel interfaces wrapped in gunk
> > > like that.
> >
> > one possibility would be to have macros which use typeof() on the parameter
> > being passed, to decide what implementation to use: regular or write-rare
> >
> > This means that type punning would still be needed, to select the
> > implementation.
> >
> > Would this be enough? Is there some better way?
>
> Like mentioned elsewhere; if you do write_enable() + write_disable()
> thingies, it all becomes:
>
> write_enable();
> atomic_foo(&bar);
> write_disable();
>
> No magic gunk infested duplication at all. Of course, ideally you'd then
> teach objtool about this (or a GCC plugin I suppose) to ensure any
> enable reached a disable.
Isn't the issue here that we don't want to change the page tables for the
mapping of &bar, but instead want to create a temporary writable alias
at a random virtual address?
So you'd want:
wbar = write_enable(&bar);
atomic_foo(wbar);
write_disable(wbar);
which is probably better expressed as a map/unmap API. I suspect this
would also be the only way to do things for cmpxchg() loops, where you
want to create the mapping outside of the loop to minimise your time in
the critical section.
Will
On Tue, Oct 30, 2018 at 04:28:16PM +0000, Will Deacon wrote:
> On Tue, Oct 30, 2018 at 04:58:41PM +0100, Peter Zijlstra wrote:
> > Like mentioned elsewhere; if you do write_enable() + write_disable()
> > thingies, it all becomes:
> >
> > write_enable();
> > atomic_foo(&bar);
> > write_disable();
> >
> > No magic gunk infested duplication at all. Of course, ideally you'd then
> > teach objtool about this (or a GCC plugin I suppose) to ensure any
> > enable reached a disable.
>
> Isn't the issue here that we don't want to change the page tables for the
> mapping of &bar, but instead want to create a temporary writable alias
> at a random virtual address?
>
> So you'd want:
>
> wbar = write_enable(&bar);
> atomic_foo(wbar);
> write_disable(wbar);
>
> which is probably better expressed as a map/unmap API. I suspect this
> would also be the only way to do things for cmpxchg() loops, where you
> want to create the mapping outside of the loop to minimise your time in
> the critical section.
Ah, so I was thikning that the altnerative mm would have stuff in the
same location, just RW instead of RO.
But yes, if we, like Andy suggets, use the userspace address range for
the aliases, then we need to do as you suggest.
On Wed, Oct 31, 2018 at 2:10 AM, Peter Zijlstra <[email protected]> wrote:
> On Tue, Oct 30, 2018 at 04:28:16PM +0000, Will Deacon wrote:
>> On Tue, Oct 30, 2018 at 04:58:41PM +0100, Peter Zijlstra wrote:
>> > Like mentioned elsewhere; if you do write_enable() + write_disable()
>> > thingies, it all becomes:
>> >
>> > write_enable();
>> > atomic_foo(&bar);
>> > write_disable();
>> >
>> > No magic gunk infested duplication at all. Of course, ideally you'd then
>> > teach objtool about this (or a GCC plugin I suppose) to ensure any
>> > enable reached a disable.
>>
>> Isn't the issue here that we don't want to change the page tables for the
>> mapping of &bar, but instead want to create a temporary writable alias
>> at a random virtual address?
>>
>> So you'd want:
>>
>> wbar = write_enable(&bar);
>> atomic_foo(wbar);
>> write_disable(wbar);
>>
>> which is probably better expressed as a map/unmap API. I suspect this
>> would also be the only way to do things for cmpxchg() loops, where you
>> want to create the mapping outside of the loop to minimise your time in
>> the critical section.
>
> Ah, so I was thikning that the altnerative mm would have stuff in the
> same location, just RW instead of RO.
I was hoping for the same location too. That allows use to use a gcc
plugin to mark, say, function pointer tables, as read-only, and
annotate their rare updates with write_rare() without any
recalculation.
-Kees
--
Kees Cook