2022-12-12 13:08:46

by David Hildenbrand

[permalink] [raw]
Subject: [PATCH v1] sparc/mm: don't unconditionally set HW writable bit when setting PTE dirty on 64bit

On sparc64, there is no HW modified bit, therefore, SW tracks via a SW
bit if the PTE is dirty via pte_mkdirty(). However, pte_mkdirty()
currently also unconditionally sets the HW writable bit, which is wrong.

pte_mkdirty() is not supposed to make a PTE actually writable, unless the
SW writable bit (pte_write()) indicates that the PTE is not
write-protected. Fortunately, sparc64 also defines a SW writable bit.

For example, this already turned into a problem in the context of
THP splitting as documented in commit 624a2c94f5b7 ("Partly revert "mm/thp:
carry over dirty bit when thp splits on pmd") and might be an issue during
page migration in mm/migrate.c:remove_migration_pte() as well where we:
if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
pte = pte_mkdirty(pte);

But more general, anything like:
maybe_mkwrite(pte_mkdirty(pte), vma)
code is broken on sparc64, because it will unconditionally set the HW
writable bit even if the SW writable bit is not set.

Simple reproducer that will result in a writable PTE after ptrace
access, to highlight the problem and as an easy way to verify if it has
been fixed:

--------------------------------------------------------------------------
#include <fcntl.h>
#include <signal.h>
#include <unistd.h>
#include <string.h>
#include <errno.h>
#include <stdlib.h>
#include <sys/mman.h>

static void signal_handler(int sig)
{
if (sig == SIGSEGV)
printf("[PASS] SIGSEGV generated\n");
else
printf("[FAIL] wrong signal generated\n");
exit(0);
}

int main(void)
{
size_t pagesize = getpagesize();
char data = 1;
off_t offs;
int mem_fd;
char *map;
int ret;

mem_fd = open("/proc/self/mem", O_RDWR);
if (mem_fd < 0) {
fprintf(stderr, "open(/proc/self/mem) failed: %d\n", errno);
return 1;
}

map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE|MAP_ANON, -1 ,0);
if (map == MAP_FAILED) {
fprintf(stderr, "mmap() failed: %d\n", errno);
return 1;
}

printf("original: %x\n", *map);

/* debug access */
offs = lseek(mem_fd, (uintptr_t) map, SEEK_SET);
ret = write(mem_fd, &data, 1);
if (ret != 1) {
fprintf(stderr, "pwrite(/proc/self/mem) failed with %d: %d\n", ret, errno);
return 1;
}
if (*map != data) {
fprintf(stderr, "pwrite(/proc/self/mem) not visible\n");
return 1;
}

printf("ptrace: %x\n", *map);

/* Install signal handler. */
if (signal(SIGSEGV, signal_handler) == SIG_ERR) {
fprintf(stderr, "signal() failed\n");
return 1;
}

/* Ordinary access. */
*map = 2;

printf("access: %x\n", *map);

printf("[FAIL] SIGSEGV not generated\n");

return 0;
}
--------------------------------------------------------------------------

Without this commit (sun4u in QEMU):
# ./reproducer
original: 0
ptrace: 1
access: 2
[FAIL] SIGSEGV not generated

Let's fix this by setting the HW writable bit only if both, the SW dirty
bit and the SW writable bit are set. This matches, for example, how
s390x handles pte_mkwrite() and pte_mkdirty() -- except, that they have
to clear the _PAGE_PROTECT bit.

We have to move pte_dirty() and pte_dirty() up. The code patching
mechanism and handling constants > 22bit is a bit special on sparc64.

With this commit (sun4u in QEMU):
# ./reproducer
original: 0
ptrace: 1
[PASS] SIGSEGV generated

This handling seems to have been in place forever.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Cc: Andrew Morton <[email protected]>
Cc: "David S. Miller" <[email protected]>
Cc: Peter Xu <[email protected]>
Cc: Hev <[email protected]>
Cc: Anatoly Pugachev <[email protected]>
Cc: Raghavendra K T <[email protected]>
Cc: Thorsten Leemhuis <[email protected]>
Cc: Mike Kravetz <[email protected]>
Cc: "Kirill A. Shutemov" <[email protected]>
Cc: Juergen Gross <[email protected]>
Signed-off-by: David Hildenbrand <[email protected]>
---

Only tested under QEMU with sun4u, as I cannot seem to get sun4v running
in QEMU. Survives a simple debian 10 boot.

This also tackles what's documented in:
https://lkml.kernel.org/r/[email protected]
and once loongarch also has been fixed, we might be able to remove all
that special-casing.

---
arch/sparc/include/asm/pgtable_64.h | 117 ++++++++++++++++------------
1 file changed, 67 insertions(+), 50 deletions(-)

diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 3bc9736bddb1..7f2e57747563 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -354,6 +354,42 @@ static inline pgprot_t pgprot_noncached(pgprot_t prot)
*/
#define pgprot_noncached pgprot_noncached

+static inline unsigned long pte_dirty(pte_t pte)
+{
+ unsigned long mask;
+
+ __asm__ __volatile__(
+ "\n661: mov %1, %0\n"
+ " nop\n"
+ " .section .sun4v_2insn_patch, \"ax\"\n"
+ " .word 661b\n"
+ " sethi %%uhi(%2), %0\n"
+ " sllx %0, 32, %0\n"
+ " .previous\n"
+ : "=r" (mask)
+ : "i" (_PAGE_MODIFIED_4U), "i" (_PAGE_MODIFIED_4V));
+
+ return (pte_val(pte) & mask);
+}
+
+static inline unsigned long pte_write(pte_t pte)
+{
+ unsigned long mask;
+
+ __asm__ __volatile__(
+ "\n661: mov %1, %0\n"
+ " nop\n"
+ " .section .sun4v_2insn_patch, \"ax\"\n"
+ " .word 661b\n"
+ " sethi %%uhi(%2), %0\n"
+ " sllx %0, 32, %0\n"
+ " .previous\n"
+ : "=r" (mask)
+ : "i" (_PAGE_WRITE_4U), "i" (_PAGE_WRITE_4V));
+
+ return (pte_val(pte) & mask);
+}
+
#if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags);
#define arch_make_huge_pte arch_make_huge_pte
@@ -415,28 +451,44 @@ static inline bool is_hugetlb_pte(pte_t pte)
}
#endif

-static inline pte_t pte_mkdirty(pte_t pte)
+static inline pte_t __pte_mkhwwrite(pte_t pte)
{
- unsigned long val = pte_val(pte), tmp;
+ unsigned long val = pte_val(pte);

+ /*
+ * Note: we only want to set the HW writable bit if the SW writable bit
+ * and the SW dirty bit are set.
+ */
__asm__ __volatile__(
- "\n661: or %0, %3, %0\n"
+ "\n661: or %0, %2, %0\n"
" nop\n"
- "\n662: nop\n"
+ " .section .sun4v_1insn_patch, \"ax\"\n"
+ " .word 661b\n"
+ " or %0, %3, %0\n"
+ " .previous\n"
+ : "=r" (val)
+ : "0" (val), "i" (_PAGE_W_4U), "i" (_PAGE_W_4V));
+
+ return __pte(val);
+}
+
+static inline pte_t pte_mkdirty(pte_t pte)
+{
+ unsigned long val = pte_val(pte), mask;
+
+ __asm__ __volatile__(
+ "\n661: mov %1, %0\n"
" nop\n"
" .section .sun4v_2insn_patch, \"ax\"\n"
" .word 661b\n"
- " sethi %%uhi(%4), %1\n"
- " sllx %1, 32, %1\n"
- " .word 662b\n"
- " or %1, %%lo(%4), %1\n"
- " or %0, %1, %0\n"
+ " sethi %%uhi(%2), %0\n"
+ " sllx %0, 32, %0\n"
" .previous\n"
- : "=r" (val), "=r" (tmp)
- : "0" (val), "i" (_PAGE_MODIFIED_4U | _PAGE_W_4U),
- "i" (_PAGE_MODIFIED_4V | _PAGE_W_4V));
+ : "=r" (mask)
+ : "i" (_PAGE_MODIFIED_4U), "i" (_PAGE_MODIFIED_4V));

- return __pte(val);
+ pte = __pte(val | mask);
+ return pte_write(pte) ? __pte_mkhwwrite(pte) : pte;
}

static inline pte_t pte_mkclean(pte_t pte)
@@ -478,7 +530,8 @@ static inline pte_t pte_mkwrite(pte_t pte)
: "=r" (mask)
: "i" (_PAGE_WRITE_4U), "i" (_PAGE_WRITE_4V));

- return __pte(val | mask);
+ pte = __pte(val | mask);
+ return pte_dirty(pte) ? __pte_mkhwwrite(pte) : pte;
}

static inline pte_t pte_wrprotect(pte_t pte)
@@ -581,42 +634,6 @@ static inline unsigned long pte_young(pte_t pte)
return (pte_val(pte) & mask);
}

-static inline unsigned long pte_dirty(pte_t pte)
-{
- unsigned long mask;
-
- __asm__ __volatile__(
- "\n661: mov %1, %0\n"
- " nop\n"
- " .section .sun4v_2insn_patch, \"ax\"\n"
- " .word 661b\n"
- " sethi %%uhi(%2), %0\n"
- " sllx %0, 32, %0\n"
- " .previous\n"
- : "=r" (mask)
- : "i" (_PAGE_MODIFIED_4U), "i" (_PAGE_MODIFIED_4V));
-
- return (pte_val(pte) & mask);
-}
-
-static inline unsigned long pte_write(pte_t pte)
-{
- unsigned long mask;
-
- __asm__ __volatile__(
- "\n661: mov %1, %0\n"
- " nop\n"
- " .section .sun4v_2insn_patch, \"ax\"\n"
- " .word 661b\n"
- " sethi %%uhi(%2), %0\n"
- " sllx %0, 32, %0\n"
- " .previous\n"
- : "=r" (mask)
- : "i" (_PAGE_WRITE_4U), "i" (_PAGE_WRITE_4V));
-
- return (pte_val(pte) & mask);
-}
-
static inline unsigned long pte_exec(pte_t pte)
{
unsigned long mask;

base-commit: 830b3c68c1fb1e9176028d02ef86f3cf76aa2476
--
2.38.1


2023-01-31 08:53:42

by David Hildenbrand

[permalink] [raw]
Subject: Re: [PATCH v1] sparc/mm: don't unconditionally set HW writable bit when setting PTE dirty on 64bit

On 12.12.22 14:02, David Hildenbrand wrote:
> On sparc64, there is no HW modified bit, therefore, SW tracks via a SW
> bit if the PTE is dirty via pte_mkdirty(). However, pte_mkdirty()
> currently also unconditionally sets the HW writable bit, which is wrong.
>
> pte_mkdirty() is not supposed to make a PTE actually writable, unless the
> SW writable bit (pte_write()) indicates that the PTE is not
> write-protected. Fortunately, sparc64 also defines a SW writable bit.
>
> For example, this already turned into a problem in the context of
> THP splitting as documented in commit 624a2c94f5b7 ("Partly revert "mm/thp:
> carry over dirty bit when thp splits on pmd") and might be an issue during
> page migration in mm/migrate.c:remove_migration_pte() as well where we:
> if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
> pte = pte_mkdirty(pte);
>
> But more general, anything like:
> maybe_mkwrite(pte_mkdirty(pte), vma)
> code is broken on sparc64, because it will unconditionally set the HW
> writable bit even if the SW writable bit is not set.
>
> Simple reproducer that will result in a writable PTE after ptrace
> access, to highlight the problem and as an easy way to verify if it has
> been fixed:
>
> --------------------------------------------------------------------------
> #include <fcntl.h>
> #include <signal.h>
> #include <unistd.h>
> #include <string.h>
> #include <errno.h>
> #include <stdlib.h>
> #include <sys/mman.h>
>
> static void signal_handler(int sig)
> {
> if (sig == SIGSEGV)
> printf("[PASS] SIGSEGV generated\n");
> else
> printf("[FAIL] wrong signal generated\n");
> exit(0);
> }
>
> int main(void)
> {
> size_t pagesize = getpagesize();
> char data = 1;
> off_t offs;
> int mem_fd;
> char *map;
> int ret;
>
> mem_fd = open("/proc/self/mem", O_RDWR);
> if (mem_fd < 0) {
> fprintf(stderr, "open(/proc/self/mem) failed: %d\n", errno);
> return 1;
> }
>
> map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE|MAP_ANON, -1 ,0);
> if (map == MAP_FAILED) {
> fprintf(stderr, "mmap() failed: %d\n", errno);
> return 1;
> }
>
> printf("original: %x\n", *map);
>
> /* debug access */
> offs = lseek(mem_fd, (uintptr_t) map, SEEK_SET);
> ret = write(mem_fd, &data, 1);
> if (ret != 1) {
> fprintf(stderr, "pwrite(/proc/self/mem) failed with %d: %d\n", ret, errno);
> return 1;
> }
> if (*map != data) {
> fprintf(stderr, "pwrite(/proc/self/mem) not visible\n");
> return 1;
> }
>
> printf("ptrace: %x\n", *map);
>
> /* Install signal handler. */
> if (signal(SIGSEGV, signal_handler) == SIG_ERR) {
> fprintf(stderr, "signal() failed\n");
> return 1;
> }
>
> /* Ordinary access. */
> *map = 2;
>
> printf("access: %x\n", *map);
>
> printf("[FAIL] SIGSEGV not generated\n");
>
> return 0;
> }
> --------------------------------------------------------------------------
>
> Without this commit (sun4u in QEMU):
> # ./reproducer
> original: 0
> ptrace: 1
> access: 2
> [FAIL] SIGSEGV not generated
>
> Let's fix this by setting the HW writable bit only if both, the SW dirty
> bit and the SW writable bit are set. This matches, for example, how
> s390x handles pte_mkwrite() and pte_mkdirty() -- except, that they have
> to clear the _PAGE_PROTECT bit.
>
> We have to move pte_dirty() and pte_dirty() up. The code patching
> mechanism and handling constants > 22bit is a bit special on sparc64.
>
> With this commit (sun4u in QEMU):
> # ./reproducer
> original: 0
> ptrace: 1
> [PASS] SIGSEGV generated
>
> This handling seems to have been in place forever.
>
> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
> Cc: Andrew Morton <[email protected]>
> Cc: "David S. Miller" <[email protected]>
> Cc: Peter Xu <[email protected]>
> Cc: Hev <[email protected]>
> Cc: Anatoly Pugachev <[email protected]>
> Cc: Raghavendra K T <[email protected]>
> Cc: Thorsten Leemhuis <[email protected]>
> Cc: Mike Kravetz <[email protected]>
> Cc: "Kirill A. Shutemov" <[email protected]>
> Cc: Juergen Gross <[email protected]>
> Signed-off-by: David Hildenbrand <[email protected]>
> ---

Ping

--
Thanks,

David / dhildenb


2023-02-01 00:51:02

by Peter Xu

[permalink] [raw]
Subject: Re: [PATCH v1] sparc/mm: don't unconditionally set HW writable bit when setting PTE dirty on 64bit

On Tue, Jan 31, 2023 at 09:47:01AM +0100, David Hildenbrand wrote:
> On 12.12.22 14:02, David Hildenbrand wrote:
> > On sparc64, there is no HW modified bit, therefore, SW tracks via a SW
> > bit if the PTE is dirty via pte_mkdirty(). However, pte_mkdirty()
> > currently also unconditionally sets the HW writable bit, which is wrong.
> >
> > pte_mkdirty() is not supposed to make a PTE actually writable, unless the
> > SW writable bit (pte_write()) indicates that the PTE is not
> > write-protected. Fortunately, sparc64 also defines a SW writable bit.
> >
> > For example, this already turned into a problem in the context of
> > THP splitting as documented in commit 624a2c94f5b7 ("Partly revert "mm/thp:
> > carry over dirty bit when thp splits on pmd") and might be an issue during
> > page migration in mm/migrate.c:remove_migration_pte() as well where we:
> > if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
> > pte = pte_mkdirty(pte);
> >
> > But more general, anything like:
> > maybe_mkwrite(pte_mkdirty(pte), vma)
> > code is broken on sparc64, because it will unconditionally set the HW
> > writable bit even if the SW writable bit is not set.
> >
> > Simple reproducer that will result in a writable PTE after ptrace
> > access, to highlight the problem and as an easy way to verify if it has
> > been fixed:
> >
> > --------------------------------------------------------------------------
> > #include <fcntl.h>
> > #include <signal.h>
> > #include <unistd.h>
> > #include <string.h>
> > #include <errno.h>
> > #include <stdlib.h>
> > #include <sys/mman.h>
> >
> > static void signal_handler(int sig)
> > {
> > if (sig == SIGSEGV)
> > printf("[PASS] SIGSEGV generated\n");
> > else
> > printf("[FAIL] wrong signal generated\n");
> > exit(0);
> > }
> >
> > int main(void)
> > {
> > size_t pagesize = getpagesize();
> > char data = 1;
> > off_t offs;
> > int mem_fd;
> > char *map;
> > int ret;
> >
> > mem_fd = open("/proc/self/mem", O_RDWR);
> > if (mem_fd < 0) {
> > fprintf(stderr, "open(/proc/self/mem) failed: %d\n", errno);
> > return 1;
> > }
> >
> > map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE|MAP_ANON, -1 ,0);
> > if (map == MAP_FAILED) {
> > fprintf(stderr, "mmap() failed: %d\n", errno);
> > return 1;
> > }
> >
> > printf("original: %x\n", *map);
> >
> > /* debug access */
> > offs = lseek(mem_fd, (uintptr_t) map, SEEK_SET);
> > ret = write(mem_fd, &data, 1);
> > if (ret != 1) {
> > fprintf(stderr, "pwrite(/proc/self/mem) failed with %d: %d\n", ret, errno);
> > return 1;
> > }
> > if (*map != data) {
> > fprintf(stderr, "pwrite(/proc/self/mem) not visible\n");
> > return 1;
> > }
> >
> > printf("ptrace: %x\n", *map);
> >
> > /* Install signal handler. */
> > if (signal(SIGSEGV, signal_handler) == SIG_ERR) {
> > fprintf(stderr, "signal() failed\n");
> > return 1;
> > }
> >
> > /* Ordinary access. */
> > *map = 2;
> >
> > printf("access: %x\n", *map);
> >
> > printf("[FAIL] SIGSEGV not generated\n");
> >
> > return 0;
> > }
> > --------------------------------------------------------------------------
> >
> > Without this commit (sun4u in QEMU):
> > # ./reproducer
> > original: 0
> > ptrace: 1
> > access: 2
> > [FAIL] SIGSEGV not generated
> >
> > Let's fix this by setting the HW writable bit only if both, the SW dirty
> > bit and the SW writable bit are set. This matches, for example, how
> > s390x handles pte_mkwrite() and pte_mkdirty() -- except, that they have
> > to clear the _PAGE_PROTECT bit.
> >
> > We have to move pte_dirty() and pte_dirty() up. The code patching
> > mechanism and handling constants > 22bit is a bit special on sparc64.
> >
> > With this commit (sun4u in QEMU):
> > # ./reproducer
> > original: 0
> > ptrace: 1
> > [PASS] SIGSEGV generated
> >
> > This handling seems to have been in place forever.
> >
> > Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
> > Cc: Andrew Morton <[email protected]>
> > Cc: "David S. Miller" <[email protected]>
> > Cc: Peter Xu <[email protected]>
> > Cc: Hev <[email protected]>
> > Cc: Anatoly Pugachev <[email protected]>
> > Cc: Raghavendra K T <[email protected]>
> > Cc: Thorsten Leemhuis <[email protected]>
> > Cc: Mike Kravetz <[email protected]>
> > Cc: "Kirill A. Shutemov" <[email protected]>
> > Cc: Juergen Gross <[email protected]>
> > Signed-off-by: David Hildenbrand <[email protected]>
> > ---
>
> Ping

I agree with David that the current sparc64 impl of pte_mkdirty is
suspecious.

What David mentioned on page migration above is correct and has another
report here from Nick recently:

https://lore.kernel.org/all/CADyTPEzsvdRC15+Z5T3oryofwRYqHmHzwqRmJKJoHB3d7Tdayw@mail.gmail.com/

If this patch is hopefully correct (which I cannot tell as I know little on
sparc64) and can be merged, it'll be the cleanest solution, comparing to
what I provided here:

https://lore.kernel.org/all/Y9bvwz4FIOQ+D8c4@x1n/

And I assume it'll also fix things like the reproducer being attached on
wrongly applying write bit with FOLL_FORCE, so it fixes more than that.

I plan to keep posting that fix I referenced above for the breakage because
that'll still be the safest so far, but that can change if someone from
sparc64 can have a look at this and ack it.

Thanks,

--
Peter Xu


2023-02-16 15:37:24

by David Hildenbrand

[permalink] [raw]
Subject: Re: [PATCH v1] sparc/mm: don't unconditionally set HW writable bit when setting PTE dirty on 64bit

Ping. We are seeing more issues [1] with that handling that require
workarounds.

@Andrew, if David is unavailable for sparc64 activity, can we route this
through your tree?

[1] https://lkml.kernel.org/r/[email protected]


On 12.12.22 14:02, David Hildenbrand wrote:
> On sparc64, there is no HW modified bit, therefore, SW tracks via a SW
> bit if the PTE is dirty via pte_mkdirty(). However, pte_mkdirty()
> currently also unconditionally sets the HW writable bit, which is wrong.
>
> pte_mkdirty() is not supposed to make a PTE actually writable, unless the
> SW writable bit (pte_write()) indicates that the PTE is not
> write-protected. Fortunately, sparc64 also defines a SW writable bit.
>
> For example, this already turned into a problem in the context of
> THP splitting as documented in commit 624a2c94f5b7 ("Partly revert "mm/thp:
> carry over dirty bit when thp splits on pmd") and might be an issue during
> page migration in mm/migrate.c:remove_migration_pte() as well where we:
> if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
> pte = pte_mkdirty(pte);
>
> But more general, anything like:
> maybe_mkwrite(pte_mkdirty(pte), vma)
> code is broken on sparc64, because it will unconditionally set the HW
> writable bit even if the SW writable bit is not set.
>
> Simple reproducer that will result in a writable PTE after ptrace
> access, to highlight the problem and as an easy way to verify if it has
> been fixed:
>
> --------------------------------------------------------------------------
> #include <fcntl.h>
> #include <signal.h>
> #include <unistd.h>
> #include <string.h>
> #include <errno.h>
> #include <stdlib.h>
> #include <sys/mman.h>
>
> static void signal_handler(int sig)
> {
> if (sig == SIGSEGV)
> printf("[PASS] SIGSEGV generated\n");
> else
> printf("[FAIL] wrong signal generated\n");
> exit(0);
> }
>
> int main(void)
> {
> size_t pagesize = getpagesize();
> char data = 1;
> off_t offs;
> int mem_fd;
> char *map;
> int ret;
>
> mem_fd = open("/proc/self/mem", O_RDWR);
> if (mem_fd < 0) {
> fprintf(stderr, "open(/proc/self/mem) failed: %d\n", errno);
> return 1;
> }
>
> map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE|MAP_ANON, -1 ,0);
> if (map == MAP_FAILED) {
> fprintf(stderr, "mmap() failed: %d\n", errno);
> return 1;
> }
>
> printf("original: %x\n", *map);
>
> /* debug access */
> offs = lseek(mem_fd, (uintptr_t) map, SEEK_SET);
> ret = write(mem_fd, &data, 1);
> if (ret != 1) {
> fprintf(stderr, "pwrite(/proc/self/mem) failed with %d: %d\n", ret, errno);
> return 1;
> }
> if (*map != data) {
> fprintf(stderr, "pwrite(/proc/self/mem) not visible\n");
> return 1;
> }
>
> printf("ptrace: %x\n", *map);
>
> /* Install signal handler. */
> if (signal(SIGSEGV, signal_handler) == SIG_ERR) {
> fprintf(stderr, "signal() failed\n");
> return 1;
> }
>
> /* Ordinary access. */
> *map = 2;
>
> printf("access: %x\n", *map);
>
> printf("[FAIL] SIGSEGV not generated\n");
>
> return 0;
> }
> --------------------------------------------------------------------------
>
> Without this commit (sun4u in QEMU):
> # ./reproducer
> original: 0
> ptrace: 1
> access: 2
> [FAIL] SIGSEGV not generated
>
> Let's fix this by setting the HW writable bit only if both, the SW dirty
> bit and the SW writable bit are set. This matches, for example, how
> s390x handles pte_mkwrite() and pte_mkdirty() -- except, that they have
> to clear the _PAGE_PROTECT bit.
>
> We have to move pte_dirty() and pte_dirty() up. The code patching
> mechanism and handling constants > 22bit is a bit special on sparc64.
>
> With this commit (sun4u in QEMU):
> # ./reproducer
> original: 0
> ptrace: 1
> [PASS] SIGSEGV generated
>
> This handling seems to have been in place forever.
>
> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
> Cc: Andrew Morton <[email protected]>
> Cc: "David S. Miller" <[email protected]>
> Cc: Peter Xu <[email protected]>
> Cc: Hev <[email protected]>
> Cc: Anatoly Pugachev <[email protected]>
> Cc: Raghavendra K T <[email protected]>
> Cc: Thorsten Leemhuis <[email protected]>
> Cc: Mike Kravetz <[email protected]>
> Cc: "Kirill A. Shutemov" <[email protected]>
> Cc: Juergen Gross <[email protected]>
> Signed-off-by: David Hildenbrand <[email protected]>
> ---
>
> Only tested under QEMU with sun4u, as I cannot seem to get sun4v running
> in QEMU. Survives a simple debian 10 boot.
>
> This also tackles what's documented in:
> https://lkml.kernel.org/r/[email protected]
> and once loongarch also has been fixed, we might be able to remove all
> that special-casing.
>
> ---
> arch/sparc/include/asm/pgtable_64.h | 117 ++++++++++++++++------------
> 1 file changed, 67 insertions(+), 50 deletions(-)
>
> diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
> index 3bc9736bddb1..7f2e57747563 100644
> --- a/arch/sparc/include/asm/pgtable_64.h
> +++ b/arch/sparc/include/asm/pgtable_64.h
> @@ -354,6 +354,42 @@ static inline pgprot_t pgprot_noncached(pgprot_t prot)
> */
> #define pgprot_noncached pgprot_noncached
>
> +static inline unsigned long pte_dirty(pte_t pte)
> +{
> + unsigned long mask;
> +
> + __asm__ __volatile__(
> + "\n661: mov %1, %0\n"
> + " nop\n"
> + " .section .sun4v_2insn_patch, \"ax\"\n"
> + " .word 661b\n"
> + " sethi %%uhi(%2), %0\n"
> + " sllx %0, 32, %0\n"
> + " .previous\n"
> + : "=r" (mask)
> + : "i" (_PAGE_MODIFIED_4U), "i" (_PAGE_MODIFIED_4V));
> +
> + return (pte_val(pte) & mask);
> +}
> +
> +static inline unsigned long pte_write(pte_t pte)
> +{
> + unsigned long mask;
> +
> + __asm__ __volatile__(
> + "\n661: mov %1, %0\n"
> + " nop\n"
> + " .section .sun4v_2insn_patch, \"ax\"\n"
> + " .word 661b\n"
> + " sethi %%uhi(%2), %0\n"
> + " sllx %0, 32, %0\n"
> + " .previous\n"
> + : "=r" (mask)
> + : "i" (_PAGE_WRITE_4U), "i" (_PAGE_WRITE_4V));
> +
> + return (pte_val(pte) & mask);
> +}
> +
> #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
> pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags);
> #define arch_make_huge_pte arch_make_huge_pte
> @@ -415,28 +451,44 @@ static inline bool is_hugetlb_pte(pte_t pte)
> }
> #endif
>
> -static inline pte_t pte_mkdirty(pte_t pte)
> +static inline pte_t __pte_mkhwwrite(pte_t pte)
> {
> - unsigned long val = pte_val(pte), tmp;
> + unsigned long val = pte_val(pte);
>
> + /*
> + * Note: we only want to set the HW writable bit if the SW writable bit
> + * and the SW dirty bit are set.
> + */
> __asm__ __volatile__(
> - "\n661: or %0, %3, %0\n"
> + "\n661: or %0, %2, %0\n"
> " nop\n"
> - "\n662: nop\n"
> + " .section .sun4v_1insn_patch, \"ax\"\n"
> + " .word 661b\n"
> + " or %0, %3, %0\n"
> + " .previous\n"
> + : "=r" (val)
> + : "0" (val), "i" (_PAGE_W_4U), "i" (_PAGE_W_4V));
> +
> + return __pte(val);
> +}
> +
> +static inline pte_t pte_mkdirty(pte_t pte)
> +{
> + unsigned long val = pte_val(pte), mask;
> +
> + __asm__ __volatile__(
> + "\n661: mov %1, %0\n"
> " nop\n"
> " .section .sun4v_2insn_patch, \"ax\"\n"
> " .word 661b\n"
> - " sethi %%uhi(%4), %1\n"
> - " sllx %1, 32, %1\n"
> - " .word 662b\n"
> - " or %1, %%lo(%4), %1\n"
> - " or %0, %1, %0\n"
> + " sethi %%uhi(%2), %0\n"
> + " sllx %0, 32, %0\n"
> " .previous\n"
> - : "=r" (val), "=r" (tmp)
> - : "0" (val), "i" (_PAGE_MODIFIED_4U | _PAGE_W_4U),
> - "i" (_PAGE_MODIFIED_4V | _PAGE_W_4V));
> + : "=r" (mask)
> + : "i" (_PAGE_MODIFIED_4U), "i" (_PAGE_MODIFIED_4V));
>
> - return __pte(val);
> + pte = __pte(val | mask);
> + return pte_write(pte) ? __pte_mkhwwrite(pte) : pte;
> }
>
> static inline pte_t pte_mkclean(pte_t pte)
> @@ -478,7 +530,8 @@ static inline pte_t pte_mkwrite(pte_t pte)
> : "=r" (mask)
> : "i" (_PAGE_WRITE_4U), "i" (_PAGE_WRITE_4V));
>
> - return __pte(val | mask);
> + pte = __pte(val | mask);
> + return pte_dirty(pte) ? __pte_mkhwwrite(pte) : pte;
> }
>
> static inline pte_t pte_wrprotect(pte_t pte)
> @@ -581,42 +634,6 @@ static inline unsigned long pte_young(pte_t pte)
> return (pte_val(pte) & mask);
> }
>
> -static inline unsigned long pte_dirty(pte_t pte)
> -{
> - unsigned long mask;
> -
> - __asm__ __volatile__(
> - "\n661: mov %1, %0\n"
> - " nop\n"
> - " .section .sun4v_2insn_patch, \"ax\"\n"
> - " .word 661b\n"
> - " sethi %%uhi(%2), %0\n"
> - " sllx %0, 32, %0\n"
> - " .previous\n"
> - : "=r" (mask)
> - : "i" (_PAGE_MODIFIED_4U), "i" (_PAGE_MODIFIED_4V));
> -
> - return (pte_val(pte) & mask);
> -}
> -
> -static inline unsigned long pte_write(pte_t pte)
> -{
> - unsigned long mask;
> -
> - __asm__ __volatile__(
> - "\n661: mov %1, %0\n"
> - " nop\n"
> - " .section .sun4v_2insn_patch, \"ax\"\n"
> - " .word 661b\n"
> - " sethi %%uhi(%2), %0\n"
> - " sllx %0, 32, %0\n"
> - " .previous\n"
> - : "=r" (mask)
> - : "i" (_PAGE_WRITE_4U), "i" (_PAGE_WRITE_4V));
> -
> - return (pte_val(pte) & mask);
> -}
> -
> static inline unsigned long pte_exec(pte_t pte)
> {
> unsigned long mask;
>
> base-commit: 830b3c68c1fb1e9176028d02ef86f3cf76aa2476

--
Thanks,

David / dhildenb