2024-04-19 18:37:27

by Guillaume Morin

[permalink] [raw]
Subject: [RFC][PATCH] uprobe: support for private hugetlb mappings

libhugetlbfs, the Intel iodlr code both allow to remap .text onto a
hugetlb private mapping. It's also pretty easy to do it manually.
One drawback of using this functionality is the lack of support for
uprobes (NOTE uprobe ignores shareable vmas)

This patch adds support for private hugetlb mappings. It does require exposing
some hugetlbfs innards and relies on copy_user_large_folio which is only
available when CONFIG_HUGETLBFS is used so I had to use an ugly #ifdef

If there is some interest in applying this patch in some form or
another, I am open to any refactoring suggestions (esp getting rid the
#ifdef in uprobes.c) . I tried to limit the
amount of branching.

---
fs/hugetlbfs/inode.c | 4 ++
include/linux/hugetlb.h | 12 ++++
kernel/events/uprobes.c | 133 ++++++++++++++++++++++++++++++----------
mm/hugetlb.c | 2 +-
4 files changed, 117 insertions(+), 34 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 6502c7e776d1..63bc5d0ddcf7 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -83,6 +83,10 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
{}
};

+bool hugetlbfs_mapping(struct address_space *mapping) {
+ return mapping->a_ops == &hugetlbfs_aops;
+}
+
/*
* Mask used when checking the page offset value passed in via system
* calls. This value will be converted to a loff_t which is signed.
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 77b30a8c6076..49fd47b79bed 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -174,6 +174,8 @@ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx);

pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pud_t *pud);
+pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
+ int writable);

struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage);

@@ -492,6 +494,13 @@ static inline vm_fault_t hugetlb_fault(struct mm_struct *mm,

static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { }

+static inline pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
+ int writable) {
+ pte_t ret;
+ BUG();
+ return ret;
+}
+
#endif /* !CONFIG_HUGETLB_PAGE */
/*
* hugepages at page global directory. If arch support
@@ -539,6 +548,8 @@ struct hugetlbfs_sb_info {
umode_t mode;
};

+bool hugetlbfs_mapping(struct address_space *mapping);
+
static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
{
return sb->s_fs_info;
@@ -585,6 +596,7 @@ static inline struct hstate *hstate_inode(struct inode *i)
{
return NULL;
}
+static inline bool hugetlbfs_mapping(struct address_space *mapping) { return false; }
#endif /* !CONFIG_HUGETLBFS */

#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index e4834d23e1d1..cf4716e73f54 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -11,6 +11,7 @@

#include <linux/kernel.h>
#include <linux/highmem.h>
+#include <linux/hugetlb.h>
#include <linux/pagemap.h> /* read_mapping_page */
#include <linux/slab.h>
#include <linux/sched.h>
@@ -119,7 +120,7 @@ struct xol_area {
*/
static bool valid_vma(struct vm_area_struct *vma, bool is_register)
{
- vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
+ vm_flags_t flags = VM_MAYEXEC | VM_MAYSHARE;

if (is_register)
flags |= VM_WRITE;
@@ -159,9 +160,12 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0);
int err;
struct mmu_notifier_range range;
+ unsigned long page_size = is_vm_hugetlb_page(vma) ?
+ huge_page_size(hstate_vma(vma)):
+ PAGE_SIZE;

mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,
- addr + PAGE_SIZE);
+ addr + page_size);

if (new_page) {
new_folio = page_folio(new_page);
@@ -181,26 +185,46 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,

if (new_page) {
folio_get(new_folio);
- folio_add_new_anon_rmap(new_folio, vma, addr);
- folio_add_lru_vma(new_folio, vma);
- } else
+ if (folio_test_hugetlb(new_folio)) {
+ hugetlb_add_new_anon_rmap(new_folio, vma, addr);
+ } else {
+ folio_add_new_anon_rmap(new_folio, vma, addr);
+ folio_add_lru_vma(new_folio, vma);
+ }
+ } else if (!folio_test_hugetlb(old_folio))
/* no new page, just dec_mm_counter for old_page */
dec_mm_counter(mm, MM_ANONPAGES);

- if (!folio_test_anon(old_folio)) {
+ if (!folio_test_anon(old_folio) && !folio_test_hugetlb(old_folio)) {
dec_mm_counter(mm, mm_counter_file(old_folio));
inc_mm_counter(mm, MM_ANONPAGES);
}

flush_cache_page(vma, addr, pte_pfn(ptep_get(pvmw.pte)));
- ptep_clear_flush(vma, addr, pvmw.pte);
- if (new_page)
- set_pte_at_notify(mm, addr, pvmw.pte,
- mk_pte(new_page, vma->vm_page_prot));
+ if (folio_test_hugetlb(old_folio))
+ huge_ptep_clear_flush(vma, addr, pvmw.pte);
+ else
+ ptep_clear_flush(vma, addr, pvmw.pte);
+ if (new_page) {
+ if (folio_test_hugetlb(new_folio)) {
+ pte_t pte = make_huge_pte(vma, new_page,
+ vma->vm_flags & VM_WRITE);
+ mmu_notifier_change_pte(mm, addr, pte);
+ set_huge_pte_at(mm, addr, pvmw.pte, pte,
+ page_size);
+ } else {
+ set_pte_at_notify(mm, addr, pvmw.pte,
+ mk_pte(new_page, vma->vm_page_prot));
+ }
+ }

- folio_remove_rmap_pte(old_folio, old_page, vma);
- if (!folio_mapped(old_folio))
- folio_free_swap(old_folio);
+ if (folio_test_hugetlb(old_folio))
+ hugetlb_remove_rmap(old_folio);
+ else {
+ folio_remove_rmap_pte(old_folio, old_page, vma);
+ if (!folio_mapped(old_folio))
+ folio_free_swap(old_folio);
+ }
page_vma_mapped_walk_done(&pvmw);
folio_put(old_folio);

@@ -236,21 +260,25 @@ bool __weak is_trap_insn(uprobe_opcode_t *insn)
return is_swbp_insn(insn);
}

-static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
+static void copy_from_page(struct page *page, unsigned long vaddr,
+ void *dst, int len, size_t page_mask)
{
void *kaddr = kmap_atomic(page);
- memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
+ memcpy(dst, kaddr + (vaddr & ~page_mask), len);
kunmap_atomic(kaddr);
}

-static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len)
+static void copy_to_page(struct page *page, unsigned long vaddr,
+ const void *src, int len, size_t page_mask)
{
void *kaddr = kmap_atomic(page);
- memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len);
+ memcpy(kaddr + (vaddr & ~page_mask), src, len);
kunmap_atomic(kaddr);
}

-static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
+static int verify_opcode(struct page *page, unsigned long vaddr,
+ uprobe_opcode_t *new_opcode,
+ unsigned long page_mask)
{
uprobe_opcode_t old_opcode;
bool is_swbp;
@@ -264,7 +292,8 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
* is a trap variant; uprobes always wins over any other (gdb)
* breakpoint.
*/
- copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
+ copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE,
+ page_mask);
is_swbp = is_swbp_insn(&old_opcode);

if (is_swbp_insn(new_opcode)) {
@@ -464,7 +493,9 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
struct vm_area_struct *vma;
int ret, is_register, ref_ctr_updated = 0;
bool orig_page_huge = false;
+ struct hstate *h = NULL;
unsigned int gup_flags = FOLL_FORCE;
+ unsigned long page_mask = PAGE_MASK;

is_register = is_swbp_insn(&opcode);
uprobe = container_of(auprobe, struct uprobe, arch);
@@ -477,11 +508,18 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
if (IS_ERR(old_page))
return PTR_ERR(old_page);

- ret = verify_opcode(old_page, vaddr, &opcode);
+ if (is_vm_hugetlb_page(vma)) {
+ h = hstate_vma(vma);
+ page_mask = huge_page_mask(h);
+
+ BUG_ON(!PageCompound(old_page));
+ old_page = compound_head(old_page);
+ }
+ ret = verify_opcode(old_page, vaddr, &opcode, page_mask);
if (ret <= 0)
goto put_old;

- if (WARN(!is_register && PageCompound(old_page),
+ if (WARN(!is_register && PageCompound(old_page) && !h,
"uprobe unregister should never work on compound page\n")) {
ret = -EINVAL;
goto put_old;
@@ -505,13 +543,30 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
goto put_old;

ret = -ENOMEM;
- new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
+ if (h) {
+ struct folio *new_folio = alloc_hugetlb_folio(vma, vaddr, 0);
+ if (IS_ERR(new_folio))
+ goto put_old;
+ new_page = &new_folio->page;
+ } else {
+ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
+ }
if (!new_page)
goto put_old;

__SetPageUptodate(new_page);
- copy_highpage(new_page, old_page);
- copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+ if (h) {
+#if defined(CONFIG_HUGETLBFS)
+ copy_user_large_folio(page_folio(new_page),
+ page_folio(old_page), vaddr, vma);
+#else
+ BUG();
+#endif
+ } else {
+ copy_highpage(new_page, old_page);
+ }
+ copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE,
+ page_mask);

if (!is_register) {
struct page *orig_page;
@@ -519,7 +574,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,

VM_BUG_ON_PAGE(!PageAnon(old_page), old_page);

- index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT;
+ index = vaddr_to_offset(vma, vaddr & page_mask) >> PAGE_SHIFT;
orig_page = find_get_page(vma->vm_file->f_inode->i_mapping,
index);

@@ -530,14 +585,14 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
put_page(new_page);
new_page = NULL;

- if (PageCompound(orig_page))
+ if (!h && PageCompound(orig_page))
orig_page_huge = true;
}
put_page(orig_page);
}
}

- ret = __replace_page(vma, vaddr & PAGE_MASK, old_page, new_page);
+ ret = __replace_page(vma, vaddr & page_mask, old_page, new_page);
if (new_page)
put_page(new_page);
put_old:
@@ -785,6 +840,7 @@ static int __copy_insn(struct address_space *mapping, struct file *filp,
void *insn, int nbytes, loff_t offset)
{
struct page *page;
+ unsigned long mask = PAGE_MASK;
/*
* Ensure that the page that has the original instruction is populated
* and in page-cache. If ->read_folio == NULL it must be shmem_mapping(),
@@ -792,12 +848,20 @@ static int __copy_insn(struct address_space *mapping, struct file *filp,
*/
if (mapping->a_ops->read_folio)
page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp);
- else
+ else if (!is_file_hugepages(filp))
page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
+ else {
+ struct hstate *h = hstate_file(filp);
+ mask = huge_page_mask(h);
+
+ page = find_get_page(mapping, (offset & mask) >> PAGE_SHIFT);
+ WARN_ON_ONCE(!page);
+ }
+
if (IS_ERR(page))
return PTR_ERR(page);

- copy_from_page(page, offset, insn, nbytes);
+ copy_from_page(page, offset, insn, nbytes, mask);
put_page(page);

return 0;
@@ -1142,9 +1206,12 @@ static int __uprobe_register(struct inode *inode, loff_t offset,
if (!uc->handler && !uc->ret_handler)
return -EINVAL;

- /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
+ /* copy_insn() uses read_mapping_page() or shmem/hugetlbfs specific
+ * code
+ */
if (!inode->i_mapping->a_ops->read_folio &&
- !shmem_mapping(inode->i_mapping))
+ !shmem_mapping(inode->i_mapping) &&
+ !hugetlbfs_mapping(inode->i_mapping))
return -EIO;
/* Racy, just to catch the obvious mistakes */
if (offset > i_size_read(inode))
@@ -1665,7 +1732,7 @@ void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
void *src, unsigned long len)
{
/* Initialize the slot */
- copy_to_page(page, vaddr, src, len);
+ copy_to_page(page, vaddr, src, len, PAGE_MASK);

/*
* We probably need flush_icache_user_page() but it needs vma.
@@ -2029,7 +2096,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
if (result < 0)
return result;

- copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+ copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE, PAGE_MASK);
put_page(page);
out:
/* This needs to return true for any variant of the trap insn */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 23ef240ba48a..c8f26c112b81 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5310,7 +5310,7 @@ const struct vm_operations_struct hugetlb_vm_ops = {
.pagesize = hugetlb_vm_op_pagesize,
};

-static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
+pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
int writable)
{
pte_t entry;
--
2.39.1


--
Guillaume Morin <[email protected]>


2024-04-22 09:39:50

by David Hildenbrand

[permalink] [raw]
Subject: Re: [RFC][PATCH] uprobe: support for private hugetlb mappings

On 19.04.24 20:37, Guillaume Morin wrote:
> libhugetlbfs, the Intel iodlr code both allow to remap .text onto a
> hugetlb private mapping. It's also pretty easy to do it manually.
> One drawback of using this functionality is the lack of support for
> uprobes (NOTE uprobe ignores shareable vmas)
>
> This patch adds support for private hugetlb mappings. It does require exposing
> some hugetlbfs innards and relies on copy_user_large_folio which is only
> available when CONFIG_HUGETLBFS is used so I had to use an ugly #ifdef
>
> If there is some interest in applying this patch in some form or
> another, I am open to any refactoring suggestions (esp getting rid the
> #ifdef in uprobes.c) . I tried to limit the
> amount of branching.

All that hugetlb special casing .... oh my. What's the benefit why we
should be interested in making that code less clean -- to phrase it in a
nice way ;) ?

Yes, libhugetlbfs exists. But why do we have to support uprobes with it?
Nobody cared until now, why care now?

--
Cheers,

David / dhildenb


2024-04-22 18:12:24

by Guillaume Morin

[permalink] [raw]
Subject: Re: [RFC][PATCH] uprobe: support for private hugetlb mappings

(Dropping Mike Kravetz as CC since he has retired and his email is no
longer valid, adding Muchun since he's the current hugetlb maintainer,
as well as linux-trace-kernel)

On 22 Apr 11:39, David Hildenbrand wrote:
>
> On 19.04.24 20:37, Guillaume Morin wrote:
> > libhugetlbfs, the Intel iodlr code both allow to remap .text onto a
> > hugetlb private mapping. It's also pretty easy to do it manually.
> > One drawback of using this functionality is the lack of support for
> > uprobes (NOTE uprobe ignores shareable vmas)
> >
> > This patch adds support for private hugetlb mappings. It does require exposing
> > some hugetlbfs innards and relies on copy_user_large_folio which is only
> > available when CONFIG_HUGETLBFS is used so I had to use an ugly #ifdef
> >
> > If there is some interest in applying this patch in some form or
> > another, I am open to any refactoring suggestions (esp getting rid the
> > #ifdef in uprobes.c) . I tried to limit the
> > amount of branching.
>
> All that hugetlb special casing .... oh my. What's the benefit why we should
> be interested in making that code less clean -- to phrase it in a nice way
> ;) ?

I do appreciate the nice phrasing. Believe me, I did try to limit the
special casing to a minimum :-).

Outside of __replace_page, I added only 3-ish branches so I do not think
it's *too* bad. The uprobe code is using PAGE_{SHIFT,MASK} quite liberally so I
had to add calls to retrieve these for the hugetlb vmas.

__replace_page has a lot of special casing. I certainly agree (and
unfortunately for me it's at the beginning of the patch :)). It's doing
something pretty uncommon outside of the mm code so it has to make a
bunch of specific hugetlb calls. I am not quite sure how to improve it
but if you have suggestions, I'd be happy to refactor.

The benefit - to me - is very clear. People do use hugetlb mappings to
run code in production environments. The perf benefits are there for some
workloads. Intel has published a whitepaper about it etc.
Uprobes are a very good tool to do live tracing. If you can restart the
process and reproduce, you should be able to disable hugetlb remapping
but if you need to look at a live process, there are not many options.
Not being able to use uprobes is crippling.

> Yes, libhugetlbfs exists. But why do we have to support uprobes with it?
> Nobody cared until now, why care now?

I think you could ask the same question for every new feature patch :)

Since the removal a few releases ago of the __morecore() hook in glibc,
the main feature of libhugetlbfs is ELF segments remapping. I think
there are definitely a lot of users that simply deal with this
unnecessary limitation.

I am certainly not shoving this patch through anyone's throat if there
is no interest. But we definitely find it a very useful feature ...

Guillaume.

--
Guillaume Morin <[email protected]>

2024-04-22 19:06:06

by David Hildenbrand

[permalink] [raw]
Subject: Re: [RFC][PATCH] uprobe: support for private hugetlb mappings

On 22.04.24 20:11, Guillaume Morin wrote:
> (Dropping Mike Kravetz as CC since he has retired and his email is no
> longer valid, adding Muchun since he's the current hugetlb maintainer,
> as well as linux-trace-kernel)
>
> On 22 Apr 11:39, David Hildenbrand wrote:
>>
>> On 19.04.24 20:37, Guillaume Morin wrote:
>>> libhugetlbfs, the Intel iodlr code both allow to remap .text onto a
>>> hugetlb private mapping. It's also pretty easy to do it manually.
>>> One drawback of using this functionality is the lack of support for
>>> uprobes (NOTE uprobe ignores shareable vmas)
>>>
>>> This patch adds support for private hugetlb mappings. It does require exposing
>>> some hugetlbfs innards and relies on copy_user_large_folio which is only
>>> available when CONFIG_HUGETLBFS is used so I had to use an ugly #ifdef
>>>
>>> If there is some interest in applying this patch in some form or
>>> another, I am open to any refactoring suggestions (esp getting rid the
>>> #ifdef in uprobes.c) . I tried to limit the
>>> amount of branching.
>>
>> All that hugetlb special casing .... oh my. What's the benefit why we should
>> be interested in making that code less clean -- to phrase it in a nice way
>> ;) ?
>
> I do appreciate the nice phrasing. Believe me, I did try to limit the
> special casing to a minimum :-).
>
> Outside of __replace_page, I added only 3-ish branches so I do not think
> it's *too* bad. The uprobe code is using PAGE_{SHIFT,MASK} quite liberally so I
> had to add calls to retrieve these for the hugetlb vmas.
>
> __replace_page has a lot of special casing. I certainly agree (and
> unfortunately for me it's at the beginning of the patch :)). It's doing
> something pretty uncommon outside of the mm code so it has to make a
> bunch of specific hugetlb calls. I am not quite sure how to improve it
> but if you have suggestions, I'd be happy to refactor.

See below.

>
> The benefit - to me - is very clear. People do use hugetlb mappings to
> run code in production environments. The perf benefits are there for some
> workloads. Intel has published a whitepaper about it etc.
> Uprobes are a very good tool to do live tracing. If you can restart the
> process and reproduce, you should be able to disable hugetlb remapping
> but if you need to look at a live process, there are not many options.
> Not being able to use uprobes is crippling.

Please add all that as motivation to the patch description or cover letter.

>
>> Yes, libhugetlbfs exists. But why do we have to support uprobes with it?
>> Nobody cared until now, why care now?
>
> I think you could ask the same question for every new feature patch :)

I have to, because it usually indicates a lack of motivation in the
cover-letter/patch description :P

People will have to maintain that code, and maintaining hugetlb code in
odd places is no fun ...

>
> Since the removal a few releases ago of the __morecore() hook in glibc,
> the main feature of libhugetlbfs is ELF segments remapping. I think
> there are definitely a lot of users that simply deal with this
> unnecessary limitation.
>
> I am certainly not shoving this patch through anyone's throat if there
> is no interest. But we definitely find it a very useful feature ...

Let me try to see if we can get this done cleaner.

One ugly part (in general here) is the custom page replacement in the
registration part.

We are guaranteed to have a MAP_PRIVATE mapping. Instead of replacing
pages ourselves (which we likely shouldn't do ...) ... maybe we could
use FAULT_FLAG_UNSHARE faults such that we will get an anonymous folio
populated. (like KSM does nowadays)

Punching FOLL_PIN|FOLL_LONGTERM into GUP would achieve the same thing,
but using FOLL_WRITE would not work on many file systems. So maybe we
have to trigger an unsharing fault ourselves.

That would do the page replacement for us and we "should" be able to
lookup an anonymous folio that we can then just modify, like ptrace would.

But then, there is also unregistration part, with weird conditional page
replacement. Zapping the anon page if the content matches the content of
the original page is one thing. But why are we placing an existing
anonymous page by a new anonymous page when the content from the
original page differs (but matches the one from the just copied page?)?

I'll have to further think about that one. It's all a bit nasty.


One thing to note is that hugetlb folios don't grow on trees. Likely,
Many setups *don't* reserve extra hugetlb folios and you might just
easily be running out of free hugetlb folios that you can use to break
COW here (replace a file hugetlb by a fresh anon hugetlb page). Likely
it's easy to make register or unregister fail.

--
Cheers,

David / dhildenb


2024-04-22 20:53:37

by Guillaume Morin

[permalink] [raw]
Subject: Re: [RFC][PATCH] uprobe: support for private hugetlb mappings

On 22 Apr 20:59, David Hildenbrand wrote:
> > The benefit - to me - is very clear. People do use hugetlb mappings to
> > run code in production environments. The perf benefits are there for some
> > workloads. Intel has published a whitepaper about it etc.
> > Uprobes are a very good tool to do live tracing. If you can restart the
> > process and reproduce, you should be able to disable hugetlb remapping
> > but if you need to look at a live process, there are not many options.
> > Not being able to use uprobes is crippling.
>
> Please add all that as motivation to the patch description or cover letter.
>
> > > Yes, libhugetlbfs exists. But why do we have to support uprobes with it?
> > > Nobody cared until now, why care now?
> >
> > I think you could ask the same question for every new feature patch :)
>
> I have to, because it usually indicates a lack of motivation in the
> cover-letter/patch description :P

My cover letter was indeed lacking. I will make sure to add this kind of
details next time.

> > Since the removal a few releases ago of the __morecore() hook in glibc,
> > the main feature of libhugetlbfs is ELF segments remapping. I think
> > there are definitely a lot of users that simply deal with this
> > unnecessary limitation.
> >
> > I am certainly not shoving this patch through anyone's throat if there
> > is no interest. But we definitely find it a very useful feature ...
>
> Let me try to see if we can get this done cleaner.
>
> One ugly part (in general here) is the custom page replacement in the
> registration part.
>
> We are guaranteed to have a MAP_PRIVATE mapping. Instead of replacing pages
> ourselves (which we likely shouldn't do ...) ... maybe we could use
> FAULT_FLAG_UNSHARE faults such that we will get an anonymous folio
> populated. (like KSM does nowadays)
>
> Punching FOLL_PIN|FOLL_LONGTERM into GUP would achieve the same thing, but
> using FOLL_WRITE would not work on many file systems. So maybe we have to
> trigger an unsharing fault ourselves.
>
> That would do the page replacement for us and we "should" be able to lookup
> an anonymous folio that we can then just modify, like ptrace would.
>
> But then, there is also unregistration part, with weird conditional page
> replacement. Zapping the anon page if the content matches the content of the
> original page is one thing. But why are we placing an existing anonymous
> page by a new anonymous page when the content from the original page differs
> (but matches the one from the just copied page?)?
>
> I'll have to further think about that one. It's all a bit nasty.

Sounds good to me. I am willing to help with the code when you have a
plan or testing as you see fit. Let me know.

> One thing to note is that hugetlb folios don't grow on trees. Likely, Many
> setups *don't* reserve extra hugetlb folios and you might just easily be
> running out of free hugetlb folios that you can use to break COW here
> (replace a file hugetlb by a fresh anon hugetlb page). Likely it's easy to
> make register or unregister fail.

Agreed.

--
Guillaume Morin <[email protected]>

2024-04-24 20:31:53

by David Hildenbrand

[permalink] [raw]
Subject: Re: [RFC][PATCH] uprobe: support for private hugetlb mappings

On 22.04.24 22:53, Guillaume Morin wrote:
> On 22 Apr 20:59, David Hildenbrand wrote:
>>> The benefit - to me - is very clear. People do use hugetlb mappings to
>>> run code in production environments. The perf benefits are there for some
>>> workloads. Intel has published a whitepaper about it etc.
>>> Uprobes are a very good tool to do live tracing. If you can restart the
>>> process and reproduce, you should be able to disable hugetlb remapping
>>> but if you need to look at a live process, there are not many options.
>>> Not being able to use uprobes is crippling.
>>
>> Please add all that as motivation to the patch description or cover letter.
>>
>>>> Yes, libhugetlbfs exists. But why do we have to support uprobes with it?
>>>> Nobody cared until now, why care now?
>>>
>>> I think you could ask the same question for every new feature patch :)
>>
>> I have to, because it usually indicates a lack of motivation in the
>> cover-letter/patch description :P
>
> My cover letter was indeed lacking. I will make sure to add this kind of
> details next time.
>
>>> Since the removal a few releases ago of the __morecore() hook in glibc,
>>> the main feature of libhugetlbfs is ELF segments remapping. I think
>>> there are definitely a lot of users that simply deal with this
>>> unnecessary limitation.
>>>
>>> I am certainly not shoving this patch through anyone's throat if there
>>> is no interest. But we definitely find it a very useful feature ...
>>
>> Let me try to see if we can get this done cleaner.
>>
>> One ugly part (in general here) is the custom page replacement in the
>> registration part.
>>
>> We are guaranteed to have a MAP_PRIVATE mapping. Instead of replacing pages
>> ourselves (which we likely shouldn't do ...) ... maybe we could use
>> FAULT_FLAG_UNSHARE faults such that we will get an anonymous folio
>> populated. (like KSM does nowadays)
>>
>> Punching FOLL_PIN|FOLL_LONGTERM into GUP would achieve the same thing, but
>> using FOLL_WRITE would not work on many file systems. So maybe we have to
>> trigger an unsharing fault ourselves.

^ realizing that we already use FOLL_FORCE, so we can just use
FOLL_WRITE to break COW.

>>
>> That would do the page replacement for us and we "should" be able to lookup
>> an anonymous folio that we can then just modify, like ptrace would.
>>
>> But then, there is also unregistration part, with weird conditional page
>> replacement. Zapping the anon page if the content matches the content of the
>> original page is one thing. But why are we placing an existing anonymous
>> page by a new anonymous page when the content from the original page differs
>> (but matches the one from the just copied page?)?
>>
>> I'll have to further think about that one. It's all a bit nasty.
>
> Sounds good to me. I am willing to help with the code when you have a
> plan or testing as you see fit. Let me know.

I'm hacking on a redesign that removes the manual COW breaking logic and
*might* make it easier to integrate hugetlb. (very likely, but until I
have the redesign running I cannot promise anything :) )

I'll let you know once I have something ready so you could integrate the
hugetlb portion.

--
Cheers,

David / dhildenb


2024-04-24 20:45:04

by Guillaume Morin

[permalink] [raw]
Subject: Re: [RFC][PATCH] uprobe: support for private hugetlb mappings

On 24 Apr 22:09, David Hildenbrand wrote:
> > > Let me try to see if we can get this done cleaner.
> > >
> > > One ugly part (in general here) is the custom page replacement in the
> > > registration part.
> > >
> > > We are guaranteed to have a MAP_PRIVATE mapping. Instead of replacing pages
> > > ourselves (which we likely shouldn't do ...) ... maybe we could use
> > > FAULT_FLAG_UNSHARE faults such that we will get an anonymous folio
> > > populated. (like KSM does nowadays)
> > >
> > > Punching FOLL_PIN|FOLL_LONGTERM into GUP would achieve the same thing, but
> > > using FOLL_WRITE would not work on many file systems. So maybe we have to
> > > trigger an unsharing fault ourselves.
>
> ^ realizing that we already use FOLL_FORCE, so we can just use FOLL_WRITE to
> break COW.

It was never clear to me why uprobes was not doing FOLL_WRITE in the
first place, I must say.

One issue here is that FOLL_FORCE|FOLL_WRITE is not implemented for
hugetlb mappings. However this was also on my TODO and I have a draft
patch that implements it.

>
> > >
> > > That would do the page replacement for us and we "should" be able to lookup
> > > an anonymous folio that we can then just modify, like ptrace would.
> > >
> > > But then, there is also unregistration part, with weird conditional page
> > > replacement. Zapping the anon page if the content matches the content of the
> > > original page is one thing. But why are we placing an existing anonymous
> > > page by a new anonymous page when the content from the original page differs
> > > (but matches the one from the just copied page?)?
> > >
> > > I'll have to further think about that one. It's all a bit nasty.
> >
> > Sounds good to me. I am willing to help with the code when you have a
> > plan or testing as you see fit. Let me know.
>
> I'm hacking on a redesign that removes the manual COW breaking logic and
> *might* make it easier to integrate hugetlb. (very likely, but until I have
> the redesign running I cannot promise anything :) )
>
> I'll let you know once I have something ready so you could integrate the
> hugetlb portion.

Sounds good.

--
Guillaume Morin <[email protected]>

2024-04-24 21:00:26

by David Hildenbrand

[permalink] [raw]
Subject: Re: [RFC][PATCH] uprobe: support for private hugetlb mappings

On 24.04.24 22:44, Guillaume Morin wrote:
> On 24 Apr 22:09, David Hildenbrand wrote:
>>>> Let me try to see if we can get this done cleaner.
>>>>
>>>> One ugly part (in general here) is the custom page replacement in the
>>>> registration part.
>>>>
>>>> We are guaranteed to have a MAP_PRIVATE mapping. Instead of replacing pages
>>>> ourselves (which we likely shouldn't do ...) ... maybe we could use
>>>> FAULT_FLAG_UNSHARE faults such that we will get an anonymous folio
>>>> populated. (like KSM does nowadays)
>>>>
>>>> Punching FOLL_PIN|FOLL_LONGTERM into GUP would achieve the same thing, but
>>>> using FOLL_WRITE would not work on many file systems. So maybe we have to
>>>> trigger an unsharing fault ourselves.
>>
>> ^ realizing that we already use FOLL_FORCE, so we can just use FOLL_WRITE to
>> break COW.
>
> It was never clear to me why uprobes was not doing FOLL_WRITE in the
> first place, I must say.

It's quite dated code ...

The use of FOLL_FORCE really is ugly here. When registering, we require
VM_WRITE but ... when unregistering, we don't ...

>
> One issue here is that FOLL_FORCE|FOLL_WRITE is not implemented for
> hugetlb mappings. However this was also on my TODO and I have a draft
> patch that implements it.

Yes, I documented it back then and added sanity checks in GUP code to
fence it off. Shouldn't be too hard to implement (famous last words) and
would be the cleaner thing to use here once I manage to switch over to
FOLL_WRITE|FOLL_FORCE to break COW.

--
Cheers,

David / dhildenb


2024-04-25 15:19:45

by Guillaume Morin

[permalink] [raw]
Subject: Re: [RFC][PATCH] uprobe: support for private hugetlb mappings

On 24 Apr 23:00, David Hildenbrand wrote:
> > One issue here is that FOLL_FORCE|FOLL_WRITE is not implemented for
> > hugetlb mappings. However this was also on my TODO and I have a draft
> > patch that implements it.
>
> Yes, I documented it back then and added sanity checks in GUP code to fence
> it off. Shouldn't be too hard to implement (famous last words) and would be
> the cleaner thing to use here once I manage to switch over to
> FOLL_WRITE|FOLL_FORCE to break COW.

Yes, my patch seems to be working. The hugetlb code is pretty simple.
And it allows ptrace and the proc pid mem file to work on the executable
private hugetlb mappings.

There is one thing I am unclear about though. hugetlb enforces that
huge_pte_write() is true on FOLL_WRITE in both the fault and
follow_page_mask paths. I am not sure if we can simply assume in the
hugetlb code that if the pte is not writable and this is a write fault
then we're in the FOLL_FORCE|FOLL_WRITE case. Or do we want to keep the
checks simply not enforce it for FOLL_FORCE|FOLL_WRITE?

The latter is more complicated in the fault path because there is no
FAULT_FLAG_FORCE flag.

--
Guillaume Morin <[email protected]>

2024-04-25 15:42:38

by David Hildenbrand

[permalink] [raw]
Subject: Re: [RFC][PATCH] uprobe: support for private hugetlb mappings

On 25.04.24 17:19, Guillaume Morin wrote:
> On 24 Apr 23:00, David Hildenbrand wrote:
>>> One issue here is that FOLL_FORCE|FOLL_WRITE is not implemented for
>>> hugetlb mappings. However this was also on my TODO and I have a draft
>>> patch that implements it.
>>
>> Yes, I documented it back then and added sanity checks in GUP code to fence
>> it off. Shouldn't be too hard to implement (famous last words) and would be
>> the cleaner thing to use here once I manage to switch over to
>> FOLL_WRITE|FOLL_FORCE to break COW.
>
> Yes, my patch seems to be working. The hugetlb code is pretty simple.
> And it allows ptrace and the proc pid mem file to work on the executable
> private hugetlb mappings.
>
> There is one thing I am unclear about though. hugetlb enforces that
> huge_pte_write() is true on FOLL_WRITE in both the fault and
> follow_page_mask paths. I am not sure if we can simply assume in the
> hugetlb code that if the pte is not writable and this is a write fault
> then we're in the FOLL_FORCE|FOLL_WRITE case. Or do we want to keep the
> checks simply not enforce it for FOLL_FORCE|FOLL_WRITE?
>
> The latter is more complicated in the fault path because there is no
> FAULT_FLAG_FORCE flag.
>

handle_mm_fault()->sanitize_fault_flags() makes sure that we'll only
proceed with a fault either if
* we have VM_WRITE set
* we are in a COW mapping (MAP_PRIVATE with at least VM_MAYWRITE)

Once you see FAULT_FLAG_WRITE and you do have VM_WRITE, you don't care
about FOLL_FORCE, it's simply a write fault.

Once you see FAULT_FLAG_WRITE and you *don't* have VM_WRITE, you must
have VM_MAYWRITE and are essentially in FOLL_FORCE.

In a VMA without VM_WRITE, you must never map a PTE writable. In
ordinary COW code, that's done in wp_page_copy(), where we *always* use
maybe_mkwrite(), to do exactly what a write fault would do, but without
mapping the PTE writable.

That's what the whole can_follow_write_pmd()/can_follow_write_pte() is
about: writing to PTEs that are not writable.

You'll have to follow the exact same model in hugetlb
(can_follow_write_pmd(), hugetlb_maybe_mkwrite(), ...).

--
Cheers,

David / dhildenb


2024-04-25 19:56:18

by David Hildenbrand

[permalink] [raw]
Subject: Re: [RFC][PATCH] uprobe: support for private hugetlb mappings

On 25.04.24 17:19, Guillaume Morin wrote:
> On 24 Apr 23:00, David Hildenbrand wrote:
>>> One issue here is that FOLL_FORCE|FOLL_WRITE is not implemented for
>>> hugetlb mappings. However this was also on my TODO and I have a draft
>>> patch that implements it.
>>
>> Yes, I documented it back then and added sanity checks in GUP code to fence
>> it off. Shouldn't be too hard to implement (famous last words) and would be
>> the cleaner thing to use here once I manage to switch over to
>> FOLL_WRITE|FOLL_FORCE to break COW.
>
> Yes, my patch seems to be working. The hugetlb code is pretty simple.
> And it allows ptrace and the proc pid mem file to work on the executable
> private hugetlb mappings.
>
> There is one thing I am unclear about though. hugetlb enforces that
> huge_pte_write() is true on FOLL_WRITE in both the fault and
> follow_page_mask paths. I am not sure if we can simply assume in the
> hugetlb code that if the pte is not writable and this is a write fault
> then we're in the FOLL_FORCE|FOLL_WRITE case. Or do we want to keep the
> checks simply not enforce it for FOLL_FORCE|FOLL_WRITE?
>
> The latter is more complicated in the fault path because there is no
> FAULT_FLAG_FORCE flag.
>

I just pushed something to
https://github.com/davidhildenbrand/linux/tree/uprobes_cow

Only very lightly tested so far. Expect the worst :)

I still detest having the zapping logic there, but to get it all right I
don't see a clean way around that.


For hugetlb, we'd primarily have to implement the
mm_walk_ops->hugetlb_entry() callback (well, and FOLL_FORCE).

Likely vaddr and PAGE_SIZE in uprobe_write_opcode() would have to be
expanded to cover the full hugetlb page.

--
Cheers,

David / dhildenb


2024-04-26 00:10:19

by Guillaume Morin

[permalink] [raw]
Subject: Re: [RFC][PATCH] uprobe: support for private hugetlb mappings

On 25 Apr 21:56, David Hildenbrand wrote:
>
> On 25.04.24 17:19, Guillaume Morin wrote:
> > On 24 Apr 23:00, David Hildenbrand wrote:
> > > > One issue here is that FOLL_FORCE|FOLL_WRITE is not implemented for
> > > > hugetlb mappings. However this was also on my TODO and I have a draft
> > > > patch that implements it.
> > >
> > > Yes, I documented it back then and added sanity checks in GUP code to fence
> > > it off. Shouldn't be too hard to implement (famous last words) and would be
> > > the cleaner thing to use here once I manage to switch over to
> > > FOLL_WRITE|FOLL_FORCE to break COW.
> >
> > Yes, my patch seems to be working. The hugetlb code is pretty simple.
> > And it allows ptrace and the proc pid mem file to work on the executable
> > private hugetlb mappings.
> >
> > There is one thing I am unclear about though. hugetlb enforces that
> > huge_pte_write() is true on FOLL_WRITE in both the fault and
> > follow_page_mask paths. I am not sure if we can simply assume in the
> > hugetlb code that if the pte is not writable and this is a write fault
> > then we're in the FOLL_FORCE|FOLL_WRITE case. Or do we want to keep the
> > checks simply not enforce it for FOLL_FORCE|FOLL_WRITE?
> >
> > The latter is more complicated in the fault path because there is no
> > FAULT_FLAG_FORCE flag.
> >
>
> I just pushed something to
> https://github.com/davidhildenbrand/linux/tree/uprobes_cow
>
> Only very lightly tested so far. Expect the worst :)


I'll try it out and send you the hugetlb bits

>
> I still detest having the zapping logic there, but to get it all right I
> don't see a clean way around that.
>
>
> For hugetlb, we'd primarily have to implement the
> mm_walk_ops->hugetlb_entry() callback (well, and FOLL_FORCE).

For FOLL_FORCE, heer is my draft. Let me know if this is what you had in
mind.


diff --git a/mm/gup.c b/mm/gup.c
index 1611e73b1121..ac60e0ae64e8 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1056,9 +1056,6 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
if (!(vm_flags & VM_WRITE) || (vm_flags & VM_SHADOW_STACK)) {
if (!(gup_flags & FOLL_FORCE))
return -EFAULT;
- /* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */
- if (is_vm_hugetlb_page(vma))
- return -EFAULT;
/*
* We used to let the write,force case do COW in a
* VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3548eae42cf9..73f86eddf888 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5941,7 +5941,8 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
struct folio *pagecache_folio, spinlock_t *ptl,
struct vm_fault *vmf)
{
- const bool unshare = flags & FAULT_FLAG_UNSHARE;
+ const bool make_writable = !(flags & FAULT_FLAG_UNSHARE) &&
+ (vma->vm_flags & VM_WRITE);
pte_t pte = huge_ptep_get(ptep);
struct hstate *h = hstate_vma(vma);
struct folio *old_folio;
@@ -5959,16 +5960,9 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
* can trigger this, because hugetlb_fault() will always resolve
* uffd-wp bit first.
*/
- if (!unshare && huge_pte_uffd_wp(pte))
+ if (make_writable && huge_pte_uffd_wp(pte))
return 0;

- /*
- * hugetlb does not support FOLL_FORCE-style write faults that keep the
- * PTE mapped R/O such as maybe_mkwrite() would do.
- */
- if (WARN_ON_ONCE(!unshare && !(vma->vm_flags & VM_WRITE)))
- return VM_FAULT_SIGSEGV;
-
/* Let's take out MAP_SHARED mappings first. */
if (vma->vm_flags & VM_MAYSHARE) {
set_huge_ptep_writable(vma, haddr, ptep);
@@ -5989,7 +5983,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
folio_move_anon_rmap(old_folio, vma);
SetPageAnonExclusive(&old_folio->page);
}
- if (likely(!unshare))
+ if (likely(make_writable))
set_huge_ptep_writable(vma, haddr, ptep);

delayacct_wpcopy_end();
@@ -6094,7 +6088,8 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
spin_lock(ptl);
ptep = hugetlb_walk(vma, haddr, huge_page_size(h));
if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
- pte_t newpte = make_huge_pte(vma, &new_folio->page, !unshare);
+ pte_t newpte = make_huge_pte(vma, &new_folio->page,
+ make_writable);

/* Break COW or unshare */
huge_ptep_clear_flush(vma, haddr, ptep);
@@ -6883,6 +6878,17 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
}
#endif /* CONFIG_USERFAULTFD */

+static bool is_force_follow(struct vm_area_struct* vma, unsigned int flags,
+ struct page* page) {
+ if (vma->vm_flags & VM_WRITE)
+ return false;
+
+ if (!(flags & FOLL_FORCE))
+ return false;
+
+ return page && PageAnon(page) && page_mapcount(page) == 1;
+}
+
struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
unsigned long address, unsigned int flags,
unsigned int *page_mask)
@@ -6907,11 +6913,11 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,

if (!huge_pte_write(entry)) {
if (flags & FOLL_WRITE) {
- page = NULL;
- goto out;
- }
-
- if (gup_must_unshare(vma, flags, page)) {
+ if (!is_force_follow(vma, flags, page)) {
+ page = NULL;
+ goto out;
+ }
+ } else if (gup_must_unshare(vma, flags, page)) {
/* Tell the caller to do unsharing */
page = ERR_PTR(-EMLINK);
goto out;

>
> Likely vaddr and PAGE_SIZE in uprobe_write_opcode() would have to be
> expanded to cover the full hugetlb page.
>
> --
> Cheers,
>
> David / dhildenb
>

--
Guillaume Morin <[email protected]>

2024-04-26 07:19:28

by David Hildenbrand

[permalink] [raw]
Subject: Re: [RFC][PATCH] uprobe: support for private hugetlb mappings

On 26.04.24 02:09, Guillaume Morin wrote:
> On 25 Apr 21:56, David Hildenbrand wrote:
>>
>> On 25.04.24 17:19, Guillaume Morin wrote:
>>> On 24 Apr 23:00, David Hildenbrand wrote:
>>>>> One issue here is that FOLL_FORCE|FOLL_WRITE is not implemented for
>>>>> hugetlb mappings. However this was also on my TODO and I have a draft
>>>>> patch that implements it.
>>>>
>>>> Yes, I documented it back then and added sanity checks in GUP code to fence
>>>> it off. Shouldn't be too hard to implement (famous last words) and would be
>>>> the cleaner thing to use here once I manage to switch over to
>>>> FOLL_WRITE|FOLL_FORCE to break COW.
>>>
>>> Yes, my patch seems to be working. The hugetlb code is pretty simple.
>>> And it allows ptrace and the proc pid mem file to work on the executable
>>> private hugetlb mappings.
>>>
>>> There is one thing I am unclear about though. hugetlb enforces that
>>> huge_pte_write() is true on FOLL_WRITE in both the fault and
>>> follow_page_mask paths. I am not sure if we can simply assume in the
>>> hugetlb code that if the pte is not writable and this is a write fault
>>> then we're in the FOLL_FORCE|FOLL_WRITE case. Or do we want to keep the
>>> checks simply not enforce it for FOLL_FORCE|FOLL_WRITE?
>>>
>>> The latter is more complicated in the fault path because there is no
>>> FAULT_FLAG_FORCE flag.
>>>
>>
>> I just pushed something to
>> https://github.com/davidhildenbrand/linux/tree/uprobes_cow
>>
>> Only very lightly tested so far. Expect the worst :)
>
>
> I'll try it out and send you the hugetlb bits
>
>>
>> I still detest having the zapping logic there, but to get it all right I
>> don't see a clean way around that.
>>
>>
>> For hugetlb, we'd primarily have to implement the
>> mm_walk_ops->hugetlb_entry() callback (well, and FOLL_FORCE).
>
> For FOLL_FORCE, heer is my draft. Let me know if this is what you had in
> mind.
>
>
> diff --git a/mm/gup.c b/mm/gup.c
> index 1611e73b1121..ac60e0ae64e8 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -1056,9 +1056,6 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
> if (!(vm_flags & VM_WRITE) || (vm_flags & VM_SHADOW_STACK)) {
> if (!(gup_flags & FOLL_FORCE))
> return -EFAULT;
> - /* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */
> - if (is_vm_hugetlb_page(vma))
> - return -EFAULT;
> /*
> * We used to let the write,force case do COW in a
> * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 3548eae42cf9..73f86eddf888 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -5941,7 +5941,8 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
> struct folio *pagecache_folio, spinlock_t *ptl,
> struct vm_fault *vmf)
> {
> - const bool unshare = flags & FAULT_FLAG_UNSHARE;
> + const bool make_writable = !(flags & FAULT_FLAG_UNSHARE) &&
> + (vma->vm_flags & VM_WRITE);
> pte_t pte = huge_ptep_get(ptep);
> struct hstate *h = hstate_vma(vma);
> struct folio *old_folio;
> @@ -5959,16 +5960,9 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
> * can trigger this, because hugetlb_fault() will always resolve
> * uffd-wp bit first.
> */
> - if (!unshare && huge_pte_uffd_wp(pte))
> + if (make_writable && huge_pte_uffd_wp(pte))
> return 0;
>
> - /*
> - * hugetlb does not support FOLL_FORCE-style write faults that keep the
> - * PTE mapped R/O such as maybe_mkwrite() would do.
> - */
> - if (WARN_ON_ONCE(!unshare && !(vma->vm_flags & VM_WRITE)))
> - return VM_FAULT_SIGSEGV;
> -
> /* Let's take out MAP_SHARED mappings first. */
> if (vma->vm_flags & VM_MAYSHARE) {
> set_huge_ptep_writable(vma, haddr, ptep);
> @@ -5989,7 +5983,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
> folio_move_anon_rmap(old_folio, vma);
> SetPageAnonExclusive(&old_folio->page);
> }
> - if (likely(!unshare))
> + if (likely(make_writable))
> set_huge_ptep_writable(vma, haddr, ptep);

Maybe we want to refactor that similarly into a
set_huge_ptep_maybe_writable, and handle the VM_WRITE check internally.

Then, here you'd do

if (unshare)
set_huge_ptep(vma, haddr, ptep);
else
set_huge_ptep_maybe_writable(vma, haddr, ptep);

Something like that.



> /* Break COW or unshare */
> huge_ptep_clear_flush(vma, haddr, ptep);
> @@ -6883,6 +6878,17 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
> }
> #endif /* CONFIG_USERFAULTFD */
>
> +static bool is_force_follow(struct vm_area_struct* vma, unsigned int flags,
> + struct page* page) {
> + if (vma->vm_flags & VM_WRITE)
> + return false;
> +
> + if (!(flags & FOLL_FORCE))
> + return false;
> +
> + return page && PageAnon(page) && page_mapcount(page) == 1;
> +}

A couple of points:

a) Don't use page_mapcount(). Either folio_mapcount(), but likely you
want to check PageAnonExclusive.

b) If you're not following the can_follow_write_pte/_pmd model, you are
doing something wrong :)

c) The code was heavily changed in mm/mm-unstable. It was merged with t
the common code.

Likely, in mm/mm-unstable, the existing can_follow_write_pte and
can_follow_write_pmd checks will already cover what you want in most cases.

We'd need a can_follow_write_pud() to cover follow_huge_pud() and
(unfortunately) something to handle follow_hugepd() as well similarly.

Copy-pasting what we do in can_follow_write_pte() and adjusting for
different PTE types is the right thing to do. Maybe now it's time to
factor out the common checks into a separate helper.


--
Cheers,

David / dhildenb


2024-04-26 20:03:54

by Guillaume Morin

[permalink] [raw]
Subject: Re: [RFC][PATCH] uprobe: support for private hugetlb mappings

On 26 Apr 9:19, David Hildenbrand wrote:
> A couple of points:
>
> a) Don't use page_mapcount(). Either folio_mapcount(), but likely you want
> to check PageAnonExclusive.
>
> b) If you're not following the can_follow_write_pte/_pmd model, you are
> doing something wrong :)
>
> c) The code was heavily changed in mm/mm-unstable. It was merged with t
> the common code.
>
> Likely, in mm/mm-unstable, the existing can_follow_write_pte and
> can_follow_write_pmd checks will already cover what you want in most cases.
>
> We'd need a can_follow_write_pud() to cover follow_huge_pud() and
> (unfortunately) something to handle follow_hugepd() as well similarly.
>
> Copy-pasting what we do in can_follow_write_pte() and adjusting for
> different PTE types is the right thing to do. Maybe now it's time to factor
> out the common checks into a separate helper.

I tried to get the hugepd stuff right but this was the first I heard
about it :-) Afaict follow_huge_pmd and friends were already DTRT

diff --git a/mm/gup.c b/mm/gup.c
index 2f7baf96f65..9df97ea555f 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -517,6 +517,34 @@ static int record_subpages(struct page *page, unsigned long sz,
}
#endif /* CONFIG_ARCH_HAS_HUGEPD || CONFIG_HAVE_GUP_FAST */

+/* Common code for can_follow_write_* */
+static inline bool can_follow_write_common(struct page *page,
+ struct vm_area_struct *vma,
+ unsigned int flags)
+{
+ /* Maybe FOLL_FORCE is set to override it? */
+ if (!(flags & FOLL_FORCE))
+ return false;
+
+ /* But FOLL_FORCE has no effect on shared mappings */
+ if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
+ return false;
+
+ /* ... or read-only private ones */
+ if (!(vma->vm_flags & VM_MAYWRITE))
+ return false;
+
+ /* ... or already writable ones that just need to take a write fault */
+ if (vma->vm_flags & VM_WRITE)
+ return false;
+
+ /*
+ * See can_change_pte_writable(): we broke COW and could map the page
+ * writable if we have an exclusive anonymous page ...
+ */
+ return page && PageAnon(page) && PageAnonExclusive(page);
+}
+
#ifdef CONFIG_ARCH_HAS_HUGEPD
static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
unsigned long sz)
@@ -525,6 +553,17 @@ static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
return (__boundary - 1 < end - 1) ? __boundary : end;
}

+static inline bool can_follow_write_hugepd(pte_t pte, struct page* page,
+ struct vm_area_struct *vma,
+ unsigned int flags)
+{
+ /* If the pte is writable, we can write to the page. */
+ if (pte_access_permitted(pte, flags & FOLL_WRITE))
+ return true;
+
+ return can_follow_write_common(page, vma, flags);
+}
+
static int gup_fast_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
unsigned long end, unsigned int flags, struct page **pages,
int *nr)
@@ -541,13 +580,13 @@ static int gup_fast_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,

pte = huge_ptep_get(ptep);

- if (!pte_access_permitted(pte, flags & FOLL_WRITE))
- return 0;
-
/* hugepages are never "special" */
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));

page = pte_page(pte);
+ if (!can_follow_write_hugepd(pte, page, vma, flags))
+ return 0;
+
refs = record_subpages(page, sz, addr, end, pages + *nr);

folio = try_grab_folio(page, refs, flags);
@@ -668,7 +707,25 @@ static struct page *no_page_table(struct vm_area_struct *vma,
return NULL;
}

+
+
#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
+/* FOLL_FORCE can write to even unwritable PUDs in COW mappings. */
+static inline bool can_follow_write_pud(pud_t pud, struct page *page,
+ struct vm_area_struct *vma,
+ unsigned int flags)
+{
+ /* If the pud is writable, we can write to the page. */
+ if (pud_write(pud))
+ return true;
+
+ if (!can_follow_write_common(page, vma, flags))
+ return false;
+
+ /* ... and a write-fault isn't required for other reasons. */
+ return !vma_soft_dirty_enabled(vma) || pud_soft_dirty(pud);
+}
+
static struct page *follow_huge_pud(struct vm_area_struct *vma,
unsigned long addr, pud_t *pudp,
int flags, struct follow_page_context *ctx)
@@ -681,7 +738,8 @@ static struct page *follow_huge_pud(struct vm_area_struct *vma,

assert_spin_locked(pud_lockptr(mm, pudp));

- if ((flags & FOLL_WRITE) && !pud_write(pud))
+ if ((flags & FOLL_WRITE) &&
+ !can_follow_write_pud(pud, page, vma, flags))
return NULL;

if (!pud_present(pud))
@@ -733,27 +791,7 @@ static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page,
if (pmd_write(pmd))
return true;

- /* Maybe FOLL_FORCE is set to override it? */
- if (!(flags & FOLL_FORCE))
- return false;
-
- /* But FOLL_FORCE has no effect on shared mappings */
- if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
- return false;
-
- /* ... or read-only private ones */
- if (!(vma->vm_flags & VM_MAYWRITE))
- return false;
-
- /* ... or already writable ones that just need to take a write fault */
- if (vma->vm_flags & VM_WRITE)
- return false;
-
- /*
- * See can_change_pte_writable(): we broke COW and could map the page
- * writable if we have an exclusive anonymous page ...
- */
- if (!page || !PageAnon(page) || !PageAnonExclusive(page))
+ if (!can_follow_write_common(page, vma, flags))
return false;

/* ... and a write-fault isn't required for other reasons. */
@@ -854,27 +892,7 @@ static inline bool can_follow_write_pte(pte_t pte, struct page *page,
if (pte_write(pte))
return true;

- /* Maybe FOLL_FORCE is set to override it? */
- if (!(flags & FOLL_FORCE))
- return false;
-
- /* But FOLL_FORCE has no effect on shared mappings */
- if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
- return false;
-
- /* ... or read-only private ones */
- if (!(vma->vm_flags & VM_MAYWRITE))
- return false;
-
- /* ... or already writable ones that just need to take a write fault */
- if (vma->vm_flags & VM_WRITE)
- return false;
-
- /*
- * See can_change_pte_writable(): we broke COW and could map the page
- * writable if we have an exclusive anonymous page ...
- */
- if (!page || !PageAnon(page) || !PageAnonExclusive(page))
+ if (!can_follow_write_common(page, vma, flags))
return false;

/* ... and a write-fault isn't required for other reasons. */
@@ -1374,9 +1392,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
if (!(vm_flags & VM_WRITE) || (vm_flags & VM_SHADOW_STACK)) {
if (!(gup_flags & FOLL_FORCE))
return -EFAULT;
- /* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */
- if (is_vm_hugetlb_page(vma))
- return -EFAULT;
+
/*
* We used to let the write,force case do COW in a
* VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 417fc5cdb6e..099a71ee028 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5320,6 +5320,13 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
update_mmu_cache(vma, address, ptep);
}

+static void set_huge_ptep_maybe_writable(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
+{
+ if (vma->vm_flags & VM_WRITE)
+ set_huge_ptep_writable(vma, address, ptep);
+}
+
bool is_hugetlb_entry_migration(pte_t pte)
{
swp_entry_t swp;
@@ -5942,13 +5949,6 @@ static vm_fault_t hugetlb_wp(struct folio *pagecache_folio,
if (!unshare && huge_pte_uffd_wp(pte))
return 0;

- /*
- * hugetlb does not support FOLL_FORCE-style write faults that keep the
- * PTE mapped R/O such as maybe_mkwrite() would do.
- */
- if (WARN_ON_ONCE(!unshare && !(vma->vm_flags & VM_WRITE)))
- return VM_FAULT_SIGSEGV;
-
/* Let's take out MAP_SHARED mappings first. */
if (vma->vm_flags & VM_MAYSHARE) {
set_huge_ptep_writable(vma, vmf->address, vmf->pte);
@@ -5970,7 +5970,7 @@ static vm_fault_t hugetlb_wp(struct folio *pagecache_folio,
SetPageAnonExclusive(&old_folio->page);
}
if (likely(!unshare))
- set_huge_ptep_writable(vma, vmf->address, vmf->pte);
+ set_huge_ptep_maybe_writable(vma, vmf->address, vmf->pte);

delayacct_wpcopy_end();
return 0;
@@ -6076,7 +6076,8 @@ static vm_fault_t hugetlb_wp(struct folio *pagecache_folio,
spin_lock(vmf->ptl);
vmf->pte = hugetlb_walk(vma, vmf->address, huge_page_size(h));
if (likely(vmf->pte && pte_same(huge_ptep_get(vmf->pte), pte))) {
- pte_t newpte = make_huge_pte(vma, &new_folio->page, !unshare);
+ const bool writable = !unshare && (vma->vm_flags & VM_WRITE);
+ pte_t newpte = make_huge_pte(vma, &new_folio->page, writable);

/* Break COW or unshare */
huge_ptep_clear_flush(vma, vmf->address, vmf->pte);


--
Guillaume Morin <[email protected]>

2024-04-30 15:22:41

by Guillaume Morin

[permalink] [raw]
Subject: Re: [RFC][PATCH] uprobe: support for private hugetlb mappings

On 26 Apr 21:55, Guillaume Morin wrote:
>
> On 26 Apr 9:19, David Hildenbrand wrote:
> > A couple of points:
> >
> > a) Don't use page_mapcount(). Either folio_mapcount(), but likely you want
> > to check PageAnonExclusive.
> >
> > b) If you're not following the can_follow_write_pte/_pmd model, you are
> > doing something wrong :)
> >
> > c) The code was heavily changed in mm/mm-unstable. It was merged with t
> > the common code.
> >
> > Likely, in mm/mm-unstable, the existing can_follow_write_pte and
> > can_follow_write_pmd checks will already cover what you want in most cases.
> >
> > We'd need a can_follow_write_pud() to cover follow_huge_pud() and
> > (unfortunately) something to handle follow_hugepd() as well similarly.
> >
> > Copy-pasting what we do in can_follow_write_pte() and adjusting for
> > different PTE types is the right thing to do. Maybe now it's time to factor
> > out the common checks into a separate helper.
>
> I tried to get the hugepd stuff right but this was the first I heard
> about it :-) Afaict follow_huge_pmd and friends were already DTRT

I got it working on top of your uprobes-cow branch with the foll force
patch sent friday. Still pretty lightly tested

I went with using one write uprobe function with some additional
branches. I went back and forth between that and making them 2 different
functions.

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 2f4e88552d3f..8a33e380f7ea 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -83,6 +83,10 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
{}
};

+bool hugetlbfs_mapping(struct address_space *mapping) {
+ return mapping->a_ops == &hugetlbfs_aops;
+}
+
/*
* Mask used when checking the page offset value passed in via system
* calls. This value will be converted to a loff_t which is signed.
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 68244bb3637a..66fdcc0b5db2 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -511,6 +511,8 @@ struct hugetlbfs_sb_info {
umode_t mode;
};

+bool hugetlbfs_mapping(struct address_space *mapping);
+
static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
{
return sb->s_fs_info;
@@ -557,6 +559,8 @@ static inline struct hstate *hstate_inode(struct inode *i)
{
return NULL;
}
+
+static inline bool hugetlbfs_mapping(struct address_space *mapping) { return false; }
#endif /* !CONFIG_HUGETLBFS */

#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 4237a7477cca..e6e93a574c39 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -11,6 +11,7 @@

#include <linux/kernel.h>
#include <linux/highmem.h>
+#include <linux/hugetlb.h>
#include <linux/pagemap.h> /* read_mapping_page */
#include <linux/slab.h>
#include <linux/sched.h>
@@ -120,7 +121,7 @@ struct xol_area {
*/
static bool valid_vma(struct vm_area_struct *vma, bool is_register)
{
- vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
+ vm_flags_t flags = VM_MAYEXEC | VM_MAYSHARE;

if (is_register)
flags |= VM_WRITE;
@@ -163,21 +164,21 @@ bool __weak is_trap_insn(uprobe_opcode_t *insn)
return is_swbp_insn(insn);
}

-static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
+static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len, unsigned long page_mask)
{
void *kaddr = kmap_atomic(page);
- memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
+ memcpy(dst, kaddr + (vaddr & ~page_mask), len);
kunmap_atomic(kaddr);
}

-static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len)
+static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len, unsigned long page_mask)
{
void *kaddr = kmap_atomic(page);
- memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len);
+ memcpy(kaddr + (vaddr & ~page_mask), src, len);
kunmap_atomic(kaddr);
}

-static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
+static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode, unsigned long page_mask)
{
uprobe_opcode_t old_opcode;
bool is_swbp;
@@ -191,7 +192,8 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
* is a trap variant; uprobes always wins over any other (gdb)
* breakpoint.
*/
- copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
+ copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE,
+ page_mask);
is_swbp = is_swbp_insn(&old_opcode);

if (is_swbp_insn(new_opcode)) {
@@ -376,8 +378,8 @@ struct uwo_data {
uprobe_opcode_t opcode;
};

-static int __write_opcode_pte(pte_t *ptep, unsigned long vaddr,
- unsigned long next, struct mm_walk *walk)
+static int __write_opcode(pte_t *ptep, unsigned long vaddr,
+ unsigned long page_mask, struct mm_walk *walk)
{
struct uwo_data *data = walk->private;;
const bool is_register = !!is_swbp_insn(&data->opcode);
@@ -415,9 +417,12 @@ static int __write_opcode_pte(pte_t *ptep, unsigned long vaddr,

/* Unmap + flush the TLB, such that we can write atomically .*/
flush_cache_page(vma, vaddr, pte_pfn(pte));
- pte = ptep_clear_flush(vma, vaddr, ptep);
+ if (folio_test_hugetlb(folio))
+ pte = huge_ptep_clear_flush(vma, vaddr, ptep);
+ else
+ pte = ptep_clear_flush(vma, vaddr, ptep);
copy_to_page(page, data->opcode_vaddr, &data->opcode,
- UPROBE_SWBP_INSN_SIZE);
+ UPROBE_SWBP_INSN_SIZE, page_mask);

/* When unregistering, we may only zap a PTE if uffd is disabled ... */
if (is_register || userfaultfd_missing(vma))
@@ -443,13 +448,18 @@ static int __write_opcode_pte(pte_t *ptep, unsigned long vaddr,
if (!identical || folio_maybe_dma_pinned(folio))
goto remap;

- /* Zap it and try to reclaim swap space. */
- dec_mm_counter(mm, MM_ANONPAGES);
- folio_remove_rmap_pte(folio, page, vma);
- if (!folio_mapped(folio) && folio_test_swapcache(folio) &&
- folio_trylock(folio)) {
- folio_free_swap(folio);
- folio_unlock(folio);
+ if (folio_test_hugetlb(folio)) {
+ hugetlb_remove_rmap(folio);
+ large = false;
+ } else {
+ /* Zap it and try to reclaim swap space. */
+ dec_mm_counter(mm, MM_ANONPAGES);
+ folio_remove_rmap_pte(folio, page, vma);
+ if (!folio_mapped(folio) && folio_test_swapcache(folio) &&
+ folio_trylock(folio)) {
+ folio_free_swap(folio);
+ folio_unlock(folio);
+ }
}
folio_put(folio);

@@ -461,11 +471,29 @@ static int __write_opcode_pte(pte_t *ptep, unsigned long vaddr,
*/
smp_wmb();
/* We modified the page. Make sure to mark the PTE dirty. */
- set_pte_at(mm, vaddr, ptep, pte_mkdirty(pte));
+ if (folio_test_hugetlb(folio))
+ set_huge_pte_at(mm , vaddr, ptep, huge_pte_mkdirty(pte),
+ (~page_mask) + 1);
+ else
+ set_pte_at(mm, vaddr, ptep, pte_mkdirty(pte));
return UWO_DONE;
}

+static int __write_opcode_hugetlb(pte_t *ptep, unsigned long hmask,
+ unsigned long vaddr,
+ unsigned long next, struct mm_walk *walk)
+{
+ return __write_opcode(ptep, vaddr, hmask, walk);
+}
+
+static int __write_opcode_pte(pte_t *ptep, unsigned long vaddr,
+ unsigned long next, struct mm_walk *walk)
+{
+ return __write_opcode(ptep, vaddr, PAGE_MASK, walk);
+}
+
static const struct mm_walk_ops write_opcode_ops = {
+ .hugetlb_entry = __write_opcode_hugetlb,
.pte_entry = __write_opcode_pte,
.walk_lock = PGWALK_WRLOCK,
};
@@ -492,7 +520,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
unsigned long opcode_vaddr, uprobe_opcode_t opcode)
{
struct uprobe *uprobe = container_of(auprobe, struct uprobe, arch);
- const unsigned long vaddr = opcode_vaddr & PAGE_MASK;
+ unsigned long vaddr = opcode_vaddr & PAGE_MASK;
const bool is_register = !!is_swbp_insn(&opcode);
struct uwo_data data = {
.opcode = opcode,
@@ -503,6 +531,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
struct mmu_notifier_range range;
int ret, ref_ctr_updated = 0;
struct page *page;
+ unsigned long page_size = PAGE_SIZE;

if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags)))
return -EINVAL;
@@ -521,7 +550,14 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
if (ret != 1)
goto out;

- ret = verify_opcode(page, opcode_vaddr, &opcode);
+
+ if (is_vm_hugetlb_page(vma)) {
+ struct hstate *h = hstate_vma(vma);
+ page_size = huge_page_size(h);
+ vaddr &= huge_page_mask(h);
+ page = compound_head(page);
+ }
+ ret = verify_opcode(page, opcode_vaddr, &opcode, ~(page_size - 1));
put_page(page);
if (ret <= 0)
goto out;
@@ -541,12 +577,12 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
* be able to do it under PTL.
*/
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
- vaddr, vaddr + PAGE_SIZE);
+ vaddr, vaddr + page_size);
mmu_notifier_invalidate_range_start(&range);
}

/* Walk the page tables and perform the actual magic. */
- ret = walk_page_range_vma(vma, vaddr, vaddr + PAGE_SIZE,
+ ret = walk_page_range_vma(vma, vaddr, vaddr + page_size,
&write_opcode_ops, &data);

if (!is_register)
@@ -816,6 +852,7 @@ static int __copy_insn(struct address_space *mapping, struct file *filp,
void *insn, int nbytes, loff_t offset)
{
struct page *page;
+ unsigned long mask = PAGE_MASK;
/*
* Ensure that the page that has the original instruction is populated
* and in page-cache. If ->read_folio == NULL it must be shmem_mapping(),
@@ -823,12 +860,17 @@ static int __copy_insn(struct address_space *mapping, struct file *filp,
*/
if (mapping->a_ops->read_folio)
page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp);
- else
+ else if (!is_file_hugepages(filp))
page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
+ else {
+ struct hstate *h = hstate_file(filp);
+ mask = huge_page_mask(h);
+ page = find_get_page(mapping, (offset & mask) >> PAGE_SHIFT);
+ }
if (IS_ERR(page))
return PTR_ERR(page);

- copy_from_page(page, offset, insn, nbytes);
+ copy_from_page(page, offset, insn, nbytes, mask);
put_page(page);

return 0;
@@ -1175,9 +1217,12 @@ static int __uprobe_register(struct inode *inode, loff_t offset,
if (!uc->handler && !uc->ret_handler)
return -EINVAL;

- /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
+ /* copy_insn() uses read_mapping_page() or shmem/hugetlbfs specific
+ * logic
+ */
if (!inode->i_mapping->a_ops->read_folio &&
- !shmem_mapping(inode->i_mapping))
+ !shmem_mapping(inode->i_mapping) &&
+ !hugetlbfs_mapping(inode->i_mapping))
return -EIO;
/* Racy, just to catch the obvious mistakes */
if (offset > i_size_read(inode))
@@ -1698,7 +1743,7 @@ void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
void *src, unsigned long len)
{
/* Initialize the slot */
- copy_to_page(page, vaddr, src, len);
+ copy_to_page(page, vaddr, src, len, PAGE_MASK);

/*
* We probably need flush_icache_user_page() but it needs vma.
@@ -2062,7 +2107,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
if (result < 0)
return result;

- copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+ copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE, PAGE_MASK);
put_page(page);
out:
/* This needs to return true for any variant of the trap insn */

--
Guillaume Morin <[email protected]>

2024-04-30 18:21:25

by David Hildenbrand

[permalink] [raw]
Subject: Re: [RFC][PATCH] uprobe: support for private hugetlb mappings

On 30.04.24 17:22, Guillaume Morin wrote:
> On 26 Apr 21:55, Guillaume Morin wrote:
>>
>> On 26 Apr 9:19, David Hildenbrand wrote:
>>> A couple of points:
>>>
>>> a) Don't use page_mapcount(). Either folio_mapcount(), but likely you want
>>> to check PageAnonExclusive.
>>>
>>> b) If you're not following the can_follow_write_pte/_pmd model, you are
>>> doing something wrong :)
>>>
>>> c) The code was heavily changed in mm/mm-unstable. It was merged with t
>>> the common code.
>>>
>>> Likely, in mm/mm-unstable, the existing can_follow_write_pte and
>>> can_follow_write_pmd checks will already cover what you want in most cases.
>>>
>>> We'd need a can_follow_write_pud() to cover follow_huge_pud() and
>>> (unfortunately) something to handle follow_hugepd() as well similarly.
>>>
>>> Copy-pasting what we do in can_follow_write_pte() and adjusting for
>>> different PTE types is the right thing to do. Maybe now it's time to factor
>>> out the common checks into a separate helper.
>>
>> I tried to get the hugepd stuff right but this was the first I heard
>> about it :-) Afaict follow_huge_pmd and friends were already DTRT
>
> I got it working on top of your uprobes-cow branch with the foll force
> patch sent friday. Still pretty lightly tested

Sorry for not replying earlier, was busy with other stuff. I'll try
getiing that stuff into shape and send it out soonish.

>
> I went with using one write uprobe function with some additional
> branches. I went back and forth between that and making them 2 different
> functions.

All the folio_test_hugetlb() special casing is a bit suboptimal. Likely
we want a separate variant, because we should be sing hugetlb PTE
functions consistently (e.g., huge_pte_uffd_wp() vs pte_uffd_wp(),
softdirty does not exist etc.)

>
> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
> index 2f4e88552d3f..8a33e380f7ea 100644
> --- a/fs/hugetlbfs/inode.c
> +++ b/fs/hugetlbfs/inode.c
> @@ -83,6 +83,10 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
> {}
> };
>
> +bool hugetlbfs_mapping(struct address_space *mapping) {
> + return mapping->a_ops == &hugetlbfs_aops;

is_vm_hugetlb_page() might be what you are looking for.

[...]

> }
>
> -static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
> +static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len, unsigned long page_mask)
> {
> void *kaddr = kmap_atomic(page);
> - memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
> + memcpy(dst, kaddr + (vaddr & ~page_mask), len);
> kunmap_atomic(kaddr);
> }

>
> -static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len)
> +static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len, unsigned long page_mask)
> {
> void *kaddr = kmap_atomic(page);
> - memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len);
> + memcpy(kaddr + (vaddr & ~page_mask), src, len);
> kunmap_atomic(kaddr);
> }

These two changes really are rather ugly ...

An why are they even required? We get a PAGE_SIZED-based subpage of a
hugetlb page. We only kmap that one and copy within that one.

In other words, I don't think the copy_from_page() and copy_to_page()
changes are even required when we consistently work on subpages and not
suddenly on head pages.

>
> -static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
> +static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode, unsigned long page_mask)
> {
> uprobe_opcode_t old_opcode;
> bool is_swbp;
> @@ -191,7 +192,8 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
> * is a trap variant; uprobes always wins over any other (gdb)
> * breakpoint.
> */
> - copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
> + copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE,
> + page_mask);
> is_swbp = is_swbp_insn(&old_opcode);
>
> if (is_swbp_insn(new_opcode)) {
> @@ -376,8 +378,8 @@ struct uwo_data {
> uprobe_opcode_t opcode;
> };
>
> -static int __write_opcode_pte(pte_t *ptep, unsigned long vaddr,
> - unsigned long next, struct mm_walk *walk)
> +static int __write_opcode(pte_t *ptep, unsigned long vaddr,
> + unsigned long page_mask, struct mm_walk *walk)


Unrelated alignment change.

> {
> struct uwo_data *data = walk->private;;
> const bool is_register = !!is_swbp_insn(&data->opcode);
> @@ -415,9 +417,12 @@ static int __write_opcode_pte(pte_t *ptep, unsigned long vaddr,
>
> /* Unmap + flush the TLB, such that we can write atomically .*/
> flush_cache_page(vma, vaddr, pte_pfn(pte));
> - pte = ptep_clear_flush(vma, vaddr, ptep);
> + if (folio_test_hugetlb(folio))
> + pte = huge_ptep_clear_flush(vma, vaddr, ptep);
> + else
> + pte = ptep_clear_flush(vma, vaddr, ptep);
> copy_to_page(page, data->opcode_vaddr, &data->opcode,
> - UPROBE_SWBP_INSN_SIZE);
> + UPROBE_SWBP_INSN_SIZE, page_mask);
>
> /* When unregistering, we may only zap a PTE if uffd is disabled ... */
> if (is_register || userfaultfd_missing(vma))
> @@ -443,13 +448,18 @@ static int __write_opcode_pte(pte_t *ptep, unsigned long vaddr,
> if (!identical || folio_maybe_dma_pinned(folio))
> goto remap;
>
> - /* Zap it and try to reclaim swap space. */
> - dec_mm_counter(mm, MM_ANONPAGES);
> - folio_remove_rmap_pte(folio, page, vma);
> - if (!folio_mapped(folio) && folio_test_swapcache(folio) &&
> - folio_trylock(folio)) {
> - folio_free_swap(folio);
> - folio_unlock(folio);
> + if (folio_test_hugetlb(folio)) {
> + hugetlb_remove_rmap(folio);
> + large = false;
> + } else {
> + /* Zap it and try to reclaim swap space. */
> + dec_mm_counter(mm, MM_ANONPAGES);
> + folio_remove_rmap_pte(folio, page, vma);
> + if (!folio_mapped(folio) && folio_test_swapcache(folio) &&
> + folio_trylock(folio)) {
> + folio_free_swap(folio);
> + folio_unlock(folio);
> + }
> }
> folio_put(folio);
>
> @@ -461,11 +471,29 @@ static int __write_opcode_pte(pte_t *ptep, unsigned long vaddr,
> */
> smp_wmb();
> /* We modified the page. Make sure to mark the PTE dirty. */
> - set_pte_at(mm, vaddr, ptep, pte_mkdirty(pte));
> + if (folio_test_hugetlb(folio))
> + set_huge_pte_at(mm , vaddr, ptep, huge_pte_mkdirty(pte),
> + (~page_mask) + 1);
> + else
> + set_pte_at(mm, vaddr, ptep, pte_mkdirty(pte));
> return UWO_DONE;
> }
>
> +static int __write_opcode_hugetlb(pte_t *ptep, unsigned long hmask,
> + unsigned long vaddr,
> + unsigned long next, struct mm_walk *walk)
> +{
> + return __write_opcode(ptep, vaddr, hmask, walk);
> +}
> +
> +static int __write_opcode_pte(pte_t *ptep, unsigned long vaddr,
> + unsigned long next, struct mm_walk *walk)
> +{
> + return __write_opcode(ptep, vaddr, PAGE_MASK, walk);
> +}
> +
> static const struct mm_walk_ops write_opcode_ops = {
> + .hugetlb_entry = __write_opcode_hugetlb,
> .pte_entry = __write_opcode_pte,
> .walk_lock = PGWALK_WRLOCK,
> };
> @@ -492,7 +520,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
> unsigned long opcode_vaddr, uprobe_opcode_t opcode)
> {
> struct uprobe *uprobe = container_of(auprobe, struct uprobe, arch);
> - const unsigned long vaddr = opcode_vaddr & PAGE_MASK;
> + unsigned long vaddr = opcode_vaddr & PAGE_MASK;
> const bool is_register = !!is_swbp_insn(&opcode);
> struct uwo_data data = {
> .opcode = opcode,
> @@ -503,6 +531,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
> struct mmu_notifier_range range;
> int ret, ref_ctr_updated = 0;
> struct page *page;
> + unsigned long page_size = PAGE_SIZE;
>
> if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags)))
> return -EINVAL;
> @@ -521,7 +550,14 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
> if (ret != 1)
> goto out;
>
> - ret = verify_opcode(page, opcode_vaddr, &opcode);
> +
> + if (is_vm_hugetlb_page(vma)) {
> + struct hstate *h = hstate_vma(vma);
> + page_size = huge_page_size(h);
> + vaddr &= huge_page_mask(h);
> + page = compound_head(page);

I think we should only adjust the range we pass to the mmu notifier and
for walking the VMA range. But we should not adjust vaddr.

Further, we should not adjust the page if possible ... ideally, we'll
treat hugetlb folios just like large folios here and operate on subpages.

Inside __write_opcode(), we can derive the the page of interest from
data->opcode_vaddr.

find_get_page() might need some though, if it won't return a subpage of
a hugetlb folio. Should be solvable by a wrapper, though.

> + }
> + ret = verify_opcode(page, opcode_vaddr, &opcode, ~(page_size - 1));
> put_page(page);
> if (ret <= 0)
> goto out;


--
Cheers,

David / dhildenb


2024-04-30 18:58:51

by Guillaume Morin

[permalink] [raw]
Subject: Re: [RFC][PATCH] uprobe: support for private hugetlb mappings

On 30 Apr 20:21, David Hildenbrand wrote:
> Sorry for not replying earlier, was busy with other stuff. I'll try getiing
> that stuff into shape and send it out soonish.

No worries. Let me know what you think of the FOLL_FORCE patch when you
have a sec.

> > I went with using one write uprobe function with some additional
> > branches. I went back and forth between that and making them 2 different
> > functions.
>
> All the folio_test_hugetlb() special casing is a bit suboptimal. Likely we
> want a separate variant, because we should be sing hugetlb PTE functions
> consistently (e.g., huge_pte_uffd_wp() vs pte_uffd_wp(), softdirty does not
> exist etc.)

Ok, I'll give this a whirl and send something probably tomorrow.

> > diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
> > index 2f4e88552d3f..8a33e380f7ea 100644
> > --- a/fs/hugetlbfs/inode.c
> > +++ b/fs/hugetlbfs/inode.c
> > @@ -83,6 +83,10 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
> > {}
> > };
> > +bool hugetlbfs_mapping(struct address_space *mapping) {
> > + return mapping->a_ops == &hugetlbfs_aops;
>
> is_vm_hugetlb_page() might be what you are looking for.

I use hugetlbfs_mapping() in __uprobe_register() which does an early
return using the mapping only if it's not supported. There is no vma
that I can get at this point (afaict).

I could refactor so we check this when we have a vma but it looked
cleaner to introduce it since there is already shmem_mapping()

> > }
> > -static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
> > +static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len, unsigned long page_mask)
> > {
> > void *kaddr = kmap_atomic(page);
> > - memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
> > + memcpy(dst, kaddr + (vaddr & ~page_mask), len);
> > kunmap_atomic(kaddr);
> > }
>
> > -static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len)
> > +static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len, unsigned long page_mask)
> > {
> > void *kaddr = kmap_atomic(page);
> > - memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len);
> > + memcpy(kaddr + (vaddr & ~page_mask), src, len);
> > kunmap_atomic(kaddr);
> > }
>
> These two changes really are rather ugly ...
>
> An why are they even required? We get a PAGE_SIZED-based subpage of a
> hugetlb page. We only kmap that one and copy within that one.
>
> In other words, I don't think the copy_from_page() and copy_to_page()
> changes are even required when we consistently work on subpages and not
> suddenly on head pages.

The main reason is that the previous __replace_page worked directly on the full
HP page so adjusting after gup seemed to make more sense to me. But
now I guess it's not that useful (esp we're going with a different
version of write_uprobe). I'll fix

(...)
> > {
> > struct uwo_data *data = walk->private;;
> > const bool is_register = !!is_swbp_insn(&data->opcode);
> > @@ -415,9 +417,12 @@ static int __write_opcode_pte(pte_t *ptep, unsigned long vaddr,
> > /* Unmap + flush the TLB, such that we can write atomically .*/
> > flush_cache_page(vma, vaddr, pte_pfn(pte));
> > - pte = ptep_clear_flush(vma, vaddr, ptep);
> > + if (folio_test_hugetlb(folio))
> > + pte = huge_ptep_clear_flush(vma, vaddr, ptep);
> > + else
> > + pte = ptep_clear_flush(vma, vaddr, ptep);
> > copy_to_page(page, data->opcode_vaddr, &data->opcode,
> > - UPROBE_SWBP_INSN_SIZE);
> > + UPROBE_SWBP_INSN_SIZE, page_mask);
> > /* When unregistering, we may only zap a PTE if uffd is disabled ... */
> > if (is_register || userfaultfd_missing(vma))
> > @@ -443,13 +448,18 @@ static int __write_opcode_pte(pte_t *ptep, unsigned long vaddr,
> > if (!identical || folio_maybe_dma_pinned(folio))
> > goto remap;
> > - /* Zap it and try to reclaim swap space. */
> > - dec_mm_counter(mm, MM_ANONPAGES);
> > - folio_remove_rmap_pte(folio, page, vma);
> > - if (!folio_mapped(folio) && folio_test_swapcache(folio) &&
> > - folio_trylock(folio)) {
> > - folio_free_swap(folio);
> > - folio_unlock(folio);
> > + if (folio_test_hugetlb(folio)) {
> > + hugetlb_remove_rmap(folio);
> > + large = false;
> > + } else {
> > + /* Zap it and try to reclaim swap space. */
> > + dec_mm_counter(mm, MM_ANONPAGES);
> > + folio_remove_rmap_pte(folio, page, vma);
> > + if (!folio_mapped(folio) && folio_test_swapcache(folio) &&
> > + folio_trylock(folio)) {
> > + folio_free_swap(folio);
> > + folio_unlock(folio);
> > + }
> > }
> > folio_put(folio);
> > @@ -461,11 +471,29 @@ static int __write_opcode_pte(pte_t *ptep, unsigned long vaddr,
> > */
> > smp_wmb();
> > /* We modified the page. Make sure to mark the PTE dirty. */
> > - set_pte_at(mm, vaddr, ptep, pte_mkdirty(pte));
> > + if (folio_test_hugetlb(folio))
> > + set_huge_pte_at(mm , vaddr, ptep, huge_pte_mkdirty(pte),
> > + (~page_mask) + 1);
> > + else
> > + set_pte_at(mm, vaddr, ptep, pte_mkdirty(pte));
> > return UWO_DONE;
> > }
> > +static int __write_opcode_hugetlb(pte_t *ptep, unsigned long hmask,
> > + unsigned long vaddr,
> > + unsigned long next, struct mm_walk *walk)
> > +{
> > + return __write_opcode(ptep, vaddr, hmask, walk);
> > +}
> > +
> > +static int __write_opcode_pte(pte_t *ptep, unsigned long vaddr,
> > + unsigned long next, struct mm_walk *walk)
> > +{
> > + return __write_opcode(ptep, vaddr, PAGE_MASK, walk);
> > +}
> > +
> > static const struct mm_walk_ops write_opcode_ops = {
> > + .hugetlb_entry = __write_opcode_hugetlb,
> > .pte_entry = __write_opcode_pte,
> > .walk_lock = PGWALK_WRLOCK,
> > };
> > @@ -492,7 +520,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
> > unsigned long opcode_vaddr, uprobe_opcode_t opcode)
> > {
> > struct uprobe *uprobe = container_of(auprobe, struct uprobe, arch);
> > - const unsigned long vaddr = opcode_vaddr & PAGE_MASK;
> > + unsigned long vaddr = opcode_vaddr & PAGE_MASK;
> > const bool is_register = !!is_swbp_insn(&opcode);
> > struct uwo_data data = {
> > .opcode = opcode,
> > @@ -503,6 +531,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
> > struct mmu_notifier_range range;
> > int ret, ref_ctr_updated = 0;
> > struct page *page;
> > + unsigned long page_size = PAGE_SIZE;
> > if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags)))
> > return -EINVAL;
> > @@ -521,7 +550,14 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
> > if (ret != 1)
> > goto out;
> > - ret = verify_opcode(page, opcode_vaddr, &opcode);
> > +
> > + if (is_vm_hugetlb_page(vma)) {
> > + struct hstate *h = hstate_vma(vma);
> > + page_size = huge_page_size(h);
> > + vaddr &= huge_page_mask(h);
> > + page = compound_head(page);
>
> I think we should only adjust the range we pass to the mmu notifier and for
> walking the VMA range. But we should not adjust vaddr.
>
> Further, we should not adjust the page if possible ... ideally, we'll treat
> hugetlb folios just like large folios here and operate on subpages.
>
> Inside __write_opcode(), we can derive the the page of interest from
> data->opcode_vaddr.

Here you mean __write_opcode_hugetlb(), right? Since we're going with
the 2 independent variants. Just want to 100% sure I am following

> find_get_page() might need some though, if it won't return a subpage of a
> hugetlb folio. Should be solvable by a wrapper, though.

We can zero out the subbits with the huge page mask in the
vaddr_to_offset() in the hugetlb variant like I do in __copy_insn() and
that should work, no? Or you prefer a wrapper?

Guillaume.

--
Guillaume Morin <[email protected]>

2024-04-30 19:25:47

by David Hildenbrand

[permalink] [raw]
Subject: Re: [RFC][PATCH] uprobe: support for private hugetlb mappings

On 26.04.24 21:55, Guillaume Morin wrote:
> On 26 Apr 9:19, David Hildenbrand wrote:
>> A couple of points:
>>
>> a) Don't use page_mapcount(). Either folio_mapcount(), but likely you want
>> to check PageAnonExclusive.
>>
>> b) If you're not following the can_follow_write_pte/_pmd model, you are
>> doing something wrong :)
>>
>> c) The code was heavily changed in mm/mm-unstable. It was merged with t
>> the common code.
>>
>> Likely, in mm/mm-unstable, the existing can_follow_write_pte and
>> can_follow_write_pmd checks will already cover what you want in most cases.
>>
>> We'd need a can_follow_write_pud() to cover follow_huge_pud() and
>> (unfortunately) something to handle follow_hugepd() as well similarly.
>>
>> Copy-pasting what we do in can_follow_write_pte() and adjusting for
>> different PTE types is the right thing to do. Maybe now it's time to factor
>> out the common checks into a separate helper.
>
> I tried to get the hugepd stuff right but this was the first I heard
> about it :-) Afaict follow_huge_pmd and friends were already DTRT

I'll have to have a closer look at some details (the hugepd writability
check looks a bit odd), but it's mostly what I would have expected!

--
Cheers,

David / dhildenb


2024-05-02 04:00:18

by Guillaume Morin

[permalink] [raw]
Subject: Re: [RFC][PATCH] uprobe: support for private hugetlb mappings

On 30 Apr 21:25, David Hildenbrand wrote:
> > I tried to get the hugepd stuff right but this was the first I heard
> > about it :-) Afaict follow_huge_pmd and friends were already DTRT
>
> I'll have to have a closer look at some details (the hugepd writability
> check looks a bit odd), but it's mostly what I would have expected!

Ok in the meantime, here is the uprobe change on your current
uprobes_cow trying to address the comments you made in your previous
message. Some of them were not 100% clear to me, so it's a best effort
patch :-) Again lightly tested

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -83,6 +83,10 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
{}
};

+bool hugetlbfs_mapping(struct address_space *mapping) {
+ return mapping->a_ops == &hugetlbfs_aops;
+}
+
/*
* Mask used when checking the page offset value passed in via system
* calls. This value will be converted to a loff_t which is signed.
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -511,6 +511,8 @@ struct hugetlbfs_sb_info {
umode_t mode;
};

+bool hugetlbfs_mapping(struct address_space *mapping);
+
static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
{
return sb->s_fs_info;
@@ -557,6 +559,8 @@ static inline struct hstate *hstate_inode(struct inode *i)
{
return NULL;
}
+
+static inline bool hugetlbfs_mapping(struct address_space *mapping) { return false; }
#endif /* !CONFIG_HUGETLBFS */

#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -11,6 +11,7 @@

#include <linux/kernel.h>
#include <linux/highmem.h>
+#include <linux/hugetlb.h>
#include <linux/pagemap.h> /* read_mapping_page */
#include <linux/slab.h>
#include <linux/sched.h>
@@ -120,7 +121,7 @@ struct xol_area {
*/
static bool valid_vma(struct vm_area_struct *vma, bool is_register)
{
- vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
+ vm_flags_t flags = VM_MAYEXEC | VM_MAYSHARE;

if (is_register)
flags |= VM_WRITE;
@@ -177,6 +178,19 @@ static void copy_to_page(struct page *page, unsigned long vaddr, const void *src
kunmap_atomic(kaddr);
}

+static bool compare_pages(struct page *page1, struct page *page2, unsigned long page_size)
+{
+ char *addr1, *addr2;
+ int ret;
+
+ addr1 = kmap_local_page(page1);
+ addr2 = kmap_local_page(page2);
+ ret = memcmp(addr1, addr2, page_size);
+ kunmap_local(addr2);
+ kunmap_local(addr1);
+ return ret == 0;
+}
+
static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
{
uprobe_opcode_t old_opcode;
@@ -366,7 +380,9 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
}

static bool orig_page_is_identical(struct vm_area_struct *vma,
- unsigned long vaddr, struct page *page, bool *large)
+ unsigned long vaddr, struct page *page,
+ unsigned long page_size,
+ bool *large)
{
const pgoff_t index = vaddr_to_offset(vma, vaddr) >> PAGE_SHIFT;
struct page *orig_page = find_get_page(vma->vm_file->f_inode->i_mapping,
@@ -380,7 +396,7 @@ static bool orig_page_is_identical(struct vm_area_struct *vma,

*large = folio_test_large(orig_folio);
identical = folio_test_uptodate(orig_folio) &&
- pages_identical(page, orig_page);
+ compare_pages(page, orig_page, page_size);
folio_put(orig_folio);
return identical;
}
@@ -396,6 +412,81 @@ struct uwo_data {
uprobe_opcode_t opcode;
};

+static int __write_opcode_hugetlb(pte_t *ptep, unsigned long page_mask,
+ unsigned long vaddr,
+ unsigned long next, struct mm_walk *walk)
+{
+ struct uwo_data *data = walk->private;
+ const bool is_register = !!is_swbp_insn(&data->opcode);
+ pte_t pte = huge_ptep_get(ptep);
+ struct folio *folio;
+ struct page *page;
+ bool large;
+ struct hstate *h = hstate_vma(walk->vma);
+ unsigned subpage_index = (vaddr & (huge_page_size(h) - 1)) >>
+ PAGE_SHIFT;
+
+ if (!pte_present(pte))
+ return UWO_RETRY;
+ page = vm_normal_page(walk->vma, vaddr, pte);
+ if (!page)
+ return UWO_RETRY;
+ folio = page_folio(page);
+
+ /* When unregistering and there is no anon folio anymore, we're done. */
+ if (!folio_test_anon(folio))
+ return is_register ? UWO_RETRY_WRITE_FAULT : UWO_DONE;
+
+ /*
+ * See can_follow_write_pte(): we'd actually prefer requiring a
+ * writable PTE here, but when unregistering we might no longer have
+ * VM_WRITE ...
+ */
+ if (!huge_pte_write(pte)) {
+ if (!PageAnonExclusive(page))
+ return UWO_RETRY_WRITE_FAULT;
+ if (unlikely(userfaultfd_wp(walk->vma) && huge_pte_uffd_wp(pte)))
+ return UWO_RETRY_WRITE_FAULT;
+ /* SOFTDIRTY is handled via pte_mkdirty() below. */
+ }
+
+ /* Unmap + flush the TLB, such that we can write atomically .*/
+ flush_cache_page(walk->vma, vaddr & page_mask, pte_pfn(pte));
+ pte = huge_ptep_clear_flush(walk->vma, vaddr & page_mask, ptep);
+ copy_to_page(nth_page(page, subpage_index), data->opcode_vaddr,
+ &data->opcode, UPROBE_SWBP_INSN_SIZE);
+
+ /*
+ * When unregistering, we may only zap a PTE if uffd is disabled and
+ * the folio is not pinned ...
+ */
+ if (is_register || userfaultfd_missing(walk->vma) ||
+ folio_maybe_dma_pinned(folio))
+ goto remap;
+
+ /*
+ * ... the mapped anon page is identical to the original page (that
+ * will get faulted in on next access), and we don't have GUP pins.
+ */
+ if (!orig_page_is_identical(walk->vma, vaddr & page_mask, page,
+ huge_page_size(h), &large))
+ goto remap;
+
+ hugetlb_remove_rmap(folio);
+ folio_put(folio);
+ return UWO_DONE;
+remap:
+ /*
+ * Make sure that our copy_to_page() changes become visible before the
+ * set_huge_pte_at() write.
+ */
+ smp_wmb();
+ /* We modified the page. Make sure to mark the PTE dirty. */
+ set_huge_pte_at(walk->mm , vaddr & page_mask, ptep,
+ huge_pte_mkdirty(pte), huge_page_size(h));
+ return UWO_DONE;
+}
+
static int __write_opcode_pte(pte_t *ptep, unsigned long vaddr,
unsigned long next, struct mm_walk *walk)
{
@@ -447,7 +538,7 @@ static int __write_opcode_pte(pte_t *ptep, unsigned long vaddr,
* ... the mapped anon page is identical to the original page (that
* will get faulted in on next access), and we don't have GUP pins.
*/
- if (!orig_page_is_identical(walk->vma, vaddr, page, &large))
+ if (!orig_page_is_identical(walk->vma, vaddr, page, PAGE_SIZE, &large))
goto remap;

/* Zap it and try to reclaim swap space. */
@@ -473,6 +564,7 @@ static int __write_opcode_pte(pte_t *ptep, unsigned long vaddr,
}

static const struct mm_walk_ops write_opcode_ops = {
+ .hugetlb_entry = __write_opcode_hugetlb,
.pte_entry = __write_opcode_pte,
.walk_lock = PGWALK_WRLOCK,
};
@@ -510,6 +602,8 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
struct mmu_notifier_range range;
int ret, ref_ctr_updated = 0;
struct page *page;
+ unsigned long page_size = PAGE_SIZE;
+ unsigned long page_mask = PAGE_MASK;

if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags)))
return -EINVAL;
@@ -528,6 +622,11 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
if (ret != 1)
goto out;

+ if (is_vm_hugetlb_page(vma)) {
+ struct hstate *h = hstate_vma(vma);
+ page_size = huge_page_size(h);
+ page_mask = huge_page_mask(h);
+ }
ret = verify_opcode(page, opcode_vaddr, &opcode);
put_page(page);
if (ret <= 0)
@@ -547,8 +646,9 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
* unregistering. So trigger MMU notifiers now, as we won't
* be able to do it under PTL.
*/
+ const unsigned long start = vaddr & page_mask;
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
- vaddr, vaddr + PAGE_SIZE);
+ start, start + page_size);
mmu_notifier_invalidate_range_start(&range);
}

@@ -830,8 +930,16 @@ static int __copy_insn(struct address_space *mapping, struct file *filp,
*/
if (mapping->a_ops->read_folio)
page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp);
- else
+ else if (!is_file_hugepages(filp))
page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
+ else {
+ struct hstate *h = hstate_file(filp);
+ unsigned long mask = huge_page_mask(h);
+ page = find_get_page(mapping, (offset & mask) >> PAGE_SHIFT);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+ page = nth_page(page, (offset & (huge_page_size(h) - 1)) >> PAGE_SHIFT);
+ }
if (IS_ERR(page))
return PTR_ERR(page);

@@ -1182,9 +1290,12 @@ static int __uprobe_register(struct inode *inode, loff_t offset,
if (!uc->handler && !uc->ret_handler)
return -EINVAL;

- /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
+ /* copy_insn() uses read_mapping_page() or shmem/hugetlbfs specific
+ * logic
+ */
if (!inode->i_mapping->a_ops->read_folio &&
- !shmem_mapping(inode->i_mapping))
+ !shmem_mapping(inode->i_mapping) &&
+ !hugetlbfs_mapping(inode->i_mapping))
return -EIO;
/* Racy, just to catch the obvious mistakes */
if (offset > i_size_read(inode))

--
Guillaume Morin <[email protected]>