2012-06-12 23:13:42

by Andi Kleen

[permalink] [raw]
Subject: [PATCH] MM: Support more pagesizes for MAP_HUGETLB/SHM_HUGETLB v2

From: Andi Kleen <[email protected]>

There was some desire in large applications using MAP_HUGETLB/SHM_HUGETLB
to use 1GB huge pages on some mappings, and stay with 2MB on others. This
is useful together with NUMA policy: use 2MB interleaving on some mappings,
but 1GB on local mappings.

This patch extends the IPC/SHM syscall interfaces slightly to allow specifying
the page size.

It borrows some upper bits in the existing flag arguments and allows encoding
the log of the desired page size in addition to the *_HUGETLB flag.
When 0 is specified the default size is used, this makes the change fully
compatible.

Extending the internal hugetlb code to handle this is straight forward. Instead
of a single mount it just keeps an array of them and selects the right
mount based on the specified page size.

I also exported the new flags to the user headers
(they were previously under __KERNEL__). Right now only symbols
for x86 and some other architecture for 1GB and 2MB are defined.
The interface should already work for all other architectures
though.

v2: Port to new tree. Fix unmount.
Signed-off-by: Andi Kleen <[email protected]>
---
arch/x86/include/asm/mman.h | 3 ++
fs/hugetlbfs/inode.c | 62 ++++++++++++++++++++++++++++++++++---------
include/asm-generic/mman.h | 13 +++++++++
include/linux/hugetlb.h | 17 ++++++++++-
include/linux/shm.h | 19 +++++++++++++
ipc/shm.c | 3 +-
mm/hugetlb.c | 5 +---
mm/mmap.c | 5 ++-
8 files changed, 105 insertions(+), 22 deletions(-)

diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h
index 593e51d..513b05f 100644
--- a/arch/x86/include/asm/mman.h
+++ b/arch/x86/include/asm/mman.h
@@ -3,6 +3,9 @@

#define MAP_32BIT 0x40 /* only give out 32bit addresses */

+#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT)
+#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT)
+
#include <asm-generic/mman.h>

#endif /* _ASM_X86_MMAN_H */
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index cc9281b..b5b6a1d 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -920,16 +920,29 @@ static struct file_system_type hugetlbfs_fs_type = {
.kill_sb = kill_litter_super,
};

-static struct vfsmount *hugetlbfs_vfsmount;
+static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];

static int can_do_hugetlb_shm(void)
{
return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
}

+static int get_hstate_idx(int page_size_log)
+{
+ struct hstate *h;
+
+ if (!page_size_log)
+ return default_hstate_idx;
+ h = size_to_hstate(1 << page_size_log);
+ if (!h)
+ return -1;
+ return h - hstates;
+}
+
struct file *hugetlb_file_setup(const char *name, unsigned long addr,
size_t size, vm_flags_t acctflag,
- struct user_struct **user, int creat_flags)
+ struct user_struct **user,
+ int creat_flags, int page_size_log)
{
int error = -ENOMEM;
struct file *file;
@@ -939,9 +952,14 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
struct qstr quick_string;
struct hstate *hstate;
unsigned long num_pages;
+ int hstate_idx;
+
+ hstate_idx = get_hstate_idx(page_size_log);
+ if (hstate_idx < 0)
+ return ERR_PTR(-ENODEV);

*user = NULL;
- if (!hugetlbfs_vfsmount)
+ if (!hugetlbfs_vfsmount[hstate_idx])
return ERR_PTR(-ENOENT);

if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
@@ -958,7 +976,7 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
}
}

- root = hugetlbfs_vfsmount->mnt_root;
+ root = hugetlbfs_vfsmount[hstate_idx]->mnt_root;
quick_string.name = name;
quick_string.len = strlen(quick_string.name);
quick_string.hash = 0;
@@ -966,7 +984,7 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
if (!path.dentry)
goto out_shm_unlock;

- path.mnt = mntget(hugetlbfs_vfsmount);
+ path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]);
error = -ENOSPC;
inode = hugetlbfs_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0);
if (!inode)
@@ -1006,8 +1024,9 @@ out_shm_unlock:

static int __init init_hugetlbfs_fs(void)
{
+ struct hstate *h;
int error;
- struct vfsmount *vfsmount;
+ int i;

error = bdi_init(&hugetlbfs_backing_dev_info);
if (error)
@@ -1024,14 +1043,26 @@ static int __init init_hugetlbfs_fs(void)
if (error)
goto out;

- vfsmount = kern_mount(&hugetlbfs_fs_type);
+ i = 0;
+ for_each_hstate (h) {
+ char buf[50];
+ unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10);

- if (!IS_ERR(vfsmount)) {
- hugetlbfs_vfsmount = vfsmount;
- return 0;
- }
+ snprintf(buf, sizeof buf, "pagesize=%uK", ps_kb);
+ hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type,
+ buf);

- error = PTR_ERR(vfsmount);
+ if (IS_ERR(hugetlbfs_vfsmount[i])) {
+ pr_err(
+ "hugetlb: Cannot mount internal hugetlbfs for page size %uK",
+ ps_kb);
+ error = PTR_ERR(hugetlbfs_vfsmount[i]);
+ }
+ i++;
+ }
+ /* Non default hstates are optional */
+ if (hugetlbfs_vfsmount[default_hstate_idx])
+ return 0;

out:
kmem_cache_destroy(hugetlbfs_inode_cachep);
@@ -1042,8 +1073,13 @@ static int __init init_hugetlbfs_fs(void)

static void __exit exit_hugetlbfs_fs(void)
{
+ struct hstate *h;
+ int i;
+
kmem_cache_destroy(hugetlbfs_inode_cachep);
- kern_unmount(hugetlbfs_vfsmount);
+ i = 0;
+ for_each_hstate (h)
+ kern_unmount(hugetlbfs_vfsmount[i++]);
unregister_filesystem(&hugetlbfs_fs_type);
bdi_destroy(&hugetlbfs_backing_dev_info);
}
diff --git a/include/asm-generic/mman.h b/include/asm-generic/mman.h
index 32c8bd6..d2f35d8 100644
--- a/include/asm-generic/mman.h
+++ b/include/asm-generic/mman.h
@@ -13,6 +13,19 @@
#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */
#define MAP_HUGETLB 0x40000 /* create a huge page mapping */

+/* Bits [26:31] are reserved */
+
+/*
+ * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
+ * This gives us 6 bits, which is enough until someone invents 128 bit address
+ * spaces.
+ *
+ * Assume these are all power of twos.
+ * When 0 use the default page size.
+ */
+#define MAP_HUGE_SHIFT 26
+#define MAP_HUGE_MASK 0x3f
+
#define MCL_CURRENT 1 /* lock all current mappings */
#define MCL_FUTURE 2 /* lock all future mappings */

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index d5d6bbe..78618a3 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -154,7 +154,13 @@ extern const struct file_operations hugetlbfs_file_operations;
extern const struct vm_operations_struct hugetlb_vm_ops;
struct file *hugetlb_file_setup(const char *name, unsigned long addr,
size_t size, vm_flags_t acct,
- struct user_struct **user, int creat_flags);
+ struct user_struct **user, int creat_flags,
+ int page_size_log);
+int hugetlb_get_quota(struct address_space *mapping, long delta);
+void hugetlb_put_quota(struct address_space *mapping, long delta);
+
+int hugetlb_get_quota(struct address_space *mapping, long delta);
+void hugetlb_put_quota(struct address_space *mapping, long delta);

static inline int is_file_hugepages(struct file *file)
{
@@ -166,12 +172,19 @@ static inline int is_file_hugepages(struct file *file)
return 0;
}

+
+extern int max_hstate;
+
+#define for_each_hstate(h) \
+ for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
+
#else /* !CONFIG_HUGETLBFS */

#define is_file_hugepages(file) 0
static inline struct file *
hugetlb_file_setup(const char *name, unsigned long addr, size_t size,
- vm_flags_t acctflag, struct user_struct **user, int creat_flags)
+ vm_flags_t acctflag, struct user_struct **user, int creat_flags,
+ int page_size_log)
{
return ERR_PTR(-ENOSYS);
}
diff --git a/include/linux/shm.h b/include/linux/shm.h
index 92808b8..41aa305 100644
--- a/include/linux/shm.h
+++ b/include/linux/shm.h
@@ -100,12 +100,31 @@ struct shmid_kernel /* private to the kernel */
struct task_struct *shm_creator;
};

+#endif
+
/* shm_mode upper byte flags */
#define SHM_DEST 01000 /* segment will be destroyed on last detach */
#define SHM_LOCKED 02000 /* segment will not be swapped */
#define SHM_HUGETLB 04000 /* segment will use huge TLB pages */
#define SHM_NORESERVE 010000 /* don't check for reservations */

+/* Bits [26:31] are reserved */
+
+/*
+ * When SHM_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
+ * This gives us 6 bits, which is enough until someone invents 128 bit address
+ * spaces.
+ *
+ * Assume these are all power of twos.
+ * When 0 use the default page size.
+ */
+#define SHM_HUGE_SHIFT 26
+#define SHM_HUGE_MASK 0x3f
+#define SHM_HUGE_2MB (21 << SHM_HUGE_SHIFT)
+#define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT)
+
+#ifdef __KERNEL__
+
#ifdef CONFIG_SYSVIPC
long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr);
extern int is_file_shm_hugepages(struct file *file);
diff --git a/ipc/shm.c b/ipc/shm.c
index 5e2cbfd..2515004 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -483,7 +483,8 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
if (shmflg & SHM_NORESERVE)
acctflag = VM_NORESERVE;
file = hugetlb_file_setup(name, 0, size, acctflag,
- &shp->mlock_user, HUGETLB_SHMFS_INODE);
+ &shp->mlock_user, HUGETLB_SHMFS_INODE,
+ (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
} else {
/*
* Do not allow no accounting for OVERCOMMIT_NEVER, even
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e198831..bcae924 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -34,7 +34,7 @@ const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
unsigned long hugepages_treat_as_movable;

-static int max_hstate;
+int max_hstate;
unsigned int default_hstate_idx;
struct hstate hstates[HUGE_MAX_HSTATE];

@@ -45,9 +45,6 @@ static struct hstate * __initdata parsed_hstate;
static unsigned long __initdata default_hstate_max_huge_pages;
static unsigned long __initdata default_hstate_size;

-#define for_each_hstate(h) \
- for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
-
/*
* Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
*/
diff --git a/mm/mmap.c b/mm/mmap.c
index 3edfcdf..cc417ee 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1125,8 +1125,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
* memory so no accounting is necessary
*/
file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len,
- VM_NORESERVE, &user,
- HUGETLB_ANONHUGE_INODE);
+ VM_NORESERVE,
+ &user, HUGETLB_ANONHUGE_INODE,
+ (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
if (IS_ERR(file))
return PTR_ERR(file);
}
--
1.7.7.6


2012-06-13 20:25:40

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH] MM: Support more pagesizes for MAP_HUGETLB/SHM_HUGETLB v2

On 06/12/2012 07:13 PM, Andi Kleen wrote:
> From: Andi Kleen<[email protected]>

Acked-by: Rik van Riel <[email protected]>

> There was some desire in large applications using MAP_HUGETLB/SHM_HUGETLB
> to use 1GB huge pages on some mappings, and stay with 2MB on others. This
> is useful together with NUMA policy: use 2MB interleaving on some mappings,
> but 1GB on local mappings.
>
> This patch extends the IPC/SHM syscall interfaces slightly to allow specifying
> the page size.

This would also be useful for emulators such as qemu-kvm,
which want the guest memory to be 2MB aligned.

That would require extending mmap to specify the desired
alignment, which may be possible using the upper bits of
the mmap flags, like you did for the shm interface.

> +#define MAP_HUGE_2MB (21<< MAP_HUGE_SHIFT)
> +#define MAP_HUGE_1GB (30<< MAP_HUGE_SHIFT)

Nice idea, that way each architecture can define the
names for possible offsets, yet the numeric values
will always line up between all of them.

> + * Assume these are all power of twos.
> + * When 0 use the default page size.
> + */
> +#define SHM_HUGE_SHIFT 26
> +#define SHM_HUGE_MASK 0x3f
> +#define SHM_HUGE_2MB (21<< SHM_HUGE_SHIFT)
> +#define SHM_HUGE_1GB (30<< SHM_HUGE_SHIFT)
> +
> +#ifdef __KERNEL__

Excellent, this is very similar to what I was
thinking about implementing myself, in order
to pass "desired alignment" information to my
implementation of arch_get_unmapped_area(_topdown) :)

2012-06-13 20:31:11

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH] MM: Support more pagesizes for MAP_HUGETLB/SHM_HUGETLB v2

On Wed, Jun 13, 2012 at 04:24:47PM -0400, Rik van Riel wrote:
> This would also be useful for emulators such as qemu-kvm,
> which want the guest memory to be 2MB aligned.

hugetlbfs does implicit align, so right now I mash
the two together and use up many of the remaining bits

If you want align different than page sizes you may need
to go 64bits with the flags.

Is there a use case for alignment independent of page sizes?

-Andi
--
[email protected] -- Speaking for myself only.

2012-06-13 20:39:07

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH] MM: Support more pagesizes for MAP_HUGETLB/SHM_HUGETLB v2

On 06/13/2012 04:31 PM, Andi Kleen wrote:
> On Wed, Jun 13, 2012 at 04:24:47PM -0400, Rik van Riel wrote:
>> This would also be useful for emulators such as qemu-kvm,
>> which want the guest memory to be 2MB aligned.
>
> hugetlbfs does implicit align, so right now I mash
> the two together and use up many of the remaining bits
>
> If you want align different than page sizes you may need
> to go 64bits with the flags.

All alignment is a power of two, so six bits should
be enough for up to 2^64 pages :)

> Is there a use case for alignment independent of page sizes?

No, but page size differs per architecture and it
would be nice if we could share arch_get_unmapped_area
and related code in mm/, instead of every architecture
having its own.

In fact, that is what I am working on right now, and
my current road block is the page colouring code :)

2012-06-15 10:35:22

by Michal Hocko

[permalink] [raw]
Subject: Re: [PATCH] MM: Support more pagesizes for MAP_HUGETLB/SHM_HUGETLB v2

On Tue 12-06-12 16:13:36, Andi Kleen wrote:
> From: Andi Kleen <[email protected]>
>
> There was some desire in large applications using MAP_HUGETLB/SHM_HUGETLB
> to use 1GB huge pages on some mappings, and stay with 2MB on others. This
> is useful together with NUMA policy: use 2MB interleaving on some mappings,
> but 1GB on local mappings.
>
> This patch extends the IPC/SHM syscall interfaces slightly to allow specifying
> the page size.
>
> It borrows some upper bits in the existing flag arguments and allows encoding
> the log of the desired page size in addition to the *_HUGETLB flag.
> When 0 is specified the default size is used, this makes the change fully
> compatible.
>
> Extending the internal hugetlb code to handle this is straight forward. Instead
> of a single mount it just keeps an array of them and selects the right
> mount based on the specified page size.
>
> I also exported the new flags to the user headers
> (they were previously under __KERNEL__). Right now only symbols
> for x86 and some other architecture for 1GB and 2MB are defined.
> The interface should already work for all other architectures
> though.

I like the idea!

Please note that some parts of the patch clash with hugetlb cgroup
controller[1] (CCing Aneesh) which made it into -mm tree recently.
Nothing big though.

---
[1] http://thread.gmane.org/gmane.linux.kernel.cgroups/2637 but there is
also -v9 out there but google doesn't seem to show it to me (that one is
just a bugfix release)

>
> v2: Port to new tree. Fix unmount.
> Signed-off-by: Andi Kleen <[email protected]>
> ---
> arch/x86/include/asm/mman.h | 3 ++
> fs/hugetlbfs/inode.c | 62 ++++++++++++++++++++++++++++++++++---------
> include/asm-generic/mman.h | 13 +++++++++
> include/linux/hugetlb.h | 17 ++++++++++-
> include/linux/shm.h | 19 +++++++++++++
> ipc/shm.c | 3 +-
> mm/hugetlb.c | 5 +---
> mm/mmap.c | 5 ++-
> 8 files changed, 105 insertions(+), 22 deletions(-)
>
> diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h
> index 593e51d..513b05f 100644
> --- a/arch/x86/include/asm/mman.h
> +++ b/arch/x86/include/asm/mman.h
> @@ -3,6 +3,9 @@
>
> #define MAP_32BIT 0x40 /* only give out 32bit addresses */
>
> +#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT)
> +#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT)
> +
> #include <asm-generic/mman.h>
>
> #endif /* _ASM_X86_MMAN_H */
> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
> index cc9281b..b5b6a1d 100644
> --- a/fs/hugetlbfs/inode.c
> +++ b/fs/hugetlbfs/inode.c
> @@ -920,16 +920,29 @@ static struct file_system_type hugetlbfs_fs_type = {
> .kill_sb = kill_litter_super,
> };
>
> -static struct vfsmount *hugetlbfs_vfsmount;
> +static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
>
> static int can_do_hugetlb_shm(void)
> {
> return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
> }
>
> +static int get_hstate_idx(int page_size_log)
> +{
> + struct hstate *h;
> +
> + if (!page_size_log)
> + return default_hstate_idx;
> + h = size_to_hstate(1 << page_size_log);
> + if (!h)
> + return -1;
> + return h - hstates;
> +}
> +
> struct file *hugetlb_file_setup(const char *name, unsigned long addr,
> size_t size, vm_flags_t acctflag,
> - struct user_struct **user, int creat_flags)
> + struct user_struct **user,
> + int creat_flags, int page_size_log)
> {
> int error = -ENOMEM;
> struct file *file;
> @@ -939,9 +952,14 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
> struct qstr quick_string;
> struct hstate *hstate;
> unsigned long num_pages;
> + int hstate_idx;
> +
> + hstate_idx = get_hstate_idx(page_size_log);
> + if (hstate_idx < 0)
> + return ERR_PTR(-ENODEV);
>
> *user = NULL;
> - if (!hugetlbfs_vfsmount)
> + if (!hugetlbfs_vfsmount[hstate_idx])
> return ERR_PTR(-ENOENT);
>
> if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
> @@ -958,7 +976,7 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
> }
> }
>
> - root = hugetlbfs_vfsmount->mnt_root;
> + root = hugetlbfs_vfsmount[hstate_idx]->mnt_root;
> quick_string.name = name;
> quick_string.len = strlen(quick_string.name);
> quick_string.hash = 0;
> @@ -966,7 +984,7 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
> if (!path.dentry)
> goto out_shm_unlock;
>
> - path.mnt = mntget(hugetlbfs_vfsmount);
> + path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]);
> error = -ENOSPC;
> inode = hugetlbfs_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0);
> if (!inode)
> @@ -1006,8 +1024,9 @@ out_shm_unlock:
>
> static int __init init_hugetlbfs_fs(void)
> {
> + struct hstate *h;
> int error;
> - struct vfsmount *vfsmount;
> + int i;
>
> error = bdi_init(&hugetlbfs_backing_dev_info);
> if (error)
> @@ -1024,14 +1043,26 @@ static int __init init_hugetlbfs_fs(void)
> if (error)
> goto out;
>
> - vfsmount = kern_mount(&hugetlbfs_fs_type);
> + i = 0;
> + for_each_hstate (h) {
> + char buf[50];
> + unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10);
>
> - if (!IS_ERR(vfsmount)) {
> - hugetlbfs_vfsmount = vfsmount;
> - return 0;
> - }
> + snprintf(buf, sizeof buf, "pagesize=%uK", ps_kb);
> + hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type,
> + buf);
>
> - error = PTR_ERR(vfsmount);
> + if (IS_ERR(hugetlbfs_vfsmount[i])) {
> + pr_err(
> + "hugetlb: Cannot mount internal hugetlbfs for page size %uK",
> + ps_kb);
> + error = PTR_ERR(hugetlbfs_vfsmount[i]);
> + }
> + i++;
> + }
> + /* Non default hstates are optional */
> + if (hugetlbfs_vfsmount[default_hstate_idx])
> + return 0;
>
> out:
> kmem_cache_destroy(hugetlbfs_inode_cachep);
> @@ -1042,8 +1073,13 @@ static int __init init_hugetlbfs_fs(void)
>
> static void __exit exit_hugetlbfs_fs(void)
> {
> + struct hstate *h;
> + int i;
> +
> kmem_cache_destroy(hugetlbfs_inode_cachep);
> - kern_unmount(hugetlbfs_vfsmount);
> + i = 0;
> + for_each_hstate (h)
> + kern_unmount(hugetlbfs_vfsmount[i++]);
> unregister_filesystem(&hugetlbfs_fs_type);
> bdi_destroy(&hugetlbfs_backing_dev_info);
> }
> diff --git a/include/asm-generic/mman.h b/include/asm-generic/mman.h
> index 32c8bd6..d2f35d8 100644
> --- a/include/asm-generic/mman.h
> +++ b/include/asm-generic/mman.h
> @@ -13,6 +13,19 @@
> #define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */
> #define MAP_HUGETLB 0x40000 /* create a huge page mapping */
>
> +/* Bits [26:31] are reserved */
> +
> +/*
> + * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
> + * This gives us 6 bits, which is enough until someone invents 128 bit address
> + * spaces.
> + *
> + * Assume these are all power of twos.
> + * When 0 use the default page size.
> + */
> +#define MAP_HUGE_SHIFT 26
> +#define MAP_HUGE_MASK 0x3f
> +
> #define MCL_CURRENT 1 /* lock all current mappings */
> #define MCL_FUTURE 2 /* lock all future mappings */
>
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index d5d6bbe..78618a3 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -154,7 +154,13 @@ extern const struct file_operations hugetlbfs_file_operations;
> extern const struct vm_operations_struct hugetlb_vm_ops;
> struct file *hugetlb_file_setup(const char *name, unsigned long addr,
> size_t size, vm_flags_t acct,
> - struct user_struct **user, int creat_flags);
> + struct user_struct **user, int creat_flags,
> + int page_size_log);
> +int hugetlb_get_quota(struct address_space *mapping, long delta);
> +void hugetlb_put_quota(struct address_space *mapping, long delta);
> +
> +int hugetlb_get_quota(struct address_space *mapping, long delta);
> +void hugetlb_put_quota(struct address_space *mapping, long delta);
>
> static inline int is_file_hugepages(struct file *file)
> {
> @@ -166,12 +172,19 @@ static inline int is_file_hugepages(struct file *file)
> return 0;
> }
>
> +
> +extern int max_hstate;
> +
> +#define for_each_hstate(h) \
> + for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
> +
> #else /* !CONFIG_HUGETLBFS */
>
> #define is_file_hugepages(file) 0
> static inline struct file *
> hugetlb_file_setup(const char *name, unsigned long addr, size_t size,
> - vm_flags_t acctflag, struct user_struct **user, int creat_flags)
> + vm_flags_t acctflag, struct user_struct **user, int creat_flags,
> + int page_size_log)
> {
> return ERR_PTR(-ENOSYS);
> }
> diff --git a/include/linux/shm.h b/include/linux/shm.h
> index 92808b8..41aa305 100644
> --- a/include/linux/shm.h
> +++ b/include/linux/shm.h
> @@ -100,12 +100,31 @@ struct shmid_kernel /* private to the kernel */
> struct task_struct *shm_creator;
> };
>
> +#endif
> +
> /* shm_mode upper byte flags */
> #define SHM_DEST 01000 /* segment will be destroyed on last detach */
> #define SHM_LOCKED 02000 /* segment will not be swapped */
> #define SHM_HUGETLB 04000 /* segment will use huge TLB pages */
> #define SHM_NORESERVE 010000 /* don't check for reservations */
>
> +/* Bits [26:31] are reserved */
> +
> +/*
> + * When SHM_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
> + * This gives us 6 bits, which is enough until someone invents 128 bit address
> + * spaces.
> + *
> + * Assume these are all power of twos.
> + * When 0 use the default page size.
> + */
> +#define SHM_HUGE_SHIFT 26
> +#define SHM_HUGE_MASK 0x3f
> +#define SHM_HUGE_2MB (21 << SHM_HUGE_SHIFT)
> +#define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT)
> +
> +#ifdef __KERNEL__
> +
> #ifdef CONFIG_SYSVIPC
> long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr);
> extern int is_file_shm_hugepages(struct file *file);
> diff --git a/ipc/shm.c b/ipc/shm.c
> index 5e2cbfd..2515004 100644
> --- a/ipc/shm.c
> +++ b/ipc/shm.c
> @@ -483,7 +483,8 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
> if (shmflg & SHM_NORESERVE)
> acctflag = VM_NORESERVE;
> file = hugetlb_file_setup(name, 0, size, acctflag,
> - &shp->mlock_user, HUGETLB_SHMFS_INODE);
> + &shp->mlock_user, HUGETLB_SHMFS_INODE,
> + (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
> } else {
> /*
> * Do not allow no accounting for OVERCOMMIT_NEVER, even
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index e198831..bcae924 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -34,7 +34,7 @@ const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
> static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
> unsigned long hugepages_treat_as_movable;
>
> -static int max_hstate;
> +int max_hstate;
> unsigned int default_hstate_idx;
> struct hstate hstates[HUGE_MAX_HSTATE];
>
> @@ -45,9 +45,6 @@ static struct hstate * __initdata parsed_hstate;
> static unsigned long __initdata default_hstate_max_huge_pages;
> static unsigned long __initdata default_hstate_size;
>
> -#define for_each_hstate(h) \
> - for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
> -
> /*
> * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
> */
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 3edfcdf..cc417ee 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1125,8 +1125,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
> * memory so no accounting is necessary
> */
> file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len,
> - VM_NORESERVE, &user,
> - HUGETLB_ANONHUGE_INODE);
> + VM_NORESERVE,
> + &user, HUGETLB_ANONHUGE_INODE,
> + (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
> if (IS_ERR(file))
> return PTR_ERR(file);
> }
> --
> 1.7.7.6
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>

--
Michal Hocko
SUSE Labs
SUSE LINUX s.r.o.
Lihovarska 1060/12
190 00 Praha 9
Czech Republic

2012-06-16 07:30:24

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH] MM: Support more pagesizes for MAP_HUGETLB/SHM_HUGETLB v2

(2012/06/13 8:13), Andi Kleen wrote:
> From: Andi Kleen<[email protected]>
>
> There was some desire in large applications using MAP_HUGETLB/SHM_HUGETLB
> to use 1GB huge pages on some mappings, and stay with 2MB on others. This
> is useful together with NUMA policy: use 2MB interleaving on some mappings,
> but 1GB on local mappings.
>
> This patch extends the IPC/SHM syscall interfaces slightly to allow specifying
> the page size.
>
> It borrows some upper bits in the existing flag arguments and allows encoding
> the log of the desired page size in addition to the *_HUGETLB flag.
> When 0 is specified the default size is used, this makes the change fully
> compatible.
>
> Extending the internal hugetlb code to handle this is straight forward. Instead
> of a single mount it just keeps an array of them and selects the right
> mount based on the specified page size.
>
> I also exported the new flags to the user headers
> (they were previously under __KERNEL__). Right now only symbols
> for x86 and some other architecture for 1GB and 2MB are defined.
> The interface should already work for all other architectures
> though.
>
> v2: Port to new tree. Fix unmount.
> Signed-off-by: Andi Kleen<[email protected]>

I like this.

Acked-by: KAMEZAWA Hiroyuki <[email protected]>

BTW, do you have any plan to implement 1GB page allocator ?
I wonder recent contiguous-memory-allocator works can be used for...