There is no way to set up the defrag options in boot time. And it's
useful to set it up by default instead of making it work by a
systemd/upstart service or put the command to set up defrag inside
/etc/rc.local.
Signed-off-by: Gavin Guo <[email protected]>
---
.../admin-guide/kernel-parameters.txt | 18 ++++++++
mm/huge_memory.c | 43 +++++++++++++++++++
mm/khugepaged.c | 21 +++++++++
3 files changed, 82 insertions(+)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 6253849afac2..a9fd020d78db 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2149,6 +2149,16 @@
kgdbwait [KGDB] Stop kernel execution and enter the
kernel debugger at the earliest opportunity.
+ khugepaged_defrag=
+ [KNL]
+ Format: { "0" | "1" }
+ 0 - disable the defrag
+ 1 - enable the defrag
+ Control the defrag efforts when generating the
+ transparent hugepages through khugepaged.
+ See Documentation/admin-guide/mm/transhuge.rst
+ for more details.
+
kmac= [MIPS] korina ethernet MAC address.
Configure the RouterBoard 532 series on-chip
Ethernet adapter MAC address.
@@ -5146,6 +5156,14 @@
See Documentation/admin-guide/mm/transhuge.rst
for more details.
+ transparent_hugepage_defrag=
+ [KNL]
+ Format: [always|defer|defer+madvise|madvise|never]
+ Control the defrag efforts when generating the
+ transparent hugepages.
+ See Documentation/admin-guide/mm/transhuge.rst
+ for more details.
+
tsc= Disable clocksource stability checks for TSC.
Format: <string>
[x86] reliable: mark tsc clocksource as reliable, this
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8091b780cd7a..86b20a3a1aac 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -481,6 +481,49 @@ static int __init setup_transparent_hugepage(char *str)
}
__setup("transparent_hugepage=", setup_transparent_hugepage);
+static int __init setup_transparent_hugepage_defrag(char *str)
+{
+ int ret = 0;
+ if (!str)
+ goto out;
+ if (!strcmp(str, "always")) {
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
+ ret = 1;
+ } else if (!strcmp(str, "defer+madvise")) {
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
+ ret = 1;
+ } else if (!strcmp(str, "defer")) {
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
+ ret = 1;
+ } else if (!strcmp(str, "madvise")) {
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
+ set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ ret = 1;
+ } else if (!strcmp(str, "never")) {
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ ret = 1;
+ }
+out:
+ if (!ret)
+ pr_warn("transparent_hugepage_defrag= cannot parse, ignored\n");
+ return ret;
+}
+__setup("transparent_hugepage_defrag=", setup_transparent_hugepage_defrag);
+
pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
{
if (likely(vma->vm_flags & VM_WRITE))
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index b043c40a21d4..39bbf2107a23 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -394,6 +394,27 @@ int __init khugepaged_init(void)
return 0;
}
+static int __init setup_khugepaged_defrag(char *str)
+{
+ int ret = 0;
+ if (!str)
+ goto out;
+ if (!strcmp(str, "0")) {
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
+ &transparent_hugepage_flags);
+ ret = 1;
+ } else if (!strcmp(str, "1")) {
+ set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
+ &transparent_hugepage_flags);
+ ret = 1;
+ }
+out:
+ if (!ret)
+ pr_warn("khugepaged_defrag= cannot parse, ignored\n");
+ return ret;
+}
+__setup("khugepaged_defrag=", setup_khugepaged_defrag);
+
void __init khugepaged_destroy(void)
{
kmem_cache_destroy(mm_slot_cache);
--
2.17.1
On 6/3/20 8:50 AM, Gavin Guo wrote:
> There is no way to set up the defrag options in boot time. And it's
> useful to set it up by default instead of making it work by a
> systemd/upstart service or put the command to set up defrag inside
> /etc/rc.local.
>
> Signed-off-by: Gavin Guo <[email protected]>
Well, maybe isntead of adding these handlers, we could extend the new boot
parameter sysctl support (handling procfs /proc/sys/) to sysfs (/sys) as well,
as Eric already suggested? [1]
[1] https://lore.kernel.org/linux-api/[email protected]/
> ---
> .../admin-guide/kernel-parameters.txt | 18 ++++++++
> mm/huge_memory.c | 43 +++++++++++++++++++
> mm/khugepaged.c | 21 +++++++++
> 3 files changed, 82 insertions(+)
>
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index 6253849afac2..a9fd020d78db 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -2149,6 +2149,16 @@
> kgdbwait [KGDB] Stop kernel execution and enter the
> kernel debugger at the earliest opportunity.
>
> + khugepaged_defrag=
> + [KNL]
> + Format: { "0" | "1" }
> + 0 - disable the defrag
> + 1 - enable the defrag
> + Control the defrag efforts when generating the
> + transparent hugepages through khugepaged.
> + See Documentation/admin-guide/mm/transhuge.rst
> + for more details.
> +
> kmac= [MIPS] korina ethernet MAC address.
> Configure the RouterBoard 532 series on-chip
> Ethernet adapter MAC address.
> @@ -5146,6 +5156,14 @@
> See Documentation/admin-guide/mm/transhuge.rst
> for more details.
>
> + transparent_hugepage_defrag=
> + [KNL]
> + Format: [always|defer|defer+madvise|madvise|never]
> + Control the defrag efforts when generating the
> + transparent hugepages.
> + See Documentation/admin-guide/mm/transhuge.rst
> + for more details.
> +
> tsc= Disable clocksource stability checks for TSC.
> Format: <string>
> [x86] reliable: mark tsc clocksource as reliable, this
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 8091b780cd7a..86b20a3a1aac 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -481,6 +481,49 @@ static int __init setup_transparent_hugepage(char *str)
> }
> __setup("transparent_hugepage=", setup_transparent_hugepage);
>
> +static int __init setup_transparent_hugepage_defrag(char *str)
> +{
> + int ret = 0;
> + if (!str)
> + goto out;
> + if (!strcmp(str, "always")) {
> + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
> + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
> + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
> + set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
> + ret = 1;
> + } else if (!strcmp(str, "defer+madvise")) {
> + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
> + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
> + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
> + set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
> + ret = 1;
> + } else if (!strcmp(str, "defer")) {
> + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
> + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
> + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
> + set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
> + ret = 1;
> + } else if (!strcmp(str, "madvise")) {
> + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
> + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
> + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
> + set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
> + ret = 1;
> + } else if (!strcmp(str, "never")) {
> + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
> + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
> + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
> + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
> + ret = 1;
> + }
> +out:
> + if (!ret)
> + pr_warn("transparent_hugepage_defrag= cannot parse, ignored\n");
> + return ret;
> +}
> +__setup("transparent_hugepage_defrag=", setup_transparent_hugepage_defrag);
> +
> pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
> {
> if (likely(vma->vm_flags & VM_WRITE))
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index b043c40a21d4..39bbf2107a23 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -394,6 +394,27 @@ int __init khugepaged_init(void)
> return 0;
> }
>
> +static int __init setup_khugepaged_defrag(char *str)
> +{
> + int ret = 0;
> + if (!str)
> + goto out;
> + if (!strcmp(str, "0")) {
> + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
> + &transparent_hugepage_flags);
> + ret = 1;
> + } else if (!strcmp(str, "1")) {
> + set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
> + &transparent_hugepage_flags);
> + ret = 1;
> + }
> +out:
> + if (!ret)
> + pr_warn("khugepaged_defrag= cannot parse, ignored\n");
> + return ret;
> +}
> +__setup("khugepaged_defrag=", setup_khugepaged_defrag);
> +
> void __init khugepaged_destroy(void)
> {
> kmem_cache_destroy(mm_slot_cache);
>
On Wed, 3 Jun 2020, Vlastimil Babka wrote:
> > There is no way to set up the defrag options in boot time. And it's
> > useful to set it up by default instead of making it work by a
> > systemd/upstart service or put the command to set up defrag inside
> > /etc/rc.local.
> >
> > Signed-off-by: Gavin Guo <[email protected]>
>
> Well, maybe isntead of adding these handlers, we could extend the new boot
> parameter sysctl support (handling procfs /proc/sys/) to sysfs (/sys) as well,
> as Eric already suggested? [1]
>
> [1] https://lore.kernel.org/linux-api/[email protected]/
>
Fully agreed, I think the solution needs to be more generic since thp
defrag isn't special here. With the generic support to tune sysctls and
sysfs tunables from the command line it seems like this patch would be
redundant.
On Thu, Jun 4, 2020 at 3:27 AM David Rientjes <[email protected]> wrote:
>
> On Wed, 3 Jun 2020, Vlastimil Babka wrote:
>
> > > There is no way to set up the defrag options in boot time. And it's
> > > useful to set it up by default instead of making it work by a
> > > systemd/upstart service or put the command to set up defrag inside
> > > /etc/rc.local.
> > >
> > > Signed-off-by: Gavin Guo <[email protected]>
> >
> > Well, maybe isntead of adding these handlers, we could extend the new boot
> > parameter sysctl support (handling procfs /proc/sys/) to sysfs (/sys) as well,
> > as Eric already suggested? [1]
> >
> > [1] https://lore.kernel.org/linux-api/[email protected]/
> >
>
> Fully agreed, I think the solution needs to be more generic since thp
> defrag isn't special here. With the generic support to tune sysctls and
> sysfs tunables from the command line it seems like this patch would be
> redundant.
Agreed, I'll try to investigate more on how to do that in a generic way.