[PATCH 06/06]
The following kernel components register a tunable structure and call the
auto-tuning routine:
. file system
. shared memory (per namespace)
. semaphore (per namespace)
. message queues (per namespace)
Signed-off-by: Nadia Derbey <[email protected]>
---
fs/file_table.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++
include/linux/akt.h | 1
include/linux/ipc.h | 6 +++
init/main.c | 1
ipc/msg.c | 19 ++++++++++++
ipc/sem.c | 41 ++++++++++++++++++++++++++
ipc/shm.c | 74 ++++++++++++++++++++++++++++++++++++++++++++---
7 files changed, 218 insertions(+), 5 deletions(-)
Index: linux-2.6.20-rc4/fs/file_table.c
===================================================================
--- linux-2.6.20-rc4.orig/fs/file_table.c 2007-01-15 13:08:14.000000000 +0100
+++ linux-2.6.20-rc4/fs/file_table.c 2007-01-15 15:44:39.000000000 +0100
@@ -21,6 +21,8 @@
#include <linux/fsnotify.h>
#include <linux/sysctl.h>
#include <linux/percpu_counter.h>
+#include <linux/akt.h>
+#include <linux/akt_ops.h>
#include <asm/atomic.h>
@@ -34,6 +36,71 @@ __cacheline_aligned_in_smp DEFINE_SPINLO
static struct percpu_counter nr_files __cacheline_aligned_in_smp;
+#ifdef CONFIG_AKT
+
+static int get_nr_files(void);
+
+/********** automatic tuning **********/
+#define FILPTHRESH 80 /* threshold = 80% */
+
+/*
+ * FUNCTION: This is the routine called to accomplish auto tuning for the
+ * max_files tunable.
+ *
+ * Upwards adjustment:
+ * Adjustment is needed if nr_files has reached
+ * (threshold / 100 * max_files)
+ * In that case, max_files is set to
+ * (tunable + max_files * (100 - threshold) / 100)
+ *
+ * Downards adjustment:
+ * Adjustment is needed if nr_files has fallen under
+ * (threshold / 100 * max_files previous value)
+ * In that case max_files is set back to its previous value,
+ * i.e. to (max_files * 100 / (200 - threshold))
+ *
+ * PARAMETERS: cmd: controls the adjustment direction (up / down)
+ * params: pointer to the registered tunable structure
+ *
+ * EXECUTION ENVIRONMENT: This routine should be called with the
+ * params->tunable_lck lock held
+ *
+ * RETURN VALUE: 1 if tunable has been adjusted
+ * 0 else
+ */
+static inline int maxfiles_auto_tuning(int cmd, struct auto_tune *params)
+{
+ int thr = params->threshold;
+ int min = params->min.value.val_int;
+ int max = params->max.value.val_int;
+ int tun = files_stat.max_files;
+
+ if (cmd == AKT_UP) {
+ if (get_nr_files() >= tun * thr / 100 && tun < max) {
+ int new = tun * (200 - thr) / 100;
+
+ files_stat.max_files = min(max, new);
+ return 1;
+ } else
+ return 0;
+ }
+
+ if (get_nr_files() < tun * thr / (200 - thr) && tun > min) {
+ int new = tun * 100 / (200 - thr);
+
+ files_stat.max_files = max(min, new);
+ return 1;
+ } else
+ return 0;
+}
+
+#endif /* CONFIG_AKT */
+
+/* The maximum value will be known later on */
+DEFINE_TUNABLE(maxfiles_akt, FILPTHRESH, 0, 0, &files_stat.max_files,
+ &nr_files, int);
+
+
static inline void file_free_rcu(struct rcu_head *head)
{
struct file *f = container_of(head, struct file, f_u.fu_rcuhead);
@@ -44,6 +111,8 @@ static inline void file_free(struct file
{
percpu_counter_dec(&nr_files);
call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
+
+ activate_auto_tuning(AKT_DOWN, &maxfiles_akt);
}
/*
@@ -91,6 +160,8 @@ struct file *get_empty_filp(void)
static int old_max;
struct file * f;
+ activate_auto_tuning(AKT_UP, &maxfiles_akt);
+
/*
* Privileged users can go above max_files
*/
@@ -299,6 +370,16 @@ void __init files_init(unsigned long mem
files_stat.max_files = n;
if (files_stat.max_files < NR_FILE)
files_stat.max_files = NR_FILE;
+
+ set_tunable_min_max(maxfiles_akt, n, n * 2, int);
+ set_autotuning_routine(&maxfiles_akt, maxfiles_auto_tuning);
+
files_defer_init();
percpu_counter_init(&nr_files, 0);
}
+
+void __init files_late_init(void)
+{
+ if (register_tunable(&maxfiles_akt))
+ printk(KERN_WARNING "Failed registering tunable file-max\n");
+}
Index: linux-2.6.20-rc4/include/linux/akt.h
===================================================================
--- linux-2.6.20-rc4.orig/include/linux/akt.h 2007-01-15 15:31:44.000000000 +0100
+++ linux-2.6.20-rc4/include/linux/akt.h 2007-01-15 15:45:29.000000000 +0100
@@ -295,5 +295,6 @@ static inline void init_auto_tuning(void
#endif /* CONFIG_AKT */
extern void fork_late_init(void);
+extern void files_late_init(void);
#endif /* AKT_H */
Index: linux-2.6.20-rc4/init/main.c
===================================================================
--- linux-2.6.20-rc4.orig/init/main.c 2007-01-15 15:09:27.000000000 +0100
+++ linux-2.6.20-rc4/init/main.c 2007-01-15 15:46:09.000000000 +0100
@@ -616,6 +616,7 @@ asmlinkage void __init start_kernel(void
page_writeback_init();
init_auto_tuning();
fork_late_init();
+ files_late_init();
#ifdef CONFIG_PROC_FS
proc_root_init();
#endif
Index: linux-2.6.20-rc4/ipc/msg.c
===================================================================
--- linux-2.6.20-rc4.orig/ipc/msg.c 2007-01-15 13:08:15.000000000 +0100
+++ linux-2.6.20-rc4/ipc/msg.c 2007-01-15 15:48:16.000000000 +0100
@@ -36,6 +36,8 @@
#include <linux/seq_file.h>
#include <linux/mutex.h>
#include <linux/nsproxy.h>
+#include <linux/akt.h>
+#include <linux/akt_ops.h>
#include <asm/current.h>
#include <asm/uaccess.h>
@@ -94,6 +96,11 @@ static void __ipc_init __msg_init_ns(str
ns->msg_ctlmnb = MSGMNB;
ns->msg_ctlmni = MSGMNI;
ipc_init_ids(ids, ns->msg_ctlmni);
+
+#define MSGTHRESH 80
+
+ init_tunable_ipcns(ns, msgmni_akt, MSGTHRESH, MSGMNI, IPCMNI,
+ &ns->msg_ctlmni, &ids->in_use, int);
}
#ifdef CONFIG_IPC_NS
@@ -133,6 +140,10 @@ void msg_exit_ns(struct ipc_namespace *n
void __init msg_init(void)
{
__msg_init_ns(&init_ipc_ns, &init_msg_ids);
+
+ if (register_tunable(&init_ipc_ns.msgmni_akt))
+ printk(KERN_WARNING " Failed registering tunable msgmni\n");
+
ipc_init_proc_interface("sysvipc/msg",
" key msqid perms cbytes qnum lspid lrpid uid gid cuid cgid stime rtime ctime\n",
IPC_MSG_IDS, sysvipc_msg_proc_show);
@@ -262,6 +273,8 @@ asmlinkage long sys_msgget(key_t key, in
ns = current->nsproxy->ipc_ns;
+ activate_auto_tuning(AKT_UP, &ns->msgmni_akt);
+
mutex_lock(&msg_ids(ns).mutex);
if (key == IPC_PRIVATE)
ret = newque(ns, key, msgflg);
@@ -391,6 +404,7 @@ asmlinkage long sys_msgctl(int msqid, in
struct msg_queue *msq;
int err, version;
struct ipc_namespace *ns;
+ int destroyed = 0;
if (msqid < 0 || cmd < 0)
return -EINVAL;
@@ -555,11 +569,16 @@ asmlinkage long sys_msgctl(int msqid, in
}
case IPC_RMID:
freeque(ns, msq, msqid);
+ destroyed = 1;
break;
}
err = 0;
out_up:
mutex_unlock(&msg_ids(ns).mutex);
+
+ if (destroyed)
+ activate_auto_tuning(AKT_DOWN, &ns->msgmni_akt);
+
return err;
out_unlock_up:
msg_unlock(msq);
Index: linux-2.6.20-rc4/ipc/shm.c
===================================================================
--- linux-2.6.20-rc4.orig/ipc/shm.c 2007-01-15 13:08:15.000000000 +0100
+++ linux-2.6.20-rc4/ipc/shm.c 2007-01-15 15:49:00.000000000 +0100
@@ -37,6 +37,8 @@
#include <linux/seq_file.h>
#include <linux/mutex.h>
#include <linux/nsproxy.h>
+#include <linux/akt.h>
+#include <linux/akt_ops.h>
#include <asm/uaccess.h>
@@ -75,17 +77,27 @@ static void __ipc_init __shm_init_ns(str
ns->shm_ctlmni = SHMMNI;
ns->shm_tot = 0;
ipc_init_ids(ids, 1);
+
+#define SHMTHRESH 80
+ init_tunable_ipcns(ns, shmmni_akt, SHMTHRESH, SHMMNI, IPCMNI,
+ &ns->shm_ctlmni, &ids->in_use, int);
+ init_tunable_ipcns(ns, shmall_akt, SHMTHRESH, SHMALL,
+ SHMMAX / PAGE_SIZE * (IPCMNI / 16), &ns->shm_ctlall,
+ &ns->shm_tot, size_t);
}
-static void do_shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *shp)
+static int do_shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *shp)
{
if (shp->shm_nattch){
shp->shm_perm.mode |= SHM_DEST;
/* Do not find it any more */
shp->shm_perm.key = IPC_PRIVATE;
shm_unlock(shp);
- } else
+ return 0;
+ } else {
shm_destroy(ns, shp);
+ return 1;
+ }
}
#ifdef CONFIG_IPC_NS
@@ -125,6 +137,13 @@ void shm_exit_ns(struct ipc_namespace *n
void __init shm_init (void)
{
__shm_init_ns(&init_ipc_ns, &init_shm_ids);
+
+ if (register_tunable(&init_ipc_ns.shmmni_akt))
+ printk(KERN_WARNING "Failed registering tunable shmmni\n");
+
+ if (register_tunable(&init_ipc_ns.shmall_akt))
+ printk(KERN_WARNING "Failed registering tunable shmall\n");
+
ipc_init_proc_interface("sysvipc/shm",
" key shmid perms size cpid lpid nattch uid gid cuid cgid atime dtime ctime\n",
IPC_SHM_IDS, sysvipc_shm_proc_show);
@@ -206,6 +225,7 @@ static void shm_close (struct vm_area_st
int id = file->f_path.dentry->d_inode->i_ino;
struct shmid_kernel *shp;
struct ipc_namespace *ns;
+ int destroyed = 0;
ns = shm_file_ns(file);
@@ -217,11 +237,27 @@ static void shm_close (struct vm_area_st
shp->shm_dtim = get_seconds();
shp->shm_nattch--;
if(shp->shm_nattch == 0 &&
- shp->shm_perm.mode & SHM_DEST)
+ shp->shm_perm.mode & SHM_DEST) {
shm_destroy(ns, shp);
- else
+ destroyed = 1;
+ } else
shm_unlock(shp);
mutex_unlock(&shm_ids(ns).mutex);
+
+ if (destroyed) {
+ int rc;
+
+ rc = activate_auto_tuning(AKT_DOWN, &ns->shmmni_akt);
+ if (rc)
+ /*
+ * shm_ctlmni has been adjusted == > change
+ * shm_ctlall value
+ */
+ ns->shm_ctlall = ns->shm_ctlmax / PAGE_SIZE *
+ (ns->shm_ctlmni / 16);
+
+ activate_auto_tuning(AKT_DOWN, &ns->shmall_akt);
+ }
}
static int shm_mmap(struct file * file, struct vm_area_struct * vma)
@@ -355,9 +391,20 @@ asmlinkage long sys_shmget (key_t key, s
struct shmid_kernel *shp;
int err, id = 0;
struct ipc_namespace *ns;
+ int rc;
ns = current->nsproxy->ipc_ns;
+ rc = activate_auto_tuning(AKT_UP, &ns->shmmni_akt);
+ if (rc)
+ /*
+ * shm_ctlmni has been adjusted == > change shm_ctlall value
+ */
+ ns->shm_ctlall = ns->shm_ctlmax / PAGE_SIZE
+ * (ns->shm_ctlmni / 16);
+
+ activate_auto_tuning(AKT_UP, &ns->shmall_akt);
+
mutex_lock(&shm_ids(ns).mutex);
if (key == IPC_PRIVATE) {
err = newseg(ns, key, shmflg, size);
@@ -516,6 +563,7 @@ asmlinkage long sys_shmctl (int shmid, i
struct shmid_kernel *shp;
int err, version;
struct ipc_namespace *ns;
+ int destroyed;
if (cmd < 0 || shmid < 0) {
err = -EINVAL;
@@ -701,8 +749,24 @@ asmlinkage long sys_shmctl (int shmid, i
if (err)
goto out_unlock_up;
- do_shm_rmid(ns, shp);
+ destroyed = do_shm_rmid(ns, shp);
mutex_unlock(&shm_ids(ns).mutex);
+
+ if (destroyed) {
+ int rc;
+
+ rc = activate_auto_tuning(AKT_DOWN, &ns->shmmni_akt);
+ if (rc)
+ /*
+ * shm_ctlmni has been adjusted == > change
+ * shm_ctlall value
+ */
+ ns->shm_ctlall = ns->shm_ctlmax / PAGE_SIZE *
+ (ns->shm_ctlmni / 16);
+
+ activate_auto_tuning(AKT_DOWN, &ns->shmall_akt);
+ }
+
goto out;
}
Index: linux-2.6.20-rc4/ipc/sem.c
===================================================================
--- linux-2.6.20-rc4.orig/ipc/sem.c 2007-01-15 13:08:15.000000000 +0100
+++ linux-2.6.20-rc4/ipc/sem.c 2007-01-15 15:49:41.000000000 +0100
@@ -83,6 +83,8 @@
#include <linux/seq_file.h>
#include <linux/mutex.h>
#include <linux/nsproxy.h>
+#include <linux/akt.h>
+#include <linux/akt_ops.h>
#include <asm/uaccess.h>
#include "util.h"
@@ -131,6 +133,12 @@ static void __ipc_init __sem_init_ns(str
ns->sc_semmni = SEMMNI;
ns->used_sems = 0;
ipc_init_ids(ids, ns->sc_semmni);
+
+#define SEMTHRESH 80
+ init_tunable_ipcns(ns, semmni_akt, SEMTHRESH, SEMMNI, IPCMNI,
+ &(ns->sc_semmni), &ids->in_use, int);
+ init_tunable_ipcns(ns, semmns_akt, SEMTHRESH, SEMMNS,
+ IPCMNI * SEMMSL, &(ns->sc_semmns), &ns->used_sems, int);
}
#ifdef CONFIG_IPC_NS
@@ -170,6 +178,13 @@ void sem_exit_ns(struct ipc_namespace *n
void __init sem_init (void)
{
__sem_init_ns(&init_ipc_ns, &init_sem_ids);
+
+ if (register_tunable(&init_ipc_ns.semmni_akt))
+ printk(KERN_WARNING "Failed registering tunable semmni\n");
+
+ if (register_tunable(&init_ipc_ns.semmns_akt))
+ printk(KERN_WARNING "Failed registering tunable semmns\n");
+
ipc_init_proc_interface("sysvipc/sem",
" key semid perms nsems uid gid cuid cgid otime ctime\n",
IPC_SEM_IDS, sysvipc_sem_proc_show);
@@ -263,11 +278,22 @@ asmlinkage long sys_semget (key_t key, i
int id, err = -EINVAL;
struct sem_array *sma;
struct ipc_namespace *ns;
+ int rc;
ns = current->nsproxy->ipc_ns;
if (nsems < 0 || nsems > ns->sc_semmsl)
return -EINVAL;
+
+ rc = activate_auto_tuning(AKT_UP, &ns->semmni_akt);
+ if (rc)
+ /*
+ * sc_semmni has been adjusted == > change sc_semmns value
+ */
+ ns->sc_semmns = ns->sc_semmni * ns->sc_semmsl;
+
+ activate_auto_tuning(AKT_UP, &ns->semmns_akt);
+
mutex_lock(&sem_ids(ns).mutex);
if (key == IPC_PRIVATE) {
@@ -899,6 +925,21 @@ static int semctl_down(struct ipc_namesp
case IPC_RMID:
freeary(ns, sma, semid);
err = 0;
+
+ {
+ int rc;
+
+ rc = activate_auto_tuning(AKT_DOWN, &ns->semmni_akt);
+ if (rc)
+ /*
+ * sc_semmni has been adjusted ==>
+ * change sc_semmns value
+ */
+ ns->sc_semmns = ns->sc_semmni * ns->sc_semmsl;
+
+ activate_auto_tuning(AKT_DOWN, &ns->semmns_akt);
+ }
+
break;
case IPC_SET:
ipcp->uid = setbuf.uid;
Index: linux-2.6.20-rc4/include/linux/ipc.h
===================================================================
--- linux-2.6.20-rc4.orig/include/linux/ipc.h 2007-01-15 13:08:15.000000000 +0100
+++ linux-2.6.20-rc4/include/linux/ipc.h 2007-01-15 15:52:19.000000000 +0100
@@ -52,6 +52,7 @@ struct ipc_perm
#ifdef __KERNEL__
#include <linux/kref.h>
+#include <linux/akt.h>
#define IPCMNI 32768 /* <= MAX_INT limit for ipc arrays (including sysctl changes) */
@@ -77,15 +78,20 @@ struct ipc_namespace {
int sem_ctls[4];
int used_sems;
+ DECLARE_TUNABLE(semmni_akt);
+ DECLARE_TUNABLE(semmns_akt);
int msg_ctlmax;
int msg_ctlmnb;
int msg_ctlmni;
+ DECLARE_TUNABLE(msgmni_akt);
size_t shm_ctlmax;
size_t shm_ctlall;
int shm_ctlmni;
int shm_tot;
+ DECLARE_TUNABLE(shmmni_akt);
+ DECLARE_TUNABLE(shmall_akt);
};
extern struct ipc_namespace init_ipc_ns;
--
> On Tue, 16 Jan 2007 07:15:22 +0100 [email protected] wrote:
> The following kernel components register a tunable structure and call the
> auto-tuning routine:
> . file system
> . shared memory (per namespace)
> . semaphore (per namespace)
> . message queues (per namespace)
This is the part of the patch series which really matters, and I just don't
understand it :(
Why do we want to autotune these things? What problem is this patch series
solving? Please describe this part of the work much, much more completely,
so we can understand the need to add such a large amount of code to the
kernel.
It seems strange that the whole feature is Kconfigurable. Please also
explain the thinking behind that.
I suspect the patches would be much simpler if you simply required that all
these new tunables be of type `long'. About seven eighths of the code
would go away. As would most of those eye-popping macros.
Andrew Morton wrote:
>>On Tue, 16 Jan 2007 07:15:22 +0100 [email protected] wrote:
>>The following kernel components register a tunable structure and call the
>>auto-tuning routine:
>> . file system
>> . shared memory (per namespace)
>> . semaphore (per namespace)
>> . message queues (per namespace)
>
>
> This is the part of the patch series which really matters, and I just don't
> understand it :(
>
> Why do we want to autotune these things? What problem is this patch series
> solving? Please describe this part of the work much, much more completely,
> so we can understand the need to add such a large amount of code to the
> kernel.
1) why these tunables?
The ipc tunables have been selected as "guinea-pig" tunables for the AKT
framework because they are likely to be often used in data bases. This
applies to file-max too.
Now, if the framework itself is accepted, the set of impacted tunables
can easily be enhanced.
2) why autotuning:
There are at least 3 cases where it can be useful
. for workloads that are known to need a big amount of a given resource
type (say shared memories), but we don't know what the maximum amount
needed will be
. to solve the case of multiple applications running on a single system,
and that need the same tunable to be adjusted to feet their needs
. to make a system correctly react to eventual peak loads for a given
resource usage, i.e. make it tune up *and down* as needed.
In all these cases, the akt framework will enable the kernel to adapt to
increasing / decreasing resource consumption:
1) avoid allocating "a priori" a big amount of memory that will be used
only in extreme cases. This is the effect of doing an "echo <huge_value>
> /proc/sys/kernel/shmmni"
2) the system will come back to the default values as soon as the peak
load is over.
>
> It seems strange that the whole feature is Kconfigurable. Please also
> explain the thinking behind that.
We wanted to make it configurable because it adds some overhead in terms of
1) generated kernel size
2) instructions added to the resource creation / removal code paths even
if auto-tuning is not activated for th corresponding tunable ->
performance impact.
>
> I suspect the patches would be much simpler if you simply required that all
> these new tunables be of type `long'. About seven eighths of the code
> would go away. As would most of those eye-popping macros.
>
Yes, agree with you: the idea here was to make the framework more
generic. But I can change that.
Regards,
Nadia
Nadia Derbey <[email protected]> writes:
>
> 2) why autotuning:
> There are at least 3 cases where it can be useful
> . for workloads that are known to need a big amount of a given resource type
> (say shared memories), but we don't know what the maximum amount needed will be
> . to solve the case of multiple applications running on a single system, and
> that need the same tunable to be adjusted to feet their needs
> . to make a system correctly react to eventual peak loads for a given resource
> usage, i.e. make it tune up *and down* as needed.
>
> In all these cases, the akt framework will enable the kernel to adapt to
> increasing / decreasing resource consumption:
> 1) avoid allocating "a priori" a big amount of memory that will be used only in
> extreme cases. This is the effect of doing an "echo <huge_value>
>> /proc/sys/kernel/shmmni"
>
> 2) the system will come back to the default values as soon as the peak load is
> over.
At least the ipc ones are supposed to be DOS limits not behavior
modifiers. I do admit from looking at the code that there are some
consequences of increasing things like shmmni. However I think we
would be better off with better data structures and implementations
that remove these consequences than this autotuning of
denial-of-service limits.
i.e. I think you are treating the symptom not the problem.
Does this make sense?
Eric
Eric W. Biederman wrote:
> Nadia Derbey <[email protected]> writes:
>
>
>>2) why autotuning:
>>There are at least 3 cases where it can be useful
>>. for workloads that are known to need a big amount of a given resource type
>>(say shared memories), but we don't know what the maximum amount needed will be
>>. to solve the case of multiple applications running on a single system, and
>>that need the same tunable to be adjusted to feet their needs
>>. to make a system correctly react to eventual peak loads for a given resource
>>usage, i.e. make it tune up *and down* as needed.
>
>
>>In all these cases, the akt framework will enable the kernel to adapt to
>>increasing / decreasing resource consumption:
>>1) avoid allocating "a priori" a big amount of memory that will be used only in
>>extreme cases. This is the effect of doing an "echo <huge_value>
>>
>>>/proc/sys/kernel/shmmni"
>>
>>2) the system will come back to the default values as soon as the peak load is
>>over.
>
>
> At least the ipc ones are supposed to be DOS limits not behavior
> modifiers. I do admit from looking at the code that there are some
> consequences of increasing things like shmmni. However I think we
> would be better off with better data structures and implementations
> that remove these consequences than this autotuning of
> denial-of-service limits.
>
I do not fully agree with you:
It is true that some ipc tunables play the role of DoS limits.
But IMHO the *mni ones (semmni, msgmni, shmmni) are used by the ipc
subsystem to adapt its data structures sizes to what is being asked for
through the tunable value. I think this is how they manage to take into
account a new tunable value without a need for rebooting the system:
reallocate some more memory on demand.
Now, what the akt framework does, is that it takes advantage of this
concept of "on demand memory allocation" to replace a user (or a daemon)
that would periodically check its ipcs consumptions and manually adjust
the ipcs tunables: Doing this from the user space would imply a latency
that makes it difficult to react fast enough to resources running out.
Now, talking about DoS limits, akt implements them in a sense: each
tunable managed by akt has 3 attributes exported to sysfs:
. autotune: enable / disable auto-tuning
. min: min value the tunable can ever reach
. max: max value the tunable can ever reach
Enabling a sysadmin to play with these min and max values makes it
possible to refine the dynamic adjustment, and avoid that the tunable
reaches really huge values.
Regards,
Nadia
Nadia Derbey <[email protected]> writes:
> I do not fully agree with you:
> It is true that some ipc tunables play the role of DoS limits.
> But IMHO the *mni ones (semmni, msgmni, shmmni) are used by the ipc subsystem to
> adapt its data structures sizes to what is being asked for through the tunable
> value. I think this is how they manage to take into account a new tunable value
> without a need for rebooting the system: reallocate some more memory on demand.
Yes, they do. However if you are constantly having to play with shmmni or
the others that is the problem and the array should be replaced with
a hash table or some form of radix tree, so it changes it's size to fit
the need. Once that is done, shmmni does become a simple DOS limit.
So what I'm asking is please fix the problem at the source don't plaster over
it.
> Now, what the akt framework does, is that it takes advantage of this concept of
> "on demand memory allocation" to replace a user (or a daemon) that would
> periodically check its ipcs consumptions and manually adjust the ipcs tunables:
> Doing this from the user space would imply a latency that makes it difficult to
> react fast enough to resources running out.
There may be some sense in this but you haven't found something that inherently
needs tuning. You have found something that has a poor data structure,
and can more easily be fixed by simply fixing the data structure.
I'm guessing that we have a disconnect somewhere with kernel developers thinking
shm is an old legacy api and doing minimal maintenance, expecting serious users
to use tmpfs or hugetlbfs and users not used to the old stuff using the SYSV apis.
If we have serious users it makes sense to fix these things properly, in a backwards
compatible way, so existing users and applications don't need to be changed.
> Now, talking about DoS limits, akt implements them in a sense: each tunable
> managed by akt has 3 attributes exported to sysfs:
> . autotune: enable / disable auto-tuning
> . min: min value the tunable can ever reach
> . max: max value the tunable can ever reach
>
> Enabling a sysadmin to play with these min and max values makes it possible to
> refine the dynamic adjustment, and avoid that the tunable reaches really huge
> values.
This just shifts the location where you have your DOS limit and could
be done transparently under the covers with shmmni being the maximum.
If we can't get users to switch to something that doesn't need tuning
that has been available for years, I doubt even more user tunables
that tune the tunables will make the situation any better. I suspect
your changes would just confuse the landscape even more and give us
more weird legacy cases to support that we can never get rid of?
Eric
Eric W. Biederman wrote:
> Nadia Derbey <[email protected]> writes:
>
>
>>I do not fully agree with you:
>>It is true that some ipc tunables play the role of DoS limits.
>>But IMHO the *mni ones (semmni, msgmni, shmmni) are used by the ipc subsystem to
>>adapt its data structures sizes to what is being asked for through the tunable
>>value. I think this is how they manage to take into account a new tunable value
>>without a need for rebooting the system: reallocate some more memory on demand.
>
>
> Yes, they do. However if you are constantly having to play with shmmni or
> the others that is the problem and the array should be replaced with
> a hash table or some form of radix tree, so it changes it's size to fit
> the need. Once that is done, shmmni does become a simple DOS limit.
>
> So what I'm asking is please fix the problem at the source don't plaster over
> it.
>
>
>>Now, what the akt framework does, is that it takes advantage of this concept of
>>"on demand memory allocation" to replace a user (or a daemon) that would
>>periodically check its ipcs consumptions and manually adjust the ipcs tunables:
>>Doing this from the user space would imply a latency that makes it difficult to
>>react fast enough to resources running out.
>
>
> There may be some sense in this but you haven't found something that inherently
> needs tuning. You have found something that has a poor data structure,
> and can more easily be fixed by simply fixing the data structure.
So, should I understand from this that automatic tuning and the AKT
framework itself would make sense, given that I find the rigth tunables
it should be applied to?
Actually, dont' know if you had the opportunity to read all the patches,
but there are 2 other tunables AKT is proposed to be applied to:
. max_threads, the tunable limit on nr_threads
. max_files, the tunable limit on nr_files
Regards,
Nadia
Nadia Derbey <[email protected]> writes:
> So, should I understand from this that automatic tuning and the AKT framework
> itself would make sense, given that I find the rigth tunables it should be
> applied to?
Sort of. The concept of things tuning themselves automatically makes
a lot of sense.
I'm not at all certain about tunables being exported just to be hidden
again. Ideally you don't even want the fact that these things are
varying visible to the user.
So I think that if you can find a good example that cannot be solved
better another way, you can build a case for your framework.
Currently I am doubt you can find such a case.
> Actually, dont' know if you had the opportunity to read all the patches, but
> there are 2 other tunables AKT is proposed to be applied to:
> . max_threads, the tunable limit on nr_threads
> . max_files, the tunable limit on nr_files
At a quick glance max_threads and max_files appear even more to be
DOS limits and not tunables and even less applicable to needing any
tuning at all. My gut feel is at worst these values may need a little
better boot time defaults but otherwise they the should be good.
Eric
ebiederm wrote:
> At a quick glance max_threads and max_files appear even more to be
> DOS limits and not tunables and even less applicable to needing any
> tuning at all. My gut feel is at worst these values may need a little
> better boot time defaults but otherwise they the should be good.
Autotuning max_threads and max_files by using some sort of rate-limiter could
possibly be more useful than any kind of fixed default.
Thanks!
--
Al
Eric W. Biederman wrote:
> Nadia Derbey <[email protected]> writes:
>
>
>>So, should I understand from this that automatic tuning and the AKT framework
>>itself would make sense, given that I find the rigth tunables it should be
>>applied to?
>
>
> Sort of. The concept of things tuning themselves automatically makes
> a lot of sense.
>
> I'm not at all certain about tunables being exported just to be hidden
> again. Ideally you don't even want the fact that these things are
> varying visible to the user.
>
> So I think that if you can find a good example that cannot be solved
> better another way, you can build a case for your framework.
> Currently I am doubt you can find such a case.
>
>
>>Actually, dont' know if you had the opportunity to read all the patches, but
>>there are 2 other tunables AKT is proposed to be applied to:
>>. max_threads, the tunable limit on nr_threads
>>. max_files, the tunable limit on nr_files
>
>
> At a quick glance max_threads and max_files appear even more to be
> DOS limits and not tunables and even less applicable to needing any
> tuning at all. My gut feel is at worst these values may need a little
> better boot time defaults but otherwise they the should be good.
>
But, what do you do with Oracle that's asking maxfiles to be set to
0x10000, while the default value might be enough for a system that's not
running Oracle.
I'm afraid that giving boot time values to the max_* tunables we will
loose all the benefits from /proc (or /sys): it is impossible to
anticipate what an OS will be used for. So allowing such things to be
changed without having to reboot the machine is in my mind quite a
powerful feature we should keep taking adavntage of.
Regards,
Nadia
Nadia Derbey <[email protected]> writes:
> But, what do you do with Oracle that's asking maxfiles to be set to 0x10000,
> while the default value might be enough for a system that's not running Oracle.
> I'm afraid that giving boot time values to the max_* tunables we will loose all
> the benefits from /proc (or /sys): it is impossible to anticipate what an OS
> will be used for. So allowing such things to be changed without having to reboot
> the machine is in my mind quite a powerful feature we should keep taking
> adavntage of.
I'm not saying remove user spaces' ability to set the
denial-of-service limits. I'm saying if they need to be frequently
changed we need to update the default so they are higher by default.
There really is no cost in moving those values up and down it is just
an arbitrary integer used in comparisons. But if we can make a good
guess that still catches runaway programs before they kill the machine
but also allows more programs to work out of the box we are in better
shape.
Eric
Eric W. Biederman wrote:
> Nadia Derbey <[email protected]> writes:
>
>
>>But, what do you do with Oracle that's asking maxfiles to be set to 0x10000,
>>while the default value might be enough for a system that's not running Oracle.
>>I'm afraid that giving boot time values to the max_* tunables we will loose all
>>the benefits from /proc (or /sys): it is impossible to anticipate what an OS
>>will be used for. So allowing such things to be changed without having to reboot
>>the machine is in my mind quite a powerful feature we should keep taking
>>adavntage of.
>
>
> I'm not saying remove user spaces' ability to set the
> denial-of-service limits. I'm saying if they need to be frequently
> changed we need to update the default so they are higher by default.
>
> There really is no cost in moving those values up and down it is just
> an arbitrary integer used in comparisons. But if we can make a good
> guess that still catches runaway programs before they kill the machine
> but also allows more programs to work out of the box we are in better
> shape.
>
OK, happy to see we are on the same wavelength (and sorry for
misunderstanding what you were saying ;-) )
Regards,
Nadia