From: Pierre Peiffer <[email protected]>
This patch provides three new API to change the ID of an existing
System V IPCs.
These APIs are:
long msg_chid(struct ipc_namespace *ns, int id, int newid);
long sem_chid(struct ipc_namespace *ns, int id, int newid);
long shm_chid(struct ipc_namespace *ns, int id, int newid);
They return 0 or an error code in case of failure.
They may be useful for setting a specific ID for an IPC when preparing
a restart operation.
To be successful, the following rules must be respected:
- the IPC exists (of course...)
- the new ID must satisfy the ID computation rule.
- the entry in the idr corresponding to the new ID must be free.
Signed-off-by: Pierre Peiffer <[email protected]>
Acked-by: Serge Hallyn <[email protected]>
---
include/linux/msg.h | 2 ++
include/linux/sem.h | 2 ++
include/linux/shm.h | 3 +++
ipc/msg.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
ipc/sem.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
ipc/shm.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
ipc/util.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
ipc/util.h | 1 +
8 files changed, 197 insertions(+)
Index: b/include/linux/msg.h
===================================================================
--- a/include/linux/msg.h
+++ b/include/linux/msg.h
@@ -63,6 +63,7 @@ struct msginfo {
#ifdef __KERNEL__
#include <linux/list.h>
+#include <linux/ipc_namespace.h>
/* one msg_msg structure for each message */
struct msg_msg {
@@ -96,6 +97,7 @@ extern long do_msgsnd(int msqid, long mt
size_t msgsz, int msgflg);
extern long do_msgrcv(int msqid, long *pmtype, void __user *mtext,
size_t msgsz, long msgtyp, int msgflg);
+long msg_chid(struct ipc_namespace *ns, int id, int newid);
#endif /* __KERNEL__ */
Index: b/include/linux/sem.h
===================================================================
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -138,9 +138,11 @@ struct sysv_sem {
};
#ifdef CONFIG_SYSVIPC
+#include <linux/ipc_namespace.h>
extern int copy_semundo(unsigned long clone_flags, struct task_struct *tsk);
extern void exit_sem(struct task_struct *tsk);
+long sem_chid(struct ipc_namespace *ns, int id, int newid);
#else
static inline int copy_semundo(unsigned long clone_flags, struct task_struct *tsk)
Index: b/include/linux/shm.h
===================================================================
--- a/include/linux/shm.h
+++ b/include/linux/shm.h
@@ -104,8 +104,11 @@ struct shmid_kernel /* private to the ke
#define SHM_NORESERVE 010000 /* don't check for reservations */
#ifdef CONFIG_SYSVIPC
+#include <linux/ipc_namespace.h>
+
long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr);
extern int is_file_shm_hugepages(struct file *file);
+long shm_chid(struct ipc_namespace *ns, int id, int newid);
#else
static inline long do_shmat(int shmid, char __user *shmaddr,
int shmflg, unsigned long *addr)
Index: b/ipc/msg.c
===================================================================
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -291,6 +291,51 @@ asmlinkage long sys_msgget(key_t key, in
return ipcget(ns, &msg_ids(ns), &msg_ops, &msg_params);
}
+/* must be called with mutex and msq locks held */
+static long msg_chid_nolock(struct ipc_namespace *ns, struct msg_queue *msq,
+ int newid)
+{
+ long err;
+
+ err = ipc_chid(&msg_ids(ns), msq->q_perm.id, newid);
+ if (!err)
+ msq->q_ctime = get_seconds();
+
+ return err;
+}
+
+/* API to use for changing an id from kernel space, not from the syscall, as
+ there is no permission check done here */
+long msg_chid(struct ipc_namespace *ns, int id, int newid)
+{
+ long err;
+ struct msg_queue *msq;
+
+retry:
+ err = idr_pre_get(&msg_ids(ns).ipcs_idr, GFP_KERNEL);
+ if (!err)
+ return -ENOMEM;
+
+ down_write(&msg_ids(ns).rw_mutex);
+ msq = msg_lock_check(ns, id);
+
+ if (IS_ERR(msq)) {
+ up_write(&msg_ids(ns).rw_mutex);
+ return PTR_ERR(msq);
+ }
+
+ err = msg_chid_nolock(ns, msq, newid);
+
+ msg_unlock(msq);
+ up_write(&msg_ids(ns).rw_mutex);
+
+ /* ipc_chid may return -EAGAIN in case of memory requirement */
+ if (err == -EAGAIN)
+ goto retry;
+
+ return err;
+}
+
static inline unsigned long
copy_msqid_to_user(void __user *buf, struct msqid64_ds *in, int version)
{
Index: b/ipc/sem.c
===================================================================
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -564,6 +564,57 @@ static void freeary(struct ipc_namespace
ipc_rcu_putref(sma);
}
+/* must be called with rw_mutex and sma locks held */
+static long sem_chid_nolock(struct ipc_namespace *ns, struct sem_array *sma,
+ int newid)
+{
+ long err;
+
+ err = ipc_chid(&sem_ids(ns), sma->sem_perm.id, newid);
+
+ if (!err) {
+ struct sem_undo *un;
+ for (un = sma->undo; un; un = un->id_next)
+ un->semid = newid;
+
+ sma->sem_ctime = get_seconds();
+ }
+
+ return err;
+}
+
+/* API to use for changing an id from kernel space, not from the syscall, as
+ there is no permission check done here */
+long sem_chid(struct ipc_namespace *ns, int id, int newid)
+{
+ long err;
+ struct sem_array *sma;
+
+retry:
+ err = idr_pre_get(&sem_ids(ns).ipcs_idr, GFP_KERNEL);
+ if (!err)
+ return -ENOMEM;
+
+ down_write(&sem_ids(ns).rw_mutex);
+ sma = sem_lock_check(ns, id);
+
+ if (IS_ERR(sma)) {
+ up_write(&sem_ids(ns).rw_mutex);
+ return PTR_ERR(sma);
+ }
+
+ err = sem_chid_nolock(ns, sma, newid);
+
+ sem_unlock(sma);
+ up_write(&sem_ids(ns).rw_mutex);
+
+ /* ipc_chid may return -EAGAIN in case of memory requirement */
+ if (err == -EAGAIN)
+ goto retry;
+
+ return err;
+}
+
static unsigned long copy_semid_to_user(void __user *buf, struct semid64_ds *in, int version)
{
switch(version) {
Index: b/ipc/shm.c
===================================================================
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -162,7 +162,52 @@ static inline int shm_addid(struct ipc_n
return ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni);
}
+/* must be called with mutex and shp locks held */
+static long shm_chid_nolock(struct ipc_namespace *ns, struct shmid_kernel *shp,
+ int newid)
+{
+ long err;
+
+ err = ipc_chid(&shm_ids(ns), shp->shm_perm.id, newid);
+ if (!err) {
+ shp->shm_file->f_dentry->d_inode->i_ino = newid;
+ shp->shm_ctim = get_seconds();
+ }
+
+ return err;
+}
+
+/* API to use for changing an id from kernel space, not from the syscall, as
+ there is no permission check done here */
+long shm_chid(struct ipc_namespace *ns, int id, int newid)
+{
+ long err;
+ struct shmid_kernel *shp;
+
+retry:
+ err = idr_pre_get(&shm_ids(ns).ipcs_idr, GFP_KERNEL);
+ if (!err)
+ return -ENOMEM;
+
+ down_write(&shm_ids(ns).rw_mutex);
+ shp = shm_lock_check(ns, id);
+
+ if (IS_ERR(shp)) {
+ up_write(&shm_ids(ns).rw_mutex);
+ return PTR_ERR(shp);
+ }
+
+ err = shm_chid_nolock(ns, shp, newid);
+ shm_unlock(shp);
+ up_write(&shm_ids(ns).rw_mutex);
+
+ /* ipc_chid may return -EAGAIN in case of memory requirement */
+ if (err == -EAGAIN)
+ goto retry;
+
+ return err;
+}
/* This is called by fork, once for every shm attach. */
static void shm_open(struct vm_area_struct *vma)
Index: b/ipc/util.c
===================================================================
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -363,6 +363,54 @@ retry:
/**
+ * ipc_chid - change an IPC identifier
+ * @ids: IPC identifier set
+ * @oldid: ID of the IPC permission set to move
+ * @newid: new ID of the IPC permission set to move
+ *
+ * Move an entry in the IPC idr from the 'oldid' place to the
+ * 'newid' place. The seq number of the entry is updated to match the
+ * 'newid' value.
+ *
+ * Called with the ipc lock and ipc_ids.rw_mutex held.
+ */
+int ipc_chid(struct ipc_ids *ids, int oldid, int newid)
+{
+ struct kern_ipc_perm *p;
+ int old_lid = oldid % SEQ_MULTIPLIER;
+ int new_lid = newid % SEQ_MULTIPLIER;
+
+ if (newid != (new_lid + (newid/SEQ_MULTIPLIER)*SEQ_MULTIPLIER))
+ return -EINVAL;
+
+ p = idr_find(&ids->ipcs_idr, old_lid);
+
+ if (!p)
+ return -EINVAL;
+
+ /* The idx in the idr may be the same but not the seq number. */
+ if (new_lid != old_lid) {
+ int id, err;
+
+ err = idr_get_new_above(&ids->ipcs_idr, p, new_lid, &id);
+ if (err)
+ return err;
+
+ /* do we get our wished id ? */
+ if (id == new_lid) {
+ idr_remove(&ids->ipcs_idr, old_lid);
+ } else {
+ idr_remove(&ids->ipcs_idr, id);
+ return -EBUSY;
+ }
+ }
+
+ p->id = newid;
+ p->seq = newid/SEQ_MULTIPLIER;
+ return 0;
+}
+
+/**
* ipc_rmid - remove an IPC identifier
* @ids: IPC identifier set
* @ipcp: ipc perm structure containing the identifier to remove
Index: b/ipc/util.h
===================================================================
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -85,6 +85,7 @@ int ipc_get_maxid(struct ipc_ids *);
void ipc_rmid(struct ipc_ids *, struct kern_ipc_perm *);
/* must be called with ipcp locked */
+int ipc_chid(struct ipc_ids *ids, int oldid, int newid);
int ipcperms(struct kern_ipc_perm *ipcp, short flg);
/* for rare, potentially huge allocations.
--
Pierre Peiffer
On Tue, Jan 29, 2008 at 05:02:38PM +0100, [email protected] wrote:
> This patch provides three new API to change the ID of an existing
> System V IPCs.
>
> These APIs are:
> long msg_chid(struct ipc_namespace *ns, int id, int newid);
> long sem_chid(struct ipc_namespace *ns, int id, int newid);
> long shm_chid(struct ipc_namespace *ns, int id, int newid);
>
> They return 0 or an error code in case of failure.
>
> They may be useful for setting a specific ID for an IPC when preparing
> a restart operation.
>
> To be successful, the following rules must be respected:
> - the IPC exists (of course...)
> - the new ID must satisfy the ID computation rule.
> - the entry in the idr corresponding to the new ID must be free.
> ipc/util.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
> ipc/util.h | 1 +
> 8 files changed, 197 insertions(+)
For the record, OpenVZ uses "create with predefined ID" method which
leads to less code. For example, change at the end is all we want from
ipc/util.c .
Also, if ids were A and B at the moment of checkpoint, and during
restart they became B and A you'll get collision in both ways which you
techically can avoid by classic "tmp = A, A = B, B = tmp" but you also
can avoid all other loops just by creating with ID you need.
Alexey Dobriyan wrote:
> On Tue, Jan 29, 2008 at 05:02:38PM +0100, [email protected] wrote:
>> This patch provides three new API to change the ID of an existing
>> System V IPCs.
>>
>> These APIs are:
>> long msg_chid(struct ipc_namespace *ns, int id, int newid);
>> long sem_chid(struct ipc_namespace *ns, int id, int newid);
>> long shm_chid(struct ipc_namespace *ns, int id, int newid);
>>
>> They return 0 or an error code in case of failure.
>>
>> They may be useful for setting a specific ID for an IPC when preparing
>> a restart operation.
>>
>> To be successful, the following rules must be respected:
>> - the IPC exists (of course...)
>> - the new ID must satisfy the ID computation rule.
>> - the entry in the idr corresponding to the new ID must be free.
>
>> ipc/util.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
>> ipc/util.h | 1 +
>> 8 files changed, 197 insertions(+)
>
> For the record, OpenVZ uses "create with predefined ID" method which
> leads to less code. For example, change at the end is all we want from
> ipc/util.c .
>
Yes, indeed, I saw that. The idea here is, at the end, to propose a more
"userspace oriented" solution.
As we can't use msgget(), etc, API to specify an ID, I think we can at least
change it afterwards
> Also, if ids were A and B at the moment of checkpoint, and during
> restart they became B and A you'll get collision in both ways which you
> techically can avoid by classic "tmp = A, A = B, B = tmp"
In the general case, yes, you're right.
In the case of the checkpoint/restart, this is not necessarily a problem, as we
will probably restart an application in an empty "container"/"namespace"; Thus
we can create all needed IPCs in an empty IPC namespace like this:
1. create first IPC
2. change its ID
3. create the second IPC
4. change its ID
5. etc..
But yes, I agree that if we can directly create an IPC with the right ID, it
would be better; may be with an IPC_CREATE command or something like that if the
direction is to do that from userspace.
--
Pierre Peiffer
Hi again,
Thinking more about this, I think I must clarify why I choose this way.
In fact, the idea of these patches is to provide the missing user APIs (or
extend the existing ones) that allow to set or update _all_ properties of all
IPCs, as needed in the case of the checkpoint/restart of an application (the
current user API does not allow to specify an ID for a created IPC, for
example). And this, without changing the existing API of course.
And msgget(), semget() and shmget() does not have any parameter we can use to
specify an ID.
That's why I've decided to not change these routines and add a new control
command, IP_SETID, with which we can can change the ID of an IPC. (that looks to
me more straightforward and logical)
Now, this patch is, in fact, only a preparation for the patch 10/15 which
really complete the user API by adding this IPC_SETID command.
(... continuing below ...)
Alexey Dobriyan wrote:
> On Tue, Jan 29, 2008 at 05:02:38PM +0100, [email protected] wrote:
>> This patch provides three new API to change the ID of an existing
>> System V IPCs.
>>
>> These APIs are:
>> long msg_chid(struct ipc_namespace *ns, int id, int newid);
>> long sem_chid(struct ipc_namespace *ns, int id, int newid);
>> long shm_chid(struct ipc_namespace *ns, int id, int newid);
>>
>> They return 0 or an error code in case of failure.
>>
>> They may be useful for setting a specific ID for an IPC when preparing
>> a restart operation.
>>
>> To be successful, the following rules must be respected:
>> - the IPC exists (of course...)
>> - the new ID must satisfy the ID computation rule.
>> - the entry in the idr corresponding to the new ID must be free.
>
>> ipc/util.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
>> ipc/util.h | 1 +
>> 8 files changed, 197 insertions(+)
>
> For the record, OpenVZ uses "create with predefined ID" method which
> leads to less code. For example, change at the end is all we want from
> ipc/util.c .
And in fact, you do that from kernel space, you don't have the constraint to fit
the existing user API.
Again, this patch, even if it presents a new kernel API, is in fact a
preparation for the next patch which introduces a new user API.
Do you think that this could fit your need ?
--
Pierre
Why user space can need this API? for checkpointing only?
Then I would not consider it for inclusion until it is clear how to implement checkpointing.
As for me personally - I'm against exporting such APIs, since they are not needed in real-life user space applications and maintaining it forever for compatibility doesn't worth it.
Also such APIs allow creation of non-GPL checkpointing in user-space, which can be of concern as well.
Kirill
Pierre Peiffer wrote:
> Hi again,
>
> Thinking more about this, I think I must clarify why I choose this way.
> In fact, the idea of these patches is to provide the missing user APIs (or
> extend the existing ones) that allow to set or update _all_ properties of all
> IPCs, as needed in the case of the checkpoint/restart of an application (the
> current user API does not allow to specify an ID for a created IPC, for
> example). And this, without changing the existing API of course.
>
> And msgget(), semget() and shmget() does not have any parameter we can use to
> specify an ID.
> That's why I've decided to not change these routines and add a new control
> command, IP_SETID, with which we can can change the ID of an IPC. (that looks to
> me more straightforward and logical)
>
> Now, this patch is, in fact, only a preparation for the patch 10/15 which
> really complete the user API by adding this IPC_SETID command.
>
> (... continuing below ...)
>
> Alexey Dobriyan wrote:
>> On Tue, Jan 29, 2008 at 05:02:38PM +0100, [email protected] wrote:
>>> This patch provides three new API to change the ID of an existing
>>> System V IPCs.
>>>
>>> These APIs are:
>>> long msg_chid(struct ipc_namespace *ns, int id, int newid);
>>> long sem_chid(struct ipc_namespace *ns, int id, int newid);
>>> long shm_chid(struct ipc_namespace *ns, int id, int newid);
>>>
>>> They return 0 or an error code in case of failure.
>>>
>>> They may be useful for setting a specific ID for an IPC when preparing
>>> a restart operation.
>>>
>>> To be successful, the following rules must be respected:
>>> - the IPC exists (of course...)
>>> - the new ID must satisfy the ID computation rule.
>>> - the entry in the idr corresponding to the new ID must be free.
>>> ipc/util.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
>>> ipc/util.h | 1 +
>>> 8 files changed, 197 insertions(+)
>> For the record, OpenVZ uses "create with predefined ID" method which
>> leads to less code. For example, change at the end is all we want from
>> ipc/util.c .
>
> And in fact, you do that from kernel space, you don't have the constraint to fit
> the existing user API.
> Again, this patch, even if it presents a new kernel API, is in fact a
> preparation for the next patch which introduces a new user API.
>
> Do you think that this could fit your need ?
>
Kirill Korotaev wrote:
> Why user space can need this API? for checkpointing only?
I would say "at least for checkpointing"... ;) May be someone else may find an
interest about this for something else.
In fact, I'm sure that you have some interest in checkpointing; and thus, you
have probably some ideas in mind; but whatever the solution you will propose,
I'm pretty sure that I could say the same thing for your solution.
And what I finally think is: even if it's for "checkpointing only", if many
people are interested by this, it may be sufficient to push this ?
> Then I would not consider it for inclusion until it is clear how to implement checkpointing.
> As for me personally - I'm against exporting such APIs, since they are not needed in real-life user space applications and maintaining it forever for compatibility doesn't worth it.
Maintaining these patches is not a big deal, really, but this is not the main
point; the "need in real life" (1) is in fact the main one, and then, the "is
this solution the best one ?" (2) the second one.
About (1), as said in my first mail, as the namespaces and containers are being
integrated into the mainline kernel, checkpoint/restart is (or will be) the next
need.
About (2), my solution propose to do that, as much as possible from userspace,
to minimize the kernel impact. Of course, this is subject to discussion. My
opinion is that doing a full checkpoint/restart from kernel space will need lot
of new specific and intrusive code; I'm not sure that this will be acceptable by
the community. But this is my opinion only. Discusion is opened.
> Also such APIs allow creation of non-GPL checkpointing in user-space, which can be of concern as well.
Honestly, I don't think this really a concern at all. I mean: I've never seen
"this allows non-GPL binary and thus, this is bad" as an argument to reject a
functionality, but I may be wrong, and thus, it can be discussed as well.
I think the points (1) and (2) as stated above are the key ones.
Pierre
> Kirill
>
>
> Pierre Peiffer wrote:
>> Hi again,
>>
>> Thinking more about this, I think I must clarify why I choose this way.
>> In fact, the idea of these patches is to provide the missing user APIs (or
>> extend the existing ones) that allow to set or update _all_ properties of all
>> IPCs, as needed in the case of the checkpoint/restart of an application (the
>> current user API does not allow to specify an ID for a created IPC, for
>> example). And this, without changing the existing API of course.
>>
>> And msgget(), semget() and shmget() does not have any parameter we can use to
>> specify an ID.
>> That's why I've decided to not change these routines and add a new control
>> command, IP_SETID, with which we can can change the ID of an IPC. (that looks to
>> me more straightforward and logical)
>>
>> Now, this patch is, in fact, only a preparation for the patch 10/15 which
>> really complete the user API by adding this IPC_SETID command.
>>
>> (... continuing below ...)
>>
>> Alexey Dobriyan wrote:
>>> On Tue, Jan 29, 2008 at 05:02:38PM +0100, [email protected] wrote:
>>>> This patch provides three new API to change the ID of an existing
>>>> System V IPCs.
>>>>
>>>> These APIs are:
>>>> long msg_chid(struct ipc_namespace *ns, int id, int newid);
>>>> long sem_chid(struct ipc_namespace *ns, int id, int newid);
>>>> long shm_chid(struct ipc_namespace *ns, int id, int newid);
>>>>
>>>> They return 0 or an error code in case of failure.
>>>>
>>>> They may be useful for setting a specific ID for an IPC when preparing
>>>> a restart operation.
>>>>
>>>> To be successful, the following rules must be respected:
>>>> - the IPC exists (of course...)
>>>> - the new ID must satisfy the ID computation rule.
>>>> - the entry in the idr corresponding to the new ID must be free.
>>>> ipc/util.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
>>>> ipc/util.h | 1 +
>>>> 8 files changed, 197 insertions(+)
>>> For the record, OpenVZ uses "create with predefined ID" method which
>>> leads to less code. For example, change at the end is all we want from
>>> ipc/util.c .
>> And in fact, you do that from kernel space, you don't have the constraint to fit
>> the existing user API.
>> Again, this patch, even if it presents a new kernel API, is in fact a
>> preparation for the next patch which introduces a new user API.
>>
>> Do you think that this could fit your need ?
>>
>
>
--
Pierre Peiffer
Pierre,
my point is that after you've added interface "set IPCID", you'll need more and more for checkpointing:
- "create/setup conntrack" (otherwise connections get dropped),
- "set task start time" (needed for Oracle checkpointing BTW),
- "set some statistics counters (e.g. networking or taskstats)"
- "restore inotify"
and so on and so forth.
Exporting such intimate kernel interfaces to user space doesn't look sane.
Exactly from compatibility and maintenance POV. You'll be burden with supporting them for a long time.
Remember recent story with SLUB and /proc/slabinfo?
Hope I made my argument more clear this time.
Thanks,
Kirill
Pierre Peiffer wrote:
>
> Kirill Korotaev wrote:
>> Why user space can need this API? for checkpointing only?
>
> I would say "at least for checkpointing"... ;) May be someone else may find an
> interest about this for something else.
> In fact, I'm sure that you have some interest in checkpointing; and thus, you
> have probably some ideas in mind; but whatever the solution you will propose,
> I'm pretty sure that I could say the same thing for your solution.
> And what I finally think is: even if it's for "checkpointing only", if many
> people are interested by this, it may be sufficient to push this ?
>
>> Then I would not consider it for inclusion until it is clear how to implement checkpointing.
>> As for me personally - I'm against exporting such APIs, since they are not needed in real-life user space applications and maintaining it forever for compatibility doesn't worth it.
>
> Maintaining these patches is not a big deal, really, but this is not the main
> point; the "need in real life" (1) is in fact the main one, and then, the "is
> this solution the best one ?" (2) the second one.
>
> About (1), as said in my first mail, as the namespaces and containers are being
> integrated into the mainline kernel, checkpoint/restart is (or will be) the next
> need.
> About (2), my solution propose to do that, as much as possible from userspace,
> to minimize the kernel impact. Of course, this is subject to discussion. My
> opinion is that doing a full checkpoint/restart from kernel space will need lot
> of new specific and intrusive code; I'm not sure that this will be acceptable by
> the community. But this is my opinion only. Discusion is opened.
>
>> Also such APIs allow creation of non-GPL checkpointing in user-space, which can be of concern as well.
>
> Honestly, I don't think this really a concern at all. I mean: I've never seen
> "this allows non-GPL binary and thus, this is bad" as an argument to reject a
> functionality, but I may be wrong, and thus, it can be discussed as well.
> I think the points (1) and (2) as stated above are the key ones.
>
> Pierre
>
>> Kirill
>>
>>
>> Pierre Peiffer wrote:
>>> Hi again,
>>>
>>> Thinking more about this, I think I must clarify why I choose this way.
>>> In fact, the idea of these patches is to provide the missing user APIs (or
>>> extend the existing ones) that allow to set or update _all_ properties of all
>>> IPCs, as needed in the case of the checkpoint/restart of an application (the
>>> current user API does not allow to specify an ID for a created IPC, for
>>> example). And this, without changing the existing API of course.
>>>
>>> And msgget(), semget() and shmget() does not have any parameter we can use to
>>> specify an ID.
>>> That's why I've decided to not change these routines and add a new control
>>> command, IP_SETID, with which we can can change the ID of an IPC. (that looks to
>>> me more straightforward and logical)
>>>
>>> Now, this patch is, in fact, only a preparation for the patch 10/15 which
>>> really complete the user API by adding this IPC_SETID command.
>>>
>>> (... continuing below ...)
>>>
>>> Alexey Dobriyan wrote:
>>>> On Tue, Jan 29, 2008 at 05:02:38PM +0100, [email protected] wrote:
>>>>> This patch provides three new API to change the ID of an existing
>>>>> System V IPCs.
>>>>>
>>>>> These APIs are:
>>>>> long msg_chid(struct ipc_namespace *ns, int id, int newid);
>>>>> long sem_chid(struct ipc_namespace *ns, int id, int newid);
>>>>> long shm_chid(struct ipc_namespace *ns, int id, int newid);
>>>>>
>>>>> They return 0 or an error code in case of failure.
>>>>>
>>>>> They may be useful for setting a specific ID for an IPC when preparing
>>>>> a restart operation.
>>>>>
>>>>> To be successful, the following rules must be respected:
>>>>> - the IPC exists (of course...)
>>>>> - the new ID must satisfy the ID computation rule.
>>>>> - the entry in the idr corresponding to the new ID must be free.
>>>>> ipc/util.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
>>>>> ipc/util.h | 1 +
>>>>> 8 files changed, 197 insertions(+)
>>>> For the record, OpenVZ uses "create with predefined ID" method which
>>>> leads to less code. For example, change at the end is all we want from
>>>> ipc/util.c .
>>> And in fact, you do that from kernel space, you don't have the constraint to fit
>>> the existing user API.
>>> Again, this patch, even if it presents a new kernel API, is in fact a
>>> preparation for the next patch which introduces a new user API.
>>>
>>> Do you think that this could fit your need ?
>>>
>>
>
Hello Kirill !
Kirill Korotaev wrote:
> Pierre,
>
> my point is that after you've added interface "set IPCID", you'll need more and more for checkpointing:
> - "create/setup conntrack" (otherwise connections get dropped),
> - "set task start time" (needed for Oracle checkpointing BTW),
> - "set some statistics counters (e.g. networking or taskstats)"
> - "restore inotify"
> and so on and so forth.
right. we know that we will have to handle a lot of these
and more and we will need an API for it :) so how should we
handle it ?
through a dedicated syscall that would be able to checkpoint
and/or restart a process, an ipc object, an ipc namespace, a
full container ? will it take a fd or a big binary blob ?
I personally really liked Pavel idea's of filesystem. but we
dropped the thread.
that's for the user API but we will need also kernel services
to expose (checkpoint) states and restore them. If it's too
early to talk about the user API, we could try first to refactor
the kernel internals to expose correctly what we need.
That's what Pierre's patchset is trying to do.
Cheers,
C.
Cedric Le Goater wrote:
> Hello Kirill !
>
> Kirill Korotaev wrote:
>> Pierre,
>>
>> my point is that after you've added interface "set IPCID", you'll need
>> more and more for checkpointing:
>> - "create/setup conntrack" (otherwise connections get dropped),
>> - "set task start time" (needed for Oracle checkpointing BTW),
>> - "set some statistics counters (e.g. networking or taskstats)"
>> - "restore inotify"
>> and so on and so forth.
>
> right. we know that we will have to handle a lot of these
> and more and we will need an API for it :) so how should we handle it ?
> through a dedicated syscall that would be able to checkpoint and/or
> restart a process, an ipc object, an ipc namespace, a full container ?
> will it take a fd or a big binary blob ?
> I personally really liked Pavel idea's of filesystem. but we dropped the
> thread.
Imho having a file system interface means having all its problems.
Imagine you have some information about tasks exported with a file system interface.
Obviously to collect the information you have to hold some spinlock like tasklist_lock or similar.
Obviously, you have to drop the lock between sys_read() syscalls.
So interface gets much more complicated - you have to rescan the objects and somehow find the place where
you stopped previous read. Or you have to to force reader to read everything at once.
> that's for the user API but we will need also kernel services to expose
> (checkpoint) states and restore them. If it's too
> early to talk about the user API, we could try first to refactor
> the kernel internals to expose correctly what we need.
That's what I would start with.
> That's what Pierre's patchset is trying to do.
Not exactly. For checkpointing/restoring we actually need only one new API call for each
subsystem - create some object with given ID (and maybe parameters, if they are not dynamically changeable by user).
While Pierre's patchset adds different API call - change object ID.
Thanks,
Kirill
Kirill Korotaev wrote:
>
> Cedric Le Goater wrote:
>> Hello Kirill !
>>
>> Kirill Korotaev wrote:
>>> Pierre,
>>>
>>> my point is that after you've added interface "set IPCID", you'll need
>>> more and more for checkpointing:
>>> - "create/setup conntrack" (otherwise connections get dropped),
>>> - "set task start time" (needed for Oracle checkpointing BTW),
>>> - "set some statistics counters (e.g. networking or taskstats)"
>>> - "restore inotify"
>>> and so on and so forth.
>> right. we know that we will have to handle a lot of these
>> and more and we will need an API for it :) so how should we handle it ?
>> through a dedicated syscall that would be able to checkpoint and/or
>> restart a process, an ipc object, an ipc namespace, a full container ?
>> will it take a fd or a big binary blob ?
>> I personally really liked Pavel idea's of filesystem. but we dropped the
>> thread.
>
> Imho having a file system interface means having all its problems.
> Imagine you have some information about tasks exported with a file system interface.
> Obviously to collect the information you have to hold some spinlock like tasklist_lock or similar.
> Obviously, you have to drop the lock between sys_read() syscalls.
> So interface gets much more complicated - you have to rescan the objects and somehow find the place where
> you stopped previous read. Or you have to to force reader to read everything at once.
To remember the place when we stopped previous read we have a "pos" counter
on the struct file.
Actually, tar utility, that I propose to perform the most simple migration
reads the directory contents with 4Kb buffer - that's enough for ~500 tasks.
Besides, is this a real problem for a frozen container?
>> that's for the user API but we will need also kernel services to expose
>> (checkpoint) states and restore them. If it's too
>> early to talk about the user API, we could try first to refactor
>> the kernel internals to expose correctly what we need.
>
> That's what I would start with.
>
>> That's what Pierre's patchset is trying to do.
>
> Not exactly. For checkpointing/restoring we actually need only one new API call for each
> subsystem - create some object with given ID (and maybe parameters, if they are not dynamically changeable by user).
> While Pierre's patchset adds different API call - change object ID.
>
> Thanks,
> Kirill
> _______________________________________________
> Containers mailing list
> [email protected]
> https://lists.linux-foundation.org/mailman/listinfo/containers
>
> _______________________________________________
> Devel mailing list
> [email protected]
> https://openvz.org/mailman/listinfo/devel
>
Pavel Emelyanov wrote:
> Kirill Korotaev wrote:
>> Cedric Le Goater wrote:
>>> Hello Kirill !
>>>
>>> Kirill Korotaev wrote:
>>>> Pierre,
>>>>
>>>> my point is that after you've added interface "set IPCID", you'll need
>>>> more and more for checkpointing:
>>>> - "create/setup conntrack" (otherwise connections get dropped),
>>>> - "set task start time" (needed for Oracle checkpointing BTW),
>>>> - "set some statistics counters (e.g. networking or taskstats)"
>>>> - "restore inotify"
>>>> and so on and so forth.
>>> right. we know that we will have to handle a lot of these
>>> and more and we will need an API for it :) so how should we handle it ?
>>> through a dedicated syscall that would be able to checkpoint and/or
>>> restart a process, an ipc object, an ipc namespace, a full container ?
>>> will it take a fd or a big binary blob ?
>>> I personally really liked Pavel idea's of filesystem. but we dropped the
>>> thread.
>> Imho having a file system interface means having all its problems.
>> Imagine you have some information about tasks exported with a file system interface.
>> Obviously to collect the information you have to hold some spinlock like tasklist_lock or similar.
>> Obviously, you have to drop the lock between sys_read() syscalls.
>> So interface gets much more complicated - you have to rescan the objects and somehow find the place where
>> you stopped previous read. Or you have to to force reader to read everything at once.
>
> To remember the place when we stopped previous read we have a "pos" counter
> on the struct file.
>
> Actually, tar utility, that I propose to perform the most simple migration
> reads the directory contents with 4Kb buffer - that's enough for ~500 tasks.
>
> Besides, is this a real problem for a frozen container?
I like the idea of a C/R filesystem. Does it implies a specific user
space program to orchestrate the checkpoint/restart of the different
subsystems ? I mean the checkpoint is easy but what about the restart ?
We must ensure, for example to restore a process before restoring the fd
associated to it, or restore a deleted file before restoring the fd
opened to it, no ?
Daniel Lezcano wrote:
> Pavel Emelyanov wrote:
>> Kirill Korotaev wrote:
>>> Cedric Le Goater wrote:
>>>> Hello Kirill !
>>>>
>>>> Kirill Korotaev wrote:
>>>>> Pierre,
>>>>>
>>>>> my point is that after you've added interface "set IPCID", you'll need
>>>>> more and more for checkpointing:
>>>>> - "create/setup conntrack" (otherwise connections get dropped),
>>>>> - "set task start time" (needed for Oracle checkpointing BTW),
>>>>> - "set some statistics counters (e.g. networking or taskstats)"
>>>>> - "restore inotify"
>>>>> and so on and so forth.
>>>> right. we know that we will have to handle a lot of these
>>>> and more and we will need an API for it :) so how should we handle it ?
>>>> through a dedicated syscall that would be able to checkpoint and/or
>>>> restart a process, an ipc object, an ipc namespace, a full container ?
>>>> will it take a fd or a big binary blob ?
>>>> I personally really liked Pavel idea's of filesystem. but we dropped the
>>>> thread.
>>> Imho having a file system interface means having all its problems.
>>> Imagine you have some information about tasks exported with a file system interface.
>>> Obviously to collect the information you have to hold some spinlock like tasklist_lock or similar.
>>> Obviously, you have to drop the lock between sys_read() syscalls.
>>> So interface gets much more complicated - you have to rescan the objects and somehow find the place where
>>> you stopped previous read. Or you have to to force reader to read everything at once.
>> To remember the place when we stopped previous read we have a "pos" counter
>> on the struct file.
>>
>> Actually, tar utility, that I propose to perform the most simple migration
>> reads the directory contents with 4Kb buffer - that's enough for ~500 tasks.
>>
>> Besides, is this a real problem for a frozen container?
>
> I like the idea of a C/R filesystem. Does it implies a specific user
> space program to orchestrate the checkpoint/restart of the different
> subsystems ? I mean the checkpoint is easy but what about the restart ?
I though about smth like "writing to this fs causes restore process".
> We must ensure, for example to restore a process before restoring the fd
> associated to it, or restore a deleted file before restoring the fd
This is achieved by tar automatically - it extracts files in the order
of archiving. Thus is we provide them in correct order we'll get them
in correct one as well.
> opened to it, no ?
>
>
>
>
>
>
I strongly second Kirill on this matter.
IMHO, we should _avoid_ as much as possible exposing internal kernel
state to applications, unless a _real_ need for it is _clearly_
demonstrated. The reasons for this are quite obvious.
It isn't strictly necessary to export a new interface in order to
support checkpoint/restart. **. Hence, I think that the speculation
"we may need it in the future" is too abstract and isn't a good
excuse to commit to a new, currently unneeded, interface. Should the
need arise in the future, it will be easy to design a new interface
(also based on aggregated experience until then).
** In fact, the suggested interface may prove problematic (as noted
earlier in this thread): if you first create the resource with some
arbitrary identifier and then modify the identifier (in our case,
IPC id), then the restart procedure is bound to execute sequentially,
because of lack of atomicity.
That said, I suggest the following method instead (this is the method
we use in Zap to determine the desired resource identifier when a new
resource is allocated; I recall that we had discussed it in the past,
perhaps the mini-summit in september ?):
1) The process/thread tells the kernel that it wishes to pre-determine
the resource identifier of a subsequent call (this can be done via a
new syscall, or by writing to /proc/self/...).
2) Each system call that allocates a resource and assigns an identifier
is modified to check this per-thread field first; if it is set then
it will attempt to allocate that particular value (if already taken,
return an error, eg. EBUSY). Otherwise it will proceed as it is today.
(I left out some details - eg. the kernel will keep the desire value
on a per-thread field, when it will be reset, whether we want to also
tag the field with its type and so on, but the idea is now clear).
The main two advantages are that first, we don't need to devise a new
method for every syscall that allocates said resources (sigh... just
think of clone() nightmare to add a new argument); second, the change
is incremental: first code the mechanism to set the field, then add
support in the IPC subsystem, later in the DEVPTS, then in clone and
so forth.
Oren.
Pierre Peiffer wrote:
>
> Kirill Korotaev wrote:
>> Why user space can need this API? for checkpointing only?
>
> I would say "at least for checkpointing"... ;) May be someone else may find an
> interest about this for something else.
> In fact, I'm sure that you have some interest in checkpointing; and thus, you
> have probably some ideas in mind; but whatever the solution you will propose,
> I'm pretty sure that I could say the same thing for your solution.
> And what I finally think is: even if it's for "checkpointing only", if many
> people are interested by this, it may be sufficient to push this ?
>
>> Then I would not consider it for inclusion until it is clear how to implement checkpointing.
>> As for me personally - I'm against exporting such APIs, since they are not needed in real-life user space applications and maintaining it forever for compatibility doesn't worth it.
>
> Maintaining these patches is not a big deal, really, but this is not the main
> point; the "need in real life" (1) is in fact the main one, and then, the "is
> this solution the best one ?" (2) the second one.
>
> About (1), as said in my first mail, as the namespaces and containers are being
> integrated into the mainline kernel, checkpoint/restart is (or will be) the next
> need.
> About (2), my solution propose to do that, as much as possible from userspace,
> to minimize the kernel impact. Of course, this is subject to discussion. My
> opinion is that doing a full checkpoint/restart from kernel space will need lot
> of new specific and intrusive code; I'm not sure that this will be acceptable by
> the community. But this is my opinion only. Discusion is opened.
>
>> Also such APIs allow creation of non-GPL checkpointing in user-space, which can be of concern as well.
>
> Honestly, I don't think this really a concern at all. I mean: I've never seen
> "this allows non-GPL binary and thus, this is bad" as an argument to reject a
> functionality, but I may be wrong, and thus, it can be discussed as well.
> I think the points (1) and (2) as stated above are the key ones.
>
> Pierre
>
>> Kirill
>>
>>
>> Pierre Peiffer wrote:
>>> Hi again,
>>>
>>> Thinking more about this, I think I must clarify why I choose this way.
>>> In fact, the idea of these patches is to provide the missing user APIs (or
>>> extend the existing ones) that allow to set or update _all_ properties of all
>>> IPCs, as needed in the case of the checkpoint/restart of an application (the
>>> current user API does not allow to specify an ID for a created IPC, for
>>> example). And this, without changing the existing API of course.
>>>
>>> And msgget(), semget() and shmget() does not have any parameter we can use to
>>> specify an ID.
>>> That's why I've decided to not change these routines and add a new control
>>> command, IP_SETID, with which we can can change the ID of an IPC. (that looks to
>>> me more straightforward and logical)
>>>
>>> Now, this patch is, in fact, only a preparation for the patch 10/15 which
>>> really complete the user API by adding this IPC_SETID command.
>>>
>>> (... continuing below ...)
>>>
>>> Alexey Dobriyan wrote:
>>>> On Tue, Jan 29, 2008 at 05:02:38PM +0100, [email protected] wrote:
>>>>> This patch provides three new API to change the ID of an existing
>>>>> System V IPCs.
>>>>>
>>>>> These APIs are:
>>>>> long msg_chid(struct ipc_namespace *ns, int id, int newid);
>>>>> long sem_chid(struct ipc_namespace *ns, int id, int newid);
>>>>> long shm_chid(struct ipc_namespace *ns, int id, int newid);
>>>>>
>>>>> They return 0 or an error code in case of failure.
>>>>>
>>>>> They may be useful for setting a specific ID for an IPC when preparing
>>>>> a restart operation.
>>>>>
>>>>> To be successful, the following rules must be respected:
>>>>> - the IPC exists (of course...)
>>>>> - the new ID must satisfy the ID computation rule.
>>>>> - the entry in the idr corresponding to the new ID must be free.
>>>>> ipc/util.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
>>>>> ipc/util.h | 1 +
>>>>> 8 files changed, 197 insertions(+)
>>>> For the record, OpenVZ uses "create with predefined ID" method which
>>>> leads to less code. For example, change at the end is all we want from
>>>> ipc/util.c .
>>> And in fact, you do that from kernel space, you don't have the constraint to fit
>>> the existing user API.
>>> Again, this patch, even if it presents a new kernel API, is in fact a
>>> preparation for the next patch which introduces a new user API.
>>>
>>> Do you think that this could fit your need ?
>>>
>>
>
On Tue, 2008-02-05 at 04:51 -0500, Oren Laadan wrote:
> That said, I suggest the following method instead (this is the method
> we use in Zap to determine the desired resource identifier when a new
> resource is allocated; I recall that we had discussed it in the past,
> perhaps the mini-summit in september ?):
>
> 1) The process/thread tells the kernel that it wishes to pre-determine
> the resource identifier of a subsequent call (this can be done via a
> new syscall, or by writing to /proc/self/...).
>
> 2) Each system call that allocates a resource and assigns an
> identifier
> is modified to check this per-thread field first; if it is set then
> it will attempt to allocate that particular value (if already taken,
> return an error, eg. EBUSY). Otherwise it will proceed as it is today.
You forgot to attach the patch to your mail. ;)
-- Dave
Quoting Oren Laadan ([email protected]):
>
> I strongly second Kirill on this matter.
>
> IMHO, we should _avoid_ as much as possible exposing internal kernel
> state to applications, unless a _real_ need for it is _clearly_
> demonstrated. The reasons for this are quite obvious.
Hmm, sure, but this sentence is designed to make us want to agree. Yes,
we want to avoid exporting kernel internals, but generally that means
things like the precise layout of the task_struct. What Pierre is doing
is in fact the opposite, exporting resource information in a kernel
version invariant way.
In fact, the very reason not to go the route you and Pavel are
advocating is that if we just dump task state to a file or filesystem
from the kernel in one shot, we'll be much more tempted to lay out data
in a way that exports and ends up depending on kernel internals. So
we'll just want to read and write the task_struct verbatim.
So, there are two very different approaches we can start with.
Whichever one we follow, we want to avoid having kernel version
dependencies. They both have their merits to be sure.
But note that in either case we need to deal with a bunch of locking.
So getting back to Pierre's patchset, IIRC 1-8 are cleanups worth
doing no matter 1. 9-11 sound like they are contentuous until
we decide whether we want to go with a create_with_id() type approach
or a set_id(). 12 is IMO a good locking cleanup regardless. 13 and
15 are contentous until we decide whether we want userspace-controlled
checkpoint or a one-shot fs. 14 IMO is useful for both c/r approaches.
Is that pretty accurate?
> It isn't strictly necessary to export a new interface in order to
> support checkpoint/restart. **. Hence, I think that the speculation
> "we may need it in the future" is too abstract and isn't a good
> excuse to commit to a new, currently unneeded, interface.
OTOH it did succeed in starting some conversation :)
> Should the
> need arise in the future, it will be easy to design a new interface
> (also based on aggregated experience until then).
What aggregated experience? We have to start somewhere...
> ** In fact, the suggested interface may prove problematic (as noted
> earlier in this thread): if you first create the resource with some
> arbitrary identifier and then modify the identifier (in our case,
> IPC id), then the restart procedure is bound to execute sequentially,
> because of lack of atomicity.
Hmm? Lack of atomicity wrt what? All the tasks being restarted were
checkpointed at the same time so there will be no conflict in the
requested IDs, so I don't know what you're referring to.
> That said, I suggest the following method instead (this is the method
> we use in Zap to determine the desired resource identifier when a new
> resource is allocated; I recall that we had discussed it in the past,
> perhaps the mini-summit in september ?):
>
> 1) The process/thread tells the kernel that it wishes to pre-determine
> the resource identifier of a subsequent call (this can be done via a
> new syscall, or by writing to /proc/self/...).
>
> 2) Each system call that allocates a resource and assigns an identifier
> is modified to check this per-thread field first; if it is set then
> it will attempt to allocate that particular value (if already taken,
> return an error, eg. EBUSY). Otherwise it will proceed as it is today.
But I thought you were just advocating a one-shot filesystem approach
for c/r, so we wouldn't be creating the resources piecemeal?
The /proc/self approach is one way to go, it has been working for LSMs
this long. I'd agree that it would be nice if we could have a
consistent interface to the create_with_id()/set_id() problem. A first
shot addressing ipcs and pids would be a great start.
> (I left out some details - eg. the kernel will keep the desire value
> on a per-thread field, when it will be reset, whether we want to also
> tag the field with its type and so on, but the idea is now clear).
>
> The main two advantages are that first, we don't need to devise a new
> method for every syscall that allocates said resources (sigh... just
Agreed.
> think of clone() nightmare to add a new argument);
Yes, and then there will need to be the clone_with_pid() extension on
top of that.
> second, the change
> is incremental: first code the mechanism to set the field, then add
> support in the IPC subsystem, later in the DEVPTS, then in clone and
> so forth.
>
> Oren.
>
> Pierre Peiffer wrote:
>> Kirill Korotaev wrote:
>>> Why user space can need this API? for checkpointing only?
>> I would say "at least for checkpointing"... ;) May be someone else may
>> find an
>> interest about this for something else.
>> In fact, I'm sure that you have some interest in checkpointing; and thus,
>> you
>> have probably some ideas in mind; but whatever the solution you will
>> propose,
>> I'm pretty sure that I could say the same thing for your solution.
>> And what I finally think is: even if it's for "checkpointing only", if
>> many
>> people are interested by this, it may be sufficient to push this ?
>>> Then I would not consider it for inclusion until it is clear how to
>>> implement checkpointing.
>>> As for me personally - I'm against exporting such APIs, since they are
>>> not needed in real-life user space applications and maintaining it
>>> forever for compatibility doesn't worth it.
>> Maintaining these patches is not a big deal, really, but this is not the
>> main
>> point; the "need in real life" (1) is in fact the main one, and then, the
>> "is
>> this solution the best one ?" (2) the second one.
>> About (1), as said in my first mail, as the namespaces and containers are
>> being
>> integrated into the mainline kernel, checkpoint/restart is (or will be)
>> the next
>> need.
>> About (2), my solution propose to do that, as much as possible from
>> userspace,
>> to minimize the kernel impact. Of course, this is subject to discussion.
>> My
>> opinion is that doing a full checkpoint/restart from kernel space will
>> need lot
>> of new specific and intrusive code; I'm not sure that this will be
>> acceptable by
>> the community. But this is my opinion only. Discusion is opened.
>>> Also such APIs allow creation of non-GPL checkpointing in user-space,
>>> which can be of concern as well.
>> Honestly, I don't think this really a concern at all. I mean: I've never
>> seen
>> "this allows non-GPL binary and thus, this is bad" as an argument to
>> reject a
>> functionality, but I may be wrong, and thus, it can be discussed as well.
>> I think the points (1) and (2) as stated above are the key ones.
>> Pierre
>>> Kirill
>>>
>>>
>>> Pierre Peiffer wrote:
>>>> Hi again,
>>>>
>>>> Thinking more about this, I think I must clarify why I choose this way.
>>>> In fact, the idea of these patches is to provide the missing user APIs
>>>> (or
>>>> extend the existing ones) that allow to set or update _all_ properties
>>>> of all
>>>> IPCs, as needed in the case of the checkpoint/restart of an application
>>>> (the
>>>> current user API does not allow to specify an ID for a created IPC, for
>>>> example). And this, without changing the existing API of course.
>>>>
>>>> And msgget(), semget() and shmget() does not have any parameter we can
>>>> use to
>>>> specify an ID.
>>>> That's why I've decided to not change these routines and add a new
>>>> control
>>>> command, IP_SETID, with which we can can change the ID of an IPC. (that
>>>> looks to
>>>> me more straightforward and logical)
>>>>
>>>> Now, this patch is, in fact, only a preparation for the patch 10/15
>>>> which
>>>> really complete the user API by adding this IPC_SETID command.
>>>>
>>>> (... continuing below ...)
>>>>
>>>> Alexey Dobriyan wrote:
>>>>> On Tue, Jan 29, 2008 at 05:02:38PM +0100, [email protected]
>>>>> wrote:
>>>>>> This patch provides three new API to change the ID of an existing
>>>>>> System V IPCs.
>>>>>>
>>>>>> These APIs are:
>>>>>> long msg_chid(struct ipc_namespace *ns, int id, int newid);
>>>>>> long sem_chid(struct ipc_namespace *ns, int id, int newid);
>>>>>> long shm_chid(struct ipc_namespace *ns, int id, int newid);
>>>>>>
>>>>>> They return 0 or an error code in case of failure.
>>>>>>
>>>>>> They may be useful for setting a specific ID for an IPC when preparing
>>>>>> a restart operation.
>>>>>>
>>>>>> To be successful, the following rules must be respected:
>>>>>> - the IPC exists (of course...)
>>>>>> - the new ID must satisfy the ID computation rule.
>>>>>> - the entry in the idr corresponding to the new ID must be free.
>>>>>> ipc/util.c | 48
>>>>>> ++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>> ipc/util.h | 1 +
>>>>>> 8 files changed, 197 insertions(+)
>>>>> For the record, OpenVZ uses "create with predefined ID" method which
>>>>> leads to less code. For example, change at the end is all we want from
>>>>> ipc/util.c .
>>>> And in fact, you do that from kernel space, you don't have the
>>>> constraint to fit
>>>> the existing user API.
>>>> Again, this patch, even if it presents a new kernel API, is in fact a
>>>> preparation for the next patch which introduces a new user API.
>>>>
>>>> Do you think that this could fit your need ?
>>>>
>>>
> _______________________________________________
> Containers mailing list
> [email protected]
> https://lists.linux-foundation.org/mailman/listinfo/containers
Serge E. Hallyn wrote:
> Quoting Oren Laadan ([email protected]):
>> I strongly second Kirill on this matter.
>>
>> IMHO, we should _avoid_ as much as possible exposing internal kernel
>> state to applications, unless a _real_ need for it is _clearly_
>> demonstrated. The reasons for this are quite obvious.
>
> Hmm, sure, but this sentence is designed to make us want to agree. Yes,
> we want to avoid exporting kernel internals, but generally that means
> things like the precise layout of the task_struct. What Pierre is doing
> is in fact the opposite, exporting resource information in a kernel
> version invariant way.
LOL ... a bit of misunderstanding - let me put some order here:
my response what with respect to the new interface that Pierre
suggested, that is - to add a new IPC call to change an identifier
after it has been allocated (and assigned). This is necessary for the
restart because applications expect to see the same resource id's as
they had at the time of the checkpoint.
What you are referring to is the more recent part of the thread, where
the topic became how data should be saved - in other words, the format
of the checkpoint data. This is entirely orthogonal to my argument.
Now please re-read my email :)
That said, I'd advocate for something in between a raw dump and a pure
"parametric" representation of the data. Raw data tends to be, well,
too raw, which makes the task of reading data from older version by
newer kernels harder to maintain. On the other hand, it is impossible
to abstract everything into kernel-independent format.
>
> In fact, the very reason not to go the route you and Pavel are
> advocating is that if we just dump task state to a file or filesystem
> from the kernel in one shot, we'll be much more tempted to lay out data
> in a way that exports and ends up depending on kernel internals. So
> we'll just want to read and write the task_struct verbatim.
>
> So, there are two very different approaches we can start with.
> Whichever one we follow, we want to avoid having kernel version
> dependencies. They both have their merits to be sure.
You will never be able to avoid that completely, simply because new
kernels will require saving more (or less) data per object, because
of new (or dropped) features.
The best solution in this sense is to provide a filter (hopefully
in user space, utility) that would convert a checkpoint image file
from the old format to a newer format.
And you keep a lot of compatibility code of the kernel, too.
>
> But note that in either case we need to deal with a bunch of locking.
> So getting back to Pierre's patchset, IIRC 1-8 are cleanups worth
> doing no matter 1. 9-11 sound like they are contentuous until
> we decide whether we want to go with a create_with_id() type approach
> or a set_id(). 12 is IMO a good locking cleanup regardless. 13 and
> 15 are contentous until we decide whether we want userspace-controlled
> checkpoint or a one-shot fs. 14 IMO is useful for both c/r approaches.
>
> Is that pretty accurate?
(context switch back to my original reply)
I prefer not to add a new interface to IPC that will provide a new
functionality that isn't needed, except for the checkpoint - because
there is a better alternative to do the same task; this alternative
is more suitable because (a) it can be applied incrementally, (b) it
provides a consistent method to pre-select identifiers of all syscalls,
(where is the current suggestion suggests one way for IPC and will
suggest other hacks for other resources).
(context switch back to the current reply)
I definitely welcome a cleanup of the (insanely multiplexedd) IPC
code. However I argue that the interface need not be extended.
>
>> It isn't strictly necessary to export a new interface in order to
>> support checkpoint/restart. **. Hence, I think that the speculation
>> "we may need it in the future" is too abstract and isn't a good
>> excuse to commit to a new, currently unneeded, interface.
>
> OTOH it did succeed in starting some conversation :)
>
>> Should the
>> need arise in the future, it will be easy to design a new interface
>> (also based on aggregated experience until then).
>
> What aggregated experience? We have to start somewhere...
:) well, assuming the selection of resource IDs is done as I suggested,
we'll have the restart use it. If someone finds a good reason (other
than checkpoint/restart) to pre-select/modify an identifier, it will
be easy to _then_ add an interface. That (hypothetical) interface is
likely to come out more clever after X months using checkpoint/restart.
>
>> ** In fact, the suggested interface may prove problematic (as noted
>> earlier in this thread): if you first create the resource with some
>> arbitrary identifier and then modify the identifier (in our case,
>> IPC id), then the restart procedure is bound to execute sequentially,
>> because of lack of atomicity.
>
> Hmm? Lack of atomicity wrt what? All the tasks being restarted were
> checkpointed at the same time so there will be no conflict in the
> requested IDs, so I don't know what you're referring to.
Consider that we want to have an ultra-fast restart, so we let processes
restart in parallel (as much as possible) in the same container. Task A
wants to allocate IPC id 256, but the kernel allocates 32; before task A
manages to change it to 256 (with the new interface), task B attempts to
create an IPC id 32; the kernel provides, say, 1024, and task B fails to
change it to 32 because it is still used by task A. So restart fails :(
On the other hand, if a process first tells the kernel "I want 32" and
then calls, for instance, semget(), then the IPC can atomically ensure
that the process gets what it wanted.
>
>> That said, I suggest the following method instead (this is the method
>> we use in Zap to determine the desired resource identifier when a new
>> resource is allocated; I recall that we had discussed it in the past,
>> perhaps the mini-summit in september ?):
>>
>> 1) The process/thread tells the kernel that it wishes to pre-determine
>> the resource identifier of a subsequent call (this can be done via a
>> new syscall, or by writing to /proc/self/...).
>>
>> 2) Each system call that allocates a resource and assigns an identifier
>> is modified to check this per-thread field first; if it is set then
>> it will attempt to allocate that particular value (if already taken,
>> return an error, eg. EBUSY). Otherwise it will proceed as it is today.
>
> But I thought you were just advocating a one-shot filesystem approach
> for c/r, so we wouldn't be creating the resources piecemeal?
I wasn't. That was Pavel. While I think the idea is neat, I'm not
convinced that it's practical and best way to go, however I need to
further think about it.
And as I said, I see this as a separate issue from the problem of
create_with_id()/set_id issue().
>
> The /proc/self approach is one way to go, it has been working for LSMs
> this long. I'd agree that it would be nice if we could have a
> consistent interface to the create_with_id()/set_id() problem. A first
> shot addressing ipcs and pids would be a great start.
>
>> (I left out some details - eg. the kernel will keep the desire value
>> on a per-thread field, when it will be reset, whether we want to also
>> tag the field with its type and so on, but the idea is now clear).
>>
>> The main two advantages are that first, we don't need to devise a new
>> method for every syscall that allocates said resources (sigh... just
>
> Agreed.
>
>> think of clone() nightmare to add a new argument);
>
> Yes, and then there will need to be the clone_with_pid() extension on
> top of that.
Exactly ! With the /proc/self/... approach there will not be a need
for a clone_with_pid() extension in terms of user-visible interface;
makes the clone-flags headache a bit more manageable :p
Ah... ok, long one, hopefully clarifies the confusion. That said, I
suggest that the debate regarding the format of the checkpoint data
shall proceed on a new thread, since IMHO it's orthogonal.
Oren.
>
>> second, the change
>> is incremental: first code the mechanism to set the field, then add
>> support in the IPC subsystem, later in the DEVPTS, then in clone and
>> so forth.
>>
>> Oren.
>>
>> Pierre Peiffer wrote:
>>> Kirill Korotaev wrote:
>>>> Why user space can need this API? for checkpointing only?
>>> I would say "at least for checkpointing"... ;) May be someone else may
>>> find an
>>> interest about this for something else.
>>> In fact, I'm sure that you have some interest in checkpointing; and thus,
>>> you
>>> have probably some ideas in mind; but whatever the solution you will
>>> propose,
>>> I'm pretty sure that I could say the same thing for your solution.
>>> And what I finally think is: even if it's for "checkpointing only", if
>>> many
>>> people are interested by this, it may be sufficient to push this ?
>>>> Then I would not consider it for inclusion until it is clear how to
>>>> implement checkpointing.
>>>> As for me personally - I'm against exporting such APIs, since they are
>>>> not needed in real-life user space applications and maintaining it
>>>> forever for compatibility doesn't worth it.
>>> Maintaining these patches is not a big deal, really, but this is not the
>>> main
>>> point; the "need in real life" (1) is in fact the main one, and then, the
>>> "is
>>> this solution the best one ?" (2) the second one.
>>> About (1), as said in my first mail, as the namespaces and containers are
>>> being
>>> integrated into the mainline kernel, checkpoint/restart is (or will be)
>>> the next
>>> need.
>>> About (2), my solution propose to do that, as much as possible from
>>> userspace,
>>> to minimize the kernel impact. Of course, this is subject to discussion.
>>> My
>>> opinion is that doing a full checkpoint/restart from kernel space will
>>> need lot
>>> of new specific and intrusive code; I'm not sure that this will be
>>> acceptable by
>>> the community. But this is my opinion only. Discusion is opened.
>>>> Also such APIs allow creation of non-GPL checkpointing in user-space,
>>>> which can be of concern as well.
>>> Honestly, I don't think this really a concern at all. I mean: I've never
>>> seen
>>> "this allows non-GPL binary and thus, this is bad" as an argument to
>>> reject a
>>> functionality, but I may be wrong, and thus, it can be discussed as well.
>>> I think the points (1) and (2) as stated above are the key ones.
>>> Pierre
>>>> Kirill
>>>>
>>>>
>>>> Pierre Peiffer wrote:
>>>>> Hi again,
>>>>>
>>>>> Thinking more about this, I think I must clarify why I choose this way.
>>>>> In fact, the idea of these patches is to provide the missing user APIs
>>>>> (or
>>>>> extend the existing ones) that allow to set or update _all_ properties
>>>>> of all
>>>>> IPCs, as needed in the case of the checkpoint/restart of an application
>>>>> (the
>>>>> current user API does not allow to specify an ID for a created IPC, for
>>>>> example). And this, without changing the existing API of course.
>>>>>
>>>>> And msgget(), semget() and shmget() does not have any parameter we can
>>>>> use to
>>>>> specify an ID.
>>>>> That's why I've decided to not change these routines and add a new
>>>>> control
>>>>> command, IP_SETID, with which we can can change the ID of an IPC. (that
>>>>> looks to
>>>>> me more straightforward and logical)
>>>>>
>>>>> Now, this patch is, in fact, only a preparation for the patch 10/15
>>>>> which
>>>>> really complete the user API by adding this IPC_SETID command.
>>>>>
>>>>> (... continuing below ...)
>>>>>
>>>>> Alexey Dobriyan wrote:
>>>>>> On Tue, Jan 29, 2008 at 05:02:38PM +0100, [email protected]
>>>>>> wrote:
>>>>>>> This patch provides three new API to change the ID of an existing
>>>>>>> System V IPCs.
>>>>>>>
>>>>>>> These APIs are:
>>>>>>> long msg_chid(struct ipc_namespace *ns, int id, int newid);
>>>>>>> long sem_chid(struct ipc_namespace *ns, int id, int newid);
>>>>>>> long shm_chid(struct ipc_namespace *ns, int id, int newid);
>>>>>>>
>>>>>>> They return 0 or an error code in case of failure.
>>>>>>>
>>>>>>> They may be useful for setting a specific ID for an IPC when preparing
>>>>>>> a restart operation.
>>>>>>>
>>>>>>> To be successful, the following rules must be respected:
>>>>>>> - the IPC exists (of course...)
>>>>>>> - the new ID must satisfy the ID computation rule.
>>>>>>> - the entry in the idr corresponding to the new ID must be free.
>>>>>>> ipc/util.c | 48
>>>>>>> ++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>>> ipc/util.h | 1 +
>>>>>>> 8 files changed, 197 insertions(+)
>>>>>> For the record, OpenVZ uses "create with predefined ID" method which
>>>>>> leads to less code. For example, change at the end is all we want from
>>>>>> ipc/util.c .
>>>>> And in fact, you do that from kernel space, you don't have the
>>>>> constraint to fit
>>>>> the existing user API.
>>>>> Again, this patch, even if it presents a new kernel API, is in fact a
>>>>> preparation for the next patch which introduces a new user API.
>>>>>
>>>>> Do you think that this could fit your need ?
>>>>>
>> _______________________________________________
>> Containers mailing list
>> [email protected]
>> https://lists.linux-foundation.org/mailman/listinfo/containers
Quoting Oren Laadan ([email protected]):
>
>
> Serge E. Hallyn wrote:
>> Quoting Oren Laadan ([email protected]):
>>> I strongly second Kirill on this matter.
>>>
>>> IMHO, we should _avoid_ as much as possible exposing internal kernel
>>> state to applications, unless a _real_ need for it is _clearly_
>>> demonstrated. The reasons for this are quite obvious.
>> Hmm, sure, but this sentence is designed to make us want to agree. Yes,
>> we want to avoid exporting kernel internals, but generally that means
>> things like the precise layout of the task_struct. What Pierre is doing
>> is in fact the opposite, exporting resource information in a kernel
>> version invariant way.
>
> LOL ... a bit of misunderstanding - let me put some order here:
>
> my response what with respect to the new interface that Pierre
> suggested, that is - to add a new IPC call to change an identifier
> after it has been allocated (and assigned). This is necessary for the
> restart because applications expect to see the same resource id's as
> they had at the time of the checkpoint.
>
> What you are referring to is the more recent part of the thread, where
> the topic became how data should be saved - in other words, the format
> of the checkpoint data. This is entirely orthogonal to my argument.
>
> Now please re-read my email :)
Heh - by the end of my response I was pretty sure that was the case :)
> That said, I'd advocate for something in between a raw dump and a pure
> "parametric" representation of the data. Raw data tends to be, well,
> too raw, which makes the task of reading data from older version by
> newer kernels harder to maintain. On the other hand, it is impossible
> to abstract everything into kernel-independent format.
Well, that's probably getting a little pedantic, but true.
>> In fact, the very reason not to go the route you and Pavel are
>> advocating is that if we just dump task state to a file or filesystem
>> from the kernel in one shot, we'll be much more tempted to lay out data
>> in a way that exports and ends up depending on kernel internals. So
>> we'll just want to read and write the task_struct verbatim.
>> So, there are two very different approaches we can start with.
>> Whichever one we follow, we want to avoid having kernel version
>> dependencies. They both have their merits to be sure.
>
> You will never be able to avoid that completely, simply because new
> kernels will require saving more (or less) data per object, because
> of new (or dropped) features.
Sure.
> The best solution in this sense is to provide a filter (hopefully
> in user space, utility) that would convert a checkpoint image file
> from the old format to a newer format.
Naturally.
> And you keep a lot of compatibility code of the kernel, too.
>
>> But note that in either case we need to deal with a bunch of locking.
>> So getting back to Pierre's patchset, IIRC 1-8 are cleanups worth
>> doing no matter 1. 9-11 sound like they are contentuous until
>> we decide whether we want to go with a create_with_id() type approach
>> or a set_id(). 12 is IMO a good locking cleanup regardless. 13 and
>> 15 are contentous until we decide whether we want userspace-controlled
>> checkpoint or a one-shot fs. 14 IMO is useful for both c/r approaches.
>> Is that pretty accurate?
>
> (context switch back to my original reply)
>
> I prefer not to add a new interface to IPC that will provide a new
> functionality that isn't needed, except for the checkpoint - because
> there is a better alternative to do the same task; this alternative
> is more suitable because (a) it can be applied incrementally, (b) it
> provides a consistent method to pre-select identifiers of all syscalls,
> (where is the current suggestion suggests one way for IPC and will
> suggest other hacks for other resources).
>
> (context switch back to the current reply)
>
> I definitely welcome a cleanup of the (insanely multiplexedd) IPC
> code. However I argue that the interface need not be extended.
>
>>> It isn't strictly necessary to export a new interface in order to
>>> support checkpoint/restart. **. Hence, I think that the speculation
>>> "we may need it in the future" is too abstract and isn't a good
>>> excuse to commit to a new, currently unneeded, interface.
>> OTOH it did succeed in starting some conversation :)
>>> Should the
>>> need arise in the future, it will be easy to design a new interface
>>> (also based on aggregated experience until then).
>> What aggregated experience? We have to start somewhere...
>
> :) well, assuming the selection of resource IDs is done as I suggested,
> we'll have the restart use it. If someone finds a good reason (other
> than checkpoint/restart) to pre-select/modify an identifier, it will
> be easy to _then_ add an interface. That (hypothetical) interface is
> likely to come out more clever after X months using checkpoint/restart.
>
>>> ** In fact, the suggested interface may prove problematic (as noted
>>> earlier in this thread): if you first create the resource with some
>>> arbitrary identifier and then modify the identifier (in our case,
>>> IPC id), then the restart procedure is bound to execute sequentially,
>>> because of lack of atomicity.
>> Hmm? Lack of atomicity wrt what? All the tasks being restarted were
>> checkpointed at the same time so there will be no conflict in the
>> requested IDs, so I don't know what you're referring to.
>
> Consider that we want to have an ultra-fast restart, so we let processes
> restart in parallel (as much as possible) in the same container. Task A
> wants to allocate IPC id 256, but the kernel allocates 32; before task A
> manages to change it to 256 (with the new interface), task B attempts to
> create an IPC id 32; the kernel provides, say, 1024, and task B fails to
> change it to 32 because it is still used by task A. So restart fails :(
Bah, it gets -EAGAIN and tries again. I see the biggest plus of your
approach as being the consistent api.
> On the other hand, if a process first tells the kernel "I want 32" and
> then calls, for instance, semget(), then the IPC can atomically ensure
> that the process gets what it wanted.
>
>>> That said, I suggest the following method instead (this is the method
>>> we use in Zap to determine the desired resource identifier when a new
>>> resource is allocated; I recall that we had discussed it in the past,
>>> perhaps the mini-summit in september ?):
>>>
>>> 1) The process/thread tells the kernel that it wishes to pre-determine
>>> the resource identifier of a subsequent call (this can be done via a
>>> new syscall, or by writing to /proc/self/...).
>>>
>>> 2) Each system call that allocates a resource and assigns an identifier
>>> is modified to check this per-thread field first; if it is set then
>>> it will attempt to allocate that particular value (if already taken,
>>> return an error, eg. EBUSY). Otherwise it will proceed as it is today.
>> But I thought you were just advocating a one-shot filesystem approach
>> for c/r, so we wouldn't be creating the resources piecemeal?
>
> I wasn't. That was Pavel. While I think the idea is neat, I'm not
> convinced that it's practical and best way to go, however I need to
> further think about it.
>
> And as I said, I see this as a separate issue from the problem of
> create_with_id()/set_id issue().
>
>> The /proc/self approach is one way to go, it has been working for LSMs
>> this long. I'd agree that it would be nice if we could have a
>> consistent interface to the create_with_id()/set_id() problem. A first
>> shot addressing ipcs and pids would be a great start.
>>> (I left out some details - eg. the kernel will keep the desire value
>>> on a per-thread field, when it will be reset, whether we want to also
>>> tag the field with its type and so on, but the idea is now clear).
>>>
>>> The main two advantages are that first, we don't need to devise a new
>>> method for every syscall that allocates said resources (sigh... just
>> Agreed.
>>> think of clone() nightmare to add a new argument);
>> Yes, and then there will need to be the clone_with_pid() extension on
>> top of that.
>
> Exactly ! With the /proc/self/... approach there will not be a need
> for a clone_with_pid() extension in terms of user-visible interface;
> makes the clone-flags headache a bit more manageable :p
So you say this is how zap does it now? Would it be pretty trivial to
make a small patch consisting of your base procpid code and the clone
plugin to let you clone with a particular pid, and post that?
thanks,
-serge
> Ah... ok, long one, hopefully clarifies the confusion. That said, I
> suggest that the debate regarding the format of the checkpoint data
> shall proceed on a new thread, since IMHO it's orthogonal.
>
> Oren.
>
>>> second, the change
>>> is incremental: first code the mechanism to set the field, then add
>>> support in the IPC subsystem, later in the DEVPTS, then in clone and
>>> so forth.
>>>
>>> Oren.
>>>
>>> Pierre Peiffer wrote:
>>>> Kirill Korotaev wrote:
>>>>> Why user space can need this API? for checkpointing only?
>>>> I would say "at least for checkpointing"... ;) May be someone else may
>>>> find an
>>>> interest about this for something else.
>>>> In fact, I'm sure that you have some interest in checkpointing; and
>>>> thus, you
>>>> have probably some ideas in mind; but whatever the solution you will
>>>> propose,
>>>> I'm pretty sure that I could say the same thing for your solution.
>>>> And what I finally think is: even if it's for "checkpointing only", if
>>>> many
>>>> people are interested by this, it may be sufficient to push this ?
>>>>> Then I would not consider it for inclusion until it is clear how to
>>>>> implement checkpointing.
>>>>> As for me personally - I'm against exporting such APIs, since they are
>>>>> not needed in real-life user space applications and maintaining it
>>>>> forever for compatibility doesn't worth it.
>>>> Maintaining these patches is not a big deal, really, but this is not the
>>>> main
>>>> point; the "need in real life" (1) is in fact the main one, and then,
>>>> the "is
>>>> this solution the best one ?" (2) the second one.
>>>> About (1), as said in my first mail, as the namespaces and containers
>>>> are being
>>>> integrated into the mainline kernel, checkpoint/restart is (or will be)
>>>> the next
>>>> need.
>>>> About (2), my solution propose to do that, as much as possible from
>>>> userspace,
>>>> to minimize the kernel impact. Of course, this is subject to discussion.
>>>> My
>>>> opinion is that doing a full checkpoint/restart from kernel space will
>>>> need lot
>>>> of new specific and intrusive code; I'm not sure that this will be
>>>> acceptable by
>>>> the community. But this is my opinion only. Discusion is opened.
>>>>> Also such APIs allow creation of non-GPL checkpointing in user-space,
>>>>> which can be of concern as well.
>>>> Honestly, I don't think this really a concern at all. I mean: I've never
>>>> seen
>>>> "this allows non-GPL binary and thus, this is bad" as an argument to
>>>> reject a
>>>> functionality, but I may be wrong, and thus, it can be discussed as
>>>> well.
>>>> I think the points (1) and (2) as stated above are the key ones.
>>>> Pierre
>>>>> Kirill
>>>>>
>>>>>
>>>>> Pierre Peiffer wrote:
>>>>>> Hi again,
>>>>>>
>>>>>> Thinking more about this, I think I must clarify why I choose this
>>>>>> way.
>>>>>> In fact, the idea of these patches is to provide the missing user APIs
>>>>>> (or
>>>>>> extend the existing ones) that allow to set or update _all_ properties
>>>>>> of all
>>>>>> IPCs, as needed in the case of the checkpoint/restart of an
>>>>>> application (the
>>>>>> current user API does not allow to specify an ID for a created IPC,
>>>>>> for
>>>>>> example). And this, without changing the existing API of course.
>>>>>>
>>>>>> And msgget(), semget() and shmget() does not have any parameter we
>>>>>> can use to
>>>>>> specify an ID.
>>>>>> That's why I've decided to not change these routines and add a new
>>>>>> control
>>>>>> command, IP_SETID, with which we can can change the ID of an IPC.
>>>>>> (that looks to
>>>>>> me more straightforward and logical)
>>>>>>
>>>>>> Now, this patch is, in fact, only a preparation for the patch 10/15
>>>>>> which
>>>>>> really complete the user API by adding this IPC_SETID command.
>>>>>>
>>>>>> (... continuing below ...)
>>>>>>
>>>>>> Alexey Dobriyan wrote:
>>>>>>> On Tue, Jan 29, 2008 at 05:02:38PM +0100, [email protected]
>>>>>>> wrote:
>>>>>>>> This patch provides three new API to change the ID of an existing
>>>>>>>> System V IPCs.
>>>>>>>>
>>>>>>>> These APIs are:
>>>>>>>> long msg_chid(struct ipc_namespace *ns, int id, int newid);
>>>>>>>> long sem_chid(struct ipc_namespace *ns, int id, int newid);
>>>>>>>> long shm_chid(struct ipc_namespace *ns, int id, int newid);
>>>>>>>>
>>>>>>>> They return 0 or an error code in case of failure.
>>>>>>>>
>>>>>>>> They may be useful for setting a specific ID for an IPC when
>>>>>>>> preparing
>>>>>>>> a restart operation.
>>>>>>>>
>>>>>>>> To be successful, the following rules must be respected:
>>>>>>>> - the IPC exists (of course...)
>>>>>>>> - the new ID must satisfy the ID computation rule.
>>>>>>>> - the entry in the idr corresponding to the new ID must be free.
>>>>>>>> ipc/util.c | 48
>>>>>>>> ++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>>>> ipc/util.h | 1 +
>>>>>>>> 8 files changed, 197 insertions(+)
>>>>>>> For the record, OpenVZ uses "create with predefined ID" method which
>>>>>>> leads to less code. For example, change at the end is all we want
>>>>>>> from
>>>>>>> ipc/util.c .
>>>>>> And in fact, you do that from kernel space, you don't have the
>>>>>> constraint to fit
>>>>>> the existing user API.
>>>>>> Again, this patch, even if it presents a new kernel API, is in fact a
>>>>>> preparation for the next patch which introduces a new user API.
>>>>>>
>>>>>> Do you think that this could fit your need ?
>>>>>>
>>> _______________________________________________
>>> Containers mailing list
>>> [email protected]
>>> https://lists.linux-foundation.org/mailman/listinfo/containers
Serge E. Hallyn wrote:
>
> But note that in either case we need to deal with a bunch of locking.
> So getting back to Pierre's patchset, IIRC 1-8 are cleanups worth
> doing no matter 1. 9-11 sound like they are contentuous until
> we decide whether we want to go with a create_with_id() type approach
> or a set_id(). 12 is IMO a good locking cleanup regardless. 13 and
> 15 are contentous until we decide whether we want userspace-controlled
> checkpoint or a one-shot fs. 14 IMO is useful for both c/r approaches.
>
> Is that pretty accurate?
>
Ok, so, so far, the discussion stays opened about the new functionalities for c/r.
As there were no objection about the first patches, which rewrite/enhance the
existing code, Andrew, could you consider them (ie patches 1 to 8 of this
series) for inclusion in -mm ? (I mean, as soon as it is possible, as I guess
you're pretty busy for now with the merge for 2.6.25)
If you prefer, I can resend them separately ?
Thanks,
Pierre
>> It isn't strictly necessary to export a new interface in order to
>> support checkpoint/restart. **. Hence, I think that the speculation
>> "we may need it in the future" is too abstract and isn't a good
>> excuse to commit to a new, currently unneeded, interface.
>
> OTOH it did succeed in starting some conversation :)
>
>> Should the
>> need arise in the future, it will be easy to design a new interface
>> (also based on aggregated experience until then).
>
> What aggregated experience? We have to start somewhere...
>
>> ** In fact, the suggested interface may prove problematic (as noted
>> earlier in this thread): if you first create the resource with some
>> arbitrary identifier and then modify the identifier (in our case,
>> IPC id), then the restart procedure is bound to execute sequentially,
>> because of lack of atomicity.
>
> Hmm? Lack of atomicity wrt what? All the tasks being restarted were
> checkpointed at the same time so there will be no conflict in the
> requested IDs, so I don't know what you're referring to.
>
>> That said, I suggest the following method instead (this is the method
>> we use in Zap to determine the desired resource identifier when a new
>> resource is allocated; I recall that we had discussed it in the past,
>> perhaps the mini-summit in september ?):
>>
>> 1) The process/thread tells the kernel that it wishes to pre-determine
>> the resource identifier of a subsequent call (this can be done via a
>> new syscall, or by writing to /proc/self/...).
>>
>> 2) Each system call that allocates a resource and assigns an identifier
>> is modified to check this per-thread field first; if it is set then
>> it will attempt to allocate that particular value (if already taken,
>> return an error, eg. EBUSY). Otherwise it will proceed as it is today.
>
> But I thought you were just advocating a one-shot filesystem approach
> for c/r, so we wouldn't be creating the resources piecemeal?
>
> The /proc/self approach is one way to go, it has been working for LSMs
> this long. I'd agree that it would be nice if we could have a
> consistent interface to the create_with_id()/set_id() problem. A first
> shot addressing ipcs and pids would be a great start.
>
>> (I left out some details - eg. the kernel will keep the desire value
>> on a per-thread field, when it will be reset, whether we want to also
>> tag the field with its type and so on, but the idea is now clear).
>>
>> The main two advantages are that first, we don't need to devise a new
>> method for every syscall that allocates said resources (sigh... just
>
> Agreed.
>
>> think of clone() nightmare to add a new argument);
>
> Yes, and then there will need to be the clone_with_pid() extension on
> top of that.
>
>> second, the change
>> is incremental: first code the mechanism to set the field, then add
>> support in the IPC subsystem, later in the DEVPTS, then in clone and
>> so forth.
>>
>> Oren.
>>
>> Pierre Peiffer wrote:
>>> Kirill Korotaev wrote:
>>>> Why user space can need this API? for checkpointing only?
>>> I would say "at least for checkpointing"... ;) May be someone else may
>>> find an
>>> interest about this for something else.
>>> In fact, I'm sure that you have some interest in checkpointing; and thus,
>>> you
>>> have probably some ideas in mind; but whatever the solution you will
>>> propose,
>>> I'm pretty sure that I could say the same thing for your solution.
>>> And what I finally think is: even if it's for "checkpointing only", if
>>> many
>>> people are interested by this, it may be sufficient to push this ?
>>>> Then I would not consider it for inclusion until it is clear how to
>>>> implement checkpointing.
>>>> As for me personally - I'm against exporting such APIs, since they are
>>>> not needed in real-life user space applications and maintaining it
>>>> forever for compatibility doesn't worth it.
>>> Maintaining these patches is not a big deal, really, but this is not the
>>> main
>>> point; the "need in real life" (1) is in fact the main one, and then, the
>>> "is
>>> this solution the best one ?" (2) the second one.
>>> About (1), as said in my first mail, as the namespaces and containers are
>>> being
>>> integrated into the mainline kernel, checkpoint/restart is (or will be)
>>> the next
>>> need.
>>> About (2), my solution propose to do that, as much as possible from
>>> userspace,
>>> to minimize the kernel impact. Of course, this is subject to discussion.
>>> My
>>> opinion is that doing a full checkpoint/restart from kernel space will
>>> need lot
>>> of new specific and intrusive code; I'm not sure that this will be
>>> acceptable by
>>> the community. But this is my opinion only. Discusion is opened.
>>>> Also such APIs allow creation of non-GPL checkpointing in user-space,
>>>> which can be of concern as well.
>>> Honestly, I don't think this really a concern at all. I mean: I've never
>>> seen
>>> "this allows non-GPL binary and thus, this is bad" as an argument to
>>> reject a
>>> functionality, but I may be wrong, and thus, it can be discussed as well.
>>> I think the points (1) and (2) as stated above are the key ones.
>>> Pierre
>>>> Kirill
>>>>
>>>>
>>>> Pierre Peiffer wrote:
>>>>> Hi again,
>>>>>
>>>>> Thinking more about this, I think I must clarify why I choose this way.
>>>>> In fact, the idea of these patches is to provide the missing user APIs
>>>>> (or
>>>>> extend the existing ones) that allow to set or update _all_ properties
>>>>> of all
>>>>> IPCs, as needed in the case of the checkpoint/restart of an application
>>>>> (the
>>>>> current user API does not allow to specify an ID for a created IPC, for
>>>>> example). And this, without changing the existing API of course.
>>>>>
>>>>> And msgget(), semget() and shmget() does not have any parameter we can
>>>>> use to
>>>>> specify an ID.
>>>>> That's why I've decided to not change these routines and add a new
>>>>> control
>>>>> command, IP_SETID, with which we can can change the ID of an IPC. (that
>>>>> looks to
>>>>> me more straightforward and logical)
>>>>>
>>>>> Now, this patch is, in fact, only a preparation for the patch 10/15
>>>>> which
>>>>> really complete the user API by adding this IPC_SETID command.
>>>>>
>>>>> (... continuing below ...)
>>>>>
>>>>> Alexey Dobriyan wrote:
>>>>>> On Tue, Jan 29, 2008 at 05:02:38PM +0100, [email protected]
>>>>>> wrote:
>>>>>>> This patch provides three new API to change the ID of an existing
>>>>>>> System V IPCs.
>>>>>>>
>>>>>>> These APIs are:
>>>>>>> long msg_chid(struct ipc_namespace *ns, int id, int newid);
>>>>>>> long sem_chid(struct ipc_namespace *ns, int id, int newid);
>>>>>>> long shm_chid(struct ipc_namespace *ns, int id, int newid);
>>>>>>>
>>>>>>> They return 0 or an error code in case of failure.
>>>>>>>
>>>>>>> They may be useful for setting a specific ID for an IPC when preparing
>>>>>>> a restart operation.
>>>>>>>
>>>>>>> To be successful, the following rules must be respected:
>>>>>>> - the IPC exists (of course...)
>>>>>>> - the new ID must satisfy the ID computation rule.
>>>>>>> - the entry in the idr corresponding to the new ID must be free.
>>>>>>> ipc/util.c | 48
>>>>>>> ++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>>> ipc/util.h | 1 +
>>>>>>> 8 files changed, 197 insertions(+)
>>>>>> For the record, OpenVZ uses "create with predefined ID" method which
>>>>>> leads to less code. For example, change at the end is all we want from
>>>>>> ipc/util.c .
>>>>> And in fact, you do that from kernel space, you don't have the
>>>>> constraint to fit
>>>>> the existing user API.
>>>>> Again, this patch, even if it presents a new kernel API, is in fact a
>>>>> preparation for the next patch which introduces a new user API.
>>>>>
>>>>> Do you think that this could fit your need ?
>>>>>
>> _______________________________________________
>> Containers mailing list
>> [email protected]
>> https://lists.linux-foundation.org/mailman/listinfo/containers
>
>
--
Pierre Peiffer