LinuxLists.cc - [PATCH] Traffic control cgroups subsystem

2008-07-22 17:45:07

Subject: [PATCH] Traffic control cgroups subsystem

[Take 3] -- Fixed additional comments from Patrick McHardy

This patch provides a simple resource controller (cgroup_tc) based on the
cgroups infrastructure to manage network traffic. The cgroup_tc resource
controller can be used to schedule and shape traffic belonging to the task(s)
in a particular cgroup.

The implementation consists of two parts:

1) A resource controller (cgroup_tc) that is used to associate packets from
a particular task belonging to a cgroup with a traffic control class id (
tc_classid). This tc_classid is propagated to all sockets created by tasks
in the cgroup and will be used for classifying packets at the link layer.

2) A modified traffic control classifier (cls_flow) that can classify packets
based on the tc_classid field in the socket to specific destination classes.

An example of the use of this resource controller would be to limit
the traffic from all tasks from a file_server cgroup to 100Mbps. We could
achieve this by doing:

# make a cgroup of file transfer processes and assign it a uniqe classid
# of 0x10 - this will be used lated to direct packets.
mkdir -p /dev/cgroup
mount -t cgroup tc -otc /dev/cgroup
mkdir /dev/cgroup/file_transfer
echo 0x10 > /dev/cgroup/file_transfer/tc.classid
echo $PID_OF_FILE_XFER_PROCESS > /dev/cgroup/file_transfer/tasks

# Now create a HTB class that rate limits traffic to 100mbits and attach
# a filter to direct all traffic from cgroup file_transfer to this new class.
tc qdisc add dev eth0 root handle 1: htb
tc class add dev eth0 parent 1: classid 1:10 htb rate 100mbit ceil 100mbit
tc filter add dev eth0 parent 1: handle 800 protocol ip prio 1 flow map key cgroup-classid baseclass 1:10

Signed-off-by: Ranjit Manomohan <[email protected]>

---
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index e287745..4b12372 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -48,3 +48,9 @@ SUBSYS(devices)
#endif

/* */
+
+#ifdef CONFIG_CGROUP_TC
+SUBSYS(tc)
+#endif
+
+/* */
diff --git a/include/linux/cgroup_tc.h b/include/linux/cgroup_tc.h
new file mode 100644
index 0000000..e4ba6a1
--- /dev/null
+++ b/include/linux/cgroup_tc.h
@@ -0,0 +1,20 @@
+#ifndef __LINUX_CGROUP_TC_H
+#define __LINUX_CGROUP_TC_H
+
+/* Interface to obtain tasks cgroup identifier. */
+
+#include <linux/cgroup.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+
+#ifdef CONFIG_CGROUP_TC
+
+void cgroup_tc_set_sock_classid(struct sock *sk);
+
+#else
+
+#define cgroup_tc_set_sock_classid(sk)
+
+#endif /* CONFIG_CGROUP_TC */
+
+#endif /* __LINUX_CGROUP_TC_H */
diff --git a/include/linux/pkt_cls.h b/include/linux/pkt_cls.h
index 99efbed..deead80 100644
--- a/include/linux/pkt_cls.h
+++ b/include/linux/pkt_cls.h
@@ -349,6 +349,7 @@ enum
FLOW_KEY_SKUID,
FLOW_KEY_SKGID,
FLOW_KEY_VLAN_TAG,
+ FLOW_KEY_CGROUP_CLASSID,
__FLOW_KEY_MAX,
};

diff --git a/include/net/sock.h b/include/net/sock.h
index dc42b44..7a4e09c 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -271,6 +271,9 @@ struct sock {
int sk_write_pending;
void *sk_security;
__u32 sk_mark;
+#ifdef CONFIG_CGROUP_TC
+ __u32 sk_cgroup_classid;
+#endif
/* XXX 4 bytes hole on 64 bit */
void (*sk_state_change)(struct sock *sk);
void (*sk_data_ready)(struct sock *sk, int bytes);
diff --git a/init/Kconfig b/init/Kconfig
index 6135d07..c28fde8 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -289,6 +289,17 @@ config CGROUP_DEBUG

Say N if unsure

+config CGROUP_TC
+ bool "Traffic control cgroup subsystem"
+ depends on CGROUPS
+ default n
+ help
+ This option enables a simple cgroup subsystem that
+ allows network traffic to be classified based on the
+ cgroup of the task originating the traffic.
+
+ Say N if unsure
+
config CGROUP_NS
bool "Namespace cgroup subsystem"
depends on CGROUPS
diff --git a/kernel/Makefile b/kernel/Makefile
index 1c9938a..08b217b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -42,6 +42,7 @@ obj-$(CONFIG_CGROUPS) += cgroup.o
obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
obj-$(CONFIG_CPUSETS) += cpuset.o
obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
+obj-$(CONFIG_CGROUP_TC) += tc_cgroup.o
obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_USER_NS) += user_namespace.o
obj-$(CONFIG_PID_NS) += pid_namespace.o
diff --git a/kernel/tc_cgroup.c b/kernel/tc_cgroup.c
new file mode 100644
index 0000000..1c62a6c
--- /dev/null
+++ b/kernel/tc_cgroup.c
@@ -0,0 +1,108 @@
+/*
+ * tc_cgroup.c - traffic control cgroup subsystem
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/cgroup.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/cgroup_tc.h>
+
+struct tc_cgroup {
+ struct cgroup_subsys_state css;
+ unsigned int classid;
+};
+
+struct cgroup_subsys tc_subsys;
+
+static inline struct tc_cgroup *cgroup_to_tc(
+ struct cgroup *cgroup)
+{
+ return container_of(cgroup_subsys_state(cgroup, tc_subsys_id),
+ struct tc_cgroup, css);
+}
+
+static int cgroup_tc_classid(struct task_struct *tsk)
+{
+ int tc_classid;
+
+ rcu_read_lock();
+ tc_classid = container_of(task_subsys_state(tsk, tc_subsys_id),
+ struct tc_cgroup, css)->classid;
+ rcu_read_unlock();
+ return tc_classid;
+}
+
+void cgroup_tc_set_sock_classid(struct sock *sk)
+{
+ if (sk)
+ sk->sk_cgroup_classid = cgroup_tc_classid(current);
+}
+
+static struct cgroup_subsys_state *tc_create(struct cgroup_subsys *ss,
+ struct cgroup *cgroup)
+{
+ struct tc_cgroup *tc_cgroup;
+
+ tc_cgroup = kzalloc(sizeof(*tc_cgroup), GFP_KERNEL);
+
+ if (!tc_cgroup)
+ return ERR_PTR(-ENOMEM);
+
+ /* Copy parent's class id if present */
+ if (cgroup->parent)
+ tc_cgroup->classid = cgroup_to_tc(cgroup->parent)->classid;
+
+ return &tc_cgroup->css;
+}
+
+static void tc_destroy(struct cgroup_subsys *ss,
+ struct cgroup *cgroup)
+{
+ kfree(cgroup_to_tc(cgroup));
+}
+
+static int tc_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+ struct tc_cgroup *tc = cgroup_to_tc(cgrp);
+
+ cgroup_lock();
+ if (cgroup_is_removed(cgrp)) {
+ cgroup_unlock();
+ return -ENODEV;
+ }
+
+ tc->classid = (unsigned int) (val & 0xffffffff);
+ cgroup_unlock();
+ return 0;
+}
+
+static u64 tc_read_u64(struct cgroup *cont, struct cftype *cft)
+{
+ struct tc_cgroup *tc = cgroup_to_tc(cont);
+ return tc->classid;
+}
+
+static struct cftype tc_files[] = {
+ {
+ .name = "classid",
+ .read_u64 = tc_read_u64,
+ .write_u64 = tc_write_u64,
+ }
+};
+
+static int tc_populate(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+ int err;
+ err = cgroup_add_files(cont, ss, tc_files, ARRAY_SIZE(tc_files));
+ return err;
+}
+
+struct cgroup_subsys tc_subsys = {
+ .name = "tc",
+ .create = tc_create,
+ .destroy = tc_destroy,
+ .populate = tc_populate,
+ .subsys_id = tc_subsys_id,
+};
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 971b867..68b2d2d 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -280,6 +280,15 @@ static u32 flow_get_vlan_tag(const struct sk_buff *skb)
return tag & VLAN_VID_MASK;
}

+static u32 flow_get_cgroup_classid(const struct sk_buff *skb)
+{
+#ifdef CONFIG_CGROUP_TC
+ if (skb->sk)
+ return skb->sk->sk_cgroup_classid;
+#endif
+ return 0;
+}
+
static u32 flow_key_get(const struct sk_buff *skb, int key)
{
switch (key) {
@@ -317,6 +326,8 @@ static u32 flow_key_get(const struct sk_buff *skb, int key)
return flow_get_skgid(skb);
case FLOW_KEY_VLAN_TAG:
return flow_get_vlan_tag(skb);
+ case FLOW_KEY_CGROUP_CLASSID:
+ return flow_get_cgroup_classid(skb);
default:
WARN_ON(1);
return 0;
@@ -359,7 +370,12 @@ static int flow_classify(struct sk_buff *skb, struct tcf_proto *tp,
classid %= f->divisor;

res->class = 0;
- res->classid = TC_H_MAKE(f->baseclass, f->baseclass + classid);
+
+ if (key == FLOW_KEY_CGROUP_CLASSID)
+ res->classid = TC_H_MAKE(f->baseclass, classid);
+ else
+ res->classid = TC_H_MAKE(f->baseclass,
+ f->baseclass + classid);

r = tcf_exts_exec(skb, &f->exts, res);
if (r < 0)
diff --git a/net/socket.c b/net/socket.c
index 66c4a8c..b7421ec 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -93,6 +93,7 @@

#include <net/sock.h>
#include <linux/netfilter.h>
+#include <linux/cgroup_tc.h>

static int sock_no_open(struct inode *irrelevant, struct file *dontcare);
static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
@@ -1170,6 +1171,8 @@ static int __sock_create(struct net *net, int family, int type, int protocol,
if (err < 0)
goto out_module_put;

+ cgroup_tc_set_sock_classid(sock->sk);
+
/*
* Now to bump the refcnt of the [loadable] module that owns this
* socket at sock_release time we decrement its refcnt.
@@ -1444,6 +1447,8 @@ asmlinkage long sys_accept(int fd, struct sockaddr __user *upeer_sockaddr,
if (err < 0)
goto out_fd;

+ cgroup_tc_set_sock_classid(newsock->sk);
+
if (upeer_sockaddr) {
if (newsock->ops->getname(newsock, (struct sockaddr *)address,
&len, 2) < 0) {

2008-07-23 22:06:21

by Andrew Morton

[permalink] [raw]

Subject: Re: [PATCH] Traffic control cgroups subsystem

On Tue, 22 Jul 2008 10:44:18 -0700 (PDT)
Ranjit Manomohan <[email protected]> wrote:

> [Take 3] -- Fixed additional comments from Patrick McHardy
>
> This patch provides a simple resource controller (cgroup_tc) based on the
> cgroups infrastructure to manage network traffic. The cgroup_tc resource
> controller can be used to schedule and shape traffic belonging to the task(s)
> in a particular cgroup.
>
> The implementation consists of two parts:
>
> 1) A resource controller (cgroup_tc) that is used to associate packets from
> a particular task belonging to a cgroup with a traffic control class id (
> tc_classid). This tc_classid is propagated to all sockets created by tasks
> in the cgroup and will be used for classifying packets at the link layer.
>
> 2) A modified traffic control classifier (cls_flow) that can classify packets
> based on the tc_classid field in the socket to specific destination classes.
>
> An example of the use of this resource controller would be to limit
> the traffic from all tasks from a file_server cgroup to 100Mbps. We could
> achieve this by doing:
>
> # make a cgroup of file transfer processes and assign it a uniqe classid
> # of 0x10 - this will be used lated to direct packets.
> mkdir -p /dev/cgroup
> mount -t cgroup tc -otc /dev/cgroup
> mkdir /dev/cgroup/file_transfer
> echo 0x10 > /dev/cgroup/file_transfer/tc.classid
> echo $PID_OF_FILE_XFER_PROCESS > /dev/cgroup/file_transfer/tasks
>
> # Now create a HTB class that rate limits traffic to 100mbits and attach
> # a filter to direct all traffic from cgroup file_transfer to this new class.
> tc qdisc add dev eth0 root handle 1: htb
> tc class add dev eth0 parent 1: classid 1:10 htb rate 100mbit ceil 100mbit
> tc filter add dev eth0 parent 1: handle 800 protocol ip prio 1 flow map key cgroup-classid baseclass 1:10
>
> Signed-off-by: Ranjit Manomohan <[email protected]>
>
> ---
> diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
> index e287745..4b12372 100644
> --- a/include/linux/cgroup_subsys.h
> +++ b/include/linux/cgroup_subsys.h
> @@ -48,3 +48,9 @@ SUBSYS(devices)
> #endif
>
> /* */

Your email client is performing space-stuffing. It's easy enough to
fix at this end (s/^ / /g) but is a bit of a hassle.

> @@ -359,7 +370,12 @@ static int flow_classify(struct sk_buff *skb, struct tcf_proto *tp,
> classid %= f->divisor;
>
> res->class = 0;
> - res->classid = TC_H_MAKE(f->baseclass, f->baseclass + classid);
> +
> + if (key == FLOW_KEY_CGROUP_CLASSID)
> + res->classid = TC_H_MAKE(f->baseclass, classid);
> + else
> + res->classid = TC_H_MAKE(f->baseclass,
> + f->baseclass + classid);

This causes a warning:

net/sched/cls_flow.c: In function 'flow_classify':
net/sched/cls_flow.c:344: warning: 'key' may be used uninitialized in this function

that warning is a non-issue if we happen to know that f->nkeys can
never be zero. I don't know if that is guaranteed at this code site?

If so, it would be nice to suppress the warning in some fashion.
uninitialized_var() is one way, but suitable code reorganisation is
preferred.

I note that flow_classify() has an on-stack dynamically-sized array:

u32 keys[f->nkeys];

I hope that f->nkeys is a) small and b) not user-controllable.

2008-07-23 22:34:45

by Patrick McHardy

[permalink] [raw]

Subject: Re: [PATCH] Traffic control cgroups subsystem

Andrew Morton wrote:
> On Tue, 22 Jul 2008 10:44:18 -0700 (PDT)
> Ranjit Manomohan <[email protected]> wrote:
>
>> @@ -359,7 +370,12 @@ static int flow_classify(struct sk_buff *skb, struct tcf_proto *tp,
>> classid %= f->divisor;
>>
>> res->class = 0;
>> - res->classid = TC_H_MAKE(f->baseclass, f->baseclass + classid);
>> +
>> + if (key == FLOW_KEY_CGROUP_CLASSID)
>> + res->classid = TC_H_MAKE(f->baseclass, classid);
>> + else
>> + res->classid = TC_H_MAKE(f->baseclass,
>> + f->baseclass + classid);
>
> This causes a warning:
>
> net/sched/cls_flow.c: In function 'flow_classify':
> net/sched/cls_flow.c:344: warning: 'key' may be used uninitialized in this function
>
> that warning is a non-issue if we happen to know that f->nkeys can
> never be zero. I don't know if that is guaranteed at this code site?

It is by the flow_change() function, but special casing the
CGROUP_CLASSID is not acceptable anyway. There should be no
need for that, a simple linear mapping to classids is done
by default in mapping mode, the sk_cgroup_classid simply
shouldn't include qdisc IDs.

2008-07-23 23:54:25

by Ranjit Manomohan

[permalink] [raw]

Subject: Re: [PATCH] Traffic control cgroups subsystem

On Wed, Jul 23, 2008 at 3:34 PM, Patrick McHardy <[email protected]> wrote:
> Andrew Morton wrote:
>>
>> On Tue, 22 Jul 2008 10:44:18 -0700 (PDT)
>> Ranjit Manomohan <[email protected]> wrote:
>>
>>> @@ -359,7 +370,12 @@ static int flow_classify(struct sk_buff *skb, struct
>>> tcf_proto *tp,
>>> classid %= f->divisor;
>>>
>>> res->class = 0;
>>> - res->classid = TC_H_MAKE(f->baseclass, f->baseclass +
>>> classid);
>>> +
>>> + if (key == FLOW_KEY_CGROUP_CLASSID)
>>> + res->classid = TC_H_MAKE(f->baseclass, classid);
>>> + else
>>> + res->classid = TC_H_MAKE(f->baseclass,
>>> + f->baseclass + classid);
>>
>> This causes a warning:
>>
>> net/sched/cls_flow.c: In function 'flow_classify':
>> net/sched/cls_flow.c:344: warning: 'key' may be used uninitialized in this
>> function
>>
>> that warning is a non-issue if we happen to know that f->nkeys can
>> never be zero. I don't know if that is guaranteed at this code site?
>
> It is by the flow_change() function, but special casing the
> CGROUP_CLASSID is not acceptable anyway. There should be no
> need for that, a simple linear mapping to classids is done
> by default in mapping mode, the sk_cgroup_classid simply
> shouldn't include qdisc IDs.

I did not want to special case it but I want an identity mapping not a
linear one. For some reason a baseclass of X:0 is not allowed, and
there does not seem to be a clean way to get a 1-1 mapping (tc.classid
-> X:tc.classid). I would have to workaround it by using a baseclass
of the form X:Y and then subtracting Y from the value written to
tc.classid which seemed very non intuitive.

Any particular reason for this restriction? Am I missing any other
technique of getting a 1-1 mapping using the flow classifier?

-Thanks,
Ranjit

>

2008-07-24 00:18:29

by Patrick McHardy

[permalink] [raw]

Subject: Re: [PATCH] Traffic control cgroups subsystem

Ranjit Manomohan wrote:
> On Wed, Jul 23, 2008 at 3:34 PM, Patrick McHardy <[email protected]> wrote:
>>
>> It is by the flow_change() function, but special casing the
>> CGROUP_CLASSID is not acceptable anyway. There should be no
>> need for that, a simple linear mapping to classids is done
>> by default in mapping mode, the sk_cgroup_classid simply
>> shouldn't include qdisc IDs.
>
> I did not want to special case it but I want an identity mapping not a
> linear one. For some reason a baseclass of X:0 is not allowed

An ID of X:0 identifies a qdisc, not a class, which is why this
isn't accepted.

> and
> there does not seem to be a clean way to get a 1-1 mapping (tc.classid
> -> X:tc.classid). I would have to workaround it by using a baseclass
> of the form X:Y and then subtracting Y from the value written to
> tc.classid which seemed very non intuitive.
>
> Any particular reason for this restriction? Am I missing any other
> technique of getting a 1-1 mapping using the flow classifier?

My suggestion to replace your classifier by cls_flow was wrong, sorry.

You can't do classification that don't use linear mappings or hash
distribution, but we really should support that. We should keep your
classifier (with the change to use skb->sk), but please extend it by
the standard classifier features like ematches, actions and policers
(see flow_classify() for a simple example). Adding the cgroup to
cls_flow still makes sense, but as regular key without the baseclass
modification.

2008-07-25 00:08:54

by Thomas Graf

[permalink] [raw]

Subject: Re: [PATCH] Traffic control cgroups subsystem

* Ranjit Manomohan <[email protected]> 2008-07-22 10:44
> The implementation consists of two parts:
>
> 1) A resource controller (cgroup_tc) that is used to associate packets from
> a particular task belonging to a cgroup with a traffic control class id (
> tc_classid). This tc_classid is propagated to all sockets created by
> tasks
> in the cgroup and will be used for classifying packets at the link layer.
>
> 2) A modified traffic control classifier (cls_flow) that can classify
> packets
> based on the tc_classid field in the socket to specific destination
> classes.
>
> An example of the use of this resource controller would be to limit
> the traffic from all tasks from a file_server cgroup to 100Mbps. We could
> achieve this by doing:
>
> # make a cgroup of file transfer processes and assign it a uniqe classid
> # of 0x10 - this will be used lated to direct packets.
> mkdir -p /dev/cgroup
> mount -t cgroup tc -otc /dev/cgroup
> mkdir /dev/cgroup/file_transfer
> echo 0x10 > /dev/cgroup/file_transfer/tc.classid
> echo $PID_OF_FILE_XFER_PROCESS > /dev/cgroup/file_transfer/tasks
>
> # Now create a HTB class that rate limits traffic to 100mbits and attach
> # a filter to direct all traffic from cgroup file_transfer to this new
> class.
> tc qdisc add dev eth0 root handle 1: htb
> tc class add dev eth0 parent 1: classid 1:10 htb rate 100mbit ceil 100mbit
> tc filter add dev eth0 parent 1: handle 800 protocol ip prio 1 flow map key
> cgroup-classid baseclass 1:10

It might have been easier to simply write a classifier which maps pids
to classes. The interface could be as simple as two nested attributes,
ADD_MAPS, REMOVE_MAPS which both take lists of pid->class mappings to
either add or remove from the classifier.

I have been working on this over the past 2 weeks, it includes the
classifier as just stated, a cgroup module which sends notifications
about events as netlink messages and a daemon which creates qdiscs,
classes and filters on the fly according to the configured distribution.
It works both ingress (with some tricks) and egress.

IMHO, there is no point in a cgroup interface if the user has to create
qdiscs, classes and filters manually anyway.

2008-07-25 01:16:51

by Ranjit Manomohan

[permalink] [raw]

Subject: Re: [PATCH] Traffic control cgroups subsystem

On Thu, Jul 24, 2008 at 4:45 PM, Thomas Graf <[email protected]> wrote:
> * Ranjit Manomohan <[email protected]> 2008-07-22 10:44
>> The implementation consists of two parts:
>>
>> 1) A resource controller (cgroup_tc) that is used to associate packets from
>> a particular task belonging to a cgroup with a traffic control class id (
>> tc_classid). This tc_classid is propagated to all sockets created by
>> tasks
>> in the cgroup and will be used for classifying packets at the link layer.
>>
>> 2) A modified traffic control classifier (cls_flow) that can classify
>> packets
>> based on the tc_classid field in the socket to specific destination
>> classes.
>>
>> An example of the use of this resource controller would be to limit
>> the traffic from all tasks from a file_server cgroup to 100Mbps. We could
>> achieve this by doing:
>>
>> # make a cgroup of file transfer processes and assign it a uniqe classid
>> # of 0x10 - this will be used lated to direct packets.
>> mkdir -p /dev/cgroup
>> mount -t cgroup tc -otc /dev/cgroup
>> mkdir /dev/cgroup/file_transfer
>> echo 0x10 > /dev/cgroup/file_transfer/tc.classid
>> echo $PID_OF_FILE_XFER_PROCESS > /dev/cgroup/file_transfer/tasks
>>
>> # Now create a HTB class that rate limits traffic to 100mbits and attach
>> # a filter to direct all traffic from cgroup file_transfer to this new
>> class.
>> tc qdisc add dev eth0 root handle 1: htb
>> tc class add dev eth0 parent 1: classid 1:10 htb rate 100mbit ceil 100mbit
>> tc filter add dev eth0 parent 1: handle 800 protocol ip prio 1 flow map key
>> cgroup-classid baseclass 1:10
>
> It might have been easier to simply write a classifier which maps pids
> to classes. The interface could be as simple as two nested attributes,
> ADD_MAPS, REMOVE_MAPS which both take lists of pid->class mappings to
> either add or remove from the classifier.
>

The current patch is extremely lightweight and has virtually no
performance overhead compared to a more dynamic technique like you
suggest.

> I have been working on this over the past 2 weeks, it includes the
> classifier as just stated, a cgroup module which sends notifications
> about events as netlink messages and a daemon which creates qdiscs,
> classes and filters on the fly according to the configured distribution.
> It works both ingress (with some tricks) and egress.

I will send a follow up patch that handles ingress as well which
should be a fairly simple addition to the current scheme. IMO it may
be preferable not to tie the implementation to any specific dependency
in user space leaving it maximum flexibility. Our cluster management
component sets up these rules depending upon the configuration and we
have this scheme working in our clusters for quite some time with no
issues.

>
> IMHO, there is no point in a cgroup interface if the user has to create
> qdiscs, classes and filters manually anyway.

In my view it is a trade off to allow more flexibility in the
configuration. I would think someone configuring the current tc setup
in Linux is already pretty knowledgeable about its working and can do
this extra step without much difficulty.

That said I will look for your alternative implementation to compare
the benefits.

-Thanks for your review and comments,
Ranjit

> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>

2008-07-25 01:18:45

by Paul Menage

[permalink] [raw]

Subject: Re: [PATCH] Traffic control cgroups subsystem

On Thu, Jul 24, 2008 at 7:45 PM, Thomas Graf <[email protected]> wrote:
>
> It might have been easier to simply write a classifier which maps pids
> to classes. The interface could be as simple as two nested attributes,
> ADD_MAPS, REMOVE_MAPS which both take lists of pid->class mappings to
> either add or remove from the classifier.

You mean as processes fork/exit or move between cgroups you have to
update the pid->class mappings in the kernel's filter? That sounds way
too fragile to me.

>
> I have been working on this over the past 2 weeks, it includes the
> classifier as just stated, a cgroup module which sends notifications
> about events

What types of events? We discussed how to send cgroup notifications to
userspace in the containers mini-summit on Tuesday. Netlink was one of
the options discussed, but suffers from the problem that netlink
sockets are tied to a particular network namespaces. The solution that
seemed most favoured was to have pollable cgroup control files that
represent events (and optionally support event data via a fifo).

> as netlink messages and a daemon which creates qdiscs,
> classes and filters on the fly according to the configured distribution.

> It works both ingress (with some tricks) and egress.
>
> IMHO, there is no point in a cgroup interface if the user has to create
> qdiscs, classes and filters manually anyway.
>

The user can use whatever middleware they want (e.g. your daemon,
libcg, etc) to set up qdiscs and classes. I don't think that requiring
any particular userspace implementation is the right way to go. The
point of this patch was to provide a minimal way to tag
sockets/packets as belonging to a particular cgroup, in order to make
use of the existing traffic controll APIs.

Paul

2008-07-25 09:29:48

by Thomas Graf

[permalink] [raw]

Subject: Re: [PATCH] Traffic control cgroups subsystem

* Ranjit Manomohan <[email protected]> 2008-07-24 18:16
> I will send a follow up patch that handles ingress as well which
> should be a fairly simple addition to the current scheme.

It is not that simple, neither dst nor socket has been looked up
where netfilter gives the possibility to build a queue using ifb.
I chose to shape just before the skb is put on the socket queue
but that also required some tricks and has to be done for every
protocol separately.

> IMO it may
> be preferable not to tie the implementation to any specific dependency
> in user space leaving it maximum flexibility. Our cluster management
> component sets up these rules depending upon the configuration and we
> have this scheme working in our clusters for quite some time with no
> issues.

I never even mentioned a dependency on anything. Whether such a daemon
is being run or not is up to the user. It is absolutely irrelevant what
your cluster management component does unless you open up the code.

> In my view it is a trade off to allow more flexibility in the
> configuration. I would think someone configuring the current tc setup
> in Linux is already pretty knowledgeable about its working and can do
> this extra step without much difficulty.

The one does not exclude the other but even with good documentation,
configuring or adapting tc configurations is a heavy task for many
users and will prevent many from using this feature. Having a daemon
create and modify a tc tree does not hinder the experienced user from
making custom modifications.

> That said I will look for your alternative implementation to compare
> the benefits.

Thanks, I will post my patches as soon as the next feature window opens.

* Paul Menage <[email protected]> 2008-07-24 21:18
> You mean as processes fork/exit or move between cgroups you have to
> update the pid->class mappings in the kernel's filter? That sounds way
> too fragile to me.

No, not at all. The classifier registers as cgroup subsystem and updates
the mappings automatically if the pid has been added by the user.

> What types of events? We discussed how to send cgroup notifications to
> userspace in the containers mini-summit on Tuesday. Netlink was one of
> the options discussed, but suffers from the problem that netlink
> sockets are tied to a particular network namespaces. The solution that
> seemed most favoured was to have pollable cgroup control files that
> represent events (and optionally support event data via a fifo).

Currently I broadcast on all namespaces by iterating over them but I may
remove them again altogether. I only use the notifications to decide
wehther a cgroup has at least one task at the moment.

> The user can use whatever middleware they want (e.g. your daemon,
> libcg, etc) to set up qdiscs and classes. I don't think that requiring
> any particular userspace implementation is the right way to go. The
> point of this patch was to provide a minimal way to tag
> sockets/packets as belonging to a particular cgroup, in order to make
> use of the existing traffic controll APIs.

This patch certainly does have value. Since it won't be a problem for
people to use one over another I see no problem it multiple solutions
to coexist.