2021-08-02 20:52:04

by Peilin Ye

[permalink] [raw]
Subject: [PATCH net-next 1/2] net/sched: sch_ingress: Support clsact egress mini-Qdisc option

From: Peilin Ye <[email protected]>

If the ingress Qdisc is in use, currently it is not possible to add
another clsact egress mini-Qdisc to the same device without taking down
the ingress Qdisc, since both sch_ingress and sch_clsact use the same
handle (0xFFFF0000).

Add a "change" option for sch_ingress, so that users can enable or disable
a clsact egress mini-Qdisc, without suffering from downtime:

$ tc qdisc add dev eth0 ingress
$ tc qdisc change dev eth0 ingress clsact-on

Then users can add filters to the egress mini-Qdisc as usual:

$ tc filter add dev eth0 egress protocol ip prio 10 \
matchall action skbmod swap mac

Deleting the ingress Qdisc removes the egress mini-Qdisc as well. To
remove egress mini-Qdisc only, use:

$ tc qdisc change dev eth0 ingress clsact-off

Finally, if the egress mini-Qdisc is enabled, the "show" command will
print out a "clsact" flag to indicate it:

$ tc qdisc show ingress
qdisc ingress ffff: dev eth0 parent ffff:fff1 ----------------
$ tc qdisc change dev eth0 ingress clsact-on
$ tc qdisc show ingress
qdisc ingress ffff: dev eth0 parent ffff:fff1 ---------------- clsact

Reviewed-by: Cong Wang <[email protected]>
Signed-off-by: Peilin Ye <[email protected]>
---
include/uapi/linux/pkt_sched.h | 12 +++++
net/sched/sch_ingress.c | 92 ++++++++++++++++++++++++++++++++++
2 files changed, 104 insertions(+)

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 79a699f106b1..cb0eb5dd848a 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -586,6 +586,18 @@ enum {

#define TCA_ATM_MAX (__TCA_ATM_MAX - 1)

+/* INGRESS section */
+
+enum {
+ TCA_INGRESS_UNSPEC,
+ TCA_INGRESS_FLAGS,
+#define TC_INGRESS_CLSACT _BITUL(0) /* enable clsact egress mini-Qdisc */
+#define TC_INGRESS_SUPPORTED_FLAGS TC_INGRESS_CLSACT
+ __TCA_INGRESS_MAX,
+};
+
+#define TCA_INGRESS_MAX (__TCA_INGRESS_MAX - 1)
+
/* Network emulator */

enum {
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index 84838128b9c5..96e00e9e727b 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -16,8 +16,12 @@

struct ingress_sched_data {
struct tcf_block *block;
+ struct tcf_block *egress_block;
struct tcf_block_ext_info block_info;
+ struct tcf_block_ext_info egress_block_info;
struct mini_Qdisc_pair miniqp;
+ struct mini_Qdisc_pair miniqp_egress;
+ bool clsact;
};

static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg)
@@ -27,6 +31,11 @@ static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg)

static unsigned long ingress_find(struct Qdisc *sch, u32 classid)
{
+ struct ingress_sched_data *q = qdisc_priv(sch);
+
+ if (q->clsact && TC_H_MIN(classid) == TC_H_MIN(TC_H_MIN_EGRESS))
+ return TC_H_MIN(TC_H_MIN_EGRESS);
+
return TC_H_MIN(classid) + 1;
}

@@ -49,6 +58,9 @@ static struct tcf_block *ingress_tcf_block(struct Qdisc *sch, unsigned long cl,
{
struct ingress_sched_data *q = qdisc_priv(sch);

+ if (q->clsact && cl == TC_H_MIN(TC_H_MIN_EGRESS))
+ return q->egress_block;
+
return q->block;
}

@@ -66,6 +78,14 @@ static void ingress_ingress_block_set(struct Qdisc *sch, u32 block_index)
q->block_info.block_index = block_index;
}

+static void ingress_egress_block_set(struct Qdisc *sch, u32 block_index)
+{
+ struct ingress_sched_data *q = qdisc_priv(sch);
+
+ if (q->clsact)
+ q->egress_block_info.block_index = block_index;
+}
+
static u32 ingress_ingress_block_get(struct Qdisc *sch)
{
struct ingress_sched_data *q = qdisc_priv(sch);
@@ -73,6 +93,13 @@ static u32 ingress_ingress_block_get(struct Qdisc *sch)
return q->block_info.block_index;
}

+static u32 ingress_egress_block_get(struct Qdisc *sch)
+{
+ struct ingress_sched_data *q = qdisc_priv(sch);
+
+ return q->clsact ? q->egress_block_info.block_index : 0;
+}
+
static int ingress_init(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
@@ -103,16 +130,78 @@ static void ingress_destroy(struct Qdisc *sch)

tcf_block_put_ext(q->block, sch, &q->block_info);
net_dec_ingress_queue();
+
+ if (q->clsact) {
+ tcf_block_put_ext(q->egress_block, sch, &q->egress_block_info);
+ net_dec_egress_queue();
+ }
+}
+
+static const struct nla_policy ingress_policy[TCA_INGRESS_MAX + 1] = {
+ [TCA_INGRESS_FLAGS] = NLA_POLICY_BITFIELD32(TC_INGRESS_SUPPORTED_FLAGS),
+};
+
+static int ingress_change(struct Qdisc *sch, struct nlattr *arg, struct netlink_ext_ack *extack)
+{
+ struct ingress_sched_data *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct nlattr *tb[TCA_INGRESS_MAX + 1];
+ struct nla_bitfield32 flags;
+ int err;
+
+ err = nla_parse_nested_deprecated(tb, TCA_INGRESS_MAX, arg, ingress_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_INGRESS_FLAGS])
+ return -EINVAL;
+
+ flags = nla_get_bitfield32(tb[TCA_INGRESS_FLAGS]);
+
+ if (flags.value & TC_INGRESS_CLSACT) {
+ if (q->clsact)
+ return -EEXIST;
+
+ /* enable clsact egress mini-Qdisc */
+ mini_qdisc_pair_init(&q->miniqp_egress, sch, &dev->miniq_egress);
+
+ q->egress_block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS;
+ q->egress_block_info.chain_head_change = clsact_chain_head_change;
+ q->egress_block_info.chain_head_change_priv = &q->miniqp_egress;
+
+ err = tcf_block_get_ext(&q->egress_block, sch, &q->egress_block_info, extack);
+ if (err)
+ return err;
+
+ net_inc_egress_queue();
+ q->clsact = true;
+ } else {
+ if (!q->clsact)
+ return -ENOENT;
+
+ /* disable clsact egress mini-Qdisc */
+ tcf_block_put_ext(q->egress_block, sch, &q->egress_block_info);
+
+ net_dec_egress_queue();
+ q->clsact = false;
+ }
+
+ return 0;
}

static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb)
{
+ struct ingress_sched_data *q = qdisc_priv(sch);
struct nlattr *nest;

nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;

+ if (nla_put_bitfield32(skb, TCA_INGRESS_FLAGS, q->clsact ? TC_INGRESS_CLSACT : 0,
+ TC_INGRESS_SUPPORTED_FLAGS))
+ goto nla_put_failure;
+
return nla_nest_end(skb, nest);

nla_put_failure:
@@ -137,9 +226,12 @@ static struct Qdisc_ops ingress_qdisc_ops __read_mostly = {
.static_flags = TCQ_F_CPUSTATS,
.init = ingress_init,
.destroy = ingress_destroy,
+ .change = ingress_change,
.dump = ingress_dump,
.ingress_block_set = ingress_ingress_block_set,
+ .egress_block_set = ingress_egress_block_set,
.ingress_block_get = ingress_ingress_block_get,
+ .egress_block_get = ingress_egress_block_get,
.owner = THIS_MODULE,
};

--
2.20.1



2021-08-02 20:52:15

by Peilin Ye

[permalink] [raw]
Subject: [PATCH net-next 2/2] tc-testing/ingress: Add control-plane selftests for clsact egress mini-Qdisc option

From: Peilin Ye <[email protected]>

Recently we added a new clsact egress mini-Qdisc option for sch_ingress.
Add a few control-plane tdc.py selftests for it.

Depends on kernel patch "net/sched: sch_ingress: Support clsact egress
mini-Qdisc option", as well as iproute2 patch "tc/ingress: Introduce
clsact egress mini-Qdisc option".

Reviewed-by: Cong Wang <[email protected]>
Signed-off-by: Peilin Ye <[email protected]>
---
.../tc-testing/tc-tests/qdiscs/ingress.json | 84 +++++++++++++++++++
1 file changed, 84 insertions(+)

diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/ingress.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/ingress.json
index d99dba6e2b1a..2cde11b2ea9b 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/ingress.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/ingress.json
@@ -98,5 +98,89 @@
"teardown": [
"$IP link del dev $DUMMY type dummy"
]
+ },
+ {
+ "id": "8e8c",
+ "name": "Enable clsact egress mini-qdisc for ingress",
+ "category": [
+ "qdisc",
+ "ingress"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true",
+ "$TC qdisc add dev $DUMMY ingress"
+ ],
+ "cmdUnderTest": "$TC qdisc change dev $DUMMY ingress clsact-on",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ingress ffff:.*clsact",
+ "matchCount": "1",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "3a76",
+ "name": "Disable clsact egress mini-qdisc for ingress",
+ "category": [
+ "qdisc",
+ "ingress"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true",
+ "$TC qdisc add dev $DUMMY ingress",
+ "$TC qdisc change dev $DUMMY ingress clsact-on"
+ ],
+ "cmdUnderTest": "$TC qdisc change dev $DUMMY ingress clsact-off",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ingress ffff:.*clsact",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "7b2b",
+ "name": "Enable clsact egress mini-qdisc for ingress twice",
+ "category": [
+ "qdisc",
+ "ingress"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true",
+ "$TC qdisc add dev $DUMMY ingress",
+ "$TC qdisc change dev $DUMMY ingress clsact-on"
+ ],
+ "cmdUnderTest": "$TC qdisc change dev $DUMMY ingress clsact-on",
+ "expExitCode": "2",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ingress ffff:.*clsact",
+ "matchCount": "1",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "05ab",
+ "name": "Disable clsact egress mini-qdisc for ingress twice",
+ "category": [
+ "qdisc",
+ "ingress"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true",
+ "$TC qdisc add dev $DUMMY ingress",
+ "$TC qdisc change dev $DUMMY ingress clsact-on",
+ "$TC qdisc change dev $DUMMY ingress clsact-off"
+ ],
+ "cmdUnderTest": "$TC qdisc change dev $DUMMY ingress clsact-off",
+ "expExitCode": "2",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ingress ffff:.*clsact",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
}
]
--
2.20.1


2021-08-02 21:13:30

by Daniel Borkmann

[permalink] [raw]
Subject: Re: [PATCH net-next 1/2] net/sched: sch_ingress: Support clsact egress mini-Qdisc option

On 8/2/21 10:49 PM, Peilin Ye wrote:
> From: Peilin Ye <[email protected]>
>
> If the ingress Qdisc is in use, currently it is not possible to add
> another clsact egress mini-Qdisc to the same device without taking down
> the ingress Qdisc, since both sch_ingress and sch_clsact use the same
> handle (0xFFFF0000).
>
> Add a "change" option for sch_ingress, so that users can enable or disable
> a clsact egress mini-Qdisc, without suffering from downtime:
>
> $ tc qdisc add dev eth0 ingress
> $ tc qdisc change dev eth0 ingress clsact-on
>
> Then users can add filters to the egress mini-Qdisc as usual:
>
> $ tc filter add dev eth0 egress protocol ip prio 10 \
> matchall action skbmod swap mac
>
> Deleting the ingress Qdisc removes the egress mini-Qdisc as well. To
> remove egress mini-Qdisc only, use:
>
> $ tc qdisc change dev eth0 ingress clsact-off
>
> Finally, if the egress mini-Qdisc is enabled, the "show" command will
> print out a "clsact" flag to indicate it:
>
> $ tc qdisc show ingress
> qdisc ingress ffff: dev eth0 parent ffff:fff1 ----------------
> $ tc qdisc change dev eth0 ingress clsact-on
> $ tc qdisc show ingress
> qdisc ingress ffff: dev eth0 parent ffff:fff1 ---------------- clsact
>
> Reviewed-by: Cong Wang <[email protected]>
> Signed-off-by: Peilin Ye <[email protected]>

NAK, just use clsact qdisc in the first place which has both ingress and egress
support instead of adding such hack. You already need to change your scripts for
clsact-on, so just swap 'tc qdisc add dev eth0 ingress' to 'tc qdisc add dev eth0
clsact' w/o needing to change kernel.

Thanks,
Daniel

2021-08-03 00:10:08

by Cong Wang

[permalink] [raw]
Subject: Re: [PATCH net-next 1/2] net/sched: sch_ingress: Support clsact egress mini-Qdisc option

On Mon, Aug 2, 2021 at 2:11 PM Daniel Borkmann <[email protected]> wrote:
>
> NAK, just use clsact qdisc in the first place which has both ingress and egress
> support instead of adding such hack. You already need to change your scripts for
> clsact-on, so just swap 'tc qdisc add dev eth0 ingress' to 'tc qdisc add dev eth0
> clsact' w/o needing to change kernel.

If we were able to change the "script" as easily as you described,
you would not even see such a patch. The fact is it is not under
our control, the most we can do is change the qdisc after it is
created by the "script", ideally without interfering its traffic,
hence we have such a patch.

(BTW, it is actually not a script, it is a cloud platform.)

Thanks.

2021-08-03 08:10:42

by Daniel Borkmann

[permalink] [raw]
Subject: Re: [PATCH net-next 1/2] net/sched: sch_ingress: Support clsact egress mini-Qdisc option

On 8/3/21 2:08 AM, Cong Wang wrote:
> On Mon, Aug 2, 2021 at 2:11 PM Daniel Borkmann <[email protected]> wrote:
>>
>> NAK, just use clsact qdisc in the first place which has both ingress and egress
>> support instead of adding such hack. You already need to change your scripts for
>> clsact-on, so just swap 'tc qdisc add dev eth0 ingress' to 'tc qdisc add dev eth0
>> clsact' w/o needing to change kernel.
>
> If we were able to change the "script" as easily as you described,
> you would not even see such a patch. The fact is it is not under
> our control, the most we can do is change the qdisc after it is
> created by the "script", ideally without interfering its traffic,
> hence we have such a patch.
>
> (BTW, it is actually not a script, it is a cloud platform.)

Sigh, so you're trying to solve a non-technical issue with one cloud provider by
taking a detour for unnecessarily extending the kernel instead with functionality
that already exists in another qdisc (and potentially waiting few years until they
eventually upgrade). I presume Bytedance should be a big enough entity to make a
case for that provider to change it. After all swapping ingress with clsact for
such script is completely transparent and there is nothing that would break. (Fwiw,
from all the major cloud providers we have never seen such issue in our deployments.)

Thanks,
Daniel

2021-08-04 20:45:32

by Cong Wang

[permalink] [raw]
Subject: Re: [PATCH net-next 1/2] net/sched: sch_ingress: Support clsact egress mini-Qdisc option

On Tue, Aug 3, 2021 at 1:08 AM Daniel Borkmann <[email protected]> wrote:
>
> On 8/3/21 2:08 AM, Cong Wang wrote:
> > On Mon, Aug 2, 2021 at 2:11 PM Daniel Borkmann <[email protected]> wrote:
> >>
> >> NAK, just use clsact qdisc in the first place which has both ingress and egress
> >> support instead of adding such hack. You already need to change your scripts for
> >> clsact-on, so just swap 'tc qdisc add dev eth0 ingress' to 'tc qdisc add dev eth0
> >> clsact' w/o needing to change kernel.
> >
> > If we were able to change the "script" as easily as you described,
> > you would not even see such a patch. The fact is it is not under
> > our control, the most we can do is change the qdisc after it is
> > created by the "script", ideally without interfering its traffic,
> > hence we have such a patch.
> >
> > (BTW, it is actually not a script, it is a cloud platform.)
>
> Sigh, so you're trying to solve a non-technical issue with one cloud provider by
> taking a detour for unnecessarily extending the kernel instead with functionality
> that already exists in another qdisc (and potentially waiting few years until they
> eventually upgrade). I presume Bytedance should be a big enough entity to make a
> case for that provider to change it. After all swapping ingress with clsact for
> such script is completely transparent and there is nothing that would break. (Fwiw,
> from all the major cloud providers we have never seen such issue in our deployments.)

Well, it is both non-technical and technical at the same time.

The non-technical part is that it is really hard to convince people from
other team to restart their services just for a kernel change, people are just
not happy to take risks.

The technical part is the bad design of clsact. It is too late to complain,
but it should not create two _conceptual_ qdiscs (actually just one struct
Qdisc) at the same time. If it only created just egress, we would
not even bother changing ingress at all. Sigh.

Thanks.