nftables replaces iptables, but it lacks memcg accounting.
This patch account most of the memory allocation associated with nft
and should protect the host from misusing nft inside a memcg restricted
container.
Signed-off-by: Vasily Averin <[email protected]>
---
net/netfilter/core.c | 2 +-
net/netfilter/nf_tables_api.c | 44 +++++++++++++++++------------------
2 files changed, 23 insertions(+), 23 deletions(-)
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 8a77a3fd69bc..77ae3e8d344c 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -58,7 +58,7 @@ static struct nf_hook_entries *allocate_hook_entries_size(u16 num)
if (num == 0)
return NULL;
- e = kvzalloc(alloc, GFP_KERNEL);
+ e = kvzalloc(alloc, GFP_KERNEL_ACCOUNT);
if (e)
e->num_hook_entries = num;
return e;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index d71a33ae39b3..04be94236a34 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1113,16 +1113,16 @@ static int nf_tables_newtable(struct sk_buff *skb, const struct nfnl_info *info,
}
err = -ENOMEM;
- table = kzalloc(sizeof(*table), GFP_KERNEL);
+ table = kzalloc(sizeof(*table), GFP_KERNEL_ACCOUNT);
if (table == NULL)
goto err_kzalloc;
- table->name = nla_strdup(attr, GFP_KERNEL);
+ table->name = nla_strdup(attr, GFP_KERNEL_ACCOUNT);
if (table->name == NULL)
goto err_strdup;
if (nla[NFTA_TABLE_USERDATA]) {
- table->udata = nla_memdup(nla[NFTA_TABLE_USERDATA], GFP_KERNEL);
+ table->udata = nla_memdup(nla[NFTA_TABLE_USERDATA], GFP_KERNEL_ACCOUNT);
if (table->udata == NULL)
goto err_table_udata;
@@ -1803,7 +1803,7 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net,
struct nft_hook *hook;
int err;
- hook = kmalloc(sizeof(struct nft_hook), GFP_KERNEL);
+ hook = kmalloc(sizeof(struct nft_hook), GFP_KERNEL_ACCOUNT);
if (!hook) {
err = -ENOMEM;
goto err_hook_alloc;
@@ -2026,7 +2026,7 @@ static struct nft_rule_blob *nf_tables_chain_alloc_rules(unsigned int size)
if (size > INT_MAX)
return NULL;
- blob = kvmalloc(size, GFP_KERNEL);
+ blob = kvmalloc(size, GFP_KERNEL_ACCOUNT);
if (!blob)
return NULL;
@@ -2126,7 +2126,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
if (err < 0)
return err;
- basechain = kzalloc(sizeof(*basechain), GFP_KERNEL);
+ basechain = kzalloc(sizeof(*basechain), GFP_KERNEL_ACCOUNT);
if (basechain == NULL) {
nft_chain_release_hook(&hook);
return -ENOMEM;
@@ -2156,7 +2156,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
if (flags & NFT_CHAIN_HW_OFFLOAD)
return -EOPNOTSUPP;
- chain = kzalloc(sizeof(*chain), GFP_KERNEL);
+ chain = kzalloc(sizeof(*chain), GFP_KERNEL_ACCOUNT);
if (chain == NULL)
return -ENOMEM;
@@ -2169,7 +2169,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
chain->table = table;
if (nla[NFTA_CHAIN_NAME]) {
- chain->name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL);
+ chain->name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL_ACCOUNT);
} else {
if (!(flags & NFT_CHAIN_BINDING)) {
err = -EINVAL;
@@ -2177,7 +2177,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
}
snprintf(name, sizeof(name), "__chain%llu", ++chain_id);
- chain->name = kstrdup(name, GFP_KERNEL);
+ chain->name = kstrdup(name, GFP_KERNEL_ACCOUNT);
}
if (!chain->name) {
@@ -2186,7 +2186,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
}
if (nla[NFTA_CHAIN_USERDATA]) {
- chain->udata = nla_memdup(nla[NFTA_CHAIN_USERDATA], GFP_KERNEL);
+ chain->udata = nla_memdup(nla[NFTA_CHAIN_USERDATA], GFP_KERNEL_ACCOUNT);
if (chain->udata == NULL) {
err = -ENOMEM;
goto err_destroy_chain;
@@ -2349,7 +2349,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
char *name;
err = -ENOMEM;
- name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL);
+ name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL_ACCOUNT);
if (!name)
goto err;
@@ -2797,7 +2797,7 @@ static struct nft_expr *nft_expr_init(const struct nft_ctx *ctx,
goto err1;
err = -ENOMEM;
- expr = kzalloc(expr_info.ops->size, GFP_KERNEL);
+ expr = kzalloc(expr_info.ops->size, GFP_KERNEL_ACCOUNT);
if (expr == NULL)
goto err2;
@@ -3405,7 +3405,7 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
}
err = -ENOMEM;
- rule = kzalloc(sizeof(*rule) + size + usize, GFP_KERNEL);
+ rule = kzalloc(sizeof(*rule) + size + usize, GFP_KERNEL_ACCOUNT);
if (rule == NULL)
goto err_release_expr;
@@ -3818,7 +3818,7 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set,
free_page((unsigned long)inuse);
}
- set->name = kasprintf(GFP_KERNEL, name, min + n);
+ set->name = kasprintf(GFP_KERNEL_ACCOUNT, name, min + n);
if (!set->name)
return -ENOMEM;
@@ -4382,11 +4382,11 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info,
alloc_size = sizeof(*set) + size + udlen;
if (alloc_size < size || alloc_size > INT_MAX)
return -ENOMEM;
- set = kvzalloc(alloc_size, GFP_KERNEL);
+ set = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT);
if (!set)
return -ENOMEM;
- name = nla_strdup(nla[NFTA_SET_NAME], GFP_KERNEL);
+ name = nla_strdup(nla[NFTA_SET_NAME], GFP_KERNEL_ACCOUNT);
if (!name) {
err = -ENOMEM;
goto err_set_name;
@@ -5921,7 +5921,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
err = -ENOMEM;
elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data,
elem.key_end.val.data, elem.data.val.data,
- timeout, expiration, GFP_KERNEL);
+ timeout, expiration, GFP_KERNEL_ACCOUNT);
if (elem.priv == NULL)
goto err_parse_data;
@@ -6165,7 +6165,7 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
err = -ENOMEM;
elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data,
elem.key_end.val.data, NULL, 0, 0,
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (elem.priv == NULL)
goto fail_elem;
@@ -6477,7 +6477,7 @@ static struct nft_object *nft_obj_init(const struct nft_ctx *ctx,
}
err = -ENOMEM;
- obj = kzalloc(sizeof(*obj) + ops->size, GFP_KERNEL);
+ obj = kzalloc(sizeof(*obj) + ops->size, GFP_KERNEL_ACCOUNT);
if (!obj)
goto err2;
@@ -6643,7 +6643,7 @@ static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info,
obj->key.table = table;
obj->handle = nf_tables_alloc_handle(table);
- obj->key.name = nla_strdup(nla[NFTA_OBJ_NAME], GFP_KERNEL);
+ obj->key.name = nla_strdup(nla[NFTA_OBJ_NAME], GFP_KERNEL_ACCOUNT);
if (!obj->key.name) {
err = -ENOMEM;
goto err_strdup;
@@ -7404,7 +7404,7 @@ static int nf_tables_newflowtable(struct sk_buff *skb,
nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
- flowtable = kzalloc(sizeof(*flowtable), GFP_KERNEL);
+ flowtable = kzalloc(sizeof(*flowtable), GFP_KERNEL_ACCOUNT);
if (!flowtable)
return -ENOMEM;
@@ -7412,7 +7412,7 @@ static int nf_tables_newflowtable(struct sk_buff *skb,
flowtable->handle = nf_tables_alloc_handle(table);
INIT_LIST_HEAD(&flowtable->hook_list);
- flowtable->name = nla_strdup(nla[NFTA_FLOWTABLE_NAME], GFP_KERNEL);
+ flowtable->name = nla_strdup(nla[NFTA_FLOWTABLE_NAME], GFP_KERNEL_ACCOUNT);
if (!flowtable->name) {
err = -ENOMEM;
goto err1;
--
2.25.1
On 3/28/22 11:15, Pablo Neira Ayuso wrote:
> I think nft_*.c files whose NFT_EXPR_STATEFUL flag is set on need to
> use this _ACCOUNT flag variant for objects that are dinamically
> allocated from the packet path.
Thank you for the hint.
I think you're right in general, such objects should be accounted too.
Though this requires additional research, because it is not clear for me,
where proper memcg can be found.
In case of nft it was quite easy, memcg was taken from current task.
However, the objects you specified seem can be allocated in other contexts.
Thank you,
Vasily Averin
On Thu, Mar 24, 2022 at 09:05:50PM +0300, Vasily Averin wrote:
> nftables replaces iptables, but it lacks memcg accounting.
>
> This patch account most of the memory allocation associated with nft
> and should protect the host from misusing nft inside a memcg restricted
> container.
Applied to nf, thanks
I think nft_*.c files whose NFT_EXPR_STATEFUL flag is set on need to
use this _ACCOUNT flag variant for objects that are dinamically
allocated from the packet path.
nft_*.c files whose NFT_EXPR_STATEFUL flag is set on need to
use __GFP_ACCOUNT flag for objects that are dynamically
allocated from the packet path.
Such objects are allocated inside .init() or .clone() callbacks
of struct nft_expr_ops executed in task context while processing
netlink messages.
In addition, this patch adds accounting to nft_set_elem_expr_clone()
used for the same purposes.
Signed-off-by: Vasily Averin <[email protected]>
---
net/netfilter/nf_tables_api.c | 2 +-
net/netfilter/nft_connlimit.c | 4 ++--
net/netfilter/nft_counter.c | 4 ++--
net/netfilter/nft_last.c | 4 ++--
net/netfilter/nft_limit.c | 4 ++--
net/netfilter/nft_quota.c | 4 ++--
6 files changed, 11 insertions(+), 11 deletions(-)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 04be94236a34..e01241151ef7 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5447,7 +5447,7 @@ int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set,
int err, i, k;
for (i = 0; i < set->num_exprs; i++) {
- expr = kzalloc(set->exprs[i]->ops->size, GFP_KERNEL);
+ expr = kzalloc(set->exprs[i]->ops->size, GFP_KERNEL_ACCOUNT);
if (!expr)
goto err_expr;
diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c
index 3362417ebfdb..9c2146aac59e 100644
--- a/net/netfilter/nft_connlimit.c
+++ b/net/netfilter/nft_connlimit.c
@@ -77,7 +77,7 @@ static int nft_connlimit_do_init(const struct nft_ctx *ctx,
invert = true;
}
- priv->list = kmalloc(sizeof(*priv->list), GFP_KERNEL);
+ priv->list = kmalloc(sizeof(*priv->list), GFP_KERNEL_ACCOUNT);
if (!priv->list)
return -ENOMEM;
@@ -214,7 +214,7 @@ static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src)
struct nft_connlimit *priv_dst = nft_expr_priv(dst);
struct nft_connlimit *priv_src = nft_expr_priv(src);
- priv_dst->list = kmalloc(sizeof(*priv_dst->list), GFP_ATOMIC);
+ priv_dst->list = kmalloc(sizeof(*priv_dst->list), GFP_ATOMIC | __GFP_ACCOUNT);
if (!priv_dst->list)
return -ENOMEM;
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
index f179e8c3b0ca..040a697d96b3 100644
--- a/net/netfilter/nft_counter.c
+++ b/net/netfilter/nft_counter.c
@@ -62,7 +62,7 @@ static int nft_counter_do_init(const struct nlattr * const tb[],
struct nft_counter __percpu *cpu_stats;
struct nft_counter *this_cpu;
- cpu_stats = alloc_percpu(struct nft_counter);
+ cpu_stats = alloc_percpu_gfp(struct nft_counter, GFP_KERNEL_ACCOUNT);
if (cpu_stats == NULL)
return -ENOMEM;
@@ -235,7 +235,7 @@ static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src)
nft_counter_fetch(priv, &total);
- cpu_stats = alloc_percpu_gfp(struct nft_counter, GFP_ATOMIC);
+ cpu_stats = alloc_percpu_gfp(struct nft_counter, GFP_ATOMIC | __GFP_ACCOUNT);
if (cpu_stats == NULL)
return -ENOMEM;
diff --git a/net/netfilter/nft_last.c b/net/netfilter/nft_last.c
index 4f745a409d34..4cf4f6349855 100644
--- a/net/netfilter/nft_last.c
+++ b/net/netfilter/nft_last.c
@@ -30,7 +30,7 @@ static int nft_last_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
u64 last_jiffies;
int err;
- last = kzalloc(sizeof(*last), GFP_KERNEL);
+ last = kzalloc(sizeof(*last), GFP_KERNEL_ACCOUNT);
if (!last)
return -ENOMEM;
@@ -105,7 +105,7 @@ static int nft_last_clone(struct nft_expr *dst, const struct nft_expr *src)
{
struct nft_last_priv *priv_dst = nft_expr_priv(dst);
- priv_dst->last = kzalloc(sizeof(*priv_dst->last), GFP_ATOMIC);
+ priv_dst->last = kzalloc(sizeof(*priv_dst->last), GFP_ATOMIC | __GFP_ACCOUNT);
if (!priv_dst->last)
return -ENOMEM;
diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c
index a726b623963d..d7779d5f3cbb 100644
--- a/net/netfilter/nft_limit.c
+++ b/net/netfilter/nft_limit.c
@@ -90,7 +90,7 @@ static int nft_limit_init(struct nft_limit_priv *priv,
priv->rate);
}
- priv->limit = kmalloc(sizeof(*priv->limit), GFP_KERNEL);
+ priv->limit = kmalloc(sizeof(*priv->limit), GFP_KERNEL_ACCOUNT);
if (!priv->limit)
return -ENOMEM;
@@ -144,7 +144,7 @@ static int nft_limit_clone(struct nft_limit_priv *priv_dst,
priv_dst->burst = priv_src->burst;
priv_dst->invert = priv_src->invert;
- priv_dst->limit = kmalloc(sizeof(*priv_dst->limit), GFP_ATOMIC);
+ priv_dst->limit = kmalloc(sizeof(*priv_dst->limit), GFP_ATOMIC | __GFP_ACCOUNT);
if (!priv_dst->limit)
return -ENOMEM;
diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c
index f394a0b562f6..2def4d92a170 100644
--- a/net/netfilter/nft_quota.c
+++ b/net/netfilter/nft_quota.c
@@ -90,7 +90,7 @@ static int nft_quota_do_init(const struct nlattr * const tb[],
return -EOPNOTSUPP;
}
- priv->consumed = kmalloc(sizeof(*priv->consumed), GFP_KERNEL);
+ priv->consumed = kmalloc(sizeof(*priv->consumed), GFP_KERNEL_ACCOUNT);
if (!priv->consumed)
return -ENOMEM;
@@ -236,7 +236,7 @@ static int nft_quota_clone(struct nft_expr *dst, const struct nft_expr *src)
{
struct nft_quota *priv_dst = nft_expr_priv(dst);
- priv_dst->consumed = kmalloc(sizeof(*priv_dst->consumed), GFP_ATOMIC);
+ priv_dst->consumed = kmalloc(sizeof(*priv_dst->consumed), GFP_ATOMIC | __GFP_ACCOUNT);
if (!priv_dst->consumed)
return -ENOMEM;
--
2.31.1
On Thu, Mar 31, 2022 at 11:40:23AM +0300, Vasily Averin wrote:
> nft_*.c files whose NFT_EXPR_STATEFUL flag is set on need to
> use __GFP_ACCOUNT flag for objects that are dynamically
> allocated from the packet path.
>
> Such objects are allocated inside .init() or .clone() callbacks
> of struct nft_expr_ops executed in task context while processing
> netlink messages.
>
> In addition, this patch adds accounting to nft_set_elem_expr_clone()
> used for the same purposes.
The patch looks good to me. The only concern I have is a potential
performance regression. Are any of these allocations happening on really
hot paths? From a very brief look it seems like the answer is no,
but I'm not well familiar with the netfilter code. Would be nice to
have a confirmation from somebody working on the netfilter.
We are roughly talking about x2 slower memory allocations, which
is usually not a problem at all, given that allocations are still
very cheap.
Please, feel free to add my
Acked-by: Roman Gushchin <[email protected]>
Thanks!
>
> Signed-off-by: Vasily Averin <[email protected]>
> ---
> net/netfilter/nf_tables_api.c | 2 +-
> net/netfilter/nft_connlimit.c | 4 ++--
> net/netfilter/nft_counter.c | 4 ++--
> net/netfilter/nft_last.c | 4 ++--
> net/netfilter/nft_limit.c | 4 ++--
> net/netfilter/nft_quota.c | 4 ++--
> 6 files changed, 11 insertions(+), 11 deletions(-)
>
> diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
> index 04be94236a34..e01241151ef7 100644
> --- a/net/netfilter/nf_tables_api.c
> +++ b/net/netfilter/nf_tables_api.c
> @@ -5447,7 +5447,7 @@ int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set,
> int err, i, k;
>
> for (i = 0; i < set->num_exprs; i++) {
> - expr = kzalloc(set->exprs[i]->ops->size, GFP_KERNEL);
> + expr = kzalloc(set->exprs[i]->ops->size, GFP_KERNEL_ACCOUNT);
> if (!expr)
> goto err_expr;
>
> diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c
> index 3362417ebfdb..9c2146aac59e 100644
> --- a/net/netfilter/nft_connlimit.c
> +++ b/net/netfilter/nft_connlimit.c
> @@ -77,7 +77,7 @@ static int nft_connlimit_do_init(const struct nft_ctx *ctx,
> invert = true;
> }
>
> - priv->list = kmalloc(sizeof(*priv->list), GFP_KERNEL);
> + priv->list = kmalloc(sizeof(*priv->list), GFP_KERNEL_ACCOUNT);
> if (!priv->list)
> return -ENOMEM;
>
> @@ -214,7 +214,7 @@ static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src)
> struct nft_connlimit *priv_dst = nft_expr_priv(dst);
> struct nft_connlimit *priv_src = nft_expr_priv(src);
>
> - priv_dst->list = kmalloc(sizeof(*priv_dst->list), GFP_ATOMIC);
> + priv_dst->list = kmalloc(sizeof(*priv_dst->list), GFP_ATOMIC | __GFP_ACCOUNT);
> if (!priv_dst->list)
> return -ENOMEM;
>
> diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
> index f179e8c3b0ca..040a697d96b3 100644
> --- a/net/netfilter/nft_counter.c
> +++ b/net/netfilter/nft_counter.c
> @@ -62,7 +62,7 @@ static int nft_counter_do_init(const struct nlattr * const tb[],
> struct nft_counter __percpu *cpu_stats;
> struct nft_counter *this_cpu;
>
> - cpu_stats = alloc_percpu(struct nft_counter);
> + cpu_stats = alloc_percpu_gfp(struct nft_counter, GFP_KERNEL_ACCOUNT);
> if (cpu_stats == NULL)
> return -ENOMEM;
>
> @@ -235,7 +235,7 @@ static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src)
>
> nft_counter_fetch(priv, &total);
>
> - cpu_stats = alloc_percpu_gfp(struct nft_counter, GFP_ATOMIC);
> + cpu_stats = alloc_percpu_gfp(struct nft_counter, GFP_ATOMIC | __GFP_ACCOUNT);
> if (cpu_stats == NULL)
> return -ENOMEM;
>
> diff --git a/net/netfilter/nft_last.c b/net/netfilter/nft_last.c
> index 4f745a409d34..4cf4f6349855 100644
> --- a/net/netfilter/nft_last.c
> +++ b/net/netfilter/nft_last.c
> @@ -30,7 +30,7 @@ static int nft_last_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
> u64 last_jiffies;
> int err;
>
> - last = kzalloc(sizeof(*last), GFP_KERNEL);
> + last = kzalloc(sizeof(*last), GFP_KERNEL_ACCOUNT);
> if (!last)
> return -ENOMEM;
>
> @@ -105,7 +105,7 @@ static int nft_last_clone(struct nft_expr *dst, const struct nft_expr *src)
> {
> struct nft_last_priv *priv_dst = nft_expr_priv(dst);
>
> - priv_dst->last = kzalloc(sizeof(*priv_dst->last), GFP_ATOMIC);
> + priv_dst->last = kzalloc(sizeof(*priv_dst->last), GFP_ATOMIC | __GFP_ACCOUNT);
> if (!priv_dst->last)
> return -ENOMEM;
>
> diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c
> index a726b623963d..d7779d5f3cbb 100644
> --- a/net/netfilter/nft_limit.c
> +++ b/net/netfilter/nft_limit.c
> @@ -90,7 +90,7 @@ static int nft_limit_init(struct nft_limit_priv *priv,
> priv->rate);
> }
>
> - priv->limit = kmalloc(sizeof(*priv->limit), GFP_KERNEL);
> + priv->limit = kmalloc(sizeof(*priv->limit), GFP_KERNEL_ACCOUNT);
> if (!priv->limit)
> return -ENOMEM;
>
> @@ -144,7 +144,7 @@ static int nft_limit_clone(struct nft_limit_priv *priv_dst,
> priv_dst->burst = priv_src->burst;
> priv_dst->invert = priv_src->invert;
>
> - priv_dst->limit = kmalloc(sizeof(*priv_dst->limit), GFP_ATOMIC);
> + priv_dst->limit = kmalloc(sizeof(*priv_dst->limit), GFP_ATOMIC | __GFP_ACCOUNT);
> if (!priv_dst->limit)
> return -ENOMEM;
>
> diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c
> index f394a0b562f6..2def4d92a170 100644
> --- a/net/netfilter/nft_quota.c
> +++ b/net/netfilter/nft_quota.c
> @@ -90,7 +90,7 @@ static int nft_quota_do_init(const struct nlattr * const tb[],
> return -EOPNOTSUPP;
> }
>
> - priv->consumed = kmalloc(sizeof(*priv->consumed), GFP_KERNEL);
> + priv->consumed = kmalloc(sizeof(*priv->consumed), GFP_KERNEL_ACCOUNT);
> if (!priv->consumed)
> return -ENOMEM;
>
> @@ -236,7 +236,7 @@ static int nft_quota_clone(struct nft_expr *dst, const struct nft_expr *src)
> {
> struct nft_quota *priv_dst = nft_expr_priv(dst);
>
> - priv_dst->consumed = kmalloc(sizeof(*priv_dst->consumed), GFP_ATOMIC);
> + priv_dst->consumed = kmalloc(sizeof(*priv_dst->consumed), GFP_ATOMIC | __GFP_ACCOUNT);
> if (!priv_dst->consumed)
> return -ENOMEM;
>
> --
> 2.31.1
>
On 4/1/22 22:31, Florian Westphal wrote:
> Vasily Averin <[email protected]> wrote:
>>> Same problem as connlimit, can be called from packet path.
>>> Basically all GFP_ATOMIC are suspicious.
>>>
>>> Not sure how to resolve this, similar mechanics in iptables world (e.g.
>>> connlimit or SET target) don't use memcg accounting.
>>>
>>> Perhaps for now resend with only the GFP_KERNEL parts converted?
>>> Those are safe.
>>
>> It is safe for packet path too, _ACCOUNT allocation will not be able to find memcg
>> in case of "!in_task()" context.
>> On the other hand any additional checks on such path will affect performance.
>
> I'm not sure this works with ksoftirqd serving network stack?
Please take look at memcg_kmem_bypass() called from
memcg_slab_pre_alloc_hook -> get_obj_cgroup_from_current
By default memcg accounting does not work for any kernel threads.
If required thread can use set_active_memcg() but at present it is not widely used.
>> Could you please estimate how often is this code used in the case of nft vs packet path?
>
> It depends on user configuration.
> Update from packet path is used for things like port knocking or other
> dyanamic filter lists, or somehing like Limiting connections to x-per-address/subnet and so on.
Ok, I think we can skip accounting in such cases at the moment.
I doubt it can be misused and consume significant piece of host memory.
So I'm going to resend the patch w/o accounting in all .clone callbacks.
>> If the opposite is the case, then I can add __GFP_ACCOUNT flag depending on in_task() check.
>
> But what task/memcg is used for the accounting in that case?
Thanks to Roman for the explanation in concurrent thread.
Thank you,
Vasily Averin
Roman Gushchin <[email protected]> wrote:
> On Fri, Apr 01, 2022 at 09:31:59PM +0200, Florian Westphal wrote:
> > But what task/memcg is used for the accounting in that case?
>
> Root memcg/no accounting, which is the same.
>
> There is a way to account for a specific memcg in such cases, it's used for
> bpf maps, for example. We save a pointer to the memcg which created the map and
> charge it for all allocations from a !in_task context.
Great, so we could use same scheme later on if its required for some
use case.
> so let's not do without regression tests and a serious need.
Sounds good. Thanks.
On 4/1/22 15:03, Florian Westphal wrote:
> Vasily Averin <[email protected]> wrote:
>> nft_*.c files whose NFT_EXPR_STATEFUL flag is set on need to
>> use __GFP_ACCOUNT flag for objects that are dynamically
>> allocated from the packet path.
>>
>> Such objects are allocated inside .init() or .clone() callbacks
>> of struct nft_expr_ops executed in task context while processing
>> netlink messages.
>
> They can also be called from packet path.
>> @@ -214,7 +214,7 @@ static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src)
>> struct nft_connlimit *priv_dst = nft_expr_priv(dst);
>> struct nft_connlimit *priv_src = nft_expr_priv(src);
>>
>> - priv_dst->list = kmalloc(sizeof(*priv_dst->list), GFP_ATOMIC);
>> + priv_dst->list = kmalloc(sizeof(*priv_dst->list), GFP_ATOMIC | __GFP_ACCOUNT);
>
> This can be called from packet path, via nft_dynset.c.
>
> nft_do_chain -> nft_dynset_eval -> nft_dynset_new ->
> nft_dynset_expr_setup -> nft_expr_clone -> src->ops->clone()
>
Thank you, I noticed this case but did not understand that it is related to packet path.
>> @@ -235,7 +235,7 @@ static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src)
>>
>> nft_counter_fetch(priv, &total);
>>
>> - cpu_stats = alloc_percpu_gfp(struct nft_counter, GFP_ATOMIC);
>> + cpu_stats = alloc_percpu_gfp(struct nft_counter, GFP_ATOMIC | __GFP_ACCOUNT);
>> if (cpu_stats == NULL)
>> return -ENOMEM;
>
> Same problem as connlimit, can be called from packet path.
> Basically all GFP_ATOMIC are suspicious.
>
> Not sure how to resolve this, similar mechanics in iptables world (e.g.
> connlimit or SET target) don't use memcg accounting.
>
> Perhaps for now resend with only the GFP_KERNEL parts converted?
> Those are safe.
It is safe for packet path too, _ACCOUNT allocation will not be able to find memcg
in case of "!in_task()" context.
On the other hand any additional checks on such path will affect performance.
Could you please estimate how often is this code used in the case of nft vs packet path?
If packet path is rare case I think we can keep current code as is.
If the opposite is the case, then I can add __GFP_ACCOUNT flag depending on in_task() check.
Thank you,
Vasily Averin
On Fri, Apr 01, 2022 at 09:31:59PM +0200, Florian Westphal wrote:
> Vasily Averin <[email protected]> wrote:
> > > Same problem as connlimit, can be called from packet path.
> > > Basically all GFP_ATOMIC are suspicious.
> > >
> > > Not sure how to resolve this, similar mechanics in iptables world (e.g.
> > > connlimit or SET target) don't use memcg accounting.
> > >
> > > Perhaps for now resend with only the GFP_KERNEL parts converted?
> > > Those are safe.
> >
> > It is safe for packet path too, _ACCOUNT allocation will not be able to find memcg
> > in case of "!in_task()" context.
> > On the other hand any additional checks on such path will affect performance.
>
> I'm not sure this works with ksoftirqd serving network stack?
>
> > Could you please estimate how often is this code used in the case of nft vs packet path?
>
> It depends on user configuration.
> Update from packet path is used for things like port knocking or other
> dyanamic filter lists, or somehing like Limiting connections to x-per-address/subnet and so on.
>
> > If the opposite is the case, then I can add __GFP_ACCOUNT flag depending on in_task() check.
>
> But what task/memcg is used for the accounting in that case?
Root memcg/no accounting, which is the same.
There is a way to account for a specific memcg in such cases, it's used for
bpf maps, for example. We save a pointer to the memcg which created the map and
charge it for all allocations from a !in_task context. But the performance can
be affected, so let's not do without regression tests and a serious need.
Thanks!
Vasily Averin <[email protected]> wrote:
> > Same problem as connlimit, can be called from packet path.
> > Basically all GFP_ATOMIC are suspicious.
> >
> > Not sure how to resolve this, similar mechanics in iptables world (e.g.
> > connlimit or SET target) don't use memcg accounting.
> >
> > Perhaps for now resend with only the GFP_KERNEL parts converted?
> > Those are safe.
>
> It is safe for packet path too, _ACCOUNT allocation will not be able to find memcg
> in case of "!in_task()" context.
> On the other hand any additional checks on such path will affect performance.
I'm not sure this works with ksoftirqd serving network stack?
> Could you please estimate how often is this code used in the case of nft vs packet path?
It depends on user configuration.
Update from packet path is used for things like port knocking or other
dyanamic filter lists, or somehing like Limiting connections to x-per-address/subnet and so on.
> If the opposite is the case, then I can add __GFP_ACCOUNT flag depending on in_task() check.
But what task/memcg is used for the accounting in that case?
nft_*.c files whose NFT_EXPR_STATEFUL flag is set on need to
use __GFP_ACCOUNT flag for objects that are dynamically
allocated from the packet path.
Such objects are allocated inside nft_expr_ops->init() callbacks
executed in task context while processing netlink messages.
In addition, this patch adds accounting to nft_set_elem_expr_clone()
used for the same purposes.
Signed-off-by: Vasily Averin <[email protected]>
Acked-by: Roman Gushchin <[email protected]>
---
v2: removed accounting in nft_expr_ops->clone() callbacks
---
net/netfilter/nf_tables_api.c | 2 +-
net/netfilter/nft_connlimit.c | 2 +-
net/netfilter/nft_counter.c | 2 +-
net/netfilter/nft_last.c | 2 +-
net/netfilter/nft_limit.c | 2 +-
net/netfilter/nft_quota.c | 2 +-
6 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 04be94236a34..e01241151ef7 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5447,7 +5447,7 @@ int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set,
int err, i, k;
for (i = 0; i < set->num_exprs; i++) {
- expr = kzalloc(set->exprs[i]->ops->size, GFP_KERNEL);
+ expr = kzalloc(set->exprs[i]->ops->size, GFP_KERNEL_ACCOUNT);
if (!expr)
goto err_expr;
diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c
index 3362417ebfdb..f5df535bcbd0 100644
--- a/net/netfilter/nft_connlimit.c
+++ b/net/netfilter/nft_connlimit.c
@@ -77,7 +77,7 @@ static int nft_connlimit_do_init(const struct nft_ctx *ctx,
invert = true;
}
- priv->list = kmalloc(sizeof(*priv->list), GFP_KERNEL);
+ priv->list = kmalloc(sizeof(*priv->list), GFP_KERNEL_ACCOUNT);
if (!priv->list)
return -ENOMEM;
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
index f179e8c3b0ca..7691e42f6bbe 100644
--- a/net/netfilter/nft_counter.c
+++ b/net/netfilter/nft_counter.c
@@ -62,7 +62,7 @@ static int nft_counter_do_init(const struct nlattr * const tb[],
struct nft_counter __percpu *cpu_stats;
struct nft_counter *this_cpu;
- cpu_stats = alloc_percpu(struct nft_counter);
+ cpu_stats = alloc_percpu_gfp(struct nft_counter, GFP_KERNEL_ACCOUNT);
if (cpu_stats == NULL)
return -ENOMEM;
diff --git a/net/netfilter/nft_last.c b/net/netfilter/nft_last.c
index 4f745a409d34..97d0d09d48d3 100644
--- a/net/netfilter/nft_last.c
+++ b/net/netfilter/nft_last.c
@@ -30,7 +30,7 @@ static int nft_last_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
u64 last_jiffies;
int err;
- last = kzalloc(sizeof(*last), GFP_KERNEL);
+ last = kzalloc(sizeof(*last), GFP_KERNEL_ACCOUNT);
if (!last)
return -ENOMEM;
diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c
index a726b623963d..c1f7e8bb33e6 100644
--- a/net/netfilter/nft_limit.c
+++ b/net/netfilter/nft_limit.c
@@ -90,7 +90,7 @@ static int nft_limit_init(struct nft_limit_priv *priv,
priv->rate);
}
- priv->limit = kmalloc(sizeof(*priv->limit), GFP_KERNEL);
+ priv->limit = kmalloc(sizeof(*priv->limit), GFP_KERNEL_ACCOUNT);
if (!priv->limit)
return -ENOMEM;
diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c
index f394a0b562f6..0d2f55900f7b 100644
--- a/net/netfilter/nft_quota.c
+++ b/net/netfilter/nft_quota.c
@@ -90,7 +90,7 @@ static int nft_quota_do_init(const struct nlattr * const tb[],
return -EOPNOTSUPP;
}
- priv->consumed = kmalloc(sizeof(*priv->consumed), GFP_KERNEL);
+ priv->consumed = kmalloc(sizeof(*priv->consumed), GFP_KERNEL_ACCOUNT);
if (!priv->consumed)
return -ENOMEM;
--
2.31.1
Vasily Averin <[email protected]> wrote:
> nft_*.c files whose NFT_EXPR_STATEFUL flag is set on need to
> use __GFP_ACCOUNT flag for objects that are dynamically
> allocated from the packet path.
>
> Such objects are allocated inside .init() or .clone() callbacks
> of struct nft_expr_ops executed in task context while processing
> netlink messages.
They can also be called from packet path.
> diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
> index 04be94236a34..e01241151ef7 100644
> --- a/net/netfilter/nf_tables_api.c
> +++ b/net/netfilter/nf_tables_api.c
> @@ -5447,7 +5447,7 @@ int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set,
> int err, i, k;
>
> for (i = 0; i < set->num_exprs; i++) {
> - expr = kzalloc(set->exprs[i]->ops->size, GFP_KERNEL);
> + expr = kzalloc(set->exprs[i]->ops->size, GFP_KERNEL_ACCOUNT);
> if (!expr)
> goto err_expr;
This is ok.
> diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c
> index 3362417ebfdb..9c2146aac59e 100644
> --- a/net/netfilter/nft_connlimit.c
> +++ b/net/netfilter/nft_connlimit.c
> @@ -77,7 +77,7 @@ static int nft_connlimit_do_init(const struct nft_ctx *ctx,
> invert = true;
> }
>
> - priv->list = kmalloc(sizeof(*priv->list), GFP_KERNEL);
> + priv->list = kmalloc(sizeof(*priv->list), GFP_KERNEL_ACCOUNT);
> if (!priv->list)
> return -ENOMEM;
Same.
> @@ -214,7 +214,7 @@ static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src)
> struct nft_connlimit *priv_dst = nft_expr_priv(dst);
> struct nft_connlimit *priv_src = nft_expr_priv(src);
>
> - priv_dst->list = kmalloc(sizeof(*priv_dst->list), GFP_ATOMIC);
> + priv_dst->list = kmalloc(sizeof(*priv_dst->list), GFP_ATOMIC | __GFP_ACCOUNT);
This can be called from packet path, via nft_dynset.c.
nft_do_chain -> nft_dynset_eval -> nft_dynset_new ->
nft_dynset_expr_setup -> nft_expr_clone -> src->ops->clone()
> diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
> index f179e8c3b0ca..040a697d96b3 100644
> --- a/net/netfilter/nft_counter.c
> +++ b/net/netfilter/nft_counter.c
> @@ -62,7 +62,7 @@ static int nft_counter_do_init(const struct nlattr * const tb[],
> struct nft_counter __percpu *cpu_stats;
> struct nft_counter *this_cpu;
>
> - cpu_stats = alloc_percpu(struct nft_counter);
> + cpu_stats = alloc_percpu_gfp(struct nft_counter, GFP_KERNEL_ACCOUNT);
This is ok.
> @@ -235,7 +235,7 @@ static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src)
>
> nft_counter_fetch(priv, &total);
>
> - cpu_stats = alloc_percpu_gfp(struct nft_counter, GFP_ATOMIC);
> + cpu_stats = alloc_percpu_gfp(struct nft_counter, GFP_ATOMIC | __GFP_ACCOUNT);
> if (cpu_stats == NULL)
> return -ENOMEM;
Same problem as connlimit, can be called from packet path.
Basically all GFP_ATOMIC are suspicious.
Not sure how to resolve this, similar mechanics in iptables world (e.g.
connlimit or SET target) don't use memcg accounting.
Perhaps for now resend with only the GFP_KERNEL parts converted?
Those are safe.
Insertion from packet path is limited by set->size (element count) only
at this time.
On Sat, Apr 02, 2022 at 12:50:37PM +0300, Vasily Averin wrote:
> nft_*.c files whose NFT_EXPR_STATEFUL flag is set on need to
> use __GFP_ACCOUNT flag for objects that are dynamically
> allocated from the packet path.
>
> Such objects are allocated inside nft_expr_ops->init() callbacks
> executed in task context while processing netlink messages.
>
> In addition, this patch adds accounting to nft_set_elem_expr_clone()
> used for the same purposes.
Applied to nf, thanks