From c604cc691c10cb23ce7fb4ea2c9beb703d321790 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 17 Mar 2020 14:13:44 +0100 Subject: [PATCH 01/28] netfilter: nf_tables: move nft_expr_clone() to nf_tables_api.c Move the nft_expr_clone() helper function to the core. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 1 + net/netfilter/nf_tables_api.c | 18 ++++++++++++++++++ net/netfilter/nft_dynset.c | 17 ----------------- 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 5d80e09f8148..af2ed70d7eed 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -846,6 +846,7 @@ static inline void *nft_expr_priv(const struct nft_expr *expr) return (void *)expr->data; } +int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src); void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr); int nft_expr_dump(struct sk_buff *skb, unsigned int attr, const struct nft_expr *expr); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index f92fb6003745..4f6245e7988e 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2557,6 +2557,24 @@ err1: return ERR_PTR(err); } +int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src) +{ + int err; + + if (src->ops->clone) { + dst->ops = src->ops; + err = src->ops->clone(dst, src); + if (err < 0) + return err; + } else { + memcpy(dst, src, src->ops->size); + } + + __module_get(src->ops->type->owner); + + return 0; +} + void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr) { nf_tables_expr_destroy(ctx, expr); diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 46ab28ec4b53..d1b64c8de585 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -24,23 +24,6 @@ struct nft_dynset { struct nft_set_binding binding; }; -static int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src) -{ - int err; - - if (src->ops->clone) { - dst->ops = src->ops; - err = src->ops->clone(dst, src); - if (err < 0) - return err; - } else { - memcpy(dst, src, src->ops->size); - } - - __module_get(src->ops->type->owner); - return 0; -} - static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, struct nft_regs *regs) { From 0c2a85edd143162b3a698f31e94bf8cdc041da87 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 17 Mar 2020 14:13:45 +0100 Subject: [PATCH 02/28] netfilter: nf_tables: pass context to nft_set_destroy() The patch that adds support for stateful expressions in set definitions require this. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 4f6245e7988e..df046cd97fa7 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4126,7 +4126,7 @@ err2: return err; } -static void nft_set_destroy(struct nft_set *set) +static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) { if (WARN_ON(set->use > 0)) return; @@ -4271,7 +4271,7 @@ EXPORT_SYMBOL_GPL(nf_tables_deactivate_set); void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set) { if (list_empty(&set->bindings) && nft_set_is_anonymous(set)) - nft_set_destroy(set); + nft_set_destroy(ctx, set); } EXPORT_SYMBOL_GPL(nf_tables_destroy_set); @@ -7020,7 +7020,7 @@ static void nft_commit_release(struct nft_trans *trans) nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); break; case NFT_MSG_DELSET: - nft_set_destroy(nft_trans_set(trans)); + nft_set_destroy(&trans->ctx, nft_trans_set(trans)); break; case NFT_MSG_DELSETELEM: nf_tables_set_elem_destroy(&trans->ctx, @@ -7451,7 +7451,7 @@ static void nf_tables_abort_release(struct nft_trans *trans) nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); break; case NFT_MSG_NEWSET: - nft_set_destroy(nft_trans_set(trans)); + nft_set_destroy(&trans->ctx, nft_trans_set(trans)); break; case NFT_MSG_NEWSETELEM: nft_set_elem_destroy(nft_trans_elem_set(trans), @@ -8177,7 +8177,7 @@ static void __nft_release_tables(struct net *net) list_for_each_entry_safe(set, ns, &table->sets, list) { list_del(&set->list); table->use--; - nft_set_destroy(set); + nft_set_destroy(&ctx, set); } list_for_each_entry_safe(obj, ne, &table->objects, list) { nft_obj_del(obj); From 65038428b2c6c5be79d3f78a6b79c0cdc3a58a41 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 17 Mar 2020 14:13:46 +0100 Subject: [PATCH 03/28] netfilter: nf_tables: allow to specify stateful expression in set definition This patch allows users to specify the stateful expression for the elements in this set via NFTA_SET_EXPR. This new feature allows you to turn on counters for all of the elements in this set. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 + include/uapi/linux/netfilter/nf_tables.h | 2 + net/netfilter/nf_tables_api.c | 60 +++++++++++++++++++----- 3 files changed, 52 insertions(+), 12 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index af2ed70d7eed..642bc3ef81aa 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -416,6 +416,7 @@ struct nft_set_type { * @policy: set parameterization (see enum nft_set_policies) * @udlen: user data length * @udata: user data + * @expr: stateful expression * @ops: set ops * @flags: set flags * @genmask: generation mask @@ -444,6 +445,7 @@ struct nft_set { u16 policy; u16 udlen; unsigned char *udata; + struct nft_expr *expr; /* runtime data below here */ const struct nft_set_ops *ops ____cacheline_aligned; u16 flags:14, diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 9c3d2d04d6a1..4e3a5971d4ee 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -342,6 +342,7 @@ enum nft_set_field_attributes { * @NFTA_SET_USERDATA: user data (NLA_BINARY) * @NFTA_SET_OBJ_TYPE: stateful object type (NLA_U32: NFT_OBJECT_*) * @NFTA_SET_HANDLE: set handle (NLA_U64) + * @NFTA_SET_EXPR: set expression (NLA_NESTED: nft_expr_attributes) */ enum nft_set_attributes { NFTA_SET_UNSPEC, @@ -361,6 +362,7 @@ enum nft_set_attributes { NFTA_SET_PAD, NFTA_SET_OBJ_TYPE, NFTA_SET_HANDLE, + NFTA_SET_EXPR, __NFTA_SET_MAX }; #define NFTA_SET_MAX (__NFTA_SET_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index df046cd97fa7..f1910cd795fd 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3394,6 +3394,7 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { .len = NFT_USERDATA_MAXLEN }, [NFTA_SET_OBJ_TYPE] = { .type = NLA_U32 }, [NFTA_SET_HANDLE] = { .type = NLA_U64 }, + [NFTA_SET_EXPR] = { .type = NLA_NESTED }, }; static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { @@ -3597,8 +3598,8 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, { struct nfgenmsg *nfmsg; struct nlmsghdr *nlh; - struct nlattr *desc; u32 portid = ctx->portid; + struct nlattr *nest; u32 seq = ctx->seq; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); @@ -3654,9 +3655,8 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, if (nla_put(skb, NFTA_SET_USERDATA, set->udlen, set->udata)) goto nla_put_failure; - desc = nla_nest_start_noflag(skb, NFTA_SET_DESC); - - if (desc == NULL) + nest = nla_nest_start_noflag(skb, NFTA_SET_DESC); + if (!nest) goto nla_put_failure; if (set->size && nla_put_be32(skb, NFTA_SET_DESC_SIZE, htonl(set->size))) @@ -3666,7 +3666,15 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, nf_tables_fill_set_concat(skb, set)) goto nla_put_failure; - nla_nest_end(skb, desc); + nla_nest_end(skb, nest); + + if (set->expr) { + nest = nla_nest_start_noflag(skb, NFTA_SET_EXPR); + if (nf_tables_fill_expr_info(skb, set->expr) < 0) + goto nla_put_failure; + + nla_nest_end(skb, nest); + } nlmsg_end(skb, nlh); return 0; @@ -3913,6 +3921,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; const struct nft_set_ops *ops; + struct nft_expr *expr = NULL; struct nft_table *table; struct nft_set *set; struct nft_ctx ctx; @@ -4069,13 +4078,21 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, name = nla_strdup(nla[NFTA_SET_NAME], GFP_KERNEL); if (!name) { err = -ENOMEM; - goto err2; + goto err_set_name; } err = nf_tables_set_alloc_name(&ctx, set, name); kfree(name); if (err < 0) - goto err2; + goto err_set_alloc_name; + + if (nla[NFTA_SET_EXPR]) { + expr = nft_set_elem_expr_alloc(&ctx, set, nla[NFTA_SET_EXPR]); + if (IS_ERR(expr)) { + err = PTR_ERR(expr); + goto err_set_alloc_name; + } + } udata = NULL; if (udlen) { @@ -4092,6 +4109,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, set->dtype = dtype; set->objtype = objtype; set->dlen = desc.dlen; + set->expr = expr; set->flags = flags; set->size = desc.size; set->policy = policy; @@ -4107,21 +4125,24 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, err = ops->init(set, &desc, nla); if (err < 0) - goto err3; + goto err_set_init; err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set); if (err < 0) - goto err4; + goto err_set_trans; list_add_tail_rcu(&set->list, &table->sets); table->use++; return 0; -err4: +err_set_trans: ops->destroy(set); -err3: +err_set_init: + if (expr) + nft_expr_destroy(&ctx, expr); +err_set_alloc_name: kfree(set->name); -err2: +err_set_name: kvfree(set); return err; } @@ -4131,6 +4152,9 @@ static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) if (WARN_ON(set->use > 0)) return; + if (set->expr) + nft_expr_destroy(ctx, set->expr); + set->ops->destroy(set); kfree(set->name); kvfree(set); @@ -4982,6 +5006,18 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, nla[NFTA_SET_ELEM_EXPR]); if (IS_ERR(expr)) return PTR_ERR(expr); + + err = -EOPNOTSUPP; + if (set->expr && set->expr->ops != expr->ops) + goto err_set_elem_expr; + } else if (set->expr) { + expr = kzalloc(set->expr->ops->size, GFP_KERNEL); + if (!expr) + return -ENOMEM; + + err = nft_expr_clone(expr, set->expr); + if (err < 0) + goto err_set_elem_expr; } err = nft_setelem_parse_key(ctx, set, &elem.key.val, From 772f4e82b3ffa1eb7412cd531f718a96a0e5474b Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 18 Mar 2020 01:14:58 +0100 Subject: [PATCH 04/28] netfilter: nf_tables: fix double-free on set expression from the error path After copying the expression to the set element extension, release the expression and reset the pointer to avoid a double-free from the error path. Fixes: 409444522976 ("netfilter: nf_tables: add elements with stateful expressions") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index f1910cd795fd..29ad33e52dbb 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5133,6 +5133,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (expr) { memcpy(nft_set_ext_expr(ext), expr, expr->ops->size); kfree(expr); + expr = NULL; } trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set); From 475beb9c8de1227ff6ec572f37a05c5c612d94a7 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 18 Mar 2020 14:29:45 +0100 Subject: [PATCH 05/28] netfilter: nf_tables: add nft_set_elem_expr_destroy() and use it This patch adds nft_set_elem_expr_destroy() to destroy stateful expressions in set elements. This patch also updates the commit path to call this function to invoke expr->ops->destroy_clone when required. This is implicitly fixing up a module reference counter leak and a memory leak in expressions that allocated internal state, e.g. nft_counter. Fixes: 409444522976 ("netfilter: nf_tables: add elements with stateful expressions") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 29ad33e52dbb..c5332a313283 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4882,6 +4882,17 @@ void *nft_set_elem_init(const struct nft_set *set, return elem; } +static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx, + struct nft_expr *expr) +{ + if (expr->ops->destroy_clone) { + expr->ops->destroy_clone(ctx, expr); + module_put(expr->ops->type->owner); + } else { + nf_tables_expr_destroy(ctx, expr); + } +} + void nft_set_elem_destroy(const struct nft_set *set, void *elem, bool destroy_expr) { @@ -4894,16 +4905,9 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem, nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) nft_data_release(nft_set_ext_data(ext), set->dtype); - if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) { - struct nft_expr *expr = nft_set_ext_expr(ext); + if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) + nft_set_elem_expr_destroy(&ctx, nft_set_ext_expr(ext)); - if (expr->ops->destroy_clone) { - expr->ops->destroy_clone(&ctx, expr); - module_put(expr->ops->type->owner); - } else { - nf_tables_expr_destroy(&ctx, expr); - } - } if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) (*nft_set_ext_obj(ext))->use--; kfree(elem); @@ -4919,7 +4923,8 @@ static void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, struct nft_set_ext *ext = nft_set_elem_ext(set, elem); if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) - nf_tables_expr_destroy(ctx, nft_set_ext_expr(ext)); + nft_set_elem_expr_destroy(ctx, nft_set_ext_expr(ext)); + kfree(elem); } @@ -5182,7 +5187,8 @@ err_element_clash: err_trans: if (obj) obj->use--; - kfree(elem.priv); + + nf_tables_set_elem_destroy(ctx, set, elem.priv); err_parse_data: if (nla[NFTA_SET_ELEM_DATA] != NULL) nft_data_release(&data, desc.type); From dc264f1f7ece2ed11cc9e01039cf4168eb3eb66b Mon Sep 17 00:00:00 2001 From: wenxu Date: Thu, 19 Mar 2020 12:52:45 +0800 Subject: [PATCH 06/28] netfilter: flowtable: fix NULL pointer dereference in tunnel offload support The tc ct action does not cache the route in the flowtable entry. Fixes: 88bf6e4114d5 ("netfilter: flowtable: add tunnel encap/decap action offload support") Fixes: cfab6dbd0ecf ("netfilter: flowtable: add tunnel match offload support") Signed-off-by: wenxu Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_offload.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index ad549317af30..a68136a8d750 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -92,7 +92,7 @@ static int nf_flow_rule_match(struct nf_flow_match *match, NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_TCP, tcp); NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_PORTS, tp); - if (other_dst->lwtstate) { + if (other_dst && other_dst->lwtstate) { tun_info = lwt_tun_info(other_dst->lwtstate); nf_flow_rule_lwt_match(match, tun_info); } @@ -483,7 +483,7 @@ static void flow_offload_encap_tunnel(const struct flow_offload *flow, struct dst_entry *dst; dst = flow->tuplehash[dir].tuple.dst_cache; - if (dst->lwtstate) { + if (dst && dst->lwtstate) { struct ip_tunnel_info *tun_info; tun_info = lwt_tun_info(dst->lwtstate); @@ -503,7 +503,7 @@ static void flow_offload_decap_tunnel(const struct flow_offload *flow, struct dst_entry *dst; dst = flow->tuplehash[!dir].tuple.dst_cache; - if (dst->lwtstate) { + if (dst && dst->lwtstate) { struct ip_tunnel_info *tun_info; tun_info = lwt_tun_info(dst->lwtstate); From 19f8f717f620576ada10b0ba8f4d55ef1208571e Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Wed, 11 Mar 2020 01:09:04 +0000 Subject: [PATCH 07/28] netfilter: ctnetlink: Add missing annotation for ctnetlink_parse_nat_setup() Sparse reports a warning at ctnetlink_parse_nat_setup() warning: context imbalance in ctnetlink_parse_nat_setup() - unexpected unlock The root cause is the missing annotation at ctnetlink_parse_nat_setup() Add the missing __must_hold(RCU) annotation Signed-off-by: Jules Irenge Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 6a1c8f1f6171..eb190206cd12 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1533,6 +1533,7 @@ static int ctnetlink_parse_nat_setup(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr) + __must_hold(RCU) { struct nf_nat_hook *nat_hook; int err; From 6b36d4829cbc7c38daee83365cc46e86c33e93fb Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Wed, 11 Mar 2020 01:09:05 +0000 Subject: [PATCH 08/28] netfilter: conntrack: Add missing annotations for nf_conntrack_all_lock() and nf_conntrack_all_unlock() Sparse reports warnings at nf_conntrack_all_lock() and nf_conntrack_all_unlock() warning: context imbalance in nf_conntrack_all_lock() - wrong count at exit warning: context imbalance in nf_conntrack_all_unlock() - unexpected unlock Add the missing __acquires(&nf_conntrack_locks_all_lock) Add missing __releases(&nf_conntrack_locks_all_lock) Signed-off-by: Jules Irenge Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index a18f8fe728e3..f82d4a802acc 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -143,6 +143,7 @@ static bool nf_conntrack_double_lock(struct net *net, unsigned int h1, } static void nf_conntrack_all_lock(void) + __acquires(&nf_conntrack_locks_all_lock) { int i; @@ -162,6 +163,7 @@ static void nf_conntrack_all_lock(void) } static void nf_conntrack_all_unlock(void) + __releases(&nf_conntrack_locks_all_lock) { /* All prior stores must be complete before we clear * 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock() From 73348fed35d023e998cbd303a28400f2c0ec30a3 Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Sun, 15 Mar 2020 21:25:41 +0800 Subject: [PATCH 09/28] ipvs: optimize tunnel dumps for icmp errors After strip GRE/UDP tunnel header for icmp errors, it's better to show "GRE/UDP" instead of "IPIP" in debug message. Signed-off-by: Haishuang Yan Acked-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_core.c | 46 +++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 512259f579d7..d2ac530a9501 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1661,8 +1661,9 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; unsigned int offset, offset2, ihl, verdict; - bool ipip, new_cp = false; + bool tunnel, new_cp = false; union nf_inet_addr *raddr; + char *outer_proto; *related = 1; @@ -1703,8 +1704,8 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, return NF_ACCEPT; /* The packet looks wrong, ignore */ raddr = (union nf_inet_addr *)&cih->daddr; - /* Special case for errors for IPIP packets */ - ipip = false; + /* Special case for errors for IPIP/UDP/GRE tunnel packets */ + tunnel = false; if (cih->protocol == IPPROTO_IPIP) { struct ip_vs_dest *dest; @@ -1721,7 +1722,8 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); if (cih == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ - ipip = true; + tunnel = true; + outer_proto = "IPIP"; } else if ((cih->protocol == IPPROTO_UDP || /* Can be UDP encap */ cih->protocol == IPPROTO_GRE) && /* Can be GRE encap */ /* Error for our tunnel must arrive at LOCAL_IN */ @@ -1729,16 +1731,19 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, __u8 iproto; int ulen; - /* Non-first fragment has no UDP header */ + /* Non-first fragment has no UDP/GRE header */ if (unlikely(cih->frag_off & htons(IP_OFFSET))) return NF_ACCEPT; offset2 = offset + cih->ihl * 4; - if (cih->protocol == IPPROTO_UDP) + if (cih->protocol == IPPROTO_UDP) { ulen = ipvs_udp_decap(ipvs, skb, offset2, AF_INET, raddr, &iproto); - else + outer_proto = "UDP"; + } else { ulen = ipvs_gre_decap(ipvs, skb, offset2, AF_INET, raddr, &iproto); + outer_proto = "GRE"; + } if (ulen > 0) { /* Skip IP and UDP/GRE tunnel headers */ offset = offset2 + ulen; @@ -1747,7 +1752,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, &_ciph); if (cih && cih->version == 4 && cih->ihl >= 5 && iproto == IPPROTO_IPIP) - ipip = true; + tunnel = true; else return NF_ACCEPT; } @@ -1767,11 +1772,11 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, "Checking incoming ICMP for"); offset2 = offset; - ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, !ipip, &ciph); + ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, !tunnel, &ciph); offset = ciph.len; /* The embedded headers contain source and dest in reverse order. - * For IPIP this is error for request, not for reply. + * For IPIP/UDP/GRE tunnel this is error for request, not for reply. */ cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto, ipvs, AF_INET, skb, &ciph); @@ -1779,7 +1784,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, if (!cp) { int v; - if (ipip || !sysctl_schedule_icmp(ipvs)) + if (tunnel || !sysctl_schedule_icmp(ipvs)) return NF_ACCEPT; if (!ip_vs_try_to_schedule(ipvs, AF_INET, skb, pd, &v, &cp, &ciph)) @@ -1797,7 +1802,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, goto out; } - if (ipip) { + if (tunnel) { __be32 info = ic->un.gateway; __u8 type = ic->type; __u8 code = ic->code; @@ -1809,17 +1814,18 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, u32 mtu = ntohs(ic->un.frag.mtu); __be16 frag_off = cih->frag_off; - /* Strip outer IP and ICMP, go to IPIP header */ + /* Strip outer IP and ICMP, go to IPIP/UDP/GRE header */ if (pskb_pull(skb, ihl + sizeof(_icmph)) == NULL) - goto ignore_ipip; + goto ignore_tunnel; offset2 -= ihl + sizeof(_icmph); skb_reset_network_header(skb); - IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n", - &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu); + IP_VS_DBG(12, "ICMP for %s %pI4->%pI4: mtu=%u\n", + outer_proto, &ip_hdr(skb)->saddr, + &ip_hdr(skb)->daddr, mtu); ipv4_update_pmtu(skb, ipvs->net, mtu, 0, 0); /* Client uses PMTUD? */ if (!(frag_off & htons(IP_DF))) - goto ignore_ipip; + goto ignore_tunnel; /* Prefer the resulting PMTU */ if (dest) { struct ip_vs_dest_dst *dest_dst; @@ -1832,11 +1838,11 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, mtu -= sizeof(struct iphdr); info = htonl(mtu); } - /* Strip outer IP, ICMP and IPIP, go to IP header of + /* Strip outer IP, ICMP and IPIP/UDP/GRE, go to IP header of * original request. */ if (pskb_pull(skb, offset2) == NULL) - goto ignore_ipip; + goto ignore_tunnel; skb_reset_network_header(skb); IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n", &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, @@ -1845,7 +1851,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, /* ICMP can be shorter but anyways, account it */ ip_vs_out_stats(cp, skb); -ignore_ipip: +ignore_tunnel: consume_skb(skb); verdict = NF_STOLEN; goto out; From 8ac2bd357775b3abf838110833279ea1a3b035e4 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 24 Mar 2020 12:34:33 +0100 Subject: [PATCH 10/28] netfilter: conntrack: export nf_ct_acct_update() This function allows you to update the conntrack counters. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_acct.h | 2 ++ net/netfilter/nf_conntrack_core.c | 15 +++++++-------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/include/net/netfilter/nf_conntrack_acct.h b/include/net/netfilter/nf_conntrack_acct.h index f7a060c6eb28..df198c51244a 100644 --- a/include/net/netfilter/nf_conntrack_acct.h +++ b/include/net/netfilter/nf_conntrack_acct.h @@ -65,6 +65,8 @@ static inline void nf_ct_set_acct(struct net *net, bool enable) #endif } +void nf_ct_acct_update(struct nf_conn *ct, u32 dir, unsigned int bytes); + void nf_conntrack_acct_pernet_init(struct net *net); int nf_conntrack_acct_init(void); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index f82d4a802acc..7ded6d287f87 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -865,9 +865,7 @@ out: } EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); -static inline void nf_ct_acct_update(struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - unsigned int len) +void nf_ct_acct_update(struct nf_conn *ct, u32 dir, unsigned int bytes) { struct nf_conn_acct *acct; @@ -875,10 +873,11 @@ static inline void nf_ct_acct_update(struct nf_conn *ct, if (acct) { struct nf_conn_counter *counter = acct->counter; - atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets); - atomic64_add(len, &counter[CTINFO2DIR(ctinfo)].bytes); + atomic64_inc(&counter[dir].packets); + atomic64_add(bytes, &counter[dir].bytes); } } +EXPORT_SYMBOL_GPL(nf_ct_acct_update); static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo, const struct nf_conn *loser_ct) @@ -892,7 +891,7 @@ static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo, /* u32 should be fine since we must have seen one packet. */ bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes); - nf_ct_acct_update(ct, ctinfo, bytes); + nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes); } } @@ -1933,7 +1932,7 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, WRITE_ONCE(ct->timeout, extra_jiffies); acct: if (do_acct) - nf_ct_acct_update(ct, ctinfo, skb->len); + nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len); } EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); @@ -1941,7 +1940,7 @@ bool nf_ct_kill_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo, const struct sk_buff *skb) { - nf_ct_acct_update(ct, ctinfo, skb->len); + nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len); return nf_ct_delete(ct, 0, 0); } From cfbd1125fc8778913d0956a757438bbb2c35f031 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 24 Mar 2020 12:23:57 +0100 Subject: [PATCH 11/28] netfilter: nf_tables: add enum nft_flowtable_flags to uapi Expose the NFT_FLOWTABLE_HW_OFFLOAD flag through uapi. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 2 +- include/uapi/linux/netfilter/nf_tables.h | 10 ++++++++++ net/netfilter/nf_tables_api.c | 2 +- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index f523ea87b6ae..4beb7f13bc50 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -62,7 +62,7 @@ struct nf_flowtable_type { }; enum nf_flowtable_flags { - NF_FLOWTABLE_HW_OFFLOAD = 0x1, + NF_FLOWTABLE_HW_OFFLOAD = 0x1, /* NFT_FLOWTABLE_HW_OFFLOAD */ }; struct nf_flowtable { diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 4e3a5971d4ee..717ee3aa05d7 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1553,6 +1553,16 @@ enum nft_object_attributes { }; #define NFTA_OBJ_MAX (__NFTA_OBJ_MAX - 1) +/** + * enum nft_flowtable_flags - nf_tables flowtable flags + * + * @NFT_FLOWTABLE_HW_OFFLOAD: flowtable hardware offload is enabled + */ +enum nft_flowtable_flags { + NFT_FLOWTABLE_HW_OFFLOAD = 0x1, + NFT_FLOWTABLE_MASK = NFT_FLOWTABLE_HW_OFFLOAD +}; + /** * enum nft_flowtable_attributes - nf_tables flow table netlink attributes * diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index c5332a313283..ace325218edb 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6375,7 +6375,7 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, if (nla[NFTA_FLOWTABLE_FLAGS]) { flowtable->data.flags = ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS])); - if (flowtable->data.flags & ~NF_FLOWTABLE_HW_OFFLOAD) + if (flowtable->data.flags & ~NFT_FLOWTABLE_MASK) goto err3; } From 53c2b2899af7e6a29c0cf8bfa8a554721398a4b0 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 24 Mar 2020 12:50:02 +0100 Subject: [PATCH 12/28] netfilter: flowtable: add counter support Add a new flag to turn on flowtable counters which are stored in the conntrack entry. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 1 + include/uapi/linux/netfilter/nf_tables.h | 5 ++++- net/netfilter/nf_flow_table_ip.c | 7 +++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 4beb7f13bc50..4a2ec6fd9ad2 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -63,6 +63,7 @@ struct nf_flowtable_type { enum nf_flowtable_flags { NF_FLOWTABLE_HW_OFFLOAD = 0x1, /* NFT_FLOWTABLE_HW_OFFLOAD */ + NF_FLOWTABLE_COUNTER = 0x2, /* NFT_FLOWTABLE_COUNTER */ }; struct nf_flowtable { diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 717ee3aa05d7..30f2a87270dc 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1557,10 +1557,13 @@ enum nft_object_attributes { * enum nft_flowtable_flags - nf_tables flowtable flags * * @NFT_FLOWTABLE_HW_OFFLOAD: flowtable hardware offload is enabled + * @NFT_FLOWTABLE_COUNTER: enable flow counters */ enum nft_flowtable_flags { NFT_FLOWTABLE_HW_OFFLOAD = 0x1, - NFT_FLOWTABLE_MASK = NFT_FLOWTABLE_HW_OFFLOAD + NFT_FLOWTABLE_COUNTER = 0x2, + NFT_FLOWTABLE_MASK = (NFT_FLOWTABLE_HW_OFFLOAD | + NFT_FLOWTABLE_COUNTER) }; /** diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 5272721080f8..553cc0d5695a 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -12,6 +12,7 @@ #include #include #include +#include /* For layer 4 checksum field offset. */ #include #include @@ -286,6 +287,9 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, ip_decrease_ttl(iph); skb->tstamp = 0; + if (flow_table->flags & NF_FLOWTABLE_COUNTER) + nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len); + if (unlikely(dst_xfrm(&rt->dst))) { memset(skb->cb, 0, sizeof(struct inet_skb_parm)); IPCB(skb)->iif = skb->dev->ifindex; @@ -516,6 +520,9 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, ip6h->hop_limit--; skb->tstamp = 0; + if (flow_table->flags & NF_FLOWTABLE_COUNTER) + nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len); + if (unlikely(dst_xfrm(&rt->dst))) { memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); IP6CB(skb)->iif = skb->dev->ifindex; From 133a2fe594dc0eb15a77477a5a05176495190139 Mon Sep 17 00:00:00 2001 From: wenxu Date: Tue, 24 Mar 2020 07:34:25 +0800 Subject: [PATCH 13/28] netfilter: flowtable: Fix incorrect tc_setup_type type The indirect block setup should use TC_SETUP_FT as the type instead of TC_SETUP_BLOCK. Adjust existing users of the indirect flow block infrastructure. Fixes: b5140a36da78 ("netfilter: flowtable: add indr block setup support") Signed-off-by: wenxu Signed-off-by: Pablo Neira Ayuso --- include/net/flow_offload.h | 3 ++- net/core/flow_offload.c | 6 +++--- net/netfilter/nf_flow_table_offload.c | 2 +- net/netfilter/nf_tables_offload.c | 2 +- net/sched/cls_api.c | 2 +- 5 files changed, 8 insertions(+), 7 deletions(-) diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 1e30b0d44b61..1afb6bd4530d 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -520,6 +520,7 @@ void flow_indr_block_cb_unregister(struct net_device *dev, void flow_indr_block_call(struct net_device *dev, struct flow_block_offload *bo, - enum flow_block_command command); + enum flow_block_command command, + enum tc_setup_type type); #endif /* _NET_FLOW_OFFLOAD_H */ diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index 7440e6117c81..e951b743bed3 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -511,7 +511,8 @@ EXPORT_SYMBOL_GPL(flow_indr_block_cb_unregister); void flow_indr_block_call(struct net_device *dev, struct flow_block_offload *bo, - enum flow_block_command command) + enum flow_block_command command, + enum tc_setup_type type) { struct flow_indr_block_cb *indr_block_cb; struct flow_indr_block_dev *indr_dev; @@ -521,8 +522,7 @@ void flow_indr_block_call(struct net_device *dev, return; list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list) - indr_block_cb->cb(dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK, - bo); + indr_block_cb->cb(dev, indr_block_cb->cb_priv, type, bo); } EXPORT_SYMBOL_GPL(flow_indr_block_call); diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index a68136a8d750..0c6437fab4fe 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -938,7 +938,7 @@ static int nf_flow_table_indr_offload_cmd(struct flow_block_offload *bo, { nf_flow_table_block_offload_init(bo, dev_net(dev), cmd, flowtable, extack); - flow_indr_block_call(dev, bo, cmd); + flow_indr_block_call(dev, bo, cmd, TC_SETUP_FT); if (list_empty(&bo->cb_list)) return -EOPNOTSUPP; diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 2bb28483af22..954bccb7f32a 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -313,7 +313,7 @@ static int nft_indr_block_offload_cmd(struct nft_base_chain *chain, nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack); - flow_indr_block_call(dev, &bo, cmd); + flow_indr_block_call(dev, &bo, cmd, TC_SETUP_BLOCK); if (list_empty(&bo.cb_list)) return -EOPNOTSUPP; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index eefacb3176e3..84f8ee6f2009 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -708,7 +708,7 @@ static void tc_indr_block_call(struct tcf_block *block, }; INIT_LIST_HEAD(&bo.cb_list); - flow_indr_block_call(dev, &bo, command); + flow_indr_block_call(dev, &bo, command, TC_SETUP_BLOCK); tcf_block_setup(block, &bo); } From 0a6a9515fe390976cd762c52d8d4f446d7a14285 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Wed, 25 Mar 2020 10:31:42 -0400 Subject: [PATCH 14/28] netfilter: nf_tables: silence a RCU-list warning in nft_table_lookup() It is safe to traverse &net->nft.tables with &net->nft.commit_mutex held using list_for_each_entry_rcu(). Silence the PROVE_RCU_LIST false positive, WARNING: suspicious RCU usage net/netfilter/nf_tables_api.c:523 RCU-list traversed in non-reader section!! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 1 lock held by iptables/1384: #0: ffffffff9745c4a8 (&net->nft.commit_mutex){+.+.}, at: nf_tables_valid_genid+0x25/0x60 [nf_tables] Call Trace: dump_stack+0xa1/0xea lockdep_rcu_suspicious+0x103/0x10d nft_table_lookup.part.0+0x116/0x120 [nf_tables] nf_tables_newtable+0x12c/0x7d0 [nf_tables] nfnetlink_rcv_batch+0x559/0x1190 [nfnetlink] nfnetlink_rcv+0x1da/0x210 [nfnetlink] netlink_unicast+0x306/0x460 netlink_sendmsg+0x44b/0x770 ____sys_sendmsg+0x46b/0x4a0 ___sys_sendmsg+0x138/0x1a0 __sys_sendmsg+0xb6/0x130 __x64_sys_sendmsg+0x48/0x50 do_syscall_64+0x69/0xf4 entry_SYSCALL_64_after_hwframe+0x49/0xb3 Signed-off-by: Qian Cai Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index ace325218edb..c1e04ac21392 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -520,7 +520,8 @@ static struct nft_table *nft_table_lookup(const struct net *net, if (nla == NULL) return ERR_PTR(-EINVAL); - list_for_each_entry_rcu(table, &net->nft.tables, list) { + list_for_each_entry_rcu(table, &net->nft.tables, list, + lockdep_is_held(&net->nft.commit_mutex)) { if (!nla_strcmp(nla, table->name) && table->family == family && nft_active_genmask(table, genmask)) From 422c032afcf57d5e8109a54912e22ffc53d99068 Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Fri, 27 Mar 2020 12:12:29 +0300 Subject: [PATCH 15/28] netfilter: flowtable: Use rw sem as flow block lock Currently flow offload threads are synchronized by the flow block mutex. Use rw lock instead to increase flow insertion (read) concurrency. Signed-off-by: Paul Blakey Reviewed-by: Oz Shlomo Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 2 +- net/netfilter/nf_flow_table_core.c | 11 +++++------ net/netfilter/nf_flow_table_offload.c | 4 ++-- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 4a2ec6fd9ad2..6bf69652f57d 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -74,7 +74,7 @@ struct nf_flowtable { struct delayed_work gc_work; unsigned int flags; struct flow_block flow_block; - struct mutex flow_block_lock; /* Guards flow_block */ + struct rw_semaphore flow_block_lock; /* Guards flow_block */ possible_net_t net; }; diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 9a477bd563b7..9399bb2df295 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -392,7 +392,7 @@ int nf_flow_table_offload_add_cb(struct nf_flowtable *flow_table, struct flow_block_cb *block_cb; int err = 0; - mutex_lock(&flow_table->flow_block_lock); + down_write(&flow_table->flow_block_lock); block_cb = flow_block_cb_lookup(block, cb, cb_priv); if (block_cb) { err = -EEXIST; @@ -408,7 +408,7 @@ int nf_flow_table_offload_add_cb(struct nf_flowtable *flow_table, list_add_tail(&block_cb->list, &block->cb_list); unlock: - mutex_unlock(&flow_table->flow_block_lock); + up_write(&flow_table->flow_block_lock); return err; } EXPORT_SYMBOL_GPL(nf_flow_table_offload_add_cb); @@ -419,13 +419,13 @@ void nf_flow_table_offload_del_cb(struct nf_flowtable *flow_table, struct flow_block *block = &flow_table->flow_block; struct flow_block_cb *block_cb; - mutex_lock(&flow_table->flow_block_lock); + down_write(&flow_table->flow_block_lock); block_cb = flow_block_cb_lookup(block, cb, cb_priv); if (block_cb) list_del(&block_cb->list); else WARN_ON(true); - mutex_unlock(&flow_table->flow_block_lock); + up_write(&flow_table->flow_block_lock); } EXPORT_SYMBOL_GPL(nf_flow_table_offload_del_cb); @@ -551,7 +551,7 @@ int nf_flow_table_init(struct nf_flowtable *flowtable) INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc); flow_block_init(&flowtable->flow_block); - mutex_init(&flowtable->flow_block_lock); + init_rwsem(&flowtable->flow_block_lock); err = rhashtable_init(&flowtable->rhashtable, &nf_flow_offload_rhash_params); @@ -614,7 +614,6 @@ void nf_flow_table_free(struct nf_flowtable *flow_table) nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table); nf_flow_table_offload_flush(flow_table); rhashtable_destroy(&flow_table->rhashtable); - mutex_destroy(&flow_table->flow_block_lock); } EXPORT_SYMBOL_GPL(nf_flow_table_free); diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 0c6437fab4fe..b96db831b4ca 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -691,7 +691,7 @@ static int nf_flow_offload_tuple(struct nf_flowtable *flowtable, if (cmd == FLOW_CLS_REPLACE) cls_flow.rule = flow_rule->rule; - mutex_lock(&flowtable->flow_block_lock); + down_read(&flowtable->flow_block_lock); list_for_each_entry(block_cb, block_cb_list, list) { err = block_cb->cb(TC_SETUP_CLSFLOWER, &cls_flow, block_cb->cb_priv); @@ -700,7 +700,7 @@ static int nf_flow_offload_tuple(struct nf_flowtable *flowtable, i++; } - mutex_unlock(&flowtable->flow_block_lock); + up_read(&flowtable->flow_block_lock); if (cmd == FLOW_CLS_STATS) memcpy(stats, &cls_flow.stats, sizeof(*stats)); From 7da182a998d6cf81191f0b0798eae451015a770f Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Fri, 27 Mar 2020 12:12:30 +0300 Subject: [PATCH 16/28] netfilter: flowtable: Use work entry per offload command To allow offload commands to execute in parallel, create workqueue for flow table offload, and use a work entry per offload command. Signed-off-by: Paul Blakey Reviewed-by: Oz Shlomo Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_offload.c | 46 +++++++++------------------ 1 file changed, 15 insertions(+), 31 deletions(-) diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index b96db831b4ca..a9b3b88bd0f1 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -12,9 +12,7 @@ #include #include -static struct work_struct nf_flow_offload_work; -static DEFINE_SPINLOCK(flow_offload_pending_list_lock); -static LIST_HEAD(flow_offload_pending_list); +static struct workqueue_struct *nf_flow_offload_wq; struct flow_offload_work { struct list_head list; @@ -22,6 +20,7 @@ struct flow_offload_work { int priority; struct nf_flowtable *flowtable; struct flow_offload *flow; + struct work_struct work; }; #define NF_FLOW_DISSECTOR(__match, __type, __field) \ @@ -788,15 +787,10 @@ static void flow_offload_work_stats(struct flow_offload_work *offload) static void flow_offload_work_handler(struct work_struct *work) { - struct flow_offload_work *offload, *next; - LIST_HEAD(offload_pending_list); + struct flow_offload_work *offload; - spin_lock_bh(&flow_offload_pending_list_lock); - list_replace_init(&flow_offload_pending_list, &offload_pending_list); - spin_unlock_bh(&flow_offload_pending_list_lock); - - list_for_each_entry_safe(offload, next, &offload_pending_list, list) { - switch (offload->cmd) { + offload = container_of(work, struct flow_offload_work, work); + switch (offload->cmd) { case FLOW_CLS_REPLACE: flow_offload_work_add(offload); break; @@ -808,19 +802,14 @@ static void flow_offload_work_handler(struct work_struct *work) break; default: WARN_ON_ONCE(1); - } - list_del(&offload->list); - kfree(offload); } + + kfree(offload); } static void flow_offload_queue_work(struct flow_offload_work *offload) { - spin_lock_bh(&flow_offload_pending_list_lock); - list_add_tail(&offload->list, &flow_offload_pending_list); - spin_unlock_bh(&flow_offload_pending_list_lock); - - schedule_work(&nf_flow_offload_work); + queue_work(nf_flow_offload_wq, &offload->work); } static struct flow_offload_work * @@ -837,6 +826,7 @@ nf_flow_offload_work_alloc(struct nf_flowtable *flowtable, offload->flow = flow; offload->priority = flowtable->priority; offload->flowtable = flowtable; + INIT_WORK(&offload->work, flow_offload_work_handler); return offload; } @@ -887,7 +877,7 @@ void nf_flow_offload_stats(struct nf_flowtable *flowtable, void nf_flow_table_offload_flush(struct nf_flowtable *flowtable) { if (nf_flowtable_hw_offload(flowtable)) - flush_work(&nf_flow_offload_work); + flush_workqueue(nf_flow_offload_wq); } static int nf_flow_table_block_setup(struct nf_flowtable *flowtable, @@ -1052,7 +1042,10 @@ static struct flow_indr_block_entry block_ing_entry = { int nf_flow_table_offload_init(void) { - INIT_WORK(&nf_flow_offload_work, flow_offload_work_handler); + nf_flow_offload_wq = alloc_workqueue("nf_flow_table_offload", + WQ_UNBOUND | WQ_MEM_RECLAIM, 0); + if (!nf_flow_offload_wq) + return -ENOMEM; flow_indr_add_block_cb(&block_ing_entry); @@ -1061,15 +1054,6 @@ int nf_flow_table_offload_init(void) void nf_flow_table_offload_exit(void) { - struct flow_offload_work *offload, *next; - LIST_HEAD(offload_pending_list); - flow_indr_del_block_cb(&block_ing_entry); - - cancel_work_sync(&nf_flow_offload_work); - - list_for_each_entry_safe(offload, next, &offload_pending_list, list) { - list_del(&offload->list); - kfree(offload); - } + destroy_workqueue(nf_flow_offload_wq); } From dd3cc111f2e3220ddc9c4ab17f13dc97759b5163 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 27 Mar 2020 03:24:46 +0100 Subject: [PATCH 17/28] netfilter: nf_queue: make nf_queue_entry_release_refs static This is a preparation patch, no logical changes. Move free_entry into core and rename it to something more sensible. Will ease followup patches which will complicate the refcount handling. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_queue.h | 2 +- net/netfilter/nf_queue.c | 10 ++++++++-- net/netfilter/nfnetlink_queue.c | 10 ++-------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h index 47088083667b..cdbd98730852 100644 --- a/include/net/netfilter/nf_queue.h +++ b/include/net/netfilter/nf_queue.h @@ -35,7 +35,7 @@ void nf_unregister_queue_handler(struct net *net); void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict); void nf_queue_entry_get_refs(struct nf_queue_entry *entry); -void nf_queue_entry_release_refs(struct nf_queue_entry *entry); +void nf_queue_entry_free(struct nf_queue_entry *entry); static inline void init_hashrandom(u32 *jhash_initval) { diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index f8f52ff99cfb..4da5776a9904 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -64,7 +64,7 @@ static void nf_queue_entry_release_br_nf_refs(struct sk_buff *skb) #endif } -void nf_queue_entry_release_refs(struct nf_queue_entry *entry) +static void nf_queue_entry_release_refs(struct nf_queue_entry *entry) { struct nf_hook_state *state = &entry->state; @@ -78,7 +78,13 @@ void nf_queue_entry_release_refs(struct nf_queue_entry *entry) nf_queue_entry_release_br_nf_refs(entry->skb); } -EXPORT_SYMBOL_GPL(nf_queue_entry_release_refs); + +void nf_queue_entry_free(struct nf_queue_entry *entry) +{ + nf_queue_entry_release_refs(entry); + kfree(entry); +} +EXPORT_SYMBOL_GPL(nf_queue_entry_free); static void nf_queue_entry_get_br_nf_refs(struct sk_buff *skb) { diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 76535fd9278c..3243a31f6e82 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -737,12 +737,6 @@ static void nf_bridge_adjust_segmented_data(struct sk_buff *skb) #define nf_bridge_adjust_segmented_data(s) do {} while (0) #endif -static void free_entry(struct nf_queue_entry *entry) -{ - nf_queue_entry_release_refs(entry); - kfree(entry); -} - static int __nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue, struct sk_buff *skb, struct nf_queue_entry *entry) @@ -768,7 +762,7 @@ __nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue, entry_seg->skb = skb; ret = __nfqnl_enqueue_packet(net, queue, entry_seg); if (ret) - free_entry(entry_seg); + nf_queue_entry_free(entry_seg); } return ret; } @@ -827,7 +821,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) if (queued) { if (err) /* some segments are already queued */ - free_entry(entry); + nf_queue_entry_free(entry); kfree_skb(skb); return 0; } From 119e52e664c57d5f7c0174dc2b3a296b1e40591d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 27 Mar 2020 03:24:47 +0100 Subject: [PATCH 18/28] netfilter: nf_queue: place bridge physports into queue_entry struct The refcount is done via entry->skb, which does work fine. Major problem: When putting the refcount of the bridge ports, we must always put the references while the skb is still around. However, we will need to put the references after okfn() to avoid a possible 1 -> 0 -> 1 refcount transition, so we cannot use the skb pointer anymore. Place the physports in the queue entry structure instead to allow for refcounting changes in the next patch. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_queue.h | 5 ++- net/netfilter/nf_queue.c | 53 ++++++++++++++------------------ 2 files changed, 27 insertions(+), 31 deletions(-) diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h index cdbd98730852..e770bba00066 100644 --- a/include/net/netfilter/nf_queue.h +++ b/include/net/netfilter/nf_queue.h @@ -14,7 +14,10 @@ struct nf_queue_entry { struct sk_buff *skb; unsigned int id; unsigned int hook_index; /* index in hook_entries->hook[] */ - +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + struct net_device *physin; + struct net_device *physout; +#endif struct nf_hook_state state; u16 size; /* sizeof(entry) + saved route keys */ diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 4da5776a9904..96eb72908467 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -46,24 +46,6 @@ void nf_unregister_queue_handler(struct net *net) } EXPORT_SYMBOL(nf_unregister_queue_handler); -static void nf_queue_entry_release_br_nf_refs(struct sk_buff *skb) -{ -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); - - if (nf_bridge) { - struct net_device *physdev; - - physdev = nf_bridge_get_physindev(skb); - if (physdev) - dev_put(physdev); - physdev = nf_bridge_get_physoutdev(skb); - if (physdev) - dev_put(physdev); - } -#endif -} - static void nf_queue_entry_release_refs(struct nf_queue_entry *entry) { struct nf_hook_state *state = &entry->state; @@ -76,7 +58,12 @@ static void nf_queue_entry_release_refs(struct nf_queue_entry *entry) if (state->sk) sock_put(state->sk); - nf_queue_entry_release_br_nf_refs(entry->skb); +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + if (entry->physin) + dev_put(entry->physin); + if (entry->physout) + dev_put(entry->physout); +#endif } void nf_queue_entry_free(struct nf_queue_entry *entry) @@ -86,20 +73,19 @@ void nf_queue_entry_free(struct nf_queue_entry *entry) } EXPORT_SYMBOL_GPL(nf_queue_entry_free); -static void nf_queue_entry_get_br_nf_refs(struct sk_buff *skb) +static void __nf_queue_entry_init_physdevs(struct nf_queue_entry *entry) { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + const struct sk_buff *skb = entry->skb; + struct nf_bridge_info *nf_bridge; + nf_bridge = nf_bridge_info_get(skb); if (nf_bridge) { - struct net_device *physdev; - - physdev = nf_bridge_get_physindev(skb); - if (physdev) - dev_hold(physdev); - physdev = nf_bridge_get_physoutdev(skb); - if (physdev) - dev_hold(physdev); + entry->physin = nf_bridge_get_physindev(skb); + entry->physout = nf_bridge_get_physoutdev(skb); + } else { + entry->physin = NULL; + entry->physout = NULL; } #endif } @@ -116,7 +102,12 @@ void nf_queue_entry_get_refs(struct nf_queue_entry *entry) if (state->sk) sock_hold(state->sk); - nf_queue_entry_get_br_nf_refs(entry->skb); +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + if (entry->physin) + dev_hold(entry->physin); + if (entry->physout) + dev_hold(entry->physout); +#endif } EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs); @@ -207,6 +198,8 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, .size = sizeof(*entry) + route_key_size, }; + __nf_queue_entry_init_physdevs(entry); + nf_queue_entry_get_refs(entry); switch (entry->state.pf) { From af370ab36fcd19f04e3408c402608e7e56e6f188 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 27 Mar 2020 03:24:48 +0100 Subject: [PATCH 19/28] netfilter: nf_queue: do not release refcouts until nf_reinject is done nf_queue is problematic when another NF_QUEUE invocation happens from nf_reinject(). 1. nf_queue is invoked, increments state->sk refcount. 2. skb is queued, waiting for verdict. 3. sk is closed/released. 3. verdict comes back, nf_reinject is called. 4. nf_reinject drops the reference -- refcount can now drop to 0 Instead of get_ref/release_ref pattern, we need to nest the get_ref calls: get_ref get_ref release_ref release_ref So that when we invoke the next processing stage (another netfilter or the okfn()), we hold at least one reference count on the devices/socket. After previous patch, it is now safe to put the entry even after okfn() has potentially free'd the skb. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_queue.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 96eb72908467..aadccdd117f0 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -303,12 +303,10 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) hooks = nf_hook_entries_head(net, pf, entry->state.hook); - nf_queue_entry_release_refs(entry); - i = entry->hook_index; if (WARN_ON_ONCE(!hooks || i >= hooks->num_hook_entries)) { kfree_skb(skb); - kfree(entry); + nf_queue_entry_free(entry); return; } @@ -347,6 +345,6 @@ next_hook: kfree_skb(skb); } - kfree(entry); + nf_queue_entry_free(entry); } EXPORT_SYMBOL(nf_reinject); From 28f715b9e6dd7cbf07c2aea913fea7c87a56a3b5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 27 Mar 2020 03:24:49 +0100 Subject: [PATCH 20/28] netfilter: nf_queue: prefer nf_queue_entry_free Instead of dropping refs+kfree, use the helper added in previous patch. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_queue.c | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index aadccdd117f0..bbd1209694b8 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -155,18 +155,16 @@ static void nf_ip6_saveroute(const struct sk_buff *skb, static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, unsigned int index, unsigned int queuenum) { - int status = -ENOENT; struct nf_queue_entry *entry = NULL; const struct nf_queue_handler *qh; struct net *net = state->net; unsigned int route_key_size; + int status; /* QUEUE == DROP if no one is waiting, to be safe. */ qh = rcu_dereference(net->nf.queue_handler); - if (!qh) { - status = -ESRCH; - goto err; - } + if (!qh) + return -ESRCH; switch (state->pf) { case AF_INET: @@ -181,14 +179,12 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, } entry = kmalloc(sizeof(*entry) + route_key_size, GFP_ATOMIC); - if (!entry) { - status = -ENOMEM; - goto err; - } + if (!entry) + return -ENOMEM; if (skb_dst(skb) && !skb_dst_force(skb)) { - status = -ENETDOWN; - goto err; + kfree(entry); + return -ENETDOWN; } *entry = (struct nf_queue_entry) { @@ -212,17 +208,12 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, } status = qh->outfn(entry, queuenum); - if (status < 0) { - nf_queue_entry_release_refs(entry); - goto err; + nf_queue_entry_free(entry); + return status; } return 0; - -err: - kfree(entry); - return status; } /* Packets leaving via this function must come back through nf_reinject(). */ From 7c6b4121627aeaa79536fbb900feafec740410d3 Mon Sep 17 00:00:00 2001 From: Romain Bellan Date: Fri, 27 Mar 2020 09:26:32 +0100 Subject: [PATCH 21/28] netfilter: ctnetlink: be more strict when NF_CONNTRACK_MARK is not set When CONFIG_NF_CONNTRACK_MARK is not set, any CTA_MARK or CTA_MARK_MASK in netlink message are not supported. We should return an error when one of them is set, not both Fixes: 9306425b70bf ("netfilter: ctnetlink: must check mark attributes vs NULL") Signed-off-by: Romain Bellan Signed-off-by: Florent Fourcot Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index eb190206cd12..9ddfcd002d3b 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -860,7 +860,7 @@ ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family) struct ctnetlink_filter *filter; #ifndef CONFIG_NF_CONNTRACK_MARK - if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) + if (cda[CTA_MARK] || cda[CTA_MARK_MASK]) return ERR_PTR(-EOPNOTSUPP); #endif From 24791b9aa1ab09818617ff384876930e09ada0a3 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 27 Mar 2020 17:43:04 +0100 Subject: [PATCH 22/28] netfilter: nft_set_bitmap: initialize set element extension in lookups Otherwise, nft_lookup might dereference an uninitialized pointer to the element extension. Fixes: 665153ff5752 ("netfilter: nf_tables: add bitmap set type") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_bitmap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 1cb2e67e6e03..6829a497b4cc 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -81,6 +81,7 @@ static bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set, u32 idx, off; nft_bitmap_location(set, key, &idx, &off); + *ext = NULL; return nft_bitmap_active(priv->bitmap, idx, off, genmask); } From 8548bde9890fd0316f49d3c6937573c18261f68f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 27 Mar 2020 17:43:05 +0100 Subject: [PATCH 23/28] netfilter: nft_dynset: validate set expression definition If the global set expression definition mismatches the dynset expression, then bail out. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_dynset.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index d1b64c8de585..64ca13a1885b 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -187,6 +187,11 @@ static int nft_dynset_init(const struct nft_ctx *ctx, tb[NFTA_DYNSET_EXPR]); if (IS_ERR(priv->expr)) return PTR_ERR(priv->expr); + + if (set->expr && set->expr->ops != priv->expr->ops) { + err = -EOPNOTSUPP; + goto err_expr_free; + } } nft_set_ext_prepare(&priv->tmpl); @@ -205,7 +210,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx, err = nf_tables_bind_set(ctx, set, &priv->binding); if (err < 0) - goto err1; + goto err_expr_free; if (set->size == 0) set->size = 0xffff; @@ -213,7 +218,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx, priv->set = set; return 0; -err1: +err_expr_free: if (priv->expr != NULL) nft_expr_destroy(ctx, priv->expr); return err; From d56aab2625f7bee79686566d3f6ad639d694b8cd Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 27 Mar 2020 17:43:06 +0100 Subject: [PATCH 24/28] netfilter: nf_tables: skip set types that do not support for expressions The bitmap set does not support for expressions, skip it from the estimation step. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 3 +++ net/netfilter/nft_set_bitmap.c | 2 ++ 3 files changed, 7 insertions(+) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 642bc3ef81aa..6eb627b3c99b 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -266,6 +266,7 @@ struct nft_set_iter { * @size: number of set elements * @field_len: length of each field in concatenation, bytes * @field_count: number of concatenated fields in element + * @expr: set must support for expressions */ struct nft_set_desc { unsigned int klen; @@ -273,6 +274,7 @@ struct nft_set_desc { unsigned int size; u8 field_len[NFT_REG32_COUNT]; u8 field_count; + bool expr; }; /** diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index c1e04ac21392..8a73adfab7ff 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4032,6 +4032,9 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, return err; } + if (nla[NFTA_SET_EXPR]) + desc.expr = true; + table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]); diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 6829a497b4cc..32f0fc8be3a4 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -286,6 +286,8 @@ static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features, /* Make sure bitmaps we don't get bitmaps larger than 16 Kbytes. */ if (desc->klen > 2) return false; + else if (desc->expr) + return false; est->size = nft_bitmap_total_size(desc->klen); est->lookup = NFT_SET_CLASS_O_1; From 9312eabab4a68348af5b4482cc7cc6f151ff1c3f Mon Sep 17 00:00:00 2001 From: wenxu Date: Sat, 28 Mar 2020 08:57:53 +0800 Subject: [PATCH 25/28] netfilter: conntrack: add nf_ct_acct_add() Add nf_ct_acct_add function to update the conntrack counter with packets and bytes. Signed-off-by: wenxu Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_acct.h | 11 ++++++++++- net/netfilter/nf_conntrack_core.c | 7 ++++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/include/net/netfilter/nf_conntrack_acct.h b/include/net/netfilter/nf_conntrack_acct.h index df198c51244a..7f44a771530e 100644 --- a/include/net/netfilter/nf_conntrack_acct.h +++ b/include/net/netfilter/nf_conntrack_acct.h @@ -65,7 +65,16 @@ static inline void nf_ct_set_acct(struct net *net, bool enable) #endif } -void nf_ct_acct_update(struct nf_conn *ct, u32 dir, unsigned int bytes); +void nf_ct_acct_add(struct nf_conn *ct, u32 dir, unsigned int packets, + unsigned int bytes); + +static inline void nf_ct_acct_update(struct nf_conn *ct, u32 dir, + unsigned int bytes) +{ +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + nf_ct_acct_add(ct, dir, 1, bytes); +#endif +} void nf_conntrack_acct_pernet_init(struct net *net); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 7ded6d287f87..c4582eb71766 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -865,7 +865,8 @@ out: } EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); -void nf_ct_acct_update(struct nf_conn *ct, u32 dir, unsigned int bytes) +void nf_ct_acct_add(struct nf_conn *ct, u32 dir, unsigned int packets, + unsigned int bytes) { struct nf_conn_acct *acct; @@ -873,11 +874,11 @@ void nf_ct_acct_update(struct nf_conn *ct, u32 dir, unsigned int bytes) if (acct) { struct nf_conn_counter *counter = acct->counter; - atomic64_inc(&counter[dir].packets); + atomic64_add(packets, &counter[dir].packets); atomic64_add(bytes, &counter[dir].bytes); } } -EXPORT_SYMBOL_GPL(nf_ct_acct_update); +EXPORT_SYMBOL_GPL(nf_ct_acct_add); static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo, const struct nf_conn *loser_ct) From ef803b3cf96a26e2a601755b237585a23e6bc30c Mon Sep 17 00:00:00 2001 From: wenxu Date: Sat, 28 Mar 2020 08:57:54 +0800 Subject: [PATCH 26/28] netfilter: flowtable: add counter support in HW offload Store the conntrack counters to the conntrack entry in the HW flowtable offload. Signed-off-by: wenxu Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_offload.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index a9b3b88bd0f1..d9547c3d1b85 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -783,6 +784,17 @@ static void flow_offload_work_stats(struct flow_offload_work *offload) lastused = max_t(u64, stats[0].lastused, stats[1].lastused); offload->flow->timeout = max_t(u64, offload->flow->timeout, lastused + NF_FLOW_TIMEOUT); + + if (offload->flowtable->flags & NF_FLOWTABLE_COUNTER) { + if (stats[0].pkts) + nf_ct_acct_add(offload->flow->ct, + FLOW_OFFLOAD_DIR_ORIGINAL, + stats[0].pkts, stats[0].bytes); + if (stats[1].pkts) + nf_ct_acct_add(offload->flow->ct, + FLOW_OFFLOAD_DIR_REPLY, + stats[1].pkts, stats[1].bytes); + } } static void flow_offload_work_handler(struct work_struct *work) From 2e34328b396a69b73661ba38d47d92b7cf21c2c4 Mon Sep 17 00:00:00 2001 From: Sergey Marinkevich Date: Sun, 29 Mar 2020 19:19:14 +0700 Subject: [PATCH 27/28] netfilter: nft_exthdr: fix endianness of tcp option cast MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I got a problem on MIPS with Big-Endian is turned on: every time when NF trying to change TCP MSS it returns because of new.v16 was greater than old.v16. But real MSS was 1460 and my rule was like this: add rule table chain tcp option maxseg size set 1400 And 1400 is lesser that 1460, not greater. Later I founded that main causer is cast from u32 to __be16. Debugging: In example MSS = 1400(HEX: 0x578). Here is representation of each byte like it is in memory by addresses from left to right(e.g. [0x0 0x1 0x2 0x3]). LE — Little-Endian system, BE — Big-Endian, left column is type. LE BE u32: [78 05 00 00] [00 00 05 78] As you can see, u32 representation will be casted to u16 from different half of 4-byte address range. But actually nf_tables uses registers and store data of various size. Actually TCP MSS stored in 2 bytes. But registers are still u32 in definition: struct nft_regs { union { u32 data[20]; struct nft_verdict verdict; }; }; So, access like regs->data[priv->sreg] exactly u32. So, according to table presents above, per-byte representation of stored TCP MSS in register will be: LE BE (u32)regs->data[]: [78 05 00 00] [05 78 00 00] ^^ ^^ We see that register uses just half of u32 and other 2 bytes may be used for some another data. But in nft_exthdr_tcp_set_eval() it casted just like u32 -> __be16: new.v16 = src But u32 overfill __be16, so it get 2 low bytes. For clarity draw one more table( means that bytes will be used for cast). LE BE u32: [<78 05> 00 00] [00 00 <05 78>] (u32)regs->data[]: [<78 05> 00 00] [05 78 <00 00>] As you can see, for Little-Endian nothing changes, but for Big-endian we take the wrong half. In my case there is some other data instead of zeros, so new MSS was wrongly greater. For shooting this bug I used solution for ports ranges. Applying of this patch does not affect Little-Endian systems. Signed-off-by: Sergey Marinkevich Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_exthdr.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index a5e8469859e3..07782836fad6 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -228,7 +228,6 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, unsigned int i, optl, tcphdr_len, offset; struct tcphdr *tcph; u8 *opt; - u32 src; tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len); if (!tcph) @@ -237,7 +236,6 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, opt = (u8 *)tcph; for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) { union { - u8 octet; __be16 v16; __be32 v32; } old, new; @@ -259,13 +257,13 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, if (!tcph) return; - src = regs->data[priv->sreg]; offset = i + priv->offset; switch (priv->len) { case 2: old.v16 = get_unaligned((u16 *)(opt + offset)); - new.v16 = src; + new.v16 = (__force __be16)nft_reg_load16( + ®s->data[priv->sreg]); switch (priv->type) { case TCPOPT_MSS: @@ -283,7 +281,7 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, old.v16, new.v16, false); break; case 4: - new.v32 = src; + new.v32 = regs->data[priv->sreg]; old.v32 = get_unaligned((u32 *)(opt + offset)); if (old.v32 == new.v32) From e19680f8347ec0e335ae90801fbe42d85d7b385a Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Mon, 30 Mar 2020 11:20:15 +0800 Subject: [PATCH 28/28] ipvs: fix uninitialized variable warning If outer_proto is not set, GCC warning as following: In file included from net/netfilter/ipvs/ip_vs_core.c:52: net/netfilter/ipvs/ip_vs_core.c: In function 'ip_vs_in_icmp': include/net/ip_vs.h:233:4: warning: 'outer_proto' may be used uninitialized in this function [-Wmaybe-uninitialized] 233 | printk(KERN_DEBUG pr_fmt(msg), ##__VA_ARGS__); \ | ^~~~~~ net/netfilter/ipvs/ip_vs_core.c:1666:8: note: 'outer_proto' was declared here 1666 | char *outer_proto; | ^~~~~~~~~~~ Fixes: 73348fed35d0 ("ipvs: optimize tunnel dumps for icmp errors") Signed-off-by: Haishuang Yan Acked-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index d2ac530a9501..aa6a603a2425 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1663,7 +1663,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, unsigned int offset, offset2, ihl, verdict; bool tunnel, new_cp = false; union nf_inet_addr *raddr; - char *outer_proto; + char *outer_proto = "IPIP"; *related = 1; @@ -1723,7 +1723,6 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, if (cih == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ tunnel = true; - outer_proto = "IPIP"; } else if ((cih->protocol == IPPROTO_UDP || /* Can be UDP encap */ cih->protocol == IPPROTO_GRE) && /* Can be GRE encap */ /* Error for our tunnel must arrive at LOCAL_IN */