From 1e8e3f449b1e73b73a843257635b9c50f0cc0f0a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 20 Apr 2026 23:15:32 +0200 Subject: [PATCH 01/79] netfilter: arp_tables: fix IEEE1394 ARP payload parsing Weiming Shi says: "arp_packet_match() unconditionally parses the ARP payload assuming two hardware addresses are present (source and target). However, IPv4-over-IEEE1394 ARP (RFC 2734) omits the target hardware address field, and arp_hdr_len() already accounts for this by returning a shorter length for ARPHRD_IEEE1394 devices. As a result, on IEEE1394 interfaces arp_packet_match() advances past a nonexistent target hardware address and reads the wrong bytes for both the target device address comparison and the target IP address. This causes arptables rules to match against garbage data, leading to incorrect filtering decisions: packets that should be accepted may be dropped and vice versa. The ARP stack in net/ipv4/arp.c (arp_create and arp_process) already handles this correctly by skipping the target hardware address for ARPHRD_IEEE1394. Apply the same pattern to arp_packet_match()." Mangle the original patch to always return 0 (no match) in case user matches on the target hardware address which is never present in IEEE1394. Note that this returns 0 (no match) for either normal and inverse match because matching in the target hardware address in ARPHRD_IEEE1394 has never been supported by arptables. This is intentional, matching on the target hardware address should never evaluate true for ARPHRD_IEEE1394. Moreover, adjust arpt_mangle to drop the packet too as AI suggests: In arpt_mangle, the logic assumes a standard ARP layout. Because IEEE1394 (FireWire) omits the target hardware address, the linear pointer arithmetic miscalculates the offset for the target IP address. This causes mangling operations to write to the wrong location, leading to packet corruption. To ensure safety, this patch drops packets (NF_DROP) when mangling is requested for these fields on IEEE1394 devices, as the current implementation cannot correctly map the FireWire ARP payload. This omits both mangling target hardware and IP address. Even if IP address mangling should be possible in IEEE1394, this would require to adjust arpt_mangle offset calculation, which has never been supported. Based on patch from Weiming Shi . Fixes: 6752c8db8e0c ("firewire net, ipv4 arp: Extend hardware address and remove driver-level packet inspection.") Reported-by: Xiang Mei Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/arp_tables.c | 18 +++++++++++++++--- net/ipv4/netfilter/arpt_mangle.c | 8 ++++++++ 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 1cdd9c28ab2d..97ead883e4a1 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -110,13 +110,25 @@ static inline int arp_packet_match(const struct arphdr *arphdr, arpptr += dev->addr_len; memcpy(&src_ipaddr, arpptr, sizeof(u32)); arpptr += sizeof(u32); - tgt_devaddr = arpptr; - arpptr += dev->addr_len; + + if (IS_ENABLED(CONFIG_FIREWIRE_NET) && dev->type == ARPHRD_IEEE1394) { + if (unlikely(memchr_inv(arpinfo->tgt_devaddr.mask, 0, + sizeof(arpinfo->tgt_devaddr.mask)))) + return 0; + + tgt_devaddr = NULL; + } else { + tgt_devaddr = arpptr; + arpptr += dev->addr_len; + } memcpy(&tgt_ipaddr, arpptr, sizeof(u32)); if (NF_INVF(arpinfo, ARPT_INV_SRCDEVADDR, arp_devaddr_compare(&arpinfo->src_devaddr, src_devaddr, - dev->addr_len)) || + dev->addr_len))) + return 0; + + if (tgt_devaddr && NF_INVF(arpinfo, ARPT_INV_TGTDEVADDR, arp_devaddr_compare(&arpinfo->tgt_devaddr, tgt_devaddr, dev->addr_len))) diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c index a4e07e5e9c11..f65dd339208e 100644 --- a/net/ipv4/netfilter/arpt_mangle.c +++ b/net/ipv4/netfilter/arpt_mangle.c @@ -40,6 +40,10 @@ target(struct sk_buff *skb, const struct xt_action_param *par) } arpptr += pln; if (mangle->flags & ARPT_MANGLE_TDEV) { + if (unlikely(IS_ENABLED(CONFIG_FIREWIRE_NET) && + skb->dev->type == ARPHRD_IEEE1394)) + return NF_DROP; + if (ARPT_DEV_ADDR_LEN_MAX < hln || (arpptr + hln > skb_tail_pointer(skb))) return NF_DROP; @@ -47,6 +51,10 @@ target(struct sk_buff *skb, const struct xt_action_param *par) } arpptr += hln; if (mangle->flags & ARPT_MANGLE_TIP) { + if (unlikely(IS_ENABLED(CONFIG_FIREWIRE_NET) && + skb->dev->type == ARPHRD_IEEE1394)) + return NF_DROP; + if (ARPT_MANGLE_ADDR_LEN_MAX < pln || (arpptr + pln > skb_tail_pointer(skb))) return NF_DROP; From f3224ee463f8f6f6ced7dcdf6081add4f8128527 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 16 Apr 2026 15:14:51 +0200 Subject: [PATCH 02/79] netfilter: nf_tables: use list_del_rcu for netlink hooks nft_netdev_unregister_hooks and __nft_unregister_flowtable_net_hooks need to use list_del_rcu(), this list can be walked by concurrent dumpers. Add a new helper and use it consistently. Fixes: f9a43007d3f7 ("netfilter: nf_tables: double hook unregistration in netns path") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 44 ++++++++++++++--------------------- 1 file changed, 18 insertions(+), 26 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 8537b94653d3..07e151245765 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -374,6 +374,12 @@ static void nft_netdev_hook_free_rcu(struct nft_hook *hook) call_rcu(&hook->rcu, __nft_netdev_hook_free_rcu); } +static void nft_netdev_hook_unlink_free_rcu(struct nft_hook *hook) +{ + list_del_rcu(&hook->list); + nft_netdev_hook_free_rcu(hook); +} + static void nft_netdev_unregister_hooks(struct net *net, struct list_head *hook_list, bool release_netdev) @@ -384,10 +390,8 @@ static void nft_netdev_unregister_hooks(struct net *net, list_for_each_entry_safe(hook, next, hook_list, list) { list_for_each_entry(ops, &hook->ops_list, list) nf_unregister_net_hook(net, ops); - if (release_netdev) { - list_del(&hook->list); - nft_netdev_hook_free_rcu(hook); - } + if (release_netdev) + nft_netdev_hook_unlink_free_rcu(hook); } } @@ -2271,10 +2275,8 @@ void nf_tables_chain_destroy(struct nft_chain *chain) if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) { list_for_each_entry_safe(hook, next, - &basechain->hook_list, list) { - list_del_rcu(&hook->list); - nft_netdev_hook_free_rcu(hook); - } + &basechain->hook_list, list) + nft_netdev_hook_unlink_free_rcu(hook); } module_put(basechain->type->owner); if (rcu_access_pointer(basechain->stats)) { @@ -2974,6 +2976,7 @@ err_hooks: list_for_each_entry(ops, &h->ops_list, list) nf_unregister_net_hook(ctx->net, ops); } + /* hook.list is on stack, no need for list_del_rcu() */ list_del(&h->list); nft_netdev_hook_free_rcu(h); } @@ -8852,10 +8855,8 @@ static void __nft_unregister_flowtable_net_hooks(struct net *net, list_for_each_entry_safe(hook, next, hook_list, list) { list_for_each_entry(ops, &hook->ops_list, list) nft_unregister_flowtable_ops(net, flowtable, ops); - if (release_netdev) { - list_del(&hook->list); - nft_netdev_hook_free_rcu(hook); - } + if (release_netdev) + nft_netdev_hook_unlink_free_rcu(hook); } } @@ -8926,8 +8927,7 @@ err_unregister_net_hooks: nft_unregister_flowtable_ops(net, flowtable, ops); } - list_del_rcu(&hook->list); - nft_netdev_hook_free_rcu(hook); + nft_netdev_hook_unlink_free_rcu(hook); } return err; @@ -8937,10 +8937,8 @@ static void nft_hooks_destroy(struct list_head *hook_list) { struct nft_hook *hook, *next; - list_for_each_entry_safe(hook, next, hook_list, list) { - list_del_rcu(&hook->list); - nft_netdev_hook_free_rcu(hook); - } + list_for_each_entry_safe(hook, next, hook_list, list) + nft_netdev_hook_unlink_free_rcu(hook); } static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, @@ -9028,8 +9026,7 @@ err_flowtable_update_hook: nft_unregister_flowtable_ops(ctx->net, flowtable, ops); } - list_del_rcu(&hook->list); - nft_netdev_hook_free_rcu(hook); + nft_netdev_hook_unlink_free_rcu(hook); } return err; @@ -9535,13 +9532,8 @@ err: static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) { - struct nft_hook *hook, *next; - flowtable->data.type->free(&flowtable->data); - list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { - list_del_rcu(&hook->list); - nft_netdev_hook_free_rcu(hook); - } + nft_hooks_destroy(&flowtable->hook_list); kfree(flowtable->name); module_put(flowtable->data.type->owner); kfree(flowtable); From f902877b635551513729bdf9a8d1422c4aab7741 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 15 Apr 2026 17:56:02 +0200 Subject: [PATCH 03/79] rculist: add list_splice_rcu() for private lists This patch adds a helper function, list_splice_rcu(), to safely splice a private (non-RCU-protected) list into an RCU-protected list. The function ensures that only the pointer visible to RCU readers (prev->next) is updated using rcu_assign_pointer(), while the rest of the list manipulations are performed with regular assignments, as the source list is private and not visible to concurrent RCU readers. This is useful for moving elements from a private list into a global RCU-protected list, ensuring safe publication for RCU readers. Subsystems with some sort of batching mechanism from userspace can benefit from this new function. The function __list_splice_rcu() has been added for clarity and to follow the same pattern as in the existing list_splice*() interfaces, where there is a check to ensure that the list to splice is not empty. Note that __list_splice_rcu() has no documentation for this reason. Reviewed-by: Paul E. McKenney Signed-off-by: Pablo Neira Ayuso --- include/linux/rculist.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 2abba7552605..e3bc44225692 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -261,6 +261,35 @@ static inline void list_replace_rcu(struct list_head *old, old->prev = LIST_POISON2; } +static inline void __list_splice_rcu(struct list_head *list, + struct list_head *prev, + struct list_head *next) +{ + struct list_head *first = list->next; + struct list_head *last = list->prev; + + last->next = next; + first->prev = prev; + next->prev = last; + rcu_assign_pointer(list_next_rcu(prev), first); +} + +/** + * list_splice_rcu - splice a non-RCU list into an RCU-protected list, + * designed for stacks. + * @list: the non RCU-protected list to splice + * @head: the place in the existing RCU-protected list to splice + * + * The list pointed to by @head can be RCU-read traversed concurrently with + * this function. + */ +static inline void list_splice_rcu(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) + __list_splice_rcu(list, head, head->next); +} + /** * __list_splice_init_rcu - join an RCU-protected list into an existing list. * @list: the RCU-protected list to splice From a6134e62dba2ea4f760b29d5226907f447c92400 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 15 Apr 2026 17:56:14 +0200 Subject: [PATCH 04/79] netfilter: nf_tables: join hook list via splice_list_rcu() in commit phase Publish new hooks in the list into the basechain/flowtable using splice_list_rcu() to ensure netlink dump list traversal via rcu is safe while concurrent ruleset update is going on. Fixes: 78d9f48f7f44 ("netfilter: nf_tables: add devices to existing flowtable") Fixes: b9703ed44ffb ("netfilter: nf_tables: support for adding new devices to an existing netdev chain") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 07e151245765..ae10116af923 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -10838,8 +10838,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_chain_commit_update(nft_trans_container_chain(trans)); nf_tables_chain_notify(&ctx, NFT_MSG_NEWCHAIN, &nft_trans_chain_hooks(trans)); - list_splice(&nft_trans_chain_hooks(trans), - &nft_trans_basechain(trans)->hook_list); + list_splice_rcu(&nft_trans_chain_hooks(trans), + &nft_trans_basechain(trans)->hook_list); /* trans destroyed after rcu grace period */ } else { nft_chain_commit_drop_policy(nft_trans_container_chain(trans)); @@ -10968,8 +10968,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_trans_flowtable(trans), &nft_trans_flowtable_hooks(trans), NFT_MSG_NEWFLOWTABLE); - list_splice(&nft_trans_flowtable_hooks(trans), - &nft_trans_flowtable(trans)->hook_list); + list_splice_rcu(&nft_trans_flowtable_hooks(trans), + &nft_trans_flowtable(trans)->hook_list); } else { nft_clear(net, nft_trans_flowtable(trans)); nf_tables_flowtable_notify(&ctx, From 10f79dbd7719d1da9f5884d13060322d8729f091 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 15 Apr 2026 22:58:23 +0200 Subject: [PATCH 05/79] netfilter: nf_tables: add hook transactions for device deletions Restore the flag that indicates that the hook is going away, ie. NFT_HOOK_REMOVE, but add a new transaction object to track deletion of hooks without altering the basechain/flowtable hook_list during the preparation phase. The existing approach that moves the hook from the basechain/flowtable hook_list to transaction hook_list breaks netlink dump path readers of this RCU-protected list. It should be possible use an array for nft_trans_hook to store the deleted hooks to compact the representation but I am not expecting many hook object, specially now that wildcard support for devices is in place. Note that the nft_trans_chain_hooks() list contains a list of struct nft_trans_hook objects for DELCHAIN and DELFLOWTABLE commands, while this list stores struct nft_hook objects for NEWCHAIN and NEWFLOWTABLE. Note that new commands can be updated to use nft_trans_hook for consistency. This patch also adapts the event notification path to deal with the list of hook transactions. Fixes: 7d937b107108 ("netfilter: nf_tables: support for deleting devices in an existing netdev chain") Fixes: b6d9014a3335 ("netfilter: nf_tables: delete flowtable hooks via transaction list") Reported-by: Xiang Mei Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 13 ++ net/netfilter/nf_tables_api.c | 264 +++++++++++++++++++++++------- 2 files changed, 217 insertions(+), 60 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 2c0173d9309c..cff7b773e972 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1204,12 +1204,15 @@ struct nft_stats { struct u64_stats_sync syncp; }; +#define NFT_HOOK_REMOVE (1 << 0) + struct nft_hook { struct list_head list; struct list_head ops_list; struct rcu_head rcu; char ifname[IFNAMSIZ]; u8 ifnamelen; + u8 flags; }; struct nf_hook_ops *nft_hook_find_ops(const struct nft_hook *hook, @@ -1664,6 +1667,16 @@ struct nft_trans { u8 put_net:1; }; +/** + * struct nft_trans_hook - nf_tables hook update in transaction + * @list: used internally + * @hook: struct nft_hook with the device hook + */ +struct nft_trans_hook { + struct list_head list; + struct nft_hook *hook; +}; + /** * struct nft_trans_binding - nf_tables object with binding support in transaction * @nft_trans: base structure, MUST be first member diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index ae10116af923..d20ce5c36d31 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -380,6 +380,32 @@ static void nft_netdev_hook_unlink_free_rcu(struct nft_hook *hook) nft_netdev_hook_free_rcu(hook); } +static void nft_trans_hook_destroy(struct nft_trans_hook *trans_hook) +{ + list_del(&trans_hook->list); + kfree(trans_hook); +} + +static void nft_netdev_unregister_trans_hook(struct net *net, + const struct nft_table *table, + struct list_head *hook_list) +{ + struct nft_trans_hook *trans_hook, *next; + struct nf_hook_ops *ops; + struct nft_hook *hook; + + list_for_each_entry_safe(trans_hook, next, hook_list, list) { + hook = trans_hook->hook; + + if (!(table->flags & NFT_TABLE_F_DORMANT)) { + list_for_each_entry(ops, &hook->ops_list, list) + nf_unregister_net_hook(net, ops); + } + nft_netdev_hook_unlink_free_rcu(hook); + nft_trans_hook_destroy(trans_hook); + } +} + static void nft_netdev_unregister_hooks(struct net *net, struct list_head *hook_list, bool release_netdev) @@ -1946,15 +1972,69 @@ static int nft_nla_put_hook_dev(struct sk_buff *skb, struct nft_hook *hook) return nla_put_string(skb, attr, hook->ifname); } +struct nft_hook_dump_ctx { + struct nft_hook *first; + int n; +}; + +static int nft_dump_basechain_hook_one(struct sk_buff *skb, + struct nft_hook *hook, + struct nft_hook_dump_ctx *dump_ctx) +{ + if (!dump_ctx->first) + dump_ctx->first = hook; + + if (nft_nla_put_hook_dev(skb, hook)) + return -1; + + dump_ctx->n++; + + return 0; +} + +static int nft_dump_basechain_hook_list(struct sk_buff *skb, + const struct net *net, + const struct list_head *hook_list, + struct nft_hook_dump_ctx *dump_ctx) +{ + struct nft_hook *hook; + int err; + + list_for_each_entry_rcu(hook, hook_list, list, + lockdep_commit_lock_is_held(net)) { + err = nft_dump_basechain_hook_one(skb, hook, dump_ctx); + if (err < 0) + return err; + } + + return 0; +} + +static int nft_dump_basechain_trans_hook_list(struct sk_buff *skb, + const struct list_head *trans_hook_list, + struct nft_hook_dump_ctx *dump_ctx) +{ + struct nft_trans_hook *trans_hook; + int err; + + list_for_each_entry(trans_hook, trans_hook_list, list) { + err = nft_dump_basechain_hook_one(skb, trans_hook->hook, dump_ctx); + if (err < 0) + return err; + } + + return 0; +} + static int nft_dump_basechain_hook(struct sk_buff *skb, const struct net *net, int family, const struct nft_base_chain *basechain, - const struct list_head *hook_list) + const struct list_head *hook_list, + const struct list_head *trans_hook_list) { const struct nf_hook_ops *ops = &basechain->ops; - struct nft_hook *hook, *first = NULL; + struct nft_hook_dump_ctx dump_hook_ctx = {}; struct nlattr *nest, *nest_devs; - int n = 0; nest = nla_nest_start_noflag(skb, NFTA_CHAIN_HOOK); if (nest == NULL) @@ -1969,23 +2049,23 @@ static int nft_dump_basechain_hook(struct sk_buff *skb, if (!nest_devs) goto nla_put_failure; - if (!hook_list) + if (!hook_list && !trans_hook_list) hook_list = &basechain->hook_list; - list_for_each_entry_rcu(hook, hook_list, list, - lockdep_commit_lock_is_held(net)) { - if (!first) - first = hook; - - if (nft_nla_put_hook_dev(skb, hook)) - goto nla_put_failure; - n++; + if (hook_list && + nft_dump_basechain_hook_list(skb, net, hook_list, &dump_hook_ctx)) { + goto nla_put_failure; + } else if (trans_hook_list && + nft_dump_basechain_trans_hook_list(skb, trans_hook_list, + &dump_hook_ctx)) { + goto nla_put_failure; } + nla_nest_end(skb, nest_devs); - if (n == 1 && - !hook_is_prefix(first) && - nla_put_string(skb, NFTA_HOOK_DEV, first->ifname)) + if (dump_hook_ctx.n == 1 && + !hook_is_prefix(dump_hook_ctx.first) && + nla_put_string(skb, NFTA_HOOK_DEV, dump_hook_ctx.first->ifname)) goto nla_put_failure; } nla_nest_end(skb, nest); @@ -1999,7 +2079,8 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq, int event, u32 flags, int family, const struct nft_table *table, const struct nft_chain *chain, - const struct list_head *hook_list) + const struct list_head *hook_list, + const struct list_head *trans_hook_list) { struct nlmsghdr *nlh; @@ -2015,7 +2096,7 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, NFTA_CHAIN_PAD)) goto nla_put_failure; - if (!hook_list && + if (!hook_list && !trans_hook_list && (event == NFT_MSG_DELCHAIN || event == NFT_MSG_DESTROYCHAIN)) { nlmsg_end(skb, nlh); @@ -2026,7 +2107,8 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, const struct nft_base_chain *basechain = nft_base_chain(chain); struct nft_stats __percpu *stats; - if (nft_dump_basechain_hook(skb, net, family, basechain, hook_list)) + if (nft_dump_basechain_hook(skb, net, family, basechain, + hook_list, trans_hook_list)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_CHAIN_POLICY, @@ -2062,7 +2144,8 @@ nla_put_failure: } static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event, - const struct list_head *hook_list) + const struct list_head *hook_list, + const struct list_head *trans_hook_list) { struct nftables_pernet *nft_net; struct sk_buff *skb; @@ -2082,7 +2165,7 @@ static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event, err = nf_tables_fill_chain_info(skb, ctx->net, ctx->portid, ctx->seq, event, flags, ctx->family, ctx->table, - ctx->chain, hook_list); + ctx->chain, hook_list, trans_hook_list); if (err < 0) { kfree_skb(skb); goto err; @@ -2128,7 +2211,7 @@ static int nf_tables_dump_chains(struct sk_buff *skb, NFT_MSG_NEWCHAIN, NLM_F_MULTI, table->family, table, - chain, NULL) < 0) + chain, NULL, NULL) < 0) goto done; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); @@ -2182,7 +2265,7 @@ static int nf_tables_getchain(struct sk_buff *skb, const struct nfnl_info *info, err = nf_tables_fill_chain_info(skb2, net, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, - 0, family, table, chain, NULL); + 0, family, table, chain, NULL, NULL); if (err < 0) goto err_fill_chain_info; @@ -2345,8 +2428,12 @@ static struct nft_hook *nft_hook_list_find(struct list_head *hook_list, list_for_each_entry(hook, hook_list, list) { if (!strncmp(hook->ifname, this->ifname, - min(hook->ifnamelen, this->ifnamelen))) + min(hook->ifnamelen, this->ifnamelen))) { + if (hook->flags & NFT_HOOK_REMOVE) + continue; + return hook; + } } return NULL; @@ -3105,6 +3192,32 @@ static int nf_tables_newchain(struct sk_buff *skb, const struct nfnl_info *info, return nf_tables_addchain(&ctx, family, policy, flags, extack); } +static int nft_trans_delhook(struct nft_hook *hook, + struct list_head *del_list) +{ + struct nft_trans_hook *trans_hook; + + trans_hook = kmalloc_obj(*trans_hook, GFP_KERNEL); + if (!trans_hook) + return -ENOMEM; + + trans_hook->hook = hook; + list_add_tail(&trans_hook->list, del_list); + hook->flags |= NFT_HOOK_REMOVE; + + return 0; +} + +static void nft_trans_delhook_abort(struct list_head *del_list) +{ + struct nft_trans_hook *trans_hook, *next; + + list_for_each_entry_safe(trans_hook, next, del_list, list) { + trans_hook->hook->flags &= ~NFT_HOOK_REMOVE; + nft_trans_hook_destroy(trans_hook); + } +} + static int nft_delchain_hook(struct nft_ctx *ctx, struct nft_base_chain *basechain, struct netlink_ext_ack *extack) @@ -3131,7 +3244,10 @@ static int nft_delchain_hook(struct nft_ctx *ctx, err = -ENOENT; goto err_chain_del_hook; } - list_move(&hook->list, &chain_del_list); + if (nft_trans_delhook(hook, &chain_del_list) < 0) { + err = -ENOMEM; + goto err_chain_del_hook; + } } trans = nft_trans_alloc_chain(ctx, NFT_MSG_DELCHAIN); @@ -3151,7 +3267,7 @@ static int nft_delchain_hook(struct nft_ctx *ctx, return 0; err_chain_del_hook: - list_splice(&chain_del_list, &basechain->hook_list); + nft_trans_delhook_abort(&chain_del_list); nft_chain_release_hook(&chain_hook); return err; @@ -8941,6 +9057,24 @@ static void nft_hooks_destroy(struct list_head *hook_list) nft_netdev_hook_unlink_free_rcu(hook); } +static void nft_flowtable_unregister_trans_hook(struct net *net, + struct nft_flowtable *flowtable, + struct list_head *hook_list) +{ + struct nft_trans_hook *trans_hook, *next; + struct nf_hook_ops *ops; + struct nft_hook *hook; + + list_for_each_entry_safe(trans_hook, next, hook_list, list) { + hook = trans_hook->hook; + list_for_each_entry(ops, &hook->ops_list, list) + nft_unregister_flowtable_ops(net, flowtable, ops); + + nft_netdev_hook_unlink_free_rcu(hook); + nft_trans_hook_destroy(trans_hook); + } +} + static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, struct nft_flowtable *flowtable, struct netlink_ext_ack *extack) @@ -9199,7 +9333,10 @@ static int nft_delflowtable_hook(struct nft_ctx *ctx, err = -ENOENT; goto err_flowtable_del_hook; } - list_move(&hook->list, &flowtable_del_list); + if (nft_trans_delhook(hook, &flowtable_del_list) < 0) { + err = -ENOMEM; + goto err_flowtable_del_hook; + } } trans = nft_trans_alloc(ctx, NFT_MSG_DELFLOWTABLE, @@ -9220,7 +9357,7 @@ static int nft_delflowtable_hook(struct nft_ctx *ctx, return 0; err_flowtable_del_hook: - list_splice(&flowtable_del_list, &flowtable->hook_list); + nft_trans_delhook_abort(&flowtable_del_list); nft_flowtable_hook_release(&flowtable_hook); return err; @@ -9285,8 +9422,10 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq, int event, u32 flags, int family, struct nft_flowtable *flowtable, - struct list_head *hook_list) + struct list_head *hook_list, + struct list_head *trans_hook_list) { + struct nft_trans_hook *trans_hook; struct nlattr *nest, *nest_devs; struct nft_hook *hook; struct nlmsghdr *nlh; @@ -9303,7 +9442,7 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, NFTA_FLOWTABLE_PAD)) goto nla_put_failure; - if (!hook_list && + if (!hook_list && !trans_hook_list && (event == NFT_MSG_DELFLOWTABLE || event == NFT_MSG_DESTROYFLOWTABLE)) { nlmsg_end(skb, nlh); @@ -9325,13 +9464,20 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, if (!nest_devs) goto nla_put_failure; - if (!hook_list) + if (!hook_list && !trans_hook_list) hook_list = &flowtable->hook_list; - list_for_each_entry_rcu(hook, hook_list, list, - lockdep_commit_lock_is_held(net)) { - if (nft_nla_put_hook_dev(skb, hook)) - goto nla_put_failure; + if (hook_list) { + list_for_each_entry_rcu(hook, hook_list, list, + lockdep_commit_lock_is_held(net)) { + if (nft_nla_put_hook_dev(skb, hook)) + goto nla_put_failure; + } + } else if (trans_hook_list) { + list_for_each_entry(trans_hook, trans_hook_list, list) { + if (nft_nla_put_hook_dev(skb, trans_hook->hook)) + goto nla_put_failure; + } } nla_nest_end(skb, nest_devs); nla_nest_end(skb, nest); @@ -9385,7 +9531,7 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb, NFT_MSG_NEWFLOWTABLE, NLM_F_MULTI | NLM_F_APPEND, table->family, - flowtable, NULL) < 0) + flowtable, NULL, NULL) < 0) goto done; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); @@ -9485,7 +9631,7 @@ static int nf_tables_getflowtable(struct sk_buff *skb, err = nf_tables_fill_flowtable_info(skb2, net, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFT_MSG_NEWFLOWTABLE, 0, family, - flowtable, NULL); + flowtable, NULL, NULL); if (err < 0) goto err_fill_flowtable_info; @@ -9498,7 +9644,9 @@ err_fill_flowtable_info: static void nf_tables_flowtable_notify(struct nft_ctx *ctx, struct nft_flowtable *flowtable, - struct list_head *hook_list, int event) + struct list_head *hook_list, + struct list_head *trans_hook_list, + int event) { struct nftables_pernet *nft_net = nft_pernet(ctx->net); struct sk_buff *skb; @@ -9518,7 +9666,8 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx, err = nf_tables_fill_flowtable_info(skb, ctx->net, ctx->portid, ctx->seq, event, flags, - ctx->family, flowtable, hook_list); + ctx->family, flowtable, + hook_list, trans_hook_list); if (err < 0) { kfree_skb(skb); goto err; @@ -10052,9 +10201,7 @@ static void nft_commit_release(struct nft_trans *trans) break; case NFT_MSG_DELCHAIN: case NFT_MSG_DESTROYCHAIN: - if (nft_trans_chain_update(trans)) - nft_hooks_destroy(&nft_trans_chain_hooks(trans)); - else + if (!nft_trans_chain_update(trans)) nf_tables_chain_destroy(nft_trans_chain(trans)); break; case NFT_MSG_DELRULE: @@ -10075,9 +10222,7 @@ static void nft_commit_release(struct nft_trans *trans) break; case NFT_MSG_DELFLOWTABLE: case NFT_MSG_DESTROYFLOWTABLE: - if (nft_trans_flowtable_update(trans)) - nft_hooks_destroy(&nft_trans_flowtable_hooks(trans)); - else + if (!nft_trans_flowtable_update(trans)) nf_tables_flowtable_destroy(nft_trans_flowtable(trans)); break; } @@ -10837,31 +10982,28 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) if (nft_trans_chain_update(trans)) { nft_chain_commit_update(nft_trans_container_chain(trans)); nf_tables_chain_notify(&ctx, NFT_MSG_NEWCHAIN, - &nft_trans_chain_hooks(trans)); + &nft_trans_chain_hooks(trans), NULL); list_splice_rcu(&nft_trans_chain_hooks(trans), &nft_trans_basechain(trans)->hook_list); /* trans destroyed after rcu grace period */ } else { nft_chain_commit_drop_policy(nft_trans_container_chain(trans)); nft_clear(net, nft_trans_chain(trans)); - nf_tables_chain_notify(&ctx, NFT_MSG_NEWCHAIN, NULL); + nf_tables_chain_notify(&ctx, NFT_MSG_NEWCHAIN, NULL, NULL); nft_trans_destroy(trans); } break; case NFT_MSG_DELCHAIN: case NFT_MSG_DESTROYCHAIN: if (nft_trans_chain_update(trans)) { - nf_tables_chain_notify(&ctx, NFT_MSG_DELCHAIN, + nf_tables_chain_notify(&ctx, NFT_MSG_DELCHAIN, NULL, &nft_trans_chain_hooks(trans)); - if (!(table->flags & NFT_TABLE_F_DORMANT)) { - nft_netdev_unregister_hooks(net, - &nft_trans_chain_hooks(trans), - true); - } + nft_netdev_unregister_trans_hook(net, table, + &nft_trans_chain_hooks(trans)); } else { nft_chain_del(nft_trans_chain(trans)); nf_tables_chain_notify(&ctx, NFT_MSG_DELCHAIN, - NULL); + NULL, NULL); nf_tables_unregister_hook(ctx.net, ctx.table, nft_trans_chain(trans)); } @@ -10967,6 +11109,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nf_tables_flowtable_notify(&ctx, nft_trans_flowtable(trans), &nft_trans_flowtable_hooks(trans), + NULL, NFT_MSG_NEWFLOWTABLE); list_splice_rcu(&nft_trans_flowtable_hooks(trans), &nft_trans_flowtable(trans)->hook_list); @@ -10975,6 +11118,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nf_tables_flowtable_notify(&ctx, nft_trans_flowtable(trans), NULL, + NULL, NFT_MSG_NEWFLOWTABLE); } nft_trans_destroy(trans); @@ -10984,16 +11128,18 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) if (nft_trans_flowtable_update(trans)) { nf_tables_flowtable_notify(&ctx, nft_trans_flowtable(trans), + NULL, &nft_trans_flowtable_hooks(trans), trans->msg_type); - nft_unregister_flowtable_net_hooks(net, - nft_trans_flowtable(trans), - &nft_trans_flowtable_hooks(trans)); + nft_flowtable_unregister_trans_hook(net, + nft_trans_flowtable(trans), + &nft_trans_flowtable_hooks(trans)); } else { list_del_rcu(&nft_trans_flowtable(trans)->list); nf_tables_flowtable_notify(&ctx, nft_trans_flowtable(trans), NULL, + NULL, trans->msg_type); nft_unregister_flowtable_net_hooks(net, nft_trans_flowtable(trans), @@ -11157,8 +11303,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) case NFT_MSG_DELCHAIN: case NFT_MSG_DESTROYCHAIN: if (nft_trans_chain_update(trans)) { - list_splice(&nft_trans_chain_hooks(trans), - &nft_trans_basechain(trans)->hook_list); + nft_trans_delhook_abort(&nft_trans_chain_hooks(trans)); } else { nft_use_inc_restore(&table->use); nft_clear(trans->net, nft_trans_chain(trans)); @@ -11272,8 +11417,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) case NFT_MSG_DELFLOWTABLE: case NFT_MSG_DESTROYFLOWTABLE: if (nft_trans_flowtable_update(trans)) { - list_splice(&nft_trans_flowtable_hooks(trans), - &nft_trans_flowtable(trans)->hook_list); + nft_trans_delhook_abort(&nft_trans_flowtable_hooks(trans)); } else { nft_use_inc_restore(&table->use); nft_clear(trans->net, nft_trans_flowtable(trans)); From 4b2b4d7d4e203c92db8966b163edfacb1f0e1e29 Mon Sep 17 00:00:00 2001 From: Jiexun Wang Date: Fri, 17 Apr 2026 20:25:06 +0800 Subject: [PATCH 06/79] netfilter: xt_policy: fix strict mode inbound policy matching match_policy_in() walks sec_path entries from the last transform to the first one, but strict policy matching needs to consume info->pol[] in the same forward order as the rule layout. Derive the strict-match policy position from the number of transforms already consumed so that multi-element inbound rules are matched consistently. Fixes: c4b885139203 ("[NETFILTER]: x_tables: replace IPv4/IPv6 policy match by address family independant version") Reported-by: Yuan Tan Reported-by: Yifan Wu Reported-by: Juefei Pu Reported-by: Xin Liu Signed-off-by: Jiexun Wang Signed-off-by: Ren Wei Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_policy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c index cb6e8279010a..b5fa65558318 100644 --- a/net/netfilter/xt_policy.c +++ b/net/netfilter/xt_policy.c @@ -63,7 +63,7 @@ match_policy_in(const struct sk_buff *skb, const struct xt_policy_info *info, return 0; for (i = sp->len - 1; i >= 0; i--) { - pos = strict ? i - sp->len + 1 : 0; + pos = strict ? sp->len - i - 1 : 0; if (pos >= info->len) return 0; e = &info->pol[pos]; From fe11e5c40817b84abaa5d83bfb6586d8412bfd07 Mon Sep 17 00:00:00 2001 From: Kai Ma Date: Wed, 22 Apr 2026 22:54:18 +0800 Subject: [PATCH 07/79] netfilter: reject zero shift in nft_bitwise Reject zero shift operands for nft_bitwise left and right shift expressions during initialization. The carry propagation logic computes the carry from the adjacent 32-bit word using BITS_PER_TYPE(u32) - shift. A zero shift operand turns this into a 32-bit shift, which is undefined behaviour. Reject zero shift operands in the control plane, alongside the existing check for values greater than or equal to 32, so malformed rules never reach the packet path. Fixes: 567d746b55bc ("netfilter: bitwise: add support for shifts.") Cc: stable@kernel.org Reported-by: Yuan Tan Reported-by: Yifan Wu Reported-by: Juefei Pu Reported-by: Xin Liu Signed-off-by: Kai Ma Signed-off-by: Ren Wei Reviewed-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_bitwise.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index 13808e9cd999..94dccdcfa06b 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -196,7 +196,8 @@ static int nft_bitwise_init_shift(struct nft_bitwise *priv, if (err < 0) return err; - if (priv->data.data[0] >= BITS_PER_TYPE(u32)) { + if (!priv->data.data[0] || + priv->data.data[0] >= BITS_PER_TYPE(u32)) { nft_data_release(&priv->data, desc.type); return -EINVAL; } From 8cf6809cddcbe301aedfc6b51bcd4944d45795f6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 23 Apr 2026 02:19:11 +0200 Subject: [PATCH 08/79] netfilter: nf_conntrack_sip: don't use simple_strtoul MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace unsafe port parsing in epaddr_len(), ct_sip_parse_header_uri(), and ct_sip_parse_request() with a new sip_parse_port() helper that validates each digit against the buffer limit, eliminating the use of simple_strtoul() which assumes NUL-terminated strings. The previous code dereferenced pointers without bounds checks after sip_parse_addr() and relied on simple_strtoul() on non-NUL-terminated skb data. A port that reaches the buffer limit without a trailing character is also rejected as malformed. Also get rid of all simple_strtoul() usage in conntrack, prefer a stricter version instead. There are intentional changes: - Bail out if number is > UINT_MAX and indicate a failure, same for too long sequences. While we do accept 05535 as port 5535, we will not accept e.g. 'sip:10.0.0.1:005060'. While its syntactically valid under RFC 3261, we should restrict this to not waste cycles when presented with malformed packets with 64k '0' characters. - Force base 10 in ct_sip_parse_numerical_param(). This is used to fetch 'expire=' and 'rports='; both are expected to use base-10. - In nf_nat_sip.c, only accept the parsed value if its within the 1k-64k range. - epaddr_len now returns 0 if the port is invalid, as it already does for invalid ip addresses. This is intentional. nf_conntrack_sip performs lots of guesswork to find the right parts of the message to parse. Being stricter could break existing setups. Connection tracking helpers are designed to allow traffic to pass, not to block it. Based on an earlier patch from Jenny Guanni Qu . Fixes: 05e3ced297fe ("[NETFILTER]: nf_conntrack_sip: introduce SIP-URI parsing helper") Reported-by: Klaudia Kloc Reported-by: Dawid Moczadło Reported-by: Jenny Guanni Qu . Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_sip.c | 152 ++++++++++++++++++++++++------- net/netfilter/nf_nat_sip.c | 1 + 2 files changed, 119 insertions(+), 34 deletions(-) diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 182cfb119448..1eb55907d470 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -181,6 +181,57 @@ static int sip_parse_addr(const struct nf_conn *ct, const char *cp, return 1; } +/* Parse optional port number after IP address. + * Returns false on malformed input, true otherwise. + * If port is non-NULL, stores parsed port in network byte order. + * If no port is present, sets *port to default SIP port. + */ +static bool sip_parse_port(const char *dptr, const char **endp, + const char *limit, __be16 *port) +{ + unsigned int p = 0; + int len = 0; + + if (dptr >= limit) + return false; + + if (*dptr != ':') { + if (port) + *port = htons(SIP_PORT); + if (endp) + *endp = dptr; + return true; + } + + dptr++; /* skip ':' */ + + while (dptr < limit && isdigit(*dptr)) { + p = p * 10 + (*dptr - '0'); + dptr++; + len++; + if (len > 5) /* max "65535" */ + return false; + } + + if (len == 0) + return false; + + /* reached limit while parsing port */ + if (dptr >= limit) + return false; + + if (p < 1024 || p > 65535) + return false; + + if (port) + *port = htons(p); + + if (endp) + *endp = dptr; + + return true; +} + /* skip ip address. returns its length. */ static int epaddr_len(const struct nf_conn *ct, const char *dptr, const char *limit, int *shift) @@ -193,11 +244,8 @@ static int epaddr_len(const struct nf_conn *ct, const char *dptr, return 0; } - /* Port number */ - if (*dptr == ':') { - dptr++; - dptr += digits_len(ct, dptr, limit, shift); - } + if (!sip_parse_port(dptr, &dptr, limit, NULL)) + return 0; return dptr - aux; } @@ -228,6 +276,51 @@ static int skp_epaddr_len(const struct nf_conn *ct, const char *dptr, return epaddr_len(ct, dptr, limit, shift); } +/* simple_strtoul stops after first non-number character. + * But as we're not dealing with c-strings, we can't rely on + * hitting \r,\n,\0 etc. before moving past end of buffer. + * + * This is a variant of simple_strtoul, but doesn't require + * a c-string. + * + * If value exceeds UINT_MAX, 0 is returned. + */ +static unsigned int sip_strtouint(const char *cp, unsigned int len, char **endp) +{ + const unsigned int max = sizeof("4294967295"); + unsigned int olen = len; + const char *s = cp; + u64 result = 0; + + if (len > max) + len = max; + + while (olen > 0 && isdigit(*s)) { + unsigned int value; + + if (len == 0) + goto err; + + value = *s - '0'; + result = result * 10 + value; + + if (result > UINT_MAX) + goto err; + s++; + len--; + olen--; + } + + if (endp) + *endp = (char *)s; + + return result; +err: + if (endp) + *endp = (char *)cp; + return 0; +} + /* Parse a SIP request line of the form: * * Request-Line = Method SP Request-URI SP SIP-Version CRLF @@ -241,7 +334,6 @@ int ct_sip_parse_request(const struct nf_conn *ct, { const char *start = dptr, *limit = dptr + datalen, *end; unsigned int mlen; - unsigned int p; int shift = 0; /* Skip method and following whitespace */ @@ -267,14 +359,8 @@ int ct_sip_parse_request(const struct nf_conn *ct, if (!sip_parse_addr(ct, dptr, &end, addr, limit, true)) return -1; - if (end < limit && *end == ':') { - end++; - p = simple_strtoul(end, (char **)&end, 10); - if (p < 1024 || p > 65535) - return -1; - *port = htons(p); - } else - *port = htons(SIP_PORT); + if (!sip_parse_port(end, &end, limit, port)) + return -1; if (end == dptr) return 0; @@ -509,7 +595,6 @@ int ct_sip_parse_header_uri(const struct nf_conn *ct, const char *dptr, union nf_inet_addr *addr, __be16 *port) { const char *c, *limit = dptr + datalen; - unsigned int p; int ret; ret = ct_sip_walk_headers(ct, dptr, dataoff ? *dataoff : 0, datalen, @@ -520,14 +605,8 @@ int ct_sip_parse_header_uri(const struct nf_conn *ct, const char *dptr, if (!sip_parse_addr(ct, dptr + *matchoff, &c, addr, limit, true)) return -1; - if (*c == ':') { - c++; - p = simple_strtoul(c, (char **)&c, 10); - if (p < 1024 || p > 65535) - return -1; - *port = htons(p); - } else - *port = htons(SIP_PORT); + if (!sip_parse_port(c, &c, limit, port)) + return -1; if (dataoff) *dataoff = c - dptr; @@ -609,7 +688,7 @@ int ct_sip_parse_numerical_param(const struct nf_conn *ct, const char *dptr, return 0; start += strlen(name); - *val = simple_strtoul(start, &end, 0); + *val = sip_strtouint(start, limit - start, (char **)&end); if (start == end) return -1; if (matchoff && matchlen) { @@ -1064,6 +1143,8 @@ static int process_sdp(struct sk_buff *skb, unsigned int protoff, mediaoff = sdpoff; for (i = 0; i < ARRAY_SIZE(sdp_media_types); ) { + char *end; + if (ct_sip_get_sdp_header(ct, *dptr, mediaoff, *datalen, SDP_HDR_MEDIA, SDP_HDR_UNSPEC, &mediaoff, &medialen) <= 0) @@ -1079,8 +1160,8 @@ static int process_sdp(struct sk_buff *skb, unsigned int protoff, mediaoff += t->len; medialen -= t->len; - port = simple_strtoul(*dptr + mediaoff, NULL, 10); - if (port == 0) + port = sip_strtouint(*dptr + mediaoff, *datalen - mediaoff, (char **)&end); + if (port == 0 || *dptr + mediaoff == end) continue; if (port < 1024 || port > 65535) { nf_ct_helper_log(skb, ct, "wrong port %u", port); @@ -1254,7 +1335,7 @@ static int process_register_request(struct sk_buff *skb, unsigned int protoff, */ if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_EXPIRES, &matchoff, &matchlen) > 0) - expires = simple_strtoul(*dptr + matchoff, NULL, 10); + expires = sip_strtouint(*dptr + matchoff, *datalen - matchoff, NULL); ret = ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, SIP_HDR_CONTACT, NULL, @@ -1358,7 +1439,7 @@ static int process_register_response(struct sk_buff *skb, unsigned int protoff, if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_EXPIRES, &matchoff, &matchlen) > 0) - expires = simple_strtoul(*dptr + matchoff, NULL, 10); + expires = sip_strtouint(*dptr + matchoff, *datalen - matchoff, NULL); while (1) { unsigned int c_expires = expires; @@ -1418,10 +1499,12 @@ static int process_sip_response(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct = nf_ct_get(skb, &ctinfo); unsigned int matchoff, matchlen, matchend; unsigned int code, cseq, i; + char *end; if (*datalen < strlen("SIP/2.0 200")) return NF_ACCEPT; - code = simple_strtoul(*dptr + strlen("SIP/2.0 "), NULL, 10); + code = sip_strtouint(*dptr + strlen("SIP/2.0 "), + *datalen - strlen("SIP/2.0 "), NULL); if (!code) { nf_ct_helper_log(skb, ct, "cannot get code"); return NF_DROP; @@ -1432,8 +1515,8 @@ static int process_sip_response(struct sk_buff *skb, unsigned int protoff, nf_ct_helper_log(skb, ct, "cannot parse cseq"); return NF_DROP; } - cseq = simple_strtoul(*dptr + matchoff, NULL, 10); - if (!cseq && *(*dptr + matchoff) != '0') { + cseq = sip_strtouint(*dptr + matchoff, *datalen - matchoff, (char **)&end); + if (*dptr + matchoff == end) { nf_ct_helper_log(skb, ct, "cannot get cseq"); return NF_DROP; } @@ -1482,6 +1565,7 @@ static int process_sip_request(struct sk_buff *skb, unsigned int protoff, for (i = 0; i < ARRAY_SIZE(sip_handlers); i++) { const struct sip_handler *handler; + char *end; handler = &sip_handlers[i]; if (handler->request == NULL) @@ -1498,8 +1582,8 @@ static int process_sip_request(struct sk_buff *skb, unsigned int protoff, nf_ct_helper_log(skb, ct, "cannot parse cseq"); return NF_DROP; } - cseq = simple_strtoul(*dptr + matchoff, NULL, 10); - if (!cseq && *(*dptr + matchoff) != '0') { + cseq = sip_strtouint(*dptr + matchoff, *datalen - matchoff, (char **)&end); + if (*dptr + matchoff == end) { nf_ct_helper_log(skb, ct, "cannot get cseq"); return NF_DROP; } @@ -1575,7 +1659,7 @@ static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff, &matchoff, &matchlen) <= 0) break; - clen = simple_strtoul(dptr + matchoff, (char **)&end, 10); + clen = sip_strtouint(dptr + matchoff, datalen - matchoff, (char **)&end); if (dptr + matchoff == end) break; diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c index c845b6d1a2bd..9fbfc6bff0c2 100644 --- a/net/netfilter/nf_nat_sip.c +++ b/net/netfilter/nf_nat_sip.c @@ -246,6 +246,7 @@ static unsigned int nf_nat_sip(struct sk_buff *skb, unsigned int protoff, if (ct_sip_parse_numerical_param(ct, *dptr, matchend, *datalen, "rport=", &poff, &plen, &n) > 0 && + n >= 1024 && n <= 65535 && htons(n) == ct->tuplehash[dir].tuple.dst.u.udp.port && htons(n) != ct->tuplehash[!dir].tuple.src.u.udp.port) { __be16 p = ct->tuplehash[!dir].tuple.src.u.udp.port; From 3618442d54f366eeee8f6c83a47861ca22918dfe Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 23 Apr 2026 15:08:57 -0700 Subject: [PATCH 09/79] MAINTAINERS: add pcnet_cs to PCMCIA Per discussion under the Link make sure Dominik can help with the patches to drivers/net/ethernet/8390/pcnet_cs.c cc: linux@dominikbrodowski.net Link: https://lore.kernel.org/aeomUh5JqFvkLTH7@scops.dominikbrodowski.net Acked-by: Dominik Brodowski Link: https://patch.msgid.link/20260423220857.3490118-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 2fb1c75afd16..21288a3a7d93 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20774,6 +20774,7 @@ M: Dominik Brodowski S: Odd Fixes T: git git://git.kernel.org/pub/scm/linux/kernel/git/brodo/linux.git F: Documentation/pcmcia/ +F: drivers/net/ethernet/8390/pcnet_cs.c F: drivers/pcmcia/ F: include/pcmcia/ F: tools/pcmcia/ From 35eaa6d8d6c2ee65e96f507add856e0eacf24591 Mon Sep 17 00:00:00 2001 From: "Nikola Z. Ivanov" Date: Sun, 26 Apr 2026 23:14:34 +0300 Subject: [PATCH 10/79] netdevsim: zero initialize struct iphdr in dummy sk_buff Syzbot reports a KMSAN uninit-value originating from nsim_dev_trap_skb_build, with the allocation also being performed in the same function. Fix this by calling skb_put_zero instead of skb_put to guarantee zero initialization of the whole IP header. Closes: https://syzkaller.appspot.com/bug?extid=23d7fcd204e3837866ff Fixes: da58f90f11f5 ("netdevsim: Add devlink-trap support") Signed-off-by: Nikola Z. Ivanov Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260426201434.742030-1-zlatistiv@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/netdevsim/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c index 1e06e781c835..f00fc2f9ebde 100644 --- a/drivers/net/netdevsim/dev.c +++ b/drivers/net/netdevsim/dev.c @@ -829,7 +829,7 @@ static struct sk_buff *nsim_dev_trap_skb_build(void) skb->protocol = htons(ETH_P_IP); skb_set_network_header(skb, skb->len); - iph = skb_put(skb, sizeof(struct iphdr)); + iph = skb_put_zero(skb, sizeof(struct iphdr)); iph->protocol = IPPROTO_UDP; iph->saddr = in_aton("192.0.2.1"); iph->daddr = in_aton("198.51.100.1"); From 732b463449fd0ef90acd13cda68eab1c91adb00c Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 17 Apr 2026 20:19:39 -0700 Subject: [PATCH 11/79] net/sched: netem: fix probability gaps in 4-state loss model The 4-state Markov chain in loss_4state() has gaps at the boundaries between transition probability ranges. The comparisons use: if (rnd < a4) else if (a4 < rnd && rnd < a1 + a4) When rnd equals a boundary value exactly, neither branch matches and no state transition occurs. The redundant lower-bound check (a4 < rnd) is already implied by being in the else branch. Remove the unnecessary lower-bound comparisons so the ranges are contiguous and every random value produces a transition, matching the GI (General and Intuitive) loss model specification. This bug goes back to original implementation of this model. Fixes: 661b79725fea ("netem: revised correlated loss generator") Signed-off-by: Stephen Hemminger Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260418032027.900913-2-stephen@networkplumber.org Signed-off-by: Jakub Kicinski --- net/sched/sch_netem.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 20df1c08b1e9..8ee72cac1faf 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -227,10 +227,10 @@ static bool loss_4state(struct netem_sched_data *q) if (rnd < clg->a4) { clg->state = LOST_IN_GAP_PERIOD; return true; - } else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) { + } else if (rnd < clg->a1 + clg->a4) { clg->state = LOST_IN_BURST_PERIOD; return true; - } else if (clg->a1 + clg->a4 < rnd) { + } else { clg->state = TX_IN_GAP_PERIOD; } @@ -247,9 +247,9 @@ static bool loss_4state(struct netem_sched_data *q) case LOST_IN_BURST_PERIOD: if (rnd < clg->a3) clg->state = TX_IN_BURST_PERIOD; - else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) { + else if (rnd < clg->a2 + clg->a3) { clg->state = TX_IN_GAP_PERIOD; - } else if (clg->a2 + clg->a3 < rnd) { + } else { clg->state = LOST_IN_BURST_PERIOD; return true; } From 4185701fcce6b426b6c3630b25330dddd9c47b0d Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 17 Apr 2026 20:19:40 -0700 Subject: [PATCH 12/79] net/sched: netem: fix queue limit check to include reordered packets The queue limit check in netem_enqueue() uses q->t_len which only counts packets in the internal tfifo. Packets placed in sch->q by the reorder path (__qdisc_enqueue_head) are not counted, allowing the total queue occupancy to exceed sch->limit under reordering. Include sch->q.qlen in the limit check. Fixes: f8d4bc455047 ("net/sched: netem: account for backlog updates from child qdisc") Signed-off-by: Stephen Hemminger Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260418032027.900913-3-stephen@networkplumber.org Signed-off-by: Jakub Kicinski --- net/sched/sch_netem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 8ee72cac1faf..d400a730eadd 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -524,7 +524,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, 1 << get_random_u32_below(8); } - if (unlikely(q->t_len >= sch->limit)) { + if (unlikely(sch->q.qlen >= sch->limit)) { /* re-link segs, so that qdisc_drop_all() frees them all */ skb->next = segs; qdisc_drop_all(skb, sch, to_free); From 986afaf809940577224a99c3a08d97a15eb37e93 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 17 Apr 2026 20:19:41 -0700 Subject: [PATCH 13/79] net/sched: netem: only reseed PRNG when seed is explicitly provided netem_change() unconditionally reseeds the PRNG on every tc change command. If TCA_NETEM_PRNG_SEED is not specified, a new random seed is generated, destroying reproducibility for users who set a deterministic seed on a previous change. Move the initial random seed generation to netem_init() and only reseed in netem_change() when TCA_NETEM_PRNG_SEED is explicitly provided by the user. Fixes: 4072d97ddc44 ("netem: add prng attribute to netem_sched_data") Signed-off-by: Stephen Hemminger Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260418032027.900913-4-stephen@networkplumber.org Signed-off-by: Jakub Kicinski --- net/sched/sch_netem.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index d400a730eadd..556f9747f0e7 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -1112,11 +1112,10 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, /* capping jitter to the range acceptable by tabledist() */ q->jitter = min_t(s64, abs(q->jitter), INT_MAX); - if (tb[TCA_NETEM_PRNG_SEED]) + if (tb[TCA_NETEM_PRNG_SEED]) { q->prng.seed = nla_get_u64(tb[TCA_NETEM_PRNG_SEED]); - else - q->prng.seed = get_random_u64(); - prandom_seed_state(&q->prng.prng_state, q->prng.seed); + prandom_seed_state(&q->prng.prng_state, q->prng.seed); + } unlock: sch_tree_unlock(sch); @@ -1139,6 +1138,9 @@ static int netem_init(struct Qdisc *sch, struct nlattr *opt, return -EINVAL; q->loss_model = CLG_RANDOM; + q->prng.seed = get_random_u64(); + prandom_seed_state(&q->prng.prng_state, q->prng.seed); + ret = netem_change(sch, opt, extack); if (ret) pr_info("netem: change failed\n"); From 01801c359a74737b9b1aa28568b60374d857241a Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 17 Apr 2026 20:19:42 -0700 Subject: [PATCH 14/79] net/sched: netem: validate slot configuration Reject slot configurations that have no defensible meaning: - negative min_delay or max_delay - min_delay greater than max_delay - negative dist_delay or dist_jitter - negative max_packets or max_bytes Negative or out-of-order delays underflow in get_slot_next(), producing garbage intervals. Negative limits trip the per-slot accounting (packets_left/bytes_left <= 0) on the first packet of every slot, defeating the rate-limiting half of the slot feature. Note that dist_jitter has been silently coerced to its absolute value by get_slot() since the feature was introduced; rejecting negatives here converts that silent coercion into -EINVAL. The abs() can be removed in a follow-up. Fixes: 836af83b54e3 ("netem: support delivering packets in delayed time slots") Signed-off-by: Stephen Hemminger Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260418032027.900913-5-stephen@networkplumber.org Signed-off-by: Jakub Kicinski --- net/sched/sch_netem.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 556f9747f0e7..640b51be807a 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -827,6 +827,29 @@ static int get_dist_table(struct disttable **tbl, const struct nlattr *attr) return 0; } +static int validate_slot(const struct nlattr *attr, struct netlink_ext_ack *extack) +{ + const struct tc_netem_slot *c = nla_data(attr); + + if (c->min_delay < 0 || c->max_delay < 0) { + NL_SET_ERR_MSG_ATTR(extack, attr, "negative slot delay"); + return -EINVAL; + } + if (c->min_delay > c->max_delay) { + NL_SET_ERR_MSG_ATTR(extack, attr, "slot min delay greater than max delay"); + return -EINVAL; + } + if (c->dist_delay < 0 || c->dist_jitter < 0) { + NL_SET_ERR_MSG_ATTR(extack, attr, "negative dist delay"); + return -EINVAL; + } + if (c->max_packets < 0 || c->max_bytes < 0) { + NL_SET_ERR_MSG_ATTR(extack, attr, "negative slot limit"); + return -EINVAL; + } + return 0; +} + static void get_slot(struct netem_sched_data *q, const struct nlattr *attr) { const struct tc_netem_slot *c = nla_data(attr); @@ -1040,6 +1063,12 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, goto table_free; } + if (tb[TCA_NETEM_SLOT]) { + ret = validate_slot(tb[TCA_NETEM_SLOT], extack); + if (ret) + goto table_free; + } + sch_tree_lock(sch); /* backup q->clg and q->loss_model */ old_clg = q->clg; From 51e94e1e2fef351c74d69eb53666df808d26af95 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 17 Apr 2026 20:19:43 -0700 Subject: [PATCH 15/79] net/sched: netem: fix slot delay calculation overflow get_slot_next() computes a random delay between min_delay and max_delay using: get_random_u32() * (max_delay - min_delay) >> 32 This overflows signed 64-bit arithmetic when the delay range exceeds approximately 2.1 seconds (2^31 nanoseconds), producing a negative result that effectively disables slot-based pacing. This is a realistic configuration for WAN emulation (e.g., slot 1s 5s). Use mul_u64_u32_shr() which handles the widening multiply without overflow. Fixes: 0a9fe5c375b5 ("netem: slotting with non-uniform distribution") Signed-off-by: Stephen Hemminger Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260418032027.900913-6-stephen@networkplumber.org Signed-off-by: Jakub Kicinski --- net/sched/sch_netem.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 640b51be807a..475c14b3dbdb 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -659,9 +659,8 @@ static void get_slot_next(struct netem_sched_data *q, u64 now) if (!q->slot_dist) next_delay = q->slot_config.min_delay + - (get_random_u32() * - (q->slot_config.max_delay - - q->slot_config.min_delay) >> 32); + mul_u64_u32_shr(q->slot_config.max_delay - q->slot_config.min_delay, + get_random_u32(), 32); else next_delay = tabledist(q->slot_config.dist_delay, (s32)(q->slot_config.dist_jitter), From 90be9fedb218ee95a1cf59050d1306fbfb0e8b87 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 17 Apr 2026 20:19:44 -0700 Subject: [PATCH 16/79] net/sched: netem: check for negative latency and jitter Reject requests with negative latency or jitter. A negative value added to current timestamp (u64) wraps to an enormous time_to_send, disabling dequeue. The original UAPI used u32 for these values; the conversion to 64-bit time values via TCA_NETEM_LATENCY64 and TCA_NETEM_JITTER64 allowed signed values to reach the kernel without validation. Jitter is already silently clamped by an abs() in netem_change(); that abs() can be removed in a follow-up once this rejection is in place. Fixes: 99803171ef04 ("netem: add uapi to express delay and jitter in nanoseconds") Signed-off-by: Stephen Hemminger Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260418032027.900913-7-stephen@networkplumber.org Signed-off-by: Jakub Kicinski --- net/sched/sch_netem.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 475c14b3dbdb..bc18e1976b6e 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -826,6 +826,16 @@ static int get_dist_table(struct disttable **tbl, const struct nlattr *attr) return 0; } +static int validate_time(const struct nlattr *attr, const char *name, + struct netlink_ext_ack *extack) +{ + if (nla_get_s64(attr) < 0) { + NL_SET_ERR_MSG_ATTR_FMT(extack, attr, "negative %s", name); + return -EINVAL; + } + return 0; +} + static int validate_slot(const struct nlattr *attr, struct netlink_ext_ack *extack) { const struct tc_netem_slot *c = nla_data(attr); @@ -1068,6 +1078,18 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, goto table_free; } + if (tb[TCA_NETEM_LATENCY64]) { + ret = validate_time(tb[TCA_NETEM_LATENCY64], "latency", extack); + if (ret) + goto table_free; + } + + if (tb[TCA_NETEM_JITTER64]) { + ret = validate_time(tb[TCA_NETEM_JITTER64], "jitter", extack); + if (ret) + goto table_free; + } + sch_tree_lock(sch); /* backup q->clg and q->loss_model */ old_clg = q->clg; From 2d9f5a118205da2683ffcec78b9347f1f01a820e Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 21 Apr 2026 08:35:11 +0200 Subject: [PATCH 17/79] net: airoha: fix BQL imbalance in TX path Fix a possible BQL imbalance in airoha_dev_xmit(), where inflight packets are accounted only for the AIROHA_NUM_TX_RING netdev TX queues. The queue index is computed as: qid = skb_get_queue_mapping(skb) % ARRAY_SIZE(qdma->q_tx) txq = netdev_get_tx_queue(dev, qid); However, airoha_qdma_tx_napi_poll() accounts completions across all netdev TX queues (num_tx_queues), leading to inconsistent BQL accounting. Also reset all netdev TX queues in the ndo_stop callback. Fixes: 1d304174106c ("net: airoha: Implement BQL support") Fixes: c9f947769b77 ("net: airoha: Reset BQL stopping the netdevice") Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20260421-airoha-fix-bql-v1-1-f135afe4275b@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/airoha/airoha_eth.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/airoha/airoha_eth.c b/drivers/net/ethernet/airoha/airoha_eth.c index 2bb0a3ff9810..daae15aa078c 100644 --- a/drivers/net/ethernet/airoha/airoha_eth.c +++ b/drivers/net/ethernet/airoha/airoha_eth.c @@ -929,10 +929,9 @@ static int airoha_qdma_tx_napi_poll(struct napi_struct *napi, int budget) q->queued--; if (skb) { - u16 queue = skb_get_queue_mapping(skb); struct netdev_queue *txq; - txq = netdev_get_tx_queue(skb->dev, queue); + txq = skb_get_tx_queue(skb->dev, skb); netdev_tx_completed_queue(txq, 1, skb->len); dev_kfree_skb_any(skb); } @@ -1744,7 +1743,7 @@ static int airoha_dev_stop(struct net_device *dev) if (err) return err; - for (i = 0; i < ARRAY_SIZE(qdma->q_tx); i++) + for (i = 0; i < dev->num_tx_queues; i++) netdev_tx_reset_subqueue(dev, i); airoha_set_gdm_port_fwd_cfg(qdma->eth, REG_GDM_FWD_CFG(port->id), @@ -2039,7 +2038,7 @@ static netdev_tx_t airoha_dev_xmit(struct sk_buff *skb, spin_lock_bh(&q->lock); - txq = netdev_get_tx_queue(dev, qid); + txq = skb_get_tx_queue(dev, skb); nr_frags = 1 + skb_shinfo(skb)->nr_frags; if (q->queued + nr_frags >= q->ndesc) { From 3854de7b38be742cf7558476956d12414cb274f2 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 21 Apr 2026 08:43:07 +0200 Subject: [PATCH 18/79] net: airoha: stop net_device TX queue before updating CPU index Currently, airoha_eth driver updates the CPU index register prior of verifying whether the number of free descriptors has fallen below the threshold. Move net_device TX queue length check before updating the TX CPU index in order to update TX CPU index even if there are more packets to be transmitted but the net_device TX queue is going to be stopped accounting the inflight packets. Fixes: 1d304174106c ("net: airoha: Implement BQL support") Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20260421-airoha-xmit-stop-condition-v1-1-e670d6a48467@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/airoha/airoha_eth.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/airoha/airoha_eth.c b/drivers/net/ethernet/airoha/airoha_eth.c index daae15aa078c..27c85cd95750 100644 --- a/drivers/net/ethernet/airoha/airoha_eth.c +++ b/drivers/net/ethernet/airoha/airoha_eth.c @@ -2094,17 +2094,16 @@ static netdev_tx_t airoha_dev_xmit(struct sk_buff *skb, skb_tx_timestamp(skb); netdev_tx_sent_queue(txq, skb->len); + if (q->ndesc - q->queued < q->free_thr) { + netif_tx_stop_queue(txq); + q->txq_stopped = true; + } if (netif_xmit_stopped(txq) || !netdev_xmit_more()) airoha_qdma_rmw(qdma, REG_TX_CPU_IDX(qid), TX_RING_CPU_IDX_MASK, FIELD_PREP(TX_RING_CPU_IDX_MASK, index)); - if (q->ndesc - q->queued < q->free_thr) { - netif_tx_stop_queue(txq); - q->txq_stopped = true; - } - spin_unlock_bh(&q->lock); return NETDEV_TX_OK; From e070aac63b42bf81f4dc565f9f841ff47e6c992f Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 21 Apr 2026 10:53:33 +0200 Subject: [PATCH 19/79] net: airoha: Do not wake all netdev TX queues in airoha_qdma_wake_netdev_txqs() Do not wake every netdev TX queue across all ports sharing the QDMA running netif_tx_wake_all_queues routine in airoha_qdma_wake_netdev_txqs() but only the ones that are mapped the specific QDMA stopped hw TX queue. This patch can potentially avoid waking already stopped netdev TX queues that are mapped to a different QDMA hw TX queue. Introduce airoha_qdma_get_txq utility routine. Fixes: b94769eb2f30 ("net: airoha: Fix possible TX queue stall in airoha_qdma_tx_napi_poll()") Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20260421-airoha-wake_netdev_txqs-optmization-v1-1-e0be95115d53@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/airoha/airoha_eth.c | 19 +++++++++++++++---- drivers/net/ethernet/airoha/airoha_eth.h | 5 +++++ 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/airoha/airoha_eth.c b/drivers/net/ethernet/airoha/airoha_eth.c index 27c85cd95750..905bcbf90752 100644 --- a/drivers/net/ethernet/airoha/airoha_eth.c +++ b/drivers/net/ethernet/airoha/airoha_eth.c @@ -847,13 +847,24 @@ static void airoha_qdma_wake_netdev_txqs(struct airoha_queue *q) { struct airoha_qdma *qdma = q->qdma; struct airoha_eth *eth = qdma->eth; - int i; + int i, qid = q - &qdma->q_tx[0]; for (i = 0; i < ARRAY_SIZE(eth->ports); i++) { struct airoha_gdm_port *port = eth->ports[i]; + int j; - if (port && port->qdma == qdma) - netif_tx_wake_all_queues(port->dev); + if (!port) + continue; + + if (port->qdma != qdma) + continue; + + for (j = 0; j < port->dev->num_tx_queues; j++) { + if (airoha_qdma_get_txq(qdma, j) != qid) + continue; + + netif_wake_subqueue(port->dev, j); + } } q->txq_stopped = false; } @@ -2001,7 +2012,7 @@ static netdev_tx_t airoha_dev_xmit(struct sk_buff *skb, u16 index; u8 fport; - qid = skb_get_queue_mapping(skb) % ARRAY_SIZE(qdma->q_tx); + qid = airoha_qdma_get_txq(qdma, skb_get_queue_mapping(skb)); tag = airoha_get_dsa_tag(skb, dev); msg0 = FIELD_PREP(QDMA_ETH_TXMSG_CHAN_MASK, diff --git a/drivers/net/ethernet/airoha/airoha_eth.h b/drivers/net/ethernet/airoha/airoha_eth.h index e389d2fe3b86..4fad3acc3ccf 100644 --- a/drivers/net/ethernet/airoha/airoha_eth.h +++ b/drivers/net/ethernet/airoha/airoha_eth.h @@ -631,6 +631,11 @@ u32 airoha_rmw(void __iomem *base, u32 offset, u32 mask, u32 val); #define airoha_qdma_clear(qdma, offset, val) \ airoha_rmw((qdma)->regs, (offset), (val), 0) +static inline u16 airoha_qdma_get_txq(struct airoha_qdma *qdma, u16 qid) +{ + return qid % ARRAY_SIZE(qdma->q_tx); +} + static inline bool airoha_is_lan_gdm_port(struct airoha_gdm_port *port) { /* GDM1 port on EN7581 SoC is connected to the lan dsa switch. From bde34e84edc8b5571fbde7e941e175a4293ee1eb Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 24 Apr 2026 11:00:28 +0200 Subject: [PATCH 20/79] net: airoha: Do not read uninitialized fragment address in airoha_dev_xmit() The transmit loop in airoha_dev_xmit() reads fragment address and length during its final iteration, when the loop index equals skb_shinfo(skb)->nr_frags, at which point the fragment data is uninitialized. While these values are never consumed, the read itself is unsafe and may trigger a page fault. Fix this by avoiding the fragment read on the last iteration. Additionally, move the skb pointer from the first to the last used packet descriptor, so that airoha_qdma_tx_napi_poll() defers freeing the skb until the final descriptor is processed. Fixes: 23020f0493270 ("net: airoha: Introduce ethernet support for EN7581 SoC") Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20260424-airoha-xmit-fix-read-frag-v1-1-fdc0a83c79e8@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/airoha/airoha_eth.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/airoha/airoha_eth.c b/drivers/net/ethernet/airoha/airoha_eth.c index 905bcbf90752..5effb4a4ae84 100644 --- a/drivers/net/ethernet/airoha/airoha_eth.c +++ b/drivers/net/ethernet/airoha/airoha_eth.c @@ -2007,8 +2007,8 @@ static netdev_tx_t airoha_dev_xmit(struct sk_buff *skb, struct netdev_queue *txq; struct airoha_queue *q; LIST_HEAD(tx_list); + int i = 0, qid; void *data; - int i, qid; u16 index; u8 fport; @@ -2067,7 +2067,7 @@ static netdev_tx_t airoha_dev_xmit(struct sk_buff *skb, list); index = e - q->entry; - for (i = 0; i < nr_frags; i++) { + while (true) { struct airoha_qdma_desc *desc = &q->desc[index]; skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; dma_addr_t addr; @@ -2079,7 +2079,7 @@ static netdev_tx_t airoha_dev_xmit(struct sk_buff *skb, goto error_unmap; list_move_tail(&e->list, &tx_list); - e->skb = i ? NULL : skb; + e->skb = i == nr_frags - 1 ? skb : NULL; e->dma_addr = addr; e->dma_len = len; @@ -2098,6 +2098,9 @@ static netdev_tx_t airoha_dev_xmit(struct sk_buff *skb, WRITE_ONCE(desc->msg1, cpu_to_le32(msg1)); WRITE_ONCE(desc->msg2, cpu_to_le32(0xffff)); + if (++i == nr_frags) + break; + data = skb_frag_address(frag); len = skb_frag_size(frag); } From d3aeb889dcbd78e95f500d383799a23d949796e0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 23 Apr 2026 06:28:39 +0000 Subject: [PATCH 21/79] net/sched: sch_choke: annotate data-races in choke_dump_stats() choke_dump_stats() only runs with RTNL held. It reads fields that can be changed in qdisc fast path. Add READ_ONCE()/WRITE_ONCE() annotations. Fixes: edb09eb17ed8 ("net: sched: do not acquire qdisc spinlock in qdisc/class stats dump") Signed-off-by: Eric Dumazet Reviewed-by: Jamal Hadi Salim Link: https://patch.msgid.link/20260423062839.2524324-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sched/sch_choke.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 94df8e741a97..2875bcdb18a4 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -229,7 +229,7 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* Draw a packet at random from queue and compare flow */ if (choke_match_random(q, skb, &idx)) { - q->stats.matched++; + WRITE_ONCE(q->stats.matched, q->stats.matched + 1); choke_drop_by_idx(sch, idx, to_free); goto congestion_drop; } @@ -241,11 +241,13 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch, qdisc_qstats_overlimit(sch); if (use_harddrop(q) || !use_ecn(q) || !INET_ECN_set_ce(skb)) { - q->stats.forced_drop++; + WRITE_ONCE(q->stats.forced_drop, + q->stats.forced_drop + 1); goto congestion_drop; } - q->stats.forced_mark++; + WRITE_ONCE(q->stats.forced_mark, + q->stats.forced_mark + 1); } else if (++q->vars.qcount) { if (red_mark_probability(p, &q->vars, q->vars.qavg)) { q->vars.qcount = 0; @@ -253,11 +255,13 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch, qdisc_qstats_overlimit(sch); if (!use_ecn(q) || !INET_ECN_set_ce(skb)) { - q->stats.prob_drop++; + WRITE_ONCE(q->stats.prob_drop, + q->stats.prob_drop + 1); goto congestion_drop; } - q->stats.prob_mark++; + WRITE_ONCE(q->stats.prob_mark, + q->stats.prob_mark + 1); } } else q->vars.qR = red_random(p); @@ -272,7 +276,7 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch, return NET_XMIT_SUCCESS; } - q->stats.pdrop++; + WRITE_ONCE(q->stats.pdrop, q->stats.pdrop + 1); return qdisc_drop(skb, sch, to_free); congestion_drop: @@ -461,10 +465,12 @@ static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct choke_sched_data *q = qdisc_priv(sch); struct tc_choke_xstats st = { - .early = q->stats.prob_drop + q->stats.forced_drop, - .marked = q->stats.prob_mark + q->stats.forced_mark, - .pdrop = q->stats.pdrop, - .matched = q->stats.matched, + .early = READ_ONCE(q->stats.prob_drop) + + READ_ONCE(q->stats.forced_drop), + .marked = READ_ONCE(q->stats.prob_mark) + + READ_ONCE(q->stats.forced_mark), + .pdrop = READ_ONCE(q->stats.pdrop), + .matched = READ_ONCE(q->stats.matched), }; return gnet_stats_copy_app(d, &st, sizeof(st)); From 59b145771c7982cfe9020d4e9e22da92d6b5ae31 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 23 Apr 2026 06:35:27 +0000 Subject: [PATCH 22/79] net/sched: sch_fq_pie: annotate data-races in fq_pie_dump_stats() fq_codel_dump_stats() acquires the qdisc spinlock a bit too late. Move this acquisition before we fill tc_fq_pie_xstats with live data. Alternative would be to add READ_ONCE() and WRITE_ONCE() annotations, but the spinlock is needed anyway to scan q->new_flows and q->old_flows. Fixes: ec97ecf1ebe4 ("net: sched: add Flow Queue PIE packet scheduler") Signed-off-by: Eric Dumazet Reviewed-by: Jamal Hadi Salim Link: https://patch.msgid.link/20260423063527.2568262-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sched/sch_fq_pie.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c index 154c70f489f2..7becbf5362b3 100644 --- a/net/sched/sch_fq_pie.c +++ b/net/sched/sch_fq_pie.c @@ -509,18 +509,19 @@ nla_put_failure: static int fq_pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct fq_pie_sched_data *q = qdisc_priv(sch); - struct tc_fq_pie_xstats st = { - .packets_in = q->stats.packets_in, - .overlimit = q->stats.overlimit, - .overmemory = q->overmemory, - .dropped = q->stats.dropped, - .ecn_mark = q->stats.ecn_mark, - .new_flow_count = q->new_flow_count, - .memory_usage = q->memory_usage, - }; + struct tc_fq_pie_xstats st = { 0 }; struct list_head *pos; sch_tree_lock(sch); + + st.packets_in = q->stats.packets_in; + st.overlimit = q->stats.overlimit; + st.overmemory = q->overmemory; + st.dropped = q->stats.dropped; + st.ecn_mark = q->stats.ecn_mark; + st.new_flow_count = q->new_flow_count; + st.memory_usage = q->memory_usage; + list_for_each(pos, &q->new_flows) st.new_flows_len++; From 2674d603a9e6970463b2b9ebcf8e31e90beae169 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 23 Apr 2026 09:36:07 +0300 Subject: [PATCH 23/79] vrf: Fix a potential NPD when removing a port from a VRF RCU readers that identified a net device as a VRF port using netif_is_l3_slave() assume that a subsequent call to netdev_master_upper_dev_get_rcu() will return a VRF device. They then continue to dereference its l3mdev operations. This assumption is not always correct and can result in a NPD [1]. There is no RCU synchronization when removing a port from a VRF, so it is possible for an RCU reader to see a new master device (e.g., a bridge) that does not have l3mdev operations. Fix by adding RCU synchronization after clearing the IFF_L3MDEV_SLAVE flag. Skip this synchronization when a net device is removed from a VRF as part of its deletion and when the VRF device itself is deleted. In the latter case an RCU grace period will pass by the time RTNL is released. [1] BUG: kernel NULL pointer dereference, address: 0000000000000000 [...] RIP: 0010:l3mdev_fib_table_rcu (net/l3mdev/l3mdev.c:181) [...] Call Trace: l3mdev_fib_table_by_index (net/l3mdev/l3mdev.c:201 net/l3mdev/l3mdev.c:189) __inet_bind (net/ipv4/af_inet.c:499 (discriminator 3)) inet_bind_sk (net/ipv4/af_inet.c:469) __sys_bind (./include/linux/file.h:62 (discriminator 1) ./include/linux/file.h:83 (discriminator 1) net/socket.c:1951 (discriminator 1)) __x64_sys_bind (net/socket.c:1969 (discriminator 1) net/socket.c:1967 (discriminator 1) net/socket.c:1967 (discriminator 1)) do_syscall_64 (arch/x86/entry/syscall_64.c:63 (discriminator 1) arch/x86/entry/syscall_64.c:94 (discriminator 1)) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) Fixes: fdeea7be88b1 ("net: vrf: Set slave's private flag before linking") Reported-by: Haoze Xie Reported-by: Yifan Wu Reported-by: Juefei Pu Reported-by: Yuan Tan Closes: https://lore.kernel.org/netdev/20260419145332.3988923-1-n05ec@lzu.edu.cn/ Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Link: https://patch.msgid.link/20260423063607.1208202-1-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/vrf.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 2cf2dbd1c12f..46209917ae4d 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1034,6 +1034,7 @@ static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev, err: port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE; + synchronize_net(); return ret; } @@ -1053,10 +1054,16 @@ static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev, } /* inverse of do_vrf_add_slave */ -static int do_vrf_del_slave(struct net_device *dev, struct net_device *port_dev) +static int do_vrf_del_slave(struct net_device *dev, struct net_device *port_dev, + bool needs_sync) { netdev_upper_dev_unlink(port_dev, dev); port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE; + /* Make sure that concurrent RCU readers that identified the device + * as a VRF port see a VRF master or no master at all. + */ + if (needs_sync) + synchronize_net(); cycle_netdev(port_dev, NULL); @@ -1065,7 +1072,7 @@ static int do_vrf_del_slave(struct net_device *dev, struct net_device *port_dev) static int vrf_del_slave(struct net_device *dev, struct net_device *port_dev) { - return do_vrf_del_slave(dev, port_dev); + return do_vrf_del_slave(dev, port_dev, true); } static void vrf_dev_uninit(struct net_device *dev) @@ -1619,7 +1626,7 @@ static void vrf_dellink(struct net_device *dev, struct list_head *head) struct list_head *iter; netdev_for_each_lower_dev(dev, port_dev, iter) - vrf_del_slave(dev, port_dev); + do_vrf_del_slave(dev, port_dev, false); vrf_map_unregister_dev(dev); @@ -1751,7 +1758,7 @@ static int vrf_device_event(struct notifier_block *unused, goto out; vrf_dev = netdev_master_upper_dev_get(dev); - vrf_del_slave(vrf_dev, dev); + do_vrf_del_slave(vrf_dev, dev, false); } out: return NOTIFY_DONE; From 9e6bf146b55999a095bb14f73a843942456d1adc Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 21 Apr 2026 15:16:33 +0200 Subject: [PATCH 24/79] ipv6: rpl: reserve mac_len headroom when recompressed SRH grows ipv6_rpl_srh_rcv() decompresses an RFC 6554 Source Routing Header, swaps the next segment into ipv6_hdr->daddr, recompresses, then pulls the old header and pushes the new one plus the IPv6 header back. The recompressed header can be larger than the received one when the swap reduces the common-prefix length the segments share with daddr (CmprI=0, CmprE>0, seg[0][0] != daddr[0] gives the maximum +8 bytes). pskb_expand_head() was gated on segments_left == 0, so on earlier segments the push consumed unchecked headroom. Once skb_push() leaves fewer than skb->mac_len bytes in front of data, skb_mac_header_rebuild()'s call to: skb_set_mac_header(skb, -skb->mac_len); will store (data - head) - mac_len into the u16 mac_header field, which wraps to ~65530, and the following memmove() writes mac_len bytes ~64KiB past skb->head. A single AF_INET6/SOCK_RAW/IPV6_HDRINCL packet over lo with a two segment type-3 SRH (CmprI=0, CmprE=15) reaches headroom 8 after one pass; KASAN reports a 14-byte OOB write in ipv6_rthdr_rcv. Fix this by expanding the head whenever the remaining room is less than the push size plus mac_len, and request that much extra so the rebuilt MAC header fits afterwards. Fixes: 8610c7c6e3bd ("net: ipv6: add support for rpl sr exthdr") Cc: stable Reported-by: Anthropic Signed-off-by: Greg Kroah-Hartman Link: https://patch.msgid.link/2026042133-gout-unvented-1bd9@gregkh Signed-off-by: Jakub Kicinski --- net/ipv6/exthdrs.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 95558fd6f447..03cbce842c1a 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -491,6 +491,7 @@ static int ipv6_rpl_srh_rcv(struct sk_buff *skb) struct net *net = dev_net(skb->dev); struct inet6_dev *idev; struct ipv6hdr *oldhdr; + unsigned int chdr_len; unsigned char *buf; int accept_rpl_seg; int i, err; @@ -592,8 +593,10 @@ looped_back: skb_pull(skb, ((hdr->hdrlen + 1) << 3)); skb_postpull_rcsum(skb, oldhdr, sizeof(struct ipv6hdr) + ((hdr->hdrlen + 1) << 3)); - if (unlikely(!hdr->segments_left)) { - if (pskb_expand_head(skb, sizeof(struct ipv6hdr) + ((chdr->hdrlen + 1) << 3), 0, + chdr_len = sizeof(struct ipv6hdr) + ((chdr->hdrlen + 1) << 3); + if (unlikely(!hdr->segments_left || + skb_headroom(skb) < chdr_len + skb->mac_len)) { + if (pskb_expand_head(skb, chdr_len + skb->mac_len, 0, GFP_ATOMIC)) { __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); @@ -603,7 +606,7 @@ looped_back: oldhdr = ipv6_hdr(skb); } - skb_push(skb, ((chdr->hdrlen + 1) << 3) + sizeof(struct ipv6hdr)); + skb_push(skb, chdr_len); skb_reset_network_header(skb); skb_mac_header_rebuild(skb); skb_set_transport_header(skb, sizeof(struct ipv6hdr)); From 23f0e34c64acba15cad4d23e50f41f533da195fa Mon Sep 17 00:00:00 2001 From: Zhan Jun Date: Thu, 23 Apr 2026 08:49:12 +0800 Subject: [PATCH 25/79] net: usb: rtl8150: fix use-after-free in rtl8150_start_xmit() syzbot reported a KASAN slab-use-after-free read in rtl8150_start_xmit() when accessing skb->len for tx statistics after usb_submit_urb() has been called: BUG: KASAN: slab-use-after-free in rtl8150_start_xmit+0x71f/0x760 drivers/net/usb/rtl8150.c:712 Read of size 4 at addr ffff88810eb7a930 by task kworker/0:4/5226 The URB completion handler write_bulk_callback() frees the skb via dev_kfree_skb_irq(dev->tx_skb). The URB may complete on another CPU in softirq context before usb_submit_urb() returns in the submitter, so by the time the submitter reads skb->len the skb has already been queued to the per-CPU completion_queue and freed by net_tx_action(): CPU A (xmit) CPU B (USB completion softirq) ------------ ------------------------------ dev->tx_skb = skb; usb_submit_urb() --+ |-------> write_bulk_callback() | dev_kfree_skb_irq(dev->tx_skb) | net_tx_action() | napi_skb_cache_put() <-- free netdev->stats.tx_bytes | += skb->len; <-- UAF read Fix it by caching skb->len before submitting the URB and using the cached value when updating the tx_bytes counter. The pre-existing tx_bytes semantics are preserved: the counter tracks the original frame length (skb->len), not the ETH_ZLEN/USB-alignment padded "count" value that is handed to the device. Changing that would be a user-visible accounting change and is out of scope for this UAF fix. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot+3f46c095ac0ca048cb71@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/69e69ee7.050a0220.24bfd3.002b.GAE@google.com/ Closes: https://syzkaller.appspot.com/bug?extid=3f46c095ac0ca048cb71 Reviewed-by: Andrew Lunn Signed-off-by: Zhan Jun Link: https://patch.msgid.link/809895186B866C10+20260423004913.136655-1-zhangdandan@uniontech.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/rtl8150.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/usb/rtl8150.c b/drivers/net/usb/rtl8150.c index 4cda0643afb6..1bbfdeab4d62 100644 --- a/drivers/net/usb/rtl8150.c +++ b/drivers/net/usb/rtl8150.c @@ -683,6 +683,7 @@ static netdev_tx_t rtl8150_start_xmit(struct sk_buff *skb, struct net_device *netdev) { rtl8150_t *dev = netdev_priv(netdev); + unsigned int skb_len; int count, res; /* pad the frame and ensure terminating USB packet, datasheet 9.2.3 */ @@ -694,6 +695,8 @@ static netdev_tx_t rtl8150_start_xmit(struct sk_buff *skb, return NETDEV_TX_OK; } + skb_len = skb->len; + netif_stop_queue(netdev); dev->tx_skb = skb; usb_fill_bulk_urb(dev->tx_urb, dev->udev, usb_sndbulkpipe(dev->udev, 2), @@ -709,7 +712,7 @@ static netdev_tx_t rtl8150_start_xmit(struct sk_buff *skb, } } else { netdev->stats.tx_packets++; - netdev->stats.tx_bytes += skb->len; + netdev->stats.tx_bytes += skb_len; netif_trans_update(netdev); } From adbe2cdf75461891e50dbe11896ac78e9af1f874 Mon Sep 17 00:00:00 2001 From: Morduan Zang Date: Fri, 24 Apr 2026 09:55:17 +0800 Subject: [PATCH 26/79] net: usb: rtl8150: free skb on usb_submit_urb() failure in xmit When rtl8150_start_xmit() fails to submit the tx URB, the URB is never handed to the USB core and write_bulk_callback() will not run. The driver returns NETDEV_TX_OK, which tells the networking stack that the skb has been consumed, but nothing actually frees the skb on this error path: dev->tx_skb = skb; ... if ((res = usb_submit_urb(dev->tx_urb, GFP_ATOMIC))) { ... /* no kfree_skb here */ } return NETDEV_TX_OK; This leaks the skb on every submit failure and also leaves dev->tx_skb pointing at memory that the driver itself may later free, which is fragile. Free the skb with dev_kfree_skb_any() in the error path and clear dev->tx_skb so no stale pointer is left behind. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reviewed-by: Andrew Lunn Signed-off-by: Morduan Zang Link: https://patch.msgid.link/E7D3E1C013C5A859+20260424015517.9574-1-zhangdandan@uniontech.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/rtl8150.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/usb/rtl8150.c b/drivers/net/usb/rtl8150.c index 1bbfdeab4d62..c880c95c41a5 100644 --- a/drivers/net/usb/rtl8150.c +++ b/drivers/net/usb/rtl8150.c @@ -710,6 +710,13 @@ static netdev_tx_t rtl8150_start_xmit(struct sk_buff *skb, netdev->stats.tx_errors++; netif_start_queue(netdev); } + /* + * The URB was not submitted, so write_bulk_callback() will + * never run to free dev->tx_skb. Drop the skb here and + * clear tx_skb to avoid leaving a stale pointer. + */ + dev->tx_skb = NULL; + dev_kfree_skb_any(skb); } else { netdev->stats.tx_packets++; netdev->stats.tx_bytes += skb_len; From a9bc28aa4e64320668131349436a650bf42591a5 Mon Sep 17 00:00:00 2001 From: Paul Geurts Date: Wed, 22 Apr 2026 12:09:30 +0200 Subject: [PATCH 27/79] NFC: trf7970a: Ignore antenna noise when checking for RF field The main channel Received Signal Strength Indicator (RSSI) measurement is used to determine whether an RF field is present or not. RSSI != 0 is interpreted as an RF Field is present. This does not take RF noise and measurement inaccuracy into account, and results in false positives in the field. Define a noise level and make sure the RF field is only interpreted as present when the RSSI is above the noise level. Fixes: 851ee3cbf850 ("NFC: trf7970a: Don't turn on RF if there is already an RF field") Signed-off-by: Paul Geurts Reviewed-by: Krzysztof Kozlowski Reviewed-by: Mark Greer Link: https://patch.msgid.link/20260422100930.581237-1-paul.geurts@prodrive-technologies.com Signed-off-by: Jakub Kicinski --- drivers/nfc/trf7970a.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nfc/trf7970a.c b/drivers/nfc/trf7970a.c index d17c701c7888..08c27bb438b5 100644 --- a/drivers/nfc/trf7970a.c +++ b/drivers/nfc/trf7970a.c @@ -317,6 +317,7 @@ #define TRF7970A_RSSI_OSC_STATUS_RSSI_MASK (BIT(2) | BIT(1) | BIT(0)) #define TRF7970A_RSSI_OSC_STATUS_RSSI_X_MASK (BIT(5) | BIT(4) | BIT(3)) #define TRF7970A_RSSI_OSC_STATUS_RSSI_OSC_OK BIT(6) +#define TRF7970A_RSSI_OSC_STATUS_RSSI_NOISE_LEVEL 1 #define TRF7970A_SPECIAL_FCN_REG1_COL_7_6 BIT(0) #define TRF7970A_SPECIAL_FCN_REG1_14_ANTICOLL BIT(1) @@ -1300,7 +1301,7 @@ static int trf7970a_is_rf_field(struct trf7970a *trf, bool *is_rf_field) if (ret) return ret; - if (rssi & TRF7970A_RSSI_OSC_STATUS_RSSI_MASK) + if ((rssi & TRF7970A_RSSI_OSC_STATUS_RSSI_MASK) > TRF7970A_RSSI_OSC_STATUS_RSSI_NOISE_LEVEL) *is_rf_field = true; else *is_rf_field = false; From 3d07ca5c0fae311226f737963984bd94bb159a87 Mon Sep 17 00:00:00 2001 From: Weiming Shi Date: Thu, 23 Apr 2026 00:19:58 +0800 Subject: [PATCH 28/79] net/sched: taprio: fix NULL pointer dereference in class dump When a TAPRIO child qdisc is deleted via RTM_DELQDISC, taprio_graft() is called with new == NULL and stores NULL into q->qdiscs[cl - 1]. Subsequent RTM_GETTCLASS dump operations walk all classes via taprio_walk() and call taprio_dump_class(), which calls taprio_leaf() returning the NULL pointer, then dereferences it to read child->handle, causing a kernel NULL pointer dereference. The bug is reachable with namespace-scoped CAP_NET_ADMIN on any kernel with CONFIG_NET_SCH_TAPRIO enabled. On systems with unprivileged user namespaces enabled, an unprivileged local user can trigger a kernel panic by creating a taprio qdisc inside a new network namespace, grafting an explicit child qdisc, deleting it, and requesting a class dump. The RTM_GETTCLASS dump itself requires no capability. Oops: general protection fault, probably for non-canonical address 0xdffffc0000000007: 0000 [#1] SMP KASAN NOPTI KASAN: null-ptr-deref in range [0x0000000000000038-0x000000000000003f] RIP: 0010:taprio_dump_class (net/sched/sch_taprio.c:2478) Call Trace: tc_fill_tclass (net/sched/sch_api.c:1966) qdisc_class_dump (net/sched/sch_api.c:2326) taprio_walk (net/sched/sch_taprio.c:2514) tc_dump_tclass_qdisc (net/sched/sch_api.c:2352) tc_dump_tclass_root (net/sched/sch_api.c:2370) tc_dump_tclass (net/sched/sch_api.c:2431) rtnl_dumpit (net/core/rtnetlink.c:6864) netlink_dump (net/netlink/af_netlink.c:2325) rtnetlink_rcv_msg (net/core/rtnetlink.c:6959) netlink_rcv_skb (net/netlink/af_netlink.c:2550) Fix this by substituting &noop_qdisc when new is NULL in taprio_graft(), a common pattern used by other qdiscs (e.g., multiq_graft()) to ensure the q->qdiscs[] slots are never NULL. This makes control-plane dump paths safe without requiring individual NULL checks. Since the data-plane paths (taprio_enqueue and taprio_dequeue_from_txq) previously had explicit NULL guards that would drop/skip the packet cleanly, update those checks to test for &noop_qdisc instead. Without this, packets would reach taprio_enqueue_one() which increments the root qdisc's qlen and backlog before calling the child's enqueue; noop_qdisc drops the packet but those counters are never rolled back, permanently inflating the root qdisc's statistics. After this change *old can be a valid qdisc, NULL, or &noop_qdisc. Only call qdisc_put(*old) in the first case to avoid decreasing noop_qdisc's refcount, which was never increased. Fixes: 665338b2a7a0 ("net/sched: taprio: dump class stats for the actual q->qdiscs[]") Reported-by: Xiang Mei Signed-off-by: Weiming Shi Acked-by: Jamal Hadi Salim Tested-by: Weiming Shi Link: https://patch.msgid.link/20260422161958.2517539-3-bestswngs@gmail.com Signed-off-by: Jakub Kicinski --- net/sched/sch_taprio.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index a47a09d76400..45245157e00a 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -634,7 +634,7 @@ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, queue = skb_get_queue_mapping(skb); child = q->qdiscs[queue]; - if (unlikely(!child)) + if (unlikely(child == &noop_qdisc)) return qdisc_drop(skb, sch, to_free); if (taprio_skb_exceeds_queue_max_sdu(sch, skb)) { @@ -717,7 +717,7 @@ static struct sk_buff *taprio_dequeue_from_txq(struct Qdisc *sch, int txq, int len; u8 tc; - if (unlikely(!child)) + if (unlikely(child == &noop_qdisc)) return NULL; if (TXTIME_ASSIST_IS_ENABLED(q->flags)) @@ -2184,6 +2184,9 @@ static int taprio_graft(struct Qdisc *sch, unsigned long cl, if (!dev_queue) return -EINVAL; + if (!new) + new = &noop_qdisc; + if (dev->flags & IFF_UP) dev_deactivate(dev, false); @@ -2197,14 +2200,14 @@ static int taprio_graft(struct Qdisc *sch, unsigned long cl, *old = q->qdiscs[cl - 1]; if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { WARN_ON_ONCE(dev_graft_qdisc(dev_queue, new) != *old); - if (new) + if (new != &noop_qdisc) qdisc_refcount_inc(new); - if (*old) + if (*old && *old != &noop_qdisc) qdisc_put(*old); } q->qdiscs[cl - 1] = new; - if (new) + if (new != &noop_qdisc) new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; if (dev->flags & IFF_UP) From a469feed399da791f890b3448622121e97a07f3b Mon Sep 17 00:00:00 2001 From: Weiming Shi Date: Thu, 23 Apr 2026 00:19:59 +0800 Subject: [PATCH 29/79] selftests/tc-testing: add taprio test for class dump after child delete Add a regression test for the NULL pointer dereference fixed in the previous commit. Before the fix, taprio_graft() stored NULL into q->qdiscs[cl - 1] when an explicitly grafted child qdisc was deleted via RTM_DELQDISC; the next RTM_GETTCLASS dump then crashed the kernel in taprio_dump_class() while reading child->handle. The test installs a taprio root qdisc on a multi-queue netdevsim device, grafts a pfifo child onto class 8001:1, deletes that child, and then performs a class dump. On a fixed kernel the dump succeeds and all eight taprio classes are listed; on an unpatched kernel the class dump crashes, which surfaces as a test failure. Signed-off-by: Weiming Shi Acked-by: Jamal Hadi Salim Link: https://patch.msgid.link/20260422161958.2517539-4-bestswngs@gmail.com Signed-off-by: Jakub Kicinski --- .../tc-testing/tc-tests/qdiscs/taprio.json | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/taprio.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/taprio.json index 557fb074acf0..cd19d05925e4 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/taprio.json +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/taprio.json @@ -302,5 +302,31 @@ "$TC qdisc del dev $ETH root", "echo \"1\" > /sys/bus/netdevsim/del_device" ] + }, + { + "id": "c7e1", + "name": "Class dump after graft and delete of explicit child qdisc", + "category": [ + "qdisc", + "taprio" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 8\" > /sys/bus/netdevsim/new_device", + "$TC qdisc replace dev $ETH handle 8001: parent root taprio num_tc 8 map 0 1 2 3 4 5 6 7 queues 1@0 1@1 1@2 1@3 1@4 1@5 1@6 1@7 base-time 0 sched-entry S ff 20000000 clockid CLOCK_TAI", + "$TC qdisc add dev $ETH parent 8001:1 handle 8002: pfifo", + "$TC qdisc del dev $ETH parent 8001:1 handle 8002:" + ], + "cmdUnderTest": "$TC class show dev $ETH", + "expExitCode": "0", + "verifyCmd": "$TC class show dev $ETH", + "matchPattern": "class taprio 8001:[0-9]+ root", + "matchCount": "8", + "teardown": [ + "$TC qdisc del dev $ETH root", + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] } ] From 5b0c911bcdbd982f7748d11c0b39ec5808eae2de Mon Sep 17 00:00:00 2001 From: Morduan Zang Date: Thu, 23 Apr 2026 09:05:57 +0800 Subject: [PATCH 30/79] net: phonet: do not BUG_ON() in pn_socket_autobind() on failed bind MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit syzbot reported a kernel BUG triggered from pn_socket_sendmsg() via pn_socket_autobind(): kernel BUG at net/phonet/socket.c:213! RIP: 0010:pn_socket_autobind net/phonet/socket.c:213 [inline] RIP: 0010:pn_socket_sendmsg+0x240/0x250 net/phonet/socket.c:421 Call Trace: sock_sendmsg_nosec+0x112/0x150 net/socket.c:797 __sock_sendmsg net/socket.c:812 [inline] __sys_sendto+0x402/0x590 net/socket.c:2280 ... pn_socket_autobind() calls pn_socket_bind() with port 0 and, on -EINVAL, assumes the socket was already bound and asserts that the port is non-zero: err = pn_socket_bind(sock, ..., sizeof(struct sockaddr_pn)); if (err != -EINVAL) return err; BUG_ON(!pn_port(pn_sk(sock->sk)->sobject)); return 0; /* socket was already bound */ However pn_socket_bind() also returns -EINVAL when sk->sk_state is not TCP_CLOSE, even when the socket has never been bound and pn_port() is still 0. In that case the BUG_ON() fires and panics the kernel from a user-triggerable path. Treat the "bind returned -EINVAL but pn_port() is still 0" case as a regular error and propagate -EINVAL to the caller instead of crashing. Existing callers already translate a non-zero return from pn_socket_autobind() into -ENOBUFS/-EAGAIN, so returning -EINVAL here only changes behaviour from panic to a normal errno. Fixes: ba113a94b750 ("Phonet: common socket glue") Reported-by: syzbot+706f5eb79044e686c794@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=706f5eb79044e686c794 Suggested-by: Remi Denis-Courmont Signed-off-by: Morduan Zang Signed-off-by: zhanjun Acked-by: Rémi Denis-Courmont Link: https://patch.msgid.link/87A8960A2045AF3C+20260423010557.138124-1-zhangdandan@uniontech.com Signed-off-by: Jakub Kicinski --- net/phonet/socket.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/phonet/socket.c b/net/phonet/socket.c index c4af26357144..631a99cdbd00 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -208,9 +208,15 @@ static int pn_socket_autobind(struct socket *sock) sa.spn_family = AF_PHONET; err = pn_socket_bind(sock, (struct sockaddr_unsized *)&sa, sizeof(struct sockaddr_pn)); - if (err != -EINVAL) + /* + * pn_socket_bind() also returns -EINVAL when sk_state != TCP_CLOSE + * without a prior bind, so -EINVAL alone is not sufficient to infer + * that the socket was already bound. Only treat it as "already + * bound" when the port is non-zero; otherwise propagate the error + * instead of crashing the kernel. + */ + if (err != -EINVAL || unlikely(!pn_port(pn_sk(sock->sk)->sobject))) return err; - BUG_ON(!pn_port(pn_sk(sock->sk)->sobject)); return 0; /* socket was already bound */ } From b3b6babf47517fde6b6de2493dea28e8831b9347 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 23 Apr 2026 05:34:54 +0000 Subject: [PATCH 31/79] ipmr: Free mr_table after RCU grace period. With CONFIG_IP_MROUTE_MULTIPLE_TABLES=n, ipmr_fib_lookup() does not check if net->ipv4.mrt is NULL. Since default_device_exit_batch() is called after ->exit_rtnl(), a device could receive IGMP packets and access net->ipv4.mrt during/after ipmr_rules_exit_rtnl(). If ipmr_rules_exit_rtnl() had already cleared it and freed the memory, the access would trigger null-ptr-deref or use-after-free. Let's fix it by using RCU helper and free mrt after RCU grace period. In addition, check_net(net) is added to mroute_clean_tables() and ipmr_cache_unresolved() to synchronise via mfc_unres_lock. This prevents ipmr_cache_unresolved() from putting skb into c->_c.mfc_un.unres.unresolved after mroute_clean_tables() purges it. For the same reason, timer_shutdown_sync() is moved after mroute_clean_tables(). Since rhltable_destroy() holds mutex internally, rcu_work is used, and it is placed as the first member because rcu_head must be placed within <4K offset. mr_table is alraedy 3864 bytes without rcu_work. Note that IP6MR is not yet converted to ->exit_rtnl(), so this change is not needed for now but will be. Fixes: b22b01867406 ("ipmr: Convert ipmr_net_exit_batch() to ->exit_rtnl().") Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260423053456.4097409-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/mroute_base.h | 3 + net/ipv4/ipmr.c | 108 +++++++++++++++++++----------------- net/ipv4/ipmr_base.c | 16 ++++++ 3 files changed, 77 insertions(+), 50 deletions(-) diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index cf3374580f74..5d75cc5b057e 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -226,6 +226,7 @@ struct mr_table_ops { /** * struct mr_table - a multicast routing table + * @work: used for table destruction * @list: entry within a list of multicast routing tables * @net: net where this table belongs * @ops: protocol specific operations @@ -243,6 +244,7 @@ struct mr_table_ops { * @mroute_reg_vif_num: PIM-device vif index */ struct mr_table { + struct rcu_work work; struct list_head list; possible_net_t net; struct mr_table_ops ops; @@ -274,6 +276,7 @@ void vif_device_init(struct vif_device *v, unsigned short flags, unsigned short get_iflink_mask); +void mr_table_free(struct mr_table *mrt); struct mr_table * mr_table_alloc(struct net *net, u32 id, struct mr_table_ops *ops, diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 8a08d09b4c30..2058ca860294 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -151,16 +151,6 @@ static struct mr_table *__ipmr_get_table(struct net *net, u32 id) return NULL; } -static struct mr_table *ipmr_get_table(struct net *net, u32 id) -{ - struct mr_table *mrt; - - rcu_read_lock(); - mrt = __ipmr_get_table(net, id); - rcu_read_unlock(); - return mrt; -} - static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4, struct mr_table **mrt) { @@ -293,7 +283,7 @@ static void __net_exit ipmr_rules_exit_rtnl(struct net *net, struct mr_table *mrt, *next; list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) { - list_del(&mrt->list); + list_del_rcu(&mrt->list); ipmr_free_table(mrt, dev_kill_list); } } @@ -315,28 +305,30 @@ bool ipmr_rule_default(const struct fib_rule *rule) } EXPORT_SYMBOL(ipmr_rule_default); #else -#define ipmr_for_each_table(mrt, net) \ - for (mrt = net->ipv4.mrt; mrt; mrt = NULL) - static struct mr_table *ipmr_mr_table_iter(struct net *net, struct mr_table *mrt) { if (!mrt) - return net->ipv4.mrt; + return rcu_dereference(net->ipv4.mrt); return NULL; } -static struct mr_table *ipmr_get_table(struct net *net, u32 id) +static struct mr_table *__ipmr_get_table(struct net *net, u32 id) { - return net->ipv4.mrt; + return rcu_dereference_check(net->ipv4.mrt, + lockdep_rtnl_is_held() || + !rcu_access_pointer(net->ipv4.mrt)); } -#define __ipmr_get_table ipmr_get_table +#define ipmr_for_each_table(mrt, net) \ + for (mrt = __ipmr_get_table(net, 0); mrt; mrt = NULL) static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4, struct mr_table **mrt) { - *mrt = net->ipv4.mrt; + *mrt = rcu_dereference(net->ipv4.mrt); + if (!*mrt) + return -EAGAIN; return 0; } @@ -347,7 +339,8 @@ static int __net_init ipmr_rules_init(struct net *net) mrt = ipmr_new_table(net, RT_TABLE_DEFAULT); if (IS_ERR(mrt)) return PTR_ERR(mrt); - net->ipv4.mrt = mrt; + + rcu_assign_pointer(net->ipv4.mrt, mrt); return 0; } @@ -358,9 +351,10 @@ static void __net_exit ipmr_rules_exit(struct net *net) static void __net_exit ipmr_rules_exit_rtnl(struct net *net, struct list_head *dev_kill_list) { - ipmr_free_table(net->ipv4.mrt, dev_kill_list); + struct mr_table *mrt = rcu_dereference_protected(net->ipv4.mrt, 1); - net->ipv4.mrt = NULL; + RCU_INIT_POINTER(net->ipv4.mrt, NULL); + ipmr_free_table(mrt, dev_kill_list); } static int ipmr_rules_dump(struct net *net, struct notifier_block *nb, @@ -381,6 +375,17 @@ bool ipmr_rule_default(const struct fib_rule *rule) EXPORT_SYMBOL(ipmr_rule_default); #endif +static struct mr_table *ipmr_get_table(struct net *net, u32 id) +{ + struct mr_table *mrt; + + rcu_read_lock(); + mrt = __ipmr_get_table(net, id); + rcu_read_unlock(); + + return mrt; +} + static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { @@ -441,12 +446,11 @@ static void ipmr_free_table(struct mr_table *mrt, struct list_head *dev_kill_lis WARN_ON_ONCE(!mr_can_free_table(net)); - timer_shutdown_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC | MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC, &ipmr_dev_kill_list); - rhltable_destroy(&mrt->mfc_hash); - kfree(mrt); + timer_shutdown_sync(&mrt->ipmr_expire_timer); + mr_table_free(mrt); WARN_ON_ONCE(!net_initialized(net) && !list_empty(&ipmr_dev_kill_list)); list_splice(&ipmr_dev_kill_list, dev_kill_list); @@ -1135,12 +1139,19 @@ static int ipmr_cache_report(const struct mr_table *mrt, static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb, struct net_device *dev) { + struct net *net = read_pnet(&mrt->net); const struct iphdr *iph = ip_hdr(skb); - struct mfc_cache *c; + struct mfc_cache *c = NULL; bool found = false; int err; spin_lock_bh(&mfc_unres_lock); + + if (!check_net(net)) { + err = -EINVAL; + goto err; + } + list_for_each_entry(c, &mrt->mfc_unres_queue, _c.list) { if (c->mfc_mcastgrp == iph->daddr && c->mfc_origin == iph->saddr) { @@ -1153,10 +1164,8 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, /* Create a new entry if allowable */ c = ipmr_cache_alloc_unres(); if (!c) { - spin_unlock_bh(&mfc_unres_lock); - - kfree_skb(skb); - return -ENOBUFS; + err = -ENOBUFS; + goto err; } /* Fill in the new cache entry */ @@ -1166,17 +1175,8 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, /* Reflect first query at mrouted. */ err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE); - - if (err < 0) { - /* If the report failed throw the cache entry - out - Brad Parker - */ - spin_unlock_bh(&mfc_unres_lock); - - ipmr_cache_free(c); - kfree_skb(skb); - return err; - } + if (err < 0) + goto err; atomic_inc(&mrt->cache_resolve_queue_len); list_add(&c->_c.list, &mrt->mfc_unres_queue); @@ -1189,18 +1189,26 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, /* See if we can append the packet */ if (c->_c.mfc_un.unres.unresolved.qlen > 3) { - kfree_skb(skb); + c = NULL; err = -ENOBUFS; - } else { - if (dev) { - skb->dev = dev; - skb->skb_iif = dev->ifindex; - } - skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb); - err = 0; + goto err; } + if (dev) { + skb->dev = dev; + skb->skb_iif = dev->ifindex; + } + + skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb); + spin_unlock_bh(&mfc_unres_lock); + return 0; + +err: + spin_unlock_bh(&mfc_unres_lock); + if (c) + ipmr_cache_free(c); + kfree_skb(skb); return err; } @@ -1346,7 +1354,7 @@ static void mroute_clean_tables(struct mr_table *mrt, int flags, } if (flags & MRT_FLUSH_MFC) { - if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { + if (atomic_read(&mrt->cache_resolve_queue_len) != 0 || !check_net(net)) { spin_lock_bh(&mfc_unres_lock); list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) { list_del(&c->list); diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index 37a3c144276c..3930d612c3de 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -28,6 +28,20 @@ void vif_device_init(struct vif_device *v, v->link = dev->ifindex; } +static void __mr_free_table(struct work_struct *work) +{ + struct mr_table *mrt = container_of(to_rcu_work(work), + struct mr_table, work); + + rhltable_destroy(&mrt->mfc_hash); + kfree(mrt); +} + +void mr_table_free(struct mr_table *mrt) +{ + queue_rcu_work(system_unbound_wq, &mrt->work); +} + struct mr_table * mr_table_alloc(struct net *net, u32 id, struct mr_table_ops *ops, @@ -50,6 +64,8 @@ mr_table_alloc(struct net *net, u32 id, kfree(mrt); return ERR_PTR(err); } + + INIT_RCU_WORK(&mrt->work, __mr_free_table); INIT_LIST_HEAD(&mrt->mfc_cache_list); INIT_LIST_HEAD(&mrt->mfc_unres_queue); From 4438113be604ee67a7bf4f81da6e1cca41332ce4 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 24 Apr 2026 16:58:38 +0200 Subject: [PATCH 32/79] neigh: let neigh_xmit take skb ownership neigh_xmit always releases the skb, except when no neighbour table is found. But even the first added user of neigh_xmit (mpls) relied on neigh_xmit to release the skb (or queue it for tx). sashiko reported: If neigh_xmit() is called with an uninitialized neighbor table (for example, NEIGH_ND_TABLE when IPv6 is disabled), it returns -EAFNOSUPPORT and bypasses its internal out_kfree_skb error path. Because the return value of neigh_xmit() is ignored here, does this leak the SKB? Assume full ownership and remove the last code path that doesn't xmit or free skb. Fixes: 4fd3d7d9e868 ("neigh: Add helper function neigh_xmit") Signed-off-by: Florian Westphal Reviewed-by: Kuniyuki Iwashima Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20260424145843.74055-1-fw@strlen.de Signed-off-by: Jakub Kicinski --- net/core/neighbour.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 9e12524b67fa..5d9216016507 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -3210,8 +3210,10 @@ int neigh_xmit(int index, struct net_device *dev, rcu_read_lock(); tbl = rcu_dereference(neigh_tables[index]); - if (!tbl) - goto out_unlock; + if (!tbl) { + rcu_read_unlock(); + goto out_kfree_skb; + } if (index == NEIGH_ARP_TABLE) { u32 key = *((u32 *)addr); @@ -3227,7 +3229,6 @@ int neigh_xmit(int index, struct net_device *dev, goto out_kfree_skb; } err = READ_ONCE(neigh->output)(neigh, skb); -out_unlock: rcu_read_unlock(); } else if (index == NEIGH_LINK_TABLE) { @@ -3237,11 +3238,10 @@ out_unlock: goto out_kfree_skb; err = dev_queue_xmit(skb); } -out: return err; out_kfree_skb: kfree_skb(skb); - goto out; + return err; } EXPORT_SYMBOL(neigh_xmit); From cc427d24ac6442ffdeafd157a63c7c5b73ed4de4 Mon Sep 17 00:00:00 2001 From: Mingming Cao Date: Fri, 24 Apr 2026 09:29:17 -0700 Subject: [PATCH 33/79] ibmveth: Disable GSO for packets with small MSS Some physical adapters on Power systems do not support segmentation offload when the MSS is less than 224 bytes. Attempting to send such packets causes the adapter to freeze, stopping all traffic until manually reset. Implement ndo_features_check to disable GSO for packets with small MSS values. The network stack will perform software segmentation instead. The 224-byte minimum matches ibmvnic commit ("ibmvnic: Enforce stronger sanity checks on GSO packets") which uses the same physical adapters in SEA configurations. The issue occurs specifically when the hardware attempts to perform segmentation (gso_segs > 1) with a small MSS. Single-segment GSO packets (gso_segs == 1) do not trigger the problematic LSO code path and are transmitted normally without segmentation. Add an ndo_features_check callback to disable GSO when MSS < 224 bytes. Also call vlan_features_check() to ensure proper handling of VLAN packets, particularly QinQ (802.1ad) configurations where the hardware parser may not support certain offload features. Validated using iptables to force small MSS values. Without the fix, the adapter freezes. With the fix, packets are segmented in software and transmission succeeds. Comprehensive regression testing completedd (MSS tests, performance, stability). Fixes: 8641dd85799f ("ibmveth: Add support for TSO") Cc: stable@vger.kernel.org Reviewed-by: Brian King Tested-by: Shaik Abdulla Tested-by: Naveed Ahmed Signed-off-by: Mingming Cao Link: https://patch.msgid.link/20260424162917.65725-1-mmc@linux.ibm.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ibm/ibmveth.c | 22 ++++++++++++++++++++++ drivers/net/ethernet/ibm/ibmveth.h | 1 + 2 files changed, 23 insertions(+) diff --git a/drivers/net/ethernet/ibm/ibmveth.c b/drivers/net/ethernet/ibm/ibmveth.c index 58cc3147afe2..73e051d26b9d 100644 --- a/drivers/net/ethernet/ibm/ibmveth.c +++ b/drivers/net/ethernet/ibm/ibmveth.c @@ -1756,6 +1756,27 @@ static int ibmveth_set_mac_addr(struct net_device *dev, void *p) return 0; } +static netdev_features_t ibmveth_features_check(struct sk_buff *skb, + struct net_device *dev, + netdev_features_t features) +{ + /* Some physical adapters do not support segmentation offload with + * MSS < 224. Disable GSO for such packets to avoid adapter freeze. + * Note: Single-segment packets (gso_segs == 1) don't need this check + * as they bypass the LSO path and are transmitted without segmentation. + */ + if (skb_is_gso(skb)) { + if (skb_shinfo(skb)->gso_size < IBMVETH_MIN_LSO_MSS) { + netdev_warn_once(dev, + "MSS %u too small for LSO, disabling GSO\n", + skb_shinfo(skb)->gso_size); + features &= ~NETIF_F_GSO_MASK; + } + } + + return vlan_features_check(skb, features); +} + static const struct net_device_ops ibmveth_netdev_ops = { .ndo_open = ibmveth_open, .ndo_stop = ibmveth_close, @@ -1767,6 +1788,7 @@ static const struct net_device_ops ibmveth_netdev_ops = { .ndo_set_features = ibmveth_set_features, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = ibmveth_set_mac_addr, + .ndo_features_check = ibmveth_features_check, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = ibmveth_poll_controller, #endif diff --git a/drivers/net/ethernet/ibm/ibmveth.h b/drivers/net/ethernet/ibm/ibmveth.h index 068f99df133e..d87713668ed3 100644 --- a/drivers/net/ethernet/ibm/ibmveth.h +++ b/drivers/net/ethernet/ibm/ibmveth.h @@ -37,6 +37,7 @@ #define IBMVETH_ILLAN_IPV4_TCP_CSUM 0x0000000000000002UL #define IBMVETH_ILLAN_ACTIVE_TRUNK 0x0000000000000001UL +#define IBMVETH_MIN_LSO_MSS 224 /* Minimum MSS for LSO */ /* hcall macros */ #define h_register_logical_lan(ua, buflst, rxq, fltlst, mac) \ plpar_hcall_norets(H_REGISTER_LOGICAL_LAN, ua, buflst, rxq, fltlst, mac) From 2b9f6f7065d4cfb65ba19126e0b35ac4544c3f3a Mon Sep 17 00:00:00 2001 From: Altan Hacigumus Date: Thu, 23 Apr 2026 18:46:38 -0700 Subject: [PATCH 34/79] tcp: make probe0 timer handle expired user timeout tcp_clamp_probe0_to_user_timeout() computes remaining time in jiffies using subtraction with an unsigned lvalue. If elapsed probing time exceeds the configured TCP_USER_TIMEOUT, the underflow yields a large value. This ends up re-arming the probe timer for a full backoff interval instead of expiring immediately, delaying connection teardown beyond the configured timeout. Fix this by preventing underflow so user-set timeout expiration is handled correctly without extending the probe timer. Fixes: 344db93ae3ee ("tcp: make TCP_USER_TIMEOUT accurate for zero window probes") Link: https://lore.kernel.org/r/20260414013634.43997-1-ahacigu.linux@gmail.com Signed-off-by: Altan Hacigumus Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260424014639.54110-1-ahacigu.linux@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_timer.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 8d791a954cd6..322db13333c7 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -50,7 +50,8 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when) { const struct inet_connection_sock *icsk = inet_csk(sk); - u32 remaining, user_timeout; + u32 user_timeout; + s32 remaining; s32 elapsed; user_timeout = READ_ONCE(icsk->icsk_user_timeout); @@ -61,7 +62,7 @@ u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when) if (unlikely(elapsed < 0)) elapsed = 0; remaining = msecs_to_jiffies(user_timeout) - elapsed; - remaining = max_t(u32, remaining, TCP_TIMEOUT_MIN); + remaining = max_t(int, remaining, TCP_TIMEOUT_MIN); return min_t(u32, remaining, when); } From 3bc179bc7146c26c9dff75d2943d10528274e301 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 24 Apr 2026 08:31:16 -0700 Subject: [PATCH 35/79] netpoll: fix IPv6 local-address corruption netpoll_setup() decides whether to auto-populate the local source address by testing np->local_ip.ip, which only inspects the first 4 bytes of the union inet_addr storage. For an IPv6 netpoll whose caller-supplied local address has a zero high-32 bits (::1, ::, IPv4-mapped ::ffff:a.b.c.d, etc.), this misdetects the address as unset (which they are not, but the first 4 bytes are empty), calls netpoll_take_ipv6() and overwrites it with whatever matching link-local/global address the device happens to expose first. Introduce a helper netpoll_local_ip_unset() that picks the correct family-aware test (ipv6_addr_any() for IPv6, !.ip for IPv4) and use it from netpoll_setup(). Reproducer is something like: echo "::2" > local_ip echo 1 > enabled cat local_ip # before this fix: 2001:db8::1 (caller-supplied ::2 was clobbered) # after this fix: ::2 Fixes: b7394d2429c1 ("netpoll: prepare for ipv6") Signed-off-by: Breno Leitao Link: https://patch.msgid.link/20260424-netpoll_fix-v1-1-3a55348c625f@debian.org Signed-off-by: Jakub Kicinski --- net/core/netpoll.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index cd74beffd209..4381e0fc25bf 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -704,6 +704,23 @@ static int netpoll_take_ipv4(struct netpoll *np, struct net_device *ndev) return 0; } +/* + * Test whether the caller left np->local_ip unset, so that + * netpoll_setup() should auto-populate it from the egress device. + * + * np->local_ip is a union of __be32 (IPv4) and struct in6_addr (IPv6), + * so an IPv6 address whose first 4 bytes are zero (e.g. ::1, ::2, + * IPv4-mapped ::ffff:a.b.c.d) must not be tested via the IPv4 arm — + * doing so would misclassify a caller-supplied address as unset and + * silently overwrite it with whatever address the device exposes. + */ +static bool netpoll_local_ip_unset(const struct netpoll *np) +{ + if (np->ipv6) + return ipv6_addr_any(&np->local_ip.in6); + return !np->local_ip.ip; +} + int netpoll_setup(struct netpoll *np) { struct net *net = current->nsproxy->net_ns; @@ -747,7 +764,7 @@ int netpoll_setup(struct netpoll *np) rtnl_lock(); } - if (!np->local_ip.ip) { + if (netpoll_local_ip_unset(np)) { if (!np->ipv6) { err = netpoll_take_ipv4(np, ndev); if (err) From f9c52a6ba9780bd27e0bf4c044fd91c13c778b6e Mon Sep 17 00:00:00 2001 From: Andrea Mayer Date: Tue, 21 Apr 2026 11:47:35 +0200 Subject: [PATCH 36/79] net: ipv6: fix NOREF dst use in seg6 and rpl lwtunnels seg6_input_core() and rpl_input() call ip6_route_input() which sets a NOREF dst on the skb, then pass it to dst_cache_set_ip6() invoking dst_hold() unconditionally. On PREEMPT_RT, ksoftirqd is preemptible and a higher-priority task can release the underlying pcpu_rt between the lookup and the caching through a concurrent FIB lookup on a shared nexthop. Simplified race sequence: ksoftirqd/X higher-prio task (same CPU X) ----------- -------------------------------- seg6_input_core(,skb)/rpl_input(skb) dst_cache_get() -> miss ip6_route_input(skb) -> ip6_pol_route(,skb,flags) [RT6_LOOKUP_F_DST_NOREF in flags] -> FIB lookup resolves fib6_nh [nhid=N route] -> rt6_make_pcpu_route() [creates pcpu_rt, refcount=1] pcpu_rt->sernum = fib6_sernum [fib6_sernum=W] -> cmpxchg(fib6_nh.rt6i_pcpu, NULL, pcpu_rt) [slot was empty, store succeeds] -> skb_dst_set_noref(skb, dst) [dst is pcpu_rt, refcount still 1] rt_genid_bump_ipv6() -> bumps fib6_sernum [fib6_sernum from W to Z] ip6_route_output() -> ip6_pol_route() -> FIB lookup resolves fib6_nh [nhid=N] -> rt6_get_pcpu_route() pcpu_rt->sernum != fib6_sernum [W <> Z, stale] -> prev = xchg(rt6i_pcpu, NULL) -> dst_release(prev) [prev is pcpu_rt, refcount 1->0, dead] dst = skb_dst(skb) [dst is the dead pcpu_rt] dst_cache_set_ip6(dst) -> dst_hold() on dead dst -> WARN / use-after-free For the race to occur, ksoftirqd must be preemptible (PREEMPT_RT without PREEMPT_RT_NEEDS_BH_LOCK) and a concurrent task must be able to release the pcpu_rt. Shared nexthop objects provide such a path, as two routes pointing to the same nhid share the same fib6_nh and its rt6i_pcpu entry. Fix seg6_input_core() and rpl_input() by calling skb_dst_force() after ip6_route_input() to force the NOREF dst into a refcounted one before caching. The output path is not affected as ip6_route_output() already returns a refcounted dst. Fixes: af4a2209b134 ("ipv6: sr: use dst_cache in seg6_input") Fixes: a7a29f9c361f ("net: ipv6: add rpl sr tunnel") Cc: stable@vger.kernel.org Signed-off-by: Andrea Mayer Reviewed-by: Simon Horman Reviewed-by: Justin Iurman Link: https://patch.msgid.link/20260421094735.20997-1-andrea.mayer@uniroma2.it Signed-off-by: Paolo Abeni --- net/ipv6/rpl_iptunnel.c | 9 +++++++++ net/ipv6/seg6_iptunnel.c | 9 +++++++++ 2 files changed, 18 insertions(+) diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c index c7942cf65567..4e10adcd70e8 100644 --- a/net/ipv6/rpl_iptunnel.c +++ b/net/ipv6/rpl_iptunnel.c @@ -287,7 +287,16 @@ static int rpl_input(struct sk_buff *skb) if (!dst) { ip6_route_input(skb); + + /* ip6_route_input() sets a NOREF dst; force a refcount on it + * before caching or further use. + */ + skb_dst_force(skb); dst = skb_dst(skb); + if (unlikely(!dst)) { + err = -ENETUNREACH; + goto drop; + } /* cache only if we don't create a dst reference loop */ if (!dst->error && lwtst != dst->lwtstate) { diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index 9b64343ebad6..4c45c0a77d75 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -515,7 +515,16 @@ static int seg6_input_core(struct net *net, struct sock *sk, if (!dst) { ip6_route_input(skb); + + /* ip6_route_input() sets a NOREF dst; force a refcount on it + * before caching or further use. + */ + skb_dst_force(skb); dst = skb_dst(skb); + if (unlikely(!dst)) { + err = -ENETUNREACH; + goto drop; + } /* cache only if we don't create a dst reference loop */ if (!dst->error && lwtst != dst->lwtstate) { From 0bb05e6adfa99a2ea1fee1125cc0953409f83ed8 Mon Sep 17 00:00:00 2001 From: Sam Edwards Date: Tue, 21 Apr 2026 21:45:03 -0700 Subject: [PATCH 37/79] net: stmmac: Prevent NULL deref when RX memory exhausted The CPU receives frames from the MAC through conventional DMA: the CPU allocates buffers for the MAC, then the MAC fills them and returns ownership to the CPU. For each hardware RX queue, the CPU and MAC coordinate through a shared ring array of DMA descriptors: one descriptor per DMA buffer. Each descriptor includes the buffer's physical address and a status flag ("OWN") indicating which side owns the buffer: OWN=0 for CPU, OWN=1 for MAC. The CPU is only allowed to set the flag and the MAC is only allowed to clear it, and both must move through the ring in sequence: thus the ring is used for both "submissions" and "completions." In the stmmac driver, stmmac_rx() bookmarks its position in the ring with the `cur_rx` index. The main receive loop in that function checks for rx_descs[cur_rx].own=0, gives the corresponding buffer to the network stack (NULLing the pointer), and increments `cur_rx` modulo the ring size. After the loop exits, stmmac_rx_refill(), which bookmarks its position with `dirty_rx`, allocates fresh buffers and rearms the descriptors (setting OWN=1). If it fails any allocation, it simply stops early (leaving OWN=0) and will retry where it left off when next called. This means descriptors have a three-stage lifecycle (terms my own): - `empty` (OWN=1, buffer valid) - `full` (OWN=0, buffer valid and populated) - `dirty` (OWN=0, buffer NULL) But because stmmac_rx() only checks OWN, it confuses `full`/`dirty`. In the past (see 'Fixes:'), there was a bug where the loop could cycle `cur_rx` all the way back to the first descriptor it dirtied, resulting in a NULL dereference when mistaken for `full`. The aforementioned commit resolved that *specific* failure by capping the loop's iteration limit at `dma_rx_size - 1`, but this is only a partial fix: if the previous stmmac_rx_refill() didn't complete, then there are leftover `dirty` descriptors that the loop might encounter without needing to cycle fully around. The current code therefore panics (see 'Closes:') when stmmac_rx_refill() is memory-starved long enough for `cur_rx` to catch up to `dirty_rx`. Fix this by explicitly checking, before advancing `cur_rx`, if the next entry is dirty; exit the loop if so. This prevents processing of the final, used descriptor until stmmac_rx_refill() succeeds, but fully prevents the `cur_rx == dirty_rx` ambiguity as the previous bugfix intended: so remove the clamp as well. Since stmmac_rx_zc() is a copy-paste-and-tweak of stmmac_rx() and the code structure is identical, any fix to stmmac_rx() will also need a corresponding fix for stmmac_rx_zc(). Therefore, apply the same check there. In stmmac_rx() (not stmmac_rx_zc()), a related bug remains: after the MAC sets OWN=0 on the final descriptor, it will be unable to send any further DMA-complete IRQs until it's given more `empty` descriptors. Currently, the driver simply *hopes* that the next stmmac_rx_refill() succeeds, risking an indefinite stall of the receive process if not. But this is not a regression, so it can be addressed in a future change. Fixes: b6cb4541853c7 ("net: stmmac: avoid rx queue overrun") Closes: https://bugzilla.kernel.org/show_bug.cgi?id=221010 Cc: stable@vger.kernel.org Suggested-by: Russell King Signed-off-by: Sam Edwards Link: https://patch.msgid.link/20260422044503.5349-1-CFSworks@gmail.com Signed-off-by: Paolo Abeni --- .../net/ethernet/stmicro/stmmac/stmmac_main.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index ca68248dbc78..3591755ea30b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -5549,9 +5549,12 @@ read_again: break; /* Prefetch the next RX descriptor */ - rx_q->cur_rx = STMMAC_NEXT_ENTRY(rx_q->cur_rx, - priv->dma_conf.dma_rx_size); - next_entry = rx_q->cur_rx; + next_entry = STMMAC_NEXT_ENTRY(rx_q->cur_rx, + priv->dma_conf.dma_rx_size); + if (unlikely(next_entry == rx_q->dirty_rx)) + break; + + rx_q->cur_rx = next_entry; np = stmmac_get_rx_desc(priv, rx_q, next_entry); @@ -5686,7 +5689,6 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue) dma_dir = page_pool_get_dma_dir(rx_q->page_pool); bufsz = DIV_ROUND_UP(priv->dma_conf.dma_buf_sz, PAGE_SIZE) * PAGE_SIZE; - limit = min(priv->dma_conf.dma_rx_size - 1, (unsigned int)limit); if (netif_msg_rx_status(priv)) { void *rx_head = stmmac_get_rx_desc(priv, rx_q, 0); @@ -5733,9 +5735,12 @@ read_again: if (unlikely(status & dma_own)) break; - rx_q->cur_rx = STMMAC_NEXT_ENTRY(rx_q->cur_rx, - priv->dma_conf.dma_rx_size); - next_entry = rx_q->cur_rx; + next_entry = STMMAC_NEXT_ENTRY(rx_q->cur_rx, + priv->dma_conf.dma_rx_size); + if (unlikely(next_entry == rx_q->dirty_rx)) + break; + + rx_q->cur_rx = next_entry; np = stmmac_get_rx_desc(priv, rx_q, next_entry); From 4ca07b9239bd0478ae586632a2ed72be37ed8407 Mon Sep 17 00:00:00 2001 From: "William A. Kennington III" Date: Thu, 23 Apr 2026 00:46:52 -0700 Subject: [PATCH 38/79] net: mctp i2c: check length before marking flow active Currently, mctp_i2c_get_tx_flow_state() is called before the packet length sanity check. This function marks a new flow as active in the MCTP core. If the sanity check fails, mctp_i2c_xmit() returns early without calling mctp_i2c_lock_nest(). This results in a mismatched locking state: the flow is active, but the I2C bus lock was never acquired for it. When the flow is later released, mctp_i2c_release_flow() will see the active state and queue an unlock marker. The TX thread will then decrement midev->i2c_lock_count from 0, causing it to underflow to -1. This underflow permanently breaks the driver's locking logic, allowing future transmissions to occur without holding the I2C bus lock, leading to bus collisions and potential hardware hangs. Move the mctp_i2c_get_tx_flow_state() call to after the length sanity check to ensure we only transition the flow state if we are actually going to proceed with the transmission and locking. Fixes: f5b8abf9fc3d ("mctp i2c: MCTP I2C binding driver") Signed-off-by: William A. Kennington III Acked-by: Jeremy Kerr Link: https://patch.msgid.link/20260423074741.201460-1-william@wkennington.com Signed-off-by: Paolo Abeni --- drivers/net/mctp/mctp-i2c.c | 4 ++-- net/sched/cls_flower.c | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/mctp/mctp-i2c.c b/drivers/net/mctp/mctp-i2c.c index 15fe4d1163c1..ee2913758e54 100644 --- a/drivers/net/mctp/mctp-i2c.c +++ b/drivers/net/mctp/mctp-i2c.c @@ -496,8 +496,6 @@ static void mctp_i2c_xmit(struct mctp_i2c_dev *midev, struct sk_buff *skb) u8 *pecp; int rc; - fs = mctp_i2c_get_tx_flow_state(midev, skb); - hdr = (void *)skb_mac_header(skb); /* Sanity check that packet contents matches skb length, * and can't exceed MCTP_I2C_BUFSZ @@ -509,6 +507,8 @@ static void mctp_i2c_xmit(struct mctp_i2c_dev *midev, struct sk_buff *skb) return; } + fs = mctp_i2c_get_tx_flow_state(midev, skb); + if (skb_tailroom(skb) >= 1) { /* Linear case with space, we can just append the PEC */ skb_put(skb, 1); diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 88f8a32fab2b..b9672ea05747 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -556,6 +556,7 @@ static int __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f, struct netlink_ext_ack *extack) { struct cls_fl_head *head = fl_head_dereference(tp); + struct fl_flow_mask *mask; *last = false; @@ -572,11 +573,12 @@ static int __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f, list_del_rcu(&f->list); spin_unlock(&tp->lock); - *last = fl_mask_put(head, f->mask); + mask = f->mask; if (!tc_skip_hw(f->flags)) fl_hw_destroy_filter(tp, f, rtnl_held, extack); tcf_unbind_filter(tp, &f->res); __fl_put(f); + *last = fl_mask_put(head, mask); return 0; } From 46f74a3f7d57d9cc0110b09cbc8163fa0a01afa2 Mon Sep 17 00:00:00 2001 From: Heiko Schocher Date: Sat, 25 Apr 2026 05:13:39 +0200 Subject: [PATCH 39/79] net: phy: dp83869: fix setting CLK_O_SEL field. Table 7-121 in datasheet says we have to set register 0xc6 to value 0x10 before CLK_O_SEL can be modified. No more infos about this field found in datasheet. With this fix, setting of CLK_O_SEL field in IO_MUX_CFG register worked through dts property "ti,clk-output-sel" on a DP83869HMRGZR. Signed-off-by: Heiko Schocher Reviewed-by: Simon Horman Fixes: 01db923e8377 ("net: phy: dp83869: Add TI dp83869 phy") Link: https://patch.msgid.link/20260425031339.3318-1-hs@nabladev.com Signed-off-by: Paolo Abeni --- drivers/net/phy/dp83869.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/dp83869.c b/drivers/net/phy/dp83869.c index 1f381d7b13ff..96a7d255f50f 100644 --- a/drivers/net/phy/dp83869.c +++ b/drivers/net/phy/dp83869.c @@ -31,6 +31,7 @@ #define DP83869_RGMIICTL 0x0032 #define DP83869_STRAP_STS1 0x006e #define DP83869_RGMIIDCTL 0x0086 +#define DP83869_ANA_PLL_PROG_PI 0x00c6 #define DP83869_RXFCFG 0x0134 #define DP83869_RXFPMD1 0x0136 #define DP83869_RXFPMD2 0x0137 @@ -826,12 +827,22 @@ static int dp83869_config_init(struct phy_device *phydev) dp83869_config_port_mirroring(phydev); /* Clock output selection if muxing property is set */ - if (dp83869->clk_output_sel != DP83869_CLK_O_SEL_REF_CLK) + if (dp83869->clk_output_sel != DP83869_CLK_O_SEL_REF_CLK) { + /* + * Table 7-121 in datasheet says we have to set register 0xc6 + * to value 0x10 before CLK_O_SEL can be modified. + */ + ret = phy_write_mmd(phydev, DP83869_DEVADDR, + DP83869_ANA_PLL_PROG_PI, 0x10); + if (ret) + return ret; + ret = phy_modify_mmd(phydev, DP83869_DEVADDR, DP83869_IO_MUX_CFG, DP83869_IO_MUX_CFG_CLK_O_SEL_MASK, dp83869->clk_output_sel << DP83869_IO_MUX_CFG_CLK_O_SEL_SHIFT); + } if (phy_interface_is_rgmii(phydev)) { ret = phy_write_mmd(phydev, DP83869_DEVADDR, DP83869_RGMIIDCTL, From b89769f936a8fa9e66de72ddc1b71a9745a488e6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 27 Apr 2026 12:06:06 -0700 Subject: [PATCH 40/79] net: psp: check for device unregister when creating assoc psp_assoc_device_get_locked() obtains a psp_dev reference via psp_dev_get_for_sock() (which uses psp_dev_tryget() under RCU); it then acquires psd->lock and drops the reference. Before the lock is taken, psp_dev_unregister() can run to completion: take psd->lock, clear out state, unlock, drop the registration reference. The expectation is that the lock prevents device unregistration, but much like with netdevs special care has to be taken when "upgrading" a reference to a locked device. Add the missing check if device is still alive. psp_dev_is_registered() exists already but had no callers, which makes me wonder if I either forgot to add this or lost the check during refactoring... Reported-by: Yiming Qian Fixes: 6b46ca260e22 ("net: psp: add socket security association code") Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260427190606.366101-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/psp/psp_nl.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/net/psp/psp_nl.c b/net/psp/psp_nl.c index 6afd7707ec12..0cc744a6e1c9 100644 --- a/net/psp/psp_nl.c +++ b/net/psp/psp_nl.c @@ -305,8 +305,13 @@ int psp_assoc_device_get_locked(const struct genl_split_ops *ops, psd = psp_dev_get_for_sock(socket->sk); if (psd) { - err = psp_dev_check_access(psd, genl_info_net(info)); - if (err) { + /* Extra care needed here, psp_dev_get_for_sock() only gives + * us access to struct psp_dev's memory, which is quite weak. + */ + mutex_lock(&psd->lock); + if (!psp_dev_is_registered(psd) || + psp_dev_check_access(psd, genl_info_net(info))) { + mutex_unlock(&psd->lock); psp_dev_put(psd); psd = NULL; } @@ -319,7 +324,6 @@ int psp_assoc_device_get_locked(const struct genl_split_ops *ops, id = info->attrs[PSP_A_ASSOC_DEV_ID]; if (psd) { - mutex_lock(&psd->lock); if (id && psd->id != nla_get_u32(id)) { mutex_unlock(&psd->lock); NL_SET_ERR_MSG_ATTR(info->extack, id, From b718342a7fbaa2dff5fefc31988c07af8c6cbc21 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 27 Apr 2026 12:58:56 -0700 Subject: [PATCH 41/79] net: psp: require admin permission for dev-set and key-rotate The dev-set and key-rotate netlink operations modify shared device state (PSP version configuration and cryptographic key material, respectively) but do not require CAP_NET_ADMIN. The only access control is psp_dev_check_access() which merely verifies netns membership. Fixes: 00c94ca2b99e ("psp: base PSP device support") Reviewed-by: Daniel Zahka Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260427195856.401223-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/psp.yaml | 2 ++ net/psp/psp-nl-gen.c | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Documentation/netlink/specs/psp.yaml b/Documentation/netlink/specs/psp.yaml index 100c36cda8e5..bfcd6e4ecb85 100644 --- a/Documentation/netlink/specs/psp.yaml +++ b/Documentation/netlink/specs/psp.yaml @@ -188,6 +188,7 @@ operations: name: dev-set doc: Set the configuration of a PSP device. attribute-set: dev + flags: [admin-perm] do: request: attributes: @@ -207,6 +208,7 @@ operations: name: key-rotate doc: Rotate the device key. attribute-set: dev + flags: [admin-perm] do: request: attributes: diff --git a/net/psp/psp-nl-gen.c b/net/psp/psp-nl-gen.c index 22a48d0fa378..953309952cef 100644 --- a/net/psp/psp-nl-gen.c +++ b/net/psp/psp-nl-gen.c @@ -76,7 +76,7 @@ static const struct genl_split_ops psp_nl_ops[] = { .post_doit = psp_device_unlock, .policy = psp_dev_set_nl_policy, .maxattr = PSP_A_DEV_PSP_VERSIONS_ENA, - .flags = GENL_CMD_CAP_DO, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, { .cmd = PSP_CMD_KEY_ROTATE, @@ -85,7 +85,7 @@ static const struct genl_split_ops psp_nl_ops[] = { .post_doit = psp_device_unlock, .policy = psp_key_rotate_nl_policy, .maxattr = PSP_A_DEV_ID, - .flags = GENL_CMD_CAP_DO, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, { .cmd = PSP_CMD_RX_ASSOC, From 576a5d2bad4814c881a829576b1261b9b8159d2b Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 26 Apr 2026 10:46:40 -0400 Subject: [PATCH 42/79] netfilter: skip recording stale or retransmitted INIT An INIT whose init_tag matches the peer's vtag does not provide new state information. It indicates either: - a stale INIT (after INIT-ACK has already been seen on the same side), or - a retransmitted INIT (after INIT has already been recorded on the same side). In both cases, the INIT must not update ct->proto.sctp.init[] state, since it does not advance the handshake tracking and may otherwise corrupt INIT/INIT-ACK validation logic. Allow INIT processing only when the conntrack entry is newly created (SCTP_CONNTRACK_NONE), or when the init_tag differs from the stored peer vtag. Note it skips the check for the ct with old_state SCTP_CONNTRACK_NONE in nf_conntrack_sctp_packet(), as it is just created in sctp_new() where it set ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = ih->init_tag. Fixes: 9fb9cbb1082d ("[NETFILTER]: Add nf_conntrack subsystem.") Signed-off-by: Xin Long Reviewed-by: Marcelo Ricardo Leitner Acked-by: Florian Westphal Link: https://patch.msgid.link/ee56c3e416452b2a40589a2a85245ac2ad5e9f4b.1777214801.git.lucien.xin@gmail.com Signed-off-by: Jakub Kicinski --- net/netfilter/nf_conntrack_proto_sctp.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 645d2c43ebf7..7e10fa65cbdd 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -466,9 +466,13 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, if (!ih) goto out_unlock; - if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir]) - ct->proto.sctp.init[!dir] = 0; - ct->proto.sctp.init[dir] = 1; + /* Do not record INIT matching peer vtag (stale or retransmitted INIT). */ + if (old_state == SCTP_CONNTRACK_NONE || + ct->proto.sctp.vtag[!dir] != ih->init_tag) { + if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir]) + ct->proto.sctp.init[!dir] = 0; + ct->proto.sctp.init[dir] = 1; + } pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir); ct->proto.sctp.vtag[!dir] = ih->init_tag; From 8a92cb475ca90d84db769e4d4383e631ace0d6e5 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 26 Apr 2026 10:46:41 -0400 Subject: [PATCH 43/79] sctp: discard stale INIT after handshake completion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After an association reaches ESTABLISHED, the peer’s init_tag is already known from the handshake. Any subsequent INIT with the same init_tag is not a valid restart, but a delayed or duplicate INIT. Drop such INIT chunks in sctp_sf_do_unexpected_init() instead of processing them as new association attempts. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Link: https://patch.msgid.link/5788c76c1ee122a3ed00189e88dcf9df1fba226c.1777214801.git.lucien.xin@gmail.com Signed-off-by: Jakub Kicinski --- net/sctp/sm_statefuns.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 7b823d759141..8e89a870780c 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -1556,6 +1556,12 @@ static enum sctp_disposition sctp_sf_do_unexpected_init( /* Tag the variable length parameters. */ chunk->param_hdr.v = skb_pull(chunk->skb, sizeof(struct sctp_inithdr)); + if (asoc->state >= SCTP_STATE_ESTABLISHED) { + /* Discard INIT matching peer vtag after handshake completion (stale INIT). */ + if (ntohl(chunk->subh.init_hdr->init_tag) == asoc->peer.i.init_tag) + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + } + /* Verify the INIT chunk before processing it. */ err_chunk = NULL; if (!sctp_verify_init(net, ep, asoc, chunk->chunk_hdr->type, From aa6c6d9ee064aabfede4402fd1283424e649ca19 Mon Sep 17 00:00:00 2001 From: Weiming Shi Date: Sun, 26 Apr 2026 09:53:51 -0700 Subject: [PATCH 44/79] bareudp: fix NULL pointer dereference in bareudp_fill_metadata_dst() bareudp_fill_metadata_dst() passes bareudp->sock to udp_tunnel6_dst_lookup() in the IPv6 path without a NULL check. The socket is only created in bareudp_open() and NULLed in bareudp_stop(), so calling this function while the device is down triggers a NULL dereference via sock->sk. BUG: kernel NULL pointer dereference, address: 0000000000000018 RIP: 0010:udp_tunnel6_dst_lookup (net/ipv6/ip6_udp_tunnel.c:160) Call Trace: bareudp_fill_metadata_dst (drivers/net/bareudp.c:532) do_execute_actions (net/openvswitch/actions.c:901) ovs_execute_actions (net/openvswitch/actions.c:1589) ovs_packet_cmd_execute (net/openvswitch/datapath.c:700) genl_family_rcv_msg_doit (net/netlink/genetlink.c:1114) genl_rcv_msg (net/netlink/genetlink.c:1209) netlink_rcv_skb (net/netlink/af_netlink.c:2550) Add a NULL check returning -ESHUTDOWN, consistent with the xmit paths in the same driver. Fixes: 571912c69f0e ("net: UDP tunnel encapsulation module for tunnelling different protocols like MPLS, IP, NSH etc.") Reported-by: Xiang Mei Signed-off-by: Weiming Shi Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260426165350.1663137-2-bestswngs@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/bareudp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c index 0df3208783ad..da5866ba0699 100644 --- a/drivers/net/bareudp.c +++ b/drivers/net/bareudp.c @@ -529,6 +529,9 @@ static int bareudp_fill_metadata_dst(struct net_device *dev, struct in6_addr saddr; struct socket *sock = rcu_dereference(bareudp->sock); + if (!sock) + return -ESHUTDOWN; + dst = udp_tunnel6_dst_lookup(skb, dev, bareudp->net, sock, 0, &saddr, &info->key, sport, bareudp->port, info->key.tos, From 44967ac3785ebef6442377708925181d4a0eb1c8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 27 Apr 2026 08:36:02 +0000 Subject: [PATCH 45/79] net/sched: sch_cake: annotate data-races in cake_dump_stats() (I) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cake_dump_stats() runs without qdisc spinlock being held. In this first patch, I add READ_ONCE()/WRITE_ONCE() annotations for the following fields: - way_hits - way_misses - way_collisions - sparse_flow_count - decaying_flow_count Other annotations are added in following patches, to ease code review. Fixes: 046f6fd5daef ("sched: Add Common Applications Kept Enhanced (cake) qdisc") Signed-off-by: Eric Dumazet Acked-by: "Toke Høiland-Jørgensen" Link: https://patch.msgid.link/20260427083606.459355-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sched/sch_cake.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 02e1fa4577ae..bcc601fc486b 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -813,7 +813,7 @@ skip_hash: i++, k = (k + 1) % CAKE_SET_WAYS) { if (q->tags[outer_hash + k] == flow_hash) { if (i) - q->way_hits++; + WRITE_ONCE(q->way_hits, q->way_hits + 1); if (!q->flows[outer_hash + k].set) { /* need to increment host refcnts */ @@ -831,7 +831,7 @@ skip_hash: for (i = 0; i < CAKE_SET_WAYS; i++, k = (k + 1) % CAKE_SET_WAYS) { if (!q->flows[outer_hash + k].set) { - q->way_misses++; + WRITE_ONCE(q->way_misses, q->way_misses + 1); allocate_src = cake_dsrc(flow_mode); allocate_dst = cake_ddst(flow_mode); goto found; @@ -841,7 +841,7 @@ skip_hash: /* With no empty queues, default to the original * queue, accept the collision, update the host tags. */ - q->way_collisions++; + WRITE_ONCE(q->way_collisions, q->way_collisions + 1); allocate_src = cake_dsrc(flow_mode); allocate_dst = cake_ddst(flow_mode); @@ -1917,11 +1917,11 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (!flow->set) { list_add_tail(&flow->flowchain, &b->new_flows); } else { - b->decaying_flow_count--; + WRITE_ONCE(b->decaying_flow_count, b->decaying_flow_count - 1); list_move_tail(&flow->flowchain, &b->new_flows); } flow->set = CAKE_SET_SPARSE; - b->sparse_flow_count++; + WRITE_ONCE(b->sparse_flow_count, b->sparse_flow_count + 1); flow->deficit = cake_get_flow_quantum(b, flow, q->config->flow_mode); } else if (flow->set == CAKE_SET_SPARSE_WAIT) { @@ -1929,7 +1929,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, * in the bulk rotation. */ flow->set = CAKE_SET_BULK; - b->sparse_flow_count--; + WRITE_ONCE(b->sparse_flow_count, b->sparse_flow_count - 1); b->bulk_flow_count++; cake_inc_srchost_bulk_flow_count(b, flow, q->config->flow_mode); @@ -2149,7 +2149,7 @@ retry: */ if (flow->set == CAKE_SET_SPARSE) { if (flow->head) { - b->sparse_flow_count--; + WRITE_ONCE(b->sparse_flow_count, b->sparse_flow_count - 1); b->bulk_flow_count++; cake_inc_srchost_bulk_flow_count(b, flow, q->config->flow_mode); @@ -2192,27 +2192,27 @@ retry: cake_dec_srchost_bulk_flow_count(b, flow, q->config->flow_mode); cake_dec_dsthost_bulk_flow_count(b, flow, q->config->flow_mode); - b->decaying_flow_count++; + WRITE_ONCE(b->decaying_flow_count, b->decaying_flow_count + 1); } else if (flow->set == CAKE_SET_SPARSE || flow->set == CAKE_SET_SPARSE_WAIT) { - b->sparse_flow_count--; - b->decaying_flow_count++; + WRITE_ONCE(b->sparse_flow_count, b->sparse_flow_count - 1); + WRITE_ONCE(b->decaying_flow_count, b->decaying_flow_count + 1); } flow->set = CAKE_SET_DECAYING; } else { /* remove empty queue from the flowchain */ list_del_init(&flow->flowchain); if (flow->set == CAKE_SET_SPARSE || - flow->set == CAKE_SET_SPARSE_WAIT) - b->sparse_flow_count--; - else if (flow->set == CAKE_SET_BULK) { + flow->set == CAKE_SET_SPARSE_WAIT) { + WRITE_ONCE(b->sparse_flow_count, b->sparse_flow_count - 1); + } else if (flow->set == CAKE_SET_BULK) { b->bulk_flow_count--; cake_dec_srchost_bulk_flow_count(b, flow, q->config->flow_mode); cake_dec_dsthost_bulk_flow_count(b, flow, q->config->flow_mode); - } else - b->decaying_flow_count--; - + } else { + WRITE_ONCE(b->decaying_flow_count, b->decaying_flow_count - 1); + } flow->set = CAKE_SET_NONE; } goto begin; @@ -3050,12 +3050,12 @@ static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d) PUT_TSTAT_U32(BASE_DELAY_US, ktime_to_us(ns_to_ktime(b->base_delay))); - PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits); - PUT_TSTAT_U32(WAY_MISSES, b->way_misses); - PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions); + PUT_TSTAT_U32(WAY_INDIRECT_HITS, READ_ONCE(b->way_hits)); + PUT_TSTAT_U32(WAY_MISSES, READ_ONCE(b->way_misses)); + PUT_TSTAT_U32(WAY_COLLISIONS, READ_ONCE(b->way_collisions)); - PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count + - b->decaying_flow_count); + PUT_TSTAT_U32(SPARSE_FLOWS, READ_ONCE(b->sparse_flow_count) + + READ_ONCE(b->decaying_flow_count)); PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count); PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count); PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen); From 91a96427b93b9ba27413077b7e825d2fefbfa134 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 27 Apr 2026 08:36:03 +0000 Subject: [PATCH 46/79] net/sched: sch_cake: annotate data-races in cake_dump_stats() (II) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cake_dump_stats() runs without qdisc spinlock being held. In this second patch, I add READ_ONCE()/WRITE_ONCE() annotations for the following fields: - bulk_flow_count - unresponsive_flow_count - max_skblen - flow_quantum Other annotations are added in following patches, to ease code review. Fixes: 046f6fd5daef ("sched: Add Common Applications Kept Enhanced (cake) qdisc") Signed-off-by: Eric Dumazet Acked-by: "Toke Høiland-Jørgensen" Link: https://patch.msgid.link/20260427083606.459355-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sched/sch_cake.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index bcc601fc486b..d7465ee4c550 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1590,7 +1590,8 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) } if (cobalt_queue_full(&flow->cvars, &b->cparams, now)) - b->unresponsive_flow_count++; + WRITE_ONCE(b->unresponsive_flow_count, + b->unresponsive_flow_count + 1); len = qdisc_pkt_len(skb); q->buffer_used -= skb->truesize; @@ -1795,7 +1796,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, } if (unlikely(len > b->max_skblen)) - b->max_skblen = len; + WRITE_ONCE(b->max_skblen, len); if (qdisc_pkt_segs(skb) > 1 && q->config->rate_flags & CAKE_FLAG_SPLIT_GSO) { struct sk_buff *segs, *nskb; @@ -1930,7 +1931,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, */ flow->set = CAKE_SET_BULK; WRITE_ONCE(b->sparse_flow_count, b->sparse_flow_count - 1); - b->bulk_flow_count++; + WRITE_ONCE(b->bulk_flow_count, b->bulk_flow_count + 1); cake_inc_srchost_bulk_flow_count(b, flow, q->config->flow_mode); cake_inc_dsthost_bulk_flow_count(b, flow, q->config->flow_mode); @@ -2150,7 +2151,7 @@ retry: if (flow->set == CAKE_SET_SPARSE) { if (flow->head) { WRITE_ONCE(b->sparse_flow_count, b->sparse_flow_count - 1); - b->bulk_flow_count++; + WRITE_ONCE(b->bulk_flow_count, b->bulk_flow_count + 1); cake_inc_srchost_bulk_flow_count(b, flow, q->config->flow_mode); cake_inc_dsthost_bulk_flow_count(b, flow, q->config->flow_mode); @@ -2177,7 +2178,8 @@ retry: if (!skb) { /* this queue was actually empty */ if (cobalt_queue_empty(&flow->cvars, &b->cparams, now)) - b->unresponsive_flow_count--; + WRITE_ONCE(b->unresponsive_flow_count, + b->unresponsive_flow_count - 1); if (flow->cvars.p_drop || flow->cvars.count || ktime_before(now, flow->cvars.drop_next)) { @@ -2187,7 +2189,7 @@ retry: list_move_tail(&flow->flowchain, &b->decaying_flows); if (flow->set == CAKE_SET_BULK) { - b->bulk_flow_count--; + WRITE_ONCE(b->bulk_flow_count, b->bulk_flow_count - 1); cake_dec_srchost_bulk_flow_count(b, flow, q->config->flow_mode); cake_dec_dsthost_bulk_flow_count(b, flow, q->config->flow_mode); @@ -2206,7 +2208,7 @@ retry: flow->set == CAKE_SET_SPARSE_WAIT) { WRITE_ONCE(b->sparse_flow_count, b->sparse_flow_count - 1); } else if (flow->set == CAKE_SET_BULK) { - b->bulk_flow_count--; + WRITE_ONCE(b->bulk_flow_count, b->bulk_flow_count - 1); cake_dec_srchost_bulk_flow_count(b, flow, q->config->flow_mode); cake_dec_dsthost_bulk_flow_count(b, flow, q->config->flow_mode); @@ -2329,9 +2331,9 @@ static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu, u8 rate_shft = 0; u64 rate_ns = 0; - b->flow_quantum = 1514; if (rate) { - b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL); + WRITE_ONCE(b->flow_quantum, + max(min(rate >> 12, 1514ULL), 300ULL)); rate_shft = 34; rate_ns = ((u64)NSEC_PER_SEC) << rate_shft; rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate)); @@ -2339,8 +2341,10 @@ static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu, rate_ns >>= 1; rate_shft--; } - } /* else unlimited, ie. zero delay */ - + } else { + /* else unlimited, ie. zero delay */ + WRITE_ONCE(b->flow_quantum, 1514); + } b->tin_rate_bps = rate; b->tin_rate_ns = rate_ns; b->tin_rate_shft = rate_shft; @@ -3056,11 +3060,11 @@ static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d) PUT_TSTAT_U32(SPARSE_FLOWS, READ_ONCE(b->sparse_flow_count) + READ_ONCE(b->decaying_flow_count)); - PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count); - PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count); - PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen); + PUT_TSTAT_U32(BULK_FLOWS, READ_ONCE(b->bulk_flow_count)); + PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, READ_ONCE(b->unresponsive_flow_count)); + PUT_TSTAT_U32(MAX_SKBLEN, READ_ONCE(b->max_skblen)); - PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum); + PUT_TSTAT_U32(FLOW_QUANTUM, READ_ONCE(b->flow_quantum)); nla_nest_end(d->skb, ts); } From 276a98a434964088fccd4745db5b34d6e831e358 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 27 Apr 2026 08:36:04 +0000 Subject: [PATCH 47/79] net/sched: sch_cake: annotate data-races in cake_dump_stats() (III) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cake_dump_stats() runs without qdisc spinlock being held. In this third patch, I add READ_ONCE()/WRITE_ONCE() annotations for the following fields: - packets - tin_dropped - tin_ecn_mark - ack_drops - peak_delay - avge_delay - base_delay Other annotations are added in following patches, to ease code review. Fixes: 046f6fd5daef ("sched: Add Common Applications Kept Enhanced (cake) qdisc") Signed-off-by: Eric Dumazet Acked-by: "Toke Høiland-Jørgensen" Link: https://patch.msgid.link/20260427083606.459355-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sched/sch_cake.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index d7465ee4c550..c5aae31565e9 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1600,7 +1600,7 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) sch->qstats.backlog -= len; flow->dropped++; - b->tin_dropped++; + WRITE_ONCE(b->tin_dropped, b->tin_dropped + 1); if (q->config->rate_flags & CAKE_FLAG_INGRESS) cake_advance_shaper(q, b, skb, now, true); @@ -1820,7 +1820,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, numsegs++; slen += segs->len; q->buffer_used += segs->truesize; - b->packets++; + WRITE_ONCE(b->packets, b->packets + 1); } /* stats */ @@ -1844,7 +1844,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, ack = cake_ack_filter(q, flow); if (ack) { - b->ack_drops++; + WRITE_ONCE(b->ack_drops, b->ack_drops + 1); sch->qstats.drops++; ack_pkt_len = qdisc_pkt_len(ack); b->bytes += ack_pkt_len; @@ -1860,7 +1860,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, } /* stats */ - b->packets++; + WRITE_ONCE(b->packets, b->packets + 1); b->bytes += len - ack_pkt_len; b->backlogs[idx] += len - ack_pkt_len; b->tin_backlog += len - ack_pkt_len; @@ -2236,7 +2236,7 @@ retry: b->tin_deficit -= len; } flow->dropped++; - b->tin_dropped++; + WRITE_ONCE(b->tin_dropped, b->tin_dropped + 1); qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); qdisc_qstats_drop(sch); qdisc_dequeue_drop(sch, skb, reason); @@ -2244,17 +2244,19 @@ retry: goto retry; } - b->tin_ecn_mark += !!flow->cvars.ecn_marked; + WRITE_ONCE(b->tin_ecn_mark, b->tin_ecn_mark + !!flow->cvars.ecn_marked); qdisc_bstats_update(sch, skb); WRITE_ONCE(q->last_active, now); /* collect delay stats */ delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); - b->avge_delay = cake_ewma(b->avge_delay, delay, 8); - b->peak_delay = cake_ewma(b->peak_delay, delay, - delay > b->peak_delay ? 2 : 8); - b->base_delay = cake_ewma(b->base_delay, delay, - delay < b->base_delay ? 2 : 8); + WRITE_ONCE(b->avge_delay, cake_ewma(b->avge_delay, delay, 8)); + WRITE_ONCE(b->peak_delay, + cake_ewma(b->peak_delay, delay, + delay > b->peak_delay ? 2 : 8)); + WRITE_ONCE(b->base_delay, + cake_ewma(b->base_delay, delay, + delay < b->base_delay ? 2 : 8)); len = cake_advance_shaper(q, b, skb, now, false); flow->deficit -= len; @@ -3042,17 +3044,17 @@ static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d) PUT_TSTAT_U32(INTERVAL_US, ktime_to_us(ns_to_ktime(b->cparams.interval))); - PUT_TSTAT_U32(SENT_PACKETS, b->packets); - PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped); - PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark); - PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops); + PUT_TSTAT_U32(SENT_PACKETS, READ_ONCE(b->packets)); + PUT_TSTAT_U32(DROPPED_PACKETS, READ_ONCE(b->tin_dropped)); + PUT_TSTAT_U32(ECN_MARKED_PACKETS, READ_ONCE(b->tin_ecn_mark)); + PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, READ_ONCE(b->ack_drops)); PUT_TSTAT_U32(PEAK_DELAY_US, - ktime_to_us(ns_to_ktime(b->peak_delay))); + ktime_to_us(ns_to_ktime(READ_ONCE(b->peak_delay)))); PUT_TSTAT_U32(AVG_DELAY_US, - ktime_to_us(ns_to_ktime(b->avge_delay))); + ktime_to_us(ns_to_ktime(READ_ONCE(b->avge_delay)))); PUT_TSTAT_U32(BASE_DELAY_US, - ktime_to_us(ns_to_ktime(b->base_delay))); + ktime_to_us(ns_to_ktime(READ_ONCE(b->base_delay)))); PUT_TSTAT_U32(WAY_INDIRECT_HITS, READ_ONCE(b->way_hits)); PUT_TSTAT_U32(WAY_MISSES, READ_ONCE(b->way_misses)); From 8fab48d87745a6ab1cec594b8d5865d9ae2db879 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 27 Apr 2026 08:36:05 +0000 Subject: [PATCH 48/79] net/sched: sch_cake: annotate data-races in cake_dump_stats() (IV) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cake_dump_stats() runs without qdisc spinlock being held. In this fourth patch, I add READ_ONCE()/WRITE_ONCE() annotations for the following fields: - avg_peak_bandwidth - buffer_limit - buffer_max_used - avg_netoff - max_netlen - max_adjlen - min_netlen - min_adjlen - active_queues - tin_rate_bps - bytes - tin_backlog Other annotations are added in following patch, to ease code review. Fixes: 046f6fd5daef ("sched: Add Common Applications Kept Enhanced (cake) qdisc") Signed-off-by: Eric Dumazet Acked-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20260427083606.459355-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sched/sch_cake.c | 90 ++++++++++++++++++++++---------------------- 1 file changed, 46 insertions(+), 44 deletions(-) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index c5aae31565e9..975f5d6d6982 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1379,9 +1379,9 @@ static u32 cake_calc_overhead(struct cake_sched_data *qd, u32 len, u32 off) len -= off; if (qd->max_netlen < len) - qd->max_netlen = len; + WRITE_ONCE(qd->max_netlen, len); if (qd->min_netlen > len) - qd->min_netlen = len; + WRITE_ONCE(qd->min_netlen, len); len += q->rate_overhead; @@ -1401,9 +1401,9 @@ static u32 cake_calc_overhead(struct cake_sched_data *qd, u32 len, u32 off) } if (qd->max_adjlen < len) - qd->max_adjlen = len; + WRITE_ONCE(qd->max_adjlen, len); if (qd->min_adjlen > len) - qd->min_adjlen = len; + WRITE_ONCE(qd->min_adjlen, len); return len; } @@ -1416,7 +1416,7 @@ static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb) u16 segs = qdisc_pkt_segs(skb); u32 len = qdisc_pkt_len(skb); - q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8); + WRITE_ONCE(q->avg_netoff, cake_ewma(q->avg_netoff, off << 16, 8)); if (segs == 1) return cake_calc_overhead(q, len, off); @@ -1596,7 +1596,7 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) len = qdisc_pkt_len(skb); q->buffer_used -= skb->truesize; b->backlogs[idx] -= len; - b->tin_backlog -= len; + WRITE_ONCE(b->tin_backlog, b->tin_backlog - len); sch->qstats.backlog -= len; flow->dropped++; @@ -1824,11 +1824,11 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, } /* stats */ - b->bytes += slen; b->backlogs[idx] += slen; - b->tin_backlog += slen; sch->qstats.backlog += slen; q->avg_window_bytes += slen; + WRITE_ONCE(b->bytes, b->bytes + slen); + WRITE_ONCE(b->tin_backlog, b->tin_backlog + slen); qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen); consume_skb(skb); @@ -1847,7 +1847,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, WRITE_ONCE(b->ack_drops, b->ack_drops + 1); sch->qstats.drops++; ack_pkt_len = qdisc_pkt_len(ack); - b->bytes += ack_pkt_len; + WRITE_ONCE(b->bytes, b->bytes + ack_pkt_len); q->buffer_used += skb->truesize - ack->truesize; if (q->config->rate_flags & CAKE_FLAG_INGRESS) cake_advance_shaper(q, b, ack, now, true); @@ -1861,11 +1861,11 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* stats */ WRITE_ONCE(b->packets, b->packets + 1); - b->bytes += len - ack_pkt_len; b->backlogs[idx] += len - ack_pkt_len; - b->tin_backlog += len - ack_pkt_len; sch->qstats.backlog += len - ack_pkt_len; q->avg_window_bytes += len - ack_pkt_len; + WRITE_ONCE(b->bytes, b->bytes + len - ack_pkt_len); + WRITE_ONCE(b->tin_backlog, b->tin_backlog + len - ack_pkt_len); } if (q->overflow_timeout) @@ -1895,9 +1895,9 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC; b = div64_u64(b, window_interval); - q->avg_peak_bandwidth = - cake_ewma(q->avg_peak_bandwidth, b, - b > q->avg_peak_bandwidth ? 2 : 8); + WRITE_ONCE(q->avg_peak_bandwidth, + cake_ewma(q->avg_peak_bandwidth, b, + b > q->avg_peak_bandwidth ? 2 : 8)); q->avg_window_bytes = 0; q->avg_window_begin = now; @@ -1938,7 +1938,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, } if (q->buffer_used > q->buffer_max_used) - q->buffer_max_used = q->buffer_used; + WRITE_ONCE(q->buffer_max_used, q->buffer_used); if (q->buffer_used <= q->buffer_limit) return NET_XMIT_SUCCESS; @@ -1978,7 +1978,7 @@ static struct sk_buff *cake_dequeue_one(struct Qdisc *sch) skb = dequeue_head(flow); len = qdisc_pkt_len(skb); b->backlogs[q->cur_flow] -= len; - b->tin_backlog -= len; + WRITE_ONCE(b->tin_backlog, b->tin_backlog - len); sch->qstats.backlog -= len; q->buffer_used -= skb->truesize; sch->q.qlen--; @@ -2043,7 +2043,7 @@ static struct sk_buff *cake_dequeue(struct Qdisc *sch) cake_configure_rates(sch, new_rate, true); q->last_checked_active = now; - q->active_queues = num_active_qs; + WRITE_ONCE(q->active_queues, num_active_qs); } begin: @@ -2347,7 +2347,7 @@ static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu, /* else unlimited, ie. zero delay */ WRITE_ONCE(b->flow_quantum, 1514); } - b->tin_rate_bps = rate; + WRITE_ONCE(b->tin_rate_bps, rate); b->tin_rate_ns = rate_ns; b->tin_rate_shft = rate_shft; @@ -2617,25 +2617,27 @@ static void cake_reconfigure(struct Qdisc *sch) { struct cake_sched_data *qd = qdisc_priv(sch); struct cake_sched_config *q = qd->config; + u32 buffer_limit; cake_configure_rates(sch, qd->config->rate_bps, false); if (q->buffer_config_limit) { - qd->buffer_limit = q->buffer_config_limit; + buffer_limit = q->buffer_config_limit; } else if (q->rate_bps) { u64 t = q->rate_bps * q->interval; do_div(t, USEC_PER_SEC / 4); - qd->buffer_limit = max_t(u32, t, 4U << 20); + buffer_limit = max_t(u32, t, 4U << 20); } else { - qd->buffer_limit = ~0; + buffer_limit = ~0; } sch->flags &= ~TCQ_F_CAN_BYPASS; - qd->buffer_limit = min(qd->buffer_limit, - max(sch->limit * psched_mtu(qdisc_dev(sch)), - q->buffer_config_limit)); + WRITE_ONCE(qd->buffer_limit, + min(buffer_limit, + max(sch->limit * psched_mtu(qdisc_dev(sch)), + q->buffer_config_limit))); } static int cake_config_change(struct cake_sched_config *q, struct nlattr *opt, @@ -2780,10 +2782,10 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, return ret; if (overhead_changed) { - qd->max_netlen = 0; - qd->max_adjlen = 0; - qd->min_netlen = ~0; - qd->min_adjlen = ~0; + WRITE_ONCE(qd->max_netlen, 0); + WRITE_ONCE(qd->max_adjlen, 0); + WRITE_ONCE(qd->min_netlen, ~0); + WRITE_ONCE(qd->min_adjlen, ~0); } if (qd->tins) { @@ -3001,15 +3003,15 @@ static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d) goto nla_put_failure; \ } while (0) - PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth); - PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit); - PUT_STAT_U32(MEMORY_USED, q->buffer_max_used); - PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16)); - PUT_STAT_U32(MAX_NETLEN, q->max_netlen); - PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen); - PUT_STAT_U32(MIN_NETLEN, q->min_netlen); - PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen); - PUT_STAT_U32(ACTIVE_QUEUES, q->active_queues); + PUT_STAT_U64(CAPACITY_ESTIMATE64, READ_ONCE(q->avg_peak_bandwidth)); + PUT_STAT_U32(MEMORY_LIMIT, READ_ONCE(q->buffer_limit)); + PUT_STAT_U32(MEMORY_USED, READ_ONCE(q->buffer_max_used)); + PUT_STAT_U32(AVG_NETOFF, ((READ_ONCE(q->avg_netoff) + 0x8000) >> 16)); + PUT_STAT_U32(MAX_NETLEN, READ_ONCE(q->max_netlen)); + PUT_STAT_U32(MAX_ADJLEN, READ_ONCE(q->max_adjlen)); + PUT_STAT_U32(MIN_NETLEN, READ_ONCE(q->min_netlen)); + PUT_STAT_U32(MIN_ADJLEN, READ_ONCE(q->min_adjlen)); + PUT_STAT_U32(ACTIVE_QUEUES, READ_ONCE(q->active_queues)); #undef PUT_STAT_U32 #undef PUT_STAT_U64 @@ -3035,9 +3037,9 @@ static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d) if (!ts) goto nla_put_failure; - PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps); - PUT_TSTAT_U64(SENT_BYTES64, b->bytes); - PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog); + PUT_TSTAT_U64(THRESHOLD_RATE64, READ_ONCE(b->tin_rate_bps)); + PUT_TSTAT_U64(SENT_BYTES64, READ_ONCE(b->bytes)); + PUT_TSTAT_U32(BACKLOG_BYTES, READ_ONCE(b->tin_backlog)); PUT_TSTAT_U32(TARGET_US, ktime_to_us(ns_to_ktime(b->cparams.target))); @@ -3304,10 +3306,10 @@ static int cake_mq_change(struct Qdisc *sch, struct nlattr *opt, struct cake_sched_data *qd = qdisc_priv(chld); if (overhead_changed) { - qd->max_netlen = 0; - qd->max_adjlen = 0; - qd->min_netlen = ~0; - qd->min_adjlen = ~0; + WRITE_ONCE(qd->max_netlen, 0); + WRITE_ONCE(qd->max_adjlen, 0); + WRITE_ONCE(qd->min_netlen, ~0); + WRITE_ONCE(qd->min_adjlen, ~0); } if (qd->tins) { From a6c95b833dc17e84d16a8ac0f40fd0931616a52d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 27 Apr 2026 08:36:06 +0000 Subject: [PATCH 49/79] net/sched: sch_cake: annotate data-races in cake_dump_stats() (V) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cake_dump_stats() runs without qdisc spinlock being held. In this final patch, I add READ_ONCE()/WRITE_ONCE() annotations for cparams.target and cparams.interval. Fixes: 046f6fd5daef ("sched: Add Common Applications Kept Enhanced (cake) qdisc") Signed-off-by: Eric Dumazet Acked-by: "Toke Høiland-Jørgensen" Link: https://patch.msgid.link/20260427083606.459355-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sched/sch_cake.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 975f5d6d6982..13c6d1869a14 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -2356,10 +2356,11 @@ static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu, byte_target_ns = (byte_target * rate_ns) >> rate_shft; - b->cparams.target = max((byte_target_ns * 3) / 2, target_ns); - b->cparams.interval = max(rtt_est_ns + - b->cparams.target - target_ns, - b->cparams.target * 2); + WRITE_ONCE(b->cparams.target, + max((byte_target_ns * 3) / 2, target_ns)); + WRITE_ONCE(b->cparams.interval, + max(rtt_est_ns + b->cparams.target - target_ns, + b->cparams.target * 2)); b->cparams.mtu_time = byte_target_ns; b->cparams.p_inc = 1 << 24; /* 1/256 */ b->cparams.p_dec = 1 << 20; /* 1/4096 */ @@ -3042,9 +3043,9 @@ static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d) PUT_TSTAT_U32(BACKLOG_BYTES, READ_ONCE(b->tin_backlog)); PUT_TSTAT_U32(TARGET_US, - ktime_to_us(ns_to_ktime(b->cparams.target))); + ktime_to_us(ns_to_ktime(READ_ONCE(b->cparams.target)))); PUT_TSTAT_U32(INTERVAL_US, - ktime_to_us(ns_to_ktime(b->cparams.interval))); + ktime_to_us(ns_to_ktime(READ_ONCE(b->cparams.interval)))); PUT_TSTAT_U32(SENT_PACKETS, READ_ONCE(b->packets)); PUT_TSTAT_U32(DROPPED_PACKETS, READ_ONCE(b->tin_dropped)); From d62c6f2df5c0e1390b9a1f45b1b52689e3f234f0 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 27 Apr 2026 07:30:35 -0700 Subject: [PATCH 50/79] netconsole: return count instead of strnlen(buf, count) from store callbacks Several configfs store callbacks in netconsole end with: ret = strnlen(buf, count); This under-reports the number of bytes consumed when the input contains an embedded NUL within count, telling the VFS that fewer bytes were written than userspace actually handed in. A conformant partial-write loop would then retry the trailing bytes against a callback that has already accepted them. Every other configfs driver in the tree returns count directly from its store callbacks once parsing has succeeded, including drivers/nvme/target/configfs.c, drivers/gpio/gpio-sim.c, drivers/most/configfs.c, drivers/block/null_blk/main.c, drivers/pci/endpoint/pci-ep-cfs.c, and the rest of the configfs users. netconsole was the outlier (along with drivers/infiniband/core/cma_configfs.c, which has the same latent issue). Align netconsole with the rest of the configfs ecosystem: return count once the parser/validator has accepted the input. The numeric and boolean parsers (kstrtobool, kstrtou16, mac_pton, netpoll_parse_ip_addr) have already validated the meaningful prefix; any trailing bytes are padding and should simply be reported as consumed. Fixes: 0bcc1816188e ("[NET] netconsole: Support dynamic reconfiguration using configfs") Reviewed-by: Simon Horman Signed-off-by: Breno Leitao Link: https://patch.msgid.link/20260427-netconsole_ai_fixes-v2-1-59965f29d9cc@debian.org Signed-off-by: Jakub Kicinski --- drivers/net/netconsole.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c index 205384dab89a..76d7fbf9e188 100644 --- a/drivers/net/netconsole.c +++ b/drivers/net/netconsole.c @@ -752,7 +752,7 @@ static ssize_t enabled_store(struct config_item *item, unregister_netcons_consoles(); } - ret = strnlen(buf, count); + ret = count; /* Deferred cleanup */ netconsole_process_cleanups(); out_unlock: @@ -781,7 +781,7 @@ static ssize_t release_store(struct config_item *item, const char *buf, nt->release = release; - ret = strnlen(buf, count); + ret = count; out_unlock: dynamic_netconsole_mutex_unlock(); return ret; @@ -807,7 +807,7 @@ static ssize_t extended_store(struct config_item *item, const char *buf, goto out_unlock; nt->extended = extended; - ret = strnlen(buf, count); + ret = count; out_unlock: dynamic_netconsole_mutex_unlock(); return ret; @@ -830,7 +830,7 @@ static ssize_t dev_name_store(struct config_item *item, const char *buf, trim_newline(nt->np.dev_name, IFNAMSIZ); dynamic_netconsole_mutex_unlock(); - return strnlen(buf, count); + return count; } static ssize_t local_port_store(struct config_item *item, const char *buf, @@ -849,7 +849,7 @@ static ssize_t local_port_store(struct config_item *item, const char *buf, ret = kstrtou16(buf, 10, &nt->np.local_port); if (ret < 0) goto out_unlock; - ret = strnlen(buf, count); + ret = count; out_unlock: dynamic_netconsole_mutex_unlock(); return ret; @@ -871,7 +871,7 @@ static ssize_t remote_port_store(struct config_item *item, ret = kstrtou16(buf, 10, &nt->np.remote_port); if (ret < 0) goto out_unlock; - ret = strnlen(buf, count); + ret = count; out_unlock: dynamic_netconsole_mutex_unlock(); return ret; @@ -896,7 +896,7 @@ static ssize_t local_ip_store(struct config_item *item, const char *buf, goto out_unlock; nt->np.ipv6 = !!ipv6; - ret = strnlen(buf, count); + ret = count; out_unlock: dynamic_netconsole_mutex_unlock(); return ret; @@ -921,7 +921,7 @@ static ssize_t remote_ip_store(struct config_item *item, const char *buf, goto out_unlock; nt->np.ipv6 = !!ipv6; - ret = strnlen(buf, count); + ret = count; out_unlock: dynamic_netconsole_mutex_unlock(); return ret; @@ -957,7 +957,7 @@ static ssize_t remote_mac_store(struct config_item *item, const char *buf, goto out_unlock; memcpy(nt->np.remote_mac, remote_mac, ETH_ALEN); - ret = strnlen(buf, count); + ret = count; out_unlock: dynamic_netconsole_mutex_unlock(); return ret; @@ -1133,7 +1133,7 @@ static ssize_t sysdata_msgid_enabled_store(struct config_item *item, disable_sysdata_feature(nt, SYSDATA_MSGID); unlock_ok: - ret = strnlen(buf, count); + ret = count; dynamic_netconsole_mutex_unlock(); mutex_unlock(&netconsole_subsys.su_mutex); return ret; @@ -1162,7 +1162,7 @@ static ssize_t sysdata_release_enabled_store(struct config_item *item, disable_sysdata_feature(nt, SYSDATA_RELEASE); unlock_ok: - ret = strnlen(buf, count); + ret = count; dynamic_netconsole_mutex_unlock(); mutex_unlock(&netconsole_subsys.su_mutex); return ret; @@ -1191,7 +1191,7 @@ static ssize_t sysdata_taskname_enabled_store(struct config_item *item, disable_sysdata_feature(nt, SYSDATA_TASKNAME); unlock_ok: - ret = strnlen(buf, count); + ret = count; dynamic_netconsole_mutex_unlock(); mutex_unlock(&netconsole_subsys.su_mutex); return ret; @@ -1225,7 +1225,7 @@ static ssize_t sysdata_cpu_nr_enabled_store(struct config_item *item, disable_sysdata_feature(nt, SYSDATA_CPU_NR); unlock_ok: - ret = strnlen(buf, count); + ret = count; dynamic_netconsole_mutex_unlock(); mutex_unlock(&netconsole_subsys.su_mutex); return ret; From e6dd94252b0fa7b4fcc00577c6898432c5d97a08 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 27 Apr 2026 07:30:36 -0700 Subject: [PATCH 51/79] netconsole: avoid clobbering userdatum value on truncated write userdatum_value_store() bounds count by MAX_EXTRADATA_VALUE_LEN (200) and then copies straight into udm->value, which is itself 200 bytes: if (count > MAX_EXTRADATA_VALUE_LEN) return -EMSGSIZE; ... ret = strscpy(udm->value, buf, sizeof(udm->value)); if (ret < 0) goto out_unlock; If userspace writes exactly MAX_EXTRADATA_VALUE_LEN bytes with no NUL within them, strscpy() copies 199 bytes plus a NUL into udm->value and returns -E2BIG. The function jumps to out_unlock and reports the error to userspace, but udm->value has already been overwritten with the truncated string and update_userdata() is skipped, so the corruption is not yet visible on the wire. The next successful write to any userdatum entry under the same target calls update_userdata(), which packs udm->value into the active netconsole payload. From that point on, every netconsole message carries the silently truncated value, and userspace has no indication that a previous, error-returning write left state behind. Tighten the entry check from "count > MAX_EXTRADATA_VALUE_LEN" to "count >= MAX_EXTRADATA_VALUE_LEN". With count strictly less than sizeof(udm->value), strscpy() can no longer return -E2BIG here, so the corrupting truncation path is removed entirely. Fixes: 8a6d5fec6c7f ("net: netconsole: add a userdata config_group member to netconsole_target") Signed-off-by: Breno Leitao Link: https://patch.msgid.link/20260427-netconsole_ai_fixes-v2-2-59965f29d9cc@debian.org Signed-off-by: Jakub Kicinski --- drivers/net/netconsole.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c index 76d7fbf9e188..595e09bd1ccf 100644 --- a/drivers/net/netconsole.c +++ b/drivers/net/netconsole.c @@ -1076,15 +1076,13 @@ static ssize_t userdatum_value_store(struct config_item *item, const char *buf, struct userdata *ud; ssize_t ret; - if (count > MAX_EXTRADATA_VALUE_LEN) + if (count >= MAX_EXTRADATA_VALUE_LEN) return -EMSGSIZE; mutex_lock(&netconsole_subsys.su_mutex); dynamic_netconsole_mutex_lock(); - - ret = strscpy(udm->value, buf, sizeof(udm->value)); - if (ret < 0) - goto out_unlock; + /* count is bounded above, so strscpy() cannot truncate here */ + strscpy(udm->value, buf, sizeof(udm->value)); trim_newline(udm->value, sizeof(udm->value)); ud = to_userdata(item->ci_parent); From 92ceb7bff62c2606f664c204750eca0b85d44112 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 27 Apr 2026 07:30:37 -0700 Subject: [PATCH 52/79] netconsole: propagate device name truncation in dev_name_store() dev_name_store() calls strscpy(nt->np.dev_name, buf, IFNAMSIZ) without checking the return value. If userspace writes an interface name longer than IFNAMSIZ - 1, strscpy() silently truncates and returns -E2BIG, but the function ignores it and reports a fully successful write back to userspace. If a real interface happens to match the truncated name, netconsole will bind to the wrong device on the next enable, sending kernel logs and panic output to an unintended network segment with no indication to userspace that anything was rewritten. Reject writes whose length cannot fit in nt->np.dev_name up front: if (count >= IFNAMSIZ) return -ENAMETOOLONG; This is not a big deal of a problem, but, it is still the correct approach. Fixes: 0bcc1816188e57 ("[NET] netconsole: Support dynamic reconfiguration using configfs") Signed-off-by: Breno Leitao Link: https://patch.msgid.link/20260427-netconsole_ai_fixes-v2-3-59965f29d9cc@debian.org Signed-off-by: Jakub Kicinski --- drivers/net/netconsole.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c index 595e09bd1ccf..b3b36e3ddd03 100644 --- a/drivers/net/netconsole.c +++ b/drivers/net/netconsole.c @@ -817,6 +817,13 @@ static ssize_t dev_name_store(struct config_item *item, const char *buf, size_t count) { struct netconsole_target *nt = to_target(item); + size_t len = count; + + /* Account for a trailing newline appended by tools like echo */ + if (len && buf[len - 1] == '\n') + len--; + if (len >= IFNAMSIZ) + return -ENAMETOOLONG; dynamic_netconsole_mutex_lock(); if (nt->state == STATE_ENABLED) { From 869cd6490fafe09c89a15d01610e8a03932d79f0 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 27 Apr 2026 07:30:38 -0700 Subject: [PATCH 53/79] netconsole: restore userdatum value on update_userdata() failure userdatum_value_store() updates udm->value first and only then calls update_userdata() to rebuild the on-the-wire payload. If update_userdata() fails (e.g. -ENOMEM from kmalloc), the function returns the error to userspace, but udm->value already holds the new string while the live nt->userdata buffer still reflects the old one. The next successful write to any sibling userdatum on the same target will call update_userdata() again, which walks every entry and packs the now-stale udm->value into the payload. The failed write is thus silently activated later, with no indication to userspace that the value it tried to set was rejected. Snapshot the previous value before overwriting udm->value and restore it if update_userdata() fails so the visible state and the active payload stay consistent. Fixes: eb83801af2dc ("netconsole: Dynamic allocation of userdata buffer") Signed-off-by: Breno Leitao Link: https://patch.msgid.link/20260427-netconsole_ai_fixes-v2-4-59965f29d9cc@debian.org Signed-off-by: Jakub Kicinski --- drivers/net/netconsole.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c index b3b36e3ddd03..57dd6821a8aa 100644 --- a/drivers/net/netconsole.c +++ b/drivers/net/netconsole.c @@ -1079,6 +1079,7 @@ static ssize_t userdatum_value_store(struct config_item *item, const char *buf, size_t count) { struct userdatum *udm = to_userdatum(item); + char old_value[MAX_EXTRADATA_VALUE_LEN]; struct netconsole_target *nt; struct userdata *ud; ssize_t ret; @@ -1088,6 +1089,8 @@ static ssize_t userdatum_value_store(struct config_item *item, const char *buf, mutex_lock(&netconsole_subsys.su_mutex); dynamic_netconsole_mutex_lock(); + /* Snapshot for rollback if update_userdata() fails below */ + strscpy(old_value, udm->value, sizeof(old_value)); /* count is bounded above, so strscpy() cannot truncate here */ strscpy(udm->value, buf, sizeof(udm->value)); trim_newline(udm->value, sizeof(udm->value)); @@ -1095,8 +1098,11 @@ static ssize_t userdatum_value_store(struct config_item *item, const char *buf, ud = to_userdata(item->ci_parent); nt = userdata_to_target(ud); ret = update_userdata(nt); - if (ret < 0) + if (ret < 0) { + /* Restore the previous value so it matches the live payload */ + strscpy(udm->value, old_value, sizeof(udm->value)); goto out_unlock; + } ret = count; out_unlock: dynamic_netconsole_mutex_unlock(); From 5f95c21fc23a7ef22b4d27d1ed9bb55557ffb926 Mon Sep 17 00:00:00 2001 From: Gang Yan Date: Mon, 27 Apr 2026 21:54:33 +0200 Subject: [PATCH 54/79] mptcp: sockopt: set timestamp flags on subflow socket, not msk Both mptcp_setsockopt_sol_socket_tstamp() and mptcp_setsockopt_sol_socket_timestamping() iterate over subflows, acquire the subflow socket lock, but then erroneously pass the MPTCP msk socket to sock_set_timestamp() / sock_set_timestamping() instead of the subflow ssk. As a result, the timestamp flags are set on the wrong socket and have no effect on the actual subflows. Pass ssk instead of sk to both helpers. Fixes: 9061f24bf82e ("mptcp: sockopt: propagate timestamp request to subflows") Cc: stable@vger.kernel.org Signed-off-by: Gang Yan Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260427-net-mptcp-misc-fixes-7-1-rc2-v1-1-7432b7f279fa@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/sockopt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index de90a2897d2d..79db15903e7a 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -161,7 +161,7 @@ static int mptcp_setsockopt_sol_socket_tstamp(struct mptcp_sock *msk, int optnam struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow = lock_sock_fast(ssk); - sock_set_timestamp(sk, optname, !!val); + sock_set_timestamp(ssk, optname, !!val); unlock_sock_fast(ssk, slow); } @@ -237,7 +237,7 @@ static int mptcp_setsockopt_sol_socket_timestamping(struct mptcp_sock *msk, struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow = lock_sock_fast(ssk); - sock_set_timestamping(sk, optname, timestamping); + sock_set_timestamping(ssk, optname, timestamping); unlock_sock_fast(ssk, slow); } From b5c52908d52c6c8eb8933264aa6087a0600fd892 Mon Sep 17 00:00:00 2001 From: Gang Yan Date: Mon, 27 Apr 2026 21:54:34 +0200 Subject: [PATCH 55/79] mptcp: fix scheduling with atomic in timestamp sockopt Using lock_sock_fast() (atomic context) around sock_set_timestamp() and sock_set_timestamping() is unsafe, as both helpers can sleep. Replace lock_sock_fast() with sleepable lock_sock()/release_sock() to avoid scheduling while atomic panic. Fixes: 9061f24bf82e ("mptcp: sockopt: propagate timestamp request to subflows") Cc: stable@vger.kernel.org Reported-by: Sashiko Closes: https://sashiko.dev/#/patchset/20260420093343.16443-1-gang.yan@linux.dev Signed-off-by: Gang Yan Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260427-net-mptcp-misc-fixes-7-1-rc2-v1-2-7432b7f279fa@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/sockopt.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 79db15903e7a..0efe40be2fde 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -159,10 +159,10 @@ static int mptcp_setsockopt_sol_socket_tstamp(struct mptcp_sock *msk, int optnam lock_sock(sk); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - bool slow = lock_sock_fast(ssk); + lock_sock(ssk); sock_set_timestamp(ssk, optname, !!val); - unlock_sock_fast(ssk, slow); + release_sock(ssk); } release_sock(sk); @@ -235,10 +235,10 @@ static int mptcp_setsockopt_sol_socket_timestamping(struct mptcp_sock *msk, mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - bool slow = lock_sock_fast(ssk); + lock_sock(ssk); sock_set_timestamping(ssk, optname, timestamping); - unlock_sock_fast(ssk, slow); + release_sock(ssk); } release_sock(sk); From f14d6e9c3678a067f304abba561e0c5446c7e845 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 27 Apr 2026 21:54:35 +0200 Subject: [PATCH 56/79] mptcp: fastclose msk when linger time is 0 The SO_LINGER socket option has been supported for a while with MPTCP sockets [1], but it didn't cause the equivalent of a TCP reset as expected when enabled and its time was set to 0. This was causing some behavioural differences with TCP where some connections were not promptly stopped as expected. To fix that, an extra condition is checked at close() time before sending an MP_FASTCLOSE, the MPTCP equivalent of a TCP reset. Note that backporting up to [1] will be difficult as more changes are needed to be able to send MP_FASTCLOSE. It seems better to stop at [2], which was supposed to already imitate TCP. Validated with MPTCP packetdrill tests [3]. Fixes: 268b12387460 ("mptcp: setsockopt: support SO_LINGER") [1] Fixes: d21f83485518 ("mptcp: use fastclose on more edge scenarios") [2] Cc: stable@vger.kernel.org Reported-by: Lance Tuller Closes: https://github.com/lance0/xfr/pull/67 Link: https://github.com/multipath-tcp/packetdrill/pull/196 [3] Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260427-net-mptcp-misc-fixes-7-1-rc2-v1-3-7432b7f279fa@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 718e910ff23f..4546a8b09884 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3302,7 +3302,8 @@ bool __mptcp_close(struct sock *sk, long timeout) goto cleanup; } - if (mptcp_data_avail(msk) || timeout < 0) { + if (mptcp_data_avail(msk) || timeout < 0 || + (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime)) { /* If the msk has read data, or the caller explicitly ask it, * do the MPTCP equivalent of TCP reset, aka MPTCP fastclose */ From 1774d3cf3cf17baaf30c095606cda496268283b3 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 27 Apr 2026 21:54:36 +0200 Subject: [PATCH 57/79] mptcp: pm: kernel: reset fullmesh counter after flush This variable counts how many MPTCP endpoints have a 'fullmesh' flag set. After having flushed all MPTCP endpoints, it is then needed to reset this counter. Without this reset, this counter exposed to the userspace is wrong, but also non-fullmesh endpoints added after the flush will not be taken into account to create subflows in reaction to ADD_ADDRs. Fixes: f88191c7f361 ("mptcp: pm: in-kernel: record fullmesh endp nb") Cc: stable@vger.kernel.org Reported-by: Sashiko Closes: https://sashiko.dev/#/patchset/20260422-mptcp-inc-limits-v6-0-903181771530%40kernel.org?part=15 Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260427-net-mptcp-misc-fixes-7-1-rc2-v1-4-7432b7f279fa@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_kernel.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index 0ebf43be9939..c9f1e5af3cd3 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -1278,6 +1278,7 @@ static void __reset_counters(struct pm_nl_pernet *pernet) WRITE_ONCE(pernet->endp_signal_max, 0); WRITE_ONCE(pernet->endp_subflow_max, 0); WRITE_ONCE(pernet->endp_laminar_max, 0); + WRITE_ONCE(pernet->endp_fullmesh_max, 0); pernet->endpoints = 0; } From 735a309b4bfb9e1e26636ff4a3e8a146f53c54f9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 27 Apr 2026 19:53:20 -0700 Subject: [PATCH 58/79] net: add net_iov_init() and use it to initialize ->page_type Commit db359fccf212 ("mm: introduce a new page type for page pool in page type") added a page_type field to struct net_iov at the same offset as struct page::page_type, so that page_pool_set_pp_info() can call __SetPageNetpp() uniformly on both pages and net_iovs. The page-type API requires the field to hold the UINT_MAX "no type" sentinel before a type can be set; for real struct page that invariant is established by the page allocator on free. struct net_iov is not allocated through the page allocator, so the field is left as zero (io_uring zcrx, which uses __GFP_ZERO) or as slab garbage (devmem, which uses kvmalloc_objs() without zeroing). When the page pool then calls page_pool_set_pp_info() on a freshly-bound niov, __SetPageNetpp()'s VM_BUG_ON_PAGE(page->page_type != UINT_MAX) fires and the kernel BUGs. Triggered in selftests by io_uring zcrx setup through the fbnic queue restart path: kernel BUG at ./include/linux/page-flags.h:1062! RIP: 0010:page_pool_set_pp_info (./include/linux/page-flags.h:1062 net/core/page_pool.c:716) Call Trace: net_mp_niov_set_page_pool (net/core/page_pool.c:1360) io_pp_zc_alloc_netmems (io_uring/zcrx.c:1089 io_uring/zcrx.c:1110) fbnic_fill_bdq (./include/net/page_pool/helpers.h:160 drivers/net/ethernet/meta/fbnic/fbnic_txrx.c:906) __fbnic_nv_restart (drivers/net/ethernet/meta/fbnic/fbnic_txrx.c:2470 drivers/net/ethernet/meta/fbnic/fbnic_txrx.c:2874) fbnic_queue_start (drivers/net/ethernet/meta/fbnic/fbnic_txrx.c:2903) netdev_rx_queue_reconfig (net/core/netdev_rx_queue.c:137) __netif_mp_open_rxq (net/core/netdev_rx_queue.c:234) io_register_zcrx (io_uring/zcrx.c:818 io_uring/zcrx.c:903) __io_uring_register (io_uring/register.c:931) __do_sys_io_uring_register (io_uring/register.c:1029) do_syscall_64 (arch/x86/entry/syscall_64.c:63 arch/x86/entry/syscall_64.c:94) The same path is reachable through devmem dmabuf binding via netdev_nl_bind_rx_doit() -> net_devmem_bind_dmabuf_to_queue(). Add a net_iov_init() helper that stamps ->owner, ->type and the ->page_type sentinel, and use it from both the devmem and io_uring zcrx niov init loops. Fixes: db359fccf212 ("mm: introduce a new page type for page pool in page type") Acked-by: Vlastimil Babka (SUSE) Acked-by: Byungchul Park Reviewed-by: Jens Axboe Acked-by: Pavel Begunkov Link: https://patch.msgid.link/20260428025320.853452-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/netmem.h | 15 +++++++++++++++ io_uring/zcrx.c | 3 +-- net/core/devmem.c | 3 +-- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/include/net/netmem.h b/include/net/netmem.h index 507b74c9f52d..78fe51e5756b 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -127,6 +127,21 @@ static inline unsigned int net_iov_idx(const struct net_iov *niov) return niov - net_iov_owner(niov)->niovs; } +/* Initialize a niov: stamp the owning area, the memory provider type, + * and the page_type "no type" sentinel expected by the page-type API + * (see PAGE_TYPE_OPS in ) so that + * page_pool_set_pp_info() can later call __SetPageNetpp() on a niov + * cast to struct page. + */ +static inline void net_iov_init(struct net_iov *niov, + struct net_iov_area *owner, + enum net_iov_type type) +{ + niov->owner = owner; + niov->type = type; + niov->page_type = UINT_MAX; +} + /* netmem */ /** diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 7b93c87b8371..19837e0b5e91 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -495,10 +495,9 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, for (i = 0; i < nr_iovs; i++) { struct net_iov *niov = &area->nia.niovs[i]; - niov->owner = &area->nia; + net_iov_init(niov, &area->nia, NET_IOV_IOURING); area->freelist[i] = i; atomic_set(&area->user_refs[i], 0); - niov->type = NET_IOV_IOURING; } if (ifq->dev) { diff --git a/net/core/devmem.c b/net/core/devmem.c index cde4c89bc146..468344739db2 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -297,8 +297,7 @@ net_devmem_bind_dmabuf(struct net_device *dev, for (i = 0; i < owner->area.num_niovs; i++) { niov = &owner->area.niovs[i]; - niov->type = NET_IOV_DMABUF; - niov->owner = &owner->area; + net_iov_init(niov, &owner->area, NET_IOV_DMABUF); page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), net_devmem_get_dma_addr(niov)); if (direction == DMA_TO_DEVICE) From c3388f8c1cbb5aae3731749b586499ed126b4156 Mon Sep 17 00:00:00 2001 From: David Heidelberg Date: Tue, 28 Apr 2026 16:24:38 +0200 Subject: [PATCH 59/79] MAINTAINERS: Add myself as NFC subsystem maintainer Add myself and update the mailing list. Signed-off-by: David Heidelberg Signed-off-by: Jakub Kicinski --- MAINTAINERS | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 21288a3a7d93..176390ef4275 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18829,8 +18829,10 @@ F: include/uapi/linux/nexthop.h F: net/ipv4/nexthop.c NFC SUBSYSTEM -L: netdev@vger.kernel.org -S: Orphan +M: David Heidelberg +L: oe-linux-nfc@lists.linux.dev +S: Maintained +T: git https://codeberg.org/linux-nfc/linux.git F: Documentation/devicetree/bindings/net/nfc/ F: drivers/nfc/ F: include/net/nfc/ From 72e9647e2b20c31b4f2febf981566e3c5cdef90e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 28 Apr 2026 13:33:57 -0700 Subject: [PATCH 60/79] selftests: drv-net: clarify linters and frameworks in README Minor clarifications in the README: - call out what linters we expect to be clean - make it clear that by "frameworks" we mean code under lib/ not just factoring code out in the same file Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/README.rst | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/drivers/net/README.rst b/tools/testing/selftests/drivers/net/README.rst index c8588436c224..c6bed9a985bc 100644 --- a/tools/testing/selftests/drivers/net/README.rst +++ b/tools/testing/selftests/drivers/net/README.rst @@ -211,8 +211,8 @@ Avoid libraries and frameworks Test files should be relatively self contained. The libraries should only include very core or non-trivial code. -It may be tempting to "factor out" the common code, but fight that urge. -Library code increases the barrier of entry, and complexity in general. +It may be tempting to "factor out" the common code to lib/py/, but fight that +urge. Library code increases the barrier of entry, and complexity in general. Avoid mixing test code and boilerplate ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -290,6 +290,12 @@ or:: def test(cfg, mode, protocol): pass +Linters +~~~~~~~ + +We expect clean ``ruff check`` and ``pylint --disable=R``. +The code should be clean, avoid disabling pylint warnings explicitly! + Running tests CI-style ====================== From e73cafaf4acea5445df2e5ee021a335d717c1697 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 28 Apr 2026 13:39:24 -0700 Subject: [PATCH 61/79] MAINTAINERS: update the IPv4/IPv6 entry and add Ido Schimmel The IPv4/IPv6 and routing code is not very well separated from the TCP/UDP code. Scope it down properly by providing a more accurate file list, instead of net/ipv4/ and net/ipv6/ Now that the entry is more accurately representing layer 3 and routing merge in the nexthop entry into it. Add Ido Schimmel as a co-maintainer, Ido's git history speaks for itself. Reviewed-by: David Ahern Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20260428203924.1229169-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 63 +++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 47 insertions(+), 16 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 176390ef4275..27a073f53cea 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18672,19 +18672,59 @@ F: net/xfrm/ F: tools/testing/selftests/net/ipsec.c NETWORKING [IPv4/IPv6] -M: "David S. Miller" M: David Ahern +M: Ido Schimmel L: netdev@vger.kernel.org S: Maintained -T: git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git -F: arch/x86/net/* -F: include/linux/ip.h -F: include/linux/ipv6* +F: Documentation/netlink/specs/rt-addr.yaml +F: Documentation/netlink/specs/rt-neigh.yaml +F: Documentation/netlink/specs/rt-route.yaml +F: Documentation/netlink/specs/rt-rule.yaml +F: include/linux/inetdevice.h +F: include/linux/mroute* +F: include/net/addrconf.h +F: include/net/arp.h F: include/net/fib* +F: include/net/if_inet6.h +F: include/net/inetpeer.h F: include/net/ip* +F: include/net/lwtunnel.h +F: include/net/ndisc.h +F: include/net/netns/nexthop.h +F: include/net/nexthop.h F: include/net/route.h -F: net/ipv4/ -F: net/ipv6/ +F: include/uapi/linux/fib_rules.h +F: include/uapi/linux/in_route.h +F: include/uapi/linux/mroute* +F: include/uapi/linux/nexthop.h +F: net/core/fib* +F: net/core/lwtunnel.c +F: net/ipv4/arp.c +F: net/ipv4/devinet.c +F: net/ipv4/fib* +F: net/ipv4/icmp.c +F: net/ipv4/igmp.c +F: net/ipv4/inet_fragment.c +F: net/ipv4/inetpeer.c +F: net/ipv4/ip* +F: net/ipv4/metrics.c +F: net/ipv4/netlink.c +F: net/ipv4/nexthop.c +F: net/ipv4/route.c +F: net/ipv6/addr* +F: net/ipv6/anycast.c +F: net/ipv6/exthdrs.c +F: net/ipv6/exthdrs_core.c +F: net/ipv6/fib* +F: net/ipv6/icmp.c +F: net/ipv6/ip* +F: net/ipv6/mcast* +F: net/ipv6/ndisc.c +F: net/ipv6/output_core.c +F: net/ipv6/reassembly.c +F: net/ipv6/route.c +F: tools/testing/selftests/net/fib* +F: tools/testing/selftests/net/forwarding/ NETWORKING [LABELED] (NetLabel, Labeled IPsec, SECMARK) M: Paul Moore @@ -18819,15 +18859,6 @@ F: Documentation/networking/net_failover.rst F: drivers/net/net_failover.c F: include/net/net_failover.h -NEXTHOP -M: David Ahern -L: netdev@vger.kernel.org -S: Maintained -F: include/net/netns/nexthop.h -F: include/net/nexthop.h -F: include/uapi/linux/nexthop.h -F: net/ipv4/nexthop.c - NFC SUBSYSTEM M: David Heidelberg L: oe-linux-nfc@lists.linux.dev From b31681206e3f527970a7c7ed807fbf6a028fc25b Mon Sep 17 00:00:00 2001 From: Hamza Mahfooz Date: Tue, 28 Apr 2026 08:53:39 -0400 Subject: [PATCH 62/79] hv_sock: fix ARM64 support VMBUS ring buffers must be page aligned. Therefore, the current value of 24K presents a challenge on ARM64 kernels (with 64K pages). So, use VMBUS_RING_SIZE() to ensure they are always aligned and large enough to hold all of the relevant data. Cc: stable@vger.kernel.org Fixes: 77ffe33363c0 ("hv_sock: use HV_HYP_PAGE_SIZE for Hyper-V communication") Tested-by: Dexuan Cui Reviewed-by: Dexuan Cui Signed-off-by: Hamza Mahfooz Acked-by: Stefano Garzarella Link: https://patch.msgid.link/20260428125339.13963-1-hamzamahfooz@linux.microsoft.com Signed-off-by: Jakub Kicinski --- net/vmw_vsock/hyperv_transport.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index f862988c1e86..7a8963595bf9 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -375,10 +375,10 @@ static void hvs_open_connection(struct vmbus_channel *chan) } else { sndbuf = max_t(int, sk->sk_sndbuf, RINGBUFFER_HVS_SND_SIZE); sndbuf = min_t(int, sndbuf, RINGBUFFER_HVS_MAX_SIZE); - sndbuf = ALIGN(sndbuf, HV_HYP_PAGE_SIZE); + sndbuf = VMBUS_RING_SIZE(sndbuf); rcvbuf = max_t(int, sk->sk_rcvbuf, RINGBUFFER_HVS_RCV_SIZE); rcvbuf = min_t(int, rcvbuf, RINGBUFFER_HVS_MAX_SIZE); - rcvbuf = ALIGN(rcvbuf, HV_HYP_PAGE_SIZE); + rcvbuf = VMBUS_RING_SIZE(rcvbuf); } chan->max_pkt_size = HVS_MAX_PKT_SIZE; From 4ca01292ea2f2363660610a65ba0285d7c3309ed Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 28 Apr 2026 08:53:16 +0200 Subject: [PATCH 63/79] net: airoha: Do not return err in ndo_stop() callback Always complete the airoha_dev_stop() routine regardless of the airoha_set_vip_for_gdm_port() return value, since errors from ndo_stop() are ignored by the networking stack and the interface is always considered down after the call. Fixes: 23020f049327 ("net: airoha: Introduce ethernet support for EN7581 SoC") Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20260428-airoha-ndo-stop-not-err-v1-1-674506d29a91@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/airoha/airoha_eth.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/airoha/airoha_eth.c b/drivers/net/ethernet/airoha/airoha_eth.c index 5effb4a4ae84..f8b3d53bccad 100644 --- a/drivers/net/ethernet/airoha/airoha_eth.c +++ b/drivers/net/ethernet/airoha/airoha_eth.c @@ -1747,13 +1747,10 @@ static int airoha_dev_stop(struct net_device *dev) { struct airoha_gdm_port *port = netdev_priv(dev); struct airoha_qdma *qdma = port->qdma; - int i, err; + int i; netif_tx_disable(dev); - err = airoha_set_vip_for_gdm_port(port, false); - if (err) - return err; - + airoha_set_vip_for_gdm_port(port, false); for (i = 0; i < dev->num_tx_queues; i++) netdev_tx_reset_subqueue(dev, i); From c4f050ce06c56cfb5993268af4a5cb66ed1cd04e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Apr 2026 12:32:07 +0000 Subject: [PATCH 64/79] bonding: 3ad: implement proper RCU rules for port->aggregator syzbot found a data-race in bond_3ad_get_active_agg_info / bond_3ad_state_machine_handler [1] which hints at lack of proper RCU implementation. Add __rcu qualifier to port->aggregator, and add proper RCU API. [1] BUG: KCSAN: data-race in bond_3ad_get_active_agg_info / bond_3ad_state_machine_handler write to 0xffff88813cf5c4b0 of 8 bytes by task 36 on cpu 0: ad_port_selection_logic drivers/net/bonding/bond_3ad.c:1659 [inline] bond_3ad_state_machine_handler+0x9d5/0x2d60 drivers/net/bonding/bond_3ad.c:2569 process_one_work kernel/workqueue.c:3302 [inline] process_scheduled_works+0x4f0/0x9c0 kernel/workqueue.c:3385 worker_thread+0x58a/0x780 kernel/workqueue.c:3466 kthread+0x22a/0x280 kernel/kthread.c:436 ret_from_fork+0x146/0x330 arch/x86/kernel/process.c:158 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245 read to 0xffff88813cf5c4b0 of 8 bytes by task 22063 on cpu 1: __bond_3ad_get_active_agg_info drivers/net/bonding/bond_3ad.c:2858 [inline] bond_3ad_get_active_agg_info+0x8c/0x230 drivers/net/bonding/bond_3ad.c:2881 bond_fill_info+0xe0f/0x10f0 drivers/net/bonding/bond_netlink.c:853 rtnl_link_info_fill net/core/rtnetlink.c:906 [inline] rtnl_link_fill+0x1d7/0x4e0 net/core/rtnetlink.c:927 rtnl_fill_ifinfo+0xf8e/0x1380 net/core/rtnetlink.c:2168 rtmsg_ifinfo_build_skb+0x11c/0x1b0 net/core/rtnetlink.c:4453 rtmsg_ifinfo_event net/core/rtnetlink.c:4486 [inline] rtmsg_ifinfo+0x6d/0x110 net/core/rtnetlink.c:4495 __dev_notify_flags+0x76/0x390 net/core/dev.c:9790 netif_change_flags+0xac/0xd0 net/core/dev.c:9823 do_setlink+0x905/0x2950 net/core/rtnetlink.c:3180 rtnl_group_changelink net/core/rtnetlink.c:3813 [inline] __rtnl_newlink net/core/rtnetlink.c:3981 [inline] rtnl_newlink+0xf55/0x1400 net/core/rtnetlink.c:4109 rtnetlink_rcv_msg+0x64b/0x720 net/core/rtnetlink.c:6995 netlink_rcv_skb+0x123/0x220 net/netlink/af_netlink.c:2550 rtnetlink_rcv+0x1c/0x30 net/core/rtnetlink.c:7022 netlink_unicast_kernel net/netlink/af_netlink.c:1318 [inline] netlink_unicast+0x5a8/0x680 net/netlink/af_netlink.c:1344 netlink_sendmsg+0x5c8/0x6f0 net/netlink/af_netlink.c:1894 sock_sendmsg_nosec net/socket.c:787 [inline] __sock_sendmsg net/socket.c:802 [inline] ____sys_sendmsg+0x563/0x5b0 net/socket.c:2698 ___sys_sendmsg+0x195/0x1e0 net/socket.c:2752 __sys_sendmsg net/socket.c:2784 [inline] __do_sys_sendmsg net/socket.c:2789 [inline] __se_sys_sendmsg net/socket.c:2787 [inline] __x64_sys_sendmsg+0xd4/0x160 net/socket.c:2787 x64_sys_call+0x194c/0x3020 arch/x86/include/generated/asm/syscalls_64.h:47 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0x12c/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f value changed: 0x0000000000000000 -> 0xffff88813cf5c400 Reported by Kernel Concurrency Sanitizer on: CPU: 1 UID: 0 PID: 22063 Comm: syz.0.31122 Tainted: G W syzkaller #0 PREEMPT(full) Tainted: [W]=WARN Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 04/18/2026 Fixes: 47e91f56008b ("bonding: use RCU protection for 3ad xmit path") Reported-by: syzbot+9bb2ff2a4ab9e17307e1@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/69f0a82f.050a0220.3aadc4.0000.GAE@google.com/ Signed-off-by: Eric Dumazet Cc: Jay Vosburgh Cc: Andrew Lunn Link: https://patch.msgid.link/20260428123207.3809211-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_3ad.c | 109 ++++++++++++++----------- drivers/net/bonding/bond_main.c | 8 +- drivers/net/bonding/bond_netlink.c | 16 ++-- drivers/net/bonding/bond_procfs.c | 3 +- drivers/net/bonding/bond_sysfs_slave.c | 17 ++-- include/net/bond_3ad.h | 2 +- 6 files changed, 89 insertions(+), 66 deletions(-) diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index af7f74cfdc08..f0aa7d2f2171 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -1029,6 +1029,7 @@ static void ad_cond_set_peer_notif(struct port *port) static void ad_mux_machine(struct port *port, bool *update_slave_arr) { struct bonding *bond = __get_bond_by_port(port); + struct aggregator *aggregator; mux_states_t last_state; /* keep current State Machine state to compare later if it was @@ -1036,6 +1037,7 @@ static void ad_mux_machine(struct port *port, bool *update_slave_arr) */ last_state = port->sm_mux_state; + aggregator = rcu_dereference(port->aggregator); if (port->sm_vars & AD_PORT_BEGIN) { port->sm_mux_state = AD_MUX_DETACHED; } else { @@ -1055,7 +1057,7 @@ static void ad_mux_machine(struct port *port, bool *update_slave_arr) * cycle to update ready variable, we check * READY_N and update READY here */ - __set_agg_ports_ready(port->aggregator, __agg_ports_are_ready(port->aggregator)); + __set_agg_ports_ready(aggregator, __agg_ports_are_ready(aggregator)); port->sm_mux_state = AD_MUX_DETACHED; break; } @@ -1070,7 +1072,7 @@ static void ad_mux_machine(struct port *port, bool *update_slave_arr) * update ready variable, we check READY_N and update * READY here */ - __set_agg_ports_ready(port->aggregator, __agg_ports_are_ready(port->aggregator)); + __set_agg_ports_ready(aggregator, __agg_ports_are_ready(aggregator)); /* if the wait_while_timer expired, and the port is * in READY state, move to ATTACHED state @@ -1086,7 +1088,7 @@ static void ad_mux_machine(struct port *port, bool *update_slave_arr) if ((port->sm_vars & AD_PORT_SELECTED) && (port->partner_oper.port_state & LACP_STATE_SYNCHRONIZATION) && !__check_agg_selection_timer(port)) { - if (port->aggregator->is_active) { + if (aggregator->is_active) { int state = AD_MUX_COLLECTING_DISTRIBUTING; if (!bond->params.coupled_control) @@ -1102,9 +1104,9 @@ static void ad_mux_machine(struct port *port, bool *update_slave_arr) * cycle to update ready variable, we check * READY_N and update READY here */ - __set_agg_ports_ready(port->aggregator, __agg_ports_are_ready(port->aggregator)); + __set_agg_ports_ready(aggregator, __agg_ports_are_ready(aggregator)); port->sm_mux_state = AD_MUX_DETACHED; - } else if (port->aggregator->is_active) { + } else if (aggregator->is_active) { port->actor_oper_port_state |= LACP_STATE_SYNCHRONIZATION; } @@ -1115,7 +1117,7 @@ static void ad_mux_machine(struct port *port, bool *update_slave_arr) * sure that a collecting distributing * port in an active aggregator is enabled */ - if (port->aggregator->is_active && + if (aggregator->is_active && !__port_is_collecting_distributing(port)) { __enable_port(port); *update_slave_arr = true; @@ -1134,7 +1136,7 @@ static void ad_mux_machine(struct port *port, bool *update_slave_arr) */ struct slave *slave = port->slave; - if (port->aggregator->is_active && + if (aggregator->is_active && bond_is_slave_rx_disabled(slave)) { ad_enable_collecting(port); *update_slave_arr = true; @@ -1154,8 +1156,8 @@ static void ad_mux_machine(struct port *port, bool *update_slave_arr) * sure that a collecting distributing * port in an active aggregator is enabled */ - if (port->aggregator && - port->aggregator->is_active && + if (aggregator && + aggregator->is_active && !__port_is_collecting_distributing(port)) { __enable_port(port); *update_slave_arr = true; @@ -1187,7 +1189,7 @@ static void ad_mux_machine(struct port *port, bool *update_slave_arr) port->sm_mux_timer_counter = __ad_timer_to_ticks(AD_WAIT_WHILE_TIMER, 0); break; case AD_MUX_ATTACHED: - if (port->aggregator->is_active) + if (aggregator->is_active) port->actor_oper_port_state |= LACP_STATE_SYNCHRONIZATION; else @@ -1561,9 +1563,9 @@ static void ad_port_selection_logic(struct port *port, bool *update_slave_arr) bond = __get_bond_by_port(port); /* if the port is connected to other aggregator, detach it */ - if (port->aggregator) { + temp_aggregator = rcu_dereference(port->aggregator); + if (temp_aggregator) { /* detach the port from its former aggregator */ - temp_aggregator = port->aggregator; for (curr_port = temp_aggregator->lag_ports; curr_port; last_port = curr_port, curr_port = curr_port->next_port_in_aggregator) { @@ -1586,7 +1588,7 @@ static void ad_port_selection_logic(struct port *port, bool *update_slave_arr) /* clear the port's relations to this * aggregator */ - port->aggregator = NULL; + RCU_INIT_POINTER(port->aggregator, NULL); port->next_port_in_aggregator = NULL; port->actor_port_aggregator_identifier = 0; @@ -1609,7 +1611,7 @@ static void ad_port_selection_logic(struct port *port, bool *update_slave_arr) port->slave->bond->dev->name, port->slave->dev->name, port->actor_port_number, - port->aggregator->aggregator_identifier); + temp_aggregator->aggregator_identifier); } } /* search on all aggregators for a suitable aggregator for this port */ @@ -1633,15 +1635,15 @@ static void ad_port_selection_logic(struct port *port, bool *update_slave_arr) ) ) { /* attach to the founded aggregator */ - port->aggregator = aggregator; + rcu_assign_pointer(port->aggregator, aggregator); port->actor_port_aggregator_identifier = - port->aggregator->aggregator_identifier; + aggregator->aggregator_identifier; port->next_port_in_aggregator = aggregator->lag_ports; - port->aggregator->num_of_ports++; + aggregator->num_of_ports++; aggregator->lag_ports = port; slave_dbg(bond->dev, slave->dev, "Port %d joined LAG %d (existing LAG)\n", port->actor_port_number, - port->aggregator->aggregator_identifier); + aggregator->aggregator_identifier); /* mark this port as selected */ port->sm_vars |= AD_PORT_SELECTED; @@ -1656,39 +1658,40 @@ static void ad_port_selection_logic(struct port *port, bool *update_slave_arr) if (!found) { if (free_aggregator) { /* assign port a new aggregator */ - port->aggregator = free_aggregator; port->actor_port_aggregator_identifier = - port->aggregator->aggregator_identifier; + free_aggregator->aggregator_identifier; /* update the new aggregator's parameters * if port was responsed from the end-user */ if (port->actor_oper_port_key & AD_DUPLEX_KEY_MASKS) /* if port is full duplex */ - port->aggregator->is_individual = false; + free_aggregator->is_individual = false; else - port->aggregator->is_individual = true; + free_aggregator->is_individual = true; - port->aggregator->actor_admin_aggregator_key = + free_aggregator->actor_admin_aggregator_key = port->actor_admin_port_key; - port->aggregator->actor_oper_aggregator_key = + free_aggregator->actor_oper_aggregator_key = port->actor_oper_port_key; - port->aggregator->partner_system = + free_aggregator->partner_system = port->partner_oper.system; - port->aggregator->partner_system_priority = + free_aggregator->partner_system_priority = port->partner_oper.system_priority; - port->aggregator->partner_oper_aggregator_key = port->partner_oper.key; - port->aggregator->receive_state = 1; - port->aggregator->transmit_state = 1; - port->aggregator->lag_ports = port; - port->aggregator->num_of_ports++; + free_aggregator->partner_oper_aggregator_key = port->partner_oper.key; + free_aggregator->receive_state = 1; + free_aggregator->transmit_state = 1; + free_aggregator->lag_ports = port; + free_aggregator->num_of_ports++; + + rcu_assign_pointer(port->aggregator, free_aggregator); /* mark this port as selected */ port->sm_vars |= AD_PORT_SELECTED; slave_dbg(bond->dev, port->slave->dev, "Port %d joined LAG %d (new LAG)\n", port->actor_port_number, - port->aggregator->aggregator_identifier); + free_aggregator->aggregator_identifier); } else { slave_err(bond->dev, port->slave->dev, "Port %d did not find a suitable aggregator\n", @@ -1700,13 +1703,12 @@ static void ad_port_selection_logic(struct port *port, bool *update_slave_arr) * in all aggregator's ports, else set ready=FALSE in all * aggregator's ports */ - __set_agg_ports_ready(port->aggregator, - __agg_ports_are_ready(port->aggregator)); + aggregator = rcu_dereference(port->aggregator); + __set_agg_ports_ready(aggregator, __agg_ports_are_ready(aggregator)); - aggregator = __get_first_agg(port); - ad_agg_selection_logic(aggregator, update_slave_arr); + ad_agg_selection_logic(__get_first_agg(port), update_slave_arr); - if (!port->aggregator->is_active) + if (!aggregator->is_active) port->actor_oper_port_state &= ~LACP_STATE_SYNCHRONIZATION; } @@ -2075,13 +2077,15 @@ static void ad_initialize_port(struct port *port, const struct bond_params *bond */ static void ad_enable_collecting(struct port *port) { - if (port->aggregator->is_active) { + struct aggregator *aggregator = rcu_dereference(port->aggregator); + + if (aggregator->is_active) { struct slave *slave = port->slave; slave_dbg(slave->bond->dev, slave->dev, "Enabling collecting on port %d (LAG %d)\n", port->actor_port_number, - port->aggregator->aggregator_identifier); + aggregator->aggregator_identifier); __enable_collecting_port(port); } } @@ -2093,11 +2097,13 @@ static void ad_enable_collecting(struct port *port) */ static void ad_disable_distributing(struct port *port, bool *update_slave_arr) { - if (port->aggregator && __agg_has_partner(port->aggregator)) { + struct aggregator *aggregator = rcu_dereference(port->aggregator); + + if (aggregator && __agg_has_partner(aggregator)) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Disabling distributing on port %d (LAG %d)\n", port->actor_port_number, - port->aggregator->aggregator_identifier); + aggregator->aggregator_identifier); __disable_distributing_port(port); /* Slave array needs an update */ *update_slave_arr = true; @@ -2114,11 +2120,13 @@ static void ad_disable_distributing(struct port *port, bool *update_slave_arr) static void ad_enable_collecting_distributing(struct port *port, bool *update_slave_arr) { - if (port->aggregator->is_active) { + struct aggregator *aggregator = rcu_dereference(port->aggregator); + + if (aggregator->is_active) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Enabling port %d (LAG %d)\n", port->actor_port_number, - port->aggregator->aggregator_identifier); + aggregator->aggregator_identifier); __enable_port(port); /* Slave array needs update */ *update_slave_arr = true; @@ -2135,11 +2143,13 @@ static void ad_enable_collecting_distributing(struct port *port, static void ad_disable_collecting_distributing(struct port *port, bool *update_slave_arr) { - if (port->aggregator && __agg_has_partner(port->aggregator)) { + struct aggregator *aggregator = rcu_dereference(port->aggregator); + + if (aggregator && __agg_has_partner(aggregator)) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Disabling port %d (LAG %d)\n", port->actor_port_number, - port->aggregator->aggregator_identifier); + aggregator->aggregator_identifier); __disable_port(port); /* Slave array needs an update */ *update_slave_arr = true; @@ -2379,7 +2389,7 @@ void bond_3ad_unbind_slave(struct slave *slave) */ for (temp_port = aggregator->lag_ports; temp_port; temp_port = temp_port->next_port_in_aggregator) { - temp_port->aggregator = new_aggregator; + rcu_assign_pointer(temp_port->aggregator, new_aggregator); temp_port->actor_port_aggregator_identifier = new_aggregator->aggregator_identifier; } @@ -2848,15 +2858,16 @@ out: int __bond_3ad_get_active_agg_info(struct bonding *bond, struct ad_info *ad_info) { - struct aggregator *aggregator = NULL; + struct aggregator *aggregator = NULL, *tmp; struct list_head *iter; struct slave *slave; struct port *port; bond_for_each_slave_rcu(bond, slave, iter) { port = &(SLAVE_AD_INFO(slave)->port); - if (port->aggregator && port->aggregator->is_active) { - aggregator = port->aggregator; + tmp = rcu_dereference(port->aggregator); + if (tmp && tmp->is_active) { + aggregator = tmp; break; } } diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index c7baa5c4bf40..af82a3df2c5d 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1433,7 +1433,7 @@ static void bond_poll_controller(struct net_device *bond_dev) if (BOND_MODE(bond) == BOND_MODE_8023AD) { struct aggregator *agg = - SLAVE_AD_INFO(slave)->port.aggregator; + rcu_dereference(SLAVE_AD_INFO(slave)->port.aggregator); if (agg && agg->aggregator_identifier != ad_info.aggregator_id) @@ -5179,15 +5179,16 @@ int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave) spin_unlock_bh(&bond->mode_lock); agg_id = ad_info.aggregator_id; } + rcu_read_lock(); bond_for_each_slave(bond, slave, iter) { if (skipslave == slave) continue; all_slaves->arr[all_slaves->count++] = slave; if (BOND_MODE(bond) == BOND_MODE_8023AD) { - struct aggregator *agg; + const struct aggregator *agg; - agg = SLAVE_AD_INFO(slave)->port.aggregator; + agg = rcu_dereference(SLAVE_AD_INFO(slave)->port.aggregator); if (!agg || agg->aggregator_identifier != agg_id) continue; } @@ -5199,6 +5200,7 @@ int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave) usable_slaves->arr[usable_slaves->count++] = slave; } + rcu_read_unlock(); bond_set_slave_arr(bond, usable_slaves, all_slaves); return ret; diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c index ea1a80e658ae..c7d3e0602c83 100644 --- a/drivers/net/bonding/bond_netlink.c +++ b/drivers/net/bonding/bond_netlink.c @@ -66,27 +66,29 @@ static int bond_fill_slave_info(struct sk_buff *skb, const struct port *ad_port; ad_port = &SLAVE_AD_INFO(slave)->port; - agg = SLAVE_AD_INFO(slave)->port.aggregator; + rcu_read_lock(); + agg = rcu_dereference(SLAVE_AD_INFO(slave)->port.aggregator); if (agg) { if (nla_put_u16(skb, IFLA_BOND_SLAVE_AD_AGGREGATOR_ID, agg->aggregator_identifier)) - goto nla_put_failure; + goto nla_put_failure_rcu; if (nla_put_u8(skb, IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE, ad_port->actor_oper_port_state)) - goto nla_put_failure; + goto nla_put_failure_rcu; if (nla_put_u16(skb, IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE, ad_port->partner_oper.port_state)) - goto nla_put_failure; + goto nla_put_failure_rcu; if (nla_put_u8(skb, IFLA_BOND_SLAVE_AD_CHURN_ACTOR_STATE, ad_port->sm_churn_actor_state)) - goto nla_put_failure; + goto nla_put_failure_rcu; if (nla_put_u8(skb, IFLA_BOND_SLAVE_AD_CHURN_PARTNER_STATE, ad_port->sm_churn_partner_state)) - goto nla_put_failure; + goto nla_put_failure_rcu; } + rcu_read_unlock(); if (nla_put_u16(skb, IFLA_BOND_SLAVE_ACTOR_PORT_PRIO, SLAVE_AD_INFO(slave)->port_priority)) @@ -95,6 +97,8 @@ static int bond_fill_slave_info(struct sk_buff *skb, return 0; +nla_put_failure_rcu: + rcu_read_unlock(); nla_put_failure: return -EMSGSIZE; } diff --git a/drivers/net/bonding/bond_procfs.c b/drivers/net/bonding/bond_procfs.c index e34f80305191..3714aab1a3d9 100644 --- a/drivers/net/bonding/bond_procfs.c +++ b/drivers/net/bonding/bond_procfs.c @@ -188,6 +188,7 @@ static void bond_info_show_master(struct seq_file *seq) } } +/* Note: runs under rcu_read_lock() */ static void bond_info_show_slave(struct seq_file *seq, const struct slave *slave) { @@ -214,7 +215,7 @@ static void bond_info_show_slave(struct seq_file *seq, if (BOND_MODE(bond) == BOND_MODE_8023AD) { const struct port *port = &SLAVE_AD_INFO(slave)->port; - const struct aggregator *agg = port->aggregator; + const struct aggregator *agg = rcu_dereference(port->aggregator); if (agg) { seq_printf(seq, "Aggregator ID: %d\n", diff --git a/drivers/net/bonding/bond_sysfs_slave.c b/drivers/net/bonding/bond_sysfs_slave.c index 36d0e8440b5b..fc6fe7181789 100644 --- a/drivers/net/bonding/bond_sysfs_slave.c +++ b/drivers/net/bonding/bond_sysfs_slave.c @@ -62,10 +62,15 @@ static ssize_t ad_aggregator_id_show(struct slave *slave, char *buf) const struct aggregator *agg; if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) { - agg = SLAVE_AD_INFO(slave)->port.aggregator; - if (agg) - return sysfs_emit(buf, "%d\n", - agg->aggregator_identifier); + rcu_read_lock(); + agg = rcu_dereference(SLAVE_AD_INFO(slave)->port.aggregator); + if (agg) { + ssize_t res = sysfs_emit(buf, "%d\n", + agg->aggregator_identifier); + rcu_read_unlock(); + return res; + } + rcu_read_unlock(); } return sysfs_emit(buf, "N/A\n"); @@ -78,7 +83,7 @@ static ssize_t ad_actor_oper_port_state_show(struct slave *slave, char *buf) if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) { ad_port = &SLAVE_AD_INFO(slave)->port; - if (ad_port->aggregator) + if (rcu_access_pointer(ad_port->aggregator)) return sysfs_emit(buf, "%u\n", ad_port->actor_oper_port_state); } @@ -93,7 +98,7 @@ static ssize_t ad_partner_oper_port_state_show(struct slave *slave, char *buf) if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) { ad_port = &SLAVE_AD_INFO(slave)->port; - if (ad_port->aggregator) + if (rcu_access_pointer(ad_port->aggregator)) return sysfs_emit(buf, "%u\n", ad_port->partner_oper.port_state); } diff --git a/include/net/bond_3ad.h b/include/net/bond_3ad.h index c92d4a976246..05572c19e14b 100644 --- a/include/net/bond_3ad.h +++ b/include/net/bond_3ad.h @@ -243,7 +243,7 @@ typedef struct port { churn_state_t sm_churn_actor_state; churn_state_t sm_churn_partner_state; struct slave *slave; /* pointer to the bond slave that this port belongs to */ - struct aggregator *aggregator; /* pointer to an aggregator that this port related to */ + struct aggregator __rcu *aggregator; /* pointer to an aggregator that this port related to */ struct port *next_port_in_aggregator; /* Next port on the linked list of the parent aggregator */ u32 transaction_id; /* continuous number for identification of Marker PDU's; */ struct lacpdu lacpdu; /* the lacpdu that will be sent for this port */ From 5ef343614db766acdc01c56d66e780a1b43c6ac6 Mon Sep 17 00:00:00 2001 From: Hasan Basbunar Date: Tue, 28 Apr 2026 19:07:39 +0200 Subject: [PATCH 65/79] page_pool: fix memory-provider leak in page_pool_create_percpu() error path When page_pool_create_percpu() fails on page_pool_list(), it falls through to its err_uninit: label, which calls page_pool_uninit(). At that point page_pool_init() has already taken two references when the user requested PP_FLAG_ALLOW_UNREADABLE_NETMEM: pool->mp_ops->init(pool) static_branch_inc(&page_pool_mem_providers); Neither is undone by page_pool_uninit(); both are only undone by __page_pool_destroy() (success-side teardown). The error path therefore leaks the per-provider reference taken by mp_ops->init (io_zcrx_ifq->refs in the io_uring zcrx provider, the dmabuf binding refcount in the devmem provider) plus one increment of the page_pool_mem_providers static branch on every failure of xa_alloc_cyclic() inside page_pool_list(). The leaked io_zcrx_ifq->refs in turn pins everything io_zcrx_ifq_free() would release on cleanup: ifq->user (uid), ifq->mm_account (mmdrop), ifq->dev (device refcount), ifq->netdev_tracker (netdev refcount), and the rbuf region. The leaked static branch increment forces all subsequent page_pool_alloc_netmems() and page_pool_return_page() callers to take the slow mp_ops branch for the lifetime of the kernel. Reachable via the io_uring zcrx path: io_uring_register(IORING_REGISTER_ZCRX_IFQ) /* CAP_NET_ADMIN */ -> __io_uring_register -> io_register_zcrx -> zcrx_register_netdev -> netif_mp_open_rxq -> driver ndo_queue_mem_alloc -> page_pool_create_percpu -> page_pool_init succeeds (mp_ops->init runs, branch++) -> page_pool_list fails (xa_alloc_cyclic -ENOMEM) -> goto err_uninit <-- leak The same shape applies to the devmem dmabuf provider via mp_dmabuf_devmem_init()/mp_dmabuf_devmem_destroy(). Restore the cleanup symmetry by moving the mp_ops->destroy() and static_branch_dec() calls out of __page_pool_destroy() and into page_pool_uninit(), so page_pool_uninit() is again the strict inverse of page_pool_init(). page_pool_uninit() has only two callers (the err_uninit: path and __page_pool_destroy()), so this preserves the single-call invariant on the success path while fixing the err path. The error path of page_pool_init() itself still skips the mp_ops cleanup correctly: mp_ops->init is the last action that takes a reference before page_pool_init() returns 0, so when it returns an error neither the refcount nor the static branch has been touched. Triggering the bug requires xa_alloc_cyclic() to fail with -ENOMEM, which under normal GFP_KERNEL retry behaviour is rare. It is deterministic under CONFIG_FAULT_INJECTION with fail_page_alloc / xa fault injection, or under sustained memory pressure. The leak is silent: there is no warning, and the released kernel build continues running with a permanently-incremented static branch. Fixes: 0f9214046893 ("memory-provider: dmabuf devmem memory provider") Signed-off-by: Hasan Basbunar Link: https://patch.msgid.link/20260428170739.34881-1-basbunarhasan@gmail.com Signed-off-by: Jakub Kicinski --- net/core/page_pool.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 877bbf7a1938..6e576dec80db 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -327,6 +327,11 @@ static void page_pool_uninit(struct page_pool *pool) if (!pool->system) free_percpu(pool->recycle_stats); #endif + + if (pool->mp_ops) { + pool->mp_ops->destroy(pool); + static_branch_dec(&page_pool_mem_providers); + } } /** @@ -1146,11 +1151,6 @@ static void __page_pool_destroy(struct page_pool *pool) page_pool_unlist(pool); page_pool_uninit(pool); - if (pool->mp_ops) { - pool->mp_ops->destroy(pool); - static_branch_dec(&page_pool_mem_providers); - } - kfree(pool); } From 70d62b669f1f9080a25278fc90b64309f4ae8959 Mon Sep 17 00:00:00 2001 From: Petr Oros Date: Mon, 27 Apr 2026 22:22:13 -0700 Subject: [PATCH 66/79] iavf: rename IAVF_VLAN_IS_NEW to IAVF_VLAN_ADDING Rename the IAVF_VLAN_IS_NEW state to IAVF_VLAN_ADDING to better describe what the state represents: an ADD request has been sent to the PF and is waiting for a response. This is a pure rename with no behavioral change, preparing for a cleanup of the VLAN filter state machine. Signed-off-by: Petr Oros Reviewed-by: Aleksandr Loktionov Tested-by: Rafal Romanowski Reviewed-by: Simon Horman Reviewed-by: Przemek Kitszel Signed-off-by: Jacob Keller Link: https://patch.msgid.link/20260427-jk-iwl-net-petr-oros-fixes-v1-1-cdcb48303fd8@intel.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/intel/iavf/iavf.h | 2 +- drivers/net/ethernet/intel/iavf/iavf_virtchnl.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h index e9fb0a0919e3..47a862ca5e2c 100644 --- a/drivers/net/ethernet/intel/iavf/iavf.h +++ b/drivers/net/ethernet/intel/iavf/iavf.h @@ -158,7 +158,7 @@ struct iavf_vlan { enum iavf_vlan_state_t { IAVF_VLAN_INVALID, IAVF_VLAN_ADD, /* filter needs to be added */ - IAVF_VLAN_IS_NEW, /* filter is new, wait for PF answer */ + IAVF_VLAN_ADDING, /* ADD sent to PF, waiting for response */ IAVF_VLAN_ACTIVE, /* filter is accepted by PF */ IAVF_VLAN_DISABLE, /* filter needs to be deleted by PF, then marked INACTIVE */ IAVF_VLAN_INACTIVE, /* filter is inactive, we are in IFF_DOWN */ diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c index a52c100dcbc5..6b06ae872a0c 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c +++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c @@ -746,7 +746,7 @@ static void iavf_vlan_add_reject(struct iavf_adapter *adapter) spin_lock_bh(&adapter->mac_vlan_list_lock); list_for_each_entry_safe(f, ftmp, &adapter->vlan_filter_list, list) { - if (f->state == IAVF_VLAN_IS_NEW) { + if (f->state == IAVF_VLAN_ADDING) { list_del(&f->list); kfree(f); adapter->num_vlan_filters--; @@ -812,7 +812,7 @@ void iavf_add_vlans(struct iavf_adapter *adapter) if (f->state == IAVF_VLAN_ADD) { vvfl->vlan_id[i] = f->vlan.vid; i++; - f->state = IAVF_VLAN_IS_NEW; + f->state = IAVF_VLAN_ADDING; if (i == count) break; } @@ -874,7 +874,7 @@ void iavf_add_vlans(struct iavf_adapter *adapter) vlan->tpid = f->vlan.tpid; i++; - f->state = IAVF_VLAN_IS_NEW; + f->state = IAVF_VLAN_ADDING; } } @@ -2910,7 +2910,7 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter, spin_lock_bh(&adapter->mac_vlan_list_lock); list_for_each_entry(f, &adapter->vlan_filter_list, list) { - if (f->state == IAVF_VLAN_IS_NEW) + if (f->state == IAVF_VLAN_ADDING) f->state = IAVF_VLAN_ACTIVE; } spin_unlock_bh(&adapter->mac_vlan_list_lock); From f2ce65b9b917474a1a6ce68d357e15fac2aca0f2 Mon Sep 17 00:00:00 2001 From: Petr Oros Date: Mon, 27 Apr 2026 22:22:14 -0700 Subject: [PATCH 67/79] iavf: stop removing VLAN filters from PF on interface down When a VF goes down, the driver currently sends DEL_VLAN to the PF for every VLAN filter (ACTIVE -> DISABLE -> send DEL -> INACTIVE), then re-adds them all on UP (INACTIVE -> ADD -> send ADD -> ADDING -> ACTIVE). This round-trip is unnecessary because: 1. The PF disables the VF's queues via VIRTCHNL_OP_DISABLE_QUEUES, which already prevents all RX/TX traffic regardless of VLAN filter state. 2. The VLAN filters remaining in PF HW while the VF is down is harmless - packets matching those filters have nowhere to go with queues disabled. 3. The DEL+ADD cycle during down/up creates race windows where the VLAN filter list is incomplete. With spoofcheck enabled, the PF enables TX VLAN filtering on the first non-zero VLAN add, blocking traffic for any VLANs not yet re-added. Remove the entire DISABLE/INACTIVE state machinery: - Remove IAVF_VLAN_DISABLE and IAVF_VLAN_INACTIVE enum values - Remove iavf_restore_filters() and its call from iavf_open() - Remove VLAN filter handling from iavf_clear_mac_vlan_filters(), rename it to iavf_clear_mac_filters() - Remove DEL_VLAN_FILTER scheduling from iavf_down() - Remove all DISABLE/INACTIVE handling from iavf_del_vlans() VLAN filters now stay ACTIVE across down/up cycles. Only explicit user removal (ndo_vlan_rx_kill_vid) or PF/VF reset triggers VLAN filter deletion/re-addition. Fixes: ed1f5b58ea01 ("i40evf: remove VLAN filters on close") Signed-off-by: Petr Oros Reviewed-by: Aleksandr Loktionov Tested-by: Rafal Romanowski Reviewed-by: Simon Horman Reviewed-by: Przemek Kitszel Signed-off-by: Jacob Keller Link: https://patch.msgid.link/20260427-jk-iwl-net-petr-oros-fixes-v1-2-cdcb48303fd8@intel.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/intel/iavf/iavf.h | 6 +-- drivers/net/ethernet/intel/iavf/iavf_main.c | 39 ++----------------- .../net/ethernet/intel/iavf/iavf_virtchnl.c | 33 +++------------- 3 files changed, 12 insertions(+), 66 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h index 47a862ca5e2c..5765715914d6 100644 --- a/drivers/net/ethernet/intel/iavf/iavf.h +++ b/drivers/net/ethernet/intel/iavf/iavf.h @@ -159,10 +159,8 @@ enum iavf_vlan_state_t { IAVF_VLAN_INVALID, IAVF_VLAN_ADD, /* filter needs to be added */ IAVF_VLAN_ADDING, /* ADD sent to PF, waiting for response */ - IAVF_VLAN_ACTIVE, /* filter is accepted by PF */ - IAVF_VLAN_DISABLE, /* filter needs to be deleted by PF, then marked INACTIVE */ - IAVF_VLAN_INACTIVE, /* filter is inactive, we are in IFF_DOWN */ - IAVF_VLAN_REMOVE, /* filter needs to be removed from list */ + IAVF_VLAN_ACTIVE, /* PF confirmed, filter is in HW */ + IAVF_VLAN_REMOVE, /* filter queued for DEL from PF */ }; struct iavf_vlan_filter { diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 3c1465cf0515..ca29038c0016 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -801,27 +801,6 @@ static void iavf_del_vlan(struct iavf_adapter *adapter, struct iavf_vlan vlan) spin_unlock_bh(&adapter->mac_vlan_list_lock); } -/** - * iavf_restore_filters - * @adapter: board private structure - * - * Restore existing non MAC filters when VF netdev comes back up - **/ -static void iavf_restore_filters(struct iavf_adapter *adapter) -{ - struct iavf_vlan_filter *f; - - /* re-add all VLAN filters */ - spin_lock_bh(&adapter->mac_vlan_list_lock); - - list_for_each_entry(f, &adapter->vlan_filter_list, list) { - if (f->state == IAVF_VLAN_INACTIVE) - f->state = IAVF_VLAN_ADD; - } - - spin_unlock_bh(&adapter->mac_vlan_list_lock); - adapter->aq_required |= IAVF_FLAG_AQ_ADD_VLAN_FILTER; -} /** * iavf_get_num_vlans_added - get number of VLANs added @@ -1246,13 +1225,12 @@ static void iavf_up_complete(struct iavf_adapter *adapter) } /** - * iavf_clear_mac_vlan_filters - Remove mac and vlan filters not sent to PF - * yet and mark other to be removed. + * iavf_clear_mac_filters - Remove MAC filters not sent to PF yet and mark + * others to be removed. * @adapter: board private structure **/ -static void iavf_clear_mac_vlan_filters(struct iavf_adapter *adapter) +static void iavf_clear_mac_filters(struct iavf_adapter *adapter) { - struct iavf_vlan_filter *vlf, *vlftmp; struct iavf_mac_filter *f, *ftmp; spin_lock_bh(&adapter->mac_vlan_list_lock); @@ -1271,11 +1249,6 @@ static void iavf_clear_mac_vlan_filters(struct iavf_adapter *adapter) } } - /* disable all VLAN filters */ - list_for_each_entry_safe(vlf, vlftmp, &adapter->vlan_filter_list, - list) - vlf->state = IAVF_VLAN_DISABLE; - spin_unlock_bh(&adapter->mac_vlan_list_lock); } @@ -1371,7 +1344,7 @@ void iavf_down(struct iavf_adapter *adapter) iavf_napi_disable_all(adapter); iavf_irq_disable(adapter); - iavf_clear_mac_vlan_filters(adapter); + iavf_clear_mac_filters(adapter); iavf_clear_cloud_filters(adapter); iavf_clear_fdir_filters(adapter); iavf_clear_adv_rss_conf(adapter); @@ -1388,8 +1361,6 @@ void iavf_down(struct iavf_adapter *adapter) */ if (!list_empty(&adapter->mac_filter_list)) adapter->aq_required |= IAVF_FLAG_AQ_DEL_MAC_FILTER; - if (!list_empty(&adapter->vlan_filter_list)) - adapter->aq_required |= IAVF_FLAG_AQ_DEL_VLAN_FILTER; if (!list_empty(&adapter->cloud_filter_list)) adapter->aq_required |= IAVF_FLAG_AQ_DEL_CLOUD_FILTER; if (!list_empty(&adapter->fdir_list_head)) @@ -4494,8 +4465,6 @@ static int iavf_open(struct net_device *netdev) iavf_add_filter(adapter, adapter->hw.mac.addr); spin_unlock_bh(&adapter->mac_vlan_list_lock); - /* Restore filters that were removed with IFF_DOWN */ - iavf_restore_filters(adapter); iavf_restore_fdir_filters(adapter); iavf_configure(adapter); diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c index 6b06ae872a0c..4f197d908124 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c +++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c @@ -911,22 +911,12 @@ void iavf_del_vlans(struct iavf_adapter *adapter) spin_lock_bh(&adapter->mac_vlan_list_lock); list_for_each_entry_safe(f, ftmp, &adapter->vlan_filter_list, list) { - /* since VLAN capabilities are not allowed, we dont want to send - * a VLAN delete request because it will most likely fail and - * create unnecessary errors/noise, so just free the VLAN - * filters marked for removal to enable bailing out before - * sending a virtchnl message - */ if (f->state == IAVF_VLAN_REMOVE && !VLAN_FILTERING_ALLOWED(adapter)) { list_del(&f->list); kfree(f); adapter->num_vlan_filters--; - } else if (f->state == IAVF_VLAN_DISABLE && - !VLAN_FILTERING_ALLOWED(adapter)) { - f->state = IAVF_VLAN_INACTIVE; - } else if (f->state == IAVF_VLAN_REMOVE || - f->state == IAVF_VLAN_DISABLE) { + } else if (f->state == IAVF_VLAN_REMOVE) { count++; } } @@ -959,13 +949,7 @@ void iavf_del_vlans(struct iavf_adapter *adapter) vvfl->vsi_id = adapter->vsi_res->vsi_id; vvfl->num_elements = count; list_for_each_entry_safe(f, ftmp, &adapter->vlan_filter_list, list) { - if (f->state == IAVF_VLAN_DISABLE) { - vvfl->vlan_id[i] = f->vlan.vid; - f->state = IAVF_VLAN_INACTIVE; - i++; - if (i == count) - break; - } else if (f->state == IAVF_VLAN_REMOVE) { + if (f->state == IAVF_VLAN_REMOVE) { vvfl->vlan_id[i] = f->vlan.vid; list_del(&f->list); kfree(f); @@ -1007,8 +991,7 @@ void iavf_del_vlans(struct iavf_adapter *adapter) vvfl_v2->vport_id = adapter->vsi_res->vsi_id; vvfl_v2->num_elements = count; list_for_each_entry_safe(f, ftmp, &adapter->vlan_filter_list, list) { - if (f->state == IAVF_VLAN_DISABLE || - f->state == IAVF_VLAN_REMOVE) { + if (f->state == IAVF_VLAN_REMOVE) { struct virtchnl_vlan_supported_caps *filtering_support = &adapter->vlan_v2_caps.filtering.filtering_support; struct virtchnl_vlan *vlan; @@ -1022,13 +1005,9 @@ void iavf_del_vlans(struct iavf_adapter *adapter) vlan->tci = f->vlan.vid; vlan->tpid = f->vlan.tpid; - if (f->state == IAVF_VLAN_DISABLE) { - f->state = IAVF_VLAN_INACTIVE; - } else { - list_del(&f->list); - kfree(f); - adapter->num_vlan_filters--; - } + list_del(&f->list); + kfree(f); + adapter->num_vlan_filters--; i++; if (i == count) break; From bbcbe4ed70dea948849549af7edf44bd42bbd695 Mon Sep 17 00:00:00 2001 From: Petr Oros Date: Mon, 27 Apr 2026 22:22:15 -0700 Subject: [PATCH 68/79] iavf: wait for PF confirmation before removing VLAN filters The VLAN filter DELETE path was asymmetric with the ADD path: ADD waits for PF confirmation (ADD -> ADDING -> ACTIVE), but DELETE immediately frees the filter struct after sending the DEL message without waiting for the PF response. This is problematic because: - If the PF rejects the DEL, the filter remains in HW but the driver has already freed the tracking structure, losing sync. - Race conditions between DEL pending and other operations (add, reset) cannot be properly resolved if the filter struct is already gone. Add IAVF_VLAN_REMOVING state to make the DELETE path symmetric: REMOVE -> REMOVING (send DEL) -> PF confirms -> kfree -> PF rejects -> ACTIVE In iavf_del_vlans(), transition filters from REMOVE to REMOVING instead of immediately freeing them. The new DEL completion handler in iavf_virtchnl_completion() frees filters on success or reverts them to ACTIVE on error. Update iavf_add_vlan() to handle the REMOVING state: if a DEL is pending and the user re-adds the same VLAN, queue it for ADD so it gets re-programmed after the PF processes the DEL. The !VLAN_FILTERING_ALLOWED early-exit path still frees filters directly since no PF message is sent in that case. Also update iavf_del_vlan() to skip filters already in REMOVING state: DEL has been sent to PF and the completion handler will free the filter when PF confirms. Without this guard, the sequence DEL(pending) -> user-del -> second DEL could cause the PF to return an error for the second DEL (filter already gone), causing the completion handler to incorrectly revert a deleted filter back to ACTIVE. Fixes: 968996c070ef ("iavf: Fix VLAN_V2 addition/rejection") Signed-off-by: Petr Oros Reviewed-by: Aleksandr Loktionov Tested-by: Rafal Romanowski Reviewed-by: Przemek Kitszel Signed-off-by: Jacob Keller Link: https://patch.msgid.link/20260427-jk-iwl-net-petr-oros-fixes-v1-3-cdcb48303fd8@intel.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/intel/iavf/iavf.h | 1 + drivers/net/ethernet/intel/iavf/iavf_main.c | 13 ++++--- .../net/ethernet/intel/iavf/iavf_virtchnl.c | 37 +++++++++++++------ 3 files changed, 34 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h index 5765715914d6..050f8241ef5e 100644 --- a/drivers/net/ethernet/intel/iavf/iavf.h +++ b/drivers/net/ethernet/intel/iavf/iavf.h @@ -161,6 +161,7 @@ enum iavf_vlan_state_t { IAVF_VLAN_ADDING, /* ADD sent to PF, waiting for response */ IAVF_VLAN_ACTIVE, /* PF confirmed, filter is in HW */ IAVF_VLAN_REMOVE, /* filter queued for DEL from PF */ + IAVF_VLAN_REMOVING, /* DEL sent to PF, waiting for response */ }; struct iavf_vlan_filter { diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index ca29038c0016..d2914c511e1e 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -757,10 +757,10 @@ iavf_vlan_filter *iavf_add_vlan(struct iavf_adapter *adapter, adapter->num_vlan_filters++; iavf_schedule_aq_request(adapter, IAVF_FLAG_AQ_ADD_VLAN_FILTER); } else if (f->state == IAVF_VLAN_REMOVE) { - /* Re-add the filter since we cannot tell whether the - * pending delete has already been processed by the PF. - * A duplicate add is harmless. - */ + /* DEL not yet sent to PF, cancel it */ + f->state = IAVF_VLAN_ACTIVE; + } else if (f->state == IAVF_VLAN_REMOVING) { + /* DEL already sent to PF, re-add after completion */ f->state = IAVF_VLAN_ADD; iavf_schedule_aq_request(adapter, IAVF_FLAG_AQ_ADD_VLAN_FILTER); @@ -791,11 +791,14 @@ static void iavf_del_vlan(struct iavf_adapter *adapter, struct iavf_vlan vlan) list_del(&f->list); kfree(f); adapter->num_vlan_filters--; - } else { + } else if (f->state != IAVF_VLAN_REMOVING) { f->state = IAVF_VLAN_REMOVE; iavf_schedule_aq_request(adapter, IAVF_FLAG_AQ_DEL_VLAN_FILTER); } + /* If REMOVING, DEL is already sent to PF; completion + * handler will free the filter when PF confirms. + */ } spin_unlock_bh(&adapter->mac_vlan_list_lock); diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c index 4f197d908124..93ca79c3e3b5 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c +++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c @@ -948,12 +948,10 @@ void iavf_del_vlans(struct iavf_adapter *adapter) vvfl->vsi_id = adapter->vsi_res->vsi_id; vvfl->num_elements = count; - list_for_each_entry_safe(f, ftmp, &adapter->vlan_filter_list, list) { + list_for_each_entry(f, &adapter->vlan_filter_list, list) { if (f->state == IAVF_VLAN_REMOVE) { vvfl->vlan_id[i] = f->vlan.vid; - list_del(&f->list); - kfree(f); - adapter->num_vlan_filters--; + f->state = IAVF_VLAN_REMOVING; i++; if (i == count) break; @@ -990,7 +988,7 @@ void iavf_del_vlans(struct iavf_adapter *adapter) vvfl_v2->vport_id = adapter->vsi_res->vsi_id; vvfl_v2->num_elements = count; - list_for_each_entry_safe(f, ftmp, &adapter->vlan_filter_list, list) { + list_for_each_entry(f, &adapter->vlan_filter_list, list) { if (f->state == IAVF_VLAN_REMOVE) { struct virtchnl_vlan_supported_caps *filtering_support = &adapter->vlan_v2_caps.filtering.filtering_support; @@ -1005,9 +1003,7 @@ void iavf_del_vlans(struct iavf_adapter *adapter) vlan->tci = f->vlan.vid; vlan->tpid = f->vlan.tpid; - list_del(&f->list); - kfree(f); - adapter->num_vlan_filters--; + f->state = IAVF_VLAN_REMOVING; i++; if (i == count) break; @@ -2370,10 +2366,6 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter, ether_addr_copy(adapter->hw.mac.addr, netdev->dev_addr); wake_up(&adapter->vc_waitqueue); break; - case VIRTCHNL_OP_DEL_VLAN: - dev_err(&adapter->pdev->dev, "Failed to delete VLAN filter, error %s\n", - iavf_stat_str(&adapter->hw, v_retval)); - break; case VIRTCHNL_OP_DEL_ETH_ADDR: dev_err(&adapter->pdev->dev, "Failed to delete MAC filter, error %s\n", iavf_stat_str(&adapter->hw, v_retval)); @@ -2895,6 +2887,27 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter, spin_unlock_bh(&adapter->mac_vlan_list_lock); } break; + case VIRTCHNL_OP_DEL_VLAN: + case VIRTCHNL_OP_DEL_VLAN_V2: { + struct iavf_vlan_filter *f, *ftmp; + + spin_lock_bh(&adapter->mac_vlan_list_lock); + list_for_each_entry_safe(f, ftmp, &adapter->vlan_filter_list, + list) { + if (f->state == IAVF_VLAN_REMOVING) { + if (v_retval) { + /* PF rejected DEL, keep filter */ + f->state = IAVF_VLAN_ACTIVE; + } else { + list_del(&f->list); + kfree(f); + adapter->num_vlan_filters--; + } + } + } + spin_unlock_bh(&adapter->mac_vlan_list_lock); + } + break; case VIRTCHNL_OP_ENABLE_VLAN_STRIPPING: /* PF enabled vlan strip on this VF. * Update netdev->features if needed to be in sync with ethtool. From 34d33313b52eeac3a97ad2e3176d523ec70d9283 Mon Sep 17 00:00:00 2001 From: Petr Oros Date: Mon, 27 Apr 2026 22:22:16 -0700 Subject: [PATCH 69/79] iavf: add VIRTCHNL_OP_ADD_VLAN to success completion handler The V1 ADD_VLAN opcode had no success handler; filters sent via V1 stayed in ADDING state permanently. Add a fallthrough case so V1 filters also transition ADDING -> ACTIVE on PF confirmation. Critically, add an `if (v_retval) break` guard: the error switch in iavf_virtchnl_completion() does NOT return after handling errors, it falls through to the success switch. Without this guard, a PF-rejected ADD would incorrectly mark ADDING filters as ACTIVE, creating a driver/HW mismatch where the driver believes the filter is installed but the PF never accepted it. For V2, this is harmless: iavf_vlan_add_reject() in the error block already kfree'd all ADDING filters, so the success handler finds nothing to transition. Fixes: 968996c070ef ("iavf: Fix VLAN_V2 addition/rejection") Signed-off-by: Petr Oros Reviewed-by: Aleksandr Loktionov Tested-by: Rafal Romanowski Reviewed-by: Przemek Kitszel Signed-off-by: Jacob Keller Link: https://patch.msgid.link/20260427-jk-iwl-net-petr-oros-fixes-v1-4-cdcb48303fd8@intel.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/intel/iavf/iavf_virtchnl.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c index 93ca79c3e3b5..4f2defd2331b 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c +++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c @@ -2876,9 +2876,13 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter, spin_unlock_bh(&adapter->adv_rss_lock); } break; + case VIRTCHNL_OP_ADD_VLAN: case VIRTCHNL_OP_ADD_VLAN_V2: { struct iavf_vlan_filter *f; + if (v_retval) + break; + spin_lock_bh(&adapter->mac_vlan_list_lock); list_for_each_entry(f, &adapter->vlan_filter_list, list) { if (f->state == IAVF_VLAN_ADDING) From 54ef02487914c24170c7e1c061e45212dc55365e Mon Sep 17 00:00:00 2001 From: Petr Oros Date: Mon, 27 Apr 2026 22:22:17 -0700 Subject: [PATCH 70/79] ice: fix NULL pointer dereference in ice_reset_all_vfs() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ice_reset_all_vfs() ignores the return value of ice_vf_rebuild_vsi(). When the VSI rebuild fails (e.g. during NVM firmware update via nvmupdate64e), ice_vsi_rebuild() tears down the VSI on its error path, leaving txq_map and rxq_map as NULL. The subsequent unconditional call to ice_vf_post_vsi_rebuild() leads to a NULL pointer dereference in ice_ena_vf_q_mappings() when it accesses vsi->txq_map[0]. The single-VF reset path in ice_reset_vf() already handles this correctly by checking the return value of ice_vf_reconfig_vsi() and skipping ice_vf_post_vsi_rebuild() on failure. Apply the same pattern to ice_reset_all_vfs(): check the return value of ice_vf_rebuild_vsi() and skip ice_vf_post_vsi_rebuild() and ice_eswitch_attach_vf() on failure. The VF is left safely disabled (ICE_VF_STATE_INIT not set, VFGEN_RSTAT not set to VFACTIVE) and can be recovered via a VFLR triggered by a PCI reset of the VF (sysfs reset or driver rebind). Note that this patch does not prevent the VF VSI rebuild from failing during NVM update — the underlying cause is firmware being in a transitional state while the EMP reset is processed, which can cause Admin Queue commands (ice_add_vsi, ice_cfg_vsi_lan) to fail. This patch only prevents the subsequent NULL pointer dereference that crashes the kernel when the rebuild does fail. crash> bt PID: 50795 TASK: ff34c9ee708dc680 CPU: 1 COMMAND: "kworker/u512:5" #0 [ff72159bcfe5bb50] machine_kexec at ffffffffaa8850ee #1 [ff72159bcfe5bba8] __crash_kexec at ffffffffaaa15fba #2 [ff72159bcfe5bc68] crash_kexec at ffffffffaaa16540 #3 [ff72159bcfe5bc70] oops_end at ffffffffaa837eda #4 [ff72159bcfe5bc90] page_fault_oops at ffffffffaa893997 #5 [ff72159bcfe5bce8] exc_page_fault at ffffffffab528595 #6 [ff72159bcfe5bd10] asm_exc_page_fault at ffffffffab600bb2 [exception RIP: ice_ena_vf_q_mappings+0x79] RIP: ffffffffc0a85b29 RSP: ff72159bcfe5bdc8 RFLAGS: 00010206 RAX: 00000000000f0000 RBX: ff34c9efc9c00000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000010 RDI: ff34c9efc9c00000 RBP: ff34c9efc27d4828 R8: 0000000000000093 R9: 0000000000000040 R10: ff34c9efc27d4828 R11: 0000000000000040 R12: 0000000000100000 R13: 0000000000000010 R14: R15: ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #7 [ff72159bcfe5bdf8] ice_sriov_post_vsi_rebuild at ffffffffc0a85e2e [ice] #8 [ff72159bcfe5be08] ice_reset_all_vfs at ffffffffc0a920b4 [ice] #9 [ff72159bcfe5be48] ice_service_task at ffffffffc0a31519 [ice] #10 [ff72159bcfe5be88] process_one_work at ffffffffaa93dca4 #11 [ff72159bcfe5bec8] worker_thread at ffffffffaa93e9de #12 [ff72159bcfe5bf18] kthread at ffffffffaa946663 #13 [ff72159bcfe5bf50] ret_from_fork at ffffffffaa8086b9 The panic occurs attempting to dereference the NULL pointer in RDX at ice_sriov.c:294, which loads vsi->txq_map (offset 0x4b8 in ice_vsi). The faulting VSI is an allocated slab object but not fully initialized after a failed ice_vsi_rebuild(): crash> struct ice_vsi 0xff34c9efc27d4828 netdev = 0x0, rx_rings = 0x0, tx_rings = 0x0, q_vectors = 0x0, txq_map = 0x0, rxq_map = 0x0, alloc_txq = 0x10, num_txq = 0x10, alloc_rxq = 0x10, num_rxq = 0x10, The nvmupdate64e process was performing NVM firmware update: crash> bt 0xff34c9edd1a30000 PID: 49858 TASK: ff34c9edd1a30000 CPU: 1 COMMAND: "nvmupdate64e" #0 [ff72159bcd617618] __schedule at ffffffffab5333f8 #4 [ff72159bcd617750] ice_sq_send_cmd at ffffffffc0a35347 [ice] #5 [ff72159bcd6177a8] ice_sq_send_cmd_retry at ffffffffc0a35b47 [ice] #6 [ff72159bcd617810] ice_aq_send_cmd at ffffffffc0a38018 [ice] #7 [ff72159bcd617848] ice_aq_read_nvm at ffffffffc0a40254 [ice] #8 [ff72159bcd6178b8] ice_read_flat_nvm at ffffffffc0a4034c [ice] #9 [ff72159bcd617918] ice_devlink_nvm_snapshot at ffffffffc0a6ffa5 [ice] dmesg: ice 0000:13:00.0: firmware recommends not updating fw.mgmt, as it may result in a downgrade. continuing anyways ice 0000:13:00.1: ice_init_nvm failed -5 ice 0000:13:00.1: Rebuild failed, unload and reload driver Fixes: 12bb018c538c ("ice: Refactor VF reset") Signed-off-by: Petr Oros Tested-by: Rafal Romanowski Signed-off-by: Jacob Keller Link: https://patch.msgid.link/20260427-jk-iwl-net-petr-oros-fixes-v1-5-cdcb48303fd8@intel.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/intel/ice/ice_vf_lib.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_vf_lib.c b/drivers/net/ethernet/intel/ice/ice_vf_lib.c index 772f6b07340d..b1f46707dcc0 100644 --- a/drivers/net/ethernet/intel/ice/ice_vf_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_vf_lib.c @@ -804,7 +804,12 @@ void ice_reset_all_vfs(struct ice_pf *pf) ice_vf_ctrl_invalidate_vsi(vf); ice_vf_pre_vsi_rebuild(vf); - ice_vf_rebuild_vsi(vf); + if (ice_vf_rebuild_vsi(vf)) { + dev_err(dev, "VF %u VSI rebuild failed, leaving VF disabled\n", + vf->vf_id); + mutex_unlock(&vf->cfg_lock); + continue; + } ice_vf_post_vsi_rebuild(vf); ice_eswitch_attach_vf(pf, vf); From 70ad216411e030f67b1743774e245601194aee6a Mon Sep 17 00:00:00 2001 From: Petr Oros Date: Mon, 27 Apr 2026 22:22:18 -0700 Subject: [PATCH 71/79] ice: fix infinite recursion in ice_cfg_tx_topo via ice_init_dev_hw On certain E810 configurations where firmware supports Tx scheduler topology switching (tx_sched_topo_comp_mode_en), ice_cfg_tx_topo() may need to apply a new 5-layer or 9-layer topology from the DDP package. If the AQ command to set the topology fails (e.g. due to invalid DDP data or firmware limitations), the global configuration lock must still be cleared via a CORER reset. Commit 86aae43f21cf ("ice: don't leave device non-functional if Tx scheduler config fails") correctly fixed this by refactoring ice_cfg_tx_topo() to always trigger CORER after acquiring the global lock and re-initialize hardware via ice_init_hw() afterwards. However, commit 8a37f9e2ff40 ("ice: move ice_deinit_dev() to the end of deinit paths") later moved ice_init_dev_hw() into ice_init_hw(), breaking the reinit path introduced by 86aae43f21cf. This creates an infinite recursive call chain: ice_init_hw() ice_init_dev_hw() ice_cfg_tx_topo() # topology change needed ice_deinit_hw() ice_init_hw() # reinit after CORER ice_init_dev_hw() # recurse ice_cfg_tx_topo() ... # stack overflow Fix by moving ice_init_dev_hw() back out of ice_init_hw() and calling it explicitly from ice_probe() and ice_devlink_reinit_up(). The third caller, ice_cfg_tx_topo(), intentionally does not need ice_init_dev_hw() during its reinit, it only needs the core HW reinitialization. This breaks the recursion cleanly without adding flags or guards. The deinit ordering changes from commit 8a37f9e2ff40 ("ice: move ice_deinit_dev() to the end of deinit paths") which fixed slow rmmod are preserved, only the init-side placement of ice_init_dev_hw() is reverted. Fixes: 8a37f9e2ff40 ("ice: move ice_deinit_dev() to the end of deinit paths") Signed-off-by: Petr Oros Reviewed-by: Paul Menzel Reviewed-by: Jacob Keller Reviewed-by: Aleksandr Loktionov Reviewed-by: Przemek Kitszel Tested-by: Alexander Nowlin Signed-off-by: Jacob Keller Link: https://patch.msgid.link/20260427-jk-iwl-net-petr-oros-fixes-v1-6-cdcb48303fd8@intel.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/intel/ice/devlink/devlink.c | 2 ++ drivers/net/ethernet/intel/ice/ice_common.c | 2 -- drivers/net/ethernet/intel/ice/ice_main.c | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/devlink/devlink.c b/drivers/net/ethernet/intel/ice/devlink/devlink.c index 6144cee8034d..641d6e289d5c 100644 --- a/drivers/net/ethernet/intel/ice/devlink/devlink.c +++ b/drivers/net/ethernet/intel/ice/devlink/devlink.c @@ -1245,6 +1245,8 @@ static int ice_devlink_reinit_up(struct ice_pf *pf) return err; } + ice_init_dev_hw(pf); + /* load MSI-X values */ ice_set_min_max_msix(pf); diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index ce11fea122d0..b617a6bff891 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -1126,8 +1126,6 @@ int ice_init_hw(struct ice_hw *hw) if (status) goto err_unroll_fltr_mgmt_struct; - ice_init_dev_hw(hw->back); - mutex_init(&hw->tnl_lock); ice_init_chk_recipe_reuse_support(hw); diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 5f92377d4dfc..1d1947a7fe11 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -5245,6 +5245,8 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent) return err; } + ice_init_dev_hw(pf); + adapter = ice_adapter_get(pdev); if (IS_ERR(adapter)) { err = PTR_ERR(adapter); From 56a643aed0f0af5c29ebb4593d4917b78344dd48 Mon Sep 17 00:00:00 2001 From: Petr Oros Date: Mon, 27 Apr 2026 22:22:19 -0700 Subject: [PATCH 72/79] ice: fix missing SMA pin initialization in DPLL subsystem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The DPLL SMA/U.FL pin redesign introduced ice_dpll_sw_pin_frequency_get() which gates frequency reporting on the pin's active flag. This flag is determined by ice_dpll_sw_pins_update() from the PCA9575 GPIO expander state. Before the redesign, SMA pins were exposed as direct HW input/output pins and ice_dpll_frequency_get() returned the CGU frequency unconditionally — the PCA9575 state was never consulted. The PCA9575 powers on with all outputs high, setting ICE_SMA1_DIR_EN, ICE_SMA1_TX_EN, ICE_SMA2_DIR_EN and ICE_SMA2_TX_EN. Nothing in the driver writes the register during initialization, so ice_dpll_sw_pins_update() sees all pins as inactive and ice_dpll_sw_pin_frequency_get() permanently returns 0 Hz for every SW pin. Fix this by writing a default SMA configuration in ice_dpll_init_info_sw_pins(): clear all SMA bits, then set SMA1 and SMA2 as active inputs (DIR_EN=0) with U.FL1 output and U.FL2 input disabled. Each SMA/U.FL pair shares a physical signal path so only one pin per pair can be active at a time. U.FL pins still report frequency 0 after this fix: U.FL1 (output-only) is disabled by ICE_SMA1_TX_EN which keeps the TX output buffer off, and U.FL2 (input-only) is disabled by ICE_SMA2_UFL2_RX_DIS. They can be activated by changing the corresponding SMA pin direction via dpll netlink. Fixes: 2dd5d03c77e2 ("ice: redesign dpll sma/u.fl pins control") Signed-off-by: Petr Oros Reviewed-by: Ivan Vecera Reviewed-by: Arkadiusz Kubalewski Tested-by: Alexander Nowlin Signed-off-by: Jacob Keller Link: https://patch.msgid.link/20260427-jk-iwl-net-petr-oros-fixes-v1-7-cdcb48303fd8@intel.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/intel/ice/ice_dpll.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.c b/drivers/net/ethernet/intel/ice/ice_dpll.c index 62f75701d652..498ec2c045f3 100644 --- a/drivers/net/ethernet/intel/ice/ice_dpll.c +++ b/drivers/net/ethernet/intel/ice/ice_dpll.c @@ -4014,6 +4014,7 @@ static int ice_dpll_init_info_sw_pins(struct ice_pf *pf) struct ice_dpll_pin *pin; u32 phase_adj_max, caps; int i, ret; + u8 data; if (pf->hw.device_id == ICE_DEV_ID_E810C_QSFP) input_idx_offset = ICE_E810_RCLK_PINS_NUM; @@ -4073,6 +4074,22 @@ static int ice_dpll_init_info_sw_pins(struct ice_pf *pf) } ice_dpll_phase_range_set(&pin->prop.phase_range, phase_adj_max); } + + /* Initialize the SMA control register to a known-good default state. + * Without this write the PCA9575 GPIO expander retains its power-on + * default (all outputs high) which makes all SW pins appear inactive. + * Set SMA1 and SMA2 as active inputs, disable U.FL1 output and + * U.FL2 input. + */ + ret = ice_read_sma_ctrl(&pf->hw, &data); + if (ret) + return ret; + data &= ~ICE_ALL_SMA_MASK; + data |= ICE_SMA1_TX_EN | ICE_SMA2_TX_EN | ICE_SMA2_UFL2_RX_DIS; + ret = ice_write_sma_ctrl(&pf->hw, data); + if (ret) + return ret; + ret = ice_dpll_pin_state_update(pf, pin, ICE_DPLL_PIN_TYPE_SOFTWARE, NULL); if (ret) From 6f9d8393c9f50fbc68b9c9e99f78ca5a7b43ff44 Mon Sep 17 00:00:00 2001 From: Petr Oros Date: Mon, 27 Apr 2026 22:22:20 -0700 Subject: [PATCH 73/79] ice: fix SMA and U.FL pin state changes affecting paired pin SMA and U.FL pins share physical signal paths in pairs (SMA1/U.FL1 and SMA2/U.FL2) controlled by the PCA9575 GPIO expander. Each pair can only have one active pin at a time: SMA1 output and U.FL1 output share the same CGU output, SMA2 input and U.FL2 input share the same CGU input. The PCA9575 register bits determine which connector in each pair owns the signal path. The driver does not account for this pairing in two places: ice_dpll_ufl_pin_state_set() modifies PCA9575 bits and disables the backing CGU pin without checking whether the U.FL pin is currently active. Disconnecting an already inactive U.FL pin flips bits that the paired SMA pin relies on, breaking its connection. ice_dpll_sma_direction_set() does not propagate direction changes to the paired U.FL pin. For SMA2/U.FL2 the ICE_SMA2_UFL2_RX_DIS bit is never managed, so U.FL2 stays disconnected after SMA2 switches to output. For both pairs the backing CGU pin of the U.FL side is never enabled when a direction change activates it, so userspace sees the pin as disconnected even though the routing is correct. Fix by guarding the U.FL disconnect path against inactive pins and by updating the paired U.FL pin fully on SMA direction changes: manage ICE_SMA2_UFL2_RX_DIS for the SMA2/U.FL2 pair and enable the backing CGU pin whenever the peer becomes active. Fixes: 2dd5d03c77e2 ("ice: redesign dpll sma/u.fl pins control") Signed-off-by: Petr Oros Tested-by: Alexander Nowlin Reviewed-by: Arkadiusz Kubalewski Signed-off-by: Jacob Keller Link: https://patch.msgid.link/20260427-jk-iwl-net-petr-oros-fixes-v1-8-cdcb48303fd8@intel.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/intel/ice/ice_dpll.c | 50 ++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.c b/drivers/net/ethernet/intel/ice/ice_dpll.c index 498ec2c045f3..3f8cd5b8298b 100644 --- a/drivers/net/ethernet/intel/ice/ice_dpll.c +++ b/drivers/net/ethernet/intel/ice/ice_dpll.c @@ -1171,6 +1171,8 @@ static int ice_dpll_sma_direction_set(struct ice_dpll_pin *p, enum dpll_pin_direction direction, struct netlink_ext_ack *extack) { + struct ice_dplls *d = &p->pf->dplls; + struct ice_dpll_pin *peer; u8 data; int ret; @@ -1189,8 +1191,9 @@ static int ice_dpll_sma_direction_set(struct ice_dpll_pin *p, case ICE_DPLL_PIN_SW_2_IDX: if (direction == DPLL_PIN_DIRECTION_INPUT) { data &= ~ICE_SMA2_DIR_EN; + data |= ICE_SMA2_UFL2_RX_DIS; } else { - data &= ~ICE_SMA2_TX_EN; + data &= ~(ICE_SMA2_TX_EN | ICE_SMA2_UFL2_RX_DIS); data |= ICE_SMA2_DIR_EN; } break; @@ -1202,6 +1205,34 @@ static int ice_dpll_sma_direction_set(struct ice_dpll_pin *p, ret = ice_dpll_pin_state_update(p->pf, p, ICE_DPLL_PIN_TYPE_SOFTWARE, extack); + if (ret) + return ret; + + /* When a direction change activates the paired U.FL pin, enable + * its backing CGU pin so the pin reports as connected. Without + * this the U.FL routing is correct but the CGU pin stays disabled + * and userspace sees the pin as disconnected. Do not disable the + * backing pin when U.FL becomes inactive because the SMA pin may + * still be using it. + */ + peer = &d->ufl[p->idx]; + if (peer->active) { + struct ice_dpll_pin *target; + enum ice_dpll_pin_type type; + + if (peer->output) { + target = peer->output; + type = ICE_DPLL_PIN_TYPE_OUTPUT; + } else { + target = peer->input; + type = ICE_DPLL_PIN_TYPE_INPUT; + } + ret = ice_dpll_pin_enable(&p->pf->hw, target, + d->eec.dpll_idx, type, extack); + if (!ret) + ret = ice_dpll_pin_state_update(p->pf, target, + type, extack); + } return ret; } @@ -1253,6 +1284,14 @@ ice_dpll_ufl_pin_state_set(const struct dpll_pin *pin, void *pin_priv, data &= ~ICE_SMA1_MASK; enable = true; } else if (state == DPLL_PIN_STATE_DISCONNECTED) { + /* Skip if U.FL1 is not active, setting TX_EN + * while DIR_EN is set would also deactivate + * the paired SMA1 output. + */ + if (data & (ICE_SMA1_DIR_EN | ICE_SMA1_TX_EN)) { + ret = 0; + goto unlock; + } data |= ICE_SMA1_TX_EN; enable = false; } else { @@ -1267,6 +1306,15 @@ ice_dpll_ufl_pin_state_set(const struct dpll_pin *pin, void *pin_priv, data &= ~ICE_SMA2_UFL2_RX_DIS; enable = true; } else if (state == DPLL_PIN_STATE_DISCONNECTED) { + /* Skip if U.FL2 is not active, setting + * UFL2_RX_DIS could also disable the paired + * SMA2 input. + */ + if (!(data & ICE_SMA2_DIR_EN) || + (data & ICE_SMA2_UFL2_RX_DIS)) { + ret = 0; + goto unlock; + } data |= ICE_SMA2_UFL2_RX_DIS; enable = false; } else { From 620055cb1036a6125fd912e7a14b47a6572b809b Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Mon, 27 Apr 2026 22:22:21 -0700 Subject: [PATCH 74/79] dpll: export __dpll_pin_change_ntf() for use under dpll_lock Export __dpll_pin_change_ntf() so that drivers can send pin change notifications from within pin callbacks, which are already called under dpll_lock. Using dpll_pin_change_ntf() in that context would deadlock. Add lockdep_assert_held() to catch misuse without the lock held. Acked-by: Vadim Fedorenko Signed-off-by: Ivan Vecera Signed-off-by: Petr Oros Tested-by: Alexander Nowlin Reviewed-by: Arkadiusz Kubalewski Signed-off-by: Jacob Keller Link: https://patch.msgid.link/20260427-jk-iwl-net-petr-oros-fixes-v1-9-cdcb48303fd8@intel.com Signed-off-by: Paolo Abeni --- drivers/dpll/dpll_netlink.c | 10 ++++++++++ drivers/dpll/dpll_netlink.h | 2 -- include/linux/dpll.h | 1 + 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c index af7ce62ec55c..0ff1658c2dc1 100644 --- a/drivers/dpll/dpll_netlink.c +++ b/drivers/dpll/dpll_netlink.c @@ -900,11 +900,21 @@ int dpll_pin_delete_ntf(struct dpll_pin *pin) return dpll_pin_event_send(DPLL_CMD_PIN_DELETE_NTF, pin); } +/** + * __dpll_pin_change_ntf - notify that the pin has been changed + * @pin: registered pin pointer + * + * Context: caller must hold dpll_lock. Suitable for use inside pin + * callbacks which are already invoked under dpll_lock. + * Return: 0 if succeeds, error code otherwise. + */ int __dpll_pin_change_ntf(struct dpll_pin *pin) { + lockdep_assert_held(&dpll_lock); dpll_pin_notify(pin, DPLL_PIN_CHANGED); return dpll_pin_event_send(DPLL_CMD_PIN_CHANGE_NTF, pin); } +EXPORT_SYMBOL_GPL(__dpll_pin_change_ntf); /** * dpll_pin_change_ntf - notify that the pin has been changed diff --git a/drivers/dpll/dpll_netlink.h b/drivers/dpll/dpll_netlink.h index dd28b56d27c5..a9cfd55f57fc 100644 --- a/drivers/dpll/dpll_netlink.h +++ b/drivers/dpll/dpll_netlink.h @@ -11,5 +11,3 @@ int dpll_device_delete_ntf(struct dpll_device *dpll); int dpll_pin_create_ntf(struct dpll_pin *pin); int dpll_pin_delete_ntf(struct dpll_pin *pin); - -int __dpll_pin_change_ntf(struct dpll_pin *pin); diff --git a/include/linux/dpll.h b/include/linux/dpll.h index b7277a8b484d..f8037f1ab20b 100644 --- a/include/linux/dpll.h +++ b/include/linux/dpll.h @@ -286,6 +286,7 @@ int dpll_pin_ref_sync_pair_add(struct dpll_pin *pin, int dpll_device_change_ntf(struct dpll_device *dpll); +int __dpll_pin_change_ntf(struct dpll_pin *pin); int dpll_pin_change_ntf(struct dpll_pin *pin); int register_dpll_notifier(struct notifier_block *nb); From 1a41b58fd4dc80dca16c717e6e77c88b9d4e83a7 Mon Sep 17 00:00:00 2001 From: Petr Oros Date: Mon, 27 Apr 2026 22:22:22 -0700 Subject: [PATCH 75/79] ice: fix missing dpll notifications for SW pins The SMA/U.FL pin redesign (commit 2dd5d03c77e2 ("ice: redesign dpll sma/u.fl pins control")) introduced software-controlled pins that wrap backing CGU input/output pins, but never updated the notification and data paths to propagate pin events to these SW wrappers. The periodic work sends dpll_pin_change_ntf() only for direct CGU input pins. SW pins that wrap these inputs never receive change or phase offset notifications, so userspace consumers such as synce4l monitoring SMA pins via dpll netlink never learn about state transitions or phase offset updates. Similarly, ice_dpll_phase_offset_get() reads the SW pin's own phase_offset field which is never updated; the PPS monitor writes to the backing CGU input's field instead. Fix by introducing ice_dpll_pin_ntf(), a wrapper around dpll_pin_change_ntf() that also notifies any registered SMA/U.FL pin whose backing CGU input matches. Replace all direct dpll_pin_change_ntf() calls in the periodic notification paths with this wrapper. Fix ice_dpll_phase_offset_get() to return the backing CGU input's phase_offset for input-direction SW pins. Fixes: 2dd5d03c77e2 ("ice: redesign dpll sma/u.fl pins control") Signed-off-by: Petr Oros Tested-by: Alexander Nowlin Reviewed-by: Arkadiusz Kubalewski Reviewed-by: Aleksandr Loktionov Reviewed-by: Ivan Vecera Signed-off-by: Jacob Keller Link: https://patch.msgid.link/20260427-jk-iwl-net-petr-oros-fixes-v1-10-cdcb48303fd8@intel.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/intel/ice/ice_dpll.c | 47 +++++++++++++++++------ 1 file changed, 36 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.c b/drivers/net/ethernet/intel/ice/ice_dpll.c index 3f8cd5b8298b..721a3f4d6a28 100644 --- a/drivers/net/ethernet/intel/ice/ice_dpll.c +++ b/drivers/net/ethernet/intel/ice/ice_dpll.c @@ -1963,7 +1963,10 @@ ice_dpll_phase_offset_get(const struct dpll_pin *pin, void *pin_priv, d->active_input == p->input->pin)) *phase_offset = d->phase_offset * ICE_DPLL_PHASE_OFFSET_FACTOR; else if (d->phase_offset_monitor_period) - *phase_offset = p->phase_offset * ICE_DPLL_PHASE_OFFSET_FACTOR; + *phase_offset = (p->input && + p->direction == DPLL_PIN_DIRECTION_INPUT ? + p->input->phase_offset : + p->phase_offset) * ICE_DPLL_PHASE_OFFSET_FACTOR; else *phase_offset = 0; mutex_unlock(&pf->dplls.lock); @@ -2657,6 +2660,27 @@ static u64 ice_generate_clock_id(struct ice_pf *pf) return pci_get_dsn(pf->pdev); } +/** + * ice_dpll_pin_ntf - notify pin change including any SW pin wrappers + * @dplls: pointer to dplls struct + * @pin: the dpll_pin that changed + * + * Send a change notification for @pin and for any registered SMA/U.FL pin + * whose backing CGU input matches @pin. + */ +static void ice_dpll_pin_ntf(struct ice_dplls *dplls, struct dpll_pin *pin) +{ + dpll_pin_change_ntf(pin); + for (int i = 0; i < ICE_DPLL_PIN_SW_NUM; i++) { + if (dplls->sma[i].pin && dplls->sma[i].input && + dplls->sma[i].input->pin == pin) + dpll_pin_change_ntf(dplls->sma[i].pin); + if (dplls->ufl[i].pin && dplls->ufl[i].input && + dplls->ufl[i].input->pin == pin) + dpll_pin_change_ntf(dplls->ufl[i].pin); + } +} + /** * ice_dpll_notify_changes - notify dpll subsystem about changes * @d: pointer do dpll @@ -2665,6 +2689,7 @@ static u64 ice_generate_clock_id(struct ice_pf *pf) */ static void ice_dpll_notify_changes(struct ice_dpll *d) { + struct ice_dplls *dplls = &d->pf->dplls; bool pin_notified = false; if (d->prev_dpll_state != d->dpll_state) { @@ -2673,17 +2698,17 @@ static void ice_dpll_notify_changes(struct ice_dpll *d) } if (d->prev_input != d->active_input) { if (d->prev_input) - dpll_pin_change_ntf(d->prev_input); + ice_dpll_pin_ntf(dplls, d->prev_input); d->prev_input = d->active_input; if (d->active_input) { - dpll_pin_change_ntf(d->active_input); + ice_dpll_pin_ntf(dplls, d->active_input); pin_notified = true; } } if (d->prev_phase_offset != d->phase_offset) { d->prev_phase_offset = d->phase_offset; if (!pin_notified && d->active_input) - dpll_pin_change_ntf(d->active_input); + ice_dpll_pin_ntf(dplls, d->active_input); } } @@ -2712,6 +2737,7 @@ static bool ice_dpll_is_pps_phase_monitor(struct ice_pf *pf) /** * ice_dpll_pins_notify_mask - notify dpll subsystem about bulk pin changes + * @dplls: pointer to dplls struct * @pins: array of ice_dpll_pin pointers registered within dpll subsystem * @pin_num: number of pins * @phase_offset_ntf_mask: bitmask of pin indexes to notify @@ -2721,15 +2747,14 @@ static bool ice_dpll_is_pps_phase_monitor(struct ice_pf *pf) * * Context: Must be called while pf->dplls.lock is released. */ -static void ice_dpll_pins_notify_mask(struct ice_dpll_pin *pins, +static void ice_dpll_pins_notify_mask(struct ice_dplls *dplls, + struct ice_dpll_pin *pins, u8 pin_num, u32 phase_offset_ntf_mask) { - int i = 0; - - for (i = 0; i < pin_num; i++) - if (phase_offset_ntf_mask & (1 << i)) - dpll_pin_change_ntf(pins[i].pin); + for (int i = 0; i < pin_num; i++) + if (phase_offset_ntf_mask & BIT(i)) + ice_dpll_pin_ntf(dplls, pins[i].pin); } /** @@ -2905,7 +2930,7 @@ static void ice_dpll_periodic_work(struct kthread_work *work) ice_dpll_notify_changes(de); ice_dpll_notify_changes(dp); if (phase_offset_ntf) - ice_dpll_pins_notify_mask(d->inputs, d->num_inputs, + ice_dpll_pins_notify_mask(d, d->inputs, d->num_inputs, phase_offset_ntf); resched: From 9e5dead140af10e8b5f975b8f04e46197d48d274 Mon Sep 17 00:00:00 2001 From: Petr Oros Date: Mon, 27 Apr 2026 22:22:23 -0700 Subject: [PATCH 76/79] ice: add dpll peer notification for paired SMA and U.FL pins SMA and U.FL pins share physical signal paths in pairs (SMA1/U.FL1 and SMA2/U.FL2). When one pin's state changes via a PCA9575 GPIO write, the paired pin's state also changes, but no notification is sent for the peer pin. Userspace consumers monitoring the peer via dpll netlink subscribe never learn about the update. Add ice_dpll_sw_pin_notify_peer() which sends a change notification for the paired SW pin. Call it from ice_dpll_pin_sma_direction_set(), ice_dpll_sma_pin_state_set(), and ice_dpll_ufl_pin_state_set() after pf->dplls.lock is released. Use __dpll_pin_change_ntf() because dpll_lock is still held by the dpll netlink layer (dpll_pin_pre_doit). Fixes: 2dd5d03c77e2 ("ice: redesign dpll sma/u.fl pins control") Signed-off-by: Petr Oros Tested-by: Alexander Nowlin Reviewed-by: Arkadiusz Kubalewski Reviewed-by: Aleksandr Loktionov Signed-off-by: Jacob Keller Link: https://patch.msgid.link/20260427-jk-iwl-net-petr-oros-fixes-v1-11-cdcb48303fd8@intel.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/intel/ice/ice_dpll.c | 32 +++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.c b/drivers/net/ethernet/intel/ice/ice_dpll.c index 721a3f4d6a28..27b460926bac 100644 --- a/drivers/net/ethernet/intel/ice/ice_dpll.c +++ b/drivers/net/ethernet/intel/ice/ice_dpll.c @@ -1154,6 +1154,32 @@ ice_dpll_input_state_get(const struct dpll_pin *pin, void *pin_priv, extack, ICE_DPLL_PIN_TYPE_INPUT); } +/** + * ice_dpll_sw_pin_notify_peer - notify the paired SW pin after a state change + * @d: pointer to dplls struct + * @changed: the SW pin that was explicitly changed (already notified by dpll core) + * + * SMA and U.FL pins share physical signal paths in pairs (SMA1/U.FL1 and + * SMA2/U.FL2). When one pin's routing changes via the PCA9575 GPIO + * expander, the paired pin's state may also change. Send a change + * notification for the peer pin so userspace consumers monitoring the + * peer via dpll netlink learn about the update. + * + * Context: Called from dpll_pin_ops callbacks after pf->dplls.lock is + * released. Uses __dpll_pin_change_ntf() because dpll_lock is + * still held by the dpll netlink layer. + */ +static void ice_dpll_sw_pin_notify_peer(struct ice_dplls *d, + struct ice_dpll_pin *changed) +{ + struct ice_dpll_pin *peer; + + peer = (changed >= d->sma && changed < d->sma + ICE_DPLL_PIN_SW_NUM) ? + &d->ufl[changed->idx] : &d->sma[changed->idx]; + if (peer->pin) + __dpll_pin_change_ntf(peer->pin); +} + /** * ice_dpll_sma_direction_set - set direction of SMA pin * @p: pointer to a pin @@ -1344,6 +1370,8 @@ ice_dpll_ufl_pin_state_set(const struct dpll_pin *pin, void *pin_priv, unlock: mutex_unlock(&pf->dplls.lock); + if (!ret) + ice_dpll_sw_pin_notify_peer(&pf->dplls, p); return ret; } @@ -1462,6 +1490,8 @@ ice_dpll_sma_pin_state_set(const struct dpll_pin *pin, void *pin_priv, unlock: mutex_unlock(&pf->dplls.lock); + if (!ret) + ice_dpll_sw_pin_notify_peer(&pf->dplls, sma); return ret; } @@ -1657,6 +1687,8 @@ ice_dpll_pin_sma_direction_set(const struct dpll_pin *pin, void *pin_priv, mutex_lock(&pf->dplls.lock); ret = ice_dpll_sma_direction_set(p, direction, extack); mutex_unlock(&pf->dplls.lock); + if (!ret) + ice_dpll_sw_pin_notify_peer(&pf->dplls, p); return ret; } From 58689498ca3384851145a754dbb1d8ed1cf9fb54 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 28 Apr 2026 16:15:59 -0700 Subject: [PATCH 77/79] net: tls: fix strparser anchor skb leak on offload RX setup failure When tls_set_device_offload_rx() fails at tls_dev_add(), the error path calls tls_sw_free_resources_rx() to clean up the SW context that was initialized by tls_set_sw_offload(). This function calls tls_sw_release_resources_rx() (which stops the strparser via tls_strp_stop()) and tls_sw_free_ctx_rx() (which kfrees the context), but never frees the anchor skb that was allocated by alloc_skb(0) in tls_strp_init(). Note that tls_sw_free_resources_rx() is exclusively used for this "failed to start offload" code path, there's no other caller. The leak did not exist before commit 84c61fe1a75b ("tls: rx: do not use the standard strparser"), because the standard strparser doesn't try to pre-allocate an skb. The normal close path in tls_sk_proto_close() handles cleanup by calling tls_sw_strparser_done() (which calls tls_strp_done()) after dropping the socket lock, because tls_strp_done() does cancel_work_sync() and the strparser work handler takes the socket lock. Fixes: 84c61fe1a75b ("tls: rx: do not use the standard strparser") Signed-off-by: Jakub Kicinski Reviewed-by: Vadim Fedorenko Link: https://patch.msgid.link/20260428231559.1358502-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- net/tls/tls.h | 1 + net/tls/tls_strp.c | 6 ++++++ net/tls/tls_sw.c | 4 ++++ 3 files changed, 11 insertions(+) diff --git a/net/tls/tls.h b/net/tls/tls.h index e8f81a006520..12f44cb649c9 100644 --- a/net/tls/tls.h +++ b/net/tls/tls.h @@ -188,6 +188,7 @@ int tls_strp_dev_init(void); void tls_strp_dev_exit(void); void tls_strp_done(struct tls_strparser *strp); +void __tls_strp_done(struct tls_strparser *strp); void tls_strp_stop(struct tls_strparser *strp); int tls_strp_init(struct tls_strparser *strp, struct sock *sk); void tls_strp_data_ready(struct tls_strparser *strp); diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c index 98e12f0ff57e..c72e88317627 100644 --- a/net/tls/tls_strp.c +++ b/net/tls/tls_strp.c @@ -624,6 +624,12 @@ void tls_strp_done(struct tls_strparser *strp) WARN_ON(!strp->stopped); cancel_work_sync(&strp->work); + __tls_strp_done(strp); +} + +/* For setup error paths where the strparser was initialized but never armed. */ +void __tls_strp_done(struct tls_strparser *strp) +{ tls_strp_anchor_free(strp); } diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 94d2ae0daa8c..798243eabb1f 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2624,8 +2624,12 @@ void tls_sw_free_ctx_rx(struct tls_context *tls_ctx) void tls_sw_free_resources_rx(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_rx *ctx; + + ctx = tls_sw_ctx_rx(tls_ctx); tls_sw_release_resources_rx(sk); + __tls_strp_done(&ctx->strp); tls_sw_free_ctx_rx(tls_ctx); } From 051ffb001b8a232cfa6e72f38bb5f51c4270a60b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 29 Apr 2026 09:48:17 +0300 Subject: [PATCH 78/79] sfc: fix error code in efx_devlink_info_running_versions() Return -EIO if efx_mcdi_rpc() doesn't return enough space. Fixes: 14743ddd2495 ("sfc: add devlink info support for ef100") Signed-off-by: Dan Carpenter Reviewed-by: Edward Cree Link: https://patch.msgid.link/afGpsbLRHL4_H0KS@stanley.mountain Signed-off-by: Paolo Abeni --- drivers/net/ethernet/sfc/efx_devlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/sfc/efx_devlink.c b/drivers/net/ethernet/sfc/efx_devlink.c index d842c60dfc10..e5c6f81af48b 100644 --- a/drivers/net/ethernet/sfc/efx_devlink.c +++ b/drivers/net/ethernet/sfc/efx_devlink.c @@ -531,7 +531,7 @@ static int efx_devlink_info_running_versions(struct efx_nic *efx, if (rc || outlength < MC_CMD_GET_VERSION_OUT_LEN) { netif_err(efx, drv, efx->net_dev, "mcdi MC_CMD_GET_VERSION failed\n"); - return rc; + return rc ?: -EIO; } /* Handle previous output */ From 1e01abec856593e02cd69fd95b784c10dd46880c Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 29 Apr 2026 09:39:11 +0200 Subject: [PATCH 79/79] net/sched: cls_flower: revert unintended changes While applying the blamed commit 4ca07b9239bd ("net: mctp i2c: check length before marking flow active"), I unintentionally included unrelated and unacceptable changes. Revert them. Fixes: 4ca07b9239bd ("net: mctp i2c: check length before marking flow active") Reported-by: Jeremy Kerr Closes: https://lore.kernel.org/netdev/bd8704fe0bd53e278add5cde4873256656623e2e.camel@codeconstruct.com.au/ Signed-off-by: Paolo Abeni Link: https://patch.msgid.link/043026a53ff84da88b17648c4b0d17f0331749cb.1777447863.git.pabeni@redhat.com Signed-off-by: Paolo Abeni --- net/sched/cls_flower.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index b9672ea05747..88f8a32fab2b 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -556,7 +556,6 @@ static int __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f, struct netlink_ext_ack *extack) { struct cls_fl_head *head = fl_head_dereference(tp); - struct fl_flow_mask *mask; *last = false; @@ -573,12 +572,11 @@ static int __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f, list_del_rcu(&f->list); spin_unlock(&tp->lock); - mask = f->mask; + *last = fl_mask_put(head, f->mask); if (!tc_skip_hw(f->flags)) fl_hw_destroy_filter(tp, f, rtnl_held, extack); tcf_unbind_filter(tp, &f->res); __fl_put(f); - *last = fl_mask_put(head, mask); return 0; }