net: introduce ndo_set_rx_mode_async and netdev_rx_mode_work

Add ndo_set_rx_mode_async callback that drivers can implement instead
of the legacy ndo_set_rx_mode. The legacy callback runs under the
netif_addr_lock spinlock with BHs disabled, preventing drivers from
sleeping. The async variant runs from a work queue with rtnl_lock and
netdev_lock_ops held, in fully sleepable context.

When __dev_set_rx_mode() sees ndo_set_rx_mode_async, it schedules
netdev_rx_mode_work instead of calling the driver inline. The work
function takes two snapshots of each address list (uc/mc) under
the addr_lock, then drops the lock and calls the driver with the
work copies. After the driver returns, it reconciles the snapshots
back to the real lists under the lock.

Add netif_rx_mode_sync() to opportunistically execute the pending
workqueue update inline, so that rx mode changes are committed
before returning to userspace:
  - dev_change_flags (SIOCSIFFLAGS / RTM_NEWLINK)
  - dev_set_promiscuity
  - dev_set_allmulti
  - dev_ifsioc SIOCADDMULTI / SIOCDELMULTI
  - do_setlink (RTM_SETLINK)

Note that some deep hierarchies still do skip the lower updates via:
  - dev_uc_sync
  - dev_mc_sync

If we do end up hitting user-visible issues, we can add more calls to
netif_rx_mode_sync in specific places. But hopefully we should not,
the actual user-visible lists are still synced, it's that just HW state
that might be lagging.

Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20260416185712.2155425-3-sdf@fomichev.me
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
master
Stanislav Fomichev 2026-04-16 11:56:59 -07:00 committed by Paolo Abeni
parent db9e726525
commit 3554b4345d
8 changed files with 249 additions and 43 deletions

View File

@ -289,6 +289,15 @@ ndo_tx_timeout:
ndo_set_rx_mode:
Synchronization: netif_addr_lock spinlock.
Context: BHs disabled
Notes: Deprecated in favor of ndo_set_rx_mode_async which runs
in process context.
ndo_set_rx_mode_async:
Synchronization: rtnl_lock() semaphore. In addition, netdev instance
lock if the driver implements queue management or shaper API.
Context: process (from a work queue)
Notes: Async version of ndo_set_rx_mode which runs in process
context. Receives snapshots of the unicast and multicast address lists.
ndo_setup_tc:
``TC_SETUP_BLOCK`` and ``TC_SETUP_FT`` are running under NFT locks

View File

@ -1119,6 +1119,16 @@ struct netdev_net_notifier {
* This function is called device changes address list filtering.
* If driver handles unicast address filtering, it should set
* IFF_UNICAST_FLT in its priv_flags.
* Cannot sleep, called with netif_addr_lock_bh held.
* Deprecated in favor of ndo_set_rx_mode_async.
*
* void (*ndo_set_rx_mode_async)(struct net_device *dev,
* struct netdev_hw_addr_list *uc,
* struct netdev_hw_addr_list *mc);
* Async version of ndo_set_rx_mode which runs in process context
* with rtnl_lock and netdev_lock_ops(dev) held. The uc/mc parameters
* are snapshots of the address lists - iterate with
* netdev_hw_addr_list_for_each(ha, uc).
*
* int (*ndo_set_mac_address)(struct net_device *dev, void *addr);
* This function is called when the Media Access Control address
@ -1439,6 +1449,10 @@ struct net_device_ops {
void (*ndo_change_rx_flags)(struct net_device *dev,
int flags);
void (*ndo_set_rx_mode)(struct net_device *dev);
void (*ndo_set_rx_mode_async)(
struct net_device *dev,
struct netdev_hw_addr_list *uc,
struct netdev_hw_addr_list *mc);
int (*ndo_set_mac_address)(struct net_device *dev,
void *addr);
int (*ndo_validate_addr)(struct net_device *dev);
@ -1903,6 +1917,8 @@ enum netdev_reg_state {
* has been enabled due to the need to listen to
* additional unicast addresses in a device that
* does not implement ndo_set_rx_mode()
* @rx_mode_node: List entry for rx_mode work processing
* @rx_mode_tracker: Refcount tracker for rx_mode work
* @uc: unicast mac addresses
* @mc: multicast mac addresses
* @dev_addrs: list of device hw addresses
@ -2294,6 +2310,8 @@ struct net_device {
unsigned int promiscuity;
unsigned int allmulti;
bool uc_promisc;
struct list_head rx_mode_node;
netdevice_tracker rx_mode_tracker;
#ifdef CONFIG_LOCKDEP
unsigned char nested_level;
#endif

View File

@ -9593,7 +9593,7 @@ static void dev_change_rx_flags(struct net_device *dev, int flags)
ops->ndo_change_rx_flags(dev, flags);
}
static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
{
unsigned int old_flags = dev->flags;
unsigned int promiscuity, flags;
@ -9697,46 +9697,6 @@ int netif_set_allmulti(struct net_device *dev, int inc, bool notify)
return 0;
}
/*
* Upload unicast and multicast address lists to device and
* configure RX filtering. When the device doesn't support unicast
* filtering it is put in promiscuous mode while unicast addresses
* are present.
*/
void __dev_set_rx_mode(struct net_device *dev)
{
const struct net_device_ops *ops = dev->netdev_ops;
/* dev_open will call this function so the list will stay sane. */
if (!(dev->flags&IFF_UP))
return;
if (!netif_device_present(dev))
return;
if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
/* Unicast addresses changes may only happen under the rtnl,
* therefore calling __dev_set_promiscuity here is safe.
*/
if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
__dev_set_promiscuity(dev, 1, false);
dev->uc_promisc = true;
} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
__dev_set_promiscuity(dev, -1, false);
dev->uc_promisc = false;
}
}
if (ops->ndo_set_rx_mode)
ops->ndo_set_rx_mode(dev);
}
void dev_set_rx_mode(struct net_device *dev)
{
netif_addr_lock_bh(dev);
__dev_set_rx_mode(dev);
netif_addr_unlock_bh(dev);
}
/**
* netif_get_flags() - get flags reported to userspace
@ -12127,6 +12087,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
#endif
mutex_init(&dev->lock);
INIT_LIST_HEAD(&dev->rx_mode_node);
dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
setup(dev);

View File

@ -165,6 +165,9 @@ int netif_change_carrier(struct net_device *dev, bool new_carrier);
int dev_change_carrier(struct net_device *dev, bool new_carrier);
void __dev_set_rx_mode(struct net_device *dev);
int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify);
bool netif_rx_mode_clean(struct net_device *dev);
void netif_rx_mode_sync(struct net_device *dev);
void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
unsigned int gchanges, u32 portid,

View File

@ -11,10 +11,18 @@
#include <linux/rtnetlink.h>
#include <linux/export.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/workqueue.h>
#include <kunit/visibility.h>
#include "dev.h"
static void netdev_rx_mode_work(struct work_struct *work);
static LIST_HEAD(rx_mode_list);
static DEFINE_SPINLOCK(rx_mode_lock);
static DECLARE_WORK(rx_mode_work, netdev_rx_mode_work);
/*
* General list handling functions
*/
@ -1156,3 +1164,204 @@ void dev_mc_init(struct net_device *dev)
__hw_addr_init(&dev->mc);
}
EXPORT_SYMBOL(dev_mc_init);
static int netif_addr_lists_snapshot(struct net_device *dev,
struct netdev_hw_addr_list *uc_snap,
struct netdev_hw_addr_list *mc_snap,
struct netdev_hw_addr_list *uc_ref,
struct netdev_hw_addr_list *mc_ref)
{
int err;
err = __hw_addr_list_snapshot(uc_snap, &dev->uc, dev->addr_len);
if (!err)
err = __hw_addr_list_snapshot(uc_ref, &dev->uc, dev->addr_len);
if (!err)
err = __hw_addr_list_snapshot(mc_snap, &dev->mc,
dev->addr_len);
if (!err)
err = __hw_addr_list_snapshot(mc_ref, &dev->mc, dev->addr_len);
if (err) {
__hw_addr_flush(uc_snap);
__hw_addr_flush(uc_ref);
__hw_addr_flush(mc_snap);
}
return err;
}
static void netif_addr_lists_reconcile(struct net_device *dev,
struct netdev_hw_addr_list *uc_snap,
struct netdev_hw_addr_list *mc_snap,
struct netdev_hw_addr_list *uc_ref,
struct netdev_hw_addr_list *mc_ref)
{
__hw_addr_list_reconcile(&dev->uc, uc_snap, uc_ref, dev->addr_len);
__hw_addr_list_reconcile(&dev->mc, mc_snap, mc_ref, dev->addr_len);
}
static void netif_rx_mode_run(struct net_device *dev)
{
struct netdev_hw_addr_list uc_snap, mc_snap, uc_ref, mc_ref;
const struct net_device_ops *ops = dev->netdev_ops;
int err;
might_sleep();
netdev_ops_assert_locked(dev);
__hw_addr_init(&uc_snap);
__hw_addr_init(&mc_snap);
__hw_addr_init(&uc_ref);
__hw_addr_init(&mc_ref);
if (!(dev->flags & IFF_UP) || !netif_device_present(dev))
return;
netif_addr_lock_bh(dev);
err = netif_addr_lists_snapshot(dev, &uc_snap, &mc_snap,
&uc_ref, &mc_ref);
if (err) {
netdev_WARN(dev, "failed to sync uc/mc addresses\n");
netif_addr_unlock_bh(dev);
return;
}
netif_addr_unlock_bh(dev);
ops->ndo_set_rx_mode_async(dev, &uc_snap, &mc_snap);
netif_addr_lock_bh(dev);
netif_addr_lists_reconcile(dev, &uc_snap, &mc_snap,
&uc_ref, &mc_ref);
netif_addr_unlock_bh(dev);
}
static void netdev_rx_mode_work(struct work_struct *work)
{
struct net_device *dev;
rtnl_lock();
while (true) {
spin_lock_bh(&rx_mode_lock);
if (list_empty(&rx_mode_list)) {
spin_unlock_bh(&rx_mode_lock);
break;
}
dev = list_first_entry(&rx_mode_list, struct net_device,
rx_mode_node);
list_del_init(&dev->rx_mode_node);
/* We must free netdev tracker under
* the spinlock protection.
*/
netdev_tracker_free(dev, &dev->rx_mode_tracker);
spin_unlock_bh(&rx_mode_lock);
netdev_lock_ops(dev);
netif_rx_mode_run(dev);
netdev_unlock_ops(dev);
/* Use __dev_put() because netdev_tracker_free() was already
* called above. Must be after netdev_unlock_ops() to prevent
* netdev_run_todo() from freeing the device while still in use.
*/
__dev_put(dev);
}
rtnl_unlock();
}
static void netif_rx_mode_queue(struct net_device *dev)
{
spin_lock_bh(&rx_mode_lock);
if (list_empty(&dev->rx_mode_node)) {
list_add_tail(&dev->rx_mode_node, &rx_mode_list);
netdev_hold(dev, &dev->rx_mode_tracker, GFP_ATOMIC);
}
spin_unlock_bh(&rx_mode_lock);
schedule_work(&rx_mode_work);
}
/**
* __dev_set_rx_mode() - upload unicast and multicast address lists to device
* and configure RX filtering.
* @dev: device
*
* When the device doesn't support unicast filtering it is put in promiscuous
* mode while unicast addresses are present.
*/
void __dev_set_rx_mode(struct net_device *dev)
{
const struct net_device_ops *ops = dev->netdev_ops;
/* dev_open will call this function so the list will stay sane. */
if (!(dev->flags & IFF_UP))
return;
if (!netif_device_present(dev))
return;
if (ops->ndo_set_rx_mode_async) {
netif_rx_mode_queue(dev);
return;
}
if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
__dev_set_promiscuity(dev, 1, false);
dev->uc_promisc = true;
} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
__dev_set_promiscuity(dev, -1, false);
dev->uc_promisc = false;
}
}
if (ops->ndo_set_rx_mode)
ops->ndo_set_rx_mode(dev);
}
void dev_set_rx_mode(struct net_device *dev)
{
netif_addr_lock_bh(dev);
__dev_set_rx_mode(dev);
netif_addr_unlock_bh(dev);
}
bool netif_rx_mode_clean(struct net_device *dev)
{
bool clean = false;
spin_lock_bh(&rx_mode_lock);
if (!list_empty(&dev->rx_mode_node)) {
list_del_init(&dev->rx_mode_node);
clean = true;
/* We must release netdev tracker under
* the spinlock protection.
*/
netdev_tracker_free(dev, &dev->rx_mode_tracker);
}
spin_unlock_bh(&rx_mode_lock);
return clean;
}
/**
* netif_rx_mode_sync() - sync rx mode inline
* @dev: network device
*
* Drivers implementing ndo_set_rx_mode_async() have their rx mode callback
* executed from a workqueue. This allows the callback to sleep, but means
* the hardware update is deferred and may not be visible to userspace
* by the time the initiating syscall returns. netif_rx_mode_sync() steals
* workqueue update and executes it inline. This preserves the atomicity of
* operations to the userspace.
*/
void netif_rx_mode_sync(struct net_device *dev)
{
if (netif_rx_mode_clean(dev)) {
netif_rx_mode_run(dev);
/* Use __dev_put() because netdev_tracker_free() was already
* called inside netif_rx_mode_clean().
*/
__dev_put(dev);
}
}

View File

@ -66,6 +66,7 @@ int dev_change_flags(struct net_device *dev, unsigned int flags,
netdev_lock_ops(dev);
ret = netif_change_flags(dev, flags, extack);
netif_rx_mode_sync(dev);
netdev_unlock_ops(dev);
return ret;
@ -285,6 +286,7 @@ int dev_set_promiscuity(struct net_device *dev, int inc)
netdev_lock_ops(dev);
ret = netif_set_promiscuity(dev, inc);
netif_rx_mode_sync(dev);
netdev_unlock_ops(dev);
return ret;
@ -311,6 +313,7 @@ int dev_set_allmulti(struct net_device *dev, int inc)
netdev_lock_ops(dev);
ret = netif_set_allmulti(dev, inc, true);
netif_rx_mode_sync(dev);
netdev_unlock_ops(dev);
return ret;

View File

@ -586,24 +586,26 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
return err;
case SIOCADDMULTI:
if (!ops->ndo_set_rx_mode ||
if ((!ops->ndo_set_rx_mode && !ops->ndo_set_rx_mode_async) ||
ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
return -EINVAL;
if (!netif_device_present(dev))
return -ENODEV;
netdev_lock_ops(dev);
err = dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
netif_rx_mode_sync(dev);
netdev_unlock_ops(dev);
return err;
case SIOCDELMULTI:
if (!ops->ndo_set_rx_mode ||
if ((!ops->ndo_set_rx_mode && !ops->ndo_set_rx_mode_async) ||
ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
return -EINVAL;
if (!netif_device_present(dev))
return -ENODEV;
netdev_lock_ops(dev);
err = dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
netif_rx_mode_sync(dev);
netdev_unlock_ops(dev);
return err;

View File

@ -3431,6 +3431,7 @@ errout:
dev->name);
}
netif_rx_mode_sync(dev);
netdev_unlock_ops(dev);
return err;