net: introduce ndo_set_rx_mode_async and netdev_rx_mode_work
Add ndo_set_rx_mode_async callback that drivers can implement instead of the legacy ndo_set_rx_mode. The legacy callback runs under the netif_addr_lock spinlock with BHs disabled, preventing drivers from sleeping. The async variant runs from a work queue with rtnl_lock and netdev_lock_ops held, in fully sleepable context. When __dev_set_rx_mode() sees ndo_set_rx_mode_async, it schedules netdev_rx_mode_work instead of calling the driver inline. The work function takes two snapshots of each address list (uc/mc) under the addr_lock, then drops the lock and calls the driver with the work copies. After the driver returns, it reconciles the snapshots back to the real lists under the lock. Add netif_rx_mode_sync() to opportunistically execute the pending workqueue update inline, so that rx mode changes are committed before returning to userspace: - dev_change_flags (SIOCSIFFLAGS / RTM_NEWLINK) - dev_set_promiscuity - dev_set_allmulti - dev_ifsioc SIOCADDMULTI / SIOCDELMULTI - do_setlink (RTM_SETLINK) Note that some deep hierarchies still do skip the lower updates via: - dev_uc_sync - dev_mc_sync If we do end up hitting user-visible issues, we can add more calls to netif_rx_mode_sync in specific places. But hopefully we should not, the actual user-visible lists are still synced, it's that just HW state that might be lagging. Signed-off-by: Stanislav Fomichev <sdf@fomichev.me> Link: https://patch.msgid.link/20260416185712.2155425-3-sdf@fomichev.me Signed-off-by: Paolo Abeni <pabeni@redhat.com>master
parent
db9e726525
commit
3554b4345d
|
|
@ -289,6 +289,15 @@ ndo_tx_timeout:
|
|||
ndo_set_rx_mode:
|
||||
Synchronization: netif_addr_lock spinlock.
|
||||
Context: BHs disabled
|
||||
Notes: Deprecated in favor of ndo_set_rx_mode_async which runs
|
||||
in process context.
|
||||
|
||||
ndo_set_rx_mode_async:
|
||||
Synchronization: rtnl_lock() semaphore. In addition, netdev instance
|
||||
lock if the driver implements queue management or shaper API.
|
||||
Context: process (from a work queue)
|
||||
Notes: Async version of ndo_set_rx_mode which runs in process
|
||||
context. Receives snapshots of the unicast and multicast address lists.
|
||||
|
||||
ndo_setup_tc:
|
||||
``TC_SETUP_BLOCK`` and ``TC_SETUP_FT`` are running under NFT locks
|
||||
|
|
|
|||
|
|
@ -1119,6 +1119,16 @@ struct netdev_net_notifier {
|
|||
* This function is called device changes address list filtering.
|
||||
* If driver handles unicast address filtering, it should set
|
||||
* IFF_UNICAST_FLT in its priv_flags.
|
||||
* Cannot sleep, called with netif_addr_lock_bh held.
|
||||
* Deprecated in favor of ndo_set_rx_mode_async.
|
||||
*
|
||||
* void (*ndo_set_rx_mode_async)(struct net_device *dev,
|
||||
* struct netdev_hw_addr_list *uc,
|
||||
* struct netdev_hw_addr_list *mc);
|
||||
* Async version of ndo_set_rx_mode which runs in process context
|
||||
* with rtnl_lock and netdev_lock_ops(dev) held. The uc/mc parameters
|
||||
* are snapshots of the address lists - iterate with
|
||||
* netdev_hw_addr_list_for_each(ha, uc).
|
||||
*
|
||||
* int (*ndo_set_mac_address)(struct net_device *dev, void *addr);
|
||||
* This function is called when the Media Access Control address
|
||||
|
|
@ -1439,6 +1449,10 @@ struct net_device_ops {
|
|||
void (*ndo_change_rx_flags)(struct net_device *dev,
|
||||
int flags);
|
||||
void (*ndo_set_rx_mode)(struct net_device *dev);
|
||||
void (*ndo_set_rx_mode_async)(
|
||||
struct net_device *dev,
|
||||
struct netdev_hw_addr_list *uc,
|
||||
struct netdev_hw_addr_list *mc);
|
||||
int (*ndo_set_mac_address)(struct net_device *dev,
|
||||
void *addr);
|
||||
int (*ndo_validate_addr)(struct net_device *dev);
|
||||
|
|
@ -1903,6 +1917,8 @@ enum netdev_reg_state {
|
|||
* has been enabled due to the need to listen to
|
||||
* additional unicast addresses in a device that
|
||||
* does not implement ndo_set_rx_mode()
|
||||
* @rx_mode_node: List entry for rx_mode work processing
|
||||
* @rx_mode_tracker: Refcount tracker for rx_mode work
|
||||
* @uc: unicast mac addresses
|
||||
* @mc: multicast mac addresses
|
||||
* @dev_addrs: list of device hw addresses
|
||||
|
|
@ -2294,6 +2310,8 @@ struct net_device {
|
|||
unsigned int promiscuity;
|
||||
unsigned int allmulti;
|
||||
bool uc_promisc;
|
||||
struct list_head rx_mode_node;
|
||||
netdevice_tracker rx_mode_tracker;
|
||||
#ifdef CONFIG_LOCKDEP
|
||||
unsigned char nested_level;
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -9593,7 +9593,7 @@ static void dev_change_rx_flags(struct net_device *dev, int flags)
|
|||
ops->ndo_change_rx_flags(dev, flags);
|
||||
}
|
||||
|
||||
static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
|
||||
int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
|
||||
{
|
||||
unsigned int old_flags = dev->flags;
|
||||
unsigned int promiscuity, flags;
|
||||
|
|
@ -9697,46 +9697,6 @@ int netif_set_allmulti(struct net_device *dev, int inc, bool notify)
|
|||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Upload unicast and multicast address lists to device and
|
||||
* configure RX filtering. When the device doesn't support unicast
|
||||
* filtering it is put in promiscuous mode while unicast addresses
|
||||
* are present.
|
||||
*/
|
||||
void __dev_set_rx_mode(struct net_device *dev)
|
||||
{
|
||||
const struct net_device_ops *ops = dev->netdev_ops;
|
||||
|
||||
/* dev_open will call this function so the list will stay sane. */
|
||||
if (!(dev->flags&IFF_UP))
|
||||
return;
|
||||
|
||||
if (!netif_device_present(dev))
|
||||
return;
|
||||
|
||||
if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
|
||||
/* Unicast addresses changes may only happen under the rtnl,
|
||||
* therefore calling __dev_set_promiscuity here is safe.
|
||||
*/
|
||||
if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
|
||||
__dev_set_promiscuity(dev, 1, false);
|
||||
dev->uc_promisc = true;
|
||||
} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
|
||||
__dev_set_promiscuity(dev, -1, false);
|
||||
dev->uc_promisc = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (ops->ndo_set_rx_mode)
|
||||
ops->ndo_set_rx_mode(dev);
|
||||
}
|
||||
|
||||
void dev_set_rx_mode(struct net_device *dev)
|
||||
{
|
||||
netif_addr_lock_bh(dev);
|
||||
__dev_set_rx_mode(dev);
|
||||
netif_addr_unlock_bh(dev);
|
||||
}
|
||||
|
||||
/**
|
||||
* netif_get_flags() - get flags reported to userspace
|
||||
|
|
@ -12127,6 +12087,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
|
|||
#endif
|
||||
|
||||
mutex_init(&dev->lock);
|
||||
INIT_LIST_HEAD(&dev->rx_mode_node);
|
||||
|
||||
dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
|
||||
setup(dev);
|
||||
|
|
|
|||
|
|
@ -165,6 +165,9 @@ int netif_change_carrier(struct net_device *dev, bool new_carrier);
|
|||
int dev_change_carrier(struct net_device *dev, bool new_carrier);
|
||||
|
||||
void __dev_set_rx_mode(struct net_device *dev);
|
||||
int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify);
|
||||
bool netif_rx_mode_clean(struct net_device *dev);
|
||||
void netif_rx_mode_sync(struct net_device *dev);
|
||||
|
||||
void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
|
||||
unsigned int gchanges, u32 portid,
|
||||
|
|
|
|||
|
|
@ -11,10 +11,18 @@
|
|||
#include <linux/rtnetlink.h>
|
||||
#include <linux/export.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/workqueue.h>
|
||||
#include <kunit/visibility.h>
|
||||
|
||||
#include "dev.h"
|
||||
|
||||
static void netdev_rx_mode_work(struct work_struct *work);
|
||||
|
||||
static LIST_HEAD(rx_mode_list);
|
||||
static DEFINE_SPINLOCK(rx_mode_lock);
|
||||
static DECLARE_WORK(rx_mode_work, netdev_rx_mode_work);
|
||||
|
||||
/*
|
||||
* General list handling functions
|
||||
*/
|
||||
|
|
@ -1156,3 +1164,204 @@ void dev_mc_init(struct net_device *dev)
|
|||
__hw_addr_init(&dev->mc);
|
||||
}
|
||||
EXPORT_SYMBOL(dev_mc_init);
|
||||
|
||||
static int netif_addr_lists_snapshot(struct net_device *dev,
|
||||
struct netdev_hw_addr_list *uc_snap,
|
||||
struct netdev_hw_addr_list *mc_snap,
|
||||
struct netdev_hw_addr_list *uc_ref,
|
||||
struct netdev_hw_addr_list *mc_ref)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = __hw_addr_list_snapshot(uc_snap, &dev->uc, dev->addr_len);
|
||||
if (!err)
|
||||
err = __hw_addr_list_snapshot(uc_ref, &dev->uc, dev->addr_len);
|
||||
if (!err)
|
||||
err = __hw_addr_list_snapshot(mc_snap, &dev->mc,
|
||||
dev->addr_len);
|
||||
if (!err)
|
||||
err = __hw_addr_list_snapshot(mc_ref, &dev->mc, dev->addr_len);
|
||||
|
||||
if (err) {
|
||||
__hw_addr_flush(uc_snap);
|
||||
__hw_addr_flush(uc_ref);
|
||||
__hw_addr_flush(mc_snap);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static void netif_addr_lists_reconcile(struct net_device *dev,
|
||||
struct netdev_hw_addr_list *uc_snap,
|
||||
struct netdev_hw_addr_list *mc_snap,
|
||||
struct netdev_hw_addr_list *uc_ref,
|
||||
struct netdev_hw_addr_list *mc_ref)
|
||||
{
|
||||
__hw_addr_list_reconcile(&dev->uc, uc_snap, uc_ref, dev->addr_len);
|
||||
__hw_addr_list_reconcile(&dev->mc, mc_snap, mc_ref, dev->addr_len);
|
||||
}
|
||||
|
||||
static void netif_rx_mode_run(struct net_device *dev)
|
||||
{
|
||||
struct netdev_hw_addr_list uc_snap, mc_snap, uc_ref, mc_ref;
|
||||
const struct net_device_ops *ops = dev->netdev_ops;
|
||||
int err;
|
||||
|
||||
might_sleep();
|
||||
netdev_ops_assert_locked(dev);
|
||||
|
||||
__hw_addr_init(&uc_snap);
|
||||
__hw_addr_init(&mc_snap);
|
||||
__hw_addr_init(&uc_ref);
|
||||
__hw_addr_init(&mc_ref);
|
||||
|
||||
if (!(dev->flags & IFF_UP) || !netif_device_present(dev))
|
||||
return;
|
||||
|
||||
netif_addr_lock_bh(dev);
|
||||
err = netif_addr_lists_snapshot(dev, &uc_snap, &mc_snap,
|
||||
&uc_ref, &mc_ref);
|
||||
if (err) {
|
||||
netdev_WARN(dev, "failed to sync uc/mc addresses\n");
|
||||
netif_addr_unlock_bh(dev);
|
||||
return;
|
||||
}
|
||||
netif_addr_unlock_bh(dev);
|
||||
|
||||
ops->ndo_set_rx_mode_async(dev, &uc_snap, &mc_snap);
|
||||
|
||||
netif_addr_lock_bh(dev);
|
||||
netif_addr_lists_reconcile(dev, &uc_snap, &mc_snap,
|
||||
&uc_ref, &mc_ref);
|
||||
netif_addr_unlock_bh(dev);
|
||||
}
|
||||
|
||||
static void netdev_rx_mode_work(struct work_struct *work)
|
||||
{
|
||||
struct net_device *dev;
|
||||
|
||||
rtnl_lock();
|
||||
|
||||
while (true) {
|
||||
spin_lock_bh(&rx_mode_lock);
|
||||
if (list_empty(&rx_mode_list)) {
|
||||
spin_unlock_bh(&rx_mode_lock);
|
||||
break;
|
||||
}
|
||||
dev = list_first_entry(&rx_mode_list, struct net_device,
|
||||
rx_mode_node);
|
||||
list_del_init(&dev->rx_mode_node);
|
||||
/* We must free netdev tracker under
|
||||
* the spinlock protection.
|
||||
*/
|
||||
netdev_tracker_free(dev, &dev->rx_mode_tracker);
|
||||
spin_unlock_bh(&rx_mode_lock);
|
||||
|
||||
netdev_lock_ops(dev);
|
||||
netif_rx_mode_run(dev);
|
||||
netdev_unlock_ops(dev);
|
||||
/* Use __dev_put() because netdev_tracker_free() was already
|
||||
* called above. Must be after netdev_unlock_ops() to prevent
|
||||
* netdev_run_todo() from freeing the device while still in use.
|
||||
*/
|
||||
__dev_put(dev);
|
||||
}
|
||||
|
||||
rtnl_unlock();
|
||||
}
|
||||
|
||||
static void netif_rx_mode_queue(struct net_device *dev)
|
||||
{
|
||||
spin_lock_bh(&rx_mode_lock);
|
||||
if (list_empty(&dev->rx_mode_node)) {
|
||||
list_add_tail(&dev->rx_mode_node, &rx_mode_list);
|
||||
netdev_hold(dev, &dev->rx_mode_tracker, GFP_ATOMIC);
|
||||
}
|
||||
spin_unlock_bh(&rx_mode_lock);
|
||||
schedule_work(&rx_mode_work);
|
||||
}
|
||||
|
||||
/**
|
||||
* __dev_set_rx_mode() - upload unicast and multicast address lists to device
|
||||
* and configure RX filtering.
|
||||
* @dev: device
|
||||
*
|
||||
* When the device doesn't support unicast filtering it is put in promiscuous
|
||||
* mode while unicast addresses are present.
|
||||
*/
|
||||
void __dev_set_rx_mode(struct net_device *dev)
|
||||
{
|
||||
const struct net_device_ops *ops = dev->netdev_ops;
|
||||
|
||||
/* dev_open will call this function so the list will stay sane. */
|
||||
if (!(dev->flags & IFF_UP))
|
||||
return;
|
||||
|
||||
if (!netif_device_present(dev))
|
||||
return;
|
||||
|
||||
if (ops->ndo_set_rx_mode_async) {
|
||||
netif_rx_mode_queue(dev);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
|
||||
if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
|
||||
__dev_set_promiscuity(dev, 1, false);
|
||||
dev->uc_promisc = true;
|
||||
} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
|
||||
__dev_set_promiscuity(dev, -1, false);
|
||||
dev->uc_promisc = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (ops->ndo_set_rx_mode)
|
||||
ops->ndo_set_rx_mode(dev);
|
||||
}
|
||||
|
||||
void dev_set_rx_mode(struct net_device *dev)
|
||||
{
|
||||
netif_addr_lock_bh(dev);
|
||||
__dev_set_rx_mode(dev);
|
||||
netif_addr_unlock_bh(dev);
|
||||
}
|
||||
|
||||
bool netif_rx_mode_clean(struct net_device *dev)
|
||||
{
|
||||
bool clean = false;
|
||||
|
||||
spin_lock_bh(&rx_mode_lock);
|
||||
if (!list_empty(&dev->rx_mode_node)) {
|
||||
list_del_init(&dev->rx_mode_node);
|
||||
clean = true;
|
||||
/* We must release netdev tracker under
|
||||
* the spinlock protection.
|
||||
*/
|
||||
netdev_tracker_free(dev, &dev->rx_mode_tracker);
|
||||
}
|
||||
spin_unlock_bh(&rx_mode_lock);
|
||||
|
||||
return clean;
|
||||
}
|
||||
|
||||
/**
|
||||
* netif_rx_mode_sync() - sync rx mode inline
|
||||
* @dev: network device
|
||||
*
|
||||
* Drivers implementing ndo_set_rx_mode_async() have their rx mode callback
|
||||
* executed from a workqueue. This allows the callback to sleep, but means
|
||||
* the hardware update is deferred and may not be visible to userspace
|
||||
* by the time the initiating syscall returns. netif_rx_mode_sync() steals
|
||||
* workqueue update and executes it inline. This preserves the atomicity of
|
||||
* operations to the userspace.
|
||||
*/
|
||||
void netif_rx_mode_sync(struct net_device *dev)
|
||||
{
|
||||
if (netif_rx_mode_clean(dev)) {
|
||||
netif_rx_mode_run(dev);
|
||||
/* Use __dev_put() because netdev_tracker_free() was already
|
||||
* called inside netif_rx_mode_clean().
|
||||
*/
|
||||
__dev_put(dev);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -66,6 +66,7 @@ int dev_change_flags(struct net_device *dev, unsigned int flags,
|
|||
|
||||
netdev_lock_ops(dev);
|
||||
ret = netif_change_flags(dev, flags, extack);
|
||||
netif_rx_mode_sync(dev);
|
||||
netdev_unlock_ops(dev);
|
||||
|
||||
return ret;
|
||||
|
|
@ -285,6 +286,7 @@ int dev_set_promiscuity(struct net_device *dev, int inc)
|
|||
|
||||
netdev_lock_ops(dev);
|
||||
ret = netif_set_promiscuity(dev, inc);
|
||||
netif_rx_mode_sync(dev);
|
||||
netdev_unlock_ops(dev);
|
||||
|
||||
return ret;
|
||||
|
|
@ -311,6 +313,7 @@ int dev_set_allmulti(struct net_device *dev, int inc)
|
|||
|
||||
netdev_lock_ops(dev);
|
||||
ret = netif_set_allmulti(dev, inc, true);
|
||||
netif_rx_mode_sync(dev);
|
||||
netdev_unlock_ops(dev);
|
||||
|
||||
return ret;
|
||||
|
|
|
|||
|
|
@ -586,24 +586,26 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
|
|||
return err;
|
||||
|
||||
case SIOCADDMULTI:
|
||||
if (!ops->ndo_set_rx_mode ||
|
||||
if ((!ops->ndo_set_rx_mode && !ops->ndo_set_rx_mode_async) ||
|
||||
ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
|
||||
return -EINVAL;
|
||||
if (!netif_device_present(dev))
|
||||
return -ENODEV;
|
||||
netdev_lock_ops(dev);
|
||||
err = dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
|
||||
netif_rx_mode_sync(dev);
|
||||
netdev_unlock_ops(dev);
|
||||
return err;
|
||||
|
||||
case SIOCDELMULTI:
|
||||
if (!ops->ndo_set_rx_mode ||
|
||||
if ((!ops->ndo_set_rx_mode && !ops->ndo_set_rx_mode_async) ||
|
||||
ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
|
||||
return -EINVAL;
|
||||
if (!netif_device_present(dev))
|
||||
return -ENODEV;
|
||||
netdev_lock_ops(dev);
|
||||
err = dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
|
||||
netif_rx_mode_sync(dev);
|
||||
netdev_unlock_ops(dev);
|
||||
return err;
|
||||
|
||||
|
|
|
|||
|
|
@ -3431,6 +3431,7 @@ errout:
|
|||
dev->name);
|
||||
}
|
||||
|
||||
netif_rx_mode_sync(dev);
|
||||
netdev_unlock_ops(dev);
|
||||
|
||||
return err;
|
||||
|
|
|
|||
Loading…
Reference in New Issue