s390/pci: Fix cyclic dead-lock in zpci_zdev_put() and zpci_scan_devices()

When triggering PCI device recovery by writing into the SysFS attribute
`recover` of a Physical Function with existing child SR-IOV Virtual
Functions, lockdep is reporting a possible deadlock between three
threads:

         Thread (A)             Thread (B)             Thread (C)
             |                      |                      |
      recover_store()      zpci_scan_devices()    zpci_scan_devices()
lock(pci_rescan_remove_lock)        |                      |
             |                      |                      |
             |                      |            zpci_bus_scan_busses()
             |                      |             lock(zbus_list_lock)
             |              zpci_add_device()              |
             |          lock(zpci_add_remove_lock)         |
             |                      |                      ┴
             |                      |             zpci_bus_scan_bus()
             |                      |         lock(pci_rescan_remove_lock)
             ┴                      |
      zpci_zdev_put()               |
 lock(zpci_add_remove_lock)         |
                                    ┴
                              zpci_bus_get()
                           lock(zbus_list_lock)

In zpci_bus_scan_busses() the `zbus_list_lock` is taken for the whole
duration of the function, which also includes taking
`pci_rescan_remove_lock`, among other things. But `zbus_list_lock` only
really needs to protect the modification of the global registration
`zbus_list`, it can be dropped while the functions within the list
iteration run; this way we break the cycle above.

Break up zpci_bus_scan_busses() into an "iterator" zpci_bus_get_next()
that iterates over `zbus_list` element by element, and acquires and
releases `zbus_list_lock` as necessary, but never keep holding it.
References to `zpci_bus` objects are also acquired and released.

The reference counting on `zpci_bus` objects is also changed so that all
put() and get() operations are done under the protection of
`zbus_list_lock`, and if the operation results in a modification of
`zpci_bus_list`, this modification is done in the same critical section
(apart the very first initialization). This way objects are never seen
on the list that are about to be released and/or half-initialized.

Fixes: 14c87ba812 ("s390/pci: separate zbus registration from scanning")
Suggested-by: Niklas Schnelle <schnelle@linux.ibm.com>
Signed-off-by: Benjamin Block <bblock@linux.ibm.com>
Reviewed-by: Niklas Schnelle <schnelle@linux.ibm.com>
Reviewed-by: Gerd Bayer <gbayer@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
master
Benjamin Block 2025-12-05 16:47:17 +01:00 committed by Heiko Carstens
parent b1aa01d312
commit 4cb92fa763
4 changed files with 91 additions and 29 deletions

View File

@ -748,6 +748,7 @@ ForEachMacros:
- 'ynl_attr_for_each_nested'
- 'ynl_attr_for_each_payload'
- 'zorro_for_each_dev'
- 'zpci_bus_for_each'
IncludeBlocks: Preserve
IncludeCategories:

View File

@ -1148,6 +1148,7 @@ static void zpci_add_devices(struct list_head *scan_list)
int zpci_scan_devices(void)
{
struct zpci_bus *zbus;
LIST_HEAD(scan_list);
int rc;
@ -1156,7 +1157,10 @@ int zpci_scan_devices(void)
return rc;
zpci_add_devices(&scan_list);
zpci_bus_scan_busses();
zpci_bus_for_each(zbus) {
zpci_bus_scan_bus(zbus);
cond_resched();
}
return 0;
}

View File

@ -153,23 +153,6 @@ int zpci_bus_scan_bus(struct zpci_bus *zbus)
return ret;
}
/* zpci_bus_scan_busses - Scan all registered busses
*
* Scan all available zbusses
*
*/
void zpci_bus_scan_busses(void)
{
struct zpci_bus *zbus = NULL;
mutex_lock(&zbus_list_lock);
list_for_each_entry(zbus, &zbus_list, bus_next) {
zpci_bus_scan_bus(zbus);
cond_resched();
}
mutex_unlock(&zbus_list_lock);
}
static bool zpci_bus_is_multifunction_root(struct zpci_dev *zdev)
{
return !s390_pci_no_rid && zdev->rid_available &&
@ -222,10 +205,29 @@ out_free_domain:
return -ENOMEM;
}
static void zpci_bus_release(struct kref *kref)
/**
* zpci_bus_release - Un-initialize resources associated with the zbus and
* free memory
* @kref: refcount * that is part of struct zpci_bus
*
* MUST be called with `zbus_list_lock` held, but the lock is released during
* run of the function.
*/
static inline void zpci_bus_release(struct kref *kref)
__releases(&zbus_list_lock)
{
struct zpci_bus *zbus = container_of(kref, struct zpci_bus, kref);
lockdep_assert_held(&zbus_list_lock);
list_del(&zbus->bus_next);
mutex_unlock(&zbus_list_lock);
/*
* At this point no-one should see this object, or be able to get a new
* reference to it.
*/
if (zbus->bus) {
pci_lock_rescan_remove();
pci_stop_root_bus(zbus->bus);
@ -237,16 +239,19 @@ static void zpci_bus_release(struct kref *kref)
pci_unlock_rescan_remove();
}
mutex_lock(&zbus_list_lock);
list_del(&zbus->bus_next);
mutex_unlock(&zbus_list_lock);
zpci_remove_parent_msi_domain(zbus);
kfree(zbus);
}
static void zpci_bus_put(struct zpci_bus *zbus)
static inline void __zpci_bus_get(struct zpci_bus *zbus)
{
kref_put(&zbus->kref, zpci_bus_release);
lockdep_assert_held(&zbus_list_lock);
kref_get(&zbus->kref);
}
static inline void zpci_bus_put(struct zpci_bus *zbus)
{
kref_put_mutex(&zbus->kref, zpci_bus_release, &zbus_list_lock);
}
static struct zpci_bus *zpci_bus_get(int topo, bool topo_is_tid)
@ -258,7 +263,7 @@ static struct zpci_bus *zpci_bus_get(int topo, bool topo_is_tid)
if (!zbus->multifunction)
continue;
if (topo_is_tid == zbus->topo_is_tid && topo == zbus->topo) {
kref_get(&zbus->kref);
__zpci_bus_get(zbus);
goto out_unlock;
}
}
@ -268,6 +273,44 @@ out_unlock:
return zbus;
}
/**
* zpci_bus_get_next - get the next zbus object from given position in the list
* @pos: current position/cursor in the global zbus list
*
* Acquires and releases references as the cursor iterates (might also free/
* release the cursor). Is tolerant of concurrent operations on the list.
*
* To begin the iteration, set *@pos to %NULL before calling the function.
*
* *@pos is set to %NULL in cases where either the list is empty, or *@pos is
* the last element in the list.
*
* Context: Process context. May sleep.
*/
void zpci_bus_get_next(struct zpci_bus **pos)
{
struct zpci_bus *curp = *pos, *next = NULL;
mutex_lock(&zbus_list_lock);
if (curp)
next = list_next_entry(curp, bus_next);
else
next = list_first_entry(&zbus_list, typeof(*curp), bus_next);
if (list_entry_is_head(next, &zbus_list, bus_next))
next = NULL;
if (next)
__zpci_bus_get(next);
*pos = next;
mutex_unlock(&zbus_list_lock);
/* zpci_bus_put() might drop refcount to 0 and locks zbus_list_lock */
if (curp)
zpci_bus_put(curp);
}
static struct zpci_bus *zpci_bus_alloc(int topo, bool topo_is_tid)
{
struct zpci_bus *zbus;
@ -279,9 +322,6 @@ static struct zpci_bus *zpci_bus_alloc(int topo, bool topo_is_tid)
zbus->topo = topo;
zbus->topo_is_tid = topo_is_tid;
INIT_LIST_HEAD(&zbus->bus_next);
mutex_lock(&zbus_list_lock);
list_add_tail(&zbus->bus_next, &zbus_list);
mutex_unlock(&zbus_list_lock);
kref_init(&zbus->kref);
INIT_LIST_HEAD(&zbus->resources);
@ -291,6 +331,10 @@ static struct zpci_bus *zpci_bus_alloc(int topo, bool topo_is_tid)
zbus->bus_resource.flags = IORESOURCE_BUS;
pci_add_resource(&zbus->resources, &zbus->bus_resource);
mutex_lock(&zbus_list_lock);
list_add_tail(&zbus->bus_next, &zbus_list);
mutex_unlock(&zbus_list_lock);
return zbus;
}

View File

@ -15,7 +15,20 @@ int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops);
void zpci_bus_device_unregister(struct zpci_dev *zdev);
int zpci_bus_scan_bus(struct zpci_bus *zbus);
void zpci_bus_scan_busses(void);
void zpci_bus_get_next(struct zpci_bus **pos);
/**
* zpci_bus_for_each - iterate over all the registered zbus objects
* @pos: a struct zpci_bus * as cursor
*
* Acquires and releases references as the cursor iterates over the registered
* objects. Is tolerant against concurrent removals of objects.
*
* Context: Process context. May sleep.
*/
#define zpci_bus_for_each(pos) \
for ((pos) = NULL, zpci_bus_get_next(&(pos)); (pos) != NULL; \
zpci_bus_get_next(&(pos)))
int zpci_bus_scan_device(struct zpci_dev *zdev);
void zpci_bus_remove_device(struct zpci_dev *zdev, bool set_error);