diff options
author | Lukas Wunner <lukas@wunner.de> | 2023-01-20 10:19:02 +0100 |
---|---|---|
committer | Bjorn Helgaas <bhelgaas@google.com> | 2023-02-15 15:01:01 -0600 |
commit | 74ff8864cc842be994853095dba6db48e716400a (patch) | |
tree | ff56f04498915fd6b296bf175f3d5f01ff2cc0e1 /drivers/pci | |
parent | 82b34b0800af8c9fc9988c290cdc813e0ca0df31 (diff) |
PCI: hotplug: Allow marking devices as disconnected during bind/unbind
On surprise removal, pciehp_unconfigure_device() and acpiphp's
trim_stale_devices() call pci_dev_set_disconnected() to mark removed
devices as permanently offline. Thereby, the PCI core and drivers know
to skip device accesses.
However pci_dev_set_disconnected() takes the device_lock and thus waits for
a concurrent driver bind or unbind to complete. As a result, the driver's
->probe and ->remove hooks have no chance to learn that the device is gone.
That doesn't make any sense, so drop the device_lock and instead use atomic
xchg() and cmpxchg() operations to update the device state.
As a byproduct, an AB-BA deadlock reported by Anatoli is fixed which occurs
on surprise removal with AER concurrently performing a bus reset.
AER bus reset:
INFO: task irq/26-aerdrv:95 blocked for more than 120 seconds.
Tainted: G W 6.2.0-rc3-custom-norework-jan11+
schedule
rwsem_down_write_slowpath
down_write_nested
pciehp_reset_slot # acquires reset_lock
pci_reset_hotplug_slot
pci_slot_reset # acquires device_lock
pci_bus_error_reset
aer_root_reset
pcie_do_recovery
aer_process_err_devices
aer_isr
pciehp surprise removal:
INFO: task irq/26-pciehp:96 blocked for more than 120 seconds.
Tainted: G W 6.2.0-rc3-custom-norework-jan11+
schedule_preempt_disabled
__mutex_lock
mutex_lock_nested
pci_dev_set_disconnected # acquires device_lock
pci_walk_bus
pciehp_unconfigure_device
pciehp_disable_slot
pciehp_handle_presence_or_link_change
pciehp_ist # acquires reset_lock
Link: https://bugzilla.kernel.org/show_bug.cgi?id=215590
Fixes: a6bd101b8f84 ("PCI: Unify device inaccessible")
Link: https://lore.kernel.org/r/3dc88ea82bdc0e37d9000e413d5ebce481cbd629.1674205689.git.lukas@wunner.de
Reported-by: Anatoli Antonovitch <anatoli.antonovitch@amd.com>
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: stable@vger.kernel.org # v4.20+
Cc: Keith Busch <kbusch@kernel.org>
Diffstat (limited to 'drivers/pci')
-rw-r--r-- | drivers/pci/pci.h | 43 |
1 files changed, 13 insertions, 30 deletions
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 9ed3b5550043..5d5a44aaafe8 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -310,53 +310,36 @@ struct pci_sriov { * @dev: PCI device to set new error_state * @new: the state we want dev to be in * - * Must be called with device_lock held. + * If the device is experiencing perm_failure, it has to remain in that state. + * Any other transition is allowed. * * Returns true if state has been changed to the requested state. */ static inline bool pci_dev_set_io_state(struct pci_dev *dev, pci_channel_state_t new) { - bool changed = false; + pci_channel_state_t old; - device_lock_assert(&dev->dev); switch (new) { case pci_channel_io_perm_failure: - switch (dev->error_state) { - case pci_channel_io_frozen: - case pci_channel_io_normal: - case pci_channel_io_perm_failure: - changed = true; - break; - } - break; + xchg(&dev->error_state, pci_channel_io_perm_failure); + return true; case pci_channel_io_frozen: - switch (dev->error_state) { - case pci_channel_io_frozen: - case pci_channel_io_normal: - changed = true; - break; - } - break; + old = cmpxchg(&dev->error_state, pci_channel_io_normal, + pci_channel_io_frozen); + return old != pci_channel_io_perm_failure; case pci_channel_io_normal: - switch (dev->error_state) { - case pci_channel_io_frozen: - case pci_channel_io_normal: - changed = true; - break; - } - break; + old = cmpxchg(&dev->error_state, pci_channel_io_frozen, + pci_channel_io_normal); + return old != pci_channel_io_perm_failure; + default: + return false; } - if (changed) - dev->error_state = new; - return changed; } static inline int pci_dev_set_disconnected(struct pci_dev *dev, void *unused) { - device_lock(&dev->dev); pci_dev_set_io_state(dev, pci_channel_io_perm_failure); - device_unlock(&dev->dev); return 0; } |