2
0
mirror of https://github.com/edk2-porting/linux-next.git synced 2024-12-23 04:34:11 +08:00
linux-next/drivers/pci/probe.c

2480 lines
64 KiB
C
Raw Normal View History

/*
* probe.c - PCI detection and setup code
*/
#include <linux/kernel.h>
#include <linux/delay.h>
#include <linux/init.h>
#include <linux/pci.h>
#include <linux/of_device.h>
#include <linux/of_pci.h>
#include <linux/pci_hotplug.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/cpumask.h>
#include <linux/pci-aspm.h>
#include <linux/aer.h>
#include <linux/acpi.h>
#include <linux/irqdomain.h>
#include <linux/pm_runtime.h>
#include "pci.h"
#define CARDBUS_LATENCY_TIMER 176 /* secondary latency timer */
#define CARDBUS_RESERVE_BUSNR 3
static struct resource busn_resource = {
.name = "PCI busn",
.start = 0,
.end = 255,
.flags = IORESOURCE_BUS,
};
/* Ugh. Need to stop exporting this to modules. */
LIST_HEAD(pci_root_buses);
EXPORT_SYMBOL(pci_root_buses);
static LIST_HEAD(pci_domain_busn_res_list);
struct pci_domain_busn_res {
struct list_head list;
struct resource res;
int domain_nr;
};
static struct resource *get_pci_domain_busn_res(int domain_nr)
{
struct pci_domain_busn_res *r;
list_for_each_entry(r, &pci_domain_busn_res_list, list)
if (r->domain_nr == domain_nr)
return &r->res;
r = kzalloc(sizeof(*r), GFP_KERNEL);
if (!r)
return NULL;
r->domain_nr = domain_nr;
r->res.start = 0;
r->res.end = 0xff;
r->res.flags = IORESOURCE_BUS | IORESOURCE_PCI_FIXED;
list_add_tail(&r->list, &pci_domain_busn_res_list);
return &r->res;
}
static int find_anything(struct device *dev, void *data)
{
return 1;
}
/*
* Some device drivers need know if pci is initiated.
* Basically, we think pci is not initiated when there
* is no device to be found on the pci_bus_type.
*/
int no_pci_devices(void)
{
struct device *dev;
int no_devices;
dev = bus_find_device(&pci_bus_type, NULL, NULL, find_anything);
no_devices = (dev == NULL);
put_device(dev);
return no_devices;
}
EXPORT_SYMBOL(no_pci_devices);
/*
* PCI Bus Class
*/
static void release_pcibus_dev(struct device *dev)
{
struct pci_bus *pci_bus = to_pci_bus(dev);
put_device(pci_bus->bridge);
pci_bus_remove_resources(pci_bus);
pci_release_bus_of_node(pci_bus);
kfree(pci_bus);
}
static struct class pcibus_class = {
.name = "pci_bus",
.dev_release = &release_pcibus_dev,
.dev_groups = pcibus_groups,
};
static int __init pcibus_class_init(void)
{
return class_register(&pcibus_class);
}
postcore_initcall(pcibus_class_init);
static u64 pci_size(u64 base, u64 maxbase, u64 mask)
{
u64 size = mask & maxbase; /* Find the significant bits */
if (!size)
return 0;
/* Get the lowest of them to find the decode size, and
from that the extent. */
size = (size & ~(size-1)) - 1;
/* base == maxbase can be valid only if the BAR has
already been programmed with all 1s. */
if (base == maxbase && ((base | size) & mask) != mask)
return 0;
return size;
}
static inline unsigned long decode_bar(struct pci_dev *dev, u32 bar)
{
u32 mem_type;
unsigned long flags;
if ((bar & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_IO) {
flags = bar & ~PCI_BASE_ADDRESS_IO_MASK;
flags |= IORESOURCE_IO;
return flags;
}
flags = bar & ~PCI_BASE_ADDRESS_MEM_MASK;
flags |= IORESOURCE_MEM;
if (flags & PCI_BASE_ADDRESS_MEM_PREFETCH)
flags |= IORESOURCE_PREFETCH;
mem_type = bar & PCI_BASE_ADDRESS_MEM_TYPE_MASK;
switch (mem_type) {
case PCI_BASE_ADDRESS_MEM_TYPE_32:
break;
case PCI_BASE_ADDRESS_MEM_TYPE_1M:
/* 1M mem BAR treated as 32-bit BAR */
break;
case PCI_BASE_ADDRESS_MEM_TYPE_64:
flags |= IORESOURCE_MEM_64;
break;
default:
/* mem unknown type treated as 32-bit BAR */
break;
}
return flags;
}
#define PCI_COMMAND_DECODE_ENABLE (PCI_COMMAND_MEMORY | PCI_COMMAND_IO)
/**
* pci_read_base - read a PCI BAR
* @dev: the PCI device
* @type: type of the BAR
* @res: resource buffer to be filled in
* @pos: BAR position in the config space
*
* Returns 1 if the BAR is 64-bit, or 0 if 32-bit.
*/
int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
struct resource *res, unsigned int pos)
{
u32 l, sz, mask;
u64 l64, sz64, mask64;
u16 orig_cmd;
struct pci_bus_region region, inverted_region;
mask = type ? PCI_ROM_ADDRESS_MASK : ~0;
/* No printks while decoding is disabled! */
if (!dev->mmio_always_on) {
pci_read_config_word(dev, PCI_COMMAND, &orig_cmd);
if (orig_cmd & PCI_COMMAND_DECODE_ENABLE) {
pci_write_config_word(dev, PCI_COMMAND,
orig_cmd & ~PCI_COMMAND_DECODE_ENABLE);
}
}
res->name = pci_name(dev);
pci_read_config_dword(dev, pos, &l);
pci_write_config_dword(dev, pos, l | mask);
pci_read_config_dword(dev, pos, &sz);
pci_write_config_dword(dev, pos, l);
/*
* All bits set in sz means the device isn't working properly.
* If the BAR isn't implemented, all bits must be 0. If it's a
* memory BAR or a ROM, bit 0 must be clear; if it's an io BAR, bit
* 1 must be clear.
*/
if (sz == 0xffffffff)
sz = 0;
/*
* I don't know how l can have all bits set. Copied from old code.
* Maybe it fixes a bug on some ancient platform.
*/
if (l == 0xffffffff)
l = 0;
if (type == pci_bar_unknown) {
res->flags = decode_bar(dev, l);
res->flags |= IORESOURCE_SIZEALIGN;
if (res->flags & IORESOURCE_IO) {
l64 = l & PCI_BASE_ADDRESS_IO_MASK;
sz64 = sz & PCI_BASE_ADDRESS_IO_MASK;
mask64 = PCI_BASE_ADDRESS_IO_MASK & (u32)IO_SPACE_LIMIT;
} else {
l64 = l & PCI_BASE_ADDRESS_MEM_MASK;
sz64 = sz & PCI_BASE_ADDRESS_MEM_MASK;
mask64 = (u32)PCI_BASE_ADDRESS_MEM_MASK;
}
} else {
res->flags |= (l & IORESOURCE_ROM_ENABLE);
l64 = l & PCI_ROM_ADDRESS_MASK;
sz64 = sz & PCI_ROM_ADDRESS_MASK;
mask64 = (u32)PCI_ROM_ADDRESS_MASK;
}
if (res->flags & IORESOURCE_MEM_64) {
pci_read_config_dword(dev, pos + 4, &l);
pci_write_config_dword(dev, pos + 4, ~0);
pci_read_config_dword(dev, pos + 4, &sz);
pci_write_config_dword(dev, pos + 4, l);
l64 |= ((u64)l << 32);
sz64 |= ((u64)sz << 32);
mask64 |= ((u64)~0 << 32);
}
if (!dev->mmio_always_on && (orig_cmd & PCI_COMMAND_DECODE_ENABLE))
pci_write_config_word(dev, PCI_COMMAND, orig_cmd);
if (!sz64)
goto fail;
sz64 = pci_size(l64, sz64, mask64);
if (!sz64) {
dev_info(&dev->dev, FW_BUG "reg 0x%x: invalid BAR (can't size)\n",
pos);
goto fail;
}
if (res->flags & IORESOURCE_MEM_64) {
PCI: Add pci_bus_addr_t David Ahern reported that d63e2e1f3df9 ("sparc/PCI: Clip bridge windows to fit in upstream windows") fails to boot on sparc/T5-8: pci 0000:06:00.0: reg 0x184: can't handle BAR above 4GB (bus address 0x110204000) The problem is that sparc64 assumed that dma_addr_t only needed to hold DMA addresses, i.e., bus addresses returned via the DMA API (dma_map_single(), etc.), while the PCI core assumed dma_addr_t could hold *any* bus address, including raw BAR values. On sparc64, all DMA addresses fit in 32 bits, so dma_addr_t is a 32-bit type. However, BAR values can be 64 bits wide, so they don't fit in a dma_addr_t. d63e2e1f3df9 added new checking that tripped over this mismatch. Add pci_bus_addr_t, which is wide enough to hold any PCI bus address, including both raw BAR values and DMA addresses. This will be 64 bits on 64-bit platforms and on platforms with a 64-bit dma_addr_t. Then dma_addr_t only needs to be wide enough to hold addresses from the DMA API. [bhelgaas: changelog, bugzilla, Kconfig to ensure pci_bus_addr_t is at least as wide as dma_addr_t, documentation] Fixes: d63e2e1f3df9 ("sparc/PCI: Clip bridge windows to fit in upstream windows") Fixes: 23b13bc76f35 ("PCI: Fail safely if we can't handle BARs larger than 4GB") Link: http://lkml.kernel.org/r/CAE9FiQU1gJY1LYrxs+ma5LCTEEe4xmtjRG0aXJ9K_Tsu+m9Wuw@mail.gmail.com Link: http://lkml.kernel.org/r/1427857069-6789-1-git-send-email-yinghai@kernel.org Link: https://bugzilla.kernel.org/show_bug.cgi?id=96231 Reported-by: David Ahern <david.ahern@oracle.com> Tested-by: David Ahern <david.ahern@oracle.com> Signed-off-by: Yinghai Lu <yinghai@kernel.org> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> Acked-by: David S. Miller <davem@davemloft.net> CC: stable@vger.kernel.org # v3.19+
2015-05-28 08:23:51 +08:00
if ((sizeof(pci_bus_addr_t) < 8 || sizeof(resource_size_t) < 8)
&& sz64 > 0x100000000ULL) {
res->flags |= IORESOURCE_UNSET | IORESOURCE_DISABLED;
res->start = 0;
res->end = 0;
dev_err(&dev->dev, "reg 0x%x: can't handle BAR larger than 4GB (size %#010llx)\n",
pos, (unsigned long long)sz64);
goto out;
}
PCI: Add pci_bus_addr_t David Ahern reported that d63e2e1f3df9 ("sparc/PCI: Clip bridge windows to fit in upstream windows") fails to boot on sparc/T5-8: pci 0000:06:00.0: reg 0x184: can't handle BAR above 4GB (bus address 0x110204000) The problem is that sparc64 assumed that dma_addr_t only needed to hold DMA addresses, i.e., bus addresses returned via the DMA API (dma_map_single(), etc.), while the PCI core assumed dma_addr_t could hold *any* bus address, including raw BAR values. On sparc64, all DMA addresses fit in 32 bits, so dma_addr_t is a 32-bit type. However, BAR values can be 64 bits wide, so they don't fit in a dma_addr_t. d63e2e1f3df9 added new checking that tripped over this mismatch. Add pci_bus_addr_t, which is wide enough to hold any PCI bus address, including both raw BAR values and DMA addresses. This will be 64 bits on 64-bit platforms and on platforms with a 64-bit dma_addr_t. Then dma_addr_t only needs to be wide enough to hold addresses from the DMA API. [bhelgaas: changelog, bugzilla, Kconfig to ensure pci_bus_addr_t is at least as wide as dma_addr_t, documentation] Fixes: d63e2e1f3df9 ("sparc/PCI: Clip bridge windows to fit in upstream windows") Fixes: 23b13bc76f35 ("PCI: Fail safely if we can't handle BARs larger than 4GB") Link: http://lkml.kernel.org/r/CAE9FiQU1gJY1LYrxs+ma5LCTEEe4xmtjRG0aXJ9K_Tsu+m9Wuw@mail.gmail.com Link: http://lkml.kernel.org/r/1427857069-6789-1-git-send-email-yinghai@kernel.org Link: https://bugzilla.kernel.org/show_bug.cgi?id=96231 Reported-by: David Ahern <david.ahern@oracle.com> Tested-by: David Ahern <david.ahern@oracle.com> Signed-off-by: Yinghai Lu <yinghai@kernel.org> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> Acked-by: David S. Miller <davem@davemloft.net> CC: stable@vger.kernel.org # v3.19+
2015-05-28 08:23:51 +08:00
if ((sizeof(pci_bus_addr_t) < 8) && l) {
/* Above 32-bit boundary; try to reallocate */
res->flags |= IORESOURCE_UNSET;
res->start = 0;
res->end = sz64;
dev_info(&dev->dev, "reg 0x%x: can't handle BAR above 4GB (bus address %#010llx)\n",
pos, (unsigned long long)l64);
goto out;
}
}
region.start = l64;
region.end = l64 + sz64;
pcibios_bus_to_resource(dev->bus, res, &region);
pcibios_resource_to_bus(dev->bus, &inverted_region, res);
/*
* If "A" is a BAR value (a bus address), "bus_to_resource(A)" is
* the corresponding resource address (the physical address used by
* the CPU. Converting that resource address back to a bus address
* should yield the original BAR value:
*
* resource_to_bus(bus_to_resource(A)) == A
*
* If it doesn't, CPU accesses to "bus_to_resource(A)" will not
* be claimed by the device.
*/
if (inverted_region.start != region.start) {
res->flags |= IORESOURCE_UNSET;
res->start = 0;
res->end = region.end - region.start;
dev_info(&dev->dev, "reg 0x%x: initial BAR value %#010llx invalid\n",
pos, (unsigned long long)region.start);
}
goto out;
fail:
res->flags = 0;
out:
if (res->flags)
dev_printk(KERN_DEBUG, &dev->dev, "reg 0x%x: %pR\n", pos, res);
return (res->flags & IORESOURCE_MEM_64) ? 1 : 0;
}
static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom)
{
unsigned int pos, reg;
if (dev->non_compliant_bars)
return;
for (pos = 0; pos < howmany; pos++) {
struct resource *res = &dev->resource[pos];
reg = PCI_BASE_ADDRESS_0 + (pos << 2);
pos += __pci_read_base(dev, pci_bar_unknown, res, reg);
}
if (rom) {
struct resource *res = &dev->resource[PCI_ROM_RESOURCE];
dev->rom_base_reg = rom;
res->flags = IORESOURCE_MEM | IORESOURCE_PREFETCH |
IORESOURCE_READONLY | IORESOURCE_SIZEALIGN;
__pci_read_base(dev, pci_bar_mem32, res, rom);
}
}
static void pci_read_bridge_io(struct pci_bus *child)
{
struct pci_dev *dev = child->self;
u8 io_base_lo, io_limit_lo;
unsigned long io_mask, io_granularity, base, limit;
struct pci_bus_region region;
struct resource *res;
io_mask = PCI_IO_RANGE_MASK;
io_granularity = 0x1000;
if (dev->io_window_1k) {
/* Support 1K I/O space granularity */
io_mask = PCI_IO_1K_RANGE_MASK;
io_granularity = 0x400;
}
res = child->resource[0];
pci_read_config_byte(dev, PCI_IO_BASE, &io_base_lo);
pci_read_config_byte(dev, PCI_IO_LIMIT, &io_limit_lo);
base = (io_base_lo & io_mask) << 8;
limit = (io_limit_lo & io_mask) << 8;
if ((io_base_lo & PCI_IO_RANGE_TYPE_MASK) == PCI_IO_RANGE_TYPE_32) {
u16 io_base_hi, io_limit_hi;
pci_read_config_word(dev, PCI_IO_BASE_UPPER16, &io_base_hi);
pci_read_config_word(dev, PCI_IO_LIMIT_UPPER16, &io_limit_hi);
base |= ((unsigned long) io_base_hi << 16);
limit |= ((unsigned long) io_limit_hi << 16);
}
if (base <= limit) {
res->flags = (io_base_lo & PCI_IO_RANGE_TYPE_MASK) | IORESOURCE_IO;
region.start = base;
region.end = limit + io_granularity - 1;
pcibios_bus_to_resource(dev->bus, res, &region);
dev_printk(KERN_DEBUG, &dev->dev, " bridge window %pR\n", res);
}
}
static void pci_read_bridge_mmio(struct pci_bus *child)
{
struct pci_dev *dev = child->self;
u16 mem_base_lo, mem_limit_lo;
unsigned long base, limit;
struct pci_bus_region region;
struct resource *res;
res = child->resource[1];
pci_read_config_word(dev, PCI_MEMORY_BASE, &mem_base_lo);
pci_read_config_word(dev, PCI_MEMORY_LIMIT, &mem_limit_lo);
base = ((unsigned long) mem_base_lo & PCI_MEMORY_RANGE_MASK) << 16;
limit = ((unsigned long) mem_limit_lo & PCI_MEMORY_RANGE_MASK) << 16;
if (base <= limit) {
res->flags = (mem_base_lo & PCI_MEMORY_RANGE_TYPE_MASK) | IORESOURCE_MEM;
region.start = base;
region.end = limit + 0xfffff;
pcibios_bus_to_resource(dev->bus, res, &region);
dev_printk(KERN_DEBUG, &dev->dev, " bridge window %pR\n", res);
}
}
static void pci_read_bridge_mmio_pref(struct pci_bus *child)
{
struct pci_dev *dev = child->self;
u16 mem_base_lo, mem_limit_lo;
PCI: Support 64-bit bridge windows if we have 64-bit dma_addr_t Aaron reported that a 32-bit x86 kernel with Physical Address Extension (PAE) support complains about bridge prefetchable memory windows above 4GB: pci_bus 0000:00: root bus resource [mem 0x380000000000-0x383fffffffff] ... pci 0000:03:00.0: reg 0x10: [mem 0x383fffc00000-0x383fffdfffff 64bit pref] pci 0000:03:00.0: reg 0x20: [mem 0x383fffe04000-0x383fffe07fff 64bit pref] pci 0000:03:00.1: reg 0x10: [mem 0x383fffa00000-0x383fffbfffff 64bit pref] pci 0000:03:00.1: reg 0x20: [mem 0x383fffe00000-0x383fffe03fff 64bit pref] pci 0000:00:02.2: PCI bridge to [bus 03-04] pci 0000:00:02.2: bridge window [io 0x1000-0x1fff] pci 0000:00:02.2: bridge window [mem 0x91900000-0x91cfffff] pci 0000:00:02.2: can't handle 64-bit address space for bridge In this kernel, unsigned long is 32 bits and dma_addr_t is 64 bits. Previously we used "unsigned long" to hold the bridge window address. But this is a bus address, so we should use dma_addr_t instead. Use dma_addr_t to hold the bridge window base and limit. The question of whether the CPU can actually *address* the window is separate and depends on what the physical address space of the CPU is and whether the host bridge does any address translation. [bhelgaas: fix "shift count > width of type", changelog, stable tag] Fixes: d56dbf5bab8c ("PCI: Allocate 64-bit BARs above 4G when possible") Link: https://bugzilla.kernel.org/show_bug.cgi?id=88131 Reported-by: Aaron Ma <mapengyu@gmail.com> Tested-by: Aaron Ma <mapengyu@gmail.com> Signed-off-by: Yinghai Lu <yinghai@kernel.org> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> CC: stable@vger.kernel.org # v3.14+
2014-11-20 05:30:32 +08:00
u64 base64, limit64;
PCI: Add pci_bus_addr_t David Ahern reported that d63e2e1f3df9 ("sparc/PCI: Clip bridge windows to fit in upstream windows") fails to boot on sparc/T5-8: pci 0000:06:00.0: reg 0x184: can't handle BAR above 4GB (bus address 0x110204000) The problem is that sparc64 assumed that dma_addr_t only needed to hold DMA addresses, i.e., bus addresses returned via the DMA API (dma_map_single(), etc.), while the PCI core assumed dma_addr_t could hold *any* bus address, including raw BAR values. On sparc64, all DMA addresses fit in 32 bits, so dma_addr_t is a 32-bit type. However, BAR values can be 64 bits wide, so they don't fit in a dma_addr_t. d63e2e1f3df9 added new checking that tripped over this mismatch. Add pci_bus_addr_t, which is wide enough to hold any PCI bus address, including both raw BAR values and DMA addresses. This will be 64 bits on 64-bit platforms and on platforms with a 64-bit dma_addr_t. Then dma_addr_t only needs to be wide enough to hold addresses from the DMA API. [bhelgaas: changelog, bugzilla, Kconfig to ensure pci_bus_addr_t is at least as wide as dma_addr_t, documentation] Fixes: d63e2e1f3df9 ("sparc/PCI: Clip bridge windows to fit in upstream windows") Fixes: 23b13bc76f35 ("PCI: Fail safely if we can't handle BARs larger than 4GB") Link: http://lkml.kernel.org/r/CAE9FiQU1gJY1LYrxs+ma5LCTEEe4xmtjRG0aXJ9K_Tsu+m9Wuw@mail.gmail.com Link: http://lkml.kernel.org/r/1427857069-6789-1-git-send-email-yinghai@kernel.org Link: https://bugzilla.kernel.org/show_bug.cgi?id=96231 Reported-by: David Ahern <david.ahern@oracle.com> Tested-by: David Ahern <david.ahern@oracle.com> Signed-off-by: Yinghai Lu <yinghai@kernel.org> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> Acked-by: David S. Miller <davem@davemloft.net> CC: stable@vger.kernel.org # v3.19+
2015-05-28 08:23:51 +08:00
pci_bus_addr_t base, limit;
struct pci_bus_region region;
struct resource *res;
res = child->resource[2];
pci_read_config_word(dev, PCI_PREF_MEMORY_BASE, &mem_base_lo);
pci_read_config_word(dev, PCI_PREF_MEMORY_LIMIT, &mem_limit_lo);
PCI: Support 64-bit bridge windows if we have 64-bit dma_addr_t Aaron reported that a 32-bit x86 kernel with Physical Address Extension (PAE) support complains about bridge prefetchable memory windows above 4GB: pci_bus 0000:00: root bus resource [mem 0x380000000000-0x383fffffffff] ... pci 0000:03:00.0: reg 0x10: [mem 0x383fffc00000-0x383fffdfffff 64bit pref] pci 0000:03:00.0: reg 0x20: [mem 0x383fffe04000-0x383fffe07fff 64bit pref] pci 0000:03:00.1: reg 0x10: [mem 0x383fffa00000-0x383fffbfffff 64bit pref] pci 0000:03:00.1: reg 0x20: [mem 0x383fffe00000-0x383fffe03fff 64bit pref] pci 0000:00:02.2: PCI bridge to [bus 03-04] pci 0000:00:02.2: bridge window [io 0x1000-0x1fff] pci 0000:00:02.2: bridge window [mem 0x91900000-0x91cfffff] pci 0000:00:02.2: can't handle 64-bit address space for bridge In this kernel, unsigned long is 32 bits and dma_addr_t is 64 bits. Previously we used "unsigned long" to hold the bridge window address. But this is a bus address, so we should use dma_addr_t instead. Use dma_addr_t to hold the bridge window base and limit. The question of whether the CPU can actually *address* the window is separate and depends on what the physical address space of the CPU is and whether the host bridge does any address translation. [bhelgaas: fix "shift count > width of type", changelog, stable tag] Fixes: d56dbf5bab8c ("PCI: Allocate 64-bit BARs above 4G when possible") Link: https://bugzilla.kernel.org/show_bug.cgi?id=88131 Reported-by: Aaron Ma <mapengyu@gmail.com> Tested-by: Aaron Ma <mapengyu@gmail.com> Signed-off-by: Yinghai Lu <yinghai@kernel.org> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> CC: stable@vger.kernel.org # v3.14+
2014-11-20 05:30:32 +08:00
base64 = (mem_base_lo & PCI_PREF_RANGE_MASK) << 16;
limit64 = (mem_limit_lo & PCI_PREF_RANGE_MASK) << 16;
if ((mem_base_lo & PCI_PREF_RANGE_TYPE_MASK) == PCI_PREF_RANGE_TYPE_64) {
u32 mem_base_hi, mem_limit_hi;
pci_read_config_dword(dev, PCI_PREF_BASE_UPPER32, &mem_base_hi);
pci_read_config_dword(dev, PCI_PREF_LIMIT_UPPER32, &mem_limit_hi);
/*
* Some bridges set the base > limit by default, and some
* (broken) BIOSes do not initialize them. If we find
* this, just assume they are not being used.
*/
if (mem_base_hi <= mem_limit_hi) {
PCI: Support 64-bit bridge windows if we have 64-bit dma_addr_t Aaron reported that a 32-bit x86 kernel with Physical Address Extension (PAE) support complains about bridge prefetchable memory windows above 4GB: pci_bus 0000:00: root bus resource [mem 0x380000000000-0x383fffffffff] ... pci 0000:03:00.0: reg 0x10: [mem 0x383fffc00000-0x383fffdfffff 64bit pref] pci 0000:03:00.0: reg 0x20: [mem 0x383fffe04000-0x383fffe07fff 64bit pref] pci 0000:03:00.1: reg 0x10: [mem 0x383fffa00000-0x383fffbfffff 64bit pref] pci 0000:03:00.1: reg 0x20: [mem 0x383fffe00000-0x383fffe03fff 64bit pref] pci 0000:00:02.2: PCI bridge to [bus 03-04] pci 0000:00:02.2: bridge window [io 0x1000-0x1fff] pci 0000:00:02.2: bridge window [mem 0x91900000-0x91cfffff] pci 0000:00:02.2: can't handle 64-bit address space for bridge In this kernel, unsigned long is 32 bits and dma_addr_t is 64 bits. Previously we used "unsigned long" to hold the bridge window address. But this is a bus address, so we should use dma_addr_t instead. Use dma_addr_t to hold the bridge window base and limit. The question of whether the CPU can actually *address* the window is separate and depends on what the physical address space of the CPU is and whether the host bridge does any address translation. [bhelgaas: fix "shift count > width of type", changelog, stable tag] Fixes: d56dbf5bab8c ("PCI: Allocate 64-bit BARs above 4G when possible") Link: https://bugzilla.kernel.org/show_bug.cgi?id=88131 Reported-by: Aaron Ma <mapengyu@gmail.com> Tested-by: Aaron Ma <mapengyu@gmail.com> Signed-off-by: Yinghai Lu <yinghai@kernel.org> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> CC: stable@vger.kernel.org # v3.14+
2014-11-20 05:30:32 +08:00
base64 |= (u64) mem_base_hi << 32;
limit64 |= (u64) mem_limit_hi << 32;
}
}
PCI: Support 64-bit bridge windows if we have 64-bit dma_addr_t Aaron reported that a 32-bit x86 kernel with Physical Address Extension (PAE) support complains about bridge prefetchable memory windows above 4GB: pci_bus 0000:00: root bus resource [mem 0x380000000000-0x383fffffffff] ... pci 0000:03:00.0: reg 0x10: [mem 0x383fffc00000-0x383fffdfffff 64bit pref] pci 0000:03:00.0: reg 0x20: [mem 0x383fffe04000-0x383fffe07fff 64bit pref] pci 0000:03:00.1: reg 0x10: [mem 0x383fffa00000-0x383fffbfffff 64bit pref] pci 0000:03:00.1: reg 0x20: [mem 0x383fffe00000-0x383fffe03fff 64bit pref] pci 0000:00:02.2: PCI bridge to [bus 03-04] pci 0000:00:02.2: bridge window [io 0x1000-0x1fff] pci 0000:00:02.2: bridge window [mem 0x91900000-0x91cfffff] pci 0000:00:02.2: can't handle 64-bit address space for bridge In this kernel, unsigned long is 32 bits and dma_addr_t is 64 bits. Previously we used "unsigned long" to hold the bridge window address. But this is a bus address, so we should use dma_addr_t instead. Use dma_addr_t to hold the bridge window base and limit. The question of whether the CPU can actually *address* the window is separate and depends on what the physical address space of the CPU is and whether the host bridge does any address translation. [bhelgaas: fix "shift count > width of type", changelog, stable tag] Fixes: d56dbf5bab8c ("PCI: Allocate 64-bit BARs above 4G when possible") Link: https://bugzilla.kernel.org/show_bug.cgi?id=88131 Reported-by: Aaron Ma <mapengyu@gmail.com> Tested-by: Aaron Ma <mapengyu@gmail.com> Signed-off-by: Yinghai Lu <yinghai@kernel.org> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> CC: stable@vger.kernel.org # v3.14+
2014-11-20 05:30:32 +08:00
PCI: Add pci_bus_addr_t David Ahern reported that d63e2e1f3df9 ("sparc/PCI: Clip bridge windows to fit in upstream windows") fails to boot on sparc/T5-8: pci 0000:06:00.0: reg 0x184: can't handle BAR above 4GB (bus address 0x110204000) The problem is that sparc64 assumed that dma_addr_t only needed to hold DMA addresses, i.e., bus addresses returned via the DMA API (dma_map_single(), etc.), while the PCI core assumed dma_addr_t could hold *any* bus address, including raw BAR values. On sparc64, all DMA addresses fit in 32 bits, so dma_addr_t is a 32-bit type. However, BAR values can be 64 bits wide, so they don't fit in a dma_addr_t. d63e2e1f3df9 added new checking that tripped over this mismatch. Add pci_bus_addr_t, which is wide enough to hold any PCI bus address, including both raw BAR values and DMA addresses. This will be 64 bits on 64-bit platforms and on platforms with a 64-bit dma_addr_t. Then dma_addr_t only needs to be wide enough to hold addresses from the DMA API. [bhelgaas: changelog, bugzilla, Kconfig to ensure pci_bus_addr_t is at least as wide as dma_addr_t, documentation] Fixes: d63e2e1f3df9 ("sparc/PCI: Clip bridge windows to fit in upstream windows") Fixes: 23b13bc76f35 ("PCI: Fail safely if we can't handle BARs larger than 4GB") Link: http://lkml.kernel.org/r/CAE9FiQU1gJY1LYrxs+ma5LCTEEe4xmtjRG0aXJ9K_Tsu+m9Wuw@mail.gmail.com Link: http://lkml.kernel.org/r/1427857069-6789-1-git-send-email-yinghai@kernel.org Link: https://bugzilla.kernel.org/show_bug.cgi?id=96231 Reported-by: David Ahern <david.ahern@oracle.com> Tested-by: David Ahern <david.ahern@oracle.com> Signed-off-by: Yinghai Lu <yinghai@kernel.org> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> Acked-by: David S. Miller <davem@davemloft.net> CC: stable@vger.kernel.org # v3.19+
2015-05-28 08:23:51 +08:00
base = (pci_bus_addr_t) base64;
limit = (pci_bus_addr_t) limit64;
PCI: Support 64-bit bridge windows if we have 64-bit dma_addr_t Aaron reported that a 32-bit x86 kernel with Physical Address Extension (PAE) support complains about bridge prefetchable memory windows above 4GB: pci_bus 0000:00: root bus resource [mem 0x380000000000-0x383fffffffff] ... pci 0000:03:00.0: reg 0x10: [mem 0x383fffc00000-0x383fffdfffff 64bit pref] pci 0000:03:00.0: reg 0x20: [mem 0x383fffe04000-0x383fffe07fff 64bit pref] pci 0000:03:00.1: reg 0x10: [mem 0x383fffa00000-0x383fffbfffff 64bit pref] pci 0000:03:00.1: reg 0x20: [mem 0x383fffe00000-0x383fffe03fff 64bit pref] pci 0000:00:02.2: PCI bridge to [bus 03-04] pci 0000:00:02.2: bridge window [io 0x1000-0x1fff] pci 0000:00:02.2: bridge window [mem 0x91900000-0x91cfffff] pci 0000:00:02.2: can't handle 64-bit address space for bridge In this kernel, unsigned long is 32 bits and dma_addr_t is 64 bits. Previously we used "unsigned long" to hold the bridge window address. But this is a bus address, so we should use dma_addr_t instead. Use dma_addr_t to hold the bridge window base and limit. The question of whether the CPU can actually *address* the window is separate and depends on what the physical address space of the CPU is and whether the host bridge does any address translation. [bhelgaas: fix "shift count > width of type", changelog, stable tag] Fixes: d56dbf5bab8c ("PCI: Allocate 64-bit BARs above 4G when possible") Link: https://bugzilla.kernel.org/show_bug.cgi?id=88131 Reported-by: Aaron Ma <mapengyu@gmail.com> Tested-by: Aaron Ma <mapengyu@gmail.com> Signed-off-by: Yinghai Lu <yinghai@kernel.org> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> CC: stable@vger.kernel.org # v3.14+
2014-11-20 05:30:32 +08:00
if (base != base64) {
dev_err(&dev->dev, "can't handle bridge window above 4GB (bus address %#010llx)\n",
(unsigned long long) base64);
return;
}
if (base <= limit) {
res->flags = (mem_base_lo & PCI_PREF_RANGE_TYPE_MASK) |
IORESOURCE_MEM | IORESOURCE_PREFETCH;
if (res->flags & PCI_PREF_RANGE_TYPE_64)
res->flags |= IORESOURCE_MEM_64;
region.start = base;
region.end = limit + 0xfffff;
pcibios_bus_to_resource(dev->bus, res, &region);
dev_printk(KERN_DEBUG, &dev->dev, " bridge window %pR\n", res);
}
}
void pci_read_bridge_bases(struct pci_bus *child)
{
struct pci_dev *dev = child->self;
struct resource *res;
int i;
if (pci_is_root_bus(child)) /* It's a host bus, nothing to read */
return;
dev_info(&dev->dev, "PCI bridge to %pR%s\n",
&child->busn_res,
dev->transparent ? " (subtractive decode)" : "");
pci_bus_remove_resources(child);
for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++)
child->resource[i] = &dev->resource[PCI_BRIDGE_RESOURCES+i];
pci_read_bridge_io(child);
pci_read_bridge_mmio(child);
pci_read_bridge_mmio_pref(child);
if (dev->transparent) {
pci_bus_for_each_resource(child->parent, res, i) {
if (res && res->flags) {
pci_bus_add_resource(child, res,
PCI_SUBTRACTIVE_DECODE);
dev_printk(KERN_DEBUG, &dev->dev,
" bridge window %pR (subtractive decode)\n",
res);
}
}
}
}
static struct pci_bus *pci_alloc_bus(struct pci_bus *parent)
{
struct pci_bus *b;
b = kzalloc(sizeof(*b), GFP_KERNEL);
if (!b)
return NULL;
INIT_LIST_HEAD(&b->node);
INIT_LIST_HEAD(&b->children);
INIT_LIST_HEAD(&b->devices);
INIT_LIST_HEAD(&b->slots);
INIT_LIST_HEAD(&b->resources);
b->max_bus_speed = PCI_SPEED_UNKNOWN;
b->cur_bus_speed = PCI_SPEED_UNKNOWN;
#ifdef CONFIG_PCI_DOMAINS_GENERIC
if (parent)
b->domain_nr = parent->domain_nr;
#endif
return b;
}
static void pci_release_host_bridge_dev(struct device *dev)
{
struct pci_host_bridge *bridge = to_pci_host_bridge(dev);
if (bridge->release_fn)
bridge->release_fn(bridge);
pci_free_resource_list(&bridge->windows);
kfree(bridge);
}
static struct pci_host_bridge *pci_alloc_host_bridge(struct pci_bus *b)
{
struct pci_host_bridge *bridge;
bridge = kzalloc(sizeof(*bridge), GFP_KERNEL);
if (!bridge)
return NULL;
INIT_LIST_HEAD(&bridge->windows);
bridge->bus = b;
return bridge;
}
static const unsigned char pcix_bus_speed[] = {
PCI_SPEED_UNKNOWN, /* 0 */
PCI_SPEED_66MHz_PCIX, /* 1 */
PCI_SPEED_100MHz_PCIX, /* 2 */
PCI_SPEED_133MHz_PCIX, /* 3 */
PCI_SPEED_UNKNOWN, /* 4 */
PCI_SPEED_66MHz_PCIX_ECC, /* 5 */
PCI_SPEED_100MHz_PCIX_ECC, /* 6 */
PCI_SPEED_133MHz_PCIX_ECC, /* 7 */
PCI_SPEED_UNKNOWN, /* 8 */
PCI_SPEED_66MHz_PCIX_266, /* 9 */
PCI_SPEED_100MHz_PCIX_266, /* A */
PCI_SPEED_133MHz_PCIX_266, /* B */
PCI_SPEED_UNKNOWN, /* C */
PCI_SPEED_66MHz_PCIX_533, /* D */
PCI_SPEED_100MHz_PCIX_533, /* E */
PCI_SPEED_133MHz_PCIX_533 /* F */
};
const unsigned char pcie_link_speed[] = {
PCI_SPEED_UNKNOWN, /* 0 */
PCIE_SPEED_2_5GT, /* 1 */
PCIE_SPEED_5_0GT, /* 2 */
PCIE_SPEED_8_0GT, /* 3 */
PCI_SPEED_UNKNOWN, /* 4 */
PCI_SPEED_UNKNOWN, /* 5 */
PCI_SPEED_UNKNOWN, /* 6 */
PCI_SPEED_UNKNOWN, /* 7 */
PCI_SPEED_UNKNOWN, /* 8 */
PCI_SPEED_UNKNOWN, /* 9 */
PCI_SPEED_UNKNOWN, /* A */
PCI_SPEED_UNKNOWN, /* B */
PCI_SPEED_UNKNOWN, /* C */
PCI_SPEED_UNKNOWN, /* D */
PCI_SPEED_UNKNOWN, /* E */
PCI_SPEED_UNKNOWN /* F */
};
void pcie_update_link_speed(struct pci_bus *bus, u16 linksta)
{
bus->cur_bus_speed = pcie_link_speed[linksta & PCI_EXP_LNKSTA_CLS];
}
EXPORT_SYMBOL_GPL(pcie_update_link_speed);
static unsigned char agp_speeds[] = {
AGP_UNKNOWN,
AGP_1X,
AGP_2X,
AGP_4X,
AGP_8X
};
static enum pci_bus_speed agp_speed(int agp3, int agpstat)
{
int index = 0;
if (agpstat & 4)
index = 3;
else if (agpstat & 2)
index = 2;
else if (agpstat & 1)
index = 1;
else
goto out;
if (agp3) {
index += 2;
if (index == 5)
index = 0;
}
out:
return agp_speeds[index];
}
static void pci_set_bus_speed(struct pci_bus *bus)
{
struct pci_dev *bridge = bus->self;
int pos;
pos = pci_find_capability(bridge, PCI_CAP_ID_AGP);
if (!pos)
pos = pci_find_capability(bridge, PCI_CAP_ID_AGP3);
if (pos) {
u32 agpstat, agpcmd;
pci_read_config_dword(bridge, pos + PCI_AGP_STATUS, &agpstat);
bus->max_bus_speed = agp_speed(agpstat & 8, agpstat & 7);
pci_read_config_dword(bridge, pos + PCI_AGP_COMMAND, &agpcmd);
bus->cur_bus_speed = agp_speed(agpstat & 8, agpcmd & 7);
}
pos = pci_find_capability(bridge, PCI_CAP_ID_PCIX);
if (pos) {
u16 status;
enum pci_bus_speed max;
pci_read_config_word(bridge, pos + PCI_X_BRIDGE_SSTATUS,
&status);
if (status & PCI_X_SSTATUS_533MHZ) {
max = PCI_SPEED_133MHz_PCIX_533;
} else if (status & PCI_X_SSTATUS_266MHZ) {
max = PCI_SPEED_133MHz_PCIX_266;
} else if (status & PCI_X_SSTATUS_133MHZ) {
if ((status & PCI_X_SSTATUS_VERS) == PCI_X_SSTATUS_V2)
max = PCI_SPEED_133MHz_PCIX_ECC;
else
max = PCI_SPEED_133MHz_PCIX;
} else {
max = PCI_SPEED_66MHz_PCIX;
}
bus->max_bus_speed = max;
bus->cur_bus_speed = pcix_bus_speed[
(status & PCI_X_SSTATUS_FREQ) >> 6];
return;
}
if (pci_is_pcie(bridge)) {
u32 linkcap;
u16 linksta;
pcie_capability_read_dword(bridge, PCI_EXP_LNKCAP, &linkcap);
bus->max_bus_speed = pcie_link_speed[linkcap & PCI_EXP_LNKCAP_SLS];
pcie_capability_read_word(bridge, PCI_EXP_LNKSTA, &linksta);
pcie_update_link_speed(bus, linksta);
}
}
static struct irq_domain *pci_host_bridge_msi_domain(struct pci_bus *bus)
{
struct irq_domain *d;
/*
* Any firmware interface that can resolve the msi_domain
* should be called from here.
*/
d = pci_host_bridge_of_msi_domain(bus);
if (!d)
d = pci_host_bridge_acpi_msi_domain(bus);
#ifdef CONFIG_PCI_MSI_IRQ_DOMAIN
/*
* If no IRQ domain was found via the OF tree, try looking it up
* directly through the fwnode_handle.
*/
if (!d) {
struct fwnode_handle *fwnode = pci_root_bus_fwnode(bus);
if (fwnode)
d = irq_find_matching_fwnode(fwnode,
DOMAIN_BUS_PCI_MSI);
}
#endif
return d;
}
static void pci_set_bus_msi_domain(struct pci_bus *bus)
{
struct irq_domain *d;
struct pci_bus *b;
/*
* The bus can be a root bus, a subordinate bus, or a virtual bus
* created by an SR-IOV device. Walk up to the first bridge device
* found or derive the domain from the host bridge.
*/
for (b = bus, d = NULL; !d && !pci_is_root_bus(b); b = b->parent) {
if (b->self)
d = dev_get_msi_domain(&b->self->dev);
}
if (!d)
d = pci_host_bridge_msi_domain(b);
dev_set_msi_domain(&bus->dev, d);
}
static struct pci_bus *pci_alloc_child_bus(struct pci_bus *parent,
struct pci_dev *bridge, int busnr)
{
struct pci_bus *child;
int i;
int ret;
/*
* Allocate a new bus, and inherit stuff from the parent..
*/
child = pci_alloc_bus(parent);
if (!child)
return NULL;
child->parent = parent;
child->ops = parent->ops;
child->msi = parent->msi;
child->sysdata = parent->sysdata;
child->bus_flags = parent->bus_flags;
/* initialize some portions of the bus device, but don't register it
* now as the parent is not properly set up yet.
*/
child->dev.class = &pcibus_class;
dev_set_name(&child->dev, "%04x:%02x", pci_domain_nr(child), busnr);
/*
* Set up the primary, secondary and subordinate
* bus numbers.
*/
child->number = child->busn_res.start = busnr;
child->primary = parent->busn_res.start;
child->busn_res.end = 0xff;
if (!bridge) {
child->dev.parent = parent->bridge;
goto add_dev;
}
child->self = bridge;
child->bridge = get_device(&bridge->dev);
child->dev.parent = child->bridge;
pci_set_bus_of_node(child);
pci_set_bus_speed(child);
/* Set up default resource pointers and names.. */
for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++) {
child->resource[i] = &bridge->resource[PCI_BRIDGE_RESOURCES+i];
child->resource[i]->name = child->name;
}
bridge->subordinate = child;
add_dev:
pci_set_bus_msi_domain(child);
ret = device_register(&child->dev);
WARN_ON(ret < 0);
pcibios_add_bus(child);
if (child->ops->add_bus) {
ret = child->ops->add_bus(child);
if (WARN_ON(ret < 0))
dev_err(&child->dev, "failed to add bus: %d\n", ret);
}
/* Create legacy_io and legacy_mem files for this bus */
pci_create_legacy_files(child);
return child;
}
struct pci_bus *pci_add_new_bus(struct pci_bus *parent, struct pci_dev *dev,
int busnr)
{
struct pci_bus *child;
child = pci_alloc_child_bus(parent, dev, busnr);
if (child) {
down_write(&pci_bus_sem);
list_add_tail(&child->node, &parent->children);
up_write(&pci_bus_sem);
}
return child;
}
EXPORT_SYMBOL(pci_add_new_bus);
PCI: Enable CRS Software Visibility for root port if it is supported Per PCIe r3.0, sec 2.3.2, an endpoint may respond to a Configuration Request with a Completion with Configuration Request Retry Status (CRS). This terminates the Configuration Request. When the CRS Software Visibility feature is disabled (as it is by default), a Root Complex must handle a CRS Completion by re-issuing the Configuration Request. This is invisible to software. From the CPU's point of view, an endpoint that always responds with CRS causes a hang because the Root Complex never supplies data to complete the CPU read. When CRS Software Visibility is enabled, a Root Complex that receives a CRS Completion for a read of the Vendor ID must return data of 0x0001. The Vendor ID of 0x0001 indicates to software that the endpoint is not ready. We now have more devices that require CRS Software Visibility. For example, a PLX 8713 NT bridge may respond with CRS until it has been configured via I2C, and the I2C configuration is completely independent of PCI enumeration. Enable CRS Software Visibility if it is supported. This allows a system with such a device to work (though the PCI core times out waiting for it to become ready, and we have to rescan the bus after it is ready). This essentially reverts ad7edfe04908 ("[PCI] Do not enable CRS Software Visibility by default"). The failures that led to ad7edfe04908 should be addressed by 89665a6a7140 ("PCI: Check only the Vendor ID to identify Configuration Request Retry"). [bhelgaas: changelog] Link: http://lkml.kernel.org/r/20071029061532.5d10dfc6@snowcone Link: http://lkml.kernel.org/r/alpine.LFD.0.9999.0712271023090.21557@woody.linux-foundation.org Signed-off-by: Rajat Jain <rajatxjain@gmail.com> Signed-off-by: Rajat Jain <rajatjain@juniper.net> Signed-off-by: Guenter Roeck <groeck@juniper.net> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
2014-09-03 07:26:00 +08:00
static void pci_enable_crs(struct pci_dev *pdev)
{
u16 root_cap = 0;
/* Enable CRS Software Visibility if supported */
pcie_capability_read_word(pdev, PCI_EXP_RTCAP, &root_cap);
if (root_cap & PCI_EXP_RTCAP_CRSVIS)
pcie_capability_set_word(pdev, PCI_EXP_RTCTL,
PCI_EXP_RTCTL_CRSSVE);
}
/*
* If it's a bridge, configure it and scan the bus behind it.
* For CardBus bridges, we don't scan behind as the devices will
* be handled by the bridge driver itself.
*
* We need to process bridges in two passes -- first we scan those
* already configured by the BIOS and after we are done with all of
* them, we proceed to assigning numbers to the remaining buses in
* order to avoid overlaps between old and new bus numbers.
*/
int pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max, int pass)
{
struct pci_bus *child;
int is_cardbus = (dev->hdr_type == PCI_HEADER_TYPE_CARDBUS);
u32 buses, i, j = 0;
u16 bctl;
u8 primary, secondary, subordinate;
int broken = 0;
/*
* Make sure the bridge is powered on to be able to access config
* space of devices below it.
*/
pm_runtime_get_sync(&dev->dev);
pci_read_config_dword(dev, PCI_PRIMARY_BUS, &buses);
primary = buses & 0xFF;
secondary = (buses >> 8) & 0xFF;
subordinate = (buses >> 16) & 0xFF;
dev_dbg(&dev->dev, "scanning [bus %02x-%02x] behind bridge, pass %d\n",
secondary, subordinate, pass);
if (!primary && (primary != bus->number) && secondary && subordinate) {
dev_warn(&dev->dev, "Primary bus is hard wired to 0\n");
primary = bus->number;
}
/* Check if setup is sensible at all */
if (!pass &&
(primary != bus->number || secondary <= bus->number ||
Revert "PCI: Make sure bus number resources stay within their parents bounds" This reverts commit 1820ffdccb9b ("PCI: Make sure bus number resources stay within their parents bounds") because it breaks some systems with LSI Logic FC949ES Fibre Channel Adapters, apparently by exposing a defect in those adapters. Dirk tested a Tyan VX50 (B4985) with this device that worked like this prior to 1820ffdccb9b: bus: [bus 00-7f] on node 0 link 1 ACPI: PCI Root Bridge [PCI0] (domain 0000 [bus 00-07]) pci 0000:00:0e.0: PCI bridge to [bus 0a] pci_bus 0000:0a: busn_res: can not insert [bus 0a] under [bus 00-07] (conflicts with (null) [bus 00-07]) pci 0000:0a:00.0: [1000:0646] type 00 class 0x0c0400 (FC adapter) Note that the root bridge [bus 00-07] aperture is wrong; this is a BIOS defect in the PCI0 _CRS method. But prior to 1820ffdccb9b, we didn't enforce that aperture, and the FC adapter worked fine at 0a:00.0. After 1820ffdccb9b, we notice that 00:0e.0's aperture is not contained in the root bridge's aperture, so we reconfigure it so it *is* contained: pci 0000:00:0e.0: bridge configuration invalid ([bus 0a-0a]), reconfiguring pci 0000:00:0e.0: PCI bridge to [bus 06-07] This effectively moves the FC device from 0a:00.0 to 07:00.0, which should be legal. But when we enumerate bus 06, the FC device doesn't respond, so we don't find anything. This is probably a defect in the FC device. Possible fixes (due to Yinghai): 1) Add a quirk to fix the _CRS information based on what amd_bus.c read from the hardware 2) Reset the FC device after we change its bus number 3) Revert 1820ffdccb9b Fix 1 would be relatively easy, but it does sweep the LSI FC issue under the rug. We might want to reconfigure bus numbers in the future for some other reason, e.g., hotplug, and then we could trip over this again. For that reason, I like fix 2, but we don't know whether it actually works, and we don't have a patch for it yet. This revert is fix 3, which also sweeps the LSI FC issue under the rug. Link: https://bugzilla.kernel.org/show_bug.cgi?id=84281 Reported-by: Dirk Gouders <dirk@gouders.net> Tested-by: Dirk Gouders <dirk@gouders.net> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> CC: stable@vger.kernel.org # v3.15+ CC: Yinghai Lu <yinghai@kernel.org>
2014-09-20 01:08:40 +08:00
secondary > subordinate)) {
dev_info(&dev->dev, "bridge configuration invalid ([bus %02x-%02x]), reconfiguring\n",
secondary, subordinate);
broken = 1;
}
/* Disable MasterAbortMode during probing to avoid reporting
of bus errors (in some architectures) */
pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &bctl);
pci_write_config_word(dev, PCI_BRIDGE_CONTROL,
bctl & ~PCI_BRIDGE_CTL_MASTER_ABORT);
PCI: Enable CRS Software Visibility for root port if it is supported Per PCIe r3.0, sec 2.3.2, an endpoint may respond to a Configuration Request with a Completion with Configuration Request Retry Status (CRS). This terminates the Configuration Request. When the CRS Software Visibility feature is disabled (as it is by default), a Root Complex must handle a CRS Completion by re-issuing the Configuration Request. This is invisible to software. From the CPU's point of view, an endpoint that always responds with CRS causes a hang because the Root Complex never supplies data to complete the CPU read. When CRS Software Visibility is enabled, a Root Complex that receives a CRS Completion for a read of the Vendor ID must return data of 0x0001. The Vendor ID of 0x0001 indicates to software that the endpoint is not ready. We now have more devices that require CRS Software Visibility. For example, a PLX 8713 NT bridge may respond with CRS until it has been configured via I2C, and the I2C configuration is completely independent of PCI enumeration. Enable CRS Software Visibility if it is supported. This allows a system with such a device to work (though the PCI core times out waiting for it to become ready, and we have to rescan the bus after it is ready). This essentially reverts ad7edfe04908 ("[PCI] Do not enable CRS Software Visibility by default"). The failures that led to ad7edfe04908 should be addressed by 89665a6a7140 ("PCI: Check only the Vendor ID to identify Configuration Request Retry"). [bhelgaas: changelog] Link: http://lkml.kernel.org/r/20071029061532.5d10dfc6@snowcone Link: http://lkml.kernel.org/r/alpine.LFD.0.9999.0712271023090.21557@woody.linux-foundation.org Signed-off-by: Rajat Jain <rajatxjain@gmail.com> Signed-off-by: Rajat Jain <rajatjain@juniper.net> Signed-off-by: Guenter Roeck <groeck@juniper.net> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
2014-09-03 07:26:00 +08:00
pci_enable_crs(dev);
if ((secondary || subordinate) && !pcibios_assign_all_busses() &&
!is_cardbus && !broken) {
unsigned int cmax;
/*
* Bus already configured by firmware, process it in the first
* pass and just note the configuration.
*/
if (pass)
[PATCH] PCI: Avoid leaving MASTER_ABORT disabled permanently when returning from pci_scan_bridge. > On Mon, Feb 13, 2006 at 05:13:21PM -0800, David S. Miller wrote: > > > > In drivers/pci/probe.c:pci_scan_bridge(), if this is not the first > > pass (pass != 0) we don't restore the PCI_BRIDGE_CONTROL_REGISTER and > > thus leave PCI_BRIDGE_CTL_MASTER_ABORT off: > > > > int __devinit pci_scan_bridge(struct pci_bus *bus, struct pci_dev * dev, int max, int pass) > > { > > ... > > /* Disable MasterAbortMode during probing to avoid reporting > > of bus errors (in some architectures) */ > > pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &bctl); > > pci_write_config_word(dev, PCI_BRIDGE_CONTROL, > > bctl & ~PCI_BRIDGE_CTL_MASTER_ABORT); > > ... > > if ((buses & 0xffff00) && !pcibios_assign_all_busses() && !is_cardbus) { > > unsigned int cmax, busnr; > > /* > > * Bus already configured by firmware, process it in the first > > * pass and just note the configuration. > > */ > > if (pass) > > return max; > > ... > > } > > > > pci_write_config_word(dev, PCI_BRIDGE_CONTROL, bctl); > > ... > > > > This doesn't seem intentional. Agreed, looks like an accident. The patch [1] originally came from Kip Walker (Broadcom back then) between 2.6.0-test3 and 2.6.0-test4. As I recall it was supposed to fix an issue with with PCI aborts being signalled by the PCI bridge of the Broadcom BCM1250 family of SOCs when probing behind pci_scan_bridge. It is undeseriable to disable PCI_BRIDGE_CTL_MASTER_ABORT in pci_{read,write)_config_* and the behaviour wasn't considered a bug in need of a workaround, so this was put in probe.c. I don't have an affected system at hand, so can't really test but I propose something like the below patch. [1] http://www.linux-mips.org/git?p=linux.git;a=commit;h=599457e0cb702a31a3247ea6a5d9c6c99c4cf195 [PCI] Avoid leaving MASTER_ABORT disabled permanently when returning from pci_scan_bridge. Signed-off-by: Ralf Baechle <ralf@linux-mips.org> Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2006-02-15 00:23:57 +08:00
goto out;
/*
* The bus might already exist for two reasons: Either we are
* rescanning the bus or the bus is reachable through more than
* one bridge. The second case can happen with the i450NX
* chipset.
*/
child = pci_find_bus(pci_domain_nr(bus), secondary);
if (!child) {
child = pci_add_new_bus(bus, dev, secondary);
if (!child)
goto out;
child->primary = primary;
pci_bus_insert_busn_res(child, secondary, subordinate);
child->bridge_ctl = bctl;
}
cmax = pci_scan_child_bus(child);
if (cmax > subordinate)
dev_warn(&dev->dev, "bridge has subordinate %02x but max busn %02x\n",
subordinate, cmax);
/* subordinate should equal child->busn_res.end */
if (subordinate > max)
max = subordinate;
} else {
/*
* We need to assign a number to this bus which we always
* do in the second pass.
*/
if (!pass) {
if (pcibios_assign_all_busses() || broken || is_cardbus)
/* Temporarily disable forwarding of the
configuration cycles on all bridges in
this bus segment to avoid possible
conflicts in the second pass between two
bridges programmed with overlapping
bus ranges. */
pci_write_config_dword(dev, PCI_PRIMARY_BUS,
buses & ~0xffffff);
[PATCH] PCI: Avoid leaving MASTER_ABORT disabled permanently when returning from pci_scan_bridge. > On Mon, Feb 13, 2006 at 05:13:21PM -0800, David S. Miller wrote: > > > > In drivers/pci/probe.c:pci_scan_bridge(), if this is not the first > > pass (pass != 0) we don't restore the PCI_BRIDGE_CONTROL_REGISTER and > > thus leave PCI_BRIDGE_CTL_MASTER_ABORT off: > > > > int __devinit pci_scan_bridge(struct pci_bus *bus, struct pci_dev * dev, int max, int pass) > > { > > ... > > /* Disable MasterAbortMode during probing to avoid reporting > > of bus errors (in some architectures) */ > > pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &bctl); > > pci_write_config_word(dev, PCI_BRIDGE_CONTROL, > > bctl & ~PCI_BRIDGE_CTL_MASTER_ABORT); > > ... > > if ((buses & 0xffff00) && !pcibios_assign_all_busses() && !is_cardbus) { > > unsigned int cmax, busnr; > > /* > > * Bus already configured by firmware, process it in the first > > * pass and just note the configuration. > > */ > > if (pass) > > return max; > > ... > > } > > > > pci_write_config_word(dev, PCI_BRIDGE_CONTROL, bctl); > > ... > > > > This doesn't seem intentional. Agreed, looks like an accident. The patch [1] originally came from Kip Walker (Broadcom back then) between 2.6.0-test3 and 2.6.0-test4. As I recall it was supposed to fix an issue with with PCI aborts being signalled by the PCI bridge of the Broadcom BCM1250 family of SOCs when probing behind pci_scan_bridge. It is undeseriable to disable PCI_BRIDGE_CTL_MASTER_ABORT in pci_{read,write)_config_* and the behaviour wasn't considered a bug in need of a workaround, so this was put in probe.c. I don't have an affected system at hand, so can't really test but I propose something like the below patch. [1] http://www.linux-mips.org/git?p=linux.git;a=commit;h=599457e0cb702a31a3247ea6a5d9c6c99c4cf195 [PCI] Avoid leaving MASTER_ABORT disabled permanently when returning from pci_scan_bridge. Signed-off-by: Ralf Baechle <ralf@linux-mips.org> Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2006-02-15 00:23:57 +08:00
goto out;
}
/* Clear errors */
pci_write_config_word(dev, PCI_STATUS, 0xffff);
Revert "PCI: Don't scan random busses in pci_scan_bridge()" This reverts commit fc1b253141b3 ("PCI: Don't scan random busses in pci_scan_bridge()") because it breaks CardBus on some machines. David tested a Dell Latitude D505 that worked like this prior to fc1b253141b3: pci 0000:00:1e.0: PCI bridge to [bus 01] pci 0000:01:01.0: CardBus bridge to [bus 02-05] Note that the 01:01.0 CardBus bridge has a bus number aperture of [bus 02-05], but those buses are all outside the 00:1e.0 PCI bridge bus number aperture, so accesses to buses 02-05 never reach CardBus. This is later patched up by yenta_fixup_parent_bridge(), which changes the subordinate bus number of the 00:1e.0 PCI bridge: pci_bus 0000:01: Raising subordinate bus# of parent bus (#01) from #01 to #05 With fc1b253141b3, pci_scan_bridge() fails immediately when it notices that we can't allocate a valid secondary bus number for the CardBus bridge, and CardBus doesn't work at all: pci 0000:01:01.0: can't allocate child bus 01 from [bus 01] I'd prefer to fix this by integrating the yenta_fixup_parent_bridge() logic into pci_scan_bridge() so we fix the bus number apertures up front. But I don't think we can do that before v3.17, so I'm going to revert this to avoid the problem while we're working on the long-term fix. Link: https://bugzilla.kernel.org/show_bug.cgi?id=83441 Link: http://lkml.kernel.org/r/1409303414-5196-1-git-send-email-david.henningsson@canonical.com Reported-by: David Henningsson <david.henningsson@canonical.com> Tested-by: David Henningsson <david.henningsson@canonical.com> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> CC: stable@vger.kernel.org # v3.15+
2014-09-20 00:56:06 +08:00
/* Prevent assigning a bus number that already exists.
* This can happen when a bridge is hot-plugged, so in
* this case we only re-scan this bus. */
child = pci_find_bus(pci_domain_nr(bus), max+1);
if (!child) {
child = pci_add_new_bus(bus, dev, max+1);
if (!child)
goto out;
Revert "PCI: Make sure bus number resources stay within their parents bounds" This reverts commit 1820ffdccb9b ("PCI: Make sure bus number resources stay within their parents bounds") because it breaks some systems with LSI Logic FC949ES Fibre Channel Adapters, apparently by exposing a defect in those adapters. Dirk tested a Tyan VX50 (B4985) with this device that worked like this prior to 1820ffdccb9b: bus: [bus 00-7f] on node 0 link 1 ACPI: PCI Root Bridge [PCI0] (domain 0000 [bus 00-07]) pci 0000:00:0e.0: PCI bridge to [bus 0a] pci_bus 0000:0a: busn_res: can not insert [bus 0a] under [bus 00-07] (conflicts with (null) [bus 00-07]) pci 0000:0a:00.0: [1000:0646] type 00 class 0x0c0400 (FC adapter) Note that the root bridge [bus 00-07] aperture is wrong; this is a BIOS defect in the PCI0 _CRS method. But prior to 1820ffdccb9b, we didn't enforce that aperture, and the FC adapter worked fine at 0a:00.0. After 1820ffdccb9b, we notice that 00:0e.0's aperture is not contained in the root bridge's aperture, so we reconfigure it so it *is* contained: pci 0000:00:0e.0: bridge configuration invalid ([bus 0a-0a]), reconfiguring pci 0000:00:0e.0: PCI bridge to [bus 06-07] This effectively moves the FC device from 0a:00.0 to 07:00.0, which should be legal. But when we enumerate bus 06, the FC device doesn't respond, so we don't find anything. This is probably a defect in the FC device. Possible fixes (due to Yinghai): 1) Add a quirk to fix the _CRS information based on what amd_bus.c read from the hardware 2) Reset the FC device after we change its bus number 3) Revert 1820ffdccb9b Fix 1 would be relatively easy, but it does sweep the LSI FC issue under the rug. We might want to reconfigure bus numbers in the future for some other reason, e.g., hotplug, and then we could trip over this again. For that reason, I like fix 2, but we don't know whether it actually works, and we don't have a patch for it yet. This revert is fix 3, which also sweeps the LSI FC issue under the rug. Link: https://bugzilla.kernel.org/show_bug.cgi?id=84281 Reported-by: Dirk Gouders <dirk@gouders.net> Tested-by: Dirk Gouders <dirk@gouders.net> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> CC: stable@vger.kernel.org # v3.15+ CC: Yinghai Lu <yinghai@kernel.org>
2014-09-20 01:08:40 +08:00
pci_bus_insert_busn_res(child, max+1, 0xff);
}
max++;
buses = (buses & 0xff000000)
| ((unsigned int)(child->primary) << 0)
| ((unsigned int)(child->busn_res.start) << 8)
| ((unsigned int)(child->busn_res.end) << 16);
/*
* yenta.c forces a secondary latency timer of 176.
* Copy that behaviour here.
*/
if (is_cardbus) {
buses &= ~0xff000000;
buses |= CARDBUS_LATENCY_TIMER << 24;
}
/*
* We need to blast all three values with a single write.
*/
pci_write_config_dword(dev, PCI_PRIMARY_BUS, buses);
if (!is_cardbus) {
child->bridge_ctl = bctl;
max = pci_scan_child_bus(child);
} else {
/*
* For CardBus bridges, we leave 4 bus numbers
* as cards with a PCI-to-PCI bridge can be
* inserted later.
*/
for (i = 0; i < CARDBUS_RESERVE_BUSNR; i++) {
struct pci_bus *parent = bus;
if (pci_find_bus(pci_domain_nr(bus),
max+i+1))
break;
while (parent->parent) {
if ((!pcibios_assign_all_busses()) &&
(parent->busn_res.end > max) &&
(parent->busn_res.end <= max+i)) {
j = 1;
}
parent = parent->parent;
}
if (j) {
/*
* Often, there are two cardbus bridges
* -- try to leave one valid bus number
* for each one.
*/
i /= 2;
break;
}
}
max += i;
}
/*
* Set the subordinate bus number to its real value.
*/
pci_bus_update_busn_res_end(child, max);
pci_write_config_byte(dev, PCI_SUBORDINATE_BUS, max);
}
sprintf(child->name,
(is_cardbus ? "PCI CardBus %04x:%02x" : "PCI Bus %04x:%02x"),
pci_domain_nr(bus), child->number);
PCI: lets kill the 'PCI hidden behind bridge' message Adrian Bunk wrote: > Alois Nešpor wrote >> PCI: Bus #0b (-#0e) is hidden behind transparent bridge #0a (-#0b) (try 'pci=assign-busses') >> Please report the result to linux-kernel to fix this permanently" >> >> dmesg: >> "Yenta: Raising subordinate bus# of parent bus (#0a) from #0b to #0e" >> without pci=assign-busses and nothing with pci=assign-busses. > > Bernhard? Ok, lets kill the message. As Alois Nešpor also saw, that's fixed up by Yenta, so PCI does not have to warn about it. PCI could still warn about it if is_cardbus is 0 in that instance of pci_scan_bridge(), but so far I have not seen a report where this would have been the case so I think we can spare the kernel of that check (removes ~300 lines of asm) unless debugging is done. History: The whole check was added in the days before we had the fixup for this in Yenta and pci=assign-busses was the only way to get CardBus cards detected on many (not all) of the machines which give this warning. In theory, there could be cases when this warning would be triggered and it's not cardbus, then the warning should still apply, but I think this should only be the case when working on a completely broken PCI setup, but one may have already enabled the debug code in drivers/pci and the patched check would then trigger. I do not sign this off yet because it's completely untested so far, but everyone is free to test it (with the #ifdef DEBUG replaced by #if 1 and pr_debug( changed to printk(. We may also dump the whole check (remove everything within the #ifdef from the source) if that's perferred. On Alois Nešpor's machine this would then (only when debugging) this message: "PCI: Bus #0b (-#0e) is partially hidden behind transparent bridge #0a (-#0b)" "partially" should be in the message on his machine because #0b of #0b-#0e is reachable behind #0a-#0b, but not #0c-#0e. But that differentiation is now moot anyway because the fixup in Yenta takes care of it as far as I could see so far, which means that unless somebody is debugging a totally broken PCI setup, this message is not needed anymore, not even for debugging PCI. Ok, here the patch with the following changes: * Refined to say that the bus is only partially hidden when the parent bus numbers are not totally way off (outside of) the child bus range * remove the reference to pci=assign-busses and the plea to report it We could add a pure source code-only comment to keep a reference to pci=assign-busses the in case when this is triggered by someone who is debugging the cause of this message and looking the way to solve it. From: Bernhard Kaindl <bk@suse.de> Cc: stable <stable@kernel.org> Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2007-07-31 02:35:13 +08:00
/* Has only triggered on CardBus, fixup is in yenta_socket */
while (bus->parent) {
if ((child->busn_res.end > bus->busn_res.end) ||
(child->number > bus->busn_res.end) ||
(child->number < bus->number) ||
(child->busn_res.end < bus->number)) {
dev_info(&child->dev, "%pR %s hidden behind%s bridge %s %pR\n",
&child->busn_res,
(bus->number > child->busn_res.end &&
bus->busn_res.end < child->number) ?
"wholly" : "partially",
bus->self->transparent ? " transparent" : "",
dev_name(&bus->dev),
&bus->busn_res);
}
bus = bus->parent;
}
[PATCH] PCI: Avoid leaving MASTER_ABORT disabled permanently when returning from pci_scan_bridge. > On Mon, Feb 13, 2006 at 05:13:21PM -0800, David S. Miller wrote: > > > > In drivers/pci/probe.c:pci_scan_bridge(), if this is not the first > > pass (pass != 0) we don't restore the PCI_BRIDGE_CONTROL_REGISTER and > > thus leave PCI_BRIDGE_CTL_MASTER_ABORT off: > > > > int __devinit pci_scan_bridge(struct pci_bus *bus, struct pci_dev * dev, int max, int pass) > > { > > ... > > /* Disable MasterAbortMode during probing to avoid reporting > > of bus errors (in some architectures) */ > > pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &bctl); > > pci_write_config_word(dev, PCI_BRIDGE_CONTROL, > > bctl & ~PCI_BRIDGE_CTL_MASTER_ABORT); > > ... > > if ((buses & 0xffff00) && !pcibios_assign_all_busses() && !is_cardbus) { > > unsigned int cmax, busnr; > > /* > > * Bus already configured by firmware, process it in the first > > * pass and just note the configuration. > > */ > > if (pass) > > return max; > > ... > > } > > > > pci_write_config_word(dev, PCI_BRIDGE_CONTROL, bctl); > > ... > > > > This doesn't seem intentional. Agreed, looks like an accident. The patch [1] originally came from Kip Walker (Broadcom back then) between 2.6.0-test3 and 2.6.0-test4. As I recall it was supposed to fix an issue with with PCI aborts being signalled by the PCI bridge of the Broadcom BCM1250 family of SOCs when probing behind pci_scan_bridge. It is undeseriable to disable PCI_BRIDGE_CTL_MASTER_ABORT in pci_{read,write)_config_* and the behaviour wasn't considered a bug in need of a workaround, so this was put in probe.c. I don't have an affected system at hand, so can't really test but I propose something like the below patch. [1] http://www.linux-mips.org/git?p=linux.git;a=commit;h=599457e0cb702a31a3247ea6a5d9c6c99c4cf195 [PCI] Avoid leaving MASTER_ABORT disabled permanently when returning from pci_scan_bridge. Signed-off-by: Ralf Baechle <ralf@linux-mips.org> Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2006-02-15 00:23:57 +08:00
out:
pci_write_config_word(dev, PCI_BRIDGE_CONTROL, bctl);
pm_runtime_put(&dev->dev);
return max;
}
EXPORT_SYMBOL(pci_scan_bridge);
/*
* Read interrupt line and base address registers.
* The architecture-dependent code can tweak these, of course.
*/
static void pci_read_irq(struct pci_dev *dev)
{
unsigned char irq;
pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &irq);
dev->pin = irq;
if (irq)
pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &irq);
dev->irq = irq;
}
void set_pcie_port_type(struct pci_dev *pdev)
{
int pos;
u16 reg16;
PCI: Add dev->has_secondary_link to track downstream PCIe links A PCIe Port is an interface to a Link. A Root Port is a PCI-PCI bridge in a Root Complex and has a Link on its secondary (downstream) side. For other Ports, the Link may be on either the upstream (closer to the Root Complex) or downstream side of the Port. The usual topology has a Root Port connected to an Upstream Port. We previously assumed this was the only possible topology, and that a Downstream Port's Link was always on its downstream side, like this: +---------------------+ +------+ | Downstream | | Root | | Upstream Port +--Link-- | Port +--Link--+ Port | +------+ | Downstream | | Port +--Link-- +---------------------+ But systems do exist (see URL below) where the Root Port is connected to a Downstream Port. In this case, a Downstream Port's Link may be on either the upstream or downstream side: +---------------------+ +------+ | Upstream | | Root | | Downstream Port +--Link-- | Port +--Link--+ Port | +------+ | Downstream | | Port +--Link-- +---------------------+ We can't use the Port type to determine which side the Link is on, so add a bit in struct pci_dev to keep track. A Root Port's Link is always on the Port's secondary side. A component (Endpoint or Port) on the other end of the Link obviously has the Link on its upstream side. If that component is a Port, it is part of a Switch or a Bridge. A Bridge has a PCI or PCI-X bus on its secondary side, not a Link. The internal bus of a Switch connects the Port to another Port whose Link is on the downstream side. [bhelgaas: changelog, comment, cache "type", use if/else] Link: http://lkml.kernel.org/r/54EB81B2.4050904@pobox.com Link: https://bugzilla.kernel.org/show_bug.cgi?id=94361 Suggested-by: Bjorn Helgaas <bhelgaas@google.com> Signed-off-by: Yijing Wang <wangyijing@huawei.com> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
2015-05-21 15:05:02 +08:00
int type;
struct pci_dev *parent;
pos = pci_find_capability(pdev, PCI_CAP_ID_EXP);
if (!pos)
return;
pdev->pcie_cap = pos;
pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, &reg16);
pdev->pcie_flags_reg = reg16;
PCI: Set PCI-E Max Payload Size on fabric On a given PCI-E fabric, each device, bridge, and root port can have a different PCI-E maximum payload size. There is a sizable performance boost for having the largest possible maximum payload size on each PCI-E device. However, if improperly configured, fatal bus errors can occur. Thus, it is important to ensure that PCI-E payloads sends by a device are never larger than the MPS setting of all devices on the way to the destination. This can be achieved two ways: - A conservative approach is to use the smallest common denominator of the entire tree below a root complex for every device on that fabric. This means for example that having a 128 bytes MPS USB controller on one leg of a switch will dramatically reduce performances of a video card or 10GE adapter on another leg of that same switch. It also means that any hierarchy supporting hotplug slots (including expresscard or thunderbolt I suppose, dbl check that) will have to be entirely clamped to 128 bytes since we cannot predict what will be plugged into those slots, and we cannot change the MPS on a "live" system. - A more optimal way is possible, if it falls within a couple of constraints: * The top-level host bridge will never generate packets larger than the smallest TLP (or if it can be controlled independently from its MPS at least) * The device will never generate packets larger than MPS (which can be configured via MRRS) * No support of direct PCI-E <-> PCI-E transfers between devices without some additional code to specifically deal with that case Then we can use an approach that basically ignores downstream requests and focuses exclusively on upstream requests. In that case, all we need to care about is that a device MPS is no larger than its parent MPS, which allows us to keep all switches/bridges to the max MPS supported by their parent and eventually the PHB. In this case, your USB controller would no longer "starve" your 10GE Ethernet and your hotplug slots won't affect your global MPS. Additionally, the hotplugged devices themselves can be configured to a larger MPS up to the value configured in the hotplug bridge. To choose between the two available options, two PCI kernel boot args have been added to the PCI calls. "pcie_bus_safe" will provide the former behavior, while "pcie_bus_perf" will perform the latter behavior. By default, the latter behavior is used. NOTE: due to the location of the enablement, each arch will need to add calls to this function. This patch only enables x86. This patch includes a number of changes recommended by Benjamin Herrenschmidt. Tested-by: Jordan_Hargrave@dell.com Signed-off-by: Jon Mason <mason@myri.com> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
pci_read_config_word(pdev, pos + PCI_EXP_DEVCAP, &reg16);
pdev->pcie_mpss = reg16 & PCI_EXP_DEVCAP_PAYLOAD;
PCI: Add dev->has_secondary_link to track downstream PCIe links A PCIe Port is an interface to a Link. A Root Port is a PCI-PCI bridge in a Root Complex and has a Link on its secondary (downstream) side. For other Ports, the Link may be on either the upstream (closer to the Root Complex) or downstream side of the Port. The usual topology has a Root Port connected to an Upstream Port. We previously assumed this was the only possible topology, and that a Downstream Port's Link was always on its downstream side, like this: +---------------------+ +------+ | Downstream | | Root | | Upstream Port +--Link-- | Port +--Link--+ Port | +------+ | Downstream | | Port +--Link-- +---------------------+ But systems do exist (see URL below) where the Root Port is connected to a Downstream Port. In this case, a Downstream Port's Link may be on either the upstream or downstream side: +---------------------+ +------+ | Upstream | | Root | | Downstream Port +--Link-- | Port +--Link--+ Port | +------+ | Downstream | | Port +--Link-- +---------------------+ We can't use the Port type to determine which side the Link is on, so add a bit in struct pci_dev to keep track. A Root Port's Link is always on the Port's secondary side. A component (Endpoint or Port) on the other end of the Link obviously has the Link on its upstream side. If that component is a Port, it is part of a Switch or a Bridge. A Bridge has a PCI or PCI-X bus on its secondary side, not a Link. The internal bus of a Switch connects the Port to another Port whose Link is on the downstream side. [bhelgaas: changelog, comment, cache "type", use if/else] Link: http://lkml.kernel.org/r/54EB81B2.4050904@pobox.com Link: https://bugzilla.kernel.org/show_bug.cgi?id=94361 Suggested-by: Bjorn Helgaas <bhelgaas@google.com> Signed-off-by: Yijing Wang <wangyijing@huawei.com> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
2015-05-21 15:05:02 +08:00
/*
* A Root Port is always the upstream end of a Link. No PCIe
* component has two Links. Two Links are connected by a Switch
* that has a Port on each Link and internal logic to connect the
* two Ports.
*/
type = pci_pcie_type(pdev);
if (type == PCI_EXP_TYPE_ROOT_PORT)
pdev->has_secondary_link = 1;
else if (type == PCI_EXP_TYPE_UPSTREAM ||
type == PCI_EXP_TYPE_DOWNSTREAM) {
parent = pci_upstream_bridge(pdev);
PCI: Tolerate hierarchies with no Root Port We should not assume any particular hardware topology. Commit d0751b98dfa3 ("PCI: Add dev->has_secondary_link to track downstream PCIe links") relied on the assumption that every PCIe hierarchy is rooted at a Root Port. But we can't rely on any assumption about what hardware we will find; we just have to deal with the world as it is. On some platforms, PCIe devices (endpoints, switch upstream ports, etc.) appear directly on the root bus, and there is no Root Port in the PCI bus hierarchy. For example, Meelis observed these top-level devices on a Sparc V245: 0000:02:00.0 PCI bridge to [bus 03-0d] Switch Upstream Port 0001:02:00.0 PCI bridge to [bus 03] PCIe to PCI/PCI-X Bridge These devices *look* like they have links going upstream, but there really are no upstream devices. In set_pcie_port_type(), we used the parent device to figure out which side of a switch port has a link, so if the parent device did not exist, we dereferenced a NULL parent pointer. Check whether the parent device exists before dereferencing it. Meelis observed this oops on Sparc V245 and T2000. Ben Herrenschmidt says this is also possible on IBM PowerVM guests on PowerPC. [bhelgaas: changelog, comment] Link: http://lkml.kernel.org/r/alpine.LRH.2.20.1508122118210.18637@math.ut.ee Reported-by: Meelis Roos <mroos@linux.ee> Tested-by: Meelis Roos <mroos@linux.ee> Signed-off-by: Yijing Wang <wangyijing@huawei.com> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> Acked-by: David S. Miller <davem@davemloft.net>
2015-08-17 18:47:58 +08:00
/*
* Usually there's an upstream device (Root Port or Switch
* Downstream Port), but we can't assume one exists.
*/
if (parent && !parent->has_secondary_link)
PCI: Add dev->has_secondary_link to track downstream PCIe links A PCIe Port is an interface to a Link. A Root Port is a PCI-PCI bridge in a Root Complex and has a Link on its secondary (downstream) side. For other Ports, the Link may be on either the upstream (closer to the Root Complex) or downstream side of the Port. The usual topology has a Root Port connected to an Upstream Port. We previously assumed this was the only possible topology, and that a Downstream Port's Link was always on its downstream side, like this: +---------------------+ +------+ | Downstream | | Root | | Upstream Port +--Link-- | Port +--Link--+ Port | +------+ | Downstream | | Port +--Link-- +---------------------+ But systems do exist (see URL below) where the Root Port is connected to a Downstream Port. In this case, a Downstream Port's Link may be on either the upstream or downstream side: +---------------------+ +------+ | Upstream | | Root | | Downstream Port +--Link-- | Port +--Link--+ Port | +------+ | Downstream | | Port +--Link-- +---------------------+ We can't use the Port type to determine which side the Link is on, so add a bit in struct pci_dev to keep track. A Root Port's Link is always on the Port's secondary side. A component (Endpoint or Port) on the other end of the Link obviously has the Link on its upstream side. If that component is a Port, it is part of a Switch or a Bridge. A Bridge has a PCI or PCI-X bus on its secondary side, not a Link. The internal bus of a Switch connects the Port to another Port whose Link is on the downstream side. [bhelgaas: changelog, comment, cache "type", use if/else] Link: http://lkml.kernel.org/r/54EB81B2.4050904@pobox.com Link: https://bugzilla.kernel.org/show_bug.cgi?id=94361 Suggested-by: Bjorn Helgaas <bhelgaas@google.com> Signed-off-by: Yijing Wang <wangyijing@huawei.com> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
2015-05-21 15:05:02 +08:00
pdev->has_secondary_link = 1;
}
}
void set_pcie_hotplug_bridge(struct pci_dev *pdev)
{
u32 reg32;
pcie_capability_read_dword(pdev, PCI_EXP_SLTCAP, &reg32);
if (reg32 & PCI_EXP_SLTCAP_HPC)
pdev->is_hotplug_bridge = 1;
}
/**
* pci_ext_cfg_is_aliased - is ext config space just an alias of std config?
* @dev: PCI device
*
* PCI Express to PCI/PCI-X Bridge Specification, rev 1.0, 4.1.4 says that
* when forwarding a type1 configuration request the bridge must check that
* the extended register address field is zero. The bridge is not permitted
* to forward the transactions and must handle it as an Unsupported Request.
* Some bridges do not follow this rule and simply drop the extended register
* bits, resulting in the standard config space being aliased, every 256
* bytes across the entire configuration space. Test for this condition by
* comparing the first dword of each potential alias to the vendor/device ID.
* Known offenders:
* ASM1083/1085 PCIe-to-PCI Reversible Bridge (1b21:1080, rev 01 & 03)
* AMD/ATI SBx00 PCI to PCI Bridge (1002:4384, rev 40)
*/
static bool pci_ext_cfg_is_aliased(struct pci_dev *dev)
{
#ifdef CONFIG_PCI_QUIRKS
int pos;
u32 header, tmp;
pci_read_config_dword(dev, PCI_VENDOR_ID, &header);
for (pos = PCI_CFG_SPACE_SIZE;
pos < PCI_CFG_SPACE_EXP_SIZE; pos += PCI_CFG_SPACE_SIZE) {
if (pci_read_config_dword(dev, pos, &tmp) != PCIBIOS_SUCCESSFUL
|| header != tmp)
return false;
}
return true;
#else
return false;
#endif
}
/**
* pci_cfg_space_size - get the configuration space size of the PCI device.
* @dev: PCI device
*
* Regular PCI devices have 256 bytes, but PCI-X 2 and PCI Express devices
* have 4096 bytes. Even if the device is capable, that doesn't mean we can
* access it. Maybe we don't have a way to generate extended config space
* accesses, or the device is behind a reverse Express bridge. So we try
* reading the dword at 0x100 which must either be 0 or a valid extended
* capability header.
*/
static int pci_cfg_space_size_ext(struct pci_dev *dev)
{
u32 status;
int pos = PCI_CFG_SPACE_SIZE;
if (pci_read_config_dword(dev, pos, &status) != PCIBIOS_SUCCESSFUL)
return PCI_CFG_SPACE_SIZE;
if (status == 0xffffffff || pci_ext_cfg_is_aliased(dev))
return PCI_CFG_SPACE_SIZE;
return PCI_CFG_SPACE_EXP_SIZE;
}
int pci_cfg_space_size(struct pci_dev *dev)
{
int pos;
u32 status;
u16 class;
class = dev->class >> 8;
if (class == PCI_CLASS_BRIDGE_HOST)
return pci_cfg_space_size_ext(dev);
if (pci_is_pcie(dev))
return pci_cfg_space_size_ext(dev);
pos = pci_find_capability(dev, PCI_CAP_ID_PCIX);
if (!pos)
return PCI_CFG_SPACE_SIZE;
pci_read_config_dword(dev, pos + PCI_X_STATUS, &status);
if (status & (PCI_X_STATUS_266MHZ | PCI_X_STATUS_533MHZ))
return pci_cfg_space_size_ext(dev);
return PCI_CFG_SPACE_SIZE;
}
#define LEGACY_IO_RESOURCE (IORESOURCE_IO | IORESOURCE_PCI_FIXED)
static void pci_msi_setup_pci_dev(struct pci_dev *dev)
{
/*
* Disable the MSI hardware to avoid screaming interrupts
* during boot. This is the power on reset default so
* usually this should be a noop.
*/
dev->msi_cap = pci_find_capability(dev, PCI_CAP_ID_MSI);
if (dev->msi_cap)
pci_msi_set_enable(dev, 0);
dev->msix_cap = pci_find_capability(dev, PCI_CAP_ID_MSIX);
if (dev->msix_cap)
pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
}
/**
* pci_setup_device - fill in class and map information of a device
* @dev: the device structure to fill
*
* Initialize the device structure with information about the device's
* vendor,class,memory and IO-space addresses,IRQ lines etc.
* Called at initialisation of the PCI subsystem and by CardBus services.
* Returns 0 on success and negative if unknown type of device (not normal,
* bridge or CardBus).
*/
int pci_setup_device(struct pci_dev *dev)
{
u32 class;
u16 cmd;
u8 hdr_type;
int pos = 0;
struct pci_bus_region region;
struct resource *res;
if (pci_read_config_byte(dev, PCI_HEADER_TYPE, &hdr_type))
return -EIO;
dev->sysdata = dev->bus->sysdata;
dev->dev.parent = dev->bus->bridge;
dev->dev.bus = &pci_bus_type;
dev->hdr_type = hdr_type & 0x7f;
dev->multifunction = !!(hdr_type & 0x80);
dev->error_state = pci_channel_io_normal;
set_pcie_port_type(dev);
pci_dev_assign_slot(dev);
/* Assume 32-bit PCI; let 64-bit PCI cards (which are far rarer)
set this higher, assuming the system even supports it. */
dev->dma_mask = 0xffffffff;
dev_set_name(&dev->dev, "%04x:%02x:%02x.%d", pci_domain_nr(dev->bus),
dev->bus->number, PCI_SLOT(dev->devfn),
PCI_FUNC(dev->devfn));
pci_read_config_dword(dev, PCI_CLASS_REVISION, &class);
dev->revision = class & 0xff;
dev->class = class >> 8; /* upper 3 bytes */
dev_printk(KERN_DEBUG, &dev->dev, "[%04x:%04x] type %02x class %#08x\n",
dev->vendor, dev->device, dev->hdr_type, dev->class);
/* need to have dev->class ready */
dev->cfg_size = pci_cfg_space_size(dev);
/* "Unknown power state" */
dev->current_state = PCI_UNKNOWN;
/* Early fixups, before probing the BARs */
pci_fixup_device(pci_fixup_early, dev);
/* device class may be changed after fixup */
class = dev->class >> 8;
if (dev->non_compliant_bars) {
pci_read_config_word(dev, PCI_COMMAND, &cmd);
if (cmd & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) {
dev_info(&dev->dev, "device has non-compliant BARs; disabling IO/MEM decoding\n");
cmd &= ~PCI_COMMAND_IO;
cmd &= ~PCI_COMMAND_MEMORY;
pci_write_config_word(dev, PCI_COMMAND, cmd);
}
}
switch (dev->hdr_type) { /* header type */
case PCI_HEADER_TYPE_NORMAL: /* standard header */
if (class == PCI_CLASS_BRIDGE_PCI)
goto bad;
pci_read_irq(dev);
pci_read_bases(dev, 6, PCI_ROM_ADDRESS);
pci_read_config_word(dev, PCI_SUBSYSTEM_VENDOR_ID, &dev->subsystem_vendor);
pci_read_config_word(dev, PCI_SUBSYSTEM_ID, &dev->subsystem_device);
/*
* Do the ugly legacy mode stuff here rather than broken chip
* quirk code. Legacy mode ATA controllers have fixed
* addresses. These are not always echoed in BAR0-3, and
* BAR0-3 in a few cases contain junk!
*/
if (class == PCI_CLASS_STORAGE_IDE) {
u8 progif;
pci_read_config_byte(dev, PCI_CLASS_PROG, &progif);
if ((progif & 1) == 0) {
region.start = 0x1F0;
region.end = 0x1F7;
res = &dev->resource[0];
res->flags = LEGACY_IO_RESOURCE;
pcibios_bus_to_resource(dev->bus, res, &region);
dev_info(&dev->dev, "legacy IDE quirk: reg 0x10: %pR\n",
res);
region.start = 0x3F6;
region.end = 0x3F6;
res = &dev->resource[1];
res->flags = LEGACY_IO_RESOURCE;
pcibios_bus_to_resource(dev->bus, res, &region);
dev_info(&dev->dev, "legacy IDE quirk: reg 0x14: %pR\n",
res);
}
if ((progif & 4) == 0) {
region.start = 0x170;
region.end = 0x177;
res = &dev->resource[2];
res->flags = LEGACY_IO_RESOURCE;
pcibios_bus_to_resource(dev->bus, res, &region);
dev_info(&dev->dev, "legacy IDE quirk: reg 0x18: %pR\n",
res);
region.start = 0x376;
region.end = 0x376;
res = &dev->resource[3];
res->flags = LEGACY_IO_RESOURCE;
pcibios_bus_to_resource(dev->bus, res, &region);
dev_info(&dev->dev, "legacy IDE quirk: reg 0x1c: %pR\n",
res);
}
}
break;
case PCI_HEADER_TYPE_BRIDGE: /* bridge header */
if (class != PCI_CLASS_BRIDGE_PCI)
goto bad;
/* The PCI-to-PCI bridge spec requires that subtractive
decoding (i.e. transparent) bridge must have programming
interface code of 0x01. */
pci_read_irq(dev);
dev->transparent = ((dev->class & 0xff) == 1);
pci_read_bases(dev, 2, PCI_ROM_ADDRESS1);
set_pcie_hotplug_bridge(dev);
pos = pci_find_capability(dev, PCI_CAP_ID_SSVID);
if (pos) {
pci_read_config_word(dev, pos + PCI_SSVID_VENDOR_ID, &dev->subsystem_vendor);
pci_read_config_word(dev, pos + PCI_SSVID_DEVICE_ID, &dev->subsystem_device);
}
break;
case PCI_HEADER_TYPE_CARDBUS: /* CardBus bridge header */
if (class != PCI_CLASS_BRIDGE_CARDBUS)
goto bad;
pci_read_irq(dev);
pci_read_bases(dev, 1, 0);
pci_read_config_word(dev, PCI_CB_SUBSYSTEM_VENDOR_ID, &dev->subsystem_vendor);
pci_read_config_word(dev, PCI_CB_SUBSYSTEM_ID, &dev->subsystem_device);
break;
default: /* unknown header */
dev_err(&dev->dev, "unknown header type %02x, ignoring device\n",
dev->hdr_type);
return -EIO;
bad:
dev_err(&dev->dev, "ignoring class %#08x (doesn't match header type %02x)\n",
dev->class, dev->hdr_type);
dev->class = PCI_CLASS_NOT_DEFINED << 8;
}
/* We found a fine healthy device, go go go... */
return 0;
}
static void pci_configure_mps(struct pci_dev *dev)
{
struct pci_dev *bridge = pci_upstream_bridge(dev);
int mps, p_mps, rc;
if (!pci_is_pcie(dev) || !bridge || !pci_is_pcie(bridge))
return;
mps = pcie_get_mps(dev);
p_mps = pcie_get_mps(bridge);
if (mps == p_mps)
return;
if (pcie_bus_config == PCIE_BUS_TUNE_OFF) {
dev_warn(&dev->dev, "Max Payload Size %d, but upstream %s set to %d; if necessary, use \"pci=pcie_bus_safe\" and report a bug\n",
mps, pci_name(bridge), p_mps);
return;
}
/*
* Fancier MPS configuration is done later by
* pcie_bus_configure_settings()
*/
if (pcie_bus_config != PCIE_BUS_DEFAULT)
return;
rc = pcie_set_mps(dev, p_mps);
if (rc) {
dev_warn(&dev->dev, "can't set Max Payload Size to %d; if necessary, use \"pci=pcie_bus_safe\" and report a bug\n",
p_mps);
return;
}
dev_info(&dev->dev, "Max Payload Size set to %d (was %d, max %d)\n",
p_mps, mps, 128 << dev->pcie_mpss);
}
static struct hpp_type0 pci_default_type0 = {
.revision = 1,
.cache_line_size = 8,
.latency_timer = 0x40,
.enable_serr = 0,
.enable_perr = 0,
};
static void program_hpp_type0(struct pci_dev *dev, struct hpp_type0 *hpp)
{
u16 pci_cmd, pci_bctl;
if (!hpp)
hpp = &pci_default_type0;
if (hpp->revision > 1) {
dev_warn(&dev->dev,
"PCI settings rev %d not supported; using defaults\n",
hpp->revision);
hpp = &pci_default_type0;
}
pci_write_config_byte(dev, PCI_CACHE_LINE_SIZE, hpp->cache_line_size);
pci_write_config_byte(dev, PCI_LATENCY_TIMER, hpp->latency_timer);
pci_read_config_word(dev, PCI_COMMAND, &pci_cmd);
if (hpp->enable_serr)
pci_cmd |= PCI_COMMAND_SERR;
if (hpp->enable_perr)
pci_cmd |= PCI_COMMAND_PARITY;
pci_write_config_word(dev, PCI_COMMAND, pci_cmd);
/* Program bridge control value */
if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI) {
pci_write_config_byte(dev, PCI_SEC_LATENCY_TIMER,
hpp->latency_timer);
pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &pci_bctl);
if (hpp->enable_serr)
pci_bctl |= PCI_BRIDGE_CTL_SERR;
if (hpp->enable_perr)
pci_bctl |= PCI_BRIDGE_CTL_PARITY;
pci_write_config_word(dev, PCI_BRIDGE_CONTROL, pci_bctl);
}
}
static void program_hpp_type1(struct pci_dev *dev, struct hpp_type1 *hpp)
{
if (hpp)
dev_warn(&dev->dev, "PCI-X settings not supported\n");
}
PCI: Set Read Completion Boundary to 128 iff Root Port supports it (_HPX) Per PCIe spec r3.0, sec 2.3.1.1, the Read Completion Boundary (RCB) determines the naturally aligned address boundaries on which a Read Request may be serviced with multiple Completions: - For a Root Complex, RCB is 64 bytes or 128 bytes This value is reported in the Link Control Register Note: Bridges and Endpoints may implement a corresponding command bit which may be set by system software to indicate the RCB value for the Root Complex, allowing the Bridge/Endpoint to optimize its behavior when the Root Complex’s RCB is 128 bytes. - For all other system elements, RCB is 128 bytes Per sec 7.8.7, if a Root Port only supports a 64-byte RCB, the RCB of all downstream devices must be clear, indicating an RCB of 64 bytes. If the Root Port supports a 128-byte RCB, we may optionally set the RCB of downstream devices so they know they can generate larger Completions. Some BIOSes supply an _HPX that tells us to set RCB, even though the Root Port doesn't have RCB set, which may lead to Malformed TLP errors if the Endpoint generates completions larger than the Root Port can handle. The IBM x3850 X6 with BIOS version -[A8E120CUS-1.30]- 08/22/2016 supplies such an _HPX and a Mellanox MT27500 ConnectX-3 device fails to initialize: mlx4_core 0000:41:00.0: command 0xfff timed out (go bit not cleared) mlx4_core 0000:41:00.0: device is going to be reset mlx4_core 0000:41:00.0: Failed to obtain HW semaphore, aborting mlx4_core 0000:41:00.0: Fail to reset HCA ------------[ cut here ]------------ kernel BUG at drivers/net/ethernet/mellanox/mlx4/catas.c:193! After 6cd33649fa83 ("PCI: Add pci_configure_device() during enumeration") and 7a1562d4f2d0 ("PCI: Apply _HPX Link Control settings to all devices with a link"), we apply _HPX settings to *all* devices, not just those hot-added after boot. Before 7a1562d4f2d0, we didn't touch the Mellanox RCB, and the device worked. After 7a1562d4f2d0, we set its RCB to 128, and it failed. Set the RCB to 128 iff the Root Port supports a 128-byte RCB. Otherwise, set RCB to 64 bytes. This effectively ignores what _HPX tells us about RCB. Note that this change only affects _HPX handling. If we have no _HPX, this does nothing with RCB. [bhelgaas: changelog, clear RCB if not set for Root Port] Fixes: 6cd33649fa83 ("PCI: Add pci_configure_device() during enumeration") Fixes: 7a1562d4f2d0 ("PCI: Apply _HPX Link Control settings to all devices with a link") Link: https://bugzilla.kernel.org/show_bug.cgi?id=187781 Tested-by: Frank Danapfel <fdanapfe@redhat.com> Signed-off-by: Johannes Thumshirn <jthumshirn@suse.de> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> Acked-by: Myron Stowe <myron.stowe@redhat.com> CC: stable@vger.kernel.org # v3.18+
2016-11-24 00:56:28 +08:00
static bool pcie_root_rcb_set(struct pci_dev *dev)
{
struct pci_dev *rp = pcie_find_root_port(dev);
u16 lnkctl;
if (!rp)
return false;
pcie_capability_read_word(rp, PCI_EXP_LNKCTL, &lnkctl);
if (lnkctl & PCI_EXP_LNKCTL_RCB)
return true;
return false;
}
static void program_hpp_type2(struct pci_dev *dev, struct hpp_type2 *hpp)
{
int pos;
u32 reg32;
if (!hpp)
return;
if (hpp->revision > 1) {
dev_warn(&dev->dev, "PCIe settings rev %d not supported\n",
hpp->revision);
return;
}
/*
* Don't allow _HPX to change MPS or MRRS settings. We manage
* those to make sure they're consistent with the rest of the
* platform.
*/
hpp->pci_exp_devctl_and |= PCI_EXP_DEVCTL_PAYLOAD |
PCI_EXP_DEVCTL_READRQ;
hpp->pci_exp_devctl_or &= ~(PCI_EXP_DEVCTL_PAYLOAD |
PCI_EXP_DEVCTL_READRQ);
/* Initialize Device Control Register */
pcie_capability_clear_and_set_word(dev, PCI_EXP_DEVCTL,
~hpp->pci_exp_devctl_and, hpp->pci_exp_devctl_or);
/* Initialize Link Control Register */
PCI: Set Read Completion Boundary to 128 iff Root Port supports it (_HPX) Per PCIe spec r3.0, sec 2.3.1.1, the Read Completion Boundary (RCB) determines the naturally aligned address boundaries on which a Read Request may be serviced with multiple Completions: - For a Root Complex, RCB is 64 bytes or 128 bytes This value is reported in the Link Control Register Note: Bridges and Endpoints may implement a corresponding command bit which may be set by system software to indicate the RCB value for the Root Complex, allowing the Bridge/Endpoint to optimize its behavior when the Root Complex’s RCB is 128 bytes. - For all other system elements, RCB is 128 bytes Per sec 7.8.7, if a Root Port only supports a 64-byte RCB, the RCB of all downstream devices must be clear, indicating an RCB of 64 bytes. If the Root Port supports a 128-byte RCB, we may optionally set the RCB of downstream devices so they know they can generate larger Completions. Some BIOSes supply an _HPX that tells us to set RCB, even though the Root Port doesn't have RCB set, which may lead to Malformed TLP errors if the Endpoint generates completions larger than the Root Port can handle. The IBM x3850 X6 with BIOS version -[A8E120CUS-1.30]- 08/22/2016 supplies such an _HPX and a Mellanox MT27500 ConnectX-3 device fails to initialize: mlx4_core 0000:41:00.0: command 0xfff timed out (go bit not cleared) mlx4_core 0000:41:00.0: device is going to be reset mlx4_core 0000:41:00.0: Failed to obtain HW semaphore, aborting mlx4_core 0000:41:00.0: Fail to reset HCA ------------[ cut here ]------------ kernel BUG at drivers/net/ethernet/mellanox/mlx4/catas.c:193! After 6cd33649fa83 ("PCI: Add pci_configure_device() during enumeration") and 7a1562d4f2d0 ("PCI: Apply _HPX Link Control settings to all devices with a link"), we apply _HPX settings to *all* devices, not just those hot-added after boot. Before 7a1562d4f2d0, we didn't touch the Mellanox RCB, and the device worked. After 7a1562d4f2d0, we set its RCB to 128, and it failed. Set the RCB to 128 iff the Root Port supports a 128-byte RCB. Otherwise, set RCB to 64 bytes. This effectively ignores what _HPX tells us about RCB. Note that this change only affects _HPX handling. If we have no _HPX, this does nothing with RCB. [bhelgaas: changelog, clear RCB if not set for Root Port] Fixes: 6cd33649fa83 ("PCI: Add pci_configure_device() during enumeration") Fixes: 7a1562d4f2d0 ("PCI: Apply _HPX Link Control settings to all devices with a link") Link: https://bugzilla.kernel.org/show_bug.cgi?id=187781 Tested-by: Frank Danapfel <fdanapfe@redhat.com> Signed-off-by: Johannes Thumshirn <jthumshirn@suse.de> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> Acked-by: Myron Stowe <myron.stowe@redhat.com> CC: stable@vger.kernel.org # v3.18+
2016-11-24 00:56:28 +08:00
if (pcie_cap_has_lnkctl(dev)) {
/*
* If the Root Port supports Read Completion Boundary of
* 128, set RCB to 128. Otherwise, clear it.
*/
hpp->pci_exp_lnkctl_and |= PCI_EXP_LNKCTL_RCB;
hpp->pci_exp_lnkctl_or &= ~PCI_EXP_LNKCTL_RCB;
if (pcie_root_rcb_set(dev))
hpp->pci_exp_lnkctl_or |= PCI_EXP_LNKCTL_RCB;
pcie_capability_clear_and_set_word(dev, PCI_EXP_LNKCTL,
~hpp->pci_exp_lnkctl_and, hpp->pci_exp_lnkctl_or);
PCI: Set Read Completion Boundary to 128 iff Root Port supports it (_HPX) Per PCIe spec r3.0, sec 2.3.1.1, the Read Completion Boundary (RCB) determines the naturally aligned address boundaries on which a Read Request may be serviced with multiple Completions: - For a Root Complex, RCB is 64 bytes or 128 bytes This value is reported in the Link Control Register Note: Bridges and Endpoints may implement a corresponding command bit which may be set by system software to indicate the RCB value for the Root Complex, allowing the Bridge/Endpoint to optimize its behavior when the Root Complex’s RCB is 128 bytes. - For all other system elements, RCB is 128 bytes Per sec 7.8.7, if a Root Port only supports a 64-byte RCB, the RCB of all downstream devices must be clear, indicating an RCB of 64 bytes. If the Root Port supports a 128-byte RCB, we may optionally set the RCB of downstream devices so they know they can generate larger Completions. Some BIOSes supply an _HPX that tells us to set RCB, even though the Root Port doesn't have RCB set, which may lead to Malformed TLP errors if the Endpoint generates completions larger than the Root Port can handle. The IBM x3850 X6 with BIOS version -[A8E120CUS-1.30]- 08/22/2016 supplies such an _HPX and a Mellanox MT27500 ConnectX-3 device fails to initialize: mlx4_core 0000:41:00.0: command 0xfff timed out (go bit not cleared) mlx4_core 0000:41:00.0: device is going to be reset mlx4_core 0000:41:00.0: Failed to obtain HW semaphore, aborting mlx4_core 0000:41:00.0: Fail to reset HCA ------------[ cut here ]------------ kernel BUG at drivers/net/ethernet/mellanox/mlx4/catas.c:193! After 6cd33649fa83 ("PCI: Add pci_configure_device() during enumeration") and 7a1562d4f2d0 ("PCI: Apply _HPX Link Control settings to all devices with a link"), we apply _HPX settings to *all* devices, not just those hot-added after boot. Before 7a1562d4f2d0, we didn't touch the Mellanox RCB, and the device worked. After 7a1562d4f2d0, we set its RCB to 128, and it failed. Set the RCB to 128 iff the Root Port supports a 128-byte RCB. Otherwise, set RCB to 64 bytes. This effectively ignores what _HPX tells us about RCB. Note that this change only affects _HPX handling. If we have no _HPX, this does nothing with RCB. [bhelgaas: changelog, clear RCB if not set for Root Port] Fixes: 6cd33649fa83 ("PCI: Add pci_configure_device() during enumeration") Fixes: 7a1562d4f2d0 ("PCI: Apply _HPX Link Control settings to all devices with a link") Link: https://bugzilla.kernel.org/show_bug.cgi?id=187781 Tested-by: Frank Danapfel <fdanapfe@redhat.com> Signed-off-by: Johannes Thumshirn <jthumshirn@suse.de> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> Acked-by: Myron Stowe <myron.stowe@redhat.com> CC: stable@vger.kernel.org # v3.18+
2016-11-24 00:56:28 +08:00
}
/* Find Advanced Error Reporting Enhanced Capability */
pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR);
if (!pos)
return;
/* Initialize Uncorrectable Error Mask Register */
pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, &reg32);
reg32 = (reg32 & hpp->unc_err_mask_and) | hpp->unc_err_mask_or;
pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, reg32);
/* Initialize Uncorrectable Error Severity Register */
pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, &reg32);
reg32 = (reg32 & hpp->unc_err_sever_and) | hpp->unc_err_sever_or;
pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, reg32);
/* Initialize Correctable Error Mask Register */
pci_read_config_dword(dev, pos + PCI_ERR_COR_MASK, &reg32);
reg32 = (reg32 & hpp->cor_err_mask_and) | hpp->cor_err_mask_or;
pci_write_config_dword(dev, pos + PCI_ERR_COR_MASK, reg32);
/* Initialize Advanced Error Capabilities and Control Register */
pci_read_config_dword(dev, pos + PCI_ERR_CAP, &reg32);
reg32 = (reg32 & hpp->adv_err_cap_and) | hpp->adv_err_cap_or;
pci_write_config_dword(dev, pos + PCI_ERR_CAP, reg32);
/*
* FIXME: The following two registers are not supported yet.
*
* o Secondary Uncorrectable Error Severity Register
* o Secondary Uncorrectable Error Mask Register
*/
}
static void pci_configure_device(struct pci_dev *dev)
{
struct hotplug_params hpp;
int ret;
pci_configure_mps(dev);
memset(&hpp, 0, sizeof(hpp));
ret = pci_get_hp_params(dev, &hpp);
if (ret)
return;
program_hpp_type2(dev, hpp.t2);
program_hpp_type1(dev, hpp.t1);
program_hpp_type0(dev, hpp.t0);
}
static void pci_release_capabilities(struct pci_dev *dev)
{
pci_vpd_release(dev);
pci_iov_release(dev);
pci_free_cap_save_buffers(dev);
}
/**
* pci_release_dev - free a pci device structure when all users of it are finished.
* @dev: device that's been disconnected
*
* Will be called only by the device core when all users of this pci device are
* done.
*/
static void pci_release_dev(struct device *dev)
{
struct pci_dev *pci_dev;
pci_dev = to_pci_dev(dev);
pci_release_capabilities(pci_dev);
pci_release_of_node(pci_dev);
pcibios_release_device(pci_dev);
pci_bus_put(pci_dev->bus);
PCI: Introduce new device binding path using pci_dev.driver_override The driver_override field allows us to specify the driver for a device rather than relying on the driver to provide a positive match of the device. This shortcuts the existing process of looking up the vendor and device ID, adding them to the driver new_id, binding the device, then removing the ID, but it also provides a couple advantages. First, the above existing process allows the driver to bind to any device matching the new_id for the window where it's enabled. This is often not desired, such as the case of trying to bind a single device to a meta driver like pci-stub or vfio-pci. Using driver_override we can do this deterministically using: echo pci-stub > /sys/bus/pci/devices/0000:03:00.0/driver_override echo 0000:03:00.0 > /sys/bus/pci/devices/0000:03:00.0/driver/unbind echo 0000:03:00.0 > /sys/bus/pci/drivers_probe Previously we could not invoke drivers_probe after adding a device to new_id for a driver as we get non-deterministic behavior whether the driver we intend or the standard driver will claim the device. Now it becomes a deterministic process, only the driver matching driver_override will probe the device. To return the device to the standard driver, we simply clear the driver_override and reprobe the device: echo > /sys/bus/pci/devices/0000:03:00.0/driver_override echo 0000:03:00.0 > /sys/bus/pci/devices/0000:03:00.0/driver/unbind echo 0000:03:00.0 > /sys/bus/pci/drivers_probe Another advantage to this approach is that we can specify a driver override to force a specific binding or prevent any binding. For instance when an IOMMU group is exposed to userspace through VFIO we require that all devices within that group are owned by VFIO. However, devices can be hot-added into an IOMMU group, in which case we want to prevent the device from binding to any driver (override driver = "none") or perhaps have it automatically bind to vfio-pci. With driver_override it's a simple matter for this field to be set internally when the device is first discovered to prevent driver matches. Signed-off-by: Alex Williamson <alex.williamson@redhat.com> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Reviewed-by: Alexander Graf <agraf@suse.de> Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2014-05-20 22:53:21 +08:00
kfree(pci_dev->driver_override);
PCI: Add support for multiple DMA aliases Solve IOMMU support issues with PCIe non-transparent bridges that use Requester ID look-up tables (RID-LUT), e.g., the PEX8733. The NTB connects devices in two independent PCI domains. Devices separated by the NTB are not able to discover each other. A PCI packet being forwared from one domain to another has to have its RID modified so it appears on correct bus and completions are forwarded back to the original domain through the NTB. The RID is translated using a preprogrammed table (LUT) and the PCI packet propagates upstream away from the NTB. If the destination system has IOMMU enabled, the packet will be discarded because the new RID is unknown to the IOMMU. Adding a DMA alias for the new RID allows IOMMU to properly recognize the packet. Each device behind the NTB has a unique RID assigned in the RID-LUT. The current DMA alias implementation supports only a single alias, so it's not possible to support mutiple devices behind the NTB when IOMMU is enabled. Enable all possible aliases on a given bus (256) that are stored in a bitset. Alias devfn is directly translated to a bit number. The bitset is not allocated for devices that have no need for DMA aliases. More details can be found in the following article: http://www.plxtech.com/files/pdf/technical/expresslane/RTC_Enabling%20MulitHostSystemDesigns.pdf Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz@intel.com> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> Reviewed-by: Alex Williamson <alex.williamson@redhat.com> Acked-by: David Woodhouse <David.Woodhouse@intel.com> Acked-by: Joerg Roedel <jroedel@suse.de>
2016-03-03 22:38:02 +08:00
kfree(pci_dev->dma_alias_mask);
kfree(pci_dev);
}
struct pci_dev *pci_alloc_dev(struct pci_bus *bus)
{
struct pci_dev *dev;
dev = kzalloc(sizeof(struct pci_dev), GFP_KERNEL);
if (!dev)
return NULL;
INIT_LIST_HEAD(&dev->bus_list);
dev->dev.type = &pci_dev_type;
dev->bus = pci_bus_get(bus);
return dev;
}
EXPORT_SYMBOL(pci_alloc_dev);
bool pci_bus_read_dev_vendor_id(struct pci_bus *bus, int devfn, u32 *l,
int crs_timeout)
{
int delay = 1;
if (pci_bus_read_config_dword(bus, devfn, PCI_VENDOR_ID, l))
return false;
/* some broken boards return 0 or ~0 if a slot is empty: */
if (*l == 0xffffffff || *l == 0x00000000 ||
*l == 0x0000ffff || *l == 0xffff0000)
return false;
/*
* Configuration Request Retry Status. Some root ports return the
* actual device ID instead of the synthetic ID (0xFFFF) required
* by the PCIe spec. Ignore the device ID and only check for
* (vendor id == 1).
*/
while ((*l & 0xffff) == 0x0001) {
if (!crs_timeout)
return false;
msleep(delay);
delay *= 2;
if (pci_bus_read_config_dword(bus, devfn, PCI_VENDOR_ID, l))
return false;
/* Card hasn't responded in 60 seconds? Must be stuck. */
if (delay > crs_timeout) {
printk(KERN_WARNING "pci %04x:%02x:%02x.%d: not responding\n",
pci_domain_nr(bus), bus->number, PCI_SLOT(devfn),
PCI_FUNC(devfn));
return false;
}
}
return true;
}
EXPORT_SYMBOL(pci_bus_read_dev_vendor_id);
/*
* Read the config data for a PCI device, sanity-check it
* and fill in the dev structure...
*/
static struct pci_dev *pci_scan_device(struct pci_bus *bus, int devfn)
{
struct pci_dev *dev;
u32 l;
if (!pci_bus_read_dev_vendor_id(bus, devfn, &l, 60*1000))
return NULL;
dev = pci_alloc_dev(bus);
if (!dev)
return NULL;
dev->devfn = devfn;
dev->vendor = l & 0xffff;
dev->device = (l >> 16) & 0xffff;
pci_set_of_node(dev);
if (pci_setup_device(dev)) {
pci_bus_put(dev->bus);
kfree(dev);
return NULL;
}
return dev;
}
static void pci_init_capabilities(struct pci_dev *dev)
{
/* Enhanced Allocation */
pci_ea_init(dev);
/* Setup MSI caps & disable MSI/MSI-X interrupts */
pci_msi_setup_pci_dev(dev);
/* Buffers for saving PCIe and PCI-X capabilities */
pci_allocate_cap_save_buffers(dev);
/* Power Management */
pci_pm_init(dev);
/* Vital Product Data */
pci_vpd_init(dev);
/* Alternative Routing-ID Forwarding */
pci_configure_ari(dev);
/* Single Root I/O Virtualization */
pci_iov_init(dev);
2015-07-18 04:05:46 +08:00
/* Address Translation Services */
pci_ats_init(dev);
/* Enable ACS P2P upstream forwarding */
pci_enable_acs(dev);
/* Precision Time Measurement */
pci_ptm_init(dev);
/* Advanced Error Reporting */
pci_aer_init(dev);
}
/*
* This is the equivalent of pci_host_bridge_msi_domain that acts on
* devices. Firmware interfaces that can select the MSI domain on a
* per-device basis should be called from here.
*/
static struct irq_domain *pci_dev_msi_domain(struct pci_dev *dev)
{
struct irq_domain *d;
/*
* If a domain has been set through the pcibios_add_device
* callback, then this is the one (platform code knows best).
*/
d = dev_get_msi_domain(&dev->dev);
if (d)
return d;
/*
* Let's see if we have a firmware interface able to provide
* the domain.
*/
d = pci_msi_get_device_domain(dev);
if (d)
return d;
return NULL;
}
static void pci_set_msi_domain(struct pci_dev *dev)
{
struct irq_domain *d;
/*
* If the platform or firmware interfaces cannot supply a
* device-specific MSI domain, then inherit the default domain
* from the host bridge itself.
*/
d = pci_dev_msi_domain(dev);
if (!d)
d = dev_get_msi_domain(&dev->bus->dev);
dev_set_msi_domain(&dev->dev, d);
}
/**
* pci_dma_configure - Setup DMA configuration
* @dev: ptr to pci_dev struct of the PCI device
*
* Function to update PCI devices's DMA configuration using the same
* info from the OF node or ACPI node of host bridge's parent (if any).
*/
static void pci_dma_configure(struct pci_dev *dev)
{
struct device *bridge = pci_get_host_bridge_device(dev);
if (IS_ENABLED(CONFIG_OF) &&
bridge->parent && bridge->parent->of_node) {
of_dma_configure(&dev->dev, bridge->parent->of_node);
} else if (has_acpi_companion(bridge)) {
struct acpi_device *adev = to_acpi_device_node(bridge->fwnode);
enum dev_dma_attr attr = acpi_get_dma_attr(adev);
if (attr == DEV_DMA_NOT_SUPPORTED)
dev_warn(&dev->dev, "DMA not supported.\n");
else
arch_setup_dma_ops(&dev->dev, 0, 0, NULL,
attr == DEV_DMA_COHERENT);
}
pci_put_host_bridge_device(bridge);
}
pci: do not mark exported functions as __devinit Functions marked __devinit will be removed after kernel init. But being exported they are potentially called by a module much later. So the safer choice seems to be to keep the function even in the non CONFIG_HOTPLUG case. This silence the follwoing section mismatch warnings: WARNING: drivers/built-in.o - Section mismatch: reference to .init.text:pci_bus_add_device from __ksymtab_gpl between '__ksymtab_pci_bus_add_device' (at offset 0x20) and '__ksymtab_pci_walk_bus' WARNING: drivers/built-in.o - Section mismatch: reference to .init.text:pci_create_bus from __ksymtab_gpl between '__ksymtab_pci_create_bus' (at offset 0x40) and '__ksymtab_pci_stop_bus_device' WARNING: drivers/built-in.o - Section mismatch: reference to .init.text:pci_bus_max_busnr from __ksymtab_gpl between '__ksymtab_pci_bus_max_busnr' (at offset 0xc0) and '__ksymtab_pci_assign_resource_fixed' WARNING: drivers/built-in.o - Section mismatch: reference to .init.text:pci_claim_resource from __ksymtab_gpl between '__ksymtab_pci_claim_resource' (at offset 0xe0) and '__ksymtab_pcie_port_bus_type' WARNING: drivers/built-in.o - Section mismatch: reference to .init.text:pci_bus_add_devices from __ksymtab between '__ksymtab_pci_bus_add_devices' (at offset 0x70) and '__ksymtab_pci_bus_alloc_resource' WARNING: drivers/built-in.o - Section mismatch: reference to .init.text:pci_scan_bus_parented from __ksymtab between '__ksymtab_pci_scan_bus_parented' (at offset 0x90) and '__ksymtab_pci_root_buses' WARNING: drivers/built-in.o - Section mismatch: reference to .init.text:pci_bus_assign_resources from __ksymtab between '__ksymtab_pci_bus_assign_resources' (at offset 0x4d0) and '__ksymtab_pci_bus_size_bridges' WARNING: drivers/built-in.o - Section mismatch: reference to .init.text:pci_bus_size_bridges from __ksymtab between '__ksymtab_pci_bus_size_bridges' (at offset 0x4e0) and '__ksymtab_pci_setup_cardbus' Signed-off-by: Sam Ravnborg <sam@ravnborg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2007-03-27 13:53:30 +08:00
void pci_device_add(struct pci_dev *dev, struct pci_bus *bus)
{
int ret;
pci_configure_device(dev);
device_initialize(&dev->dev);
dev->dev.release = pci_release_dev;
set_dev_node(&dev->dev, pcibus_to_node(bus));
dev->dev.dma_mask = &dev->dma_mask;
dev->dev.dma_parms = &dev->dma_parms;
dev->dev.coherent_dma_mask = 0xffffffffull;
pci_dma_configure(dev);
pci_set_dma_max_seg_size(dev, 65536);
pci_set_dma_seg_boundary(dev, 0xffffffff);
/* Fix up broken headers */
pci_fixup_device(pci_fixup_header, dev);
/* moved out from quirk header fixup code */
pci_reassigndev_resource_alignment(dev);
/* Clear the state_saved flag. */
dev->state_saved = false;
/* Initialize various capabilities */
pci_init_capabilities(dev);
PCI ACPI: Rework PCI handling of wake-up * Introduce function acpi_pm_device_sleep_wake() for enabling and disabling the system wake-up capability of devices that are power manageable by ACPI. * Introduce function acpi_bus_can_wakeup() allowing other (dependent) subsystems to check if ACPI is able to enable the system wake-up capability of given device. * Introduce callback .sleep_wake() in struct pci_platform_pm_ops and for the ACPI PCI 'driver' make it use acpi_pm_device_sleep_wake(). * Introduce callback .can_wakeup() in struct pci_platform_pm_ops and for the ACPI 'driver' make it use acpi_bus_can_wakeup(). * Move the PME# handlig code out of pci_enable_wake() and split it into two functions, pci_pme_capable() and pci_pme_active(), allowing the caller to check if given device is capable of generating PME# from given power state and to enable/disable the device's PME# functionality, respectively. * Modify pci_enable_wake() to use the new ACPI callbacks and the new PME#-related functions. * Drop the generic .platform_enable_wakeup() callback that is not used any more. * Introduce device_set_wakeup_capable() that will set the power.can_wakeup flag of given device. * Rework PCI device PM initialization so that, if given device is capable of generating wake-up events, either natively through the PME# mechanism, or with the help of the platform, its power.can_wakeup flag is set and its power.should_wakeup flag is unset as appropriate. * Make ACPI set the power.can_wakeup flag for devices found to be wake-up capable by it. * Make the ACPI wake-up code enable/disable GPEs for devices that have the wakeup.flags.prepared flag set (which means that their wake-up power has been enabled). Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2008-07-07 09:34:48 +08:00
/*
* Add the device to our list of discovered devices
* and the bus list for fixup functions, etc.
*/
down_write(&pci_bus_sem);
list_add_tail(&dev->bus_list, &bus->devices);
up_write(&pci_bus_sem);
ret = pcibios_add_device(dev);
WARN_ON(ret < 0);
/* Setup MSI irq domain */
pci_set_msi_domain(dev);
/* Notifier could use PCI capabilities */
dev->match_driver = false;
ret = device_add(&dev->dev);
WARN_ON(ret < 0);
}
struct pci_dev *pci_scan_single_device(struct pci_bus *bus, int devfn)
{
struct pci_dev *dev;
dev = pci_get_slot(bus, devfn);
if (dev) {
pci_dev_put(dev);
return dev;
}
dev = pci_scan_device(bus, devfn);
if (!dev)
return NULL;
pci_device_add(dev, bus);
return dev;
}
EXPORT_SYMBOL(pci_scan_single_device);
static unsigned next_fn(struct pci_bus *bus, struct pci_dev *dev, unsigned fn)
{
int pos;
u16 cap = 0;
unsigned next_fn;
if (pci_ari_enabled(bus)) {
if (!dev)
return 0;
pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ARI);
if (!pos)
return 0;
pci_read_config_word(dev, pos + PCI_ARI_CAP, &cap);
next_fn = PCI_ARI_CAP_NFN(cap);
if (next_fn <= fn)
return 0; /* protect against malformed list */
return next_fn;
}
/* dev may be NULL for non-contiguous multifunction devices */
if (!dev || dev->multifunction)
return (fn + 1) % 8;
return 0;
}
static int only_one_child(struct pci_bus *bus)
{
struct pci_dev *parent = bus->self;
if (!parent || !pci_is_pcie(parent))
return 0;
if (pci_pcie_type(parent) == PCI_EXP_TYPE_ROOT_PORT)
return 1;
/*
* PCIe downstream ports are bridges that normally lead to only a
* device 0, but if PCI_SCAN_ALL_PCIE_DEVS is set, scan all
* possible devices, not just device 0. See PCIe spec r3.0,
* sec 7.3.1.
*/
if (parent->has_secondary_link &&
!pci_has_flag(PCI_SCAN_ALL_PCIE_DEVS))
return 1;
return 0;
}
/**
* pci_scan_slot - scan a PCI slot on a bus for devices.
* @bus: PCI bus to scan
* @devfn: slot number to scan (must have zero function.)
*
* Scan a PCI slot on the specified PCI bus for devices, adding
* discovered devices to the @bus->devices list. New devices
* will not have is_added set.
*
* Returns the number of new devices found.
*/
pci: do not mark exported functions as __devinit Functions marked __devinit will be removed after kernel init. But being exported they are potentially called by a module much later. So the safer choice seems to be to keep the function even in the non CONFIG_HOTPLUG case. This silence the follwoing section mismatch warnings: WARNING: drivers/built-in.o - Section mismatch: reference to .init.text:pci_bus_add_device from __ksymtab_gpl between '__ksymtab_pci_bus_add_device' (at offset 0x20) and '__ksymtab_pci_walk_bus' WARNING: drivers/built-in.o - Section mismatch: reference to .init.text:pci_create_bus from __ksymtab_gpl between '__ksymtab_pci_create_bus' (at offset 0x40) and '__ksymtab_pci_stop_bus_device' WARNING: drivers/built-in.o - Section mismatch: reference to .init.text:pci_bus_max_busnr from __ksymtab_gpl between '__ksymtab_pci_bus_max_busnr' (at offset 0xc0) and '__ksymtab_pci_assign_resource_fixed' WARNING: drivers/built-in.o - Section mismatch: reference to .init.text:pci_claim_resource from __ksymtab_gpl between '__ksymtab_pci_claim_resource' (at offset 0xe0) and '__ksymtab_pcie_port_bus_type' WARNING: drivers/built-in.o - Section mismatch: reference to .init.text:pci_bus_add_devices from __ksymtab between '__ksymtab_pci_bus_add_devices' (at offset 0x70) and '__ksymtab_pci_bus_alloc_resource' WARNING: drivers/built-in.o - Section mismatch: reference to .init.text:pci_scan_bus_parented from __ksymtab between '__ksymtab_pci_scan_bus_parented' (at offset 0x90) and '__ksymtab_pci_root_buses' WARNING: drivers/built-in.o - Section mismatch: reference to .init.text:pci_bus_assign_resources from __ksymtab between '__ksymtab_pci_bus_assign_resources' (at offset 0x4d0) and '__ksymtab_pci_bus_size_bridges' WARNING: drivers/built-in.o - Section mismatch: reference to .init.text:pci_bus_size_bridges from __ksymtab between '__ksymtab_pci_bus_size_bridges' (at offset 0x4e0) and '__ksymtab_pci_setup_cardbus' Signed-off-by: Sam Ravnborg <sam@ravnborg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2007-03-27 13:53:30 +08:00
int pci_scan_slot(struct pci_bus *bus, int devfn)
{
unsigned fn, nr = 0;
struct pci_dev *dev;
if (only_one_child(bus) && (devfn > 0))
return 0; /* Already scanned the entire slot */
dev = pci_scan_single_device(bus, devfn);
if (!dev)
return 0;
if (!dev->is_added)
nr++;
for (fn = next_fn(bus, dev, 0); fn > 0; fn = next_fn(bus, dev, fn)) {
dev = pci_scan_single_device(bus, devfn + fn);
if (dev) {
if (!dev->is_added)
nr++;
dev->multifunction = 1;
}
}
/* only one slot has pcie device */
if (bus->self && nr)
pcie_aspm_init_link_state(bus->self);
return nr;
}
EXPORT_SYMBOL(pci_scan_slot);
PCI: Set PCI-E Max Payload Size on fabric On a given PCI-E fabric, each device, bridge, and root port can have a different PCI-E maximum payload size. There is a sizable performance boost for having the largest possible maximum payload size on each PCI-E device. However, if improperly configured, fatal bus errors can occur. Thus, it is important to ensure that PCI-E payloads sends by a device are never larger than the MPS setting of all devices on the way to the destination. This can be achieved two ways: - A conservative approach is to use the smallest common denominator of the entire tree below a root complex for every device on that fabric. This means for example that having a 128 bytes MPS USB controller on one leg of a switch will dramatically reduce performances of a video card or 10GE adapter on another leg of that same switch. It also means that any hierarchy supporting hotplug slots (including expresscard or thunderbolt I suppose, dbl check that) will have to be entirely clamped to 128 bytes since we cannot predict what will be plugged into those slots, and we cannot change the MPS on a "live" system. - A more optimal way is possible, if it falls within a couple of constraints: * The top-level host bridge will never generate packets larger than the smallest TLP (or if it can be controlled independently from its MPS at least) * The device will never generate packets larger than MPS (which can be configured via MRRS) * No support of direct PCI-E <-> PCI-E transfers between devices without some additional code to specifically deal with that case Then we can use an approach that basically ignores downstream requests and focuses exclusively on upstream requests. In that case, all we need to care about is that a device MPS is no larger than its parent MPS, which allows us to keep all switches/bridges to the max MPS supported by their parent and eventually the PHB. In this case, your USB controller would no longer "starve" your 10GE Ethernet and your hotplug slots won't affect your global MPS. Additionally, the hotplugged devices themselves can be configured to a larger MPS up to the value configured in the hotplug bridge. To choose between the two available options, two PCI kernel boot args have been added to the PCI calls. "pcie_bus_safe" will provide the former behavior, while "pcie_bus_perf" will perform the latter behavior. By default, the latter behavior is used. NOTE: due to the location of the enablement, each arch will need to add calls to this function. This patch only enables x86. This patch includes a number of changes recommended by Benjamin Herrenschmidt. Tested-by: Jordan_Hargrave@dell.com Signed-off-by: Jon Mason <mason@myri.com> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
static int pcie_find_smpss(struct pci_dev *dev, void *data)
{
u8 *smpss = data;
if (!pci_is_pcie(dev))
return 0;
PCI: Don't restrict MPS for slots below Root Ports When booting with "pci=pcie_bus_safe", we previously limited the fabric MPS to 128 when we found: (1) A hotplug-capable Downstream Port ("dev->is_hotplug_bridge && pci_pcie_type(dev) != PCI_EXP_TYPE_ROOT_PORT"), or (2) A hotplug-capable Root Port with a slot that was either empty or contained a multi-function device ("dev->is_hotplug_bridge && !list_is_singular(&dev->bus->devices)") Part (1) is valid, but part (2) is not. After a hot-add in the slot below a Root Port, we can reconfigure all MPS values in the fabric below the Root Port because the new device is the only thing below the Root Port and there are no active drivers. Therefore, there's no reason to limit the MPS for Root Ports, no matter what's in the slot. Test info: -+-[0000:40]-+-07.0-[0000:46]--+-00.0 Intel 82576 NIC \-00.1 Intel 82576 NIC 0000:40:07.0 Root Port bridge to [bus 46] (MPS supported=256) 0000:46:00.0 Endpoint (MPS supported=512) 0000:46:00.1 Endpoint (MPS supported=512) # echo 0 > /sys/bus/pci/slots/7/power # echo 1 > /sys/bus/pci/slots/7/power pcieport 0000:40:07.0: PCI-E Max Payload Size set to 256/ 256 (was 256) pci 0000:46:00.0: PCI-E Max Payload Size set to 256/ 512 (was 128) pci 0000:46:00.1: PCI-E Max Payload Size set to 256/ 512 (was 128) Before this change, we set MPS to 128 for the Root Port and both NICs because the slot contained a multi-function device and dev->is_hotplug_bridge && !list_is_singular(&dev->bus->devices) was true. After this change, we set it to 256. [bhelgaas: changelog, comments, split out upstream bridge check] Signed-off-by: Yijing Wang <wangyijing@huawei.com> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> Cc: Jon Mason <jdmason@kudzu.us>
2013-08-22 11:24:47 +08:00
/*
* We don't have a way to change MPS settings on devices that have
* drivers attached. A hot-added device might support only the minimum
* MPS setting (MPS=128). Therefore, if the fabric contains a bridge
* where devices may be hot-added, we limit the fabric MPS to 128 so
* hot-added devices will work correctly.
*
* However, if we hot-add a device to a slot directly below a Root
* Port, it's impossible for there to be other existing devices below
* the port. We don't limit the MPS in this case because we can
* reconfigure MPS on both the Root Port and the hot-added device,
* and there are no other devices involved.
*
* Note that this PCIE_BUS_SAFE path assumes no peer-to-peer DMA.
PCI: Set PCI-E Max Payload Size on fabric On a given PCI-E fabric, each device, bridge, and root port can have a different PCI-E maximum payload size. There is a sizable performance boost for having the largest possible maximum payload size on each PCI-E device. However, if improperly configured, fatal bus errors can occur. Thus, it is important to ensure that PCI-E payloads sends by a device are never larger than the MPS setting of all devices on the way to the destination. This can be achieved two ways: - A conservative approach is to use the smallest common denominator of the entire tree below a root complex for every device on that fabric. This means for example that having a 128 bytes MPS USB controller on one leg of a switch will dramatically reduce performances of a video card or 10GE adapter on another leg of that same switch. It also means that any hierarchy supporting hotplug slots (including expresscard or thunderbolt I suppose, dbl check that) will have to be entirely clamped to 128 bytes since we cannot predict what will be plugged into those slots, and we cannot change the MPS on a "live" system. - A more optimal way is possible, if it falls within a couple of constraints: * The top-level host bridge will never generate packets larger than the smallest TLP (or if it can be controlled independently from its MPS at least) * The device will never generate packets larger than MPS (which can be configured via MRRS) * No support of direct PCI-E <-> PCI-E transfers between devices without some additional code to specifically deal with that case Then we can use an approach that basically ignores downstream requests and focuses exclusively on upstream requests. In that case, all we need to care about is that a device MPS is no larger than its parent MPS, which allows us to keep all switches/bridges to the max MPS supported by their parent and eventually the PHB. In this case, your USB controller would no longer "starve" your 10GE Ethernet and your hotplug slots won't affect your global MPS. Additionally, the hotplugged devices themselves can be configured to a larger MPS up to the value configured in the hotplug bridge. To choose between the two available options, two PCI kernel boot args have been added to the PCI calls. "pcie_bus_safe" will provide the former behavior, while "pcie_bus_perf" will perform the latter behavior. By default, the latter behavior is used. NOTE: due to the location of the enablement, each arch will need to add calls to this function. This patch only enables x86. This patch includes a number of changes recommended by Benjamin Herrenschmidt. Tested-by: Jordan_Hargrave@dell.com Signed-off-by: Jon Mason <mason@myri.com> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
*/
PCI: Don't restrict MPS for slots below Root Ports When booting with "pci=pcie_bus_safe", we previously limited the fabric MPS to 128 when we found: (1) A hotplug-capable Downstream Port ("dev->is_hotplug_bridge && pci_pcie_type(dev) != PCI_EXP_TYPE_ROOT_PORT"), or (2) A hotplug-capable Root Port with a slot that was either empty or contained a multi-function device ("dev->is_hotplug_bridge && !list_is_singular(&dev->bus->devices)") Part (1) is valid, but part (2) is not. After a hot-add in the slot below a Root Port, we can reconfigure all MPS values in the fabric below the Root Port because the new device is the only thing below the Root Port and there are no active drivers. Therefore, there's no reason to limit the MPS for Root Ports, no matter what's in the slot. Test info: -+-[0000:40]-+-07.0-[0000:46]--+-00.0 Intel 82576 NIC \-00.1 Intel 82576 NIC 0000:40:07.0 Root Port bridge to [bus 46] (MPS supported=256) 0000:46:00.0 Endpoint (MPS supported=512) 0000:46:00.1 Endpoint (MPS supported=512) # echo 0 > /sys/bus/pci/slots/7/power # echo 1 > /sys/bus/pci/slots/7/power pcieport 0000:40:07.0: PCI-E Max Payload Size set to 256/ 256 (was 256) pci 0000:46:00.0: PCI-E Max Payload Size set to 256/ 512 (was 128) pci 0000:46:00.1: PCI-E Max Payload Size set to 256/ 512 (was 128) Before this change, we set MPS to 128 for the Root Port and both NICs because the slot contained a multi-function device and dev->is_hotplug_bridge && !list_is_singular(&dev->bus->devices) was true. After this change, we set it to 256. [bhelgaas: changelog, comments, split out upstream bridge check] Signed-off-by: Yijing Wang <wangyijing@huawei.com> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> Cc: Jon Mason <jdmason@kudzu.us>
2013-08-22 11:24:47 +08:00
if (dev->is_hotplug_bridge &&
pci_pcie_type(dev) != PCI_EXP_TYPE_ROOT_PORT)
PCI: Set PCI-E Max Payload Size on fabric On a given PCI-E fabric, each device, bridge, and root port can have a different PCI-E maximum payload size. There is a sizable performance boost for having the largest possible maximum payload size on each PCI-E device. However, if improperly configured, fatal bus errors can occur. Thus, it is important to ensure that PCI-E payloads sends by a device are never larger than the MPS setting of all devices on the way to the destination. This can be achieved two ways: - A conservative approach is to use the smallest common denominator of the entire tree below a root complex for every device on that fabric. This means for example that having a 128 bytes MPS USB controller on one leg of a switch will dramatically reduce performances of a video card or 10GE adapter on another leg of that same switch. It also means that any hierarchy supporting hotplug slots (including expresscard or thunderbolt I suppose, dbl check that) will have to be entirely clamped to 128 bytes since we cannot predict what will be plugged into those slots, and we cannot change the MPS on a "live" system. - A more optimal way is possible, if it falls within a couple of constraints: * The top-level host bridge will never generate packets larger than the smallest TLP (or if it can be controlled independently from its MPS at least) * The device will never generate packets larger than MPS (which can be configured via MRRS) * No support of direct PCI-E <-> PCI-E transfers between devices without some additional code to specifically deal with that case Then we can use an approach that basically ignores downstream requests and focuses exclusively on upstream requests. In that case, all we need to care about is that a device MPS is no larger than its parent MPS, which allows us to keep all switches/bridges to the max MPS supported by their parent and eventually the PHB. In this case, your USB controller would no longer "starve" your 10GE Ethernet and your hotplug slots won't affect your global MPS. Additionally, the hotplugged devices themselves can be configured to a larger MPS up to the value configured in the hotplug bridge. To choose between the two available options, two PCI kernel boot args have been added to the PCI calls. "pcie_bus_safe" will provide the former behavior, while "pcie_bus_perf" will perform the latter behavior. By default, the latter behavior is used. NOTE: due to the location of the enablement, each arch will need to add calls to this function. This patch only enables x86. This patch includes a number of changes recommended by Benjamin Herrenschmidt. Tested-by: Jordan_Hargrave@dell.com Signed-off-by: Jon Mason <mason@myri.com> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
*smpss = 0;
if (*smpss > dev->pcie_mpss)
*smpss = dev->pcie_mpss;
return 0;
}
static void pcie_write_mps(struct pci_dev *dev, int mps)
{
int rc;
PCI: Set PCI-E Max Payload Size on fabric On a given PCI-E fabric, each device, bridge, and root port can have a different PCI-E maximum payload size. There is a sizable performance boost for having the largest possible maximum payload size on each PCI-E device. However, if improperly configured, fatal bus errors can occur. Thus, it is important to ensure that PCI-E payloads sends by a device are never larger than the MPS setting of all devices on the way to the destination. This can be achieved two ways: - A conservative approach is to use the smallest common denominator of the entire tree below a root complex for every device on that fabric. This means for example that having a 128 bytes MPS USB controller on one leg of a switch will dramatically reduce performances of a video card or 10GE adapter on another leg of that same switch. It also means that any hierarchy supporting hotplug slots (including expresscard or thunderbolt I suppose, dbl check that) will have to be entirely clamped to 128 bytes since we cannot predict what will be plugged into those slots, and we cannot change the MPS on a "live" system. - A more optimal way is possible, if it falls within a couple of constraints: * The top-level host bridge will never generate packets larger than the smallest TLP (or if it can be controlled independently from its MPS at least) * The device will never generate packets larger than MPS (which can be configured via MRRS) * No support of direct PCI-E <-> PCI-E transfers between devices without some additional code to specifically deal with that case Then we can use an approach that basically ignores downstream requests and focuses exclusively on upstream requests. In that case, all we need to care about is that a device MPS is no larger than its parent MPS, which allows us to keep all switches/bridges to the max MPS supported by their parent and eventually the PHB. In this case, your USB controller would no longer "starve" your 10GE Ethernet and your hotplug slots won't affect your global MPS. Additionally, the hotplugged devices themselves can be configured to a larger MPS up to the value configured in the hotplug bridge. To choose between the two available options, two PCI kernel boot args have been added to the PCI calls. "pcie_bus_safe" will provide the former behavior, while "pcie_bus_perf" will perform the latter behavior. By default, the latter behavior is used. NOTE: due to the location of the enablement, each arch will need to add calls to this function. This patch only enables x86. This patch includes a number of changes recommended by Benjamin Herrenschmidt. Tested-by: Jordan_Hargrave@dell.com Signed-off-by: Jon Mason <mason@myri.com> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
if (pcie_bus_config == PCIE_BUS_PERFORMANCE) {
mps = 128 << dev->pcie_mpss;
PCI: Set PCI-E Max Payload Size on fabric On a given PCI-E fabric, each device, bridge, and root port can have a different PCI-E maximum payload size. There is a sizable performance boost for having the largest possible maximum payload size on each PCI-E device. However, if improperly configured, fatal bus errors can occur. Thus, it is important to ensure that PCI-E payloads sends by a device are never larger than the MPS setting of all devices on the way to the destination. This can be achieved two ways: - A conservative approach is to use the smallest common denominator of the entire tree below a root complex for every device on that fabric. This means for example that having a 128 bytes MPS USB controller on one leg of a switch will dramatically reduce performances of a video card or 10GE adapter on another leg of that same switch. It also means that any hierarchy supporting hotplug slots (including expresscard or thunderbolt I suppose, dbl check that) will have to be entirely clamped to 128 bytes since we cannot predict what will be plugged into those slots, and we cannot change the MPS on a "live" system. - A more optimal way is possible, if it falls within a couple of constraints: * The top-level host bridge will never generate packets larger than the smallest TLP (or if it can be controlled independently from its MPS at least) * The device will never generate packets larger than MPS (which can be configured via MRRS) * No support of direct PCI-E <-> PCI-E transfers between devices without some additional code to specifically deal with that case Then we can use an approach that basically ignores downstream requests and focuses exclusively on upstream requests. In that case, all we need to care about is that a device MPS is no larger than its parent MPS, which allows us to keep all switches/bridges to the max MPS supported by their parent and eventually the PHB. In this case, your USB controller would no longer "starve" your 10GE Ethernet and your hotplug slots won't affect your global MPS. Additionally, the hotplugged devices themselves can be configured to a larger MPS up to the value configured in the hotplug bridge. To choose between the two available options, two PCI kernel boot args have been added to the PCI calls. "pcie_bus_safe" will provide the former behavior, while "pcie_bus_perf" will perform the latter behavior. By default, the latter behavior is used. NOTE: due to the location of the enablement, each arch will need to add calls to this function. This patch only enables x86. This patch includes a number of changes recommended by Benjamin Herrenschmidt. Tested-by: Jordan_Hargrave@dell.com Signed-off-by: Jon Mason <mason@myri.com> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
if (pci_pcie_type(dev) != PCI_EXP_TYPE_ROOT_PORT &&
dev->bus->self)
/* For "Performance", the assumption is made that
PCI: Set PCI-E Max Payload Size on fabric On a given PCI-E fabric, each device, bridge, and root port can have a different PCI-E maximum payload size. There is a sizable performance boost for having the largest possible maximum payload size on each PCI-E device. However, if improperly configured, fatal bus errors can occur. Thus, it is important to ensure that PCI-E payloads sends by a device are never larger than the MPS setting of all devices on the way to the destination. This can be achieved two ways: - A conservative approach is to use the smallest common denominator of the entire tree below a root complex for every device on that fabric. This means for example that having a 128 bytes MPS USB controller on one leg of a switch will dramatically reduce performances of a video card or 10GE adapter on another leg of that same switch. It also means that any hierarchy supporting hotplug slots (including expresscard or thunderbolt I suppose, dbl check that) will have to be entirely clamped to 128 bytes since we cannot predict what will be plugged into those slots, and we cannot change the MPS on a "live" system. - A more optimal way is possible, if it falls within a couple of constraints: * The top-level host bridge will never generate packets larger than the smallest TLP (or if it can be controlled independently from its MPS at least) * The device will never generate packets larger than MPS (which can be configured via MRRS) * No support of direct PCI-E <-> PCI-E transfers between devices without some additional code to specifically deal with that case Then we can use an approach that basically ignores downstream requests and focuses exclusively on upstream requests. In that case, all we need to care about is that a device MPS is no larger than its parent MPS, which allows us to keep all switches/bridges to the max MPS supported by their parent and eventually the PHB. In this case, your USB controller would no longer "starve" your 10GE Ethernet and your hotplug slots won't affect your global MPS. Additionally, the hotplugged devices themselves can be configured to a larger MPS up to the value configured in the hotplug bridge. To choose between the two available options, two PCI kernel boot args have been added to the PCI calls. "pcie_bus_safe" will provide the former behavior, while "pcie_bus_perf" will perform the latter behavior. By default, the latter behavior is used. NOTE: due to the location of the enablement, each arch will need to add calls to this function. This patch only enables x86. This patch includes a number of changes recommended by Benjamin Herrenschmidt. Tested-by: Jordan_Hargrave@dell.com Signed-off-by: Jon Mason <mason@myri.com> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
* downstream communication will never be larger than
* the MRRS. So, the MPS only needs to be configured
* for the upstream communication. This being the case,
* walk from the top down and set the MPS of the child
* to that of the parent bus.
*
* Configure the device MPS with the smaller of the
* device MPSS or the bridge MPS (which is assumed to be
* properly configured at this point to the largest
* allowable MPS based on its parent bus).
PCI: Set PCI-E Max Payload Size on fabric On a given PCI-E fabric, each device, bridge, and root port can have a different PCI-E maximum payload size. There is a sizable performance boost for having the largest possible maximum payload size on each PCI-E device. However, if improperly configured, fatal bus errors can occur. Thus, it is important to ensure that PCI-E payloads sends by a device are never larger than the MPS setting of all devices on the way to the destination. This can be achieved two ways: - A conservative approach is to use the smallest common denominator of the entire tree below a root complex for every device on that fabric. This means for example that having a 128 bytes MPS USB controller on one leg of a switch will dramatically reduce performances of a video card or 10GE adapter on another leg of that same switch. It also means that any hierarchy supporting hotplug slots (including expresscard or thunderbolt I suppose, dbl check that) will have to be entirely clamped to 128 bytes since we cannot predict what will be plugged into those slots, and we cannot change the MPS on a "live" system. - A more optimal way is possible, if it falls within a couple of constraints: * The top-level host bridge will never generate packets larger than the smallest TLP (or if it can be controlled independently from its MPS at least) * The device will never generate packets larger than MPS (which can be configured via MRRS) * No support of direct PCI-E <-> PCI-E transfers between devices without some additional code to specifically deal with that case Then we can use an approach that basically ignores downstream requests and focuses exclusively on upstream requests. In that case, all we need to care about is that a device MPS is no larger than its parent MPS, which allows us to keep all switches/bridges to the max MPS supported by their parent and eventually the PHB. In this case, your USB controller would no longer "starve" your 10GE Ethernet and your hotplug slots won't affect your global MPS. Additionally, the hotplugged devices themselves can be configured to a larger MPS up to the value configured in the hotplug bridge. To choose between the two available options, two PCI kernel boot args have been added to the PCI calls. "pcie_bus_safe" will provide the former behavior, while "pcie_bus_perf" will perform the latter behavior. By default, the latter behavior is used. NOTE: due to the location of the enablement, each arch will need to add calls to this function. This patch only enables x86. This patch includes a number of changes recommended by Benjamin Herrenschmidt. Tested-by: Jordan_Hargrave@dell.com Signed-off-by: Jon Mason <mason@myri.com> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
*/
mps = min(mps, pcie_get_mps(dev->bus->self));
PCI: Set PCI-E Max Payload Size on fabric On a given PCI-E fabric, each device, bridge, and root port can have a different PCI-E maximum payload size. There is a sizable performance boost for having the largest possible maximum payload size on each PCI-E device. However, if improperly configured, fatal bus errors can occur. Thus, it is important to ensure that PCI-E payloads sends by a device are never larger than the MPS setting of all devices on the way to the destination. This can be achieved two ways: - A conservative approach is to use the smallest common denominator of the entire tree below a root complex for every device on that fabric. This means for example that having a 128 bytes MPS USB controller on one leg of a switch will dramatically reduce performances of a video card or 10GE adapter on another leg of that same switch. It also means that any hierarchy supporting hotplug slots (including expresscard or thunderbolt I suppose, dbl check that) will have to be entirely clamped to 128 bytes since we cannot predict what will be plugged into those slots, and we cannot change the MPS on a "live" system. - A more optimal way is possible, if it falls within a couple of constraints: * The top-level host bridge will never generate packets larger than the smallest TLP (or if it can be controlled independently from its MPS at least) * The device will never generate packets larger than MPS (which can be configured via MRRS) * No support of direct PCI-E <-> PCI-E transfers between devices without some additional code to specifically deal with that case Then we can use an approach that basically ignores downstream requests and focuses exclusively on upstream requests. In that case, all we need to care about is that a device MPS is no larger than its parent MPS, which allows us to keep all switches/bridges to the max MPS supported by their parent and eventually the PHB. In this case, your USB controller would no longer "starve" your 10GE Ethernet and your hotplug slots won't affect your global MPS. Additionally, the hotplugged devices themselves can be configured to a larger MPS up to the value configured in the hotplug bridge. To choose between the two available options, two PCI kernel boot args have been added to the PCI calls. "pcie_bus_safe" will provide the former behavior, while "pcie_bus_perf" will perform the latter behavior. By default, the latter behavior is used. NOTE: due to the location of the enablement, each arch will need to add calls to this function. This patch only enables x86. This patch includes a number of changes recommended by Benjamin Herrenschmidt. Tested-by: Jordan_Hargrave@dell.com Signed-off-by: Jon Mason <mason@myri.com> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
}
rc = pcie_set_mps(dev, mps);
if (rc)
dev_err(&dev->dev, "Failed attempting to set the MPS\n");
}
static void pcie_write_mrrs(struct pci_dev *dev)
PCI: Set PCI-E Max Payload Size on fabric On a given PCI-E fabric, each device, bridge, and root port can have a different PCI-E maximum payload size. There is a sizable performance boost for having the largest possible maximum payload size on each PCI-E device. However, if improperly configured, fatal bus errors can occur. Thus, it is important to ensure that PCI-E payloads sends by a device are never larger than the MPS setting of all devices on the way to the destination. This can be achieved two ways: - A conservative approach is to use the smallest common denominator of the entire tree below a root complex for every device on that fabric. This means for example that having a 128 bytes MPS USB controller on one leg of a switch will dramatically reduce performances of a video card or 10GE adapter on another leg of that same switch. It also means that any hierarchy supporting hotplug slots (including expresscard or thunderbolt I suppose, dbl check that) will have to be entirely clamped to 128 bytes since we cannot predict what will be plugged into those slots, and we cannot change the MPS on a "live" system. - A more optimal way is possible, if it falls within a couple of constraints: * The top-level host bridge will never generate packets larger than the smallest TLP (or if it can be controlled independently from its MPS at least) * The device will never generate packets larger than MPS (which can be configured via MRRS) * No support of direct PCI-E <-> PCI-E transfers between devices without some additional code to specifically deal with that case Then we can use an approach that basically ignores downstream requests and focuses exclusively on upstream requests. In that case, all we need to care about is that a device MPS is no larger than its parent MPS, which allows us to keep all switches/bridges to the max MPS supported by their parent and eventually the PHB. In this case, your USB controller would no longer "starve" your 10GE Ethernet and your hotplug slots won't affect your global MPS. Additionally, the hotplugged devices themselves can be configured to a larger MPS up to the value configured in the hotplug bridge. To choose between the two available options, two PCI kernel boot args have been added to the PCI calls. "pcie_bus_safe" will provide the former behavior, while "pcie_bus_perf" will perform the latter behavior. By default, the latter behavior is used. NOTE: due to the location of the enablement, each arch will need to add calls to this function. This patch only enables x86. This patch includes a number of changes recommended by Benjamin Herrenschmidt. Tested-by: Jordan_Hargrave@dell.com Signed-off-by: Jon Mason <mason@myri.com> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
{
int rc, mrrs;
PCI: Set PCI-E Max Payload Size on fabric On a given PCI-E fabric, each device, bridge, and root port can have a different PCI-E maximum payload size. There is a sizable performance boost for having the largest possible maximum payload size on each PCI-E device. However, if improperly configured, fatal bus errors can occur. Thus, it is important to ensure that PCI-E payloads sends by a device are never larger than the MPS setting of all devices on the way to the destination. This can be achieved two ways: - A conservative approach is to use the smallest common denominator of the entire tree below a root complex for every device on that fabric. This means for example that having a 128 bytes MPS USB controller on one leg of a switch will dramatically reduce performances of a video card or 10GE adapter on another leg of that same switch. It also means that any hierarchy supporting hotplug slots (including expresscard or thunderbolt I suppose, dbl check that) will have to be entirely clamped to 128 bytes since we cannot predict what will be plugged into those slots, and we cannot change the MPS on a "live" system. - A more optimal way is possible, if it falls within a couple of constraints: * The top-level host bridge will never generate packets larger than the smallest TLP (or if it can be controlled independently from its MPS at least) * The device will never generate packets larger than MPS (which can be configured via MRRS) * No support of direct PCI-E <-> PCI-E transfers between devices without some additional code to specifically deal with that case Then we can use an approach that basically ignores downstream requests and focuses exclusively on upstream requests. In that case, all we need to care about is that a device MPS is no larger than its parent MPS, which allows us to keep all switches/bridges to the max MPS supported by their parent and eventually the PHB. In this case, your USB controller would no longer "starve" your 10GE Ethernet and your hotplug slots won't affect your global MPS. Additionally, the hotplugged devices themselves can be configured to a larger MPS up to the value configured in the hotplug bridge. To choose between the two available options, two PCI kernel boot args have been added to the PCI calls. "pcie_bus_safe" will provide the former behavior, while "pcie_bus_perf" will perform the latter behavior. By default, the latter behavior is used. NOTE: due to the location of the enablement, each arch will need to add calls to this function. This patch only enables x86. This patch includes a number of changes recommended by Benjamin Herrenschmidt. Tested-by: Jordan_Hargrave@dell.com Signed-off-by: Jon Mason <mason@myri.com> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
/* In the "safe" case, do not configure the MRRS. There appear to be
* issues with setting MRRS to 0 on a number of devices.
*/
if (pcie_bus_config != PCIE_BUS_PERFORMANCE)
return;
/* For Max performance, the MRRS must be set to the largest supported
* value. However, it cannot be configured larger than the MPS the
* device or the bus can support. This should already be properly
* configured by a prior call to pcie_write_mps.
*/
mrrs = pcie_get_mps(dev);
PCI: Set PCI-E Max Payload Size on fabric On a given PCI-E fabric, each device, bridge, and root port can have a different PCI-E maximum payload size. There is a sizable performance boost for having the largest possible maximum payload size on each PCI-E device. However, if improperly configured, fatal bus errors can occur. Thus, it is important to ensure that PCI-E payloads sends by a device are never larger than the MPS setting of all devices on the way to the destination. This can be achieved two ways: - A conservative approach is to use the smallest common denominator of the entire tree below a root complex for every device on that fabric. This means for example that having a 128 bytes MPS USB controller on one leg of a switch will dramatically reduce performances of a video card or 10GE adapter on another leg of that same switch. It also means that any hierarchy supporting hotplug slots (including expresscard or thunderbolt I suppose, dbl check that) will have to be entirely clamped to 128 bytes since we cannot predict what will be plugged into those slots, and we cannot change the MPS on a "live" system. - A more optimal way is possible, if it falls within a couple of constraints: * The top-level host bridge will never generate packets larger than the smallest TLP (or if it can be controlled independently from its MPS at least) * The device will never generate packets larger than MPS (which can be configured via MRRS) * No support of direct PCI-E <-> PCI-E transfers between devices without some additional code to specifically deal with that case Then we can use an approach that basically ignores downstream requests and focuses exclusively on upstream requests. In that case, all we need to care about is that a device MPS is no larger than its parent MPS, which allows us to keep all switches/bridges to the max MPS supported by their parent and eventually the PHB. In this case, your USB controller would no longer "starve" your 10GE Ethernet and your hotplug slots won't affect your global MPS. Additionally, the hotplugged devices themselves can be configured to a larger MPS up to the value configured in the hotplug bridge. To choose between the two available options, two PCI kernel boot args have been added to the PCI calls. "pcie_bus_safe" will provide the former behavior, while "pcie_bus_perf" will perform the latter behavior. By default, the latter behavior is used. NOTE: due to the location of the enablement, each arch will need to add calls to this function. This patch only enables x86. This patch includes a number of changes recommended by Benjamin Herrenschmidt. Tested-by: Jordan_Hargrave@dell.com Signed-off-by: Jon Mason <mason@myri.com> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
/* MRRS is a R/W register. Invalid values can be written, but a
* subsequent read will verify if the value is acceptable or not.
PCI: Set PCI-E Max Payload Size on fabric On a given PCI-E fabric, each device, bridge, and root port can have a different PCI-E maximum payload size. There is a sizable performance boost for having the largest possible maximum payload size on each PCI-E device. However, if improperly configured, fatal bus errors can occur. Thus, it is important to ensure that PCI-E payloads sends by a device are never larger than the MPS setting of all devices on the way to the destination. This can be achieved two ways: - A conservative approach is to use the smallest common denominator of the entire tree below a root complex for every device on that fabric. This means for example that having a 128 bytes MPS USB controller on one leg of a switch will dramatically reduce performances of a video card or 10GE adapter on another leg of that same switch. It also means that any hierarchy supporting hotplug slots (including expresscard or thunderbolt I suppose, dbl check that) will have to be entirely clamped to 128 bytes since we cannot predict what will be plugged into those slots, and we cannot change the MPS on a "live" system. - A more optimal way is possible, if it falls within a couple of constraints: * The top-level host bridge will never generate packets larger than the smallest TLP (or if it can be controlled independently from its MPS at least) * The device will never generate packets larger than MPS (which can be configured via MRRS) * No support of direct PCI-E <-> PCI-E transfers between devices without some additional code to specifically deal with that case Then we can use an approach that basically ignores downstream requests and focuses exclusively on upstream requests. In that case, all we need to care about is that a device MPS is no larger than its parent MPS, which allows us to keep all switches/bridges to the max MPS supported by their parent and eventually the PHB. In this case, your USB controller would no longer "starve" your 10GE Ethernet and your hotplug slots won't affect your global MPS. Additionally, the hotplugged devices themselves can be configured to a larger MPS up to the value configured in the hotplug bridge. To choose between the two available options, two PCI kernel boot args have been added to the PCI calls. "pcie_bus_safe" will provide the former behavior, while "pcie_bus_perf" will perform the latter behavior. By default, the latter behavior is used. NOTE: due to the location of the enablement, each arch will need to add calls to this function. This patch only enables x86. This patch includes a number of changes recommended by Benjamin Herrenschmidt. Tested-by: Jordan_Hargrave@dell.com Signed-off-by: Jon Mason <mason@myri.com> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
* If the MRRS value provided is not acceptable (e.g., too large),
* shrink the value until it is acceptable to the HW.
*/
PCI: Set PCI-E Max Payload Size on fabric On a given PCI-E fabric, each device, bridge, and root port can have a different PCI-E maximum payload size. There is a sizable performance boost for having the largest possible maximum payload size on each PCI-E device. However, if improperly configured, fatal bus errors can occur. Thus, it is important to ensure that PCI-E payloads sends by a device are never larger than the MPS setting of all devices on the way to the destination. This can be achieved two ways: - A conservative approach is to use the smallest common denominator of the entire tree below a root complex for every device on that fabric. This means for example that having a 128 bytes MPS USB controller on one leg of a switch will dramatically reduce performances of a video card or 10GE adapter on another leg of that same switch. It also means that any hierarchy supporting hotplug slots (including expresscard or thunderbolt I suppose, dbl check that) will have to be entirely clamped to 128 bytes since we cannot predict what will be plugged into those slots, and we cannot change the MPS on a "live" system. - A more optimal way is possible, if it falls within a couple of constraints: * The top-level host bridge will never generate packets larger than the smallest TLP (or if it can be controlled independently from its MPS at least) * The device will never generate packets larger than MPS (which can be configured via MRRS) * No support of direct PCI-E <-> PCI-E transfers between devices without some additional code to specifically deal with that case Then we can use an approach that basically ignores downstream requests and focuses exclusively on upstream requests. In that case, all we need to care about is that a device MPS is no larger than its parent MPS, which allows us to keep all switches/bridges to the max MPS supported by their parent and eventually the PHB. In this case, your USB controller would no longer "starve" your 10GE Ethernet and your hotplug slots won't affect your global MPS. Additionally, the hotplugged devices themselves can be configured to a larger MPS up to the value configured in the hotplug bridge. To choose between the two available options, two PCI kernel boot args have been added to the PCI calls. "pcie_bus_safe" will provide the former behavior, while "pcie_bus_perf" will perform the latter behavior. By default, the latter behavior is used. NOTE: due to the location of the enablement, each arch will need to add calls to this function. This patch only enables x86. This patch includes a number of changes recommended by Benjamin Herrenschmidt. Tested-by: Jordan_Hargrave@dell.com Signed-off-by: Jon Mason <mason@myri.com> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
while (mrrs != pcie_get_readrq(dev) && mrrs >= 128) {
rc = pcie_set_readrq(dev, mrrs);
if (!rc)
break;
PCI: Set PCI-E Max Payload Size on fabric On a given PCI-E fabric, each device, bridge, and root port can have a different PCI-E maximum payload size. There is a sizable performance boost for having the largest possible maximum payload size on each PCI-E device. However, if improperly configured, fatal bus errors can occur. Thus, it is important to ensure that PCI-E payloads sends by a device are never larger than the MPS setting of all devices on the way to the destination. This can be achieved two ways: - A conservative approach is to use the smallest common denominator of the entire tree below a root complex for every device on that fabric. This means for example that having a 128 bytes MPS USB controller on one leg of a switch will dramatically reduce performances of a video card or 10GE adapter on another leg of that same switch. It also means that any hierarchy supporting hotplug slots (including expresscard or thunderbolt I suppose, dbl check that) will have to be entirely clamped to 128 bytes since we cannot predict what will be plugged into those slots, and we cannot change the MPS on a "live" system. - A more optimal way is possible, if it falls within a couple of constraints: * The top-level host bridge will never generate packets larger than the smallest TLP (or if it can be controlled independently from its MPS at least) * The device will never generate packets larger than MPS (which can be configured via MRRS) * No support of direct PCI-E <-> PCI-E transfers between devices without some additional code to specifically deal with that case Then we can use an approach that basically ignores downstream requests and focuses exclusively on upstream requests. In that case, all we need to care about is that a device MPS is no larger than its parent MPS, which allows us to keep all switches/bridges to the max MPS supported by their parent and eventually the PHB. In this case, your USB controller would no longer "starve" your 10GE Ethernet and your hotplug slots won't affect your global MPS. Additionally, the hotplugged devices themselves can be configured to a larger MPS up to the value configured in the hotplug bridge. To choose between the two available options, two PCI kernel boot args have been added to the PCI calls. "pcie_bus_safe" will provide the former behavior, while "pcie_bus_perf" will perform the latter behavior. By default, the latter behavior is used. NOTE: due to the location of the enablement, each arch will need to add calls to this function. This patch only enables x86. This patch includes a number of changes recommended by Benjamin Herrenschmidt. Tested-by: Jordan_Hargrave@dell.com Signed-off-by: Jon Mason <mason@myri.com> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
dev_warn(&dev->dev, "Failed attempting to set the MRRS\n");
PCI: Set PCI-E Max Payload Size on fabric On a given PCI-E fabric, each device, bridge, and root port can have a different PCI-E maximum payload size. There is a sizable performance boost for having the largest possible maximum payload size on each PCI-E device. However, if improperly configured, fatal bus errors can occur. Thus, it is important to ensure that PCI-E payloads sends by a device are never larger than the MPS setting of all devices on the way to the destination. This can be achieved two ways: - A conservative approach is to use the smallest common denominator of the entire tree below a root complex for every device on that fabric. This means for example that having a 128 bytes MPS USB controller on one leg of a switch will dramatically reduce performances of a video card or 10GE adapter on another leg of that same switch. It also means that any hierarchy supporting hotplug slots (including expresscard or thunderbolt I suppose, dbl check that) will have to be entirely clamped to 128 bytes since we cannot predict what will be plugged into those slots, and we cannot change the MPS on a "live" system. - A more optimal way is possible, if it falls within a couple of constraints: * The top-level host bridge will never generate packets larger than the smallest TLP (or if it can be controlled independently from its MPS at least) * The device will never generate packets larger than MPS (which can be configured via MRRS) * No support of direct PCI-E <-> PCI-E transfers between devices without some additional code to specifically deal with that case Then we can use an approach that basically ignores downstream requests and focuses exclusively on upstream requests. In that case, all we need to care about is that a device MPS is no larger than its parent MPS, which allows us to keep all switches/bridges to the max MPS supported by their parent and eventually the PHB. In this case, your USB controller would no longer "starve" your 10GE Ethernet and your hotplug slots won't affect your global MPS. Additionally, the hotplugged devices themselves can be configured to a larger MPS up to the value configured in the hotplug bridge. To choose between the two available options, two PCI kernel boot args have been added to the PCI calls. "pcie_bus_safe" will provide the former behavior, while "pcie_bus_perf" will perform the latter behavior. By default, the latter behavior is used. NOTE: due to the location of the enablement, each arch will need to add calls to this function. This patch only enables x86. This patch includes a number of changes recommended by Benjamin Herrenschmidt. Tested-by: Jordan_Hargrave@dell.com Signed-off-by: Jon Mason <mason@myri.com> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
mrrs /= 2;
}
if (mrrs < 128)
dev_err(&dev->dev, "MRRS was unable to be configured with a safe value. If problems are experienced, try running with pci=pcie_bus_safe\n");
PCI: Set PCI-E Max Payload Size on fabric On a given PCI-E fabric, each device, bridge, and root port can have a different PCI-E maximum payload size. There is a sizable performance boost for having the largest possible maximum payload size on each PCI-E device. However, if improperly configured, fatal bus errors can occur. Thus, it is important to ensure that PCI-E payloads sends by a device are never larger than the MPS setting of all devices on the way to the destination. This can be achieved two ways: - A conservative approach is to use the smallest common denominator of the entire tree below a root complex for every device on that fabric. This means for example that having a 128 bytes MPS USB controller on one leg of a switch will dramatically reduce performances of a video card or 10GE adapter on another leg of that same switch. It also means that any hierarchy supporting hotplug slots (including expresscard or thunderbolt I suppose, dbl check that) will have to be entirely clamped to 128 bytes since we cannot predict what will be plugged into those slots, and we cannot change the MPS on a "live" system. - A more optimal way is possible, if it falls within a couple of constraints: * The top-level host bridge will never generate packets larger than the smallest TLP (or if it can be controlled independently from its MPS at least) * The device will never generate packets larger than MPS (which can be configured via MRRS) * No support of direct PCI-E <-> PCI-E transfers between devices without some additional code to specifically deal with that case Then we can use an approach that basically ignores downstream requests and focuses exclusively on upstream requests. In that case, all we need to care about is that a device MPS is no larger than its parent MPS, which allows us to keep all switches/bridges to the max MPS supported by their parent and eventually the PHB. In this case, your USB controller would no longer "starve" your 10GE Ethernet and your hotplug slots won't affect your global MPS. Additionally, the hotplugged devices themselves can be configured to a larger MPS up to the value configured in the hotplug bridge. To choose between the two available options, two PCI kernel boot args have been added to the PCI calls. "pcie_bus_safe" will provide the former behavior, while "pcie_bus_perf" will perform the latter behavior. By default, the latter behavior is used. NOTE: due to the location of the enablement, each arch will need to add calls to this function. This patch only enables x86. This patch includes a number of changes recommended by Benjamin Herrenschmidt. Tested-by: Jordan_Hargrave@dell.com Signed-off-by: Jon Mason <mason@myri.com> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
}
static int pcie_bus_configure_set(struct pci_dev *dev, void *data)
{
int mps, orig_mps;
PCI: Set PCI-E Max Payload Size on fabric On a given PCI-E fabric, each device, bridge, and root port can have a different PCI-E maximum payload size. There is a sizable performance boost for having the largest possible maximum payload size on each PCI-E device. However, if improperly configured, fatal bus errors can occur. Thus, it is important to ensure that PCI-E payloads sends by a device are never larger than the MPS setting of all devices on the way to the destination. This can be achieved two ways: - A conservative approach is to use the smallest common denominator of the entire tree below a root complex for every device on that fabric. This means for example that having a 128 bytes MPS USB controller on one leg of a switch will dramatically reduce performances of a video card or 10GE adapter on another leg of that same switch. It also means that any hierarchy supporting hotplug slots (including expresscard or thunderbolt I suppose, dbl check that) will have to be entirely clamped to 128 bytes since we cannot predict what will be plugged into those slots, and we cannot change the MPS on a "live" system. - A more optimal way is possible, if it falls within a couple of constraints: * The top-level host bridge will never generate packets larger than the smallest TLP (or if it can be controlled independently from its MPS at least) * The device will never generate packets larger than MPS (which can be configured via MRRS) * No support of direct PCI-E <-> PCI-E transfers between devices without some additional code to specifically deal with that case Then we can use an approach that basically ignores downstream requests and focuses exclusively on upstream requests. In that case, all we need to care about is that a device MPS is no larger than its parent MPS, which allows us to keep all switches/bridges to the max MPS supported by their parent and eventually the PHB. In this case, your USB controller would no longer "starve" your 10GE Ethernet and your hotplug slots won't affect your global MPS. Additionally, the hotplugged devices themselves can be configured to a larger MPS up to the value configured in the hotplug bridge. To choose between the two available options, two PCI kernel boot args have been added to the PCI calls. "pcie_bus_safe" will provide the former behavior, while "pcie_bus_perf" will perform the latter behavior. By default, the latter behavior is used. NOTE: due to the location of the enablement, each arch will need to add calls to this function. This patch only enables x86. This patch includes a number of changes recommended by Benjamin Herrenschmidt. Tested-by: Jordan_Hargrave@dell.com Signed-off-by: Jon Mason <mason@myri.com> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
if (!pci_is_pcie(dev))
return 0;
if (pcie_bus_config == PCIE_BUS_TUNE_OFF ||
pcie_bus_config == PCIE_BUS_DEFAULT)
return 0;
mps = 128 << *(u8 *)data;
orig_mps = pcie_get_mps(dev);
PCI: Set PCI-E Max Payload Size on fabric On a given PCI-E fabric, each device, bridge, and root port can have a different PCI-E maximum payload size. There is a sizable performance boost for having the largest possible maximum payload size on each PCI-E device. However, if improperly configured, fatal bus errors can occur. Thus, it is important to ensure that PCI-E payloads sends by a device are never larger than the MPS setting of all devices on the way to the destination. This can be achieved two ways: - A conservative approach is to use the smallest common denominator of the entire tree below a root complex for every device on that fabric. This means for example that having a 128 bytes MPS USB controller on one leg of a switch will dramatically reduce performances of a video card or 10GE adapter on another leg of that same switch. It also means that any hierarchy supporting hotplug slots (including expresscard or thunderbolt I suppose, dbl check that) will have to be entirely clamped to 128 bytes since we cannot predict what will be plugged into those slots, and we cannot change the MPS on a "live" system. - A more optimal way is possible, if it falls within a couple of constraints: * The top-level host bridge will never generate packets larger than the smallest TLP (or if it can be controlled independently from its MPS at least) * The device will never generate packets larger than MPS (which can be configured via MRRS) * No support of direct PCI-E <-> PCI-E transfers between devices without some additional code to specifically deal with that case Then we can use an approach that basically ignores downstream requests and focuses exclusively on upstream requests. In that case, all we need to care about is that a device MPS is no larger than its parent MPS, which allows us to keep all switches/bridges to the max MPS supported by their parent and eventually the PHB. In this case, your USB controller would no longer "starve" your 10GE Ethernet and your hotplug slots won't affect your global MPS. Additionally, the hotplugged devices themselves can be configured to a larger MPS up to the value configured in the hotplug bridge. To choose between the two available options, two PCI kernel boot args have been added to the PCI calls. "pcie_bus_safe" will provide the former behavior, while "pcie_bus_perf" will perform the latter behavior. By default, the latter behavior is used. NOTE: due to the location of the enablement, each arch will need to add calls to this function. This patch only enables x86. This patch includes a number of changes recommended by Benjamin Herrenschmidt. Tested-by: Jordan_Hargrave@dell.com Signed-off-by: Jon Mason <mason@myri.com> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
pcie_write_mps(dev, mps);
pcie_write_mrrs(dev);
PCI: Set PCI-E Max Payload Size on fabric On a given PCI-E fabric, each device, bridge, and root port can have a different PCI-E maximum payload size. There is a sizable performance boost for having the largest possible maximum payload size on each PCI-E device. However, if improperly configured, fatal bus errors can occur. Thus, it is important to ensure that PCI-E payloads sends by a device are never larger than the MPS setting of all devices on the way to the destination. This can be achieved two ways: - A conservative approach is to use the smallest common denominator of the entire tree below a root complex for every device on that fabric. This means for example that having a 128 bytes MPS USB controller on one leg of a switch will dramatically reduce performances of a video card or 10GE adapter on another leg of that same switch. It also means that any hierarchy supporting hotplug slots (including expresscard or thunderbolt I suppose, dbl check that) will have to be entirely clamped to 128 bytes since we cannot predict what will be plugged into those slots, and we cannot change the MPS on a "live" system. - A more optimal way is possible, if it falls within a couple of constraints: * The top-level host bridge will never generate packets larger than the smallest TLP (or if it can be controlled independently from its MPS at least) * The device will never generate packets larger than MPS (which can be configured via MRRS) * No support of direct PCI-E <-> PCI-E transfers between devices without some additional code to specifically deal with that case Then we can use an approach that basically ignores downstream requests and focuses exclusively on upstream requests. In that case, all we need to care about is that a device MPS is no larger than its parent MPS, which allows us to keep all switches/bridges to the max MPS supported by their parent and eventually the PHB. In this case, your USB controller would no longer "starve" your 10GE Ethernet and your hotplug slots won't affect your global MPS. Additionally, the hotplugged devices themselves can be configured to a larger MPS up to the value configured in the hotplug bridge. To choose between the two available options, two PCI kernel boot args have been added to the PCI calls. "pcie_bus_safe" will provide the former behavior, while "pcie_bus_perf" will perform the latter behavior. By default, the latter behavior is used. NOTE: due to the location of the enablement, each arch will need to add calls to this function. This patch only enables x86. This patch includes a number of changes recommended by Benjamin Herrenschmidt. Tested-by: Jordan_Hargrave@dell.com Signed-off-by: Jon Mason <mason@myri.com> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
dev_info(&dev->dev, "Max Payload Size set to %4d/%4d (was %4d), Max Read Rq %4d\n",
pcie_get_mps(dev), 128 << dev->pcie_mpss,
orig_mps, pcie_get_readrq(dev));
PCI: Set PCI-E Max Payload Size on fabric On a given PCI-E fabric, each device, bridge, and root port can have a different PCI-E maximum payload size. There is a sizable performance boost for having the largest possible maximum payload size on each PCI-E device. However, if improperly configured, fatal bus errors can occur. Thus, it is important to ensure that PCI-E payloads sends by a device are never larger than the MPS setting of all devices on the way to the destination. This can be achieved two ways: - A conservative approach is to use the smallest common denominator of the entire tree below a root complex for every device on that fabric. This means for example that having a 128 bytes MPS USB controller on one leg of a switch will dramatically reduce performances of a video card or 10GE adapter on another leg of that same switch. It also means that any hierarchy supporting hotplug slots (including expresscard or thunderbolt I suppose, dbl check that) will have to be entirely clamped to 128 bytes since we cannot predict what will be plugged into those slots, and we cannot change the MPS on a "live" system. - A more optimal way is possible, if it falls within a couple of constraints: * The top-level host bridge will never generate packets larger than the smallest TLP (or if it can be controlled independently from its MPS at least) * The device will never generate packets larger than MPS (which can be configured via MRRS) * No support of direct PCI-E <-> PCI-E transfers between devices without some additional code to specifically deal with that case Then we can use an approach that basically ignores downstream requests and focuses exclusively on upstream requests. In that case, all we need to care about is that a device MPS is no larger than its parent MPS, which allows us to keep all switches/bridges to the max MPS supported by their parent and eventually the PHB. In this case, your USB controller would no longer "starve" your 10GE Ethernet and your hotplug slots won't affect your global MPS. Additionally, the hotplugged devices themselves can be configured to a larger MPS up to the value configured in the hotplug bridge. To choose between the two available options, two PCI kernel boot args have been added to the PCI calls. "pcie_bus_safe" will provide the former behavior, while "pcie_bus_perf" will perform the latter behavior. By default, the latter behavior is used. NOTE: due to the location of the enablement, each arch will need to add calls to this function. This patch only enables x86. This patch includes a number of changes recommended by Benjamin Herrenschmidt. Tested-by: Jordan_Hargrave@dell.com Signed-off-by: Jon Mason <mason@myri.com> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
return 0;
}
/* pcie_bus_configure_settings requires that pci_walk_bus work in a top-down,
PCI: Set PCI-E Max Payload Size on fabric On a given PCI-E fabric, each device, bridge, and root port can have a different PCI-E maximum payload size. There is a sizable performance boost for having the largest possible maximum payload size on each PCI-E device. However, if improperly configured, fatal bus errors can occur. Thus, it is important to ensure that PCI-E payloads sends by a device are never larger than the MPS setting of all devices on the way to the destination. This can be achieved two ways: - A conservative approach is to use the smallest common denominator of the entire tree below a root complex for every device on that fabric. This means for example that having a 128 bytes MPS USB controller on one leg of a switch will dramatically reduce performances of a video card or 10GE adapter on another leg of that same switch. It also means that any hierarchy supporting hotplug slots (including expresscard or thunderbolt I suppose, dbl check that) will have to be entirely clamped to 128 bytes since we cannot predict what will be plugged into those slots, and we cannot change the MPS on a "live" system. - A more optimal way is possible, if it falls within a couple of constraints: * The top-level host bridge will never generate packets larger than the smallest TLP (or if it can be controlled independently from its MPS at least) * The device will never generate packets larger than MPS (which can be configured via MRRS) * No support of direct PCI-E <-> PCI-E transfers between devices without some additional code to specifically deal with that case Then we can use an approach that basically ignores downstream requests and focuses exclusively on upstream requests. In that case, all we need to care about is that a device MPS is no larger than its parent MPS, which allows us to keep all switches/bridges to the max MPS supported by their parent and eventually the PHB. In this case, your USB controller would no longer "starve" your 10GE Ethernet and your hotplug slots won't affect your global MPS. Additionally, the hotplugged devices themselves can be configured to a larger MPS up to the value configured in the hotplug bridge. To choose between the two available options, two PCI kernel boot args have been added to the PCI calls. "pcie_bus_safe" will provide the former behavior, while "pcie_bus_perf" will perform the latter behavior. By default, the latter behavior is used. NOTE: due to the location of the enablement, each arch will need to add calls to this function. This patch only enables x86. This patch includes a number of changes recommended by Benjamin Herrenschmidt. Tested-by: Jordan_Hargrave@dell.com Signed-off-by: Jon Mason <mason@myri.com> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
* parents then children fashion. If this changes, then this code will not
* work as designed.
*/
void pcie_bus_configure_settings(struct pci_bus *bus)
PCI: Set PCI-E Max Payload Size on fabric On a given PCI-E fabric, each device, bridge, and root port can have a different PCI-E maximum payload size. There is a sizable performance boost for having the largest possible maximum payload size on each PCI-E device. However, if improperly configured, fatal bus errors can occur. Thus, it is important to ensure that PCI-E payloads sends by a device are never larger than the MPS setting of all devices on the way to the destination. This can be achieved two ways: - A conservative approach is to use the smallest common denominator of the entire tree below a root complex for every device on that fabric. This means for example that having a 128 bytes MPS USB controller on one leg of a switch will dramatically reduce performances of a video card or 10GE adapter on another leg of that same switch. It also means that any hierarchy supporting hotplug slots (including expresscard or thunderbolt I suppose, dbl check that) will have to be entirely clamped to 128 bytes since we cannot predict what will be plugged into those slots, and we cannot change the MPS on a "live" system. - A more optimal way is possible, if it falls within a couple of constraints: * The top-level host bridge will never generate packets larger than the smallest TLP (or if it can be controlled independently from its MPS at least) * The device will never generate packets larger than MPS (which can be configured via MRRS) * No support of direct PCI-E <-> PCI-E transfers between devices without some additional code to specifically deal with that case Then we can use an approach that basically ignores downstream requests and focuses exclusively on upstream requests. In that case, all we need to care about is that a device MPS is no larger than its parent MPS, which allows us to keep all switches/bridges to the max MPS supported by their parent and eventually the PHB. In this case, your USB controller would no longer "starve" your 10GE Ethernet and your hotplug slots won't affect your global MPS. Additionally, the hotplugged devices themselves can be configured to a larger MPS up to the value configured in the hotplug bridge. To choose between the two available options, two PCI kernel boot args have been added to the PCI calls. "pcie_bus_safe" will provide the former behavior, while "pcie_bus_perf" will perform the latter behavior. By default, the latter behavior is used. NOTE: due to the location of the enablement, each arch will need to add calls to this function. This patch only enables x86. This patch includes a number of changes recommended by Benjamin Herrenschmidt. Tested-by: Jordan_Hargrave@dell.com Signed-off-by: Jon Mason <mason@myri.com> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
{
u8 smpss = 0;
PCI: Set PCI-E Max Payload Size on fabric On a given PCI-E fabric, each device, bridge, and root port can have a different PCI-E maximum payload size. There is a sizable performance boost for having the largest possible maximum payload size on each PCI-E device. However, if improperly configured, fatal bus errors can occur. Thus, it is important to ensure that PCI-E payloads sends by a device are never larger than the MPS setting of all devices on the way to the destination. This can be achieved two ways: - A conservative approach is to use the smallest common denominator of the entire tree below a root complex for every device on that fabric. This means for example that having a 128 bytes MPS USB controller on one leg of a switch will dramatically reduce performances of a video card or 10GE adapter on another leg of that same switch. It also means that any hierarchy supporting hotplug slots (including expresscard or thunderbolt I suppose, dbl check that) will have to be entirely clamped to 128 bytes since we cannot predict what will be plugged into those slots, and we cannot change the MPS on a "live" system. - A more optimal way is possible, if it falls within a couple of constraints: * The top-level host bridge will never generate packets larger than the smallest TLP (or if it can be controlled independently from its MPS at least) * The device will never generate packets larger than MPS (which can be configured via MRRS) * No support of direct PCI-E <-> PCI-E transfers between devices without some additional code to specifically deal with that case Then we can use an approach that basically ignores downstream requests and focuses exclusively on upstream requests. In that case, all we need to care about is that a device MPS is no larger than its parent MPS, which allows us to keep all switches/bridges to the max MPS supported by their parent and eventually the PHB. In this case, your USB controller would no longer "starve" your 10GE Ethernet and your hotplug slots won't affect your global MPS. Additionally, the hotplugged devices themselves can be configured to a larger MPS up to the value configured in the hotplug bridge. To choose between the two available options, two PCI kernel boot args have been added to the PCI calls. "pcie_bus_safe" will provide the former behavior, while "pcie_bus_perf" will perform the latter behavior. By default, the latter behavior is used. NOTE: due to the location of the enablement, each arch will need to add calls to this function. This patch only enables x86. This patch includes a number of changes recommended by Benjamin Herrenschmidt. Tested-by: Jordan_Hargrave@dell.com Signed-off-by: Jon Mason <mason@myri.com> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
if (!bus->self)
PCI: Set PCI-E Max Payload Size on fabric On a given PCI-E fabric, each device, bridge, and root port can have a different PCI-E maximum payload size. There is a sizable performance boost for having the largest possible maximum payload size on each PCI-E device. However, if improperly configured, fatal bus errors can occur. Thus, it is important to ensure that PCI-E payloads sends by a device are never larger than the MPS setting of all devices on the way to the destination. This can be achieved two ways: - A conservative approach is to use the smallest common denominator of the entire tree below a root complex for every device on that fabric. This means for example that having a 128 bytes MPS USB controller on one leg of a switch will dramatically reduce performances of a video card or 10GE adapter on another leg of that same switch. It also means that any hierarchy supporting hotplug slots (including expresscard or thunderbolt I suppose, dbl check that) will have to be entirely clamped to 128 bytes since we cannot predict what will be plugged into those slots, and we cannot change the MPS on a "live" system. - A more optimal way is possible, if it falls within a couple of constraints: * The top-level host bridge will never generate packets larger than the smallest TLP (or if it can be controlled independently from its MPS at least) * The device will never generate packets larger than MPS (which can be configured via MRRS) * No support of direct PCI-E <-> PCI-E transfers between devices without some additional code to specifically deal with that case Then we can use an approach that basically ignores downstream requests and focuses exclusively on upstream requests. In that case, all we need to care about is that a device MPS is no larger than its parent MPS, which allows us to keep all switches/bridges to the max MPS supported by their parent and eventually the PHB. In this case, your USB controller would no longer "starve" your 10GE Ethernet and your hotplug slots won't affect your global MPS. Additionally, the hotplugged devices themselves can be configured to a larger MPS up to the value configured in the hotplug bridge. To choose between the two available options, two PCI kernel boot args have been added to the PCI calls. "pcie_bus_safe" will provide the former behavior, while "pcie_bus_perf" will perform the latter behavior. By default, the latter behavior is used. NOTE: due to the location of the enablement, each arch will need to add calls to this function. This patch only enables x86. This patch includes a number of changes recommended by Benjamin Herrenschmidt. Tested-by: Jordan_Hargrave@dell.com Signed-off-by: Jon Mason <mason@myri.com> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
return;
if (!pci_is_pcie(bus->self))
return;
/* FIXME - Peer to peer DMA is possible, though the endpoint would need
* to be aware of the MPS of the destination. To work around this,
* simply force the MPS of the entire system to the smallest possible.
*/
if (pcie_bus_config == PCIE_BUS_PEER2PEER)
smpss = 0;
PCI: Set PCI-E Max Payload Size on fabric On a given PCI-E fabric, each device, bridge, and root port can have a different PCI-E maximum payload size. There is a sizable performance boost for having the largest possible maximum payload size on each PCI-E device. However, if improperly configured, fatal bus errors can occur. Thus, it is important to ensure that PCI-E payloads sends by a device are never larger than the MPS setting of all devices on the way to the destination. This can be achieved two ways: - A conservative approach is to use the smallest common denominator of the entire tree below a root complex for every device on that fabric. This means for example that having a 128 bytes MPS USB controller on one leg of a switch will dramatically reduce performances of a video card or 10GE adapter on another leg of that same switch. It also means that any hierarchy supporting hotplug slots (including expresscard or thunderbolt I suppose, dbl check that) will have to be entirely clamped to 128 bytes since we cannot predict what will be plugged into those slots, and we cannot change the MPS on a "live" system. - A more optimal way is possible, if it falls within a couple of constraints: * The top-level host bridge will never generate packets larger than the smallest TLP (or if it can be controlled independently from its MPS at least) * The device will never generate packets larger than MPS (which can be configured via MRRS) * No support of direct PCI-E <-> PCI-E transfers between devices without some additional code to specifically deal with that case Then we can use an approach that basically ignores downstream requests and focuses exclusively on upstream requests. In that case, all we need to care about is that a device MPS is no larger than its parent MPS, which allows us to keep all switches/bridges to the max MPS supported by their parent and eventually the PHB. In this case, your USB controller would no longer "starve" your 10GE Ethernet and your hotplug slots won't affect your global MPS. Additionally, the hotplugged devices themselves can be configured to a larger MPS up to the value configured in the hotplug bridge. To choose between the two available options, two PCI kernel boot args have been added to the PCI calls. "pcie_bus_safe" will provide the former behavior, while "pcie_bus_perf" will perform the latter behavior. By default, the latter behavior is used. NOTE: due to the location of the enablement, each arch will need to add calls to this function. This patch only enables x86. This patch includes a number of changes recommended by Benjamin Herrenschmidt. Tested-by: Jordan_Hargrave@dell.com Signed-off-by: Jon Mason <mason@myri.com> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
if (pcie_bus_config == PCIE_BUS_SAFE) {
smpss = bus->self->pcie_mpss;
PCI: Set PCI-E Max Payload Size on fabric On a given PCI-E fabric, each device, bridge, and root port can have a different PCI-E maximum payload size. There is a sizable performance boost for having the largest possible maximum payload size on each PCI-E device. However, if improperly configured, fatal bus errors can occur. Thus, it is important to ensure that PCI-E payloads sends by a device are never larger than the MPS setting of all devices on the way to the destination. This can be achieved two ways: - A conservative approach is to use the smallest common denominator of the entire tree below a root complex for every device on that fabric. This means for example that having a 128 bytes MPS USB controller on one leg of a switch will dramatically reduce performances of a video card or 10GE adapter on another leg of that same switch. It also means that any hierarchy supporting hotplug slots (including expresscard or thunderbolt I suppose, dbl check that) will have to be entirely clamped to 128 bytes since we cannot predict what will be plugged into those slots, and we cannot change the MPS on a "live" system. - A more optimal way is possible, if it falls within a couple of constraints: * The top-level host bridge will never generate packets larger than the smallest TLP (or if it can be controlled independently from its MPS at least) * The device will never generate packets larger than MPS (which can be configured via MRRS) * No support of direct PCI-E <-> PCI-E transfers between devices without some additional code to specifically deal with that case Then we can use an approach that basically ignores downstream requests and focuses exclusively on upstream requests. In that case, all we need to care about is that a device MPS is no larger than its parent MPS, which allows us to keep all switches/bridges to the max MPS supported by their parent and eventually the PHB. In this case, your USB controller would no longer "starve" your 10GE Ethernet and your hotplug slots won't affect your global MPS. Additionally, the hotplugged devices themselves can be configured to a larger MPS up to the value configured in the hotplug bridge. To choose between the two available options, two PCI kernel boot args have been added to the PCI calls. "pcie_bus_safe" will provide the former behavior, while "pcie_bus_perf" will perform the latter behavior. By default, the latter behavior is used. NOTE: due to the location of the enablement, each arch will need to add calls to this function. This patch only enables x86. This patch includes a number of changes recommended by Benjamin Herrenschmidt. Tested-by: Jordan_Hargrave@dell.com Signed-off-by: Jon Mason <mason@myri.com> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
pcie_find_smpss(bus->self, &smpss);
pci_walk_bus(bus, pcie_find_smpss, &smpss);
}
pcie_bus_configure_set(bus->self, &smpss);
pci_walk_bus(bus, pcie_bus_configure_set, &smpss);
}
EXPORT_SYMBOL_GPL(pcie_bus_configure_settings);
PCI: Set PCI-E Max Payload Size on fabric On a given PCI-E fabric, each device, bridge, and root port can have a different PCI-E maximum payload size. There is a sizable performance boost for having the largest possible maximum payload size on each PCI-E device. However, if improperly configured, fatal bus errors can occur. Thus, it is important to ensure that PCI-E payloads sends by a device are never larger than the MPS setting of all devices on the way to the destination. This can be achieved two ways: - A conservative approach is to use the smallest common denominator of the entire tree below a root complex for every device on that fabric. This means for example that having a 128 bytes MPS USB controller on one leg of a switch will dramatically reduce performances of a video card or 10GE adapter on another leg of that same switch. It also means that any hierarchy supporting hotplug slots (including expresscard or thunderbolt I suppose, dbl check that) will have to be entirely clamped to 128 bytes since we cannot predict what will be plugged into those slots, and we cannot change the MPS on a "live" system. - A more optimal way is possible, if it falls within a couple of constraints: * The top-level host bridge will never generate packets larger than the smallest TLP (or if it can be controlled independently from its MPS at least) * The device will never generate packets larger than MPS (which can be configured via MRRS) * No support of direct PCI-E <-> PCI-E transfers between devices without some additional code to specifically deal with that case Then we can use an approach that basically ignores downstream requests and focuses exclusively on upstream requests. In that case, all we need to care about is that a device MPS is no larger than its parent MPS, which allows us to keep all switches/bridges to the max MPS supported by their parent and eventually the PHB. In this case, your USB controller would no longer "starve" your 10GE Ethernet and your hotplug slots won't affect your global MPS. Additionally, the hotplugged devices themselves can be configured to a larger MPS up to the value configured in the hotplug bridge. To choose between the two available options, two PCI kernel boot args have been added to the PCI calls. "pcie_bus_safe" will provide the former behavior, while "pcie_bus_perf" will perform the latter behavior. By default, the latter behavior is used. NOTE: due to the location of the enablement, each arch will need to add calls to this function. This patch only enables x86. This patch includes a number of changes recommended by Benjamin Herrenschmidt. Tested-by: Jordan_Hargrave@dell.com Signed-off-by: Jon Mason <mason@myri.com> Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2011-07-21 04:20:54 +08:00
unsigned int pci_scan_child_bus(struct pci_bus *bus)
{
unsigned int devfn, pass, max = bus->busn_res.start;
struct pci_dev *dev;
dev_dbg(&bus->dev, "scanning bus\n");
/* Go find them, Rover! */
for (devfn = 0; devfn < 0x100; devfn += 8)
pci_scan_slot(bus, devfn);
/* Reserve buses for SR-IOV capability. */
max += pci_iov_bus_range(bus);
/*
* After performing arch-dependent fixup of the bus, look behind
* all PCI-to-PCI bridges on this bus.
*/
if (!bus->is_added) {
dev_dbg(&bus->dev, "fixups for bus\n");
pcibios_fixup_bus(bus);
bus->is_added = 1;
}
for (pass = 0; pass < 2; pass++)
list_for_each_entry(dev, &bus->devices, bus_list) {
if (pci_is_bridge(dev))
max = pci_scan_bridge(bus, dev, max, pass);
}
/*
* Make sure a hotplug bridge has at least the minimum requested
* number of buses.
*/
if (bus->self && bus->self->is_hotplug_bridge && pci_hotplug_bus_size) {
if (max - bus->busn_res.start < pci_hotplug_bus_size - 1)
max = bus->busn_res.start + pci_hotplug_bus_size - 1;
}
/*
* We've scanned the bus and so we know all about what's on
* the other side of any bridges that may be on this bus plus
* any devices.
*
* Return how far we've got finding sub-buses.
*/
dev_dbg(&bus->dev, "bus scan returning with max=%02x\n", max);
return max;
}
EXPORT_SYMBOL_GPL(pci_scan_child_bus);
ACPI / PCI: Set root bridge ACPI handle in advance The ACPI handles of PCI root bridges need to be known to acpi_bind_one(), so that it can create the appropriate "firmware_node" and "physical_node" files for them, but currently the way it gets to know those handles is not exactly straightforward (to put it lightly). This is how it works, roughly: 1. acpi_bus_scan() finds the handle of a PCI root bridge, creates a struct acpi_device object for it and passes that object to acpi_pci_root_add(). 2. acpi_pci_root_add() creates a struct acpi_pci_root object, populates its "device" field with its argument's address (device->handle is the ACPI handle found in step 1). 3. The struct acpi_pci_root object created in step 2 is passed to pci_acpi_scan_root() and used to get resources that are passed to pci_create_root_bus(). 4. pci_create_root_bus() creates a struct pci_host_bridge object and passes its "dev" member to device_register(). 5. platform_notify(), which for systems with ACPI is set to acpi_platform_notify(), is called. So far, so good. Now it starts to be "interesting". 6. acpi_find_bridge_device() is used to find the ACPI handle of the given device (which is the PCI root bridge) and executes acpi_pci_find_root_bridge(), among other things, for the given device object. 7. acpi_pci_find_root_bridge() uses the name (sic!) of the given device object to extract the segment and bus numbers of the PCI root bridge and passes them to acpi_get_pci_rootbridge_handle(). 8. acpi_get_pci_rootbridge_handle() browses the list of ACPI PCI root bridges and finds the one that matches the given segment and bus numbers. Its handle is then used to initialize the ACPI handle of the PCI root bridge's device object by acpi_bind_one(). However, this is *exactly* the ACPI handle we started with in step 1. Needless to say, this is quite embarassing, but it may be avoided thanks to commit f3fd0c8 (ACPI: Allow ACPI handles of devices to be initialized in advance), which makes it possible to initialize the ACPI handle of a device before passing it to device_register(). Accordingly, add a new __weak routine, pcibios_root_bridge_prepare(), defaulting to an empty implementation that can be replaced by the interested architecutres (x86 and ia64 at the moment) with functions that will set the root bridge's ACPI handle before its dev member is passed to device_register(). Make both x86 and ia64 provide such implementations of pcibios_root_bridge_prepare() and remove acpi_pci_find_root_bridge() and acpi_get_pci_rootbridge_handle() that aren't necessary any more. Included is a fix for breakage on systems with non-ACPI PCI host bridges from Bjorn Helgaas. Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
2013-01-10 05:33:37 +08:00
/**
* pcibios_root_bridge_prepare - Platform-specific host bridge setup.
* @bridge: Host bridge to set up.
*
* Default empty implementation. Replace with an architecture-specific setup
* routine, if necessary.
*/
int __weak pcibios_root_bridge_prepare(struct pci_host_bridge *bridge)
{
return 0;
}
void __weak pcibios_add_bus(struct pci_bus *bus)
{
}
void __weak pcibios_remove_bus(struct pci_bus *bus)
{
}
struct pci_bus *pci_create_root_bus(struct device *parent, int bus,
struct pci_ops *ops, void *sysdata, struct list_head *resources)
{
int error;
struct pci_host_bridge *bridge;
struct pci_bus *b, *b2;
struct resource_entry *window, *n;
struct resource *res;
resource_size_t offset;
char bus_addr[64];
char *fmt;
b = pci_alloc_bus(NULL);
if (!b)
return NULL;
b->sysdata = sysdata;
b->ops = ops;
b->number = b->busn_res.start = bus;
#ifdef CONFIG_PCI_DOMAINS_GENERIC
b->domain_nr = pci_bus_find_domain_nr(b, parent);
#endif
b2 = pci_find_bus(pci_domain_nr(b), bus);
if (b2) {
/* If we already got to this bus through a different bridge, ignore it */
dev_dbg(&b2->dev, "bus already known\n");
goto err_out;
}
bridge = pci_alloc_host_bridge(b);
if (!bridge)
goto err_out;
bridge->dev.parent = parent;
bridge->dev.release = pci_release_host_bridge_dev;
dev_set_name(&bridge->dev, "pci%04x:%02x", pci_domain_nr(b), bus);
ACPI / PCI: Set root bridge ACPI handle in advance The ACPI handles of PCI root bridges need to be known to acpi_bind_one(), so that it can create the appropriate "firmware_node" and "physical_node" files for them, but currently the way it gets to know those handles is not exactly straightforward (to put it lightly). This is how it works, roughly: 1. acpi_bus_scan() finds the handle of a PCI root bridge, creates a struct acpi_device object for it and passes that object to acpi_pci_root_add(). 2. acpi_pci_root_add() creates a struct acpi_pci_root object, populates its "device" field with its argument's address (device->handle is the ACPI handle found in step 1). 3. The struct acpi_pci_root object created in step 2 is passed to pci_acpi_scan_root() and used to get resources that are passed to pci_create_root_bus(). 4. pci_create_root_bus() creates a struct pci_host_bridge object and passes its "dev" member to device_register(). 5. platform_notify(), which for systems with ACPI is set to acpi_platform_notify(), is called. So far, so good. Now it starts to be "interesting". 6. acpi_find_bridge_device() is used to find the ACPI handle of the given device (which is the PCI root bridge) and executes acpi_pci_find_root_bridge(), among other things, for the given device object. 7. acpi_pci_find_root_bridge() uses the name (sic!) of the given device object to extract the segment and bus numbers of the PCI root bridge and passes them to acpi_get_pci_rootbridge_handle(). 8. acpi_get_pci_rootbridge_handle() browses the list of ACPI PCI root bridges and finds the one that matches the given segment and bus numbers. Its handle is then used to initialize the ACPI handle of the PCI root bridge's device object by acpi_bind_one(). However, this is *exactly* the ACPI handle we started with in step 1. Needless to say, this is quite embarassing, but it may be avoided thanks to commit f3fd0c8 (ACPI: Allow ACPI handles of devices to be initialized in advance), which makes it possible to initialize the ACPI handle of a device before passing it to device_register(). Accordingly, add a new __weak routine, pcibios_root_bridge_prepare(), defaulting to an empty implementation that can be replaced by the interested architecutres (x86 and ia64 at the moment) with functions that will set the root bridge's ACPI handle before its dev member is passed to device_register(). Make both x86 and ia64 provide such implementations of pcibios_root_bridge_prepare() and remove acpi_pci_find_root_bridge() and acpi_get_pci_rootbridge_handle() that aren't necessary any more. Included is a fix for breakage on systems with non-ACPI PCI host bridges from Bjorn Helgaas. Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
2013-01-10 05:33:37 +08:00
error = pcibios_root_bridge_prepare(bridge);
if (error) {
kfree(bridge);
goto err_out;
}
ACPI / PCI: Set root bridge ACPI handle in advance The ACPI handles of PCI root bridges need to be known to acpi_bind_one(), so that it can create the appropriate "firmware_node" and "physical_node" files for them, but currently the way it gets to know those handles is not exactly straightforward (to put it lightly). This is how it works, roughly: 1. acpi_bus_scan() finds the handle of a PCI root bridge, creates a struct acpi_device object for it and passes that object to acpi_pci_root_add(). 2. acpi_pci_root_add() creates a struct acpi_pci_root object, populates its "device" field with its argument's address (device->handle is the ACPI handle found in step 1). 3. The struct acpi_pci_root object created in step 2 is passed to pci_acpi_scan_root() and used to get resources that are passed to pci_create_root_bus(). 4. pci_create_root_bus() creates a struct pci_host_bridge object and passes its "dev" member to device_register(). 5. platform_notify(), which for systems with ACPI is set to acpi_platform_notify(), is called. So far, so good. Now it starts to be "interesting". 6. acpi_find_bridge_device() is used to find the ACPI handle of the given device (which is the PCI root bridge) and executes acpi_pci_find_root_bridge(), among other things, for the given device object. 7. acpi_pci_find_root_bridge() uses the name (sic!) of the given device object to extract the segment and bus numbers of the PCI root bridge and passes them to acpi_get_pci_rootbridge_handle(). 8. acpi_get_pci_rootbridge_handle() browses the list of ACPI PCI root bridges and finds the one that matches the given segment and bus numbers. Its handle is then used to initialize the ACPI handle of the PCI root bridge's device object by acpi_bind_one(). However, this is *exactly* the ACPI handle we started with in step 1. Needless to say, this is quite embarassing, but it may be avoided thanks to commit f3fd0c8 (ACPI: Allow ACPI handles of devices to be initialized in advance), which makes it possible to initialize the ACPI handle of a device before passing it to device_register(). Accordingly, add a new __weak routine, pcibios_root_bridge_prepare(), defaulting to an empty implementation that can be replaced by the interested architecutres (x86 and ia64 at the moment) with functions that will set the root bridge's ACPI handle before its dev member is passed to device_register(). Make both x86 and ia64 provide such implementations of pcibios_root_bridge_prepare() and remove acpi_pci_find_root_bridge() and acpi_get_pci_rootbridge_handle() that aren't necessary any more. Included is a fix for breakage on systems with non-ACPI PCI host bridges from Bjorn Helgaas. Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
2013-01-10 05:33:37 +08:00
error = device_register(&bridge->dev);
if (error) {
put_device(&bridge->dev);
goto err_out;
}
b->bridge = get_device(&bridge->dev);
device_enable_async_suspend(b->bridge);
pci_set_bus_of_node(b);
pci_set_bus_msi_domain(b);
if (!parent)
set_dev_node(b->bridge, pcibus_to_node(b));
b->dev.class = &pcibus_class;
b->dev.parent = b->bridge;
dev_set_name(&b->dev, "%04x:%02x", pci_domain_nr(b), bus);
error = device_register(&b->dev);
if (error)
goto class_dev_reg_err;
pcibios_add_bus(b);
/* Create legacy_io and legacy_mem files for this bus */
pci_create_legacy_files(b);
if (parent)
dev_info(parent, "PCI host bridge to bus %s\n", dev_name(&b->dev));
else
printk(KERN_INFO "PCI host bridge to bus %s\n", dev_name(&b->dev));
/* Add initial resources to the bus */
resource_list_for_each_entry_safe(window, n, resources) {
list_move_tail(&window->node, &bridge->windows);
res = window->res;
offset = window->offset;
if (res->flags & IORESOURCE_BUS)
pci_bus_insert_busn_res(b, bus, res->end);
else
pci_bus_add_resource(b, res, 0);
if (offset) {
if (resource_type(res) == IORESOURCE_IO)
fmt = " (bus address [%#06llx-%#06llx])";
else
fmt = " (bus address [%#010llx-%#010llx])";
snprintf(bus_addr, sizeof(bus_addr), fmt,
(unsigned long long) (res->start - offset),
(unsigned long long) (res->end - offset));
} else
bus_addr[0] = '\0';
dev_info(&b->dev, "root bus resource %pR%s\n", res, bus_addr);
}
down_write(&pci_bus_sem);
list_add_tail(&b->node, &pci_root_buses);
up_write(&pci_bus_sem);
return b;
class_dev_reg_err:
put_device(&bridge->dev);
device_unregister(&bridge->dev);
err_out:
kfree(b);
return NULL;
}
EXPORT_SYMBOL_GPL(pci_create_root_bus);
int pci_bus_insert_busn_res(struct pci_bus *b, int bus, int bus_max)
{
struct resource *res = &b->busn_res;
struct resource *parent_res, *conflict;
res->start = bus;
res->end = bus_max;
res->flags = IORESOURCE_BUS;
if (!pci_is_root_bus(b))
parent_res = &b->parent->busn_res;
else {
parent_res = get_pci_domain_busn_res(pci_domain_nr(b));
res->flags |= IORESOURCE_PCI_FIXED;
}
conflict = request_resource_conflict(parent_res, res);
if (conflict)
dev_printk(KERN_DEBUG, &b->dev,
"busn_res: can not insert %pR under %s%pR (conflicts with %s %pR)\n",
res, pci_is_root_bus(b) ? "domain " : "",
parent_res, conflict->name, conflict);
return conflict == NULL;
}
int pci_bus_update_busn_res_end(struct pci_bus *b, int bus_max)
{
struct resource *res = &b->busn_res;
struct resource old_res = *res;
resource_size_t size;
int ret;
if (res->start > bus_max)
return -EINVAL;
size = bus_max - res->start + 1;
ret = adjust_resource(res, res->start, size);
dev_printk(KERN_DEBUG, &b->dev,
"busn_res: %pR end %s updated to %02x\n",
&old_res, ret ? "can not be" : "is", bus_max);
if (!ret && !res->parent)
pci_bus_insert_busn_res(b, res->start, res->end);
return ret;
}
void pci_bus_release_busn_res(struct pci_bus *b)
{
struct resource *res = &b->busn_res;
int ret;
if (!res->flags || !res->parent)
return;
ret = release_resource(res);
dev_printk(KERN_DEBUG, &b->dev,
"busn_res: %pR %s released\n",
res, ret ? "can not be" : "is");
}
struct pci_bus *pci_scan_root_bus_msi(struct device *parent, int bus,
struct pci_ops *ops, void *sysdata,
struct list_head *resources, struct msi_controller *msi)
{
struct resource_entry *window;
bool found = false;
struct pci_bus *b;
int max;
resource_list_for_each_entry(window, resources)
if (window->res->flags & IORESOURCE_BUS) {
found = true;
break;
}
b = pci_create_root_bus(parent, bus, ops, sysdata, resources);
if (!b)
return NULL;
b->msi = msi;
if (!found) {
dev_info(&b->dev,
"No busn resource found for root bus, will use [bus %02x-ff]\n",
bus);
pci_bus_insert_busn_res(b, bus, 255);
}
max = pci_scan_child_bus(b);
if (!found)
pci_bus_update_busn_res_end(b, max);
return b;
}
struct pci_bus *pci_scan_root_bus(struct device *parent, int bus,
struct pci_ops *ops, void *sysdata, struct list_head *resources)
{
return pci_scan_root_bus_msi(parent, bus, ops, sysdata, resources,
NULL);
}
EXPORT_SYMBOL(pci_scan_root_bus);
struct pci_bus *pci_scan_bus(int bus, struct pci_ops *ops,
void *sysdata)
{
LIST_HEAD(resources);
struct pci_bus *b;
pci_add_resource(&resources, &ioport_resource);
pci_add_resource(&resources, &iomem_resource);
pci_add_resource(&resources, &busn_resource);
b = pci_create_root_bus(NULL, bus, ops, sysdata, &resources);
if (b) {
pci_scan_child_bus(b);
} else {
pci_free_resource_list(&resources);
}
return b;
}
EXPORT_SYMBOL(pci_scan_bus);
/**
* pci_rescan_bus_bridge_resize - scan a PCI bus for devices.
* @bridge: PCI bridge for the bus to scan
*
* Scan a PCI bus and child buses for new devices, add them,
* and enable them, resizing bridge mmio/io resource if necessary
* and possible. The caller must ensure the child devices are already
* removed for resizing to occur.
*
* Returns the max number of subordinate bus discovered.
*/
unsigned int pci_rescan_bus_bridge_resize(struct pci_dev *bridge)
{
unsigned int max;
struct pci_bus *bus = bridge->subordinate;
max = pci_scan_child_bus(bus);
pci_assign_unassigned_bridge_resources(bridge);
pci_bus_add_devices(bus);
return max;
}
/**
* pci_rescan_bus - scan a PCI bus for devices.
* @bus: PCI bus to scan
*
* Scan a PCI bus and child buses for new devices, adds them,
* and enables them.
*
* Returns the max number of subordinate bus discovered.
*/
unsigned int pci_rescan_bus(struct pci_bus *bus)
{
unsigned int max;
max = pci_scan_child_bus(bus);
pci_assign_unassigned_bus_resources(bus);
pci_bus_add_devices(bus);
return max;
}
EXPORT_SYMBOL_GPL(pci_rescan_bus);
/*
* pci_rescan_bus(), pci_rescan_bus_bridge_resize() and PCI device removal
* routines should always be executed under this mutex.
*/
static DEFINE_MUTEX(pci_rescan_remove_lock);
void pci_lock_rescan_remove(void)
{
mutex_lock(&pci_rescan_remove_lock);
}
EXPORT_SYMBOL_GPL(pci_lock_rescan_remove);
void pci_unlock_rescan_remove(void)
{
mutex_unlock(&pci_rescan_remove_lock);
}
EXPORT_SYMBOL_GPL(pci_unlock_rescan_remove);
static int __init pci_sort_bf_cmp(const struct device *d_a,
const struct device *d_b)
PCI: optionally sort device lists breadth-first Problem: New Dell PowerEdge servers have 2 embedded ethernet ports, which are labeled NIC1 and NIC2 on the chassis, in the BIOS setup screens, and in the printed documentation. Assuming no other add-in ethernet ports in the system, Linux 2.4 kernels name these eth0 and eth1 respectively. Many people have come to expect this naming. Linux 2.6 kernels name these eth1 and eth0 respectively (backwards from expectations). I also have reports that various Sun and HP servers have similar behavior. Root cause: Linux 2.4 kernels walk the pci_devices list, which happens to be sorted in breadth-first order (or pcbios_find_device order on i386, which most often is breadth-first also). 2.6 kernels have both the pci_devices list and the pci_bus_type.klist_devices list, the latter is what is walked at driver load time to match the pci_id tables; this klist happens to be in depth-first order. On systems where, for physical routing reasons, NIC1 appears on a lower bus number than NIC2, but NIC2's bridge is discovered first in the depth-first ordering, NIC2 will be discovered before NIC1. If the list were sorted breadth-first, NIC1 would be discovered before NIC2. A PowerEdge 1955 system has the following topology which easily exhibits the difference between depth-first and breadth-first device lists. -[0000:00]-+-00.0 Intel Corporation 5000P Chipset Memory Controller Hub +-02.0-[0000:03-08]--+-00.0-[0000:04-07]--+-00.0-[0000:05-06]----00.0-[0000:06]----00.0 Broadcom Corporation NetXtreme II BCM5708S Gigabit Ethernet (labeled NIC2, 2.4 kernel name eth1, 2.6 kernel name eth0) +-1c.0-[0000:01-02]----00.0-[0000:02]----00.0 Broadcom Corporation NetXtreme II BCM5708S Gigabit Ethernet (labeled NIC1, 2.4 kernel name eth0, 2.6 kernel name eth1) Other factors, such as device driver load order and the presence of PCI slots at various points in the bus hierarchy further complicate this problem; I'm not trying to solve those here, just restore the device order, and thus basic behavior, that 2.4 kernels had. Solution: The solution can come in multiple steps. Suggested fix #1: kernel Patch below optionally sorts the two device lists into breadth-first ordering to maintain compatibility with 2.4 kernels. It adds two new command line options: pci=bfsort pci=nobfsort to force the sort order, or not, as you wish. It also adds DMI checks for the specific Dell systems which exhibit "backwards" ordering, to make them "right". Suggested fix #2: udev rules from userland Many people also have the expectation that embedded NICs are always discovered before add-in NICs (which this patch does not try to do). Using the PCI IRQ Routing Table provided by system BIOS, it's easy to determine which PCI devices are embedded, or if add-in, which PCI slot they're in. I'm working on a tool that would allow udev to name ethernet devices in ascending embedded, slot 1 .. slot N order, subsort by PCI bus/dev/fn breadth-first. It'll be possible to use it independent of udev as well for those distributions that don't use udev in their installers. Suggested fix #3: system board routing rules One can constrain the system board layout to put NIC1 ahead of NIC2 regardless of breadth-first or depth-first discovery order. This adds a significant level of complexity to board routing, and may not be possible in all instances (witness the above systems from several major manufacturers). I don't want to encourage this particular train of thought too far, at the expense of not doing #1 or #2 above. Feedback appreciated. Patch tested on a Dell PowerEdge 1955 blade with 2.6.18. You'll also note I took some liberty and temporarily break the klist abstraction to simplify and speed up the sort algorithm. I think that's both safe and appropriate in this instance. Signed-off-by: Matt Domsch <Matt_Domsch@dell.com> Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2006-09-30 04:23:23 +08:00
{
const struct pci_dev *a = to_pci_dev(d_a);
const struct pci_dev *b = to_pci_dev(d_b);
PCI: optionally sort device lists breadth-first Problem: New Dell PowerEdge servers have 2 embedded ethernet ports, which are labeled NIC1 and NIC2 on the chassis, in the BIOS setup screens, and in the printed documentation. Assuming no other add-in ethernet ports in the system, Linux 2.4 kernels name these eth0 and eth1 respectively. Many people have come to expect this naming. Linux 2.6 kernels name these eth1 and eth0 respectively (backwards from expectations). I also have reports that various Sun and HP servers have similar behavior. Root cause: Linux 2.4 kernels walk the pci_devices list, which happens to be sorted in breadth-first order (or pcbios_find_device order on i386, which most often is breadth-first also). 2.6 kernels have both the pci_devices list and the pci_bus_type.klist_devices list, the latter is what is walked at driver load time to match the pci_id tables; this klist happens to be in depth-first order. On systems where, for physical routing reasons, NIC1 appears on a lower bus number than NIC2, but NIC2's bridge is discovered first in the depth-first ordering, NIC2 will be discovered before NIC1. If the list were sorted breadth-first, NIC1 would be discovered before NIC2. A PowerEdge 1955 system has the following topology which easily exhibits the difference between depth-first and breadth-first device lists. -[0000:00]-+-00.0 Intel Corporation 5000P Chipset Memory Controller Hub +-02.0-[0000:03-08]--+-00.0-[0000:04-07]--+-00.0-[0000:05-06]----00.0-[0000:06]----00.0 Broadcom Corporation NetXtreme II BCM5708S Gigabit Ethernet (labeled NIC2, 2.4 kernel name eth1, 2.6 kernel name eth0) +-1c.0-[0000:01-02]----00.0-[0000:02]----00.0 Broadcom Corporation NetXtreme II BCM5708S Gigabit Ethernet (labeled NIC1, 2.4 kernel name eth0, 2.6 kernel name eth1) Other factors, such as device driver load order and the presence of PCI slots at various points in the bus hierarchy further complicate this problem; I'm not trying to solve those here, just restore the device order, and thus basic behavior, that 2.4 kernels had. Solution: The solution can come in multiple steps. Suggested fix #1: kernel Patch below optionally sorts the two device lists into breadth-first ordering to maintain compatibility with 2.4 kernels. It adds two new command line options: pci=bfsort pci=nobfsort to force the sort order, or not, as you wish. It also adds DMI checks for the specific Dell systems which exhibit "backwards" ordering, to make them "right". Suggested fix #2: udev rules from userland Many people also have the expectation that embedded NICs are always discovered before add-in NICs (which this patch does not try to do). Using the PCI IRQ Routing Table provided by system BIOS, it's easy to determine which PCI devices are embedded, or if add-in, which PCI slot they're in. I'm working on a tool that would allow udev to name ethernet devices in ascending embedded, slot 1 .. slot N order, subsort by PCI bus/dev/fn breadth-first. It'll be possible to use it independent of udev as well for those distributions that don't use udev in their installers. Suggested fix #3: system board routing rules One can constrain the system board layout to put NIC1 ahead of NIC2 regardless of breadth-first or depth-first discovery order. This adds a significant level of complexity to board routing, and may not be possible in all instances (witness the above systems from several major manufacturers). I don't want to encourage this particular train of thought too far, at the expense of not doing #1 or #2 above. Feedback appreciated. Patch tested on a Dell PowerEdge 1955 blade with 2.6.18. You'll also note I took some liberty and temporarily break the klist abstraction to simplify and speed up the sort algorithm. I think that's both safe and appropriate in this instance. Signed-off-by: Matt Domsch <Matt_Domsch@dell.com> Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2006-09-30 04:23:23 +08:00
if (pci_domain_nr(a->bus) < pci_domain_nr(b->bus)) return -1;
else if (pci_domain_nr(a->bus) > pci_domain_nr(b->bus)) return 1;
if (a->bus->number < b->bus->number) return -1;
else if (a->bus->number > b->bus->number) return 1;
if (a->devfn < b->devfn) return -1;
else if (a->devfn > b->devfn) return 1;
return 0;
}
void __init pci_sort_breadthfirst(void)
PCI: optionally sort device lists breadth-first Problem: New Dell PowerEdge servers have 2 embedded ethernet ports, which are labeled NIC1 and NIC2 on the chassis, in the BIOS setup screens, and in the printed documentation. Assuming no other add-in ethernet ports in the system, Linux 2.4 kernels name these eth0 and eth1 respectively. Many people have come to expect this naming. Linux 2.6 kernels name these eth1 and eth0 respectively (backwards from expectations). I also have reports that various Sun and HP servers have similar behavior. Root cause: Linux 2.4 kernels walk the pci_devices list, which happens to be sorted in breadth-first order (or pcbios_find_device order on i386, which most often is breadth-first also). 2.6 kernels have both the pci_devices list and the pci_bus_type.klist_devices list, the latter is what is walked at driver load time to match the pci_id tables; this klist happens to be in depth-first order. On systems where, for physical routing reasons, NIC1 appears on a lower bus number than NIC2, but NIC2's bridge is discovered first in the depth-first ordering, NIC2 will be discovered before NIC1. If the list were sorted breadth-first, NIC1 would be discovered before NIC2. A PowerEdge 1955 system has the following topology which easily exhibits the difference between depth-first and breadth-first device lists. -[0000:00]-+-00.0 Intel Corporation 5000P Chipset Memory Controller Hub +-02.0-[0000:03-08]--+-00.0-[0000:04-07]--+-00.0-[0000:05-06]----00.0-[0000:06]----00.0 Broadcom Corporation NetXtreme II BCM5708S Gigabit Ethernet (labeled NIC2, 2.4 kernel name eth1, 2.6 kernel name eth0) +-1c.0-[0000:01-02]----00.0-[0000:02]----00.0 Broadcom Corporation NetXtreme II BCM5708S Gigabit Ethernet (labeled NIC1, 2.4 kernel name eth0, 2.6 kernel name eth1) Other factors, such as device driver load order and the presence of PCI slots at various points in the bus hierarchy further complicate this problem; I'm not trying to solve those here, just restore the device order, and thus basic behavior, that 2.4 kernels had. Solution: The solution can come in multiple steps. Suggested fix #1: kernel Patch below optionally sorts the two device lists into breadth-first ordering to maintain compatibility with 2.4 kernels. It adds two new command line options: pci=bfsort pci=nobfsort to force the sort order, or not, as you wish. It also adds DMI checks for the specific Dell systems which exhibit "backwards" ordering, to make them "right". Suggested fix #2: udev rules from userland Many people also have the expectation that embedded NICs are always discovered before add-in NICs (which this patch does not try to do). Using the PCI IRQ Routing Table provided by system BIOS, it's easy to determine which PCI devices are embedded, or if add-in, which PCI slot they're in. I'm working on a tool that would allow udev to name ethernet devices in ascending embedded, slot 1 .. slot N order, subsort by PCI bus/dev/fn breadth-first. It'll be possible to use it independent of udev as well for those distributions that don't use udev in their installers. Suggested fix #3: system board routing rules One can constrain the system board layout to put NIC1 ahead of NIC2 regardless of breadth-first or depth-first discovery order. This adds a significant level of complexity to board routing, and may not be possible in all instances (witness the above systems from several major manufacturers). I don't want to encourage this particular train of thought too far, at the expense of not doing #1 or #2 above. Feedback appreciated. Patch tested on a Dell PowerEdge 1955 blade with 2.6.18. You'll also note I took some liberty and temporarily break the klist abstraction to simplify and speed up the sort algorithm. I think that's both safe and appropriate in this instance. Signed-off-by: Matt Domsch <Matt_Domsch@dell.com> Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2006-09-30 04:23:23 +08:00
{
bus_sort_breadthfirst(&pci_bus_type, &pci_sort_bf_cmp);
PCI: optionally sort device lists breadth-first Problem: New Dell PowerEdge servers have 2 embedded ethernet ports, which are labeled NIC1 and NIC2 on the chassis, in the BIOS setup screens, and in the printed documentation. Assuming no other add-in ethernet ports in the system, Linux 2.4 kernels name these eth0 and eth1 respectively. Many people have come to expect this naming. Linux 2.6 kernels name these eth1 and eth0 respectively (backwards from expectations). I also have reports that various Sun and HP servers have similar behavior. Root cause: Linux 2.4 kernels walk the pci_devices list, which happens to be sorted in breadth-first order (or pcbios_find_device order on i386, which most often is breadth-first also). 2.6 kernels have both the pci_devices list and the pci_bus_type.klist_devices list, the latter is what is walked at driver load time to match the pci_id tables; this klist happens to be in depth-first order. On systems where, for physical routing reasons, NIC1 appears on a lower bus number than NIC2, but NIC2's bridge is discovered first in the depth-first ordering, NIC2 will be discovered before NIC1. If the list were sorted breadth-first, NIC1 would be discovered before NIC2. A PowerEdge 1955 system has the following topology which easily exhibits the difference between depth-first and breadth-first device lists. -[0000:00]-+-00.0 Intel Corporation 5000P Chipset Memory Controller Hub +-02.0-[0000:03-08]--+-00.0-[0000:04-07]--+-00.0-[0000:05-06]----00.0-[0000:06]----00.0 Broadcom Corporation NetXtreme II BCM5708S Gigabit Ethernet (labeled NIC2, 2.4 kernel name eth1, 2.6 kernel name eth0) +-1c.0-[0000:01-02]----00.0-[0000:02]----00.0 Broadcom Corporation NetXtreme II BCM5708S Gigabit Ethernet (labeled NIC1, 2.4 kernel name eth0, 2.6 kernel name eth1) Other factors, such as device driver load order and the presence of PCI slots at various points in the bus hierarchy further complicate this problem; I'm not trying to solve those here, just restore the device order, and thus basic behavior, that 2.4 kernels had. Solution: The solution can come in multiple steps. Suggested fix #1: kernel Patch below optionally sorts the two device lists into breadth-first ordering to maintain compatibility with 2.4 kernels. It adds two new command line options: pci=bfsort pci=nobfsort to force the sort order, or not, as you wish. It also adds DMI checks for the specific Dell systems which exhibit "backwards" ordering, to make them "right". Suggested fix #2: udev rules from userland Many people also have the expectation that embedded NICs are always discovered before add-in NICs (which this patch does not try to do). Using the PCI IRQ Routing Table provided by system BIOS, it's easy to determine which PCI devices are embedded, or if add-in, which PCI slot they're in. I'm working on a tool that would allow udev to name ethernet devices in ascending embedded, slot 1 .. slot N order, subsort by PCI bus/dev/fn breadth-first. It'll be possible to use it independent of udev as well for those distributions that don't use udev in their installers. Suggested fix #3: system board routing rules One can constrain the system board layout to put NIC1 ahead of NIC2 regardless of breadth-first or depth-first discovery order. This adds a significant level of complexity to board routing, and may not be possible in all instances (witness the above systems from several major manufacturers). I don't want to encourage this particular train of thought too far, at the expense of not doing #1 or #2 above. Feedback appreciated. Patch tested on a Dell PowerEdge 1955 blade with 2.6.18. You'll also note I took some liberty and temporarily break the klist abstraction to simplify and speed up the sort algorithm. I think that's both safe and appropriate in this instance. Signed-off-by: Matt Domsch <Matt_Domsch@dell.com> Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2006-09-30 04:23:23 +08:00
}