linux/drivers/pci/setup-bus.c

926 lines
24 KiB
C
Raw Normal View History

/*
* drivers/pci/setup-bus.c
*
* Extruded from code written by
* Dave Rusling (david.rusling@reo.mts.dec.com)
* David Mosberger (davidm@cs.arizona.edu)
* David Miller (davem@redhat.com)
*
* Support routines for initializing a PCI subsystem.
*/
/*
* Nov 2000, Ivan Kokshaysky <ink@jurassic.park.msu.ru>
* PCI-PCI bridges cleanup, sorted resource allocation.
* Feb 2002, Ivan Kokshaysky <ink@jurassic.park.msu.ru>
* Converted to allocation in 3 passes, which gives
* tighter packing. Prefetchable range support.
*/
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/pci.h>
#include <linux/errno.h>
#include <linux/ioport.h>
#include <linux/cache.h>
#include <linux/slab.h>
PCI SR-IOV: correct broken resource alignment calculations An SR-IOV capable device includes an SR-IOV PCIe capability which describes the Virtual Function (VF) BAR requirements. A typical SR-IOV device can support multiple VFs whose BARs must be in a contiguous region, effectively an array of VF BARs. The BAR reports the size requirement for a single VF. We calculate the full range needed by simply multiplying the VF BAR size with the number of possible VFs and create a resource spanning the full range. This all seems sane enough except it artificially inflates the alignment requirement for the VF BAR. The VF BAR need only be aligned to the size of a single BAR not the contiguous range of VF BARs. This can cause us to fail to allocate resources for the BAR despite the fact that we actually have enough space. This patch adds a thin PCI specific layer over the generic resource_alignment() function which is aware of the special nature of VF BARs and does sorting and allocation based on the smaller alignment requirement. I recognize that while resource_alignment is generic, it's basically a PCI helper. An alternative to this patch is to add PCI VF BAR specific information to struct resource. I opted for the extra layer rather than adding such PCI specific information to struct resource. This does have the slight downside that we don't cache the BAR size and re-read for each alignment query (happens a small handful of times during boot for each VF BAR). Signed-off-by: Chris Wright <chrisw@sous-sol.org> Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Matthew Wilcox <matthew@wil.cx> Cc: Yu Zhao <yu.zhao@intel.com> Cc: stable@kernel.org Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2009-08-29 04:00:06 +08:00
#include "pci.h"
struct resource_list_x {
struct resource_list_x *next;
struct resource *res;
struct pci_dev *dev;
resource_size_t start;
resource_size_t end;
unsigned long flags;
};
static void add_to_failed_list(struct resource_list_x *head,
struct pci_dev *dev, struct resource *res)
{
struct resource_list_x *list = head;
struct resource_list_x *ln = list->next;
struct resource_list_x *tmp;
tmp = kmalloc(sizeof(*tmp), GFP_KERNEL);
if (!tmp) {
pr_warning("add_to_failed_list: kmalloc() failed!\n");
return;
}
tmp->next = ln;
tmp->res = res;
tmp->dev = dev;
tmp->start = res->start;
tmp->end = res->end;
tmp->flags = res->flags;
list->next = tmp;
}
static void free_failed_list(struct resource_list_x *head)
{
struct resource_list_x *list, *tmp;
for (list = head->next; list;) {
tmp = list;
list = list->next;
kfree(tmp);
}
head->next = NULL;
}
static void __dev_sort_resources(struct pci_dev *dev,
struct resource_list *head)
{
u16 class = dev->class >> 8;
/* Don't touch classless devices or host bridges or ioapics. */
if (class == PCI_CLASS_NOT_DEFINED || class == PCI_CLASS_BRIDGE_HOST)
return;
/* Don't touch ioapic devices already enabled by firmware */
if (class == PCI_CLASS_SYSTEM_PIC) {
u16 command;
pci_read_config_word(dev, PCI_COMMAND, &command);
if (command & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY))
return;
}
pdev_sort_resources(dev, head);
}
static void __assign_resources_sorted(struct resource_list *head,
struct resource_list_x *fail_head)
{
struct resource *res;
struct resource_list *list, *tmp;
int idx;
for (list = head->next; list;) {
res = list->res;
idx = res - &list->dev->resource[0];
if (pci_assign_resource(list->dev, idx)) {
if (fail_head && !pci_is_root_bus(list->dev->bus)) {
/*
* if the failed res is for ROM BAR, and it will
* be enabled later, don't add it to the list
*/
if (!((idx == PCI_ROM_RESOURCE) &&
(!(res->flags & IORESOURCE_ROM_ENABLE))))
add_to_failed_list(fail_head, list->dev, res);
}
res->start = 0;
res->end = 0;
res->flags = 0;
}
tmp = list;
list = list->next;
kfree(tmp);
}
}
static void pdev_assign_resources_sorted(struct pci_dev *dev,
struct resource_list_x *fail_head)
{
struct resource_list head;
head.next = NULL;
__dev_sort_resources(dev, &head);
__assign_resources_sorted(&head, fail_head);
}
static void pbus_assign_resources_sorted(const struct pci_bus *bus,
struct resource_list_x *fail_head)
{
struct pci_dev *dev;
struct resource_list head;
head.next = NULL;
list_for_each_entry(dev, &bus->devices, bus_list)
__dev_sort_resources(dev, &head);
__assign_resources_sorted(&head, fail_head);
}
void pci_setup_cardbus(struct pci_bus *bus)
{
struct pci_dev *bridge = bus->self;
struct resource *res;
struct pci_bus_region region;
dev_info(&bridge->dev, "CardBus bridge to [bus %02x-%02x]\n",
bus->secondary, bus->subordinate);
res = bus->resource[0];
pcibios_resource_to_bus(bridge, &region, res);
if (res->flags & IORESOURCE_IO) {
/*
* The IO resource is allocated a range twice as large as it
* would normally need. This allows us to set both IO regs.
*/
dev_info(&bridge->dev, " bridge window %pR\n", res);
pci_write_config_dword(bridge, PCI_CB_IO_BASE_0,
region.start);
pci_write_config_dword(bridge, PCI_CB_IO_LIMIT_0,
region.end);
}
res = bus->resource[1];
pcibios_resource_to_bus(bridge, &region, res);
if (res->flags & IORESOURCE_IO) {
dev_info(&bridge->dev, " bridge window %pR\n", res);
pci_write_config_dword(bridge, PCI_CB_IO_BASE_1,
region.start);
pci_write_config_dword(bridge, PCI_CB_IO_LIMIT_1,
region.end);
}
res = bus->resource[2];
pcibios_resource_to_bus(bridge, &region, res);
if (res->flags & IORESOURCE_MEM) {
dev_info(&bridge->dev, " bridge window %pR\n", res);
pci_write_config_dword(bridge, PCI_CB_MEMORY_BASE_0,
region.start);
pci_write_config_dword(bridge, PCI_CB_MEMORY_LIMIT_0,
region.end);
}
res = bus->resource[3];
pcibios_resource_to_bus(bridge, &region, res);
if (res->flags & IORESOURCE_MEM) {
dev_info(&bridge->dev, " bridge window %pR\n", res);
pci_write_config_dword(bridge, PCI_CB_MEMORY_BASE_1,
region.start);
pci_write_config_dword(bridge, PCI_CB_MEMORY_LIMIT_1,
region.end);
}
}
EXPORT_SYMBOL(pci_setup_cardbus);
/* Initialize bridges with base/limit values we have collected.
PCI-to-PCI Bridge Architecture Specification rev. 1.1 (1998)
requires that if there is no I/O ports or memory behind the
bridge, corresponding range must be turned off by writing base
value greater than limit to the bridge's base/limit registers.
Note: care must be taken when updating I/O base/limit registers
of bridges which support 32-bit I/O. This update requires two
config space writes, so it's quite possible that an I/O window of
the bridge will have some undesirable address (e.g. 0) after the
first write. Ditto 64-bit prefetchable MMIO. */
static void pci_setup_bridge_io(struct pci_bus *bus)
{
struct pci_dev *bridge = bus->self;
struct resource *res;
struct pci_bus_region region;
u32 l, io_upper16;
/* Set up the top and bottom of the PCI I/O segment for this bus. */
res = bus->resource[0];
pcibios_resource_to_bus(bridge, &region, res);
if (res->flags & IORESOURCE_IO) {
pci_read_config_dword(bridge, PCI_IO_BASE, &l);
l &= 0xffff0000;
l |= (region.start >> 8) & 0x00f0;
l |= region.end & 0xf000;
/* Set up upper 16 bits of I/O base/limit. */
io_upper16 = (region.end & 0xffff0000) | (region.start >> 16);
dev_info(&bridge->dev, " bridge window %pR\n", res);
} else {
/* Clear upper 16 bits of I/O base/limit. */
io_upper16 = 0;
l = 0x00f0;
dev_info(&bridge->dev, " bridge window [io disabled]\n");
}
/* Temporarily disable the I/O range before updating PCI_IO_BASE. */
pci_write_config_dword(bridge, PCI_IO_BASE_UPPER16, 0x0000ffff);
/* Update lower 16 bits of I/O base/limit. */
pci_write_config_dword(bridge, PCI_IO_BASE, l);
/* Update upper 16 bits of I/O base/limit. */
pci_write_config_dword(bridge, PCI_IO_BASE_UPPER16, io_upper16);
}
static void pci_setup_bridge_mmio(struct pci_bus *bus)
{
struct pci_dev *bridge = bus->self;
struct resource *res;
struct pci_bus_region region;
u32 l;
/* Set up the top and bottom of the PCI Memory segment for this bus. */
res = bus->resource[1];
pcibios_resource_to_bus(bridge, &region, res);
if (res->flags & IORESOURCE_MEM) {
l = (region.start >> 16) & 0xfff0;
l |= region.end & 0xfff00000;
dev_info(&bridge->dev, " bridge window %pR\n", res);
} else {
l = 0x0000fff0;
dev_info(&bridge->dev, " bridge window [mem disabled]\n");
}
pci_write_config_dword(bridge, PCI_MEMORY_BASE, l);
}
static void pci_setup_bridge_mmio_pref(struct pci_bus *bus)
{
struct pci_dev *bridge = bus->self;
struct resource *res;
struct pci_bus_region region;
u32 l, bu, lu;
/* Clear out the upper 32 bits of PREF limit.
If PCI_PREF_BASE_UPPER32 was non-zero, this temporarily
disables PREF range, which is ok. */
pci_write_config_dword(bridge, PCI_PREF_LIMIT_UPPER32, 0);
/* Set up PREF base/limit. */
bu = lu = 0;
res = bus->resource[2];
pcibios_resource_to_bus(bridge, &region, res);
if (res->flags & IORESOURCE_PREFETCH) {
l = (region.start >> 16) & 0xfff0;
l |= region.end & 0xfff00000;
if (res->flags & IORESOURCE_MEM_64) {
bu = upper_32_bits(region.start);
lu = upper_32_bits(region.end);
}
dev_info(&bridge->dev, " bridge window %pR\n", res);
} else {
l = 0x0000fff0;
dev_info(&bridge->dev, " bridge window [mem pref disabled]\n");
}
pci_write_config_dword(bridge, PCI_PREF_MEMORY_BASE, l);
/* Set the upper 32 bits of PREF base & limit. */
pci_write_config_dword(bridge, PCI_PREF_BASE_UPPER32, bu);
pci_write_config_dword(bridge, PCI_PREF_LIMIT_UPPER32, lu);
}
static void __pci_setup_bridge(struct pci_bus *bus, unsigned long type)
{
struct pci_dev *bridge = bus->self;
dev_info(&bridge->dev, "PCI bridge to [bus %02x-%02x]\n",
bus->secondary, bus->subordinate);
if (type & IORESOURCE_IO)
pci_setup_bridge_io(bus);
if (type & IORESOURCE_MEM)
pci_setup_bridge_mmio(bus);
if (type & IORESOURCE_PREFETCH)
pci_setup_bridge_mmio_pref(bus);
pci_write_config_word(bridge, PCI_BRIDGE_CONTROL, bus->bridge_ctl);
}
static void pci_setup_bridge(struct pci_bus *bus)
{
unsigned long type = IORESOURCE_IO | IORESOURCE_MEM |
IORESOURCE_PREFETCH;
__pci_setup_bridge(bus, type);
}
/* Check whether the bridge supports optional I/O and
prefetchable memory ranges. If not, the respective
base/limit registers must be read-only and read as 0. */
pci: do not mark exported functions as __devinit Functions marked __devinit will be removed after kernel init. But being exported they are potentially called by a module much later. So the safer choice seems to be to keep the function even in the non CONFIG_HOTPLUG case. This silence the follwoing section mismatch warnings: WARNING: drivers/built-in.o - Section mismatch: reference to .init.text:pci_bus_add_device from __ksymtab_gpl between '__ksymtab_pci_bus_add_device' (at offset 0x20) and '__ksymtab_pci_walk_bus' WARNING: drivers/built-in.o - Section mismatch: reference to .init.text:pci_create_bus from __ksymtab_gpl between '__ksymtab_pci_create_bus' (at offset 0x40) and '__ksymtab_pci_stop_bus_device' WARNING: drivers/built-in.o - Section mismatch: reference to .init.text:pci_bus_max_busnr from __ksymtab_gpl between '__ksymtab_pci_bus_max_busnr' (at offset 0xc0) and '__ksymtab_pci_assign_resource_fixed' WARNING: drivers/built-in.o - Section mismatch: reference to .init.text:pci_claim_resource from __ksymtab_gpl between '__ksymtab_pci_claim_resource' (at offset 0xe0) and '__ksymtab_pcie_port_bus_type' WARNING: drivers/built-in.o - Section mismatch: reference to .init.text:pci_bus_add_devices from __ksymtab between '__ksymtab_pci_bus_add_devices' (at offset 0x70) and '__ksymtab_pci_bus_alloc_resource' WARNING: drivers/built-in.o - Section mismatch: reference to .init.text:pci_scan_bus_parented from __ksymtab between '__ksymtab_pci_scan_bus_parented' (at offset 0x90) and '__ksymtab_pci_root_buses' WARNING: drivers/built-in.o - Section mismatch: reference to .init.text:pci_bus_assign_resources from __ksymtab between '__ksymtab_pci_bus_assign_resources' (at offset 0x4d0) and '__ksymtab_pci_bus_size_bridges' WARNING: drivers/built-in.o - Section mismatch: reference to .init.text:pci_bus_size_bridges from __ksymtab between '__ksymtab_pci_bus_size_bridges' (at offset 0x4e0) and '__ksymtab_pci_setup_cardbus' Signed-off-by: Sam Ravnborg <sam@ravnborg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2007-03-27 13:53:30 +08:00
static void pci_bridge_check_ranges(struct pci_bus *bus)
{
u16 io;
u32 pmem;
struct pci_dev *bridge = bus->self;
struct resource *b_res;
b_res = &bridge->resource[PCI_BRIDGE_RESOURCES];
b_res[1].flags |= IORESOURCE_MEM;
pci_read_config_word(bridge, PCI_IO_BASE, &io);
if (!io) {
pci_write_config_word(bridge, PCI_IO_BASE, 0xf0f0);
pci_read_config_word(bridge, PCI_IO_BASE, &io);
pci_write_config_word(bridge, PCI_IO_BASE, 0x0);
}
if (io)
b_res[0].flags |= IORESOURCE_IO;
/* DECchip 21050 pass 2 errata: the bridge may miss an address
disconnect boundary by one PCI data phase.
Workaround: do not use prefetching on this device. */
if (bridge->vendor == PCI_VENDOR_ID_DEC && bridge->device == 0x0001)
return;
pci_read_config_dword(bridge, PCI_PREF_MEMORY_BASE, &pmem);
if (!pmem) {
pci_write_config_dword(bridge, PCI_PREF_MEMORY_BASE,
0xfff0fff0);
pci_read_config_dword(bridge, PCI_PREF_MEMORY_BASE, &pmem);
pci_write_config_dword(bridge, PCI_PREF_MEMORY_BASE, 0x0);
}
if (pmem) {
b_res[2].flags |= IORESOURCE_MEM | IORESOURCE_PREFETCH;
if ((pmem & PCI_PREF_RANGE_TYPE_MASK) ==
PCI_PREF_RANGE_TYPE_64) {
b_res[2].flags |= IORESOURCE_MEM_64;
b_res[2].flags |= PCI_PREF_RANGE_TYPE_64;
}
}
/* double check if bridge does support 64 bit pref */
if (b_res[2].flags & IORESOURCE_MEM_64) {
u32 mem_base_hi, tmp;
pci_read_config_dword(bridge, PCI_PREF_BASE_UPPER32,
&mem_base_hi);
pci_write_config_dword(bridge, PCI_PREF_BASE_UPPER32,
0xffffffff);
pci_read_config_dword(bridge, PCI_PREF_BASE_UPPER32, &tmp);
if (!tmp)
b_res[2].flags &= ~IORESOURCE_MEM_64;
pci_write_config_dword(bridge, PCI_PREF_BASE_UPPER32,
mem_base_hi);
}
}
/* Helper function for sizing routines: find first available
bus resource of a given type. Note: we intentionally skip
the bus resources which have already been assigned (that is,
have non-NULL parent resource). */
pci: do not mark exported functions as __devinit Functions marked __devinit will be removed after kernel init. But being exported they are potentially called by a module much later. So the safer choice seems to be to keep the function even in the non CONFIG_HOTPLUG case. This silence the follwoing section mismatch warnings: WARNING: drivers/built-in.o - Section mismatch: reference to .init.text:pci_bus_add_device from __ksymtab_gpl between '__ksymtab_pci_bus_add_device' (at offset 0x20) and '__ksymtab_pci_walk_bus' WARNING: drivers/built-in.o - Section mismatch: reference to .init.text:pci_create_bus from __ksymtab_gpl between '__ksymtab_pci_create_bus' (at offset 0x40) and '__ksymtab_pci_stop_bus_device' WARNING: drivers/built-in.o - Section mismatch: reference to .init.text:pci_bus_max_busnr from __ksymtab_gpl between '__ksymtab_pci_bus_max_busnr' (at offset 0xc0) and '__ksymtab_pci_assign_resource_fixed' WARNING: drivers/built-in.o - Section mismatch: reference to .init.text:pci_claim_resource from __ksymtab_gpl between '__ksymtab_pci_claim_resource' (at offset 0xe0) and '__ksymtab_pcie_port_bus_type' WARNING: drivers/built-in.o - Section mismatch: reference to .init.text:pci_bus_add_devices from __ksymtab between '__ksymtab_pci_bus_add_devices' (at offset 0x70) and '__ksymtab_pci_bus_alloc_resource' WARNING: drivers/built-in.o - Section mismatch: reference to .init.text:pci_scan_bus_parented from __ksymtab between '__ksymtab_pci_scan_bus_parented' (at offset 0x90) and '__ksymtab_pci_root_buses' WARNING: drivers/built-in.o - Section mismatch: reference to .init.text:pci_bus_assign_resources from __ksymtab between '__ksymtab_pci_bus_assign_resources' (at offset 0x4d0) and '__ksymtab_pci_bus_size_bridges' WARNING: drivers/built-in.o - Section mismatch: reference to .init.text:pci_bus_size_bridges from __ksymtab between '__ksymtab_pci_bus_size_bridges' (at offset 0x4e0) and '__ksymtab_pci_setup_cardbus' Signed-off-by: Sam Ravnborg <sam@ravnborg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2007-03-27 13:53:30 +08:00
static struct resource *find_free_bus_resource(struct pci_bus *bus, unsigned long type)
{
int i;
struct resource *r;
unsigned long type_mask = IORESOURCE_IO | IORESOURCE_MEM |
IORESOURCE_PREFETCH;
pci_bus_for_each_resource(bus, r, i) {
if (r == &ioport_resource || r == &iomem_resource)
continue;
if (r && (r->flags & type_mask) == type && !r->parent)
return r;
}
return NULL;
}
/* Sizing the IO windows of the PCI-PCI bridge is trivial,
since these windows have 4K granularity and the IO ranges
of non-bridge PCI devices are limited to 256 bytes.
We must be careful with the ISA aliasing though. */
static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size)
{
struct pci_dev *dev;
struct resource *b_res = find_free_bus_resource(bus, IORESOURCE_IO);
unsigned long size = 0, size1 = 0, old_size;
if (!b_res)
return;
list_for_each_entry(dev, &bus->devices, bus_list) {
int i;
for (i = 0; i < PCI_NUM_RESOURCES; i++) {
struct resource *r = &dev->resource[i];
unsigned long r_size;
if (r->parent || !(r->flags & IORESOURCE_IO))
continue;
r_size = resource_size(r);
if (r_size < 0x400)
/* Might be re-aligned for ISA */
size += r_size;
else
size1 += r_size;
}
}
if (size < min_size)
size = min_size;
old_size = resource_size(b_res);
if (old_size == 1)
old_size = 0;
/* To be fixed in 2.5: we should have sort of HAVE_ISA
flag in the struct pci_bus. */
#if defined(CONFIG_ISA) || defined(CONFIG_EISA)
size = (size & 0xff) + ((size & ~0xffUL) << 2);
#endif
size = ALIGN(size + size1, 4096);
if (size < old_size)
size = old_size;
if (!size) {
if (b_res->start || b_res->end)
dev_info(&bus->self->dev, "disabling bridge window "
"%pR to [bus %02x-%02x] (unused)\n", b_res,
bus->secondary, bus->subordinate);
b_res->flags = 0;
return;
}
/* Alignment of the IO window is always 4K */
b_res->start = 4096;
b_res->end = b_res->start + size - 1;
PCI: clean up resource alignment management Done per Linus' request and suggestions. Linus has explained that better than I'll be able to explain: On Thu, Mar 27, 2008 at 10:12:10AM -0700, Linus Torvalds wrote: > Actually, before we go any further, there might be a less intrusive > alternative: add just a couple of flags to the resource flags field (we > still have something like 8 unused bits on 32-bit), and use those to > implement a generic "resource_alignment()" routine. > > Two flags would do it: > > - IORESOURCE_SIZEALIGN: size indicates alignment (regular PCI device > resources) > > - IORESOURCE_STARTALIGN: start field is alignment (PCI bus resources > during probing) > > and then the case of both flags zero (or both bits set) would actually be > "invalid", and we would also clear the IORESOURCE_STARTALIGN flag when we > actually allocate the resource (so that we don't use the "start" field as > alignment incorrectly when it no longer indicates alignment). > > That wouldn't be totally generic, but it would have the nice property of > automatically at least add sanity checking for that whole "res->start has > the odd meaning of 'alignment' during probing" and remove the need for a > new field, and it would allow us to have a generic "resource_alignment()" > routine that just gets a resource pointer. Besides, I removed IORESOURCE_BUS_HAS_VGA flag which was unused for ages. Signed-off-by: Ivan Kokshaysky <ink@jurassic.park.msu.ru> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Gary Hade <garyhade@us.ibm.com> Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2008-03-30 23:50:14 +08:00
b_res->flags |= IORESOURCE_STARTALIGN;
}
/* Calculate the size of the bus and minimal alignment which
guarantees that all child resources fit in this size. */
static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
unsigned long type, resource_size_t min_size)
{
struct pci_dev *dev;
resource_size_t min_align, align, size, old_size;
resource_size_t aligns[12]; /* Alignments from 1Mb to 2Gb */
int order, max_order;
struct resource *b_res = find_free_bus_resource(bus, type);
unsigned int mem64_mask = 0;
if (!b_res)
return 0;
memset(aligns, 0, sizeof(aligns));
max_order = 0;
size = 0;
mem64_mask = b_res->flags & IORESOURCE_MEM_64;
b_res->flags &= ~IORESOURCE_MEM_64;
list_for_each_entry(dev, &bus->devices, bus_list) {
int i;
for (i = 0; i < PCI_NUM_RESOURCES; i++) {
struct resource *r = &dev->resource[i];
resource_size_t r_size;
if (r->parent || (r->flags & mask) != type)
continue;
r_size = resource_size(r);
/* For bridges size != alignment */
PCI SR-IOV: correct broken resource alignment calculations An SR-IOV capable device includes an SR-IOV PCIe capability which describes the Virtual Function (VF) BAR requirements. A typical SR-IOV device can support multiple VFs whose BARs must be in a contiguous region, effectively an array of VF BARs. The BAR reports the size requirement for a single VF. We calculate the full range needed by simply multiplying the VF BAR size with the number of possible VFs and create a resource spanning the full range. This all seems sane enough except it artificially inflates the alignment requirement for the VF BAR. The VF BAR need only be aligned to the size of a single BAR not the contiguous range of VF BARs. This can cause us to fail to allocate resources for the BAR despite the fact that we actually have enough space. This patch adds a thin PCI specific layer over the generic resource_alignment() function which is aware of the special nature of VF BARs and does sorting and allocation based on the smaller alignment requirement. I recognize that while resource_alignment is generic, it's basically a PCI helper. An alternative to this patch is to add PCI VF BAR specific information to struct resource. I opted for the extra layer rather than adding such PCI specific information to struct resource. This does have the slight downside that we don't cache the BAR size and re-read for each alignment query (happens a small handful of times during boot for each VF BAR). Signed-off-by: Chris Wright <chrisw@sous-sol.org> Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Matthew Wilcox <matthew@wil.cx> Cc: Yu Zhao <yu.zhao@intel.com> Cc: stable@kernel.org Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
2009-08-29 04:00:06 +08:00
align = pci_resource_alignment(dev, r);
order = __ffs(align) - 20;
if (order > 11) {
dev_warn(&dev->dev, "disabling BAR %d: %pR "
"(bad alignment %#llx)\n", i, r,
(unsigned long long) align);
r->flags = 0;
continue;
}
size += r_size;
if (order < 0)
order = 0;
/* Exclude ranges with size > align from
calculation of the alignment. */
if (r_size == align)
aligns[order] += align;
if (order > max_order)
max_order = order;
mem64_mask &= r->flags & IORESOURCE_MEM_64;
}
}
if (size < min_size)
size = min_size;
old_size = resource_size(b_res);
if (old_size == 1)
old_size = 0;
if (size < old_size)
size = old_size;
align = 0;
min_align = 0;
for (order = 0; order <= max_order; order++) {
resource_size_t align1 = 1;
align1 <<= (order + 20);
if (!align)
min_align = align1;
else if (ALIGN(align + min_align, min_align) < align1)
min_align = align1 >> 1;
align += aligns[order];
}
size = ALIGN(size, min_align);
if (!size) {
if (b_res->start || b_res->end)
dev_info(&bus->self->dev, "disabling bridge window "
"%pR to [bus %02x-%02x] (unused)\n", b_res,
bus->secondary, bus->subordinate);
b_res->flags = 0;
return 1;
}
b_res->start = min_align;
b_res->end = size + min_align - 1;
PCI: clean up resource alignment management Done per Linus' request and suggestions. Linus has explained that better than I'll be able to explain: On Thu, Mar 27, 2008 at 10:12:10AM -0700, Linus Torvalds wrote: > Actually, before we go any further, there might be a less intrusive > alternative: add just a couple of flags to the resource flags field (we > still have something like 8 unused bits on 32-bit), and use those to > implement a generic "resource_alignment()" routine. > > Two flags would do it: > > - IORESOURCE_SIZEALIGN: size indicates alignment (regular PCI device > resources) > > - IORESOURCE_STARTALIGN: start field is alignment (PCI bus resources > during probing) > > and then the case of both flags zero (or both bits set) would actually be > "invalid", and we would also clear the IORESOURCE_STARTALIGN flag when we > actually allocate the resource (so that we don't use the "start" field as > alignment incorrectly when it no longer indicates alignment). > > That wouldn't be totally generic, but it would have the nice property of > automatically at least add sanity checking for that whole "res->start has > the odd meaning of 'alignment' during probing" and remove the need for a > new field, and it would allow us to have a generic "resource_alignment()" > routine that just gets a resource pointer. Besides, I removed IORESOURCE_BUS_HAS_VGA flag which was unused for ages. Signed-off-by: Ivan Kokshaysky <ink@jurassic.park.msu.ru> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Gary Hade <garyhade@us.ibm.com> Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2008-03-30 23:50:14 +08:00
b_res->flags |= IORESOURCE_STARTALIGN;
b_res->flags |= mem64_mask;
return 1;
}
static void pci_bus_size_cardbus(struct pci_bus *bus)
{
struct pci_dev *bridge = bus->self;
struct resource *b_res = &bridge->resource[PCI_BRIDGE_RESOURCES];
u16 ctrl;
/*
* Reserve some resources for CardBus. We reserve
* a fixed amount of bus space for CardBus bridges.
*/
b_res[0].start = 0;
b_res[0].end = pci_cardbus_io_size - 1;
b_res[0].flags |= IORESOURCE_IO | IORESOURCE_SIZEALIGN;
b_res[1].start = 0;
b_res[1].end = pci_cardbus_io_size - 1;
b_res[1].flags |= IORESOURCE_IO | IORESOURCE_SIZEALIGN;
/*
* Check whether prefetchable memory is supported
* by this bridge.
*/
pci_read_config_word(bridge, PCI_CB_BRIDGE_CONTROL, &ctrl);
if (!(ctrl & PCI_CB_BRIDGE_CTL_PREFETCH_MEM0)) {
ctrl |= PCI_CB_BRIDGE_CTL_PREFETCH_MEM0;
pci_write_config_word(bridge, PCI_CB_BRIDGE_CONTROL, ctrl);
pci_read_config_word(bridge, PCI_CB_BRIDGE_CONTROL, &ctrl);
}
/*
* If we have prefetchable memory support, allocate
* two regions. Otherwise, allocate one region of
* twice the size.
*/
if (ctrl & PCI_CB_BRIDGE_CTL_PREFETCH_MEM0) {
b_res[2].start = 0;
b_res[2].end = pci_cardbus_mem_size - 1;
b_res[2].flags |= IORESOURCE_MEM | IORESOURCE_PREFETCH | IORESOURCE_SIZEALIGN;
b_res[3].start = 0;
b_res[3].end = pci_cardbus_mem_size - 1;
b_res[3].flags |= IORESOURCE_MEM | IORESOURCE_SIZEALIGN;
} else {
b_res[3].start = 0;
b_res[3].end = pci_cardbus_mem_size * 2 - 1;
b_res[3].flags |= IORESOURCE_MEM | IORESOURCE_SIZEALIGN;
}
}
void __ref pci_bus_size_bridges(struct pci_bus *bus)
{
struct pci_dev *dev;
unsigned long mask, prefmask;
resource_size_t min_mem_size = 0, min_io_size = 0;
list_for_each_entry(dev, &bus->devices, bus_list) {
struct pci_bus *b = dev->subordinate;
if (!b)
continue;
switch (dev->class >> 8) {
case PCI_CLASS_BRIDGE_CARDBUS:
pci_bus_size_cardbus(b);
break;
case PCI_CLASS_BRIDGE_PCI:
default:
pci_bus_size_bridges(b);
break;
}
}
/* The root bus? */
if (!bus->self)
return;
switch (bus->self->class >> 8) {
case PCI_CLASS_BRIDGE_CARDBUS:
/* don't size cardbuses yet. */
break;
case PCI_CLASS_BRIDGE_PCI:
pci_bridge_check_ranges(bus);
if (bus->self->is_hotplug_bridge) {
min_io_size = pci_hotplug_io_size;
min_mem_size = pci_hotplug_mem_size;
}
default:
pbus_size_io(bus, min_io_size);
/* If the bridge supports prefetchable range, size it
separately. If it doesn't, or its prefetchable window
has already been allocated by arch code, try
non-prefetchable range for both types of PCI memory
resources. */
mask = IORESOURCE_MEM;
prefmask = IORESOURCE_MEM | IORESOURCE_PREFETCH;
if (pbus_size_mem(bus, prefmask, prefmask, min_mem_size))
mask = prefmask; /* Success, size non-prefetch only. */
else
min_mem_size += min_mem_size;
pbus_size_mem(bus, mask, IORESOURCE_MEM, min_mem_size);
break;
}
}
EXPORT_SYMBOL(pci_bus_size_bridges);
static void __ref __pci_bus_assign_resources(const struct pci_bus *bus,
struct resource_list_x *fail_head)
{
struct pci_bus *b;
struct pci_dev *dev;
pbus_assign_resources_sorted(bus, fail_head);
list_for_each_entry(dev, &bus->devices, bus_list) {
b = dev->subordinate;
if (!b)
continue;
__pci_bus_assign_resources(b, fail_head);
switch (dev->class >> 8) {
case PCI_CLASS_BRIDGE_PCI:
if (!pci_is_enabled(dev))
pci_setup_bridge(b);
break;
case PCI_CLASS_BRIDGE_CARDBUS:
pci_setup_cardbus(b);
break;
default:
dev_info(&dev->dev, "not setting up bridge for bus "
"%04x:%02x\n", pci_domain_nr(b), b->number);
break;
}
}
}
void __ref pci_bus_assign_resources(const struct pci_bus *bus)
{
__pci_bus_assign_resources(bus, NULL);
}
EXPORT_SYMBOL(pci_bus_assign_resources);
static void __ref __pci_bridge_assign_resources(const struct pci_dev *bridge,
struct resource_list_x *fail_head)
{
struct pci_bus *b;
pdev_assign_resources_sorted((struct pci_dev *)bridge, fail_head);
b = bridge->subordinate;
if (!b)
return;
__pci_bus_assign_resources(b, fail_head);
switch (bridge->class >> 8) {
case PCI_CLASS_BRIDGE_PCI:
pci_setup_bridge(b);
break;
case PCI_CLASS_BRIDGE_CARDBUS:
pci_setup_cardbus(b);
break;
default:
dev_info(&bridge->dev, "not setting up bridge for bus "
"%04x:%02x\n", pci_domain_nr(b), b->number);
break;
}
}
static void pci_bridge_release_resources(struct pci_bus *bus,
unsigned long type)
{
int idx;
bool changed = false;
struct pci_dev *dev;
struct resource *r;
unsigned long type_mask = IORESOURCE_IO | IORESOURCE_MEM |
IORESOURCE_PREFETCH;
dev = bus->self;
for (idx = PCI_BRIDGE_RESOURCES; idx <= PCI_BRIDGE_RESOURCE_END;
idx++) {
r = &dev->resource[idx];
if ((r->flags & type_mask) != type)
continue;
if (!r->parent)
continue;
/*
* if there are children under that, we should release them
* all
*/
release_child_resources(r);
if (!release_resource(r)) {
dev_printk(KERN_DEBUG, &dev->dev,
"resource %d %pR released\n", idx, r);
/* keep the old size */
r->end = resource_size(r) - 1;
r->start = 0;
r->flags = 0;
changed = true;
}
}
if (changed) {
/* avoiding touch the one without PREF */
if (type & IORESOURCE_PREFETCH)
type = IORESOURCE_PREFETCH;
__pci_setup_bridge(bus, type);
}
}
enum release_type {
leaf_only,
whole_subtree,
};
/*
* try to release pci bridge resources that is from leaf bridge,
* so we can allocate big new one later
*/
static void __ref pci_bus_release_bridge_resources(struct pci_bus *bus,
unsigned long type,
enum release_type rel_type)
{
struct pci_dev *dev;
bool is_leaf_bridge = true;
list_for_each_entry(dev, &bus->devices, bus_list) {
struct pci_bus *b = dev->subordinate;
if (!b)
continue;
is_leaf_bridge = false;
if ((dev->class >> 8) != PCI_CLASS_BRIDGE_PCI)
continue;
if (rel_type == whole_subtree)
pci_bus_release_bridge_resources(b, type,
whole_subtree);
}
if (pci_is_root_bus(bus))
return;
if ((bus->self->class >> 8) != PCI_CLASS_BRIDGE_PCI)
return;
if ((rel_type == whole_subtree) || is_leaf_bridge)
pci_bridge_release_resources(bus, type);
}
static void pci_bus_dump_res(struct pci_bus *bus)
{
struct resource *res;
int i;
pci_bus_for_each_resource(bus, res, i) {
if (!res || !res->end || !res->flags)
continue;
dev_printk(KERN_DEBUG, &bus->dev, "resource %d %pR\n", i, res);
}
}
static void pci_bus_dump_resources(struct pci_bus *bus)
{
struct pci_bus *b;
struct pci_dev *dev;
pci_bus_dump_res(bus);
list_for_each_entry(dev, &bus->devices, bus_list) {
b = dev->subordinate;
if (!b)
continue;
pci_bus_dump_resources(b);
}
}
void __init
pci_assign_unassigned_resources(void)
{
struct pci_bus *bus;
/* Depth first, calculate sizes and alignments of all
subordinate buses. */
list_for_each_entry(bus, &pci_root_buses, node) {
pci_bus_size_bridges(bus);
}
/* Depth last, allocate resources and update the hardware. */
list_for_each_entry(bus, &pci_root_buses, node) {
pci_bus_assign_resources(bus);
pci_enable_bridges(bus);
}
/* dump the resource on buses */
list_for_each_entry(bus, &pci_root_buses, node) {
pci_bus_dump_resources(bus);
}
}
void pci_assign_unassigned_bridge_resources(struct pci_dev *bridge)
{
struct pci_bus *parent = bridge->subordinate;
int tried_times = 0;
struct resource_list_x head, *list;
int retval;
unsigned long type_mask = IORESOURCE_IO | IORESOURCE_MEM |
IORESOURCE_PREFETCH;
head.next = NULL;
again:
pci_bus_size_bridges(parent);
__pci_bridge_assign_resources(bridge, &head);
tried_times++;
if (!head.next)
goto enable_all;
if (tried_times >= 2) {
/* still fail, don't need to try more */
free_failed_list(&head);
goto enable_all;
}
printk(KERN_DEBUG "PCI: No. %d try to assign unassigned res\n",
tried_times + 1);
/*
* Try to release leaf bridge's resources that doesn't fit resource of
* child device under that bridge
*/
for (list = head.next; list;) {
struct pci_bus *bus = list->dev->bus;
unsigned long flags = list->flags;
pci_bus_release_bridge_resources(bus, flags & type_mask,
whole_subtree);
list = list->next;
}
/* restore size and flags */
for (list = head.next; list;) {
struct resource *res = list->res;
res->start = list->start;
res->end = list->end;
res->flags = list->flags;
if (list->dev->subordinate)
res->flags = 0;
list = list->next;
}
free_failed_list(&head);
goto again;
enable_all:
retval = pci_reenable_device(bridge);
pci_set_master(bridge);
pci_enable_bridges(parent);
}
EXPORT_SYMBOL_GPL(pci_assign_unassigned_bridge_resources);