From d782f33df706f1b8a4496b41fd7d339c6e23aa59 Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 8 Jun 2006 17:59:31 +0100 Subject: [PATCH 01/45] [ARM] Fix Neponset IRQ handling While testing the genirq code on ARM, a condition was found whereby the Neponset IRQ handler was being re-entered, causing the system to deadlock. Under the ARM IRQ code, this would not have been a visible problem because the "simple" IRQ handling had no re-entrancy protection. Resolve this by acknowledging the parent interrupt after we mask it when we are going to handle one of our "special" level-based sources (from ethernet or USAR chip.) Signed-off-by: Russell King --- arch/arm/mach-sa1100/neponset.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/arm/mach-sa1100/neponset.c b/arch/arm/mach-sa1100/neponset.c index 9e02bc3712a0..af6d2775cf82 100644 --- a/arch/arm/mach-sa1100/neponset.c +++ b/arch/arm/mach-sa1100/neponset.c @@ -59,6 +59,14 @@ neponset_irq_handler(unsigned int irq, struct irqdesc *desc, struct pt_regs *reg if (irr & (IRR_ETHERNET | IRR_USAR)) { desc->chip->mask(irq); + /* + * Ack the interrupt now to prevent re-entering + * this neponset handler. Again, this is safe + * since we'll check the IRR register prior to + * leaving. + */ + desc->chip->ack(irq); + if (irr & IRR_ETHERNET) { d = irq_desc + IRQ_NEPONSET_SMC9196; desc_handle_irq(IRQ_NEPONSET_SMC9196, d, regs); From 0c27c5d5b93339df4def7ced77ea5be26df4d84b Mon Sep 17 00:00:00 2001 From: Richard Purdie Date: Thu, 8 Jun 2006 22:44:07 +0100 Subject: [PATCH 02/45] [ARM] 3547/1: PXA-OHCI: Allow platforms to specify a power budget Patch from Richard Purdie Add a power budget variable to the PXA OHCI platform data and add a default value for the spitz platform(s) which prevents known failures with certain USB devices. Signed-off-by: Richard Purdie Signed-off-by: Russell King --- arch/arm/mach-pxa/spitz.c | 1 + drivers/usb/host/ohci-pxa27x.c | 3 +++ include/asm-arm/arch-pxa/ohci.h | 2 ++ 3 files changed, 6 insertions(+) diff --git a/arch/arm/mach-pxa/spitz.c b/arch/arm/mach-pxa/spitz.c index 19b372df544a..44bcb8097c7a 100644 --- a/arch/arm/mach-pxa/spitz.c +++ b/arch/arm/mach-pxa/spitz.c @@ -371,6 +371,7 @@ static int spitz_ohci_init(struct device *dev) static struct pxaohci_platform_data spitz_ohci_platform_data = { .port_mode = PMM_NPS_MODE, .init = spitz_ohci_init, + .power_budget = 150, }; diff --git a/drivers/usb/host/ohci-pxa27x.c b/drivers/usb/host/ohci-pxa27x.c index acde8868da21..fafe7c1265b3 100644 --- a/drivers/usb/host/ohci-pxa27x.c +++ b/drivers/usb/host/ohci-pxa27x.c @@ -185,6 +185,9 @@ int usb_hcd_pxa27x_probe (const struct hc_driver *driver, struct platform_device /* Select Power Management Mode */ pxa27x_ohci_select_pmm(inf->port_mode); + if (inf->power_budget) + hcd->power_budget = inf->power_budget; + ohci_hcd_init(hcd_to_ohci(hcd)); retval = usb_add_hcd(hcd, pdev->resource[1].start, SA_INTERRUPT); diff --git a/include/asm-arm/arch-pxa/ohci.h b/include/asm-arm/arch-pxa/ohci.h index 7da89569061e..e848a47128cd 100644 --- a/include/asm-arm/arch-pxa/ohci.h +++ b/include/asm-arm/arch-pxa/ohci.h @@ -11,6 +11,8 @@ struct pxaohci_platform_data { #define PMM_NPS_MODE 1 #define PMM_GLOBAL_MODE 2 #define PMM_PERPORT_MODE 3 + + int power_budget; }; extern void pxa_set_ohci_info(struct pxaohci_platform_data *info); From e2f04e18941dbd3826901540a0be03f1728f8822 Mon Sep 17 00:00:00 2001 From: Matt Reimer Date: Thu, 8 Jun 2006 22:46:48 +0100 Subject: [PATCH 03/45] [ARM] 3546/1: PATCH: subtle lost interrupts bug on i.MX Patch from Matt Reimer There is a subtle bug in the GPIO interrupt status register handling in arch/arm/mach-imx/irq.c:imx_gpio_ack_irq(). The documentation states that a 1 should be written to the relevant bit to acknowledge a GPIO interrupt, but that is not what the code does. The problem is that the |= writes back 1s for all the *other* interrupts represented in the register, so interrupts could get lost. For example, if interrupts are pending for GPIO B10 and B12, ISR_B would have the value 0x00001400. Then when the interrupt code handles GPIO B10, it eventually calls imx_gpio_ack_irq(IRQ_GPIOB(10)), which effectively does this: ISR_B |= 1 << 10; with the result that (0x00001400 | 0x00000400) is written, clearing the interrupt status bits for *both* GPIO B10 and B12. The fix is to write 1s only for the interrupts we want to clear. The same problem seems to be occurring in the DMA code; this patch does not address those issues. Acked-by: Sascha Hauer Signed-off-by: Matt Reimer Signed-off-by: Russell King --- arch/arm/mach-imx/irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mach-imx/irq.c b/arch/arm/mach-imx/irq.c index eeb8a6d4a399..a5de5f1da9f2 100644 --- a/arch/arm/mach-imx/irq.c +++ b/arch/arm/mach-imx/irq.c @@ -127,7 +127,7 @@ static void imx_gpio_ack_irq(unsigned int irq) { DEBUG_IRQ("%s: irq %d\n", __FUNCTION__, irq); - ISR(IRQ_TO_REG(irq)) |= 1 << ((irq - IRQ_GPIOA(0)) % 32); + ISR(IRQ_TO_REG(irq)) = 1 << ((irq - IRQ_GPIOA(0)) % 32); } static void From d44647b0a6e48d18a1402dfa9052d85c4fe98341 Mon Sep 17 00:00:00 2001 From: Andy Currid Date: Thu, 8 Jun 2006 00:43:38 -0700 Subject: [PATCH 04/45] [PATCH] Fix HPET operation on 32-bit NVIDIA platforms From: "Andy Currid" This patch fixes a kernel panic during boot that occurs on NVIDIA platforms that have HPET enabled. When HPET is enabled, the standard timer IRQ is routed to IOAPIC pin 2 and is advertised as such in the ACPI APIC table - but an earlier workaround in the kernel was ignoring this override. The fix is to honor timer IRQ overrides from ACPI when HPET is detected on an NVIDIA platform. Signed-off-by: Andy Currid Cc: "Brown, Len" Cc: "Yu, Luming" Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/acpi/earlyquirk.c | 23 ++++++++++++++++++++--- arch/i386/kernel/setup.c | 11 +++++++---- 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/arch/i386/kernel/acpi/earlyquirk.c b/arch/i386/kernel/acpi/earlyquirk.c index 2e3b643a4dc4..1649a175a206 100644 --- a/arch/i386/kernel/acpi/earlyquirk.c +++ b/arch/i386/kernel/acpi/earlyquirk.c @@ -5,17 +5,34 @@ #include #include #include +#include + #include #include #include +#ifdef CONFIG_ACPI + +static int nvidia_hpet_detected __initdata; + +static int __init nvidia_hpet_check(unsigned long phys, unsigned long size) +{ + nvidia_hpet_detected = 1; + return 0; +} +#endif + static int __init check_bridge(int vendor, int device) { #ifdef CONFIG_ACPI - /* According to Nvidia all timer overrides are bogus. Just ignore - them all. */ + /* According to Nvidia all timer overrides are bogus unless HPET + is enabled. */ if (vendor == PCI_VENDOR_ID_NVIDIA) { - acpi_skip_timer_override = 1; + nvidia_hpet_detected = 0; + acpi_table_parse(ACPI_HPET, nvidia_hpet_check); + if (nvidia_hpet_detected == 0) { + acpi_skip_timer_override = 1; + } } #endif if (vendor == PCI_VENDOR_ID_ATI && timer_over_8254 == 1) { diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 846e1639ef7c..dd6b0e3386ce 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -1547,15 +1547,18 @@ void __init setup_arch(char **cmdline_p) if (efi_enabled) efi_map_memmap(); -#ifdef CONFIG_X86_IO_APIC - check_acpi_pci(); /* Checks more than just ACPI actually */ -#endif - #ifdef CONFIG_ACPI /* * Parse the ACPI tables for possible boot-time SMP configuration. */ acpi_boot_table_init(); +#endif + +#ifdef CONFIG_X86_IO_APIC + check_acpi_pci(); /* Checks more than just ACPI actually */ +#endif + +#ifdef CONFIG_ACPI acpi_boot_init(); #if defined(CONFIG_SMP) && defined(CONFIG_X86_PC) From a2ef3a50f19f64d350bdc0aa15c31ae4b8973f57 Mon Sep 17 00:00:00 2001 From: Andy Currid Date: Thu, 8 Jun 2006 00:43:39 -0700 Subject: [PATCH 05/45] [PATCH] Fix HPET operation on 64-bit NVIDIA platforms From: "Andy Currid" This patch fixes a kernel panic during boot that occurs on NVIDIA platforms that have HPET enabled. When HPET is enabled, the standard timer IRQ is routed to IOAPIC pin 2 and is advertised as such in the ACPI APIC table - but an earlier workaround in the kernel was ignoring this override. The fix is to honor timer IRQ overrides from ACPI when HPET is detected on an NVIDIA platform. Signed-off-by: Andy Currid Cc: "Brown, Len" Cc: "Yu, Luming" Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/kernel/io_apic.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 0de3ea938830..9cc7031b7151 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -271,6 +271,18 @@ __setup("enable_8254_timer", setup_enable_8254_timer); #include #include + +#ifdef CONFIG_ACPI + +static int nvidia_hpet_detected __initdata; + +static int __init nvidia_hpet_check(unsigned long phys, unsigned long size) +{ + nvidia_hpet_detected = 1; + return 0; +} +#endif + /* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC off. Check for an Nvidia or VIA PCI bridge and turn it off. Use pci direct infrastructure because this runs before the PCI subsystem. @@ -317,11 +329,19 @@ void __init check_ioapic(void) return; case PCI_VENDOR_ID_NVIDIA: #ifdef CONFIG_ACPI - /* All timer overrides on Nvidia - seem to be wrong. Skip them. */ - acpi_skip_timer_override = 1; - printk(KERN_INFO - "Nvidia board detected. Ignoring ACPI timer override.\n"); + /* + * All timer overrides on Nvidia are + * wrong unless HPET is enabled. + */ + nvidia_hpet_detected = 0; + acpi_table_parse(ACPI_HPET, + nvidia_hpet_check); + if (nvidia_hpet_detected == 0) { + acpi_skip_timer_override = 1; + printk(KERN_INFO "Nvidia board " + "detected. Ignoring ACPI " + "timer override.\n"); + } #endif /* RED-PEN skip them on mptables too? */ return; From fd0a0ac1c5393b226640a30bae753983068136b3 Mon Sep 17 00:00:00 2001 From: Lennert Buytenhek Date: Thu, 8 Jun 2006 00:43:40 -0700 Subject: [PATCH 06/45] [PATCH] ep93xx build fix From: Lennert Buytenhek The recent renaming of m48t86's ->readb() and ->writeb() platform driver methods (2d7b20c1884777e66009be1a533641c19c4705f6) to ->readbyte() and ->writebyte() to fix the ia64 build broke the build of the cirrus ep93xx ARM platform. This patch fixes it up. Signed-off-by: Lennert Buytenhek Cc: Alessandro Zummo Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mach-ep93xx/ts72xx.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm/mach-ep93xx/ts72xx.c b/arch/arm/mach-ep93xx/ts72xx.c index 9be01b0c3f48..e24566b88a78 100644 --- a/arch/arm/mach-ep93xx/ts72xx.c +++ b/arch/arm/mach-ep93xx/ts72xx.c @@ -111,21 +111,21 @@ static void __init ts72xx_map_io(void) } } -static unsigned char ts72xx_rtc_readb(unsigned long addr) +static unsigned char ts72xx_rtc_readbyte(unsigned long addr) { __raw_writeb(addr, TS72XX_RTC_INDEX_VIRT_BASE); return __raw_readb(TS72XX_RTC_DATA_VIRT_BASE); } -static void ts72xx_rtc_writeb(unsigned char value, unsigned long addr) +static void ts72xx_rtc_writebyte(unsigned char value, unsigned long addr) { __raw_writeb(addr, TS72XX_RTC_INDEX_VIRT_BASE); __raw_writeb(value, TS72XX_RTC_DATA_VIRT_BASE); } static struct m48t86_ops ts72xx_rtc_ops = { - .readb = ts72xx_rtc_readb, - .writeb = ts72xx_rtc_writeb, + .readbyte = ts72xx_rtc_readbyte, + .writebyte = ts72xx_rtc_writebyte, }; static struct platform_device ts72xx_rtc_device = { From 45b35a5ced474b9fbbbfcfd5cf346c432d28d9fd Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Thu, 8 Jun 2006 00:43:41 -0700 Subject: [PATCH 07/45] [PATCH] Fix mempolicy.h build error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit From: Ralf Baechle uses struct mm_struct and relies on a definition or declaration somehow magically being dragged in which may result in a build: [...] CC mm/mempolicy.o In file included from mm/mempolicy.c:69: include/linux/mempolicy.h:150: warning: ‘struct mm_struct’ declared inside parameter list include/linux/mempolicy.h:150: warning: its scope is only this definition or declaration, which is probably not what you want include/linux/mempolicy.h:175: warning: ‘struct mm_struct’ declared inside parameter list mm/mempolicy.c:622: error: conflicting types for ‘do_migrate_pages’ include/linux/mempolicy.h:175: error: previous declaration of ‘do_migrate_pages’ was here mm/mempolicy.c:1661: error: conflicting types for ‘mpol_rebind_mm’ include/linux/mempolicy.h:150: error: previous declaration of ‘mpol_rebind_mm’ was here make[1]: *** [mm/mempolicy.o] Error 1 make: *** [mm] Error 2 [ralf@denk linux-ip35]$ Including is a step into direction of include hell so fixed by adding a forward declaration of struct mm_struct instead. Signed-off-by: Ralf Baechle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 6a7621b2b12b..f5fdca1d67e6 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -36,6 +36,7 @@ #include struct vm_area_struct; +struct mm_struct; #ifdef CONFIG_NUMA From 26e780e8ef1cc3ef581a07aafe2346bb5a07b4f9 Mon Sep 17 00:00:00 2001 From: Malcom Parsons Date: Thu, 8 Jun 2006 00:43:42 -0700 Subject: [PATCH 08/45] [PATCH] fbcon: fix limited scroll in SCROLL_PAN_REDRAW mode From: Malcom Parsons When scrolling up in SCROLL_PAN_REDRAW mode with a large limited scroll region, the bottom few lines have to be redrawn. Without this patch, the wrong text is drawn into these lines, corrupting the display. Observed in 2.6.14 when running an IRC client in the Nintendo DS linux port. I haven't tested if scrolling down has the same problem. Signed-off-by: Antonino Daplas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/console/fbcon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/console/fbcon.c b/drivers/video/console/fbcon.c index 953eb8c171d6..47ba1a79adcd 100644 --- a/drivers/video/console/fbcon.c +++ b/drivers/video/console/fbcon.c @@ -1745,7 +1745,7 @@ static int fbcon_scroll(struct vc_data *vc, int t, int b, int dir, fbcon_redraw_move(vc, p, 0, t, count); ypan_up_redraw(vc, t, count); if (vc->vc_rows - b > 0) - fbcon_redraw_move(vc, p, b - count, + fbcon_redraw_move(vc, p, b, vc->vc_rows - b, b); } else fbcon_redraw_move(vc, p, t + count, b - t - count, t); From bc1c116974a5c3f498112a6f175d3e4a8cd5bdbc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 8 Jun 2006 08:49:06 +0200 Subject: [PATCH 09/45] [PATCH] elevator switching race There's a race between shutting down one io scheduler and firing up the next, in which a new io could enter and cause the io scheduler to be invoked with bad or NULL data. To fix this, we need to maintain the queue lock for a bit longer. Unfortunately we cannot do that, since the elevator init requires to be run without the lock held. This isn't easily fixable, without also changing the mempool API. So split the initialization into two parts, and alloc-init operation and an attach operation. Then we can preallocate the io scheduler and related structures, and run the attach inside the lock after we detach the old one. This patch has survived 30 minutes of 1 second io scheduler switching with a very busy io load. Signed-off-by: Jens Axboe Signed-off-by: Linus Torvalds --- block/as-iosched.c | 13 +++++----- block/cfq-iosched.c | 10 +++----- block/deadline-iosched.c | 13 +++++----- block/elevator.c | 55 +++++++++++++++++++++++++--------------- block/noop-iosched.c | 7 +++-- include/linux/elevator.h | 2 +- 6 files changed, 54 insertions(+), 46 deletions(-) diff --git a/block/as-iosched.c b/block/as-iosched.c index e25a5d79ab27..a7caf35ca0c2 100644 --- a/block/as-iosched.c +++ b/block/as-iosched.c @@ -1648,17 +1648,17 @@ static void as_exit_queue(elevator_t *e) * initialize elevator private data (as_data), and alloc a arq for * each request on the free lists */ -static int as_init_queue(request_queue_t *q, elevator_t *e) +static void *as_init_queue(request_queue_t *q, elevator_t *e) { struct as_data *ad; int i; if (!arq_pool) - return -ENOMEM; + return NULL; ad = kmalloc_node(sizeof(*ad), GFP_KERNEL, q->node); if (!ad) - return -ENOMEM; + return NULL; memset(ad, 0, sizeof(*ad)); ad->q = q; /* Identify what queue the data belongs to */ @@ -1667,7 +1667,7 @@ static int as_init_queue(request_queue_t *q, elevator_t *e) GFP_KERNEL, q->node); if (!ad->hash) { kfree(ad); - return -ENOMEM; + return NULL; } ad->arq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, @@ -1675,7 +1675,7 @@ static int as_init_queue(request_queue_t *q, elevator_t *e) if (!ad->arq_pool) { kfree(ad->hash); kfree(ad); - return -ENOMEM; + return NULL; } /* anticipatory scheduling helpers */ @@ -1696,14 +1696,13 @@ static int as_init_queue(request_queue_t *q, elevator_t *e) ad->antic_expire = default_antic_expire; ad->batch_expire[REQ_SYNC] = default_read_batch_expire; ad->batch_expire[REQ_ASYNC] = default_write_batch_expire; - e->elevator_data = ad; ad->current_batch_expires = jiffies + ad->batch_expire[REQ_SYNC]; ad->write_batch_count = ad->batch_expire[REQ_ASYNC] / 10; if (ad->write_batch_count < 2) ad->write_batch_count = 2; - return 0; + return ad; } /* diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 8e9d84825e1c..a46d030e092a 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -2251,14 +2251,14 @@ static void cfq_exit_queue(elevator_t *e) kfree(cfqd); } -static int cfq_init_queue(request_queue_t *q, elevator_t *e) +static void *cfq_init_queue(request_queue_t *q, elevator_t *e) { struct cfq_data *cfqd; int i; cfqd = kmalloc(sizeof(*cfqd), GFP_KERNEL); if (!cfqd) - return -ENOMEM; + return NULL; memset(cfqd, 0, sizeof(*cfqd)); @@ -2288,8 +2288,6 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e) for (i = 0; i < CFQ_QHASH_ENTRIES; i++) INIT_HLIST_HEAD(&cfqd->cfq_hash[i]); - e->elevator_data = cfqd; - cfqd->queue = q; cfqd->max_queued = q->nr_requests / 4; @@ -2316,14 +2314,14 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e) cfqd->cfq_slice_async_rq = cfq_slice_async_rq; cfqd->cfq_slice_idle = cfq_slice_idle; - return 0; + return cfqd; out_crqpool: kfree(cfqd->cfq_hash); out_cfqhash: kfree(cfqd->crq_hash); out_crqhash: kfree(cfqd); - return -ENOMEM; + return NULL; } static void cfq_slab_kill(void) diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c index 399fa1e60e1f..3bd0415a9828 100644 --- a/block/deadline-iosched.c +++ b/block/deadline-iosched.c @@ -613,24 +613,24 @@ static void deadline_exit_queue(elevator_t *e) * initialize elevator private data (deadline_data), and alloc a drq for * each request on the free lists */ -static int deadline_init_queue(request_queue_t *q, elevator_t *e) +static void *deadline_init_queue(request_queue_t *q, elevator_t *e) { struct deadline_data *dd; int i; if (!drq_pool) - return -ENOMEM; + return NULL; dd = kmalloc_node(sizeof(*dd), GFP_KERNEL, q->node); if (!dd) - return -ENOMEM; + return NULL; memset(dd, 0, sizeof(*dd)); dd->hash = kmalloc_node(sizeof(struct list_head)*DL_HASH_ENTRIES, GFP_KERNEL, q->node); if (!dd->hash) { kfree(dd); - return -ENOMEM; + return NULL; } dd->drq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, @@ -638,7 +638,7 @@ static int deadline_init_queue(request_queue_t *q, elevator_t *e) if (!dd->drq_pool) { kfree(dd->hash); kfree(dd); - return -ENOMEM; + return NULL; } for (i = 0; i < DL_HASH_ENTRIES; i++) @@ -653,8 +653,7 @@ static int deadline_init_queue(request_queue_t *q, elevator_t *e) dd->writes_starved = writes_starved; dd->front_merges = 1; dd->fifo_batch = fifo_batch; - e->elevator_data = dd; - return 0; + return dd; } static void deadline_put_request(request_queue_t *q, struct request *rq) diff --git a/block/elevator.c b/block/elevator.c index 8768a367fdde..a0afdd317cef 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -121,16 +121,16 @@ static struct elevator_type *elevator_get(const char *name) return e; } -static int elevator_attach(request_queue_t *q, struct elevator_queue *eq) +static void *elevator_init_queue(request_queue_t *q, struct elevator_queue *eq) { - int ret = 0; + return eq->ops->elevator_init_fn(q, eq); +} +static void elevator_attach(request_queue_t *q, struct elevator_queue *eq, + void *data) +{ q->elevator = eq; - - if (eq->ops->elevator_init_fn) - ret = eq->ops->elevator_init_fn(q, eq); - - return ret; + eq->elevator_data = data; } static char chosen_elevator[16]; @@ -181,6 +181,7 @@ int elevator_init(request_queue_t *q, char *name) struct elevator_type *e = NULL; struct elevator_queue *eq; int ret = 0; + void *data; INIT_LIST_HEAD(&q->queue_head); q->last_merge = NULL; @@ -202,10 +203,13 @@ int elevator_init(request_queue_t *q, char *name) if (!eq) return -ENOMEM; - ret = elevator_attach(q, eq); - if (ret) + data = elevator_init_queue(q, eq); + if (!data) { kobject_put(&eq->kobj); + return -ENOMEM; + } + elevator_attach(q, eq, data); return ret; } @@ -722,13 +726,16 @@ int elv_register_queue(struct request_queue *q) return error; } +static void __elv_unregister_queue(elevator_t *e) +{ + kobject_uevent(&e->kobj, KOBJ_REMOVE); + kobject_del(&e->kobj); +} + void elv_unregister_queue(struct request_queue *q) { - if (q) { - elevator_t *e = q->elevator; - kobject_uevent(&e->kobj, KOBJ_REMOVE); - kobject_del(&e->kobj); - } + if (q) + __elv_unregister_queue(q->elevator); } int elv_register(struct elevator_type *e) @@ -780,6 +787,7 @@ EXPORT_SYMBOL_GPL(elv_unregister); static int elevator_switch(request_queue_t *q, struct elevator_type *new_e) { elevator_t *old_elevator, *e; + void *data; /* * Allocate new elevator @@ -788,6 +796,12 @@ static int elevator_switch(request_queue_t *q, struct elevator_type *new_e) if (!e) return 0; + data = elevator_init_queue(q, e); + if (!data) { + kobject_put(&e->kobj); + return 0; + } + /* * Turn on BYPASS and drain all requests w/ elevator private data */ @@ -806,19 +820,19 @@ static int elevator_switch(request_queue_t *q, struct elevator_type *new_e) elv_drain_elevator(q); } - spin_unlock_irq(q->queue_lock); - /* - * unregister old elevator data + * Remember old elevator. */ - elv_unregister_queue(q); old_elevator = q->elevator; /* * attach and start new elevator */ - if (elevator_attach(q, e)) - goto fail; + elevator_attach(q, e, data); + + spin_unlock_irq(q->queue_lock); + + __elv_unregister_queue(old_elevator); if (elv_register_queue(q)) goto fail_register; @@ -837,7 +851,6 @@ fail_register: */ elevator_exit(e); e = NULL; -fail: q->elevator = old_elevator; elv_register_queue(q); clear_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); diff --git a/block/noop-iosched.c b/block/noop-iosched.c index f370e4a7fe6d..56a7c620574f 100644 --- a/block/noop-iosched.c +++ b/block/noop-iosched.c @@ -65,16 +65,15 @@ noop_latter_request(request_queue_t *q, struct request *rq) return list_entry(rq->queuelist.next, struct request, queuelist); } -static int noop_init_queue(request_queue_t *q, elevator_t *e) +static void *noop_init_queue(request_queue_t *q, elevator_t *e) { struct noop_data *nd; nd = kmalloc(sizeof(*nd), GFP_KERNEL); if (!nd) - return -ENOMEM; + return NULL; INIT_LIST_HEAD(&nd->queue); - e->elevator_data = nd; - return 0; + return nd; } static void noop_exit_queue(elevator_t *e) diff --git a/include/linux/elevator.h b/include/linux/elevator.h index ad133fcfb239..1713ace808bf 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -21,7 +21,7 @@ typedef void (elevator_put_req_fn) (request_queue_t *, struct request *); typedef void (elevator_activate_req_fn) (request_queue_t *, struct request *); typedef void (elevator_deactivate_req_fn) (request_queue_t *, struct request *); -typedef int (elevator_init_fn) (request_queue_t *, elevator_t *); +typedef void *(elevator_init_fn) (request_queue_t *, elevator_t *); typedef void (elevator_exit_fn) (elevator_t *); struct elevator_ops From 71601e2b33dad9acb8d7844f7321f90ed9d1bce8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 8 Jun 2006 10:26:39 +0200 Subject: [PATCH 10/45] [PATCH] debugfs inode leak Looking at the reiser4 crash, I found a leak in debugfs. In debugfs_mknod(), we create the inode before checking if the dentry already has one attached. We don't free it if that is the case. These bugs happen quite often, I'm starting to think we should disallow such coding in CodingStyle. Signed-off-by: Jens Axboe Signed-off-by: Linus Torvalds --- fs/debugfs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 85d166cdcae4..b55b4ea9a676 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -67,12 +67,13 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d static int debugfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) { - struct inode *inode = debugfs_get_inode(dir->i_sb, mode, dev); + struct inode *inode; int error = -EPERM; if (dentry->d_inode) return -EEXIST; + inode = debugfs_get_inode(dir->i_sb, mode, dev); if (inode) { d_instantiate(dentry, inode); dget(dentry); From bafe00cc9297ca77b66e5c83e5e65e17c0c997c8 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Thu, 8 Jun 2006 01:36:20 -0700 Subject: [PATCH 11/45] [PATCH] s390: fix in-user atomic futex operation. From: Martin Schwidefsky __futex_atomic_op needs to do an atomic operation in the user address space, not the kernel address space. Add the missing sacf 256/sacf 0 to switch to the secondary mode before doing the compare-and-swap. In addition add another fixup for catch specification exceptions if the compare-and-swap address is not aligned. Signed-off-by: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-s390/futex.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/include/asm-s390/futex.h b/include/asm-s390/futex.h index 40c25e166a9b..1802775568b9 100644 --- a/include/asm-s390/futex.h +++ b/include/asm-s390/futex.h @@ -11,23 +11,24 @@ #define __futex_atomic_fixup \ ".section __ex_table,\"a\"\n" \ " .align 4\n" \ - " .long 0b,2b,1b,2b\n" \ + " .long 0b,4b,2b,4b,3b,4b\n" \ ".previous" #else /* __s390x__ */ #define __futex_atomic_fixup \ ".section __ex_table,\"a\"\n" \ " .align 8\n" \ - " .quad 0b,2b,1b,2b\n" \ + " .quad 0b,4b,2b,4b,3b,4b\n" \ ".previous" #endif /* __s390x__ */ #define __futex_atomic_op(insn, ret, oldval, newval, uaddr, oparg) \ - asm volatile(" l %1,0(%6)\n" \ - "0: " insn \ - " cs %1,%2,0(%6)\n" \ - "1: jl 0b\n" \ + asm volatile(" sacf 256\n" \ + "0: l %1,0(%6)\n" \ + "1: " insn \ + "2: cs %1,%2,0(%6)\n" \ + "3: jl 1b\n" \ " lhi %0,0\n" \ - "2:\n" \ + "4: sacf 0\n" \ __futex_atomic_fixup \ : "=d" (ret), "=&d" (oldval), "=&d" (newval), \ "=m" (*uaddr) \ From 7c85d1f9d358b24c5b05c3a2783a78423775a080 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 9 Jun 2006 13:02:59 +1000 Subject: [PATCH 12/45] powerpc: Fix machine check problem on 32-bit kernels This fixes a bug found by Dave Jones that means that it is possible for userspace to provoke a machine check on 32-bit kernels. This also fixes a couple of other places where I found similar problems by inspection. Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/signal_32.c | 11 ++++++++++- arch/powerpc/kernel/signal_64.c | 2 ++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 01e3c08cb550..8fdeca2d4597 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -803,10 +803,13 @@ static int do_setcontext(struct ucontext __user *ucp, struct pt_regs *regs, int if (__get_user(cmcp, &ucp->uc_regs)) return -EFAULT; mcp = (struct mcontext __user *)(u64)cmcp; + /* no need to check access_ok(mcp), since mcp < 4GB */ } #else if (__get_user(mcp, &ucp->uc_regs)) return -EFAULT; + if (!access_ok(VERIFY_READ, mcp, sizeof(*mcp))) + return -EFAULT; #endif restore_sigmask(&set); if (restore_user_regs(regs, mcp, sig)) @@ -908,13 +911,14 @@ int sys_debug_setcontext(struct ucontext __user *ctx, { struct sig_dbg_op op; int i; + unsigned char tmp; unsigned long new_msr = regs->msr; #if defined(CONFIG_4xx) || defined(CONFIG_BOOKE) unsigned long new_dbcr0 = current->thread.dbcr0; #endif for (i=0; ithread.dbcr0 = new_dbcr0; #endif + if (!access_ok(VERIFY_READ, ctx, sizeof(*ctx)) + || __get_user(tmp, (u8 __user *) ctx) + || __get_user(tmp, (u8 __user *) (ctx + 1) - 1)) + return -EFAULT; + /* * If we get a fault copying the context into the kernel's * image of the user's registers, we can't just return -EFAULT diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index 27f65b95184d..c2db642f4cdd 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -182,6 +182,8 @@ static long restore_sigcontext(struct pt_regs *regs, sigset_t *set, int sig, err |= __get_user(msr, &sc->gp_regs[PT_MSR]); if (err) return err; + if (v_regs && !access_ok(VERIFY_READ, v_regs, 34 * sizeof(vector128))) + return -EFAULT; /* Copy 33 vec registers (vr0..31 and vscr) from the stack */ if (v_regs != 0 && (msr & MSR_VEC) != 0) err |= __copy_from_user(current->thread.vr, v_regs, From 33b7497794424181dca87f18e43ecbc07f86bba5 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 7 Jun 2006 12:01:32 +1000 Subject: [PATCH 13/45] [PATCH] powerpc: Fix call to ibm,client-architecture-support The code in prom_init.c calling the firmware ibm,client-architecture-support method on pSeries has a bug where it fails to properly pass the instance handle of the firmware object when trying to call a method. Result ranges from the call doing nothing to the firmware crashing. (Found by Segher, thanks !) Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Segher Boessenkool Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/prom_init.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 41e9ab40cd54..f393a3867430 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -822,6 +822,7 @@ static void __init prom_send_capabilities(void) /* try calling the ibm,client-architecture-support method */ if (call_prom_ret("call-method", 3, 2, &ret, ADDR("ibm,client-architecture-support"), + root, ADDR(ibm_architecture_vec)) == 0) { /* the call exists... */ if (ret) From 133dda1e4f757e036fa838cba6804d0344931c4a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 7 Jun 2006 12:04:18 +1000 Subject: [PATCH 14/45] [PATCH] powerpc: Fix cell blade detection The IBM Cell blade firmware might confuse the kernel to think it's a pSeries machine. This fixes it for now. With a bit of luck, the firmware will be updated to avoid that in the future but currently that patch is needed. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/prom_init.c | 9 +++++++++ arch/powerpc/platforms/cell/setup.c | 11 +++++------ arch/powerpc/platforms/pseries/setup.c | 8 ++++++++ 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index f393a3867430..f70bd090dacd 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -1623,6 +1623,15 @@ static int __init prom_find_machine_type(void) if (strstr(p, RELOC("Power Macintosh")) || strstr(p, RELOC("MacRISC"))) return PLATFORM_POWERMAC; +#ifdef CONFIG_PPC64 + /* We must make sure we don't detect the IBM Cell + * blades as pSeries due to some firmware issues, + * so we do it here. + */ + if (strstr(p, RELOC("IBM,CBEA")) || + strstr(p, RELOC("IBM,CPBW-1.0"))) + return PLATFORM_GENERIC; +#endif /* CONFIG_PPC64 */ i += sl + 1; } } diff --git a/arch/powerpc/platforms/cell/setup.c b/arch/powerpc/platforms/cell/setup.c index 6574b22b3cf3..fd3e5609e3e0 100644 --- a/arch/powerpc/platforms/cell/setup.c +++ b/arch/powerpc/platforms/cell/setup.c @@ -125,14 +125,13 @@ static void __init cell_init_early(void) static int __init cell_probe(void) { - /* XXX This is temporary, the Cell maintainer will come up with - * more appropriate detection logic - */ unsigned long root = of_get_flat_dt_root(); - if (!of_flat_dt_is_compatible(root, "IBM,CPBW-1.0")) - return 0; - return 1; + if (of_flat_dt_is_compatible(root, "IBM,CBEA") || + of_flat_dt_is_compatible(root, "IBM,CPBW-1.0")) + return 1; + + return 0; } /* diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 5f79f01c44f2..3ba87835757e 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -389,6 +389,7 @@ static int __init pSeries_probe_hypertas(unsigned long node, static int __init pSeries_probe(void) { + unsigned long root = of_get_flat_dt_root(); char *dtype = of_get_flat_dt_prop(of_get_flat_dt_root(), "device_type", NULL); if (dtype == NULL) @@ -396,6 +397,13 @@ static int __init pSeries_probe(void) if (strcmp(dtype, "chrp")) return 0; + /* Cell blades firmware claims to be chrp while it's not. Until this + * is fixed, we need to avoid those here. + */ + if (of_flat_dt_is_compatible(root, "IBM,CPBW-1.0") || + of_flat_dt_is_compatible(root, "IBM,CBEA")) + return 0; + DBG("pSeries detected, looking for LPAR capability...\n"); /* Now try to figure out if we are running on LPAR */ From 5224e6cc3ab5ae03895bbb67f4a26ce72e62ce58 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 6 Jun 2006 17:37:41 -0700 Subject: [PATCH 15/45] [SPARC64]: Dump local cpu registers in sun4v_log_error() This makes the debugging information more usable. Signed-off-by: David S. Miller --- arch/sparc64/kernel/traps.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/sparc64/kernel/traps.c b/arch/sparc64/kernel/traps.c index 2793a5d82380..563db528e031 100644 --- a/arch/sparc64/kernel/traps.c +++ b/arch/sparc64/kernel/traps.c @@ -1797,7 +1797,9 @@ static const char *sun4v_err_type_to_str(u32 type) }; } -static void sun4v_log_error(struct sun4v_error_entry *ent, int cpu, const char *pfx, atomic_t *ocnt) +extern void __show_regs(struct pt_regs * regs); + +static void sun4v_log_error(struct pt_regs *regs, struct sun4v_error_entry *ent, int cpu, const char *pfx, atomic_t *ocnt) { int cnt; @@ -1830,6 +1832,8 @@ static void sun4v_log_error(struct sun4v_error_entry *ent, int cpu, const char * pfx, ent->err_raddr, ent->err_size, ent->err_cpu); + __show_regs(regs); + if ((cnt = atomic_read(ocnt)) != 0) { atomic_set(ocnt, 0); wmb(); @@ -1862,7 +1866,7 @@ void sun4v_resum_error(struct pt_regs *regs, unsigned long offset) put_cpu(); - sun4v_log_error(&local_copy, cpu, + sun4v_log_error(regs, &local_copy, cpu, KERN_ERR "RESUMABLE ERROR", &sun4v_resum_oflow_cnt); } @@ -1910,7 +1914,7 @@ void sun4v_nonresum_error(struct pt_regs *regs, unsigned long offset) } #endif - sun4v_log_error(&local_copy, cpu, + sun4v_log_error(regs, &local_copy, cpu, KERN_EMERG "NON-RESUMABLE ERROR", &sun4v_nonresum_oflow_cnt); @@ -2200,7 +2204,6 @@ static inline struct reg_window *kernel_stack_up(struct reg_window *rw) void die_if_kernel(char *str, struct pt_regs *regs) { static int die_counter; - extern void __show_regs(struct pt_regs * regs); extern void smp_report_regs(void); int count = 0; From f49639e643e69ff233b14966b8d48541d2e17517 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 9 Jun 2006 11:58:36 -0700 Subject: [PATCH 16/45] [TG3]: Handle Sun onboard tg3 chips more correctly. Get rid of all the SUN_570X logic and instead: 1) Make sure MEMARB_ENABLE is set when we probe the SRAM for config information. If that is off we will get timeouts. 2) Always try to sync with the firmware, if there is no firmware running do not treat it as an error and instead just report it the first time we notice this condition. 3) If there is no valid SRAM signature, assume the device is onboard by setting TG3_FLAG_EEPROM_WRITE_PROT. Update driver version and release date. With help from Michael Chan and Fabio Massimo Di Nitto. Signed-off-by: David S. Miller --- drivers/net/tg3.c | 144 ++++++++++++++++------------------------------ drivers/net/tg3.h | 3 +- 2 files changed, 50 insertions(+), 97 deletions(-) diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index 49ad60b72657..862c226dbbe2 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -69,8 +69,8 @@ #define DRV_MODULE_NAME "tg3" #define PFX DRV_MODULE_NAME ": " -#define DRV_MODULE_VERSION "3.58" -#define DRV_MODULE_RELDATE "May 22, 2006" +#define DRV_MODULE_VERSION "3.59" +#define DRV_MODULE_RELDATE "June 8, 2006" #define TG3_DEF_MAC_MODE 0 #define TG3_DEF_RX_MODE 0 @@ -4485,9 +4485,8 @@ static void tg3_disable_nvram_access(struct tg3 *tp) /* tp->lock is held. */ static void tg3_write_sig_pre_reset(struct tg3 *tp, int kind) { - if (!(tp->tg3_flags2 & TG3_FLG2_SUN_570X)) - tg3_write_mem(tp, NIC_SRAM_FIRMWARE_MBOX, - NIC_SRAM_FIRMWARE_MBOX_MAGIC1); + tg3_write_mem(tp, NIC_SRAM_FIRMWARE_MBOX, + NIC_SRAM_FIRMWARE_MBOX_MAGIC1); if (tp->tg3_flags2 & TG3_FLG2_ASF_NEW_HANDSHAKE) { switch (kind) { @@ -4568,13 +4567,12 @@ static int tg3_chip_reset(struct tg3 *tp) void (*write_op)(struct tg3 *, u32, u32); int i; - if (!(tp->tg3_flags2 & TG3_FLG2_SUN_570X)) { - tg3_nvram_lock(tp); - /* No matching tg3_nvram_unlock() after this because - * chip reset below will undo the nvram lock. - */ - tp->nvram_lock_cnt = 0; - } + tg3_nvram_lock(tp); + + /* No matching tg3_nvram_unlock() after this because + * chip reset below will undo the nvram lock. + */ + tp->nvram_lock_cnt = 0; if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5752 || GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5755 || @@ -4727,20 +4725,25 @@ static int tg3_chip_reset(struct tg3 *tp) tw32_f(MAC_MODE, 0); udelay(40); - if (!(tp->tg3_flags2 & TG3_FLG2_SUN_570X)) { - /* Wait for firmware initialization to complete. */ - for (i = 0; i < 100000; i++) { - tg3_read_mem(tp, NIC_SRAM_FIRMWARE_MBOX, &val); - if (val == ~NIC_SRAM_FIRMWARE_MBOX_MAGIC1) - break; - udelay(10); - } - if (i >= 100000) { - printk(KERN_ERR PFX "tg3_reset_hw timed out for %s, " - "firmware will not restart magic=%08x\n", - tp->dev->name, val); - return -ENODEV; - } + /* Wait for firmware initialization to complete. */ + for (i = 0; i < 100000; i++) { + tg3_read_mem(tp, NIC_SRAM_FIRMWARE_MBOX, &val); + if (val == ~NIC_SRAM_FIRMWARE_MBOX_MAGIC1) + break; + udelay(10); + } + + /* Chip might not be fitted with firmare. Some Sun onboard + * parts are configured like that. So don't signal the timeout + * of the above loop as an error, but do report the lack of + * running firmware once. + */ + if (i >= 100000 && + !(tp->tg3_flags2 & TG3_FLG2_NO_FWARE_REPORTED)) { + tp->tg3_flags2 |= TG3_FLG2_NO_FWARE_REPORTED; + + printk(KERN_INFO PFX "%s: No firmware running.\n", + tp->dev->name); } if ((tp->tg3_flags2 & TG3_FLG2_PCI_EXPRESS) && @@ -9075,9 +9078,6 @@ static void __devinit tg3_nvram_init(struct tg3 *tp) { int j; - if (tp->tg3_flags2 & TG3_FLG2_SUN_570X) - return; - tw32_f(GRC_EEPROM_ADDR, (EEPROM_ADDR_FSM_RESET | (EEPROM_DEFAULT_CLOCK_PERIOD << @@ -9210,11 +9210,6 @@ static int tg3_nvram_read(struct tg3 *tp, u32 offset, u32 *val) { int ret; - if (tp->tg3_flags2 & TG3_FLG2_SUN_570X) { - printk(KERN_ERR PFX "Attempt to do nvram_read on Sun 570X\n"); - return -EINVAL; - } - if (!(tp->tg3_flags & TG3_FLAG_NVRAM)) return tg3_nvram_read_using_eeprom(tp, offset, val); @@ -9447,11 +9442,6 @@ static int tg3_nvram_write_block(struct tg3 *tp, u32 offset, u32 len, u8 *buf) { int ret; - if (tp->tg3_flags2 & TG3_FLG2_SUN_570X) { - printk(KERN_ERR PFX "Attempt to do nvram_write on Sun 570X\n"); - return -EINVAL; - } - if (tp->tg3_flags & TG3_FLAG_EEPROM_WRITE_PROT) { tw32_f(GRC_LOCAL_CTRL, tp->grc_local_ctrl & ~GRC_LCLCTRL_GPIO_OUTPUT1); @@ -9578,15 +9568,19 @@ static void __devinit tg3_get_eeprom_hw_cfg(struct tg3 *tp) pci_write_config_dword(tp->pdev, TG3PCI_MISC_HOST_CTRL, tp->misc_host_ctrl); + /* The memory arbiter has to be enabled in order for SRAM accesses + * to succeed. Normally on powerup the tg3 chip firmware will make + * sure it is enabled, but other entities such as system netboot + * code might disable it. + */ + val = tr32(MEMARB_MODE); + tw32(MEMARB_MODE, val | MEMARB_MODE_ENABLE); + tp->phy_id = PHY_ID_INVALID; tp->led_ctrl = LED_CTRL_MODE_PHY_1; - /* Do not even try poking around in here on Sun parts. */ - if (tp->tg3_flags2 & TG3_FLG2_SUN_570X) { - /* All SUN chips are built-in LOMs. */ - tp->tg3_flags |= TG3_FLAG_EEPROM_WRITE_PROT; - return; - } + /* Assume an onboard device by default. */ + tp->tg3_flags |= TG3_FLAG_EEPROM_WRITE_PROT; tg3_read_mem(tp, NIC_SRAM_DATA_SIG, &val); if (val == NIC_SRAM_DATA_SIG_MAGIC) { @@ -9686,6 +9680,8 @@ static void __devinit tg3_get_eeprom_hw_cfg(struct tg3 *tp) if (nic_cfg & NIC_SRAM_DATA_CFG_EEPROM_WP) tp->tg3_flags |= TG3_FLAG_EEPROM_WRITE_PROT; + else + tp->tg3_flags &= ~TG3_FLAG_EEPROM_WRITE_PROT; if (nic_cfg & NIC_SRAM_DATA_CFG_ASF_ENABLE) { tp->tg3_flags |= TG3_FLAG_ENABLE_ASF; @@ -9834,16 +9830,8 @@ static void __devinit tg3_read_partno(struct tg3 *tp) int i; u32 magic; - if (tp->tg3_flags2 & TG3_FLG2_SUN_570X) { - /* Sun decided not to put the necessary bits in the - * NVRAM of their onboard tg3 parts :( - */ - strcpy(tp->board_part_number, "Sun 570X"); - return; - } - if (tg3_nvram_read_swab(tp, 0x0, &magic)) - return; + goto out_not_found; if (magic == TG3_EEPROM_MAGIC) { for (i = 0; i < 256; i += 4) { @@ -9874,6 +9862,9 @@ static void __devinit tg3_read_partno(struct tg3 *tp) break; msleep(1); } + if (!(tmp16 & 0x8000)) + goto out_not_found; + pci_read_config_dword(tp->pdev, vpd_cap + PCI_VPD_DATA, &tmp); tmp = cpu_to_le32(tmp); @@ -9965,37 +9956,6 @@ static void __devinit tg3_read_fw_ver(struct tg3 *tp) } } -#ifdef CONFIG_SPARC64 -static int __devinit tg3_is_sun_570X(struct tg3 *tp) -{ - struct pci_dev *pdev = tp->pdev; - struct pcidev_cookie *pcp = pdev->sysdata; - - if (pcp != NULL) { - int node = pcp->prom_node; - u32 venid; - int err; - - err = prom_getproperty(node, "subsystem-vendor-id", - (char *) &venid, sizeof(venid)); - if (err == 0 || err == -1) - return 0; - if (venid == PCI_VENDOR_ID_SUN) - return 1; - - /* TG3 chips onboard the SunBlade-2500 don't have the - * subsystem-vendor-id set to PCI_VENDOR_ID_SUN but they - * are distinguishable from non-Sun variants by being - * named "network" by the firmware. Non-Sun cards will - * show up as being named "ethernet". - */ - if (!strcmp(pcp->prom_name, "network")) - return 1; - } - return 0; -} -#endif - static int __devinit tg3_get_invariants(struct tg3 *tp) { static struct pci_device_id write_reorder_chipsets[] = { @@ -10012,11 +9972,6 @@ static int __devinit tg3_get_invariants(struct tg3 *tp) u16 pci_cmd; int err; -#ifdef CONFIG_SPARC64 - if (tg3_is_sun_570X(tp)) - tp->tg3_flags2 |= TG3_FLG2_SUN_570X; -#endif - /* Force memory write invalidate off. If we leave it on, * then on 5700_BX chips we have to enable a workaround. * The workaround is to set the TG3PCI_DMA_RW_CTRL boundary @@ -10312,8 +10267,7 @@ static int __devinit tg3_get_invariants(struct tg3 *tp) if (tp->write32 == tg3_write_indirect_reg32 || ((tp->tg3_flags & TG3_FLAG_PCIX_MODE) && (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5700 || - GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5701)) || - (tp->tg3_flags2 & TG3_FLG2_SUN_570X)) + GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5701))) tp->tg3_flags |= TG3_FLAG_SRAM_USE_CONFIG; /* Get eeprom hw config before calling tg3_set_power_state(). @@ -10594,8 +10548,7 @@ static int __devinit tg3_get_device_address(struct tg3 *tp) #endif mac_offset = 0x7c; - if ((GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5704 && - !(tp->tg3_flags & TG3_FLG2_SUN_570X)) || + if ((GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5704) || (tp->tg3_flags2 & TG3_FLG2_5780_CLASS)) { if (tr32(TG3PCI_DUAL_MAC_CTRL) & DUAL_MAC_CTRL_ID) mac_offset = 0xcc; @@ -10622,8 +10575,7 @@ static int __devinit tg3_get_device_address(struct tg3 *tp) } if (!addr_ok) { /* Next, try NVRAM. */ - if (!(tp->tg3_flags & TG3_FLG2_SUN_570X) && - !tg3_nvram_read(tp, mac_offset + 0, &hi) && + if (!tg3_nvram_read(tp, mac_offset + 0, &hi) && !tg3_nvram_read(tp, mac_offset + 4, &lo)) { dev->dev_addr[0] = ((hi >> 16) & 0xff); dev->dev_addr[1] = ((hi >> 24) & 0xff); diff --git a/drivers/net/tg3.h b/drivers/net/tg3.h index 0e29b885d449..ff0faab94bd5 100644 --- a/drivers/net/tg3.h +++ b/drivers/net/tg3.h @@ -2184,7 +2184,7 @@ struct tg3 { #define TG3_FLAG_INIT_COMPLETE 0x80000000 u32 tg3_flags2; #define TG3_FLG2_RESTART_TIMER 0x00000001 -#define TG3_FLG2_SUN_570X 0x00000002 +/* 0x00000002 available */ #define TG3_FLG2_NO_ETH_WIRE_SPEED 0x00000004 #define TG3_FLG2_IS_5788 0x00000008 #define TG3_FLG2_MAX_RXPEND_64 0x00000010 @@ -2216,6 +2216,7 @@ struct tg3 { #define TG3_FLG2_HW_TSO (TG3_FLG2_HW_TSO_1 | TG3_FLG2_HW_TSO_2) #define TG3_FLG2_1SHOT_MSI 0x10000000 #define TG3_FLG2_PHY_JITTER_BUG 0x20000000 +#define TG3_FLG2_NO_FWARE_REPORTED 0x40000000 u32 split_mode_max_reqs; #define SPLIT_MODE_5704_MAX_REQ 3 From c29ca9d1812f2abacaefa7daa31e085600128938 Mon Sep 17 00:00:00 2001 From: "Tom \"spot\" Callaway" Date: Fri, 9 Jun 2006 17:01:48 -0700 Subject: [PATCH 17/45] [FUSION]: Fix mptspi.c build with CONFIG_PM not set. Signed-off-by: Tom "spot" Callaway Signed-off-by: David S. Miller --- drivers/message/fusion/mptspi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/message/fusion/mptspi.c b/drivers/message/fusion/mptspi.c index f2a4d382ea19..3201de053943 100644 --- a/drivers/message/fusion/mptspi.c +++ b/drivers/message/fusion/mptspi.c @@ -831,6 +831,7 @@ mptspi_ioc_reset(MPT_ADAPTER *ioc, int reset_phase) return rc; } +#ifdef CONFIG_PM /* * spi module resume handler */ @@ -846,6 +847,7 @@ mptspi_resume(struct pci_dev *pdev) return rc; } +#endif /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/ /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/ From 46b304934de417a2238d659ef6459a74cb3f5e6b Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 10 Jun 2006 01:06:25 -0700 Subject: [PATCH 18/45] [SPARC64]: Avoid JBUS errors on some Niagara systems. Doing PCI config space accesses to non-present PCI slots can result in fatal JBUS errors if the PCI config access hypervisor call is performed on cpus other than the boot cpu. PCI config space accesses to present PCI slots works just fine. Recursively traverse the OBP device tree under the PCI controller node and record all present device IDs into a small hash table. Avoid the hypervisor call for any PCI config space access attempt for a device not recorded in the hash table. Signed-off-by: David S. Miller --- arch/sparc64/kernel/pci_sun4v.c | 124 ++++++++++++++++++++++++++++++-- 1 file changed, 118 insertions(+), 6 deletions(-) diff --git a/arch/sparc64/kernel/pci_sun4v.c b/arch/sparc64/kernel/pci_sun4v.c index 2b7a1f316a93..0c0895202970 100644 --- a/arch/sparc64/kernel/pci_sun4v.c +++ b/arch/sparc64/kernel/pci_sun4v.c @@ -599,18 +599,128 @@ struct pci_iommu_ops pci_sun4v_iommu_ops = { /* SUN4V PCI configuration space accessors. */ -static inline int pci_sun4v_out_of_range(struct pci_pbm_info *pbm, unsigned int bus, unsigned int device, unsigned int func) +struct pdev_entry { + struct pdev_entry *next; + u32 devhandle; + unsigned int bus; + unsigned int device; + unsigned int func; +}; + +#define PDEV_HTAB_SIZE 16 +#define PDEV_HTAB_MASK (PDEV_HTAB_SIZE - 1) +static struct pdev_entry *pdev_htab[PDEV_HTAB_SIZE]; + +static inline unsigned int pdev_hashfn(u32 devhandle, unsigned int bus, unsigned int device, unsigned int func) { - if (bus == pbm->pci_first_busno) { - if (device == 0 && func == 0) - return 0; - return 1; + unsigned int val; + + val = (devhandle ^ (devhandle >> 4)); + val ^= bus; + val ^= device; + val ^= func; + + return val & PDEV_HTAB_MASK; +} + +static int pdev_htab_add(u32 devhandle, unsigned int bus, unsigned int device, unsigned int func) +{ + struct pdev_entry *p = kmalloc(sizeof(*p), GFP_KERNEL); + struct pdev_entry **slot; + + if (!p) + return -ENOMEM; + + slot = &pdev_htab[pdev_hashfn(devhandle, bus, device, func)]; + p->next = *slot; + *slot = p; + + p->devhandle = devhandle; + p->bus = bus; + p->device = device; + p->func = func; + + return 0; +} + +/* Recursively descend into the OBP device tree, rooted at toplevel_node, + * looking for a PCI device matching bus and devfn. + */ +static int obp_find(struct linux_prom_pci_registers *pregs, int toplevel_node, unsigned int bus, unsigned int devfn) +{ + toplevel_node = prom_getchild(toplevel_node); + + while (toplevel_node != 0) { + int ret = obp_find(pregs, toplevel_node, bus, devfn); + + if (ret != 0) + return ret; + + ret = prom_getproperty(toplevel_node, "reg", (char *) pregs, + sizeof(*pregs) * PROMREG_MAX); + if (ret == 0 || ret == -1) + goto next_sibling; + + if (((pregs[0].phys_hi >> 16) & 0xff) == bus && + ((pregs[0].phys_hi >> 8) & 0xff) == devfn) + break; + + next_sibling: + toplevel_node = prom_getsibling(toplevel_node); } + return toplevel_node; +} + +static int pdev_htab_populate(struct pci_pbm_info *pbm) +{ + struct linux_prom_pci_registers pr[PROMREG_MAX]; + u32 devhandle = pbm->devhandle; + unsigned int bus; + + for (bus = pbm->pci_first_busno; bus <= pbm->pci_last_busno; bus++) { + unsigned int devfn; + + for (devfn = 0; devfn < 256; devfn++) { + unsigned int device = PCI_SLOT(devfn); + unsigned int func = PCI_FUNC(devfn); + + if (obp_find(pr, pbm->prom_node, bus, devfn)) { + int err = pdev_htab_add(devhandle, bus, + device, func); + if (err) + return err; + } + } + } + + return 0; +} + +static struct pdev_entry *pdev_find(u32 devhandle, unsigned int bus, unsigned int device, unsigned int func) +{ + struct pdev_entry *p; + + p = pdev_htab[pdev_hashfn(devhandle, bus, device, func)]; + while (p) { + if (p->devhandle == devhandle && + p->bus == bus && + p->device == device && + p->func == func) + break; + + p = p->next; + } + + return p; +} + +static inline int pci_sun4v_out_of_range(struct pci_pbm_info *pbm, unsigned int bus, unsigned int device, unsigned int func) +{ if (bus < pbm->pci_first_busno || bus > pbm->pci_last_busno) return 1; - return 0; + return pdev_find(pbm->devhandle, bus, device, func) == NULL; } static int pci_sun4v_read_pci_cfg(struct pci_bus *bus_dev, unsigned int devfn, @@ -1063,6 +1173,8 @@ static void pci_sun4v_pbm_init(struct pci_controller_info *p, int prom_node, u32 pci_sun4v_get_bus_range(pbm); pci_sun4v_iommu_init(pbm); + + pdev_htab_populate(pbm); } void sun4v_pci_init(int node, char *model_name) From 56f1319e877a969b814b3805c77ea9c31d849f54 Mon Sep 17 00:00:00 2001 From: Russell King Date: Sat, 10 Jun 2006 12:42:12 +0100 Subject: [PATCH 19/45] [ARM] Fix Integrator and Versatile interrupt initialisation Both Integrator and Versatile were using set_irq_handler() and enable_irq(), and working around the initialisation of the chained interrupt, instead of the more correct set_irq_chained_handler() function. Fix Integrator and Versatile to use the right function, and remove these work-arounds. Signed-off-by: Russell King --- arch/arm/mach-integrator/integrator_cp.c | 5 +---- arch/arm/mach-versatile/core.c | 5 ++--- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/arm/mach-integrator/integrator_cp.c b/arch/arm/mach-integrator/integrator_cp.c index a0724f2b24ce..9f55f5ae1044 100644 --- a/arch/arm/mach-integrator/integrator_cp.c +++ b/arch/arm/mach-integrator/integrator_cp.c @@ -232,8 +232,6 @@ static void __init intcp_init_irq(void) for (i = IRQ_PIC_START; i <= IRQ_PIC_END; i++) { if (i == 11) i = 22; - if (i == IRQ_CP_CPPLDINT) - i++; if (i == 29) break; set_irq_chip(i, &pic_chip); @@ -259,8 +257,7 @@ static void __init intcp_init_irq(void) set_irq_flags(i, IRQF_VALID | IRQF_PROBE); } - set_irq_handler(IRQ_CP_CPPLDINT, sic_handle_irq); - pic_unmask_irq(IRQ_CP_CPPLDINT); + set_irq_chained_handler(IRQ_CP_CPPLDINT, sic_handle_irq); } /* diff --git a/arch/arm/mach-versatile/core.c b/arch/arm/mach-versatile/core.c index 799697d32dec..cebd48a3dae4 100644 --- a/arch/arm/mach-versatile/core.c +++ b/arch/arm/mach-versatile/core.c @@ -112,10 +112,9 @@ void __init versatile_init_irq(void) { unsigned int i; - vic_init(VA_VIC_BASE, IRQ_VIC_START, ~(1 << 31)); + vic_init(VA_VIC_BASE, IRQ_VIC_START, ~0); - set_irq_handler(IRQ_VICSOURCE31, sic_handle_irq); - enable_irq(IRQ_VICSOURCE31); + set_irq_chained_handler(IRQ_VICSOURCE31, sic_handle_irq); /* Do second interrupt controller */ writel(~0, VA_SIC_BASE + SIC_IRQ_ENABLE_CLEAR); From 670bd95e0413c43f878b73a4a3919d1f452a4157 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 10 Jun 2006 09:54:12 -0700 Subject: [PATCH 20/45] [PATCH] Further alterations for memory barrier document From: David Howells Apply some alterations to the memory barrier document that I worked out with Paul McKenney of IBM, plus some of the alterations suggested by Alan Stern. The following changes were made: (*) One of the examples given for what can happen with overlapping memory barriers was wrong. (*) The description of general memory barriers said that a general barrier is a combination of a read barrier and a write barrier. This isn't entirely true: it implies both, but is more than a combination of both. (*) The first example in the "SMP Barrier Pairing" section was wrong: the loads around the read barrier need to touch the memory locations in the opposite order to the stores around the write barrier. (*) Added a note to make explicit that the loads should be in reverse order to the stores. (*) Adjusted the diagrams in the "Examples Of Memory Barrier Sequences" section to make them clearer. Added a couple of diagrams to make it more clear as to how it could go wrong without the barrier. (*) Added a section on memory speculation. (*) Dropped any references to memory allocation routines doing memory barriers. They may do sometimes, but it can't be relied on. This may be worthy of further documentation later. (*) Made the fact that a LOCK followed by an UNLOCK should not be considered a full memory barrier more explicit and gave an example. Signed-off-by: David Howells Acked-by: Paul E. McKenney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/memory-barriers.txt | 344 +++++++++++++++++++++++------- 1 file changed, 268 insertions(+), 76 deletions(-) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index c61d8b876fdb..4710845dbac4 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -19,6 +19,7 @@ Contents: - Control dependencies. - SMP barrier pairing. - Examples of memory barrier sequences. + - Read memory barriers vs load speculation. (*) Explicit kernel barriers. @@ -248,7 +249,7 @@ And there are a number of things that _must_ or _must_not_ be assumed: we may get either of: STORE *A = X; Y = LOAD *A; - STORE *A = Y; + STORE *A = Y = X; ========================= @@ -344,9 +345,12 @@ Memory barriers come in four basic varieties: (4) General memory barriers. - A general memory barrier is a combination of both a read memory barrier - and a write memory barrier. It is a partial ordering over both loads and - stores. + A general memory barrier gives a guarantee that all the LOAD and STORE + operations specified before the barrier will appear to happen before all + the LOAD and STORE operations specified after the barrier with respect to + the other components of the system. + + A general memory barrier is a partial ordering over both loads and stores. General memory barriers imply both read and write memory barriers, and so can substitute for either. @@ -546,9 +550,9 @@ write barrier, though, again, a general barrier is viable: =============== =============== a = 1; - b = 2; x = a; + b = 2; x = b; - y = b; + y = a; Or: @@ -563,6 +567,18 @@ Or: Basically, the read barrier always has to be there, even though it can be of the "weaker" type. +[!] Note that the stores before the write barrier would normally be expected to +match the loads after the read barrier or data dependency barrier, and vice +versa: + + CPU 1 CPU 2 + =============== =============== + a = 1; }---- --->{ v = c + b = 2; } \ / { w = d + \ + c = 3; } / \ { x = a; + d = 4; }---- --->{ y = b; + EXAMPLES OF MEMORY BARRIER SEQUENCES ------------------------------------ @@ -600,8 +616,8 @@ STORE B, STORE C } all occuring before the unordered set of { STORE D, STORE E | | +------+ +-------+ : : | - | Sequence in which stores committed to memory system - | by CPU 1 + | Sequence in which stores are committed to the + | memory system by CPU 1 V @@ -683,14 +699,12 @@ then the following will occur: | : : | | | : : | CPU 2 | | +-------+ | | - \ | X->9 |------>| | - \ +-------+ | | - ----->| B->2 | | | - +-------+ | | - Makes sure all effects ---> ddddddddddddddddd | | - prior to the store of C +-------+ | | - are perceptible to | B->2 |------>| | - successive loads +-------+ | | + | | X->9 |------>| | + | +-------+ | | + Makes sure all effects ---> \ ddddddddddddddddd | | + prior to the store of C \ +-------+ | | + are perceptible to ----->| B->2 |------>| | + subsequent loads +-------+ | | : : +-------+ @@ -699,73 +713,239 @@ following sequence of events: CPU 1 CPU 2 ======================= ======================= + { A = 0, B = 9 } STORE A=1 - STORE B=2 - STORE C=3 - STORE D=4 - STORE E=5 - LOAD A + STORE B=2 LOAD B - LOAD C - LOAD D - LOAD E + LOAD A Without intervention, CPU 2 may then choose to perceive the events on CPU 1 in some effectively random order, despite the write barrier issued by CPU 1: - +-------+ : : - | | +------+ - | |------>| C=3 | } - | | : +------+ } - | | : | A=1 | } - | | : +------+ } - | CPU 1 | : | B=2 | }--- - | | +------+ } \ - | | wwwwwwwwwwwww} \ - | | +------+ } \ : : +-------+ - | | : | E=5 | } \ +-------+ | | - | | : +------+ } \ { | C->3 |------>| | - | |------>| D=4 | } \ { +-------+ : | | - | | +------+ \ { | E->5 | : | | - +-------+ : : \ { +-------+ : | | - Transfer -->{ | A->1 | : | CPU 2 | - from CPU 1 { +-------+ : | | - to CPU 2 { | D->4 | : | | - { +-------+ : | | - { | B->2 |------>| | - +-------+ | | - : : +-------+ + +-------+ : : : : + | | +------+ +-------+ + | |------>| A=1 |------ --->| A->0 | + | | +------+ \ +-------+ + | CPU 1 | wwwwwwwwwwwwwwww \ --->| B->9 | + | | +------+ | +-------+ + | |------>| B=2 |--- | : : + | | +------+ \ | : : +-------+ + +-------+ : : \ | +-------+ | | + ---------->| B->2 |------>| | + | +-------+ | CPU 2 | + | | A->0 |------>| | + | +-------+ | | + | : : +-------+ + \ : : + \ +-------+ + ---->| A->1 | + +-------+ + : : -If, however, a read barrier were to be placed between the load of C and the -load of D on CPU 2, then the partial ordering imposed by CPU 1 will be -perceived correctly by CPU 2. +If, however, a read barrier were to be placed between the load of E and the +load of A on CPU 2: - +-------+ : : - | | +------+ - | |------>| C=3 | } - | | : +------+ } - | | : | A=1 | }--- - | | : +------+ } \ - | CPU 1 | : | B=2 | } \ - | | +------+ \ - | | wwwwwwwwwwwwwwww \ - | | +------+ \ : : +-------+ - | | : | E=5 | } \ +-------+ | | - | | : +------+ }--- \ { | C->3 |------>| | - | |------>| D=4 | } \ \ { +-------+ : | | - | | +------+ \ -->{ | B->2 | : | | - +-------+ : : \ { +-------+ : | | - \ { | A->1 | : | CPU 2 | - \ +-------+ | | - At this point the read ----> \ rrrrrrrrrrrrrrrrr | | - barrier causes all effects \ +-------+ | | - prior to the storage of C \ { | E->5 | : | | - to be perceptible to CPU 2 -->{ +-------+ : | | - { | D->4 |------>| | - +-------+ | | - : : +-------+ + CPU 1 CPU 2 + ======================= ======================= + { A = 0, B = 9 } + STORE A=1 + + STORE B=2 + LOAD B + + LOAD A + +then the partial ordering imposed by CPU 1 will be perceived correctly by CPU +2: + + +-------+ : : : : + | | +------+ +-------+ + | |------>| A=1 |------ --->| A->0 | + | | +------+ \ +-------+ + | CPU 1 | wwwwwwwwwwwwwwww \ --->| B->9 | + | | +------+ | +-------+ + | |------>| B=2 |--- | : : + | | +------+ \ | : : +-------+ + +-------+ : : \ | +-------+ | | + ---------->| B->2 |------>| | + | +-------+ | CPU 2 | + | : : | | + | : : | | + At this point the read ----> \ rrrrrrrrrrrrrrrrr | | + barrier causes all effects \ +-------+ | | + prior to the storage of B ---->| A->1 |------>| | + to be perceptible to CPU 2 +-------+ | | + : : +-------+ + + +To illustrate this more completely, consider what could happen if the code +contained a load of A either side of the read barrier: + + CPU 1 CPU 2 + ======================= ======================= + { A = 0, B = 9 } + STORE A=1 + + STORE B=2 + LOAD B + LOAD A [first load of A] + + LOAD A [second load of A] + +Even though the two loads of A both occur after the load of B, they may both +come up with different values: + + +-------+ : : : : + | | +------+ +-------+ + | |------>| A=1 |------ --->| A->0 | + | | +------+ \ +-------+ + | CPU 1 | wwwwwwwwwwwwwwww \ --->| B->9 | + | | +------+ | +-------+ + | |------>| B=2 |--- | : : + | | +------+ \ | : : +-------+ + +-------+ : : \ | +-------+ | | + ---------->| B->2 |------>| | + | +-------+ | CPU 2 | + | : : | | + | : : | | + | +-------+ | | + | | A->0 |------>| 1st | + | +-------+ | | + At this point the read ----> \ rrrrrrrrrrrrrrrrr | | + barrier causes all effects \ +-------+ | | + prior to the storage of B ---->| A->1 |------>| 2nd | + to be perceptible to CPU 2 +-------+ | | + : : +-------+ + + +But it may be that the update to A from CPU 1 becomes perceptible to CPU 2 +before the read barrier completes anyway: + + +-------+ : : : : + | | +------+ +-------+ + | |------>| A=1 |------ --->| A->0 | + | | +------+ \ +-------+ + | CPU 1 | wwwwwwwwwwwwwwww \ --->| B->9 | + | | +------+ | +-------+ + | |------>| B=2 |--- | : : + | | +------+ \ | : : +-------+ + +-------+ : : \ | +-------+ | | + ---------->| B->2 |------>| | + | +-------+ | CPU 2 | + | : : | | + \ : : | | + \ +-------+ | | + ---->| A->1 |------>| 1st | + +-------+ | | + rrrrrrrrrrrrrrrrr | | + +-------+ | | + | A->1 |------>| 2nd | + +-------+ | | + : : +-------+ + + +The guarantee is that the second load will always come up with A == 1 if the +load of B came up with B == 2. No such guarantee exists for the first load of +A; that may come up with either A == 0 or A == 1. + + +READ MEMORY BARRIERS VS LOAD SPECULATION +---------------------------------------- + +Many CPUs speculate with loads: that is they see that they will need to load an +item from memory, and they find a time where they're not using the bus for any +other loads, and so do the load in advance - even though they haven't actually +got to that point in the instruction execution flow yet. This permits the +actual load instruction to potentially complete immediately because the CPU +already has the value to hand. + +It may turn out that the CPU didn't actually need the value - perhaps because a +branch circumvented the load - in which case it can discard the value or just +cache it for later use. + +Consider: + + CPU 1 CPU 2 + ======================= ======================= + LOAD B + DIVIDE } Divide instructions generally + DIVIDE } take a long time to perform + LOAD A + +Which might appear as this: + + : : +-------+ + +-------+ | | + --->| B->2 |------>| | + +-------+ | CPU 2 | + : :DIVIDE | | + +-------+ | | + The CPU being busy doing a ---> --->| A->0 |~~~~ | | + division speculates on the +-------+ ~ | | + LOAD of A : : ~ | | + : :DIVIDE | | + : : ~ | | + Once the divisions are complete --> : : ~-->| | + the CPU can then perform the : : | | + LOAD with immediate effect : : +-------+ + + +Placing a read barrier or a data dependency barrier just before the second +load: + + CPU 1 CPU 2 + ======================= ======================= + LOAD B + DIVIDE + DIVIDE + + LOAD A + +will force any value speculatively obtained to be reconsidered to an extent +dependent on the type of barrier used. If there was no change made to the +speculated memory location, then the speculated value will just be used: + + : : +-------+ + +-------+ | | + --->| B->2 |------>| | + +-------+ | CPU 2 | + : :DIVIDE | | + +-------+ | | + The CPU being busy doing a ---> --->| A->0 |~~~~ | | + division speculates on the +-------+ ~ | | + LOAD of A : : ~ | | + : :DIVIDE | | + : : ~ | | + : : ~ | | + rrrrrrrrrrrrrrrr~ | | + : : ~ | | + : : ~-->| | + : : | | + : : +-------+ + + +but if there was an update or an invalidation from another CPU pending, then +the speculation will be cancelled and the value reloaded: + + : : +-------+ + +-------+ | | + --->| B->2 |------>| | + +-------+ | CPU 2 | + : :DIVIDE | | + +-------+ | | + The CPU being busy doing a ---> --->| A->0 |~~~~ | | + division speculates on the +-------+ ~ | | + LOAD of A : : ~ | | + : :DIVIDE | | + : : ~ | | + : : ~ | | + rrrrrrrrrrrrrrrrr | | + +-------+ | | + The speculation is discarded ---> --->| A->1 |------>| | + and an updated value is +-------+ | | + retrieved : : +-------+ ======================== @@ -901,7 +1081,7 @@ IMPLICIT KERNEL MEMORY BARRIERS =============================== Some of the other functions in the linux kernel imply memory barriers, amongst -which are locking, scheduling and memory allocation functions. +which are locking and scheduling functions. This specification is a _minimum_ guarantee; any particular architecture may provide more substantial guarantees, but these may not be relied upon outside @@ -966,6 +1146,20 @@ equivalent to a full barrier, but a LOCK followed by an UNLOCK is not. barriers is that the effects instructions outside of a critical section may seep into the inside of the critical section. +A LOCK followed by an UNLOCK may not be assumed to be full memory barrier +because it is possible for an access preceding the LOCK to happen after the +LOCK, and an access following the UNLOCK to happen before the UNLOCK, and the +two accesses can themselves then cross: + + *A = a; + LOCK + UNLOCK + *B = b; + +may occur as: + + LOCK, STORE *B, STORE *A, UNLOCK + Locks and semaphores may not provide any guarantee of ordering on UP compiled systems, and so cannot be counted on in such a situation to actually achieve anything at all - especially with respect to I/O accesses - unless combined @@ -1016,8 +1210,6 @@ Other functions that imply barriers: (*) schedule() and similar imply full memory barriers. - (*) Memory allocation and release functions imply full memory barriers. - ================================= INTER-CPU LOCKING BARRIER EFFECTS From a913f50706b21c7933f53cec678bb9a1c2383499 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 10 Jun 2006 09:54:13 -0700 Subject: [PATCH 21/45] [PATCH] powernow-k8 crash workaround From: Andrew Morton Work around the oops reported in http://bugzilla.kernel.org/show_bug.cgi?id=6478. Thanks to Ralf Hildebrandt for testing and reporting. Acked-by: Dave Jones Cc: "Brown, Len" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/acpi/processor_perflib.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c index abbdb37a7f5f..f36db22ce1ae 100644 --- a/drivers/acpi/processor_perflib.c +++ b/drivers/acpi/processor_perflib.c @@ -577,6 +577,8 @@ acpi_processor_register_performance(struct acpi_processor_performance return_VALUE(-EBUSY); } + WARN_ON(!performance); + pr->performance = performance; if (acpi_processor_get_performance_info(pr)) { @@ -609,7 +611,8 @@ acpi_processor_unregister_performance(struct acpi_processor_performance return_VOID; } - kfree(pr->performance->states); + if (pr->performance) + kfree(pr->performance->states); pr->performance = NULL; acpi_cpufreq_remove_file(pr); From 57a62fed871eb2a95f296fe6c5c250ce21b81a79 Mon Sep 17 00:00:00 2001 From: Markus Lidel Date: Sat, 10 Jun 2006 09:54:14 -0700 Subject: [PATCH 22/45] [PATCH] I2O: Bugfixes to get I2O working again From: Markus Lidel - Fixed locking of struct i2o_exec_wait in Executive-OSM - Removed LCT Notify in i2o_exec_probe() which caused freeing memory and accessing freed memory during first enumeration of I2O devices - Added missing locking in i2o_exec_lct_notify() - removed put_device() of I2O controller in i2o_iop_remove() which caused the controller structure get freed to early - Fixed size of mempool in i2o_iop_alloc() - Fixed access to freed memory in i2o_msg_get() See http://bugzilla.kernel.org/show_bug.cgi?id=6561 Signed-off-by: Markus Lidel Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/message/i2o/exec-osm.c | 72 +++++++++++++++++----------------- drivers/message/i2o/iop.c | 4 +- include/linux/i2o.h | 5 ++- 3 files changed, 42 insertions(+), 39 deletions(-) diff --git a/drivers/message/i2o/exec-osm.c b/drivers/message/i2o/exec-osm.c index 5ea133c59afb..7bd4d85d0b42 100644 --- a/drivers/message/i2o/exec-osm.c +++ b/drivers/message/i2o/exec-osm.c @@ -55,6 +55,7 @@ struct i2o_exec_wait { u32 m; /* message id */ struct i2o_message *msg; /* pointer to the reply message */ struct list_head list; /* node in global wait list */ + spinlock_t lock; /* lock before modifying */ }; /* Work struct needed to handle LCT NOTIFY replies */ @@ -87,6 +88,7 @@ static struct i2o_exec_wait *i2o_exec_wait_alloc(void) return NULL; INIT_LIST_HEAD(&wait->list); + spin_lock_init(&wait->lock); return wait; }; @@ -125,6 +127,7 @@ int i2o_msg_post_wait_mem(struct i2o_controller *c, struct i2o_message *msg, DECLARE_WAIT_QUEUE_HEAD(wq); struct i2o_exec_wait *wait; static u32 tcntxt = 0x80000000; + long flags; int rc = 0; wait = i2o_exec_wait_alloc(); @@ -146,33 +149,28 @@ int i2o_msg_post_wait_mem(struct i2o_controller *c, struct i2o_message *msg, wait->tcntxt = tcntxt++; msg->u.s.tcntxt = cpu_to_le32(wait->tcntxt); + wait->wq = &wq; + /* + * we add elements to the head, because if a entry in the list will + * never be removed, we have to iterate over it every time + */ + list_add(&wait->list, &i2o_exec_wait_list); + /* * Post the message to the controller. At some point later it will * return. If we time out before it returns then complete will be zero. */ i2o_msg_post(c, msg); - if (!wait->complete) { - wait->wq = &wq; - /* - * we add elements add the head, because if a entry in the list - * will never be removed, we have to iterate over it every time - */ - list_add(&wait->list, &i2o_exec_wait_list); + wait_event_interruptible_timeout(wq, wait->complete, timeout * HZ); - wait_event_interruptible_timeout(wq, wait->complete, - timeout * HZ); + spin_lock_irqsave(&wait->lock, flags); - wait->wq = NULL; - } + wait->wq = NULL; - barrier(); - - if (wait->complete) { + if (wait->complete) rc = le32_to_cpu(wait->msg->body[0]) >> 24; - i2o_flush_reply(c, wait->m); - i2o_exec_wait_free(wait); - } else { + else { /* * We cannot remove it now. This is important. When it does * terminate (which it must do if the controller has not @@ -186,6 +184,13 @@ int i2o_msg_post_wait_mem(struct i2o_controller *c, struct i2o_message *msg, rc = -ETIMEDOUT; } + spin_unlock_irqrestore(&wait->lock, flags); + + if (rc != -ETIMEDOUT) { + i2o_flush_reply(c, wait->m); + i2o_exec_wait_free(wait); + } + return rc; }; @@ -213,7 +218,6 @@ static int i2o_msg_post_wait_complete(struct i2o_controller *c, u32 m, { struct i2o_exec_wait *wait, *tmp; unsigned long flags; - static spinlock_t lock = SPIN_LOCK_UNLOCKED; int rc = 1; /* @@ -223,23 +227,24 @@ static int i2o_msg_post_wait_complete(struct i2o_controller *c, u32 m, * already expired. Not much we can do about that except log it for * debug purposes, increase timeout, and recompile. */ - spin_lock_irqsave(&lock, flags); list_for_each_entry_safe(wait, tmp, &i2o_exec_wait_list, list) { if (wait->tcntxt == context) { - list_del(&wait->list); + spin_lock_irqsave(&wait->lock, flags); - spin_unlock_irqrestore(&lock, flags); + list_del(&wait->list); wait->m = m; wait->msg = msg; wait->complete = 1; - barrier(); - - if (wait->wq) { - wake_up_interruptible(wait->wq); + if (wait->wq) rc = 0; - } else { + else + rc = -1; + + spin_unlock_irqrestore(&wait->lock, flags); + + if (rc) { struct device *dev; dev = &c->pdev->dev; @@ -248,15 +253,13 @@ static int i2o_msg_post_wait_complete(struct i2o_controller *c, u32 m, c->name); i2o_dma_free(dev, &wait->dma); i2o_exec_wait_free(wait); - rc = -1; - } + } else + wake_up_interruptible(wait->wq); return rc; } } - spin_unlock_irqrestore(&lock, flags); - osm_warn("%s: Bogus reply in POST WAIT (tr-context: %08x)!\n", c->name, context); @@ -322,14 +325,9 @@ static DEVICE_ATTR(product_id, S_IRUGO, i2o_exec_show_product_id, NULL); static int i2o_exec_probe(struct device *dev) { struct i2o_device *i2o_dev = to_i2o_device(dev); - struct i2o_controller *c = i2o_dev->iop; i2o_event_register(i2o_dev, &i2o_exec_driver, 0, 0xffffffff); - c->exec = i2o_dev; - - i2o_exec_lct_notify(c, c->lct->change_ind + 1); - device_create_file(dev, &dev_attr_vendor_id); device_create_file(dev, &dev_attr_product_id); @@ -523,6 +521,8 @@ static int i2o_exec_lct_notify(struct i2o_controller *c, u32 change_ind) struct device *dev; struct i2o_message *msg; + down(&c->lct_lock); + dev = &c->pdev->dev; if (i2o_dma_realloc @@ -545,6 +545,8 @@ static int i2o_exec_lct_notify(struct i2o_controller *c, u32 change_ind) i2o_msg_post(c, msg); + up(&c->lct_lock); + return 0; }; diff --git a/drivers/message/i2o/iop.c b/drivers/message/i2o/iop.c index 492167446936..febbdd4e0605 100644 --- a/drivers/message/i2o/iop.c +++ b/drivers/message/i2o/iop.c @@ -804,8 +804,6 @@ void i2o_iop_remove(struct i2o_controller *c) /* Ask the IOP to switch to RESET state */ i2o_iop_reset(c); - - put_device(&c->device); } /** @@ -1059,7 +1057,7 @@ struct i2o_controller *i2o_iop_alloc(void) snprintf(poolname, sizeof(poolname), "i2o_%s_msg_inpool", c->name); if (i2o_pool_alloc - (&c->in_msg, poolname, I2O_INBOUND_MSG_FRAME_SIZE * 4, + (&c->in_msg, poolname, I2O_INBOUND_MSG_FRAME_SIZE * 4 + sizeof(u32), I2O_MSG_INPOOL_MIN)) { kfree(c); return ERR_PTR(-ENOMEM); diff --git a/include/linux/i2o.h b/include/linux/i2o.h index dd7d627bf66f..c115e9e840b4 100644 --- a/include/linux/i2o.h +++ b/include/linux/i2o.h @@ -1114,8 +1114,11 @@ static inline struct i2o_message *i2o_msg_get(struct i2o_controller *c) mmsg->mfa = readl(c->in_port); if (unlikely(mmsg->mfa >= c->in_queue.len)) { + u32 mfa = mmsg->mfa; + mempool_free(mmsg, c->in_msg.mempool); - if(mmsg->mfa == I2O_QUEUE_EMPTY) + + if (mfa == I2O_QUEUE_EMPTY) return ERR_PTR(-EBUSY); return ERR_PTR(-EFAULT); } From 938473b24636d77dc5e9c3f41090d071b6cf4389 Mon Sep 17 00:00:00 2001 From: Milton Miller Date: Sat, 10 Jun 2006 09:54:16 -0700 Subject: [PATCH 23/45] [PATCH] powerpc: console_initcall ordering issues From: Milton Miller The add_preferred_console call in rtas_console.c was not causing the console to be selected. It turns out that the add_preferred_console was being called after the hvc_console driver was registered. It only works when it is called before the console driver is registered. Reorder hvc_console.o after the hvc_console drivers to allow the selection during console_initcall processing. Signed-off-by: Milton Miller Signed-off-by: Anton Blanchard Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/Makefile b/drivers/char/Makefile index f5b01c6d498e..fb919bfb2824 100644 --- a/drivers/char/Makefile +++ b/drivers/char/Makefile @@ -41,9 +41,9 @@ obj-$(CONFIG_N_HDLC) += n_hdlc.o obj-$(CONFIG_AMIGA_BUILTIN_SERIAL) += amiserial.o obj-$(CONFIG_SX) += sx.o generic_serial.o obj-$(CONFIG_RIO) += rio/ generic_serial.o -obj-$(CONFIG_HVC_DRIVER) += hvc_console.o obj-$(CONFIG_HVC_CONSOLE) += hvc_vio.o hvsi.o obj-$(CONFIG_HVC_RTAS) += hvc_rtas.o +obj-$(CONFIG_HVC_DRIVER) += hvc_console.o obj-$(CONFIG_RAW_DRIVER) += raw.o obj-$(CONFIG_SGI_SNSC) += snsc.o snsc_event.o obj-$(CONFIG_MMTIMER) += mmtimer.o From 9145bcf63575a8b78590a5beaf604001e9c8d2ef Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 10 Jun 2006 22:02:17 -0700 Subject: [PATCH 24/45] [SPARC64]: Set appropriate max_cache_size. Signed-off-by: David S. Miller --- arch/sparc64/kernel/smp.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c index 4e8cd79156e0..f03d52d0b88d 100644 --- a/arch/sparc64/kernel/smp.c +++ b/arch/sparc64/kernel/smp.c @@ -1287,6 +1287,40 @@ int setup_profiling_timer(unsigned int multiplier) return 0; } +static void __init smp_tune_scheduling(void) +{ + int instance, node; + unsigned int def, smallest = ~0U; + + def = ((tlb_type == hypervisor) ? + (3 * 1024 * 1024) : + (4 * 1024 * 1024)); + + instance = 0; + while (!cpu_find_by_instance(instance, &node, NULL)) { + unsigned int val; + + val = prom_getintdefault(node, "ecache-size", def); + if (val < smallest) + smallest = val; + + instance++; + } + + /* Any value less than 256K is nonsense. */ + if (smallest < (256U * 1024U)) + smallest = 256 * 1024; + + max_cache_size = smallest; + + if (smallest < 1U * 1024U * 1024U) + printk(KERN_INFO "Using max_cache_size of %uKB\n", + smallest / 1024U); + else + printk(KERN_INFO "Using max_cache_size of %uMB\n", + smallest / 1024U / 1024U); +} + /* Constrain the number of cpus to max_cpus. */ void __init smp_prepare_cpus(unsigned int max_cpus) { @@ -1322,6 +1356,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus) } smp_store_cpu_info(boot_cpu_id); + smp_tune_scheduling(); } /* Set this up early so that things like the scheduler can init From 650fb8382287f7990d5127a82a54295139224606 Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Sat, 10 Jun 2006 22:03:43 -0700 Subject: [PATCH 25/45] [SPARC]: Migration cost tune up in sparc smp. This patch sets the max_cache_size value required to tune up scheduler in SMP systems. Otherwise, the calculated migration_cost is too high and task scheduling may lock up. Signed-off-by: Krzysztof Helt Signed-off-by: David S. Miller --- arch/sparc/kernel/smp.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/sparc/kernel/smp.c b/arch/sparc/kernel/smp.c index a93f5da6855d..40b42c88e6a7 100644 --- a/arch/sparc/kernel/smp.c +++ b/arch/sparc/kernel/smp.c @@ -69,6 +69,17 @@ void __init smp_store_cpu_info(int id) "clock-frequency", 0); cpu_data(id).prom_node = cpu_node; cpu_data(id).mid = cpu_get_hwmid(cpu_node); + + /* this is required to tune the scheduler correctly */ + /* is it possible to have CPUs with different cache sizes? */ + if (id == boot_cpu_id) { + int cache_line,cache_nlines; + cache_line = 0x20; + cache_line = prom_getintdefault(cpu_node, "ecache-line-size", cache_line); + cache_nlines = 0x8000; + cache_nlines = prom_getintdefault(cpu_node, "ecache-nlines", cache_nlines); + max_cache_size = cache_line * cache_nlines; + } if (cpu_data(id).mid < 0) panic("No MID found for CPU%d at node 0x%08d", id, cpu_node); } From 0ce030395b92270567423d57d9d432eb77df32f2 Mon Sep 17 00:00:00 2001 From: "akpm@osdl.org" Date: Sat, 13 May 2006 08:30:52 -0700 Subject: [PATCH 26/45] [PATCH] PCI: fix pciehp compile issue when CONFIG_ACPI is not enabled Fix build error when CONFIG_ACPI not defined Signed-off-by: Kristen Carlson Accardi Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- include/linux/pci-acpi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h index 4877e35ae202..936ef82ed76a 100644 --- a/include/linux/pci-acpi.h +++ b/include/linux/pci-acpi.h @@ -50,7 +50,7 @@ extern acpi_status pci_osc_control_set(acpi_handle handle, u32 flags); extern acpi_status pci_osc_support_set(u32 flags); #else -#if !defined(acpi_status) +#if !defined(AE_ERROR) typedef u32 acpi_status; #define AE_ERROR (acpi_status) (0x0001) #endif From 8d92bc2270d67a43b1d7e94a8cb6f81f1435fe9a Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Tue, 18 Apr 2006 14:49:56 +0200 Subject: [PATCH 27/45] [PATCH] PCI: Error handling on PCI device resume We currently don't handle errors properly when resuming a PCI device: * In pci_default_resume() we capture the error code returned by pci_enable_device() but don't pass it up to the caller. Introduced by commit 95a629657dbe28e44a312c47815b3dc3f1ce0970 * In pci_resume_device(), the errors possibly returned by the driver's .resume method or by the generic pci_default_resume() function are ignored. This patch fixes both issues. Signed-off-by: Jean Delvare Signed-off-by: Greg Kroah-Hartman --- drivers/pci/pci-driver.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 1456759936c5..10e1a905c144 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -285,9 +285,9 @@ static int pci_device_suspend(struct device * dev, pm_message_t state) * Default resume method for devices that have no driver provided resume, * or not even a driver at all. */ -static void pci_default_resume(struct pci_dev *pci_dev) +static int pci_default_resume(struct pci_dev *pci_dev) { - int retval; + int retval = 0; /* restore the PCI config space */ pci_restore_state(pci_dev); @@ -297,18 +297,21 @@ static void pci_default_resume(struct pci_dev *pci_dev) /* if the device was busmaster before the suspend, make it busmaster again */ if (pci_dev->is_busmaster) pci_set_master(pci_dev); + + return retval; } static int pci_device_resume(struct device * dev) { + int error; struct pci_dev * pci_dev = to_pci_dev(dev); struct pci_driver * drv = pci_dev->driver; if (drv && drv->resume) - drv->resume(pci_dev); + error = drv->resume(pci_dev); else - pci_default_resume(pci_dev); - return 0; + error = pci_default_resume(pci_dev); + return error; } static void pci_device_shutdown(struct device *dev) From 04d9c1a1100b6bdeffa7e1bfc30080bdac28e183 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Tue, 18 Apr 2006 21:06:51 -0700 Subject: [PATCH 28/45] [PATCH] PCI: Improve PCI config space writeback At least one laptop blew up on resume from suspend with a black screen due to a lack of this patch. By only writing back config space that is different, we minimise the possibility of accidents like this. Signed-off-by: Dave Jones Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- drivers/pci/pci.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 2329f941a0dc..d2520451f36b 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -461,9 +461,19 @@ int pci_restore_state(struct pci_dev *dev) { int i; + int val; - for (i = 0; i < 16; i++) - pci_write_config_dword(dev,i * 4, dev->saved_config_space[i]); + for (i = 0; i < 16; i++) { + pci_read_config_dword(dev, i * 4, &val); + if (val != dev->saved_config_space[i]) { + printk(KERN_DEBUG "PM: Writing back config space on " + "device %s at offset %x (was %x, writing %x)\n", + pci_name(dev), i, + val, (int)dev->saved_config_space[i]); + pci_write_config_dword(dev,i * 4, + dev->saved_config_space[i]); + } + } pci_restore_msi_state(dev); pci_restore_msix_state(dev); return 0; From 8b8c8d280ab2d18fe6e42d671f60d4ffed451cdc Mon Sep 17 00:00:00 2001 From: "Yu, Luming" Date: Tue, 25 Apr 2006 00:00:34 -0700 Subject: [PATCH 29/45] [PATCH] PCI: reverse pci config space restore order According to Intel ICH spec, there are several rules that Base Address should be programmed before IOSE (PCICMD register ) enabled. For example ICH7: 12.1.3 SATA : the base address register for the bus master register should be programmed before this bit is set. 11.1.3: PCICMD (USB): The base address register for USB should be programmed before this bit is set. .... To make sure kernel code follow this rule , and prevent unnecessary confusion. I proposal this patch. Signed-off-by: Luming Yu Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- drivers/pci/pci.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index d2520451f36b..12286275b1c8 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -463,7 +463,11 @@ pci_restore_state(struct pci_dev *dev) int i; int val; - for (i = 0; i < 16; i++) { + /* + * The Base Address register should be programmed before the command + * register(s) + */ + for (i = 15; i >= 0; i--) { pci_read_config_dword(dev, i * 4, &val); if (val != dev->saved_config_space[i]) { printk(KERN_DEBUG "PM: Writing back config space on " From c0bbbc73d58f1b774cd987b5687a478a027f137c Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 11 Jun 2006 15:22:26 -0700 Subject: [PATCH 30/45] [PATCH] typo in vmscan.c From: Christoph Lameter Looks like a comma was left from the conversion from a struct to an assignment. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 4649a63a8cb6..440a733fe2e9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1061,7 +1061,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages, loop_again: total_scanned = 0; nr_reclaimed = 0; - sc.may_writepage = !laptop_mode, + sc.may_writepage = !laptop_mode; sc.nr_mapped = read_page_state(nr_mapped); inc_page_state(pageoutrun); From 2f9719b61e1fcf7422a016ac4f2420a0cc6ba320 Mon Sep 17 00:00:00 2001 From: Mark Lord Date: Wed, 7 Jun 2006 12:53:29 -0400 Subject: [PATCH 31/45] [PATCH] sata_mv: grab host lock inside eng_timeout Bug fix: mv_eng_timeout() calls mv_err_intr() without first grabbing the host lock, which can lead to all sorts of interesting scenarios. This whole error-handling portion of sata_mv is nasty (and will get fixed for the new EH stuff), but for now this patch will help keep it on life-support. Signed-off-by: Mark Lord Signed-off-by: Jeff Garzik --- drivers/scsi/sata_mv.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/scsi/sata_mv.c b/drivers/scsi/sata_mv.c index 9b8bca1ac1f0..f16f92a6ec0f 100644 --- a/drivers/scsi/sata_mv.c +++ b/drivers/scsi/sata_mv.c @@ -2035,6 +2035,7 @@ static void mv_phy_reset(struct ata_port *ap) static void mv_eng_timeout(struct ata_port *ap) { struct ata_queued_cmd *qc; + unsigned long flags; printk(KERN_ERR "ata%u: Entering mv_eng_timeout\n",ap->id); DPRINTK("All regs @ start of eng_timeout\n"); @@ -2046,8 +2047,10 @@ static void mv_eng_timeout(struct ata_port *ap) ap->host_set->mmio_base, ap, qc, qc->scsicmd, &qc->scsicmd->cmnd); + spin_lock_irqsave(&ap->host_set->lock, flags); mv_err_intr(ap, 0); mv_stop_and_reset(ap); + spin_unlock_irqrestore(&ap->host_set->lock, flags); WARN_ON(!(qc->flags & ATA_QCFLAG_ACTIVE)); if (qc->flags & ATA_QCFLAG_ACTIVE) { From 289a1e995e74734b5ec76ca8a5490058f4fecc24 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 12 Jun 2006 12:16:26 +1000 Subject: [PATCH 32/45] [PATCH] Fix for the PPTP hangs that have been reported People have been reporting that PPP connections over ptys, such as used with PPTP, will hang randomly when transferring large amounts of data, for instance in http://bugzilla.kernel.org/show_bug.cgi?id=6530. I have managed to reproduce the problem, and the patch below fixes the actual cause. The problem is not in fact in ppp_async.c but in n_tty.c. What happens is that when pptp reads from the pty, we call read_chan() in drivers/char/n_tty.c on the master side of the pty. That copies all the characters out of its buffer to userspace and then calls check_unthrottle(), which calls the pty unthrottle routine, which calls tty_wakeup on the slave side, which calls ppp_asynctty_wakeup, which calls tasklet_schedule. So far so good. Since we are in process context, the tasklet runs immediately and calls ppp_async_process(), which calls ppp_async_push, which calls the tty->driver->write function to send some more output. However, tty->driver->write() returns zero, because the master tty->receive_room is still zero. We haven't returned from check_unthrottle() yet, and read_chan() only updates tty->receive_room _after_ calling check_unthrottle. That means that the driver->write call in ppp_async_process() returns 0. That would be fine if we were going to get a subsequent wakeup call, but we aren't (we just had it, and the buffer is now empty). The solution is for n_tty.c to update tty->receive_room _before_ calling the driver unthrottle routine. The patch below does this. With this patch I was able to transfer a 900MB file over a PPTP connection (taking about 25 minutes), whereas without the patch the connection would always stall in under a minute. Signed-off-by: Paul Mackerras Signed-off-by: Linus Torvalds --- drivers/char/n_tty.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/char/n_tty.c b/drivers/char/n_tty.c index ede365d05387..b9371d5bf790 100644 --- a/drivers/char/n_tty.c +++ b/drivers/char/n_tty.c @@ -1384,8 +1384,10 @@ do_it_again: * longer than TTY_THRESHOLD_UNTHROTTLE in canonical mode, * we won't get any more characters. */ - if (n_tty_chars_in_buffer(tty) <= TTY_THRESHOLD_UNTHROTTLE) + if (n_tty_chars_in_buffer(tty) <= TTY_THRESHOLD_UNTHROTTLE) { + n_tty_set_room(tty); check_unthrottle(tty); + } if (b - buf >= minimum) break; From ccefb5f3f60cd116d9a8ce2fa9e82e67206e49e5 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sun, 11 Jun 2006 21:00:46 -0700 Subject: [PATCH 33/45] [SPARC64]: Do not double-export sys_close() when CONFIG_SOLARIS_EMUL_MODULE It is already exported by fs/open.c Noticed by Ben Collins. Signed-off-by: David S. Miller --- arch/sparc64/kernel/sparc64_ksyms.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/sparc64/kernel/sparc64_ksyms.c b/arch/sparc64/kernel/sparc64_ksyms.c index 62d8a99271ea..38e569f786dd 100644 --- a/arch/sparc64/kernel/sparc64_ksyms.c +++ b/arch/sparc64/kernel/sparc64_ksyms.c @@ -297,7 +297,6 @@ EXPORT_SYMBOL(svr4_getcontext); EXPORT_SYMBOL(svr4_setcontext); EXPORT_SYMBOL(compat_sys_ioctl); EXPORT_SYMBOL(sparc32_open); -EXPORT_SYMBOL(sys_close); #endif /* Special internal versions of library functions. */ From afec35e3fee900b3016519d0b5512064e4625b2c Mon Sep 17 00:00:00 2001 From: Andrea Bittau Date: Sun, 11 Jun 2006 20:58:33 -0700 Subject: [PATCH 34/45] [DCCP] Ackvec: fix soft lockup in ackvec handling code A soft lockup existed in the handling of ack vector records. Specifically, when a tail of the list of ack vector records was removed, it was possible to end up iterating infinitely on an element of the tail. Signed-off-by: Andrea Bittau Signed-off-by: Ian McDonald Signed-off-by: David S. Miller --- net/dccp/ackvec.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index b5981e5f6b00..8c211c58893b 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -452,6 +452,7 @@ found: (unsigned long long) avr->dccpavr_ack_ackno); dccp_ackvec_throw_record(av, avr); + break; } /* * If it wasn't received, continue scanning... we might From 79320d7e14900c549c3520791a297328f53ff71e Mon Sep 17 00:00:00 2001 From: Aki M Nyrhinen Date: Sun, 11 Jun 2006 21:18:56 -0700 Subject: [PATCH 35/45] [TCP]: continued: reno sacked_out count fix From: Aki M Nyrhinen IMHO the current fix to the problem (in_flight underflow in reno) is incorrect. it treats the symptons but ignores the problem. the problem is timing out packets other than the head packet when we don't have sack. i try to explain (sorry if explaining the obvious). with sack, scanning the retransmit queue for timed out packets is fine because we know which packets in our retransmit queue have been acked by the receiver. without sack, we know only how many packets in our retransmit queue the receiver has acknowledged, but no idea which packets. think of a "typical" slow-start overshoot case, where for example every third packet in a window get lost because a router buffer gets full. with sack, we check for timeouts on those every third packet (as the rest have been sacked). the packet counting works out and if there is no reordering, we'll retransmit exactly the packets that were lost. without sack, however, we check for timeout on every packet and end up retransmitting consecutive packets in the retransmit queue. in our slow-start example, 2/3 of those retransmissions are unnecessary. these unnecessary retransmissions eat the congestion window and evetually prevent fast recovery from continuing, if enough packets were lost. Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4a538bc1683d..b5521a9d3dc1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1649,7 +1649,7 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) * Hence, we can detect timed out packets during fast * retransmit without falling to slow start. */ - if (tcp_head_timedout(sk, tp)) { + if (!IsReno(tp) && tcp_head_timedout(sk, tp)) { struct sk_buff *skb; skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint @@ -1662,8 +1662,6 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); - if (IsReno(tp)) - tcp_remove_reno_sacks(sk, tp, tcp_skb_pcount(skb) + 1); /* clear xmit_retrans hint */ if (tp->retransmit_skb_hint && From d374c1c1281d6188a0d0676172b1c0e3de35c6e7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 12 Jun 2006 12:53:27 -0700 Subject: [PATCH 36/45] [sky2] Fix sky2 network driver suspend/resume This fixes two independent problems: it would not save the PCI state on suspend (and thus try to resume a nonexistent state on resume), and while shut off, if an interrupt happened on the same shared irq, the irq handler would react very badly to the interrupt status being an invalid all-ones state. Acked-by: Jeff Garzik Signed-off-by: Linus Torvalds --- drivers/net/sky2.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c index 959109609d85..6b87c7a5c906 100644 --- a/drivers/net/sky2.c +++ b/drivers/net/sky2.c @@ -2183,6 +2183,9 @@ static int sky2_poll(struct net_device *dev0, int *budget) int work_done = 0; u32 status = sky2_read32(hw, B0_Y2_SP_EISR); + if (!~status) + return 0; + if (status & Y2_IS_HW_ERR) sky2_hw_intr(hw); @@ -3438,6 +3441,7 @@ static int sky2_suspend(struct pci_dev *pdev, pm_message_t state) } } + pci_save_state(pdev); return sky2_set_power_state(hw, pci_choose_state(pdev, state)); } From 42d1d52e695d87475846e9a09964cae1209eeecb Mon Sep 17 00:00:00 2001 From: Weidong Date: Mon, 12 Jun 2006 13:09:59 -0700 Subject: [PATCH 37/45] [IPV4]: Increment ipInHdrErrors when TTL expires. Signed-off-by: Weidong Signed-off-by: David S. Miller --- net/ipv4/ip_forward.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 0923add122b4..9f0bb529ab70 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -116,6 +116,7 @@ sr_failed: too_many_hops: /* Tell the sender its packet died... */ + IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); drop: kfree_skb(skb); From cfd95a9cf58cd9e92d4c23b5ee20b07a3d121477 Mon Sep 17 00:00:00 2001 From: "Robin H. Johnson" Date: Mon, 12 Jun 2006 21:50:25 +0100 Subject: [PATCH 38/45] [PATCH] tmpfs: time granularity fix for [acm]time going backwards I noticed a strange behavior in a tmpfs file system the other day, while building packages - occasionally, and seemingly at random, make decided to rebuild a target. However, only on tmpfs. A file would be created, and if checked, it had a sub-second timestamp. However, after an utimes related call where sub-seconds should be set, they were zeroed instead. In the case that a file was created, and utimes(...,NULL) was used on it in the same second, the timestamp on the file moved backwards. After some digging, I found that this was being caused by tmpfs not having a time granularity set, thus inheriting the default 1 second granularity. Hugh adds: yes, we missed tmpfs when the s_time_gran mods went into 2.6.11. Unfortunately, the granularity of CURRENT_TIME, often used in filesystems, does not match the default granularity set by alloc_super. A few more such discrepancies have been found, but this is the most important to fix now. Signed-off-by: Robin H. Johnson Acked-by: Andi Kleen Signed-off-by: Hugh Dickins Signed-off-by: Linus Torvalds --- mm/shmem.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/shmem.c b/mm/shmem.c index 4c5e68e4e9ae..73f7a9dfcd37 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2102,6 +2102,7 @@ static int shmem_fill_super(struct super_block *sb, sb->s_blocksize_bits = PAGE_CACHE_SHIFT; sb->s_magic = TMPFS_MAGIC; sb->s_op = &shmem_ops; + sb->s_time_gran = 1; inode = shmem_get_inode(sb, S_IFDIR | mode, 0); if (!inode) From 86bc843a268058df558844b6bf64531617fbc698 Mon Sep 17 00:00:00 2001 From: Sergey Vlasov Date: Mon, 12 Jun 2006 21:53:23 +0100 Subject: [PATCH 39/45] [PATCH] tmpfs: Decrement i_nlink correctly in shmem_rmdir() shmem_rmdir() must undo the increment of i_nlink done in shmem_get_inode() for directories, otherwise at least IN_DELETE_SELF inotify event generation is broken. Signed-off-by: Sergey Vlasov Signed-off-by: Hugh Dickins Signed-off-by: Linus Torvalds --- mm/shmem.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/shmem.c b/mm/shmem.c index 73f7a9dfcd37..1e43c8a865ba 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1780,6 +1780,7 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry) if (!simple_empty(dentry)) return -ENOTEMPTY; + dentry->d_inode->i_nlink--; dir->i_nlink--; return shmem_unlink(dir, dentry); } From 5e625b0844435e0333670d9da633304169896740 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 12 Jun 2006 15:13:40 -0700 Subject: [PATCH 40/45] [PATCH] alpha: generic hweight build fix From: Randy Dunlap According to include/asm-alpha/bitops.h, only ALPHA_EV67 has hardware hweight support, so ALPHA_EV6 needs to use GENERIC_HWEIGHT. Signed-off-by: Randy Dunlap Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Ernst Herzberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 8290b69da202..213c7850d5fb 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -453,7 +453,7 @@ config ALPHA_IRONGATE config GENERIC_HWEIGHT bool - default y if !ALPHA_EV6 && !ALPHA_EV67 + default y if !ALPHA_EV67 config ALPHA_AVANTI bool From 2ccc99b7b71976d15822ae7c41cd2ccda66d5076 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 13 Jun 2006 17:17:27 +0900 Subject: [PATCH 41/45] [PATCH] sky2: set_power_state should be void The set power state function is cleaner if it doesn't return anything. The only caller that could fail is in suspend() and it can check the argument there. Signed-off-by: Stephen Hemminger Signed-off-by: Linus Torvalds --- drivers/net/sky2.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c index 6b87c7a5c906..765c8f063512 100644 --- a/drivers/net/sky2.c +++ b/drivers/net/sky2.c @@ -187,12 +187,11 @@ static u16 gm_phy_read(struct sky2_hw *hw, unsigned port, u16 reg) return v; } -static int sky2_set_power_state(struct sky2_hw *hw, pci_power_t state) +static void sky2_set_power_state(struct sky2_hw *hw, pci_power_t state) { u16 power_control; u32 reg1; int vaux; - int ret = 0; pr_debug("sky2_set_power_state %d\n", state); sky2_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_ON); @@ -275,12 +274,10 @@ static int sky2_set_power_state(struct sky2_hw *hw, pci_power_t state) break; default: printk(KERN_ERR PFX "Unknown power state %d\n", state); - ret = -1; } sky2_pci_write16(hw, hw->pm_cap + PCI_PM_CTRL, power_control); sky2_write8(hw, B2_TST_CTRL1, TST_CFG_WRITE_OFF); - return ret; } static void sky2_phy_reset(struct sky2_hw *hw, unsigned port) @@ -3428,6 +3425,10 @@ static int sky2_suspend(struct pci_dev *pdev, pm_message_t state) { struct sky2_hw *hw = pci_get_drvdata(pdev); int i; + pci_power_t pstate = pci_choose_state(pdev, state); + + if (!(pstate == PCI_D3hot || pstate == PCI_D3cold)) + return -EINVAL; for (i = 0; i < 2; i++) { struct net_device *dev = hw->dev[i]; @@ -3442,7 +3443,8 @@ static int sky2_suspend(struct pci_dev *pdev, pm_message_t state) } pci_save_state(pdev); - return sky2_set_power_state(hw, pci_choose_state(pdev, state)); + sky2_set_power_state(hw, pstate); + return 0; } static int sky2_resume(struct pci_dev *pdev) @@ -3452,9 +3454,7 @@ static int sky2_resume(struct pci_dev *pdev) pci_restore_state(pdev); pci_enable_wake(pdev, PCI_D0, 0); - err = sky2_set_power_state(hw, PCI_D0); - if (err) - goto out; + sky2_set_power_state(hw, PCI_D0); err = sky2_reset(hw); if (err) From f05267e7dee58741a4feb20d0351706ec64bb0b5 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 13 Jun 2006 17:17:28 +0900 Subject: [PATCH 42/45] [PATCH] sky2: don't hard code number of ports It is cleaner, to not loop over both ports if only one exists. Signed-off-by: Stephen Hemminger Signed-off-by: Linus Torvalds --- drivers/net/sky2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c index 765c8f063512..6ad676d2cbc1 100644 --- a/drivers/net/sky2.c +++ b/drivers/net/sky2.c @@ -3430,7 +3430,7 @@ static int sky2_suspend(struct pci_dev *pdev, pm_message_t state) if (!(pstate == PCI_D3hot || pstate == PCI_D3cold)) return -EINVAL; - for (i = 0; i < 2; i++) { + for (i = 0; i < hw->ports; i++) { struct net_device *dev = hw->dev[i]; if (dev) { @@ -3460,7 +3460,7 @@ static int sky2_resume(struct pci_dev *pdev) if (err) goto out; - for (i = 0; i < 2; i++) { + for (i = 0; i < hw->ports; i++) { struct net_device *dev = hw->dev[i]; if (dev && netif_running(dev)) { netif_device_attach(dev); From 26ec43f132d1cf282124a020b2bb5310496c9132 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 13 Jun 2006 17:17:29 +0900 Subject: [PATCH 43/45] [PATCH] sky2: fix hotplug detect during poll If the poll routine detects no hardware available, it needs to dequeue it self from the network poll list. Linus didn't understand NAPI. Signed-off-by: Stephen Hemminger Signed-off-by: Linus Torvalds --- drivers/net/sky2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c index 6ad676d2cbc1..b680e64ad2ca 100644 --- a/drivers/net/sky2.c +++ b/drivers/net/sky2.c @@ -2181,7 +2181,7 @@ static int sky2_poll(struct net_device *dev0, int *budget) u32 status = sky2_read32(hw, B0_Y2_SP_EISR); if (!~status) - return 0; + goto out; if (status & Y2_IS_HW_ERR) sky2_hw_intr(hw); @@ -2219,7 +2219,7 @@ static int sky2_poll(struct net_device *dev0, int *budget) if (sky2_more_work(hw)) return 1; - +out: netif_rx_complete(dev0); sky2_read32(hw, B0_Y2_SP_LISR); From 8ab8fca2071cec559e4b77212cccffd150ce5ce7 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 13 Jun 2006 17:17:30 +0900 Subject: [PATCH 44/45] [PATCH] sky2: save/restore base hardware irq during suspend/resume The hardware should be fully shut off during suspend, and the base irq mask restored during resume. Signed-off-by: Stephen Hemminger Signed-off-by: Linus Torvalds --- drivers/net/sky2.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c index b680e64ad2ca..df899dcbd5b0 100644 --- a/drivers/net/sky2.c +++ b/drivers/net/sky2.c @@ -3442,6 +3442,7 @@ static int sky2_suspend(struct pci_dev *pdev, pm_message_t state) } } + sky2_write32(hw, B0_IMSK, 0); pci_save_state(pdev); sky2_set_power_state(hw, pstate); return 0; @@ -3460,6 +3461,8 @@ static int sky2_resume(struct pci_dev *pdev) if (err) goto out; + sky2_write32(hw, B0_IMSK, Y2_IS_BASE); + for (i = 0; i < hw->ports; i++) { struct net_device *dev = hw->dev[i]; if (dev && netif_running(dev)) { From eb35cf60e462491249166182e3e755d3d5d91a28 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 13 Jun 2006 17:17:31 +0900 Subject: [PATCH 45/45] [PATCH] sky2: stop/start hardware idle timer on suspend/resume The resume bug was caused not by an early interrupt but because the idle timeout was not being stopped on suspend. Also disable hardware IRQ's on suspend. Will need to revisit this with hotplug? Signed-off-by: Stephen Hemminger Signed-off-by: Linus Torvalds --- drivers/net/sky2.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c index df899dcbd5b0..97fe95666f3b 100644 --- a/drivers/net/sky2.c +++ b/drivers/net/sky2.c @@ -2161,6 +2161,13 @@ static void sky2_descriptor_error(struct sky2_hw *hw, unsigned port, /* If idle then force a fake soft NAPI poll once a second * to work around cases where sharing an edge triggered interrupt. */ +static inline void sky2_idle_start(struct sky2_hw *hw) +{ + if (idle_timeout > 0) + mod_timer(&hw->idle_timer, + jiffies + msecs_to_jiffies(idle_timeout)); +} + static void sky2_idle(unsigned long arg) { struct sky2_hw *hw = (struct sky2_hw *) arg; @@ -3350,9 +3357,7 @@ static int __devinit sky2_probe(struct pci_dev *pdev, sky2_write32(hw, B0_IMSK, Y2_IS_BASE); setup_timer(&hw->idle_timer, sky2_idle, (unsigned long) hw); - if (idle_timeout > 0) - mod_timer(&hw->idle_timer, - jiffies + msecs_to_jiffies(idle_timeout)); + sky2_idle_start(hw); pci_set_drvdata(pdev, hw); @@ -3430,6 +3435,8 @@ static int sky2_suspend(struct pci_dev *pdev, pm_message_t state) if (!(pstate == PCI_D3hot || pstate == PCI_D3cold)) return -EINVAL; + del_timer_sync(&hw->idle_timer); + for (i = 0; i < hw->ports; i++) { struct net_device *dev = hw->dev[i]; @@ -3472,10 +3479,12 @@ static int sky2_resume(struct pci_dev *pdev) printk(KERN_ERR PFX "%s: could not up: %d\n", dev->name, err); dev_close(dev); - break; + goto out; } } } + + sky2_idle_start(hw); out: return err; }