diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 705ad8e66703..7bd4501f0cf9 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -746,6 +746,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. cpuidle.off=1 [CPU_IDLE] disable the cpuidle sub-system + cpu_init_udelay=N + [X86] Delay for N microsec between assert and de-assert + of APIC INIT to start processors. This delay occurs + on every CPU online, such as boot, and resume from suspend. + Default: 10000 + cpcihp_generic= [HW,PCI] Generic port I/O CompactPCI driver Format: ,,,[,] diff --git a/Documentation/x86/entry_64.txt b/Documentation/x86/entry_64.txt index 9132b86176a3..33884d156125 100644 --- a/Documentation/x86/entry_64.txt +++ b/Documentation/x86/entry_64.txt @@ -18,10 +18,10 @@ Some of these entries are: - system_call: syscall instruction from 64-bit code. - - ia32_syscall: int 0x80 from 32-bit or 64-bit code; compat syscall + - entry_INT80_compat: int 0x80 from 32-bit or 64-bit code; compat syscall either way. - - ia32_syscall, ia32_sysenter: syscall and sysenter from 32-bit + - entry_INT80_compat, ia32_sysenter: syscall and sysenter from 32-bit code - interrupt: An array of entries. Every IDT vector that doesn't diff --git a/Documentation/x86/mtrr.txt b/Documentation/x86/mtrr.txt index cc071dc333c2..860bc3adc223 100644 --- a/Documentation/x86/mtrr.txt +++ b/Documentation/x86/mtrr.txt @@ -1,7 +1,19 @@ MTRR (Memory Type Range Register) control -3 Jun 1999 -Richard Gooch - + +Richard Gooch - 3 Jun 1999 +Luis R. Rodriguez - April 9, 2015 + +=============================================================================== +Phasing out MTRR use + +MTRR use is replaced on modern x86 hardware with PAT. Over time the only type +of effective MTRR that is expected to be supported will be for write-combining. +As MTRR use is phased out device drivers should use arch_phys_wc_add() to make +MTRR effective on non-PAT systems while a no-op on PAT enabled systems. + +For details refer to Documentation/x86/pat.txt. + +=============================================================================== On Intel P6 family processors (Pentium Pro, Pentium II and later) the Memory Type Range Registers (MTRRs) may be used to control diff --git a/Documentation/x86/pat.txt b/Documentation/x86/pat.txt index cf08c9fff3cd..54944c71b819 100644 --- a/Documentation/x86/pat.txt +++ b/Documentation/x86/pat.txt @@ -12,7 +12,7 @@ virtual addresses. PAT allows for different types of memory attributes. The most commonly used ones that will be supported at this time are Write-back, Uncached, -Write-combined and Uncached Minus. +Write-combined, Write-through and Uncached Minus. PAT APIs @@ -34,16 +34,23 @@ ioremap | -- | UC- | UC- | | | | | ioremap_cache | -- | WB | WB | | | | | +ioremap_uc | -- | UC | UC | + | | | | ioremap_nocache | -- | UC- | UC- | | | | | ioremap_wc | -- | -- | WC | | | | | +ioremap_wt | -- | -- | WT | + | | | | set_memory_uc | UC- | -- | -- | set_memory_wb | | | | | | | | set_memory_wc | WC | -- | -- | set_memory_wb | | | | | | | | +set_memory_wt | WT | -- | -- | + set_memory_wb | | | | + | | | | pci sysfs resource | -- | -- | UC- | | | | | pci sysfs resource_wc | -- | -- | WC | @@ -102,7 +109,38 @@ wants to export a RAM region, it has to do set_memory_uc() or set_memory_wc() as step 0 above and also track the usage of those pages and use set_memory_wb() before the page is freed to free pool. +MTRR effects on PAT / non-PAT systems +------------------------------------- +The following table provides the effects of using write-combining MTRRs when +using ioremap*() calls on x86 for both non-PAT and PAT systems. Ideally +mtrr_add() usage will be phased out in favor of arch_phys_wc_add() which will +be a no-op on PAT enabled systems. The region over which a arch_phys_wc_add() +is made, should already have been ioremapped with WC attributes or PAT entries, +this can be done by using ioremap_wc() / set_memory_wc(). Devices which +combine areas of IO memory desired to remain uncacheable with areas where +write-combining is desirable should consider use of ioremap_uc() followed by +set_memory_wc() to white-list effective write-combined areas. Such use is +nevertheless discouraged as the effective memory type is considered +implementation defined, yet this strategy can be used as last resort on devices +with size-constrained regions where otherwise MTRR write-combining would +otherwise not be effective. + +---------------------------------------------------------------------- +MTRR Non-PAT PAT Linux ioremap value Effective memory type +---------------------------------------------------------------------- + Non-PAT | PAT + PAT + |PCD + ||PWT + ||| +WC 000 WB _PAGE_CACHE_MODE_WB WC | WC +WC 001 WC _PAGE_CACHE_MODE_WC WC* | WC +WC 010 UC- _PAGE_CACHE_MODE_UC_MINUS WC* | UC +WC 011 UC _PAGE_CACHE_MODE_UC UC | UC +---------------------------------------------------------------------- + +(*) denotes implementation defined and is discouraged Notes: @@ -115,8 +153,8 @@ can be more restrictive, in case of any existing aliasing for that address. For example: If there is an existing uncached mapping, a new ioremap_wc can return uncached mapping in place of write-combine requested. -set_memory_[uc|wc] and set_memory_wb should be used in pairs, where driver will -first make a region uc or wc and switch it back to wb after use. +set_memory_[uc|wc|wt] and set_memory_wb should be used in pairs, where driver +will first make a region uc, wc or wt and switch it back to wb after use. Over time writes to /proc/mtrr will be deprecated in favor of using PAT based interfaces. Users writing to /proc/mtrr are suggested to use above interfaces. @@ -124,7 +162,7 @@ interfaces. Users writing to /proc/mtrr are suggested to use above interfaces. Drivers should use ioremap_[uc|wc] to access PCI BARs with [uc|wc] access types. -Drivers should use set_memory_[uc|wc] to set access type for RAM ranges. +Drivers should use set_memory_[uc|wc|wt] to set access type for RAM ranges. PAT debugging diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt index 5223479291a2..68ed3114c363 100644 --- a/Documentation/x86/x86_64/boot-options.txt +++ b/Documentation/x86/x86_64/boot-options.txt @@ -31,6 +31,9 @@ Machine check (e.g. BIOS or hardware monitoring applications), conflicting with OS's error handling, and you cannot deactivate the agent, then this option will be a help. + mce=no_lmce + Do not opt-in to Local MCE delivery. Use legacy method + to broadcast MCEs. mce=bootlog Enable logging of machine checks left over from booting. Disabled by default on AMD because some BIOS leave bogus ones. diff --git a/MAINTAINERS b/MAINTAINERS index 4303a642a2e5..a655435705aa 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10894,7 +10894,7 @@ M: Andy Lutomirski L: linux-kernel@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/vdso S: Maintained -F: arch/x86/vdso/ +F: arch/x86/entry/vdso/ XC2028/3028 TUNER DRIVER M: Mauro Carvalho Chehab diff --git a/arch/arc/include/asm/io.h b/arch/arc/include/asm/io.h index cabd518cb253..7cc4ced5dbf4 100644 --- a/arch/arc/include/asm/io.h +++ b/arch/arc/include/asm/io.h @@ -20,6 +20,7 @@ extern void iounmap(const void __iomem *addr); #define ioremap_nocache(phy, sz) ioremap(phy, sz) #define ioremap_wc(phy, sz) ioremap(phy, sz) +#define ioremap_wt(phy, sz) ioremap(phy, sz) /* Change struct page to physical address */ #define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) diff --git a/arch/arm/include/asm/io.h b/arch/arm/include/asm/io.h index db58deb00aa7..1b7677d1e5e1 100644 --- a/arch/arm/include/asm/io.h +++ b/arch/arm/include/asm/io.h @@ -336,6 +336,7 @@ extern void _memset_io(volatile void __iomem *, int, size_t); #define ioremap_nocache(cookie,size) __arm_ioremap((cookie), (size), MT_DEVICE) #define ioremap_cache(cookie,size) __arm_ioremap((cookie), (size), MT_DEVICE_CACHED) #define ioremap_wc(cookie,size) __arm_ioremap((cookie), (size), MT_DEVICE_WC) +#define ioremap_wt(cookie,size) __arm_ioremap((cookie), (size), MT_DEVICE) #define iounmap __arm_iounmap /* diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 540f7c0aea82..7116d3973058 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -170,6 +170,7 @@ extern void __iomem *ioremap_cache(phys_addr_t phys_addr, size_t size); #define ioremap(addr, size) __ioremap((addr), (size), __pgprot(PROT_DEVICE_nGnRE)) #define ioremap_nocache(addr, size) __ioremap((addr), (size), __pgprot(PROT_DEVICE_nGnRE)) #define ioremap_wc(addr, size) __ioremap((addr), (size), __pgprot(PROT_NORMAL_NC)) +#define ioremap_wt(addr, size) __ioremap((addr), (size), __pgprot(PROT_DEVICE_nGnRE)) #define iounmap __iounmap /* diff --git a/arch/avr32/include/asm/io.h b/arch/avr32/include/asm/io.h index 4f5ec2bb7172..e998ff5d8e1a 100644 --- a/arch/avr32/include/asm/io.h +++ b/arch/avr32/include/asm/io.h @@ -296,6 +296,7 @@ extern void __iounmap(void __iomem *addr); __iounmap(addr) #define ioremap_wc ioremap_nocache +#define ioremap_wt ioremap_nocache #define cached(addr) P1SEGADDR(addr) #define uncached(addr) P2SEGADDR(addr) diff --git a/arch/frv/include/asm/io.h b/arch/frv/include/asm/io.h index 0b78bc89e840..a31b63ec4930 100644 --- a/arch/frv/include/asm/io.h +++ b/arch/frv/include/asm/io.h @@ -17,6 +17,8 @@ #ifdef __KERNEL__ +#define ARCH_HAS_IOREMAP_WT + #include #include #include @@ -265,7 +267,7 @@ static inline void __iomem *ioremap_nocache(unsigned long physaddr, unsigned lon return __ioremap(physaddr, size, IOMAP_NOCACHE_SER); } -static inline void __iomem *ioremap_writethrough(unsigned long physaddr, unsigned long size) +static inline void __iomem *ioremap_wt(unsigned long physaddr, unsigned long size) { return __ioremap(physaddr, size, IOMAP_WRITETHROUGH); } diff --git a/arch/ia64/include/asm/irq_remapping.h b/arch/ia64/include/asm/irq_remapping.h index e3b3556e2e1b..a8687b1d8906 100644 --- a/arch/ia64/include/asm/irq_remapping.h +++ b/arch/ia64/include/asm/irq_remapping.h @@ -1,6 +1,4 @@ #ifndef __IA64_INTR_REMAPPING_H #define __IA64_INTR_REMAPPING_H #define irq_remapping_enabled 0 -#define dmar_alloc_hwirq create_irq -#define dmar_free_hwirq destroy_irq #endif diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c index 9dd7464f8c17..d70bf15c690a 100644 --- a/arch/ia64/kernel/msi_ia64.c +++ b/arch/ia64/kernel/msi_ia64.c @@ -165,7 +165,7 @@ static struct irq_chip dmar_msi_type = { .irq_retrigger = ia64_msi_retrigger_irq, }; -static int +static void msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) { struct irq_cfg *cfg = irq_cfg + irq; @@ -186,21 +186,29 @@ msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) MSI_DATA_LEVEL_ASSERT | MSI_DATA_DELIVERY_FIXED | MSI_DATA_VECTOR(cfg->vector); - return 0; } -int arch_setup_dmar_msi(unsigned int irq) +int dmar_alloc_hwirq(int id, int node, void *arg) { - int ret; + int irq; struct msi_msg msg; - ret = msi_compose_msg(NULL, irq, &msg); - if (ret < 0) - return ret; - dmar_msi_write(irq, &msg); - irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, - "edge"); - return 0; + irq = create_irq(); + if (irq > 0) { + irq_set_handler_data(irq, arg); + irq_set_chip_and_handler_name(irq, &dmar_msi_type, + handle_edge_irq, "edge"); + msi_compose_msg(NULL, irq, &msg); + dmar_msi_write(irq, &msg); + } + + return irq; +} + +void dmar_free_hwirq(int irq) +{ + irq_set_handler_data(irq, NULL); + destroy_irq(irq); } #endif /* CONFIG_INTEL_IOMMU */ diff --git a/arch/m32r/include/asm/io.h b/arch/m32r/include/asm/io.h index 9cc00dbd59ce..0c3f25ee3381 100644 --- a/arch/m32r/include/asm/io.h +++ b/arch/m32r/include/asm/io.h @@ -68,6 +68,7 @@ static inline void __iomem *ioremap(unsigned long offset, unsigned long size) extern void iounmap(volatile void __iomem *addr); #define ioremap_nocache(off,size) ioremap(off,size) #define ioremap_wc ioremap_nocache +#define ioremap_wt ioremap_nocache /* * IO bus memory addresses are also 1:1 with the physical address diff --git a/arch/m68k/include/asm/io_mm.h b/arch/m68k/include/asm/io_mm.h index 8955b40a5dc4..618c85d3c786 100644 --- a/arch/m68k/include/asm/io_mm.h +++ b/arch/m68k/include/asm/io_mm.h @@ -20,6 +20,8 @@ #ifdef __KERNEL__ +#define ARCH_HAS_IOREMAP_WT + #include #include #include @@ -465,7 +467,7 @@ static inline void __iomem *ioremap_nocache(unsigned long physaddr, unsigned lon { return __ioremap(physaddr, size, IOMAP_NOCACHE_SER); } -static inline void __iomem *ioremap_writethrough(unsigned long physaddr, +static inline void __iomem *ioremap_wt(unsigned long physaddr, unsigned long size) { return __ioremap(physaddr, size, IOMAP_WRITETHROUGH); diff --git a/arch/m68k/include/asm/io_no.h b/arch/m68k/include/asm/io_no.h index a93c8cde4d38..ad7bd40e6742 100644 --- a/arch/m68k/include/asm/io_no.h +++ b/arch/m68k/include/asm/io_no.h @@ -3,6 +3,8 @@ #ifdef __KERNEL__ +#define ARCH_HAS_IOREMAP_WT + #include #include @@ -153,7 +155,7 @@ static inline void *ioremap_nocache(unsigned long physaddr, unsigned long size) { return __ioremap(physaddr, size, IOMAP_NOCACHE_SER); } -static inline void *ioremap_writethrough(unsigned long physaddr, unsigned long size) +static inline void *ioremap_wt(unsigned long physaddr, unsigned long size) { return __ioremap(physaddr, size, IOMAP_WRITETHROUGH); } diff --git a/arch/metag/include/asm/io.h b/arch/metag/include/asm/io.h index d5779b0ec573..9890f21eadbe 100644 --- a/arch/metag/include/asm/io.h +++ b/arch/metag/include/asm/io.h @@ -160,6 +160,9 @@ extern void __iounmap(void __iomem *addr); #define ioremap_wc(offset, size) \ __ioremap((offset), (size), _PAGE_WR_COMBINE) +#define ioremap_wt(offset, size) \ + __ioremap((offset), (size), 0) + #define iounmap(addr) \ __iounmap(addr) diff --git a/arch/microblaze/include/asm/io.h b/arch/microblaze/include/asm/io.h index 940f5fc1d1da..39b6315db82e 100644 --- a/arch/microblaze/include/asm/io.h +++ b/arch/microblaze/include/asm/io.h @@ -39,10 +39,10 @@ extern resource_size_t isa_mem_base; extern void iounmap(void __iomem *addr); extern void __iomem *ioremap(phys_addr_t address, unsigned long size); -#define ioremap_writethrough(addr, size) ioremap((addr), (size)) #define ioremap_nocache(addr, size) ioremap((addr), (size)) #define ioremap_fullcache(addr, size) ioremap((addr), (size)) #define ioremap_wc(addr, size) ioremap((addr), (size)) +#define ioremap_wt(addr, size) ioremap((addr), (size)) #endif /* CONFIG_MMU */ diff --git a/arch/mn10300/include/asm/io.h b/arch/mn10300/include/asm/io.h index cc4a2ba9e228..07c5b4a3903b 100644 --- a/arch/mn10300/include/asm/io.h +++ b/arch/mn10300/include/asm/io.h @@ -282,6 +282,7 @@ static inline void __iomem *ioremap_nocache(unsigned long offset, unsigned long } #define ioremap_wc ioremap_nocache +#define ioremap_wt ioremap_nocache static inline void iounmap(void __iomem *addr) { diff --git a/arch/nios2/include/asm/io.h b/arch/nios2/include/asm/io.h index 6e24d7cceb0c..c5a62da22cd2 100644 --- a/arch/nios2/include/asm/io.h +++ b/arch/nios2/include/asm/io.h @@ -46,6 +46,7 @@ static inline void iounmap(void __iomem *addr) } #define ioremap_wc ioremap_nocache +#define ioremap_wt ioremap_nocache /* Pages to physical address... */ #define page_to_phys(page) virt_to_phys(page_to_virt(page)) diff --git a/arch/s390/include/asm/io.h b/arch/s390/include/asm/io.h index 30fd5c84680e..cb5fdf3a78fc 100644 --- a/arch/s390/include/asm/io.h +++ b/arch/s390/include/asm/io.h @@ -29,6 +29,7 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); #define ioremap_nocache(addr, size) ioremap(addr, size) #define ioremap_wc ioremap_nocache +#define ioremap_wt ioremap_nocache static inline void __iomem *ioremap(unsigned long offset, unsigned long size) { diff --git a/arch/sparc/include/asm/io_32.h b/arch/sparc/include/asm/io_32.h index 407ac14295f4..57f26c398dc9 100644 --- a/arch/sparc/include/asm/io_32.h +++ b/arch/sparc/include/asm/io_32.h @@ -129,6 +129,7 @@ static inline void sbus_memcpy_toio(volatile void __iomem *dst, void __iomem *ioremap(unsigned long offset, unsigned long size); #define ioremap_nocache(X,Y) ioremap((X),(Y)) #define ioremap_wc(X,Y) ioremap((X),(Y)) +#define ioremap_wt(X,Y) ioremap((X),(Y)) void iounmap(volatile void __iomem *addr); /* Create a virtual mapping cookie for an IO port range */ diff --git a/arch/sparc/include/asm/io_64.h b/arch/sparc/include/asm/io_64.h index 50d4840d9aeb..c32fa3f752c8 100644 --- a/arch/sparc/include/asm/io_64.h +++ b/arch/sparc/include/asm/io_64.h @@ -402,6 +402,7 @@ static inline void __iomem *ioremap(unsigned long offset, unsigned long size) #define ioremap_nocache(X,Y) ioremap((X),(Y)) #define ioremap_wc(X,Y) ioremap((X),(Y)) +#define ioremap_wt(X,Y) ioremap((X),(Y)) static inline void iounmap(volatile void __iomem *addr) { diff --git a/arch/tile/include/asm/io.h b/arch/tile/include/asm/io.h index 6ef4ecab1df2..dc61de15c1f9 100644 --- a/arch/tile/include/asm/io.h +++ b/arch/tile/include/asm/io.h @@ -54,7 +54,7 @@ extern void iounmap(volatile void __iomem *addr); #define ioremap_nocache(physaddr, size) ioremap(physaddr, size) #define ioremap_wc(physaddr, size) ioremap(physaddr, size) -#define ioremap_writethrough(physaddr, size) ioremap(physaddr, size) +#define ioremap_wt(physaddr, size) ioremap(physaddr, size) #define ioremap_fullcache(physaddr, size) ioremap(physaddr, size) #define mmiowb() diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index 3942f74c92d7..1538562cc720 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -1,3 +1,6 @@ + +obj-y += entry/ + obj-$(CONFIG_KVM) += kvm/ # Xen paravirtualization support @@ -11,7 +14,7 @@ obj-y += kernel/ obj-y += mm/ obj-y += crypto/ -obj-y += vdso/ + obj-$(CONFIG_IA32_EMULATION) += ia32/ obj-y += platform/ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4e986e809861..7e39f9b22705 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -9,141 +9,141 @@ config 64BIT config X86_32 def_bool y depends on !64BIT - select CLKSRC_I8253 - select HAVE_UID16 config X86_64 def_bool y depends on 64BIT - select X86_DEV_DMA_OPS - select ARCH_USE_CMPXCHG_LOCKREF - select HAVE_LIVEPATCH ### Arch settings config X86 def_bool y - select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI - select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI + select ACPI_LEGACY_TABLES_LOOKUP if ACPI + select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI + select ANON_INODES + select ARCH_CLOCKSOURCE_DATA + select ARCH_DISCARD_MEMBLOCK + select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS + select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_GCOV_PROFILE_ALL + select ARCH_HAS_SG_CHAIN + select ARCH_HAVE_NMI_SAFE_CMPXCHG + select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO - select HAVE_AOUT if X86_32 - select HAVE_UNSTABLE_SCHED_CLOCK - select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 - select ARCH_SUPPORTS_INT128 if X86_64 - select HAVE_IDE - select HAVE_OPROFILE - select HAVE_PCSPKR_PLATFORM - select HAVE_PERF_EVENTS - select HAVE_IOREMAP_PROT - select HAVE_KPROBES - select HAVE_MEMBLOCK - select HAVE_MEMBLOCK_NODE_MAP - select ARCH_DISCARD_MEMBLOCK - select ARCH_WANT_OPTIONAL_GPIOLIB + select ARCH_SUPPORTS_ATOMIC_RMW + select ARCH_SUPPORTS_INT128 if X86_64 + select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 + select ARCH_USE_BUILTIN_BSWAP + select ARCH_USE_CMPXCHG_LOCKREF if X86_64 + select ARCH_USE_QUEUED_RWLOCKS + select ARCH_USE_QUEUED_SPINLOCKS select ARCH_WANT_FRAME_POINTERS + select ARCH_WANT_IPC_PARSE_VERSION if X86_32 + select ARCH_WANT_OPTIONAL_GPIOLIB + select BUILDTIME_EXTABLE_SORT + select CLKEVT_I8253 + select CLKSRC_I8253 if X86_32 + select CLOCKSOURCE_VALIDATE_LAST_CYCLE + select CLOCKSOURCE_WATCHDOG + select CLONE_BACKWARDS if X86_32 + select COMPAT_OLD_SIGACTION if IA32_EMULATION + select DCACHE_WORD_ACCESS + select GENERIC_CLOCKEVENTS + select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC) + select GENERIC_CLOCKEVENTS_MIN_ADJUST + select GENERIC_CMOS_UPDATE + select GENERIC_CPU_AUTOPROBE + select GENERIC_EARLY_IOREMAP + select GENERIC_FIND_FIRST_BIT + select GENERIC_IOMAP + select GENERIC_IRQ_PROBE + select GENERIC_IRQ_SHOW + select GENERIC_PENDING_IRQ if SMP + select GENERIC_SMP_IDLE_THREAD + select GENERIC_STRNCPY_FROM_USER + select GENERIC_STRNLEN_USER + select GENERIC_TIME_VSYSCALL + select HAVE_ACPI_APEI if ACPI + select HAVE_ACPI_APEI_NMI if ACPI + select HAVE_ALIGNED_STRUCT_PAGE if SLUB + select HAVE_AOUT if X86_32 + select HAVE_ARCH_AUDITSYSCALL + select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE + select HAVE_ARCH_JUMP_LABEL + select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP + select HAVE_ARCH_KGDB + select HAVE_ARCH_KMEMCHECK + select HAVE_ARCH_SECCOMP_FILTER + select HAVE_ARCH_SOFT_DIRTY if X86_64 + select HAVE_ARCH_TRACEHOOK + select HAVE_ARCH_TRANSPARENT_HUGEPAGE + select HAVE_BPF_JIT if X86_64 + select HAVE_CC_STACKPROTECTOR + select HAVE_CMPXCHG_DOUBLE + select HAVE_CMPXCHG_LOCAL + select HAVE_CONTEXT_TRACKING if X86_64 + select HAVE_C_RECORDMCOUNT + select HAVE_DEBUG_KMEMLEAK + select HAVE_DEBUG_STACKOVERFLOW + select HAVE_DMA_API_DEBUG select HAVE_DMA_ATTRS select HAVE_DMA_CONTIGUOUS - select HAVE_KRETPROBES - select GENERIC_EARLY_IOREMAP - select HAVE_OPTPROBES - select HAVE_KPROBES_ON_FTRACE - select HAVE_FTRACE_MCOUNT_RECORD - select HAVE_FENTRY if X86_64 - select HAVE_C_RECORDMCOUNT select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS - select HAVE_FUNCTION_TRACER - select HAVE_FUNCTION_GRAPH_TRACER - select HAVE_FUNCTION_GRAPH_FP_TEST - select HAVE_SYSCALL_TRACEPOINTS - select SYSCTL_EXCEPTION_TRACE - select HAVE_KVM - select HAVE_ARCH_KGDB - select HAVE_ARCH_TRACEHOOK - select HAVE_GENERIC_DMA_COHERENT if X86_32 select HAVE_EFFICIENT_UNALIGNED_ACCESS - select USER_STACKTRACE_SUPPORT - select HAVE_REGS_AND_STACK_ACCESS_API - select HAVE_DMA_API_DEBUG - select HAVE_KERNEL_GZIP - select HAVE_KERNEL_BZIP2 - select HAVE_KERNEL_LZMA - select HAVE_KERNEL_XZ - select HAVE_KERNEL_LZO - select HAVE_KERNEL_LZ4 + select HAVE_FENTRY if X86_64 + select HAVE_FTRACE_MCOUNT_RECORD + select HAVE_FUNCTION_GRAPH_FP_TEST + select HAVE_FUNCTION_GRAPH_TRACER + select HAVE_FUNCTION_TRACER + select HAVE_GENERIC_DMA_COHERENT if X86_32 select HAVE_HW_BREAKPOINT + select HAVE_IDE + select HAVE_IOREMAP_PROT + select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64 + select HAVE_IRQ_TIME_ACCOUNTING + select HAVE_KERNEL_BZIP2 + select HAVE_KERNEL_GZIP + select HAVE_KERNEL_LZ4 + select HAVE_KERNEL_LZMA + select HAVE_KERNEL_LZO + select HAVE_KERNEL_XZ + select HAVE_KPROBES + select HAVE_KPROBES_ON_FTRACE + select HAVE_KRETPROBES + select HAVE_KVM + select HAVE_LIVEPATCH if X86_64 + select HAVE_MEMBLOCK + select HAVE_MEMBLOCK_NODE_MAP select HAVE_MIXED_BREAKPOINTS_REGS - select PERF_EVENTS + select HAVE_OPROFILE + select HAVE_OPTPROBES + select HAVE_PCSPKR_PLATFORM + select HAVE_PERF_EVENTS select HAVE_PERF_EVENTS_NMI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP - select HAVE_DEBUG_KMEMLEAK - select ANON_INODES - select HAVE_ALIGNED_STRUCT_PAGE if SLUB - select HAVE_CMPXCHG_LOCAL - select HAVE_CMPXCHG_DOUBLE - select HAVE_ARCH_KMEMCHECK - select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP + select HAVE_REGS_AND_STACK_ACCESS_API + select HAVE_SYSCALL_TRACEPOINTS + select HAVE_UID16 if X86_32 + select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_USER_RETURN_NOTIFIER - select ARCH_HAS_ELF_RANDOMIZE - select HAVE_ARCH_JUMP_LABEL - select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE - select SPARSE_IRQ - select GENERIC_FIND_FIRST_BIT - select GENERIC_IRQ_PROBE - select GENERIC_PENDING_IRQ if SMP - select GENERIC_IRQ_SHOW - select GENERIC_CLOCKEVENTS_MIN_ADJUST select IRQ_FORCED_THREADING - select HAVE_BPF_JIT if X86_64 - select HAVE_ARCH_TRANSPARENT_HUGEPAGE - select HAVE_ARCH_HUGE_VMAP if X86_64 || (X86_32 && X86_PAE) - select ARCH_HAS_SG_CHAIN - select CLKEVT_I8253 - select ARCH_HAVE_NMI_SAFE_CMPXCHG - select GENERIC_IOMAP - select DCACHE_WORD_ACCESS - select GENERIC_SMP_IDLE_THREAD - select ARCH_WANT_IPC_PARSE_VERSION if X86_32 - select HAVE_ARCH_SECCOMP_FILTER - select BUILDTIME_EXTABLE_SORT - select GENERIC_CMOS_UPDATE - select HAVE_ARCH_SOFT_DIRTY if X86_64 - select CLOCKSOURCE_WATCHDOG - select GENERIC_CLOCKEVENTS - select ARCH_CLOCKSOURCE_DATA - select CLOCKSOURCE_VALIDATE_LAST_CYCLE - select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC) - select GENERIC_TIME_VSYSCALL - select GENERIC_STRNCPY_FROM_USER - select GENERIC_STRNLEN_USER - select HAVE_CONTEXT_TRACKING if X86_64 - select HAVE_IRQ_TIME_ACCOUNTING - select VIRT_TO_BUS - select MODULES_USE_ELF_REL if X86_32 - select MODULES_USE_ELF_RELA if X86_64 - select CLONE_BACKWARDS if X86_32 - select ARCH_USE_BUILTIN_BSWAP - select ARCH_USE_QUEUED_SPINLOCKS - select ARCH_USE_QUEUED_RWLOCKS - select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION - select OLD_SIGACTION if X86_32 - select COMPAT_OLD_SIGACTION if IA32_EMULATION + select MODULES_USE_ELF_RELA if X86_64 + select MODULES_USE_ELF_REL if X86_32 + select OLD_SIGACTION if X86_32 + select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION + select PERF_EVENTS select RTC_LIB - select HAVE_DEBUG_STACKOVERFLOW - select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64 - select HAVE_CC_STACKPROTECTOR - select GENERIC_CPU_AUTOPROBE - select HAVE_ARCH_AUDITSYSCALL - select ARCH_SUPPORTS_ATOMIC_RMW - select HAVE_ACPI_APEI if ACPI - select HAVE_ACPI_APEI_NMI if ACPI - select ACPI_LEGACY_TABLES_LOOKUP if ACPI - select X86_FEATURE_NAMES if PROC_FS + select SPARSE_IRQ select SRCU + select SYSCTL_EXCEPTION_TRACE + select USER_STACKTRACE_SUPPORT + select VIRT_TO_BUS + select X86_DEV_DMA_OPS if X86_64 + select X86_FEATURE_NAMES if PROC_FS config INSTRUCTION_DECODER def_bool y @@ -261,10 +261,6 @@ config X86_64_SMP def_bool y depends on X86_64 && SMP -config X86_HT - def_bool y - depends on SMP - config X86_32_LAZY_GS def_bool y depends on X86_32 && !CC_STACKPROTECTOR @@ -342,7 +338,7 @@ config X86_FEATURE_NAMES config X86_X2APIC bool "Support x2apic" - depends on X86_LOCAL_APIC && X86_64 && IRQ_REMAP + depends on X86_LOCAL_APIC && X86_64 && (IRQ_REMAP || HYPERVISOR_GUEST) ---help--- This enables x2apic support on CPUs that have this feature. @@ -442,6 +438,7 @@ config X86_UV depends on X86_EXTENDED_PLATFORM depends on NUMA depends on X86_X2APIC + depends on PCI ---help--- This option is needed in order to support SGI Ultraviolet systems. If you don't have one of these, you should say N here. @@ -467,7 +464,6 @@ config X86_INTEL_CE select X86_REBOOTFIXUPS select OF select OF_EARLY_FLATTREE - select IRQ_DOMAIN ---help--- Select for the Intel CE media processor (CE4100) SOC. This option compiles in support for the CE4100 SOC for settop @@ -852,11 +848,12 @@ config NR_CPUS default "1" if !SMP default "8192" if MAXSMP default "32" if SMP && X86_BIGSMP - default "8" if SMP + default "8" if SMP && X86_32 + default "64" if SMP ---help--- This allows you to specify the maximum number of CPUs which this kernel will support. If CPUMASK_OFFSTACK is enabled, the maximum - supported value is 4096, otherwise the maximum value is 512. The + supported value is 8192, otherwise the maximum value is 512. The minimum value which makes sense is 2. This is purely to save memory - each supported CPU adds @@ -864,7 +861,7 @@ config NR_CPUS config SCHED_SMT bool "SMT (Hyperthreading) scheduler support" - depends on X86_HT + depends on SMP ---help--- SMT scheduler support improves the CPU scheduler's decision making when dealing with Intel Pentium 4 chips with HyperThreading at a @@ -874,7 +871,7 @@ config SCHED_SMT config SCHED_MC def_bool y prompt "Multi-core scheduler support" - depends on X86_HT + depends on SMP ---help--- Multi-core scheduler support improves the CPU scheduler's decision making when dealing with multi-core CPU chips at a cost of slightly @@ -915,12 +912,12 @@ config X86_UP_IOAPIC config X86_LOCAL_APIC def_bool y depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC || PCI_MSI - select GENERIC_IRQ_LEGACY_ALLOC_HWIRQ + select IRQ_DOMAIN_HIERARCHY + select PCI_MSI_IRQ_DOMAIN if PCI_MSI config X86_IO_APIC def_bool y depends on X86_LOCAL_APIC || X86_UP_IOAPIC - select IRQ_DOMAIN config X86_REROUTE_FOR_BROKEN_BOOT_IRQS bool "Reroute for broken boot IRQs" diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 2fd3ebbb4e33..a15893d17c55 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -344,4 +344,15 @@ config X86_DEBUG_FPU If unsure, say N. +config PUNIT_ATOM_DEBUG + tristate "ATOM Punit debug driver" + select DEBUG_FS + select IOSF_MBI + ---help--- + This is a debug driver, which gets the power states + of all Punit North Complex devices. The power states of + each device is exposed as part of the debugfs interface. + The current power state can be read from + /sys/kernel/debug/punit_atom/dev_power_state + endmenu diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 2fda005bb334..118e6debc483 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -77,6 +77,12 @@ else KBUILD_AFLAGS += -m64 KBUILD_CFLAGS += -m64 + # Align jump targets to 1 byte, not the default 16 bytes: + KBUILD_CFLAGS += -falign-jumps=1 + + # Pack loops tightly as well: + KBUILD_CFLAGS += -falign-loops=1 + # Don't autogenerate traditional x87 instructions KBUILD_CFLAGS += $(call cc-option,-mno-80387) KBUILD_CFLAGS += $(call cc-option,-mno-fp-ret-in-387) @@ -84,6 +90,9 @@ else # Use -mpreferred-stack-boundary=3 if supported. KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=3) + # Use -mskip-rax-setup if supported. + KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup) + # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu) cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8) cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona) @@ -140,12 +149,6 @@ endif sp-$(CONFIG_X86_32) := esp sp-$(CONFIG_X86_64) := rsp -# do binutils support CFI? -cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_endproc,-DCONFIG_AS_CFI=1) -# is .cfi_signal_frame supported too? -cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1) -cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1) - # does binutils support specific instructions? asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1) asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1) @@ -153,8 +156,8 @@ asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1) avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1) avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1) -KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) -KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) +KBUILD_AFLAGS += $(asinstr) $(avx_instr) $(avx2_instr) +KBUILD_CFLAGS += $(asinstr) $(avx_instr) $(avx2_instr) LDFLAGS := -m elf_$(UTS_MACHINE) @@ -178,7 +181,7 @@ archscripts: scripts_basic # Syscall table generation archheaders: - $(Q)$(MAKE) $(build)=arch/x86/syscalls all + $(Q)$(MAKE) $(build)=arch/x86/entry/syscalls all archprepare: ifeq ($(CONFIG_KEXEC_FILE),y) @@ -241,7 +244,7 @@ install: PHONY += vdso_install vdso_install: - $(Q)$(MAKE) $(build)=arch/x86/vdso $@ + $(Q)$(MAKE) $(build)=arch/x86/entry/vdso $@ archclean: $(Q)rm -rf $(objtree)/arch/i386 diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile new file mode 100644 index 000000000000..7a144971db79 --- /dev/null +++ b/arch/x86/entry/Makefile @@ -0,0 +1,10 @@ +# +# Makefile for the x86 low level entry code +# +obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o + +obj-y += vdso/ +obj-y += vsyscall/ + +obj-$(CONFIG_IA32_EMULATION) += entry_64_compat.o syscall_32.o + diff --git a/arch/x86/include/asm/calling.h b/arch/x86/entry/calling.h similarity index 80% rename from arch/x86/include/asm/calling.h rename to arch/x86/entry/calling.h index 1c8b50edb2db..f4e6308c4200 100644 --- a/arch/x86/include/asm/calling.h +++ b/arch/x86/entry/calling.h @@ -46,8 +46,6 @@ For 32-bit we have the following conventions - kernel is built with */ -#include - #ifdef CONFIG_X86_64 /* @@ -91,28 +89,27 @@ For 32-bit we have the following conventions - kernel is built with #define SIZEOF_PTREGS 21*8 .macro ALLOC_PT_GPREGS_ON_STACK addskip=0 - subq $15*8+\addskip, %rsp - CFI_ADJUST_CFA_OFFSET 15*8+\addskip + addq $-(15*8+\addskip), %rsp .endm .macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1 .if \r11 - movq_cfi r11, 6*8+\offset + movq %r11, 6*8+\offset(%rsp) .endif .if \r8910 - movq_cfi r10, 7*8+\offset - movq_cfi r9, 8*8+\offset - movq_cfi r8, 9*8+\offset + movq %r10, 7*8+\offset(%rsp) + movq %r9, 8*8+\offset(%rsp) + movq %r8, 9*8+\offset(%rsp) .endif .if \rax - movq_cfi rax, 10*8+\offset + movq %rax, 10*8+\offset(%rsp) .endif .if \rcx - movq_cfi rcx, 11*8+\offset + movq %rcx, 11*8+\offset(%rsp) .endif - movq_cfi rdx, 12*8+\offset - movq_cfi rsi, 13*8+\offset - movq_cfi rdi, 14*8+\offset + movq %rdx, 12*8+\offset(%rsp) + movq %rsi, 13*8+\offset(%rsp) + movq %rdi, 14*8+\offset(%rsp) .endm .macro SAVE_C_REGS offset=0 SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1 @@ -131,24 +128,24 @@ For 32-bit we have the following conventions - kernel is built with .endm .macro SAVE_EXTRA_REGS offset=0 - movq_cfi r15, 0*8+\offset - movq_cfi r14, 1*8+\offset - movq_cfi r13, 2*8+\offset - movq_cfi r12, 3*8+\offset - movq_cfi rbp, 4*8+\offset - movq_cfi rbx, 5*8+\offset + movq %r15, 0*8+\offset(%rsp) + movq %r14, 1*8+\offset(%rsp) + movq %r13, 2*8+\offset(%rsp) + movq %r12, 3*8+\offset(%rsp) + movq %rbp, 4*8+\offset(%rsp) + movq %rbx, 5*8+\offset(%rsp) .endm .macro SAVE_EXTRA_REGS_RBP offset=0 - movq_cfi rbp, 4*8+\offset + movq %rbp, 4*8+\offset(%rsp) .endm .macro RESTORE_EXTRA_REGS offset=0 - movq_cfi_restore 0*8+\offset, r15 - movq_cfi_restore 1*8+\offset, r14 - movq_cfi_restore 2*8+\offset, r13 - movq_cfi_restore 3*8+\offset, r12 - movq_cfi_restore 4*8+\offset, rbp - movq_cfi_restore 5*8+\offset, rbx + movq 0*8+\offset(%rsp), %r15 + movq 1*8+\offset(%rsp), %r14 + movq 2*8+\offset(%rsp), %r13 + movq 3*8+\offset(%rsp), %r12 + movq 4*8+\offset(%rsp), %rbp + movq 5*8+\offset(%rsp), %rbx .endm .macro ZERO_EXTRA_REGS @@ -162,24 +159,24 @@ For 32-bit we have the following conventions - kernel is built with .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 .if \rstor_r11 - movq_cfi_restore 6*8, r11 + movq 6*8(%rsp), %r11 .endif .if \rstor_r8910 - movq_cfi_restore 7*8, r10 - movq_cfi_restore 8*8, r9 - movq_cfi_restore 9*8, r8 + movq 7*8(%rsp), %r10 + movq 8*8(%rsp), %r9 + movq 9*8(%rsp), %r8 .endif .if \rstor_rax - movq_cfi_restore 10*8, rax + movq 10*8(%rsp), %rax .endif .if \rstor_rcx - movq_cfi_restore 11*8, rcx + movq 11*8(%rsp), %rcx .endif .if \rstor_rdx - movq_cfi_restore 12*8, rdx + movq 12*8(%rsp), %rdx .endif - movq_cfi_restore 13*8, rsi - movq_cfi_restore 14*8, rdi + movq 13*8(%rsp), %rsi + movq 14*8(%rsp), %rdi .endm .macro RESTORE_C_REGS RESTORE_C_REGS_HELPER 1,1,1,1,1 @@ -204,8 +201,7 @@ For 32-bit we have the following conventions - kernel is built with .endm .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0 - addq $15*8+\addskip, %rsp - CFI_ADJUST_CFA_OFFSET -(15*8+\addskip) + subq $-(15*8+\addskip), %rsp .endm .macro icebp @@ -224,23 +220,23 @@ For 32-bit we have the following conventions - kernel is built with */ .macro SAVE_ALL - pushl_cfi_reg eax - pushl_cfi_reg ebp - pushl_cfi_reg edi - pushl_cfi_reg esi - pushl_cfi_reg edx - pushl_cfi_reg ecx - pushl_cfi_reg ebx + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx .endm .macro RESTORE_ALL - popl_cfi_reg ebx - popl_cfi_reg ecx - popl_cfi_reg edx - popl_cfi_reg esi - popl_cfi_reg edi - popl_cfi_reg ebp - popl_cfi_reg eax + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax .endm #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S new file mode 100644 index 000000000000..21dc60a60b5f --- /dev/null +++ b/arch/x86/entry/entry_32.S @@ -0,0 +1,1248 @@ +/* + * Copyright (C) 1991,1992 Linus Torvalds + * + * entry_32.S contains the system-call and low-level fault and trap handling routines. + * + * Stack layout in 'syscall_exit': + * ptrace needs to have all registers on the stack. + * If the order here is changed, it needs to be + * updated in fork.c:copy_process(), signal.c:do_signal(), + * ptrace.c and ptrace.h + * + * 0(%esp) - %ebx + * 4(%esp) - %ecx + * 8(%esp) - %edx + * C(%esp) - %esi + * 10(%esp) - %edi + * 14(%esp) - %ebp + * 18(%esp) - %eax + * 1C(%esp) - %ds + * 20(%esp) - %es + * 24(%esp) - %fs + * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS + * 2C(%esp) - orig_eax + * 30(%esp) - %eip + * 34(%esp) - %cs + * 38(%esp) - %eflags + * 3C(%esp) - %oldesp + * 40(%esp) - %oldss + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Avoid __ASSEMBLER__'ifying just for this. */ +#include +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_LE 0x40000000 + +#ifndef CONFIG_AUDITSYSCALL +# define sysenter_audit syscall_trace_entry +# define sysexit_audit syscall_exit_work +#endif + + .section .entry.text, "ax" + +/* + * We use macros for low-level operations which need to be overridden + * for paravirtualization. The following will never clobber any registers: + * INTERRUPT_RETURN (aka. "iret") + * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") + * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). + * + * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must + * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). + * Allowing a register to be clobbered can shrink the paravirt replacement + * enough to patch inline, increasing performance. + */ + +#ifdef CONFIG_PREEMPT +# define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF +#else +# define preempt_stop(clobbers) +# define resume_kernel restore_all +#endif + +.macro TRACE_IRQS_IRET +#ifdef CONFIG_TRACE_IRQFLAGS + testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off? + jz 1f + TRACE_IRQS_ON +1: +#endif +.endm + +/* + * User gs save/restore + * + * %gs is used for userland TLS and kernel only uses it for stack + * canary which is required to be at %gs:20 by gcc. Read the comment + * at the top of stackprotector.h for more info. + * + * Local labels 98 and 99 are used. + */ +#ifdef CONFIG_X86_32_LAZY_GS + + /* unfortunately push/pop can't be no-op */ +.macro PUSH_GS + pushl $0 +.endm +.macro POP_GS pop=0 + addl $(4 + \pop), %esp +.endm +.macro POP_GS_EX +.endm + + /* all the rest are no-op */ +.macro PTGS_TO_GS +.endm +.macro PTGS_TO_GS_EX +.endm +.macro GS_TO_REG reg +.endm +.macro REG_TO_PTGS reg +.endm +.macro SET_KERNEL_GS reg +.endm + +#else /* CONFIG_X86_32_LAZY_GS */ + +.macro PUSH_GS + pushl %gs +.endm + +.macro POP_GS pop=0 +98: popl %gs + .if \pop <> 0 + add $\pop, %esp + .endif +.endm +.macro POP_GS_EX +.pushsection .fixup, "ax" +99: movl $0, (%esp) + jmp 98b +.popsection + _ASM_EXTABLE(98b, 99b) +.endm + +.macro PTGS_TO_GS +98: mov PT_GS(%esp), %gs +.endm +.macro PTGS_TO_GS_EX +.pushsection .fixup, "ax" +99: movl $0, PT_GS(%esp) + jmp 98b +.popsection + _ASM_EXTABLE(98b, 99b) +.endm + +.macro GS_TO_REG reg + movl %gs, \reg +.endm +.macro REG_TO_PTGS reg + movl \reg, PT_GS(%esp) +.endm +.macro SET_KERNEL_GS reg + movl $(__KERNEL_STACK_CANARY), \reg + movl \reg, %gs +.endm + +#endif /* CONFIG_X86_32_LAZY_GS */ + +.macro SAVE_ALL + cld + PUSH_GS + pushl %fs + pushl %es + pushl %ds + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + movl $(__USER_DS), %edx + movl %edx, %ds + movl %edx, %es + movl $(__KERNEL_PERCPU), %edx + movl %edx, %fs + SET_KERNEL_GS %edx +.endm + +.macro RESTORE_INT_REGS + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax +.endm + +.macro RESTORE_REGS pop=0 + RESTORE_INT_REGS +1: popl %ds +2: popl %es +3: popl %fs + POP_GS \pop +.pushsection .fixup, "ax" +4: movl $0, (%esp) + jmp 1b +5: movl $0, (%esp) + jmp 2b +6: movl $0, (%esp) + jmp 3b +.popsection + _ASM_EXTABLE(1b, 4b) + _ASM_EXTABLE(2b, 5b) + _ASM_EXTABLE(3b, 6b) + POP_GS_EX +.endm + +ENTRY(ret_from_fork) + pushl %eax + call schedule_tail + GET_THREAD_INFO(%ebp) + popl %eax + pushl $0x0202 # Reset kernel eflags + popfl + jmp syscall_exit +END(ret_from_fork) + +ENTRY(ret_from_kernel_thread) + pushl %eax + call schedule_tail + GET_THREAD_INFO(%ebp) + popl %eax + pushl $0x0202 # Reset kernel eflags + popfl + movl PT_EBP(%esp), %eax + call *PT_EBX(%esp) + movl $0, PT_EAX(%esp) + jmp syscall_exit +ENDPROC(ret_from_kernel_thread) + +/* + * Return to user mode is not as complex as all this looks, + * but we want the default path for a system call return to + * go as quickly as possible which is why some of this is + * less clear than it otherwise should be. + */ + + # userspace resumption stub bypassing syscall exit tracing + ALIGN +ret_from_exception: + preempt_stop(CLBR_ANY) +ret_from_intr: + GET_THREAD_INFO(%ebp) +#ifdef CONFIG_VM86 + movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS + movb PT_CS(%esp), %al + andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax +#else + /* + * We can be coming here from child spawned by kernel_thread(). + */ + movl PT_CS(%esp), %eax + andl $SEGMENT_RPL_MASK, %eax +#endif + cmpl $USER_RPL, %eax + jb resume_kernel # not returning to v8086 or userspace + +ENTRY(resume_userspace) + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + andl $_TIF_WORK_MASK, %ecx # is there any work to be done on + # int/exception return? + jne work_pending + jmp restore_all +END(ret_from_exception) + +#ifdef CONFIG_PREEMPT +ENTRY(resume_kernel) + DISABLE_INTERRUPTS(CLBR_ANY) +need_resched: + cmpl $0, PER_CPU_VAR(__preempt_count) + jnz restore_all + testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ? + jz restore_all + call preempt_schedule_irq + jmp need_resched +END(resume_kernel) +#endif + +/* + * SYSENTER_RETURN points to after the SYSENTER instruction + * in the vsyscall page. See vsyscall-sysentry.S, which defines + * the symbol. + */ + + # SYSENTER call handler stub +ENTRY(entry_SYSENTER_32) + movl TSS_sysenter_sp0(%esp), %esp +sysenter_past_esp: + /* + * Interrupts are disabled here, but we can't trace it until + * enough kernel state to call TRACE_IRQS_OFF can be called - but + * we immediately enable interrupts at that point anyway. + */ + pushl $__USER_DS + pushl %ebp + pushfl + orl $X86_EFLAGS_IF, (%esp) + pushl $__USER_CS + /* + * Push current_thread_info()->sysenter_return to the stack. + * A tiny bit of offset fixup is necessary: TI_sysenter_return + * is relative to thread_info, which is at the bottom of the + * kernel stack page. 4*4 means the 4 words pushed above; + * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack; + * and THREAD_SIZE takes us to the bottom. + */ + pushl ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp) + + pushl %eax + SAVE_ALL + ENABLE_INTERRUPTS(CLBR_NONE) + +/* + * Load the potential sixth argument from user stack. + * Careful about security. + */ + cmpl $__PAGE_OFFSET-3, %ebp + jae syscall_fault + ASM_STAC +1: movl (%ebp), %ebp + ASM_CLAC + movl %ebp, PT_EBP(%esp) + _ASM_EXTABLE(1b, syscall_fault) + + GET_THREAD_INFO(%ebp) + + testl $_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp) + jnz sysenter_audit +sysenter_do_call: + cmpl $(NR_syscalls), %eax + jae sysenter_badsys + call *sys_call_table(, %eax, 4) +sysenter_after_call: + movl %eax, PT_EAX(%esp) + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testl $_TIF_ALLWORK_MASK, %ecx + jnz sysexit_audit +sysenter_exit: +/* if something modifies registers it must also disable sysexit */ + movl PT_EIP(%esp), %edx + movl PT_OLDESP(%esp), %ecx + xorl %ebp, %ebp + TRACE_IRQS_ON +1: mov PT_FS(%esp), %fs + PTGS_TO_GS + ENABLE_INTERRUPTS_SYSEXIT + +#ifdef CONFIG_AUDITSYSCALL +sysenter_audit: + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), TI_flags(%ebp) + jnz syscall_trace_entry + /* movl PT_EAX(%esp), %eax already set, syscall number: 1st arg to audit */ + movl PT_EBX(%esp), %edx /* ebx/a0: 2nd arg to audit */ + /* movl PT_ECX(%esp), %ecx already set, a1: 3nd arg to audit */ + pushl PT_ESI(%esp) /* a3: 5th arg */ + pushl PT_EDX+4(%esp) /* a2: 4th arg */ + call __audit_syscall_entry + popl %ecx /* get that remapped edx off the stack */ + popl %ecx /* get that remapped esi off the stack */ + movl PT_EAX(%esp), %eax /* reload syscall number */ + jmp sysenter_do_call + +sysexit_audit: + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx + jnz syscall_exit_work + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_ANY) + movl %eax, %edx /* second arg, syscall return value */ + cmpl $-MAX_ERRNO, %eax /* is it an error ? */ + setbe %al /* 1 if so, 0 if not */ + movzbl %al, %eax /* zero-extend that */ + call __audit_syscall_exit + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx + jnz syscall_exit_work + movl PT_EAX(%esp), %eax /* reload syscall return value */ + jmp sysenter_exit +#endif + +.pushsection .fixup, "ax" +2: movl $0, PT_FS(%esp) + jmp 1b +.popsection + _ASM_EXTABLE(1b, 2b) + PTGS_TO_GS_EX +ENDPROC(entry_SYSENTER_32) + + # system call handler stub +ENTRY(entry_INT80_32) + ASM_CLAC + pushl %eax # save orig_eax + SAVE_ALL + GET_THREAD_INFO(%ebp) + # system call tracing in operation / emulation + testl $_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp) + jnz syscall_trace_entry + cmpl $(NR_syscalls), %eax + jae syscall_badsys +syscall_call: + call *sys_call_table(, %eax, 4) +syscall_after_call: + movl %eax, PT_EAX(%esp) # store the return value +syscall_exit: + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testl $_TIF_ALLWORK_MASK, %ecx # current->work + jnz syscall_exit_work + +restore_all: + TRACE_IRQS_IRET +restore_all_notrace: +#ifdef CONFIG_X86_ESPFIX32 + movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS + /* + * Warning: PT_OLDSS(%esp) contains the wrong/random values if we + * are returning to the kernel. + * See comments in process.c:copy_thread() for details. + */ + movb PT_OLDSS(%esp), %ah + movb PT_CS(%esp), %al + andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax + cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax + je ldt_ss # returning to user-space with LDT SS +#endif +restore_nocheck: + RESTORE_REGS 4 # skip orig_eax/error_code +irq_return: + INTERRUPT_RETURN +.section .fixup, "ax" +ENTRY(iret_exc ) + pushl $0 # no error code + pushl $do_iret_error + jmp error_code +.previous + _ASM_EXTABLE(irq_return, iret_exc) + +#ifdef CONFIG_X86_ESPFIX32 +ldt_ss: +#ifdef CONFIG_PARAVIRT + /* + * The kernel can't run on a non-flat stack if paravirt mode + * is active. Rather than try to fixup the high bits of + * ESP, bypass this code entirely. This may break DOSemu + * and/or Wine support in a paravirt VM, although the option + * is still available to implement the setting of the high + * 16-bits in the INTERRUPT_RETURN paravirt-op. + */ + cmpl $0, pv_info+PARAVIRT_enabled + jne restore_nocheck +#endif + +/* + * Setup and switch to ESPFIX stack + * + * We're returning to userspace with a 16 bit stack. The CPU will not + * restore the high word of ESP for us on executing iret... This is an + * "official" bug of all the x86-compatible CPUs, which we can work + * around to make dosemu and wine happy. We do this by preloading the + * high word of ESP with the high word of the userspace ESP while + * compensating for the offset by changing to the ESPFIX segment with + * a base address that matches for the difference. + */ +#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) + mov %esp, %edx /* load kernel esp */ + mov PT_OLDESP(%esp), %eax /* load userspace esp */ + mov %dx, %ax /* eax: new kernel esp */ + sub %eax, %edx /* offset (low word is 0) */ + shr $16, %edx + mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ + mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ + pushl $__ESPFIX_SS + pushl %eax /* new kernel esp */ + /* + * Disable interrupts, but do not irqtrace this section: we + * will soon execute iret and the tracer was already set to + * the irqstate after the IRET: + */ + DISABLE_INTERRUPTS(CLBR_EAX) + lss (%esp), %esp /* switch to espfix segment */ + jmp restore_nocheck +#endif +ENDPROC(entry_INT80_32) + + # perform work that needs to be done immediately before resumption + ALIGN +work_pending: + testb $_TIF_NEED_RESCHED, %cl + jz work_notifysig +work_resched: + call schedule + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + andl $_TIF_WORK_MASK, %ecx # is there any work to be done other + # than syscall tracing? + jz restore_all + testb $_TIF_NEED_RESCHED, %cl + jnz work_resched + +work_notifysig: # deal with pending signals and + # notify-resume requests +#ifdef CONFIG_VM86 + testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) + movl %esp, %eax + jnz work_notifysig_v86 # returning to kernel-space or + # vm86-space +1: +#else + movl %esp, %eax +#endif + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + movb PT_CS(%esp), %bl + andb $SEGMENT_RPL_MASK, %bl + cmpb $USER_RPL, %bl + jb resume_kernel + xorl %edx, %edx + call do_notify_resume + jmp resume_userspace + +#ifdef CONFIG_VM86 + ALIGN +work_notifysig_v86: + pushl %ecx # save ti_flags for do_notify_resume + call save_v86_state # %eax contains pt_regs pointer + popl %ecx + movl %eax, %esp + jmp 1b +#endif +END(work_pending) + + # perform syscall exit tracing + ALIGN +syscall_trace_entry: + movl $-ENOSYS, PT_EAX(%esp) + movl %esp, %eax + call syscall_trace_enter + /* What it returned is what we'll actually use. */ + cmpl $(NR_syscalls), %eax + jnae syscall_call + jmp syscall_exit +END(syscall_trace_entry) + + # perform syscall exit tracing + ALIGN +syscall_exit_work: + testl $_TIF_WORK_SYSCALL_EXIT, %ecx + jz work_pending + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call + # schedule() instead + movl %esp, %eax + call syscall_trace_leave + jmp resume_userspace +END(syscall_exit_work) + +syscall_fault: + ASM_CLAC + GET_THREAD_INFO(%ebp) + movl $-EFAULT, PT_EAX(%esp) + jmp resume_userspace +END(syscall_fault) + +syscall_badsys: + movl $-ENOSYS, %eax + jmp syscall_after_call +END(syscall_badsys) + +sysenter_badsys: + movl $-ENOSYS, %eax + jmp sysenter_after_call +END(sysenter_badsys) + +.macro FIXUP_ESPFIX_STACK +/* + * Switch back for ESPFIX stack to the normal zerobased stack + * + * We can't call C functions using the ESPFIX stack. This code reads + * the high word of the segment base from the GDT and swiches to the + * normal stack and adjusts ESP with the matching offset. + */ +#ifdef CONFIG_X86_ESPFIX32 + /* fixup the stack */ + mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ + mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ + shl $16, %eax + addl %esp, %eax /* the adjusted stack pointer */ + pushl $__KERNEL_DS + pushl %eax + lss (%esp), %esp /* switch to the normal stack segment */ +#endif +.endm +.macro UNWIND_ESPFIX_STACK +#ifdef CONFIG_X86_ESPFIX32 + movl %ss, %eax + /* see if on espfix stack */ + cmpw $__ESPFIX_SS, %ax + jne 27f + movl $__KERNEL_DS, %eax + movl %eax, %ds + movl %eax, %es + /* switch to normal stack */ + FIXUP_ESPFIX_STACK +27: +#endif +.endm + +/* + * Build the entry stubs with some assembler magic. + * We pack 1 stub into every 8-byte block. + */ + .align 8 +ENTRY(irq_entries_start) + vector=FIRST_EXTERNAL_VECTOR + .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) + pushl $(~vector+0x80) /* Note: always in signed byte range */ + vector=vector+1 + jmp common_interrupt + .align 8 + .endr +END(irq_entries_start) + +/* + * the CPU automatically disables interrupts when executing an IRQ vector, + * so IRQ-flags tracing has to follow that: + */ + .p2align CONFIG_X86_L1_CACHE_SHIFT +common_interrupt: + ASM_CLAC + addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */ + SAVE_ALL + TRACE_IRQS_OFF + movl %esp, %eax + call do_IRQ + jmp ret_from_intr +ENDPROC(common_interrupt) + +#define BUILD_INTERRUPT3(name, nr, fn) \ +ENTRY(name) \ + ASM_CLAC; \ + pushl $~(nr); \ + SAVE_ALL; \ + TRACE_IRQS_OFF \ + movl %esp, %eax; \ + call fn; \ + jmp ret_from_intr; \ +ENDPROC(name) + + +#ifdef CONFIG_TRACING +# define TRACE_BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name) +#else +# define TRACE_BUILD_INTERRUPT(name, nr) +#endif + +#define BUILD_INTERRUPT(name, nr) \ + BUILD_INTERRUPT3(name, nr, smp_##name); \ + TRACE_BUILD_INTERRUPT(name, nr) + +/* The include is where all of the SMP etc. interrupts come from */ +#include + +ENTRY(coprocessor_error) + ASM_CLAC + pushl $0 + pushl $do_coprocessor_error + jmp error_code +END(coprocessor_error) + +ENTRY(simd_coprocessor_error) + ASM_CLAC + pushl $0 +#ifdef CONFIG_X86_INVD_BUG + /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ + ALTERNATIVE "pushl $do_general_protection", \ + "pushl $do_simd_coprocessor_error", \ + X86_FEATURE_XMM +#else + pushl $do_simd_coprocessor_error +#endif + jmp error_code +END(simd_coprocessor_error) + +ENTRY(device_not_available) + ASM_CLAC + pushl $-1 # mark this as an int + pushl $do_device_not_available + jmp error_code +END(device_not_available) + +#ifdef CONFIG_PARAVIRT +ENTRY(native_iret) + iret + _ASM_EXTABLE(native_iret, iret_exc) +END(native_iret) + +ENTRY(native_irq_enable_sysexit) + sti + sysexit +END(native_irq_enable_sysexit) +#endif + +ENTRY(overflow) + ASM_CLAC + pushl $0 + pushl $do_overflow + jmp error_code +END(overflow) + +ENTRY(bounds) + ASM_CLAC + pushl $0 + pushl $do_bounds + jmp error_code +END(bounds) + +ENTRY(invalid_op) + ASM_CLAC + pushl $0 + pushl $do_invalid_op + jmp error_code +END(invalid_op) + +ENTRY(coprocessor_segment_overrun) + ASM_CLAC + pushl $0 + pushl $do_coprocessor_segment_overrun + jmp error_code +END(coprocessor_segment_overrun) + +ENTRY(invalid_TSS) + ASM_CLAC + pushl $do_invalid_TSS + jmp error_code +END(invalid_TSS) + +ENTRY(segment_not_present) + ASM_CLAC + pushl $do_segment_not_present + jmp error_code +END(segment_not_present) + +ENTRY(stack_segment) + ASM_CLAC + pushl $do_stack_segment + jmp error_code +END(stack_segment) + +ENTRY(alignment_check) + ASM_CLAC + pushl $do_alignment_check + jmp error_code +END(alignment_check) + +ENTRY(divide_error) + ASM_CLAC + pushl $0 # no error code + pushl $do_divide_error + jmp error_code +END(divide_error) + +#ifdef CONFIG_X86_MCE +ENTRY(machine_check) + ASM_CLAC + pushl $0 + pushl machine_check_vector + jmp error_code +END(machine_check) +#endif + +ENTRY(spurious_interrupt_bug) + ASM_CLAC + pushl $0 + pushl $do_spurious_interrupt_bug + jmp error_code +END(spurious_interrupt_bug) + +#ifdef CONFIG_XEN +/* + * Xen doesn't set %esp to be precisely what the normal SYSENTER + * entry point expects, so fix it up before using the normal path. + */ +ENTRY(xen_sysenter_target) + addl $5*4, %esp /* remove xen-provided frame */ + jmp sysenter_past_esp + +ENTRY(xen_hypervisor_callback) + pushl $-1 /* orig_ax = -1 => not a system call */ + SAVE_ALL + TRACE_IRQS_OFF + + /* + * Check to see if we got the event in the critical + * region in xen_iret_direct, after we've reenabled + * events and checked for pending events. This simulates + * iret instruction's behaviour where it delivers a + * pending interrupt when enabling interrupts: + */ + movl PT_EIP(%esp), %eax + cmpl $xen_iret_start_crit, %eax + jb 1f + cmpl $xen_iret_end_crit, %eax + jae 1f + + jmp xen_iret_crit_fixup + +ENTRY(xen_do_upcall) +1: mov %esp, %eax + call xen_evtchn_do_upcall +#ifndef CONFIG_PREEMPT + call xen_maybe_preempt_hcall +#endif + jmp ret_from_intr +ENDPROC(xen_hypervisor_callback) + +/* + * Hypervisor uses this for application faults while it executes. + * We get here for two reasons: + * 1. Fault while reloading DS, ES, FS or GS + * 2. Fault while executing IRET + * Category 1 we fix up by reattempting the load, and zeroing the segment + * register if the load fails. + * Category 2 we fix up by jumping to do_iret_error. We cannot use the + * normal Linux return path in this case because if we use the IRET hypercall + * to pop the stack frame we end up in an infinite loop of failsafe callbacks. + * We distinguish between categories by maintaining a status value in EAX. + */ +ENTRY(xen_failsafe_callback) + pushl %eax + movl $1, %eax +1: mov 4(%esp), %ds +2: mov 8(%esp), %es +3: mov 12(%esp), %fs +4: mov 16(%esp), %gs + /* EAX == 0 => Category 1 (Bad segment) + EAX != 0 => Category 2 (Bad IRET) */ + testl %eax, %eax + popl %eax + lea 16(%esp), %esp + jz 5f + jmp iret_exc +5: pushl $-1 /* orig_ax = -1 => not a system call */ + SAVE_ALL + jmp ret_from_exception + +.section .fixup, "ax" +6: xorl %eax, %eax + movl %eax, 4(%esp) + jmp 1b +7: xorl %eax, %eax + movl %eax, 8(%esp) + jmp 2b +8: xorl %eax, %eax + movl %eax, 12(%esp) + jmp 3b +9: xorl %eax, %eax + movl %eax, 16(%esp) + jmp 4b +.previous + _ASM_EXTABLE(1b, 6b) + _ASM_EXTABLE(2b, 7b) + _ASM_EXTABLE(3b, 8b) + _ASM_EXTABLE(4b, 9b) +ENDPROC(xen_failsafe_callback) + +BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR, + xen_evtchn_do_upcall) + +#endif /* CONFIG_XEN */ + +#if IS_ENABLED(CONFIG_HYPERV) + +BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR, + hyperv_vector_handler) + +#endif /* CONFIG_HYPERV */ + +#ifdef CONFIG_FUNCTION_TRACER +#ifdef CONFIG_DYNAMIC_FTRACE + +ENTRY(mcount) + ret +END(mcount) + +ENTRY(ftrace_caller) + pushl %eax + pushl %ecx + pushl %edx + pushl $0 /* Pass NULL as regs pointer */ + movl 4*4(%esp), %eax + movl 0x4(%ebp), %edx + movl function_trace_op, %ecx + subl $MCOUNT_INSN_SIZE, %eax + +.globl ftrace_call +ftrace_call: + call ftrace_stub + + addl $4, %esp /* skip NULL pointer */ + popl %edx + popl %ecx + popl %eax +ftrace_ret: +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +.globl ftrace_graph_call +ftrace_graph_call: + jmp ftrace_stub +#endif + +.globl ftrace_stub +ftrace_stub: + ret +END(ftrace_caller) + +ENTRY(ftrace_regs_caller) + pushf /* push flags before compare (in cs location) */ + + /* + * i386 does not save SS and ESP when coming from kernel. + * Instead, to get sp, ®s->sp is used (see ptrace.h). + * Unfortunately, that means eflags must be at the same location + * as the current return ip is. We move the return ip into the + * ip location, and move flags into the return ip location. + */ + pushl 4(%esp) /* save return ip into ip slot */ + + pushl $0 /* Load 0 into orig_ax */ + pushl %gs + pushl %fs + pushl %es + pushl %ds + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + + movl 13*4(%esp), %eax /* Get the saved flags */ + movl %eax, 14*4(%esp) /* Move saved flags into regs->flags location */ + /* clobbering return ip */ + movl $__KERNEL_CS, 13*4(%esp) + + movl 12*4(%esp), %eax /* Load ip (1st parameter) */ + subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */ + movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */ + movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ + pushl %esp /* Save pt_regs as 4th parameter */ + +GLOBAL(ftrace_regs_call) + call ftrace_stub + + addl $4, %esp /* Skip pt_regs */ + movl 14*4(%esp), %eax /* Move flags back into cs */ + movl %eax, 13*4(%esp) /* Needed to keep addl from modifying flags */ + movl 12*4(%esp), %eax /* Get return ip from regs->ip */ + movl %eax, 14*4(%esp) /* Put return ip back for ret */ + + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax + popl %ds + popl %es + popl %fs + popl %gs + addl $8, %esp /* Skip orig_ax and ip */ + popf /* Pop flags at end (no addl to corrupt flags) */ + jmp ftrace_ret + + popf + jmp ftrace_stub +#else /* ! CONFIG_DYNAMIC_FTRACE */ + +ENTRY(mcount) + cmpl $__PAGE_OFFSET, %esp + jb ftrace_stub /* Paging not enabled yet? */ + + cmpl $ftrace_stub, ftrace_trace_function + jnz trace +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + cmpl $ftrace_stub, ftrace_graph_return + jnz ftrace_graph_caller + + cmpl $ftrace_graph_entry_stub, ftrace_graph_entry + jnz ftrace_graph_caller +#endif +.globl ftrace_stub +ftrace_stub: + ret + + /* taken from glibc */ +trace: + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + movl 0x4(%ebp), %edx + subl $MCOUNT_INSN_SIZE, %eax + + call *ftrace_trace_function + + popl %edx + popl %ecx + popl %eax + jmp ftrace_stub +END(mcount) +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +ENTRY(ftrace_graph_caller) + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + lea 0x4(%ebp), %edx + movl (%ebp), %ecx + subl $MCOUNT_INSN_SIZE, %eax + call prepare_ftrace_return + popl %edx + popl %ecx + popl %eax + ret +END(ftrace_graph_caller) + +.globl return_to_handler +return_to_handler: + pushl %eax + pushl %edx + movl %ebp, %eax + call ftrace_return_to_handler + movl %eax, %ecx + popl %edx + popl %eax + jmp *%ecx +#endif + +#ifdef CONFIG_TRACING +ENTRY(trace_page_fault) + ASM_CLAC + pushl $trace_do_page_fault + jmp error_code +END(trace_page_fault) +#endif + +ENTRY(page_fault) + ASM_CLAC + pushl $do_page_fault + ALIGN +error_code: + /* the function address is in %gs's slot on the stack */ + pushl %fs + pushl %es + pushl %ds + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + cld + movl $(__KERNEL_PERCPU), %ecx + movl %ecx, %fs + UNWIND_ESPFIX_STACK + GS_TO_REG %ecx + movl PT_GS(%esp), %edi # get the function address + movl PT_ORIG_EAX(%esp), %edx # get the error code + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart + REG_TO_PTGS %ecx + SET_KERNEL_GS %ecx + movl $(__USER_DS), %ecx + movl %ecx, %ds + movl %ecx, %es + TRACE_IRQS_OFF + movl %esp, %eax # pt_regs pointer + call *%edi + jmp ret_from_exception +END(page_fault) + +/* + * Debug traps and NMI can happen at the one SYSENTER instruction + * that sets up the real kernel stack. Check here, since we can't + * allow the wrong stack to be used. + * + * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have + * already pushed 3 words if it hits on the sysenter instruction: + * eflags, cs and eip. + * + * We just load the right stack, and push the three (known) values + * by hand onto the new stack - while updating the return eip past + * the instruction that would have done it for sysenter. + */ +.macro FIX_STACK offset ok label + cmpw $__KERNEL_CS, 4(%esp) + jne \ok +\label: + movl TSS_sysenter_sp0 + \offset(%esp), %esp + pushfl + pushl $__KERNEL_CS + pushl $sysenter_past_esp +.endm + +ENTRY(debug) + ASM_CLAC + cmpl $entry_SYSENTER_32, (%esp) + jne debug_stack_correct + FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn +debug_stack_correct: + pushl $-1 # mark this as an int + SAVE_ALL + TRACE_IRQS_OFF + xorl %edx, %edx # error code 0 + movl %esp, %eax # pt_regs pointer + call do_debug + jmp ret_from_exception +END(debug) + +/* + * NMI is doubly nasty. It can happen _while_ we're handling + * a debug fault, and the debug fault hasn't yet been able to + * clear up the stack. So we first check whether we got an + * NMI on the sysenter entry path, but after that we need to + * check whether we got an NMI on the debug path where the debug + * fault happened on the sysenter path. + */ +ENTRY(nmi) + ASM_CLAC +#ifdef CONFIG_X86_ESPFIX32 + pushl %eax + movl %ss, %eax + cmpw $__ESPFIX_SS, %ax + popl %eax + je nmi_espfix_stack +#endif + cmpl $entry_SYSENTER_32, (%esp) + je nmi_stack_fixup + pushl %eax + movl %esp, %eax + /* + * Do not access memory above the end of our stack page, + * it might not exist. + */ + andl $(THREAD_SIZE-1), %eax + cmpl $(THREAD_SIZE-20), %eax + popl %eax + jae nmi_stack_correct + cmpl $entry_SYSENTER_32, 12(%esp) + je nmi_debug_stack_check +nmi_stack_correct: + pushl %eax + SAVE_ALL + xorl %edx, %edx # zero error code + movl %esp, %eax # pt_regs pointer + call do_nmi + jmp restore_all_notrace + +nmi_stack_fixup: + FIX_STACK 12, nmi_stack_correct, 1 + jmp nmi_stack_correct + +nmi_debug_stack_check: + cmpw $__KERNEL_CS, 16(%esp) + jne nmi_stack_correct + cmpl $debug, (%esp) + jb nmi_stack_correct + cmpl $debug_esp_fix_insn, (%esp) + ja nmi_stack_correct + FIX_STACK 24, nmi_stack_correct, 1 + jmp nmi_stack_correct + +#ifdef CONFIG_X86_ESPFIX32 +nmi_espfix_stack: + /* + * create the pointer to lss back + */ + pushl %ss + pushl %esp + addl $4, (%esp) + /* copy the iret frame of 12 bytes */ + .rept 3 + pushl 16(%esp) + .endr + pushl %eax + SAVE_ALL + FIXUP_ESPFIX_STACK # %eax == %esp + xorl %edx, %edx # zero error code + call do_nmi + RESTORE_REGS + lss 12+4(%esp), %esp # back to espfix stack + jmp irq_return +#endif +END(nmi) + +ENTRY(int3) + ASM_CLAC + pushl $-1 # mark this as an int + SAVE_ALL + TRACE_IRQS_OFF + xorl %edx, %edx # zero error code + movl %esp, %eax # pt_regs pointer + call do_int3 + jmp ret_from_exception +END(int3) + +ENTRY(general_protection) + pushl $do_general_protection + jmp error_code +END(general_protection) + +#ifdef CONFIG_KVM_GUEST +ENTRY(async_page_fault) + ASM_CLAC + pushl $do_async_page_fault + jmp error_code +END(async_page_fault) +#endif diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/entry/entry_64.S similarity index 58% rename from arch/x86/kernel/entry_64.S rename to arch/x86/entry/entry_64.S index 02c2eff7478d..3bb2c4302df1 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -4,34 +4,25 @@ * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs * Copyright (C) 2000 Pavel Machek - */ - -/* + * * entry.S contains the system-call and fault low-level handling routines. * * Some of this is documented in Documentation/x86/entry_64.txt * - * NOTE: This code handles signal-recognition, which happens every time - * after an interrupt and after each system call. - * * A note on terminology: - * - iret frame: Architecture defined interrupt frame from SS to RIP - * at the top of the kernel process stack. + * - iret frame: Architecture defined interrupt frame from SS to RIP + * at the top of the kernel process stack. * * Some macro usage: - * - CFI macros are used to generate dwarf2 unwind information for better - * backtraces. They don't change any code. - * - ENTRY/END Define functions in the symbol table. - * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. - * - idtentry - Define exception entry points. + * - ENTRY/END: Define functions in the symbol table. + * - TRACE_IRQ_*: Trace hardirq state for lock debugging. + * - idtentry: Define exception entry points. */ - #include #include #include #include -#include -#include +#include "calling.h" #include #include #include @@ -49,13 +40,12 @@ /* Avoid __ASSEMBLER__'ifying just for this. */ #include -#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) -#define __AUDIT_ARCH_64BIT 0x80000000 -#define __AUDIT_ARCH_LE 0x40000000 - - .code64 - .section .entry.text, "ax" +#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_64BIT 0x80000000 +#define __AUDIT_ARCH_LE 0x40000000 +.code64 +.section .entry.text, "ax" #ifdef CONFIG_PARAVIRT ENTRY(native_usergs_sysret64) @@ -64,11 +54,10 @@ ENTRY(native_usergs_sysret64) ENDPROC(native_usergs_sysret64) #endif /* CONFIG_PARAVIRT */ - .macro TRACE_IRQS_IRETQ #ifdef CONFIG_TRACE_IRQFLAGS - bt $9,EFLAGS(%rsp) /* interrupts off? */ - jnc 1f + bt $9, EFLAGS(%rsp) /* interrupts off? */ + jnc 1f TRACE_IRQS_ON 1: #endif @@ -88,89 +77,34 @@ ENDPROC(native_usergs_sysret64) #if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS) .macro TRACE_IRQS_OFF_DEBUG - call debug_stack_set_zero + call debug_stack_set_zero TRACE_IRQS_OFF - call debug_stack_reset + call debug_stack_reset .endm .macro TRACE_IRQS_ON_DEBUG - call debug_stack_set_zero + call debug_stack_set_zero TRACE_IRQS_ON - call debug_stack_reset + call debug_stack_reset .endm .macro TRACE_IRQS_IRETQ_DEBUG - bt $9,EFLAGS(%rsp) /* interrupts off? */ - jnc 1f + bt $9, EFLAGS(%rsp) /* interrupts off? */ + jnc 1f TRACE_IRQS_ON_DEBUG 1: .endm #else -# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF -# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON -# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ +# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF +# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON +# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ #endif /* - * empty frame - */ - .macro EMPTY_FRAME start=1 offset=0 - .if \start - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,8+\offset - .else - CFI_DEF_CFA_OFFSET 8+\offset - .endif - .endm - -/* - * initial frame state for interrupts (and exceptions without error code) - */ - .macro INTR_FRAME start=1 offset=0 - EMPTY_FRAME \start, 5*8+\offset - /*CFI_REL_OFFSET ss, 4*8+\offset*/ - CFI_REL_OFFSET rsp, 3*8+\offset - /*CFI_REL_OFFSET rflags, 2*8+\offset*/ - /*CFI_REL_OFFSET cs, 1*8+\offset*/ - CFI_REL_OFFSET rip, 0*8+\offset - .endm - -/* - * initial frame state for exceptions with error code (and interrupts - * with vector already pushed) - */ - .macro XCPT_FRAME start=1 offset=0 - INTR_FRAME \start, 1*8+\offset - .endm - -/* - * frame that enables passing a complete pt_regs to a C function. - */ - .macro DEFAULT_FRAME start=1 offset=0 - XCPT_FRAME \start, ORIG_RAX+\offset - CFI_REL_OFFSET rdi, RDI+\offset - CFI_REL_OFFSET rsi, RSI+\offset - CFI_REL_OFFSET rdx, RDX+\offset - CFI_REL_OFFSET rcx, RCX+\offset - CFI_REL_OFFSET rax, RAX+\offset - CFI_REL_OFFSET r8, R8+\offset - CFI_REL_OFFSET r9, R9+\offset - CFI_REL_OFFSET r10, R10+\offset - CFI_REL_OFFSET r11, R11+\offset - CFI_REL_OFFSET rbx, RBX+\offset - CFI_REL_OFFSET rbp, RBP+\offset - CFI_REL_OFFSET r12, R12+\offset - CFI_REL_OFFSET r13, R13+\offset - CFI_REL_OFFSET r14, R14+\offset - CFI_REL_OFFSET r15, R15+\offset - .endm - -/* - * 64bit SYSCALL instruction entry. Up to 6 arguments in registers. + * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers. * - * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, + * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, * then loads new ss, cs, and rip from previously programmed MSRs. * rflags gets masked by a value from another MSR (so CLD and CLAC * are not needed). SYSCALL does not save anything on the stack @@ -186,7 +120,7 @@ ENDPROC(native_usergs_sysret64) * r10 arg3 (needs to be moved to rcx to conform to C ABI) * r8 arg4 * r9 arg5 - * (note: r12-r15,rbp,rbx are callee-preserved in C ABI) + * (note: r12-r15, rbp, rbx are callee-preserved in C ABI) * * Only called from user space. * @@ -195,13 +129,7 @@ ENDPROC(native_usergs_sysret64) * with them due to bugs in both AMD and Intel CPUs. */ -ENTRY(system_call) - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,0 - CFI_REGISTER rip,rcx - /*CFI_REGISTER rflags,r11*/ - +ENTRY(entry_SYSCALL_64) /* * Interrupts are off on entry. * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, @@ -213,14 +141,14 @@ ENTRY(system_call) * after the swapgs, so that it can do the swapgs * for the guest and jump here on syscall. */ -GLOBAL(system_call_after_swapgs) +GLOBAL(entry_SYSCALL_64_after_swapgs) - movq %rsp,PER_CPU_VAR(rsp_scratch) - movq PER_CPU_VAR(kernel_stack),%rsp + movq %rsp, PER_CPU_VAR(rsp_scratch) + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp /* Construct struct pt_regs on stack */ - pushq_cfi $__USER_DS /* pt_regs->ss */ - pushq_cfi PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ + pushq $__USER_DS /* pt_regs->ss */ + pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ /* * Re-enable interrupts. * We use 'rsp_scratch' as a scratch space, hence irq-off block above @@ -229,36 +157,34 @@ GLOBAL(system_call_after_swapgs) * with using rsp_scratch: */ ENABLE_INTERRUPTS(CLBR_NONE) - pushq_cfi %r11 /* pt_regs->flags */ - pushq_cfi $__USER_CS /* pt_regs->cs */ - pushq_cfi %rcx /* pt_regs->ip */ - CFI_REL_OFFSET rip,0 - pushq_cfi_reg rax /* pt_regs->orig_ax */ - pushq_cfi_reg rdi /* pt_regs->di */ - pushq_cfi_reg rsi /* pt_regs->si */ - pushq_cfi_reg rdx /* pt_regs->dx */ - pushq_cfi_reg rcx /* pt_regs->cx */ - pushq_cfi $-ENOSYS /* pt_regs->ax */ - pushq_cfi_reg r8 /* pt_regs->r8 */ - pushq_cfi_reg r9 /* pt_regs->r9 */ - pushq_cfi_reg r10 /* pt_regs->r10 */ - pushq_cfi_reg r11 /* pt_regs->r11 */ - sub $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */ - CFI_ADJUST_CFA_OFFSET 6*8 + pushq %r11 /* pt_regs->flags */ + pushq $__USER_CS /* pt_regs->cs */ + pushq %rcx /* pt_regs->ip */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq $-ENOSYS /* pt_regs->ax */ + pushq %r8 /* pt_regs->r8 */ + pushq %r9 /* pt_regs->r9 */ + pushq %r10 /* pt_regs->r10 */ + pushq %r11 /* pt_regs->r11 */ + sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz tracesys -system_call_fastpath: + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz tracesys +entry_SYSCALL_64_fastpath: #if __SYSCALL_MASK == ~0 - cmpq $__NR_syscall_max,%rax + cmpq $__NR_syscall_max, %rax #else - andl $__SYSCALL_MASK,%eax - cmpl $__NR_syscall_max,%eax + andl $__SYSCALL_MASK, %eax + cmpl $__NR_syscall_max, %eax #endif - ja 1f /* return -ENOSYS (already in pt_regs->ax) */ - movq %r10,%rcx - call *sys_call_table(,%rax,8) - movq %rax,RAX(%rsp) + ja 1f /* return -ENOSYS (already in pt_regs->ax) */ + movq %r10, %rcx + call *sys_call_table(, %rax, 8) + movq %rax, RAX(%rsp) 1: /* * Syscall return path ending with SYSRET (fast path). @@ -279,19 +205,15 @@ system_call_fastpath: * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is * very bad. */ - testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */ - - CFI_REMEMBER_STATE + testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */ RESTORE_C_REGS_EXCEPT_RCX_R11 - movq RIP(%rsp),%rcx - CFI_REGISTER rip,rcx - movq EFLAGS(%rsp),%r11 - /*CFI_REGISTER rflags,r11*/ - movq RSP(%rsp),%rsp + movq RIP(%rsp), %rcx + movq EFLAGS(%rsp), %r11 + movq RSP(%rsp), %rsp /* - * 64bit SYSRET restores rip from rcx, + * 64-bit SYSRET restores rip from rcx, * rflags from r11 (but RF and VM bits are forced to 0), * cs and ss are loaded from MSRs. * Restoration of rflags re-enables interrupts. @@ -307,25 +229,23 @@ system_call_fastpath: */ USERGS_SYSRET64 - CFI_RESTORE_STATE - /* Do syscall entry tracing */ tracesys: - movq %rsp, %rdi - movl $AUDIT_ARCH_X86_64, %esi - call syscall_trace_enter_phase1 - test %rax, %rax - jnz tracesys_phase2 /* if needed, run the slow path */ - RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */ - movq ORIG_RAX(%rsp), %rax - jmp system_call_fastpath /* and return to the fast path */ + movq %rsp, %rdi + movl $AUDIT_ARCH_X86_64, %esi + call syscall_trace_enter_phase1 + test %rax, %rax + jnz tracesys_phase2 /* if needed, run the slow path */ + RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */ + movq ORIG_RAX(%rsp), %rax + jmp entry_SYSCALL_64_fastpath /* and return to the fast path */ tracesys_phase2: SAVE_EXTRA_REGS - movq %rsp, %rdi - movl $AUDIT_ARCH_X86_64, %esi - movq %rax,%rdx - call syscall_trace_enter_phase2 + movq %rsp, %rdi + movl $AUDIT_ARCH_X86_64, %esi + movq %rax, %rdx + call syscall_trace_enter_phase2 /* * Reload registers from stack in case ptrace changed them. @@ -335,15 +255,15 @@ tracesys_phase2: RESTORE_C_REGS_EXCEPT_RAX RESTORE_EXTRA_REGS #if __SYSCALL_MASK == ~0 - cmpq $__NR_syscall_max,%rax + cmpq $__NR_syscall_max, %rax #else - andl $__SYSCALL_MASK,%eax - cmpl $__NR_syscall_max,%eax + andl $__SYSCALL_MASK, %eax + cmpl $__NR_syscall_max, %eax #endif - ja 1f /* return -ENOSYS (already in pt_regs->ax) */ - movq %r10,%rcx /* fixup for C */ - call *sys_call_table(,%rax,8) - movq %rax,RAX(%rsp) + ja 1f /* return -ENOSYS (already in pt_regs->ax) */ + movq %r10, %rcx /* fixup for C */ + call *sys_call_table(, %rax, 8) + movq %rax, RAX(%rsp) 1: /* Use IRET because user could have changed pt_regs->foo */ @@ -355,31 +275,33 @@ GLOBAL(int_ret_from_sys_call) DISABLE_INTERRUPTS(CLBR_NONE) int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */ TRACE_IRQS_OFF - movl $_TIF_ALLWORK_MASK,%edi + movl $_TIF_ALLWORK_MASK, %edi /* edi: mask to check */ GLOBAL(int_with_check) LOCKDEP_SYS_EXIT_IRQ GET_THREAD_INFO(%rcx) - movl TI_flags(%rcx),%edx - andl %edi,%edx - jnz int_careful - andl $~TS_COMPAT,TI_status(%rcx) + movl TI_flags(%rcx), %edx + andl %edi, %edx + jnz int_careful + andl $~TS_COMPAT, TI_status(%rcx) jmp syscall_return - /* Either reschedule or signal or syscall exit tracking needed. */ - /* First do a reschedule test. */ - /* edx: work, edi: workmask */ + /* + * Either reschedule or signal or syscall exit tracking needed. + * First do a reschedule test. + * edx: work, edi: workmask + */ int_careful: - bt $TIF_NEED_RESCHED,%edx - jnc int_very_careful + bt $TIF_NEED_RESCHED, %edx + jnc int_very_careful TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - pushq_cfi %rdi + pushq %rdi SCHEDULE_USER - popq_cfi %rdi + popq %rdi DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - jmp int_with_check + jmp int_with_check /* handle signals and tracing -- both require a full pt_regs */ int_very_careful: @@ -387,27 +309,27 @@ int_very_careful: ENABLE_INTERRUPTS(CLBR_NONE) SAVE_EXTRA_REGS /* Check for syscall exit trace */ - testl $_TIF_WORK_SYSCALL_EXIT,%edx - jz int_signal - pushq_cfi %rdi - leaq 8(%rsp),%rdi # &ptregs -> arg1 - call syscall_trace_leave - popq_cfi %rdi - andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi - jmp int_restore_rest + testl $_TIF_WORK_SYSCALL_EXIT, %edx + jz int_signal + pushq %rdi + leaq 8(%rsp), %rdi /* &ptregs -> arg1 */ + call syscall_trace_leave + popq %rdi + andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU), %edi + jmp int_restore_rest int_signal: - testl $_TIF_DO_NOTIFY_MASK,%edx - jz 1f - movq %rsp,%rdi # &ptregs -> arg1 - xorl %esi,%esi # oldset -> arg2 - call do_notify_resume -1: movl $_TIF_WORK_MASK,%edi + testl $_TIF_DO_NOTIFY_MASK, %edx + jz 1f + movq %rsp, %rdi /* &ptregs -> arg1 */ + xorl %esi, %esi /* oldset -> arg2 */ + call do_notify_resume +1: movl $_TIF_WORK_MASK, %edi int_restore_rest: RESTORE_EXTRA_REGS DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - jmp int_with_check + jmp int_with_check syscall_return: /* The IRETQ could re-enable interrupts: */ @@ -418,34 +340,37 @@ syscall_return: * Try to use SYSRET instead of IRET if we're returning to * a completely clean 64-bit userspace context. */ - movq RCX(%rsp),%rcx - cmpq %rcx,RIP(%rsp) /* RCX == RIP */ - jne opportunistic_sysret_failed + movq RCX(%rsp), %rcx + movq RIP(%rsp), %r11 + cmpq %rcx, %r11 /* RCX == RIP */ + jne opportunistic_sysret_failed /* * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP * in kernel space. This essentially lets the user take over - * the kernel, since userspace controls RSP. It's not worth - * testing for canonicalness exactly -- this check detects any - * of the 17 high bits set, which is true for non-canonical - * or kernel addresses. (This will pessimize vsyscall=native. - * Big deal.) + * the kernel, since userspace controls RSP. * - * If virtual addresses ever become wider, this will need + * If width of "canonical tail" ever becomes variable, this will need * to be updated to remain correct on both old and new CPUs. */ .ifne __VIRTUAL_MASK_SHIFT - 47 .error "virtual address width changed -- SYSRET checks need update" .endif - shr $__VIRTUAL_MASK_SHIFT, %rcx - jnz opportunistic_sysret_failed - cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */ - jne opportunistic_sysret_failed + /* Change top 16 bits to be the sign-extension of 47th bit */ + shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx + sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx - movq R11(%rsp),%r11 - cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */ - jne opportunistic_sysret_failed + /* If this changed %rcx, it was not canonical */ + cmpq %rcx, %r11 + jne opportunistic_sysret_failed + + cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */ + jne opportunistic_sysret_failed + + movq R11(%rsp), %r11 + cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */ + jne opportunistic_sysret_failed /* * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET, @@ -454,47 +379,41 @@ syscall_return: * with register state that satisfies the opportunistic SYSRET * conditions. For example, single-stepping this user code: * - * movq $stuck_here,%rcx + * movq $stuck_here, %rcx * pushfq * popq %r11 * stuck_here: * * would never get past 'stuck_here'. */ - testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 - jnz opportunistic_sysret_failed + testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 + jnz opportunistic_sysret_failed /* nothing to check for RSP */ - cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */ - jne opportunistic_sysret_failed + cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */ + jne opportunistic_sysret_failed /* - * We win! This label is here just for ease of understanding - * perf profiles. Nothing jumps here. + * We win! This label is here just for ease of understanding + * perf profiles. Nothing jumps here. */ syscall_return_via_sysret: - CFI_REMEMBER_STATE - /* r11 is already restored (see code above) */ - RESTORE_C_REGS_EXCEPT_R11 - movq RSP(%rsp),%rsp + /* rcx and r11 are already restored (see code above) */ + RESTORE_C_REGS_EXCEPT_RCX_R11 + movq RSP(%rsp), %rsp USERGS_SYSRET64 - CFI_RESTORE_STATE opportunistic_sysret_failed: SWAPGS jmp restore_c_regs_and_iret - CFI_ENDPROC -END(system_call) +END(entry_SYSCALL_64) .macro FORK_LIKE func ENTRY(stub_\func) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 /* offset 8: return address */ SAVE_EXTRA_REGS 8 - jmp sys_\func - CFI_ENDPROC + jmp sys_\func END(stub_\func) .endm @@ -503,8 +422,6 @@ END(stub_\func) FORK_LIKE vfork ENTRY(stub_execve) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 call sys_execve return_from_execve: testl %eax, %eax @@ -514,11 +431,9 @@ return_from_execve: 1: /* must use IRET code path (pt_regs->cs may have changed) */ addq $8, %rsp - CFI_ADJUST_CFA_OFFSET -8 ZERO_EXTRA_REGS - movq %rax,RAX(%rsp) + movq %rax, RAX(%rsp) jmp int_ret_from_sys_call - CFI_ENDPROC END(stub_execve) /* * Remaining execve stubs are only 7 bytes long. @@ -526,47 +441,25 @@ END(stub_execve) */ .align 8 GLOBAL(stub_execveat) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 call sys_execveat jmp return_from_execve - CFI_ENDPROC END(stub_execveat) -#ifdef CONFIG_X86_X32_ABI +#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION) .align 8 GLOBAL(stub_x32_execve) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 +GLOBAL(stub32_execve) call compat_sys_execve jmp return_from_execve - CFI_ENDPROC +END(stub32_execve) END(stub_x32_execve) .align 8 GLOBAL(stub_x32_execveat) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 - call compat_sys_execveat - jmp return_from_execve - CFI_ENDPROC -END(stub_x32_execveat) -#endif - -#ifdef CONFIG_IA32_EMULATION - .align 8 -GLOBAL(stub32_execve) - CFI_STARTPROC - call compat_sys_execve - jmp return_from_execve - CFI_ENDPROC -END(stub32_execve) - .align 8 GLOBAL(stub32_execveat) - CFI_STARTPROC call compat_sys_execveat jmp return_from_execve - CFI_ENDPROC END(stub32_execveat) +END(stub_x32_execveat) #endif /* @@ -574,8 +467,6 @@ END(stub32_execveat) * This cannot be done with SYSRET, so use the IRET return path instead. */ ENTRY(stub_rt_sigreturn) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 /* * SAVE_EXTRA_REGS result is not normally needed: * sigreturn overwrites all pt_regs->GPREGS. @@ -584,24 +475,19 @@ ENTRY(stub_rt_sigreturn) * we SAVE_EXTRA_REGS here. */ SAVE_EXTRA_REGS 8 - call sys_rt_sigreturn + call sys_rt_sigreturn return_from_stub: addq $8, %rsp - CFI_ADJUST_CFA_OFFSET -8 RESTORE_EXTRA_REGS - movq %rax,RAX(%rsp) - jmp int_ret_from_sys_call - CFI_ENDPROC + movq %rax, RAX(%rsp) + jmp int_ret_from_sys_call END(stub_rt_sigreturn) #ifdef CONFIG_X86_X32_ABI ENTRY(stub_x32_rt_sigreturn) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 SAVE_EXTRA_REGS 8 - call sys32_x32_rt_sigreturn - jmp return_from_stub - CFI_ENDPROC + call sys32_x32_rt_sigreturn + jmp return_from_stub END(stub_x32_rt_sigreturn) #endif @@ -611,36 +497,36 @@ END(stub_x32_rt_sigreturn) * rdi: prev task we switched from */ ENTRY(ret_from_fork) - DEFAULT_FRAME - LOCK ; btr $TIF_FORK,TI_flags(%r8) + LOCK ; btr $TIF_FORK, TI_flags(%r8) - pushq_cfi $0x0002 - popfq_cfi # reset kernel eflags + pushq $0x0002 + popfq /* reset kernel eflags */ - call schedule_tail # rdi: 'prev' task parameter + call schedule_tail /* rdi: 'prev' task parameter */ RESTORE_EXTRA_REGS - testl $3,CS(%rsp) # from kernel_thread? + testb $3, CS(%rsp) /* from kernel_thread? */ /* * By the time we get here, we have no idea whether our pt_regs, * ti flags, and ti status came from the 64-bit SYSCALL fast path, - * the slow path, or one of the ia32entry paths. + * the slow path, or one of the 32-bit compat paths. * Use IRET code path to return, since it can safely handle * all of the above. */ jnz int_ret_from_sys_call - /* We came from kernel_thread */ - /* nb: we depend on RESTORE_EXTRA_REGS above */ - movq %rbp, %rdi - call *%rbx - movl $0, RAX(%rsp) + /* + * We came from kernel_thread + * nb: we depend on RESTORE_EXTRA_REGS above + */ + movq %rbp, %rdi + call *%rbx + movl $0, RAX(%rsp) RESTORE_EXTRA_REGS - jmp int_ret_from_sys_call - CFI_ENDPROC + jmp int_ret_from_sys_call END(ret_from_fork) /* @@ -649,16 +535,13 @@ END(ret_from_fork) */ .align 8 ENTRY(irq_entries_start) - INTR_FRAME vector=FIRST_EXTERNAL_VECTOR .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) - pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */ + pushq $(~vector+0x80) /* Note: always in signed byte range */ vector=vector+1 jmp common_interrupt - CFI_ADJUST_CFA_OFFSET -8 .align 8 .endr - CFI_ENDPROC END(irq_entries_start) /* @@ -684,10 +567,10 @@ END(irq_entries_start) /* this goes to 0(%rsp) for unwinder, not for saving the value: */ SAVE_EXTRA_REGS_RBP -RBP - leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */ + leaq -RBP(%rsp), %rdi /* arg1 for \func (pointer to pt_regs) */ - testl $3, CS-RBP(%rsp) - je 1f + testb $3, CS-RBP(%rsp) + jz 1f SWAPGS 1: /* @@ -697,24 +580,14 @@ END(irq_entries_start) * a little cheaper to use a separate counter in the PDA (short of * moving irq_enter into assembly, which would be too much work) */ - movq %rsp, %rsi - incl PER_CPU_VAR(irq_count) - cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp - CFI_DEF_CFA_REGISTER rsi - pushq %rsi - /* - * For debugger: - * "CFA (Current Frame Address) is the value on stack + offset" - */ - CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ - 0x77 /* DW_OP_breg7 (rsp) */, 0, \ - 0x06 /* DW_OP_deref */, \ - 0x08 /* DW_OP_const1u */, SIZEOF_PTREGS-RBP, \ - 0x22 /* DW_OP_plus */ + movq %rsp, %rsi + incl PER_CPU_VAR(irq_count) + cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp + pushq %rsi /* We entered an interrupt context - irqs are off: */ TRACE_IRQS_OFF - call \func + call \func .endm /* @@ -723,42 +596,36 @@ END(irq_entries_start) */ .p2align CONFIG_X86_L1_CACHE_SHIFT common_interrupt: - XCPT_FRAME ASM_CLAC - addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ + addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */ interrupt do_IRQ /* 0(%rsp): old RSP */ ret_from_intr: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - decl PER_CPU_VAR(irq_count) + decl PER_CPU_VAR(irq_count) /* Restore saved previous stack */ - popq %rsi - CFI_DEF_CFA rsi,SIZEOF_PTREGS-RBP /* reg/off reset after def_cfa_expr */ + popq %rsi /* return code expects complete pt_regs - adjust rsp accordingly: */ - leaq -RBP(%rsi),%rsp - CFI_DEF_CFA_REGISTER rsp - CFI_ADJUST_CFA_OFFSET RBP + leaq -RBP(%rsi), %rsp - testl $3,CS(%rsp) - je retint_kernel + testb $3, CS(%rsp) + jz retint_kernel /* Interrupt came from user space */ - +retint_user: GET_THREAD_INFO(%rcx) - /* - * %rcx: thread info. Interrupts off. - */ + + /* %rcx: thread info. Interrupts are off. */ retint_with_reschedule: - movl $_TIF_WORK_MASK,%edi + movl $_TIF_WORK_MASK, %edi retint_check: LOCKDEP_SYS_EXIT_IRQ - movl TI_flags(%rcx),%edx - andl %edi,%edx - CFI_REMEMBER_STATE - jnz retint_careful + movl TI_flags(%rcx), %edx + andl %edi, %edx + jnz retint_careful -retint_swapgs: /* return to user-space */ +retint_swapgs: /* return to user-space */ /* * The iretq could re-enable interrupts: */ @@ -773,9 +640,9 @@ retint_kernel: #ifdef CONFIG_PREEMPT /* Interrupts are off */ /* Check if we need preemption */ - bt $9,EFLAGS(%rsp) /* interrupts were off? */ + bt $9, EFLAGS(%rsp) /* were interrupts off? */ jnc 1f -0: cmpl $0,PER_CPU_VAR(__preempt_count) +0: cmpl $0, PER_CPU_VAR(__preempt_count) jnz 1f call preempt_schedule_irq jmp 0b @@ -793,8 +660,6 @@ retint_kernel: restore_c_regs_and_iret: RESTORE_C_REGS REMOVE_PT_GPREGS_FROM_STACK 8 - -irq_return: INTERRUPT_RETURN ENTRY(native_iret) @@ -803,8 +668,8 @@ ENTRY(native_iret) * 64-bit mode SS:RSP on the exception stack is always valid. */ #ifdef CONFIG_X86_ESPFIX64 - testb $4,(SS-RIP)(%rsp) - jnz native_irq_return_ldt + testb $4, (SS-RIP)(%rsp) + jnz native_irq_return_ldt #endif .global native_irq_return_iret @@ -819,62 +684,60 @@ native_irq_return_iret: #ifdef CONFIG_X86_ESPFIX64 native_irq_return_ldt: - pushq_cfi %rax - pushq_cfi %rdi + pushq %rax + pushq %rdi SWAPGS - movq PER_CPU_VAR(espfix_waddr),%rdi - movq %rax,(0*8)(%rdi) /* RAX */ - movq (2*8)(%rsp),%rax /* RIP */ - movq %rax,(1*8)(%rdi) - movq (3*8)(%rsp),%rax /* CS */ - movq %rax,(2*8)(%rdi) - movq (4*8)(%rsp),%rax /* RFLAGS */ - movq %rax,(3*8)(%rdi) - movq (6*8)(%rsp),%rax /* SS */ - movq %rax,(5*8)(%rdi) - movq (5*8)(%rsp),%rax /* RSP */ - movq %rax,(4*8)(%rdi) - andl $0xffff0000,%eax - popq_cfi %rdi - orq PER_CPU_VAR(espfix_stack),%rax + movq PER_CPU_VAR(espfix_waddr), %rdi + movq %rax, (0*8)(%rdi) /* RAX */ + movq (2*8)(%rsp), %rax /* RIP */ + movq %rax, (1*8)(%rdi) + movq (3*8)(%rsp), %rax /* CS */ + movq %rax, (2*8)(%rdi) + movq (4*8)(%rsp), %rax /* RFLAGS */ + movq %rax, (3*8)(%rdi) + movq (6*8)(%rsp), %rax /* SS */ + movq %rax, (5*8)(%rdi) + movq (5*8)(%rsp), %rax /* RSP */ + movq %rax, (4*8)(%rdi) + andl $0xffff0000, %eax + popq %rdi + orq PER_CPU_VAR(espfix_stack), %rax SWAPGS - movq %rax,%rsp - popq_cfi %rax - jmp native_irq_return_iret + movq %rax, %rsp + popq %rax + jmp native_irq_return_iret #endif /* edi: workmask, edx: work */ retint_careful: - CFI_RESTORE_STATE - bt $TIF_NEED_RESCHED,%edx - jnc retint_signal + bt $TIF_NEED_RESCHED, %edx + jnc retint_signal TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - pushq_cfi %rdi + pushq %rdi SCHEDULE_USER - popq_cfi %rdi + popq %rdi GET_THREAD_INFO(%rcx) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - jmp retint_check + jmp retint_check retint_signal: - testl $_TIF_DO_NOTIFY_MASK,%edx - jz retint_swapgs + testl $_TIF_DO_NOTIFY_MASK, %edx + jz retint_swapgs TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) SAVE_EXTRA_REGS - movq $-1,ORIG_RAX(%rsp) - xorl %esi,%esi # oldset - movq %rsp,%rdi # &pt_regs - call do_notify_resume + movq $-1, ORIG_RAX(%rsp) + xorl %esi, %esi /* oldset */ + movq %rsp, %rdi /* &pt_regs */ + call do_notify_resume RESTORE_EXTRA_REGS DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF GET_THREAD_INFO(%rcx) - jmp retint_with_reschedule + jmp retint_with_reschedule - CFI_ENDPROC END(common_interrupt) /* @@ -882,13 +745,11 @@ END(common_interrupt) */ .macro apicinterrupt3 num sym do_sym ENTRY(\sym) - INTR_FRAME ASM_CLAC - pushq_cfi $~(\num) + pushq $~(\num) .Lcommon_\sym: interrupt \do_sym - jmp ret_from_intr - CFI_ENDPROC + jmp ret_from_intr END(\sym) .endm @@ -910,53 +771,45 @@ trace_apicinterrupt \num \sym .endm #ifdef CONFIG_SMP -apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR \ - irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt -apicinterrupt3 REBOOT_VECTOR \ - reboot_interrupt smp_reboot_interrupt +apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt +apicinterrupt3 REBOOT_VECTOR reboot_interrupt smp_reboot_interrupt #endif #ifdef CONFIG_X86_UV -apicinterrupt3 UV_BAU_MESSAGE \ - uv_bau_message_intr1 uv_bau_message_interrupt +apicinterrupt3 UV_BAU_MESSAGE uv_bau_message_intr1 uv_bau_message_interrupt #endif -apicinterrupt LOCAL_TIMER_VECTOR \ - apic_timer_interrupt smp_apic_timer_interrupt -apicinterrupt X86_PLATFORM_IPI_VECTOR \ - x86_platform_ipi smp_x86_platform_ipi + +apicinterrupt LOCAL_TIMER_VECTOR apic_timer_interrupt smp_apic_timer_interrupt +apicinterrupt X86_PLATFORM_IPI_VECTOR x86_platform_ipi smp_x86_platform_ipi #ifdef CONFIG_HAVE_KVM -apicinterrupt3 POSTED_INTR_VECTOR \ - kvm_posted_intr_ipi smp_kvm_posted_intr_ipi +apicinterrupt3 POSTED_INTR_VECTOR kvm_posted_intr_ipi smp_kvm_posted_intr_ipi +apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi #endif #ifdef CONFIG_X86_MCE_THRESHOLD -apicinterrupt THRESHOLD_APIC_VECTOR \ - threshold_interrupt smp_threshold_interrupt +apicinterrupt THRESHOLD_APIC_VECTOR threshold_interrupt smp_threshold_interrupt +#endif + +#ifdef CONFIG_X86_MCE_AMD +apicinterrupt DEFERRED_ERROR_VECTOR deferred_error_interrupt smp_deferred_error_interrupt #endif #ifdef CONFIG_X86_THERMAL_VECTOR -apicinterrupt THERMAL_APIC_VECTOR \ - thermal_interrupt smp_thermal_interrupt +apicinterrupt THERMAL_APIC_VECTOR thermal_interrupt smp_thermal_interrupt #endif #ifdef CONFIG_SMP -apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ - call_function_single_interrupt smp_call_function_single_interrupt -apicinterrupt CALL_FUNCTION_VECTOR \ - call_function_interrupt smp_call_function_interrupt -apicinterrupt RESCHEDULE_VECTOR \ - reschedule_interrupt smp_reschedule_interrupt +apicinterrupt CALL_FUNCTION_SINGLE_VECTOR call_function_single_interrupt smp_call_function_single_interrupt +apicinterrupt CALL_FUNCTION_VECTOR call_function_interrupt smp_call_function_interrupt +apicinterrupt RESCHEDULE_VECTOR reschedule_interrupt smp_reschedule_interrupt #endif -apicinterrupt ERROR_APIC_VECTOR \ - error_interrupt smp_error_interrupt -apicinterrupt SPURIOUS_APIC_VECTOR \ - spurious_interrupt smp_spurious_interrupt +apicinterrupt ERROR_APIC_VECTOR error_interrupt smp_error_interrupt +apicinterrupt SPURIOUS_APIC_VECTOR spurious_interrupt smp_spurious_interrupt #ifdef CONFIG_IRQ_WORK -apicinterrupt IRQ_WORK_VECTOR \ - irq_work_interrupt smp_irq_work_interrupt +apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt #endif /* @@ -971,100 +824,87 @@ ENTRY(\sym) .error "using shift_ist requires paranoid=1" .endif - .if \has_error_code - XCPT_FRAME - .else - INTR_FRAME - .endif - ASM_CLAC PARAVIRT_ADJUST_EXCEPTION_FRAME .ifeq \has_error_code - pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ + pushq $-1 /* ORIG_RAX: no syscall to restart */ .endif ALLOC_PT_GPREGS_ON_STACK .if \paranoid .if \paranoid == 1 - CFI_REMEMBER_STATE - testl $3, CS(%rsp) /* If coming from userspace, switch */ - jnz 1f /* stacks. */ + testb $3, CS(%rsp) /* If coming from userspace, switch stacks */ + jnz 1f .endif - call paranoid_entry + call paranoid_entry .else - call error_entry + call error_entry .endif /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ - DEFAULT_FRAME 0 - .if \paranoid .if \shift_ist != -1 - TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ + TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ .else TRACE_IRQS_OFF .endif .endif - movq %rsp,%rdi /* pt_regs pointer */ + movq %rsp, %rdi /* pt_regs pointer */ .if \has_error_code - movq ORIG_RAX(%rsp),%rsi /* get error code */ - movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ + movq ORIG_RAX(%rsp), %rsi /* get error code */ + movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ .else - xorl %esi,%esi /* no error code */ + xorl %esi, %esi /* no error code */ .endif .if \shift_ist != -1 - subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) + subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) .endif - call \do_sym + call \do_sym .if \shift_ist != -1 - addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) + addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) .endif /* these procedures expect "no swapgs" flag in ebx */ .if \paranoid - jmp paranoid_exit + jmp paranoid_exit .else - jmp error_exit + jmp error_exit .endif .if \paranoid == 1 - CFI_RESTORE_STATE /* * Paranoid entry from userspace. Switch stacks and treat it * as a normal entry. This means that paranoid handlers * run in real process context if user_mode(regs). */ 1: - call error_entry + call error_entry - DEFAULT_FRAME 0 - movq %rsp,%rdi /* pt_regs pointer */ - call sync_regs - movq %rax,%rsp /* switch stack */ + movq %rsp, %rdi /* pt_regs pointer */ + call sync_regs + movq %rax, %rsp /* switch stack */ - movq %rsp,%rdi /* pt_regs pointer */ + movq %rsp, %rdi /* pt_regs pointer */ .if \has_error_code - movq ORIG_RAX(%rsp),%rsi /* get error code */ - movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ + movq ORIG_RAX(%rsp), %rsi /* get error code */ + movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ .else - xorl %esi,%esi /* no error code */ + xorl %esi, %esi /* no error code */ .endif - call \do_sym + call \do_sym - jmp error_exit /* %ebx: no swapgs flag */ + jmp error_exit /* %ebx: no swapgs flag */ .endif - - CFI_ENDPROC END(\sym) .endm @@ -1079,65 +919,58 @@ idtentry \sym \do_sym has_error_code=\has_error_code .endm #endif -idtentry divide_error do_divide_error has_error_code=0 -idtentry overflow do_overflow has_error_code=0 -idtentry bounds do_bounds has_error_code=0 -idtentry invalid_op do_invalid_op has_error_code=0 -idtentry device_not_available do_device_not_available has_error_code=0 -idtentry double_fault do_double_fault has_error_code=1 paranoid=2 -idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 -idtentry invalid_TSS do_invalid_TSS has_error_code=1 -idtentry segment_not_present do_segment_not_present has_error_code=1 -idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 -idtentry coprocessor_error do_coprocessor_error has_error_code=0 -idtentry alignment_check do_alignment_check has_error_code=1 -idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 +idtentry divide_error do_divide_error has_error_code=0 +idtentry overflow do_overflow has_error_code=0 +idtentry bounds do_bounds has_error_code=0 +idtentry invalid_op do_invalid_op has_error_code=0 +idtentry device_not_available do_device_not_available has_error_code=0 +idtentry double_fault do_double_fault has_error_code=1 paranoid=2 +idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 +idtentry invalid_TSS do_invalid_TSS has_error_code=1 +idtentry segment_not_present do_segment_not_present has_error_code=1 +idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 +idtentry coprocessor_error do_coprocessor_error has_error_code=0 +idtentry alignment_check do_alignment_check has_error_code=1 +idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 - /* Reload gs selector with exception handling */ - /* edi: new selector */ + /* + * Reload gs selector with exception handling + * edi: new selector + */ ENTRY(native_load_gs_index) - CFI_STARTPROC - pushfq_cfi + pushfq DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) SWAPGS gs_change: - movl %edi,%gs -2: mfence /* workaround */ + movl %edi, %gs +2: mfence /* workaround */ SWAPGS - popfq_cfi + popfq ret - CFI_ENDPROC END(native_load_gs_index) - _ASM_EXTABLE(gs_change,bad_gs) - .section .fixup,"ax" + _ASM_EXTABLE(gs_change, bad_gs) + .section .fixup, "ax" /* running with kernelgs */ bad_gs: - SWAPGS /* switch back to user gs */ - xorl %eax,%eax - movl %eax,%gs - jmp 2b + SWAPGS /* switch back to user gs */ + xorl %eax, %eax + movl %eax, %gs + jmp 2b .previous /* Call softirq on interrupt stack. Interrupts are off. */ ENTRY(do_softirq_own_stack) - CFI_STARTPROC - pushq_cfi %rbp - CFI_REL_OFFSET rbp,0 - mov %rsp,%rbp - CFI_DEF_CFA_REGISTER rbp - incl PER_CPU_VAR(irq_count) - cmove PER_CPU_VAR(irq_stack_ptr),%rsp - push %rbp # backlink for old unwinder - call __do_softirq + pushq %rbp + mov %rsp, %rbp + incl PER_CPU_VAR(irq_count) + cmove PER_CPU_VAR(irq_stack_ptr), %rsp + push %rbp /* frame pointer backlink */ + call __do_softirq leaveq - CFI_RESTORE rbp - CFI_DEF_CFA_REGISTER rsp - CFI_ADJUST_CFA_OFFSET -8 - decl PER_CPU_VAR(irq_count) + decl PER_CPU_VAR(irq_count) ret - CFI_ENDPROC END(do_softirq_own_stack) #ifdef CONFIG_XEN @@ -1156,29 +989,24 @@ idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0 * existing activation in its critical region -- if so, we pop the current * activation and restart the handler using the previous one. */ -ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) - CFI_STARTPROC +ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */ + /* * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will * see the correct pointer to the pt_regs */ - movq %rdi, %rsp # we don't return, adjust the stack frame - CFI_ENDPROC - DEFAULT_FRAME -11: incl PER_CPU_VAR(irq_count) - movq %rsp,%rbp - CFI_DEF_CFA_REGISTER rbp - cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp - pushq %rbp # backlink for old unwinder - call xen_evtchn_do_upcall - popq %rsp - CFI_DEF_CFA_REGISTER rsp - decl PER_CPU_VAR(irq_count) + movq %rdi, %rsp /* we don't return, adjust the stack frame */ +11: incl PER_CPU_VAR(irq_count) + movq %rsp, %rbp + cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp + pushq %rbp /* frame pointer backlink */ + call xen_evtchn_do_upcall + popq %rsp + decl PER_CPU_VAR(irq_count) #ifndef CONFIG_PREEMPT - call xen_maybe_preempt_hcall + call xen_maybe_preempt_hcall #endif - jmp error_exit - CFI_ENDPROC + jmp error_exit END(xen_do_hypervisor_callback) /* @@ -1195,51 +1023,35 @@ END(xen_do_hypervisor_callback) * with its current contents: any discrepancy means we in category 1. */ ENTRY(xen_failsafe_callback) - INTR_FRAME 1 (6*8) - /*CFI_REL_OFFSET gs,GS*/ - /*CFI_REL_OFFSET fs,FS*/ - /*CFI_REL_OFFSET es,ES*/ - /*CFI_REL_OFFSET ds,DS*/ - CFI_REL_OFFSET r11,8 - CFI_REL_OFFSET rcx,0 - movw %ds,%cx - cmpw %cx,0x10(%rsp) - CFI_REMEMBER_STATE - jne 1f - movw %es,%cx - cmpw %cx,0x18(%rsp) - jne 1f - movw %fs,%cx - cmpw %cx,0x20(%rsp) - jne 1f - movw %gs,%cx - cmpw %cx,0x28(%rsp) - jne 1f + movl %ds, %ecx + cmpw %cx, 0x10(%rsp) + jne 1f + movl %es, %ecx + cmpw %cx, 0x18(%rsp) + jne 1f + movl %fs, %ecx + cmpw %cx, 0x20(%rsp) + jne 1f + movl %gs, %ecx + cmpw %cx, 0x28(%rsp) + jne 1f /* All segments match their saved values => Category 2 (Bad IRET). */ - movq (%rsp),%rcx - CFI_RESTORE rcx - movq 8(%rsp),%r11 - CFI_RESTORE r11 - addq $0x30,%rsp - CFI_ADJUST_CFA_OFFSET -0x30 - pushq_cfi $0 /* RIP */ - pushq_cfi %r11 - pushq_cfi %rcx - jmp general_protection - CFI_RESTORE_STATE + movq (%rsp), %rcx + movq 8(%rsp), %r11 + addq $0x30, %rsp + pushq $0 /* RIP */ + pushq %r11 + pushq %rcx + jmp general_protection 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ - movq (%rsp),%rcx - CFI_RESTORE rcx - movq 8(%rsp),%r11 - CFI_RESTORE r11 - addq $0x30,%rsp - CFI_ADJUST_CFA_OFFSET -0x30 - pushq_cfi $-1 /* orig_ax = -1 => not a system call */ + movq (%rsp), %rcx + movq 8(%rsp), %r11 + addq $0x30, %rsp + pushq $-1 /* orig_ax = -1 => not a system call */ ALLOC_PT_GPREGS_ON_STACK SAVE_C_REGS SAVE_EXTRA_REGS - jmp error_exit - CFI_ENDPROC + jmp error_exit END(xen_failsafe_callback) apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ @@ -1252,21 +1064,25 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ hyperv_callback_vector hyperv_vector_handler #endif /* CONFIG_HYPERV */ -idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK -idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK -idtentry stack_segment do_stack_segment has_error_code=1 +idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK +idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK +idtentry stack_segment do_stack_segment has_error_code=1 + #ifdef CONFIG_XEN -idtentry xen_debug do_debug has_error_code=0 -idtentry xen_int3 do_int3 has_error_code=0 -idtentry xen_stack_segment do_stack_segment has_error_code=1 +idtentry xen_debug do_debug has_error_code=0 +idtentry xen_int3 do_int3 has_error_code=0 +idtentry xen_stack_segment do_stack_segment has_error_code=1 #endif -idtentry general_protection do_general_protection has_error_code=1 -trace_idtentry page_fault do_page_fault has_error_code=1 + +idtentry general_protection do_general_protection has_error_code=1 +trace_idtentry page_fault do_page_fault has_error_code=1 + #ifdef CONFIG_KVM_GUEST -idtentry async_page_fault do_async_page_fault has_error_code=1 +idtentry async_page_fault do_async_page_fault has_error_code=1 #endif + #ifdef CONFIG_X86_MCE -idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) +idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) #endif /* @@ -1275,19 +1091,17 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector( * Return: ebx=0: need swapgs on exit, ebx=1: otherwise */ ENTRY(paranoid_entry) - XCPT_FRAME 1 15*8 cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 - movl $1,%ebx - movl $MSR_GS_BASE,%ecx + movl $1, %ebx + movl $MSR_GS_BASE, %ecx rdmsr - testl %edx,%edx - js 1f /* negative -> in kernel */ + testl %edx, %edx + js 1f /* negative -> in kernel */ SWAPGS - xorl %ebx,%ebx + xorl %ebx, %ebx 1: ret - CFI_ENDPROC END(paranoid_entry) /* @@ -1299,17 +1113,17 @@ END(paranoid_entry) * in syscall entry), so checking for preemption here would * be complicated. Fortunately, we there's no good reason * to try to handle preemption here. + * + * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ -/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ ENTRY(paranoid_exit) - DEFAULT_FRAME DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF_DEBUG - testl %ebx,%ebx /* swapgs needed? */ - jnz paranoid_exit_no_swapgs + testl %ebx, %ebx /* swapgs needed? */ + jnz paranoid_exit_no_swapgs TRACE_IRQS_IRETQ SWAPGS_UNSAFE_STACK - jmp paranoid_exit_restore + jmp paranoid_exit_restore paranoid_exit_no_swapgs: TRACE_IRQS_IRETQ_DEBUG paranoid_exit_restore: @@ -1317,24 +1131,24 @@ paranoid_exit_restore: RESTORE_C_REGS REMOVE_PT_GPREGS_FROM_STACK 8 INTERRUPT_RETURN - CFI_ENDPROC END(paranoid_exit) /* * Save all registers in pt_regs, and switch gs if needed. - * Return: ebx=0: need swapgs on exit, ebx=1: otherwise + * Return: EBX=0: came from user mode; EBX=1: otherwise */ ENTRY(error_entry) - XCPT_FRAME 1 15*8 cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 - xorl %ebx,%ebx - testl $3,CS+8(%rsp) - je error_kernelspace -error_swapgs: + xorl %ebx, %ebx + testb $3, CS+8(%rsp) + jz error_kernelspace + + /* We entered from user mode */ SWAPGS -error_sti: + +error_entry_done: TRACE_IRQS_OFF ret @@ -1345,56 +1159,66 @@ error_sti: * for these here too. */ error_kernelspace: - CFI_REL_OFFSET rcx, RCX+8 - incl %ebx - leaq native_irq_return_iret(%rip),%rcx - cmpq %rcx,RIP+8(%rsp) - je error_bad_iret - movl %ecx,%eax /* zero extend */ - cmpq %rax,RIP+8(%rsp) - je bstep_iret - cmpq $gs_change,RIP+8(%rsp) - je error_swapgs - jmp error_sti + incl %ebx + leaq native_irq_return_iret(%rip), %rcx + cmpq %rcx, RIP+8(%rsp) + je error_bad_iret + movl %ecx, %eax /* zero extend */ + cmpq %rax, RIP+8(%rsp) + je bstep_iret + cmpq $gs_change, RIP+8(%rsp) + jne error_entry_done + + /* + * hack: gs_change can fail with user gsbase. If this happens, fix up + * gsbase and proceed. We'll fix up the exception and land in + * gs_change's error handler with kernel gsbase. + */ + SWAPGS + jmp error_entry_done bstep_iret: /* Fix truncated RIP */ - movq %rcx,RIP+8(%rsp) + movq %rcx, RIP+8(%rsp) /* fall through */ error_bad_iret: + /* + * We came from an IRET to user mode, so we have user gsbase. + * Switch to kernel gsbase: + */ SWAPGS - mov %rsp,%rdi - call fixup_bad_iret - mov %rax,%rsp - decl %ebx /* Return to usergs */ - jmp error_sti - CFI_ENDPROC + + /* + * Pretend that the exception came from user mode: set up pt_regs + * as if we faulted immediately after IRET and clear EBX so that + * error_exit knows that we will be returning to user mode. + */ + mov %rsp, %rdi + call fixup_bad_iret + mov %rax, %rsp + decl %ebx + jmp error_entry_done END(error_entry) -/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ +/* + * On entry, EBS is a "return to kernel mode" flag: + * 1: already in kernel mode, don't need SWAPGS + * 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode + */ ENTRY(error_exit) - DEFAULT_FRAME - movl %ebx,%eax + movl %ebx, %eax RESTORE_EXTRA_REGS DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - GET_THREAD_INFO(%rcx) - testl %eax,%eax - jne retint_kernel - LOCKDEP_SYS_EXIT_IRQ - movl TI_flags(%rcx),%edx - movl $_TIF_WORK_MASK,%edi - andl %edi,%edx - jnz retint_careful - jmp retint_swapgs - CFI_ENDPROC + testl %eax, %eax + jnz retint_kernel + jmp retint_user END(error_exit) /* Runs on exception stack */ ENTRY(nmi) - INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME /* * We allow breakpoints in NMIs. If a breakpoint occurs, then @@ -1429,22 +1253,21 @@ ENTRY(nmi) */ /* Use %rdx as our temp variable throughout */ - pushq_cfi %rdx - CFI_REL_OFFSET rdx, 0 + pushq %rdx /* * If %cs was not the kernel segment, then the NMI triggered in user * space, which means it is definitely not nested. */ - cmpl $__KERNEL_CS, 16(%rsp) - jne first_nmi + cmpl $__KERNEL_CS, 16(%rsp) + jne first_nmi /* * Check the special variable on the stack to see if NMIs are * executing. */ - cmpl $1, -8(%rsp) - je nested_nmi + cmpl $1, -8(%rsp) + je nested_nmi /* * Now test if the previous stack was an NMI stack. @@ -1458,51 +1281,46 @@ ENTRY(nmi) cmpq %rdx, 4*8(%rsp) /* If the stack pointer is above the NMI stack, this is a normal NMI */ ja first_nmi + subq $EXCEPTION_STKSZ, %rdx cmpq %rdx, 4*8(%rsp) /* If it is below the NMI stack, it is a normal NMI */ jb first_nmi /* Ah, it is within the NMI stack, treat it as nested */ - CFI_REMEMBER_STATE - nested_nmi: /* * Do nothing if we interrupted the fixup in repeat_nmi. * It's about to repeat the NMI handler, so we are fine * with ignoring this one. */ - movq $repeat_nmi, %rdx - cmpq 8(%rsp), %rdx - ja 1f - movq $end_repeat_nmi, %rdx - cmpq 8(%rsp), %rdx - ja nested_nmi_out + movq $repeat_nmi, %rdx + cmpq 8(%rsp), %rdx + ja 1f + movq $end_repeat_nmi, %rdx + cmpq 8(%rsp), %rdx + ja nested_nmi_out 1: /* Set up the interrupted NMIs stack to jump to repeat_nmi */ - leaq -1*8(%rsp), %rdx - movq %rdx, %rsp - CFI_ADJUST_CFA_OFFSET 1*8 - leaq -10*8(%rsp), %rdx - pushq_cfi $__KERNEL_DS - pushq_cfi %rdx - pushfq_cfi - pushq_cfi $__KERNEL_CS - pushq_cfi $repeat_nmi + leaq -1*8(%rsp), %rdx + movq %rdx, %rsp + leaq -10*8(%rsp), %rdx + pushq $__KERNEL_DS + pushq %rdx + pushfq + pushq $__KERNEL_CS + pushq $repeat_nmi /* Put stack back */ - addq $(6*8), %rsp - CFI_ADJUST_CFA_OFFSET -6*8 + addq $(6*8), %rsp nested_nmi_out: - popq_cfi %rdx - CFI_RESTORE rdx + popq %rdx /* No need to check faults here */ INTERRUPT_RETURN - CFI_RESTORE_STATE first_nmi: /* * Because nested NMIs will use the pushed location that we @@ -1540,23 +1358,18 @@ first_nmi: * is also used by nested NMIs and can not be trusted on exit. */ /* Do not pop rdx, nested NMIs will corrupt that part of the stack */ - movq (%rsp), %rdx - CFI_RESTORE rdx + movq (%rsp), %rdx /* Set the NMI executing variable on the stack. */ - pushq_cfi $1 + pushq $1 - /* - * Leave room for the "copied" frame - */ - subq $(5*8), %rsp - CFI_ADJUST_CFA_OFFSET 5*8 + /* Leave room for the "copied" frame */ + subq $(5*8), %rsp /* Copy the stack frame to the Saved frame */ .rept 5 - pushq_cfi 11*8(%rsp) + pushq 11*8(%rsp) .endr - CFI_DEF_CFA_OFFSET 5*8 /* Everything up to here is safe from nested NMIs */ @@ -1575,16 +1388,14 @@ repeat_nmi: * is benign for the non-repeat case, where 1 was pushed just above * to this very stack slot). */ - movq $1, 10*8(%rsp) + movq $1, 10*8(%rsp) /* Make another copy, this one may be modified by nested NMIs */ - addq $(10*8), %rsp - CFI_ADJUST_CFA_OFFSET -10*8 + addq $(10*8), %rsp .rept 5 - pushq_cfi -6*8(%rsp) + pushq -6*8(%rsp) .endr - subq $(5*8), %rsp - CFI_DEF_CFA_OFFSET 5*8 + subq $(5*8), %rsp end_repeat_nmi: /* @@ -1592,7 +1403,7 @@ end_repeat_nmi: * NMI if the first NMI took an exception and reset our iret stack * so that we repeat another NMI. */ - pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ + pushq $-1 /* ORIG_RAX: no syscall to restart */ ALLOC_PT_GPREGS_ON_STACK /* @@ -1602,8 +1413,7 @@ end_repeat_nmi: * setting NEED_RESCHED or anything that normal interrupts and * exceptions might do. */ - call paranoid_entry - DEFAULT_FRAME 0 + call paranoid_entry /* * Save off the CR2 register. If we take a page fault in the NMI then @@ -1614,22 +1424,21 @@ end_repeat_nmi: * origin fault. Save it off and restore it if it changes. * Use the r12 callee-saved register. */ - movq %cr2, %r12 + movq %cr2, %r12 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ - movq %rsp,%rdi - movq $-1,%rsi - call do_nmi + movq %rsp, %rdi + movq $-1, %rsi + call do_nmi /* Did the NMI take a page fault? Restore cr2 if it did */ - movq %cr2, %rcx - cmpq %rcx, %r12 - je 1f - movq %r12, %cr2 + movq %cr2, %rcx + cmpq %rcx, %r12 + je 1f + movq %r12, %cr2 1: - - testl %ebx,%ebx /* swapgs needed? */ - jnz nmi_restore + testl %ebx, %ebx /* swapgs needed? */ + jnz nmi_restore nmi_swapgs: SWAPGS_UNSAFE_STACK nmi_restore: @@ -1639,15 +1448,11 @@ nmi_restore: REMOVE_PT_GPREGS_FROM_STACK 6*8 /* Clear the NMI executing stack variable */ - movq $0, 5*8(%rsp) - jmp irq_return - CFI_ENDPROC + movq $0, 5*8(%rsp) + INTERRUPT_RETURN END(nmi) ENTRY(ignore_sysret) - CFI_STARTPROC - mov $-ENOSYS,%eax + mov $-ENOSYS, %eax sysret - CFI_ENDPROC END(ignore_sysret) - diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S new file mode 100644 index 000000000000..bb187a6a877c --- /dev/null +++ b/arch/x86/entry/entry_64_compat.S @@ -0,0 +1,556 @@ +/* + * Compatibility mode system call entry point for x86-64. + * + * Copyright 2000-2002 Andi Kleen, SuSE Labs. + */ +#include "calling.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Avoid __ASSEMBLER__'ifying just for this. */ +#include +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_LE 0x40000000 + +#ifndef CONFIG_AUDITSYSCALL +# define sysexit_audit ia32_ret_from_sys_call +# define sysretl_audit ia32_ret_from_sys_call +#endif + + .section .entry.text, "ax" + +#ifdef CONFIG_PARAVIRT +ENTRY(native_usergs_sysret32) + swapgs + sysretl +ENDPROC(native_usergs_sysret32) +#endif + +/* + * 32-bit SYSENTER instruction entry. + * + * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs. + * IF and VM in rflags are cleared (IOW: interrupts are off). + * SYSENTER does not save anything on the stack, + * and does not save old rip (!!!) and rflags. + * + * Arguments: + * eax system call number + * ebx arg1 + * ecx arg2 + * edx arg3 + * esi arg4 + * edi arg5 + * ebp user stack + * 0(%ebp) arg6 + * + * This is purely a fast path. For anything complicated we use the int 0x80 + * path below. We set up a complete hardware stack frame to share code + * with the int 0x80 path. + */ +ENTRY(entry_SYSENTER_compat) + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ + SWAPGS_UNSAFE_STACK + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + ENABLE_INTERRUPTS(CLBR_NONE) + + /* Zero-extending 32-bit regs, do not remove */ + movl %ebp, %ebp + movl %eax, %eax + + movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d + + /* Construct struct pt_regs on stack */ + pushq $__USER32_DS /* pt_regs->ss */ + pushq %rbp /* pt_regs->sp */ + pushfq /* pt_regs->flags */ + pushq $__USER32_CS /* pt_regs->cs */ + pushq %r10 /* pt_regs->ip = thread_info->sysenter_return */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq $-ENOSYS /* pt_regs->ax */ + cld + sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */ + + /* + * no need to do an access_ok check here because rbp has been + * 32-bit zero extended + */ + ASM_STAC +1: movl (%rbp), %ebp + _ASM_EXTABLE(1b, ia32_badarg) + ASM_CLAC + + /* + * Sysenter doesn't filter flags, so we need to clear NT + * ourselves. To save a few cycles, we can check whether + * NT was set instead of doing an unconditional popfq. + */ + testl $X86_EFLAGS_NT, EFLAGS(%rsp) + jnz sysenter_fix_flags +sysenter_flags_fixed: + + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz sysenter_tracesys + +sysenter_do_call: + /* 32-bit syscall -> 64-bit C ABI argument conversion */ + movl %edi, %r8d /* arg5 */ + movl %ebp, %r9d /* arg6 */ + xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ + movl %ebx, %edi /* arg1 */ + movl %edx, %edx /* arg3 (zero extension) */ +sysenter_dispatch: + cmpq $(IA32_NR_syscalls-1), %rax + ja 1f + call *ia32_sys_call_table(, %rax, 8) + movq %rax, RAX(%rsp) +1: + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz sysexit_audit +sysexit_from_sys_call: + /* + * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an + * NMI between STI and SYSEXIT has poorly specified behavior, + * and and NMI followed by an IRQ with usergs is fatal. So + * we just pretend we're using SYSEXIT but we really use + * SYSRETL instead. + * + * This code path is still called 'sysexit' because it pairs + * with 'sysenter' and it uses the SYSENTER calling convention. + */ + andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + movl RIP(%rsp), %ecx /* User %eip */ + RESTORE_RSI_RDI + xorl %edx, %edx /* Do not leak kernel information */ + xorq %r8, %r8 + xorq %r9, %r9 + xorq %r10, %r10 + movl EFLAGS(%rsp), %r11d /* User eflags */ + TRACE_IRQS_ON + + /* + * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT, + * since it avoids a dicey window with interrupts enabled. + */ + movl RSP(%rsp), %esp + + /* + * USERGS_SYSRET32 does: + * gsbase = user's gs base + * eip = ecx + * rflags = r11 + * cs = __USER32_CS + * ss = __USER_DS + * + * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does: + * + * pop %ebp + * pop %edx + * pop %ecx + * + * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to + * avoid info leaks. R11 ends up with VDSO32_SYSENTER_RETURN's + * address (already known to user code), and R12-R15 are + * callee-saved and therefore don't contain any interesting + * kernel data. + */ + USERGS_SYSRET32 + +#ifdef CONFIG_AUDITSYSCALL + .macro auditsys_entry_common + /* + * At this point, registers hold syscall args in the 32-bit syscall ABI: + * EAX is syscall number, the 6 args are in EBX,ECX,EDX,ESI,EDI,EBP. + * + * We want to pass them to __audit_syscall_entry(), which is a 64-bit + * C function with 5 parameters, so shuffle them to match what + * the function expects: RDI,RSI,RDX,RCX,R8. + */ + movl %esi, %r8d /* arg5 (R8 ) <= 4th syscall arg (ESI) */ + xchg %ecx, %edx /* arg4 (RCX) <= 3rd syscall arg (EDX) */ + /* arg3 (RDX) <= 2nd syscall arg (ECX) */ + movl %ebx, %esi /* arg2 (RSI) <= 1st syscall arg (EBX) */ + movl %eax, %edi /* arg1 (RDI) <= syscall number (EAX) */ + call __audit_syscall_entry + + /* + * We are going to jump back to the syscall dispatch code. + * Prepare syscall args as required by the 64-bit C ABI. + * Registers clobbered by __audit_syscall_entry() are + * loaded from pt_regs on stack: + */ + movl ORIG_RAX(%rsp), %eax /* syscall number */ + movl %ebx, %edi /* arg1 */ + movl RCX(%rsp), %esi /* arg2 */ + movl RDX(%rsp), %edx /* arg3 */ + movl RSI(%rsp), %ecx /* arg4 */ + movl RDI(%rsp), %r8d /* arg5 */ + movl %ebp, %r9d /* arg6 */ + .endm + + .macro auditsys_exit exit + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz ia32_ret_from_sys_call + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + movl %eax, %esi /* second arg, syscall return value */ + cmpl $-MAX_ERRNO, %eax /* is it an error ? */ + jbe 1f + movslq %eax, %rsi /* if error sign extend to 64 bits */ +1: setbe %al /* 1 if error, 0 if not */ + movzbl %al, %edi /* zero-extend that into %edi */ + call __audit_syscall_exit + movq RAX(%rsp), %rax /* reload syscall return value */ + movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %edi + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jz \exit + xorl %eax, %eax /* Do not leak kernel information */ + movq %rax, R11(%rsp) + movq %rax, R10(%rsp) + movq %rax, R9(%rsp) + movq %rax, R8(%rsp) + jmp int_with_check + .endm + +sysenter_auditsys: + auditsys_entry_common + jmp sysenter_dispatch + +sysexit_audit: + auditsys_exit sysexit_from_sys_call +#endif + +sysenter_fix_flags: + pushq $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) + popfq + jmp sysenter_flags_fixed + +sysenter_tracesys: +#ifdef CONFIG_AUDITSYSCALL + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jz sysenter_auditsys +#endif + SAVE_EXTRA_REGS + xorl %eax, %eax /* Do not leak kernel information */ + movq %rax, R11(%rsp) + movq %rax, R10(%rsp) + movq %rax, R9(%rsp) + movq %rax, R8(%rsp) + movq %rsp, %rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + + /* Reload arg registers from stack. (see sysenter_tracesys) */ + movl RCX(%rsp), %ecx + movl RDX(%rsp), %edx + movl RSI(%rsp), %esi + movl RDI(%rsp), %edi + movl %eax, %eax /* zero extension */ + + RESTORE_EXTRA_REGS + jmp sysenter_do_call +ENDPROC(entry_SYSENTER_compat) + +/* + * 32-bit SYSCALL instruction entry. + * + * 32-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, + * then loads new ss, cs, and rip from previously programmed MSRs. + * rflags gets masked by a value from another MSR (so CLD and CLAC + * are not needed). SYSCALL does not save anything on the stack + * and does not change rsp. + * + * Note: rflags saving+masking-with-MSR happens only in Long mode + * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it). + * Don't get confused: rflags saving+masking depends on Long Mode Active bit + * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes + * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). + * + * Arguments: + * eax system call number + * ecx return address + * ebx arg1 + * ebp arg2 (note: not saved in the stack frame, should not be touched) + * edx arg3 + * esi arg4 + * edi arg5 + * esp user stack + * 0(%esp) arg6 + * + * This is purely a fast path. For anything complicated we use the int 0x80 + * path below. We set up a complete hardware stack frame to share code + * with the int 0x80 path. + */ +ENTRY(entry_SYSCALL_compat) + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ + SWAPGS_UNSAFE_STACK + movl %esp, %r8d + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + ENABLE_INTERRUPTS(CLBR_NONE) + + /* Zero-extending 32-bit regs, do not remove */ + movl %eax, %eax + + /* Construct struct pt_regs on stack */ + pushq $__USER32_DS /* pt_regs->ss */ + pushq %r8 /* pt_regs->sp */ + pushq %r11 /* pt_regs->flags */ + pushq $__USER32_CS /* pt_regs->cs */ + pushq %rcx /* pt_regs->ip */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rbp /* pt_regs->cx */ + movl %ebp, %ecx + pushq $-ENOSYS /* pt_regs->ax */ + sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */ + + /* + * No need to do an access_ok check here because r8 has been + * 32-bit zero extended: + */ + ASM_STAC +1: movl (%r8), %ebp + _ASM_EXTABLE(1b, ia32_badarg) + ASM_CLAC + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz cstar_tracesys + +cstar_do_call: + /* 32-bit syscall -> 64-bit C ABI argument conversion */ + movl %edi, %r8d /* arg5 */ + movl %ebp, %r9d /* arg6 */ + xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ + movl %ebx, %edi /* arg1 */ + movl %edx, %edx /* arg3 (zero extension) */ + +cstar_dispatch: + cmpq $(IA32_NR_syscalls-1), %rax + ja 1f + + call *ia32_sys_call_table(, %rax, 8) + movq %rax, RAX(%rsp) +1: + movl RCX(%rsp), %ebp + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz sysretl_audit + +sysretl_from_sys_call: + andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + RESTORE_RSI_RDI_RDX + movl RIP(%rsp), %ecx + movl EFLAGS(%rsp), %r11d + xorq %r10, %r10 + xorq %r9, %r9 + xorq %r8, %r8 + TRACE_IRQS_ON + movl RSP(%rsp), %esp + /* + * 64-bit->32-bit SYSRET restores eip from ecx, + * eflags from r11 (but RF and VM bits are forced to 0), + * cs and ss are loaded from MSRs. + * (Note: 32-bit->32-bit SYSRET is different: since r11 + * does not exist, it merely sets eflags.IF=1). + * + * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss + * descriptor is not reinitialized. This means that we must + * avoid SYSRET with SS == NULL, which could happen if we schedule, + * exit the kernel, and re-enter using an interrupt vector. (All + * interrupt entries on x86_64 set SS to NULL.) We prevent that + * from happening by reloading SS in __switch_to. + */ + USERGS_SYSRET32 + +#ifdef CONFIG_AUDITSYSCALL +cstar_auditsys: + auditsys_entry_common + jmp cstar_dispatch + +sysretl_audit: + auditsys_exit sysretl_from_sys_call +#endif + +cstar_tracesys: +#ifdef CONFIG_AUDITSYSCALL + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jz cstar_auditsys +#endif + SAVE_EXTRA_REGS + xorl %eax, %eax /* Do not leak kernel information */ + movq %rax, R11(%rsp) + movq %rax, R10(%rsp) + movq %rax, R9(%rsp) + movq %rax, R8(%rsp) + movq %rsp, %rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + + /* Reload arg registers from stack. (see sysenter_tracesys) */ + movl RCX(%rsp), %ecx + movl RDX(%rsp), %edx + movl RSI(%rsp), %esi + movl RDI(%rsp), %edi + movl %eax, %eax /* zero extension */ + + RESTORE_EXTRA_REGS + jmp cstar_do_call +END(entry_SYSCALL_compat) + +ia32_badarg: + ASM_CLAC + movq $-EFAULT, RAX(%rsp) +ia32_ret_from_sys_call: + xorl %eax, %eax /* Do not leak kernel information */ + movq %rax, R11(%rsp) + movq %rax, R10(%rsp) + movq %rax, R9(%rsp) + movq %rax, R8(%rsp) + jmp int_ret_from_sys_call + +/* + * Emulated IA32 system calls via int 0x80. + * + * Arguments: + * eax system call number + * ebx arg1 + * ecx arg2 + * edx arg3 + * esi arg4 + * edi arg5 + * ebp arg6 (note: not saved in the stack frame, should not be touched) + * + * Notes: + * Uses the same stack frame as the x86-64 version. + * All registers except eax must be saved (but ptrace may violate that). + * Arguments are zero extended. For system calls that want sign extension and + * take long arguments a wrapper is needed. Most calls can just be called + * directly. + * Assumes it is only called from user space and entered with interrupts off. + */ + +ENTRY(entry_INT80_compat) + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ + PARAVIRT_ADJUST_EXCEPTION_FRAME + SWAPGS + ENABLE_INTERRUPTS(CLBR_NONE) + + /* Zero-extending 32-bit regs, do not remove */ + movl %eax, %eax + + /* Construct struct pt_regs on stack (iret frame is already on stack) */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq $-ENOSYS /* pt_regs->ax */ + pushq $0 /* pt_regs->r8 */ + pushq $0 /* pt_regs->r9 */ + pushq $0 /* pt_regs->r10 */ + pushq $0 /* pt_regs->r11 */ + cld + sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ + + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz ia32_tracesys + +ia32_do_call: + /* 32-bit syscall -> 64-bit C ABI argument conversion */ + movl %edi, %r8d /* arg5 */ + movl %ebp, %r9d /* arg6 */ + xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ + movl %ebx, %edi /* arg1 */ + movl %edx, %edx /* arg3 (zero extension) */ + cmpq $(IA32_NR_syscalls-1), %rax + ja 1f + + call *ia32_sys_call_table(, %rax, 8) + movq %rax, RAX(%rsp) +1: + jmp int_ret_from_sys_call + +ia32_tracesys: + SAVE_EXTRA_REGS + movq %rsp, %rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + /* + * Reload arg registers from stack in case ptrace changed them. + * Don't reload %eax because syscall_trace_enter() returned + * the %rax value we should see. But do truncate it to 32 bits. + * If it's -1 to make us punt the syscall, then (u32)-1 is still + * an appropriately invalid value. + */ + movl RCX(%rsp), %ecx + movl RDX(%rsp), %edx + movl RSI(%rsp), %esi + movl RDI(%rsp), %edi + movl %eax, %eax /* zero extension */ + RESTORE_EXTRA_REGS + jmp ia32_do_call +END(entry_INT80_compat) + + .macro PTREGSCALL label, func + ALIGN +GLOBAL(\label) + leaq \func(%rip), %rax + jmp ia32_ptregs_common + .endm + + PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn + PTREGSCALL stub32_sigreturn, sys32_sigreturn + PTREGSCALL stub32_fork, sys_fork + PTREGSCALL stub32_vfork, sys_vfork + + ALIGN +GLOBAL(stub32_clone) + leaq sys_clone(%rip), %rax + /* + * The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr). + * The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val). + * + * The native 64-bit kernel's sys_clone() implements the latter, + * so we need to swap arguments here before calling it: + */ + xchg %r8, %rcx + jmp ia32_ptregs_common + + ALIGN +ia32_ptregs_common: + SAVE_EXTRA_REGS 8 + call *%rax + RESTORE_EXTRA_REGS 8 + ret +END(ia32_ptregs_common) diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/entry/syscall_32.c similarity index 79% rename from arch/x86/kernel/syscall_32.c rename to arch/x86/entry/syscall_32.c index 3777189c4a19..8ea34f94e973 100644 --- a/arch/x86/kernel/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -10,7 +10,7 @@ #else #define SYM(sym, compat) sym #define ia32_sys_call_table sys_call_table -#define __NR_ia32_syscall_max __NR_syscall_max +#define __NR_syscall_compat_max __NR_syscall_max #endif #define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ; @@ -23,11 +23,11 @@ typedef asmlinkage void (*sys_call_ptr_t)(void); extern asmlinkage void sys_ni_syscall(void); -__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { +__visible const sys_call_ptr_t ia32_sys_call_table[__NR_syscall_compat_max+1] = { /* * Smells like a compiler bug -- it doesn't work * when the & below is removed. */ - [0 ... __NR_ia32_syscall_max] = &sys_ni_syscall, + [0 ... __NR_syscall_compat_max] = &sys_ni_syscall, #include }; diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/entry/syscall_64.c similarity index 100% rename from arch/x86/kernel/syscall_64.c rename to arch/x86/entry/syscall_64.c diff --git a/arch/x86/syscalls/Makefile b/arch/x86/entry/syscalls/Makefile similarity index 95% rename from arch/x86/syscalls/Makefile rename to arch/x86/entry/syscalls/Makefile index a55abb9f6c5e..57aa59fd140c 100644 --- a/arch/x86/syscalls/Makefile +++ b/arch/x86/entry/syscalls/Makefile @@ -1,5 +1,5 @@ -out := $(obj)/../include/generated/asm -uapi := $(obj)/../include/generated/uapi/asm +out := $(obj)/../../include/generated/asm +uapi := $(obj)/../../include/generated/uapi/asm # Create output directory if not already present _dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') \ diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl similarity index 100% rename from arch/x86/syscalls/syscall_32.tbl rename to arch/x86/entry/syscalls/syscall_32.tbl diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl similarity index 100% rename from arch/x86/syscalls/syscall_64.tbl rename to arch/x86/entry/syscalls/syscall_64.tbl diff --git a/arch/x86/syscalls/syscallhdr.sh b/arch/x86/entry/syscalls/syscallhdr.sh similarity index 100% rename from arch/x86/syscalls/syscallhdr.sh rename to arch/x86/entry/syscalls/syscallhdr.sh diff --git a/arch/x86/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh similarity index 100% rename from arch/x86/syscalls/syscalltbl.sh rename to arch/x86/entry/syscalls/syscalltbl.sh diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/entry/thunk_32.S similarity index 83% rename from arch/x86/lib/thunk_32.S rename to arch/x86/entry/thunk_32.S index e407941d0488..e5a17114a8c4 100644 --- a/arch/x86/lib/thunk_32.S +++ b/arch/x86/entry/thunk_32.S @@ -6,16 +6,14 @@ */ #include #include - #include /* put return address in eax (arg1) */ .macro THUNK name, func, put_ret_addr_in_eax=0 .globl \name \name: - CFI_STARTPROC - pushl_cfi_reg eax - pushl_cfi_reg ecx - pushl_cfi_reg edx + pushl %eax + pushl %ecx + pushl %edx .if \put_ret_addr_in_eax /* Place EIP in the arg1 */ @@ -23,11 +21,10 @@ .endif call \func - popl_cfi_reg edx - popl_cfi_reg ecx - popl_cfi_reg eax + popl %edx + popl %ecx + popl %eax ret - CFI_ENDPROC _ASM_NOKPROBE(\name) .endm diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/entry/thunk_64.S similarity index 71% rename from arch/x86/lib/thunk_64.S rename to arch/x86/entry/thunk_64.S index 2198902329b5..efb2b932b748 100644 --- a/arch/x86/lib/thunk_64.S +++ b/arch/x86/entry/thunk_64.S @@ -6,35 +6,32 @@ * Subject to the GNU public license, v.2. No warranty of any kind. */ #include -#include -#include +#include "calling.h" #include /* rdi: arg1 ... normal C conventions. rax is saved/restored. */ .macro THUNK name, func, put_ret_addr_in_rdi=0 .globl \name \name: - CFI_STARTPROC /* this one pushes 9 elems, the next one would be %rIP */ - pushq_cfi_reg rdi - pushq_cfi_reg rsi - pushq_cfi_reg rdx - pushq_cfi_reg rcx - pushq_cfi_reg rax - pushq_cfi_reg r8 - pushq_cfi_reg r9 - pushq_cfi_reg r10 - pushq_cfi_reg r11 + pushq %rdi + pushq %rsi + pushq %rdx + pushq %rcx + pushq %rax + pushq %r8 + pushq %r9 + pushq %r10 + pushq %r11 .if \put_ret_addr_in_rdi /* 9*8(%rsp) is return addr on stack */ - movq_cfi_restore 9*8, rdi + movq 9*8(%rsp), %rdi .endif call \func jmp restore - CFI_ENDPROC _ASM_NOKPROBE(\name) .endm @@ -55,19 +52,16 @@ #if defined(CONFIG_TRACE_IRQFLAGS) \ || defined(CONFIG_DEBUG_LOCK_ALLOC) \ || defined(CONFIG_PREEMPT) - CFI_STARTPROC - CFI_ADJUST_CFA_OFFSET 9*8 restore: - popq_cfi_reg r11 - popq_cfi_reg r10 - popq_cfi_reg r9 - popq_cfi_reg r8 - popq_cfi_reg rax - popq_cfi_reg rcx - popq_cfi_reg rdx - popq_cfi_reg rsi - popq_cfi_reg rdi + popq %r11 + popq %r10 + popq %r9 + popq %r8 + popq %rax + popq %rcx + popq %rdx + popq %rsi + popq %rdi ret - CFI_ENDPROC _ASM_NOKPROBE(restore) #endif diff --git a/arch/x86/vdso/.gitignore b/arch/x86/entry/vdso/.gitignore similarity index 100% rename from arch/x86/vdso/.gitignore rename to arch/x86/entry/vdso/.gitignore diff --git a/arch/x86/vdso/Makefile b/arch/x86/entry/vdso/Makefile similarity index 100% rename from arch/x86/vdso/Makefile rename to arch/x86/entry/vdso/Makefile diff --git a/arch/x86/vdso/checkundef.sh b/arch/x86/entry/vdso/checkundef.sh similarity index 100% rename from arch/x86/vdso/checkundef.sh rename to arch/x86/entry/vdso/checkundef.sh diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c similarity index 100% rename from arch/x86/vdso/vclock_gettime.c rename to arch/x86/entry/vdso/vclock_gettime.c diff --git a/arch/x86/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S similarity index 100% rename from arch/x86/vdso/vdso-layout.lds.S rename to arch/x86/entry/vdso/vdso-layout.lds.S diff --git a/arch/x86/vdso/vdso-note.S b/arch/x86/entry/vdso/vdso-note.S similarity index 100% rename from arch/x86/vdso/vdso-note.S rename to arch/x86/entry/vdso/vdso-note.S diff --git a/arch/x86/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S similarity index 100% rename from arch/x86/vdso/vdso.lds.S rename to arch/x86/entry/vdso/vdso.lds.S diff --git a/arch/x86/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c similarity index 100% rename from arch/x86/vdso/vdso2c.c rename to arch/x86/entry/vdso/vdso2c.c diff --git a/arch/x86/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h similarity index 100% rename from arch/x86/vdso/vdso2c.h rename to arch/x86/entry/vdso/vdso2c.h diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c similarity index 100% rename from arch/x86/vdso/vdso32-setup.c rename to arch/x86/entry/vdso/vdso32-setup.c diff --git a/arch/x86/vdso/vdso32/.gitignore b/arch/x86/entry/vdso/vdso32/.gitignore similarity index 100% rename from arch/x86/vdso/vdso32/.gitignore rename to arch/x86/entry/vdso/vdso32/.gitignore diff --git a/arch/x86/vdso/vdso32/int80.S b/arch/x86/entry/vdso/vdso32/int80.S similarity index 100% rename from arch/x86/vdso/vdso32/int80.S rename to arch/x86/entry/vdso/vdso32/int80.S diff --git a/arch/x86/vdso/vdso32/note.S b/arch/x86/entry/vdso/vdso32/note.S similarity index 100% rename from arch/x86/vdso/vdso32/note.S rename to arch/x86/entry/vdso/vdso32/note.S diff --git a/arch/x86/vdso/vdso32/sigreturn.S b/arch/x86/entry/vdso/vdso32/sigreturn.S similarity index 100% rename from arch/x86/vdso/vdso32/sigreturn.S rename to arch/x86/entry/vdso/vdso32/sigreturn.S diff --git a/arch/x86/vdso/vdso32/syscall.S b/arch/x86/entry/vdso/vdso32/syscall.S similarity index 100% rename from arch/x86/vdso/vdso32/syscall.S rename to arch/x86/entry/vdso/vdso32/syscall.S diff --git a/arch/x86/vdso/vdso32/sysenter.S b/arch/x86/entry/vdso/vdso32/sysenter.S similarity index 100% rename from arch/x86/vdso/vdso32/sysenter.S rename to arch/x86/entry/vdso/vdso32/sysenter.S diff --git a/arch/x86/vdso/vdso32/vclock_gettime.c b/arch/x86/entry/vdso/vdso32/vclock_gettime.c similarity index 100% rename from arch/x86/vdso/vdso32/vclock_gettime.c rename to arch/x86/entry/vdso/vdso32/vclock_gettime.c diff --git a/arch/x86/vdso/vdso32/vdso-fakesections.c b/arch/x86/entry/vdso/vdso32/vdso-fakesections.c similarity index 100% rename from arch/x86/vdso/vdso32/vdso-fakesections.c rename to arch/x86/entry/vdso/vdso32/vdso-fakesections.c diff --git a/arch/x86/vdso/vdso32/vdso32.lds.S b/arch/x86/entry/vdso/vdso32/vdso32.lds.S similarity index 100% rename from arch/x86/vdso/vdso32/vdso32.lds.S rename to arch/x86/entry/vdso/vdso32/vdso32.lds.S diff --git a/arch/x86/vdso/vdsox32.lds.S b/arch/x86/entry/vdso/vdsox32.lds.S similarity index 100% rename from arch/x86/vdso/vdsox32.lds.S rename to arch/x86/entry/vdso/vdsox32.lds.S diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/entry/vdso/vgetcpu.c similarity index 100% rename from arch/x86/vdso/vgetcpu.c rename to arch/x86/entry/vdso/vgetcpu.c diff --git a/arch/x86/vdso/vma.c b/arch/x86/entry/vdso/vma.c similarity index 100% rename from arch/x86/vdso/vma.c rename to arch/x86/entry/vdso/vma.c diff --git a/arch/x86/entry/vsyscall/Makefile b/arch/x86/entry/vsyscall/Makefile new file mode 100644 index 000000000000..a9f4856f622a --- /dev/null +++ b/arch/x86/entry/vsyscall/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the x86 low level vsyscall code +# +obj-y := vsyscall_gtod.o + +obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o + diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c similarity index 100% rename from arch/x86/kernel/vsyscall_64.c rename to arch/x86/entry/vsyscall/vsyscall_64.c diff --git a/arch/x86/kernel/vsyscall_emu_64.S b/arch/x86/entry/vsyscall/vsyscall_emu_64.S similarity index 100% rename from arch/x86/kernel/vsyscall_emu_64.S rename to arch/x86/entry/vsyscall/vsyscall_emu_64.S diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/entry/vsyscall/vsyscall_gtod.c similarity index 100% rename from arch/x86/kernel/vsyscall_gtod.c rename to arch/x86/entry/vsyscall/vsyscall_gtod.c diff --git a/arch/x86/kernel/vsyscall_trace.h b/arch/x86/entry/vsyscall/vsyscall_trace.h similarity index 89% rename from arch/x86/kernel/vsyscall_trace.h rename to arch/x86/entry/vsyscall/vsyscall_trace.h index a8b2edec54fe..9dd7359a38a8 100644 --- a/arch/x86/kernel/vsyscall_trace.h +++ b/arch/x86/entry/vsyscall/vsyscall_trace.h @@ -24,6 +24,6 @@ TRACE_EVENT(emulate_vsyscall, #endif #undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH ../../arch/x86/kernel +#define TRACE_INCLUDE_PATH ../../arch/x86/entry/vsyscall/ #define TRACE_INCLUDE_FILE vsyscall_trace #include diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile index bb635c641869..cd4339bae066 100644 --- a/arch/x86/ia32/Makefile +++ b/arch/x86/ia32/Makefile @@ -2,7 +2,7 @@ # Makefile for the ia32 kernel emulation subsystem. # -obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o +obj-$(CONFIG_IA32_EMULATION) := sys_ia32.o ia32_signal.o obj-$(CONFIG_IA32_AOUT) += ia32_aout.o diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S deleted file mode 100644 index 72bf2680f819..000000000000 --- a/arch/x86/ia32/ia32entry.S +++ /dev/null @@ -1,611 +0,0 @@ -/* - * Compatibility mode system call entry point for x86-64. - * - * Copyright 2000-2002 Andi Kleen, SuSE Labs. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* Avoid __ASSEMBLER__'ifying just for this. */ -#include -#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) -#define __AUDIT_ARCH_LE 0x40000000 - -#ifndef CONFIG_AUDITSYSCALL -#define sysexit_audit ia32_ret_from_sys_call -#define sysretl_audit ia32_ret_from_sys_call -#endif - - .section .entry.text, "ax" - - /* clobbers %rax */ - .macro CLEAR_RREGS _r9=rax - xorl %eax,%eax - movq %rax,R11(%rsp) - movq %rax,R10(%rsp) - movq %\_r9,R9(%rsp) - movq %rax,R8(%rsp) - .endm - - /* - * Reload arg registers from stack in case ptrace changed them. - * We don't reload %eax because syscall_trace_enter() returned - * the %rax value we should see. Instead, we just truncate that - * value to 32 bits again as we did on entry from user mode. - * If it's a new value set by user_regset during entry tracing, - * this matches the normal truncation of the user-mode value. - * If it's -1 to make us punt the syscall, then (u32)-1 is still - * an appropriately invalid value. - */ - .macro LOAD_ARGS32 _r9=0 - .if \_r9 - movl R9(%rsp),%r9d - .endif - movl RCX(%rsp),%ecx - movl RDX(%rsp),%edx - movl RSI(%rsp),%esi - movl RDI(%rsp),%edi - movl %eax,%eax /* zero extension */ - .endm - - .macro CFI_STARTPROC32 simple - CFI_STARTPROC \simple - CFI_UNDEFINED r8 - CFI_UNDEFINED r9 - CFI_UNDEFINED r10 - CFI_UNDEFINED r11 - CFI_UNDEFINED r12 - CFI_UNDEFINED r13 - CFI_UNDEFINED r14 - CFI_UNDEFINED r15 - .endm - -#ifdef CONFIG_PARAVIRT -ENTRY(native_usergs_sysret32) - swapgs - sysretl -ENDPROC(native_usergs_sysret32) - -ENTRY(native_irq_enable_sysexit) - swapgs - sti - sysexit -ENDPROC(native_irq_enable_sysexit) -#endif - -/* - * 32bit SYSENTER instruction entry. - * - * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs. - * IF and VM in rflags are cleared (IOW: interrupts are off). - * SYSENTER does not save anything on the stack, - * and does not save old rip (!!!) and rflags. - * - * Arguments: - * eax system call number - * ebx arg1 - * ecx arg2 - * edx arg3 - * esi arg4 - * edi arg5 - * ebp user stack - * 0(%ebp) arg6 - * - * This is purely a fast path. For anything complicated we use the int 0x80 - * path below. We set up a complete hardware stack frame to share code - * with the int 0x80 path. - */ -ENTRY(ia32_sysenter_target) - CFI_STARTPROC32 simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,0 - CFI_REGISTER rsp,rbp - - /* - * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ - SWAPGS_UNSAFE_STACK - movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp - ENABLE_INTERRUPTS(CLBR_NONE) - - /* Zero-extending 32-bit regs, do not remove */ - movl %ebp, %ebp - movl %eax, %eax - - movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d - CFI_REGISTER rip,r10 - - /* Construct struct pt_regs on stack */ - pushq_cfi $__USER32_DS /* pt_regs->ss */ - pushq_cfi %rbp /* pt_regs->sp */ - CFI_REL_OFFSET rsp,0 - pushfq_cfi /* pt_regs->flags */ - pushq_cfi $__USER32_CS /* pt_regs->cs */ - pushq_cfi %r10 /* pt_regs->ip = thread_info->sysenter_return */ - CFI_REL_OFFSET rip,0 - pushq_cfi_reg rax /* pt_regs->orig_ax */ - pushq_cfi_reg rdi /* pt_regs->di */ - pushq_cfi_reg rsi /* pt_regs->si */ - pushq_cfi_reg rdx /* pt_regs->dx */ - pushq_cfi_reg rcx /* pt_regs->cx */ - pushq_cfi_reg rax /* pt_regs->ax */ - cld - sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ - CFI_ADJUST_CFA_OFFSET 10*8 - - /* - * no need to do an access_ok check here because rbp has been - * 32bit zero extended - */ - ASM_STAC -1: movl (%rbp),%ebp - _ASM_EXTABLE(1b,ia32_badarg) - ASM_CLAC - - /* - * Sysenter doesn't filter flags, so we need to clear NT - * ourselves. To save a few cycles, we can check whether - * NT was set instead of doing an unconditional popfq. - */ - testl $X86_EFLAGS_NT,EFLAGS(%rsp) - jnz sysenter_fix_flags -sysenter_flags_fixed: - - orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - CFI_REMEMBER_STATE - jnz sysenter_tracesys - cmpq $(IA32_NR_syscalls-1),%rax - ja ia32_badsys -sysenter_do_call: - /* 32bit syscall -> 64bit C ABI argument conversion */ - movl %edi,%r8d /* arg5 */ - movl %ebp,%r9d /* arg6 */ - xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ - movl %ebx,%edi /* arg1 */ - movl %edx,%edx /* arg3 (zero extension) */ -sysenter_dispatch: - call *ia32_sys_call_table(,%rax,8) - movq %rax,RAX(%rsp) - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz sysexit_audit -sysexit_from_sys_call: - /* - * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an - * NMI between STI and SYSEXIT has poorly specified behavior, - * and and NMI followed by an IRQ with usergs is fatal. So - * we just pretend we're using SYSEXIT but we really use - * SYSRETL instead. - * - * This code path is still called 'sysexit' because it pairs - * with 'sysenter' and it uses the SYSENTER calling convention. - */ - andl $~TS_COMPAT,ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - movl RIP(%rsp),%ecx /* User %eip */ - CFI_REGISTER rip,rcx - RESTORE_RSI_RDI - xorl %edx,%edx /* avoid info leaks */ - xorq %r8,%r8 - xorq %r9,%r9 - xorq %r10,%r10 - movl EFLAGS(%rsp),%r11d /* User eflags */ - /*CFI_RESTORE rflags*/ - TRACE_IRQS_ON - - /* - * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT, - * since it avoids a dicey window with interrupts enabled. - */ - movl RSP(%rsp),%esp - - /* - * USERGS_SYSRET32 does: - * gsbase = user's gs base - * eip = ecx - * rflags = r11 - * cs = __USER32_CS - * ss = __USER_DS - * - * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does: - * - * pop %ebp - * pop %edx - * pop %ecx - * - * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to - * avoid info leaks. R11 ends up with VDSO32_SYSENTER_RETURN's - * address (already known to user code), and R12-R15 are - * callee-saved and therefore don't contain any interesting - * kernel data. - */ - USERGS_SYSRET32 - - CFI_RESTORE_STATE - -#ifdef CONFIG_AUDITSYSCALL - .macro auditsys_entry_common - movl %esi,%r8d /* 5th arg: 4th syscall arg */ - movl %ecx,%r9d /*swap with edx*/ - movl %edx,%ecx /* 4th arg: 3rd syscall arg */ - movl %r9d,%edx /* 3rd arg: 2nd syscall arg */ - movl %ebx,%esi /* 2nd arg: 1st syscall arg */ - movl %eax,%edi /* 1st arg: syscall number */ - call __audit_syscall_entry - movl RAX(%rsp),%eax /* reload syscall number */ - cmpq $(IA32_NR_syscalls-1),%rax - ja ia32_badsys - movl %ebx,%edi /* reload 1st syscall arg */ - movl RCX(%rsp),%esi /* reload 2nd syscall arg */ - movl RDX(%rsp),%edx /* reload 3rd syscall arg */ - movl RSI(%rsp),%ecx /* reload 4th syscall arg */ - movl RDI(%rsp),%r8d /* reload 5th syscall arg */ - .endm - - .macro auditsys_exit exit - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz ia32_ret_from_sys_call - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - movl %eax,%esi /* second arg, syscall return value */ - cmpl $-MAX_ERRNO,%eax /* is it an error ? */ - jbe 1f - movslq %eax, %rsi /* if error sign extend to 64 bits */ -1: setbe %al /* 1 if error, 0 if not */ - movzbl %al,%edi /* zero-extend that into %edi */ - call __audit_syscall_exit - movq RAX(%rsp),%rax /* reload syscall return value */ - movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jz \exit - CLEAR_RREGS - jmp int_with_check - .endm - -sysenter_auditsys: - auditsys_entry_common - movl %ebp,%r9d /* reload 6th syscall arg */ - jmp sysenter_dispatch - -sysexit_audit: - auditsys_exit sysexit_from_sys_call -#endif - -sysenter_fix_flags: - pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) - popfq_cfi - jmp sysenter_flags_fixed - -sysenter_tracesys: -#ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jz sysenter_auditsys -#endif - SAVE_EXTRA_REGS - CLEAR_RREGS - movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */ - movq %rsp,%rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter - LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ - RESTORE_EXTRA_REGS - cmpq $(IA32_NR_syscalls-1),%rax - ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ - jmp sysenter_do_call - CFI_ENDPROC -ENDPROC(ia32_sysenter_target) - -/* - * 32bit SYSCALL instruction entry. - * - * 32bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, - * then loads new ss, cs, and rip from previously programmed MSRs. - * rflags gets masked by a value from another MSR (so CLD and CLAC - * are not needed). SYSCALL does not save anything on the stack - * and does not change rsp. - * - * Note: rflags saving+masking-with-MSR happens only in Long mode - * (in legacy 32bit mode, IF, RF and VM bits are cleared and that's it). - * Don't get confused: rflags saving+masking depends on Long Mode Active bit - * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes - * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). - * - * Arguments: - * eax system call number - * ecx return address - * ebx arg1 - * ebp arg2 (note: not saved in the stack frame, should not be touched) - * edx arg3 - * esi arg4 - * edi arg5 - * esp user stack - * 0(%esp) arg6 - * - * This is purely a fast path. For anything complicated we use the int 0x80 - * path below. We set up a complete hardware stack frame to share code - * with the int 0x80 path. - */ -ENTRY(ia32_cstar_target) - CFI_STARTPROC32 simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,0 - CFI_REGISTER rip,rcx - /*CFI_REGISTER rflags,r11*/ - - /* - * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ - SWAPGS_UNSAFE_STACK - movl %esp,%r8d - CFI_REGISTER rsp,r8 - movq PER_CPU_VAR(kernel_stack),%rsp - ENABLE_INTERRUPTS(CLBR_NONE) - - /* Zero-extending 32-bit regs, do not remove */ - movl %eax,%eax - - /* Construct struct pt_regs on stack */ - pushq_cfi $__USER32_DS /* pt_regs->ss */ - pushq_cfi %r8 /* pt_regs->sp */ - CFI_REL_OFFSET rsp,0 - pushq_cfi %r11 /* pt_regs->flags */ - pushq_cfi $__USER32_CS /* pt_regs->cs */ - pushq_cfi %rcx /* pt_regs->ip */ - CFI_REL_OFFSET rip,0 - pushq_cfi_reg rax /* pt_regs->orig_ax */ - pushq_cfi_reg rdi /* pt_regs->di */ - pushq_cfi_reg rsi /* pt_regs->si */ - pushq_cfi_reg rdx /* pt_regs->dx */ - pushq_cfi_reg rbp /* pt_regs->cx */ - movl %ebp,%ecx - pushq_cfi_reg rax /* pt_regs->ax */ - sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ - CFI_ADJUST_CFA_OFFSET 10*8 - - /* - * no need to do an access_ok check here because r8 has been - * 32bit zero extended - */ - ASM_STAC -1: movl (%r8),%r9d - _ASM_EXTABLE(1b,ia32_badarg) - ASM_CLAC - orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - CFI_REMEMBER_STATE - jnz cstar_tracesys - cmpq $IA32_NR_syscalls-1,%rax - ja ia32_badsys -cstar_do_call: - /* 32bit syscall -> 64bit C ABI argument conversion */ - movl %edi,%r8d /* arg5 */ - /* r9 already loaded */ /* arg6 */ - xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ - movl %ebx,%edi /* arg1 */ - movl %edx,%edx /* arg3 (zero extension) */ -cstar_dispatch: - call *ia32_sys_call_table(,%rax,8) - movq %rax,RAX(%rsp) - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz sysretl_audit -sysretl_from_sys_call: - andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - RESTORE_RSI_RDI_RDX - movl RIP(%rsp),%ecx - CFI_REGISTER rip,rcx - movl EFLAGS(%rsp),%r11d - /*CFI_REGISTER rflags,r11*/ - xorq %r10,%r10 - xorq %r9,%r9 - xorq %r8,%r8 - TRACE_IRQS_ON - movl RSP(%rsp),%esp - CFI_RESTORE rsp - /* - * 64bit->32bit SYSRET restores eip from ecx, - * eflags from r11 (but RF and VM bits are forced to 0), - * cs and ss are loaded from MSRs. - * (Note: 32bit->32bit SYSRET is different: since r11 - * does not exist, it merely sets eflags.IF=1). - * - * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss - * descriptor is not reinitialized. This means that we must - * avoid SYSRET with SS == NULL, which could happen if we schedule, - * exit the kernel, and re-enter using an interrupt vector. (All - * interrupt entries on x86_64 set SS to NULL.) We prevent that - * from happening by reloading SS in __switch_to. - */ - USERGS_SYSRET32 - -#ifdef CONFIG_AUDITSYSCALL -cstar_auditsys: - CFI_RESTORE_STATE - movl %r9d,R9(%rsp) /* register to be clobbered by call */ - auditsys_entry_common - movl R9(%rsp),%r9d /* reload 6th syscall arg */ - jmp cstar_dispatch - -sysretl_audit: - auditsys_exit sysretl_from_sys_call -#endif - -cstar_tracesys: -#ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jz cstar_auditsys -#endif - xchgl %r9d,%ebp - SAVE_EXTRA_REGS - CLEAR_RREGS r9 - movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ - movq %rsp,%rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter - LOAD_ARGS32 1 /* reload args from stack in case ptrace changed it */ - RESTORE_EXTRA_REGS - xchgl %ebp,%r9d - cmpq $(IA32_NR_syscalls-1),%rax - ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */ - jmp cstar_do_call -END(ia32_cstar_target) - -ia32_badarg: - ASM_CLAC - movq $-EFAULT,%rax - jmp ia32_sysret - CFI_ENDPROC - -/* - * Emulated IA32 system calls via int 0x80. - * - * Arguments: - * eax system call number - * ebx arg1 - * ecx arg2 - * edx arg3 - * esi arg4 - * edi arg5 - * ebp arg6 (note: not saved in the stack frame, should not be touched) - * - * Notes: - * Uses the same stack frame as the x86-64 version. - * All registers except eax must be saved (but ptrace may violate that). - * Arguments are zero extended. For system calls that want sign extension and - * take long arguments a wrapper is needed. Most calls can just be called - * directly. - * Assumes it is only called from user space and entered with interrupts off. - */ - -ENTRY(ia32_syscall) - CFI_STARTPROC32 simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,5*8 - /*CFI_REL_OFFSET ss,4*8 */ - CFI_REL_OFFSET rsp,3*8 - /*CFI_REL_OFFSET rflags,2*8 */ - /*CFI_REL_OFFSET cs,1*8 */ - CFI_REL_OFFSET rip,0*8 - - /* - * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ - PARAVIRT_ADJUST_EXCEPTION_FRAME - SWAPGS - ENABLE_INTERRUPTS(CLBR_NONE) - - /* Zero-extending 32-bit regs, do not remove */ - movl %eax,%eax - - /* Construct struct pt_regs on stack (iret frame is already on stack) */ - pushq_cfi_reg rax /* pt_regs->orig_ax */ - pushq_cfi_reg rdi /* pt_regs->di */ - pushq_cfi_reg rsi /* pt_regs->si */ - pushq_cfi_reg rdx /* pt_regs->dx */ - pushq_cfi_reg rcx /* pt_regs->cx */ - pushq_cfi_reg rax /* pt_regs->ax */ - cld - sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ - CFI_ADJUST_CFA_OFFSET 10*8 - - orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz ia32_tracesys - cmpq $(IA32_NR_syscalls-1),%rax - ja ia32_badsys -ia32_do_call: - /* 32bit syscall -> 64bit C ABI argument conversion */ - movl %edi,%r8d /* arg5 */ - movl %ebp,%r9d /* arg6 */ - xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ - movl %ebx,%edi /* arg1 */ - movl %edx,%edx /* arg3 (zero extension) */ - call *ia32_sys_call_table(,%rax,8) # xxx: rip relative -ia32_sysret: - movq %rax,RAX(%rsp) -ia32_ret_from_sys_call: - CLEAR_RREGS - jmp int_ret_from_sys_call - -ia32_tracesys: - SAVE_EXTRA_REGS - CLEAR_RREGS - movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ - movq %rsp,%rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter - LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ - RESTORE_EXTRA_REGS - cmpq $(IA32_NR_syscalls-1),%rax - ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */ - jmp ia32_do_call -END(ia32_syscall) - -ia32_badsys: - movq $0,ORIG_RAX(%rsp) - movq $-ENOSYS,%rax - jmp ia32_sysret - - CFI_ENDPROC - - .macro PTREGSCALL label, func - ALIGN -GLOBAL(\label) - leaq \func(%rip),%rax - jmp ia32_ptregs_common - .endm - - CFI_STARTPROC32 - - PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn - PTREGSCALL stub32_sigreturn, sys32_sigreturn - PTREGSCALL stub32_fork, sys_fork - PTREGSCALL stub32_vfork, sys_vfork - - ALIGN -GLOBAL(stub32_clone) - leaq sys_clone(%rip),%rax - mov %r8, %rcx - jmp ia32_ptregs_common - - ALIGN -ia32_ptregs_common: - CFI_ENDPROC - CFI_STARTPROC32 simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,SIZEOF_PTREGS - CFI_REL_OFFSET rax,RAX - CFI_REL_OFFSET rcx,RCX - CFI_REL_OFFSET rdx,RDX - CFI_REL_OFFSET rsi,RSI - CFI_REL_OFFSET rdi,RDI - CFI_REL_OFFSET rip,RIP -/* CFI_REL_OFFSET cs,CS*/ -/* CFI_REL_OFFSET rflags,EFLAGS*/ - CFI_REL_OFFSET rsp,RSP -/* CFI_REL_OFFSET ss,SS*/ - SAVE_EXTRA_REGS 8 - call *%rax - RESTORE_EXTRA_REGS 8 - ret - CFI_ENDPROC -END(ia32_ptregs_common) diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index bdf02eeee765..e7636bac7372 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -18,6 +18,12 @@ .endm #endif +/* + * Issue one struct alt_instr descriptor entry (need to put it into + * the section .altinstructions, see below). This entry contains + * enough information for the alternatives patching code to patch an + * instruction. See apply_alternatives(). + */ .macro altinstruction_entry orig alt feature orig_len alt_len pad_len .long \orig - . .long \alt - . @@ -27,6 +33,12 @@ .byte \pad_len .endm +/* + * Define an alternative between two instructions. If @feature is + * present, early code in apply_alternatives() replaces @oldinstr with + * @newinstr. ".skip" directive takes care of proper instruction padding + * in case @newinstr is longer than @oldinstr. + */ .macro ALTERNATIVE oldinstr, newinstr, feature 140: \oldinstr @@ -55,6 +67,12 @@ */ #define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) + +/* + * Same as ALTERNATIVE macro above but for two alternatives. If CPU + * has @feature1, it replaces @oldinstr with @newinstr1. If CPU has + * @feature2, it replaces @oldinstr with @feature2. + */ .macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2 140: \oldinstr diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 976b86a325e5..c8393634ca0c 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -644,6 +644,12 @@ static inline void entering_ack_irq(void) entering_irq(); } +static inline void ipi_entering_ack_irq(void) +{ + ack_APIC_irq(); + irq_enter(); +} + static inline void exiting_irq(void) { irq_exit(); diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 7730c1c5c83a..189679aba703 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -63,6 +63,31 @@ _ASM_ALIGN ; \ _ASM_PTR (entry); \ .popsection + +.macro ALIGN_DESTINATION + /* check for bad alignment of destination */ + movl %edi,%ecx + andl $7,%ecx + jz 102f /* already aligned */ + subl $8,%ecx + negl %ecx + subl %ecx,%edx +100: movb (%rsi),%al +101: movb %al,(%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz 100b +102: + .section .fixup,"ax" +103: addl %ecx,%edx /* ecx is zerorest also */ + jmp copy_user_handle_tail + .previous + + _ASM_EXTABLE(100b,103b) + _ASM_EXTABLE(101b,103b) + .endm + #else # define _ASM_EXTABLE(from,to) \ " .pushsection \"__ex_table\",\"a\"\n" \ diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 5e5cd123fdfb..e9168955c42f 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -22,7 +22,7 @@ * * Atomically reads the value of @v. */ -static inline int atomic_read(const atomic_t *v) +static __always_inline int atomic_read(const atomic_t *v) { return ACCESS_ONCE((v)->counter); } @@ -34,7 +34,7 @@ static inline int atomic_read(const atomic_t *v) * * Atomically sets the value of @v to @i. */ -static inline void atomic_set(atomic_t *v, int i) +static __always_inline void atomic_set(atomic_t *v, int i) { v->counter = i; } @@ -46,7 +46,7 @@ static inline void atomic_set(atomic_t *v, int i) * * Atomically adds @i to @v. */ -static inline void atomic_add(int i, atomic_t *v) +static __always_inline void atomic_add(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "addl %1,%0" : "+m" (v->counter) @@ -60,7 +60,7 @@ static inline void atomic_add(int i, atomic_t *v) * * Atomically subtracts @i from @v. */ -static inline void atomic_sub(int i, atomic_t *v) +static __always_inline void atomic_sub(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "subl %1,%0" : "+m" (v->counter) @@ -76,7 +76,7 @@ static inline void atomic_sub(int i, atomic_t *v) * true if the result is zero, or false for all * other cases. */ -static inline int atomic_sub_and_test(int i, atomic_t *v) +static __always_inline int atomic_sub_and_test(int i, atomic_t *v) { GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, "er", i, "%0", "e"); } @@ -87,7 +87,7 @@ static inline int atomic_sub_and_test(int i, atomic_t *v) * * Atomically increments @v by 1. */ -static inline void atomic_inc(atomic_t *v) +static __always_inline void atomic_inc(atomic_t *v) { asm volatile(LOCK_PREFIX "incl %0" : "+m" (v->counter)); @@ -99,7 +99,7 @@ static inline void atomic_inc(atomic_t *v) * * Atomically decrements @v by 1. */ -static inline void atomic_dec(atomic_t *v) +static __always_inline void atomic_dec(atomic_t *v) { asm volatile(LOCK_PREFIX "decl %0" : "+m" (v->counter)); @@ -113,7 +113,7 @@ static inline void atomic_dec(atomic_t *v) * returns true if the result is 0, or false for all other * cases. */ -static inline int atomic_dec_and_test(atomic_t *v) +static __always_inline int atomic_dec_and_test(atomic_t *v) { GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", "e"); } @@ -126,7 +126,7 @@ static inline int atomic_dec_and_test(atomic_t *v) * and returns true if the result is zero, or false for all * other cases. */ -static inline int atomic_inc_and_test(atomic_t *v) +static __always_inline int atomic_inc_and_test(atomic_t *v) { GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", "e"); } @@ -140,7 +140,7 @@ static inline int atomic_inc_and_test(atomic_t *v) * if the result is negative, or false when * result is greater than or equal to zero. */ -static inline int atomic_add_negative(int i, atomic_t *v) +static __always_inline int atomic_add_negative(int i, atomic_t *v) { GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, "er", i, "%0", "s"); } @@ -152,7 +152,7 @@ static inline int atomic_add_negative(int i, atomic_t *v) * * Atomically adds @i to @v and returns @i + @v */ -static inline int atomic_add_return(int i, atomic_t *v) +static __always_inline int atomic_add_return(int i, atomic_t *v) { return i + xadd(&v->counter, i); } @@ -164,7 +164,7 @@ static inline int atomic_add_return(int i, atomic_t *v) * * Atomically subtracts @i from @v and returns @v - @i */ -static inline int atomic_sub_return(int i, atomic_t *v) +static __always_inline int atomic_sub_return(int i, atomic_t *v) { return atomic_add_return(-i, v); } @@ -172,7 +172,7 @@ static inline int atomic_sub_return(int i, atomic_t *v) #define atomic_inc_return(v) (atomic_add_return(1, v)) #define atomic_dec_return(v) (atomic_sub_return(1, v)) -static inline int atomic_cmpxchg(atomic_t *v, int old, int new) +static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new) { return cmpxchg(&v->counter, old, new); } @@ -191,7 +191,7 @@ static inline int atomic_xchg(atomic_t *v, int new) * Atomically adds @a to @v, so long as @v was not already @u. * Returns the old value of @v. */ -static inline int __atomic_add_unless(atomic_t *v, int a, int u) +static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u) { int c, old; c = atomic_read(v); @@ -213,7 +213,7 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u) * Atomically adds 1 to @v * Returns the new value of @u */ -static inline short int atomic_inc_short(short int *v) +static __always_inline short int atomic_inc_short(short int *v) { asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v)); return *v; diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index f8d273e18516..b965f9e03f2a 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -40,7 +40,7 @@ static inline void atomic64_set(atomic64_t *v, long i) * * Atomically adds @i to @v. */ -static inline void atomic64_add(long i, atomic64_t *v) +static __always_inline void atomic64_add(long i, atomic64_t *v) { asm volatile(LOCK_PREFIX "addq %1,%0" : "=m" (v->counter) @@ -81,7 +81,7 @@ static inline int atomic64_sub_and_test(long i, atomic64_t *v) * * Atomically increments @v by 1. */ -static inline void atomic64_inc(atomic64_t *v) +static __always_inline void atomic64_inc(atomic64_t *v) { asm volatile(LOCK_PREFIX "incq %0" : "=m" (v->counter) @@ -94,7 +94,7 @@ static inline void atomic64_inc(atomic64_t *v) * * Atomically decrements @v by 1. */ -static inline void atomic64_dec(atomic64_t *v) +static __always_inline void atomic64_dec(atomic64_t *v) { asm volatile(LOCK_PREFIX "decq %0" : "=m" (v->counter) @@ -148,7 +148,7 @@ static inline int atomic64_add_negative(long i, atomic64_t *v) * * Atomically adds @i to @v and returns @i + @v */ -static inline long atomic64_add_return(long i, atomic64_t *v) +static __always_inline long atomic64_add_return(long i, atomic64_t *v) { return i + xadd(&v->counter, i); } diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 47c8e32f621a..b6f7457d12e4 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -8,7 +8,7 @@ /* * The set_memory_* API can be used to change various attributes of a virtual * address range. The attributes include: - * Cachability : UnCached, WriteCombining, WriteBack + * Cachability : UnCached, WriteCombining, WriteThrough, WriteBack * Executability : eXeutable, NoteXecutable * Read/Write : ReadOnly, ReadWrite * Presence : NotPresent @@ -35,9 +35,11 @@ int _set_memory_uc(unsigned long addr, int numpages); int _set_memory_wc(unsigned long addr, int numpages); +int _set_memory_wt(unsigned long addr, int numpages); int _set_memory_wb(unsigned long addr, int numpages); int set_memory_uc(unsigned long addr, int numpages); int set_memory_wc(unsigned long addr, int numpages); +int set_memory_wt(unsigned long addr, int numpages); int set_memory_wb(unsigned long addr, int numpages); int set_memory_x(unsigned long addr, int numpages); int set_memory_nx(unsigned long addr, int numpages); @@ -48,10 +50,12 @@ int set_memory_4k(unsigned long addr, int numpages); int set_memory_array_uc(unsigned long *addr, int addrinarray); int set_memory_array_wc(unsigned long *addr, int addrinarray); +int set_memory_array_wt(unsigned long *addr, int addrinarray); int set_memory_array_wb(unsigned long *addr, int addrinarray); int set_pages_array_uc(struct page **pages, int addrinarray); int set_pages_array_wc(struct page **pages, int addrinarray); +int set_pages_array_wt(struct page **pages, int addrinarray); int set_pages_array_wb(struct page **pages, int addrinarray); /* diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h deleted file mode 100644 index de1cdaf4d743..000000000000 --- a/arch/x86/include/asm/dwarf2.h +++ /dev/null @@ -1,170 +0,0 @@ -#ifndef _ASM_X86_DWARF2_H -#define _ASM_X86_DWARF2_H - -#ifndef __ASSEMBLY__ -#warning "asm/dwarf2.h should be only included in pure assembly files" -#endif - -/* - * Macros for dwarf2 CFI unwind table entries. - * See "as.info" for details on these pseudo ops. Unfortunately - * they are only supported in very new binutils, so define them - * away for older version. - */ - -#ifdef CONFIG_AS_CFI - -#define CFI_STARTPROC .cfi_startproc -#define CFI_ENDPROC .cfi_endproc -#define CFI_DEF_CFA .cfi_def_cfa -#define CFI_DEF_CFA_REGISTER .cfi_def_cfa_register -#define CFI_DEF_CFA_OFFSET .cfi_def_cfa_offset -#define CFI_ADJUST_CFA_OFFSET .cfi_adjust_cfa_offset -#define CFI_OFFSET .cfi_offset -#define CFI_REL_OFFSET .cfi_rel_offset -#define CFI_REGISTER .cfi_register -#define CFI_RESTORE .cfi_restore -#define CFI_REMEMBER_STATE .cfi_remember_state -#define CFI_RESTORE_STATE .cfi_restore_state -#define CFI_UNDEFINED .cfi_undefined -#define CFI_ESCAPE .cfi_escape - -#ifdef CONFIG_AS_CFI_SIGNAL_FRAME -#define CFI_SIGNAL_FRAME .cfi_signal_frame -#else -#define CFI_SIGNAL_FRAME -#endif - -#if defined(CONFIG_AS_CFI_SECTIONS) && defined(__ASSEMBLY__) - /* - * Emit CFI data in .debug_frame sections, not .eh_frame sections. - * The latter we currently just discard since we don't do DWARF - * unwinding at runtime. So only the offline DWARF information is - * useful to anyone. Note we should not use this directive if this - * file is used in the vDSO assembly, or if vmlinux.lds.S gets - * changed so it doesn't discard .eh_frame. - */ - .cfi_sections .debug_frame -#endif - -#else - -/* - * Due to the structure of pre-exisiting code, don't use assembler line - * comment character # to ignore the arguments. Instead, use a dummy macro. - */ -.macro cfi_ignore a=0, b=0, c=0, d=0 -.endm - -#define CFI_STARTPROC cfi_ignore -#define CFI_ENDPROC cfi_ignore -#define CFI_DEF_CFA cfi_ignore -#define CFI_DEF_CFA_REGISTER cfi_ignore -#define CFI_DEF_CFA_OFFSET cfi_ignore -#define CFI_ADJUST_CFA_OFFSET cfi_ignore -#define CFI_OFFSET cfi_ignore -#define CFI_REL_OFFSET cfi_ignore -#define CFI_REGISTER cfi_ignore -#define CFI_RESTORE cfi_ignore -#define CFI_REMEMBER_STATE cfi_ignore -#define CFI_RESTORE_STATE cfi_ignore -#define CFI_UNDEFINED cfi_ignore -#define CFI_ESCAPE cfi_ignore -#define CFI_SIGNAL_FRAME cfi_ignore - -#endif - -/* - * An attempt to make CFI annotations more or less - * correct and shorter. It is implied that you know - * what you're doing if you use them. - */ -#ifdef __ASSEMBLY__ -#ifdef CONFIG_X86_64 - .macro pushq_cfi reg - pushq \reg - CFI_ADJUST_CFA_OFFSET 8 - .endm - - .macro pushq_cfi_reg reg - pushq %\reg - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET \reg, 0 - .endm - - .macro popq_cfi reg - popq \reg - CFI_ADJUST_CFA_OFFSET -8 - .endm - - .macro popq_cfi_reg reg - popq %\reg - CFI_ADJUST_CFA_OFFSET -8 - CFI_RESTORE \reg - .endm - - .macro pushfq_cfi - pushfq - CFI_ADJUST_CFA_OFFSET 8 - .endm - - .macro popfq_cfi - popfq - CFI_ADJUST_CFA_OFFSET -8 - .endm - - .macro movq_cfi reg offset=0 - movq %\reg, \offset(%rsp) - CFI_REL_OFFSET \reg, \offset - .endm - - .macro movq_cfi_restore offset reg - movq \offset(%rsp), %\reg - CFI_RESTORE \reg - .endm -#else /*!CONFIG_X86_64*/ - .macro pushl_cfi reg - pushl \reg - CFI_ADJUST_CFA_OFFSET 4 - .endm - - .macro pushl_cfi_reg reg - pushl %\reg - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET \reg, 0 - .endm - - .macro popl_cfi reg - popl \reg - CFI_ADJUST_CFA_OFFSET -4 - .endm - - .macro popl_cfi_reg reg - popl %\reg - CFI_ADJUST_CFA_OFFSET -4 - CFI_RESTORE \reg - .endm - - .macro pushfl_cfi - pushfl - CFI_ADJUST_CFA_OFFSET 4 - .endm - - .macro popfl_cfi - popfl - CFI_ADJUST_CFA_OFFSET -4 - .endm - - .macro movl_cfi reg offset=0 - movl %\reg, \offset(%esp) - CFI_REL_OFFSET \reg, \offset - .endm - - .macro movl_cfi_restore offset reg - movl \offset(%esp), %\reg - CFI_RESTORE \reg - .endm -#endif /*!CONFIG_X86_64*/ -#endif /*__ASSEMBLY__*/ - -#endif /* _ASM_X86_DWARF2_H */ diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index dc5fa661465f..df002992d8fd 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -23,6 +23,8 @@ BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) #ifdef CONFIG_HAVE_KVM BUILD_INTERRUPT3(kvm_posted_intr_ipi, POSTED_INTR_VECTOR, smp_kvm_posted_intr_ipi) +BUILD_INTERRUPT3(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR, + smp_kvm_posted_intr_wakeup_ipi) #endif /* @@ -50,4 +52,7 @@ BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR) #endif +#ifdef CONFIG_X86_MCE_AMD +BUILD_INTERRUPT(deferred_error_interrupt, DEFERRED_ERROR_VECTOR) +#endif #endif diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h index 3b629f47eb65..793179cf8e21 100644 --- a/arch/x86/include/asm/frame.h +++ b/arch/x86/include/asm/frame.h @@ -1,20 +1,17 @@ #ifdef __ASSEMBLY__ #include -#include /* The annotation hides the frame from the unwinder and makes it look like a ordinary ebp save/restore. This avoids some special cases for frame pointer later */ #ifdef CONFIG_FRAME_POINTER .macro FRAME - __ASM_SIZE(push,_cfi) %__ASM_REG(bp) - CFI_REL_OFFSET __ASM_REG(bp), 0 + __ASM_SIZE(push,) %__ASM_REG(bp) __ASM_SIZE(mov) %__ASM_REG(sp), %__ASM_REG(bp) .endm .macro ENDFRAME - __ASM_SIZE(pop,_cfi) %__ASM_REG(bp) - CFI_RESTORE __ASM_REG(bp) + __ASM_SIZE(pop,) %__ASM_REG(bp) .endm #else .macro FRAME diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 0f5fb6b6567e..7178043b0e1d 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -14,6 +14,7 @@ typedef struct { #endif #ifdef CONFIG_HAVE_KVM unsigned int kvm_posted_intr_ipis; + unsigned int kvm_posted_intr_wakeup_ipis; #endif unsigned int x86_platform_ipis; /* arch dependent */ unsigned int apic_perf_irqs; @@ -33,6 +34,9 @@ typedef struct { #ifdef CONFIG_X86_MCE_THRESHOLD unsigned int irq_threshold_count; #endif +#ifdef CONFIG_X86_MCE_AMD + unsigned int irq_deferred_error_count; +#endif #if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN) unsigned int irq_hv_callback_count; #endif diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h index 36f7125945e3..5fa9fb0f8809 100644 --- a/arch/x86/include/asm/hpet.h +++ b/arch/x86/include/asm/hpet.h @@ -74,20 +74,16 @@ extern unsigned int hpet_readl(unsigned int a); extern void force_hpet_resume(void); struct irq_data; +struct hpet_dev; +struct irq_domain; + extern void hpet_msi_unmask(struct irq_data *data); extern void hpet_msi_mask(struct irq_data *data); -struct hpet_dev; extern void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg); extern void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg); - -#ifdef CONFIG_PCI_MSI -extern int default_setup_hpet_msi(unsigned int irq, unsigned int id); -#else -static inline int default_setup_hpet_msi(unsigned int irq, unsigned int id) -{ - return -EINVAL; -} -#endif +extern struct irq_domain *hpet_create_irq_domain(int hpet_id); +extern int hpet_assign_irq(struct irq_domain *domain, + struct hpet_dev *dev, int dev_num); #ifdef CONFIG_HPET_EMULATE_RTC diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index e9571ddabc4f..6615032e19c8 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -29,6 +29,7 @@ extern asmlinkage void apic_timer_interrupt(void); extern asmlinkage void x86_platform_ipi(void); extern asmlinkage void kvm_posted_intr_ipi(void); +extern asmlinkage void kvm_posted_intr_wakeup_ipi(void); extern asmlinkage void error_interrupt(void); extern asmlinkage void irq_work_interrupt(void); @@ -36,43 +37,10 @@ extern asmlinkage void spurious_interrupt(void); extern asmlinkage void thermal_interrupt(void); extern asmlinkage void reschedule_interrupt(void); -extern asmlinkage void invalidate_interrupt(void); -extern asmlinkage void invalidate_interrupt0(void); -extern asmlinkage void invalidate_interrupt1(void); -extern asmlinkage void invalidate_interrupt2(void); -extern asmlinkage void invalidate_interrupt3(void); -extern asmlinkage void invalidate_interrupt4(void); -extern asmlinkage void invalidate_interrupt5(void); -extern asmlinkage void invalidate_interrupt6(void); -extern asmlinkage void invalidate_interrupt7(void); -extern asmlinkage void invalidate_interrupt8(void); -extern asmlinkage void invalidate_interrupt9(void); -extern asmlinkage void invalidate_interrupt10(void); -extern asmlinkage void invalidate_interrupt11(void); -extern asmlinkage void invalidate_interrupt12(void); -extern asmlinkage void invalidate_interrupt13(void); -extern asmlinkage void invalidate_interrupt14(void); -extern asmlinkage void invalidate_interrupt15(void); -extern asmlinkage void invalidate_interrupt16(void); -extern asmlinkage void invalidate_interrupt17(void); -extern asmlinkage void invalidate_interrupt18(void); -extern asmlinkage void invalidate_interrupt19(void); -extern asmlinkage void invalidate_interrupt20(void); -extern asmlinkage void invalidate_interrupt21(void); -extern asmlinkage void invalidate_interrupt22(void); -extern asmlinkage void invalidate_interrupt23(void); -extern asmlinkage void invalidate_interrupt24(void); -extern asmlinkage void invalidate_interrupt25(void); -extern asmlinkage void invalidate_interrupt26(void); -extern asmlinkage void invalidate_interrupt27(void); -extern asmlinkage void invalidate_interrupt28(void); -extern asmlinkage void invalidate_interrupt29(void); -extern asmlinkage void invalidate_interrupt30(void); -extern asmlinkage void invalidate_interrupt31(void); - extern asmlinkage void irq_move_cleanup_interrupt(void); extern asmlinkage void reboot_interrupt(void); extern asmlinkage void threshold_interrupt(void); +extern asmlinkage void deferred_error_interrupt(void); extern asmlinkage void call_function_interrupt(void); extern asmlinkage void call_function_single_interrupt(void); @@ -87,60 +55,93 @@ extern void trace_spurious_interrupt(void); extern void trace_thermal_interrupt(void); extern void trace_reschedule_interrupt(void); extern void trace_threshold_interrupt(void); +extern void trace_deferred_error_interrupt(void); extern void trace_call_function_interrupt(void); extern void trace_call_function_single_interrupt(void); #define trace_irq_move_cleanup_interrupt irq_move_cleanup_interrupt #define trace_reboot_interrupt reboot_interrupt #define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi +#define trace_kvm_posted_intr_wakeup_ipi kvm_posted_intr_wakeup_ipi #endif /* CONFIG_TRACING */ -#ifdef CONFIG_IRQ_REMAP -/* Intel specific interrupt remapping information */ -struct irq_2_iommu { - struct intel_iommu *iommu; - u16 irte_index; - u16 sub_handle; - u8 irte_mask; -}; - -/* AMD specific interrupt remapping information */ -struct irq_2_irte { - u16 devid; /* Device ID for IRTE table */ - u16 index; /* Index into IRTE table*/ -}; -#endif /* CONFIG_IRQ_REMAP */ - #ifdef CONFIG_X86_LOCAL_APIC struct irq_data; +struct pci_dev; +struct msi_desc; -struct irq_cfg { - cpumask_var_t domain; - cpumask_var_t old_domain; - u8 vector; - u8 move_in_progress : 1; -#ifdef CONFIG_IRQ_REMAP - u8 remapped : 1; +enum irq_alloc_type { + X86_IRQ_ALLOC_TYPE_IOAPIC = 1, + X86_IRQ_ALLOC_TYPE_HPET, + X86_IRQ_ALLOC_TYPE_MSI, + X86_IRQ_ALLOC_TYPE_MSIX, + X86_IRQ_ALLOC_TYPE_DMAR, + X86_IRQ_ALLOC_TYPE_UV, +}; + +struct irq_alloc_info { + enum irq_alloc_type type; + u32 flags; + const struct cpumask *mask; /* CPU mask for vector allocation */ union { - struct irq_2_iommu irq_2_iommu; - struct irq_2_irte irq_2_irte; - }; -#endif - union { -#ifdef CONFIG_X86_IO_APIC + int unused; +#ifdef CONFIG_HPET_TIMER struct { - struct list_head irq_2_pin; + int hpet_id; + int hpet_index; + void *hpet_data; + }; +#endif +#ifdef CONFIG_PCI_MSI + struct { + struct pci_dev *msi_dev; + irq_hw_number_t msi_hwirq; + }; +#endif +#ifdef CONFIG_X86_IO_APIC + struct { + int ioapic_id; + int ioapic_pin; + int ioapic_node; + u32 ioapic_trigger : 1; + u32 ioapic_polarity : 1; + u32 ioapic_valid : 1; + struct IO_APIC_route_entry *ioapic_entry; + }; +#endif +#ifdef CONFIG_DMAR_TABLE + struct { + int dmar_id; + void *dmar_data; + }; +#endif +#ifdef CONFIG_HT_IRQ + struct { + int ht_pos; + int ht_idx; + struct pci_dev *ht_dev; + void *ht_update; + }; +#endif +#ifdef CONFIG_X86_UV + struct { + int uv_limit; + int uv_blade; + unsigned long uv_offset; + char *uv_name; }; #endif }; }; +struct irq_cfg { + unsigned int dest_apicid; + u8 vector; +}; + extern struct irq_cfg *irq_cfg(unsigned int irq); extern struct irq_cfg *irqd_cfg(struct irq_data *irq_data); -extern struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node); extern void lock_vector_lock(void); extern void unlock_vector_lock(void); -extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *); -extern void clear_irq_vector(int irq, struct irq_cfg *cfg); extern void setup_vector_irq(int cpu); #ifdef CONFIG_SMP extern void send_cleanup_vector(struct irq_cfg *); @@ -150,10 +151,7 @@ static inline void send_cleanup_vector(struct irq_cfg *c) { } static inline void irq_complete_move(struct irq_cfg *c) { } #endif -extern int apic_retrigger_irq(struct irq_data *data); extern void apic_ack_edge(struct irq_data *data); -extern int apic_set_affinity(struct irq_data *data, const struct cpumask *mask, - unsigned int *dest_id); #else /* CONFIG_X86_LOCAL_APIC */ static inline void lock_vector_lock(void) {} static inline void unlock_vector_lock(void) {} @@ -163,8 +161,7 @@ static inline void unlock_vector_lock(void) {} extern atomic_t irq_err_count; extern atomic_t irq_mis_count; -/* EISA */ -extern void eisa_set_level_irq(unsigned int irq); +extern void elcr_set_level_irq(unsigned int irq); /* SMP */ extern __visible void smp_apic_timer_interrupt(struct pt_regs *); @@ -178,7 +175,6 @@ extern asmlinkage void smp_irq_move_cleanup_interrupt(void); extern __visible void smp_reschedule_interrupt(struct pt_regs *); extern __visible void smp_call_function_interrupt(struct pt_regs *); extern __visible void smp_call_function_single_interrupt(struct pt_regs *); -extern __visible void smp_invalidate_interrupt(struct pt_regs *); #endif extern char irq_entries_start[]; diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 34a5b93704d3..83ec9b1d77cc 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -35,11 +35,13 @@ */ #define ARCH_HAS_IOREMAP_WC +#define ARCH_HAS_IOREMAP_WT #include #include #include #include +#include #define build_mmio_read(name, size, type, reg, barrier) \ static inline type name(const volatile void __iomem *addr) \ @@ -177,6 +179,7 @@ static inline unsigned int isa_virt_to_bus(volatile void *address) * look at pci_iomap(). */ extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size); +extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size); extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val); @@ -197,8 +200,6 @@ extern void set_iounmap_nonlazy(void); #include -#include - /* * Convert a virtual cached pointer to an uncached pointer */ @@ -320,6 +321,7 @@ extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); extern int ioremap_change_attr(unsigned long vaddr, unsigned long size, enum page_cache_mode pcm); extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size); +extern void __iomem *ioremap_wt(resource_size_t offset, unsigned long size); extern bool is_early_ioremap_ptep(pte_t *ptep); @@ -338,6 +340,9 @@ extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, #define IO_SPACE_LIMIT 0xffff #ifdef CONFIG_MTRR +extern int __must_check arch_phys_wc_index(int handle); +#define arch_phys_wc_index arch_phys_wc_index + extern int __must_check arch_phys_wc_add(unsigned long base, unsigned long size); extern void arch_phys_wc_del(int handle); diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 2f91685fe1cd..6cbf2cfb3f8a 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -95,9 +95,22 @@ struct IR_IO_APIC_route_entry { index : 15; } __attribute__ ((packed)); -#define IOAPIC_AUTO -1 -#define IOAPIC_EDGE 0 -#define IOAPIC_LEVEL 1 +struct irq_alloc_info; +struct ioapic_domain_cfg; + +#define IOAPIC_AUTO -1 +#define IOAPIC_EDGE 0 +#define IOAPIC_LEVEL 1 + +#define IOAPIC_MASKED 1 +#define IOAPIC_UNMASKED 0 + +#define IOAPIC_POL_HIGH 0 +#define IOAPIC_POL_LOW 1 + +#define IOAPIC_DEST_MODE_PHYSICAL 0 +#define IOAPIC_DEST_MODE_LOGICAL 1 + #define IOAPIC_MAP_ALLOC 0x1 #define IOAPIC_MAP_CHECK 0x2 @@ -110,9 +123,6 @@ extern int nr_ioapics; extern int mpc_ioapic_id(int ioapic); extern unsigned int mpc_ioapic_addr(int ioapic); -extern struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic); - -#define MP_MAX_IOAPIC_PIN 127 /* # of MP IRQ source entries */ extern int mp_irq_entries; @@ -120,9 +130,6 @@ extern int mp_irq_entries; /* MP IRQ source entries */ extern struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; -/* Older SiS APIC requires we rewrite the index register */ -extern int sis_apic_bug; - /* 1 if "noapic" boot option passed */ extern int skip_ioapic_setup; @@ -132,6 +139,8 @@ extern int noioapicquirk; /* -1 if "noapic" boot option passed */ extern int noioapicreroute; +extern u32 gsi_top; + extern unsigned long io_apic_irqs; #define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1 << (x)) & io_apic_irqs)) @@ -147,13 +156,6 @@ struct irq_cfg; extern void ioapic_insert_resources(void); extern int arch_early_ioapic_init(void); -extern int native_setup_ioapic_entry(int, struct IO_APIC_route_entry *, - unsigned int, int, - struct io_apic_irq_attr *); -extern void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg); - -extern void native_eoi_ioapic_pin(int apic, int pin, int vector); - extern int save_ioapic_entries(void); extern void mask_ioapic_entries(void); extern int restore_ioapic_entries(void); @@ -161,82 +163,32 @@ extern int restore_ioapic_entries(void); extern void setup_ioapic_ids_from_mpc(void); extern void setup_ioapic_ids_from_mpc_nocheck(void); -struct io_apic_irq_attr { - int ioapic; - int ioapic_pin; - int trigger; - int polarity; -}; - -enum ioapic_domain_type { - IOAPIC_DOMAIN_INVALID, - IOAPIC_DOMAIN_LEGACY, - IOAPIC_DOMAIN_STRICT, - IOAPIC_DOMAIN_DYNAMIC, -}; - -struct device_node; -struct irq_domain; -struct irq_domain_ops; - -struct ioapic_domain_cfg { - enum ioapic_domain_type type; - const struct irq_domain_ops *ops; - struct device_node *dev; -}; - -struct mp_ioapic_gsi{ - u32 gsi_base; - u32 gsi_end; -}; -extern u32 gsi_top; - extern int mp_find_ioapic(u32 gsi); extern int mp_find_ioapic_pin(int ioapic, u32 gsi); -extern u32 mp_pin_to_gsi(int ioapic, int pin); -extern int mp_map_gsi_to_irq(u32 gsi, unsigned int flags); +extern int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, + struct irq_alloc_info *info); extern void mp_unmap_irq(int irq); extern int mp_register_ioapic(int id, u32 address, u32 gsi_base, struct ioapic_domain_cfg *cfg); extern int mp_unregister_ioapic(u32 gsi_base); extern int mp_ioapic_registered(u32 gsi_base); -extern int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hwirq); -extern void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq); -extern int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node); -extern void __init pre_init_apic_IRQ0(void); + +extern void ioapic_set_alloc_attr(struct irq_alloc_info *info, + int node, int trigger, int polarity); extern void mp_save_irq(struct mpc_intsrc *m); extern void disable_ioapic_support(void); -extern void __init native_io_apic_init_mappings(void); +extern void __init io_apic_init_mappings(void); extern unsigned int native_io_apic_read(unsigned int apic, unsigned int reg); -extern void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int val); -extern void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val); extern void native_disable_io_apic(void); -extern void native_io_apic_print_entries(unsigned int apic, unsigned int nr_entries); -extern void intel_ir_io_apic_print_entries(unsigned int apic, unsigned int nr_entries); -extern int native_ioapic_set_affinity(struct irq_data *, - const struct cpumask *, - bool); static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) { return x86_io_apic_ops.read(apic, reg); } -static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) -{ - x86_io_apic_ops.write(apic, reg, value); -} -static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) -{ - x86_io_apic_ops.modify(apic, reg, value); -} - -extern void io_apic_eoi(unsigned int apic, unsigned int vector); - extern void setup_IO_APIC(void); extern void enable_IO_APIC(void); extern void disable_IO_APIC(void); @@ -253,8 +205,12 @@ static inline int arch_early_ioapic_init(void) { return 0; } static inline void print_IO_APICs(void) {} #define gsi_top (NR_IRQS_LEGACY) static inline int mp_find_ioapic(u32 gsi) { return 0; } -static inline u32 mp_pin_to_gsi(int ioapic, int pin) { return UINT_MAX; } -static inline int mp_map_gsi_to_irq(u32 gsi, unsigned int flags) { return gsi; } +static inline int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, + struct irq_alloc_info *info) +{ + return gsi; +} + static inline void mp_unmap_irq(int irq) { } static inline int save_ioapic_entries(void) @@ -268,17 +224,11 @@ static inline int restore_ioapic_entries(void) return -ENOMEM; } -static inline void mp_save_irq(struct mpc_intsrc *m) { }; +static inline void mp_save_irq(struct mpc_intsrc *m) { } static inline void disable_ioapic_support(void) { } -#define native_io_apic_init_mappings NULL +static inline void io_apic_init_mappings(void) { } #define native_io_apic_read NULL -#define native_io_apic_write NULL -#define native_io_apic_modify NULL #define native_disable_io_apic NULL -#define native_io_apic_print_entries NULL -#define native_ioapic_set_affinity NULL -#define native_setup_ioapic_entry NULL -#define native_eoi_ioapic_pin NULL static inline void setup_IO_APIC(void) { } static inline void enable_IO_APIC(void) { } diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index a80cbb88ea91..8008d06581c7 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -30,6 +30,10 @@ extern void fixup_irqs(void); extern void irq_force_complete_move(int); #endif +#ifdef CONFIG_HAVE_KVM +extern void kvm_set_posted_intr_wakeup_handler(void (*handler)(void)); +#endif + extern void (*x86_platform_ipi_callback)(void); extern void native_init_IRQ(void); extern bool handle_irq(unsigned irq, struct pt_regs *regs); diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 6224d316c405..046c7fb1ca43 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -22,84 +22,72 @@ #ifndef __X86_IRQ_REMAPPING_H #define __X86_IRQ_REMAPPING_H +#include +#include #include -struct IO_APIC_route_entry; -struct io_apic_irq_attr; -struct irq_chip; struct msi_msg; -struct pci_dev; -struct irq_cfg; +struct irq_alloc_info; + +enum irq_remap_cap { + IRQ_POSTING_CAP = 0, +}; #ifdef CONFIG_IRQ_REMAP +extern bool irq_remapping_cap(enum irq_remap_cap cap); extern void set_irq_remapping_broken(void); extern int irq_remapping_prepare(void); extern int irq_remapping_enable(void); extern void irq_remapping_disable(void); extern int irq_remapping_reenable(int); extern int irq_remap_enable_fault_handling(void); -extern int setup_ioapic_remapped_entry(int irq, - struct IO_APIC_route_entry *entry, - unsigned int destination, - int vector, - struct io_apic_irq_attr *attr); -extern void free_remapped_irq(int irq); -extern void compose_remapped_msi_msg(struct pci_dev *pdev, - unsigned int irq, unsigned int dest, - struct msi_msg *msg, u8 hpet_id); -extern int setup_hpet_msi_remapped(unsigned int irq, unsigned int id); extern void panic_if_irq_remap(const char *msg); -extern bool setup_remapped_irq(int irq, - struct irq_cfg *cfg, - struct irq_chip *chip); -void irq_remap_modify_chip_defaults(struct irq_chip *chip); +extern struct irq_domain * +irq_remapping_get_ir_irq_domain(struct irq_alloc_info *info); +extern struct irq_domain * +irq_remapping_get_irq_domain(struct irq_alloc_info *info); + +/* Create PCI MSI/MSIx irqdomain, use @parent as the parent irqdomain. */ +extern struct irq_domain *arch_create_msi_irq_domain(struct irq_domain *parent); + +/* Get parent irqdomain for interrupt remapping irqdomain */ +static inline struct irq_domain *arch_get_ir_parent_domain(void) +{ + return x86_vector_domain; +} + +struct vcpu_data { + u64 pi_desc_addr; /* Physical address of PI Descriptor */ + u32 vector; /* Guest vector of the interrupt */ +}; #else /* CONFIG_IRQ_REMAP */ +static inline bool irq_remapping_cap(enum irq_remap_cap cap) { return 0; } static inline void set_irq_remapping_broken(void) { } static inline int irq_remapping_prepare(void) { return -ENODEV; } static inline int irq_remapping_enable(void) { return -ENODEV; } static inline void irq_remapping_disable(void) { } static inline int irq_remapping_reenable(int eim) { return -ENODEV; } static inline int irq_remap_enable_fault_handling(void) { return -ENODEV; } -static inline int setup_ioapic_remapped_entry(int irq, - struct IO_APIC_route_entry *entry, - unsigned int destination, - int vector, - struct io_apic_irq_attr *attr) -{ - return -ENODEV; -} -static inline void free_remapped_irq(int irq) { } -static inline void compose_remapped_msi_msg(struct pci_dev *pdev, - unsigned int irq, unsigned int dest, - struct msi_msg *msg, u8 hpet_id) -{ -} -static inline int setup_hpet_msi_remapped(unsigned int irq, unsigned int id) -{ - return -ENODEV; -} static inline void panic_if_irq_remap(const char *msg) { } -static inline void irq_remap_modify_chip_defaults(struct irq_chip *chip) +static inline struct irq_domain * +irq_remapping_get_ir_irq_domain(struct irq_alloc_info *info) { + return NULL; } -static inline bool setup_remapped_irq(int irq, - struct irq_cfg *cfg, - struct irq_chip *chip) +static inline struct irq_domain * +irq_remapping_get_irq_domain(struct irq_alloc_info *info) { - return false; + return NULL; } + #endif /* CONFIG_IRQ_REMAP */ - -#define dmar_alloc_hwirq() irq_alloc_hwirq(-1) -#define dmar_free_hwirq irq_free_hwirq - #endif /* __X86_IRQ_REMAPPING_H */ diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 666c89ec4bd7..4c2d2eb2060a 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -47,31 +47,12 @@ #define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR #define IA32_SYSCALL_VECTOR 0x80 -#ifdef CONFIG_X86_32 -# define SYSCALL_VECTOR 0x80 -#endif /* * Vectors 0x30-0x3f are used for ISA interrupts. * round up to the next 16-vector boundary */ -#define IRQ0_VECTOR ((FIRST_EXTERNAL_VECTOR + 16) & ~15) - -#define IRQ1_VECTOR (IRQ0_VECTOR + 1) -#define IRQ2_VECTOR (IRQ0_VECTOR + 2) -#define IRQ3_VECTOR (IRQ0_VECTOR + 3) -#define IRQ4_VECTOR (IRQ0_VECTOR + 4) -#define IRQ5_VECTOR (IRQ0_VECTOR + 5) -#define IRQ6_VECTOR (IRQ0_VECTOR + 6) -#define IRQ7_VECTOR (IRQ0_VECTOR + 7) -#define IRQ8_VECTOR (IRQ0_VECTOR + 8) -#define IRQ9_VECTOR (IRQ0_VECTOR + 9) -#define IRQ10_VECTOR (IRQ0_VECTOR + 10) -#define IRQ11_VECTOR (IRQ0_VECTOR + 11) -#define IRQ12_VECTOR (IRQ0_VECTOR + 12) -#define IRQ13_VECTOR (IRQ0_VECTOR + 13) -#define IRQ14_VECTOR (IRQ0_VECTOR + 14) -#define IRQ15_VECTOR (IRQ0_VECTOR + 15) +#define ISA_IRQ_VECTOR(irq) (((FIRST_EXTERNAL_VECTOR + 16) & ~15) + irq) /* * Special IRQ vectors used by the SMP architecture, 0xf0-0xff @@ -102,21 +83,23 @@ */ #define X86_PLATFORM_IPI_VECTOR 0xf7 -/* Vector for KVM to deliver posted interrupt IPI */ -#ifdef CONFIG_HAVE_KVM -#define POSTED_INTR_VECTOR 0xf2 -#endif - +#define POSTED_INTR_WAKEUP_VECTOR 0xf1 /* * IRQ work vector: */ #define IRQ_WORK_VECTOR 0xf6 #define UV_BAU_MESSAGE 0xf5 +#define DEFERRED_ERROR_VECTOR 0xf4 /* Vector on which hypervisor callbacks will be delivered */ #define HYPERVISOR_CALLBACK_VECTOR 0xf3 +/* Vector for KVM to deliver posted interrupt IPI */ +#ifdef CONFIG_HAVE_KVM +#define POSTED_INTR_VECTOR 0xf2 +#endif + /* * Local APIC timer IRQ vector is on a different priority level, * to work around the 'lost local interrupt if more than 2 IRQ @@ -155,18 +138,22 @@ static inline int invalid_vm86_irq(int irq) * static arrays. */ -#define NR_IRQS_LEGACY 16 +#define NR_IRQS_LEGACY 16 -#define IO_APIC_VECTOR_LIMIT ( 32 * MAX_IO_APICS ) +#define CPU_VECTOR_LIMIT (64 * NR_CPUS) +#define IO_APIC_VECTOR_LIMIT (32 * MAX_IO_APICS) -#ifdef CONFIG_X86_IO_APIC -# define CPU_VECTOR_LIMIT (64 * NR_CPUS) -# define NR_IRQS \ +#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_PCI_MSI) +#define NR_IRQS \ (CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ? \ (NR_VECTORS + CPU_VECTOR_LIMIT) : \ (NR_VECTORS + IO_APIC_VECTOR_LIMIT)) -#else /* !CONFIG_X86_IO_APIC: */ -# define NR_IRQS NR_IRQS_LEGACY +#elif defined(CONFIG_X86_IO_APIC) +#define NR_IRQS (NR_VECTORS + IO_APIC_VECTOR_LIMIT) +#elif defined(CONFIG_PCI_MSI) +#define NR_IRQS (NR_VECTORS + CPU_VECTOR_LIMIT) +#else +#define NR_IRQS NR_IRQS_LEGACY #endif #endif /* _ASM_X86_IRQ_VECTORS_H */ diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h new file mode 100644 index 000000000000..d26075b52885 --- /dev/null +++ b/arch/x86/include/asm/irqdomain.h @@ -0,0 +1,63 @@ +#ifndef _ASM_IRQDOMAIN_H +#define _ASM_IRQDOMAIN_H + +#include +#include + +#ifdef CONFIG_X86_LOCAL_APIC +enum { + /* Allocate contiguous CPU vectors */ + X86_IRQ_ALLOC_CONTIGUOUS_VECTORS = 0x1, +}; + +extern struct irq_domain *x86_vector_domain; + +extern void init_irq_alloc_info(struct irq_alloc_info *info, + const struct cpumask *mask); +extern void copy_irq_alloc_info(struct irq_alloc_info *dst, + struct irq_alloc_info *src); +#endif /* CONFIG_X86_LOCAL_APIC */ + +#ifdef CONFIG_X86_IO_APIC +struct device_node; +struct irq_data; + +enum ioapic_domain_type { + IOAPIC_DOMAIN_INVALID, + IOAPIC_DOMAIN_LEGACY, + IOAPIC_DOMAIN_STRICT, + IOAPIC_DOMAIN_DYNAMIC, +}; + +struct ioapic_domain_cfg { + enum ioapic_domain_type type; + const struct irq_domain_ops *ops; + struct device_node *dev; +}; + +extern const struct irq_domain_ops mp_ioapic_irqdomain_ops; + +extern int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg); +extern void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs); +extern void mp_irqdomain_activate(struct irq_domain *domain, + struct irq_data *irq_data); +extern void mp_irqdomain_deactivate(struct irq_domain *domain, + struct irq_data *irq_data); +extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain); +#endif /* CONFIG_X86_IO_APIC */ + +#ifdef CONFIG_PCI_MSI +extern void arch_init_msi_domain(struct irq_domain *domain); +#else +static inline void arch_init_msi_domain(struct irq_domain *domain) { } +#endif + +#ifdef CONFIG_HT_IRQ +extern void arch_init_htirq_domain(struct irq_domain *domain); +#else +static inline void arch_init_htirq_domain(struct irq_domain *domain) { } +#endif + +#endif diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 1f5a86d518db..982dfc3679ad 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -17,11 +17,16 @@ #define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT) #define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */ #define MCG_ELOG_P (1ULL<<26) /* Extended error log supported */ +#define MCG_LMCE_P (1ULL<<27) /* Local machine check supported */ /* MCG_STATUS register defines */ #define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */ #define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */ #define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */ +#define MCG_STATUS_LMCES (1ULL<<3) /* LMCE signaled */ + +/* MCG_EXT_CTL register defines */ +#define MCG_EXT_CTL_LMCE_EN (1ULL<<0) /* Enable LMCE */ /* MCi_STATUS register defines */ #define MCI_STATUS_VAL (1ULL<<63) /* valid error */ @@ -104,6 +109,7 @@ struct mce_log { struct mca_config { bool dont_log_ce; bool cmci_disabled; + bool lmce_disabled; bool ignore_ce; bool disabled; bool ser; @@ -117,8 +123,19 @@ struct mca_config { }; struct mce_vendor_flags { - __u64 overflow_recov : 1, /* cpuid_ebx(80000007) */ - __reserved_0 : 63; + /* + * overflow recovery cpuid bit indicates that overflow + * conditions are not fatal + */ + __u64 overflow_recov : 1, + + /* + * SUCCOR stands for S/W UnCorrectable error COntainment + * and Recovery. It indicates support for data poisoning + * in HW and deferred error interrupts. + */ + succor : 1, + __reserved_0 : 62; }; extern struct mce_vendor_flags mce_flags; @@ -168,12 +185,16 @@ void cmci_clear(void); void cmci_reenable(void); void cmci_rediscover(void); void cmci_recheck(void); +void lmce_clear(void); +void lmce_enable(void); #else static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { } static inline void cmci_clear(void) {} static inline void cmci_reenable(void) {} static inline void cmci_rediscover(void) {} static inline void cmci_recheck(void) {} +static inline void lmce_clear(void) {} +static inline void lmce_enable(void) {} #endif #ifdef CONFIG_X86_MCE_AMD @@ -223,6 +244,9 @@ void do_machine_check(struct pt_regs *, long); extern void (*mce_threshold_vector)(void); extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); +/* Deferred error interrupt handler */ +extern void (*deferred_error_int_vector)(void); + /* * Thermal handler */ diff --git a/arch/x86/include/asm/msi.h b/arch/x86/include/asm/msi.h new file mode 100644 index 000000000000..93724cc62177 --- /dev/null +++ b/arch/x86/include/asm/msi.h @@ -0,0 +1,7 @@ +#ifndef _ASM_X86_MSI_H +#define _ASM_X86_MSI_H +#include + +typedef struct irq_alloc_info msi_alloc_info_t; + +#endif /* _ASM_X86_MSI_H */ diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/asm/msr-index.h similarity index 99% rename from arch/x86/include/uapi/asm/msr-index.h rename to arch/x86/include/asm/msr-index.h index 3c6bb342a48f..9ebc3d009373 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -56,6 +56,7 @@ #define MSR_IA32_MCG_CAP 0x00000179 #define MSR_IA32_MCG_STATUS 0x0000017a #define MSR_IA32_MCG_CTL 0x0000017b +#define MSR_IA32_MCG_EXT_CTL 0x000004d0 #define MSR_OFFCORE_RSP_0 0x000001a6 #define MSR_OFFCORE_RSP_1 0x000001a7 @@ -380,6 +381,7 @@ #define FEATURE_CONTROL_LOCKED (1<<0) #define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1) #define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX (1<<2) +#define FEATURE_CONTROL_LMCE (1<<20) #define MSR_IA32_APICBASE 0x0000001b #define MSR_IA32_APICBASE_BSP (1<<8) diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index de36f22eb0b9..e6a707eb5081 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -1,13 +1,14 @@ #ifndef _ASM_X86_MSR_H #define _ASM_X86_MSR_H -#include +#include "msr-index.h" #ifndef __ASSEMBLY__ #include #include #include +#include struct msr { union { @@ -205,8 +206,13 @@ do { \ #endif /* !CONFIG_PARAVIRT */ -#define wrmsrl_safe(msr, val) wrmsr_safe((msr), (u32)(val), \ - (u32)((val) >> 32)) +/* + * 64-bit version of wrmsr_safe(): + */ +static inline int wrmsrl_safe(u32 msr, u64 val) +{ + return wrmsr_safe(msr, (u32)val, (u32)(val >> 32)); +} #define write_tsc(low, high) wrmsr(MSR_IA32_TSC, (low), (high)) diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index f768f6298419..b94f6f64e23d 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -31,7 +31,7 @@ * arch_phys_wc_add and arch_phys_wc_del. */ # ifdef CONFIG_MTRR -extern u8 mtrr_type_lookup(u64 addr, u64 end); +extern u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform); extern void mtrr_save_fixed_ranges(void *); extern void mtrr_save_state(void); extern int mtrr_add(unsigned long base, unsigned long size, @@ -48,14 +48,13 @@ extern void mtrr_aps_init(void); extern void mtrr_bp_restore(void); extern int mtrr_trim_uncached_memory(unsigned long end_pfn); extern int amd_special_default_mtrr(void); -extern int phys_wc_to_mtrr_index(int handle); # else -static inline u8 mtrr_type_lookup(u64 addr, u64 end) +static inline u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform) { /* * Return no-MTRRs: */ - return 0xff; + return MTRR_TYPE_INVALID; } #define mtrr_save_fixed_ranges(arg) do {} while (0) #define mtrr_save_state() do {} while (0) @@ -84,10 +83,6 @@ static inline int mtrr_trim_uncached_memory(unsigned long end_pfn) static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) { } -static inline int phys_wc_to_mtrr_index(int handle) -{ - return -1; -} #define mtrr_ap_init() do {} while (0) #define mtrr_bp_init() do {} while (0) @@ -127,4 +122,8 @@ struct mtrr_gentry32 { _IOW(MTRR_IOCTL_BASE, 9, struct mtrr_sentry32) #endif /* CONFIG_COMPAT */ +/* Bit fields for enabled in struct mtrr_state_type */ +#define MTRR_STATE_MTRR_FIXED_ENABLED 0x01 +#define MTRR_STATE_MTRR_ENABLED 0x02 + #endif /* _ASM_X86_MTRR_H */ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 8766c7c395c2..a6b8f9fadb06 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -160,13 +160,14 @@ struct pv_cpu_ops { u64 (*read_pmc)(int counter); unsigned long long (*read_tscp)(unsigned int *aux); +#ifdef CONFIG_X86_32 /* * Atomically enable interrupts and return to userspace. This - * is only ever used to return to 32-bit processes; in a - * 64-bit kernel, it's used for 32-on-64 compat processes, but - * never native 64-bit processes. (Jump, not call.) + * is only used in 32-bit kernels. 64-bit kernels use + * usergs_sysret32 instead. */ void (*irq_enable_sysexit)(void); +#endif /* * Switch to usermode gs and return to 64-bit usermode using diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index 91bc4ba95f91..ca6c228d5e62 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h @@ -4,14 +4,9 @@ #include #include -#ifdef CONFIG_X86_PAT -extern int pat_enabled; -#else -static const int pat_enabled; -#endif - +bool pat_enabled(void); extern void pat_init(void); -void pat_init_cache_modes(void); +void pat_init_cache_modes(u64); extern int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_pcm, enum page_cache_mode *ret_pcm); diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 4e370a5d8117..d8c80ff32e8c 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -96,15 +96,10 @@ extern void pci_iommu_alloc(void); #ifdef CONFIG_PCI_MSI /* implemented in arch/x86/kernel/apic/io_apic. */ struct msi_desc; -void native_compose_msi_msg(struct pci_dev *pdev, unsigned int irq, - unsigned int dest, struct msi_msg *msg, u8 hpet_id); int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type); void native_teardown_msi_irq(unsigned int irq); void native_restore_msi_irqs(struct pci_dev *dev); -int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, - unsigned int irq_base, unsigned int irq_offset); #else -#define native_compose_msi_msg NULL #define native_setup_msi_irqs NULL #define native_teardown_msi_irq NULL #endif diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index fe57e7a98839..2562e303405b 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -398,11 +398,17 @@ static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, * requested memtype: * - request is uncached, return cannot be write-back * - request is write-combine, return cannot be write-back + * - request is write-through, return cannot be write-back + * - request is write-through, return cannot be write-combine */ if ((pcm == _PAGE_CACHE_MODE_UC_MINUS && new_pcm == _PAGE_CACHE_MODE_WB) || (pcm == _PAGE_CACHE_MODE_WC && - new_pcm == _PAGE_CACHE_MODE_WB)) { + new_pcm == _PAGE_CACHE_MODE_WB) || + (pcm == _PAGE_CACHE_MODE_WT && + new_pcm == _PAGE_CACHE_MODE_WB) || + (pcm == _PAGE_CACHE_MODE_WT && + new_pcm == _PAGE_CACHE_MODE_WC)) { return 0; } diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 78f0c8cbe316..13f310bfc09a 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -367,6 +367,9 @@ extern int nx_enabled; #define pgprot_writecombine pgprot_writecombine extern pgprot_t pgprot_writecombine(pgprot_t prot); +#define pgprot_writethrough pgprot_writethrough +extern pgprot_t pgprot_writethrough(pgprot_t prot); + /* Indicate that x86 has its own track and untrack pfn vma functions */ #define __HAVE_PFNMAP_TRACKING diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index a90f8972dad5..a4a77286cb1d 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -5,12 +5,14 @@ /* misc architecture specific prototypes */ -void system_call(void); void syscall_init(void); -void ia32_syscall(void); -void ia32_cstar_target(void); -void ia32_sysenter_target(void); +void entry_SYSCALL_64(void); +void entry_SYSCALL_compat(void); +void entry_INT80_32(void); +void entry_INT80_compat(void); +void entry_SYSENTER_32(void); +void entry_SYSENTER_compat(void); void x86_configure_nx(void); void x86_report_nx(void); diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index aeb4666e0c0a..2270e41b32fd 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -215,6 +215,44 @@ static inline void clwb(volatile void *__p) : [pax] "a" (p)); } +/** + * pcommit_sfence() - persistent commit and fence + * + * The PCOMMIT instruction ensures that data that has been flushed from the + * processor's cache hierarchy with CLWB, CLFLUSHOPT or CLFLUSH is accepted to + * memory and is durable on the DIMM. The primary use case for this is + * persistent memory. + * + * This function shows how to properly use CLWB/CLFLUSHOPT/CLFLUSH and PCOMMIT + * with appropriate fencing. + * + * Example: + * void flush_and_commit_buffer(void *vaddr, unsigned int size) + * { + * unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1; + * void *vend = vaddr + size; + * void *p; + * + * for (p = (void *)((unsigned long)vaddr & ~clflush_mask); + * p < vend; p += boot_cpu_data.x86_clflush_size) + * clwb(p); + * + * // SFENCE to order CLWB/CLFLUSHOPT/CLFLUSH cache flushes + * // MFENCE via mb() also works + * wmb(); + * + * // PCOMMIT and the required SFENCE for ordering + * pcommit_sfence(); + * } + * + * After this function completes the data pointed to by 'vaddr' has been + * accepted to memory and will be durable if the 'vaddr' points to persistent + * memory. + * + * PCOMMIT must always be ordered by an MFENCE or SFENCE, so to help simplify + * things we include both the PCOMMIT and the required SFENCE in the + * alternatives generated by pcommit_sfence(). + */ static inline void pcommit_sfence(void) { alternative(ASM_NOP7, diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index b4bdec3e9523..225ee545e1a0 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -177,8 +177,6 @@ struct thread_info { */ #ifndef __ASSEMBLY__ -DECLARE_PER_CPU(unsigned long, kernel_stack); - static inline struct thread_info *current_thread_info(void) { return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE); @@ -197,9 +195,13 @@ static inline unsigned long current_stack_pointer(void) #else /* !__ASSEMBLY__ */ +#ifdef CONFIG_X86_64 +# define cpu_current_top_of_stack (cpu_tss + TSS_sp0) +#endif + /* Load thread_info address into "reg" */ #define GET_THREAD_INFO(reg) \ - _ASM_MOV PER_CPU_VAR(kernel_stack),reg ; \ + _ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \ _ASM_SUB $(THREAD_SIZE),reg ; /* diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 5a77593fdace..0fb46482dfde 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -26,7 +26,7 @@ #define _ASM_X86_TOPOLOGY_H #ifdef CONFIG_X86_32 -# ifdef CONFIG_X86_HT +# ifdef CONFIG_SMP # define ENABLE_TOPO_DEFINES # endif #else diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h index 4cab890007a7..38a09a13a9bc 100644 --- a/arch/x86/include/asm/trace/irq_vectors.h +++ b/arch/x86/include/asm/trace/irq_vectors.h @@ -100,6 +100,12 @@ DEFINE_IRQ_VECTOR_EVENT(call_function_single); */ DEFINE_IRQ_VECTOR_EVENT(threshold_apic); +/* + * deferred_error_apic - called when entering/exiting a deferred apic interrupt + * vector handler + */ +DEFINE_IRQ_VECTOR_EVENT(deferred_error_apic); + /* * thermal_apic - called when entering/exiting a thermal apic interrupt * vector handler diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 4e49d7dff78e..c5380bea2a36 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -108,7 +108,8 @@ extern int panic_on_unrecovered_nmi; void math_emulate(struct math_emu_info *); #ifndef CONFIG_X86_32 asmlinkage void smp_thermal_interrupt(void); -asmlinkage void mce_threshold_interrupt(void); +asmlinkage void smp_threshold_interrupt(void); +asmlinkage void smp_deferred_error_interrupt(void); #endif extern enum ctx_state ist_enter(struct pt_regs *regs); diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 7c8ad3451988..f5dcb5204dcd 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -59,6 +59,10 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) __put_user_size(*(u32 *)from, (u32 __user *)to, 4, ret, 4); return ret; + case 8: + __put_user_size(*(u64 *)from, (u64 __user *)to, + 8, ret, 8); + return ret; } } return __copy_to_user_ll(to, from, n); diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index f58a9c7a3c86..48d34d28f5a6 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -171,38 +171,17 @@ struct x86_platform_ops { }; struct pci_dev; -struct msi_msg; struct x86_msi_ops { int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type); - void (*compose_msi_msg)(struct pci_dev *dev, unsigned int irq, - unsigned int dest, struct msi_msg *msg, - u8 hpet_id); void (*teardown_msi_irq)(unsigned int irq); void (*teardown_msi_irqs)(struct pci_dev *dev); void (*restore_msi_irqs)(struct pci_dev *dev); - int (*setup_hpet_msi)(unsigned int irq, unsigned int id); }; -struct IO_APIC_route_entry; -struct io_apic_irq_attr; -struct irq_data; -struct cpumask; - struct x86_io_apic_ops { - void (*init) (void); unsigned int (*read) (unsigned int apic, unsigned int reg); - void (*write) (unsigned int apic, unsigned int reg, unsigned int value); - void (*modify) (unsigned int apic, unsigned int reg, unsigned int value); void (*disable)(void); - void (*print_entries)(unsigned int apic, unsigned int nr_entries); - int (*set_affinity)(struct irq_data *data, - const struct cpumask *mask, - bool force); - int (*setup_entry)(int irq, struct IO_APIC_route_entry *entry, - unsigned int destination, int vector, - struct io_apic_irq_attr *attr); - void (*eoi_ioapic_pin)(int apic, int pin, int vector); }; extern struct x86_init_ops x86_init; diff --git a/arch/x86/include/uapi/asm/msr.h b/arch/x86/include/uapi/asm/msr.h index 155e51048fa4..c41f4fe25483 100644 --- a/arch/x86/include/uapi/asm/msr.h +++ b/arch/x86/include/uapi/asm/msr.h @@ -1,8 +1,6 @@ #ifndef _UAPI_ASM_X86_MSR_H #define _UAPI_ASM_X86_MSR_H -#include - #ifndef __ASSEMBLY__ #include diff --git a/arch/x86/include/uapi/asm/mtrr.h b/arch/x86/include/uapi/asm/mtrr.h index d0acb658c8f4..7528dcf59691 100644 --- a/arch/x86/include/uapi/asm/mtrr.h +++ b/arch/x86/include/uapi/asm/mtrr.h @@ -103,7 +103,7 @@ struct mtrr_state_type { #define MTRRIOC_GET_PAGE_ENTRY _IOWR(MTRR_IOCTL_BASE, 8, struct mtrr_gentry) #define MTRRIOC_KILL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 9, struct mtrr_sentry) -/* These are the region types */ +/* MTRR memory types, which are defined in SDM */ #define MTRR_TYPE_UNCACHABLE 0 #define MTRR_TYPE_WRCOMB 1 /*#define MTRR_TYPE_ 2*/ @@ -113,5 +113,11 @@ struct mtrr_state_type { #define MTRR_TYPE_WRBACK 6 #define MTRR_NUM_TYPES 7 +/* + * Invalid MTRR memory type. mtrr_type_lookup() returns this value when + * MTRRs are disabled. Note, this value is allocated from the reserved + * values (0x7-0xff) of the MTRR memory types. + */ +#define MTRR_TYPE_INVALID 0xff #endif /* _UAPI_ASM_X86_MTRR_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index febaf180621b..0f15af41bd80 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -22,7 +22,7 @@ KASAN_SANITIZE_dumpstack_$(BITS).o := n CFLAGS_irq.o := -I$(src)/../include/asm/trace -obj-y := process_$(BITS).o signal.o entry_$(BITS).o +obj-y := process_$(BITS).o signal.o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o ldt.o dumpstack.o nmi.o obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o @@ -31,9 +31,6 @@ obj-y += probe_roms.o obj-$(CONFIG_X86_32) += i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += mcount_64.o -obj-y += syscall_$(BITS).o vsyscall_gtod.o -obj-$(CONFIG_IA32_EMULATION) += syscall_32.o -obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o obj-$(CONFIG_SYSFS) += ksysfs.o obj-y += bootflag.o e820.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index dbe76a14c3c9..e49ee24da85e 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -31,12 +31,12 @@ #include #include #include -#include #include #include #include #include +#include #include #include #include @@ -400,57 +400,13 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, return 0; } -static int mp_register_gsi(struct device *dev, u32 gsi, int trigger, - int polarity) -{ - int irq, node; - - if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) - return gsi; - - trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1; - polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1; - node = dev ? dev_to_node(dev) : NUMA_NO_NODE; - if (mp_set_gsi_attr(gsi, trigger, polarity, node)) { - pr_warn("Failed to set pin attr for GSI%d\n", gsi); - return -1; - } - - irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC); - if (irq < 0) - return irq; - - /* Don't set up the ACPI SCI because it's already set up */ - if (enable_update_mptable && acpi_gbl_FADT.sci_interrupt != gsi) - mp_config_acpi_gsi(dev, gsi, trigger, polarity); - - return irq; -} - -static void mp_unregister_gsi(u32 gsi) -{ - int irq; - - if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) - return; - - irq = mp_map_gsi_to_irq(gsi, 0); - if (irq > 0) - mp_unmap_irq(irq); -} - -static struct irq_domain_ops acpi_irqdomain_ops = { - .map = mp_irqdomain_map, - .unmap = mp_irqdomain_unmap, -}; - static int __init acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end) { struct acpi_madt_io_apic *ioapic = NULL; struct ioapic_domain_cfg cfg = { .type = IOAPIC_DOMAIN_DYNAMIC, - .ops = &acpi_irqdomain_ops, + .ops = &mp_ioapic_irqdomain_ops, }; ioapic = (struct acpi_madt_io_apic *)header; @@ -652,7 +608,7 @@ static int acpi_register_gsi_pic(struct device *dev, u32 gsi, * Make sure all (legacy) PCI IRQs are set as level-triggered. */ if (trigger == ACPI_LEVEL_SENSITIVE) - eisa_set_level_irq(gsi); + elcr_set_level_irq(gsi); #endif return gsi; @@ -663,10 +619,21 @@ static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi, int trigger, int polarity) { int irq = gsi; - #ifdef CONFIG_X86_IO_APIC + int node; + struct irq_alloc_info info; + + node = dev ? dev_to_node(dev) : NUMA_NO_NODE; + trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1; + polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1; + ioapic_set_alloc_attr(&info, node, trigger, polarity); + mutex_lock(&acpi_ioapic_lock); - irq = mp_register_gsi(dev, gsi, trigger, polarity); + irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info); + /* Don't set up the ACPI SCI because it's already set up */ + if (irq >= 0 && enable_update_mptable && + acpi_gbl_FADT.sci_interrupt != gsi) + mp_config_acpi_gsi(dev, gsi, trigger, polarity); mutex_unlock(&acpi_ioapic_lock); #endif @@ -676,8 +643,12 @@ static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi, static void acpi_unregister_gsi_ioapic(u32 gsi) { #ifdef CONFIG_X86_IO_APIC + int irq; + mutex_lock(&acpi_ioapic_lock); - mp_unregister_gsi(gsi); + irq = mp_map_gsi_to_irq(gsi, 0, NULL); + if (irq > 0) + mp_unmap_irq(irq); mutex_unlock(&acpi_ioapic_lock); #endif } @@ -786,7 +757,7 @@ int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base) u64 addr; struct ioapic_domain_cfg cfg = { .type = IOAPIC_DOMAIN_DYNAMIC, - .ops = &acpi_irqdomain_ops, + .ops = &mp_ioapic_irqdomain_ops, }; ioapic_id = acpi_get_ioapic_id(handle, gsi_base, &addr); diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index ae693b51ed8e..8c35df468104 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -62,7 +62,7 @@ ENTRY(do_suspend_lowlevel) pushfq popq pt_regs_flags(%rax) - movq $resume_point, saved_rip(%rip) + movq $.Lresume_point, saved_rip(%rip) movq %rsp, saved_rsp movq %rbp, saved_rbp @@ -75,10 +75,10 @@ ENTRY(do_suspend_lowlevel) xorl %eax, %eax call x86_acpi_enter_sleep_state /* in case something went wrong, restore the machine status and go on */ - jmp resume_point + jmp .Lresume_point .align 4 -resume_point: +.Lresume_point: /* We don't restore %rax, it must be 0 anyway */ movq $saved_context, %rax movq saved_context_cr4(%rax), %rbx diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 7fe097235376..c42827eb86cf 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -231,6 +231,15 @@ void __init arch_init_ideal_nops(void) #endif } break; + + case X86_VENDOR_AMD: + if (boot_cpu_data.x86 > 0xf) { + ideal_nops = p6_nops; + return; + } + + /* fall through */ + default: #ifdef CONFIG_X86_64 ideal_nops = k8_nops; diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 6a7c23ff21d3..ede92c3364d3 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -171,10 +171,6 @@ static int __init apbt_clockevent_register(void) static void apbt_setup_irq(struct apbt_dev *adev) { - /* timer0 irq has been setup early */ - if (adev->irq == 0) - return; - irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT); irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); } diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c index 816f36e979ad..ae50d3454d78 100644 --- a/arch/x86/kernel/apic/htirq.c +++ b/arch/x86/kernel/apic/htirq.c @@ -3,6 +3,8 @@ * * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo * Moved from arch/x86/kernel/apic/io_apic.c. + * Jiang Liu + * Add support of hierarchical irqdomain * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -14,78 +16,112 @@ #include #include #include +#include #include #include #include +static struct irq_domain *htirq_domain; + /* * Hypertransport interrupt support */ -static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) -{ - struct ht_irq_msg msg; - - fetch_ht_irq_msg(irq, &msg); - - msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK); - msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); - - msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest); - msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); - - write_ht_irq_msg(irq, &msg); -} - static int ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { - struct irq_cfg *cfg = irqd_cfg(data); - unsigned int dest; + struct irq_data *parent = data->parent_data; int ret; - ret = apic_set_affinity(data, mask, &dest); - if (ret) - return ret; + ret = parent->chip->irq_set_affinity(parent, mask, force); + if (ret >= 0) { + struct ht_irq_msg msg; + struct irq_cfg *cfg = irqd_cfg(data); - target_ht_irq(data->irq, dest, cfg->vector); - return IRQ_SET_MASK_OK_NOCOPY; + fetch_ht_irq_msg(data->irq, &msg); + msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | + HT_IRQ_LOW_DEST_ID_MASK); + msg.address_lo |= HT_IRQ_LOW_VECTOR(cfg->vector) | + HT_IRQ_LOW_DEST_ID(cfg->dest_apicid); + msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); + msg.address_hi |= HT_IRQ_HIGH_DEST_ID(cfg->dest_apicid); + write_ht_irq_msg(data->irq, &msg); + } + + return ret; } static struct irq_chip ht_irq_chip = { .name = "PCI-HT", .irq_mask = mask_ht_irq, .irq_unmask = unmask_ht_irq, - .irq_ack = apic_ack_edge, + .irq_ack = irq_chip_ack_parent, .irq_set_affinity = ht_set_affinity, - .irq_retrigger = apic_retrigger_irq, + .irq_retrigger = irq_chip_retrigger_hierarchy, .flags = IRQCHIP_SKIP_SET_WAKE, }; -int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) +static int htirq_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct ht_irq_cfg *ht_cfg; + struct irq_alloc_info *info = arg; + struct pci_dev *dev; + irq_hw_number_t hwirq; + int ret; + + if (nr_irqs > 1 || !info) + return -EINVAL; + + dev = info->ht_dev; + hwirq = (info->ht_idx & 0xFF) | + PCI_DEVID(dev->bus->number, dev->devfn) << 8 | + (pci_domain_nr(dev->bus) & 0xFFFFFFFF) << 24; + if (irq_find_mapping(domain, hwirq) > 0) + return -EEXIST; + + ht_cfg = kmalloc(sizeof(*ht_cfg), GFP_KERNEL); + if (!ht_cfg) + return -ENOMEM; + + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info); + if (ret < 0) { + kfree(ht_cfg); + return ret; + } + + /* Initialize msg to a value that will never match the first write. */ + ht_cfg->msg.address_lo = 0xffffffff; + ht_cfg->msg.address_hi = 0xffffffff; + ht_cfg->dev = info->ht_dev; + ht_cfg->update = info->ht_update; + ht_cfg->pos = info->ht_pos; + ht_cfg->idx = 0x10 + (info->ht_idx * 2); + irq_domain_set_info(domain, virq, hwirq, &ht_irq_chip, ht_cfg, + handle_edge_irq, ht_cfg, "edge"); + + return 0; +} + +static void htirq_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq); + + BUG_ON(nr_irqs != 1); + kfree(irq_data->chip_data); + irq_domain_free_irqs_top(domain, virq, nr_irqs); +} + +static void htirq_domain_activate(struct irq_domain *domain, + struct irq_data *irq_data) { - struct irq_cfg *cfg; struct ht_irq_msg msg; - unsigned dest; - int err; - - if (disable_apic) - return -ENXIO; - - cfg = irq_cfg(irq); - err = assign_irq_vector(irq, cfg, apic->target_cpus()); - if (err) - return err; - - err = apic->cpu_mask_to_apicid_and(cfg->domain, - apic->target_cpus(), &dest); - if (err) - return err; - - msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); + struct irq_cfg *cfg = irqd_cfg(irq_data); + msg.address_hi = HT_IRQ_HIGH_DEST_ID(cfg->dest_apicid); msg.address_lo = HT_IRQ_LOW_BASE | - HT_IRQ_LOW_DEST_ID(dest) | + HT_IRQ_LOW_DEST_ID(cfg->dest_apicid) | HT_IRQ_LOW_VECTOR(cfg->vector) | ((apic->irq_dest_mode == 0) ? HT_IRQ_LOW_DM_PHYSICAL : @@ -95,13 +131,56 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) HT_IRQ_LOW_MT_FIXED : HT_IRQ_LOW_MT_ARBITRATED) | HT_IRQ_LOW_IRQ_MASKED; - - write_ht_irq_msg(irq, &msg); - - irq_set_chip_and_handler_name(irq, &ht_irq_chip, - handle_edge_irq, "edge"); - - dev_dbg(&dev->dev, "irq %d for HT\n", irq); - - return 0; + write_ht_irq_msg(irq_data->irq, &msg); +} + +static void htirq_domain_deactivate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + struct ht_irq_msg msg; + + memset(&msg, 0, sizeof(msg)); + write_ht_irq_msg(irq_data->irq, &msg); +} + +static const struct irq_domain_ops htirq_domain_ops = { + .alloc = htirq_domain_alloc, + .free = htirq_domain_free, + .activate = htirq_domain_activate, + .deactivate = htirq_domain_deactivate, +}; + +void arch_init_htirq_domain(struct irq_domain *parent) +{ + if (disable_apic) + return; + + htirq_domain = irq_domain_add_tree(NULL, &htirq_domain_ops, NULL); + if (!htirq_domain) + pr_warn("failed to initialize irqdomain for HTIRQ.\n"); + else + htirq_domain->parent = parent; +} + +int arch_setup_ht_irq(int idx, int pos, struct pci_dev *dev, + ht_irq_update_t *update) +{ + struct irq_alloc_info info; + + if (!htirq_domain) + return -ENOSYS; + + init_irq_alloc_info(&info, NULL); + info.ht_idx = idx; + info.ht_pos = pos; + info.ht_dev = dev; + info.ht_update = update; + + return irq_domain_alloc_irqs(htirq_domain, 1, dev_to_node(&dev->dev), + &info); +} + +void arch_teardown_ht_irq(unsigned int irq) +{ + irq_domain_free_irqs(irq, 1); } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index f4dc2462a1ac..845dc0df2002 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -18,6 +18,16 @@ * and Rolf G. Tews * for testing these extensively * Paul Diefenbaugh : Added full ACPI support + * + * Historical information which is worth to be preserved: + * + * - SiS APIC rmw bug: + * + * We used to have a workaround for a bug in SiS chips which + * required to rewrite the index register for a read-modify-write + * operation as the chip lost the index information which was + * setup for the read already. We cache the data now, so that + * workaround has been removed. */ #include @@ -31,13 +41,13 @@ #include #include #include -#include #include #include #include /* time_after() */ #include #include +#include #include #include #include @@ -63,27 +73,31 @@ #define for_each_ioapic_pin(idx, pin) \ for_each_ioapic((idx)) \ for_each_pin((idx), (pin)) - #define for_each_irq_pin(entry, head) \ list_for_each_entry(entry, &head, list) -/* - * Is the SiS APIC rmw bug present ? - * -1 = don't know, 0 = no, 1 = yes - */ -int sis_apic_bug = -1; - static DEFINE_RAW_SPINLOCK(ioapic_lock); static DEFINE_MUTEX(ioapic_mutex); static unsigned int ioapic_dynirq_base; static int ioapic_initialized; -struct mp_pin_info { +struct irq_pin_list { + struct list_head list; + int apic, pin; +}; + +struct mp_chip_data { + struct list_head irq_2_pin; + struct IO_APIC_route_entry entry; int trigger; int polarity; - int node; - int set; u32 count; + bool isa_irq; +}; + +struct mp_ioapic_gsi { + u32 gsi_base; + u32 gsi_end; }; static struct ioapic { @@ -101,7 +115,6 @@ static struct ioapic { struct mp_ioapic_gsi gsi_config; struct ioapic_domain_cfg irqdomain_cfg; struct irq_domain *irqdomain; - struct mp_pin_info *pin_info; struct resource *iomem_res; } ioapics[MAX_IO_APICS]; @@ -117,7 +130,7 @@ unsigned int mpc_ioapic_addr(int ioapic_idx) return ioapics[ioapic_idx].mp_config.apicaddr; } -struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx) +static inline struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx) { return &ioapics[ioapic_idx].gsi_config; } @@ -129,11 +142,16 @@ static inline int mp_ioapic_pin_count(int ioapic) return gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1; } -u32 mp_pin_to_gsi(int ioapic, int pin) +static inline u32 mp_pin_to_gsi(int ioapic, int pin) { return mp_ioapic_gsi_routing(ioapic)->gsi_base + pin; } +static inline bool mp_is_legacy_irq(int irq) +{ + return irq >= 0 && irq < nr_legacy_irqs(); +} + /* * Initialize all legacy IRQs and all pins on the first IOAPIC * if we have legacy interrupt controller. Kernel boot option "pirq=" @@ -144,12 +162,7 @@ static inline int mp_init_irq_at_boot(int ioapic, int irq) if (!nr_legacy_irqs()) return 0; - return ioapic == 0 || (irq >= 0 && irq < nr_legacy_irqs()); -} - -static inline struct mp_pin_info *mp_pin_info(int ioapic_idx, int pin) -{ - return ioapics[ioapic_idx].pin_info + pin; + return ioapic == 0 || mp_is_legacy_irq(irq); } static inline struct irq_domain *mp_ioapic_irqdomain(int ioapic) @@ -216,16 +229,6 @@ void mp_save_irq(struct mpc_intsrc *m) panic("Max # of irq sources exceeded!!\n"); } -struct irq_pin_list { - struct list_head list; - int apic, pin; -}; - -static struct irq_pin_list *alloc_irq_pin_list(int node) -{ - return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node); -} - static void alloc_ioapic_saved_registers(int idx) { size_t size; @@ -247,8 +250,7 @@ static void free_ioapic_saved_registers(int idx) int __init arch_early_ioapic_init(void) { - struct irq_cfg *cfg; - int i, node = cpu_to_node(0); + int i; if (!nr_legacy_irqs()) io_apic_irqs = ~0UL; @@ -256,16 +258,6 @@ int __init arch_early_ioapic_init(void) for_each_ioapic(i) alloc_ioapic_saved_registers(i); - /* - * For legacy IRQ's, start with assigning irq0 to irq15 to - * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's. - */ - for (i = 0; i < nr_legacy_irqs(); i++) { - cfg = alloc_irq_and_cfg_at(i, node); - cfg->vector = IRQ0_VECTOR + i; - cpumask_setall(cfg->domain); - } - return 0; } @@ -283,7 +275,7 @@ static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) + (mpc_ioapic_addr(idx) & ~PAGE_MASK); } -void io_apic_eoi(unsigned int apic, unsigned int vector) +static inline void io_apic_eoi(unsigned int apic, unsigned int vector) { struct io_apic __iomem *io_apic = io_apic_base(apic); writel(vector, &io_apic->eoi); @@ -296,7 +288,8 @@ unsigned int native_io_apic_read(unsigned int apic, unsigned int reg) return readl(&io_apic->data); } -void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) +static void io_apic_write(unsigned int apic, unsigned int reg, + unsigned int value) { struct io_apic __iomem *io_apic = io_apic_base(apic); @@ -304,21 +297,6 @@ void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int valu writel(value, &io_apic->data); } -/* - * Re-write a value: to be used for read-modify-write - * cycles where the read already set up the index register. - * - * Older SiS APIC requires we rewrite the index register - */ -void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) -{ - struct io_apic __iomem *io_apic = io_apic_base(apic); - - if (sis_apic_bug) - writel(reg, &io_apic->index); - writel(value, &io_apic->data); -} - union entry_union { struct { u32 w1, w2; }; struct IO_APIC_route_entry entry; @@ -378,7 +356,7 @@ static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) static void ioapic_mask_entry(int apic, int pin) { unsigned long flags; - union entry_union eu = { .entry.mask = 1 }; + union entry_union eu = { .entry.mask = IOAPIC_MASKED }; raw_spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(apic, 0x10 + 2*pin, eu.w1); @@ -391,16 +369,17 @@ static void ioapic_mask_entry(int apic, int pin) * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ -static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) +static int __add_pin_to_irq_node(struct mp_chip_data *data, + int node, int apic, int pin) { struct irq_pin_list *entry; /* don't allow duplicates */ - for_each_irq_pin(entry, cfg->irq_2_pin) + for_each_irq_pin(entry, data->irq_2_pin) if (entry->apic == apic && entry->pin == pin) return 0; - entry = alloc_irq_pin_list(node); + entry = kzalloc_node(sizeof(struct irq_pin_list), GFP_ATOMIC, node); if (!entry) { pr_err("can not alloc irq_pin_list (%d,%d,%d)\n", node, apic, pin); @@ -408,16 +387,16 @@ static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pi } entry->apic = apic; entry->pin = pin; + list_add_tail(&entry->list, &data->irq_2_pin); - list_add_tail(&entry->list, &cfg->irq_2_pin); return 0; } -static void __remove_pin_from_irq(struct irq_cfg *cfg, int apic, int pin) +static void __remove_pin_from_irq(struct mp_chip_data *data, int apic, int pin) { struct irq_pin_list *tmp, *entry; - list_for_each_entry_safe(entry, tmp, &cfg->irq_2_pin, list) + list_for_each_entry_safe(entry, tmp, &data->irq_2_pin, list) if (entry->apic == apic && entry->pin == pin) { list_del(&entry->list); kfree(entry); @@ -425,22 +404,23 @@ static void __remove_pin_from_irq(struct irq_cfg *cfg, int apic, int pin) } } -static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) +static void add_pin_to_irq_node(struct mp_chip_data *data, + int node, int apic, int pin) { - if (__add_pin_to_irq_node(cfg, node, apic, pin)) + if (__add_pin_to_irq_node(data, node, apic, pin)) panic("IO-APIC: failed to add irq-pin. Can not proceed\n"); } /* * Reroute an IRQ to a different pin. */ -static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, +static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node, int oldapic, int oldpin, int newapic, int newpin) { struct irq_pin_list *entry; - for_each_irq_pin(entry, cfg->irq_2_pin) { + for_each_irq_pin(entry, data->irq_2_pin) { if (entry->apic == oldapic && entry->pin == oldpin) { entry->apic = newapic; entry->pin = newpin; @@ -450,32 +430,26 @@ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, } /* old apic/pin didn't exist, so just add new ones */ - add_pin_to_irq_node(cfg, node, newapic, newpin); + add_pin_to_irq_node(data, node, newapic, newpin); } -static void __io_apic_modify_irq(struct irq_pin_list *entry, - int mask_and, int mask_or, - void (*final)(struct irq_pin_list *entry)) -{ - unsigned int reg, pin; - - pin = entry->pin; - reg = io_apic_read(entry->apic, 0x10 + pin * 2); - reg &= mask_and; - reg |= mask_or; - io_apic_modify(entry->apic, 0x10 + pin * 2, reg); - if (final) - final(entry); -} - -static void io_apic_modify_irq(struct irq_cfg *cfg, +static void io_apic_modify_irq(struct mp_chip_data *data, int mask_and, int mask_or, void (*final)(struct irq_pin_list *entry)) { + union entry_union eu; struct irq_pin_list *entry; - for_each_irq_pin(entry, cfg->irq_2_pin) - __io_apic_modify_irq(entry, mask_and, mask_or, final); + eu.entry = data->entry; + eu.w1 &= mask_and; + eu.w1 |= mask_or; + data->entry = eu.entry; + + for_each_irq_pin(entry, data->irq_2_pin) { + io_apic_write(entry->apic, 0x10 + 2 * entry->pin, eu.w1); + if (final) + final(entry); + } } static void io_apic_sync(struct irq_pin_list *entry) @@ -490,39 +464,31 @@ static void io_apic_sync(struct irq_pin_list *entry) readl(&io_apic->data); } -static void mask_ioapic(struct irq_cfg *cfg) +static void mask_ioapic_irq(struct irq_data *irq_data) { + struct mp_chip_data *data = irq_data->chip_data; unsigned long flags; raw_spin_lock_irqsave(&ioapic_lock, flags); - io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); + io_apic_modify_irq(data, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } -static void mask_ioapic_irq(struct irq_data *data) +static void __unmask_ioapic(struct mp_chip_data *data) { - mask_ioapic(irqd_cfg(data)); + io_apic_modify_irq(data, ~IO_APIC_REDIR_MASKED, 0, NULL); } -static void __unmask_ioapic(struct irq_cfg *cfg) -{ - io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL); -} - -static void unmask_ioapic(struct irq_cfg *cfg) +static void unmask_ioapic_irq(struct irq_data *irq_data) { + struct mp_chip_data *data = irq_data->chip_data; unsigned long flags; raw_spin_lock_irqsave(&ioapic_lock, flags); - __unmask_ioapic(cfg); + __unmask_ioapic(data); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } -static void unmask_ioapic_irq(struct irq_data *data) -{ - unmask_ioapic(irqd_cfg(data)); -} - /* * IO-APIC versions below 0x20 don't support EOI register. * For the record, here is the information about various versions: @@ -539,7 +505,7 @@ static void unmask_ioapic_irq(struct irq_data *data) * Otherwise, we simulate the EOI message manually by changing the trigger * mode to edge and then back to level, with RTE being masked during this. */ -void native_eoi_ioapic_pin(int apic, int pin, int vector) +static void __eoi_ioapic_pin(int apic, int pin, int vector) { if (mpc_ioapic_ver(apic) >= 0x20) { io_apic_eoi(apic, vector); @@ -551,7 +517,7 @@ void native_eoi_ioapic_pin(int apic, int pin, int vector) /* * Mask the entry and change the trigger mode to edge. */ - entry1.mask = 1; + entry1.mask = IOAPIC_MASKED; entry1.trigger = IOAPIC_EDGE; __ioapic_write_entry(apic, pin, entry1); @@ -563,15 +529,14 @@ void native_eoi_ioapic_pin(int apic, int pin, int vector) } } -void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) +void eoi_ioapic_pin(int vector, struct mp_chip_data *data) { - struct irq_pin_list *entry; unsigned long flags; + struct irq_pin_list *entry; raw_spin_lock_irqsave(&ioapic_lock, flags); - for_each_irq_pin(entry, cfg->irq_2_pin) - x86_io_apic_ops.eoi_ioapic_pin(entry->apic, entry->pin, - cfg->vector); + for_each_irq_pin(entry, data->irq_2_pin) + __eoi_ioapic_pin(entry->apic, entry->pin, vector); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -588,8 +553,8 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) * Make sure the entry is masked and re-read the contents to check * if it is a level triggered pin and if the remote-IRR is set. */ - if (!entry.mask) { - entry.mask = 1; + if (entry.mask == IOAPIC_UNMASKED) { + entry.mask = IOAPIC_MASKED; ioapic_write_entry(apic, pin, entry); entry = ioapic_read_entry(apic, pin); } @@ -602,13 +567,12 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) * doesn't clear the remote-IRR if the trigger mode is not * set to level. */ - if (!entry.trigger) { + if (entry.trigger == IOAPIC_EDGE) { entry.trigger = IOAPIC_LEVEL; ioapic_write_entry(apic, pin, entry); } - raw_spin_lock_irqsave(&ioapic_lock, flags); - x86_io_apic_ops.eoi_ioapic_pin(apic, pin, entry.vector); + __eoi_ioapic_pin(apic, pin, entry.vector); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -706,8 +670,8 @@ void mask_ioapic_entries(void) struct IO_APIC_route_entry entry; entry = ioapics[apic].saved_registers[pin]; - if (!entry.mask) { - entry.mask = 1; + if (entry.mask == IOAPIC_UNMASKED) { + entry.mask = IOAPIC_MASKED; ioapic_write_entry(apic, pin, entry); } } @@ -809,11 +773,11 @@ static int EISA_ELCR(unsigned int irq) #endif -/* ISA interrupts are always polarity zero edge triggered, +/* ISA interrupts are always active high edge triggered, * when listed as conforming in the MP table. */ -#define default_ISA_trigger(idx) (0) -#define default_ISA_polarity(idx) (0) +#define default_ISA_trigger(idx) (IOAPIC_EDGE) +#define default_ISA_polarity(idx) (IOAPIC_POL_HIGH) /* EISA interrupts are always polarity zero and can be edge or level * trigger depending on the ELCR value. If an interrupt is listed as @@ -823,54 +787,56 @@ static int EISA_ELCR(unsigned int irq) #define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].srcbusirq)) #define default_EISA_polarity(idx) default_ISA_polarity(idx) -/* PCI interrupts are always polarity one level triggered, +/* PCI interrupts are always active low level triggered, * when listed as conforming in the MP table. */ -#define default_PCI_trigger(idx) (1) -#define default_PCI_polarity(idx) (1) +#define default_PCI_trigger(idx) (IOAPIC_LEVEL) +#define default_PCI_polarity(idx) (IOAPIC_POL_LOW) static int irq_polarity(int idx) { int bus = mp_irqs[idx].srcbus; - int polarity; /* * Determine IRQ line polarity (high active or low active): */ - switch (mp_irqs[idx].irqflag & 3) - { - case 0: /* conforms, ie. bus-type dependent polarity */ - if (test_bit(bus, mp_bus_not_pci)) - polarity = default_ISA_polarity(idx); - else - polarity = default_PCI_polarity(idx); - break; - case 1: /* high active */ - { - polarity = 0; - break; - } - case 2: /* reserved */ - { - pr_warn("broken BIOS!!\n"); - polarity = 1; - break; - } - case 3: /* low active */ - { - polarity = 1; - break; - } - default: /* invalid */ - { - pr_warn("broken BIOS!!\n"); - polarity = 1; - break; - } + switch (mp_irqs[idx].irqflag & 0x03) { + case 0: + /* conforms to spec, ie. bus-type dependent polarity */ + if (test_bit(bus, mp_bus_not_pci)) + return default_ISA_polarity(idx); + else + return default_PCI_polarity(idx); + case 1: + return IOAPIC_POL_HIGH; + case 2: + pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n"); + case 3: + default: /* Pointless default required due to do gcc stupidity */ + return IOAPIC_POL_LOW; } - return polarity; } +#ifdef CONFIG_EISA +static int eisa_irq_trigger(int idx, int bus, int trigger) +{ + switch (mp_bus_id_to_type[bus]) { + case MP_BUS_PCI: + case MP_BUS_ISA: + return trigger; + case MP_BUS_EISA: + return default_EISA_trigger(idx); + } + pr_warn("IOAPIC: Invalid srcbus: %d defaulting to level\n", bus); + return IOAPIC_LEVEL; +} +#else +static inline int eisa_irq_trigger(int idx, int bus, int trigger) +{ + return trigger; +} +#endif + static int irq_trigger(int idx) { int bus = mp_irqs[idx].srcbus; @@ -879,153 +845,227 @@ static int irq_trigger(int idx) /* * Determine IRQ trigger mode (edge or level sensitive): */ - switch ((mp_irqs[idx].irqflag>>2) & 3) - { - case 0: /* conforms, ie. bus-type dependent */ - if (test_bit(bus, mp_bus_not_pci)) - trigger = default_ISA_trigger(idx); - else - trigger = default_PCI_trigger(idx); -#ifdef CONFIG_EISA - switch (mp_bus_id_to_type[bus]) { - case MP_BUS_ISA: /* ISA pin */ - { - /* set before the switch */ - break; - } - case MP_BUS_EISA: /* EISA pin */ - { - trigger = default_EISA_trigger(idx); - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - /* set before the switch */ - break; - } - default: - { - pr_warn("broken BIOS!!\n"); - trigger = 1; - break; - } - } -#endif - break; - case 1: /* edge */ - { - trigger = 0; - break; - } - case 2: /* reserved */ - { - pr_warn("broken BIOS!!\n"); - trigger = 1; - break; - } - case 3: /* level */ - { - trigger = 1; - break; - } - default: /* invalid */ - { - pr_warn("broken BIOS!!\n"); - trigger = 0; - break; - } + switch ((mp_irqs[idx].irqflag >> 2) & 0x03) { + case 0: + /* conforms to spec, ie. bus-type dependent trigger mode */ + if (test_bit(bus, mp_bus_not_pci)) + trigger = default_ISA_trigger(idx); + else + trigger = default_PCI_trigger(idx); + /* Take EISA into account */ + return eisa_irq_trigger(idx, bus, trigger); + case 1: + return IOAPIC_EDGE; + case 2: + pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n"); + case 3: + default: /* Pointless default required due to do gcc stupidity */ + return IOAPIC_LEVEL; } - return trigger; } -static int alloc_irq_from_domain(struct irq_domain *domain, u32 gsi, int pin) +void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node, + int trigger, int polarity) { + init_irq_alloc_info(info, NULL); + info->type = X86_IRQ_ALLOC_TYPE_IOAPIC; + info->ioapic_node = node; + info->ioapic_trigger = trigger; + info->ioapic_polarity = polarity; + info->ioapic_valid = 1; +} + +#ifndef CONFIG_ACPI +int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity); +#endif + +static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst, + struct irq_alloc_info *src, + u32 gsi, int ioapic_idx, int pin) +{ + int trigger, polarity; + + copy_irq_alloc_info(dst, src); + dst->type = X86_IRQ_ALLOC_TYPE_IOAPIC; + dst->ioapic_id = mpc_ioapic_id(ioapic_idx); + dst->ioapic_pin = pin; + dst->ioapic_valid = 1; + if (src && src->ioapic_valid) { + dst->ioapic_node = src->ioapic_node; + dst->ioapic_trigger = src->ioapic_trigger; + dst->ioapic_polarity = src->ioapic_polarity; + } else { + dst->ioapic_node = NUMA_NO_NODE; + if (acpi_get_override_irq(gsi, &trigger, &polarity) >= 0) { + dst->ioapic_trigger = trigger; + dst->ioapic_polarity = polarity; + } else { + /* + * PCI interrupts are always active low level + * triggered. + */ + dst->ioapic_trigger = IOAPIC_LEVEL; + dst->ioapic_polarity = IOAPIC_POL_LOW; + } + } +} + +static int ioapic_alloc_attr_node(struct irq_alloc_info *info) +{ + return (info && info->ioapic_valid) ? info->ioapic_node : NUMA_NO_NODE; +} + +static void mp_register_handler(unsigned int irq, unsigned long trigger) +{ + irq_flow_handler_t hdl; + bool fasteoi; + + if (trigger) { + irq_set_status_flags(irq, IRQ_LEVEL); + fasteoi = true; + } else { + irq_clear_status_flags(irq, IRQ_LEVEL); + fasteoi = false; + } + + hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq; + __irq_set_handler(irq, hdl, 0, fasteoi ? "fasteoi" : "edge"); +} + +static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info) +{ + struct mp_chip_data *data = irq_get_chip_data(irq); + + /* + * setup_IO_APIC_irqs() programs all legacy IRQs with default trigger + * and polarity attirbutes. So allow the first user to reprogram the + * pin with real trigger and polarity attributes. + */ + if (irq < nr_legacy_irqs() && data->count == 1) { + if (info->ioapic_trigger != data->trigger) + mp_register_handler(irq, data->trigger); + data->entry.trigger = data->trigger = info->ioapic_trigger; + data->entry.polarity = data->polarity = info->ioapic_polarity; + } + + return data->trigger == info->ioapic_trigger && + data->polarity == info->ioapic_polarity; +} + +static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi, + struct irq_alloc_info *info) +{ + bool legacy = false; int irq = -1; - int ioapic = (int)(long)domain->host_data; int type = ioapics[ioapic].irqdomain_cfg.type; switch (type) { case IOAPIC_DOMAIN_LEGACY: /* - * Dynamically allocate IRQ number for non-ISA IRQs in the first 16 - * GSIs on some weird platforms. + * Dynamically allocate IRQ number for non-ISA IRQs in the first + * 16 GSIs on some weird platforms. */ - if (gsi < nr_legacy_irqs()) - irq = irq_create_mapping(domain, pin); - else if (irq_create_strict_mappings(domain, gsi, pin, 1) == 0) + if (!ioapic_initialized || gsi >= nr_legacy_irqs()) irq = gsi; + legacy = mp_is_legacy_irq(irq); break; case IOAPIC_DOMAIN_STRICT: - if (irq_create_strict_mappings(domain, gsi, pin, 1) == 0) - irq = gsi; + irq = gsi; break; case IOAPIC_DOMAIN_DYNAMIC: - irq = irq_create_mapping(domain, pin); break; default: WARN(1, "ioapic: unknown irqdomain type %d\n", type); - break; + return -1; } - return irq > 0 ? irq : -1; + return __irq_domain_alloc_irqs(domain, irq, 1, + ioapic_alloc_attr_node(info), + info, legacy); +} + +/* + * Need special handling for ISA IRQs because there may be multiple IOAPIC pins + * sharing the same ISA IRQ number and irqdomain only supports 1:1 mapping + * between IOAPIC pin and IRQ number. A typical IOAPIC has 24 pins, pin 0-15 are + * used for legacy IRQs and pin 16-23 are used for PCI IRQs (PIRQ A-H). + * When ACPI is disabled, only legacy IRQ numbers (IRQ0-15) are available, and + * some BIOSes may use MP Interrupt Source records to override IRQ numbers for + * PIRQs instead of reprogramming the interrupt routing logic. Thus there may be + * multiple pins sharing the same legacy IRQ number when ACPI is disabled. + */ +static int alloc_isa_irq_from_domain(struct irq_domain *domain, + int irq, int ioapic, int pin, + struct irq_alloc_info *info) +{ + struct mp_chip_data *data; + struct irq_data *irq_data = irq_get_irq_data(irq); + int node = ioapic_alloc_attr_node(info); + + /* + * Legacy ISA IRQ has already been allocated, just add pin to + * the pin list assoicated with this IRQ and program the IOAPIC + * entry. The IOAPIC entry + */ + if (irq_data && irq_data->parent_data) { + if (!mp_check_pin_attr(irq, info)) + return -EBUSY; + if (__add_pin_to_irq_node(irq_data->chip_data, node, ioapic, + info->ioapic_pin)) + return -ENOMEM; + } else { + irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true); + if (irq >= 0) { + irq_data = irq_domain_get_irq_data(domain, irq); + data = irq_data->chip_data; + data->isa_irq = true; + } + } + + return irq; } static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin, - unsigned int flags) + unsigned int flags, struct irq_alloc_info *info) { int irq; + bool legacy = false; + struct irq_alloc_info tmp; + struct mp_chip_data *data; struct irq_domain *domain = mp_ioapic_irqdomain(ioapic); - struct mp_pin_info *info = mp_pin_info(ioapic, pin); if (!domain) - return -1; + return -ENOSYS; - mutex_lock(&ioapic_mutex); - - /* - * Don't use irqdomain to manage ISA IRQs because there may be - * multiple IOAPIC pins sharing the same ISA IRQ number and - * irqdomain only supports 1:1 mapping between IOAPIC pin and - * IRQ number. A typical IOAPIC has 24 pins, pin 0-15 are used - * for legacy IRQs and pin 16-23 are used for PCI IRQs (PIRQ A-H). - * When ACPI is disabled, only legacy IRQ numbers (IRQ0-15) are - * available, and some BIOSes may use MP Interrupt Source records - * to override IRQ numbers for PIRQs instead of reprogramming - * the interrupt routing logic. Thus there may be multiple pins - * sharing the same legacy IRQ number when ACPI is disabled. - */ if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) { irq = mp_irqs[idx].srcbusirq; - if (flags & IOAPIC_MAP_ALLOC) { - if (info->count == 0 && - mp_irqdomain_map(domain, irq, pin) != 0) - irq = -1; + legacy = mp_is_legacy_irq(irq); + } - /* special handling for timer IRQ0 */ + mutex_lock(&ioapic_mutex); + if (!(flags & IOAPIC_MAP_ALLOC)) { + if (!legacy) { + irq = irq_find_mapping(domain, pin); if (irq == 0) - info->count++; + irq = -ENOENT; } } else { - irq = irq_find_mapping(domain, pin); - if (irq <= 0 && (flags & IOAPIC_MAP_ALLOC)) - irq = alloc_irq_from_domain(domain, gsi, pin); + ioapic_copy_alloc_attr(&tmp, info, gsi, ioapic, pin); + if (legacy) + irq = alloc_isa_irq_from_domain(domain, irq, + ioapic, pin, &tmp); + else if ((irq = irq_find_mapping(domain, pin)) == 0) + irq = alloc_irq_from_domain(domain, ioapic, gsi, &tmp); + else if (!mp_check_pin_attr(irq, &tmp)) + irq = -EBUSY; + if (irq >= 0) { + data = irq_get_chip_data(irq); + data->count++; + } } - - if (flags & IOAPIC_MAP_ALLOC) { - /* special handling for legacy IRQs */ - if (irq < nr_legacy_irqs() && info->count == 1 && - mp_irqdomain_map(domain, irq, pin) != 0) - irq = -1; - - if (irq > 0) - info->count++; - else if (info->count == 0) - info->set = 0; - } - mutex_unlock(&ioapic_mutex); - return irq > 0 ? irq : -1; + return irq; } static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags) @@ -1058,10 +1098,10 @@ static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags) } #endif - return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags); + return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, NULL); } -int mp_map_gsi_to_irq(u32 gsi, unsigned int flags) +int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, struct irq_alloc_info *info) { int ioapic, pin, idx; @@ -1074,31 +1114,24 @@ int mp_map_gsi_to_irq(u32 gsi, unsigned int flags) if ((flags & IOAPIC_MAP_CHECK) && idx < 0) return -1; - return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags); + return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, info); } void mp_unmap_irq(int irq) { - struct irq_data *data = irq_get_irq_data(irq); - struct mp_pin_info *info; - int ioapic, pin; + struct irq_data *irq_data = irq_get_irq_data(irq); + struct mp_chip_data *data; - if (!data || !data->domain) + if (!irq_data || !irq_data->domain) return; - ioapic = (int)(long)data->domain->host_data; - pin = (int)data->hwirq; - info = mp_pin_info(ioapic, pin); + data = irq_data->chip_data; + if (!data || data->isa_irq) + return; mutex_lock(&ioapic_mutex); - if (--info->count == 0) { - info->set = 0; - if (irq < nr_legacy_irqs() && - ioapics[ioapic].irqdomain_cfg.type == IOAPIC_DOMAIN_LEGACY) - mp_irqdomain_unmap(data->domain, irq); - else - irq_dispose_mapping(irq); - } + if (--data->count == 0) + irq_domain_free_irqs(irq, 1); mutex_unlock(&ioapic_mutex); } @@ -1165,7 +1198,7 @@ out: } EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); -static struct irq_chip ioapic_chip; +static struct irq_chip ioapic_chip, ioapic_ir_chip; #ifdef CONFIG_X86_32 static inline int IO_APIC_irq_trigger(int irq) @@ -1189,96 +1222,6 @@ static inline int IO_APIC_irq_trigger(int irq) } #endif -static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg, - unsigned long trigger) -{ - struct irq_chip *chip = &ioapic_chip; - irq_flow_handler_t hdl; - bool fasteoi; - - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) { - irq_set_status_flags(irq, IRQ_LEVEL); - fasteoi = true; - } else { - irq_clear_status_flags(irq, IRQ_LEVEL); - fasteoi = false; - } - - if (setup_remapped_irq(irq, cfg, chip)) - fasteoi = trigger != 0; - - hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq; - irq_set_chip_and_handler_name(irq, chip, hdl, - fasteoi ? "fasteoi" : "edge"); -} - -int native_setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry, - unsigned int destination, int vector, - struct io_apic_irq_attr *attr) -{ - memset(entry, 0, sizeof(*entry)); - - entry->delivery_mode = apic->irq_delivery_mode; - entry->dest_mode = apic->irq_dest_mode; - entry->dest = destination; - entry->vector = vector; - entry->mask = 0; /* enable IRQ */ - entry->trigger = attr->trigger; - entry->polarity = attr->polarity; - - /* - * Mask level triggered irqs. - * Use IRQ_DELAYED_DISABLE for edge triggered irqs. - */ - if (attr->trigger) - entry->mask = 1; - - return 0; -} - -static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg, - struct io_apic_irq_attr *attr) -{ - struct IO_APIC_route_entry entry; - unsigned int dest; - - if (!IO_APIC_IRQ(irq)) - return; - - if (assign_irq_vector(irq, cfg, apic->target_cpus())) - return; - - if (apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus(), - &dest)) { - pr_warn("Failed to obtain apicid for ioapic %d, pin %d\n", - mpc_ioapic_id(attr->ioapic), attr->ioapic_pin); - clear_irq_vector(irq, cfg); - - return; - } - - apic_printk(APIC_VERBOSE,KERN_DEBUG - "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " - "IRQ %d Mode:%i Active:%i Dest:%d)\n", - attr->ioapic, mpc_ioapic_id(attr->ioapic), attr->ioapic_pin, - cfg->vector, irq, attr->trigger, attr->polarity, dest); - - if (x86_io_apic_ops.setup_entry(irq, &entry, dest, cfg->vector, attr)) { - pr_warn("Failed to setup ioapic entry for ioapic %d, pin %d\n", - mpc_ioapic_id(attr->ioapic), attr->ioapic_pin); - clear_irq_vector(irq, cfg); - - return; - } - - ioapic_register_intr(irq, cfg, attr->trigger); - if (irq < nr_legacy_irqs()) - legacy_pic->mask(irq); - - ioapic_write_entry(attr->ioapic, attr->ioapic_pin, entry); -} - static void __init setup_IO_APIC_irqs(void) { unsigned int ioapic, pin; @@ -1298,106 +1241,41 @@ static void __init setup_IO_APIC_irqs(void) } } -/* - * Set up the timer pin, possibly with the 8259A-master behind. - */ -static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx, - unsigned int pin, int vector) -{ - struct IO_APIC_route_entry entry; - unsigned int dest; - - memset(&entry, 0, sizeof(entry)); - - /* - * We use logical delivery to get the timer IRQ - * to the first CPU. - */ - if (unlikely(apic->cpu_mask_to_apicid_and(apic->target_cpus(), - apic->target_cpus(), &dest))) - dest = BAD_APICID; - - entry.dest_mode = apic->irq_dest_mode; - entry.mask = 0; /* don't mask IRQ for edge */ - entry.dest = dest; - entry.delivery_mode = apic->irq_delivery_mode; - entry.polarity = 0; - entry.trigger = 0; - entry.vector = vector; - - /* - * The timer IRQ doesn't have to know that behind the - * scene we may have a 8259A-master in AEOI mode ... - */ - irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, - "edge"); - - /* - * Add it to the IO-APIC irq-routing table: - */ - ioapic_write_entry(ioapic_idx, pin, entry); -} - -void native_io_apic_print_entries(unsigned int apic, unsigned int nr_entries) -{ - int i; - - pr_debug(" NR Dst Mask Trig IRR Pol Stat Dmod Deli Vect:\n"); - - for (i = 0; i <= nr_entries; i++) { - struct IO_APIC_route_entry entry; - - entry = ioapic_read_entry(apic, i); - - pr_debug(" %02x %02X ", i, entry.dest); - pr_cont("%1d %1d %1d %1d %1d " - "%1d %1d %02X\n", - entry.mask, - entry.trigger, - entry.irr, - entry.polarity, - entry.delivery_status, - entry.dest_mode, - entry.delivery_mode, - entry.vector); - } -} - -void intel_ir_io_apic_print_entries(unsigned int apic, - unsigned int nr_entries) -{ - int i; - - pr_debug(" NR Indx Fmt Mask Trig IRR Pol Stat Indx2 Zero Vect:\n"); - - for (i = 0; i <= nr_entries; i++) { - struct IR_IO_APIC_route_entry *ir_entry; - struct IO_APIC_route_entry entry; - - entry = ioapic_read_entry(apic, i); - - ir_entry = (struct IR_IO_APIC_route_entry *)&entry; - - pr_debug(" %02x %04X ", i, ir_entry->index); - pr_cont("%1d %1d %1d %1d %1d " - "%1d %1d %X %02X\n", - ir_entry->format, - ir_entry->mask, - ir_entry->trigger, - ir_entry->irr, - ir_entry->polarity, - ir_entry->delivery_status, - ir_entry->index2, - ir_entry->zero, - ir_entry->vector); - } -} - void ioapic_zap_locks(void) { raw_spin_lock_init(&ioapic_lock); } +static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries) +{ + int i; + char buf[256]; + struct IO_APIC_route_entry entry; + struct IR_IO_APIC_route_entry *ir_entry = (void *)&entry; + + printk(KERN_DEBUG "IOAPIC %d:\n", apic); + for (i = 0; i <= nr_entries; i++) { + entry = ioapic_read_entry(apic, i); + snprintf(buf, sizeof(buf), + " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)", + i, + entry.mask == IOAPIC_MASKED ? "disabled" : "enabled ", + entry.trigger == IOAPIC_LEVEL ? "level" : "edge ", + entry.polarity == IOAPIC_POL_LOW ? "low " : "high", + entry.vector, entry.irr, entry.delivery_status); + if (ir_entry->format) + printk(KERN_DEBUG "%s, remapped, I(%04X), Z(%X)\n", + buf, (ir_entry->index << 15) | ir_entry->index, + ir_entry->zero); + else + printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n", + buf, + entry.dest_mode == IOAPIC_DEST_MODE_LOGICAL ? + "logical " : "physical", + entry.dest, entry.delivery_mode); + } +} + static void __init print_IO_APIC(int ioapic_idx) { union IO_APIC_reg_00 reg_00; @@ -1451,16 +1329,13 @@ static void __init print_IO_APIC(int ioapic_idx) } printk(KERN_DEBUG ".... IRQ redirection table:\n"); - - x86_io_apic_ops.print_entries(ioapic_idx, reg_01.bits.entries); + io_apic_print_entries(ioapic_idx, reg_01.bits.entries); } void __init print_IO_APICs(void) { int ioapic_idx; - struct irq_cfg *cfg; unsigned int irq; - struct irq_chip *chip; printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); for_each_ioapic(ioapic_idx) @@ -1480,18 +1355,20 @@ void __init print_IO_APICs(void) printk(KERN_DEBUG "IRQ to pin mappings:\n"); for_each_active_irq(irq) { struct irq_pin_list *entry; + struct irq_chip *chip; + struct mp_chip_data *data; chip = irq_get_chip(irq); - if (chip != &ioapic_chip) + if (chip != &ioapic_chip && chip != &ioapic_ir_chip) + continue; + data = irq_get_chip_data(irq); + if (!data) + continue; + if (list_empty(&data->irq_2_pin)) continue; - cfg = irq_cfg(irq); - if (!cfg) - continue; - if (list_empty(&cfg->irq_2_pin)) - continue; printk(KERN_DEBUG "IRQ%d ", irq); - for_each_irq_pin(entry, cfg->irq_2_pin) + for_each_irq_pin(entry, data->irq_2_pin) pr_cont("-> %d:%d", entry->apic, entry->pin); pr_cont("\n"); } @@ -1564,15 +1441,12 @@ void native_disable_io_apic(void) struct IO_APIC_route_entry entry; memset(&entry, 0, sizeof(entry)); - entry.mask = 0; /* Enabled */ - entry.trigger = 0; /* Edge */ - entry.irr = 0; - entry.polarity = 0; /* High */ - entry.delivery_status = 0; - entry.dest_mode = 0; /* Physical */ - entry.delivery_mode = dest_ExtINT; /* ExtInt */ - entry.vector = 0; - entry.dest = read_apic_id(); + entry.mask = IOAPIC_UNMASKED; + entry.trigger = IOAPIC_EDGE; + entry.polarity = IOAPIC_POL_HIGH; + entry.dest_mode = IOAPIC_DEST_MODE_PHYSICAL; + entry.delivery_mode = dest_ExtINT; + entry.dest = read_apic_id(); /* * Add it to the IO-APIC irq-routing table: @@ -1582,7 +1456,6 @@ void native_disable_io_apic(void) if (cpu_has_apic || apic_from_smp_config()) disconnect_bsp_APIC(ioapic_i8259.pin != -1); - } /* @@ -1792,7 +1665,6 @@ static int __init timer_irq_works(void) * This is not complete - we should be able to fake * an edge even if it isn't on the 8259A... */ - static unsigned int startup_ioapic_irq(struct irq_data *data) { int was_pending = 0, irq = data->irq; @@ -1804,74 +1676,22 @@ static unsigned int startup_ioapic_irq(struct irq_data *data) if (legacy_pic->irq_pending(irq)) was_pending = 1; } - __unmask_ioapic(irqd_cfg(data)); + __unmask_ioapic(data->chip_data); raw_spin_unlock_irqrestore(&ioapic_lock, flags); return was_pending; } -/* - * Level and edge triggered IO-APIC interrupts need different handling, - * so we use two separate IRQ descriptors. Edge triggered IRQs can be - * handled with the level-triggered descriptor, but that one has slightly - * more overhead. Level-triggered interrupts cannot be handled with the - * edge-triggered handler, without risking IRQ storms and other ugly - * races. - */ - -static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) -{ - int apic, pin; - struct irq_pin_list *entry; - u8 vector = cfg->vector; - - for_each_irq_pin(entry, cfg->irq_2_pin) { - unsigned int reg; - - apic = entry->apic; - pin = entry->pin; - - io_apic_write(apic, 0x11 + pin*2, dest); - reg = io_apic_read(apic, 0x10 + pin*2); - reg &= ~IO_APIC_REDIR_VECTOR_MASK; - reg |= vector; - io_apic_modify(apic, 0x10 + pin*2, reg); - } -} - -int native_ioapic_set_affinity(struct irq_data *data, - const struct cpumask *mask, - bool force) -{ - unsigned int dest, irq = data->irq; - unsigned long flags; - int ret; - - if (!config_enabled(CONFIG_SMP)) - return -EPERM; - - raw_spin_lock_irqsave(&ioapic_lock, flags); - ret = apic_set_affinity(data, mask, &dest); - if (!ret) { - /* Only the high 8 bits are valid. */ - dest = SET_APIC_LOGICAL_ID(dest); - __target_IO_APIC_irq(irq, dest, irqd_cfg(data)); - ret = IRQ_SET_MASK_OK_NOCOPY; - } - raw_spin_unlock_irqrestore(&ioapic_lock, flags); - return ret; -} - atomic_t irq_mis_count; #ifdef CONFIG_GENERIC_PENDING_IRQ -static bool io_apic_level_ack_pending(struct irq_cfg *cfg) +static bool io_apic_level_ack_pending(struct mp_chip_data *data) { struct irq_pin_list *entry; unsigned long flags; raw_spin_lock_irqsave(&ioapic_lock, flags); - for_each_irq_pin(entry, cfg->irq_2_pin) { + for_each_irq_pin(entry, data->irq_2_pin) { unsigned int reg; int pin; @@ -1888,18 +1708,17 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg) return false; } -static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg) +static inline bool ioapic_irqd_mask(struct irq_data *data) { /* If we are moving the irq we need to mask it */ if (unlikely(irqd_is_setaffinity_pending(data))) { - mask_ioapic(cfg); + mask_ioapic_irq(data); return true; } return false; } -static inline void ioapic_irqd_unmask(struct irq_data *data, - struct irq_cfg *cfg, bool masked) +static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked) { if (unlikely(masked)) { /* Only migrate the irq if the ack has been received. @@ -1928,31 +1747,30 @@ static inline void ioapic_irqd_unmask(struct irq_data *data, * accurate and is causing problems then it is a hardware bug * and you can go talk to the chipset vendor about it. */ - if (!io_apic_level_ack_pending(cfg)) + if (!io_apic_level_ack_pending(data->chip_data)) irq_move_masked_irq(data); - unmask_ioapic(cfg); + unmask_ioapic_irq(data); } } #else -static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg) +static inline bool ioapic_irqd_mask(struct irq_data *data) { return false; } -static inline void ioapic_irqd_unmask(struct irq_data *data, - struct irq_cfg *cfg, bool masked) +static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked) { } #endif -static void ack_ioapic_level(struct irq_data *data) +static void ioapic_ack_level(struct irq_data *irq_data) { - struct irq_cfg *cfg = irqd_cfg(data); - int i, irq = data->irq; + struct irq_cfg *cfg = irqd_cfg(irq_data); unsigned long v; bool masked; + int i; irq_complete_move(cfg); - masked = ioapic_irqd_mask(data, cfg); + masked = ioapic_irqd_mask(irq_data); /* * It appears there is an erratum which affects at least version 0x11 @@ -2004,11 +1822,49 @@ static void ack_ioapic_level(struct irq_data *data) */ if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); - - eoi_ioapic_irq(irq, cfg); + eoi_ioapic_pin(cfg->vector, irq_data->chip_data); } - ioapic_irqd_unmask(data, cfg, masked); + ioapic_irqd_unmask(irq_data, masked); +} + +static void ioapic_ir_ack_level(struct irq_data *irq_data) +{ + struct mp_chip_data *data = irq_data->chip_data; + + /* + * Intr-remapping uses pin number as the virtual vector + * in the RTE. Actual vector is programmed in + * intr-remapping table entry. Hence for the io-apic + * EOI we use the pin number. + */ + ack_APIC_irq(); + eoi_ioapic_pin(data->entry.vector, data); +} + +static int ioapic_set_affinity(struct irq_data *irq_data, + const struct cpumask *mask, bool force) +{ + struct irq_data *parent = irq_data->parent_data; + struct mp_chip_data *data = irq_data->chip_data; + struct irq_pin_list *entry; + struct irq_cfg *cfg; + unsigned long flags; + int ret; + + ret = parent->chip->irq_set_affinity(parent, mask, force); + raw_spin_lock_irqsave(&ioapic_lock, flags); + if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) { + cfg = irqd_cfg(irq_data); + data->entry.dest = cfg->dest_apicid; + data->entry.vector = cfg->vector; + for_each_irq_pin(entry, data->irq_2_pin) + __ioapic_write_entry(entry->apic, entry->pin, + data->entry); + } + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + return ret; } static struct irq_chip ioapic_chip __read_mostly = { @@ -2016,10 +1872,20 @@ static struct irq_chip ioapic_chip __read_mostly = { .irq_startup = startup_ioapic_irq, .irq_mask = mask_ioapic_irq, .irq_unmask = unmask_ioapic_irq, - .irq_ack = apic_ack_edge, - .irq_eoi = ack_ioapic_level, - .irq_set_affinity = native_ioapic_set_affinity, - .irq_retrigger = apic_retrigger_irq, + .irq_ack = irq_chip_ack_parent, + .irq_eoi = ioapic_ack_level, + .irq_set_affinity = ioapic_set_affinity, + .flags = IRQCHIP_SKIP_SET_WAKE, +}; + +static struct irq_chip ioapic_ir_chip __read_mostly = { + .name = "IR-IO-APIC", + .irq_startup = startup_ioapic_irq, + .irq_mask = mask_ioapic_irq, + .irq_unmask = unmask_ioapic_irq, + .irq_ack = irq_chip_ack_parent, + .irq_eoi = ioapic_ir_ack_level, + .irq_set_affinity = ioapic_set_affinity, .flags = IRQCHIP_SKIP_SET_WAKE, }; @@ -2113,12 +1979,12 @@ static inline void __init unlock_ExtINT_logic(void) memset(&entry1, 0, sizeof(entry1)); - entry1.dest_mode = 0; /* physical delivery */ - entry1.mask = 0; /* unmask IRQ now */ + entry1.dest_mode = IOAPIC_DEST_MODE_PHYSICAL; + entry1.mask = IOAPIC_UNMASKED; entry1.dest = hard_smp_processor_id(); entry1.delivery_mode = dest_ExtINT; entry1.polarity = entry0.polarity; - entry1.trigger = 0; + entry1.trigger = IOAPIC_EDGE; entry1.vector = 0; ioapic_write_entry(apic, pin, entry1); @@ -2152,6 +2018,25 @@ static int __init disable_timer_pin_setup(char *arg) } early_param("disable_timer_pin_1", disable_timer_pin_setup); +static int mp_alloc_timer_irq(int ioapic, int pin) +{ + int irq = -1; + struct irq_domain *domain = mp_ioapic_irqdomain(ioapic); + + if (domain) { + struct irq_alloc_info info; + + ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 0, 0); + info.ioapic_id = mpc_ioapic_id(ioapic); + info.ioapic_pin = pin; + mutex_lock(&ioapic_mutex); + irq = alloc_isa_irq_from_domain(domain, 0, ioapic, pin, &info); + mutex_unlock(&ioapic_mutex); + } + + return irq; +} + /* * This code may look a bit paranoid, but it's supposed to cooperate with * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ @@ -2162,7 +2047,9 @@ early_param("disable_timer_pin_1", disable_timer_pin_setup); */ static inline void __init check_timer(void) { - struct irq_cfg *cfg = irq_cfg(0); + struct irq_data *irq_data = irq_get_irq_data(0); + struct mp_chip_data *data = irq_data->chip_data; + struct irq_cfg *cfg = irqd_cfg(irq_data); int node = cpu_to_node(0); int apic1, pin1, apic2, pin2; unsigned long flags; @@ -2174,7 +2061,6 @@ static inline void __init check_timer(void) * get/set the timer IRQ vector: */ legacy_pic->mask(0); - assign_irq_vector(0, cfg, apic->target_cpus()); /* * As IRQ0 is to be enabled in the 8259A, the virtual @@ -2215,23 +2101,21 @@ static inline void __init check_timer(void) } if (pin1 != -1) { - /* - * Ok, does IRQ0 through the IOAPIC work? - */ + /* Ok, does IRQ0 through the IOAPIC work? */ if (no_pin1) { - add_pin_to_irq_node(cfg, node, apic1, pin1); - setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); + mp_alloc_timer_irq(apic1, pin1); } else { - /* for edge trigger, setup_ioapic_irq already - * leave it unmasked. + /* + * for edge trigger, it's already unmasked, * so only need to unmask if it is level-trigger * do we really have level trigger timer? */ int idx; idx = find_irq_entry(apic1, pin1, mp_INT); if (idx != -1 && irq_trigger(idx)) - unmask_ioapic(cfg); + unmask_ioapic_irq(irq_get_chip_data(0)); } + irq_domain_activate_irq(irq_data); if (timer_irq_works()) { if (disable_timer_pin_1 > 0) clear_IO_APIC_pin(0, pin1); @@ -2251,8 +2135,8 @@ static inline void __init check_timer(void) /* * legacy devices should be connected to IO APIC #0 */ - replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2); - setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); + replace_pin_at_irq_node(data, node, apic1, pin1, apic2, pin2); + irq_domain_activate_irq(irq_data); legacy_pic->unmask(0); if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); @@ -2329,36 +2213,35 @@ out: static int mp_irqdomain_create(int ioapic) { - size_t size; + struct irq_alloc_info info; + struct irq_domain *parent; int hwirqs = mp_ioapic_pin_count(ioapic); struct ioapic *ip = &ioapics[ioapic]; struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg; struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic); - size = sizeof(struct mp_pin_info) * mp_ioapic_pin_count(ioapic); - ip->pin_info = kzalloc(size, GFP_KERNEL); - if (!ip->pin_info) - return -ENOMEM; - if (cfg->type == IOAPIC_DOMAIN_INVALID) return 0; + init_irq_alloc_info(&info, NULL); + info.type = X86_IRQ_ALLOC_TYPE_IOAPIC; + info.ioapic_id = mpc_ioapic_id(ioapic); + parent = irq_remapping_get_ir_irq_domain(&info); + if (!parent) + parent = x86_vector_domain; + ip->irqdomain = irq_domain_add_linear(cfg->dev, hwirqs, cfg->ops, (void *)(long)ioapic); - if(!ip->irqdomain) { - kfree(ip->pin_info); - ip->pin_info = NULL; + if (!ip->irqdomain) return -ENOMEM; - } + + ip->irqdomain->parent = parent; if (cfg->type == IOAPIC_DOMAIN_LEGACY || cfg->type == IOAPIC_DOMAIN_STRICT) ioapic_dynirq_base = max(ioapic_dynirq_base, gsi_cfg->gsi_end + 1); - if (gsi_cfg->gsi_base == 0) - irq_set_default_host(ip->irqdomain); - return 0; } @@ -2368,8 +2251,6 @@ static void ioapic_destroy_irqdomain(int idx) irq_domain_remove(ioapics[idx].irqdomain); ioapics[idx].irqdomain = NULL; } - kfree(ioapics[idx].pin_info); - ioapics[idx].pin_info = NULL; } void __init setup_IO_APIC(void) @@ -2399,20 +2280,6 @@ void __init setup_IO_APIC(void) ioapic_initialized = 1; } -/* - * Called after all the initialization is done. If we didn't find any - * APIC bugs then we can allow the modify fast path - */ - -static int __init io_apic_bug_finalize(void) -{ - if (sis_apic_bug == -1) - sis_apic_bug = 0; - return 0; -} - -late_initcall(io_apic_bug_finalize); - static void resume_ioapic_id(int ioapic_idx) { unsigned long flags; @@ -2451,20 +2318,6 @@ static int __init ioapic_init_ops(void) device_initcall(ioapic_init_ops); -static int -io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr) -{ - struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node); - int ret; - - if (!cfg) - return -EINVAL; - ret = __add_pin_to_irq_node(cfg, node, attr->ioapic, attr->ioapic_pin); - if (!ret) - setup_ioapic_irq(irq, cfg, attr); - return ret; -} - static int io_apic_get_redir_entries(int ioapic) { union IO_APIC_reg_01 reg_01; @@ -2692,7 +2545,7 @@ void __init setup_ioapic_dest(void) else mask = apic->target_cpus(); - x86_io_apic_ops.set_affinity(idata, mask, false); + irq_set_affinity(irq, mask); } } @@ -2737,7 +2590,7 @@ static struct resource * __init ioapic_setup_resources(void) return res; } -void __init native_io_apic_init_mappings(void) +void __init io_apic_init_mappings(void) { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; struct resource *ioapic_res; @@ -2962,7 +2815,6 @@ int mp_unregister_ioapic(u32 gsi_base) { int ioapic, pin; int found = 0; - struct mp_pin_info *pin_info; for_each_ioapic(ioapic) if (ioapics[ioapic].gsi_config.gsi_base == gsi_base) { @@ -2975,11 +2827,17 @@ int mp_unregister_ioapic(u32 gsi_base) } for_each_pin(ioapic, pin) { - pin_info = mp_pin_info(ioapic, pin); - if (pin_info->count) { - pr_warn("pin%d on IOAPIC%d is still in use.\n", - pin, ioapic); - return -EBUSY; + u32 gsi = mp_pin_to_gsi(ioapic, pin); + int irq = mp_map_gsi_to_irq(gsi, 0, NULL); + struct mp_chip_data *data; + + if (irq >= 0) { + data = irq_get_chip_data(irq); + if (data && data->count) { + pr_warn("pin%d on IOAPIC%d is still in use.\n", + pin, ioapic); + return -EBUSY; + } } } @@ -3006,108 +2864,141 @@ int mp_ioapic_registered(u32 gsi_base) return 0; } -static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr, - int ioapic, int ioapic_pin, - int trigger, int polarity) +static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data, + struct irq_alloc_info *info) { - irq_attr->ioapic = ioapic; - irq_attr->ioapic_pin = ioapic_pin; - irq_attr->trigger = trigger; - irq_attr->polarity = polarity; -} - -int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hwirq) -{ - int ioapic = (int)(long)domain->host_data; - struct mp_pin_info *info = mp_pin_info(ioapic, hwirq); - struct io_apic_irq_attr attr; - - /* Get default attribute if not set by caller yet */ - if (!info->set) { - u32 gsi = mp_pin_to_gsi(ioapic, hwirq); - - if (acpi_get_override_irq(gsi, &info->trigger, - &info->polarity) < 0) { - /* - * PCI interrupts are always polarity one level - * triggered. - */ - info->trigger = 1; - info->polarity = 1; - } - info->node = NUMA_NO_NODE; - - /* - * setup_IO_APIC_irqs() programs all legacy IRQs with default - * trigger and polarity attributes. Don't set the flag for that - * case so the first legacy IRQ user could reprogram the pin - * with real trigger and polarity attributes. - */ - if (virq >= nr_legacy_irqs() || info->count) - info->set = 1; + if (info && info->ioapic_valid) { + data->trigger = info->ioapic_trigger; + data->polarity = info->ioapic_polarity; + } else if (acpi_get_override_irq(gsi, &data->trigger, + &data->polarity) < 0) { + /* PCI interrupts are always active low level triggered. */ + data->trigger = IOAPIC_LEVEL; + data->polarity = IOAPIC_POL_LOW; } - set_io_apic_irq_attr(&attr, ioapic, hwirq, info->trigger, - info->polarity); - - return io_apic_setup_irq_pin(virq, info->node, &attr); } -void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq) +static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data, + struct IO_APIC_route_entry *entry) { - struct irq_data *data = irq_get_irq_data(virq); - struct irq_cfg *cfg = irq_cfg(virq); - int ioapic = (int)(long)domain->host_data; - int pin = (int)data->hwirq; - - ioapic_mask_entry(ioapic, pin); - __remove_pin_from_irq(cfg, ioapic, pin); - WARN_ON(!list_empty(&cfg->irq_2_pin)); - arch_teardown_hwirq(virq); + memset(entry, 0, sizeof(*entry)); + entry->delivery_mode = apic->irq_delivery_mode; + entry->dest_mode = apic->irq_dest_mode; + entry->dest = cfg->dest_apicid; + entry->vector = cfg->vector; + entry->trigger = data->trigger; + entry->polarity = data->polarity; + /* + * Mask level triggered irqs. Edge triggered irqs are masked + * by the irq core code in case they fire. + */ + if (data->trigger == IOAPIC_LEVEL) + entry->mask = IOAPIC_MASKED; + else + entry->mask = IOAPIC_UNMASKED; } -int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node) +int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) { - int ret = 0; - int ioapic, pin; - struct mp_pin_info *info; + int ret, ioapic, pin; + struct irq_cfg *cfg; + struct irq_data *irq_data; + struct mp_chip_data *data; + struct irq_alloc_info *info = arg; - ioapic = mp_find_ioapic(gsi); - if (ioapic < 0) - return -ENODEV; + if (!info || nr_irqs > 1) + return -EINVAL; + irq_data = irq_domain_get_irq_data(domain, virq); + if (!irq_data) + return -EINVAL; - pin = mp_find_ioapic_pin(ioapic, gsi); - info = mp_pin_info(ioapic, pin); - trigger = trigger ? 1 : 0; - polarity = polarity ? 1 : 0; + ioapic = mp_irqdomain_ioapic_idx(domain); + pin = info->ioapic_pin; + if (irq_find_mapping(domain, (irq_hw_number_t)pin) > 0) + return -EEXIST; - mutex_lock(&ioapic_mutex); - if (!info->set) { - info->trigger = trigger; - info->polarity = polarity; - info->node = node; - info->set = 1; - } else if (info->trigger != trigger || info->polarity != polarity) { - ret = -EBUSY; + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + info->ioapic_entry = &data->entry; + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info); + if (ret < 0) { + kfree(data); + return ret; } - mutex_unlock(&ioapic_mutex); - return ret; + INIT_LIST_HEAD(&data->irq_2_pin); + irq_data->hwirq = info->ioapic_pin; + irq_data->chip = (domain->parent == x86_vector_domain) ? + &ioapic_chip : &ioapic_ir_chip; + irq_data->chip_data = data; + mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info); + + cfg = irqd_cfg(irq_data); + add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin); + if (info->ioapic_entry) + mp_setup_entry(cfg, data, info->ioapic_entry); + mp_register_handler(virq, data->trigger); + if (virq < nr_legacy_irqs()) + legacy_pic->mask(virq); + + apic_printk(APIC_VERBOSE, KERN_DEBUG + "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i Dest:%d)\n", + ioapic, mpc_ioapic_id(ioapic), pin, cfg->vector, + virq, data->trigger, data->polarity, cfg->dest_apicid); + + return 0; } -/* Enable IOAPIC early just for system timer */ -void __init pre_init_apic_IRQ0(void) +void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) { - struct io_apic_irq_attr attr = { 0, 0, 0, 0 }; + struct irq_data *irq_data; + struct mp_chip_data *data; - printk(KERN_INFO "Early APIC setup for system timer0\n"); -#ifndef CONFIG_SMP - physid_set_mask_of_physid(boot_cpu_physical_apicid, - &phys_cpu_present_map); -#endif - setup_local_APIC(); - - io_apic_setup_irq_pin(0, 0, &attr); - irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, - "edge"); + BUG_ON(nr_irqs != 1); + irq_data = irq_domain_get_irq_data(domain, virq); + if (irq_data && irq_data->chip_data) { + data = irq_data->chip_data; + __remove_pin_from_irq(data, mp_irqdomain_ioapic_idx(domain), + (int)irq_data->hwirq); + WARN_ON(!list_empty(&data->irq_2_pin)); + kfree(irq_data->chip_data); + } + irq_domain_free_irqs_top(domain, virq, nr_irqs); } + +void mp_irqdomain_activate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + unsigned long flags; + struct irq_pin_list *entry; + struct mp_chip_data *data = irq_data->chip_data; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + for_each_irq_pin(entry, data->irq_2_pin) + __ioapic_write_entry(entry->apic, entry->pin, data->entry); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); +} + +void mp_irqdomain_deactivate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + /* It won't be called for IRQ with multiple IOAPIC pins associated */ + ioapic_mask_entry(mp_irqdomain_ioapic_idx(domain), + (int)irq_data->hwirq); +} + +int mp_irqdomain_ioapic_idx(struct irq_domain *domain) +{ + return (int)(long)domain->host_data; +} + +const struct irq_domain_ops mp_ioapic_irqdomain_ops = { + .alloc = mp_irqdomain_alloc, + .free = mp_irqdomain_free, + .activate = mp_irqdomain_activate, + .deactivate = mp_irqdomain_deactivate, +}; diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index d6ba2d660dc5..1a9d735e09c6 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -3,6 +3,8 @@ * * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo * Moved from arch/x86/kernel/apic/io_apic.c. + * Jiang Liu + * Convert to hierarchical irqdomain * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -14,22 +16,23 @@ #include #include #include +#include #include #include #include #include #include -void native_compose_msi_msg(struct pci_dev *pdev, - unsigned int irq, unsigned int dest, - struct msi_msg *msg, u8 hpet_id) +static struct irq_domain *msi_default_domain; + +static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) { - struct irq_cfg *cfg = irq_cfg(irq); + struct irq_cfg *cfg = irqd_cfg(data); msg->address_hi = MSI_ADDR_BASE_HI; if (x2apic_enabled()) - msg->address_hi |= MSI_ADDR_EXT_DEST_ID(dest); + msg->address_hi |= MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid); msg->address_lo = MSI_ADDR_BASE_LO | @@ -39,7 +42,7 @@ void native_compose_msi_msg(struct pci_dev *pdev, ((apic->irq_delivery_mode != dest_LowestPrio) ? MSI_ADDR_REDIRECTION_CPU : MSI_ADDR_REDIRECTION_LOWPRI) | - MSI_ADDR_DEST_ID(dest); + MSI_ADDR_DEST_ID(cfg->dest_apicid); msg->data = MSI_DATA_TRIGGER_EDGE | @@ -50,237 +53,305 @@ void native_compose_msi_msg(struct pci_dev *pdev, MSI_DATA_VECTOR(cfg->vector); } -static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, - struct msi_msg *msg, u8 hpet_id) -{ - struct irq_cfg *cfg; - int err; - unsigned dest; - - if (disable_apic) - return -ENXIO; - - cfg = irq_cfg(irq); - err = assign_irq_vector(irq, cfg, apic->target_cpus()); - if (err) - return err; - - err = apic->cpu_mask_to_apicid_and(cfg->domain, - apic->target_cpus(), &dest); - if (err) - return err; - - x86_msi.compose_msi_msg(pdev, irq, dest, msg, hpet_id); - - return 0; -} - -static int -msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) -{ - struct irq_cfg *cfg = irqd_cfg(data); - struct msi_msg msg; - unsigned int dest; - int ret; - - ret = apic_set_affinity(data, mask, &dest); - if (ret) - return ret; - - __get_cached_msi_msg(data->msi_desc, &msg); - - msg.data &= ~MSI_DATA_VECTOR_MASK; - msg.data |= MSI_DATA_VECTOR(cfg->vector); - msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; - msg.address_lo |= MSI_ADDR_DEST_ID(dest); - - __pci_write_msi_msg(data->msi_desc, &msg); - - return IRQ_SET_MASK_OK_NOCOPY; -} - /* * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, * which implement the MSI or MSI-X Capability Structure. */ -static struct irq_chip msi_chip = { +static struct irq_chip pci_msi_controller = { .name = "PCI-MSI", .irq_unmask = pci_msi_unmask_irq, .irq_mask = pci_msi_mask_irq, - .irq_ack = apic_ack_edge, - .irq_set_affinity = msi_set_affinity, - .irq_retrigger = apic_retrigger_irq, + .irq_ack = irq_chip_ack_parent, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_compose_msi_msg = irq_msi_compose_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; -int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, - unsigned int irq_base, unsigned int irq_offset) -{ - struct irq_chip *chip = &msi_chip; - struct msi_msg msg; - unsigned int irq = irq_base + irq_offset; - int ret; - - ret = msi_compose_msg(dev, irq, &msg, -1); - if (ret < 0) - return ret; - - irq_set_msi_desc_off(irq_base, irq_offset, msidesc); - - /* - * MSI-X message is written per-IRQ, the offset is always 0. - * MSI message denotes a contiguous group of IRQs, written for 0th IRQ. - */ - if (!irq_offset) - pci_write_msi_msg(irq, &msg); - - setup_remapped_irq(irq, irq_cfg(irq), chip); - - irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); - - dev_dbg(&dev->dev, "irq %d for MSI/MSI-X\n", irq); - - return 0; -} - int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { - struct msi_desc *msidesc; - unsigned int irq; - int node, ret; + struct irq_domain *domain; + struct irq_alloc_info info; - /* Multiple MSI vectors only supported with interrupt remapping */ - if (type == PCI_CAP_ID_MSI && nvec > 1) - return 1; + init_irq_alloc_info(&info, NULL); + info.type = X86_IRQ_ALLOC_TYPE_MSI; + info.msi_dev = dev; - node = dev_to_node(&dev->dev); + domain = irq_remapping_get_irq_domain(&info); + if (domain == NULL) + domain = msi_default_domain; + if (domain == NULL) + return -ENOSYS; - list_for_each_entry(msidesc, &dev->msi_list, list) { - irq = irq_alloc_hwirq(node); - if (!irq) - return -ENOSPC; - - ret = setup_msi_irq(dev, msidesc, irq, 0); - if (ret < 0) { - irq_free_hwirq(irq); - return ret; - } - - } - return 0; + return pci_msi_domain_alloc_irqs(domain, dev, nvec, type); } void native_teardown_msi_irq(unsigned int irq) { - irq_free_hwirq(irq); + irq_domain_free_irqs(irq, 1); } -#ifdef CONFIG_DMAR_TABLE -static int -dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, - bool force) +static irq_hw_number_t pci_msi_get_hwirq(struct msi_domain_info *info, + msi_alloc_info_t *arg) { - struct irq_cfg *cfg = irqd_cfg(data); - unsigned int dest, irq = data->irq; - struct msi_msg msg; - int ret; - - ret = apic_set_affinity(data, mask, &dest); - if (ret) - return ret; - - dmar_msi_read(irq, &msg); - - msg.data &= ~MSI_DATA_VECTOR_MASK; - msg.data |= MSI_DATA_VECTOR(cfg->vector); - msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; - msg.address_lo |= MSI_ADDR_DEST_ID(dest); - msg.address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest); - - dmar_msi_write(irq, &msg); - - return IRQ_SET_MASK_OK_NOCOPY; + return arg->msi_hwirq; } -static struct irq_chip dmar_msi_type = { - .name = "DMAR_MSI", - .irq_unmask = dmar_msi_unmask, - .irq_mask = dmar_msi_mask, - .irq_ack = apic_ack_edge, - .irq_set_affinity = dmar_msi_set_affinity, - .irq_retrigger = apic_retrigger_irq, +static int pci_msi_prepare(struct irq_domain *domain, struct device *dev, + int nvec, msi_alloc_info_t *arg) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct msi_desc *desc = first_pci_msi_entry(pdev); + + init_irq_alloc_info(arg, NULL); + arg->msi_dev = pdev; + if (desc->msi_attrib.is_msix) { + arg->type = X86_IRQ_ALLOC_TYPE_MSIX; + } else { + arg->type = X86_IRQ_ALLOC_TYPE_MSI; + arg->flags |= X86_IRQ_ALLOC_CONTIGUOUS_VECTORS; + } + + return 0; +} + +static void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc) +{ + arg->msi_hwirq = pci_msi_domain_calc_hwirq(arg->msi_dev, desc); +} + +static struct msi_domain_ops pci_msi_domain_ops = { + .get_hwirq = pci_msi_get_hwirq, + .msi_prepare = pci_msi_prepare, + .set_desc = pci_msi_set_desc, +}; + +static struct msi_domain_info pci_msi_domain_info = { + .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_PCI_MSIX, + .ops = &pci_msi_domain_ops, + .chip = &pci_msi_controller, + .handler = handle_edge_irq, + .handler_name = "edge", +}; + +void arch_init_msi_domain(struct irq_domain *parent) +{ + if (disable_apic) + return; + + msi_default_domain = pci_msi_create_irq_domain(NULL, + &pci_msi_domain_info, parent); + if (!msi_default_domain) + pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n"); +} + +#ifdef CONFIG_IRQ_REMAP +static struct irq_chip pci_msi_ir_controller = { + .name = "IR-PCI-MSI", + .irq_unmask = pci_msi_unmask_irq, + .irq_mask = pci_msi_mask_irq, + .irq_ack = irq_chip_ack_parent, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_set_vcpu_affinity = irq_chip_set_vcpu_affinity_parent, .flags = IRQCHIP_SKIP_SET_WAKE, }; -int arch_setup_dmar_msi(unsigned int irq) -{ - int ret; - struct msi_msg msg; +static struct msi_domain_info pci_msi_ir_domain_info = { + .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX, + .ops = &pci_msi_domain_ops, + .chip = &pci_msi_ir_controller, + .handler = handle_edge_irq, + .handler_name = "edge", +}; + +struct irq_domain *arch_create_msi_irq_domain(struct irq_domain *parent) +{ + return pci_msi_create_irq_domain(NULL, &pci_msi_ir_domain_info, parent); +} +#endif + +#ifdef CONFIG_DMAR_TABLE +static void dmar_msi_write_msg(struct irq_data *data, struct msi_msg *msg) +{ + dmar_msi_write(data->irq, msg); +} + +static struct irq_chip dmar_msi_controller = { + .name = "DMAR-MSI", + .irq_unmask = dmar_msi_unmask, + .irq_mask = dmar_msi_mask, + .irq_ack = irq_chip_ack_parent, + .irq_set_affinity = msi_domain_set_affinity, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_compose_msi_msg = irq_msi_compose_msg, + .irq_write_msi_msg = dmar_msi_write_msg, + .flags = IRQCHIP_SKIP_SET_WAKE, +}; + +static irq_hw_number_t dmar_msi_get_hwirq(struct msi_domain_info *info, + msi_alloc_info_t *arg) +{ + return arg->dmar_id; +} + +static int dmar_msi_init(struct irq_domain *domain, + struct msi_domain_info *info, unsigned int virq, + irq_hw_number_t hwirq, msi_alloc_info_t *arg) +{ + irq_domain_set_info(domain, virq, arg->dmar_id, info->chip, NULL, + handle_edge_irq, arg->dmar_data, "edge"); - ret = msi_compose_msg(NULL, irq, &msg, -1); - if (ret < 0) - return ret; - dmar_msi_write(irq, &msg); - irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, - "edge"); return 0; } + +static struct msi_domain_ops dmar_msi_domain_ops = { + .get_hwirq = dmar_msi_get_hwirq, + .msi_init = dmar_msi_init, +}; + +static struct msi_domain_info dmar_msi_domain_info = { + .ops = &dmar_msi_domain_ops, + .chip = &dmar_msi_controller, +}; + +static struct irq_domain *dmar_get_irq_domain(void) +{ + static struct irq_domain *dmar_domain; + static DEFINE_MUTEX(dmar_lock); + + mutex_lock(&dmar_lock); + if (dmar_domain == NULL) + dmar_domain = msi_create_irq_domain(NULL, &dmar_msi_domain_info, + x86_vector_domain); + mutex_unlock(&dmar_lock); + + return dmar_domain; +} + +int dmar_alloc_hwirq(int id, int node, void *arg) +{ + struct irq_domain *domain = dmar_get_irq_domain(); + struct irq_alloc_info info; + + if (!domain) + return -1; + + init_irq_alloc_info(&info, NULL); + info.type = X86_IRQ_ALLOC_TYPE_DMAR; + info.dmar_id = id; + info.dmar_data = arg; + + return irq_domain_alloc_irqs(domain, 1, node, &info); +} + +void dmar_free_hwirq(int irq) +{ + irq_domain_free_irqs(irq, 1); +} #endif /* * MSI message composition */ #ifdef CONFIG_HPET_TIMER - -static int hpet_msi_set_affinity(struct irq_data *data, - const struct cpumask *mask, bool force) +static inline int hpet_dev_id(struct irq_domain *domain) { - struct irq_cfg *cfg = irqd_cfg(data); - struct msi_msg msg; - unsigned int dest; - int ret; + struct msi_domain_info *info = msi_get_domain_info(domain); - ret = apic_set_affinity(data, mask, &dest); - if (ret) - return ret; - - hpet_msi_read(data->handler_data, &msg); - - msg.data &= ~MSI_DATA_VECTOR_MASK; - msg.data |= MSI_DATA_VECTOR(cfg->vector); - msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; - msg.address_lo |= MSI_ADDR_DEST_ID(dest); - - hpet_msi_write(data->handler_data, &msg); - - return IRQ_SET_MASK_OK_NOCOPY; + return (int)(long)info->data; } -static struct irq_chip hpet_msi_type = { - .name = "HPET_MSI", +static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg) +{ + hpet_msi_write(data->handler_data, msg); +} + +static struct irq_chip hpet_msi_controller = { + .name = "HPET-MSI", .irq_unmask = hpet_msi_unmask, .irq_mask = hpet_msi_mask, - .irq_ack = apic_ack_edge, - .irq_set_affinity = hpet_msi_set_affinity, - .irq_retrigger = apic_retrigger_irq, + .irq_ack = irq_chip_ack_parent, + .irq_set_affinity = msi_domain_set_affinity, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_compose_msi_msg = irq_msi_compose_msg, + .irq_write_msi_msg = hpet_msi_write_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; -int default_setup_hpet_msi(unsigned int irq, unsigned int id) +static irq_hw_number_t hpet_msi_get_hwirq(struct msi_domain_info *info, + msi_alloc_info_t *arg) { - struct irq_chip *chip = &hpet_msi_type; - struct msi_msg msg; - int ret; + return arg->hpet_index; +} - ret = msi_compose_msg(NULL, irq, &msg, id); - if (ret < 0) - return ret; +static int hpet_msi_init(struct irq_domain *domain, + struct msi_domain_info *info, unsigned int virq, + irq_hw_number_t hwirq, msi_alloc_info_t *arg) +{ + irq_set_status_flags(virq, IRQ_MOVE_PCNTXT); + irq_domain_set_info(domain, virq, arg->hpet_index, info->chip, NULL, + handle_edge_irq, arg->hpet_data, "edge"); - hpet_msi_write(irq_get_handler_data(irq), &msg); - irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - setup_remapped_irq(irq, irq_cfg(irq), chip); - - irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); return 0; } + +static void hpet_msi_free(struct irq_domain *domain, + struct msi_domain_info *info, unsigned int virq) +{ + irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT); +} + +static struct msi_domain_ops hpet_msi_domain_ops = { + .get_hwirq = hpet_msi_get_hwirq, + .msi_init = hpet_msi_init, + .msi_free = hpet_msi_free, +}; + +static struct msi_domain_info hpet_msi_domain_info = { + .ops = &hpet_msi_domain_ops, + .chip = &hpet_msi_controller, +}; + +struct irq_domain *hpet_create_irq_domain(int hpet_id) +{ + struct irq_domain *parent; + struct irq_alloc_info info; + struct msi_domain_info *domain_info; + + if (x86_vector_domain == NULL) + return NULL; + + domain_info = kzalloc(sizeof(*domain_info), GFP_KERNEL); + if (!domain_info) + return NULL; + + *domain_info = hpet_msi_domain_info; + domain_info->data = (void *)(long)hpet_id; + + init_irq_alloc_info(&info, NULL); + info.type = X86_IRQ_ALLOC_TYPE_HPET; + info.hpet_id = hpet_id; + parent = irq_remapping_get_ir_irq_domain(&info); + if (parent == NULL) + parent = x86_vector_domain; + else + hpet_msi_controller.name = "IR-HPET-MSI"; + + return msi_create_irq_domain(NULL, domain_info, parent); +} + +int hpet_assign_irq(struct irq_domain *domain, struct hpet_dev *dev, + int dev_num) +{ + struct irq_alloc_info info; + + init_irq_alloc_info(&info, NULL); + info.type = X86_IRQ_ALLOC_TYPE_HPET; + info.hpet_data = dev; + info.hpet_id = hpet_dev_id(domain); + info.hpet_index = dev_num; + + return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info); +} #endif diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 6cedd7914581..28eba2d38b15 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -3,6 +3,8 @@ * * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo * Moved from arch/x86/kernel/apic/io_apic.c. + * Jiang Liu + * Enable support of hierarchical irqdomains * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -11,15 +13,28 @@ #include #include #include -#include #include +#include #include #include #include #include #include +struct apic_chip_data { + struct irq_cfg cfg; + cpumask_var_t domain; + cpumask_var_t old_domain; + u8 move_in_progress : 1; +}; + +struct irq_domain *x86_vector_domain; static DEFINE_RAW_SPINLOCK(vector_lock); +static cpumask_var_t vector_cpumask; +static struct irq_chip lapic_controller; +#ifdef CONFIG_X86_IO_APIC +static struct apic_chip_data *legacy_irq_data[NR_IRQS_LEGACY]; +#endif void lock_vector_lock(void) { @@ -34,71 +49,59 @@ void unlock_vector_lock(void) raw_spin_unlock(&vector_lock); } -struct irq_cfg *irq_cfg(unsigned int irq) +static struct apic_chip_data *apic_chip_data(struct irq_data *irq_data) { - return irq_get_chip_data(irq); + if (!irq_data) + return NULL; + + while (irq_data->parent_data) + irq_data = irq_data->parent_data; + + return irq_data->chip_data; } struct irq_cfg *irqd_cfg(struct irq_data *irq_data) { - return irq_data->chip_data; + struct apic_chip_data *data = apic_chip_data(irq_data); + + return data ? &data->cfg : NULL; } -static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node) +struct irq_cfg *irq_cfg(unsigned int irq) { - struct irq_cfg *cfg; + return irqd_cfg(irq_get_irq_data(irq)); +} - cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node); - if (!cfg) +static struct apic_chip_data *alloc_apic_chip_data(int node) +{ + struct apic_chip_data *data; + + data = kzalloc_node(sizeof(*data), GFP_KERNEL, node); + if (!data) return NULL; - if (!zalloc_cpumask_var_node(&cfg->domain, GFP_KERNEL, node)) - goto out_cfg; - if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_KERNEL, node)) + if (!zalloc_cpumask_var_node(&data->domain, GFP_KERNEL, node)) + goto out_data; + if (!zalloc_cpumask_var_node(&data->old_domain, GFP_KERNEL, node)) goto out_domain; -#ifdef CONFIG_X86_IO_APIC - INIT_LIST_HEAD(&cfg->irq_2_pin); -#endif - return cfg; + return data; out_domain: - free_cpumask_var(cfg->domain); -out_cfg: - kfree(cfg); + free_cpumask_var(data->domain); +out_data: + kfree(data); return NULL; } -struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node) +static void free_apic_chip_data(struct apic_chip_data *data) { - int res = irq_alloc_desc_at(at, node); - struct irq_cfg *cfg; - - if (res < 0) { - if (res != -EEXIST) - return NULL; - cfg = irq_cfg(at); - if (cfg) - return cfg; + if (data) { + free_cpumask_var(data->domain); + free_cpumask_var(data->old_domain); + kfree(data); } - - cfg = alloc_irq_cfg(at, node); - if (cfg) - irq_set_chip_data(at, cfg); - else - irq_free_desc(at); - return cfg; } -static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg) -{ - if (!cfg) - return; - irq_set_chip_data(at, NULL); - free_cpumask_var(cfg->domain); - free_cpumask_var(cfg->old_domain); - kfree(cfg); -} - -static int -__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) +static int __assign_irq_vector(int irq, struct apic_chip_data *d, + const struct cpumask *mask) { /* * NOTE! The local APIC isn't very good at handling @@ -114,36 +117,33 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START; static int current_offset = VECTOR_OFFSET_START % 16; int cpu, err; - cpumask_var_t tmp_mask; - if (cfg->move_in_progress) + if (d->move_in_progress) return -EBUSY; - if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) - return -ENOMEM; - /* Only try and allocate irqs on cpus that are present */ err = -ENOSPC; - cpumask_clear(cfg->old_domain); + cpumask_clear(d->old_domain); cpu = cpumask_first_and(mask, cpu_online_mask); while (cpu < nr_cpu_ids) { int new_cpu, vector, offset; - apic->vector_allocation_domain(cpu, tmp_mask, mask); + apic->vector_allocation_domain(cpu, vector_cpumask, mask); - if (cpumask_subset(tmp_mask, cfg->domain)) { + if (cpumask_subset(vector_cpumask, d->domain)) { err = 0; - if (cpumask_equal(tmp_mask, cfg->domain)) + if (cpumask_equal(vector_cpumask, d->domain)) break; /* * New cpumask using the vector is a proper subset of * the current in use mask. So cleanup the vector * allocation for the members that are not used anymore. */ - cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask); - cfg->move_in_progress = - cpumask_intersects(cfg->old_domain, cpu_online_mask); - cpumask_and(cfg->domain, cfg->domain, tmp_mask); + cpumask_andnot(d->old_domain, d->domain, + vector_cpumask); + d->move_in_progress = + cpumask_intersects(d->old_domain, cpu_online_mask); + cpumask_and(d->domain, d->domain, vector_cpumask); break; } @@ -157,16 +157,18 @@ next: } if (unlikely(current_vector == vector)) { - cpumask_or(cfg->old_domain, cfg->old_domain, tmp_mask); - cpumask_andnot(tmp_mask, mask, cfg->old_domain); - cpu = cpumask_first_and(tmp_mask, cpu_online_mask); + cpumask_or(d->old_domain, d->old_domain, + vector_cpumask); + cpumask_andnot(vector_cpumask, mask, d->old_domain); + cpu = cpumask_first_and(vector_cpumask, + cpu_online_mask); continue; } if (test_bit(vector, used_vectors)) goto next; - for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) { + for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask) { if (per_cpu(vector_irq, new_cpu)[vector] > VECTOR_UNDEFINED) goto next; @@ -174,55 +176,73 @@ next: /* Found one! */ current_vector = vector; current_offset = offset; - if (cfg->vector) { - cpumask_copy(cfg->old_domain, cfg->domain); - cfg->move_in_progress = - cpumask_intersects(cfg->old_domain, cpu_online_mask); + if (d->cfg.vector) { + cpumask_copy(d->old_domain, d->domain); + d->move_in_progress = + cpumask_intersects(d->old_domain, cpu_online_mask); } - for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) + for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask) per_cpu(vector_irq, new_cpu)[vector] = irq; - cfg->vector = vector; - cpumask_copy(cfg->domain, tmp_mask); + d->cfg.vector = vector; + cpumask_copy(d->domain, vector_cpumask); err = 0; break; } - free_cpumask_var(tmp_mask); + + if (!err) { + /* cache destination APIC IDs into cfg->dest_apicid */ + err = apic->cpu_mask_to_apicid_and(mask, d->domain, + &d->cfg.dest_apicid); + } return err; } -int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) +static int assign_irq_vector(int irq, struct apic_chip_data *data, + const struct cpumask *mask) { int err; unsigned long flags; raw_spin_lock_irqsave(&vector_lock, flags); - err = __assign_irq_vector(irq, cfg, mask); + err = __assign_irq_vector(irq, data, mask); raw_spin_unlock_irqrestore(&vector_lock, flags); return err; } -void clear_irq_vector(int irq, struct irq_cfg *cfg) +static int assign_irq_vector_policy(int irq, int node, + struct apic_chip_data *data, + struct irq_alloc_info *info) +{ + if (info && info->mask) + return assign_irq_vector(irq, data, info->mask); + if (node != NUMA_NO_NODE && + assign_irq_vector(irq, data, cpumask_of_node(node)) == 0) + return 0; + return assign_irq_vector(irq, data, apic->target_cpus()); +} + +static void clear_irq_vector(int irq, struct apic_chip_data *data) { int cpu, vector; unsigned long flags; raw_spin_lock_irqsave(&vector_lock, flags); - BUG_ON(!cfg->vector); + BUG_ON(!data->cfg.vector); - vector = cfg->vector; - for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) + vector = data->cfg.vector; + for_each_cpu_and(cpu, data->domain, cpu_online_mask) per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; - cfg->vector = 0; - cpumask_clear(cfg->domain); + data->cfg.vector = 0; + cpumask_clear(data->domain); - if (likely(!cfg->move_in_progress)) { + if (likely(!data->move_in_progress)) { raw_spin_unlock_irqrestore(&vector_lock, flags); return; } - for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) { + for_each_cpu_and(cpu, data->old_domain, cpu_online_mask) { for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { if (per_cpu(vector_irq, cpu)[vector] != irq) @@ -231,10 +251,95 @@ void clear_irq_vector(int irq, struct irq_cfg *cfg) break; } } - cfg->move_in_progress = 0; + data->move_in_progress = 0; raw_spin_unlock_irqrestore(&vector_lock, flags); } +void init_irq_alloc_info(struct irq_alloc_info *info, + const struct cpumask *mask) +{ + memset(info, 0, sizeof(*info)); + info->mask = mask; +} + +void copy_irq_alloc_info(struct irq_alloc_info *dst, struct irq_alloc_info *src) +{ + if (src) + *dst = *src; + else + memset(dst, 0, sizeof(*dst)); +} + +static void x86_vector_free_irqs(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs) +{ + struct irq_data *irq_data; + int i; + + for (i = 0; i < nr_irqs; i++) { + irq_data = irq_domain_get_irq_data(x86_vector_domain, virq + i); + if (irq_data && irq_data->chip_data) { + clear_irq_vector(virq + i, irq_data->chip_data); + free_apic_chip_data(irq_data->chip_data); +#ifdef CONFIG_X86_IO_APIC + if (virq + i < nr_legacy_irqs()) + legacy_irq_data[virq + i] = NULL; +#endif + irq_domain_reset_irq_data(irq_data); + } + } +} + +static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct irq_alloc_info *info = arg; + struct apic_chip_data *data; + struct irq_data *irq_data; + int i, err; + + if (disable_apic) + return -ENXIO; + + /* Currently vector allocator can't guarantee contiguous allocations */ + if ((info->flags & X86_IRQ_ALLOC_CONTIGUOUS_VECTORS) && nr_irqs > 1) + return -ENOSYS; + + for (i = 0; i < nr_irqs; i++) { + irq_data = irq_domain_get_irq_data(domain, virq + i); + BUG_ON(!irq_data); +#ifdef CONFIG_X86_IO_APIC + if (virq + i < nr_legacy_irqs() && legacy_irq_data[virq + i]) + data = legacy_irq_data[virq + i]; + else +#endif + data = alloc_apic_chip_data(irq_data->node); + if (!data) { + err = -ENOMEM; + goto error; + } + + irq_data->chip = &lapic_controller; + irq_data->chip_data = data; + irq_data->hwirq = virq + i; + err = assign_irq_vector_policy(virq, irq_data->node, data, + info); + if (err) + goto error; + } + + return 0; + +error: + x86_vector_free_irqs(domain, virq, i + 1); + return err; +} + +static const struct irq_domain_ops x86_vector_domain_ops = { + .alloc = x86_vector_alloc_irqs, + .free = x86_vector_free_irqs, +}; + int __init arch_probe_nr_irqs(void) { int nr; @@ -258,8 +363,43 @@ int __init arch_probe_nr_irqs(void) return nr_legacy_irqs(); } +#ifdef CONFIG_X86_IO_APIC +static void init_legacy_irqs(void) +{ + int i, node = cpu_to_node(0); + struct apic_chip_data *data; + + /* + * For legacy IRQ's, start with assigning irq0 to irq15 to + * ISA_IRQ_VECTOR(i) for all cpu's. + */ + for (i = 0; i < nr_legacy_irqs(); i++) { + data = legacy_irq_data[i] = alloc_apic_chip_data(node); + BUG_ON(!data); + + data->cfg.vector = ISA_IRQ_VECTOR(i); + cpumask_setall(data->domain); + irq_set_chip_data(i, data); + } +} +#else +static void init_legacy_irqs(void) { } +#endif + int __init arch_early_irq_init(void) { + init_legacy_irqs(); + + x86_vector_domain = irq_domain_add_tree(NULL, &x86_vector_domain_ops, + NULL); + BUG_ON(x86_vector_domain == NULL); + irq_set_default_host(x86_vector_domain); + + arch_init_msi_domain(x86_vector_domain); + arch_init_htirq_domain(x86_vector_domain); + + BUG_ON(!alloc_cpumask_var(&vector_cpumask, GFP_KERNEL)); + return arch_early_ioapic_init(); } @@ -267,7 +407,7 @@ static void __setup_vector_irq(int cpu) { /* Initialize vector_irq on a new cpu */ int irq, vector; - struct irq_cfg *cfg; + struct apic_chip_data *data; /* * vector_lock will make sure that we don't run into irq vector @@ -277,13 +417,13 @@ static void __setup_vector_irq(int cpu) raw_spin_lock(&vector_lock); /* Mark the inuse vectors */ for_each_active_irq(irq) { - cfg = irq_cfg(irq); - if (!cfg) + data = apic_chip_data(irq_get_irq_data(irq)); + if (!data) continue; - if (!cpumask_test_cpu(cpu, cfg->domain)) + if (!cpumask_test_cpu(cpu, data->domain)) continue; - vector = cfg->vector; + vector = data->cfg.vector; per_cpu(vector_irq, cpu)[vector] = irq; } /* Mark the free vectors */ @@ -292,8 +432,8 @@ static void __setup_vector_irq(int cpu) if (irq <= VECTOR_UNDEFINED) continue; - cfg = irq_cfg(irq); - if (!cpumask_test_cpu(cpu, cfg->domain)) + data = apic_chip_data(irq_get_irq_data(irq)); + if (!cpumask_test_cpu(cpu, data->domain)) per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; } raw_spin_unlock(&vector_lock); @@ -314,20 +454,20 @@ void setup_vector_irq(int cpu) * legacy vector to irq mapping: */ for (irq = 0; irq < nr_legacy_irqs(); irq++) - per_cpu(vector_irq, cpu)[IRQ0_VECTOR + irq] = irq; + per_cpu(vector_irq, cpu)[ISA_IRQ_VECTOR(irq)] = irq; __setup_vector_irq(cpu); } -int apic_retrigger_irq(struct irq_data *data) +static int apic_retrigger_irq(struct irq_data *irq_data) { - struct irq_cfg *cfg = irqd_cfg(data); + struct apic_chip_data *data = apic_chip_data(irq_data); unsigned long flags; int cpu; raw_spin_lock_irqsave(&vector_lock, flags); - cpu = cpumask_first_and(cfg->domain, cpu_online_mask); - apic->send_IPI_mask(cpumask_of(cpu), cfg->vector); + cpu = cpumask_first_and(data->domain, cpu_online_mask); + apic->send_IPI_mask(cpumask_of(cpu), data->cfg.vector); raw_spin_unlock_irqrestore(&vector_lock, flags); return 1; @@ -340,73 +480,76 @@ void apic_ack_edge(struct irq_data *data) ack_APIC_irq(); } -/* - * Either sets data->affinity to a valid value, and returns - * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and - * leaves data->affinity untouched. - */ -int apic_set_affinity(struct irq_data *data, const struct cpumask *mask, - unsigned int *dest_id) +static int apic_set_affinity(struct irq_data *irq_data, + const struct cpumask *dest, bool force) { - struct irq_cfg *cfg = irqd_cfg(data); - unsigned int irq = data->irq; - int err; + struct apic_chip_data *data = irq_data->chip_data; + int err, irq = irq_data->irq; if (!config_enabled(CONFIG_SMP)) return -EPERM; - if (!cpumask_intersects(mask, cpu_online_mask)) + if (!cpumask_intersects(dest, cpu_online_mask)) return -EINVAL; - err = assign_irq_vector(irq, cfg, mask); - if (err) - return err; - - err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, dest_id); + err = assign_irq_vector(irq, data, dest); if (err) { - if (assign_irq_vector(irq, cfg, data->affinity)) + struct irq_data *top = irq_get_irq_data(irq); + + if (assign_irq_vector(irq, data, top->affinity)) pr_err("Failed to recover vector for irq %d\n", irq); return err; } - cpumask_copy(data->affinity, mask); - - return 0; + return IRQ_SET_MASK_OK; } +static struct irq_chip lapic_controller = { + .irq_ack = apic_ack_edge, + .irq_set_affinity = apic_set_affinity, + .irq_retrigger = apic_retrigger_irq, +}; + #ifdef CONFIG_SMP -void send_cleanup_vector(struct irq_cfg *cfg) +static void __send_cleanup_vector(struct apic_chip_data *data) { cpumask_var_t cleanup_mask; if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { unsigned int i; - for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) + for_each_cpu_and(i, data->old_domain, cpu_online_mask) apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); } else { - cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); + cpumask_and(cleanup_mask, data->old_domain, cpu_online_mask); apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); free_cpumask_var(cleanup_mask); } - cfg->move_in_progress = 0; + data->move_in_progress = 0; +} + +void send_cleanup_vector(struct irq_cfg *cfg) +{ + struct apic_chip_data *data; + + data = container_of(cfg, struct apic_chip_data, cfg); + if (data->move_in_progress) + __send_cleanup_vector(data); } asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; - ack_APIC_irq(); - irq_enter(); - exit_idle(); + entering_ack_irq(); me = smp_processor_id(); for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { int irq; unsigned int irr; struct irq_desc *desc; - struct irq_cfg *cfg; + struct apic_chip_data *data; irq = __this_cpu_read(vector_irq[vector]); @@ -417,8 +560,8 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) if (!desc) continue; - cfg = irq_cfg(irq); - if (!cfg) + data = apic_chip_data(&desc->irq_data); + if (!data) continue; raw_spin_lock(&desc->lock); @@ -427,10 +570,11 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) * Check if the irq migration is in progress. If so, we * haven't received the cleanup request yet for this irq. */ - if (cfg->move_in_progress) + if (data->move_in_progress) goto unlock; - if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) + if (vector == data->cfg.vector && + cpumask_test_cpu(me, data->domain)) goto unlock; irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); @@ -450,20 +594,21 @@ unlock: raw_spin_unlock(&desc->lock); } - irq_exit(); + exiting_irq(); } static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector) { unsigned me; + struct apic_chip_data *data; - if (likely(!cfg->move_in_progress)) + data = container_of(cfg, struct apic_chip_data, cfg); + if (likely(!data->move_in_progress)) return; me = smp_processor_id(); - - if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) - send_cleanup_vector(cfg); + if (vector == data->cfg.vector && cpumask_test_cpu(me, data->domain)) + __send_cleanup_vector(data); } void irq_complete_move(struct irq_cfg *cfg) @@ -475,46 +620,11 @@ void irq_force_complete_move(int irq) { struct irq_cfg *cfg = irq_cfg(irq); - if (!cfg) - return; - - __irq_complete_move(cfg, cfg->vector); + if (cfg) + __irq_complete_move(cfg, cfg->vector); } #endif -/* - * Dynamic irq allocate and deallocation. Should be replaced by irq domains! - */ -int arch_setup_hwirq(unsigned int irq, int node) -{ - struct irq_cfg *cfg; - unsigned long flags; - int ret; - - cfg = alloc_irq_cfg(irq, node); - if (!cfg) - return -ENOMEM; - - raw_spin_lock_irqsave(&vector_lock, flags); - ret = __assign_irq_vector(irq, cfg, apic->target_cpus()); - raw_spin_unlock_irqrestore(&vector_lock, flags); - - if (!ret) - irq_set_chip_data(irq, cfg); - else - free_irq_cfg(irq, cfg); - return ret; -} - -void arch_teardown_hwirq(unsigned int irq) -{ - struct irq_cfg *cfg = irq_cfg(irq); - - free_remapped_irq(irq); - clear_irq_vector(irq, cfg); - free_irq_cfg(irq, cfg); -} - static void __init print_APIC_field(int base) { int i; diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 6fae733e9194..3ffd925655e0 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -21,11 +21,13 @@ early_param("x2apic_phys", set_x2apic_phys_mode); static bool x2apic_fadt_phys(void) { +#ifdef CONFIG_ACPI if ((acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) && (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) { printk(KERN_DEBUG "System requires x2apic physical mode\n"); return true; } +#endif return false; } diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index b27f6ec90caa..8e3d22a1af94 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -68,7 +68,9 @@ void common(void) { OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); OFFSET(PV_CPU_iret, pv_cpu_ops, iret); +#ifdef CONFIG_X86_32 OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); +#endif OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); #endif diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index dcaab87da629..d8f42f902a0f 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -66,7 +66,7 @@ int main(void) DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); DEFINE(NR_syscalls, sizeof(syscalls_64)); - DEFINE(__NR_ia32_syscall_max, sizeof(syscalls_ia32) - 1); + DEFINE(__NR_syscall_compat_max, sizeof(syscalls_ia32) - 1); DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32)); return 0; diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 56cae1964a81..dd3a4baffe50 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -295,7 +295,7 @@ static int nearby_node(int apicid) * Assumption: Number of cores in each internal node is the same. * (2) AMD processors supporting compute units */ -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP static void amd_get_topology(struct cpuinfo_x86 *c) { u32 cores_per_cu = 1; @@ -348,7 +348,7 @@ static void amd_get_topology(struct cpuinfo_x86 *c) */ static void amd_detect_cmp(struct cpuinfo_x86 *c) { -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP unsigned bits; int cpu = smp_processor_id(); @@ -433,7 +433,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c) static void early_init_amd_mc(struct cpuinfo_x86 *c) { -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP unsigned bits, ecx; /* Multi core CPU? */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b28e5262a0a5..9fc5e3d9d9c8 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -491,7 +491,7 @@ static void cpu_detect_tlb(struct cpuinfo_x86 *c) void detect_ht(struct cpuinfo_x86 *c) { -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP u32 eax, ebx, ecx, edx; int index_msb, core_bits; static bool printed; @@ -827,7 +827,7 @@ static void generic_identify(struct cpuinfo_x86 *c) if (c->cpuid_level >= 0x00000001) { c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF; #ifdef CONFIG_X86_32 -# ifdef CONFIG_X86_HT +# ifdef CONFIG_SMP c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); # else c->apicid = c->initial_apicid; @@ -1009,7 +1009,7 @@ void enable_sep_cpu(void) (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack), 0); - wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)ia32_sysenter_target, 0); + wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); out: put_cpu(); @@ -1138,10 +1138,6 @@ static __init int setup_disablecpuid(char *arg) } __setup("clearcpuid=", setup_disablecpuid); -DEFINE_PER_CPU(unsigned long, kernel_stack) = - (unsigned long)&init_thread_union + THREAD_SIZE; -EXPORT_PER_CPU_SYMBOL(kernel_stack); - #ifdef CONFIG_X86_64 struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1, @@ -1189,10 +1185,10 @@ void syscall_init(void) * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip. */ wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); - wrmsrl(MSR_LSTAR, system_call); + wrmsrl(MSR_LSTAR, entry_SYSCALL_64); #ifdef CONFIG_IA32_EMULATION - wrmsrl(MSR_CSTAR, ia32_cstar_target); + wrmsrl(MSR_CSTAR, entry_SYSCALL_compat); /* * This only works on Intel CPUs. * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. @@ -1201,7 +1197,7 @@ void syscall_init(void) */ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); - wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); #else wrmsrl(MSR_CSTAR, ignore_sysret); wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index edcb0e28c336..be4febc58b94 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -654,7 +654,7 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c) unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP unsigned int cpu = c->cpu_index; #endif @@ -773,19 +773,19 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c) if (new_l2) { l2 = new_l2; -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP per_cpu(cpu_llc_id, cpu) = l2_id; #endif } if (new_l3) { l3 = new_l3; -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP per_cpu(cpu_llc_id, cpu) = l3_id; #endif } -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP /* * If cpu_llc_id is not yet set, this means cpuid_level < 4 which in * turns means that the only possibility is SMT (as indicated in diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 95cf78d44ab4..df919ff103c3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1053,6 +1053,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) char *msg = "Unknown"; u64 recover_paddr = ~0ull; int flags = MF_ACTION_REQUIRED; + int lmce = 0; prev_state = ist_enter(regs); @@ -1080,11 +1081,20 @@ void do_machine_check(struct pt_regs *regs, long error_code) kill_it = 1; /* - * Go through all the banks in exclusion of the other CPUs. - * This way we don't report duplicated events on shared banks - * because the first one to see it will clear it. + * Check if this MCE is signaled to only this logical processor */ - order = mce_start(&no_way_out); + if (m.mcgstatus & MCG_STATUS_LMCES) + lmce = 1; + else { + /* + * Go through all the banks in exclusion of the other CPUs. + * This way we don't report duplicated events on shared banks + * because the first one to see it will clear it. + * If this is a Local MCE, then no need to perform rendezvous. + */ + order = mce_start(&no_way_out); + } + for (i = 0; i < cfg->banks; i++) { __clear_bit(i, toclear); if (!test_bit(i, valid_banks)) @@ -1161,8 +1171,18 @@ void do_machine_check(struct pt_regs *regs, long error_code) * Do most of the synchronization with other CPUs. * When there's any problem use only local no_way_out state. */ - if (mce_end(order) < 0) - no_way_out = worst >= MCE_PANIC_SEVERITY; + if (!lmce) { + if (mce_end(order) < 0) + no_way_out = worst >= MCE_PANIC_SEVERITY; + } else { + /* + * Local MCE skipped calling mce_reign() + * If we found a fatal error, we need to panic here. + */ + if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) + mce_panic("Machine check from unknown source", + NULL, NULL); + } /* * At insane "tolerant" levels we take no action. Otherwise @@ -1643,10 +1663,16 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) mce_intel_feature_init(c); mce_adjust_timer = cmci_intel_adjust_timer; break; - case X86_VENDOR_AMD: + + case X86_VENDOR_AMD: { + u32 ebx = cpuid_ebx(0x80000007); + mce_amd_feature_init(c); - mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1; + mce_flags.overflow_recov = !!(ebx & BIT(0)); + mce_flags.succor = !!(ebx & BIT(1)); break; + } + default: break; } @@ -1982,6 +2008,7 @@ void mce_disable_bank(int bank) /* * mce=off Disables machine check * mce=no_cmci Disables CMCI + * mce=no_lmce Disables LMCE * mce=dont_log_ce Clears corrected events silently, no log created for CEs. * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) @@ -2005,6 +2032,8 @@ static int __init mcheck_enable(char *str) cfg->disabled = true; else if (!strcmp(str, "no_cmci")) cfg->cmci_disabled = true; + else if (!strcmp(str, "no_lmce")) + cfg->lmce_disabled = true; else if (!strcmp(str, "dont_log_ce")) cfg->dont_log_ce = true; else if (!strcmp(str, "ignore_ce")) @@ -2014,11 +2043,8 @@ static int __init mcheck_enable(char *str) else if (!strcmp(str, "bios_cmci_threshold")) cfg->bios_cmci_threshold = true; else if (isdigit(str[0])) { - get_option(&str, &(cfg->tolerant)); - if (*str == ',') { - ++str; + if (get_option(&str, &cfg->tolerant) == 2) get_option(&str, &(cfg->monarch_timeout)); - } } else { pr_info("mce argument %s ignored. Please use /sys\n", str); return 0; diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 55ad9b37cae8..e99b15077e94 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -1,19 +1,13 @@ /* - * (c) 2005-2012 Advanced Micro Devices, Inc. + * (c) 2005-2015 Advanced Micro Devices, Inc. * Your use of this code is subject to the terms and conditions of the * GNU general public license version 2. See "COPYING" or * http://www.gnu.org/licenses/gpl.html * * Written by Jacob Shin - AMD, Inc. - * * Maintained by: Borislav Petkov * - * April 2006 - * - added support for AMD Family 0x10 processors - * May 2012 - * - major scrubbing - * - * All MC4_MISCi registers are shared between multi-cores + * All MC4_MISCi registers are shared between cores on a node. */ #include #include @@ -32,6 +26,7 @@ #include #include #include +#include #define NR_BLOCKS 9 #define THRESHOLD_MAX 0xFFF @@ -47,6 +42,13 @@ #define MASK_BLKPTR_LO 0xFF000000 #define MCG_XBLK_ADDR 0xC0000400 +/* Deferred error settings */ +#define MSR_CU_DEF_ERR 0xC0000410 +#define MASK_DEF_LVTOFF 0x000000F0 +#define MASK_DEF_INT_TYPE 0x00000006 +#define DEF_LVT_OFF 0x2 +#define DEF_INT_TYPE_APIC 0x2 + static const char * const th_names[] = { "load_store", "insn_fetch", @@ -60,6 +62,13 @@ static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ static void amd_threshold_interrupt(void); +static void amd_deferred_error_interrupt(void); + +static void default_deferred_error_interrupt(void) +{ + pr_err("Unexpected deferred interrupt at vector %x\n", DEFERRED_ERROR_VECTOR); +} +void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; /* * CPU Initialization @@ -196,7 +205,7 @@ static void mce_threshold_block_init(struct threshold_block *b, int offset) threshold_restart_bank(&tr); }; -static int setup_APIC_mce(int reserved, int new) +static int setup_APIC_mce_threshold(int reserved, int new) { if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR, APIC_EILVT_MSG_FIX, 0)) @@ -205,6 +214,39 @@ static int setup_APIC_mce(int reserved, int new) return reserved; } +static int setup_APIC_deferred_error(int reserved, int new) +{ + if (reserved < 0 && !setup_APIC_eilvt(new, DEFERRED_ERROR_VECTOR, + APIC_EILVT_MSG_FIX, 0)) + return new; + + return reserved; +} + +static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) +{ + u32 low = 0, high = 0; + int def_offset = -1, def_new; + + if (rdmsr_safe(MSR_CU_DEF_ERR, &low, &high)) + return; + + def_new = (low & MASK_DEF_LVTOFF) >> 4; + if (!(low & MASK_DEF_LVTOFF)) { + pr_err(FW_BUG "Your BIOS is not setting up LVT offset 0x2 for deferred error IRQs correctly.\n"); + def_new = DEF_LVT_OFF; + low = (low & ~MASK_DEF_LVTOFF) | (DEF_LVT_OFF << 4); + } + + def_offset = setup_APIC_deferred_error(def_offset, def_new); + if ((def_offset == def_new) && + (deferred_error_int_vector != amd_deferred_error_interrupt)) + deferred_error_int_vector = amd_deferred_error_interrupt; + + low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC; + wrmsr(MSR_CU_DEF_ERR, low, high); +} + /* cpu init entry point, called from mce.c with preempt off */ void mce_amd_feature_init(struct cpuinfo_x86 *c) { @@ -252,7 +294,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) b.interrupt_enable = 1; new = (high & MASK_LVTOFF_HI) >> 20; - offset = setup_APIC_mce(offset, new); + offset = setup_APIC_mce_threshold(offset, new); if ((offset == new) && (mce_threshold_vector != amd_threshold_interrupt)) @@ -262,6 +304,73 @@ init: mce_threshold_block_init(&b, offset); } } + + if (mce_flags.succor) + deferred_error_interrupt_enable(c); +} + +static void __log_error(unsigned int bank, bool threshold_err, u64 misc) +{ + struct mce m; + u64 status; + + rdmsrl(MSR_IA32_MCx_STATUS(bank), status); + if (!(status & MCI_STATUS_VAL)) + return; + + mce_setup(&m); + + m.status = status; + m.bank = bank; + + if (threshold_err) + m.misc = misc; + + if (m.status & MCI_STATUS_ADDRV) + rdmsrl(MSR_IA32_MCx_ADDR(bank), m.addr); + + mce_log(&m); + wrmsrl(MSR_IA32_MCx_STATUS(bank), 0); +} + +static inline void __smp_deferred_error_interrupt(void) +{ + inc_irq_stat(irq_deferred_error_count); + deferred_error_int_vector(); +} + +asmlinkage __visible void smp_deferred_error_interrupt(void) +{ + entering_irq(); + __smp_deferred_error_interrupt(); + exiting_ack_irq(); +} + +asmlinkage __visible void smp_trace_deferred_error_interrupt(void) +{ + entering_irq(); + trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR); + __smp_deferred_error_interrupt(); + trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR); + exiting_ack_irq(); +} + +/* APIC interrupt handler for deferred errors */ +static void amd_deferred_error_interrupt(void) +{ + u64 status; + unsigned int bank; + + for (bank = 0; bank < mca_cfg.banks; ++bank) { + rdmsrl(MSR_IA32_MCx_STATUS(bank), status); + + if (!(status & MCI_STATUS_VAL) || + !(status & MCI_STATUS_DEFERRED)) + continue; + + __log_error(bank, false, 0); + break; + } } /* @@ -273,12 +382,12 @@ init: * the interrupt goes off when error_count reaches threshold_limit. * the handler will simply log mcelog w/ software defined bank number. */ + static void amd_threshold_interrupt(void) { u32 low = 0, high = 0, address = 0; int cpu = smp_processor_id(); unsigned int bank, block; - struct mce m; /* assume first bank caused it */ for (bank = 0; bank < mca_cfg.banks; ++bank) { @@ -321,15 +430,7 @@ static void amd_threshold_interrupt(void) return; log: - mce_setup(&m); - rdmsrl(MSR_IA32_MCx_STATUS(bank), m.status); - if (!(m.status & MCI_STATUS_VAL)) - return; - m.misc = ((u64)high << 32) | low; - m.bank = bank; - mce_log(&m); - - wrmsrl(MSR_IA32_MCx_STATUS(bank), 0); + __log_error(bank, true, ((u64)high << 32) | low); } /* diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index b4a41cf030ed..844f56c5616d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -91,6 +91,36 @@ static int cmci_supported(int *banks) return !!(cap & MCG_CMCI_P); } +static bool lmce_supported(void) +{ + u64 tmp; + + if (mca_cfg.lmce_disabled) + return false; + + rdmsrl(MSR_IA32_MCG_CAP, tmp); + + /* + * LMCE depends on recovery support in the processor. Hence both + * MCG_SER_P and MCG_LMCE_P should be present in MCG_CAP. + */ + if ((tmp & (MCG_SER_P | MCG_LMCE_P)) != + (MCG_SER_P | MCG_LMCE_P)) + return false; + + /* + * BIOS should indicate support for LMCE by setting bit 20 in + * IA32_FEATURE_CONTROL without which touching MCG_EXT_CTL will + * generate a #GP fault. + */ + rdmsrl(MSR_IA32_FEATURE_CONTROL, tmp); + if ((tmp & (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE)) == + (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE)) + return true; + + return false; +} + bool mce_intel_cmci_poll(void) { if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE) @@ -405,8 +435,22 @@ static void intel_init_cmci(void) cmci_recheck(); } +void intel_init_lmce(void) +{ + u64 val; + + if (!lmce_supported()) + return; + + rdmsrl(MSR_IA32_MCG_EXT_CTL, val); + + if (!(val & MCG_EXT_CTL_LMCE_EN)) + wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN); +} + void mce_intel_feature_init(struct cpuinfo_x86 *c) { intel_init_thermal(c); intel_init_cmci(); + intel_init_lmce(); } diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 939155ffdece..aad4bd84b475 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -39,14 +39,12 @@ void hyperv_vector_handler(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); - irq_enter(); - exit_idle(); - + entering_irq(); inc_irq_stat(irq_hv_callback_count); if (vmbus_handler) vmbus_handler(); - irq_exit(); + exiting_irq(); set_irq_regs(old_regs); } diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 5f90b85ff22e..70d7c93f4550 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -98,7 +98,8 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range, continue; base = range_state[i].base_pfn; if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed && - (mtrr_state.enabled & 1)) { + (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) && + (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) { /* Var MTRR contains UC entry below 1M? Skip it: */ printk(BIOS_BUG_MSG, i); if (base + size <= (1<<(20-PAGE_SHIFT))) diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 7d74f7b3c6ba..3b533cf37c74 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -102,59 +102,76 @@ static int check_type_overlap(u8 *prev, u8 *curr) return 0; } -/* - * Error/Semi-error returns: - * 0xFF - when MTRR is not enabled - * *repeat == 1 implies [start:end] spanned across MTRR range and type returned - * corresponds only to [start:*partial_end]. - * Caller has to lookup again for [*partial_end:end]. +/** + * mtrr_type_lookup_fixed - look up memory type in MTRR fixed entries + * + * Return the MTRR fixed memory type of 'start'. + * + * MTRR fixed entries are divided into the following ways: + * 0x00000 - 0x7FFFF : This range is divided into eight 64KB sub-ranges + * 0x80000 - 0xBFFFF : This range is divided into sixteen 16KB sub-ranges + * 0xC0000 - 0xFFFFF : This range is divided into sixty-four 4KB sub-ranges + * + * Return Values: + * MTRR_TYPE_(type) - Matched memory type + * MTRR_TYPE_INVALID - Unmatched */ -static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) +static u8 mtrr_type_lookup_fixed(u64 start, u64 end) +{ + int idx; + + if (start >= 0x100000) + return MTRR_TYPE_INVALID; + + /* 0x0 - 0x7FFFF */ + if (start < 0x80000) { + idx = 0; + idx += (start >> 16); + return mtrr_state.fixed_ranges[idx]; + /* 0x80000 - 0xBFFFF */ + } else if (start < 0xC0000) { + idx = 1 * 8; + idx += ((start - 0x80000) >> 14); + return mtrr_state.fixed_ranges[idx]; + } + + /* 0xC0000 - 0xFFFFF */ + idx = 3 * 8; + idx += ((start - 0xC0000) >> 12); + return mtrr_state.fixed_ranges[idx]; +} + +/** + * mtrr_type_lookup_variable - look up memory type in MTRR variable entries + * + * Return Value: + * MTRR_TYPE_(type) - Matched memory type or default memory type (unmatched) + * + * Output Arguments: + * repeat - Set to 1 when [start:end] spanned across MTRR range and type + * returned corresponds only to [start:*partial_end]. Caller has + * to lookup again for [*partial_end:end]. + * + * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the + * region is fully covered by a single MTRR entry or the default + * type. + */ +static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end, + int *repeat, u8 *uniform) { int i; u64 base, mask; u8 prev_match, curr_match; *repeat = 0; - if (!mtrr_state_set) - return 0xFF; + *uniform = 1; - if (!mtrr_state.enabled) - return 0xFF; - - /* Make end inclusive end, instead of exclusive */ + /* Make end inclusive instead of exclusive */ end--; - /* Look in fixed ranges. Just return the type as per start */ - if (mtrr_state.have_fixed && (start < 0x100000)) { - int idx; - - if (start < 0x80000) { - idx = 0; - idx += (start >> 16); - return mtrr_state.fixed_ranges[idx]; - } else if (start < 0xC0000) { - idx = 1 * 8; - idx += ((start - 0x80000) >> 14); - return mtrr_state.fixed_ranges[idx]; - } else if (start < 0x1000000) { - idx = 3 * 8; - idx += ((start - 0xC0000) >> 12); - return mtrr_state.fixed_ranges[idx]; - } - } - - /* - * Look in variable ranges - * Look of multiple ranges matching this address and pick type - * as per MTRR precedence - */ - if (!(mtrr_state.enabled & 2)) - return mtrr_state.def_type; - - prev_match = 0xFF; + prev_match = MTRR_TYPE_INVALID; for (i = 0; i < num_var_ranges; ++i) { - unsigned short start_state, end_state; + unsigned short start_state, end_state, inclusive; if (!(mtrr_state.var_ranges[i].mask_lo & (1 << 11))) continue; @@ -166,20 +183,29 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) start_state = ((start & mask) == (base & mask)); end_state = ((end & mask) == (base & mask)); + inclusive = ((start < base) && (end > base)); - if (start_state != end_state) { + if ((start_state != end_state) || inclusive) { /* * We have start:end spanning across an MTRR. - * We split the region into - * either - * (start:mtrr_end) (mtrr_end:end) - * or - * (start:mtrr_start) (mtrr_start:end) + * We split the region into either + * + * - start_state:1 + * (start:mtrr_end)(mtrr_end:end) + * - end_state:1 + * (start:mtrr_start)(mtrr_start:end) + * - inclusive:1 + * (start:mtrr_start)(mtrr_start:mtrr_end)(mtrr_end:end) + * * depending on kind of overlap. - * Return the type for first region and a pointer to - * the start of second region so that caller will - * lookup again on the second region. - * Note: This way we handle multiple overlaps as well. + * + * Return the type of the first region and a pointer + * to the start of next region so that caller will be + * advised to lookup again after having adjusted start + * and end. + * + * Note: This way we handle overlaps with multiple + * entries and the default type properly. */ if (start_state) *partial_end = base + get_mtrr_size(mask); @@ -193,59 +219,94 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) end = *partial_end - 1; /* end is inclusive */ *repeat = 1; + *uniform = 0; } if ((start & mask) != (base & mask)) continue; curr_match = mtrr_state.var_ranges[i].base_lo & 0xff; - if (prev_match == 0xFF) { + if (prev_match == MTRR_TYPE_INVALID) { prev_match = curr_match; continue; } + *uniform = 0; if (check_type_overlap(&prev_match, &curr_match)) return curr_match; } - if (mtrr_tom2) { - if (start >= (1ULL<<32) && (end < mtrr_tom2)) - return MTRR_TYPE_WRBACK; - } - - if (prev_match != 0xFF) + if (prev_match != MTRR_TYPE_INVALID) return prev_match; return mtrr_state.def_type; } -/* - * Returns the effective MTRR type for the region - * Error return: - * 0xFF - when MTRR is not enabled +/** + * mtrr_type_lookup - look up memory type in MTRR + * + * Return Values: + * MTRR_TYPE_(type) - The effective MTRR type for the region + * MTRR_TYPE_INVALID - MTRR is disabled + * + * Output Argument: + * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the + * region is fully covered by a single MTRR entry or the default + * type. */ -u8 mtrr_type_lookup(u64 start, u64 end) +u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform) { - u8 type, prev_type; + u8 type, prev_type, is_uniform = 1, dummy; int repeat; u64 partial_end; - type = __mtrr_type_lookup(start, end, &partial_end, &repeat); + if (!mtrr_state_set) + return MTRR_TYPE_INVALID; + + if (!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED)) + return MTRR_TYPE_INVALID; + + /* + * Look up the fixed ranges first, which take priority over + * the variable ranges. + */ + if ((start < 0x100000) && + (mtrr_state.have_fixed) && + (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) { + is_uniform = 0; + type = mtrr_type_lookup_fixed(start, end); + goto out; + } + + /* + * Look up the variable ranges. Look of multiple ranges matching + * this address and pick type as per MTRR precedence. + */ + type = mtrr_type_lookup_variable(start, end, &partial_end, + &repeat, &is_uniform); /* * Common path is with repeat = 0. * However, we can have cases where [start:end] spans across some - * MTRR range. Do repeated lookups for that case here. + * MTRR ranges and/or the default type. Do repeated lookups for + * that case here. */ while (repeat) { prev_type = type; start = partial_end; - type = __mtrr_type_lookup(start, end, &partial_end, &repeat); + is_uniform = 0; + type = mtrr_type_lookup_variable(start, end, &partial_end, + &repeat, &dummy); if (check_type_overlap(&prev_type, &type)) - return type; + goto out; } + if (mtrr_tom2 && (start >= (1ULL<<32)) && (end < mtrr_tom2)) + type = MTRR_TYPE_WRBACK; + +out: + *uniform = is_uniform; return type; } @@ -347,7 +408,9 @@ static void __init print_mtrr_state(void) mtrr_attrib_to_str(mtrr_state.def_type)); if (mtrr_state.have_fixed) { pr_debug("MTRR fixed ranges %sabled:\n", - mtrr_state.enabled & 1 ? "en" : "dis"); + ((mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) && + (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) ? + "en" : "dis"); print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0); for (i = 0; i < 2; ++i) print_fixed(0x80000 + i * 0x20000, 0x04000, @@ -360,7 +423,7 @@ static void __init print_mtrr_state(void) print_fixed_last(); } pr_debug("MTRR variable ranges %sabled:\n", - mtrr_state.enabled & 2 ? "en" : "dis"); + mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED ? "en" : "dis"); high_width = (__ffs64(size_or_mask) - (32 - PAGE_SHIFT) + 3) / 4; for (i = 0; i < num_var_ranges; ++i) { @@ -382,7 +445,7 @@ static void __init print_mtrr_state(void) } /* Grab all of the MTRR state for this CPU into *state */ -void __init get_mtrr_state(void) +bool __init get_mtrr_state(void) { struct mtrr_var_range *vrs; unsigned long flags; @@ -426,6 +489,8 @@ void __init get_mtrr_state(void) post_set(); local_irq_restore(flags); + + return !!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED); } /* Some BIOS's are messed up and don't set all MTRRs the same! */ diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index ea5f363a1948..e7ed0d8ebacb 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -59,6 +59,12 @@ #define MTRR_TO_PHYS_WC_OFFSET 1000 u32 num_var_ranges; +static bool __mtrr_enabled; + +static bool mtrr_enabled(void) +{ + return __mtrr_enabled; +} unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; static DEFINE_MUTEX(mtrr_mutex); @@ -286,7 +292,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, int i, replace, error; mtrr_type ltype; - if (!mtrr_if) + if (!mtrr_enabled()) return -ENXIO; error = mtrr_if->validate_add_page(base, size, type); @@ -435,6 +441,8 @@ static int mtrr_check(unsigned long base, unsigned long size) int mtrr_add(unsigned long base, unsigned long size, unsigned int type, bool increment) { + if (!mtrr_enabled()) + return -ENODEV; if (mtrr_check(base, size)) return -EINVAL; return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, @@ -463,8 +471,8 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) unsigned long lbase, lsize; int error = -EINVAL; - if (!mtrr_if) - return -ENXIO; + if (!mtrr_enabled()) + return -ENODEV; max = num_var_ranges; /* No CPU hotplug when we change MTRR entries */ @@ -523,6 +531,8 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) */ int mtrr_del(int reg, unsigned long base, unsigned long size) { + if (!mtrr_enabled()) + return -ENODEV; if (mtrr_check(base, size)) return -EINVAL; return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); @@ -538,6 +548,9 @@ EXPORT_SYMBOL(mtrr_del); * attempts to add a WC MTRR covering size bytes starting at base and * logs an error if this fails. * + * The called should provide a power of two size on an equivalent + * power of two boundary. + * * Drivers must store the return value to pass to mtrr_del_wc_if_needed, * but drivers should not try to interpret that return value. */ @@ -545,7 +558,7 @@ int arch_phys_wc_add(unsigned long base, unsigned long size) { int ret; - if (pat_enabled) + if (pat_enabled() || !mtrr_enabled()) return 0; /* Success! (We don't need to do anything.) */ ret = mtrr_add(base, size, MTRR_TYPE_WRCOMB, true); @@ -577,7 +590,7 @@ void arch_phys_wc_del(int handle) EXPORT_SYMBOL(arch_phys_wc_del); /* - * phys_wc_to_mtrr_index - translates arch_phys_wc_add's return value + * arch_phys_wc_index - translates arch_phys_wc_add's return value * @handle: Return value from arch_phys_wc_add * * This will turn the return value from arch_phys_wc_add into an mtrr @@ -587,14 +600,14 @@ EXPORT_SYMBOL(arch_phys_wc_del); * in printk line. Alas there is an illegitimate use in some ancient * drm ioctls. */ -int phys_wc_to_mtrr_index(int handle) +int arch_phys_wc_index(int handle) { if (handle < MTRR_TO_PHYS_WC_OFFSET) return -1; else return handle - MTRR_TO_PHYS_WC_OFFSET; } -EXPORT_SYMBOL_GPL(phys_wc_to_mtrr_index); +EXPORT_SYMBOL_GPL(arch_phys_wc_index); /* * HACK ALERT! @@ -734,10 +747,12 @@ void __init mtrr_bp_init(void) } if (mtrr_if) { + __mtrr_enabled = true; set_num_var_ranges(); init_table(); if (use_intel()) { - get_mtrr_state(); + /* BIOS may override */ + __mtrr_enabled = get_mtrr_state(); if (mtrr_cleanup(phys_addr)) { changed_by_mtrr_cleanup = 1; @@ -745,10 +760,16 @@ void __init mtrr_bp_init(void) } } } + + if (!mtrr_enabled()) + pr_info("MTRR: Disabled\n"); } void mtrr_ap_init(void) { + if (!mtrr_enabled()) + return; + if (!use_intel() || mtrr_aps_delayed_init) return; /* @@ -774,6 +795,9 @@ void mtrr_save_state(void) { int first_cpu; + if (!mtrr_enabled()) + return; + get_online_cpus(); first_cpu = cpumask_first(cpu_online_mask); smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1); @@ -782,6 +806,8 @@ void mtrr_save_state(void) void set_mtrr_aps_delayed_init(void) { + if (!mtrr_enabled()) + return; if (!use_intel()) return; @@ -793,7 +819,7 @@ void set_mtrr_aps_delayed_init(void) */ void mtrr_aps_init(void) { - if (!use_intel()) + if (!use_intel() || !mtrr_enabled()) return; /* @@ -810,7 +836,7 @@ void mtrr_aps_init(void) void mtrr_bp_restore(void) { - if (!use_intel()) + if (!use_intel() || !mtrr_enabled()) return; mtrr_if->set_all(); @@ -818,7 +844,7 @@ void mtrr_bp_restore(void) static int __init mtrr_init_finialize(void) { - if (!mtrr_if) + if (!mtrr_enabled()) return 0; if (use_intel()) { diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index df5e41f31a27..951884dcc433 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -51,7 +51,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt); void fill_mtrr_var_range(unsigned int index, u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); -void get_mtrr_state(void); +bool get_mtrr_state(void); extern void set_mtrr_ops(const struct mtrr_ops *ops); diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index c76d3e37c6e1..e068d6683dba 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 6367a780cc8c..5ee771859b6f 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include @@ -17,6 +16,7 @@ #include #include +#include #include #include #include @@ -196,38 +196,31 @@ static struct of_ioapic_type of_ioapic_type[] = }, }; -static int ioapic_xlate(struct irq_domain *domain, - struct device_node *controller, - const u32 *intspec, u32 intsize, - irq_hw_number_t *out_hwirq, u32 *out_type) +static int dt_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) { + struct of_phandle_args *irq_data = (void *)arg; struct of_ioapic_type *it; - u32 line, idx, gsi; + struct irq_alloc_info tmp; - if (WARN_ON(intsize < 2)) + if (WARN_ON(irq_data->args_count < 2)) + return -EINVAL; + if (irq_data->args[1] >= ARRAY_SIZE(of_ioapic_type)) return -EINVAL; - line = intspec[0]; + it = &of_ioapic_type[irq_data->args[1]]; + ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->trigger, it->polarity); + tmp.ioapic_id = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain)); + tmp.ioapic_pin = irq_data->args[0]; - if (intspec[1] >= ARRAY_SIZE(of_ioapic_type)) - return -EINVAL; - - it = &of_ioapic_type[intspec[1]]; - - idx = (u32)(long)domain->host_data; - gsi = mp_pin_to_gsi(idx, line); - if (mp_set_gsi_attr(gsi, it->trigger, it->polarity, cpu_to_node(0))) - return -EBUSY; - - *out_hwirq = line; - *out_type = it->out_type; - return 0; + return mp_irqdomain_alloc(domain, virq, nr_irqs, &tmp); } -const struct irq_domain_ops ioapic_irq_domain_ops = { - .map = mp_irqdomain_map, - .unmap = mp_irqdomain_unmap, - .xlate = ioapic_xlate, +static const struct irq_domain_ops ioapic_irq_domain_ops = { + .alloc = dt_irqdomain_alloc, + .free = mp_irqdomain_free, + .activate = mp_irqdomain_activate, + .deactivate = mp_irqdomain_deactivate, }; static void __init dtb_add_ioapic(struct device_node *dn) diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index fe9f0b79a18b..5cb9a4d6f623 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -627,8 +627,12 @@ static struct chipset early_qrk[] __initdata = { { PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID, QFLAG_APPLY_ONCE, intel_graphics_stolen }, /* - * HPET on current version of Baytrail platform has accuracy - * problems, disable it for now: + * HPET on the current version of the Baytrail platform has accuracy + * problems: it will halt in deep idle state - so we disable it. + * + * More details can be found in section 18.10.1.3 of the datasheet: + * + * http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/atom-z8000-datasheet-vol-1.pdf */ { PCI_VENDOR_ID_INTEL, 0x0f00, PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S deleted file mode 100644 index 1c309763e321..000000000000 --- a/arch/x86/kernel/entry_32.S +++ /dev/null @@ -1,1401 +0,0 @@ -/* - * - * Copyright (C) 1991, 1992 Linus Torvalds - */ - -/* - * entry.S contains the system-call and fault low-level handling routines. - * This also contains the timer-interrupt handler, as well as all interrupts - * and faults that can result in a task-switch. - * - * NOTE: This code handles signal-recognition, which happens every time - * after a timer-interrupt and after each system call. - * - * I changed all the .align's to 4 (16 byte alignment), as that's faster - * on a 486. - * - * Stack layout in 'syscall_exit': - * ptrace needs to have all regs on the stack. - * if the order here is changed, it needs to be - * updated in fork.c:copy_process, signal.c:do_signal, - * ptrace.c and ptrace.h - * - * 0(%esp) - %ebx - * 4(%esp) - %ecx - * 8(%esp) - %edx - * C(%esp) - %esi - * 10(%esp) - %edi - * 14(%esp) - %ebp - * 18(%esp) - %eax - * 1C(%esp) - %ds - * 20(%esp) - %es - * 24(%esp) - %fs - * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS - * 2C(%esp) - orig_eax - * 30(%esp) - %eip - * 34(%esp) - %cs - * 38(%esp) - %eflags - * 3C(%esp) - %oldesp - * 40(%esp) - %oldss - * - * "current" is in register %ebx during any slow entries. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* Avoid __ASSEMBLER__'ifying just for this. */ -#include -#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) -#define __AUDIT_ARCH_LE 0x40000000 - -#ifndef CONFIG_AUDITSYSCALL -#define sysenter_audit syscall_trace_entry -#define sysexit_audit syscall_exit_work -#endif - - .section .entry.text, "ax" - -/* - * We use macros for low-level operations which need to be overridden - * for paravirtualization. The following will never clobber any registers: - * INTERRUPT_RETURN (aka. "iret") - * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") - * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). - * - * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must - * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). - * Allowing a register to be clobbered can shrink the paravirt replacement - * enough to patch inline, increasing performance. - */ - -#ifdef CONFIG_PREEMPT -#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF -#else -#define preempt_stop(clobbers) -#define resume_kernel restore_all -#endif - -.macro TRACE_IRQS_IRET -#ifdef CONFIG_TRACE_IRQFLAGS - testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off? - jz 1f - TRACE_IRQS_ON -1: -#endif -.endm - -/* - * User gs save/restore - * - * %gs is used for userland TLS and kernel only uses it for stack - * canary which is required to be at %gs:20 by gcc. Read the comment - * at the top of stackprotector.h for more info. - * - * Local labels 98 and 99 are used. - */ -#ifdef CONFIG_X86_32_LAZY_GS - - /* unfortunately push/pop can't be no-op */ -.macro PUSH_GS - pushl_cfi $0 -.endm -.macro POP_GS pop=0 - addl $(4 + \pop), %esp - CFI_ADJUST_CFA_OFFSET -(4 + \pop) -.endm -.macro POP_GS_EX -.endm - - /* all the rest are no-op */ -.macro PTGS_TO_GS -.endm -.macro PTGS_TO_GS_EX -.endm -.macro GS_TO_REG reg -.endm -.macro REG_TO_PTGS reg -.endm -.macro SET_KERNEL_GS reg -.endm - -#else /* CONFIG_X86_32_LAZY_GS */ - -.macro PUSH_GS - pushl_cfi %gs - /*CFI_REL_OFFSET gs, 0*/ -.endm - -.macro POP_GS pop=0 -98: popl_cfi %gs - /*CFI_RESTORE gs*/ - .if \pop <> 0 - add $\pop, %esp - CFI_ADJUST_CFA_OFFSET -\pop - .endif -.endm -.macro POP_GS_EX -.pushsection .fixup, "ax" -99: movl $0, (%esp) - jmp 98b -.popsection - _ASM_EXTABLE(98b,99b) -.endm - -.macro PTGS_TO_GS -98: mov PT_GS(%esp), %gs -.endm -.macro PTGS_TO_GS_EX -.pushsection .fixup, "ax" -99: movl $0, PT_GS(%esp) - jmp 98b -.popsection - _ASM_EXTABLE(98b,99b) -.endm - -.macro GS_TO_REG reg - movl %gs, \reg - /*CFI_REGISTER gs, \reg*/ -.endm -.macro REG_TO_PTGS reg - movl \reg, PT_GS(%esp) - /*CFI_REL_OFFSET gs, PT_GS*/ -.endm -.macro SET_KERNEL_GS reg - movl $(__KERNEL_STACK_CANARY), \reg - movl \reg, %gs -.endm - -#endif /* CONFIG_X86_32_LAZY_GS */ - -.macro SAVE_ALL - cld - PUSH_GS - pushl_cfi %fs - /*CFI_REL_OFFSET fs, 0;*/ - pushl_cfi %es - /*CFI_REL_OFFSET es, 0;*/ - pushl_cfi %ds - /*CFI_REL_OFFSET ds, 0;*/ - pushl_cfi %eax - CFI_REL_OFFSET eax, 0 - pushl_cfi %ebp - CFI_REL_OFFSET ebp, 0 - pushl_cfi %edi - CFI_REL_OFFSET edi, 0 - pushl_cfi %esi - CFI_REL_OFFSET esi, 0 - pushl_cfi %edx - CFI_REL_OFFSET edx, 0 - pushl_cfi %ecx - CFI_REL_OFFSET ecx, 0 - pushl_cfi %ebx - CFI_REL_OFFSET ebx, 0 - movl $(__USER_DS), %edx - movl %edx, %ds - movl %edx, %es - movl $(__KERNEL_PERCPU), %edx - movl %edx, %fs - SET_KERNEL_GS %edx -.endm - -.macro RESTORE_INT_REGS - popl_cfi %ebx - CFI_RESTORE ebx - popl_cfi %ecx - CFI_RESTORE ecx - popl_cfi %edx - CFI_RESTORE edx - popl_cfi %esi - CFI_RESTORE esi - popl_cfi %edi - CFI_RESTORE edi - popl_cfi %ebp - CFI_RESTORE ebp - popl_cfi %eax - CFI_RESTORE eax -.endm - -.macro RESTORE_REGS pop=0 - RESTORE_INT_REGS -1: popl_cfi %ds - /*CFI_RESTORE ds;*/ -2: popl_cfi %es - /*CFI_RESTORE es;*/ -3: popl_cfi %fs - /*CFI_RESTORE fs;*/ - POP_GS \pop -.pushsection .fixup, "ax" -4: movl $0, (%esp) - jmp 1b -5: movl $0, (%esp) - jmp 2b -6: movl $0, (%esp) - jmp 3b -.popsection - _ASM_EXTABLE(1b,4b) - _ASM_EXTABLE(2b,5b) - _ASM_EXTABLE(3b,6b) - POP_GS_EX -.endm - -.macro RING0_INT_FRAME - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA esp, 3*4 - /*CFI_OFFSET cs, -2*4;*/ - CFI_OFFSET eip, -3*4 -.endm - -.macro RING0_EC_FRAME - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA esp, 4*4 - /*CFI_OFFSET cs, -2*4;*/ - CFI_OFFSET eip, -3*4 -.endm - -.macro RING0_PTREGS_FRAME - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA esp, PT_OLDESP-PT_EBX - /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/ - CFI_OFFSET eip, PT_EIP-PT_OLDESP - /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/ - /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/ - CFI_OFFSET eax, PT_EAX-PT_OLDESP - CFI_OFFSET ebp, PT_EBP-PT_OLDESP - CFI_OFFSET edi, PT_EDI-PT_OLDESP - CFI_OFFSET esi, PT_ESI-PT_OLDESP - CFI_OFFSET edx, PT_EDX-PT_OLDESP - CFI_OFFSET ecx, PT_ECX-PT_OLDESP - CFI_OFFSET ebx, PT_EBX-PT_OLDESP -.endm - -ENTRY(ret_from_fork) - CFI_STARTPROC - pushl_cfi %eax - call schedule_tail - GET_THREAD_INFO(%ebp) - popl_cfi %eax - pushl_cfi $0x0202 # Reset kernel eflags - popfl_cfi - jmp syscall_exit - CFI_ENDPROC -END(ret_from_fork) - -ENTRY(ret_from_kernel_thread) - CFI_STARTPROC - pushl_cfi %eax - call schedule_tail - GET_THREAD_INFO(%ebp) - popl_cfi %eax - pushl_cfi $0x0202 # Reset kernel eflags - popfl_cfi - movl PT_EBP(%esp),%eax - call *PT_EBX(%esp) - movl $0,PT_EAX(%esp) - jmp syscall_exit - CFI_ENDPROC -ENDPROC(ret_from_kernel_thread) - -/* - * Return to user mode is not as complex as all this looks, - * but we want the default path for a system call return to - * go as quickly as possible which is why some of this is - * less clear than it otherwise should be. - */ - - # userspace resumption stub bypassing syscall exit tracing - ALIGN - RING0_PTREGS_FRAME -ret_from_exception: - preempt_stop(CLBR_ANY) -ret_from_intr: - GET_THREAD_INFO(%ebp) -#ifdef CONFIG_VM86 - movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS - movb PT_CS(%esp), %al - andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax -#else - /* - * We can be coming here from child spawned by kernel_thread(). - */ - movl PT_CS(%esp), %eax - andl $SEGMENT_RPL_MASK, %eax -#endif - cmpl $USER_RPL, %eax - jb resume_kernel # not returning to v8086 or userspace - -ENTRY(resume_userspace) - LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt - # setting need_resched or sigpending - # between sampling and the iret - TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - andl $_TIF_WORK_MASK, %ecx # is there any work to be done on - # int/exception return? - jne work_pending - jmp restore_all -END(ret_from_exception) - -#ifdef CONFIG_PREEMPT -ENTRY(resume_kernel) - DISABLE_INTERRUPTS(CLBR_ANY) -need_resched: - cmpl $0,PER_CPU_VAR(__preempt_count) - jnz restore_all - testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ? - jz restore_all - call preempt_schedule_irq - jmp need_resched -END(resume_kernel) -#endif - CFI_ENDPROC - -/* SYSENTER_RETURN points to after the "sysenter" instruction in - the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ - - # sysenter call handler stub -ENTRY(ia32_sysenter_target) - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA esp, 0 - CFI_REGISTER esp, ebp - movl TSS_sysenter_sp0(%esp),%esp -sysenter_past_esp: - /* - * Interrupts are disabled here, but we can't trace it until - * enough kernel state to call TRACE_IRQS_OFF can be called - but - * we immediately enable interrupts at that point anyway. - */ - pushl_cfi $__USER_DS - /*CFI_REL_OFFSET ss, 0*/ - pushl_cfi %ebp - CFI_REL_OFFSET esp, 0 - pushfl_cfi - orl $X86_EFLAGS_IF, (%esp) - pushl_cfi $__USER_CS - /*CFI_REL_OFFSET cs, 0*/ - /* - * Push current_thread_info()->sysenter_return to the stack. - * A tiny bit of offset fixup is necessary: TI_sysenter_return - * is relative to thread_info, which is at the bottom of the - * kernel stack page. 4*4 means the 4 words pushed above; - * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack; - * and THREAD_SIZE takes us to the bottom. - */ - pushl_cfi ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp) - CFI_REL_OFFSET eip, 0 - - pushl_cfi %eax - SAVE_ALL - ENABLE_INTERRUPTS(CLBR_NONE) - -/* - * Load the potential sixth argument from user stack. - * Careful about security. - */ - cmpl $__PAGE_OFFSET-3,%ebp - jae syscall_fault - ASM_STAC -1: movl (%ebp),%ebp - ASM_CLAC - movl %ebp,PT_EBP(%esp) - _ASM_EXTABLE(1b,syscall_fault) - - GET_THREAD_INFO(%ebp) - - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) - jnz sysenter_audit -sysenter_do_call: - cmpl $(NR_syscalls), %eax - jae sysenter_badsys - call *sys_call_table(,%eax,4) -sysenter_after_call: - movl %eax,PT_EAX(%esp) - LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - testl $_TIF_ALLWORK_MASK, %ecx - jnz sysexit_audit -sysenter_exit: -/* if something modifies registers it must also disable sysexit */ - movl PT_EIP(%esp), %edx - movl PT_OLDESP(%esp), %ecx - xorl %ebp,%ebp - TRACE_IRQS_ON -1: mov PT_FS(%esp), %fs - PTGS_TO_GS - ENABLE_INTERRUPTS_SYSEXIT - -#ifdef CONFIG_AUDITSYSCALL -sysenter_audit: - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp) - jnz syscall_trace_entry - /* movl PT_EAX(%esp), %eax already set, syscall number: 1st arg to audit */ - movl PT_EBX(%esp), %edx /* ebx/a0: 2nd arg to audit */ - /* movl PT_ECX(%esp), %ecx already set, a1: 3nd arg to audit */ - pushl_cfi PT_ESI(%esp) /* a3: 5th arg */ - pushl_cfi PT_EDX+4(%esp) /* a2: 4th arg */ - call __audit_syscall_entry - popl_cfi %ecx /* get that remapped edx off the stack */ - popl_cfi %ecx /* get that remapped esi off the stack */ - movl PT_EAX(%esp),%eax /* reload syscall number */ - jmp sysenter_do_call - -sysexit_audit: - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx - jnz syscall_exit_work - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_ANY) - movl %eax,%edx /* second arg, syscall return value */ - cmpl $-MAX_ERRNO,%eax /* is it an error ? */ - setbe %al /* 1 if so, 0 if not */ - movzbl %al,%eax /* zero-extend that */ - call __audit_syscall_exit - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx - jnz syscall_exit_work - movl PT_EAX(%esp),%eax /* reload syscall return value */ - jmp sysenter_exit -#endif - - CFI_ENDPROC -.pushsection .fixup,"ax" -2: movl $0,PT_FS(%esp) - jmp 1b -.popsection - _ASM_EXTABLE(1b,2b) - PTGS_TO_GS_EX -ENDPROC(ia32_sysenter_target) - - # system call handler stub -ENTRY(system_call) - RING0_INT_FRAME # can't unwind into user space anyway - ASM_CLAC - pushl_cfi %eax # save orig_eax - SAVE_ALL - GET_THREAD_INFO(%ebp) - # system call tracing in operation / emulation - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) - jnz syscall_trace_entry - cmpl $(NR_syscalls), %eax - jae syscall_badsys -syscall_call: - call *sys_call_table(,%eax,4) -syscall_after_call: - movl %eax,PT_EAX(%esp) # store the return value -syscall_exit: - LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt - # setting need_resched or sigpending - # between sampling and the iret - TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - testl $_TIF_ALLWORK_MASK, %ecx # current->work - jnz syscall_exit_work - -restore_all: - TRACE_IRQS_IRET -restore_all_notrace: -#ifdef CONFIG_X86_ESPFIX32 - movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS - # Warning: PT_OLDSS(%esp) contains the wrong/random values if we - # are returning to the kernel. - # See comments in process.c:copy_thread() for details. - movb PT_OLDSS(%esp), %ah - movb PT_CS(%esp), %al - andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax - cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax - CFI_REMEMBER_STATE - je ldt_ss # returning to user-space with LDT SS -#endif -restore_nocheck: - RESTORE_REGS 4 # skip orig_eax/error_code -irq_return: - INTERRUPT_RETURN -.section .fixup,"ax" -ENTRY(iret_exc) - pushl $0 # no error code - pushl $do_iret_error - jmp error_code -.previous - _ASM_EXTABLE(irq_return,iret_exc) - -#ifdef CONFIG_X86_ESPFIX32 - CFI_RESTORE_STATE -ldt_ss: -#ifdef CONFIG_PARAVIRT - /* - * The kernel can't run on a non-flat stack if paravirt mode - * is active. Rather than try to fixup the high bits of - * ESP, bypass this code entirely. This may break DOSemu - * and/or Wine support in a paravirt VM, although the option - * is still available to implement the setting of the high - * 16-bits in the INTERRUPT_RETURN paravirt-op. - */ - cmpl $0, pv_info+PARAVIRT_enabled - jne restore_nocheck -#endif - -/* - * Setup and switch to ESPFIX stack - * - * We're returning to userspace with a 16 bit stack. The CPU will not - * restore the high word of ESP for us on executing iret... This is an - * "official" bug of all the x86-compatible CPUs, which we can work - * around to make dosemu and wine happy. We do this by preloading the - * high word of ESP with the high word of the userspace ESP while - * compensating for the offset by changing to the ESPFIX segment with - * a base address that matches for the difference. - */ -#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) - mov %esp, %edx /* load kernel esp */ - mov PT_OLDESP(%esp), %eax /* load userspace esp */ - mov %dx, %ax /* eax: new kernel esp */ - sub %eax, %edx /* offset (low word is 0) */ - shr $16, %edx - mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ - mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ - pushl_cfi $__ESPFIX_SS - pushl_cfi %eax /* new kernel esp */ - /* Disable interrupts, but do not irqtrace this section: we - * will soon execute iret and the tracer was already set to - * the irqstate after the iret */ - DISABLE_INTERRUPTS(CLBR_EAX) - lss (%esp), %esp /* switch to espfix segment */ - CFI_ADJUST_CFA_OFFSET -8 - jmp restore_nocheck -#endif - CFI_ENDPROC -ENDPROC(system_call) - - # perform work that needs to be done immediately before resumption - ALIGN - RING0_PTREGS_FRAME # can't unwind into user space anyway -work_pending: - testb $_TIF_NEED_RESCHED, %cl - jz work_notifysig -work_resched: - call schedule - LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt - # setting need_resched or sigpending - # between sampling and the iret - TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - andl $_TIF_WORK_MASK, %ecx # is there any work to be done other - # than syscall tracing? - jz restore_all - testb $_TIF_NEED_RESCHED, %cl - jnz work_resched - -work_notifysig: # deal with pending signals and - # notify-resume requests -#ifdef CONFIG_VM86 - testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) - movl %esp, %eax - jnz work_notifysig_v86 # returning to kernel-space or - # vm86-space -1: -#else - movl %esp, %eax -#endif - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - movb PT_CS(%esp), %bl - andb $SEGMENT_RPL_MASK, %bl - cmpb $USER_RPL, %bl - jb resume_kernel - xorl %edx, %edx - call do_notify_resume - jmp resume_userspace - -#ifdef CONFIG_VM86 - ALIGN -work_notifysig_v86: - pushl_cfi %ecx # save ti_flags for do_notify_resume - call save_v86_state # %eax contains pt_regs pointer - popl_cfi %ecx - movl %eax, %esp - jmp 1b -#endif -END(work_pending) - - # perform syscall exit tracing - ALIGN -syscall_trace_entry: - movl $-ENOSYS,PT_EAX(%esp) - movl %esp, %eax - call syscall_trace_enter - /* What it returned is what we'll actually use. */ - cmpl $(NR_syscalls), %eax - jnae syscall_call - jmp syscall_exit -END(syscall_trace_entry) - - # perform syscall exit tracing - ALIGN -syscall_exit_work: - testl $_TIF_WORK_SYSCALL_EXIT, %ecx - jz work_pending - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call - # schedule() instead - movl %esp, %eax - call syscall_trace_leave - jmp resume_userspace -END(syscall_exit_work) - CFI_ENDPROC - - RING0_INT_FRAME # can't unwind into user space anyway -syscall_fault: - ASM_CLAC - GET_THREAD_INFO(%ebp) - movl $-EFAULT,PT_EAX(%esp) - jmp resume_userspace -END(syscall_fault) - -syscall_badsys: - movl $-ENOSYS,%eax - jmp syscall_after_call -END(syscall_badsys) - -sysenter_badsys: - movl $-ENOSYS,%eax - jmp sysenter_after_call -END(sysenter_badsys) - CFI_ENDPROC - -.macro FIXUP_ESPFIX_STACK -/* - * Switch back for ESPFIX stack to the normal zerobased stack - * - * We can't call C functions using the ESPFIX stack. This code reads - * the high word of the segment base from the GDT and swiches to the - * normal stack and adjusts ESP with the matching offset. - */ -#ifdef CONFIG_X86_ESPFIX32 - /* fixup the stack */ - mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ - mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ - shl $16, %eax - addl %esp, %eax /* the adjusted stack pointer */ - pushl_cfi $__KERNEL_DS - pushl_cfi %eax - lss (%esp), %esp /* switch to the normal stack segment */ - CFI_ADJUST_CFA_OFFSET -8 -#endif -.endm -.macro UNWIND_ESPFIX_STACK -#ifdef CONFIG_X86_ESPFIX32 - movl %ss, %eax - /* see if on espfix stack */ - cmpw $__ESPFIX_SS, %ax - jne 27f - movl $__KERNEL_DS, %eax - movl %eax, %ds - movl %eax, %es - /* switch to normal stack */ - FIXUP_ESPFIX_STACK -27: -#endif -.endm - -/* - * Build the entry stubs with some assembler magic. - * We pack 1 stub into every 8-byte block. - */ - .align 8 -ENTRY(irq_entries_start) - RING0_INT_FRAME - vector=FIRST_EXTERNAL_VECTOR - .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) - pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */ - vector=vector+1 - jmp common_interrupt - CFI_ADJUST_CFA_OFFSET -4 - .align 8 - .endr -END(irq_entries_start) - -/* - * the CPU automatically disables interrupts when executing an IRQ vector, - * so IRQ-flags tracing has to follow that: - */ - .p2align CONFIG_X86_L1_CACHE_SHIFT -common_interrupt: - ASM_CLAC - addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */ - SAVE_ALL - TRACE_IRQS_OFF - movl %esp,%eax - call do_IRQ - jmp ret_from_intr -ENDPROC(common_interrupt) - CFI_ENDPROC - -#define BUILD_INTERRUPT3(name, nr, fn) \ -ENTRY(name) \ - RING0_INT_FRAME; \ - ASM_CLAC; \ - pushl_cfi $~(nr); \ - SAVE_ALL; \ - TRACE_IRQS_OFF \ - movl %esp,%eax; \ - call fn; \ - jmp ret_from_intr; \ - CFI_ENDPROC; \ -ENDPROC(name) - - -#ifdef CONFIG_TRACING -#define TRACE_BUILD_INTERRUPT(name, nr) \ - BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name) -#else -#define TRACE_BUILD_INTERRUPT(name, nr) -#endif - -#define BUILD_INTERRUPT(name, nr) \ - BUILD_INTERRUPT3(name, nr, smp_##name); \ - TRACE_BUILD_INTERRUPT(name, nr) - -/* The include is where all of the SMP etc. interrupts come from */ -#include - -ENTRY(coprocessor_error) - RING0_INT_FRAME - ASM_CLAC - pushl_cfi $0 - pushl_cfi $do_coprocessor_error - jmp error_code - CFI_ENDPROC -END(coprocessor_error) - -ENTRY(simd_coprocessor_error) - RING0_INT_FRAME - ASM_CLAC - pushl_cfi $0 -#ifdef CONFIG_X86_INVD_BUG - /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ - ALTERNATIVE "pushl_cfi $do_general_protection", \ - "pushl $do_simd_coprocessor_error", \ - X86_FEATURE_XMM -#else - pushl_cfi $do_simd_coprocessor_error -#endif - jmp error_code - CFI_ENDPROC -END(simd_coprocessor_error) - -ENTRY(device_not_available) - RING0_INT_FRAME - ASM_CLAC - pushl_cfi $-1 # mark this as an int - pushl_cfi $do_device_not_available - jmp error_code - CFI_ENDPROC -END(device_not_available) - -#ifdef CONFIG_PARAVIRT -ENTRY(native_iret) - iret - _ASM_EXTABLE(native_iret, iret_exc) -END(native_iret) - -ENTRY(native_irq_enable_sysexit) - sti - sysexit -END(native_irq_enable_sysexit) -#endif - -ENTRY(overflow) - RING0_INT_FRAME - ASM_CLAC - pushl_cfi $0 - pushl_cfi $do_overflow - jmp error_code - CFI_ENDPROC -END(overflow) - -ENTRY(bounds) - RING0_INT_FRAME - ASM_CLAC - pushl_cfi $0 - pushl_cfi $do_bounds - jmp error_code - CFI_ENDPROC -END(bounds) - -ENTRY(invalid_op) - RING0_INT_FRAME - ASM_CLAC - pushl_cfi $0 - pushl_cfi $do_invalid_op - jmp error_code - CFI_ENDPROC -END(invalid_op) - -ENTRY(coprocessor_segment_overrun) - RING0_INT_FRAME - ASM_CLAC - pushl_cfi $0 - pushl_cfi $do_coprocessor_segment_overrun - jmp error_code - CFI_ENDPROC -END(coprocessor_segment_overrun) - -ENTRY(invalid_TSS) - RING0_EC_FRAME - ASM_CLAC - pushl_cfi $do_invalid_TSS - jmp error_code - CFI_ENDPROC -END(invalid_TSS) - -ENTRY(segment_not_present) - RING0_EC_FRAME - ASM_CLAC - pushl_cfi $do_segment_not_present - jmp error_code - CFI_ENDPROC -END(segment_not_present) - -ENTRY(stack_segment) - RING0_EC_FRAME - ASM_CLAC - pushl_cfi $do_stack_segment - jmp error_code - CFI_ENDPROC -END(stack_segment) - -ENTRY(alignment_check) - RING0_EC_FRAME - ASM_CLAC - pushl_cfi $do_alignment_check - jmp error_code - CFI_ENDPROC -END(alignment_check) - -ENTRY(divide_error) - RING0_INT_FRAME - ASM_CLAC - pushl_cfi $0 # no error code - pushl_cfi $do_divide_error - jmp error_code - CFI_ENDPROC -END(divide_error) - -#ifdef CONFIG_X86_MCE -ENTRY(machine_check) - RING0_INT_FRAME - ASM_CLAC - pushl_cfi $0 - pushl_cfi machine_check_vector - jmp error_code - CFI_ENDPROC -END(machine_check) -#endif - -ENTRY(spurious_interrupt_bug) - RING0_INT_FRAME - ASM_CLAC - pushl_cfi $0 - pushl_cfi $do_spurious_interrupt_bug - jmp error_code - CFI_ENDPROC -END(spurious_interrupt_bug) - -#ifdef CONFIG_XEN -/* Xen doesn't set %esp to be precisely what the normal sysenter - entrypoint expects, so fix it up before using the normal path. */ -ENTRY(xen_sysenter_target) - RING0_INT_FRAME - addl $5*4, %esp /* remove xen-provided frame */ - CFI_ADJUST_CFA_OFFSET -5*4 - jmp sysenter_past_esp - CFI_ENDPROC - -ENTRY(xen_hypervisor_callback) - CFI_STARTPROC - pushl_cfi $-1 /* orig_ax = -1 => not a system call */ - SAVE_ALL - TRACE_IRQS_OFF - - /* Check to see if we got the event in the critical - region in xen_iret_direct, after we've reenabled - events and checked for pending events. This simulates - iret instruction's behaviour where it delivers a - pending interrupt when enabling interrupts. */ - movl PT_EIP(%esp),%eax - cmpl $xen_iret_start_crit,%eax - jb 1f - cmpl $xen_iret_end_crit,%eax - jae 1f - - jmp xen_iret_crit_fixup - -ENTRY(xen_do_upcall) -1: mov %esp, %eax - call xen_evtchn_do_upcall -#ifndef CONFIG_PREEMPT - call xen_maybe_preempt_hcall -#endif - jmp ret_from_intr - CFI_ENDPROC -ENDPROC(xen_hypervisor_callback) - -# Hypervisor uses this for application faults while it executes. -# We get here for two reasons: -# 1. Fault while reloading DS, ES, FS or GS -# 2. Fault while executing IRET -# Category 1 we fix up by reattempting the load, and zeroing the segment -# register if the load fails. -# Category 2 we fix up by jumping to do_iret_error. We cannot use the -# normal Linux return path in this case because if we use the IRET hypercall -# to pop the stack frame we end up in an infinite loop of failsafe callbacks. -# We distinguish between categories by maintaining a status value in EAX. -ENTRY(xen_failsafe_callback) - CFI_STARTPROC - pushl_cfi %eax - movl $1,%eax -1: mov 4(%esp),%ds -2: mov 8(%esp),%es -3: mov 12(%esp),%fs -4: mov 16(%esp),%gs - /* EAX == 0 => Category 1 (Bad segment) - EAX != 0 => Category 2 (Bad IRET) */ - testl %eax,%eax - popl_cfi %eax - lea 16(%esp),%esp - CFI_ADJUST_CFA_OFFSET -16 - jz 5f - jmp iret_exc -5: pushl_cfi $-1 /* orig_ax = -1 => not a system call */ - SAVE_ALL - jmp ret_from_exception - CFI_ENDPROC - -.section .fixup,"ax" -6: xorl %eax,%eax - movl %eax,4(%esp) - jmp 1b -7: xorl %eax,%eax - movl %eax,8(%esp) - jmp 2b -8: xorl %eax,%eax - movl %eax,12(%esp) - jmp 3b -9: xorl %eax,%eax - movl %eax,16(%esp) - jmp 4b -.previous - _ASM_EXTABLE(1b,6b) - _ASM_EXTABLE(2b,7b) - _ASM_EXTABLE(3b,8b) - _ASM_EXTABLE(4b,9b) -ENDPROC(xen_failsafe_callback) - -BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR, - xen_evtchn_do_upcall) - -#endif /* CONFIG_XEN */ - -#if IS_ENABLED(CONFIG_HYPERV) - -BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR, - hyperv_vector_handler) - -#endif /* CONFIG_HYPERV */ - -#ifdef CONFIG_FUNCTION_TRACER -#ifdef CONFIG_DYNAMIC_FTRACE - -ENTRY(mcount) - ret -END(mcount) - -ENTRY(ftrace_caller) - pushl %eax - pushl %ecx - pushl %edx - pushl $0 /* Pass NULL as regs pointer */ - movl 4*4(%esp), %eax - movl 0x4(%ebp), %edx - movl function_trace_op, %ecx - subl $MCOUNT_INSN_SIZE, %eax - -.globl ftrace_call -ftrace_call: - call ftrace_stub - - addl $4,%esp /* skip NULL pointer */ - popl %edx - popl %ecx - popl %eax -ftrace_ret: -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -.globl ftrace_graph_call -ftrace_graph_call: - jmp ftrace_stub -#endif - -.globl ftrace_stub -ftrace_stub: - ret -END(ftrace_caller) - -ENTRY(ftrace_regs_caller) - pushf /* push flags before compare (in cs location) */ - - /* - * i386 does not save SS and ESP when coming from kernel. - * Instead, to get sp, ®s->sp is used (see ptrace.h). - * Unfortunately, that means eflags must be at the same location - * as the current return ip is. We move the return ip into the - * ip location, and move flags into the return ip location. - */ - pushl 4(%esp) /* save return ip into ip slot */ - - pushl $0 /* Load 0 into orig_ax */ - pushl %gs - pushl %fs - pushl %es - pushl %ds - pushl %eax - pushl %ebp - pushl %edi - pushl %esi - pushl %edx - pushl %ecx - pushl %ebx - - movl 13*4(%esp), %eax /* Get the saved flags */ - movl %eax, 14*4(%esp) /* Move saved flags into regs->flags location */ - /* clobbering return ip */ - movl $__KERNEL_CS,13*4(%esp) - - movl 12*4(%esp), %eax /* Load ip (1st parameter) */ - subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */ - movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */ - movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ - pushl %esp /* Save pt_regs as 4th parameter */ - -GLOBAL(ftrace_regs_call) - call ftrace_stub - - addl $4, %esp /* Skip pt_regs */ - movl 14*4(%esp), %eax /* Move flags back into cs */ - movl %eax, 13*4(%esp) /* Needed to keep addl from modifying flags */ - movl 12*4(%esp), %eax /* Get return ip from regs->ip */ - movl %eax, 14*4(%esp) /* Put return ip back for ret */ - - popl %ebx - popl %ecx - popl %edx - popl %esi - popl %edi - popl %ebp - popl %eax - popl %ds - popl %es - popl %fs - popl %gs - addl $8, %esp /* Skip orig_ax and ip */ - popf /* Pop flags at end (no addl to corrupt flags) */ - jmp ftrace_ret - - popf - jmp ftrace_stub -#else /* ! CONFIG_DYNAMIC_FTRACE */ - -ENTRY(mcount) - cmpl $__PAGE_OFFSET, %esp - jb ftrace_stub /* Paging not enabled yet? */ - - cmpl $ftrace_stub, ftrace_trace_function - jnz trace -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - cmpl $ftrace_stub, ftrace_graph_return - jnz ftrace_graph_caller - - cmpl $ftrace_graph_entry_stub, ftrace_graph_entry - jnz ftrace_graph_caller -#endif -.globl ftrace_stub -ftrace_stub: - ret - - /* taken from glibc */ -trace: - pushl %eax - pushl %ecx - pushl %edx - movl 0xc(%esp), %eax - movl 0x4(%ebp), %edx - subl $MCOUNT_INSN_SIZE, %eax - - call *ftrace_trace_function - - popl %edx - popl %ecx - popl %eax - jmp ftrace_stub -END(mcount) -#endif /* CONFIG_DYNAMIC_FTRACE */ -#endif /* CONFIG_FUNCTION_TRACER */ - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -ENTRY(ftrace_graph_caller) - pushl %eax - pushl %ecx - pushl %edx - movl 0xc(%esp), %eax - lea 0x4(%ebp), %edx - movl (%ebp), %ecx - subl $MCOUNT_INSN_SIZE, %eax - call prepare_ftrace_return - popl %edx - popl %ecx - popl %eax - ret -END(ftrace_graph_caller) - -.globl return_to_handler -return_to_handler: - pushl %eax - pushl %edx - movl %ebp, %eax - call ftrace_return_to_handler - movl %eax, %ecx - popl %edx - popl %eax - jmp *%ecx -#endif - -#ifdef CONFIG_TRACING -ENTRY(trace_page_fault) - RING0_EC_FRAME - ASM_CLAC - pushl_cfi $trace_do_page_fault - jmp error_code - CFI_ENDPROC -END(trace_page_fault) -#endif - -ENTRY(page_fault) - RING0_EC_FRAME - ASM_CLAC - pushl_cfi $do_page_fault - ALIGN -error_code: - /* the function address is in %gs's slot on the stack */ - pushl_cfi %fs - /*CFI_REL_OFFSET fs, 0*/ - pushl_cfi %es - /*CFI_REL_OFFSET es, 0*/ - pushl_cfi %ds - /*CFI_REL_OFFSET ds, 0*/ - pushl_cfi_reg eax - pushl_cfi_reg ebp - pushl_cfi_reg edi - pushl_cfi_reg esi - pushl_cfi_reg edx - pushl_cfi_reg ecx - pushl_cfi_reg ebx - cld - movl $(__KERNEL_PERCPU), %ecx - movl %ecx, %fs - UNWIND_ESPFIX_STACK - GS_TO_REG %ecx - movl PT_GS(%esp), %edi # get the function address - movl PT_ORIG_EAX(%esp), %edx # get the error code - movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart - REG_TO_PTGS %ecx - SET_KERNEL_GS %ecx - movl $(__USER_DS), %ecx - movl %ecx, %ds - movl %ecx, %es - TRACE_IRQS_OFF - movl %esp,%eax # pt_regs pointer - call *%edi - jmp ret_from_exception - CFI_ENDPROC -END(page_fault) - -/* - * Debug traps and NMI can happen at the one SYSENTER instruction - * that sets up the real kernel stack. Check here, since we can't - * allow the wrong stack to be used. - * - * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have - * already pushed 3 words if it hits on the sysenter instruction: - * eflags, cs and eip. - * - * We just load the right stack, and push the three (known) values - * by hand onto the new stack - while updating the return eip past - * the instruction that would have done it for sysenter. - */ -.macro FIX_STACK offset ok label - cmpw $__KERNEL_CS, 4(%esp) - jne \ok -\label: - movl TSS_sysenter_sp0 + \offset(%esp), %esp - CFI_DEF_CFA esp, 0 - CFI_UNDEFINED eip - pushfl_cfi - pushl_cfi $__KERNEL_CS - pushl_cfi $sysenter_past_esp - CFI_REL_OFFSET eip, 0 -.endm - -ENTRY(debug) - RING0_INT_FRAME - ASM_CLAC - cmpl $ia32_sysenter_target,(%esp) - jne debug_stack_correct - FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn -debug_stack_correct: - pushl_cfi $-1 # mark this as an int - SAVE_ALL - TRACE_IRQS_OFF - xorl %edx,%edx # error code 0 - movl %esp,%eax # pt_regs pointer - call do_debug - jmp ret_from_exception - CFI_ENDPROC -END(debug) - -/* - * NMI is doubly nasty. It can happen _while_ we're handling - * a debug fault, and the debug fault hasn't yet been able to - * clear up the stack. So we first check whether we got an - * NMI on the sysenter entry path, but after that we need to - * check whether we got an NMI on the debug path where the debug - * fault happened on the sysenter path. - */ -ENTRY(nmi) - RING0_INT_FRAME - ASM_CLAC -#ifdef CONFIG_X86_ESPFIX32 - pushl_cfi %eax - movl %ss, %eax - cmpw $__ESPFIX_SS, %ax - popl_cfi %eax - je nmi_espfix_stack -#endif - cmpl $ia32_sysenter_target,(%esp) - je nmi_stack_fixup - pushl_cfi %eax - movl %esp,%eax - /* Do not access memory above the end of our stack page, - * it might not exist. - */ - andl $(THREAD_SIZE-1),%eax - cmpl $(THREAD_SIZE-20),%eax - popl_cfi %eax - jae nmi_stack_correct - cmpl $ia32_sysenter_target,12(%esp) - je nmi_debug_stack_check -nmi_stack_correct: - /* We have a RING0_INT_FRAME here */ - pushl_cfi %eax - SAVE_ALL - xorl %edx,%edx # zero error code - movl %esp,%eax # pt_regs pointer - call do_nmi - jmp restore_all_notrace - CFI_ENDPROC - -nmi_stack_fixup: - RING0_INT_FRAME - FIX_STACK 12, nmi_stack_correct, 1 - jmp nmi_stack_correct - -nmi_debug_stack_check: - /* We have a RING0_INT_FRAME here */ - cmpw $__KERNEL_CS,16(%esp) - jne nmi_stack_correct - cmpl $debug,(%esp) - jb nmi_stack_correct - cmpl $debug_esp_fix_insn,(%esp) - ja nmi_stack_correct - FIX_STACK 24, nmi_stack_correct, 1 - jmp nmi_stack_correct - -#ifdef CONFIG_X86_ESPFIX32 -nmi_espfix_stack: - /* We have a RING0_INT_FRAME here. - * - * create the pointer to lss back - */ - pushl_cfi %ss - pushl_cfi %esp - addl $4, (%esp) - /* copy the iret frame of 12 bytes */ - .rept 3 - pushl_cfi 16(%esp) - .endr - pushl_cfi %eax - SAVE_ALL - FIXUP_ESPFIX_STACK # %eax == %esp - xorl %edx,%edx # zero error code - call do_nmi - RESTORE_REGS - lss 12+4(%esp), %esp # back to espfix stack - CFI_ADJUST_CFA_OFFSET -24 - jmp irq_return -#endif - CFI_ENDPROC -END(nmi) - -ENTRY(int3) - RING0_INT_FRAME - ASM_CLAC - pushl_cfi $-1 # mark this as an int - SAVE_ALL - TRACE_IRQS_OFF - xorl %edx,%edx # zero error code - movl %esp,%eax # pt_regs pointer - call do_int3 - jmp ret_from_exception - CFI_ENDPROC -END(int3) - -ENTRY(general_protection) - RING0_EC_FRAME - pushl_cfi $do_general_protection - jmp error_code - CFI_ENDPROC -END(general_protection) - -#ifdef CONFIG_KVM_GUEST -ENTRY(async_page_fault) - RING0_EC_FRAME - ASM_CLAC - pushl_cfi $do_async_page_fault - jmp error_code - CFI_ENDPROC -END(async_page_fault) -#endif - diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 7e429c99c728..0e2d96ffd158 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -557,7 +557,7 @@ early_idt_handler_common: cld cmpl $2,(%esp) # X86_TRAP_NMI - je is_nmi # Ignore NMI + je .Lis_nmi # Ignore NMI cmpl $2,%ss:early_recursion_flag je hlt_loop @@ -610,7 +610,7 @@ ex_entry: pop %ecx pop %eax decl %ss:early_recursion_flag -is_nmi: +.Lis_nmi: addl $8,%esp /* drop vector number and error code */ iret ENDPROC(early_idt_handler_common) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index df7e78057ae0..e5c27f729a38 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -346,7 +346,7 @@ early_idt_handler_common: cld cmpl $2,(%rsp) # X86_TRAP_NMI - je is_nmi # Ignore NMI + je .Lis_nmi # Ignore NMI cmpl $2,early_recursion_flag(%rip) jz 1f @@ -411,7 +411,7 @@ early_idt_handler_common: popq %rcx popq %rax decl early_recursion_flag(%rip) -is_nmi: +.Lis_nmi: addq $16,%rsp # drop vector number and error code INTERRUPT_RETURN ENDPROC(early_idt_handler_common) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 3acbff4716b0..10757d0a3fcf 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -305,8 +306,6 @@ static void hpet_legacy_clockevent_register(void) printk(KERN_DEBUG "hpet clockevent registered\n"); } -static int hpet_setup_msi_irq(unsigned int irq); - static void hpet_set_mode(enum clock_event_mode mode, struct clock_event_device *evt, int timer) { @@ -357,7 +356,7 @@ static void hpet_set_mode(enum clock_event_mode mode, hpet_enable_legacy_int(); } else { struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); - hpet_setup_msi_irq(hdev->irq); + irq_domain_activate_irq(irq_get_irq_data(hdev->irq)); disable_irq(hdev->irq); irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu)); enable_irq(hdev->irq); @@ -423,6 +422,7 @@ static int hpet_legacy_next_event(unsigned long delta, static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev); static struct hpet_dev *hpet_devs; +static struct irq_domain *hpet_domain; void hpet_msi_unmask(struct irq_data *data) { @@ -473,31 +473,6 @@ static int hpet_msi_next_event(unsigned long delta, return hpet_next_event(delta, evt, hdev->num); } -static int hpet_setup_msi_irq(unsigned int irq) -{ - if (x86_msi.setup_hpet_msi(irq, hpet_blockid)) { - irq_free_hwirq(irq); - return -EINVAL; - } - return 0; -} - -static int hpet_assign_irq(struct hpet_dev *dev) -{ - unsigned int irq = irq_alloc_hwirq(-1); - - if (!irq) - return -EINVAL; - - irq_set_handler_data(irq, dev); - - if (hpet_setup_msi_irq(irq)) - return -EINVAL; - - dev->irq = irq; - return 0; -} - static irqreturn_t hpet_interrupt_handler(int irq, void *data) { struct hpet_dev *dev = (struct hpet_dev *)data; @@ -540,9 +515,6 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu) if (!(hdev->flags & HPET_DEV_VALID)) return; - if (hpet_setup_msi_irq(hdev->irq)) - return; - hdev->cpu = cpu; per_cpu(cpu_hpet_dev, cpu) = hdev; evt->name = hdev->name; @@ -574,7 +546,7 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) unsigned int id; unsigned int num_timers; unsigned int num_timers_used = 0; - int i; + int i, irq; if (hpet_msi_disable) return; @@ -587,6 +559,10 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) num_timers++; /* Value read out starts from 0 */ hpet_print_config(); + hpet_domain = hpet_create_irq_domain(hpet_blockid); + if (!hpet_domain) + return; + hpet_devs = kzalloc(sizeof(struct hpet_dev) * num_timers, GFP_KERNEL); if (!hpet_devs) return; @@ -604,12 +580,14 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) hdev->flags = 0; if (cfg & HPET_TN_PERIODIC_CAP) hdev->flags |= HPET_DEV_PERI_CAP; + sprintf(hdev->name, "hpet%d", i); hdev->num = i; - sprintf(hdev->name, "hpet%d", i); - if (hpet_assign_irq(hdev)) + irq = hpet_assign_irq(hpet_domain, hdev, hdev->num); + if (irq <= 0) continue; + hdev->irq = irq; hdev->flags |= HPET_DEV_FSB_CAP; hdev->flags |= HPET_DEV_VALID; num_timers_used++; @@ -709,10 +687,6 @@ static int hpet_cpuhp_notify(struct notifier_block *n, } #else -static int hpet_setup_msi_irq(unsigned int irq) -{ - return 0; -} static void hpet_msi_capability_lookup(unsigned int start_timer) { return; diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index e7cc5370cd2f..16cb827a5b27 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -329,8 +329,8 @@ static void init_8259A(int auto_eoi) */ outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ - /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */ - outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR); + /* ICW2: 8259A-1 IR0-7 mapped to ISA_IRQ_VECTOR(0) */ + outb_pic(ISA_IRQ_VECTOR(0), PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); @@ -342,8 +342,8 @@ static void init_8259A(int auto_eoi) outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ - /* ICW2: 8259A-2 IR0-7 mapped to IRQ8_VECTOR */ - outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR); + /* ICW2: 8259A-2 IR0-7 mapped to ISA_IRQ_VECTOR(8) */ + outb_pic(ISA_IRQ_VECTOR(8), PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index e5952c225532..88b366487b0e 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -22,6 +22,12 @@ #define CREATE_TRACE_POINTS #include +DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); +EXPORT_PER_CPU_SYMBOL(irq_stat); + +DEFINE_PER_CPU(struct pt_regs *, irq_regs); +EXPORT_PER_CPU_SYMBOL(irq_regs); + atomic_t irq_err_count; /* Function pointer for generic interrupt vector handling */ @@ -116,6 +122,12 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); seq_puts(p, " Threshold APIC interrupts\n"); #endif +#ifdef CONFIG_X86_MCE_AMD + seq_printf(p, "%*s: ", prec, "DFR"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_deferred_error_count); + seq_puts(p, " Deferred Error APIC interrupts\n"); +#endif #ifdef CONFIG_X86_MCE seq_printf(p, "%*s: ", prec, "MCE"); for_each_online_cpu(j) @@ -135,6 +147,18 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); #if defined(CONFIG_X86_IO_APIC) seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count)); +#endif +#ifdef CONFIG_HAVE_KVM + seq_printf(p, "%*s: ", prec, "PIN"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->kvm_posted_intr_ipis); + seq_puts(p, " Posted-interrupt notification event\n"); + + seq_printf(p, "%*s: ", prec, "PIW"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + irq_stats(j)->kvm_posted_intr_wakeup_ipis); + seq_puts(p, " Posted-interrupt wakeup event\n"); #endif return 0; } @@ -192,8 +216,7 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) unsigned vector = ~regs->orig_ax; unsigned irq; - irq_enter(); - exit_idle(); + entering_irq(); irq = __this_cpu_read(vector_irq[vector]); @@ -209,7 +232,7 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) } } - irq_exit(); + exiting_irq(); set_irq_regs(old_regs); return 1; @@ -237,6 +260,18 @@ __visible void smp_x86_platform_ipi(struct pt_regs *regs) } #ifdef CONFIG_HAVE_KVM +static void dummy_handler(void) {} +static void (*kvm_posted_intr_wakeup_handler)(void) = dummy_handler; + +void kvm_set_posted_intr_wakeup_handler(void (*handler)(void)) +{ + if (handler) + kvm_posted_intr_wakeup_handler = handler; + else + kvm_posted_intr_wakeup_handler = dummy_handler; +} +EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler); + /* * Handler for POSTED_INTERRUPT_VECTOR. */ @@ -244,16 +279,23 @@ __visible void smp_kvm_posted_intr_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); - ack_APIC_irq(); - - irq_enter(); - - exit_idle(); - + entering_ack_irq(); inc_irq_stat(kvm_posted_intr_ipis); + exiting_irq(); + set_irq_regs(old_regs); +} - irq_exit(); +/* + * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. + */ +__visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + entering_ack_irq(); + inc_irq_stat(kvm_posted_intr_wakeup_ipis); + kvm_posted_intr_wakeup_handler(); + exiting_irq(); set_irq_regs(old_regs); } #endif diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index f9fd86a7fcc7..cd74f5978ab9 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -21,12 +21,6 @@ #include -DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); -EXPORT_PER_CPU_SYMBOL(irq_stat); - -DEFINE_PER_CPU(struct pt_regs *, irq_regs); -EXPORT_PER_CPU_SYMBOL(irq_regs); - #ifdef CONFIG_DEBUG_STACKOVERFLOW int sysctl_panic_on_stackoverflow __read_mostly; diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 394e643d7830..bc4604e500a3 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -20,12 +20,6 @@ #include #include -DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); -EXPORT_PER_CPU_SYMBOL(irq_stat); - -DEFINE_PER_CPU(struct pt_regs *, irq_regs); -EXPORT_PER_CPU_SYMBOL(irq_regs); - int sysctl_panic_on_stackoverflow; /* diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index 15d741ddfeeb..dc5fa6a1e8d6 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -10,12 +10,6 @@ #include #include -static inline void irq_work_entering_irq(void) -{ - irq_enter(); - ack_APIC_irq(); -} - static inline void __smp_irq_work_interrupt(void) { inc_irq_stat(apic_irq_work_irqs); @@ -24,14 +18,14 @@ static inline void __smp_irq_work_interrupt(void) __visible void smp_irq_work_interrupt(struct pt_regs *regs) { - irq_work_entering_irq(); + ipi_entering_ack_irq(); __smp_irq_work_interrupt(); exiting_irq(); } __visible void smp_trace_irq_work_interrupt(struct pt_regs *regs) { - irq_work_entering_irq(); + ipi_entering_ack_irq(); trace_irq_work_entry(IRQ_WORK_VECTOR); __smp_irq_work_interrupt(); trace_irq_work_exit(IRQ_WORK_VECTOR); diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index cd10a6437264..a3a5e158ed69 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -86,7 +86,7 @@ void __init init_IRQ(void) int i; /* - * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15. + * On cpu 0, Assign ISA_IRQ_VECTOR(irq) to IRQ 0..15. * If these IRQ's are handled by legacy interrupt-controllers like PIC, * then this configuration will likely be static after the boot. If * these IRQ's are handled by more mordern controllers like IO-APIC, @@ -94,7 +94,7 @@ void __init init_IRQ(void) * irq's migrate etc. */ for (i = 0; i < nr_legacy_irqs(); i++) - per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i; + per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = i; x86_init.irqs.intr_init(); } @@ -135,6 +135,10 @@ static void __init apic_intr_init(void) alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); #endif +#ifdef CONFIG_X86_MCE_AMD + alloc_intr_gate(DEFERRED_ERROR_VECTOR, deferred_error_interrupt); +#endif + #ifdef CONFIG_X86_LOCAL_APIC /* self generated IPI for local APIC timer */ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); @@ -144,6 +148,8 @@ static void __init apic_intr_init(void) #ifdef CONFIG_HAVE_KVM /* IPI for KVM to deliver posted interrupt */ alloc_intr_gate(POSTED_INTR_VECTOR, kvm_posted_intr_ipi); + /* IPI for KVM to deliver interrupt to wake up tasks */ + alloc_intr_gate(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi); #endif /* IPI vectors for APIC spurious and error interrupts */ diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 415480d3ea84..11546b462fa6 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 2d2a237f2c73..30ca7607cbbb 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -19,8 +19,8 @@ #include #include #include -#include +#include #include #include #include @@ -113,11 +113,6 @@ static void __init MP_bus_info(struct mpc_bus *m) pr_warn("Unknown bustype %s - ignoring\n", str); } -static struct irq_domain_ops mp_ioapic_irqdomain_ops = { - .map = mp_irqdomain_map, - .unmap = mp_irqdomain_unmap, -}; - static void __init MP_ioapic_info(struct mpc_ioapic *m) { struct ioapic_domain_cfg cfg = { diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index c614dd492f5f..58bcfb67c01f 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -154,7 +154,9 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, ret = paravirt_patch_ident_64(insnbuf, len); else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || +#ifdef CONFIG_X86_32 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) || +#endif type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) || type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64)) /* If operation requires a jmp, then jmp */ @@ -371,7 +373,7 @@ __visible struct pv_cpu_ops pv_cpu_ops = { .load_sp0 = native_load_sp0, -#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) +#if defined(CONFIG_X86_32) .irq_enable_sysexit = native_irq_enable_sysexit, #endif #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index a1fa86782186..8aa05583bc42 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -55,7 +55,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_irq_ops, save_fl); PATCH_SITE(pv_irq_ops, irq_enable); PATCH_SITE(pv_irq_ops, irq_disable); - PATCH_SITE(pv_cpu_ops, irq_enable_sysexit); PATCH_SITE(pv_cpu_ops, usergs_sysret32); PATCH_SITE(pv_cpu_ops, usergs_sysret64); PATCH_SITE(pv_cpu_ops, swapgs); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index deff651835b4..c09c99ccf3e3 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -303,13 +303,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) arch_end_context_switch(next_p); /* - * Reload esp0, kernel_stack, and current_top_of_stack. This changes + * Reload esp0 and cpu_current_top_of_stack. This changes * current_thread_info(). */ load_sp0(tss, next); - this_cpu_write(kernel_stack, - (unsigned long)task_stack_page(next_p) + - THREAD_SIZE); this_cpu_write(cpu_current_top_of_stack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index c50e013b57d2..843f92e4c711 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -410,9 +410,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* Reload esp0 and ss1. This changes current_thread_info(). */ load_sp0(tss, next); - this_cpu_write(kernel_stack, - (unsigned long)task_stack_page(next_p) + THREAD_SIZE); - /* * Now maybe reload the debug registers and handle I/O bitmaps */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index cba828892790..265a6fdea8b7 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1224,8 +1224,7 @@ void __init setup_arch(char **cmdline_p) init_cpu_to_node(); init_apic_mappings(); - if (x86_io_apic_ops.init) - x86_io_apic_ops.init(); + io_apic_init_mappings(); kvm_guest_init(); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index be8e1bde07aa..15aaa69bbb5e 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -170,8 +170,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) asmlinkage __visible void smp_reboot_interrupt(void) { - ack_APIC_irq(); - irq_enter(); + ipi_entering_ack_irq(); stop_this_cpu(NULL); irq_exit(); } @@ -265,12 +264,6 @@ __visible void smp_reschedule_interrupt(struct pt_regs *regs) */ } -static inline void smp_entering_irq(void) -{ - ack_APIC_irq(); - irq_enter(); -} - __visible void smp_trace_reschedule_interrupt(struct pt_regs *regs) { /* @@ -279,7 +272,7 @@ __visible void smp_trace_reschedule_interrupt(struct pt_regs *regs) * scheduler_ipi(). This is OK, since those functions are allowed * to nest. */ - smp_entering_irq(); + ipi_entering_ack_irq(); trace_reschedule_entry(RESCHEDULE_VECTOR); __smp_reschedule_interrupt(); trace_reschedule_exit(RESCHEDULE_VECTOR); @@ -297,14 +290,14 @@ static inline void __smp_call_function_interrupt(void) __visible void smp_call_function_interrupt(struct pt_regs *regs) { - smp_entering_irq(); + ipi_entering_ack_irq(); __smp_call_function_interrupt(); exiting_irq(); } __visible void smp_trace_call_function_interrupt(struct pt_regs *regs) { - smp_entering_irq(); + ipi_entering_ack_irq(); trace_call_function_entry(CALL_FUNCTION_VECTOR); __smp_call_function_interrupt(); trace_call_function_exit(CALL_FUNCTION_VECTOR); @@ -319,14 +312,14 @@ static inline void __smp_call_function_single_interrupt(void) __visible void smp_call_function_single_interrupt(struct pt_regs *regs) { - smp_entering_irq(); + ipi_entering_ack_irq(); __smp_call_function_single_interrupt(); exiting_irq(); } __visible void smp_trace_call_function_single_interrupt(struct pt_regs *regs) { - smp_entering_irq(); + ipi_entering_ack_irq(); trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); __smp_call_function_single_interrupt(); trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 6d4bfea25874..8add66b22f33 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -514,6 +514,40 @@ void __inquire_remote_apic(int apicid) } } +/* + * The Multiprocessor Specification 1.4 (1997) example code suggests + * that there should be a 10ms delay between the BSP asserting INIT + * and de-asserting INIT, when starting a remote processor. + * But that slows boot and resume on modern processors, which include + * many cores and don't require that delay. + * + * Cmdline "init_cpu_udelay=" is available to over-ride this delay. + * Modern processor families are quirked to remove the delay entirely. + */ +#define UDELAY_10MS_DEFAULT 10000 + +static unsigned int init_udelay = UDELAY_10MS_DEFAULT; + +static int __init cpu_init_udelay(char *str) +{ + get_option(&str, &init_udelay); + + return 0; +} +early_param("cpu_init_udelay", cpu_init_udelay); + +static void __init smp_quirk_init_udelay(void) +{ + /* if cmdline changed it from default, leave it alone */ + if (init_udelay != UDELAY_10MS_DEFAULT) + return; + + /* if modern processor, use no delay */ + if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) || + ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) + init_udelay = 0; +} + /* * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this @@ -556,7 +590,7 @@ wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip) static int wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) { - unsigned long send_status, accept_status = 0; + unsigned long send_status = 0, accept_status = 0; int maxlvt, num_starts, j; maxlvt = lapic_get_maxlvt(); @@ -584,7 +618,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); - mdelay(10); + udelay(init_udelay); pr_debug("Deasserting INIT\n"); @@ -652,6 +686,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) * Give the other CPU some time to accept the IPI. */ udelay(200); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); accept_status = (apic_read(APIC_ESR) & 0xEF); @@ -793,8 +828,6 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle) clear_tsk_thread_flag(idle, TIF_FORK); initial_gs = per_cpu_offset(cpu); #endif - per_cpu(kernel_stack, cpu) = - (unsigned long)task_stack_page(idle) + THREAD_SIZE; } /* @@ -1177,6 +1210,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) uv_system_init(); set_mtrr_aps_delayed_init(); + + smp_quirk_init_udelay(); } void arch_enable_nonboot_cpus_begin(void) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 36cb15b7b367..f5791927aa64 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -73,8 +73,7 @@ gate_desc debug_idt_table[NR_VECTORS] __page_aligned_bss; #else #include #include - -asmlinkage int system_call(void); +#include #endif /* Must be page-aligned because the real IDT is used in a fixmap. */ @@ -769,18 +768,6 @@ dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) { conditional_sti(regs); -#if 0 - /* No need to warn about this any longer. */ - pr_info("Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); -#endif -} - -asmlinkage __visible void __attribute__((weak)) smp_thermal_interrupt(void) -{ -} - -asmlinkage __visible void __attribute__((weak)) smp_threshold_interrupt(void) -{ } dotraplinkage void @@ -906,13 +893,13 @@ void __init trap_init(void) set_bit(i, used_vectors); #ifdef CONFIG_IA32_EMULATION - set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); + set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_compat); set_bit(IA32_SYSCALL_VECTOR, used_vectors); #endif #ifdef CONFIG_X86_32 - set_system_trap_gate(SYSCALL_VECTOR, &system_call); - set_bit(SYSCALL_VECTOR, used_vectors); + set_system_trap_gate(IA32_SYSCALL_VECTOR, entry_INT80_32); + set_bit(IA32_SYSCALL_VECTOR, used_vectors); #endif /* diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 234b0722de53..3cee10abf01d 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -111,11 +111,9 @@ EXPORT_SYMBOL_GPL(x86_platform); #if defined(CONFIG_PCI_MSI) struct x86_msi_ops x86_msi = { .setup_msi_irqs = native_setup_msi_irqs, - .compose_msi_msg = native_compose_msi_msg, .teardown_msi_irq = native_teardown_msi_irq, .teardown_msi_irqs = default_teardown_msi_irqs, .restore_msi_irqs = default_restore_msi_irqs, - .setup_hpet_msi = default_setup_hpet_msi, }; /* MSI arch specific hooks */ @@ -141,13 +139,6 @@ void arch_restore_msi_irqs(struct pci_dev *dev) #endif struct x86_io_apic_ops x86_io_apic_ops = { - .init = native_io_apic_init_mappings, .read = native_io_apic_read, - .write = native_io_apic_write, - .modify = native_io_apic_modify, .disable = native_disable_io_apic, - .print_entries = native_io_apic_print_entries, - .set_affinity = native_ioapic_set_affinity, - .setup_entry = native_setup_ioapic_entry, - .eoi_ioapic_pin = native_eoi_ioapic_pin, }; diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 27f8eea0d6eb..f2dc08c003eb 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -90,7 +90,7 @@ struct lguest_data lguest_data = { .noirq_iret = (u32)lguest_noirq_iret, .kernel_address = PAGE_OFFSET, .blocked_interrupts = { 1 }, /* Block timer interrupts */ - .syscall_vec = SYSCALL_VECTOR, + .syscall_vec = IA32_SYSCALL_VECTOR, }; /*G:037 @@ -866,7 +866,7 @@ static void __init lguest_init_IRQ(void) for (i = FIRST_EXTERNAL_VECTOR; i < FIRST_SYSTEM_VECTOR; i++) { /* Some systems map "vectors" to interrupts weirdly. Not us! */ __this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR); - if (i != SYSCALL_VECTOR) + if (i != IA32_SYSCALL_VECTOR) set_intr_gate(i, irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR)); } diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 1530afb07c85..f2587888d987 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -17,7 +17,6 @@ clean-files := inat-tables.c obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o lib-y := delay.o misc.o cmdline.o -lib-y += thunk_$(BITS).o lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o lib-y += memcpy_$(BITS).o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o @@ -40,6 +39,6 @@ else lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o lib-y += clear_page_64.o copy_page_64.o lib-y += memmove_64.o memset_64.o - lib-y += copy_user_64.o copy_user_nocache_64.o + lib-y += copy_user_64.o lib-y += cmpxchg16b_emu.o endif diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S index 00933d5e992f..9b0ca8fe80fc 100644 --- a/arch/x86/lib/atomic64_386_32.S +++ b/arch/x86/lib/atomic64_386_32.S @@ -11,26 +11,23 @@ #include #include -#include /* if you want SMP support, implement these with real spinlocks */ .macro LOCK reg - pushfl_cfi + pushfl cli .endm .macro UNLOCK reg - popfl_cfi + popfl .endm #define BEGIN(op) \ .macro endp; \ - CFI_ENDPROC; \ ENDPROC(atomic64_##op##_386); \ .purgem endp; \ .endm; \ ENTRY(atomic64_##op##_386); \ - CFI_STARTPROC; \ LOCK v; #define ENDP endp diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S index 082a85167a5b..db3ae85440ff 100644 --- a/arch/x86/lib/atomic64_cx8_32.S +++ b/arch/x86/lib/atomic64_cx8_32.S @@ -11,7 +11,6 @@ #include #include -#include .macro read64 reg movl %ebx, %eax @@ -22,16 +21,11 @@ .endm ENTRY(atomic64_read_cx8) - CFI_STARTPROC - read64 %ecx ret - CFI_ENDPROC ENDPROC(atomic64_read_cx8) ENTRY(atomic64_set_cx8) - CFI_STARTPROC - 1: /* we don't need LOCK_PREFIX since aligned 64-bit writes * are atomic on 586 and newer */ @@ -39,28 +33,23 @@ ENTRY(atomic64_set_cx8) jne 1b ret - CFI_ENDPROC ENDPROC(atomic64_set_cx8) ENTRY(atomic64_xchg_cx8) - CFI_STARTPROC - 1: LOCK_PREFIX cmpxchg8b (%esi) jne 1b ret - CFI_ENDPROC ENDPROC(atomic64_xchg_cx8) .macro addsub_return func ins insc ENTRY(atomic64_\func\()_return_cx8) - CFI_STARTPROC - pushl_cfi_reg ebp - pushl_cfi_reg ebx - pushl_cfi_reg esi - pushl_cfi_reg edi + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi movl %eax, %esi movl %edx, %edi @@ -79,12 +68,11 @@ ENTRY(atomic64_\func\()_return_cx8) 10: movl %ebx, %eax movl %ecx, %edx - popl_cfi_reg edi - popl_cfi_reg esi - popl_cfi_reg ebx - popl_cfi_reg ebp + popl %edi + popl %esi + popl %ebx + popl %ebp ret - CFI_ENDPROC ENDPROC(atomic64_\func\()_return_cx8) .endm @@ -93,8 +81,7 @@ addsub_return sub sub sbb .macro incdec_return func ins insc ENTRY(atomic64_\func\()_return_cx8) - CFI_STARTPROC - pushl_cfi_reg ebx + pushl %ebx read64 %esi 1: @@ -109,9 +96,8 @@ ENTRY(atomic64_\func\()_return_cx8) 10: movl %ebx, %eax movl %ecx, %edx - popl_cfi_reg ebx + popl %ebx ret - CFI_ENDPROC ENDPROC(atomic64_\func\()_return_cx8) .endm @@ -119,8 +105,7 @@ incdec_return inc add adc incdec_return dec sub sbb ENTRY(atomic64_dec_if_positive_cx8) - CFI_STARTPROC - pushl_cfi_reg ebx + pushl %ebx read64 %esi 1: @@ -136,18 +121,16 @@ ENTRY(atomic64_dec_if_positive_cx8) 2: movl %ebx, %eax movl %ecx, %edx - popl_cfi_reg ebx + popl %ebx ret - CFI_ENDPROC ENDPROC(atomic64_dec_if_positive_cx8) ENTRY(atomic64_add_unless_cx8) - CFI_STARTPROC - pushl_cfi_reg ebp - pushl_cfi_reg ebx + pushl %ebp + pushl %ebx /* these just push these two parameters on the stack */ - pushl_cfi_reg edi - pushl_cfi_reg ecx + pushl %edi + pushl %ecx movl %eax, %ebp movl %edx, %edi @@ -168,21 +151,18 @@ ENTRY(atomic64_add_unless_cx8) movl $1, %eax 3: addl $8, %esp - CFI_ADJUST_CFA_OFFSET -8 - popl_cfi_reg ebx - popl_cfi_reg ebp + popl %ebx + popl %ebp ret 4: cmpl %edx, 4(%esp) jne 2b xorl %eax, %eax jmp 3b - CFI_ENDPROC ENDPROC(atomic64_add_unless_cx8) ENTRY(atomic64_inc_not_zero_cx8) - CFI_STARTPROC - pushl_cfi_reg ebx + pushl %ebx read64 %esi 1: @@ -199,7 +179,6 @@ ENTRY(atomic64_inc_not_zero_cx8) movl $1, %eax 3: - popl_cfi_reg ebx + popl %ebx ret - CFI_ENDPROC ENDPROC(atomic64_inc_not_zero_cx8) diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S index 9bc944a91274..c1e623209853 100644 --- a/arch/x86/lib/checksum_32.S +++ b/arch/x86/lib/checksum_32.S @@ -26,7 +26,6 @@ */ #include -#include #include #include @@ -50,9 +49,8 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) * alignment for the unrolled loop. */ ENTRY(csum_partial) - CFI_STARTPROC - pushl_cfi_reg esi - pushl_cfi_reg ebx + pushl %esi + pushl %ebx movl 20(%esp),%eax # Function arg: unsigned int sum movl 16(%esp),%ecx # Function arg: int len movl 12(%esp),%esi # Function arg: unsigned char *buff @@ -129,10 +127,9 @@ ENTRY(csum_partial) jz 8f roll $8, %eax 8: - popl_cfi_reg ebx - popl_cfi_reg esi + popl %ebx + popl %esi ret - CFI_ENDPROC ENDPROC(csum_partial) #else @@ -140,9 +137,8 @@ ENDPROC(csum_partial) /* Version for PentiumII/PPro */ ENTRY(csum_partial) - CFI_STARTPROC - pushl_cfi_reg esi - pushl_cfi_reg ebx + pushl %esi + pushl %ebx movl 20(%esp),%eax # Function arg: unsigned int sum movl 16(%esp),%ecx # Function arg: int len movl 12(%esp),%esi # Function arg: const unsigned char *buf @@ -249,10 +245,9 @@ ENTRY(csum_partial) jz 90f roll $8, %eax 90: - popl_cfi_reg ebx - popl_cfi_reg esi + popl %ebx + popl %esi ret - CFI_ENDPROC ENDPROC(csum_partial) #endif @@ -287,12 +282,10 @@ unsigned int csum_partial_copy_generic (const char *src, char *dst, #define FP 12 ENTRY(csum_partial_copy_generic) - CFI_STARTPROC subl $4,%esp - CFI_ADJUST_CFA_OFFSET 4 - pushl_cfi_reg edi - pushl_cfi_reg esi - pushl_cfi_reg ebx + pushl %edi + pushl %esi + pushl %ebx movl ARGBASE+16(%esp),%eax # sum movl ARGBASE+12(%esp),%ecx # len movl ARGBASE+4(%esp),%esi # src @@ -401,12 +394,11 @@ DST( movb %cl, (%edi) ) .previous - popl_cfi_reg ebx - popl_cfi_reg esi - popl_cfi_reg edi - popl_cfi %ecx # equivalent to addl $4,%esp + popl %ebx + popl %esi + popl %edi + popl %ecx # equivalent to addl $4,%esp ret - CFI_ENDPROC ENDPROC(csum_partial_copy_generic) #else @@ -426,10 +418,9 @@ ENDPROC(csum_partial_copy_generic) #define ARGBASE 12 ENTRY(csum_partial_copy_generic) - CFI_STARTPROC - pushl_cfi_reg ebx - pushl_cfi_reg edi - pushl_cfi_reg esi + pushl %ebx + pushl %edi + pushl %esi movl ARGBASE+4(%esp),%esi #src movl ARGBASE+8(%esp),%edi #dst movl ARGBASE+12(%esp),%ecx #len @@ -489,11 +480,10 @@ DST( movb %dl, (%edi) ) jmp 7b .previous - popl_cfi_reg esi - popl_cfi_reg edi - popl_cfi_reg ebx + popl %esi + popl %edi + popl %ebx ret - CFI_ENDPROC ENDPROC(csum_partial_copy_generic) #undef ROUND diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index e67e579c93bd..a2fe51b00cce 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -1,5 +1,4 @@ #include -#include #include #include @@ -15,7 +14,6 @@ * %rdi - page */ ENTRY(clear_page) - CFI_STARTPROC ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \ "jmp clear_page_c_e", X86_FEATURE_ERMS @@ -24,11 +22,9 @@ ENTRY(clear_page) xorl %eax,%eax rep stosq ret - CFI_ENDPROC ENDPROC(clear_page) ENTRY(clear_page_orig) - CFI_STARTPROC xorl %eax,%eax movl $4096/64,%ecx @@ -48,14 +44,11 @@ ENTRY(clear_page_orig) jnz .Lloop nop ret - CFI_ENDPROC ENDPROC(clear_page_orig) ENTRY(clear_page_c_e) - CFI_STARTPROC movl $4096,%ecx xorl %eax,%eax rep stosb ret - CFI_ENDPROC ENDPROC(clear_page_c_e) diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S index 40a172541ee2..9b330242e740 100644 --- a/arch/x86/lib/cmpxchg16b_emu.S +++ b/arch/x86/lib/cmpxchg16b_emu.S @@ -6,7 +6,6 @@ * */ #include -#include #include .text @@ -21,7 +20,6 @@ * %al : Operation successful */ ENTRY(this_cpu_cmpxchg16b_emu) -CFI_STARTPROC # # Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in %al not @@ -32,7 +30,7 @@ CFI_STARTPROC # *atomic* on a single cpu (as provided by the this_cpu_xx class of # macros). # - pushfq_cfi + pushfq cli cmpq PER_CPU_VAR((%rsi)), %rax @@ -43,17 +41,13 @@ CFI_STARTPROC movq %rbx, PER_CPU_VAR((%rsi)) movq %rcx, PER_CPU_VAR(8(%rsi)) - CFI_REMEMBER_STATE - popfq_cfi + popfq mov $1, %al ret - CFI_RESTORE_STATE .Lnot_same: - popfq_cfi + popfq xor %al,%al ret -CFI_ENDPROC - ENDPROC(this_cpu_cmpxchg16b_emu) diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S index b4807fce5177..ad5349778490 100644 --- a/arch/x86/lib/cmpxchg8b_emu.S +++ b/arch/x86/lib/cmpxchg8b_emu.S @@ -7,7 +7,6 @@ */ #include -#include .text @@ -20,14 +19,13 @@ * %ecx : high 32 bits of new value */ ENTRY(cmpxchg8b_emu) -CFI_STARTPROC # # Emulate 'cmpxchg8b (%esi)' on UP except we don't # set the whole ZF thing (caller will just compare # eax:edx with the expected value) # - pushfl_cfi + pushfl cli cmpl (%esi), %eax @@ -38,18 +36,15 @@ CFI_STARTPROC movl %ebx, (%esi) movl %ecx, 4(%esi) - CFI_REMEMBER_STATE - popfl_cfi + popfl ret - CFI_RESTORE_STATE .Lnot_same: movl (%esi), %eax .Lhalf_same: movl 4(%esi), %edx - popfl_cfi + popfl ret -CFI_ENDPROC ENDPROC(cmpxchg8b_emu) diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index 8239dbcbf984..009f98216b7e 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S @@ -1,7 +1,6 @@ /* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */ #include -#include #include #include @@ -13,22 +12,16 @@ */ ALIGN ENTRY(copy_page) - CFI_STARTPROC ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD movl $4096/8, %ecx rep movsq ret - CFI_ENDPROC ENDPROC(copy_page) ENTRY(copy_page_regs) - CFI_STARTPROC subq $2*8, %rsp - CFI_ADJUST_CFA_OFFSET 2*8 movq %rbx, (%rsp) - CFI_REL_OFFSET rbx, 0 movq %r12, 1*8(%rsp) - CFI_REL_OFFSET r12, 1*8 movl $(4096/64)-5, %ecx .p2align 4 @@ -87,11 +80,7 @@ ENTRY(copy_page_regs) jnz .Loop2 movq (%rsp), %rbx - CFI_RESTORE rbx movq 1*8(%rsp), %r12 - CFI_RESTORE r12 addq $2*8, %rsp - CFI_ADJUST_CFA_OFFSET -2*8 ret - CFI_ENDPROC ENDPROC(copy_page_regs) diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index fa997dfaef24..982ce34f4a9b 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -7,7 +7,6 @@ */ #include -#include #include #include #include @@ -16,33 +15,8 @@ #include #include - .macro ALIGN_DESTINATION - /* check for bad alignment of destination */ - movl %edi,%ecx - andl $7,%ecx - jz 102f /* already aligned */ - subl $8,%ecx - negl %ecx - subl %ecx,%edx -100: movb (%rsi),%al -101: movb %al,(%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz 100b -102: - .section .fixup,"ax" -103: addl %ecx,%edx /* ecx is zerorest also */ - jmp copy_user_handle_tail - .previous - - _ASM_EXTABLE(100b,103b) - _ASM_EXTABLE(101b,103b) - .endm - /* Standard copy_to_user with segment limit checking */ ENTRY(_copy_to_user) - CFI_STARTPROC GET_THREAD_INFO(%rax) movq %rdi,%rcx addq %rdx,%rcx @@ -54,12 +28,10 @@ ENTRY(_copy_to_user) X86_FEATURE_REP_GOOD, \ "jmp copy_user_enhanced_fast_string", \ X86_FEATURE_ERMS - CFI_ENDPROC ENDPROC(_copy_to_user) /* Standard copy_from_user with segment limit checking */ ENTRY(_copy_from_user) - CFI_STARTPROC GET_THREAD_INFO(%rax) movq %rsi,%rcx addq %rdx,%rcx @@ -71,14 +43,12 @@ ENTRY(_copy_from_user) X86_FEATURE_REP_GOOD, \ "jmp copy_user_enhanced_fast_string", \ X86_FEATURE_ERMS - CFI_ENDPROC ENDPROC(_copy_from_user) .section .fixup,"ax" /* must zero dest */ ENTRY(bad_from_user) bad_from_user: - CFI_STARTPROC movl %edx,%ecx xorl %eax,%eax rep @@ -86,7 +56,6 @@ bad_from_user: bad_to_user: movl %edx,%eax ret - CFI_ENDPROC ENDPROC(bad_from_user) .previous @@ -104,7 +73,6 @@ ENDPROC(bad_from_user) * eax uncopied bytes or 0 if successful. */ ENTRY(copy_user_generic_unrolled) - CFI_STARTPROC ASM_STAC cmpl $8,%edx jb 20f /* less then 8 bytes, go to byte copy loop */ @@ -186,7 +154,6 @@ ENTRY(copy_user_generic_unrolled) _ASM_EXTABLE(19b,40b) _ASM_EXTABLE(21b,50b) _ASM_EXTABLE(22b,50b) - CFI_ENDPROC ENDPROC(copy_user_generic_unrolled) /* Some CPUs run faster using the string copy instructions. @@ -208,7 +175,6 @@ ENDPROC(copy_user_generic_unrolled) * eax uncopied bytes or 0 if successful. */ ENTRY(copy_user_generic_string) - CFI_STARTPROC ASM_STAC cmpl $8,%edx jb 2f /* less than 8 bytes, go to byte copy loop */ @@ -233,7 +199,6 @@ ENTRY(copy_user_generic_string) _ASM_EXTABLE(1b,11b) _ASM_EXTABLE(3b,12b) - CFI_ENDPROC ENDPROC(copy_user_generic_string) /* @@ -249,7 +214,6 @@ ENDPROC(copy_user_generic_string) * eax uncopied bytes or 0 if successful. */ ENTRY(copy_user_enhanced_fast_string) - CFI_STARTPROC ASM_STAC movl %edx,%ecx 1: rep @@ -264,5 +228,94 @@ ENTRY(copy_user_enhanced_fast_string) .previous _ASM_EXTABLE(1b,12b) - CFI_ENDPROC ENDPROC(copy_user_enhanced_fast_string) + +/* + * copy_user_nocache - Uncached memory copy with exception handling + * This will force destination/source out of cache for more performance. + */ +ENTRY(__copy_user_nocache) + ASM_STAC + cmpl $8,%edx + jb 20f /* less then 8 bytes, go to byte copy loop */ + ALIGN_DESTINATION + movl %edx,%ecx + andl $63,%edx + shrl $6,%ecx + jz 17f +1: movq (%rsi),%r8 +2: movq 1*8(%rsi),%r9 +3: movq 2*8(%rsi),%r10 +4: movq 3*8(%rsi),%r11 +5: movnti %r8,(%rdi) +6: movnti %r9,1*8(%rdi) +7: movnti %r10,2*8(%rdi) +8: movnti %r11,3*8(%rdi) +9: movq 4*8(%rsi),%r8 +10: movq 5*8(%rsi),%r9 +11: movq 6*8(%rsi),%r10 +12: movq 7*8(%rsi),%r11 +13: movnti %r8,4*8(%rdi) +14: movnti %r9,5*8(%rdi) +15: movnti %r10,6*8(%rdi) +16: movnti %r11,7*8(%rdi) + leaq 64(%rsi),%rsi + leaq 64(%rdi),%rdi + decl %ecx + jnz 1b +17: movl %edx,%ecx + andl $7,%edx + shrl $3,%ecx + jz 20f +18: movq (%rsi),%r8 +19: movnti %r8,(%rdi) + leaq 8(%rsi),%rsi + leaq 8(%rdi),%rdi + decl %ecx + jnz 18b +20: andl %edx,%edx + jz 23f + movl %edx,%ecx +21: movb (%rsi),%al +22: movb %al,(%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz 21b +23: xorl %eax,%eax + ASM_CLAC + sfence + ret + + .section .fixup,"ax" +30: shll $6,%ecx + addl %ecx,%edx + jmp 60f +40: lea (%rdx,%rcx,8),%rdx + jmp 60f +50: movl %ecx,%edx +60: sfence + jmp copy_user_handle_tail + .previous + + _ASM_EXTABLE(1b,30b) + _ASM_EXTABLE(2b,30b) + _ASM_EXTABLE(3b,30b) + _ASM_EXTABLE(4b,30b) + _ASM_EXTABLE(5b,30b) + _ASM_EXTABLE(6b,30b) + _ASM_EXTABLE(7b,30b) + _ASM_EXTABLE(8b,30b) + _ASM_EXTABLE(9b,30b) + _ASM_EXTABLE(10b,30b) + _ASM_EXTABLE(11b,30b) + _ASM_EXTABLE(12b,30b) + _ASM_EXTABLE(13b,30b) + _ASM_EXTABLE(14b,30b) + _ASM_EXTABLE(15b,30b) + _ASM_EXTABLE(16b,30b) + _ASM_EXTABLE(18b,40b) + _ASM_EXTABLE(19b,40b) + _ASM_EXTABLE(21b,50b) + _ASM_EXTABLE(22b,50b) +ENDPROC(__copy_user_nocache) diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S deleted file mode 100644 index 6a4f43c2d9e6..000000000000 --- a/arch/x86/lib/copy_user_nocache_64.S +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright 2008 Vitaly Mayatskikh - * Copyright 2002 Andi Kleen, SuSE Labs. - * Subject to the GNU Public License v2. - * - * Functions to copy from and to user space. - */ - -#include -#include - -#define FIX_ALIGNMENT 1 - -#include -#include -#include -#include -#include - - .macro ALIGN_DESTINATION -#ifdef FIX_ALIGNMENT - /* check for bad alignment of destination */ - movl %edi,%ecx - andl $7,%ecx - jz 102f /* already aligned */ - subl $8,%ecx - negl %ecx - subl %ecx,%edx -100: movb (%rsi),%al -101: movb %al,(%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz 100b -102: - .section .fixup,"ax" -103: addl %ecx,%edx /* ecx is zerorest also */ - jmp copy_user_handle_tail - .previous - - _ASM_EXTABLE(100b,103b) - _ASM_EXTABLE(101b,103b) -#endif - .endm - -/* - * copy_user_nocache - Uncached memory copy with exception handling - * This will force destination/source out of cache for more performance. - */ -ENTRY(__copy_user_nocache) - CFI_STARTPROC - ASM_STAC - cmpl $8,%edx - jb 20f /* less then 8 bytes, go to byte copy loop */ - ALIGN_DESTINATION - movl %edx,%ecx - andl $63,%edx - shrl $6,%ecx - jz 17f -1: movq (%rsi),%r8 -2: movq 1*8(%rsi),%r9 -3: movq 2*8(%rsi),%r10 -4: movq 3*8(%rsi),%r11 -5: movnti %r8,(%rdi) -6: movnti %r9,1*8(%rdi) -7: movnti %r10,2*8(%rdi) -8: movnti %r11,3*8(%rdi) -9: movq 4*8(%rsi),%r8 -10: movq 5*8(%rsi),%r9 -11: movq 6*8(%rsi),%r10 -12: movq 7*8(%rsi),%r11 -13: movnti %r8,4*8(%rdi) -14: movnti %r9,5*8(%rdi) -15: movnti %r10,6*8(%rdi) -16: movnti %r11,7*8(%rdi) - leaq 64(%rsi),%rsi - leaq 64(%rdi),%rdi - decl %ecx - jnz 1b -17: movl %edx,%ecx - andl $7,%edx - shrl $3,%ecx - jz 20f -18: movq (%rsi),%r8 -19: movnti %r8,(%rdi) - leaq 8(%rsi),%rsi - leaq 8(%rdi),%rdi - decl %ecx - jnz 18b -20: andl %edx,%edx - jz 23f - movl %edx,%ecx -21: movb (%rsi),%al -22: movb %al,(%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz 21b -23: xorl %eax,%eax - ASM_CLAC - sfence - ret - - .section .fixup,"ax" -30: shll $6,%ecx - addl %ecx,%edx - jmp 60f -40: lea (%rdx,%rcx,8),%rdx - jmp 60f -50: movl %ecx,%edx -60: sfence - jmp copy_user_handle_tail - .previous - - _ASM_EXTABLE(1b,30b) - _ASM_EXTABLE(2b,30b) - _ASM_EXTABLE(3b,30b) - _ASM_EXTABLE(4b,30b) - _ASM_EXTABLE(5b,30b) - _ASM_EXTABLE(6b,30b) - _ASM_EXTABLE(7b,30b) - _ASM_EXTABLE(8b,30b) - _ASM_EXTABLE(9b,30b) - _ASM_EXTABLE(10b,30b) - _ASM_EXTABLE(11b,30b) - _ASM_EXTABLE(12b,30b) - _ASM_EXTABLE(13b,30b) - _ASM_EXTABLE(14b,30b) - _ASM_EXTABLE(15b,30b) - _ASM_EXTABLE(16b,30b) - _ASM_EXTABLE(18b,40b) - _ASM_EXTABLE(19b,40b) - _ASM_EXTABLE(21b,50b) - _ASM_EXTABLE(22b,50b) - CFI_ENDPROC -ENDPROC(__copy_user_nocache) diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S index 9734182966f3..7e48807b2fa1 100644 --- a/arch/x86/lib/csum-copy_64.S +++ b/arch/x86/lib/csum-copy_64.S @@ -6,7 +6,6 @@ * for more details. No warranty for anything given at all. */ #include -#include #include #include @@ -47,23 +46,16 @@ ENTRY(csum_partial_copy_generic) - CFI_STARTPROC cmpl $3*64, %edx jle .Lignore .Lignore: subq $7*8, %rsp - CFI_ADJUST_CFA_OFFSET 7*8 movq %rbx, 2*8(%rsp) - CFI_REL_OFFSET rbx, 2*8 movq %r12, 3*8(%rsp) - CFI_REL_OFFSET r12, 3*8 movq %r14, 4*8(%rsp) - CFI_REL_OFFSET r14, 4*8 movq %r13, 5*8(%rsp) - CFI_REL_OFFSET r13, 5*8 movq %rbp, 6*8(%rsp) - CFI_REL_OFFSET rbp, 6*8 movq %r8, (%rsp) movq %r9, 1*8(%rsp) @@ -206,22 +198,14 @@ ENTRY(csum_partial_copy_generic) addl %ebx, %eax adcl %r9d, %eax /* carry */ - CFI_REMEMBER_STATE .Lende: movq 2*8(%rsp), %rbx - CFI_RESTORE rbx movq 3*8(%rsp), %r12 - CFI_RESTORE r12 movq 4*8(%rsp), %r14 - CFI_RESTORE r14 movq 5*8(%rsp), %r13 - CFI_RESTORE r13 movq 6*8(%rsp), %rbp - CFI_RESTORE rbp addq $7*8, %rsp - CFI_ADJUST_CFA_OFFSET -7*8 ret - CFI_RESTORE_STATE /* Exception handlers. Very simple, zeroing is done in the wrappers */ .Lbad_source: @@ -237,5 +221,4 @@ ENTRY(csum_partial_copy_generic) jz .Lende movl $-EFAULT, (%rax) jmp .Lende - CFI_ENDPROC ENDPROC(csum_partial_copy_generic) diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index a4512359656a..46668cda4ffd 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -26,7 +26,6 @@ */ #include -#include #include #include #include @@ -36,7 +35,6 @@ .text ENTRY(__get_user_1) - CFI_STARTPROC GET_THREAD_INFO(%_ASM_DX) cmp TI_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user @@ -45,11 +43,9 @@ ENTRY(__get_user_1) xor %eax,%eax ASM_CLAC ret - CFI_ENDPROC ENDPROC(__get_user_1) ENTRY(__get_user_2) - CFI_STARTPROC add $1,%_ASM_AX jc bad_get_user GET_THREAD_INFO(%_ASM_DX) @@ -60,11 +56,9 @@ ENTRY(__get_user_2) xor %eax,%eax ASM_CLAC ret - CFI_ENDPROC ENDPROC(__get_user_2) ENTRY(__get_user_4) - CFI_STARTPROC add $3,%_ASM_AX jc bad_get_user GET_THREAD_INFO(%_ASM_DX) @@ -75,11 +69,9 @@ ENTRY(__get_user_4) xor %eax,%eax ASM_CLAC ret - CFI_ENDPROC ENDPROC(__get_user_4) ENTRY(__get_user_8) - CFI_STARTPROC #ifdef CONFIG_X86_64 add $7,%_ASM_AX jc bad_get_user @@ -104,28 +96,23 @@ ENTRY(__get_user_8) ASM_CLAC ret #endif - CFI_ENDPROC ENDPROC(__get_user_8) bad_get_user: - CFI_STARTPROC xor %edx,%edx mov $(-EFAULT),%_ASM_AX ASM_CLAC ret - CFI_ENDPROC END(bad_get_user) #ifdef CONFIG_X86_32 bad_get_user_8: - CFI_STARTPROC xor %edx,%edx xor %ecx,%ecx mov $(-EFAULT),%_ASM_AX ASM_CLAC ret - CFI_ENDPROC END(bad_get_user_8) #endif diff --git a/arch/x86/lib/iomap_copy_64.S b/arch/x86/lib/iomap_copy_64.S index 05a95e713da8..33147fef3452 100644 --- a/arch/x86/lib/iomap_copy_64.S +++ b/arch/x86/lib/iomap_copy_64.S @@ -16,15 +16,12 @@ */ #include -#include /* * override generic version in lib/iomap_copy.c */ ENTRY(__iowrite32_copy) - CFI_STARTPROC movl %edx,%ecx rep movsd ret - CFI_ENDPROC ENDPROC(__iowrite32_copy) diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index b046664f5a1c..16698bba87de 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -2,7 +2,6 @@ #include #include -#include #include /* @@ -53,7 +52,6 @@ ENTRY(memcpy_erms) ENDPROC(memcpy_erms) ENTRY(memcpy_orig) - CFI_STARTPROC movq %rdi, %rax cmpq $0x20, %rdx @@ -178,5 +176,4 @@ ENTRY(memcpy_orig) .Lend: retq - CFI_ENDPROC ENDPROC(memcpy_orig) diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index 0f8a0d0331b9..ca2afdd6d98e 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S @@ -6,7 +6,6 @@ * - Copyright 2011 Fenghua Yu */ #include -#include #include #include @@ -27,7 +26,6 @@ ENTRY(memmove) ENTRY(__memmove) - CFI_STARTPROC /* Handle more 32 bytes in loop */ mov %rdi, %rax @@ -207,6 +205,5 @@ ENTRY(__memmove) movb %r11b, (%rdi) 13: retq - CFI_ENDPROC ENDPROC(__memmove) ENDPROC(memmove) diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 93118fb23976..2661fad05827 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -1,7 +1,6 @@ /* Copyright 2002 Andi Kleen, SuSE Labs */ #include -#include #include #include @@ -66,7 +65,6 @@ ENTRY(memset_erms) ENDPROC(memset_erms) ENTRY(memset_orig) - CFI_STARTPROC movq %rdi,%r10 /* expand byte value */ @@ -78,7 +76,6 @@ ENTRY(memset_orig) movl %edi,%r9d andl $7,%r9d jnz .Lbad_alignment - CFI_REMEMBER_STATE .Lafter_bad_alignment: movq %rdx,%rcx @@ -128,7 +125,6 @@ ENTRY(memset_orig) movq %r10,%rax ret - CFI_RESTORE_STATE .Lbad_alignment: cmpq $7,%rdx jbe .Lhandle_7 @@ -139,5 +135,4 @@ ENTRY(memset_orig) subq %r8,%rdx jmp .Lafter_bad_alignment .Lfinal: - CFI_ENDPROC ENDPROC(memset_orig) diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S index 3ca5218fbece..c81556409bbb 100644 --- a/arch/x86/lib/msr-reg.S +++ b/arch/x86/lib/msr-reg.S @@ -1,6 +1,5 @@ #include #include -#include #include #include @@ -13,9 +12,8 @@ */ .macro op_safe_regs op ENTRY(\op\()_safe_regs) - CFI_STARTPROC - pushq_cfi_reg rbx - pushq_cfi_reg rbp + pushq %rbx + pushq %rbp movq %rdi, %r10 /* Save pointer */ xorl %r11d, %r11d /* Return value */ movl (%rdi), %eax @@ -25,7 +23,6 @@ ENTRY(\op\()_safe_regs) movl 20(%rdi), %ebp movl 24(%rdi), %esi movl 28(%rdi), %edi - CFI_REMEMBER_STATE 1: \op 2: movl %eax, (%r10) movl %r11d, %eax /* Return value */ @@ -35,16 +32,14 @@ ENTRY(\op\()_safe_regs) movl %ebp, 20(%r10) movl %esi, 24(%r10) movl %edi, 28(%r10) - popq_cfi_reg rbp - popq_cfi_reg rbx + popq %rbp + popq %rbx ret 3: - CFI_RESTORE_STATE movl $-EIO, %r11d jmp 2b _ASM_EXTABLE(1b, 3b) - CFI_ENDPROC ENDPROC(\op\()_safe_regs) .endm @@ -52,13 +47,12 @@ ENDPROC(\op\()_safe_regs) .macro op_safe_regs op ENTRY(\op\()_safe_regs) - CFI_STARTPROC - pushl_cfi_reg ebx - pushl_cfi_reg ebp - pushl_cfi_reg esi - pushl_cfi_reg edi - pushl_cfi $0 /* Return value */ - pushl_cfi %eax + pushl %ebx + pushl %ebp + pushl %esi + pushl %edi + pushl $0 /* Return value */ + pushl %eax movl 4(%eax), %ecx movl 8(%eax), %edx movl 12(%eax), %ebx @@ -66,32 +60,28 @@ ENTRY(\op\()_safe_regs) movl 24(%eax), %esi movl 28(%eax), %edi movl (%eax), %eax - CFI_REMEMBER_STATE 1: \op -2: pushl_cfi %eax +2: pushl %eax movl 4(%esp), %eax - popl_cfi (%eax) + popl (%eax) addl $4, %esp - CFI_ADJUST_CFA_OFFSET -4 movl %ecx, 4(%eax) movl %edx, 8(%eax) movl %ebx, 12(%eax) movl %ebp, 20(%eax) movl %esi, 24(%eax) movl %edi, 28(%eax) - popl_cfi %eax - popl_cfi_reg edi - popl_cfi_reg esi - popl_cfi_reg ebp - popl_cfi_reg ebx + popl %eax + popl %edi + popl %esi + popl %ebp + popl %ebx ret 3: - CFI_RESTORE_STATE movl $-EIO, 4(%esp) jmp 2b _ASM_EXTABLE(1b, 3b) - CFI_ENDPROC ENDPROC(\op\()_safe_regs) .endm diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index fc6ba17a7eec..e0817a12d323 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -11,7 +11,6 @@ * return value. */ #include -#include #include #include #include @@ -30,11 +29,9 @@ * as they get called from within inline assembly. */ -#define ENTER CFI_STARTPROC ; \ - GET_THREAD_INFO(%_ASM_BX) +#define ENTER GET_THREAD_INFO(%_ASM_BX) #define EXIT ASM_CLAC ; \ - ret ; \ - CFI_ENDPROC + ret .text ENTRY(__put_user_1) @@ -87,7 +84,6 @@ ENTRY(__put_user_8) ENDPROC(__put_user_8) bad_put_user: - CFI_STARTPROC movl $-EFAULT,%eax EXIT END(bad_put_user) diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S index 2322abe4da3b..40027db99140 100644 --- a/arch/x86/lib/rwsem.S +++ b/arch/x86/lib/rwsem.S @@ -15,7 +15,6 @@ #include #include -#include #define __ASM_HALF_REG(reg) __ASM_SEL(reg, e##reg) #define __ASM_HALF_SIZE(inst) __ASM_SEL(inst##w, inst##l) @@ -34,10 +33,10 @@ */ #define save_common_regs \ - pushl_cfi_reg ecx + pushl %ecx #define restore_common_regs \ - popl_cfi_reg ecx + popl %ecx /* Avoid uglifying the argument copying x86-64 needs to do. */ .macro movq src, dst @@ -64,50 +63,45 @@ */ #define save_common_regs \ - pushq_cfi_reg rdi; \ - pushq_cfi_reg rsi; \ - pushq_cfi_reg rcx; \ - pushq_cfi_reg r8; \ - pushq_cfi_reg r9; \ - pushq_cfi_reg r10; \ - pushq_cfi_reg r11 + pushq %rdi; \ + pushq %rsi; \ + pushq %rcx; \ + pushq %r8; \ + pushq %r9; \ + pushq %r10; \ + pushq %r11 #define restore_common_regs \ - popq_cfi_reg r11; \ - popq_cfi_reg r10; \ - popq_cfi_reg r9; \ - popq_cfi_reg r8; \ - popq_cfi_reg rcx; \ - popq_cfi_reg rsi; \ - popq_cfi_reg rdi + popq %r11; \ + popq %r10; \ + popq %r9; \ + popq %r8; \ + popq %rcx; \ + popq %rsi; \ + popq %rdi #endif /* Fix up special calling conventions */ ENTRY(call_rwsem_down_read_failed) - CFI_STARTPROC save_common_regs - __ASM_SIZE(push,_cfi_reg) __ASM_REG(dx) + __ASM_SIZE(push,) %__ASM_REG(dx) movq %rax,%rdi call rwsem_down_read_failed - __ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx) + __ASM_SIZE(pop,) %__ASM_REG(dx) restore_common_regs ret - CFI_ENDPROC ENDPROC(call_rwsem_down_read_failed) ENTRY(call_rwsem_down_write_failed) - CFI_STARTPROC save_common_regs movq %rax,%rdi call rwsem_down_write_failed restore_common_regs ret - CFI_ENDPROC ENDPROC(call_rwsem_down_write_failed) ENTRY(call_rwsem_wake) - CFI_STARTPROC /* do nothing if still outstanding active readers */ __ASM_HALF_SIZE(dec) %__ASM_HALF_REG(dx) jnz 1f @@ -116,17 +110,14 @@ ENTRY(call_rwsem_wake) call rwsem_wake restore_common_regs 1: ret - CFI_ENDPROC ENDPROC(call_rwsem_wake) ENTRY(call_rwsem_downgrade_wake) - CFI_STARTPROC save_common_regs - __ASM_SIZE(push,_cfi_reg) __ASM_REG(dx) + __ASM_SIZE(push,) %__ASM_REG(dx) movq %rax,%rdi call rwsem_downgrade_wake - __ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx) + __ASM_SIZE(pop,) %__ASM_REG(dx) restore_common_regs ret - CFI_ENDPROC ENDPROC(call_rwsem_downgrade_wake) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 1d553186c434..8533b46e6bee 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -40,7 +40,7 @@ */ uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { [_PAGE_CACHE_MODE_WB ] = 0 | 0 , - [_PAGE_CACHE_MODE_WC ] = _PAGE_PWT | 0 , + [_PAGE_CACHE_MODE_WC ] = 0 | _PAGE_PCD, [_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD, [_PAGE_CACHE_MODE_UC ] = _PAGE_PWT | _PAGE_PCD, [_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD, @@ -50,11 +50,11 @@ EXPORT_SYMBOL(__cachemode2pte_tbl); uint8_t __pte2cachemode_tbl[8] = { [__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB, - [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_WC, + [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC, [__pte2cm_idx( 0 | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WB, - [__pte2cm_idx(_PAGE_PWT | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WC, + [__pte2cm_idx(_PAGE_PWT | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC, }; diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index 2b7ece0e103a..9c0ff045fdd4 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -78,13 +78,13 @@ void __iomem * iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) { /* - * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS. - * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the - * MTRR is UC or WC. UC_MINUS gets the real intention, of the - * user, which is "WC if the MTRR is WC, UC if you can't do that." + * For non-PAT systems, translate non-WB request to UC- just in + * case the caller set the PWT bit to prot directly without using + * pgprot_writecombine(). UC- translates to uncached if the MTRR + * is UC or WC. UC- gets the real intention, of the user, which is + * "WC if the MTRR is WC, UC if you can't do that." */ - if (!pat_enabled && pgprot_val(prot) == - (__PAGE_KERNEL | cachemode2protval(_PAGE_CACHE_MODE_WC))) + if (!pat_enabled() && pgprot2cachemode(prot) != _PAGE_CACHE_MODE_WB) prot = __pgprot(__PAGE_KERNEL | cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS)); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 27ff21216dfa..cc5ccc415cc0 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -42,6 +42,9 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size, case _PAGE_CACHE_MODE_WC: err = _set_memory_wc(vaddr, nrpages); break; + case _PAGE_CACHE_MODE_WT: + err = _set_memory_wt(vaddr, nrpages); + break; case _PAGE_CACHE_MODE_WB: err = _set_memory_wb(vaddr, nrpages); break; @@ -172,6 +175,10 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, prot = __pgprot(pgprot_val(prot) | cachemode2protval(_PAGE_CACHE_MODE_WC)); break; + case _PAGE_CACHE_MODE_WT: + prot = __pgprot(pgprot_val(prot) | + cachemode2protval(_PAGE_CACHE_MODE_WT)); + break; case _PAGE_CACHE_MODE_WB: break; } @@ -234,10 +241,11 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size) { /* * Ideally, this should be: - * pat_enabled ? _PAGE_CACHE_MODE_UC : _PAGE_CACHE_MODE_UC_MINUS; + * pat_enabled() ? _PAGE_CACHE_MODE_UC : _PAGE_CACHE_MODE_UC_MINUS; * * Till we fix all X drivers to use ioremap_wc(), we will use - * UC MINUS. + * UC MINUS. Drivers that are certain they need or can already + * be converted over to strong UC can use ioremap_uc(). */ enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS; @@ -246,6 +254,39 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size) } EXPORT_SYMBOL(ioremap_nocache); +/** + * ioremap_uc - map bus memory into CPU space as strongly uncachable + * @phys_addr: bus address of the memory + * @size: size of the resource to map + * + * ioremap_uc performs a platform specific sequence of operations to + * make bus memory CPU accessible via the readb/readw/readl/writeb/ + * writew/writel functions and the other mmio helpers. The returned + * address is not guaranteed to be usable directly as a virtual + * address. + * + * This version of ioremap ensures that the memory is marked with a strong + * preference as completely uncachable on the CPU when possible. For non-PAT + * systems this ends up setting page-attribute flags PCD=1, PWT=1. For PAT + * systems this will set the PAT entry for the pages as strong UC. This call + * will honor existing caching rules from things like the PCI bus. Note that + * there are other caches and buffers on many busses. In particular driver + * authors should read up on PCI writes. + * + * It's useful if some control registers are in such an area and + * write combining or read caching is not desirable: + * + * Must be freed with iounmap. + */ +void __iomem *ioremap_uc(resource_size_t phys_addr, unsigned long size) +{ + enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC; + + return __ioremap_caller(phys_addr, size, pcm, + __builtin_return_address(0)); +} +EXPORT_SYMBOL_GPL(ioremap_uc); + /** * ioremap_wc - map memory into CPU space write combined * @phys_addr: bus address of the memory @@ -258,14 +299,28 @@ EXPORT_SYMBOL(ioremap_nocache); */ void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size) { - if (pat_enabled) - return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC, + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC, __builtin_return_address(0)); - else - return ioremap_nocache(phys_addr, size); } EXPORT_SYMBOL(ioremap_wc); +/** + * ioremap_wt - map memory into CPU space write through + * @phys_addr: bus address of the memory + * @size: size of the resource to map + * + * This version of ioremap ensures that the memory is marked write through. + * Write through stores data into memory while keeping the cache up-to-date. + * + * Must be freed with iounmap. + */ +void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size) +{ + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(ioremap_wt); + void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size) { return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB, @@ -331,7 +386,7 @@ void iounmap(volatile void __iomem *addr) } EXPORT_SYMBOL(iounmap); -int arch_ioremap_pud_supported(void) +int __init arch_ioremap_pud_supported(void) { #ifdef CONFIG_X86_64 return cpu_has_gbpages; @@ -340,7 +395,7 @@ int arch_ioremap_pud_supported(void) #endif } -int arch_ioremap_pmd_supported(void) +int __init arch_ioremap_pmd_supported(void) { return cpu_has_pse; } diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index 6629f397b467..8ff686aa7e8c 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 89af288ec674..727158cb3b3c 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -129,16 +130,15 @@ within(unsigned long addr, unsigned long start, unsigned long end) */ void clflush_cache_range(void *vaddr, unsigned int size) { - void *vend = vaddr + size - 1; + unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1; + void *vend = vaddr + size; + void *p; mb(); - for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size) - clflushopt(vaddr); - /* - * Flush any possible final partial cacheline: - */ - clflushopt(vend); + for (p = (void *)((unsigned long)vaddr & ~clflush_mask); + p < vend; p += boot_cpu_data.x86_clflush_size) + clflushopt(p); mb(); } @@ -418,13 +418,11 @@ phys_addr_t slow_virt_to_phys(void *__virt_addr) phys_addr_t phys_addr; unsigned long offset; enum pg_level level; - unsigned long psize; unsigned long pmask; pte_t *pte; pte = lookup_address(virt_addr, &level); BUG_ON(!pte); - psize = page_level_size(level); pmask = page_level_mask(level); offset = virt_addr & ~pmask; phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; @@ -1468,6 +1466,9 @@ int _set_memory_uc(unsigned long addr, int numpages) { /* * for now UC MINUS. see comments in ioremap_nocache() + * If you really need strong UC use ioremap_uc(), but note + * that you cannot override IO areas with set_memory_*() as + * these helpers cannot work with IO memory. */ return change_page_attr_set(&addr, numpages, cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), @@ -1502,12 +1503,10 @@ EXPORT_SYMBOL(set_memory_uc); static int _set_memory_array(unsigned long *addr, int addrinarray, enum page_cache_mode new_type) { + enum page_cache_mode set_type; int i, j; int ret; - /* - * for now UC MINUS. see comments in ioremap_nocache() - */ for (i = 0; i < addrinarray; i++) { ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE, new_type, NULL); @@ -1515,9 +1514,12 @@ static int _set_memory_array(unsigned long *addr, int addrinarray, goto out_free; } + /* If WC, set to UC- first and then WC */ + set_type = (new_type == _PAGE_CACHE_MODE_WC) ? + _PAGE_CACHE_MODE_UC_MINUS : new_type; + ret = change_page_attr_set(addr, addrinarray, - cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), - 1); + cachemode2pgprot(set_type), 1); if (!ret && new_type == _PAGE_CACHE_MODE_WC) ret = change_page_attr_set_clr(addr, addrinarray, @@ -1549,6 +1551,12 @@ int set_memory_array_wc(unsigned long *addr, int addrinarray) } EXPORT_SYMBOL(set_memory_array_wc); +int set_memory_array_wt(unsigned long *addr, int addrinarray) +{ + return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WT); +} +EXPORT_SYMBOL_GPL(set_memory_array_wt); + int _set_memory_wc(unsigned long addr, int numpages) { int ret; @@ -1571,27 +1579,42 @@ int set_memory_wc(unsigned long addr, int numpages) { int ret; - if (!pat_enabled) - return set_memory_uc(addr, numpages); - ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, _PAGE_CACHE_MODE_WC, NULL); if (ret) - goto out_err; + return ret; ret = _set_memory_wc(addr, numpages); if (ret) - goto out_free; + free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); - return 0; - -out_free: - free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); -out_err: return ret; } EXPORT_SYMBOL(set_memory_wc); +int _set_memory_wt(unsigned long addr, int numpages) +{ + return change_page_attr_set(&addr, numpages, + cachemode2pgprot(_PAGE_CACHE_MODE_WT), 0); +} + +int set_memory_wt(unsigned long addr, int numpages) +{ + int ret; + + ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, + _PAGE_CACHE_MODE_WT, NULL); + if (ret) + return ret; + + ret = _set_memory_wt(addr, numpages); + if (ret) + free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); + + return ret; +} +EXPORT_SYMBOL_GPL(set_memory_wt); + int _set_memory_wb(unsigned long addr, int numpages) { /* WB cache mode is hard wired to all cache attribute bits being 0 */ @@ -1682,6 +1705,7 @@ static int _set_pages_array(struct page **pages, int addrinarray, { unsigned long start; unsigned long end; + enum page_cache_mode set_type; int i; int free_idx; int ret; @@ -1695,8 +1719,12 @@ static int _set_pages_array(struct page **pages, int addrinarray, goto err_out; } + /* If WC, set to UC- first and then WC */ + set_type = (new_type == _PAGE_CACHE_MODE_WC) ? + _PAGE_CACHE_MODE_UC_MINUS : new_type; + ret = cpa_set_pages_array(pages, addrinarray, - cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS)); + cachemode2pgprot(set_type)); if (!ret && new_type == _PAGE_CACHE_MODE_WC) ret = change_page_attr_set_clr(NULL, addrinarray, cachemode2pgprot( @@ -1730,6 +1758,12 @@ int set_pages_array_wc(struct page **pages, int addrinarray) } EXPORT_SYMBOL(set_pages_array_wc); +int set_pages_array_wt(struct page **pages, int addrinarray) +{ + return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WT); +} +EXPORT_SYMBOL_GPL(set_pages_array_wt); + int set_pages_wb(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 35af6771a95a..188e3e07eeeb 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -33,13 +33,17 @@ #include "pat_internal.h" #include "mm_internal.h" -#ifdef CONFIG_X86_PAT -int __read_mostly pat_enabled = 1; +#undef pr_fmt +#define pr_fmt(fmt) "" fmt + +static bool boot_cpu_done; + +static int __read_mostly __pat_enabled = IS_ENABLED(CONFIG_X86_PAT); static inline void pat_disable(const char *reason) { - pat_enabled = 0; - printk(KERN_INFO "%s\n", reason); + __pat_enabled = 0; + pr_info("x86/PAT: %s\n", reason); } static int __init nopat(char *str) @@ -48,13 +52,12 @@ static int __init nopat(char *str) return 0; } early_param("nopat", nopat); -#else -static inline void pat_disable(const char *reason) -{ - (void)reason; -} -#endif +bool pat_enabled(void) +{ + return !!__pat_enabled; +} +EXPORT_SYMBOL_GPL(pat_enabled); int pat_debug_enable; @@ -65,22 +68,24 @@ static int __init pat_debug_setup(char *str) } __setup("debugpat", pat_debug_setup); -static u64 __read_mostly boot_pat_state; - #ifdef CONFIG_X86_PAT /* - * X86 PAT uses page flags WC and Uncached together to keep track of - * memory type of pages that have backing page struct. X86 PAT supports 3 - * different memory types, _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC and - * _PAGE_CACHE_MODE_UC_MINUS and fourth state where page's memory type has not - * been changed from its default (value of -1 used to denote this). - * Note we do not support _PAGE_CACHE_MODE_UC here. + * X86 PAT uses page flags arch_1 and uncached together to keep track of + * memory type of pages that have backing page struct. + * + * X86 PAT supports 4 different memory types: + * - _PAGE_CACHE_MODE_WB + * - _PAGE_CACHE_MODE_WC + * - _PAGE_CACHE_MODE_UC_MINUS + * - _PAGE_CACHE_MODE_WT + * + * _PAGE_CACHE_MODE_WB is the default type. */ -#define _PGMT_DEFAULT 0 +#define _PGMT_WB 0 #define _PGMT_WC (1UL << PG_arch_1) #define _PGMT_UC_MINUS (1UL << PG_uncached) -#define _PGMT_WB (1UL << PG_uncached | 1UL << PG_arch_1) +#define _PGMT_WT (1UL << PG_uncached | 1UL << PG_arch_1) #define _PGMT_MASK (1UL << PG_uncached | 1UL << PG_arch_1) #define _PGMT_CLEAR_MASK (~_PGMT_MASK) @@ -88,14 +93,14 @@ static inline enum page_cache_mode get_page_memtype(struct page *pg) { unsigned long pg_flags = pg->flags & _PGMT_MASK; - if (pg_flags == _PGMT_DEFAULT) - return -1; + if (pg_flags == _PGMT_WB) + return _PAGE_CACHE_MODE_WB; else if (pg_flags == _PGMT_WC) return _PAGE_CACHE_MODE_WC; else if (pg_flags == _PGMT_UC_MINUS) return _PAGE_CACHE_MODE_UC_MINUS; else - return _PAGE_CACHE_MODE_WB; + return _PAGE_CACHE_MODE_WT; } static inline void set_page_memtype(struct page *pg, @@ -112,11 +117,12 @@ static inline void set_page_memtype(struct page *pg, case _PAGE_CACHE_MODE_UC_MINUS: memtype_flags = _PGMT_UC_MINUS; break; - case _PAGE_CACHE_MODE_WB: - memtype_flags = _PGMT_WB; + case _PAGE_CACHE_MODE_WT: + memtype_flags = _PGMT_WT; break; + case _PAGE_CACHE_MODE_WB: default: - memtype_flags = _PGMT_DEFAULT; + memtype_flags = _PGMT_WB; break; } @@ -174,78 +180,154 @@ static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg) * configuration. * Using lower indices is preferred, so we start with highest index. */ -void pat_init_cache_modes(void) +void pat_init_cache_modes(u64 pat) { - int i; enum page_cache_mode cache; char pat_msg[33]; - u64 pat; + int i; - rdmsrl(MSR_IA32_CR_PAT, pat); pat_msg[32] = 0; for (i = 7; i >= 0; i--) { cache = pat_get_cache_mode((pat >> (i * 8)) & 7, pat_msg + 4 * i); update_cache_mode_entry(i, cache); } - pr_info("PAT configuration [0-7]: %s\n", pat_msg); + pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg); } #define PAT(x, y) ((u64)PAT_ ## y << ((x)*8)) -void pat_init(void) +static void pat_bsp_init(u64 pat) { - u64 pat; - bool boot_cpu = !boot_pat_state; - - if (!pat_enabled) - return; + u64 tmp_pat; if (!cpu_has_pat) { - if (!boot_pat_state) { - pat_disable("PAT not supported by CPU."); - return; - } else { - /* - * If this happens we are on a secondary CPU, but - * switched to PAT on the boot CPU. We have no way to - * undo PAT. - */ - printk(KERN_ERR "PAT enabled, " - "but not supported by secondary CPU\n"); - BUG(); - } + pat_disable("PAT not supported by CPU."); + return; } - /* Set PWT to Write-Combining. All other bits stay the same */ - /* - * PTE encoding used in Linux: - * PAT - * |PCD - * ||PWT - * ||| - * 000 WB _PAGE_CACHE_WB - * 001 WC _PAGE_CACHE_WC - * 010 UC- _PAGE_CACHE_UC_MINUS - * 011 UC _PAGE_CACHE_UC - * PAT bit unused - */ - pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | - PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC); + if (!pat_enabled()) + goto done; - /* Boot CPU check */ - if (!boot_pat_state) { - rdmsrl(MSR_IA32_CR_PAT, boot_pat_state); - if (!boot_pat_state) { - pat_disable("PAT read returns always zero, disabled."); - return; - } + rdmsrl(MSR_IA32_CR_PAT, tmp_pat); + if (!tmp_pat) { + pat_disable("PAT MSR is 0, disabled."); + return; } wrmsrl(MSR_IA32_CR_PAT, pat); - if (boot_cpu) - pat_init_cache_modes(); +done: + pat_init_cache_modes(pat); +} + +static void pat_ap_init(u64 pat) +{ + if (!pat_enabled()) + return; + + if (!cpu_has_pat) { + /* + * If this happens we are on a secondary CPU, but switched to + * PAT on the boot CPU. We have no way to undo PAT. + */ + panic("x86/PAT: PAT enabled, but not supported by secondary CPU\n"); + } + + wrmsrl(MSR_IA32_CR_PAT, pat); +} + +void pat_init(void) +{ + u64 pat; + struct cpuinfo_x86 *c = &boot_cpu_data; + + if (!pat_enabled()) { + /* + * No PAT. Emulate the PAT table that corresponds to the two + * cache bits, PWT (Write Through) and PCD (Cache Disable). This + * setup is the same as the BIOS default setup when the system + * has PAT but the "nopat" boot option has been specified. This + * emulated PAT table is used when MSR_IA32_CR_PAT returns 0. + * + * PTE encoding: + * + * PCD + * |PWT PAT + * || slot + * 00 0 WB : _PAGE_CACHE_MODE_WB + * 01 1 WT : _PAGE_CACHE_MODE_WT + * 10 2 UC-: _PAGE_CACHE_MODE_UC_MINUS + * 11 3 UC : _PAGE_CACHE_MODE_UC + * + * NOTE: When WC or WP is used, it is redirected to UC- per + * the default setup in __cachemode2pte_tbl[]. + */ + pat = PAT(0, WB) | PAT(1, WT) | PAT(2, UC_MINUS) | PAT(3, UC) | + PAT(4, WB) | PAT(5, WT) | PAT(6, UC_MINUS) | PAT(7, UC); + + } else if ((c->x86_vendor == X86_VENDOR_INTEL) && + (((c->x86 == 0x6) && (c->x86_model <= 0xd)) || + ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) { + /* + * PAT support with the lower four entries. Intel Pentium 2, + * 3, M, and 4 are affected by PAT errata, which makes the + * upper four entries unusable. To be on the safe side, we don't + * use those. + * + * PTE encoding: + * PAT + * |PCD + * ||PWT PAT + * ||| slot + * 000 0 WB : _PAGE_CACHE_MODE_WB + * 001 1 WC : _PAGE_CACHE_MODE_WC + * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS + * 011 3 UC : _PAGE_CACHE_MODE_UC + * PAT bit unused + * + * NOTE: When WT or WP is used, it is redirected to UC- per + * the default setup in __cachemode2pte_tbl[]. + */ + pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | + PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC); + } else { + /* + * Full PAT support. We put WT in slot 7 to improve + * robustness in the presence of errata that might cause + * the high PAT bit to be ignored. This way, a buggy slot 7 + * access will hit slot 3, and slot 3 is UC, so at worst + * we lose performance without causing a correctness issue. + * Pentium 4 erratum N46 is an example for such an erratum, + * although we try not to use PAT at all on affected CPUs. + * + * PTE encoding: + * PAT + * |PCD + * ||PWT PAT + * ||| slot + * 000 0 WB : _PAGE_CACHE_MODE_WB + * 001 1 WC : _PAGE_CACHE_MODE_WC + * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS + * 011 3 UC : _PAGE_CACHE_MODE_UC + * 100 4 WB : Reserved + * 101 5 WC : Reserved + * 110 6 UC-: Reserved + * 111 7 WT : _PAGE_CACHE_MODE_WT + * + * The reserved slots are unused, but mapped to their + * corresponding types in the presence of PAT errata. + */ + pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | + PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, WT); + } + + if (!boot_cpu_done) { + pat_bsp_init(pat); + boot_cpu_done = true; + } else { + pat_ap_init(pat); + } } #undef PAT @@ -267,9 +349,9 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, * request is for WB. */ if (req_type == _PAGE_CACHE_MODE_WB) { - u8 mtrr_type; + u8 mtrr_type, uniform; - mtrr_type = mtrr_type_lookup(start, end); + mtrr_type = mtrr_type_lookup(start, end, &uniform); if (mtrr_type != MTRR_TYPE_WRBACK) return _PAGE_CACHE_MODE_UC_MINUS; @@ -324,9 +406,14 @@ static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end) /* * For RAM pages, we use page flags to mark the pages with appropriate type. - * Here we do two pass: - * - Find the memtype of all the pages in the range, look for any conflicts - * - In case of no conflicts, set the new memtype for pages in the range + * The page flags are limited to four types, WB (default), WC, WT and UC-. + * WP request fails with -EINVAL, and UC gets redirected to UC-. Setting + * a new memory type is only allowed for a page mapped with the default WB + * type. + * + * Here we do two passes: + * - Find the memtype of all the pages in the range, look for any conflicts. + * - In case of no conflicts, set the new memtype for pages in the range. */ static int reserve_ram_pages_type(u64 start, u64 end, enum page_cache_mode req_type, @@ -335,6 +422,12 @@ static int reserve_ram_pages_type(u64 start, u64 end, struct page *page; u64 pfn; + if (req_type == _PAGE_CACHE_MODE_WP) { + if (new_type) + *new_type = _PAGE_CACHE_MODE_UC_MINUS; + return -EINVAL; + } + if (req_type == _PAGE_CACHE_MODE_UC) { /* We do not support strong UC */ WARN_ON_ONCE(1); @@ -346,8 +439,8 @@ static int reserve_ram_pages_type(u64 start, u64 end, page = pfn_to_page(pfn); type = get_page_memtype(page); - if (type != -1) { - pr_info("reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n", + if (type != _PAGE_CACHE_MODE_WB) { + pr_info("x86/PAT: reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n", start, end - 1, type, req_type); if (new_type) *new_type = type; @@ -373,7 +466,7 @@ static int free_ram_pages_type(u64 start, u64 end) for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { page = pfn_to_page(pfn); - set_page_memtype(page, -1); + set_page_memtype(page, _PAGE_CACHE_MODE_WB); } return 0; } @@ -384,6 +477,7 @@ static int free_ram_pages_type(u64 start, u64 end) * - _PAGE_CACHE_MODE_WC * - _PAGE_CACHE_MODE_UC_MINUS * - _PAGE_CACHE_MODE_UC + * - _PAGE_CACHE_MODE_WT * * If new_type is NULL, function will return an error if it cannot reserve the * region with req_type. If new_type is non-NULL, function will return @@ -400,14 +494,10 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type, BUG_ON(start >= end); /* end is exclusive */ - if (!pat_enabled) { + if (!pat_enabled()) { /* This is identical to page table setting without PAT */ - if (new_type) { - if (req_type == _PAGE_CACHE_MODE_WC) - *new_type = _PAGE_CACHE_MODE_UC_MINUS; - else - *new_type = req_type; - } + if (new_type) + *new_type = req_type; return 0; } @@ -451,9 +541,9 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type, err = rbt_memtype_check_insert(new, new_type); if (err) { - printk(KERN_INFO "reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n", - start, end - 1, - cattr_name(new->type), cattr_name(req_type)); + pr_info("x86/PAT: reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n", + start, end - 1, + cattr_name(new->type), cattr_name(req_type)); kfree(new); spin_unlock(&memtype_lock); @@ -475,7 +565,7 @@ int free_memtype(u64 start, u64 end) int is_range_ram; struct memtype *entry; - if (!pat_enabled) + if (!pat_enabled()) return 0; /* Low ISA region is always mapped WB. No need to track */ @@ -497,8 +587,8 @@ int free_memtype(u64 start, u64 end) spin_unlock(&memtype_lock); if (!entry) { - printk(KERN_INFO "%s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n", - current->comm, current->pid, start, end - 1); + pr_info("x86/PAT: %s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n", + current->comm, current->pid, start, end - 1); return -EINVAL; } @@ -517,7 +607,7 @@ int free_memtype(u64 start, u64 end) * Only to be called when PAT is enabled * * Returns _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC, _PAGE_CACHE_MODE_UC_MINUS - * or _PAGE_CACHE_MODE_UC + * or _PAGE_CACHE_MODE_WT. */ static enum page_cache_mode lookup_memtype(u64 paddr) { @@ -529,16 +619,9 @@ static enum page_cache_mode lookup_memtype(u64 paddr) if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { struct page *page; - page = pfn_to_page(paddr >> PAGE_SHIFT); - rettype = get_page_memtype(page); - /* - * -1 from get_page_memtype() implies RAM page is in its - * default state and not reserved, and hence of type WB - */ - if (rettype == -1) - rettype = _PAGE_CACHE_MODE_WB; - return rettype; + page = pfn_to_page(paddr >> PAGE_SHIFT); + return get_page_memtype(page); } spin_lock(&memtype_lock); @@ -623,13 +706,13 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size) u64 to = from + size; u64 cursor = from; - if (!pat_enabled) + if (!pat_enabled()) return 1; while (cursor < to) { if (!devmem_is_allowed(pfn)) { - printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n", - current->comm, from, to - 1); + pr_info("x86/PAT: Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n", + current->comm, from, to - 1); return 0; } cursor += PAGE_SIZE; @@ -659,7 +742,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, * caching for the high addresses through the KEN pin, but * we maintain the tradition of paranoia in this code. */ - if (!pat_enabled && + if (!pat_enabled() && !(boot_cpu_has(X86_FEATURE_MTRR) || boot_cpu_has(X86_FEATURE_K6_MTRR) || boot_cpu_has(X86_FEATURE_CYRIX_ARR) || @@ -698,8 +781,7 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, size; if (ioremap_change_attr((unsigned long)__va(base), id_sz, pcm) < 0) { - printk(KERN_INFO "%s:%d ioremap_change_attr failed %s " - "for [mem %#010Lx-%#010Lx]\n", + pr_info("x86/PAT: %s:%d ioremap_change_attr failed %s for [mem %#010Lx-%#010Lx]\n", current->comm, current->pid, cattr_name(pcm), base, (unsigned long long)(base + size-1)); @@ -729,12 +811,12 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, * the type requested matches the type of first page in the range. */ if (is_ram) { - if (!pat_enabled) + if (!pat_enabled()) return 0; pcm = lookup_memtype(paddr); if (want_pcm != pcm) { - printk(KERN_WARNING "%s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n", + pr_warn("x86/PAT: %s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n", current->comm, current->pid, cattr_name(want_pcm), (unsigned long long)paddr, @@ -755,13 +837,12 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, if (strict_prot || !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) { free_memtype(paddr, paddr + size); - printk(KERN_ERR "%s:%d map pfn expected mapping type %s" - " for [mem %#010Lx-%#010Lx], got %s\n", - current->comm, current->pid, - cattr_name(want_pcm), - (unsigned long long)paddr, - (unsigned long long)(paddr + size - 1), - cattr_name(pcm)); + pr_err("x86/PAT: %s:%d map pfn expected mapping type %s for [mem %#010Lx-%#010Lx], got %s\n", + current->comm, current->pid, + cattr_name(want_pcm), + (unsigned long long)paddr, + (unsigned long long)(paddr + size - 1), + cattr_name(pcm)); return -EINVAL; } /* @@ -844,7 +925,7 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, return ret; } - if (!pat_enabled) + if (!pat_enabled()) return 0; /* @@ -872,7 +953,7 @@ int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, { enum page_cache_mode pcm; - if (!pat_enabled) + if (!pat_enabled()) return 0; /* Set prot based on lookup */ @@ -913,14 +994,18 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, pgprot_t pgprot_writecombine(pgprot_t prot) { - if (pat_enabled) - return __pgprot(pgprot_val(prot) | + return __pgprot(pgprot_val(prot) | cachemode2protval(_PAGE_CACHE_MODE_WC)); - else - return pgprot_noncached(prot); } EXPORT_SYMBOL_GPL(pgprot_writecombine); +pgprot_t pgprot_writethrough(pgprot_t prot) +{ + return __pgprot(pgprot_val(prot) | + cachemode2protval(_PAGE_CACHE_MODE_WT)); +} +EXPORT_SYMBOL_GPL(pgprot_writethrough); + #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) static struct memtype *memtype_get_idx(loff_t pos) @@ -996,7 +1081,7 @@ static const struct file_operations memtype_fops = { static int __init pat_memtype_list_init(void) { - if (pat_enabled) { + if (pat_enabled()) { debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir, NULL, &memtype_fops); } diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat_internal.h index f6411620305d..a739bfc40690 100644 --- a/arch/x86/mm/pat_internal.h +++ b/arch/x86/mm/pat_internal.h @@ -4,7 +4,7 @@ extern int pat_debug_enable; #define dprintk(fmt, arg...) \ - do { if (pat_debug_enable) printk(KERN_INFO fmt, ##arg); } while (0) + do { if (pat_debug_enable) pr_info("x86/PAT: " fmt, ##arg); } while (0) struct memtype { u64 start; diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index 6582adcc8bd9..63931080366a 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c @@ -160,9 +160,9 @@ success: return 0; failure: - printk(KERN_INFO "%s:%d conflicting memory types " - "%Lx-%Lx %s<->%s\n", current->comm, current->pid, start, - end, cattr_name(found_type), cattr_name(match->type)); + pr_info("x86/PAT: %s:%d conflicting memory types %Lx-%Lx %s<->%s\n", + current->comm, current->pid, start, end, + cattr_name(found_type), cattr_name(match->type)); return -EBUSY; } diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 0b97d2c75df3..fb0a9dd1d6e4 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -563,16 +563,31 @@ void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, } #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP +/** + * pud_set_huge - setup kernel PUD mapping + * + * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this + * function sets up a huge page only if any of the following conditions are met: + * + * - MTRRs are disabled, or + * + * - MTRRs are enabled and the range is completely covered by a single MTRR, or + * + * - MTRRs are enabled and the corresponding MTRR memory type is WB, which + * has no effect on the requested PAT memory type. + * + * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger + * page mapping attempt fails. + * + * Returns 1 on success and 0 on failure. + */ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) { - u8 mtrr; + u8 mtrr, uniform; - /* - * Do not use a huge page when the range is covered by non-WB type - * of MTRRs. - */ - mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE); - if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF)) + mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform); + if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) && + (mtrr != MTRR_TYPE_WRBACK)) return 0; prot = pgprot_4k_2_large(prot); @@ -584,17 +599,24 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) return 1; } +/** + * pmd_set_huge - setup kernel PMD mapping + * + * See text over pud_set_huge() above. + * + * Returns 1 on success and 0 on failure. + */ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) { - u8 mtrr; + u8 mtrr, uniform; - /* - * Do not use a huge page when the range is covered by non-WB type - * of MTRRs. - */ - mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE); - if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF)) + mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform); + if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) && + (mtrr != MTRR_TYPE_WRBACK)) { + pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n", + __func__, addr, addr + PMD_SIZE); return 0; + } prot = pgprot_4k_2_large(prot); @@ -605,6 +627,11 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) return 1; } +/** + * pud_clear_huge - clear kernel PUD mapping when it is set + * + * Returns 1 on success and 0 on failure (no PUD map is found). + */ int pud_clear_huge(pud_t *pud) { if (pud_large(*pud)) { @@ -615,6 +642,11 @@ int pud_clear_huge(pud_t *pud) return 0; } +/** + * pmd_clear_huge - clear kernel PMD mapping when it is set + * + * Returns 1 on success and 0 on failure (no PMD map is found). + */ int pmd_clear_huge(pmd_t *pmd) { if (pmd_large(*pmd)) { diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S index 6440221ced0d..4093216b3791 100644 --- a/arch/x86/net/bpf_jit.S +++ b/arch/x86/net/bpf_jit.S @@ -8,7 +8,6 @@ * of the License. */ #include -#include /* * Calling convention : diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 349c0d32cc0b..0a9f2caf358f 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -429,12 +429,12 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, * Caller can followup with UC MINUS request and add a WC mtrr if there * is a free mtrr slot. */ - if (!pat_enabled && write_combine) + if (!pat_enabled() && write_combine) return -EINVAL; - if (pat_enabled && write_combine) + if (pat_enabled() && write_combine) prot |= cachemode2protval(_PAGE_CACHE_MODE_WC); - else if (pat_enabled || boot_cpu_data.x86 > 3) + else if (pat_enabled() || boot_cpu_data.x86 > 3) /* * ioremap() and ioremap_nocache() defaults to UC MINUS for now. * To avoid attribute conflicts, request UC MINUS here diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c index 852aa4c92da0..27062303c881 100644 --- a/arch/x86/pci/intel_mid_pci.c +++ b/arch/x86/pci/intel_mid_pci.c @@ -208,6 +208,7 @@ static int pci_write(struct pci_bus *bus, unsigned int devfn, int where, static int intel_mid_pci_irq_enable(struct pci_dev *dev) { + struct irq_alloc_info info; int polarity; if (dev->irq_managed && dev->irq > 0) @@ -217,14 +218,13 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev) polarity = 0; /* active high */ else polarity = 1; /* active low */ + ioapic_set_alloc_attr(&info, dev_to_node(&dev->dev), 1, polarity); /* * MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to * IOAPIC RTE entries, so we just enable RTE for the device. */ - if (mp_set_gsi_attr(dev->irq, 1, polarity, dev_to_node(&dev->dev))) - return -EBUSY; - if (mp_map_gsi_to_irq(dev->irq, IOAPIC_MAP_ALLOC) < 0) + if (mp_map_gsi_to_irq(dev->irq, IOAPIC_MAP_ALLOC, &info) < 0) return -EBUSY; dev->irq_managed = 1; diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 5dc6ca5e1741..9bd115484745 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -146,19 +146,20 @@ static void __init pirq_peer_trick(void) /* * Code for querying and setting of IRQ routes on various interrupt routers. + * PIC Edge/Level Control Registers (ELCR) 0x4d0 & 0x4d1. */ -void eisa_set_level_irq(unsigned int irq) +void elcr_set_level_irq(unsigned int irq) { unsigned char mask = 1 << (irq & 7); unsigned int port = 0x4d0 + (irq >> 3); unsigned char val; - static u16 eisa_irq_mask; + static u16 elcr_irq_mask; - if (irq >= 16 || (1 << irq) & eisa_irq_mask) + if (irq >= 16 || (1 << irq) & elcr_irq_mask) return; - eisa_irq_mask |= (1 << irq); + elcr_irq_mask |= (1 << irq); printk(KERN_DEBUG "PCI: setting IRQ %u as level-triggered\n", irq); val = inb(port); if (!(val & mask)) { @@ -965,11 +966,11 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) } else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \ ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) { msg = "found"; - eisa_set_level_irq(irq); + elcr_set_level_irq(irq); } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) { if (r->set(pirq_router_dev, dev, pirq, newirq)) { - eisa_set_level_irq(newirq); + elcr_set_level_irq(newirq); msg = "assigned"; irq = newirq; } diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile index a62e0be3a2f1..f1a6c8e86ddd 100644 --- a/arch/x86/platform/Makefile +++ b/arch/x86/platform/Makefile @@ -1,4 +1,5 @@ # Platform specific code goes here +obj-y += atom/ obj-y += ce4100/ obj-y += efi/ obj-y += geode/ diff --git a/arch/x86/platform/atom/Makefile b/arch/x86/platform/atom/Makefile new file mode 100644 index 000000000000..0a3a40cbc794 --- /dev/null +++ b/arch/x86/platform/atom/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_PUNIT_ATOM_DEBUG) += punit_atom_debug.o diff --git a/arch/x86/platform/atom/punit_atom_debug.c b/arch/x86/platform/atom/punit_atom_debug.c new file mode 100644 index 000000000000..5ca8ead91579 --- /dev/null +++ b/arch/x86/platform/atom/punit_atom_debug.c @@ -0,0 +1,183 @@ +/* + * Intel SOC Punit device state debug driver + * Punit controls power management for North Complex devices (Graphics + * blocks, Image Signal Processing, video processing, display, DSP etc.) + * + * Copyright (c) 2015, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* Side band Interface port */ +#define PUNIT_PORT 0x04 +/* Power gate status reg */ +#define PWRGT_STATUS 0x61 +/* Subsystem config/status Video processor */ +#define VED_SS_PM0 0x32 +/* Subsystem config/status ISP (Image Signal Processor) */ +#define ISP_SS_PM0 0x39 +/* Subsystem config/status Input/output controller */ +#define MIO_SS_PM 0x3B +/* Shift bits for getting status for video, isp and i/o */ +#define SSS_SHIFT 24 +/* Shift bits for getting status for graphics rendering */ +#define RENDER_POS 0 +/* Shift bits for getting status for media control */ +#define MEDIA_POS 2 +/* Shift bits for getting status for Valley View/Baytrail display */ +#define VLV_DISPLAY_POS 6 +/* Subsystem config/status display for Cherry Trail SOC */ +#define CHT_DSP_SSS 0x36 +/* Shift bits for getting status for display */ +#define CHT_DSP_SSS_POS 16 + +struct punit_device { + char *name; + int reg; + int sss_pos; +}; + +static const struct punit_device punit_device_byt[] = { + { "GFX RENDER", PWRGT_STATUS, RENDER_POS }, + { "GFX MEDIA", PWRGT_STATUS, MEDIA_POS }, + { "DISPLAY", PWRGT_STATUS, VLV_DISPLAY_POS }, + { "VED", VED_SS_PM0, SSS_SHIFT }, + { "ISP", ISP_SS_PM0, SSS_SHIFT }, + { "MIO", MIO_SS_PM, SSS_SHIFT }, + { NULL } +}; + +static const struct punit_device punit_device_cht[] = { + { "GFX RENDER", PWRGT_STATUS, RENDER_POS }, + { "GFX MEDIA", PWRGT_STATUS, MEDIA_POS }, + { "DISPLAY", CHT_DSP_SSS, CHT_DSP_SSS_POS }, + { "VED", VED_SS_PM0, SSS_SHIFT }, + { "ISP", ISP_SS_PM0, SSS_SHIFT }, + { "MIO", MIO_SS_PM, SSS_SHIFT }, + { NULL } +}; + +static const char * const dstates[] = {"D0", "D0i1", "D0i2", "D0i3"}; + +static int punit_dev_state_show(struct seq_file *seq_file, void *unused) +{ + u32 punit_pwr_status; + struct punit_device *punit_devp = seq_file->private; + int index; + int status; + + seq_puts(seq_file, "\n\nPUNIT NORTH COMPLEX DEVICES :\n"); + while (punit_devp->name) { + status = iosf_mbi_read(PUNIT_PORT, BT_MBI_PMC_READ, + punit_devp->reg, + &punit_pwr_status); + if (status) { + seq_printf(seq_file, "%9s : Read Failed\n", + punit_devp->name); + } else { + index = (punit_pwr_status >> punit_devp->sss_pos) & 3; + seq_printf(seq_file, "%9s : %s\n", punit_devp->name, + dstates[index]); + } + punit_devp++; + } + + return 0; +} + +static int punit_dev_state_open(struct inode *inode, struct file *file) +{ + return single_open(file, punit_dev_state_show, inode->i_private); +} + +static const struct file_operations punit_dev_state_ops = { + .open = punit_dev_state_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct dentry *punit_dbg_file; + +static int punit_dbgfs_register(struct punit_device *punit_device) +{ + static struct dentry *dev_state; + + punit_dbg_file = debugfs_create_dir("punit_atom", NULL); + if (!punit_dbg_file) + return -ENXIO; + + dev_state = debugfs_create_file("dev_power_state", S_IFREG | S_IRUGO, + punit_dbg_file, punit_device, + &punit_dev_state_ops); + if (!dev_state) { + pr_err("punit_dev_state register failed\n"); + debugfs_remove(punit_dbg_file); + return -ENXIO; + } + + return 0; +} + +static void punit_dbgfs_unregister(void) +{ + debugfs_remove_recursive(punit_dbg_file); +} + +#define ICPU(model, drv_data) \ + { X86_VENDOR_INTEL, 6, model, X86_FEATURE_MWAIT,\ + (kernel_ulong_t)&drv_data } + +static const struct x86_cpu_id intel_punit_cpu_ids[] = { + ICPU(55, punit_device_byt), /* Valleyview, Bay Trail */ + ICPU(76, punit_device_cht), /* Braswell, Cherry Trail */ + {} +}; + +MODULE_DEVICE_TABLE(x86cpu, intel_punit_cpu_ids); + +static int __init punit_atom_debug_init(void) +{ + const struct x86_cpu_id *id; + int ret; + + id = x86_match_cpu(intel_punit_cpu_ids); + if (!id) + return -ENODEV; + + ret = punit_dbgfs_register((struct punit_device *)id->driver_data); + if (ret < 0) + return ret; + + return 0; +} + +static void __exit punit_atom_debug_exit(void) +{ + punit_dbgfs_unregister(); +} + +module_init(punit_atom_debug_init); +module_exit(punit_atom_debug_exit); + +MODULE_AUTHOR("Kumar P, Mahesh "); +MODULE_AUTHOR("Srinivas Pandruvada "); +MODULE_DESCRIPTION("Driver for Punit devices states debugging"); +MODULE_LICENSE("GPL v2"); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_wdt.c b/arch/x86/platform/intel-mid/device_libs/platform_wdt.c index 0b283d4d0ad7..de734134bc8d 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_wdt.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_wdt.c @@ -27,6 +27,7 @@ static struct platform_device wdt_dev = { static int tangier_probe(struct platform_device *pdev) { int gsi; + struct irq_alloc_info info; struct intel_mid_wdt_pdata *pdata = pdev->dev.platform_data; if (!pdata) @@ -34,8 +35,8 @@ static int tangier_probe(struct platform_device *pdev) /* IOAPIC builds identity mapping between GSI and IRQ on MID */ gsi = pdata->irq; - if (mp_set_gsi_attr(gsi, 1, 0, cpu_to_node(0)) || - mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC) <= 0) { + ioapic_set_alloc_attr(&info, cpu_to_node(0), 1, 0); + if (mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info) <= 0) { dev_warn(&pdev->dev, "cannot find interrupt %d in ioapic\n", gsi); return -EINVAL; diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c index 3005f0c89f2e..01d54ea766c1 100644 --- a/arch/x86/platform/intel-mid/intel-mid.c +++ b/arch/x86/platform/intel-mid/intel-mid.c @@ -81,26 +81,34 @@ static unsigned long __init intel_mid_calibrate_tsc(void) return 0; } +static void __init intel_mid_setup_bp_timer(void) +{ + apbt_time_init(); + setup_boot_APIC_clock(); +} + static void __init intel_mid_time_init(void) { sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); + switch (intel_mid_timer_options) { case INTEL_MID_TIMER_APBT_ONLY: break; case INTEL_MID_TIMER_LAPIC_APBT: - x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock; + /* Use apbt and local apic */ + x86_init.timers.setup_percpu_clockev = intel_mid_setup_bp_timer; x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock; - break; + return; default: if (!boot_cpu_has(X86_FEATURE_ARAT)) break; + /* Lapic only, no apbt */ x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock; x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock; return; } - /* we need at least one APB timer */ - pre_init_apic_IRQ0(); - apbt_time_init(); + + x86_init.timers.setup_percpu_clockev = apbt_time_init; } static void intel_mid_arch_setup(void) diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c index c14ad34776c4..ce992e8cc065 100644 --- a/arch/x86/platform/intel-mid/sfi.c +++ b/arch/x86/platform/intel-mid/sfi.c @@ -95,18 +95,16 @@ int __init sfi_parse_mtmr(struct sfi_table_header *table) pr_debug("timer[%d]: paddr = 0x%08x, freq = %dHz, irq = %d\n", totallen, (u32)pentry->phys_addr, pentry->freq_hz, pentry->irq); - if (!pentry->irq) - continue; - mp_irq.type = MP_INTSRC; - mp_irq.irqtype = mp_INT; -/* triggering mode edge bit 2-3, active high polarity bit 0-1 */ - mp_irq.irqflag = 5; - mp_irq.srcbus = MP_BUS_ISA; - mp_irq.srcbusirq = pentry->irq; /* IRQ */ - mp_irq.dstapic = MP_APIC_ALL; - mp_irq.dstirq = pentry->irq; - mp_save_irq(&mp_irq); - mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC); + mp_irq.type = MP_INTSRC; + mp_irq.irqtype = mp_INT; + /* triggering mode edge bit 2-3, active high polarity bit 0-1 */ + mp_irq.irqflag = 5; + mp_irq.srcbus = MP_BUS_ISA; + mp_irq.srcbusirq = pentry->irq; /* IRQ */ + mp_irq.dstapic = MP_APIC_ALL; + mp_irq.dstirq = pentry->irq; + mp_save_irq(&mp_irq); + mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC, NULL); } return 0; @@ -177,7 +175,7 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table) mp_irq.dstapic = MP_APIC_ALL; mp_irq.dstirq = pentry->irq; mp_save_irq(&mp_irq); - mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC); + mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC, NULL); } return 0; } @@ -436,6 +434,7 @@ static int __init sfi_parse_devs(struct sfi_table_header *table) struct devs_id *dev = NULL; int num, i, ret; int polarity; + struct irq_alloc_info info; sb = (struct sfi_table_simple *)table; num = SFI_GET_NUM_ENTRIES(sb, struct sfi_device_table_entry); @@ -469,9 +468,8 @@ static int __init sfi_parse_devs(struct sfi_table_header *table) polarity = 1; } - ret = mp_set_gsi_attr(irq, 1, polarity, NUMA_NO_NODE); - if (ret == 0) - ret = mp_map_gsi_to_irq(irq, IOAPIC_MAP_ALLOC); + ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 1, polarity); + ret = mp_map_gsi_to_irq(irq, IOAPIC_MAP_ALLOC, &info); WARN_ON(ret < 0); } diff --git a/arch/x86/platform/sfi/sfi.c b/arch/x86/platform/sfi/sfi.c index 2a8a74f3bd76..6c7111bbd1e9 100644 --- a/arch/x86/platform/sfi/sfi.c +++ b/arch/x86/platform/sfi/sfi.c @@ -25,8 +25,8 @@ #include #include #include -#include +#include #include #include #include @@ -71,9 +71,6 @@ static int __init sfi_parse_cpus(struct sfi_table_header *table) #endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_IO_APIC -static struct irq_domain_ops sfi_ioapic_irqdomain_ops = { - .map = mp_irqdomain_map, -}; static int __init sfi_parse_ioapic(struct sfi_table_header *table) { @@ -82,7 +79,7 @@ static int __init sfi_parse_ioapic(struct sfi_table_header *table) int i, num; struct ioapic_domain_cfg cfg = { .type = IOAPIC_DOMAIN_STRICT, - .ops = &sfi_ioapic_irqdomain_ops, + .ops = &mp_ioapic_irqdomain_ops, }; sb = (struct sfi_table_simple *)table; diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index 0ce673645432..8570abe68be1 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -13,22 +13,37 @@ #include #include +#include #include #include #include /* MMR offset and pnode of hub sourcing interrupts for a given irq */ -struct uv_irq_2_mmr_pnode{ - struct rb_node list; +struct uv_irq_2_mmr_pnode { unsigned long offset; int pnode; - int irq; }; -static DEFINE_SPINLOCK(uv_irq_lock); -static struct rb_root uv_irq_root; +static void uv_program_mmr(struct irq_cfg *cfg, struct uv_irq_2_mmr_pnode *info) +{ + unsigned long mmr_value; + struct uv_IO_APIC_route_entry *entry; -static int uv_set_irq_affinity(struct irq_data *, const struct cpumask *, bool); + BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != + sizeof(unsigned long)); + + mmr_value = 0; + entry = (struct uv_IO_APIC_route_entry *)&mmr_value; + entry->vector = cfg->vector; + entry->delivery_mode = apic->irq_delivery_mode; + entry->dest_mode = apic->irq_dest_mode; + entry->polarity = 0; + entry->trigger = 0; + entry->mask = 0; + entry->dest = cfg->dest_apicid; + + uv_write_global_mmr64(info->pnode, info->offset, mmr_value); +} static void uv_noop(struct irq_data *data) { } @@ -37,6 +52,23 @@ static void uv_ack_apic(struct irq_data *data) ack_APIC_irq(); } +static int +uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) +{ + struct irq_data *parent = data->parent_data; + struct irq_cfg *cfg = irqd_cfg(data); + int ret; + + ret = parent->chip->irq_set_affinity(parent, mask, force); + if (ret >= 0) { + uv_program_mmr(cfg, data->chip_data); + send_cleanup_vector(cfg); + } + + return ret; +} + static struct irq_chip uv_irq_chip = { .name = "UV-CORE", .irq_mask = uv_noop, @@ -45,189 +77,99 @@ static struct irq_chip uv_irq_chip = { .irq_set_affinity = uv_set_irq_affinity, }; -/* - * Add offset and pnode information of the hub sourcing interrupts to the - * rb tree for a specific irq. - */ -static int uv_set_irq_2_mmr_info(int irq, unsigned long offset, unsigned blade) +static int uv_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) { - struct rb_node **link = &uv_irq_root.rb_node; - struct rb_node *parent = NULL; - struct uv_irq_2_mmr_pnode *n; - struct uv_irq_2_mmr_pnode *e; - unsigned long irqflags; + struct uv_irq_2_mmr_pnode *chip_data; + struct irq_alloc_info *info = arg; + struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq); + int ret; - n = kmalloc_node(sizeof(struct uv_irq_2_mmr_pnode), GFP_KERNEL, - uv_blade_to_memory_nid(blade)); - if (!n) + if (nr_irqs > 1 || !info || info->type != X86_IRQ_ALLOC_TYPE_UV) + return -EINVAL; + + chip_data = kmalloc_node(sizeof(*chip_data), GFP_KERNEL, + irq_data->node); + if (!chip_data) return -ENOMEM; - n->irq = irq; - n->offset = offset; - n->pnode = uv_blade_to_pnode(blade); - spin_lock_irqsave(&uv_irq_lock, irqflags); - /* Find the right place in the rbtree: */ - while (*link) { - parent = *link; - e = rb_entry(parent, struct uv_irq_2_mmr_pnode, list); - - if (unlikely(irq == e->irq)) { - /* irq entry exists */ - e->pnode = uv_blade_to_pnode(blade); - e->offset = offset; - spin_unlock_irqrestore(&uv_irq_lock, irqflags); - kfree(n); - return 0; - } - - if (irq < e->irq) - link = &(*link)->rb_left; + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); + if (ret >= 0) { + if (info->uv_limit == UV_AFFINITY_CPU) + irq_set_status_flags(virq, IRQ_NO_BALANCING); else - link = &(*link)->rb_right; + irq_set_status_flags(virq, IRQ_MOVE_PCNTXT); + + chip_data->pnode = uv_blade_to_pnode(info->uv_blade); + chip_data->offset = info->uv_offset; + irq_domain_set_info(domain, virq, virq, &uv_irq_chip, chip_data, + handle_percpu_irq, NULL, info->uv_name); + } else { + kfree(chip_data); } - /* Insert the node into the rbtree. */ - rb_link_node(&n->list, parent, link); - rb_insert_color(&n->list, &uv_irq_root); - - spin_unlock_irqrestore(&uv_irq_lock, irqflags); - return 0; + return ret; } -/* Retrieve offset and pnode information from the rb tree for a specific irq */ -int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode) +static void uv_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) { - struct uv_irq_2_mmr_pnode *e; - struct rb_node *n; - unsigned long irqflags; + struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq); - spin_lock_irqsave(&uv_irq_lock, irqflags); - n = uv_irq_root.rb_node; - while (n) { - e = rb_entry(n, struct uv_irq_2_mmr_pnode, list); - - if (e->irq == irq) { - *offset = e->offset; - *pnode = e->pnode; - spin_unlock_irqrestore(&uv_irq_lock, irqflags); - return 0; - } - - if (irq < e->irq) - n = n->rb_left; - else - n = n->rb_right; - } - spin_unlock_irqrestore(&uv_irq_lock, irqflags); - return -1; + BUG_ON(nr_irqs != 1); + kfree(irq_data->chip_data); + irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT); + irq_clear_status_flags(virq, IRQ_NO_BALANCING); + irq_domain_free_irqs_top(domain, virq, nr_irqs); } /* * Re-target the irq to the specified CPU and enable the specified MMR located * on the specified blade to allow the sending of MSIs to the specified CPU. */ -static int -arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, - unsigned long mmr_offset, int limit) +static void uv_domain_activate(struct irq_domain *domain, + struct irq_data *irq_data) { - const struct cpumask *eligible_cpu = cpumask_of(cpu); - struct irq_cfg *cfg = irq_cfg(irq); - unsigned long mmr_value; - struct uv_IO_APIC_route_entry *entry; - int mmr_pnode, err; - unsigned int dest; - - BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != - sizeof(unsigned long)); - - err = assign_irq_vector(irq, cfg, eligible_cpu); - if (err != 0) - return err; - - err = apic->cpu_mask_to_apicid_and(eligible_cpu, eligible_cpu, &dest); - if (err != 0) - return err; - - if (limit == UV_AFFINITY_CPU) - irq_set_status_flags(irq, IRQ_NO_BALANCING); - else - irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - - irq_set_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, - irq_name); - - mmr_value = 0; - entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - entry->vector = cfg->vector; - entry->delivery_mode = apic->irq_delivery_mode; - entry->dest_mode = apic->irq_dest_mode; - entry->polarity = 0; - entry->trigger = 0; - entry->mask = 0; - entry->dest = dest; - - mmr_pnode = uv_blade_to_pnode(mmr_blade); - uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); - - if (cfg->move_in_progress) - send_cleanup_vector(cfg); - - return irq; + uv_program_mmr(irqd_cfg(irq_data), irq_data->chip_data); } /* * Disable the specified MMR located on the specified blade so that MSIs are * longer allowed to be sent. */ -static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset) +static void uv_domain_deactivate(struct irq_domain *domain, + struct irq_data *irq_data) { unsigned long mmr_value; struct uv_IO_APIC_route_entry *entry; - BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != - sizeof(unsigned long)); - mmr_value = 0; entry = (struct uv_IO_APIC_route_entry *)&mmr_value; entry->mask = 1; - - uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); + uv_program_mmr(irqd_cfg(irq_data), irq_data->chip_data); } -static int -uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask, - bool force) +static const struct irq_domain_ops uv_domain_ops = { + .alloc = uv_domain_alloc, + .free = uv_domain_free, + .activate = uv_domain_activate, + .deactivate = uv_domain_deactivate, +}; + +static struct irq_domain *uv_get_irq_domain(void) { - struct irq_cfg *cfg = irqd_cfg(data); - unsigned int dest; - unsigned long mmr_value, mmr_offset; - struct uv_IO_APIC_route_entry *entry; - int mmr_pnode; + static struct irq_domain *uv_domain; + static DEFINE_MUTEX(uv_lock); - if (apic_set_affinity(data, mask, &dest)) - return -1; + mutex_lock(&uv_lock); + if (uv_domain == NULL) { + uv_domain = irq_domain_add_tree(NULL, &uv_domain_ops, NULL); + if (uv_domain) + uv_domain->parent = x86_vector_domain; + } + mutex_unlock(&uv_lock); - mmr_value = 0; - entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - - entry->vector = cfg->vector; - entry->delivery_mode = apic->irq_delivery_mode; - entry->dest_mode = apic->irq_dest_mode; - entry->polarity = 0; - entry->trigger = 0; - entry->mask = 0; - entry->dest = dest; - - /* Get previously stored MMR and pnode of hub sourcing interrupts */ - if (uv_irq_2_mmr_info(data->irq, &mmr_offset, &mmr_pnode)) - return -1; - - uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); - - if (cfg->move_in_progress) - send_cleanup_vector(cfg); - - return IRQ_SET_MASK_OK_NOCOPY; + return uv_domain; } /* @@ -238,19 +180,21 @@ uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask, int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, unsigned long mmr_offset, int limit) { - int ret, irq = irq_alloc_hwirq(uv_blade_to_memory_nid(mmr_blade)); + struct irq_alloc_info info; + struct irq_domain *domain = uv_get_irq_domain(); - if (!irq) - return -EBUSY; + if (!domain) + return -ENOMEM; - ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset, - limit); - if (ret == irq) - uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade); - else - irq_free_hwirq(irq); + init_irq_alloc_info(&info, cpumask_of(cpu)); + info.type = X86_IRQ_ALLOC_TYPE_UV; + info.uv_limit = limit; + info.uv_blade = mmr_blade; + info.uv_offset = mmr_offset; + info.uv_name = irq_name; - return ret; + return irq_domain_alloc_irqs(domain, 1, + uv_blade_to_memory_nid(mmr_blade), &info); } EXPORT_SYMBOL_GPL(uv_setup_irq); @@ -263,26 +207,6 @@ EXPORT_SYMBOL_GPL(uv_setup_irq); */ void uv_teardown_irq(unsigned int irq) { - struct uv_irq_2_mmr_pnode *e; - struct rb_node *n; - unsigned long irqflags; - - spin_lock_irqsave(&uv_irq_lock, irqflags); - n = uv_irq_root.rb_node; - while (n) { - e = rb_entry(n, struct uv_irq_2_mmr_pnode, list); - if (e->irq == irq) { - arch_disable_uv_irq(e->pnode, e->offset); - rb_erase(n, &uv_irq_root); - kfree(e); - break; - } - if (irq < e->irq) - n = n->rb_left; - else - n = n->rb_right; - } - spin_unlock_irqrestore(&uv_irq_lock, irqflags); - irq_free_hwirq(irq); + irq_domain_free_irqs(irq, 1); } EXPORT_SYMBOL_GPL(uv_teardown_irq); diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S index 3c4469a7a929..e2386cb4e0c3 100644 --- a/arch/x86/power/hibernate_asm_64.S +++ b/arch/x86/power/hibernate_asm_64.S @@ -78,9 +78,9 @@ ENTRY(restore_image) /* code below has been relocated to a safe page */ ENTRY(core_restore_code) -loop: +.Lloop: testq %rdx, %rdx - jz done + jz .Ldone /* get addresses from the pbe and copy the page */ movq pbe_address(%rdx), %rsi @@ -91,8 +91,8 @@ loop: /* progress to the next pbe */ movq pbe_next(%rdx), %rdx - jmp loop -done: + jmp .Lloop +.Ldone: /* jump to the restore_registers address from the image header */ jmpq *%rax /* diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile index acb384d24669..a8fecc226946 100644 --- a/arch/x86/um/Makefile +++ b/arch/x86/um/Makefile @@ -26,7 +26,7 @@ else obj-y += syscalls_64.o vdso/ -subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o ../lib/thunk_64.o \ +subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o ../entry/thunk_64.o \ ../lib/rwsem.o endif diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 98088bf5906a..0b95c9b8283f 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1181,10 +1181,11 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .read_tscp = native_read_tscp, .iret = xen_iret, - .irq_enable_sysexit = xen_sysexit, #ifdef CONFIG_X86_64 .usergs_sysret32 = xen_sysret32, .usergs_sysret64 = xen_sysret64, +#else + .irq_enable_sysexit = xen_sysexit, #endif .load_tr_desc = paravirt_nop, @@ -1467,6 +1468,7 @@ asmlinkage __visible void __init xen_start_kernel(void) { struct physdev_set_iopl set_iopl; unsigned long initrd_start = 0; + u64 pat; int rc; if (!xen_start_info) @@ -1574,8 +1576,8 @@ asmlinkage __visible void __init xen_start_kernel(void) * Modify the cache mode translation tables to match Xen's PAT * configuration. */ - - pat_init_cache_modes(); + rdmsrl(MSR_IA32_CR_PAT, pat); + pat_init_cache_modes(pat); /* keep using Xen gdt for now; no urgent need to change it */ diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index b47124d4cd67..8b7f18e200aa 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -67,6 +67,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 985fc3ee0973..f22667abf7b9 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -15,6 +15,8 @@ #include #include #include +#include +#include #include @@ -47,29 +49,13 @@ ENTRY(xen_iret) ENDPATCH(xen_iret) RELOC(xen_iret, 1b+1) -/* - * sysexit is not used for 64-bit processes, so it's only ever used to - * return to 32-bit compat userspace. - */ -ENTRY(xen_sysexit) - pushq $__USER32_DS - pushq %rcx - pushq $X86_EFLAGS_IF - pushq $__USER32_CS - pushq %rdx - - pushq $0 -1: jmp hypercall_iret -ENDPATCH(xen_sysexit) -RELOC(xen_sysexit, 1b+1) - ENTRY(xen_sysret64) /* * We're already on the usermode stack at this point, but * still with the kernel gs, so we can easily switch back */ movq %rsp, PER_CPU_VAR(rsp_scratch) - movq PER_CPU_VAR(kernel_stack), %rsp + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp pushq $__USER_DS pushq PER_CPU_VAR(rsp_scratch) @@ -88,7 +74,7 @@ ENTRY(xen_sysret32) * still with the kernel gs, so we can easily switch back */ movq %rsp, PER_CPU_VAR(rsp_scratch) - movq PER_CPU_VAR(kernel_stack), %rsp + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp pushq $__USER32_DS pushq PER_CPU_VAR(rsp_scratch) @@ -128,7 +114,7 @@ RELOC(xen_sysret32, 1b+1) /* Normal 64-bit system call target */ ENTRY(xen_syscall_target) undo_xen_syscall - jmp system_call_after_swapgs + jmp entry_SYSCALL_64_after_swapgs ENDPROC(xen_syscall_target) #ifdef CONFIG_IA32_EMULATION @@ -136,13 +122,13 @@ ENDPROC(xen_syscall_target) /* 32-bit compat syscall target */ ENTRY(xen_syscall32_target) undo_xen_syscall - jmp ia32_cstar_target + jmp entry_SYSCALL_compat ENDPROC(xen_syscall32_target) /* 32-bit compat sysenter target */ ENTRY(xen_sysenter_target) undo_xen_syscall - jmp ia32_sysenter_target + jmp entry_SYSENTER_compat ENDPROC(xen_sysenter_target) #else /* !CONFIG_IA32_EMULATION */ diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 9e195c683549..c20fe29e65f4 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -134,7 +134,9 @@ DECL_ASM(void, xen_restore_fl_direct, unsigned long); /* These are not functions, and cannot be called normally */ __visible void xen_iret(void); +#ifdef CONFIG_X86_32 __visible void xen_sysexit(void); +#endif __visible void xen_sysret32(void); __visible void xen_sysret64(void); __visible void xen_adjust_exception_frame(void); diff --git a/arch/xtensa/include/asm/io.h b/arch/xtensa/include/asm/io.h index fe1600a09438..c39bb6e61911 100644 --- a/arch/xtensa/include/asm/io.h +++ b/arch/xtensa/include/asm/io.h @@ -59,6 +59,7 @@ static inline void __iomem *ioremap_cache(unsigned long offset, } #define ioremap_wc ioremap_nocache +#define ioremap_wt ioremap_nocache static inline void __iomem *ioremap(unsigned long offset, unsigned long size) { diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c index ed65e9c4b5b0..3670bbab57a3 100644 --- a/drivers/acpi/apei/erst.c +++ b/drivers/acpi/apei/erst.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include "apei-internal.h" diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c index eabf4a8d0085..095dfaadcaa5 100644 --- a/drivers/block/pmem.c +++ b/drivers/block/pmem.c @@ -139,11 +139,11 @@ static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res) } /* - * Map the memory as non-cachable, as we can't write back the contents + * Map the memory as write-through, as we can't write back the contents * of the CPU caches in case of a crash. */ err = -ENOMEM; - pmem->virt_addr = ioremap_nocache(pmem->phys_addr, pmem->size); + pmem->virt_addr = ioremap_wt(pmem->phys_addr, pmem->size); if (!pmem->virt_addr) goto out_release_region; diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 6414661ac1c4..2ba53f4f6af2 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include diff --git a/drivers/dma/mic_x100_dma.c b/drivers/dma/mic_x100_dma.c index 6de2e677be04..74d9db05a5ad 100644 --- a/drivers/dma/mic_x100_dma.c +++ b/drivers/dma/mic_x100_dma.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "mic_x100_dma.h" diff --git a/drivers/gpu/drm/drm_ioctl.c b/drivers/gpu/drm/drm_ioctl.c index 266dcd6cdf3b..0a957828b3bd 100644 --- a/drivers/gpu/drm/drm_ioctl.c +++ b/drivers/gpu/drm/drm_ioctl.c @@ -36,9 +36,6 @@ #include #include -#ifdef CONFIG_X86 -#include -#endif static int drm_version(struct drm_device *dev, void *data, struct drm_file *file_priv); @@ -197,16 +194,7 @@ static int drm_getmap(struct drm_device *dev, void *data, map->type = r_list->map->type; map->flags = r_list->map->flags; map->handle = (void *)(unsigned long) r_list->user_token; - -#ifdef CONFIG_X86 - /* - * There appears to be exactly one user of the mtrr index: dritest. - * It's easy enough to keep it working on non-PAT systems. - */ - map->mtrr = phys_wc_to_mtrr_index(r_list->map->mtrr); -#else - map->mtrr = -1; -#endif + map->mtrr = arch_phys_wc_index(r_list->map->mtrr); mutex_unlock(&dev->struct_mutex); diff --git a/drivers/infiniband/hw/ipath/Kconfig b/drivers/infiniband/hw/ipath/Kconfig index 1d9bb115cbf6..8fe54ff00580 100644 --- a/drivers/infiniband/hw/ipath/Kconfig +++ b/drivers/infiniband/hw/ipath/Kconfig @@ -9,3 +9,6 @@ config INFINIBAND_IPATH as IP-over-InfiniBand as well as with userspace applications (in conjunction with InfiniBand userspace access). For QLogic PCIe QLE based cards, use the QIB driver instead. + + If you have this hardware you will need to boot with PAT disabled + on your x86-64 systems, use the nopat kernel parameter. diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c index bd0caedafe99..2d7e503d13cb 100644 --- a/drivers/infiniband/hw/ipath/ipath_driver.c +++ b/drivers/infiniband/hw/ipath/ipath_driver.c @@ -42,6 +42,9 @@ #include #include #include +#ifdef CONFIG_X86_64 +#include +#endif #include "ipath_kernel.h" #include "ipath_verbs.h" @@ -395,6 +398,14 @@ static int ipath_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) unsigned long long addr; u32 bar0 = 0, bar1 = 0; +#ifdef CONFIG_X86_64 + if (WARN(pat_enabled(), + "ipath needs PAT disabled, boot with nopat kernel parameter\n")) { + ret = -ENODEV; + goto bail; + } +#endif + dd = ipath_alloc_devdata(pdev); if (IS_ERR(dd)) { ret = PTR_ERR(dd); @@ -542,6 +553,7 @@ static int ipath_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) dd->ipath_kregbase = __ioremap(addr, len, (_PAGE_NO_CACHE|_PAGE_WRITETHRU)); #else + /* XXX: split this properly to enable on PAT */ dd->ipath_kregbase = ioremap_nocache(addr, len); #endif @@ -587,12 +599,8 @@ static int ipath_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) ret = ipath_enable_wc(dd); - if (ret) { - ipath_dev_err(dd, "Write combining not enabled " - "(err %d): performance may be poor\n", - -ret); + if (ret) ret = 0; - } ipath_verify_pioperf(dd); diff --git a/drivers/infiniband/hw/ipath/ipath_kernel.h b/drivers/infiniband/hw/ipath/ipath_kernel.h index e08db7020cd4..f0f947122779 100644 --- a/drivers/infiniband/hw/ipath/ipath_kernel.h +++ b/drivers/infiniband/hw/ipath/ipath_kernel.h @@ -463,9 +463,7 @@ struct ipath_devdata { /* offset in HT config space of slave/primary interface block */ u8 ipath_ht_slave_off; /* for write combining settings */ - unsigned long ipath_wc_cookie; - unsigned long ipath_wc_base; - unsigned long ipath_wc_len; + int wc_cookie; /* ref count for each pkey */ atomic_t ipath_pkeyrefs[4]; /* shadow copy of struct page *'s for exp tid pages */ diff --git a/drivers/infiniband/hw/ipath/ipath_wc_x86_64.c b/drivers/infiniband/hw/ipath/ipath_wc_x86_64.c index 4ad0b932df1f..7b6e4c843e19 100644 --- a/drivers/infiniband/hw/ipath/ipath_wc_x86_64.c +++ b/drivers/infiniband/hw/ipath/ipath_wc_x86_64.c @@ -37,7 +37,6 @@ */ #include -#include #include #include "ipath_kernel.h" @@ -122,27 +121,14 @@ int ipath_enable_wc(struct ipath_devdata *dd) } if (!ret) { - int cookie; - ipath_cdbg(VERBOSE, "Setting mtrr for chip to WC " - "(addr %llx, len=0x%llx)\n", - (unsigned long long) pioaddr, - (unsigned long long) piolen); - cookie = mtrr_add(pioaddr, piolen, MTRR_TYPE_WRCOMB, 0); - if (cookie < 0) { - { - dev_info(&dd->pcidev->dev, - "mtrr_add() WC for PIO bufs " - "failed (%d)\n", - cookie); - ret = -EINVAL; - } - } else { - ipath_cdbg(VERBOSE, "Set mtrr for chip to WC, " - "cookie is %d\n", cookie); - dd->ipath_wc_cookie = cookie; - dd->ipath_wc_base = (unsigned long) pioaddr; - dd->ipath_wc_len = (unsigned long) piolen; - } + dd->wc_cookie = arch_phys_wc_add(pioaddr, piolen); + if (dd->wc_cookie < 0) { + ipath_dev_err(dd, "Seting mtrr failed on PIO buffers\n"); + ret = -ENODEV; + } else if (dd->wc_cookie == 0) + ipath_cdbg(VERBOSE, "Set mtrr for chip to WC not needed\n"); + else + ipath_cdbg(VERBOSE, "Set mtrr for chip to WC\n"); } return ret; @@ -154,16 +140,5 @@ int ipath_enable_wc(struct ipath_devdata *dd) */ void ipath_disable_wc(struct ipath_devdata *dd) { - if (dd->ipath_wc_cookie) { - int r; - ipath_cdbg(VERBOSE, "undoing WCCOMB on pio buffers\n"); - r = mtrr_del(dd->ipath_wc_cookie, dd->ipath_wc_base, - dd->ipath_wc_len); - if (r < 0) - dev_info(&dd->pcidev->dev, - "mtrr_del(%lx, %lx, %lx) failed: %d\n", - dd->ipath_wc_cookie, dd->ipath_wc_base, - dd->ipath_wc_len, r); - dd->ipath_wc_cookie = 0; /* even on failure */ - } + arch_phys_wc_del(dd->wc_cookie); } diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index e1c7e9e51045..fffea87a014f 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -3852,6 +3853,21 @@ union irte { } fields; }; +struct irq_2_irte { + u16 devid; /* Device ID for IRTE table */ + u16 index; /* Index into IRTE table*/ +}; + +struct amd_ir_data { + struct irq_2_irte irq_2_irte; + union irte irte_entry; + union { + struct msi_msg msi_entry; + }; +}; + +static struct irq_chip amd_ir_chip; + #define DTE_IRQ_PHYS_ADDR_MASK (((1ULL << 45)-1) << 6) #define DTE_IRQ_REMAP_INTCTL (2ULL << 60) #define DTE_IRQ_TABLE_LEN (8ULL << 1) @@ -3945,7 +3961,7 @@ out_unlock: return table; } -static int alloc_irq_index(struct irq_cfg *cfg, u16 devid, int count) +static int alloc_irq_index(u16 devid, int count) { struct irq_remap_table *table; unsigned long flags; @@ -3967,18 +3983,10 @@ static int alloc_irq_index(struct irq_cfg *cfg, u16 devid, int count) c = 0; if (c == count) { - struct irq_2_irte *irte_info; - for (; c != 0; --c) table->table[index - c + 1] = IRTE_ALLOCATED; index -= count - 1; - - cfg->remapped = 1; - irte_info = &cfg->irq_2_irte; - irte_info->devid = devid; - irte_info->index = index; - goto out; } } @@ -3991,22 +3999,6 @@ out: return index; } -static int get_irte(u16 devid, int index, union irte *irte) -{ - struct irq_remap_table *table; - unsigned long flags; - - table = get_irq_table(devid, false); - if (!table) - return -ENOMEM; - - spin_lock_irqsave(&table->lock, flags); - irte->val = table->table[index]; - spin_unlock_irqrestore(&table->lock, flags); - - return 0; -} - static int modify_irte(u16 devid, int index, union irte irte) { struct irq_remap_table *table; @@ -4053,229 +4045,70 @@ static void free_irte(u16 devid, int index) iommu_completion_wait(iommu); } -static int setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry, - unsigned int destination, int vector, - struct io_apic_irq_attr *attr) +static int get_devid(struct irq_alloc_info *info) { - struct irq_remap_table *table; - struct irq_2_irte *irte_info; - struct irq_cfg *cfg; - union irte irte; - int ioapic_id; - int index; - int devid; - int ret; + int devid = -1; - cfg = irq_cfg(irq); - if (!cfg) - return -EINVAL; - - irte_info = &cfg->irq_2_irte; - ioapic_id = mpc_ioapic_id(attr->ioapic); - devid = get_ioapic_devid(ioapic_id); - - if (devid < 0) - return devid; - - table = get_irq_table(devid, true); - if (table == NULL) - return -ENOMEM; - - index = attr->ioapic_pin; - - /* Setup IRQ remapping info */ - cfg->remapped = 1; - irte_info->devid = devid; - irte_info->index = index; - - /* Setup IRTE for IOMMU */ - irte.val = 0; - irte.fields.vector = vector; - irte.fields.int_type = apic->irq_delivery_mode; - irte.fields.destination = destination; - irte.fields.dm = apic->irq_dest_mode; - irte.fields.valid = 1; - - ret = modify_irte(devid, index, irte); - if (ret) - return ret; - - /* Setup IOAPIC entry */ - memset(entry, 0, sizeof(*entry)); - - entry->vector = index; - entry->mask = 0; - entry->trigger = attr->trigger; - entry->polarity = attr->polarity; - - /* - * Mask level triggered irqs. - */ - if (attr->trigger) - entry->mask = 1; - - return 0; -} - -static int set_affinity(struct irq_data *data, const struct cpumask *mask, - bool force) -{ - struct irq_2_irte *irte_info; - unsigned int dest, irq; - struct irq_cfg *cfg; - union irte irte; - int err; - - if (!config_enabled(CONFIG_SMP)) - return -1; - - cfg = irqd_cfg(data); - irq = data->irq; - irte_info = &cfg->irq_2_irte; - - if (!cpumask_intersects(mask, cpu_online_mask)) - return -EINVAL; - - if (get_irte(irte_info->devid, irte_info->index, &irte)) - return -EBUSY; - - if (assign_irq_vector(irq, cfg, mask)) - return -EBUSY; - - err = apic->cpu_mask_to_apicid_and(cfg->domain, mask, &dest); - if (err) { - if (assign_irq_vector(irq, cfg, data->affinity)) - pr_err("AMD-Vi: Failed to recover vector for irq %d\n", irq); - return err; + switch (info->type) { + case X86_IRQ_ALLOC_TYPE_IOAPIC: + devid = get_ioapic_devid(info->ioapic_id); + break; + case X86_IRQ_ALLOC_TYPE_HPET: + devid = get_hpet_devid(info->hpet_id); + break; + case X86_IRQ_ALLOC_TYPE_MSI: + case X86_IRQ_ALLOC_TYPE_MSIX: + devid = get_device_id(&info->msi_dev->dev); + break; + default: + BUG_ON(1); + break; } - irte.fields.vector = cfg->vector; - irte.fields.destination = dest; - - modify_irte(irte_info->devid, irte_info->index, irte); - - if (cfg->move_in_progress) - send_cleanup_vector(cfg); - - cpumask_copy(data->affinity, mask); - - return 0; + return devid; } -static int free_irq(int irq) +static struct irq_domain *get_ir_irq_domain(struct irq_alloc_info *info) { - struct irq_2_irte *irte_info; - struct irq_cfg *cfg; + struct amd_iommu *iommu; + int devid; - cfg = irq_cfg(irq); - if (!cfg) - return -EINVAL; + if (!info) + return NULL; - irte_info = &cfg->irq_2_irte; + devid = get_devid(info); + if (devid >= 0) { + iommu = amd_iommu_rlookup_table[devid]; + if (iommu) + return iommu->ir_domain; + } - free_irte(irte_info->devid, irte_info->index); - - return 0; + return NULL; } -static void compose_msi_msg(struct pci_dev *pdev, - unsigned int irq, unsigned int dest, - struct msi_msg *msg, u8 hpet_id) +static struct irq_domain *get_irq_domain(struct irq_alloc_info *info) { - struct irq_2_irte *irte_info; - struct irq_cfg *cfg; - union irte irte; + struct amd_iommu *iommu; + int devid; - cfg = irq_cfg(irq); - if (!cfg) - return; + if (!info) + return NULL; - irte_info = &cfg->irq_2_irte; + switch (info->type) { + case X86_IRQ_ALLOC_TYPE_MSI: + case X86_IRQ_ALLOC_TYPE_MSIX: + devid = get_device_id(&info->msi_dev->dev); + if (devid >= 0) { + iommu = amd_iommu_rlookup_table[devid]; + if (iommu) + return iommu->msi_domain; + } + break; + default: + break; + } - irte.val = 0; - irte.fields.vector = cfg->vector; - irte.fields.int_type = apic->irq_delivery_mode; - irte.fields.destination = dest; - irte.fields.dm = apic->irq_dest_mode; - irte.fields.valid = 1; - - modify_irte(irte_info->devid, irte_info->index, irte); - - msg->address_hi = MSI_ADDR_BASE_HI; - msg->address_lo = MSI_ADDR_BASE_LO; - msg->data = irte_info->index; -} - -static int msi_alloc_irq(struct pci_dev *pdev, int irq, int nvec) -{ - struct irq_cfg *cfg; - int index; - u16 devid; - - if (!pdev) - return -EINVAL; - - cfg = irq_cfg(irq); - if (!cfg) - return -EINVAL; - - devid = get_device_id(&pdev->dev); - index = alloc_irq_index(cfg, devid, nvec); - - return index < 0 ? MAX_IRQS_PER_TABLE : index; -} - -static int msi_setup_irq(struct pci_dev *pdev, unsigned int irq, - int index, int offset) -{ - struct irq_2_irte *irte_info; - struct irq_cfg *cfg; - u16 devid; - - if (!pdev) - return -EINVAL; - - cfg = irq_cfg(irq); - if (!cfg) - return -EINVAL; - - if (index >= MAX_IRQS_PER_TABLE) - return 0; - - devid = get_device_id(&pdev->dev); - irte_info = &cfg->irq_2_irte; - - cfg->remapped = 1; - irte_info->devid = devid; - irte_info->index = index + offset; - - return 0; -} - -static int alloc_hpet_msi(unsigned int irq, unsigned int id) -{ - struct irq_2_irte *irte_info; - struct irq_cfg *cfg; - int index, devid; - - cfg = irq_cfg(irq); - if (!cfg) - return -EINVAL; - - irte_info = &cfg->irq_2_irte; - devid = get_hpet_devid(id); - if (devid < 0) - return devid; - - index = alloc_irq_index(cfg, devid, 1); - if (index < 0) - return index; - - cfg->remapped = 1; - irte_info->devid = devid; - irte_info->index = index; - - return 0; + return NULL; } struct irq_remap_ops amd_iommu_irq_ops = { @@ -4284,12 +4117,244 @@ struct irq_remap_ops amd_iommu_irq_ops = { .disable = amd_iommu_disable, .reenable = amd_iommu_reenable, .enable_faulting = amd_iommu_enable_faulting, - .setup_ioapic_entry = setup_ioapic_entry, - .set_affinity = set_affinity, - .free_irq = free_irq, - .compose_msi_msg = compose_msi_msg, - .msi_alloc_irq = msi_alloc_irq, - .msi_setup_irq = msi_setup_irq, - .alloc_hpet_msi = alloc_hpet_msi, + .get_ir_irq_domain = get_ir_irq_domain, + .get_irq_domain = get_irq_domain, }; + +static void irq_remapping_prepare_irte(struct amd_ir_data *data, + struct irq_cfg *irq_cfg, + struct irq_alloc_info *info, + int devid, int index, int sub_handle) +{ + struct irq_2_irte *irte_info = &data->irq_2_irte; + struct msi_msg *msg = &data->msi_entry; + union irte *irte = &data->irte_entry; + struct IO_APIC_route_entry *entry; + + data->irq_2_irte.devid = devid; + data->irq_2_irte.index = index + sub_handle; + + /* Setup IRTE for IOMMU */ + irte->val = 0; + irte->fields.vector = irq_cfg->vector; + irte->fields.int_type = apic->irq_delivery_mode; + irte->fields.destination = irq_cfg->dest_apicid; + irte->fields.dm = apic->irq_dest_mode; + irte->fields.valid = 1; + + switch (info->type) { + case X86_IRQ_ALLOC_TYPE_IOAPIC: + /* Setup IOAPIC entry */ + entry = info->ioapic_entry; + info->ioapic_entry = NULL; + memset(entry, 0, sizeof(*entry)); + entry->vector = index; + entry->mask = 0; + entry->trigger = info->ioapic_trigger; + entry->polarity = info->ioapic_polarity; + /* Mask level triggered irqs. */ + if (info->ioapic_trigger) + entry->mask = 1; + break; + + case X86_IRQ_ALLOC_TYPE_HPET: + case X86_IRQ_ALLOC_TYPE_MSI: + case X86_IRQ_ALLOC_TYPE_MSIX: + msg->address_hi = MSI_ADDR_BASE_HI; + msg->address_lo = MSI_ADDR_BASE_LO; + msg->data = irte_info->index; + break; + + default: + BUG_ON(1); + break; + } +} + +static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct irq_alloc_info *info = arg; + struct irq_data *irq_data; + struct amd_ir_data *data; + struct irq_cfg *cfg; + int i, ret, devid; + int index = -1; + + if (!info) + return -EINVAL; + if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_MSI && + info->type != X86_IRQ_ALLOC_TYPE_MSIX) + return -EINVAL; + + /* + * With IRQ remapping enabled, don't need contiguous CPU vectors + * to support multiple MSI interrupts. + */ + if (info->type == X86_IRQ_ALLOC_TYPE_MSI) + info->flags &= ~X86_IRQ_ALLOC_CONTIGUOUS_VECTORS; + + devid = get_devid(info); + if (devid < 0) + return -EINVAL; + + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); + if (ret < 0) + return ret; + + ret = -ENOMEM; + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + goto out_free_parent; + + if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC) { + if (get_irq_table(devid, true)) + index = info->ioapic_pin; + else + ret = -ENOMEM; + } else { + index = alloc_irq_index(devid, nr_irqs); + } + if (index < 0) { + pr_warn("Failed to allocate IRTE\n"); + kfree(data); + goto out_free_parent; + } + + for (i = 0; i < nr_irqs; i++) { + irq_data = irq_domain_get_irq_data(domain, virq + i); + cfg = irqd_cfg(irq_data); + if (!irq_data || !cfg) { + ret = -EINVAL; + goto out_free_data; + } + + if (i > 0) { + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + goto out_free_data; + } + irq_data->hwirq = (devid << 16) + i; + irq_data->chip_data = data; + irq_data->chip = &amd_ir_chip; + irq_remapping_prepare_irte(data, cfg, info, devid, index, i); + irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT); + } + return 0; + +out_free_data: + for (i--; i >= 0; i--) { + irq_data = irq_domain_get_irq_data(domain, virq + i); + if (irq_data) + kfree(irq_data->chip_data); + } + for (i = 0; i < nr_irqs; i++) + free_irte(devid, index + i); +out_free_parent: + irq_domain_free_irqs_common(domain, virq, nr_irqs); + return ret; +} + +static void irq_remapping_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + struct irq_2_irte *irte_info; + struct irq_data *irq_data; + struct amd_ir_data *data; + int i; + + for (i = 0; i < nr_irqs; i++) { + irq_data = irq_domain_get_irq_data(domain, virq + i); + if (irq_data && irq_data->chip_data) { + data = irq_data->chip_data; + irte_info = &data->irq_2_irte; + free_irte(irte_info->devid, irte_info->index); + kfree(data); + } + } + irq_domain_free_irqs_common(domain, virq, nr_irqs); +} + +static void irq_remapping_activate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + struct amd_ir_data *data = irq_data->chip_data; + struct irq_2_irte *irte_info = &data->irq_2_irte; + + modify_irte(irte_info->devid, irte_info->index, data->irte_entry); +} + +static void irq_remapping_deactivate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + struct amd_ir_data *data = irq_data->chip_data; + struct irq_2_irte *irte_info = &data->irq_2_irte; + union irte entry; + + entry.val = 0; + modify_irte(irte_info->devid, irte_info->index, data->irte_entry); +} + +static struct irq_domain_ops amd_ir_domain_ops = { + .alloc = irq_remapping_alloc, + .free = irq_remapping_free, + .activate = irq_remapping_activate, + .deactivate = irq_remapping_deactivate, +}; + +static int amd_ir_set_affinity(struct irq_data *data, + const struct cpumask *mask, bool force) +{ + struct amd_ir_data *ir_data = data->chip_data; + struct irq_2_irte *irte_info = &ir_data->irq_2_irte; + struct irq_cfg *cfg = irqd_cfg(data); + struct irq_data *parent = data->parent_data; + int ret; + + ret = parent->chip->irq_set_affinity(parent, mask, force); + if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE) + return ret; + + /* + * Atomically updates the IRTE with the new destination, vector + * and flushes the interrupt entry cache. + */ + ir_data->irte_entry.fields.vector = cfg->vector; + ir_data->irte_entry.fields.destination = cfg->dest_apicid; + modify_irte(irte_info->devid, irte_info->index, ir_data->irte_entry); + + /* + * After this point, all the interrupts will start arriving + * at the new destination. So, time to cleanup the previous + * vector allocation. + */ + send_cleanup_vector(cfg); + + return IRQ_SET_MASK_OK_DONE; +} + +static void ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg) +{ + struct amd_ir_data *ir_data = irq_data->chip_data; + + *msg = ir_data->msi_entry; +} + +static struct irq_chip amd_ir_chip = { + .irq_ack = ir_ack_apic_edge, + .irq_set_affinity = amd_ir_set_affinity, + .irq_compose_msi_msg = ir_compose_msi_msg, +}; + +int amd_iommu_create_irq_domain(struct amd_iommu *iommu) +{ + iommu->ir_domain = irq_domain_add_tree(NULL, &amd_ir_domain_ops, iommu); + if (!iommu->ir_domain) + return -ENOMEM; + + iommu->ir_domain->parent = arch_get_ir_parent_domain(); + iommu->msi_domain = arch_create_msi_irq_domain(iommu->ir_domain); + + return 0; +} #endif diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index 450ef5001a65..c17df04d7a7f 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -1124,6 +1124,10 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) if (ret) return ret; + ret = amd_iommu_create_irq_domain(iommu); + if (ret) + return ret; + /* * Make sure IOMMU is not considered to translate itself. The IVRS * table tells us so, but this is a lie! diff --git a/drivers/iommu/amd_iommu_proto.h b/drivers/iommu/amd_iommu_proto.h index 72b0fd455e24..0a21142d3639 100644 --- a/drivers/iommu/amd_iommu_proto.h +++ b/drivers/iommu/amd_iommu_proto.h @@ -62,6 +62,15 @@ extern u8 amd_iommu_pc_get_max_counters(u16 devid); extern int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr, u8 fxn, u64 *value, bool is_write); +#ifdef CONFIG_IRQ_REMAP +extern int amd_iommu_create_irq_domain(struct amd_iommu *iommu); +#else +static inline int amd_iommu_create_irq_domain(struct amd_iommu *iommu) +{ + return 0; +} +#endif + #define PPR_SUCCESS 0x0 #define PPR_INVALID 0x1 #define PPR_FAILURE 0xf diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index 05030e523771..6533e874c9d7 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -398,6 +398,7 @@ struct amd_iommu_fault { struct iommu_domain; +struct irq_domain; /* * This structure contains generic data for IOMMU protection domains @@ -579,6 +580,10 @@ struct amd_iommu { /* The maximum PC banks and counters/bank (PCSup=1) */ u8 max_banks; u8 max_counters; +#ifdef CONFIG_IRQ_REMAP + struct irq_domain *ir_domain; + struct irq_domain *msi_domain; +#endif }; struct devid_map { diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c index 9847613085e1..536f2d8ea41a 100644 --- a/drivers/iommu/dmar.c +++ b/drivers/iommu/dmar.c @@ -1087,8 +1087,8 @@ static void free_iommu(struct intel_iommu *iommu) if (iommu->irq) { free_irq(iommu->irq, iommu); - irq_set_handler_data(iommu->irq, NULL); dmar_free_hwirq(iommu->irq); + iommu->irq = 0; } if (iommu->qi) { @@ -1642,23 +1642,14 @@ int dmar_set_interrupt(struct intel_iommu *iommu) if (iommu->irq) return 0; - irq = dmar_alloc_hwirq(); - if (irq <= 0) { + irq = dmar_alloc_hwirq(iommu->seq_id, iommu->node, iommu); + if (irq > 0) { + iommu->irq = irq; + } else { pr_err("IOMMU: no free vectors\n"); return -EINVAL; } - irq_set_handler_data(irq, iommu); - iommu->irq = irq; - - ret = arch_setup_dmar_msi(irq); - if (ret) { - irq_set_handler_data(irq, NULL); - iommu->irq = 0; - dmar_free_hwirq(irq); - return ret; - } - ret = request_irq(irq, dmar_fault, IRQF_NO_THREAD, iommu->name, iommu); if (ret) pr_err("IOMMU: can't request irq\n"); diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c index 5709ae9c3e77..80f1d1486247 100644 --- a/drivers/iommu/intel_irq_remapping.c +++ b/drivers/iommu/intel_irq_remapping.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -17,6 +18,11 @@ #include "irq_remapping.h" +enum irq_mode { + IRQ_REMAPPING, + IRQ_POSTING, +}; + struct ioapic_scope { struct intel_iommu *iommu; unsigned int id; @@ -31,6 +37,22 @@ struct hpet_scope { unsigned int devfn; }; +struct irq_2_iommu { + struct intel_iommu *iommu; + u16 irte_index; + u16 sub_handle; + u8 irte_mask; + enum irq_mode mode; +}; + +struct intel_ir_data { + struct irq_2_iommu irq_2_iommu; + struct irte irte_entry; + union { + struct msi_msg msi_entry; + }; +}; + #define IR_X2APIC_MODE(mode) (mode ? (1 << 11) : 0) #define IRTE_DEST(dest) ((eim_mode) ? dest : dest << 8) @@ -50,43 +72,14 @@ static struct hpet_scope ir_hpet[MAX_HPET_TBS]; * the dmar_global_lock. */ static DEFINE_RAW_SPINLOCK(irq_2_ir_lock); +static struct irq_domain_ops intel_ir_domain_ops; static int __init parse_ioapics_under_ir(void); -static struct irq_2_iommu *irq_2_iommu(unsigned int irq) -{ - struct irq_cfg *cfg = irq_cfg(irq); - return cfg ? &cfg->irq_2_iommu : NULL; -} - -static int get_irte(int irq, struct irte *entry) -{ - struct irq_2_iommu *irq_iommu = irq_2_iommu(irq); - unsigned long flags; - int index; - - if (!entry || !irq_iommu) - return -1; - - raw_spin_lock_irqsave(&irq_2_ir_lock, flags); - - if (unlikely(!irq_iommu->iommu)) { - raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags); - return -1; - } - - index = irq_iommu->irte_index + irq_iommu->sub_handle; - *entry = *(irq_iommu->iommu->ir_table->base + index); - - raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags); - return 0; -} - -static int alloc_irte(struct intel_iommu *iommu, int irq, u16 count) +static int alloc_irte(struct intel_iommu *iommu, int irq, + struct irq_2_iommu *irq_iommu, u16 count) { struct ir_table *table = iommu->ir_table; - struct irq_2_iommu *irq_iommu = irq_2_iommu(irq); - struct irq_cfg *cfg = irq_cfg(irq); unsigned int mask = 0; unsigned long flags; int index; @@ -113,11 +106,11 @@ static int alloc_irte(struct intel_iommu *iommu, int irq, u16 count) if (index < 0) { pr_warn("IR%d: can't allocate an IRTE\n", iommu->seq_id); } else { - cfg->remapped = 1; irq_iommu->iommu = iommu; irq_iommu->irte_index = index; irq_iommu->sub_handle = 0; irq_iommu->irte_mask = mask; + irq_iommu->mode = IRQ_REMAPPING; } raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags); @@ -135,47 +128,9 @@ static int qi_flush_iec(struct intel_iommu *iommu, int index, int mask) return qi_submit_sync(&desc, iommu); } -static int map_irq_to_irte_handle(int irq, u16 *sub_handle) +static int modify_irte(struct irq_2_iommu *irq_iommu, + struct irte *irte_modified) { - struct irq_2_iommu *irq_iommu = irq_2_iommu(irq); - unsigned long flags; - int index; - - if (!irq_iommu) - return -1; - - raw_spin_lock_irqsave(&irq_2_ir_lock, flags); - *sub_handle = irq_iommu->sub_handle; - index = irq_iommu->irte_index; - raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags); - return index; -} - -static int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle) -{ - struct irq_2_iommu *irq_iommu = irq_2_iommu(irq); - struct irq_cfg *cfg = irq_cfg(irq); - unsigned long flags; - - if (!irq_iommu) - return -1; - - raw_spin_lock_irqsave(&irq_2_ir_lock, flags); - - cfg->remapped = 1; - irq_iommu->iommu = iommu; - irq_iommu->irte_index = index; - irq_iommu->sub_handle = subhandle; - irq_iommu->irte_mask = 0; - - raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags); - - return 0; -} - -static int modify_irte(int irq, struct irte *irte_modified) -{ - struct irq_2_iommu *irq_iommu = irq_2_iommu(irq); struct intel_iommu *iommu; unsigned long flags; struct irte *irte; @@ -196,6 +151,9 @@ static int modify_irte(int irq, struct irte *irte_modified) __iommu_flush_cache(iommu, irte, sizeof(*irte)); rc = qi_flush_iec(iommu, index, 0); + + /* Update iommu mode according to the IRTE mode */ + irq_iommu->mode = irte->pst ? IRQ_POSTING : IRQ_REMAPPING; raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags); return rc; @@ -242,7 +200,7 @@ static int clear_entries(struct irq_2_iommu *irq_iommu) return 0; iommu = irq_iommu->iommu; - index = irq_iommu->irte_index + irq_iommu->sub_handle; + index = irq_iommu->irte_index; start = iommu->ir_table->base + index; end = start + (1 << irq_iommu->irte_mask); @@ -257,29 +215,6 @@ static int clear_entries(struct irq_2_iommu *irq_iommu) return qi_flush_iec(iommu, index, irq_iommu->irte_mask); } -static int free_irte(int irq) -{ - struct irq_2_iommu *irq_iommu = irq_2_iommu(irq); - unsigned long flags; - int rc; - - if (!irq_iommu) - return -1; - - raw_spin_lock_irqsave(&irq_2_ir_lock, flags); - - rc = clear_entries(irq_iommu); - - irq_iommu->iommu = NULL; - irq_iommu->irte_index = 0; - irq_iommu->sub_handle = 0; - irq_iommu->irte_mask = 0; - - raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags); - - return rc; -} - /* * source validation type */ @@ -488,7 +423,6 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) pages = alloc_pages_node(iommu->node, GFP_KERNEL | __GFP_ZERO, INTR_REMAP_PAGE_ORDER); - if (!pages) { pr_err("IR%d: failed to allocate pages of order %d\n", iommu->seq_id, INTR_REMAP_PAGE_ORDER); @@ -502,11 +436,23 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu) goto out_free_pages; } + iommu->ir_domain = irq_domain_add_hierarchy(arch_get_ir_parent_domain(), + 0, INTR_REMAP_TABLE_ENTRIES, + NULL, &intel_ir_domain_ops, + iommu); + if (!iommu->ir_domain) { + pr_err("IR%d: failed to allocate irqdomain\n", iommu->seq_id); + goto out_free_bitmap; + } + iommu->ir_msi_domain = arch_create_msi_irq_domain(iommu->ir_domain); + ir_table->base = page_address(pages); ir_table->bitmap = bitmap; iommu->ir_table = ir_table; return 0; +out_free_bitmap: + kfree(bitmap); out_free_pages: __free_pages(pages, INTR_REMAP_PAGE_ORDER); out_free_table: @@ -517,6 +463,14 @@ out_free_table: static void intel_teardown_irq_remapping(struct intel_iommu *iommu) { if (iommu && iommu->ir_table) { + if (iommu->ir_msi_domain) { + irq_domain_remove(iommu->ir_msi_domain); + iommu->ir_msi_domain = NULL; + } + if (iommu->ir_domain) { + irq_domain_remove(iommu->ir_domain); + iommu->ir_domain = NULL; + } free_pages((unsigned long)iommu->ir_table->base, INTR_REMAP_PAGE_ORDER); kfree(iommu->ir_table->bitmap); @@ -627,6 +581,26 @@ error: return -ENODEV; } +/* + * Set Posted-Interrupts capability. + */ +static inline void set_irq_posting_cap(void) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; + + if (!disable_irq_post) { + intel_irq_remap_ops.capability |= 1 << IRQ_POSTING_CAP; + + for_each_iommu(iommu, drhd) + if (!cap_pi_support(iommu->cap)) { + intel_irq_remap_ops.capability &= + ~(1 << IRQ_POSTING_CAP); + break; + } + } +} + static int __init intel_enable_irq_remapping(void) { struct dmar_drhd_unit *drhd; @@ -702,12 +676,7 @@ static int __init intel_enable_irq_remapping(void) irq_remapping_enabled = 1; - /* - * VT-d has a different layout for IO-APIC entries when - * interrupt remapping is enabled. So it needs a special routine - * to print IO-APIC entries for debugging purposes too. - */ - x86_io_apic_ops.print_entries = intel_ir_io_apic_print_entries; + set_irq_posting_cap(); pr_info("Enabled IRQ remapping in %s mode\n", eim ? "x2apic" : "xapic"); @@ -909,6 +878,12 @@ static void disable_irq_remapping(void) iommu_disable_irq_remapping(iommu); } + + /* + * Clear Posted-Interrupts capability. + */ + if (!disable_irq_post) + intel_irq_remap_ops.capability &= ~(1 << IRQ_POSTING_CAP); } static int reenable_irq_remapping(int eim) @@ -936,6 +911,8 @@ static int reenable_irq_remapping(int eim) if (!setup) goto error; + set_irq_posting_cap(); + return 0; error: @@ -945,8 +922,7 @@ error: return -1; } -static void prepare_irte(struct irte *irte, int vector, - unsigned int dest) +static void prepare_irte(struct irte *irte, int vector, unsigned int dest) { memset(irte, 0, sizeof(*irte)); @@ -966,76 +942,63 @@ static void prepare_irte(struct irte *irte, int vector, irte->redir_hint = 1; } -static int intel_setup_ioapic_entry(int irq, - struct IO_APIC_route_entry *route_entry, - unsigned int destination, int vector, - struct io_apic_irq_attr *attr) +static struct irq_domain *intel_get_ir_irq_domain(struct irq_alloc_info *info) { - int ioapic_id = mpc_ioapic_id(attr->ioapic); - struct intel_iommu *iommu; - struct IR_IO_APIC_route_entry *entry; - struct irte irte; - int index; + struct intel_iommu *iommu = NULL; - down_read(&dmar_global_lock); - iommu = map_ioapic_to_ir(ioapic_id); - if (!iommu) { - pr_warn("No mapping iommu for ioapic %d\n", ioapic_id); - index = -ENODEV; - } else { - index = alloc_irte(iommu, irq, 1); - if (index < 0) { - pr_warn("Failed to allocate IRTE for ioapic %d\n", - ioapic_id); - index = -ENOMEM; - } + if (!info) + return NULL; + + switch (info->type) { + case X86_IRQ_ALLOC_TYPE_IOAPIC: + iommu = map_ioapic_to_ir(info->ioapic_id); + break; + case X86_IRQ_ALLOC_TYPE_HPET: + iommu = map_hpet_to_ir(info->hpet_id); + break; + case X86_IRQ_ALLOC_TYPE_MSI: + case X86_IRQ_ALLOC_TYPE_MSIX: + iommu = map_dev_to_ir(info->msi_dev); + break; + default: + BUG_ON(1); + break; } - up_read(&dmar_global_lock); - if (index < 0) - return index; - prepare_irte(&irte, vector, destination); - - /* Set source-id of interrupt request */ - set_ioapic_sid(&irte, ioapic_id); - - modify_irte(irq, &irte); - - apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: " - "Set IRTE entry (P:%d FPD:%d Dst_Mode:%d " - "Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X " - "Avail:%X Vector:%02X Dest:%08X " - "SID:%04X SQ:%X SVT:%X)\n", - attr->ioapic, irte.present, irte.fpd, irte.dst_mode, - irte.redir_hint, irte.trigger_mode, irte.dlvry_mode, - irte.avail, irte.vector, irte.dest_id, - irte.sid, irte.sq, irte.svt); - - entry = (struct IR_IO_APIC_route_entry *)route_entry; - memset(entry, 0, sizeof(*entry)); - - entry->index2 = (index >> 15) & 0x1; - entry->zero = 0; - entry->format = 1; - entry->index = (index & 0x7fff); - /* - * IO-APIC RTE will be configured with virtual vector. - * irq handler will do the explicit EOI to the io-apic. - */ - entry->vector = attr->ioapic_pin; - entry->mask = 0; /* enable IRQ */ - entry->trigger = attr->trigger; - entry->polarity = attr->polarity; - - /* Mask level triggered irqs. - * Use IRQ_DELAYED_DISABLE for edge triggered irqs. - */ - if (attr->trigger) - entry->mask = 1; - - return 0; + return iommu ? iommu->ir_domain : NULL; } +static struct irq_domain *intel_get_irq_domain(struct irq_alloc_info *info) +{ + struct intel_iommu *iommu; + + if (!info) + return NULL; + + switch (info->type) { + case X86_IRQ_ALLOC_TYPE_MSI: + case X86_IRQ_ALLOC_TYPE_MSIX: + iommu = map_dev_to_ir(info->msi_dev); + if (iommu) + return iommu->ir_msi_domain; + break; + default: + break; + } + + return NULL; +} + +struct irq_remap_ops intel_irq_remap_ops = { + .prepare = intel_prepare_irq_remapping, + .enable = intel_enable_irq_remapping, + .disable = disable_irq_remapping, + .reenable = reenable_irq_remapping, + .enable_faulting = enable_drhd_fault_handling, + .get_ir_irq_domain = intel_get_ir_irq_domain, + .get_irq_domain = intel_get_irq_domain, +}; + /* * Migrate the IO-APIC irq in the presence of intr-remapping. * @@ -1051,170 +1014,282 @@ static int intel_setup_ioapic_entry(int irq, * is used to migrate MSI irq's in the presence of interrupt-remapping. */ static int -intel_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, - bool force) +intel_ir_set_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) { + struct intel_ir_data *ir_data = data->chip_data; + struct irte *irte = &ir_data->irte_entry; struct irq_cfg *cfg = irqd_cfg(data); - unsigned int dest, irq = data->irq; - struct irte irte; - int err; + struct irq_data *parent = data->parent_data; + int ret; - if (!config_enabled(CONFIG_SMP)) - return -EINVAL; - - if (!cpumask_intersects(mask, cpu_online_mask)) - return -EINVAL; - - if (get_irte(irq, &irte)) - return -EBUSY; - - err = assign_irq_vector(irq, cfg, mask); - if (err) - return err; - - err = apic->cpu_mask_to_apicid_and(cfg->domain, mask, &dest); - if (err) { - if (assign_irq_vector(irq, cfg, data->affinity)) - pr_err("Failed to recover vector for irq %d\n", irq); - return err; - } - - irte.vector = cfg->vector; - irte.dest_id = IRTE_DEST(dest); + ret = parent->chip->irq_set_affinity(parent, mask, force); + if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE) + return ret; /* * Atomically updates the IRTE with the new destination, vector * and flushes the interrupt entry cache. */ - modify_irte(irq, &irte); + irte->vector = cfg->vector; + irte->dest_id = IRTE_DEST(cfg->dest_apicid); + + /* Update the hardware only if the interrupt is in remapped mode. */ + if (ir_data->irq_2_iommu.mode == IRQ_REMAPPING) + modify_irte(&ir_data->irq_2_iommu, irte); /* * After this point, all the interrupts will start arriving * at the new destination. So, time to cleanup the previous * vector allocation. */ - if (cfg->move_in_progress) - send_cleanup_vector(cfg); + send_cleanup_vector(cfg); + + return IRQ_SET_MASK_OK_DONE; +} + +static void intel_ir_compose_msi_msg(struct irq_data *irq_data, + struct msi_msg *msg) +{ + struct intel_ir_data *ir_data = irq_data->chip_data; + + *msg = ir_data->msi_entry; +} + +static int intel_ir_set_vcpu_affinity(struct irq_data *data, void *info) +{ + struct intel_ir_data *ir_data = data->chip_data; + struct vcpu_data *vcpu_pi_info = info; + + /* stop posting interrupts, back to remapping mode */ + if (!vcpu_pi_info) { + modify_irte(&ir_data->irq_2_iommu, &ir_data->irte_entry); + } else { + struct irte irte_pi; + + /* + * We are not caching the posted interrupt entry. We + * copy the data from the remapped entry and modify + * the fields which are relevant for posted mode. The + * cached remapped entry is used for switching back to + * remapped mode. + */ + memset(&irte_pi, 0, sizeof(irte_pi)); + dmar_copy_shared_irte(&irte_pi, &ir_data->irte_entry); + + /* Update the posted mode fields */ + irte_pi.p_pst = 1; + irte_pi.p_urgent = 0; + irte_pi.p_vector = vcpu_pi_info->vector; + irte_pi.pda_l = (vcpu_pi_info->pi_desc_addr >> + (32 - PDA_LOW_BIT)) & ~(-1UL << PDA_LOW_BIT); + irte_pi.pda_h = (vcpu_pi_info->pi_desc_addr >> 32) & + ~(-1UL << PDA_HIGH_BIT); + + modify_irte(&ir_data->irq_2_iommu, &irte_pi); + } - cpumask_copy(data->affinity, mask); return 0; } -static void intel_compose_msi_msg(struct pci_dev *pdev, - unsigned int irq, unsigned int dest, - struct msi_msg *msg, u8 hpet_id) +static struct irq_chip intel_ir_chip = { + .irq_ack = ir_ack_apic_edge, + .irq_set_affinity = intel_ir_set_affinity, + .irq_compose_msi_msg = intel_ir_compose_msi_msg, + .irq_set_vcpu_affinity = intel_ir_set_vcpu_affinity, +}; + +static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data, + struct irq_cfg *irq_cfg, + struct irq_alloc_info *info, + int index, int sub_handle) { - struct irq_cfg *cfg; - struct irte irte; - u16 sub_handle = 0; - int ir_index; + struct IR_IO_APIC_route_entry *entry; + struct irte *irte = &data->irte_entry; + struct msi_msg *msg = &data->msi_entry; - cfg = irq_cfg(irq); + prepare_irte(irte, irq_cfg->vector, irq_cfg->dest_apicid); + switch (info->type) { + case X86_IRQ_ALLOC_TYPE_IOAPIC: + /* Set source-id of interrupt request */ + set_ioapic_sid(irte, info->ioapic_id); + apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: Set IRTE entry (P:%d FPD:%d Dst_Mode:%d Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X Avail:%X Vector:%02X Dest:%08X SID:%04X SQ:%X SVT:%X)\n", + info->ioapic_id, irte->present, irte->fpd, + irte->dst_mode, irte->redir_hint, + irte->trigger_mode, irte->dlvry_mode, + irte->avail, irte->vector, irte->dest_id, + irte->sid, irte->sq, irte->svt); - ir_index = map_irq_to_irte_handle(irq, &sub_handle); - BUG_ON(ir_index == -1); + entry = (struct IR_IO_APIC_route_entry *)info->ioapic_entry; + info->ioapic_entry = NULL; + memset(entry, 0, sizeof(*entry)); + entry->index2 = (index >> 15) & 0x1; + entry->zero = 0; + entry->format = 1; + entry->index = (index & 0x7fff); + /* + * IO-APIC RTE will be configured with virtual vector. + * irq handler will do the explicit EOI to the io-apic. + */ + entry->vector = info->ioapic_pin; + entry->mask = 0; /* enable IRQ */ + entry->trigger = info->ioapic_trigger; + entry->polarity = info->ioapic_polarity; + if (info->ioapic_trigger) + entry->mask = 1; /* Mask level triggered irqs. */ + break; - prepare_irte(&irte, cfg->vector, dest); + case X86_IRQ_ALLOC_TYPE_HPET: + case X86_IRQ_ALLOC_TYPE_MSI: + case X86_IRQ_ALLOC_TYPE_MSIX: + if (info->type == X86_IRQ_ALLOC_TYPE_HPET) + set_hpet_sid(irte, info->hpet_id); + else + set_msi_sid(irte, info->msi_dev); - /* Set source-id of interrupt request */ - if (pdev) - set_msi_sid(&irte, pdev); - else - set_hpet_sid(&irte, hpet_id); + msg->address_hi = MSI_ADDR_BASE_HI; + msg->data = sub_handle; + msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT | + MSI_ADDR_IR_SHV | + MSI_ADDR_IR_INDEX1(index) | + MSI_ADDR_IR_INDEX2(index); + break; - modify_irte(irq, &irte); - - msg->address_hi = MSI_ADDR_BASE_HI; - msg->data = sub_handle; - msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT | - MSI_ADDR_IR_SHV | - MSI_ADDR_IR_INDEX1(ir_index) | - MSI_ADDR_IR_INDEX2(ir_index); + default: + BUG_ON(1); + break; + } } -/* - * Map the PCI dev to the corresponding remapping hardware unit - * and allocate 'nvec' consecutive interrupt-remapping table entries - * in it. - */ -static int intel_msi_alloc_irq(struct pci_dev *dev, int irq, int nvec) +static void intel_free_irq_resources(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs) { - struct intel_iommu *iommu; - int index; + struct irq_data *irq_data; + struct intel_ir_data *data; + struct irq_2_iommu *irq_iommu; + unsigned long flags; + int i; - down_read(&dmar_global_lock); - iommu = map_dev_to_ir(dev); - if (!iommu) { - printk(KERN_ERR - "Unable to map PCI %s to iommu\n", pci_name(dev)); - index = -ENOENT; - } else { - index = alloc_irte(iommu, irq, nvec); - if (index < 0) { - printk(KERN_ERR - "Unable to allocate %d IRTE for PCI %s\n", - nvec, pci_name(dev)); - index = -ENOSPC; + for (i = 0; i < nr_irqs; i++) { + irq_data = irq_domain_get_irq_data(domain, virq + i); + if (irq_data && irq_data->chip_data) { + data = irq_data->chip_data; + irq_iommu = &data->irq_2_iommu; + raw_spin_lock_irqsave(&irq_2_ir_lock, flags); + clear_entries(irq_iommu); + raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags); + irq_domain_reset_irq_data(irq_data); + kfree(data); } } - up_read(&dmar_global_lock); - - return index; } -static int intel_msi_setup_irq(struct pci_dev *pdev, unsigned int irq, - int index, int sub_handle) +static int intel_irq_remapping_alloc(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs, + void *arg) { - struct intel_iommu *iommu; - int ret = -ENOENT; + struct intel_iommu *iommu = domain->host_data; + struct irq_alloc_info *info = arg; + struct intel_ir_data *data, *ird; + struct irq_data *irq_data; + struct irq_cfg *irq_cfg; + int i, ret, index; + + if (!info || !iommu) + return -EINVAL; + if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_MSI && + info->type != X86_IRQ_ALLOC_TYPE_MSIX) + return -EINVAL; + + /* + * With IRQ remapping enabled, don't need contiguous CPU vectors + * to support multiple MSI interrupts. + */ + if (info->type == X86_IRQ_ALLOC_TYPE_MSI) + info->flags &= ~X86_IRQ_ALLOC_CONTIGUOUS_VECTORS; + + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); + if (ret < 0) + return ret; + + ret = -ENOMEM; + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + goto out_free_parent; down_read(&dmar_global_lock); - iommu = map_dev_to_ir(pdev); - if (iommu) { - /* - * setup the mapping between the irq and the IRTE - * base index, the sub_handle pointing to the - * appropriate interrupt remap table entry. - */ - set_irte_irq(irq, iommu, index, sub_handle); - ret = 0; - } + index = alloc_irte(iommu, virq, &data->irq_2_iommu, nr_irqs); up_read(&dmar_global_lock); + if (index < 0) { + pr_warn("Failed to allocate IRTE\n"); + kfree(data); + goto out_free_parent; + } + for (i = 0; i < nr_irqs; i++) { + irq_data = irq_domain_get_irq_data(domain, virq + i); + irq_cfg = irqd_cfg(irq_data); + if (!irq_data || !irq_cfg) { + ret = -EINVAL; + goto out_free_data; + } + + if (i > 0) { + ird = kzalloc(sizeof(*ird), GFP_KERNEL); + if (!ird) + goto out_free_data; + /* Initialize the common data */ + ird->irq_2_iommu = data->irq_2_iommu; + ird->irq_2_iommu.sub_handle = i; + } else { + ird = data; + } + + irq_data->hwirq = (index << 16) + i; + irq_data->chip_data = ird; + irq_data->chip = &intel_ir_chip; + intel_irq_remapping_prepare_irte(ird, irq_cfg, info, index, i); + irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT); + } + return 0; + +out_free_data: + intel_free_irq_resources(domain, virq, i); +out_free_parent: + irq_domain_free_irqs_common(domain, virq, nr_irqs); return ret; } -static int intel_alloc_hpet_msi(unsigned int irq, unsigned int id) +static void intel_irq_remapping_free(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs) { - int ret = -1; - struct intel_iommu *iommu; - int index; - - down_read(&dmar_global_lock); - iommu = map_hpet_to_ir(id); - if (iommu) { - index = alloc_irte(iommu, irq, 1); - if (index >= 0) - ret = 0; - } - up_read(&dmar_global_lock); - - return ret; + intel_free_irq_resources(domain, virq, nr_irqs); + irq_domain_free_irqs_common(domain, virq, nr_irqs); } -struct irq_remap_ops intel_irq_remap_ops = { - .prepare = intel_prepare_irq_remapping, - .enable = intel_enable_irq_remapping, - .disable = disable_irq_remapping, - .reenable = reenable_irq_remapping, - .enable_faulting = enable_drhd_fault_handling, - .setup_ioapic_entry = intel_setup_ioapic_entry, - .set_affinity = intel_ioapic_set_affinity, - .free_irq = free_irte, - .compose_msi_msg = intel_compose_msi_msg, - .msi_alloc_irq = intel_msi_alloc_irq, - .msi_setup_irq = intel_msi_setup_irq, - .alloc_hpet_msi = intel_alloc_hpet_msi, +static void intel_irq_remapping_activate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + struct intel_ir_data *data = irq_data->chip_data; + + modify_irte(&data->irq_2_iommu, &data->irte_entry); +} + +static void intel_irq_remapping_deactivate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + struct intel_ir_data *data = irq_data->chip_data; + struct irte entry; + + memset(&entry, 0, sizeof(entry)); + modify_irte(&data->irq_2_iommu, &entry); +} + +static struct irq_domain_ops intel_ir_domain_ops = { + .alloc = intel_irq_remapping_alloc, + .free = intel_irq_remapping_free, + .activate = intel_irq_remapping_activate, + .deactivate = intel_irq_remapping_deactivate, }; /* @@ -1280,6 +1355,9 @@ int dmar_ir_hotplug(struct dmar_drhd_unit *dmaru, bool insert) return -EINVAL; if (!ecap_ir_support(iommu->ecap)) return 0; + if (irq_remapping_cap(IRQ_POSTING_CAP) && + !cap_pi_support(iommu->cap)) + return -EBUSY; if (insert) { if (!iommu->ir_table) diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c index 390079ee1350..2d9993062ded 100644 --- a/drivers/iommu/irq_remapping.c +++ b/drivers/iommu/irq_remapping.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -21,21 +22,11 @@ int irq_remap_broken; int disable_sourceid_checking; int no_x2apic_optout; +int disable_irq_post = 1; + static int disable_irq_remap; static struct irq_remap_ops *remap_ops; -static int msi_alloc_remapped_irq(struct pci_dev *pdev, int irq, int nvec); -static int msi_setup_remapped_irq(struct pci_dev *pdev, unsigned int irq, - int index, int sub_handle); -static int set_remapped_irq_affinity(struct irq_data *data, - const struct cpumask *mask, - bool force); - -static bool irq_remapped(struct irq_cfg *cfg) -{ - return (cfg->remapped == 1); -} - static void irq_remapping_disable_io_apic(void) { /* @@ -49,117 +40,9 @@ static void irq_remapping_disable_io_apic(void) disconnect_bsp_APIC(0); } -static int do_setup_msi_irqs(struct pci_dev *dev, int nvec) -{ - int ret, sub_handle, nvec_pow2, index = 0; - unsigned int irq; - struct msi_desc *msidesc; - - msidesc = list_entry(dev->msi_list.next, struct msi_desc, list); - - irq = irq_alloc_hwirqs(nvec, dev_to_node(&dev->dev)); - if (irq == 0) - return -ENOSPC; - - nvec_pow2 = __roundup_pow_of_two(nvec); - for (sub_handle = 0; sub_handle < nvec; sub_handle++) { - if (!sub_handle) { - index = msi_alloc_remapped_irq(dev, irq, nvec_pow2); - if (index < 0) { - ret = index; - goto error; - } - } else { - ret = msi_setup_remapped_irq(dev, irq + sub_handle, - index, sub_handle); - if (ret < 0) - goto error; - } - ret = setup_msi_irq(dev, msidesc, irq, sub_handle); - if (ret < 0) - goto error; - } - return 0; - -error: - irq_free_hwirqs(irq, nvec); - - /* - * Restore altered MSI descriptor fields and prevent just destroyed - * IRQs from tearing down again in default_teardown_msi_irqs() - */ - msidesc->irq = 0; - - return ret; -} - -static int do_setup_msix_irqs(struct pci_dev *dev, int nvec) -{ - int node, ret, sub_handle, index = 0; - struct msi_desc *msidesc; - unsigned int irq; - - node = dev_to_node(&dev->dev); - sub_handle = 0; - - list_for_each_entry(msidesc, &dev->msi_list, list) { - - irq = irq_alloc_hwirq(node); - if (irq == 0) - return -1; - - if (sub_handle == 0) - ret = index = msi_alloc_remapped_irq(dev, irq, nvec); - else - ret = msi_setup_remapped_irq(dev, irq, index, sub_handle); - - if (ret < 0) - goto error; - - ret = setup_msi_irq(dev, msidesc, irq, 0); - if (ret < 0) - goto error; - - sub_handle += 1; - irq += 1; - } - - return 0; - -error: - irq_free_hwirq(irq); - return ret; -} - -static int irq_remapping_setup_msi_irqs(struct pci_dev *dev, - int nvec, int type) -{ - if (type == PCI_CAP_ID_MSI) - return do_setup_msi_irqs(dev, nvec); - else - return do_setup_msix_irqs(dev, nvec); -} - -static void eoi_ioapic_pin_remapped(int apic, int pin, int vector) -{ - /* - * Intr-remapping uses pin number as the virtual vector - * in the RTE. Actual vector is programmed in - * intr-remapping table entry. Hence for the io-apic - * EOI we use the pin number. - */ - io_apic_eoi(apic, pin); -} - static void __init irq_remapping_modify_x86_ops(void) { x86_io_apic_ops.disable = irq_remapping_disable_io_apic; - x86_io_apic_ops.set_affinity = set_remapped_irq_affinity; - x86_io_apic_ops.setup_entry = setup_ioapic_remapped_entry; - x86_io_apic_ops.eoi_ioapic_pin = eoi_ioapic_pin_remapped; - x86_msi.setup_msi_irqs = irq_remapping_setup_msi_irqs; - x86_msi.setup_hpet_msi = setup_hpet_msi_remapped; - x86_msi.compose_msi_msg = compose_remapped_msi_msg; } static __init int setup_nointremap(char *str) @@ -198,6 +81,15 @@ void set_irq_remapping_broken(void) irq_remap_broken = 1; } +bool irq_remapping_cap(enum irq_remap_cap cap) +{ + if (!remap_ops || disable_irq_post) + return 0; + + return (remap_ops->capability & (1 << cap)); +} +EXPORT_SYMBOL_GPL(irq_remapping_cap); + int __init irq_remapping_prepare(void) { if (disable_irq_remap) @@ -254,113 +146,48 @@ int __init irq_remap_enable_fault_handling(void) return remap_ops->enable_faulting(); } -int setup_ioapic_remapped_entry(int irq, - struct IO_APIC_route_entry *entry, - unsigned int destination, int vector, - struct io_apic_irq_attr *attr) -{ - if (!remap_ops->setup_ioapic_entry) - return -ENODEV; - - return remap_ops->setup_ioapic_entry(irq, entry, destination, - vector, attr); -} - -static int set_remapped_irq_affinity(struct irq_data *data, - const struct cpumask *mask, bool force) -{ - if (!config_enabled(CONFIG_SMP) || !remap_ops->set_affinity) - return 0; - - return remap_ops->set_affinity(data, mask, force); -} - -void free_remapped_irq(int irq) -{ - struct irq_cfg *cfg = irq_cfg(irq); - - if (irq_remapped(cfg) && remap_ops->free_irq) - remap_ops->free_irq(irq); -} - -void compose_remapped_msi_msg(struct pci_dev *pdev, - unsigned int irq, unsigned int dest, - struct msi_msg *msg, u8 hpet_id) -{ - struct irq_cfg *cfg = irq_cfg(irq); - - if (!irq_remapped(cfg)) - native_compose_msi_msg(pdev, irq, dest, msg, hpet_id); - else if (remap_ops->compose_msi_msg) - remap_ops->compose_msi_msg(pdev, irq, dest, msg, hpet_id); -} - -static int msi_alloc_remapped_irq(struct pci_dev *pdev, int irq, int nvec) -{ - if (!remap_ops->msi_alloc_irq) - return -ENODEV; - - return remap_ops->msi_alloc_irq(pdev, irq, nvec); -} - -static int msi_setup_remapped_irq(struct pci_dev *pdev, unsigned int irq, - int index, int sub_handle) -{ - if (!remap_ops->msi_setup_irq) - return -ENODEV; - - return remap_ops->msi_setup_irq(pdev, irq, index, sub_handle); -} - -int setup_hpet_msi_remapped(unsigned int irq, unsigned int id) -{ - int ret; - - if (!remap_ops->alloc_hpet_msi) - return -ENODEV; - - ret = remap_ops->alloc_hpet_msi(irq, id); - if (ret) - return -EINVAL; - - return default_setup_hpet_msi(irq, id); -} - void panic_if_irq_remap(const char *msg) { if (irq_remapping_enabled) panic(msg); } -static void ir_ack_apic_edge(struct irq_data *data) +void ir_ack_apic_edge(struct irq_data *data) { ack_APIC_irq(); } -static void ir_ack_apic_level(struct irq_data *data) +/** + * irq_remapping_get_ir_irq_domain - Get the irqdomain associated with the IOMMU + * device serving request @info + * @info: interrupt allocation information, used to identify the IOMMU device + * + * It's used to get parent irqdomain for HPET and IOAPIC irqdomains. + * Returns pointer to IRQ domain, or NULL on failure. + */ +struct irq_domain * +irq_remapping_get_ir_irq_domain(struct irq_alloc_info *info) { - ack_APIC_irq(); - eoi_ioapic_irq(data->irq, irqd_cfg(data)); + if (!remap_ops || !remap_ops->get_ir_irq_domain) + return NULL; + + return remap_ops->get_ir_irq_domain(info); } -static void ir_print_prefix(struct irq_data *data, struct seq_file *p) +/** + * irq_remapping_get_irq_domain - Get the irqdomain serving the request @info + * @info: interrupt allocation information, used to identify the IOMMU device + * + * There will be one PCI MSI/MSIX irqdomain associated with each interrupt + * remapping device, so this interface is used to retrieve the PCI MSI/MSIX + * irqdomain serving request @info. + * Returns pointer to IRQ domain, or NULL on failure. + */ +struct irq_domain * +irq_remapping_get_irq_domain(struct irq_alloc_info *info) { - seq_printf(p, " IR-%s", data->chip->name); -} + if (!remap_ops || !remap_ops->get_irq_domain) + return NULL; -void irq_remap_modify_chip_defaults(struct irq_chip *chip) -{ - chip->irq_print_chip = ir_print_prefix; - chip->irq_ack = ir_ack_apic_edge; - chip->irq_eoi = ir_ack_apic_level; - chip->irq_set_affinity = x86_io_apic_ops.set_affinity; -} - -bool setup_remapped_irq(int irq, struct irq_cfg *cfg, struct irq_chip *chip) -{ - if (!irq_remapped(cfg)) - return false; - irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - irq_remap_modify_chip_defaults(chip); - return true; + return remap_ops->get_irq_domain(info); } diff --git a/drivers/iommu/irq_remapping.h b/drivers/iommu/irq_remapping.h index 7c70cc29ffe6..039c7af7b190 100644 --- a/drivers/iommu/irq_remapping.h +++ b/drivers/iommu/irq_remapping.h @@ -24,19 +24,22 @@ #ifdef CONFIG_IRQ_REMAP -struct IO_APIC_route_entry; -struct io_apic_irq_attr; struct irq_data; -struct cpumask; -struct pci_dev; struct msi_msg; +struct irq_domain; +struct irq_alloc_info; extern int irq_remap_broken; extern int disable_sourceid_checking; extern int no_x2apic_optout; extern int irq_remapping_enabled; +extern int disable_irq_post; + struct irq_remap_ops { + /* The supported capabilities */ + int capability; + /* Initializes hardware and makes it ready for remapping interrupts */ int (*prepare)(void); @@ -52,40 +55,23 @@ struct irq_remap_ops { /* Enable fault handling */ int (*enable_faulting)(void); - /* IO-APIC setup routine */ - int (*setup_ioapic_entry)(int irq, struct IO_APIC_route_entry *, - unsigned int, int, - struct io_apic_irq_attr *); + /* Get the irqdomain associated the IOMMU device */ + struct irq_domain *(*get_ir_irq_domain)(struct irq_alloc_info *); - /* Set the CPU affinity of a remapped interrupt */ - int (*set_affinity)(struct irq_data *data, const struct cpumask *mask, - bool force); - - /* Free an IRQ */ - int (*free_irq)(int); - - /* Create MSI msg to use for interrupt remapping */ - void (*compose_msi_msg)(struct pci_dev *, - unsigned int, unsigned int, - struct msi_msg *, u8); - - /* Allocate remapping resources for MSI */ - int (*msi_alloc_irq)(struct pci_dev *, int, int); - - /* Setup the remapped MSI irq */ - int (*msi_setup_irq)(struct pci_dev *, unsigned int, int, int); - - /* Setup interrupt remapping for an HPET MSI */ - int (*alloc_hpet_msi)(unsigned int, unsigned int); + /* Get the MSI irqdomain associated with the IOMMU device */ + struct irq_domain *(*get_irq_domain)(struct irq_alloc_info *); }; extern struct irq_remap_ops intel_irq_remap_ops; extern struct irq_remap_ops amd_iommu_irq_ops; +extern void ir_ack_apic_edge(struct irq_data *data); + #else /* CONFIG_IRQ_REMAP */ #define irq_remapping_enabled 0 #define irq_remap_broken 0 +#define disable_irq_post 1 #endif /* CONFIG_IRQ_REMAP */ diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index 5e7559be222a..eb934b0242e0 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c @@ -20,7 +20,7 @@ #include "lg.h" /* Allow Guests to use a non-128 (ie. non-Linux) syscall trap. */ -static unsigned int syscall_vector = SYSCALL_VECTOR; +static unsigned int syscall_vector = IA32_SYSCALL_VECTOR; module_param(syscall_vector, uint, 0444); /* The address of the interrupt handler is split into two bits: */ @@ -333,8 +333,8 @@ void set_interrupt(struct lg_cpu *cpu, unsigned int irq) */ static bool could_be_syscall(unsigned int num) { - /* Normal Linux SYSCALL_VECTOR or reserved vector? */ - return num == SYSCALL_VECTOR || num == syscall_vector; + /* Normal Linux IA32_SYSCALL_VECTOR or reserved vector? */ + return num == IA32_SYSCALL_VECTOR || num == syscall_vector; } /* The syscall vector it wants must be unused by Host. */ @@ -351,7 +351,7 @@ bool check_syscall_vector(struct lguest *lg) int init_interrupts(void) { /* If they want some strange system call vector, reserve it now */ - if (syscall_vector != SYSCALL_VECTOR) { + if (syscall_vector != IA32_SYSCALL_VECTOR) { if (test_bit(syscall_vector, used_vectors) || vector_used_by_percpu_irq(syscall_vector)) { printk(KERN_ERR "lg: couldn't reserve syscall %u\n", @@ -366,7 +366,7 @@ int init_interrupts(void) void free_interrupts(void) { - if (syscall_vector != SYSCALL_VECTOR) + if (syscall_vector != IA32_SYSCALL_VECTOR) clear_bit(syscall_vector, used_vectors); } diff --git a/drivers/media/pci/ivtv/Kconfig b/drivers/media/pci/ivtv/Kconfig index dd6ee57e3a4c..6e5867c57305 100644 --- a/drivers/media/pci/ivtv/Kconfig +++ b/drivers/media/pci/ivtv/Kconfig @@ -57,5 +57,8 @@ config VIDEO_FB_IVTV This is used in the Hauppauge PVR-350 card. There is a driver homepage at . + In order to use this module, you will need to boot with PAT disabled + on x86 systems, using the nopat kernel parameter. + To compile this driver as a module, choose M here: the module will be called ivtvfb. diff --git a/drivers/media/pci/ivtv/ivtvfb.c b/drivers/media/pci/ivtv/ivtvfb.c index 9ff1230192e8..4cb365d4ffdc 100644 --- a/drivers/media/pci/ivtv/ivtvfb.c +++ b/drivers/media/pci/ivtv/ivtvfb.c @@ -44,8 +44,8 @@ #include #include -#ifdef CONFIG_MTRR -#include +#ifdef CONFIG_X86_64 +#include #endif #include "ivtv-driver.h" @@ -155,12 +155,11 @@ struct osd_info { /* Buffer size */ u32 video_buffer_size; -#ifdef CONFIG_MTRR /* video_base rounded down as required by hardware MTRRs */ unsigned long fb_start_aligned_physaddr; /* video_base rounded up as required by hardware MTRRs */ unsigned long fb_end_aligned_physaddr; -#endif + int wc_cookie; /* Store the buffer offset */ int set_osd_coords_x; @@ -1099,6 +1098,8 @@ static int ivtvfb_init_vidmode(struct ivtv *itv) static int ivtvfb_init_io(struct ivtv *itv) { struct osd_info *oi = itv->osd_info; + /* Find the largest power of two that maps the whole buffer */ + int size_shift = 31; mutex_lock(&itv->serialize_lock); if (ivtv_init_on_first_open(itv)) { @@ -1132,29 +1133,16 @@ static int ivtvfb_init_io(struct ivtv *itv) oi->video_pbase, oi->video_vbase, oi->video_buffer_size / 1024); -#ifdef CONFIG_MTRR - { - /* Find the largest power of two that maps the whole buffer */ - int size_shift = 31; - - while (!(oi->video_buffer_size & (1 << size_shift))) { - size_shift--; - } - size_shift++; - oi->fb_start_aligned_physaddr = oi->video_pbase & ~((1 << size_shift) - 1); - oi->fb_end_aligned_physaddr = oi->video_pbase + oi->video_buffer_size; - oi->fb_end_aligned_physaddr += (1 << size_shift) - 1; - oi->fb_end_aligned_physaddr &= ~((1 << size_shift) - 1); - if (mtrr_add(oi->fb_start_aligned_physaddr, - oi->fb_end_aligned_physaddr - oi->fb_start_aligned_physaddr, - MTRR_TYPE_WRCOMB, 1) < 0) { - IVTVFB_INFO("disabled mttr\n"); - oi->fb_start_aligned_physaddr = 0; - oi->fb_end_aligned_physaddr = 0; - } - } -#endif - + while (!(oi->video_buffer_size & (1 << size_shift))) + size_shift--; + size_shift++; + oi->fb_start_aligned_physaddr = oi->video_pbase & ~((1 << size_shift) - 1); + oi->fb_end_aligned_physaddr = oi->video_pbase + oi->video_buffer_size; + oi->fb_end_aligned_physaddr += (1 << size_shift) - 1; + oi->fb_end_aligned_physaddr &= ~((1 << size_shift) - 1); + oi->wc_cookie = arch_phys_wc_add(oi->fb_start_aligned_physaddr, + oi->fb_end_aligned_physaddr - + oi->fb_start_aligned_physaddr); /* Blank the entire osd. */ memset_io(oi->video_vbase, 0, oi->video_buffer_size); @@ -1172,14 +1160,7 @@ static void ivtvfb_release_buffers (struct ivtv *itv) /* Release pseudo palette */ kfree(oi->ivtvfb_info.pseudo_palette); - -#ifdef CONFIG_MTRR - if (oi->fb_end_aligned_physaddr) { - mtrr_del(-1, oi->fb_start_aligned_physaddr, - oi->fb_end_aligned_physaddr - oi->fb_start_aligned_physaddr); - } -#endif - + arch_phys_wc_del(oi->wc_cookie); kfree(oi); itv->osd_info = NULL; } @@ -1284,6 +1265,13 @@ static int __init ivtvfb_init(void) int registered = 0; int err; +#ifdef CONFIG_X86_64 + if (WARN(pat_enabled(), + "ivtvfb needs PAT disabled, boot with nopat kernel parameter\n")) { + return -ENODEV; + } +#endif + if (ivtvfb_card_id < -1 || ivtvfb_card_id >= IVTV_MAX_CARDS) { printk(KERN_ERR "ivtvfb: ivtvfb_card_id parameter is out of range (valid range: -1 - %d)\n", IVTV_MAX_CARDS - 1); diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index ea091bc5ff09..1e09243d5449 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include "hyperv_net.h" diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c index 9118cea91882..35a482d526d9 100644 --- a/drivers/net/hyperv/rndis_filter.c +++ b/drivers/net/hyperv/rndis_filter.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "hyperv_net.h" diff --git a/drivers/pci/htirq.c b/drivers/pci/htirq.c index a94dd2c4183a..7eb4109a3df4 100644 --- a/drivers/pci/htirq.c +++ b/drivers/pci/htirq.c @@ -23,20 +23,11 @@ */ static DEFINE_SPINLOCK(ht_irq_lock); -struct ht_irq_cfg { - struct pci_dev *dev; - /* Update callback used to cope with buggy hardware */ - ht_irq_update_t *update; - unsigned pos; - unsigned idx; - struct ht_irq_msg msg; -}; - - void write_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg) { struct ht_irq_cfg *cfg = irq_get_handler_data(irq); unsigned long flags; + spin_lock_irqsave(&ht_irq_lock, flags); if (cfg->msg.address_lo != msg->address_lo) { pci_write_config_byte(cfg->dev, cfg->pos + 2, cfg->idx); @@ -55,6 +46,7 @@ void write_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg) void fetch_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg) { struct ht_irq_cfg *cfg = irq_get_handler_data(irq); + *msg = cfg->msg; } @@ -86,7 +78,6 @@ void unmask_ht_irq(struct irq_data *data) */ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update) { - struct ht_irq_cfg *cfg; int max_irq, pos, irq; unsigned long flags; u32 data; @@ -105,29 +96,9 @@ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update) if (idx > max_irq) return -EINVAL; - cfg = kmalloc(sizeof(*cfg), GFP_KERNEL); - if (!cfg) - return -ENOMEM; - - cfg->dev = dev; - cfg->update = update; - cfg->pos = pos; - cfg->idx = 0x10 + (idx * 2); - /* Initialize msg to a value that will never match the first write. */ - cfg->msg.address_lo = 0xffffffff; - cfg->msg.address_hi = 0xffffffff; - - irq = irq_alloc_hwirq(dev_to_node(&dev->dev)); - if (!irq) { - kfree(cfg); - return -EBUSY; - } - irq_set_handler_data(irq, cfg); - - if (arch_setup_ht_irq(irq, dev) < 0) { - ht_destroy_irq(irq); - return -EBUSY; - } + irq = arch_setup_ht_irq(idx, pos, dev, update); + if (irq > 0) + dev_dbg(&dev->dev, "irq %d for HT\n", irq); return irq; } @@ -158,13 +129,6 @@ EXPORT_SYMBOL(ht_create_irq); */ void ht_destroy_irq(unsigned int irq) { - struct ht_irq_cfg *cfg; - - cfg = irq_get_handler_data(irq); - irq_set_chip(irq, NULL); - irq_set_handler_data(irq, NULL); - irq_free_hwirq(irq); - - kfree(cfg); + arch_teardown_ht_irq(irq); } EXPORT_SYMBOL(ht_destroy_irq); diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index c6dc1dfd25d5..2890ad7cf7c6 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -819,13 +819,6 @@ static void quirk_amd_ioapic(struct pci_dev *dev) } } DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_VIPER_7410, quirk_amd_ioapic); - -static void quirk_ioapic_rmw(struct pci_dev *dev) -{ - if (dev->devfn == 0 && dev->bus->number == 0) - sis_apic_bug = 1; -} -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_SI, PCI_ANY_ID, quirk_ioapic_rmw); #endif /* CONFIG_X86_IO_APIC */ /* diff --git a/drivers/scsi/fnic/fnic_debugfs.c b/drivers/scsi/fnic/fnic_debugfs.c index 5980c10c734d..d6498fabe628 100644 --- a/drivers/scsi/fnic/fnic_debugfs.c +++ b/drivers/scsi/fnic/fnic_debugfs.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "fnic.h" static struct dentry *fnic_trace_debugfs_root; diff --git a/drivers/scsi/fnic/fnic_trace.c b/drivers/scsi/fnic/fnic_trace.c index 65a9bde26974..4e15c4bf0795 100644 --- a/drivers/scsi/fnic/fnic_trace.c +++ b/drivers/scsi/fnic/fnic_trace.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "fnic_io.h" #include "fnic.h" diff --git a/drivers/video/fbdev/amifb.c b/drivers/video/fbdev/amifb.c index 35f7900a0573..ee3a703acf23 100644 --- a/drivers/video/fbdev/amifb.c +++ b/drivers/video/fbdev/amifb.c @@ -3705,8 +3705,8 @@ default_chipset: * access the videomem with writethrough cache */ info->fix.smem_start = (u_long)ZTWO_PADDR(videomemory); - videomemory = (u_long)ioremap_writethrough(info->fix.smem_start, - info->fix.smem_len); + videomemory = (u_long)ioremap_wt(info->fix.smem_start, + info->fix.smem_len); if (!videomemory) { dev_warn(&pdev->dev, "Unable to map videomem cached writethrough\n"); diff --git a/drivers/video/fbdev/atafb.c b/drivers/video/fbdev/atafb.c index cb9ee2556850..d6ce613e12ad 100644 --- a/drivers/video/fbdev/atafb.c +++ b/drivers/video/fbdev/atafb.c @@ -3185,8 +3185,7 @@ int __init atafb_init(void) /* Map the video memory (physical address given) to somewhere * in the kernel address space. */ - external_screen_base = ioremap_writethrough(external_addr, - external_len); + external_screen_base = ioremap_wt(external_addr, external_len); if (external_vgaiobase) external_vgaiobase = (unsigned long)ioremap(external_vgaiobase, 0x10000); diff --git a/drivers/video/fbdev/hpfb.c b/drivers/video/fbdev/hpfb.c index a1b7e5fa9b09..9476d196f510 100644 --- a/drivers/video/fbdev/hpfb.c +++ b/drivers/video/fbdev/hpfb.c @@ -241,8 +241,8 @@ static int hpfb_init_one(unsigned long phys_base, unsigned long virt_base) fb_info.fix.line_length = fb_width; fb_height = (in_8(fb_regs + HPFB_FBHMSB) << 8) | in_8(fb_regs + HPFB_FBHLSB); fb_info.fix.smem_len = fb_width * fb_height; - fb_start = (unsigned long)ioremap_writethrough(fb_info.fix.smem_start, - fb_info.fix.smem_len); + fb_start = (unsigned long)ioremap_wt(fb_info.fix.smem_start, + fb_info.fix.smem_len); hpfb_defined.xres = (in_8(fb_regs + HPFB_DWMSB) << 8) | in_8(fb_regs + HPFB_DWLSB); hpfb_defined.yres = (in_8(fb_regs + HPFB_DHMSB) << 8) | in_8(fb_regs + HPFB_DHLSB); hpfb_defined.xres_virtual = hpfb_defined.xres; diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h index 9db042304df3..f56094cfdeff 100644 --- a/include/asm-generic/io.h +++ b/include/asm-generic/io.h @@ -769,6 +769,14 @@ static inline void __iomem *ioremap_nocache(phys_addr_t offset, size_t size) } #endif +#ifndef ioremap_uc +#define ioremap_uc ioremap_uc +static inline void __iomem *ioremap_uc(phys_addr_t offset, size_t size) +{ + return ioremap_nocache(offset, size); +} +#endif + #ifndef ioremap_wc #define ioremap_wc ioremap_wc static inline void __iomem *ioremap_wc(phys_addr_t offset, size_t size) @@ -777,8 +785,17 @@ static inline void __iomem *ioremap_wc(phys_addr_t offset, size_t size) } #endif +#ifndef ioremap_wt +#define ioremap_wt ioremap_wt +static inline void __iomem *ioremap_wt(phys_addr_t offset, size_t size) +{ + return ioremap_nocache(offset, size); +} +#endif + #ifndef iounmap #define iounmap iounmap + static inline void iounmap(void __iomem *addr) { } diff --git a/include/asm-generic/iomap.h b/include/asm-generic/iomap.h index 1b41011643a5..d8f8622fa044 100644 --- a/include/asm-generic/iomap.h +++ b/include/asm-generic/iomap.h @@ -66,6 +66,10 @@ extern void ioport_unmap(void __iomem *); #define ioremap_wc ioremap_nocache #endif +#ifndef ARCH_HAS_IOREMAP_WT +#define ioremap_wt ioremap_nocache +#endif + #ifdef CONFIG_PCI /* Destroy a virtual mapping cookie for a PCI BAR (memory or IO) */ struct pci_dev; diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 39f1d6a2b04d..bd910ceaccfa 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -262,6 +262,10 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) #define pgprot_writecombine pgprot_noncached #endif +#ifndef pgprot_writethrough +#define pgprot_writethrough pgprot_noncached +#endif + #ifndef pgprot_device #define pgprot_device pgprot_noncached #endif diff --git a/include/linux/dmar.h b/include/linux/dmar.h index 30624954dec5..e9bc9292bd3a 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -185,33 +185,85 @@ static inline int dmar_device_remove(void *handle) struct irte { union { + /* Shared between remapped and posted mode*/ struct { - __u64 present : 1, - fpd : 1, - dst_mode : 1, - redir_hint : 1, - trigger_mode : 1, - dlvry_mode : 3, - avail : 4, - __reserved_1 : 4, - vector : 8, - __reserved_2 : 8, - dest_id : 32; + __u64 present : 1, /* 0 */ + fpd : 1, /* 1 */ + __res0 : 6, /* 2 - 6 */ + avail : 4, /* 8 - 11 */ + __res1 : 3, /* 12 - 14 */ + pst : 1, /* 15 */ + vector : 8, /* 16 - 23 */ + __res2 : 40; /* 24 - 63 */ + }; + + /* Remapped mode */ + struct { + __u64 r_present : 1, /* 0 */ + r_fpd : 1, /* 1 */ + dst_mode : 1, /* 2 */ + redir_hint : 1, /* 3 */ + trigger_mode : 1, /* 4 */ + dlvry_mode : 3, /* 5 - 7 */ + r_avail : 4, /* 8 - 11 */ + r_res0 : 4, /* 12 - 15 */ + r_vector : 8, /* 16 - 23 */ + r_res1 : 8, /* 24 - 31 */ + dest_id : 32; /* 32 - 63 */ + }; + + /* Posted mode */ + struct { + __u64 p_present : 1, /* 0 */ + p_fpd : 1, /* 1 */ + p_res0 : 6, /* 2 - 7 */ + p_avail : 4, /* 8 - 11 */ + p_res1 : 2, /* 12 - 13 */ + p_urgent : 1, /* 14 */ + p_pst : 1, /* 15 */ + p_vector : 8, /* 16 - 23 */ + p_res2 : 14, /* 24 - 37 */ + pda_l : 26; /* 38 - 63 */ }; __u64 low; }; union { + /* Shared between remapped and posted mode*/ struct { - __u64 sid : 16, - sq : 2, - svt : 2, - __reserved_3 : 44; + __u64 sid : 16, /* 64 - 79 */ + sq : 2, /* 80 - 81 */ + svt : 2, /* 82 - 83 */ + __res3 : 44; /* 84 - 127 */ + }; + + /* Posted mode*/ + struct { + __u64 p_sid : 16, /* 64 - 79 */ + p_sq : 2, /* 80 - 81 */ + p_svt : 2, /* 82 - 83 */ + p_res3 : 12, /* 84 - 95 */ + pda_h : 32; /* 96 - 127 */ }; __u64 high; }; }; +static inline void dmar_copy_shared_irte(struct irte *dst, struct irte *src) +{ + dst->present = src->present; + dst->fpd = src->fpd; + dst->avail = src->avail; + dst->pst = src->pst; + dst->vector = src->vector; + dst->sid = src->sid; + dst->sq = src->sq; + dst->svt = src->svt; +} + +#define PDA_LOW_BIT 26 +#define PDA_HIGH_BIT 32 + enum { IRQ_REMAP_XAPIC_MODE, IRQ_REMAP_X2APIC_MODE, @@ -227,6 +279,7 @@ extern void dmar_msi_read(int irq, struct msi_msg *msg); extern void dmar_msi_write(int irq, struct msi_msg *msg); extern int dmar_set_interrupt(struct intel_iommu *iommu); extern irqreturn_t dmar_fault(int irq, void *dev_id); -extern int arch_setup_dmar_msi(unsigned int irq); +extern int dmar_alloc_hwirq(int id, int node, void *arg); +extern void dmar_free_hwirq(int irq); #endif /* __DMAR_H__ */ diff --git a/include/linux/htirq.h b/include/linux/htirq.h index 70a1dbbf2093..d4a527e58434 100644 --- a/include/linux/htirq.h +++ b/include/linux/htirq.h @@ -1,24 +1,38 @@ #ifndef LINUX_HTIRQ_H #define LINUX_HTIRQ_H +struct pci_dev; +struct irq_data; + struct ht_irq_msg { u32 address_lo; /* low 32 bits of the ht irq message */ u32 address_hi; /* high 32 bits of the it irq message */ }; +typedef void (ht_irq_update_t)(struct pci_dev *dev, int irq, + struct ht_irq_msg *msg); + +struct ht_irq_cfg { + struct pci_dev *dev; + /* Update callback used to cope with buggy hardware */ + ht_irq_update_t *update; + unsigned pos; + unsigned idx; + struct ht_irq_msg msg; +}; + /* Helper functions.. */ void fetch_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg); void write_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg); -struct irq_data; void mask_ht_irq(struct irq_data *data); void unmask_ht_irq(struct irq_data *data); /* The arch hook for getting things started */ -int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev); +int arch_setup_ht_irq(int idx, int pos, struct pci_dev *dev, + ht_irq_update_t *update); +void arch_teardown_ht_irq(unsigned int irq); /* For drivers of buggy hardware */ -typedef void (ht_irq_update_t)(struct pci_dev *dev, int irq, - struct ht_irq_msg *msg); int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update); #endif /* LINUX_HTIRQ_H */ diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index a240e61a7700..3665cb331ca1 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -87,6 +87,7 @@ static inline void dmar_writeq(void __iomem *addr, u64 val) /* * Decoding Capability Register */ +#define cap_pi_support(c) (((c) >> 59) & 1) #define cap_read_drain(c) (((c) >> 55) & 1) #define cap_write_drain(c) (((c) >> 54) & 1) #define cap_max_amask_val(c) (((c) >> 48) & 0x3f) @@ -299,6 +300,8 @@ struct q_inval { #define INTR_REMAP_TABLE_ENTRIES 65536 +struct irq_domain; + struct ir_table { struct irte *base; unsigned long *bitmap; @@ -348,6 +351,8 @@ struct intel_iommu { #ifdef CONFIG_IRQ_REMAP struct ir_table *ir_table; /* Interrupt remapping info */ + struct irq_domain *ir_domain; + struct irq_domain *ir_msi_domain; #endif struct device *iommu_dev; /* IOMMU-sysfs device */ int node; diff --git a/include/linux/io.h b/include/linux/io.h index 986f2bffea1e..fb5a99800e77 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -19,6 +19,7 @@ #define _LINUX_IO_H #include +#include #include #include @@ -111,6 +112,13 @@ static inline void arch_phys_wc_del(int handle) } #define arch_phys_wc_add arch_phys_wc_add +#ifndef arch_phys_wc_index +static inline int arch_phys_wc_index(int handle) +{ + return -1; +} +#define arch_phys_wc_index arch_phys_wc_index +#endif #endif #endif /* _LINUX_IO_H */ diff --git a/include/linux/irq.h b/include/linux/irq.h index 62c6901cab55..48cb7d1aa58f 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -327,6 +327,7 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) * @irq_write_msi_msg: optional to write message content for MSI * @irq_get_irqchip_state: return the internal state of an interrupt * @irq_set_irqchip_state: set the internal state of a interrupt + * @irq_set_vcpu_affinity: optional to target a vCPU in a virtual machine * @flags: chip specific flags */ struct irq_chip { @@ -369,6 +370,8 @@ struct irq_chip { int (*irq_get_irqchip_state)(struct irq_data *data, enum irqchip_irq_state which, bool *state); int (*irq_set_irqchip_state)(struct irq_data *data, enum irqchip_irq_state which, bool state); + int (*irq_set_vcpu_affinity)(struct irq_data *data, void *vcpu_info); + unsigned long flags; }; @@ -422,6 +425,7 @@ extern void irq_cpu_online(void); extern void irq_cpu_offline(void); extern int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *cpumask, bool force); +extern int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info); #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_PENDING_IRQ) void irq_move_irq(struct irq_data *data); @@ -467,6 +471,8 @@ extern int irq_chip_set_affinity_parent(struct irq_data *data, const struct cpumask *dest, bool force); extern int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on); +extern int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, + void *vcpu_info); #endif /* Handling of unhandled and spurious interrupts: */ diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index dd1109fb241e..a113a8dc7438 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -93,6 +93,15 @@ struct irq_desc { extern struct irq_desc irq_desc[NR_IRQS]; #endif +static inline struct irq_desc *irq_data_to_desc(struct irq_data *data) +{ +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + return irq_to_desc(data->irq); +#else + return container_of(data, struct irq_desc, irq_data); +#endif +} + static inline struct irq_data *irq_desc_get_irq_data(struct irq_desc *desc) { return &desc->irq_data; diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index eb9a4ea394ab..55016b2151f3 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -949,6 +949,20 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data) return -ENOSYS; } +/** + * irq_chip_set_vcpu_affinity_parent - Set vcpu affinity on the parent interrupt + * @data: Pointer to interrupt specific data + * @dest: The vcpu affinity information + */ +int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info) +{ + data = data->parent_data; + if (data->chip->irq_set_vcpu_affinity) + return data->chip->irq_set_vcpu_affinity(data, vcpu_info); + + return -ENOSYS; +} + /** * irq_chip_set_wake_parent - Set/reset wake-up on the parent interrupt * @data: Pointer to interrupt specific data diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index df553b0af936..b93d434e70bd 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -59,8 +59,6 @@ enum { #include "debug.h" #include "settings.h" -#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) - extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, unsigned long flags); extern void __disable_irq(struct irq_desc *desc, unsigned int irq); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e68932bb308e..b1c7e8f46bfb 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -256,6 +256,37 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) } EXPORT_SYMBOL_GPL(irq_set_affinity_hint); +/** + * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt + * @irq: interrupt number to set affinity + * @vcpu_info: vCPU specific data + * + * This function uses the vCPU specific data to set the vCPU + * affinity for an irq. The vCPU specific data is passed from + * outside, such as KVM. One example code path is as below: + * KVM -> IOMMU -> irq_set_vcpu_affinity(). + */ +int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info) +{ + unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); + struct irq_data *data; + struct irq_chip *chip; + int ret = -ENOSYS; + + if (!desc) + return -EINVAL; + + data = irq_desc_get_irq_data(desc); + chip = irq_data_get_irq_chip(data); + if (chip && chip->irq_set_vcpu_affinity) + ret = chip->irq_set_vcpu_affinity(data, vcpu_info); + irq_put_desc_unlock(desc, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity); + static void irq_affinity_notify(struct work_struct *work) { struct irq_affinity_notify *notify = diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index ca3f4aaff707..dd203e276b07 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -7,7 +7,7 @@ void irq_move_masked_irq(struct irq_data *idata) { struct irq_desc *desc = irq_data_to_desc(idata); - struct irq_chip *chip = idata->chip; + struct irq_chip *chip = desc->irq_data.chip; if (likely(!irqd_is_setaffinity_pending(&desc->irq_data))) return; @@ -52,6 +52,13 @@ void irq_move_irq(struct irq_data *idata) { bool masked; + /* + * Get top level irq_data when CONFIG_IRQ_DOMAIN_HIERARCHY is enabled, + * and it should be optimized away when CONFIG_IRQ_DOMAIN_HIERARCHY is + * disabled. So we avoid an "#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY" here. + */ + idata = irq_desc_get_irq_data(irq_data_to_desc(idata)); + if (likely(!irqd_is_setaffinity_pending(idata))) return; diff --git a/scripts/checksyscalls.sh b/scripts/checksyscalls.sh index 5b3add31f9f1..2c9082ba6137 100755 --- a/scripts/checksyscalls.sh +++ b/scripts/checksyscalls.sh @@ -212,5 +212,5 @@ EOF ) } -(ignore_list && syscall_list $(dirname $0)/../arch/x86/syscalls/syscall_32.tbl) | \ +(ignore_list && syscall_list $(dirname $0)/../arch/x86/entry/syscalls/syscall_32.tbl) | \ $* -E -x c - > /dev/null diff --git a/sound/pci/asihpi/hpioctl.c b/sound/pci/asihpi/hpioctl.c index 6610bd096fc9..d17937b92331 100644 --- a/sound/pci/asihpi/hpioctl.c +++ b/sound/pci/asihpi/hpioctl.c @@ -32,6 +32,7 @@ #include #include #include +#include #ifdef MODULE_FIRMWARE MODULE_FIRMWARE("asihpi/dsp5000.bin"); diff --git a/tools/power/x86/turbostat/Makefile b/tools/power/x86/turbostat/Makefile index 4039854560d0..e367b1a85d70 100644 --- a/tools/power/x86/turbostat/Makefile +++ b/tools/power/x86/turbostat/Makefile @@ -9,7 +9,7 @@ endif turbostat : turbostat.c CFLAGS += -Wall -CFLAGS += -DMSRHEADER='"../../../../arch/x86/include/uapi/asm/msr-index.h"' +CFLAGS += -DMSRHEADER='"../../../../arch/x86/include/asm/msr-index.h"' %: %.c @mkdir -p $(BUILD_OUTPUT) diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index 9b0d8baf2934..caa60d56d7d1 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -4,7 +4,7 @@ include ../lib.mk .PHONY: all all_32 all_64 warn_32bit_failure clean -TARGETS_C_BOTHBITS := sigreturn single_step_syscall +TARGETS_C_BOTHBITS := sigreturn single_step_syscall sysret_ss_attrs TARGETS_C_32BIT_ONLY := entry_from_vm86 TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY) @@ -57,3 +57,6 @@ warn_32bit_failure: echo " yum install glibc-devel.*i686"; \ exit 0; endif + +# Some tests have additional dependencies. +sysret_ss_attrs_64: thunks.S diff --git a/tools/testing/selftests/x86/sysret_ss_attrs.c b/tools/testing/selftests/x86/sysret_ss_attrs.c new file mode 100644 index 000000000000..ce42d5a64009 --- /dev/null +++ b/tools/testing/selftests/x86/sysret_ss_attrs.c @@ -0,0 +1,112 @@ +/* + * sysret_ss_attrs.c - test that syscalls return valid hidden SS attributes + * Copyright (c) 2015 Andrew Lutomirski + * + * This program is free software; you can redistribute it and/or modify + * it under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * On AMD CPUs, SYSRET can return with a valid SS descriptor with with + * the hidden attributes set to an unusable state. Make sure the kernel + * doesn't let this happen. + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void *threadproc(void *ctx) +{ + /* + * Do our best to cause sleeps on this CPU to exit the kernel and + * re-enter with SS = 0. + */ + while (true) + ; + + return NULL; +} + +#ifdef __x86_64__ +extern unsigned long call32_from_64(void *stack, void (*function)(void)); + +asm (".pushsection .text\n\t" + ".code32\n\t" + "test_ss:\n\t" + "pushl $0\n\t" + "popl %eax\n\t" + "ret\n\t" + ".code64"); +extern void test_ss(void); +#endif + +int main() +{ + /* + * Start a busy-looping thread on the same CPU we're on. + * For simplicity, just stick everything to CPU 0. This will + * fail in some containers, but that's probably okay. + */ + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(0, &cpuset); + if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) + printf("[WARN]\tsched_setaffinity failed\n"); + + pthread_t thread; + if (pthread_create(&thread, 0, threadproc, 0) != 0) + err(1, "pthread_create"); + +#ifdef __x86_64__ + unsigned char *stack32 = mmap(NULL, 4096, PROT_READ | PROT_WRITE, + MAP_32BIT | MAP_ANONYMOUS | MAP_PRIVATE, + -1, 0); + if (stack32 == MAP_FAILED) + err(1, "mmap"); +#endif + + printf("[RUN]\tSyscalls followed by SS validation\n"); + + for (int i = 0; i < 1000; i++) { + /* + * Go to sleep and return using sysret (if we're 64-bit + * or we're 32-bit on AMD on a 64-bit kernel). On AMD CPUs, + * SYSRET doesn't fix up the cached SS descriptor, so the + * kernel needs some kind of workaround to make sure that we + * end the system call with a valid stack segment. This + * can be a confusing failure because the SS *selector* + * is the same regardless. + */ + usleep(2); + +#ifdef __x86_64__ + /* + * On 32-bit, just doing a syscall through glibc is enough + * to cause a crash if our cached SS descriptor is invalid. + * On 64-bit, it's not, so try extra hard. + */ + call32_from_64(stack32 + 4088, test_ss); +#endif + } + + printf("[OK]\tWe survived\n"); + +#ifdef __x86_64__ + munmap(stack32, 4096); +#endif + + return 0; +} diff --git a/tools/testing/selftests/x86/thunks.S b/tools/testing/selftests/x86/thunks.S new file mode 100644 index 000000000000..ce8a995bbb17 --- /dev/null +++ b/tools/testing/selftests/x86/thunks.S @@ -0,0 +1,67 @@ +/* + * thunks.S - assembly helpers for mixed-bitness code + * Copyright (c) 2015 Andrew Lutomirski + * + * This program is free software; you can redistribute it and/or modify + * it under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * These are little helpers that make it easier to switch bitness on + * the fly. + */ + + .text + + .global call32_from_64 + .type call32_from_64, @function +call32_from_64: + // rdi: stack to use + // esi: function to call + + // Save registers + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushfq + + // Switch stacks + mov %rsp,(%rdi) + mov %rdi,%rsp + + // Switch to compatibility mode + pushq $0x23 /* USER32_CS */ + pushq $1f + lretq + +1: + .code32 + // Call the function + call *%esi + // Switch back to long mode + jmp $0x33,$1f + .code64 + +1: + // Restore the stack + mov (%rsp),%rsp + + // Restore registers + popfq + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + + ret + +.size call32_from_64, .-call32_from_64