linux/arch/arm/kernel/entry-armv.S

1343 lines
33 KiB
ArmAsm
Raw Normal View History

/* SPDX-License-Identifier: GPL-2.0-only */
/*
* linux/arch/arm/kernel/entry-armv.S
*
* Copyright (C) 1996,1997,1998 Russell King.
* ARM700 fix by Matthew Godbolt (linux-user@willothewisp.demon.co.uk)
* nommu support by Hyok S. Choi (hyok.choi@samsung.com)
*
* Low-level vector interface routines
*
* Note: there is a StrongARM bug in the STMIA rn, {regs}^ instruction
* that causes it to save wrong values... Be aware!
*/
#include <linux/init.h>
#include <asm/assembler.h>
#include <asm/memory.h>
#include <asm/glue-df.h>
#include <asm/glue-pf.h>
#include <asm/vfpmacros.h>
#include <asm/thread_notify.h>
#include <asm/unwind.h>
#include <asm/unistd.h>
#include <asm/tls.h>
#include <asm/system_info.h>
#include <asm/uaccess-asm.h>
#include "entry-header.S"
#include <asm/probes.h>
/*
* Interrupt handling.
*/
.macro irq_handler, from_user:req
mov r1, sp
ldr_this_cpu r2, irq_stack_ptr, r2, r3
.if \from_user == 0
@
@ If we took the interrupt while running in the kernel, we may already
@ be using the IRQ stack, so revert to the original value in that case.
@
subs r3, r2, r1 @ SP above bottom of IRQ stack?
rsbscs r3, r3, #THREAD_SIZE @ ... and below the top?
#ifdef CONFIG_VMAP_STACK
ldr_va r3, high_memory, cc @ End of the linear region
cmpcc r3, r1 @ Stack pointer was below it?
#endif
bcc 0f @ If not, switch to the IRQ stack
mov r0, r1
bl generic_handle_arch_irq
b 1f
0:
.endif
mov_l r0, generic_handle_arch_irq
bl call_with_stack
1:
.endm
.macro pabt_helper
@ PABORT handler takes pt_regs in r2, fault address in r4 and psr in r5
#ifdef MULTI_PABORT
ldr_va ip, processor, offset=PROCESSOR_PABT_FUNC
bl_r ip
#else
bl CPU_PABORT_HANDLER
#endif
.endm
.macro dabt_helper
@
@ Call the processor-specific abort handler:
@
@ r2 - pt_regs
@ r4 - aborted context pc
@ r5 - aborted context psr
@
@ The abort handler must return the aborted address in r0, and
@ the fault status register in r1. r9 must be preserved.
@
#ifdef MULTI_DABORT
ldr_va ip, processor, offset=PROCESSOR_DABT_FUNC
bl_r ip
#else
bl CPU_DABORT_HANDLER
#endif
.endm
.section .entry.text,"ax",%progbits
/*
* Invalid mode handlers
*/
.macro inv_entry, reason
sub sp, sp, #PT_REGS_SIZE
ARM( stmib sp, {r1 - lr} )
THUMB( stmia sp, {r0 - r12} )
THUMB( str sp, [sp, #S_SP] )
THUMB( str lr, [sp, #S_LR] )
mov r1, #\reason
.endm
__pabt_invalid:
inv_entry BAD_PREFETCH
b common_invalid
ENDPROC(__pabt_invalid)
__dabt_invalid:
inv_entry BAD_DATA
b common_invalid
ENDPROC(__dabt_invalid)
__irq_invalid:
inv_entry BAD_IRQ
b common_invalid
ENDPROC(__irq_invalid)
__und_invalid:
inv_entry BAD_UNDEFINSTR
@
@ XXX fall through to common_invalid
@
@
@ common_invalid - generic code for failed exception (re-entrant version of handlers)
@
common_invalid:
zero_fp
ldmia r0, {r4 - r6}
add r0, sp, #S_PC @ here for interlock avoidance
mov r7, #-1 @ "" "" "" ""
str r4, [sp] @ save preserved r0
stmia r0, {r5 - r7} @ lr_<exception>,
@ cpsr_<exception>, "old_r0"
mov r0, sp
b bad_mode
ENDPROC(__und_invalid)
/*
* SVC mode handlers
*/
#if defined(CONFIG_AEABI) && (__LINUX_ARM_ARCH__ >= 5)
#define SPFIX(code...) code
#else
#define SPFIX(code...)
#endif
.macro svc_entry, stack_hole=0, trace=1, uaccess=1, overflow_check=1
UNWIND(.fnstart )
ARM: entry: rework stack realignment code in svc_entry The original Thumb-2 enablement patches updated the stack realignment code in svc_entry to work around the lack of a STMIB instruction in Thumb-2, by subtracting 4 from the frame size, inverting the sense of the misaligment check, and changing to a STMIA instruction and a final stack push of a 4 byte quantity that results in the stack becoming aligned at the end of the sequence. It also pushes and pops R0 to the stack in order to have a temp register that Thumb-2 allows in general purpose ALU instructions, as TST using SP is not permitted. Both are a bit problematic for vmap'ed stacks, as using the stack is only permitted after we decide that we did not overflow the stack, or have already switched to the overflow stack. As for the alignment check: the current approach creates a corner case where, if the initial SUB of SP ends up right at the start of the stack, we will end up subtracting another 8 bytes and overflowing it. This means we would need to add the overflow check *after* the SUB that deliberately misaligns the stack. However, this would require us to keep local state (i.e., whether we performed the subtract or not) across the overflow check, but without any GPRs or stack available. So let's switch to an approach where we don't use the stack, and where the alignment check of the stack pointer occurs in the usual way, as this is guaranteed not to result in overflow. This means we will be able to do the overflow check first. While at it, switch to R1 so the mode stack pointer in R0 remains accessible. Acked-by: Nicolas Pitre <nico@fluxnic.net> Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Tested-by: Marc Zyngier <maz@kernel.org> Tested-by: Vladimir Murzin <vladimir.murzin@arm.com> # ARMv7M
2021-10-17 23:23:47 +08:00
sub sp, sp, #(SVC_REGS_SIZE + \stack_hole)
THUMB( add sp, r1 ) @ get SP in a GPR without
THUMB( sub r1, sp, r1 ) @ using a temp register
.if \overflow_check
UNWIND(.save {r0 - pc} )
do_overflow_check (SVC_REGS_SIZE + \stack_hole)
.endif
#ifdef CONFIG_THUMB2_KERNEL
ARM: entry: rework stack realignment code in svc_entry The original Thumb-2 enablement patches updated the stack realignment code in svc_entry to work around the lack of a STMIB instruction in Thumb-2, by subtracting 4 from the frame size, inverting the sense of the misaligment check, and changing to a STMIA instruction and a final stack push of a 4 byte quantity that results in the stack becoming aligned at the end of the sequence. It also pushes and pops R0 to the stack in order to have a temp register that Thumb-2 allows in general purpose ALU instructions, as TST using SP is not permitted. Both are a bit problematic for vmap'ed stacks, as using the stack is only permitted after we decide that we did not overflow the stack, or have already switched to the overflow stack. As for the alignment check: the current approach creates a corner case where, if the initial SUB of SP ends up right at the start of the stack, we will end up subtracting another 8 bytes and overflowing it. This means we would need to add the overflow check *after* the SUB that deliberately misaligns the stack. However, this would require us to keep local state (i.e., whether we performed the subtract or not) across the overflow check, but without any GPRs or stack available. So let's switch to an approach where we don't use the stack, and where the alignment check of the stack pointer occurs in the usual way, as this is guaranteed not to result in overflow. This means we will be able to do the overflow check first. While at it, switch to R1 so the mode stack pointer in R0 remains accessible. Acked-by: Nicolas Pitre <nico@fluxnic.net> Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Tested-by: Marc Zyngier <maz@kernel.org> Tested-by: Vladimir Murzin <vladimir.murzin@arm.com> # ARMv7M
2021-10-17 23:23:47 +08:00
tst r1, #4 @ test stack pointer alignment
sub r1, sp, r1 @ restore original R1
sub sp, r1 @ restore original SP
#else
SPFIX( tst sp, #4 )
#endif
ARM: entry: rework stack realignment code in svc_entry The original Thumb-2 enablement patches updated the stack realignment code in svc_entry to work around the lack of a STMIB instruction in Thumb-2, by subtracting 4 from the frame size, inverting the sense of the misaligment check, and changing to a STMIA instruction and a final stack push of a 4 byte quantity that results in the stack becoming aligned at the end of the sequence. It also pushes and pops R0 to the stack in order to have a temp register that Thumb-2 allows in general purpose ALU instructions, as TST using SP is not permitted. Both are a bit problematic for vmap'ed stacks, as using the stack is only permitted after we decide that we did not overflow the stack, or have already switched to the overflow stack. As for the alignment check: the current approach creates a corner case where, if the initial SUB of SP ends up right at the start of the stack, we will end up subtracting another 8 bytes and overflowing it. This means we would need to add the overflow check *after* the SUB that deliberately misaligns the stack. However, this would require us to keep local state (i.e., whether we performed the subtract or not) across the overflow check, but without any GPRs or stack available. So let's switch to an approach where we don't use the stack, and where the alignment check of the stack pointer occurs in the usual way, as this is guaranteed not to result in overflow. This means we will be able to do the overflow check first. While at it, switch to R1 so the mode stack pointer in R0 remains accessible. Acked-by: Nicolas Pitre <nico@fluxnic.net> Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Tested-by: Marc Zyngier <maz@kernel.org> Tested-by: Vladimir Murzin <vladimir.murzin@arm.com> # ARMv7M
2021-10-17 23:23:47 +08:00
SPFIX( subne sp, sp, #4 )
ARM( stmib sp, {r1 - r12} )
THUMB( stmia sp, {r0 - r12} ) @ No STMIB in Thumb-2
ldmia r0, {r3 - r5}
ARM: entry: rework stack realignment code in svc_entry The original Thumb-2 enablement patches updated the stack realignment code in svc_entry to work around the lack of a STMIB instruction in Thumb-2, by subtracting 4 from the frame size, inverting the sense of the misaligment check, and changing to a STMIA instruction and a final stack push of a 4 byte quantity that results in the stack becoming aligned at the end of the sequence. It also pushes and pops R0 to the stack in order to have a temp register that Thumb-2 allows in general purpose ALU instructions, as TST using SP is not permitted. Both are a bit problematic for vmap'ed stacks, as using the stack is only permitted after we decide that we did not overflow the stack, or have already switched to the overflow stack. As for the alignment check: the current approach creates a corner case where, if the initial SUB of SP ends up right at the start of the stack, we will end up subtracting another 8 bytes and overflowing it. This means we would need to add the overflow check *after* the SUB that deliberately misaligns the stack. However, this would require us to keep local state (i.e., whether we performed the subtract or not) across the overflow check, but without any GPRs or stack available. So let's switch to an approach where we don't use the stack, and where the alignment check of the stack pointer occurs in the usual way, as this is guaranteed not to result in overflow. This means we will be able to do the overflow check first. While at it, switch to R1 so the mode stack pointer in R0 remains accessible. Acked-by: Nicolas Pitre <nico@fluxnic.net> Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Tested-by: Marc Zyngier <maz@kernel.org> Tested-by: Vladimir Murzin <vladimir.murzin@arm.com> # ARMv7M
2021-10-17 23:23:47 +08:00
add r7, sp, #S_SP @ here for interlock avoidance
mov r6, #-1 @ "" "" "" ""
ARM: entry: rework stack realignment code in svc_entry The original Thumb-2 enablement patches updated the stack realignment code in svc_entry to work around the lack of a STMIB instruction in Thumb-2, by subtracting 4 from the frame size, inverting the sense of the misaligment check, and changing to a STMIA instruction and a final stack push of a 4 byte quantity that results in the stack becoming aligned at the end of the sequence. It also pushes and pops R0 to the stack in order to have a temp register that Thumb-2 allows in general purpose ALU instructions, as TST using SP is not permitted. Both are a bit problematic for vmap'ed stacks, as using the stack is only permitted after we decide that we did not overflow the stack, or have already switched to the overflow stack. As for the alignment check: the current approach creates a corner case where, if the initial SUB of SP ends up right at the start of the stack, we will end up subtracting another 8 bytes and overflowing it. This means we would need to add the overflow check *after* the SUB that deliberately misaligns the stack. However, this would require us to keep local state (i.e., whether we performed the subtract or not) across the overflow check, but without any GPRs or stack available. So let's switch to an approach where we don't use the stack, and where the alignment check of the stack pointer occurs in the usual way, as this is guaranteed not to result in overflow. This means we will be able to do the overflow check first. While at it, switch to R1 so the mode stack pointer in R0 remains accessible. Acked-by: Nicolas Pitre <nico@fluxnic.net> Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Tested-by: Marc Zyngier <maz@kernel.org> Tested-by: Vladimir Murzin <vladimir.murzin@arm.com> # ARMv7M
2021-10-17 23:23:47 +08:00
add r2, sp, #(SVC_REGS_SIZE + \stack_hole)
SPFIX( addne r2, r2, #4 )
str r3, [sp] @ save the "real" r0 copied
@ from the exception stack
mov r3, lr
@
@ We are now ready to fill in the remaining blanks on the stack:
@
@ r2 - sp_svc
@ r3 - lr_svc
@ r4 - lr_<exception>, already fixed up for correct return/restart
@ r5 - spsr_<exception>
@ r6 - orig_r0 (see pt_regs definition in ptrace.h)
@
stmia r7, {r2 - r6}
get_thread_info tsk
uaccess_entry tsk, r0, r1, r2, \uaccess
.if \trace
#ifdef CONFIG_TRACE_IRQFLAGS
bl trace_hardirqs_off
#endif
.endif
.endm
.align 5
__dabt_svc:
svc_entry uaccess=0
mov r2, sp
dabt_helper
THUMB( ldr r5, [sp, #S_PSR] ) @ potentially updated CPSR
svc_exit r5 @ return from exception
UNWIND(.fnend )
ENDPROC(__dabt_svc)
.align 5
__irq_svc:
svc_entry
irq_handler from_user=0
#ifdef CONFIG_PREEMPTION
ldr r8, [tsk, #TI_PREEMPT] @ get preempt count
ldr r0, [tsk, #TI_FLAGS] @ get flags
teq r8, #0 @ if preempt count != 0
movne r0, #0 @ force flags to 0
tst r0, #_TIF_NEED_RESCHED
blne svc_preempt
#endif
svc_exit r5, irq = 1 @ return from exception
UNWIND(.fnend )
ENDPROC(__irq_svc)
.ltorg
#ifdef CONFIG_PREEMPTION
svc_preempt:
mov r8, lr
1: bl preempt_schedule_irq @ irq en/disable is done inside
ldr r0, [tsk, #TI_FLAGS] @ get new tasks TI_FLAGS
tst r0, #_TIF_NEED_RESCHED
ARM: convert all "mov.* pc, reg" to "bx reg" for ARMv6+ ARMv6 and greater introduced a new instruction ("bx") which can be used to return from function calls. Recent CPUs perform better when the "bx lr" instruction is used rather than the "mov pc, lr" instruction, and this sequence is strongly recommended to be used by the ARM architecture manual (section A.4.1.1). We provide a new macro "ret" with all its variants for the condition code which will resolve to the appropriate instruction. Rather than doing this piecemeal, and miss some instances, change all the "mov pc" instances to use the new macro, with the exception of the "movs" instruction and the kprobes code. This allows us to detect the "mov pc, lr" case and fix it up - and also gives us the possibility of deploying this for other registers depending on the CPU selection. Reported-by: Will Deacon <will.deacon@arm.com> Tested-by: Stephen Warren <swarren@nvidia.com> # Tegra Jetson TK1 Tested-by: Robert Jarzmik <robert.jarzmik@free.fr> # mioa701_bootresume.S Tested-by: Andrew Lunn <andrew@lunn.ch> # Kirkwood Tested-by: Shawn Guo <shawn.guo@freescale.com> Tested-by: Tony Lindgren <tony@atomide.com> # OMAPs Tested-by: Gregory CLEMENT <gregory.clement@free-electrons.com> # Armada XP, 375, 385 Acked-by: Sekhar Nori <nsekhar@ti.com> # DaVinci Acked-by: Christoffer Dall <christoffer.dall@linaro.org> # kvm/hyp Acked-by: Haojian Zhuang <haojian.zhuang@gmail.com> # PXA3xx Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com> # Xen Tested-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de> # ARMv7M Tested-by: Simon Horman <horms+renesas@verge.net.au> # Shmobile Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2014-06-30 23:29:12 +08:00
reteq r8 @ go again
b 1b
#endif
__und_fault:
@ Correct the PC such that it is pointing at the instruction
@ which caused the fault. If the faulting instruction was ARM
@ the PC will be pointing at the next instruction, and have to
@ subtract 4. Otherwise, it is Thumb, and the PC will be
@ pointing at the second half of the Thumb instruction. We
@ have to subtract 2.
ldr r2, [r0, #S_PC]
sub r2, r2, r1
str r2, [r0, #S_PC]
b do_undefinstr
ENDPROC(__und_fault)
.align 5
__und_svc:
#ifdef CONFIG_KPROBES
@ If a kprobe is about to simulate a "stmdb sp..." instruction,
@ it obviously needs free stack space which then will belong to
@ the saved context.
svc_entry MAX_STACK_SIZE
#else
svc_entry
#endif
mov r1, #4 @ PC correction to apply
ARM: 9030/1: entry: omit FP emulation for UND exceptions taken in kernel mode There are a couple of problems with the exception entry code that deals with FP exceptions (which are reported as UND exceptions) when building the kernel in Thumb2 mode: - the conditional branch to vfp_kmode_exception in vfp_support_entry() may be out of range for its target, depending on how the linker decides to arrange the sections; - when the UND exception is taken in kernel mode, the emulation handling logic is entered via the 'call_fpe' label, which means we end up using the wrong value/mask pairs to match and detect the NEON opcodes. Since UND exceptions in kernel mode are unlikely to occur on a hot path (as opposed to the user mode version which is invoked for VFP support code and lazy restore), we can use the existing undef hook machinery for any kernel mode instruction emulation that is needed, including calling the existing vfp_kmode_exception() routine for unexpected cases. So drop the call to call_fpe, and instead, install an undef hook that will get called for NEON and VFP instructions that trigger an UND exception in kernel mode. While at it, make sure that the PC correction is accurate for the execution mode where the exception was taken, by checking the PSR Thumb bit. Cc: Dmitry Osipenko <digetx@gmail.com> Cc: Kees Cook <keescook@chromium.org> Fixes: eff8728fe698 ("vmlinux.lds.h: Add PGO and AutoFDO input sections") Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Reviewed-by: Linus Walleij <linus.walleij@linaro.org> Reviewed-by: Nick Desaulniers <ndesaulniers@google.com> Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
2020-11-20 01:09:16 +08:00
THUMB( tst r5, #PSR_T_BIT ) @ exception taken in Thumb mode?
THUMB( movne r1, #2 ) @ if so, fix up PC correction
mov r0, sp @ struct pt_regs *regs
bl __und_fault
__und_svc_finish:
get_thread_info tsk
ldr r5, [sp, #S_PSR] @ Get SVC cpsr
svc_exit r5 @ return from exception
UNWIND(.fnend )
ENDPROC(__und_svc)
.align 5
__pabt_svc:
svc_entry
mov r2, sp @ regs
pabt_helper
svc_exit r5 @ return from exception
UNWIND(.fnend )
ENDPROC(__pabt_svc)
.align 5
__fiq_svc:
svc_entry trace=0
mov r0, sp @ struct pt_regs *regs
bl handle_fiq_as_nmi
svc_exit_via_fiq
UNWIND(.fnend )
ENDPROC(__fiq_svc)
/*
* Abort mode handlers
*/
@
@ Taking a FIQ in abort mode is similar to taking a FIQ in SVC mode
@ and reuses the same macros. However in abort mode we must also
@ save/restore lr_abt and spsr_abt to make nested aborts safe.
@
.align 5
__fiq_abt:
svc_entry trace=0
ARM( msr cpsr_c, #ABT_MODE | PSR_I_BIT | PSR_F_BIT )
THUMB( mov r0, #ABT_MODE | PSR_I_BIT | PSR_F_BIT )
THUMB( msr cpsr_c, r0 )
mov r1, lr @ Save lr_abt
mrs r2, spsr @ Save spsr_abt, abort is now safe
ARM( msr cpsr_c, #SVC_MODE | PSR_I_BIT | PSR_F_BIT )
THUMB( mov r0, #SVC_MODE | PSR_I_BIT | PSR_F_BIT )
THUMB( msr cpsr_c, r0 )
stmfd sp!, {r1 - r2}
add r0, sp, #8 @ struct pt_regs *regs
bl handle_fiq_as_nmi
ldmfd sp!, {r1 - r2}
ARM( msr cpsr_c, #ABT_MODE | PSR_I_BIT | PSR_F_BIT )
THUMB( mov r0, #ABT_MODE | PSR_I_BIT | PSR_F_BIT )
THUMB( msr cpsr_c, r0 )
mov lr, r1 @ Restore lr_abt, abort is unsafe
msr spsr_cxsf, r2 @ Restore spsr_abt
ARM( msr cpsr_c, #SVC_MODE | PSR_I_BIT | PSR_F_BIT )
THUMB( mov r0, #SVC_MODE | PSR_I_BIT | PSR_F_BIT )
THUMB( msr cpsr_c, r0 )
svc_exit_via_fiq
UNWIND(.fnend )
ENDPROC(__fiq_abt)
/*
* User mode handlers
*
* EABI note: sp_svc is always 64-bit aligned here, so should PT_REGS_SIZE
*/
#if defined(CONFIG_AEABI) && (__LINUX_ARM_ARCH__ >= 5) && (PT_REGS_SIZE & 7)
#error "sizeof(struct pt_regs) must be a multiple of 8"
#endif
.macro usr_entry, trace=1, uaccess=1
UNWIND(.fnstart )
UNWIND(.cantunwind ) @ don't unwind the user space
sub sp, sp, #PT_REGS_SIZE
ARM( stmib sp, {r1 - r12} )
THUMB( stmia sp, {r0 - r12} )
ATRAP( mrc p15, 0, r7, c1, c0, 0)
ATRAP( ldr_va r8, cr_alignment)
ldmia r0, {r3 - r5}
add r0, sp, #S_PC @ here for interlock avoidance
mov r6, #-1 @ "" "" "" ""
str r3, [sp] @ save the "real" r0 copied
@ from the exception stack
@
@ We are now ready to fill in the remaining blanks on the stack:
@
@ r4 - lr_<exception>, already fixed up for correct return/restart
@ r5 - spsr_<exception>
@ r6 - orig_r0 (see pt_regs definition in ptrace.h)
@
@ Also, separately save sp_usr and lr_usr
@
stmia r0, {r4 - r6}
ARM( stmdb r0, {sp, lr}^ )
THUMB( store_user_sp_lr r0, r1, S_SP - S_PC )
.if \uaccess
uaccess_disable ip
.endif
@ Enable the alignment trap while in kernel mode
ATRAP( teq r8, r7)
ATRAP( mcrne p15, 0, r8, c1, c0, 0)
reload_current r7, r8
@
@ Clear FP to mark the first stack frame
@
zero_fp
.if \trace
ARM: fix lockdep unannotated irqs-off warning Wolfram Sang reported an unannotated irqs-off warning from lockdep: WARNING: CPU: 0 PID: 282 at kernel/locking/lockdep.c:3557 check_flags+0x84/0x1f4() DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled) CPU: 0 PID: 282 Comm: rcS Tainted: G W 4.1.0-00002-g5b076054611833 #179 Hardware name: Generic Emma Mobile EV2 (Flattened Device Tree) Backtrace: [<c0012c94>] (dump_backtrace) from [<c0012e3c>] (show_stack+0x18/0x1c) r6:c02dcc67 r5:00000009 r4:00000000 r3:00400000 [<c0012e24>] (show_stack) from [<c02510c8>] (dump_stack+0x20/0x28) [<c02510a8>] (dump_stack) from [<c0022c44>] (warn_slowpath_common+0x8c/0xb4) [<c0022bb8>] (warn_slowpath_common) from [<c0022cd8>] (warn_slowpath_fmt+0x38/0x40) r8:c780f470 r7:00000000 r6:00000000 r5:c03b0570 r4:c0b7ec04 [<c0022ca4>] (warn_slowpath_fmt) from [<c004cd38>] (check_flags+0x84/0x1f4) r3:c02e13d8 r2:c02dceaa [<c004ccb4>] (check_flags) from [<c0050e50>] (lock_acquire+0x4c/0xbc) r5:00000000 r4:60000193 [<c0050e04>] (lock_acquire) from [<c0256000>] (_raw_spin_lock+0x34/0x44) r9:000a8d5c r8:00000001 r7:c7806000 r6:c780f460 r5:c03b06a0 r4:c780f460 [<c0255fcc>] (_raw_spin_lock) from [<c005a8cc>] (handle_fasteoi_irq+0x20/0x11c) r4:c780f400 [<c005a8ac>] (handle_fasteoi_irq) from [<c0057a4c>] (generic_handle_irq+0x28/0x38) r6:00000000 r5:c03b038c r4:00000012 r3:c005a8ac [<c0057a24>] (generic_handle_irq) from [<c0057ae4>] (__handle_domain_irq+0x88/0xa8) r4:00000000 r3:00000026 [<c0057a5c>] (__handle_domain_irq) from [<c000a3cc>] (gic_handle_irq+0x40/0x58) r8:10c5347d r7:10c5347d r6:c35b1fb0 r5:c03a6304 r4:c8802000 r3:c35b1fb0 [<c000a38c>] (gic_handle_irq) from [<c0013bc8>] (__irq_usr+0x48/0x60) Exception stack(0xc35b1fb0 to 0xc35b1ff8) 1fa0: 00000061 00000000 000ab736 00000066 1fc0: 00000061 000aa1f0 000a8d54 000a8d54 000a8d88 000a8d5c 000a8cc8 000a8d68 1fe0: 72727272 bef8a528 000398c0 00031334 20000010 ffffffff r6:ffffffff r5:20000010 r4:00031334 r3:00000061 ---[ end trace cb88537fdc8fa202 ]--- possible reason: unannotated irqs-off. irq event stamp: 769 hardirqs last enabled at (769): [<c000f82c>] ret_fast_syscall+0x2c/0x54 hardirqs last disabled at (768): [<c000f80c>] ret_fast_syscall+0xc/0x54 softirqs last enabled at (0): [<c0020ec4>] copy_process.part.65+0x2e8/0x11dc softirqs last disabled at (0): [< (null)>] (null) His kernel configuration had: CONFIG_PROVE_LOCKING=y CONFIG_TRACE_IRQFLAGS=y but no IRQSOFF_TRACER, which means entry from userspace can result in the kernel seeing IRQs off without being notified of that change of state. Change the IRQSOFF ifdef in the usr_entry macro to TRACE_IRQFLAGS instead. Tested-by: Wolfram Sang <wsa+renesas@sang-engineering.com> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2015-07-03 19:42:36 +08:00
#ifdef CONFIG_TRACE_IRQFLAGS
bl trace_hardirqs_off
#endif
ct_user_exit save = 0
.endif
.endm
.macro kuser_cmpxchg_check
#if !defined(CONFIG_CPU_32v6K) && defined(CONFIG_KUSER_HELPERS)
#ifndef CONFIG_MMU
#warning "NPTL on non MMU needs fixing"
#else
@ Make sure our user space atomic helper is restarted
@ if it was interrupted in a critical region. Here we
@ perform a quick test inline since it should be false
@ 99.9999% of the time. The rest is done out of line.
ARM: 9015/2: Define the virtual space of KASan's shadow region Define KASAN_SHADOW_OFFSET,KASAN_SHADOW_START and KASAN_SHADOW_END for the Arm kernel address sanitizer. We are "stealing" lowmem (the 4GB addressable by a 32bit architecture) out of the virtual address space to use as shadow memory for KASan as follows: +----+ 0xffffffff | | | | |-> Static kernel image (vmlinux) BSS and page table | |/ +----+ PAGE_OFFSET | | | | |-> Loadable kernel modules virtual address space area | |/ +----+ MODULES_VADDR = KASAN_SHADOW_END | | | | |-> The shadow area of kernel virtual address. | |/ +----+-> TASK_SIZE (start of kernel space) = KASAN_SHADOW_START the | | shadow address of MODULES_VADDR | | | | | | | | |-> The user space area in lowmem. The kernel address | | | sanitizer do not use this space, nor does it map it. | | | | | | | | | | | | | |/ ------ 0 0 .. TASK_SIZE is the memory that can be used by shared userspace/kernelspace. It us used for userspace processes and for passing parameters and memory buffers in system calls etc. We do not need to shadow this area. KASAN_SHADOW_START: This value begins with the MODULE_VADDR's shadow address. It is the start of kernel virtual space. Since we have modules to load, we need to cover also that area with shadow memory so we can find memory bugs in modules. KASAN_SHADOW_END This value is the 0x100000000's shadow address: the mapping that would be after the end of the kernel memory at 0xffffffff. It is the end of kernel address sanitizer shadow area. It is also the start of the module area. KASAN_SHADOW_OFFSET: This value is used to map an address to the corresponding shadow address by the following formula: shadow_addr = (address >> 3) + KASAN_SHADOW_OFFSET; As you would expect, >> 3 is equal to dividing by 8, meaning each byte in the shadow memory covers 8 bytes of kernel memory, so one bit shadow memory per byte of kernel memory is used. The KASAN_SHADOW_OFFSET is provided in a Kconfig option depending on the VMSPLIT layout of the system: the kernel and userspace can split up lowmem in different ways according to needs, so we calculate the shadow offset depending on this. When kasan is enabled, the definition of TASK_SIZE is not an 8-bit rotated constant, so we need to modify the TASK_SIZE access code in the *.s file. The kernel and modules may use different amounts of memory, according to the VMSPLIT configuration, which in turn determines the PAGE_OFFSET. We use the following KASAN_SHADOW_OFFSETs depending on how the virtual memory is split up: - 0x1f000000 if we have 1G userspace / 3G kernelspace split: - The kernel address space is 3G (0xc0000000) - PAGE_OFFSET is then set to 0x40000000 so the kernel static image (vmlinux) uses addresses 0x40000000 .. 0xffffffff - On top of that we have the MODULES_VADDR which under the worst case (using ARM instructions) is PAGE_OFFSET - 16M (0x01000000) = 0x3f000000 so the modules use addresses 0x3f000000 .. 0x3fffffff - So the addresses 0x3f000000 .. 0xffffffff need to be covered with shadow memory. That is 0xc1000000 bytes of memory. - 1/8 of that is needed for its shadow memory, so 0x18200000 bytes of shadow memory is needed. We "steal" that from the remaining lowmem. - The KASAN_SHADOW_START becomes 0x26e00000, to KASAN_SHADOW_END at 0x3effffff. - Now we can calculate the KASAN_SHADOW_OFFSET for any kernel address as 0x3f000000 needs to map to the first byte of shadow memory and 0xffffffff needs to map to the last byte of shadow memory. Since: SHADOW_ADDR = (address >> 3) + KASAN_SHADOW_OFFSET 0x26e00000 = (0x3f000000 >> 3) + KASAN_SHADOW_OFFSET KASAN_SHADOW_OFFSET = 0x26e00000 - (0x3f000000 >> 3) KASAN_SHADOW_OFFSET = 0x26e00000 - 0x07e00000 KASAN_SHADOW_OFFSET = 0x1f000000 - 0x5f000000 if we have 2G userspace / 2G kernelspace split: - The kernel space is 2G (0x80000000) - PAGE_OFFSET is set to 0x80000000 so the kernel static image uses 0x80000000 .. 0xffffffff. - On top of that we have the MODULES_VADDR which under the worst case (using ARM instructions) is PAGE_OFFSET - 16M (0x01000000) = 0x7f000000 so the modules use addresses 0x7f000000 .. 0x7fffffff - So the addresses 0x7f000000 .. 0xffffffff need to be covered with shadow memory. That is 0x81000000 bytes of memory. - 1/8 of that is needed for its shadow memory, so 0x10200000 bytes of shadow memory is needed. We "steal" that from the remaining lowmem. - The KASAN_SHADOW_START becomes 0x6ee00000, to KASAN_SHADOW_END at 0x7effffff. - Now we can calculate the KASAN_SHADOW_OFFSET for any kernel address as 0x7f000000 needs to map to the first byte of shadow memory and 0xffffffff needs to map to the last byte of shadow memory. Since: SHADOW_ADDR = (address >> 3) + KASAN_SHADOW_OFFSET 0x6ee00000 = (0x7f000000 >> 3) + KASAN_SHADOW_OFFSET KASAN_SHADOW_OFFSET = 0x6ee00000 - (0x7f000000 >> 3) KASAN_SHADOW_OFFSET = 0x6ee00000 - 0x0fe00000 KASAN_SHADOW_OFFSET = 0x5f000000 - 0x9f000000 if we have 3G userspace / 1G kernelspace split, and this is the default split for ARM: - The kernel address space is 1GB (0x40000000) - PAGE_OFFSET is set to 0xc0000000 so the kernel static image uses 0xc0000000 .. 0xffffffff. - On top of that we have the MODULES_VADDR which under the worst case (using ARM instructions) is PAGE_OFFSET - 16M (0x01000000) = 0xbf000000 so the modules use addresses 0xbf000000 .. 0xbfffffff - So the addresses 0xbf000000 .. 0xffffffff need to be covered with shadow memory. That is 0x41000000 bytes of memory. - 1/8 of that is needed for its shadow memory, so 0x08200000 bytes of shadow memory is needed. We "steal" that from the remaining lowmem. - The KASAN_SHADOW_START becomes 0xb6e00000, to KASAN_SHADOW_END at 0xbfffffff. - Now we can calculate the KASAN_SHADOW_OFFSET for any kernel address as 0xbf000000 needs to map to the first byte of shadow memory and 0xffffffff needs to map to the last byte of shadow memory. Since: SHADOW_ADDR = (address >> 3) + KASAN_SHADOW_OFFSET 0xb6e00000 = (0xbf000000 >> 3) + KASAN_SHADOW_OFFSET KASAN_SHADOW_OFFSET = 0xb6e00000 - (0xbf000000 >> 3) KASAN_SHADOW_OFFSET = 0xb6e00000 - 0x17e00000 KASAN_SHADOW_OFFSET = 0x9f000000 - 0x8f000000 if we have 3G userspace / 1G kernelspace with full 1 GB low memory (VMSPLIT_3G_OPT): - The kernel address space is 1GB (0x40000000) - PAGE_OFFSET is set to 0xb0000000 so the kernel static image uses 0xb0000000 .. 0xffffffff. - On top of that we have the MODULES_VADDR which under the worst case (using ARM instructions) is PAGE_OFFSET - 16M (0x01000000) = 0xaf000000 so the modules use addresses 0xaf000000 .. 0xaffffff - So the addresses 0xaf000000 .. 0xffffffff need to be covered with shadow memory. That is 0x51000000 bytes of memory. - 1/8 of that is needed for its shadow memory, so 0x0a200000 bytes of shadow memory is needed. We "steal" that from the remaining lowmem. - The KASAN_SHADOW_START becomes 0xa4e00000, to KASAN_SHADOW_END at 0xaeffffff. - Now we can calculate the KASAN_SHADOW_OFFSET for any kernel address as 0xaf000000 needs to map to the first byte of shadow memory and 0xffffffff needs to map to the last byte of shadow memory. Since: SHADOW_ADDR = (address >> 3) + KASAN_SHADOW_OFFSET 0xa4e00000 = (0xaf000000 >> 3) + KASAN_SHADOW_OFFSET KASAN_SHADOW_OFFSET = 0xa4e00000 - (0xaf000000 >> 3) KASAN_SHADOW_OFFSET = 0xa4e00000 - 0x15e00000 KASAN_SHADOW_OFFSET = 0x8f000000 - The default value of 0xffffffff for KASAN_SHADOW_OFFSET is an error value. We should always match one of the above shadow offsets. When we do this, TASK_SIZE will sometimes get a bit odd values that will not fit into immediate mov assembly instructions. To account for this, we need to rewrite some assembly using TASK_SIZE like this: - mov r1, #TASK_SIZE + ldr r1, =TASK_SIZE or - cmp r4, #TASK_SIZE + ldr r0, =TASK_SIZE + cmp r4, r0 this is done to avoid the immediate #TASK_SIZE that need to fit into a limited number of bits. Cc: Andrey Ryabinin <aryabinin@virtuozzo.com> Cc: Alexander Potapenko <glider@google.com> Cc: Dmitry Vyukov <dvyukov@google.com> Cc: kasan-dev@googlegroups.com Cc: Mike Rapoport <rppt@linux.ibm.com> Reviewed-by: Ard Biesheuvel <ardb@kernel.org> Tested-by: Ard Biesheuvel <ardb@kernel.org> # QEMU/KVM/mach-virt/LPAE/8G Tested-by: Florian Fainelli <f.fainelli@gmail.com> # Brahma SoCs Tested-by: Ahmad Fatoum <a.fatoum@pengutronix.de> # i.MX6Q Reported-by: Ard Biesheuvel <ardb@kernel.org> Signed-off-by: Abbott Liu <liuwenliang@huawei.com> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: Linus Walleij <linus.walleij@linaro.org> Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
2020-10-26 06:53:46 +08:00
ldr r0, =TASK_SIZE
cmp r4, r0
blhs kuser_cmpxchg64_fixup
#endif
#endif
.endm
.align 5
__dabt_usr:
usr_entry uaccess=0
kuser_cmpxchg_check
mov r2, sp
dabt_helper
b ret_from_exception
UNWIND(.fnend )
ENDPROC(__dabt_usr)
.align 5
__irq_usr:
usr_entry
kuser_cmpxchg_check
irq_handler from_user=1
get_thread_info tsk
mov why, #0
ARM: 6952/1: fix lockdep warning of "unannotated irqs-off" This patch fixes the lockdep warning of "unannotated irqs-off"[1]. After entering __irq_usr, arm core will disable interrupt automatically, but __irq_usr does not annotate the irq disable, so lockdep may complain the warning if it has chance to check this in irq handler. This patch adds trace_hardirqs_off in __irq_usr before entering irq_handler to handle the irq, also calls ret_to_user_from_irq to avoid calling disable_irq again. This is also a fix for irq off tracer. [1], lockdep warning log of "unannotated irqs-off" [ 13.804687] ------------[ cut here ]------------ [ 13.809570] WARNING: at kernel/lockdep.c:3335 check_flags+0x78/0x1d0() [ 13.816467] Modules linked in: [ 13.819732] Backtrace: [ 13.822357] [<c01cb42c>] (dump_backtrace+0x0/0x100) from [<c06abb14>] (dump_stack+0x20/0x24) [ 13.831268] r6:c07d8c2c r5:00000d07 r4:00000000 r3:00000000 [ 13.837280] [<c06abaf4>] (dump_stack+0x0/0x24) from [<c01ffc04>] (warn_slowpath_common+0x5c/0x74) [ 13.846649] [<c01ffba8>] (warn_slowpath_common+0x0/0x74) from [<c01ffc48>] (warn_slowpath_null+0x2c/0x34) [ 13.856781] r8:00000000 r7:00000000 r6:c18b8194 r5:60000093 r4:ef182000 [ 13.863708] r3:00000009 [ 13.866485] [<c01ffc1c>] (warn_slowpath_null+0x0/0x34) from [<c0237d84>] (check_flags+0x78/0x1d0) [ 13.875823] [<c0237d0c>] (check_flags+0x0/0x1d0) from [<c023afc8>] (lock_acquire+0x4c/0x150) [ 13.884704] [<c023af7c>] (lock_acquire+0x0/0x150) from [<c06af638>] (_raw_spin_lock+0x4c/0x84) [ 13.893798] [<c06af5ec>] (_raw_spin_lock+0x0/0x84) from [<c01f9a44>] (sched_ttwu_pending+0x58/0x8c) [ 13.903320] r6:ef92d040 r5:00000003 r4:c18b8180 [ 13.908233] [<c01f99ec>] (sched_ttwu_pending+0x0/0x8c) from [<c01f9a90>] (scheduler_ipi+0x18/0x1c) [ 13.917663] r6:ef183fb0 r5:00000003 r4:00000000 r3:00000001 [ 13.923645] [<c01f9a78>] (scheduler_ipi+0x0/0x1c) from [<c01bc458>] (do_IPI+0x9c/0xfc) [ 13.932006] [<c01bc3bc>] (do_IPI+0x0/0xfc) from [<c06b0888>] (__irq_usr+0x48/0xe0) [ 13.939971] Exception stack(0xef183fb0 to 0xef183ff8) [ 13.945281] 3fa0: ffffffc3 0001500c 00000001 0001500c [ 13.953948] 3fc0: 00000050 400b45f0 400d9000 00000000 00000001 400d9600 6474e552 bea05b3c [ 13.962585] 3fe0: 400d96c0 bea059c0 400b6574 400b65d8 20000010 ffffffff [ 13.969573] r6:00000403 r5:fa240100 r4:ffffffff r3:20000010 [ 13.975585] ---[ end trace efc4896ab0fb62cb ]--- [ 13.980468] possible reason: unannotated irqs-off. [ 13.985534] irq event stamp: 1610 [ 13.989044] hardirqs last enabled at (1610): [<c01c703c>] no_work_pending+0x8/0x2c [ 13.997131] hardirqs last disabled at (1609): [<c01c7024>] ret_slow_syscall+0xc/0x1c [ 14.005371] softirqs last enabled at (0): [<c01fe5e4>] copy_process+0x2cc/0xa24 [ 14.013183] softirqs last disabled at (0): [< (null)>] (null) Signed-off-by: Ming Lei <ming.lei@canonical.com> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2011-06-05 09:24:58 +08:00
b ret_to_user_from_irq
UNWIND(.fnend )
ENDPROC(__irq_usr)
.ltorg
.align 5
__und_usr:
usr_entry uaccess=0
mov r2, r4
mov r3, r5
@ r2 = regs->ARM_pc, which is either 2 or 4 bytes ahead of the
@ faulting instruction depending on Thumb mode.
@ r3 = regs->ARM_cpsr
@
@ The emulation code returns using r9 if it has emulated the
@ instruction, or the more conventional lr if we are to treat
@ this as a real undefined instruction
@
badr r9, ret_from_exception
@ IRQs must be enabled before attempting to read the instruction from
@ user space since that could cause a page/translation fault if the
@ page table was modified by another CPU.
enable_irq
tst r3, #PSR_T_BIT @ Thumb mode?
bne __und_usr_thumb
sub r4, r2, #4 @ ARM instr at LR - 4
1: ldrt r0, [r4]
ARM_BE8(rev r0, r0) @ little endian instruction
uaccess_disable ip
@ r0 = 32-bit ARM instruction which caused the exception
@ r2 = PC value for the following instruction (:= regs->ARM_pc)
@ r4 = PC value for the faulting instruction
@ lr = 32-bit undefined instruction function
badr lr, __und_usr_fault_32
b call_fpe
__und_usr_thumb:
@ Thumb instruction
sub r4, r2, #2 @ First half of thumb instr at LR - 2
#if CONFIG_ARM_THUMB && __LINUX_ARM_ARCH__ >= 6 && CONFIG_CPU_V7
/*
* Thumb-2 instruction handling. Note that because pre-v6 and >= v6 platforms
* can never be supported in a single kernel, this code is not applicable at
* all when __LINUX_ARM_ARCH__ < 6. This allows simplifying assumptions to be
* made about .arch directives.
*/
#if __LINUX_ARM_ARCH__ < 7
/* If the target CPU may not be Thumb-2-capable, a run-time check is needed: */
ldr_va r5, cpu_architecture
cmp r5, #CPU_ARCH_ARMv7
blo __und_usr_fault_16 @ 16bit undefined instruction
/*
* The following code won't get run unless the running CPU really is v7, so
* coding round the lack of ldrht on older arches is pointless. Temporarily
* override the assembler target arch with the minimum required instead:
*/
.arch armv6t2
#endif
2: ldrht r5, [r4]
ARM_BE8(rev16 r5, r5) @ little endian instruction
cmp r5, #0xe800 @ 32bit instruction if xx != 0
blo __und_usr_fault_16_pan @ 16bit undefined instruction
3: ldrht r0, [r2]
ARM_BE8(rev16 r0, r0) @ little endian instruction
uaccess_disable ip
add r2, r2, #2 @ r2 is PC + 2, make it PC + 4
str r2, [sp, #S_PC] @ it's a 2x16bit instr, update
orr r0, r0, r5, lsl #16
badr lr, __und_usr_fault_32
@ r0 = the two 16-bit Thumb instructions which caused the exception
@ r2 = PC value for the following Thumb instruction (:= regs->ARM_pc)
@ r4 = PC value for the first 16-bit Thumb instruction
@ lr = 32bit undefined instruction function
#if __LINUX_ARM_ARCH__ < 7
/* If the target arch was overridden, change it back: */
#ifdef CONFIG_CPU_32v6K
.arch armv6k
#else
.arch armv6
#endif
#endif /* __LINUX_ARM_ARCH__ < 7 */
#else /* !(CONFIG_ARM_THUMB && __LINUX_ARM_ARCH__ >= 6 && CONFIG_CPU_V7) */
b __und_usr_fault_16
#endif
UNWIND(.fnend)
ENDPROC(__und_usr)
/*
* The out of line fixup for the ldrt instructions above.
*/
.pushsection .text.fixup, "ax"
.align 2
ARM: 8062/1: Modify ldrt fixup handler to re-execute the userspace instruction We will reach fixup handler when one thread(say cpu0) caused an undefined exception, while another thread(say cpu1) is unmmaping the page. Fixup handler returns to the next userspace instruction which has caused the undef execption, rather than going to the same instruction. ARM ARM says that after undefined exception, the PC will be pointing to the next instruction. ie +4 offset in case of ARM and +2 in case of Thumb And there is no correction offset passed to vector_stub in case of undef exception. File: arch/arm/kernel/entry-armv.S +1085 vector_stub und, UND_MODE During an undefined exception, in normal scenario(ie when ldrt instruction does not cause an abort) after resorting the context in VFP hardware, the PC is modified as show below before jumping to ret_from_exception which is in r9. File: arch/arm/vfp/vfphw.S +169 @ The context stored in the VFP hardware is up to date with this thread vfp_hw_state_valid: tst r1, #FPEXC_EX bne process_exception @ might as well handle the pending @ exception before retrying branch @ out before setting an FPEXC that @ stops us reading stuff VFPFMXR FPEXC, r1 @ Restore FPEXC last sub r2, r2, #4 @ Retry current instruction - if Thumb str r2, [sp, #S_PC] @ mode it's two 16-bit instructions, @ else it's one 32-bit instruction, so @ always subtract 4 from the following @ instruction address. But if ldrt results in an abort, we reach the fixup handler and return to ret_from_execption without correcting the pc. This patch modifes the fixup handler to re-execute the same instruction which caused undefined execption. Signed-off-by: Vinayak Menon <vinayakm.list@gmail.com> Signed-off-by: Arun KS <getarunks@gmail.com> Acked-by: Catalin Marinas <catalin.marinas@arm.com> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2014-05-19 18:43:00 +08:00
4: str r4, [sp, #S_PC] @ retry current instruction
ARM: convert all "mov.* pc, reg" to "bx reg" for ARMv6+ ARMv6 and greater introduced a new instruction ("bx") which can be used to return from function calls. Recent CPUs perform better when the "bx lr" instruction is used rather than the "mov pc, lr" instruction, and this sequence is strongly recommended to be used by the ARM architecture manual (section A.4.1.1). We provide a new macro "ret" with all its variants for the condition code which will resolve to the appropriate instruction. Rather than doing this piecemeal, and miss some instances, change all the "mov pc" instances to use the new macro, with the exception of the "movs" instruction and the kprobes code. This allows us to detect the "mov pc, lr" case and fix it up - and also gives us the possibility of deploying this for other registers depending on the CPU selection. Reported-by: Will Deacon <will.deacon@arm.com> Tested-by: Stephen Warren <swarren@nvidia.com> # Tegra Jetson TK1 Tested-by: Robert Jarzmik <robert.jarzmik@free.fr> # mioa701_bootresume.S Tested-by: Andrew Lunn <andrew@lunn.ch> # Kirkwood Tested-by: Shawn Guo <shawn.guo@freescale.com> Tested-by: Tony Lindgren <tony@atomide.com> # OMAPs Tested-by: Gregory CLEMENT <gregory.clement@free-electrons.com> # Armada XP, 375, 385 Acked-by: Sekhar Nori <nsekhar@ti.com> # DaVinci Acked-by: Christoffer Dall <christoffer.dall@linaro.org> # kvm/hyp Acked-by: Haojian Zhuang <haojian.zhuang@gmail.com> # PXA3xx Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com> # Xen Tested-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de> # ARMv7M Tested-by: Simon Horman <horms+renesas@verge.net.au> # Shmobile Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2014-06-30 23:29:12 +08:00
ret r9
.popsection
.pushsection __ex_table,"a"
.long 1b, 4b
#if CONFIG_ARM_THUMB && __LINUX_ARM_ARCH__ >= 6 && CONFIG_CPU_V7
.long 2b, 4b
.long 3b, 4b
#endif
.popsection
/*
* Check whether the instruction is a co-processor instruction.
* If yes, we need to call the relevant co-processor handler.
*
* Note that we don't do a full check here for the co-processor
* instructions; all instructions with bit 27 set are well
* defined. The only instructions that should fault are the
* co-processor instructions. However, we have to watch out
* for the ARM6/ARM7 SWI bug.
*
* NEON is a special case that has to be handled here. Not all
* NEON instructions are co-processor instructions, so we have
* to make a special case of checking for them. Plus, there's
* five groups of them, so we have a table of mask/opcode pairs
* to check against, and if any match then we branch off into the
* NEON handler code.
*
* Emulators may wish to make use of the following registers:
* r0 = instruction opcode (32-bit ARM or two 16-bit Thumb)
* r2 = PC value to resume execution after successful emulation
* r9 = normal "successful" return address
* r10 = this threads thread_info structure
* lr = unrecognised instruction return address
* IRQs enabled, FIQs enabled.
*/
@
@ Fall-through from Thumb-2 __und_usr
@
#ifdef CONFIG_NEON
get_thread_info r10 @ get current thread
adr r6, .LCneon_thumb_opcodes
b 2f
#endif
call_fpe:
get_thread_info r10 @ get current thread
#ifdef CONFIG_NEON
adr r6, .LCneon_arm_opcodes
2: ldr r5, [r6], #4 @ mask value
ldr r7, [r6], #4 @ opcode bits matching in mask
cmp r5, #0 @ end mask?
beq 1f
and r8, r0, r5
cmp r8, r7 @ NEON instruction?
bne 2b
mov r7, #1
strb r7, [r10, #TI_USED_CP + 10] @ mark CP#10 as used
strb r7, [r10, #TI_USED_CP + 11] @ mark CP#11 as used
b do_vfp @ let VFP handler handle this
1:
#endif
tst r0, #0x08000000 @ only CDP/CPRT/LDC/STC have bit 27
tstne r0, #0x04000000 @ bit 26 set on both ARM and Thumb-2
ARM: convert all "mov.* pc, reg" to "bx reg" for ARMv6+ ARMv6 and greater introduced a new instruction ("bx") which can be used to return from function calls. Recent CPUs perform better when the "bx lr" instruction is used rather than the "mov pc, lr" instruction, and this sequence is strongly recommended to be used by the ARM architecture manual (section A.4.1.1). We provide a new macro "ret" with all its variants for the condition code which will resolve to the appropriate instruction. Rather than doing this piecemeal, and miss some instances, change all the "mov pc" instances to use the new macro, with the exception of the "movs" instruction and the kprobes code. This allows us to detect the "mov pc, lr" case and fix it up - and also gives us the possibility of deploying this for other registers depending on the CPU selection. Reported-by: Will Deacon <will.deacon@arm.com> Tested-by: Stephen Warren <swarren@nvidia.com> # Tegra Jetson TK1 Tested-by: Robert Jarzmik <robert.jarzmik@free.fr> # mioa701_bootresume.S Tested-by: Andrew Lunn <andrew@lunn.ch> # Kirkwood Tested-by: Shawn Guo <shawn.guo@freescale.com> Tested-by: Tony Lindgren <tony@atomide.com> # OMAPs Tested-by: Gregory CLEMENT <gregory.clement@free-electrons.com> # Armada XP, 375, 385 Acked-by: Sekhar Nori <nsekhar@ti.com> # DaVinci Acked-by: Christoffer Dall <christoffer.dall@linaro.org> # kvm/hyp Acked-by: Haojian Zhuang <haojian.zhuang@gmail.com> # PXA3xx Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com> # Xen Tested-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de> # ARMv7M Tested-by: Simon Horman <horms+renesas@verge.net.au> # Shmobile Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2014-06-30 23:29:12 +08:00
reteq lr
and r8, r0, #0x00000f00 @ mask out CP number
mov r7, #1
add r6, r10, r8, lsr #8 @ add used_cp[] array offset first
strb r7, [r6, #TI_USED_CP] @ set appropriate used_cp[]
#ifdef CONFIG_IWMMXT
@ Test if we need to give access to iWMMXt coprocessors
ldr r5, [r10, #TI_FLAGS]
rsbs r7, r8, #(1 << 8) @ CP 0 or 1 only
movscs r7, r5, lsr #(TIF_USING_IWMMXT + 1)
bcs iwmmxt_task_enable
#endif
ARM( add pc, pc, r8, lsr #6 )
THUMB( lsr r8, r8, #6 )
THUMB( add pc, r8 )
nop
ARM: convert all "mov.* pc, reg" to "bx reg" for ARMv6+ ARMv6 and greater introduced a new instruction ("bx") which can be used to return from function calls. Recent CPUs perform better when the "bx lr" instruction is used rather than the "mov pc, lr" instruction, and this sequence is strongly recommended to be used by the ARM architecture manual (section A.4.1.1). We provide a new macro "ret" with all its variants for the condition code which will resolve to the appropriate instruction. Rather than doing this piecemeal, and miss some instances, change all the "mov pc" instances to use the new macro, with the exception of the "movs" instruction and the kprobes code. This allows us to detect the "mov pc, lr" case and fix it up - and also gives us the possibility of deploying this for other registers depending on the CPU selection. Reported-by: Will Deacon <will.deacon@arm.com> Tested-by: Stephen Warren <swarren@nvidia.com> # Tegra Jetson TK1 Tested-by: Robert Jarzmik <robert.jarzmik@free.fr> # mioa701_bootresume.S Tested-by: Andrew Lunn <andrew@lunn.ch> # Kirkwood Tested-by: Shawn Guo <shawn.guo@freescale.com> Tested-by: Tony Lindgren <tony@atomide.com> # OMAPs Tested-by: Gregory CLEMENT <gregory.clement@free-electrons.com> # Armada XP, 375, 385 Acked-by: Sekhar Nori <nsekhar@ti.com> # DaVinci Acked-by: Christoffer Dall <christoffer.dall@linaro.org> # kvm/hyp Acked-by: Haojian Zhuang <haojian.zhuang@gmail.com> # PXA3xx Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com> # Xen Tested-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de> # ARMv7M Tested-by: Simon Horman <horms+renesas@verge.net.au> # Shmobile Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2014-06-30 23:29:12 +08:00
ret.w lr @ CP#0
W(b) do_fpe @ CP#1 (FPE)
W(b) do_fpe @ CP#2 (FPE)
ARM: convert all "mov.* pc, reg" to "bx reg" for ARMv6+ ARMv6 and greater introduced a new instruction ("bx") which can be used to return from function calls. Recent CPUs perform better when the "bx lr" instruction is used rather than the "mov pc, lr" instruction, and this sequence is strongly recommended to be used by the ARM architecture manual (section A.4.1.1). We provide a new macro "ret" with all its variants for the condition code which will resolve to the appropriate instruction. Rather than doing this piecemeal, and miss some instances, change all the "mov pc" instances to use the new macro, with the exception of the "movs" instruction and the kprobes code. This allows us to detect the "mov pc, lr" case and fix it up - and also gives us the possibility of deploying this for other registers depending on the CPU selection. Reported-by: Will Deacon <will.deacon@arm.com> Tested-by: Stephen Warren <swarren@nvidia.com> # Tegra Jetson TK1 Tested-by: Robert Jarzmik <robert.jarzmik@free.fr> # mioa701_bootresume.S Tested-by: Andrew Lunn <andrew@lunn.ch> # Kirkwood Tested-by: Shawn Guo <shawn.guo@freescale.com> Tested-by: Tony Lindgren <tony@atomide.com> # OMAPs Tested-by: Gregory CLEMENT <gregory.clement@free-electrons.com> # Armada XP, 375, 385 Acked-by: Sekhar Nori <nsekhar@ti.com> # DaVinci Acked-by: Christoffer Dall <christoffer.dall@linaro.org> # kvm/hyp Acked-by: Haojian Zhuang <haojian.zhuang@gmail.com> # PXA3xx Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com> # Xen Tested-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de> # ARMv7M Tested-by: Simon Horman <horms+renesas@verge.net.au> # Shmobile Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2014-06-30 23:29:12 +08:00
ret.w lr @ CP#3
ret.w lr @ CP#4
ret.w lr @ CP#5
ret.w lr @ CP#6
ret.w lr @ CP#7
ret.w lr @ CP#8
ret.w lr @ CP#9
#ifdef CONFIG_VFP
W(b) do_vfp @ CP#10 (VFP)
W(b) do_vfp @ CP#11 (VFP)
#else
ARM: convert all "mov.* pc, reg" to "bx reg" for ARMv6+ ARMv6 and greater introduced a new instruction ("bx") which can be used to return from function calls. Recent CPUs perform better when the "bx lr" instruction is used rather than the "mov pc, lr" instruction, and this sequence is strongly recommended to be used by the ARM architecture manual (section A.4.1.1). We provide a new macro "ret" with all its variants for the condition code which will resolve to the appropriate instruction. Rather than doing this piecemeal, and miss some instances, change all the "mov pc" instances to use the new macro, with the exception of the "movs" instruction and the kprobes code. This allows us to detect the "mov pc, lr" case and fix it up - and also gives us the possibility of deploying this for other registers depending on the CPU selection. Reported-by: Will Deacon <will.deacon@arm.com> Tested-by: Stephen Warren <swarren@nvidia.com> # Tegra Jetson TK1 Tested-by: Robert Jarzmik <robert.jarzmik@free.fr> # mioa701_bootresume.S Tested-by: Andrew Lunn <andrew@lunn.ch> # Kirkwood Tested-by: Shawn Guo <shawn.guo@freescale.com> Tested-by: Tony Lindgren <tony@atomide.com> # OMAPs Tested-by: Gregory CLEMENT <gregory.clement@free-electrons.com> # Armada XP, 375, 385 Acked-by: Sekhar Nori <nsekhar@ti.com> # DaVinci Acked-by: Christoffer Dall <christoffer.dall@linaro.org> # kvm/hyp Acked-by: Haojian Zhuang <haojian.zhuang@gmail.com> # PXA3xx Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com> # Xen Tested-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de> # ARMv7M Tested-by: Simon Horman <horms+renesas@verge.net.au> # Shmobile Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2014-06-30 23:29:12 +08:00
ret.w lr @ CP#10 (VFP)
ret.w lr @ CP#11 (VFP)
#endif
ARM: convert all "mov.* pc, reg" to "bx reg" for ARMv6+ ARMv6 and greater introduced a new instruction ("bx") which can be used to return from function calls. Recent CPUs perform better when the "bx lr" instruction is used rather than the "mov pc, lr" instruction, and this sequence is strongly recommended to be used by the ARM architecture manual (section A.4.1.1). We provide a new macro "ret" with all its variants for the condition code which will resolve to the appropriate instruction. Rather than doing this piecemeal, and miss some instances, change all the "mov pc" instances to use the new macro, with the exception of the "movs" instruction and the kprobes code. This allows us to detect the "mov pc, lr" case and fix it up - and also gives us the possibility of deploying this for other registers depending on the CPU selection. Reported-by: Will Deacon <will.deacon@arm.com> Tested-by: Stephen Warren <swarren@nvidia.com> # Tegra Jetson TK1 Tested-by: Robert Jarzmik <robert.jarzmik@free.fr> # mioa701_bootresume.S Tested-by: Andrew Lunn <andrew@lunn.ch> # Kirkwood Tested-by: Shawn Guo <shawn.guo@freescale.com> Tested-by: Tony Lindgren <tony@atomide.com> # OMAPs Tested-by: Gregory CLEMENT <gregory.clement@free-electrons.com> # Armada XP, 375, 385 Acked-by: Sekhar Nori <nsekhar@ti.com> # DaVinci Acked-by: Christoffer Dall <christoffer.dall@linaro.org> # kvm/hyp Acked-by: Haojian Zhuang <haojian.zhuang@gmail.com> # PXA3xx Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com> # Xen Tested-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de> # ARMv7M Tested-by: Simon Horman <horms+renesas@verge.net.au> # Shmobile Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2014-06-30 23:29:12 +08:00
ret.w lr @ CP#12
ret.w lr @ CP#13
ret.w lr @ CP#14 (Debug)
ret.w lr @ CP#15 (Control)
#ifdef CONFIG_NEON
.align 6
.LCneon_arm_opcodes:
.word 0xfe000000 @ mask
.word 0xf2000000 @ opcode
.word 0xff100000 @ mask
.word 0xf4000000 @ opcode
.word 0x00000000 @ mask
.word 0x00000000 @ opcode
.LCneon_thumb_opcodes:
.word 0xef000000 @ mask
.word 0xef000000 @ opcode
.word 0xff100000 @ mask
.word 0xf9000000 @ opcode
.word 0x00000000 @ mask
.word 0x00000000 @ opcode
#endif
do_fpe:
add r10, r10, #TI_FPSTATE @ r10 = workspace
ldr_va pc, fp_enter, tmp=r4 @ Call FP module USR entry point
/*
* The FP module is called with these registers set:
* r0 = instruction
* r2 = PC+4
* r9 = normal "successful" return address
* r10 = FP workspace
* lr = unrecognised FP instruction return address
*/
.pushsection .data
.align 2
ENTRY(fp_enter)
.word no_fp
.popsection
ENTRY(no_fp)
ARM: convert all "mov.* pc, reg" to "bx reg" for ARMv6+ ARMv6 and greater introduced a new instruction ("bx") which can be used to return from function calls. Recent CPUs perform better when the "bx lr" instruction is used rather than the "mov pc, lr" instruction, and this sequence is strongly recommended to be used by the ARM architecture manual (section A.4.1.1). We provide a new macro "ret" with all its variants for the condition code which will resolve to the appropriate instruction. Rather than doing this piecemeal, and miss some instances, change all the "mov pc" instances to use the new macro, with the exception of the "movs" instruction and the kprobes code. This allows us to detect the "mov pc, lr" case and fix it up - and also gives us the possibility of deploying this for other registers depending on the CPU selection. Reported-by: Will Deacon <will.deacon@arm.com> Tested-by: Stephen Warren <swarren@nvidia.com> # Tegra Jetson TK1 Tested-by: Robert Jarzmik <robert.jarzmik@free.fr> # mioa701_bootresume.S Tested-by: Andrew Lunn <andrew@lunn.ch> # Kirkwood Tested-by: Shawn Guo <shawn.guo@freescale.com> Tested-by: Tony Lindgren <tony@atomide.com> # OMAPs Tested-by: Gregory CLEMENT <gregory.clement@free-electrons.com> # Armada XP, 375, 385 Acked-by: Sekhar Nori <nsekhar@ti.com> # DaVinci Acked-by: Christoffer Dall <christoffer.dall@linaro.org> # kvm/hyp Acked-by: Haojian Zhuang <haojian.zhuang@gmail.com> # PXA3xx Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com> # Xen Tested-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de> # ARMv7M Tested-by: Simon Horman <horms+renesas@verge.net.au> # Shmobile Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2014-06-30 23:29:12 +08:00
ret lr
ENDPROC(no_fp)
__und_usr_fault_32:
mov r1, #4
b 1f
__und_usr_fault_16_pan:
uaccess_disable ip
__und_usr_fault_16:
mov r1, #2
1: mov r0, sp
badr lr, ret_from_exception
b __und_fault
ENDPROC(__und_usr_fault_32)
ENDPROC(__und_usr_fault_16)
.align 5
__pabt_usr:
usr_entry
mov r2, sp @ regs
pabt_helper
UNWIND(.fnend )
/* fall through */
/*
* This is the return code to user mode for abort handlers
*/
ENTRY(ret_from_exception)
UNWIND(.fnstart )
UNWIND(.cantunwind )
get_thread_info tsk
mov why, #0
b ret_to_user
UNWIND(.fnend )
ENDPROC(__pabt_usr)
ENDPROC(ret_from_exception)
.align 5
__fiq_usr:
usr_entry trace=0
kuser_cmpxchg_check
mov r0, sp @ struct pt_regs *regs
bl handle_fiq_as_nmi
get_thread_info tsk
restore_user_regs fast = 0, offset = 0
UNWIND(.fnend )
ENDPROC(__fiq_usr)
/*
* Register switch for ARMv3 and ARMv4 processors
* r0 = previous task_struct, r1 = previous thread_info, r2 = next thread_info
* previous and next are guaranteed not to be the same.
*/
ENTRY(__switch_to)
UNWIND(.fnstart )
UNWIND(.cantunwind )
add ip, r1, #TI_CPU_SAVE
ARM( stmia ip!, {r4 - sl, fp, sp, lr} ) @ Store most regs on stack
THUMB( stmia ip!, {r4 - sl, fp} ) @ Store most regs on stack
THUMB( str sp, [ip], #4 )
THUMB( str lr, [ip], #4 )
ldr r4, [r2, #TI_TP_VALUE]
ldr r5, [r2, #TI_TP_VALUE + 4]
#ifdef CONFIG_CPU_USE_DOMAINS
mrc p15, 0, r6, c3, c0, 0 @ Get domain register
str r6, [r1, #TI_CPU_DOMAIN] @ Save old domain register
ldr r6, [r2, #TI_CPU_DOMAIN]
#endif
switch_tls r1, r4, r5, r3, r7
#if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_SMP) && \
!defined(CONFIG_STACKPROTECTOR_PER_TASK)
ldr r8, =__stack_chk_guard
.if (TSK_STACK_CANARY > IMM12_MASK)
add r9, r2, #TSK_STACK_CANARY & ~IMM12_MASK
ldr r9, [r9, #TSK_STACK_CANARY & IMM12_MASK]
.else
ldr r9, [r2, #TSK_STACK_CANARY & IMM12_MASK]
.endif
#endif
mov r7, r2 @ Preserve 'next'
#ifdef CONFIG_CPU_USE_DOMAINS
mcr p15, 0, r6, c3, c0, 0 @ Set domain register
#endif
mov r5, r0
add r4, r2, #TI_CPU_SAVE
ldr r0, =thread_notify_head
mov r1, #THREAD_NOTIFY_SWITCH
bl atomic_notifier_call_chain
#if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_SMP) && \
!defined(CONFIG_STACKPROTECTOR_PER_TASK)
str r9, [r8]
#endif
mov r0, r5
#if !defined(CONFIG_THUMB2_KERNEL) && !defined(CONFIG_VMAP_STACK)
set_current r7, r8
ldmia r4, {r4 - sl, fp, sp, pc} @ Load all regs saved previously
#else
mov r1, r7
ldmia r4, {r4 - sl, fp, ip, lr} @ Load all regs saved previously
#ifdef CONFIG_VMAP_STACK
@
@ Do a dummy read from the new stack while running from the old one so
@ that we can rely on do_translation_fault() to fix up any stale PMD
@ entries covering the vmalloc region.
@
ldr r2, [ip]
#endif
@ When CONFIG_THREAD_INFO_IN_TASK=n, the update of SP itself is what
@ effectuates the task switch, as that is what causes the observable
@ values of current and current_thread_info to change. When
@ CONFIG_THREAD_INFO_IN_TASK=y, setting current (and therefore
@ current_thread_info) is done explicitly, and the update of SP just
@ switches us to another stack, with few other side effects. In order
@ to prevent this distinction from causing any inconsistencies, let's
@ keep the 'set_current' call as close as we can to the update of SP.
set_current r1, r2
mov sp, ip
ret lr
#endif
UNWIND(.fnend )
ENDPROC(__switch_to)
#ifdef CONFIG_VMAP_STACK
.text
.align 2
__bad_stack:
@
@ We've just detected an overflow. We need to load the address of this
@ CPU's overflow stack into the stack pointer register. We have only one
@ scratch register so let's use a sequence of ADDs including one
@ involving the PC, and decorate them with PC-relative group
@ relocations. As these are ARM only, switch to ARM mode first.
@
@ We enter here with IP clobbered and its value stashed on the mode
@ stack.
@
THUMB( bx pc )
THUMB( nop )
THUMB( .arm )
ldr_this_cpu_armv6 ip, overflow_stack_ptr
str sp, [ip, #-4]! @ Preserve original SP value
mov sp, ip @ Switch to overflow stack
pop {ip} @ Original SP in IP
#if defined(CONFIG_UNWINDER_FRAME_POINTER) && defined(CONFIG_CC_IS_GCC)
mov ip, ip @ mov expected by unwinder
push {fp, ip, lr, pc} @ GCC flavor frame record
#else
str ip, [sp, #-8]! @ store original SP
push {fpreg, lr} @ Clang flavor frame record
#endif
UNWIND( ldr ip, [r0, #4] ) @ load exception LR
UNWIND( str ip, [sp, #12] ) @ store in the frame record
ldr ip, [r0, #12] @ reload IP
@ Store the original GPRs to the new stack.
svc_entry uaccess=0, overflow_check=0
UNWIND( .save {sp, pc} )
UNWIND( .save {fpreg, lr} )
UNWIND( .setfp fpreg, sp )
ldr fpreg, [sp, #S_SP] @ Add our frame record
@ to the linked list
#if defined(CONFIG_UNWINDER_FRAME_POINTER) && defined(CONFIG_CC_IS_GCC)
ldr r1, [fp, #4] @ reload SP at entry
add fp, fp, #12
#else
ldr r1, [fpreg, #8]
#endif
str r1, [sp, #S_SP] @ store in pt_regs
@ Stash the regs for handle_bad_stack
mov r0, sp
@ Time to die
bl handle_bad_stack
nop
UNWIND( .fnend )
ENDPROC(__bad_stack)
#endif
__INIT
[PATCH] ARM: 2651/3: kernel helpers for NPTL support Patch from Nicolas Pitre This patch entirely reworks the kernel assistance for NPTL on ARM. In particular this provides an efficient way to retrieve the TLS value and perform atomic operations without any instruction emulation nor special system call. This even allows for pre ARMv6 binaries to be forward compatible with SMP systems without any penalty. The problematic and performance critical operations are performed through segment of kernel provided user code reachable from user space at a fixed address in kernel memory. Those fixed entry points are within the vector page so we basically get it for free as no extra memory page is required and nothing else may be mapped at that location anyway. This is different from (but doesn't preclude) a full blown VDSO implementation, however a VDSO would prevent some assembly tricks with constants that allows for efficient branching to those code segments. And since those code segments only use a few cycles before returning to user code, the overhead of a VDSO far call would add a significant overhead to such minimalistic operations. The ARM_NR_set_tls syscall also changed number. This is done for two reasons: 1) this patch changes the way the TLS value was previously meant to be retrieved, therefore we ensure whatever library using the old way gets fixed (they only exist in private tree at the moment since the NPTL work is still progressing). 2) the previous number was allocated in a range causing an undefined instruction trap on kernels not supporting that syscall and it was determined that allocating it in a range returning -ENOSYS would be much nicer for libraries trying to determine if the feature is present or not. Signed-off-by: Nicolas Pitre Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2005-04-30 05:08:33 +08:00
/*
* User helpers.
*
* Each segment is 32-byte aligned and will be moved to the top of the high
* vector page. New segments (if ever needed) must be added in front of
* existing ones. This mechanism should be used only for things that are
* really small and justified, and not be abused freely.
*
* See Documentation/arm/kernel_user_helpers.rst for formal definitions.
[PATCH] ARM: 2651/3: kernel helpers for NPTL support Patch from Nicolas Pitre This patch entirely reworks the kernel assistance for NPTL on ARM. In particular this provides an efficient way to retrieve the TLS value and perform atomic operations without any instruction emulation nor special system call. This even allows for pre ARMv6 binaries to be forward compatible with SMP systems without any penalty. The problematic and performance critical operations are performed through segment of kernel provided user code reachable from user space at a fixed address in kernel memory. Those fixed entry points are within the vector page so we basically get it for free as no extra memory page is required and nothing else may be mapped at that location anyway. This is different from (but doesn't preclude) a full blown VDSO implementation, however a VDSO would prevent some assembly tricks with constants that allows for efficient branching to those code segments. And since those code segments only use a few cycles before returning to user code, the overhead of a VDSO far call would add a significant overhead to such minimalistic operations. The ARM_NR_set_tls syscall also changed number. This is done for two reasons: 1) this patch changes the way the TLS value was previously meant to be retrieved, therefore we ensure whatever library using the old way gets fixed (they only exist in private tree at the moment since the NPTL work is still progressing). 2) the previous number was allocated in a range causing an undefined instruction trap on kernels not supporting that syscall and it was determined that allocating it in a range returning -ENOSYS would be much nicer for libraries trying to determine if the feature is present or not. Signed-off-by: Nicolas Pitre Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2005-04-30 05:08:33 +08:00
*/
THUMB( .arm )
[PATCH] ARM: 2651/3: kernel helpers for NPTL support Patch from Nicolas Pitre This patch entirely reworks the kernel assistance for NPTL on ARM. In particular this provides an efficient way to retrieve the TLS value and perform atomic operations without any instruction emulation nor special system call. This even allows for pre ARMv6 binaries to be forward compatible with SMP systems without any penalty. The problematic and performance critical operations are performed through segment of kernel provided user code reachable from user space at a fixed address in kernel memory. Those fixed entry points are within the vector page so we basically get it for free as no extra memory page is required and nothing else may be mapped at that location anyway. This is different from (but doesn't preclude) a full blown VDSO implementation, however a VDSO would prevent some assembly tricks with constants that allows for efficient branching to those code segments. And since those code segments only use a few cycles before returning to user code, the overhead of a VDSO far call would add a significant overhead to such minimalistic operations. The ARM_NR_set_tls syscall also changed number. This is done for two reasons: 1) this patch changes the way the TLS value was previously meant to be retrieved, therefore we ensure whatever library using the old way gets fixed (they only exist in private tree at the moment since the NPTL work is still progressing). 2) the previous number was allocated in a range causing an undefined instruction trap on kernels not supporting that syscall and it was determined that allocating it in a range returning -ENOSYS would be much nicer for libraries trying to determine if the feature is present or not. Signed-off-by: Nicolas Pitre Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2005-04-30 05:08:33 +08:00
.macro usr_ret, reg
#ifdef CONFIG_ARM_THUMB
bx \reg
#else
ARM: convert all "mov.* pc, reg" to "bx reg" for ARMv6+ ARMv6 and greater introduced a new instruction ("bx") which can be used to return from function calls. Recent CPUs perform better when the "bx lr" instruction is used rather than the "mov pc, lr" instruction, and this sequence is strongly recommended to be used by the ARM architecture manual (section A.4.1.1). We provide a new macro "ret" with all its variants for the condition code which will resolve to the appropriate instruction. Rather than doing this piecemeal, and miss some instances, change all the "mov pc" instances to use the new macro, with the exception of the "movs" instruction and the kprobes code. This allows us to detect the "mov pc, lr" case and fix it up - and also gives us the possibility of deploying this for other registers depending on the CPU selection. Reported-by: Will Deacon <will.deacon@arm.com> Tested-by: Stephen Warren <swarren@nvidia.com> # Tegra Jetson TK1 Tested-by: Robert Jarzmik <robert.jarzmik@free.fr> # mioa701_bootresume.S Tested-by: Andrew Lunn <andrew@lunn.ch> # Kirkwood Tested-by: Shawn Guo <shawn.guo@freescale.com> Tested-by: Tony Lindgren <tony@atomide.com> # OMAPs Tested-by: Gregory CLEMENT <gregory.clement@free-electrons.com> # Armada XP, 375, 385 Acked-by: Sekhar Nori <nsekhar@ti.com> # DaVinci Acked-by: Christoffer Dall <christoffer.dall@linaro.org> # kvm/hyp Acked-by: Haojian Zhuang <haojian.zhuang@gmail.com> # PXA3xx Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com> # Xen Tested-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de> # ARMv7M Tested-by: Simon Horman <horms+renesas@verge.net.au> # Shmobile Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2014-06-30 23:29:12 +08:00
ret \reg
#endif
.endm
.macro kuser_pad, sym, size
.if (. - \sym) & 3
.rept 4 - (. - \sym) & 3
.byte 0
.endr
.endif
.rept (\size - (. - \sym)) / 4
.word 0xe7fddef1
.endr
.endm
#ifdef CONFIG_KUSER_HELPERS
[PATCH] ARM: 2651/3: kernel helpers for NPTL support Patch from Nicolas Pitre This patch entirely reworks the kernel assistance for NPTL on ARM. In particular this provides an efficient way to retrieve the TLS value and perform atomic operations without any instruction emulation nor special system call. This even allows for pre ARMv6 binaries to be forward compatible with SMP systems without any penalty. The problematic and performance critical operations are performed through segment of kernel provided user code reachable from user space at a fixed address in kernel memory. Those fixed entry points are within the vector page so we basically get it for free as no extra memory page is required and nothing else may be mapped at that location anyway. This is different from (but doesn't preclude) a full blown VDSO implementation, however a VDSO would prevent some assembly tricks with constants that allows for efficient branching to those code segments. And since those code segments only use a few cycles before returning to user code, the overhead of a VDSO far call would add a significant overhead to such minimalistic operations. The ARM_NR_set_tls syscall also changed number. This is done for two reasons: 1) this patch changes the way the TLS value was previously meant to be retrieved, therefore we ensure whatever library using the old way gets fixed (they only exist in private tree at the moment since the NPTL work is still progressing). 2) the previous number was allocated in a range causing an undefined instruction trap on kernels not supporting that syscall and it was determined that allocating it in a range returning -ENOSYS would be much nicer for libraries trying to determine if the feature is present or not. Signed-off-by: Nicolas Pitre Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2005-04-30 05:08:33 +08:00
.align 5
.globl __kuser_helper_start
__kuser_helper_start:
/*
* Due to the length of some sequences, __kuser_cmpxchg64 spans 2 regular
* kuser "slots", therefore 0xffff0f80 is not used as a valid entry point.
*/
__kuser_cmpxchg64: @ 0xffff0f60
#if defined(CONFIG_CPU_32v6K)
stmfd sp!, {r4, r5, r6, r7}
ldrd r4, r5, [r0] @ load old val
ldrd r6, r7, [r1] @ load new val
smp_dmb arm
1: ldrexd r0, r1, [r2] @ load current val
eors r3, r0, r4 @ compare with oldval (1)
eorseq r3, r1, r5 @ compare with oldval (2)
strexdeq r3, r6, r7, [r2] @ store newval if eq
teqeq r3, #1 @ success?
beq 1b @ if no then retry
ARM: 6516/1: Allow SMP_ON_UP to work with Thumb-2 kernels. * __fixup_smp_on_up has been modified with support for the THUMB2_KERNEL case. For THUMB2_KERNEL only, fixups are split into halfwords in case of misalignment, since we can't rely on unaligned accesses working before turning the MMU on. No attempt is made to optimise the aligned case, since the number of fixups is typically small, and it seems best to keep the code as simple as possible. * Add a rotate in the fixup_smp code in order to support CPU_BIG_ENDIAN, as suggested by Nicolas Pitre. * Add an assembly-time sanity-check to ALT_UP() to ensure that the content really is the right size (4 bytes). (No check is done for ALT_SMP(). Possibly, this could be fixed by splitting the two uses ot ALT_SMP() (ALT_SMP...SMP_UP versus ALT_SMP...SMP_UP_B) into two macros. In the first case, ALT_SMP needs to expand to >= 4 bytes, not == 4.) * smp_mpidr.h (which implements ALT_SMP()/ALT_UP() manually due to macro limitations) has not been modified: the affected instruction (mov) has no 16-bit encoding, so the correct instruction size is satisfied in this case. * A "mode" parameter has been added to smp_dmb: smp_dmb arm @ assumes 4-byte instructions (for ARM code, e.g. kuser) smp_dmb @ uses W() to ensure 4-byte instructions for ALT_SMP() This avoids assembly failures due to use of W() inside smp_dmb, when assembling pure-ARM code in the vectors page. There might be a better way to achieve this. * Kconfig: make SMP_ON_UP depend on (!THUMB2_KERNEL || !BIG_ENDIAN) i.e., THUMB2_KERNEL is now supported, but only if !BIG_ENDIAN (The fixup code for Thumb-2 currently assumes little-endian order.) Tested using a single generic realview kernel on: ARM RealView PB-A8 (CONFIG_THUMB2_KERNEL={n,y}) ARM RealView PBX-A9 (SMP) Signed-off-by: Dave Martin <dave.martin@linaro.org> Acked-by: Nicolas Pitre <nicolas.pitre@linaro.org> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2010-12-01 22:39:23 +08:00
smp_dmb arm
rsbs r0, r3, #0 @ set returned val and C flag
ldmfd sp!, {r4, r5, r6, r7}
usr_ret lr
#elif !defined(CONFIG_SMP)
#ifdef CONFIG_MMU
/*
* The only thing that can break atomicity in this cmpxchg64
* implementation is either an IRQ or a data abort exception
* causing another process/thread to be scheduled in the middle of
* the critical sequence. The same strategy as for cmpxchg is used.
*/
stmfd sp!, {r4, r5, r6, lr}
ldmia r0, {r4, r5} @ load old val
ldmia r1, {r6, lr} @ load new val
1: ldmia r2, {r0, r1} @ load current val
eors r3, r0, r4 @ compare with oldval (1)
eorseq r3, r1, r5 @ compare with oldval (2)
2: stmiaeq r2, {r6, lr} @ store newval if eq
rsbs r0, r3, #0 @ set return val and C flag
ldmfd sp!, {r4, r5, r6, pc}
.text
kuser_cmpxchg64_fixup:
@ Called from kuser_cmpxchg_fixup.
@ r4 = address of interrupted insn (must be preserved).
@ sp = saved regs. r7 and r8 are clobbered.
@ 1b = first critical insn, 2b = last critical insn.
@ If r4 >= 1b and r4 <= 2b then saved pc_usr is set to 1b.
mov r7, #0xffff0fff
sub r7, r7, #(0xffff0fff - (0xffff0f60 + (1b - __kuser_cmpxchg64)))
subs r8, r4, r7
rsbscs r8, r8, #(2b - 1b)
strcs r7, [sp, #S_PC]
#if __LINUX_ARM_ARCH__ < 6
bcc kuser_cmpxchg32_fixup
#endif
ARM: convert all "mov.* pc, reg" to "bx reg" for ARMv6+ ARMv6 and greater introduced a new instruction ("bx") which can be used to return from function calls. Recent CPUs perform better when the "bx lr" instruction is used rather than the "mov pc, lr" instruction, and this sequence is strongly recommended to be used by the ARM architecture manual (section A.4.1.1). We provide a new macro "ret" with all its variants for the condition code which will resolve to the appropriate instruction. Rather than doing this piecemeal, and miss some instances, change all the "mov pc" instances to use the new macro, with the exception of the "movs" instruction and the kprobes code. This allows us to detect the "mov pc, lr" case and fix it up - and also gives us the possibility of deploying this for other registers depending on the CPU selection. Reported-by: Will Deacon <will.deacon@arm.com> Tested-by: Stephen Warren <swarren@nvidia.com> # Tegra Jetson TK1 Tested-by: Robert Jarzmik <robert.jarzmik@free.fr> # mioa701_bootresume.S Tested-by: Andrew Lunn <andrew@lunn.ch> # Kirkwood Tested-by: Shawn Guo <shawn.guo@freescale.com> Tested-by: Tony Lindgren <tony@atomide.com> # OMAPs Tested-by: Gregory CLEMENT <gregory.clement@free-electrons.com> # Armada XP, 375, 385 Acked-by: Sekhar Nori <nsekhar@ti.com> # DaVinci Acked-by: Christoffer Dall <christoffer.dall@linaro.org> # kvm/hyp Acked-by: Haojian Zhuang <haojian.zhuang@gmail.com> # PXA3xx Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com> # Xen Tested-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de> # ARMv7M Tested-by: Simon Horman <horms+renesas@verge.net.au> # Shmobile Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2014-06-30 23:29:12 +08:00
ret lr
.previous
#else
#warning "NPTL on non MMU needs fixing"
mov r0, #-1
adds r0, r0, #0
usr_ret lr
#endif
#else
#error "incoherent kernel configuration"
#endif
kuser_pad __kuser_cmpxchg64, 64
__kuser_memory_barrier: @ 0xffff0fa0
ARM: 6516/1: Allow SMP_ON_UP to work with Thumb-2 kernels. * __fixup_smp_on_up has been modified with support for the THUMB2_KERNEL case. For THUMB2_KERNEL only, fixups are split into halfwords in case of misalignment, since we can't rely on unaligned accesses working before turning the MMU on. No attempt is made to optimise the aligned case, since the number of fixups is typically small, and it seems best to keep the code as simple as possible. * Add a rotate in the fixup_smp code in order to support CPU_BIG_ENDIAN, as suggested by Nicolas Pitre. * Add an assembly-time sanity-check to ALT_UP() to ensure that the content really is the right size (4 bytes). (No check is done for ALT_SMP(). Possibly, this could be fixed by splitting the two uses ot ALT_SMP() (ALT_SMP...SMP_UP versus ALT_SMP...SMP_UP_B) into two macros. In the first case, ALT_SMP needs to expand to >= 4 bytes, not == 4.) * smp_mpidr.h (which implements ALT_SMP()/ALT_UP() manually due to macro limitations) has not been modified: the affected instruction (mov) has no 16-bit encoding, so the correct instruction size is satisfied in this case. * A "mode" parameter has been added to smp_dmb: smp_dmb arm @ assumes 4-byte instructions (for ARM code, e.g. kuser) smp_dmb @ uses W() to ensure 4-byte instructions for ALT_SMP() This avoids assembly failures due to use of W() inside smp_dmb, when assembling pure-ARM code in the vectors page. There might be a better way to achieve this. * Kconfig: make SMP_ON_UP depend on (!THUMB2_KERNEL || !BIG_ENDIAN) i.e., THUMB2_KERNEL is now supported, but only if !BIG_ENDIAN (The fixup code for Thumb-2 currently assumes little-endian order.) Tested using a single generic realview kernel on: ARM RealView PB-A8 (CONFIG_THUMB2_KERNEL={n,y}) ARM RealView PBX-A9 (SMP) Signed-off-by: Dave Martin <dave.martin@linaro.org> Acked-by: Nicolas Pitre <nicolas.pitre@linaro.org> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2010-12-01 22:39:23 +08:00
smp_dmb arm
usr_ret lr
kuser_pad __kuser_memory_barrier, 32
[PATCH] ARM: 2651/3: kernel helpers for NPTL support Patch from Nicolas Pitre This patch entirely reworks the kernel assistance for NPTL on ARM. In particular this provides an efficient way to retrieve the TLS value and perform atomic operations without any instruction emulation nor special system call. This even allows for pre ARMv6 binaries to be forward compatible with SMP systems without any penalty. The problematic and performance critical operations are performed through segment of kernel provided user code reachable from user space at a fixed address in kernel memory. Those fixed entry points are within the vector page so we basically get it for free as no extra memory page is required and nothing else may be mapped at that location anyway. This is different from (but doesn't preclude) a full blown VDSO implementation, however a VDSO would prevent some assembly tricks with constants that allows for efficient branching to those code segments. And since those code segments only use a few cycles before returning to user code, the overhead of a VDSO far call would add a significant overhead to such minimalistic operations. The ARM_NR_set_tls syscall also changed number. This is done for two reasons: 1) this patch changes the way the TLS value was previously meant to be retrieved, therefore we ensure whatever library using the old way gets fixed (they only exist in private tree at the moment since the NPTL work is still progressing). 2) the previous number was allocated in a range causing an undefined instruction trap on kernels not supporting that syscall and it was determined that allocating it in a range returning -ENOSYS would be much nicer for libraries trying to determine if the feature is present or not. Signed-off-by: Nicolas Pitre Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2005-04-30 05:08:33 +08:00
__kuser_cmpxchg: @ 0xffff0fc0
#if __LINUX_ARM_ARCH__ < 6
[PATCH] ARM: 2651/3: kernel helpers for NPTL support Patch from Nicolas Pitre This patch entirely reworks the kernel assistance for NPTL on ARM. In particular this provides an efficient way to retrieve the TLS value and perform atomic operations without any instruction emulation nor special system call. This even allows for pre ARMv6 binaries to be forward compatible with SMP systems without any penalty. The problematic and performance critical operations are performed through segment of kernel provided user code reachable from user space at a fixed address in kernel memory. Those fixed entry points are within the vector page so we basically get it for free as no extra memory page is required and nothing else may be mapped at that location anyway. This is different from (but doesn't preclude) a full blown VDSO implementation, however a VDSO would prevent some assembly tricks with constants that allows for efficient branching to those code segments. And since those code segments only use a few cycles before returning to user code, the overhead of a VDSO far call would add a significant overhead to such minimalistic operations. The ARM_NR_set_tls syscall also changed number. This is done for two reasons: 1) this patch changes the way the TLS value was previously meant to be retrieved, therefore we ensure whatever library using the old way gets fixed (they only exist in private tree at the moment since the NPTL work is still progressing). 2) the previous number was allocated in a range causing an undefined instruction trap on kernels not supporting that syscall and it was determined that allocating it in a range returning -ENOSYS would be much nicer for libraries trying to determine if the feature is present or not. Signed-off-by: Nicolas Pitre Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2005-04-30 05:08:33 +08:00
#ifdef CONFIG_MMU
[PATCH] ARM: 2651/3: kernel helpers for NPTL support Patch from Nicolas Pitre This patch entirely reworks the kernel assistance for NPTL on ARM. In particular this provides an efficient way to retrieve the TLS value and perform atomic operations without any instruction emulation nor special system call. This even allows for pre ARMv6 binaries to be forward compatible with SMP systems without any penalty. The problematic and performance critical operations are performed through segment of kernel provided user code reachable from user space at a fixed address in kernel memory. Those fixed entry points are within the vector page so we basically get it for free as no extra memory page is required and nothing else may be mapped at that location anyway. This is different from (but doesn't preclude) a full blown VDSO implementation, however a VDSO would prevent some assembly tricks with constants that allows for efficient branching to those code segments. And since those code segments only use a few cycles before returning to user code, the overhead of a VDSO far call would add a significant overhead to such minimalistic operations. The ARM_NR_set_tls syscall also changed number. This is done for two reasons: 1) this patch changes the way the TLS value was previously meant to be retrieved, therefore we ensure whatever library using the old way gets fixed (they only exist in private tree at the moment since the NPTL work is still progressing). 2) the previous number was allocated in a range causing an undefined instruction trap on kernels not supporting that syscall and it was determined that allocating it in a range returning -ENOSYS would be much nicer for libraries trying to determine if the feature is present or not. Signed-off-by: Nicolas Pitre Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2005-04-30 05:08:33 +08:00
/*
* The only thing that can break atomicity in this cmpxchg
* implementation is either an IRQ or a data abort exception
* causing another process/thread to be scheduled in the middle
* of the critical sequence. To prevent this, code is added to
* the IRQ and data abort exception handlers to set the pc back
* to the beginning of the critical section if it is found to be
* within that critical section (see kuser_cmpxchg_fixup).
[PATCH] ARM: 2651/3: kernel helpers for NPTL support Patch from Nicolas Pitre This patch entirely reworks the kernel assistance for NPTL on ARM. In particular this provides an efficient way to retrieve the TLS value and perform atomic operations without any instruction emulation nor special system call. This even allows for pre ARMv6 binaries to be forward compatible with SMP systems without any penalty. The problematic and performance critical operations are performed through segment of kernel provided user code reachable from user space at a fixed address in kernel memory. Those fixed entry points are within the vector page so we basically get it for free as no extra memory page is required and nothing else may be mapped at that location anyway. This is different from (but doesn't preclude) a full blown VDSO implementation, however a VDSO would prevent some assembly tricks with constants that allows for efficient branching to those code segments. And since those code segments only use a few cycles before returning to user code, the overhead of a VDSO far call would add a significant overhead to such minimalistic operations. The ARM_NR_set_tls syscall also changed number. This is done for two reasons: 1) this patch changes the way the TLS value was previously meant to be retrieved, therefore we ensure whatever library using the old way gets fixed (they only exist in private tree at the moment since the NPTL work is still progressing). 2) the previous number was allocated in a range causing an undefined instruction trap on kernels not supporting that syscall and it was determined that allocating it in a range returning -ENOSYS would be much nicer for libraries trying to determine if the feature is present or not. Signed-off-by: Nicolas Pitre Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2005-04-30 05:08:33 +08:00
*/
1: ldr r3, [r2] @ load current val
subs r3, r3, r0 @ compare with oldval
2: streq r1, [r2] @ store newval if eq
rsbs r0, r3, #0 @ set return val and C flag
usr_ret lr
.text
kuser_cmpxchg32_fixup:
@ Called from kuser_cmpxchg_check macro.
@ r4 = address of interrupted insn (must be preserved).
@ sp = saved regs. r7 and r8 are clobbered.
@ 1b = first critical insn, 2b = last critical insn.
@ If r4 >= 1b and r4 <= 2b then saved pc_usr is set to 1b.
mov r7, #0xffff0fff
sub r7, r7, #(0xffff0fff - (0xffff0fc0 + (1b - __kuser_cmpxchg)))
subs r8, r4, r7
rsbscs r8, r8, #(2b - 1b)
strcs r7, [sp, #S_PC]
ARM: convert all "mov.* pc, reg" to "bx reg" for ARMv6+ ARMv6 and greater introduced a new instruction ("bx") which can be used to return from function calls. Recent CPUs perform better when the "bx lr" instruction is used rather than the "mov pc, lr" instruction, and this sequence is strongly recommended to be used by the ARM architecture manual (section A.4.1.1). We provide a new macro "ret" with all its variants for the condition code which will resolve to the appropriate instruction. Rather than doing this piecemeal, and miss some instances, change all the "mov pc" instances to use the new macro, with the exception of the "movs" instruction and the kprobes code. This allows us to detect the "mov pc, lr" case and fix it up - and also gives us the possibility of deploying this for other registers depending on the CPU selection. Reported-by: Will Deacon <will.deacon@arm.com> Tested-by: Stephen Warren <swarren@nvidia.com> # Tegra Jetson TK1 Tested-by: Robert Jarzmik <robert.jarzmik@free.fr> # mioa701_bootresume.S Tested-by: Andrew Lunn <andrew@lunn.ch> # Kirkwood Tested-by: Shawn Guo <shawn.guo@freescale.com> Tested-by: Tony Lindgren <tony@atomide.com> # OMAPs Tested-by: Gregory CLEMENT <gregory.clement@free-electrons.com> # Armada XP, 375, 385 Acked-by: Sekhar Nori <nsekhar@ti.com> # DaVinci Acked-by: Christoffer Dall <christoffer.dall@linaro.org> # kvm/hyp Acked-by: Haojian Zhuang <haojian.zhuang@gmail.com> # PXA3xx Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com> # Xen Tested-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de> # ARMv7M Tested-by: Simon Horman <horms+renesas@verge.net.au> # Shmobile Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2014-06-30 23:29:12 +08:00
ret lr
.previous
#else
#warning "NPTL on non MMU needs fixing"
mov r0, #-1
adds r0, r0, #0
usr_ret lr
#endif
[PATCH] ARM: 2651/3: kernel helpers for NPTL support Patch from Nicolas Pitre This patch entirely reworks the kernel assistance for NPTL on ARM. In particular this provides an efficient way to retrieve the TLS value and perform atomic operations without any instruction emulation nor special system call. This even allows for pre ARMv6 binaries to be forward compatible with SMP systems without any penalty. The problematic and performance critical operations are performed through segment of kernel provided user code reachable from user space at a fixed address in kernel memory. Those fixed entry points are within the vector page so we basically get it for free as no extra memory page is required and nothing else may be mapped at that location anyway. This is different from (but doesn't preclude) a full blown VDSO implementation, however a VDSO would prevent some assembly tricks with constants that allows for efficient branching to those code segments. And since those code segments only use a few cycles before returning to user code, the overhead of a VDSO far call would add a significant overhead to such minimalistic operations. The ARM_NR_set_tls syscall also changed number. This is done for two reasons: 1) this patch changes the way the TLS value was previously meant to be retrieved, therefore we ensure whatever library using the old way gets fixed (they only exist in private tree at the moment since the NPTL work is still progressing). 2) the previous number was allocated in a range causing an undefined instruction trap on kernels not supporting that syscall and it was determined that allocating it in a range returning -ENOSYS would be much nicer for libraries trying to determine if the feature is present or not. Signed-off-by: Nicolas Pitre Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2005-04-30 05:08:33 +08:00
#else
ARM: 6516/1: Allow SMP_ON_UP to work with Thumb-2 kernels. * __fixup_smp_on_up has been modified with support for the THUMB2_KERNEL case. For THUMB2_KERNEL only, fixups are split into halfwords in case of misalignment, since we can't rely on unaligned accesses working before turning the MMU on. No attempt is made to optimise the aligned case, since the number of fixups is typically small, and it seems best to keep the code as simple as possible. * Add a rotate in the fixup_smp code in order to support CPU_BIG_ENDIAN, as suggested by Nicolas Pitre. * Add an assembly-time sanity-check to ALT_UP() to ensure that the content really is the right size (4 bytes). (No check is done for ALT_SMP(). Possibly, this could be fixed by splitting the two uses ot ALT_SMP() (ALT_SMP...SMP_UP versus ALT_SMP...SMP_UP_B) into two macros. In the first case, ALT_SMP needs to expand to >= 4 bytes, not == 4.) * smp_mpidr.h (which implements ALT_SMP()/ALT_UP() manually due to macro limitations) has not been modified: the affected instruction (mov) has no 16-bit encoding, so the correct instruction size is satisfied in this case. * A "mode" parameter has been added to smp_dmb: smp_dmb arm @ assumes 4-byte instructions (for ARM code, e.g. kuser) smp_dmb @ uses W() to ensure 4-byte instructions for ALT_SMP() This avoids assembly failures due to use of W() inside smp_dmb, when assembling pure-ARM code in the vectors page. There might be a better way to achieve this. * Kconfig: make SMP_ON_UP depend on (!THUMB2_KERNEL || !BIG_ENDIAN) i.e., THUMB2_KERNEL is now supported, but only if !BIG_ENDIAN (The fixup code for Thumb-2 currently assumes little-endian order.) Tested using a single generic realview kernel on: ARM RealView PB-A8 (CONFIG_THUMB2_KERNEL={n,y}) ARM RealView PBX-A9 (SMP) Signed-off-by: Dave Martin <dave.martin@linaro.org> Acked-by: Nicolas Pitre <nicolas.pitre@linaro.org> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2010-12-01 22:39:23 +08:00
smp_dmb arm
1: ldrex r3, [r2]
[PATCH] ARM: 2651/3: kernel helpers for NPTL support Patch from Nicolas Pitre This patch entirely reworks the kernel assistance for NPTL on ARM. In particular this provides an efficient way to retrieve the TLS value and perform atomic operations without any instruction emulation nor special system call. This even allows for pre ARMv6 binaries to be forward compatible with SMP systems without any penalty. The problematic and performance critical operations are performed through segment of kernel provided user code reachable from user space at a fixed address in kernel memory. Those fixed entry points are within the vector page so we basically get it for free as no extra memory page is required and nothing else may be mapped at that location anyway. This is different from (but doesn't preclude) a full blown VDSO implementation, however a VDSO would prevent some assembly tricks with constants that allows for efficient branching to those code segments. And since those code segments only use a few cycles before returning to user code, the overhead of a VDSO far call would add a significant overhead to such minimalistic operations. The ARM_NR_set_tls syscall also changed number. This is done for two reasons: 1) this patch changes the way the TLS value was previously meant to be retrieved, therefore we ensure whatever library using the old way gets fixed (they only exist in private tree at the moment since the NPTL work is still progressing). 2) the previous number was allocated in a range causing an undefined instruction trap on kernels not supporting that syscall and it was determined that allocating it in a range returning -ENOSYS would be much nicer for libraries trying to determine if the feature is present or not. Signed-off-by: Nicolas Pitre Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2005-04-30 05:08:33 +08:00
subs r3, r3, r0
strexeq r3, r1, [r2]
teqeq r3, #1
beq 1b
[PATCH] ARM: 2651/3: kernel helpers for NPTL support Patch from Nicolas Pitre This patch entirely reworks the kernel assistance for NPTL on ARM. In particular this provides an efficient way to retrieve the TLS value and perform atomic operations without any instruction emulation nor special system call. This even allows for pre ARMv6 binaries to be forward compatible with SMP systems without any penalty. The problematic and performance critical operations are performed through segment of kernel provided user code reachable from user space at a fixed address in kernel memory. Those fixed entry points are within the vector page so we basically get it for free as no extra memory page is required and nothing else may be mapped at that location anyway. This is different from (but doesn't preclude) a full blown VDSO implementation, however a VDSO would prevent some assembly tricks with constants that allows for efficient branching to those code segments. And since those code segments only use a few cycles before returning to user code, the overhead of a VDSO far call would add a significant overhead to such minimalistic operations. The ARM_NR_set_tls syscall also changed number. This is done for two reasons: 1) this patch changes the way the TLS value was previously meant to be retrieved, therefore we ensure whatever library using the old way gets fixed (they only exist in private tree at the moment since the NPTL work is still progressing). 2) the previous number was allocated in a range causing an undefined instruction trap on kernels not supporting that syscall and it was determined that allocating it in a range returning -ENOSYS would be much nicer for libraries trying to determine if the feature is present or not. Signed-off-by: Nicolas Pitre Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2005-04-30 05:08:33 +08:00
rsbs r0, r3, #0
/* beware -- each __kuser slot must be 8 instructions max */
ALT_SMP(b __kuser_memory_barrier)
ALT_UP(usr_ret lr)
[PATCH] ARM: 2651/3: kernel helpers for NPTL support Patch from Nicolas Pitre This patch entirely reworks the kernel assistance for NPTL on ARM. In particular this provides an efficient way to retrieve the TLS value and perform atomic operations without any instruction emulation nor special system call. This even allows for pre ARMv6 binaries to be forward compatible with SMP systems without any penalty. The problematic and performance critical operations are performed through segment of kernel provided user code reachable from user space at a fixed address in kernel memory. Those fixed entry points are within the vector page so we basically get it for free as no extra memory page is required and nothing else may be mapped at that location anyway. This is different from (but doesn't preclude) a full blown VDSO implementation, however a VDSO would prevent some assembly tricks with constants that allows for efficient branching to those code segments. And since those code segments only use a few cycles before returning to user code, the overhead of a VDSO far call would add a significant overhead to such minimalistic operations. The ARM_NR_set_tls syscall also changed number. This is done for two reasons: 1) this patch changes the way the TLS value was previously meant to be retrieved, therefore we ensure whatever library using the old way gets fixed (they only exist in private tree at the moment since the NPTL work is still progressing). 2) the previous number was allocated in a range causing an undefined instruction trap on kernels not supporting that syscall and it was determined that allocating it in a range returning -ENOSYS would be much nicer for libraries trying to determine if the feature is present or not. Signed-off-by: Nicolas Pitre Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2005-04-30 05:08:33 +08:00
#endif
kuser_pad __kuser_cmpxchg, 32
[PATCH] ARM: 2651/3: kernel helpers for NPTL support Patch from Nicolas Pitre This patch entirely reworks the kernel assistance for NPTL on ARM. In particular this provides an efficient way to retrieve the TLS value and perform atomic operations without any instruction emulation nor special system call. This even allows for pre ARMv6 binaries to be forward compatible with SMP systems without any penalty. The problematic and performance critical operations are performed through segment of kernel provided user code reachable from user space at a fixed address in kernel memory. Those fixed entry points are within the vector page so we basically get it for free as no extra memory page is required and nothing else may be mapped at that location anyway. This is different from (but doesn't preclude) a full blown VDSO implementation, however a VDSO would prevent some assembly tricks with constants that allows for efficient branching to those code segments. And since those code segments only use a few cycles before returning to user code, the overhead of a VDSO far call would add a significant overhead to such minimalistic operations. The ARM_NR_set_tls syscall also changed number. This is done for two reasons: 1) this patch changes the way the TLS value was previously meant to be retrieved, therefore we ensure whatever library using the old way gets fixed (they only exist in private tree at the moment since the NPTL work is still progressing). 2) the previous number was allocated in a range causing an undefined instruction trap on kernels not supporting that syscall and it was determined that allocating it in a range returning -ENOSYS would be much nicer for libraries trying to determine if the feature is present or not. Signed-off-by: Nicolas Pitre Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2005-04-30 05:08:33 +08:00
__kuser_get_tls: @ 0xffff0fe0
ldr r0, [pc, #(16 - 8)] @ read TLS, set in kuser_get_tls_init
usr_ret lr
mrc p15, 0, r0, c13, c0, 3 @ 0xffff0fe8 hardware TLS code
kuser_pad __kuser_get_tls, 16
.rep 3
.word 0 @ 0xffff0ff0 software TLS value, then
.endr @ pad up to __kuser_helper_version
[PATCH] ARM: 2651/3: kernel helpers for NPTL support Patch from Nicolas Pitre This patch entirely reworks the kernel assistance for NPTL on ARM. In particular this provides an efficient way to retrieve the TLS value and perform atomic operations without any instruction emulation nor special system call. This even allows for pre ARMv6 binaries to be forward compatible with SMP systems without any penalty. The problematic and performance critical operations are performed through segment of kernel provided user code reachable from user space at a fixed address in kernel memory. Those fixed entry points are within the vector page so we basically get it for free as no extra memory page is required and nothing else may be mapped at that location anyway. This is different from (but doesn't preclude) a full blown VDSO implementation, however a VDSO would prevent some assembly tricks with constants that allows for efficient branching to those code segments. And since those code segments only use a few cycles before returning to user code, the overhead of a VDSO far call would add a significant overhead to such minimalistic operations. The ARM_NR_set_tls syscall also changed number. This is done for two reasons: 1) this patch changes the way the TLS value was previously meant to be retrieved, therefore we ensure whatever library using the old way gets fixed (they only exist in private tree at the moment since the NPTL work is still progressing). 2) the previous number was allocated in a range causing an undefined instruction trap on kernels not supporting that syscall and it was determined that allocating it in a range returning -ENOSYS would be much nicer for libraries trying to determine if the feature is present or not. Signed-off-by: Nicolas Pitre Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2005-04-30 05:08:33 +08:00
__kuser_helper_version: @ 0xffff0ffc
.word ((__kuser_helper_end - __kuser_helper_start) >> 5)
.globl __kuser_helper_end
__kuser_helper_end:
#endif
THUMB( .thumb )
[PATCH] ARM: 2651/3: kernel helpers for NPTL support Patch from Nicolas Pitre This patch entirely reworks the kernel assistance for NPTL on ARM. In particular this provides an efficient way to retrieve the TLS value and perform atomic operations without any instruction emulation nor special system call. This even allows for pre ARMv6 binaries to be forward compatible with SMP systems without any penalty. The problematic and performance critical operations are performed through segment of kernel provided user code reachable from user space at a fixed address in kernel memory. Those fixed entry points are within the vector page so we basically get it for free as no extra memory page is required and nothing else may be mapped at that location anyway. This is different from (but doesn't preclude) a full blown VDSO implementation, however a VDSO would prevent some assembly tricks with constants that allows for efficient branching to those code segments. And since those code segments only use a few cycles before returning to user code, the overhead of a VDSO far call would add a significant overhead to such minimalistic operations. The ARM_NR_set_tls syscall also changed number. This is done for two reasons: 1) this patch changes the way the TLS value was previously meant to be retrieved, therefore we ensure whatever library using the old way gets fixed (they only exist in private tree at the moment since the NPTL work is still progressing). 2) the previous number was allocated in a range causing an undefined instruction trap on kernels not supporting that syscall and it was determined that allocating it in a range returning -ENOSYS would be much nicer for libraries trying to determine if the feature is present or not. Signed-off-by: Nicolas Pitre Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2005-04-30 05:08:33 +08:00
/*
* Vector stubs.
*
* This code is copied to 0xffff1000 so we can use branches in the
* vectors, rather than ldr's. Note that this code must not exceed
* a page size.
*
* Common stub entry macro:
* Enter in IRQ mode, spsr = SVC/USR CPSR, lr = SVC/USR PC
*
* SP points to a minimal amount of processor-private memory, the address
* of which is copied into r0 for the mode specific abort handler.
*/
.macro vector_stub, name, mode, correction=0
.align 5
#ifdef CONFIG_HARDEN_BRANCH_HISTORY
vector_bhb_bpiall_\name:
mcr p15, 0, r0, c7, c5, 6 @ BPIALL
@ isb not needed due to "movs pc, lr" in the vector stub
@ which gives a "context synchronisation".
#endif
vector_\name:
.if \correction
sub lr, lr, #\correction
.endif
@ Save r0, lr_<exception> (parent PC)
stmia sp, {r0, lr} @ save r0, lr
@ Save spsr_<exception> (parent CPSR)
.Lvec_\name:
mrs lr, spsr
str lr, [sp, #8] @ save spsr
@
@ Prepare for SVC32 mode. IRQs remain disabled.
@
mrs r0, cpsr
eor r0, r0, #(\mode ^ SVC_MODE | PSR_ISETSTATE)
msr spsr_cxsf, r0
@
@ the branch table must immediately follow this code
@
and lr, lr, #0x0f
THUMB( adr r0, 1f )
THUMB( ldr lr, [r0, lr, lsl #2] )
mov r0, sp
ARM( ldr lr, [pc, lr, lsl #2] )
movs pc, lr @ branch to handler in SVC mode
ENDPROC(vector_\name)
#ifdef CONFIG_HARDEN_BRANCH_HISTORY
.subsection 1
.align 5
vector_bhb_loop8_\name:
.if \correction
sub lr, lr, #\correction
.endif
@ Save r0, lr_<exception> (parent PC)
stmia sp, {r0, lr}
@ bhb workaround
mov r0, #8
3: W(b) . + 4
subs r0, r0, #1
bne 3b
dsb nsh
@ isb not needed due to "movs pc, lr" in the vector stub
@ which gives a "context synchronisation".
b .Lvec_\name
ENDPROC(vector_bhb_loop8_\name)
.previous
#endif
.align 2
@ handler addresses follow this label
1:
.endm
.section .stubs, "ax", %progbits
ARM: 9201/1: spectre-bhb: rely on linker to emit cross-section literal loads The assembler does not permit 'LDR PC, <sym>' when the symbol lives in a different section, which is why we have been relying on rather fragile open-coded arithmetic to load the address of the vector_swi routine into the program counter using a single LDR instruction in the SWI slot in the vector table. The literal was moved to a different section to in commit 19accfd373847 ("ARM: move vector stubs") to ensure that the vector stubs page does not need to be mapped readable for user space, which is the case for the vector page itself, as it carries the kuser helpers as well. So the cross-section literal load is open-coded, and this relies on the address of vector_swi to be at the very start of the vector stubs page, and we won't notice if we got it wrong until booting the kernel and see it break. Fortunately, it was guaranteed to break, so this was fragile but not problematic. Now that we have added two other variants of the vector table, we have 3 occurrences of the same trick, and so the size of our ISA/compiler/CPU validation space has tripled, in a way that may cause regressions to only be observed once booting the image in question on a CPU that exercises a particular vector table. So let's switch to true cross section references, and let the linker fix them up like it fixes up all the other cross section references in the vector page. Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
2022-04-20 17:06:50 +08:00
@ These need to remain at the start of the section so that
@ they are in range of the 'SWI' entries in the vector tables
@ located 4k down.
.L__vector_swi:
.word vector_swi
#ifdef CONFIG_HARDEN_BRANCH_HISTORY
ARM: 9201/1: spectre-bhb: rely on linker to emit cross-section literal loads The assembler does not permit 'LDR PC, <sym>' when the symbol lives in a different section, which is why we have been relying on rather fragile open-coded arithmetic to load the address of the vector_swi routine into the program counter using a single LDR instruction in the SWI slot in the vector table. The literal was moved to a different section to in commit 19accfd373847 ("ARM: move vector stubs") to ensure that the vector stubs page does not need to be mapped readable for user space, which is the case for the vector page itself, as it carries the kuser helpers as well. So the cross-section literal load is open-coded, and this relies on the address of vector_swi to be at the very start of the vector stubs page, and we won't notice if we got it wrong until booting the kernel and see it break. Fortunately, it was guaranteed to break, so this was fragile but not problematic. Now that we have added two other variants of the vector table, we have 3 occurrences of the same trick, and so the size of our ISA/compiler/CPU validation space has tripled, in a way that may cause regressions to only be observed once booting the image in question on a CPU that exercises a particular vector table. So let's switch to true cross section references, and let the linker fix them up like it fixes up all the other cross section references in the vector page. Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
2022-04-20 17:06:50 +08:00
.L__vector_bhb_loop8_swi:
.word vector_bhb_loop8_swi
ARM: 9201/1: spectre-bhb: rely on linker to emit cross-section literal loads The assembler does not permit 'LDR PC, <sym>' when the symbol lives in a different section, which is why we have been relying on rather fragile open-coded arithmetic to load the address of the vector_swi routine into the program counter using a single LDR instruction in the SWI slot in the vector table. The literal was moved to a different section to in commit 19accfd373847 ("ARM: move vector stubs") to ensure that the vector stubs page does not need to be mapped readable for user space, which is the case for the vector page itself, as it carries the kuser helpers as well. So the cross-section literal load is open-coded, and this relies on the address of vector_swi to be at the very start of the vector stubs page, and we won't notice if we got it wrong until booting the kernel and see it break. Fortunately, it was guaranteed to break, so this was fragile but not problematic. Now that we have added two other variants of the vector table, we have 3 occurrences of the same trick, and so the size of our ISA/compiler/CPU validation space has tripled, in a way that may cause regressions to only be observed once booting the image in question on a CPU that exercises a particular vector table. So let's switch to true cross section references, and let the linker fix them up like it fixes up all the other cross section references in the vector page. Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
2022-04-20 17:06:50 +08:00
.L__vector_bhb_bpiall_swi:
.word vector_bhb_bpiall_swi
#endif
vector_rst:
ARM( swi SYS_ERROR0 )
THUMB( svc #0 )
THUMB( nop )
b vector_und
/*
* Interrupt dispatcher
*/
vector_stub irq, IRQ_MODE, 4
.long __irq_usr @ 0 (USR_26 / USR_32)
.long __irq_invalid @ 1 (FIQ_26 / FIQ_32)
.long __irq_invalid @ 2 (IRQ_26 / IRQ_32)
.long __irq_svc @ 3 (SVC_26 / SVC_32)
.long __irq_invalid @ 4
.long __irq_invalid @ 5
.long __irq_invalid @ 6
.long __irq_invalid @ 7
.long __irq_invalid @ 8
.long __irq_invalid @ 9
.long __irq_invalid @ a
.long __irq_invalid @ b
.long __irq_invalid @ c
.long __irq_invalid @ d
.long __irq_invalid @ e
.long __irq_invalid @ f
/*
* Data abort dispatcher
* Enter in ABT mode, spsr = USR CPSR, lr = USR PC
*/
vector_stub dabt, ABT_MODE, 8
.long __dabt_usr @ 0 (USR_26 / USR_32)
.long __dabt_invalid @ 1 (FIQ_26 / FIQ_32)
.long __dabt_invalid @ 2 (IRQ_26 / IRQ_32)
.long __dabt_svc @ 3 (SVC_26 / SVC_32)
.long __dabt_invalid @ 4
.long __dabt_invalid @ 5
.long __dabt_invalid @ 6
.long __dabt_invalid @ 7
.long __dabt_invalid @ 8
.long __dabt_invalid @ 9
.long __dabt_invalid @ a
.long __dabt_invalid @ b
.long __dabt_invalid @ c
.long __dabt_invalid @ d
.long __dabt_invalid @ e
.long __dabt_invalid @ f
/*
* Prefetch abort dispatcher
* Enter in ABT mode, spsr = USR CPSR, lr = USR PC
*/
vector_stub pabt, ABT_MODE, 4
.long __pabt_usr @ 0 (USR_26 / USR_32)
.long __pabt_invalid @ 1 (FIQ_26 / FIQ_32)
.long __pabt_invalid @ 2 (IRQ_26 / IRQ_32)
.long __pabt_svc @ 3 (SVC_26 / SVC_32)
.long __pabt_invalid @ 4
.long __pabt_invalid @ 5
.long __pabt_invalid @ 6
.long __pabt_invalid @ 7
.long __pabt_invalid @ 8
.long __pabt_invalid @ 9
.long __pabt_invalid @ a
.long __pabt_invalid @ b
.long __pabt_invalid @ c
.long __pabt_invalid @ d
.long __pabt_invalid @ e
.long __pabt_invalid @ f
/*
* Undef instr entry dispatcher
* Enter in UND mode, spsr = SVC/USR CPSR, lr = SVC/USR PC
*/
vector_stub und, UND_MODE
.long __und_usr @ 0 (USR_26 / USR_32)
.long __und_invalid @ 1 (FIQ_26 / FIQ_32)
.long __und_invalid @ 2 (IRQ_26 / IRQ_32)
.long __und_svc @ 3 (SVC_26 / SVC_32)
.long __und_invalid @ 4
.long __und_invalid @ 5
.long __und_invalid @ 6
.long __und_invalid @ 7
.long __und_invalid @ 8
.long __und_invalid @ 9
.long __und_invalid @ a
.long __und_invalid @ b
.long __und_invalid @ c
.long __und_invalid @ d
.long __und_invalid @ e
.long __und_invalid @ f
.align 5
/*=============================================================================
* Address exception handler
*-----------------------------------------------------------------------------
* These aren't too critical.
* (they're not supposed to happen, and won't happen in 32-bit data mode).
*/
vector_addrexcptn:
b vector_addrexcptn
/*=============================================================================
* FIQ "NMI" handler
*-----------------------------------------------------------------------------
* Handle a FIQ using the SVC stack allowing FIQ act like NMI on x86
* systems. This must be the last vector stub, so lets place it in its own
* subsection.
*/
.subsection 2
vector_stub fiq, FIQ_MODE, 4
.long __fiq_usr @ 0 (USR_26 / USR_32)
.long __fiq_svc @ 1 (FIQ_26 / FIQ_32)
.long __fiq_svc @ 2 (IRQ_26 / IRQ_32)
.long __fiq_svc @ 3 (SVC_26 / SVC_32)
.long __fiq_svc @ 4
.long __fiq_svc @ 5
.long __fiq_svc @ 6
.long __fiq_abt @ 7
.long __fiq_svc @ 8
.long __fiq_svc @ 9
.long __fiq_svc @ a
.long __fiq_svc @ b
.long __fiq_svc @ c
.long __fiq_svc @ d
.long __fiq_svc @ e
.long __fiq_svc @ f
ARM: 8515/2: move .vectors and .stubs sections back into the kernel VMA Commit b9b32bf70f2f ("ARM: use linker magic for vectors and vector stubs") updated the linker script to emit the .vectors and .stubs sections into a VMA range that is zero based and disjoint from the normal static kernel region. The reason for that was that this way, the sections can be placed exactly 4 KB apart, while the payload of the .vectors section is only 32 bytes. Since the symbols that are part of the .stubs section are emitted into the kallsyms table, they appear with zero based addresses as well, e.g., 00001004 t vector_rst 00001020 t vector_irq 000010a0 t vector_dabt 00001120 t vector_pabt 000011a0 t vector_und 00001220 t vector_addrexcptn 00001240 t vector_fiq 00001240 T vector_fiq_offset As this confuses perf when it accesses the kallsyms tables, commit 7122c3e9154b ("scripts/link-vmlinux.sh: only filter kernel symbols for arm") implemented a somewhat ugly special case for ARM, where the value of CONFIG_PAGE_OFFSET is passed to scripts/kallsyms, and symbols whose addresses are below it are filtered out. Note that this special case only applies to CONFIG_XIP_KERNEL=n, not because the issue the patch addresses exists only in that case, but because finding a limit below which to apply the filtering is not entirely straightforward. Since the .vectors and .stubs sections contain position independent code that is never executed in place, we can emit it at its most likely runtime VMA (for more recent CPUs), which is 0xffff0000 for the vector table and 0xffff1000 for the stubs. Not only does this fix the perf issue with kallsyms, allowing us to drop the special case in scripts/kallsyms entirely, it also gives debuggers a more realistic view of the address space, and setting breakpoints or single stepping through code in the vector table or the stubs is more likely to work as expected on CPUs that use a high vector address. E.g., 00001240 A vector_fiq_offset ... c0c35000 T __init_begin c0c35000 T __vectors_start c0c35020 T __stubs_start c0c35020 T __vectors_end c0c352e0 T _sinittext c0c352e0 T __stubs_end ... ffff1004 t vector_rst ffff1020 t vector_irq ffff10a0 t vector_dabt ffff1120 t vector_pabt ffff11a0 t vector_und ffff1220 t vector_addrexcptn ffff1240 T vector_fiq (Note that vector_fiq_offset is now an absolute symbol, which kallsyms already ignores by default) The LMA footprint is identical with or without this change, only the VMAs are different: Before: Idx Name Size VMA LMA File off Algn ... 14 .notes 00000024 c0c34020 c0c34020 00a34020 2**2 CONTENTS, ALLOC, LOAD, READONLY, CODE 15 .vectors 00000020 00000000 c0c35000 00a40000 2**1 CONTENTS, ALLOC, LOAD, READONLY, CODE 16 .stubs 000002c0 00001000 c0c35020 00a41000 2**5 CONTENTS, ALLOC, LOAD, READONLY, CODE 17 .init.text 0006b1b8 c0c352e0 c0c352e0 00a452e0 2**5 CONTENTS, ALLOC, LOAD, READONLY, CODE ... After: Idx Name Size VMA LMA File off Algn ... 14 .notes 00000024 c0c34020 c0c34020 00a34020 2**2 CONTENTS, ALLOC, LOAD, READONLY, CODE 15 .vectors 00000020 ffff0000 c0c35000 00a40000 2**1 CONTENTS, ALLOC, LOAD, READONLY, CODE 16 .stubs 000002c0 ffff1000 c0c35020 00a41000 2**5 CONTENTS, ALLOC, LOAD, READONLY, CODE 17 .init.text 0006b1b8 c0c352e0 c0c352e0 00a452e0 2**5 CONTENTS, ALLOC, LOAD, READONLY, CODE ... Acked-by: Nicolas Pitre <nico@linaro.org> Acked-by: Chris Brandt <chris.brandt@renesas.com> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2016-02-10 18:41:08 +08:00
.globl vector_fiq
.section .vectors, "ax", %progbits
W(b) vector_rst
W(b) vector_und
ARM: 9201/1: spectre-bhb: rely on linker to emit cross-section literal loads The assembler does not permit 'LDR PC, <sym>' when the symbol lives in a different section, which is why we have been relying on rather fragile open-coded arithmetic to load the address of the vector_swi routine into the program counter using a single LDR instruction in the SWI slot in the vector table. The literal was moved to a different section to in commit 19accfd373847 ("ARM: move vector stubs") to ensure that the vector stubs page does not need to be mapped readable for user space, which is the case for the vector page itself, as it carries the kuser helpers as well. So the cross-section literal load is open-coded, and this relies on the address of vector_swi to be at the very start of the vector stubs page, and we won't notice if we got it wrong until booting the kernel and see it break. Fortunately, it was guaranteed to break, so this was fragile but not problematic. Now that we have added two other variants of the vector table, we have 3 occurrences of the same trick, and so the size of our ISA/compiler/CPU validation space has tripled, in a way that may cause regressions to only be observed once booting the image in question on a CPU that exercises a particular vector table. So let's switch to true cross section references, and let the linker fix them up like it fixes up all the other cross section references in the vector page. Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
2022-04-20 17:06:50 +08:00
ARM( .reloc ., R_ARM_LDR_PC_G0, .L__vector_swi )
THUMB( .reloc ., R_ARM_THM_PC12, .L__vector_swi )
W(ldr) pc, .
W(b) vector_pabt
W(b) vector_dabt
W(b) vector_addrexcptn
W(b) vector_irq
W(b) vector_fiq
#ifdef CONFIG_HARDEN_BRANCH_HISTORY
.section .vectors.bhb.loop8, "ax", %progbits
W(b) vector_rst
W(b) vector_bhb_loop8_und
ARM: 9201/1: spectre-bhb: rely on linker to emit cross-section literal loads The assembler does not permit 'LDR PC, <sym>' when the symbol lives in a different section, which is why we have been relying on rather fragile open-coded arithmetic to load the address of the vector_swi routine into the program counter using a single LDR instruction in the SWI slot in the vector table. The literal was moved to a different section to in commit 19accfd373847 ("ARM: move vector stubs") to ensure that the vector stubs page does not need to be mapped readable for user space, which is the case for the vector page itself, as it carries the kuser helpers as well. So the cross-section literal load is open-coded, and this relies on the address of vector_swi to be at the very start of the vector stubs page, and we won't notice if we got it wrong until booting the kernel and see it break. Fortunately, it was guaranteed to break, so this was fragile but not problematic. Now that we have added two other variants of the vector table, we have 3 occurrences of the same trick, and so the size of our ISA/compiler/CPU validation space has tripled, in a way that may cause regressions to only be observed once booting the image in question on a CPU that exercises a particular vector table. So let's switch to true cross section references, and let the linker fix them up like it fixes up all the other cross section references in the vector page. Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
2022-04-20 17:06:50 +08:00
ARM( .reloc ., R_ARM_LDR_PC_G0, .L__vector_bhb_loop8_swi )
THUMB( .reloc ., R_ARM_THM_PC12, .L__vector_bhb_loop8_swi )
W(ldr) pc, .
W(b) vector_bhb_loop8_pabt
W(b) vector_bhb_loop8_dabt
W(b) vector_addrexcptn
W(b) vector_bhb_loop8_irq
W(b) vector_bhb_loop8_fiq
.section .vectors.bhb.bpiall, "ax", %progbits
W(b) vector_rst
W(b) vector_bhb_bpiall_und
ARM: 9201/1: spectre-bhb: rely on linker to emit cross-section literal loads The assembler does not permit 'LDR PC, <sym>' when the symbol lives in a different section, which is why we have been relying on rather fragile open-coded arithmetic to load the address of the vector_swi routine into the program counter using a single LDR instruction in the SWI slot in the vector table. The literal was moved to a different section to in commit 19accfd373847 ("ARM: move vector stubs") to ensure that the vector stubs page does not need to be mapped readable for user space, which is the case for the vector page itself, as it carries the kuser helpers as well. So the cross-section literal load is open-coded, and this relies on the address of vector_swi to be at the very start of the vector stubs page, and we won't notice if we got it wrong until booting the kernel and see it break. Fortunately, it was guaranteed to break, so this was fragile but not problematic. Now that we have added two other variants of the vector table, we have 3 occurrences of the same trick, and so the size of our ISA/compiler/CPU validation space has tripled, in a way that may cause regressions to only be observed once booting the image in question on a CPU that exercises a particular vector table. So let's switch to true cross section references, and let the linker fix them up like it fixes up all the other cross section references in the vector page. Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
2022-04-20 17:06:50 +08:00
ARM( .reloc ., R_ARM_LDR_PC_G0, .L__vector_bhb_bpiall_swi )
THUMB( .reloc ., R_ARM_THM_PC12, .L__vector_bhb_bpiall_swi )
W(ldr) pc, .
W(b) vector_bhb_bpiall_pabt
W(b) vector_bhb_bpiall_dabt
W(b) vector_addrexcptn
W(b) vector_bhb_bpiall_irq
W(b) vector_bhb_bpiall_fiq
#endif
.data
.align 2
.globl cr_alignment
cr_alignment:
.space 4