2005-04-17 06:20:36 +08:00
|
|
|
/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
|
2006-10-04 15:38:54 +08:00
|
|
|
|
2006-09-26 16:52:32 +08:00
|
|
|
#include <linux/linkage.h>
|
2016-01-27 05:12:04 +08:00
|
|
|
#include <asm/cpufeatures.h>
|
2011-07-13 21:24:10 +08:00
|
|
|
#include <asm/alternative-asm.h>
|
2006-09-26 16:52:32 +08:00
|
|
|
|
x86/lib/copy_page_64.S: Use generic ALTERNATIVE macro
... instead of the semi-version with the spelled out sections.
What is more, make the REP_GOOD version be the default copy_page()
version as the majority of the relevant x86 CPUs do set
X86_FEATURE_REP_GOOD. Thus, copy_page gets compiled to:
ffffffff8130af80 <copy_page>:
ffffffff8130af80: e9 0b 00 00 00 jmpq ffffffff8130af90 <copy_page_regs>
ffffffff8130af85: b9 00 02 00 00 mov $0x200,%ecx
ffffffff8130af8a: f3 48 a5 rep movsq %ds:(%rsi),%es:(%rdi)
ffffffff8130af8d: c3 retq
ffffffff8130af8e: 66 90 xchg %ax,%ax
ffffffff8130af90 <copy_page_regs>:
...
and after the alternatives have run, the JMP to the old, unrolled
version gets NOPed out:
ffffffff8130af80 <copy_page>:
ffffffff8130af80: 66 66 90 xchg %ax,%ax
ffffffff8130af83: 66 90 xchg %ax,%ax
ffffffff8130af85: b9 00 02 00 00 mov $0x200,%ecx
ffffffff8130af8a: f3 48 a5 rep movsq %ds:(%rsi),%es:(%rdi)
ffffffff8130af8d: c3 retq
On modern uarches, those NOPs are cheaper than the unconditional JMP
previously.
Signed-off-by: Borislav Petkov <bp@suse.de>
2015-01-13 01:19:40 +08:00
|
|
|
/*
|
|
|
|
* Some CPUs run faster using the string copy instructions (sane microcode).
|
|
|
|
* It is also a lot simpler. Use this when possible. But, don't use streaming
|
|
|
|
* copy unless the CPU indicates X86_FEATURE_REP_GOOD. Could vary the
|
|
|
|
* prefetch distance based on SMP/UP.
|
|
|
|
*/
|
2006-09-26 16:52:32 +08:00
|
|
|
ALIGN
|
x86/lib/copy_page_64.S: Use generic ALTERNATIVE macro
... instead of the semi-version with the spelled out sections.
What is more, make the REP_GOOD version be the default copy_page()
version as the majority of the relevant x86 CPUs do set
X86_FEATURE_REP_GOOD. Thus, copy_page gets compiled to:
ffffffff8130af80 <copy_page>:
ffffffff8130af80: e9 0b 00 00 00 jmpq ffffffff8130af90 <copy_page_regs>
ffffffff8130af85: b9 00 02 00 00 mov $0x200,%ecx
ffffffff8130af8a: f3 48 a5 rep movsq %ds:(%rsi),%es:(%rdi)
ffffffff8130af8d: c3 retq
ffffffff8130af8e: 66 90 xchg %ax,%ax
ffffffff8130af90 <copy_page_regs>:
...
and after the alternatives have run, the JMP to the old, unrolled
version gets NOPed out:
ffffffff8130af80 <copy_page>:
ffffffff8130af80: 66 66 90 xchg %ax,%ax
ffffffff8130af83: 66 90 xchg %ax,%ax
ffffffff8130af85: b9 00 02 00 00 mov $0x200,%ecx
ffffffff8130af8a: f3 48 a5 rep movsq %ds:(%rsi),%es:(%rdi)
ffffffff8130af8d: c3 retq
On modern uarches, those NOPs are cheaper than the unconditional JMP
previously.
Signed-off-by: Borislav Petkov <bp@suse.de>
2015-01-13 01:19:40 +08:00
|
|
|
ENTRY(copy_page)
|
|
|
|
ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD
|
2012-10-18 03:52:45 +08:00
|
|
|
movl $4096/8, %ecx
|
|
|
|
rep movsq
|
2006-09-26 16:52:32 +08:00
|
|
|
ret
|
x86/lib/copy_page_64.S: Use generic ALTERNATIVE macro
... instead of the semi-version with the spelled out sections.
What is more, make the REP_GOOD version be the default copy_page()
version as the majority of the relevant x86 CPUs do set
X86_FEATURE_REP_GOOD. Thus, copy_page gets compiled to:
ffffffff8130af80 <copy_page>:
ffffffff8130af80: e9 0b 00 00 00 jmpq ffffffff8130af90 <copy_page_regs>
ffffffff8130af85: b9 00 02 00 00 mov $0x200,%ecx
ffffffff8130af8a: f3 48 a5 rep movsq %ds:(%rsi),%es:(%rdi)
ffffffff8130af8d: c3 retq
ffffffff8130af8e: 66 90 xchg %ax,%ax
ffffffff8130af90 <copy_page_regs>:
...
and after the alternatives have run, the JMP to the old, unrolled
version gets NOPed out:
ffffffff8130af80 <copy_page>:
ffffffff8130af80: 66 66 90 xchg %ax,%ax
ffffffff8130af83: 66 90 xchg %ax,%ax
ffffffff8130af85: b9 00 02 00 00 mov $0x200,%ecx
ffffffff8130af8a: f3 48 a5 rep movsq %ds:(%rsi),%es:(%rdi)
ffffffff8130af8d: c3 retq
On modern uarches, those NOPs are cheaper than the unconditional JMP
previously.
Signed-off-by: Borislav Petkov <bp@suse.de>
2015-01-13 01:19:40 +08:00
|
|
|
ENDPROC(copy_page)
|
2005-04-17 06:20:36 +08:00
|
|
|
|
x86/lib/copy_page_64.S: Use generic ALTERNATIVE macro
... instead of the semi-version with the spelled out sections.
What is more, make the REP_GOOD version be the default copy_page()
version as the majority of the relevant x86 CPUs do set
X86_FEATURE_REP_GOOD. Thus, copy_page gets compiled to:
ffffffff8130af80 <copy_page>:
ffffffff8130af80: e9 0b 00 00 00 jmpq ffffffff8130af90 <copy_page_regs>
ffffffff8130af85: b9 00 02 00 00 mov $0x200,%ecx
ffffffff8130af8a: f3 48 a5 rep movsq %ds:(%rsi),%es:(%rdi)
ffffffff8130af8d: c3 retq
ffffffff8130af8e: 66 90 xchg %ax,%ax
ffffffff8130af90 <copy_page_regs>:
...
and after the alternatives have run, the JMP to the old, unrolled
version gets NOPed out:
ffffffff8130af80 <copy_page>:
ffffffff8130af80: 66 66 90 xchg %ax,%ax
ffffffff8130af83: 66 90 xchg %ax,%ax
ffffffff8130af85: b9 00 02 00 00 mov $0x200,%ecx
ffffffff8130af8a: f3 48 a5 rep movsq %ds:(%rsi),%es:(%rdi)
ffffffff8130af8d: c3 retq
On modern uarches, those NOPs are cheaper than the unconditional JMP
previously.
Signed-off-by: Borislav Petkov <bp@suse.de>
2015-01-13 01:19:40 +08:00
|
|
|
ENTRY(copy_page_regs)
|
2012-10-18 03:52:45 +08:00
|
|
|
subq $2*8, %rsp
|
|
|
|
movq %rbx, (%rsp)
|
|
|
|
movq %r12, 1*8(%rsp)
|
2006-02-04 04:51:02 +08:00
|
|
|
|
2012-10-18 03:52:45 +08:00
|
|
|
movl $(4096/64)-5, %ecx
|
2006-02-04 04:51:02 +08:00
|
|
|
.p2align 4
|
|
|
|
.Loop64:
|
2012-10-18 03:52:45 +08:00
|
|
|
dec %rcx
|
|
|
|
movq 0x8*0(%rsi), %rax
|
|
|
|
movq 0x8*1(%rsi), %rbx
|
|
|
|
movq 0x8*2(%rsi), %rdx
|
|
|
|
movq 0x8*3(%rsi), %r8
|
|
|
|
movq 0x8*4(%rsi), %r9
|
|
|
|
movq 0x8*5(%rsi), %r10
|
|
|
|
movq 0x8*6(%rsi), %r11
|
|
|
|
movq 0x8*7(%rsi), %r12
|
2006-02-04 04:51:02 +08:00
|
|
|
|
|
|
|
prefetcht0 5*64(%rsi)
|
|
|
|
|
2012-10-18 03:52:45 +08:00
|
|
|
movq %rax, 0x8*0(%rdi)
|
|
|
|
movq %rbx, 0x8*1(%rdi)
|
|
|
|
movq %rdx, 0x8*2(%rdi)
|
|
|
|
movq %r8, 0x8*3(%rdi)
|
|
|
|
movq %r9, 0x8*4(%rdi)
|
|
|
|
movq %r10, 0x8*5(%rdi)
|
|
|
|
movq %r11, 0x8*6(%rdi)
|
|
|
|
movq %r12, 0x8*7(%rdi)
|
2006-02-04 04:51:02 +08:00
|
|
|
|
2012-10-18 03:52:45 +08:00
|
|
|
leaq 64 (%rsi), %rsi
|
|
|
|
leaq 64 (%rdi), %rdi
|
2006-02-04 04:51:02 +08:00
|
|
|
|
2012-10-18 03:52:45 +08:00
|
|
|
jnz .Loop64
|
2006-02-04 04:51:02 +08:00
|
|
|
|
2012-10-18 03:52:45 +08:00
|
|
|
movl $5, %ecx
|
2006-02-04 04:51:02 +08:00
|
|
|
.p2align 4
|
|
|
|
.Loop2:
|
2012-10-18 03:52:45 +08:00
|
|
|
decl %ecx
|
|
|
|
|
|
|
|
movq 0x8*0(%rsi), %rax
|
|
|
|
movq 0x8*1(%rsi), %rbx
|
|
|
|
movq 0x8*2(%rsi), %rdx
|
|
|
|
movq 0x8*3(%rsi), %r8
|
|
|
|
movq 0x8*4(%rsi), %r9
|
|
|
|
movq 0x8*5(%rsi), %r10
|
|
|
|
movq 0x8*6(%rsi), %r11
|
|
|
|
movq 0x8*7(%rsi), %r12
|
|
|
|
|
|
|
|
movq %rax, 0x8*0(%rdi)
|
|
|
|
movq %rbx, 0x8*1(%rdi)
|
|
|
|
movq %rdx, 0x8*2(%rdi)
|
|
|
|
movq %r8, 0x8*3(%rdi)
|
|
|
|
movq %r9, 0x8*4(%rdi)
|
|
|
|
movq %r10, 0x8*5(%rdi)
|
|
|
|
movq %r11, 0x8*6(%rdi)
|
|
|
|
movq %r12, 0x8*7(%rdi)
|
|
|
|
|
|
|
|
leaq 64(%rdi), %rdi
|
|
|
|
leaq 64(%rsi), %rsi
|
2006-02-04 04:51:02 +08:00
|
|
|
jnz .Loop2
|
|
|
|
|
2012-10-18 03:52:45 +08:00
|
|
|
movq (%rsp), %rbx
|
|
|
|
movq 1*8(%rsp), %r12
|
|
|
|
addq $2*8, %rsp
|
2006-02-04 04:51:02 +08:00
|
|
|
ret
|
x86/lib/copy_page_64.S: Use generic ALTERNATIVE macro
... instead of the semi-version with the spelled out sections.
What is more, make the REP_GOOD version be the default copy_page()
version as the majority of the relevant x86 CPUs do set
X86_FEATURE_REP_GOOD. Thus, copy_page gets compiled to:
ffffffff8130af80 <copy_page>:
ffffffff8130af80: e9 0b 00 00 00 jmpq ffffffff8130af90 <copy_page_regs>
ffffffff8130af85: b9 00 02 00 00 mov $0x200,%ecx
ffffffff8130af8a: f3 48 a5 rep movsq %ds:(%rsi),%es:(%rdi)
ffffffff8130af8d: c3 retq
ffffffff8130af8e: 66 90 xchg %ax,%ax
ffffffff8130af90 <copy_page_regs>:
...
and after the alternatives have run, the JMP to the old, unrolled
version gets NOPed out:
ffffffff8130af80 <copy_page>:
ffffffff8130af80: 66 66 90 xchg %ax,%ax
ffffffff8130af83: 66 90 xchg %ax,%ax
ffffffff8130af85: b9 00 02 00 00 mov $0x200,%ecx
ffffffff8130af8a: f3 48 a5 rep movsq %ds:(%rsi),%es:(%rdi)
ffffffff8130af8d: c3 retq
On modern uarches, those NOPs are cheaper than the unconditional JMP
previously.
Signed-off-by: Borislav Petkov <bp@suse.de>
2015-01-13 01:19:40 +08:00
|
|
|
ENDPROC(copy_page_regs)
|