linux-next/arch/x86/lib/memmove_64.S

/*
 * Normally compiler builtins are used, but sometimes the compiler calls out
 * of line code. Based on asm-i386/string.h.
 *
 * This assembly file is re-written from memmove_64.c file.
 *	- Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
 */
#define _STRING_C
#include <linux/linkage.h>
#include <asm/dwarf2.h>
#include <asm/cpufeature.h>
#include <asm/alternative-asm.h>

#undef memmove

/*
 * Implement memmove(). This can handle overlap between src and dst.
 *
 * Input:
 * rdi: dest
 * rsi: src
 * rdx: count
 *
 * Output:
 * rax: dest
 */
ENTRY(memmove)
	CFI_STARTPROC

	/* Handle more 32bytes in loop */
	mov %rdi, %rax
	cmp $0x20, %rdx
	jb	1f

	/* Decide forward/backward copy mode */
	cmp %rdi, %rsi
	jge .Lmemmove_begin_forward
	mov %rsi, %r8
	add %rdx, %r8
	cmp %rdi, %r8
	jg 2f

.Lmemmove_begin_forward:
	/*
	 * movsq instruction have many startup latency
	 * so we handle small size by general register.
	 */
	cmp  $680, %rdx
	jb	3f
	/*
	 * movsq instruction is only good for aligned case.
	 */

	cmpb %dil, %sil
	je 4f
3:
	sub $0x20, %rdx
	/*
	 * We gobble 32byts forward in each loop.
	 */
5:
	sub $0x20, %rdx
	movq 0*8(%rsi), %r11
	movq 1*8(%rsi), %r10
	movq 2*8(%rsi), %r9
	movq 3*8(%rsi), %r8
	leaq 4*8(%rsi), %rsi

	movq %r11, 0*8(%rdi)
	movq %r10, 1*8(%rdi)
	movq %r9, 2*8(%rdi)
	movq %r8, 3*8(%rdi)
	leaq 4*8(%rdi), %rdi
	jae 5b
	addq $0x20, %rdx
	jmp 1f
	/*
	 * Handle data forward by movsq.
	 */
	.p2align 4
4:
	movq %rdx, %rcx
	movq -8(%rsi, %rdx), %r11
	lea -8(%rdi, %rdx), %r10
	shrq $3, %rcx
	rep movsq
	movq %r11, (%r10)
	jmp 13f
.Lmemmove_end_forward:

	/*
	 * Handle data backward by movsq.
	 */
	.p2align 4
7:
	movq %rdx, %rcx
	movq (%rsi), %r11
	movq %rdi, %r10
	leaq -8(%rsi, %rdx), %rsi
	leaq -8(%rdi, %rdx), %rdi
	shrq $3, %rcx
	std
	rep movsq
	cld
	movq %r11, (%r10)
	jmp 13f

	/*
	 * Start to prepare for backward copy.
	 */
	.p2align 4
2:
	cmp $680, %rdx
	jb 6f
	cmp %dil, %sil
	je 7b
6:
	/*
	 * Calculate copy position to tail.
	 */
	addq %rdx, %rsi
	addq %rdx, %rdi
	subq $0x20, %rdx
	/*
	 * We gobble 32byts backward in each loop.
	 */
8:
	subq $0x20, %rdx
	movq -1*8(%rsi), %r11
	movq -2*8(%rsi), %r10
	movq -3*8(%rsi), %r9
	movq -4*8(%rsi), %r8
	leaq -4*8(%rsi), %rsi

	movq %r11, -1*8(%rdi)
	movq %r10, -2*8(%rdi)
	movq %r9, -3*8(%rdi)
	movq %r8, -4*8(%rdi)
	leaq -4*8(%rdi), %rdi
	jae 8b
	/*
	 * Calculate copy position to head.
	 */
	addq $0x20, %rdx
	subq %rdx, %rsi
	subq %rdx, %rdi
1:
	cmpq $16, %rdx
	jb 9f
	/*
	 * Move data from 16 bytes to 31 bytes.
	 */
	movq 0*8(%rsi), %r11
	movq 1*8(%rsi), %r10
	movq -2*8(%rsi, %rdx), %r9
	movq -1*8(%rsi, %rdx), %r8
	movq %r11, 0*8(%rdi)
	movq %r10, 1*8(%rdi)
	movq %r9, -2*8(%rdi, %rdx)
	movq %r8, -1*8(%rdi, %rdx)
	jmp 13f
	.p2align 4
9:
	cmpq $8, %rdx
	jb 10f
	/*
	 * Move data from 8 bytes to 15 bytes.
	 */
	movq 0*8(%rsi), %r11
	movq -1*8(%rsi, %rdx), %r10
	movq %r11, 0*8(%rdi)
	movq %r10, -1*8(%rdi, %rdx)
	jmp 13f
10:
	cmpq $4, %rdx
	jb 11f
	/*
	 * Move data from 4 bytes to 7 bytes.
	 */
	movl (%rsi), %r11d
	movl -4(%rsi, %rdx), %r10d
	movl %r11d, (%rdi)
	movl %r10d, -4(%rdi, %rdx)
	jmp 13f
11:
	cmp $2, %rdx
	jb 12f
	/*
	 * Move data from 2 bytes to 3 bytes.
	 */
	movw (%rsi), %r11w
	movw -2(%rsi, %rdx), %r10w
	movw %r11w, (%rdi)
	movw %r10w, -2(%rdi, %rdx)
	jmp 13f
12:
	cmp $1, %rdx
	jb 13f
	/*
	 * Move data for 1 byte.
	 */
	movb (%rsi), %r11b
	movb %r11b, (%rdi)
13:
	retq
	CFI_ENDPROC

	.section .altinstr_replacement,"ax"
.Lmemmove_begin_forward_efs:
	/* Forward moving data. */
	movq %rdx, %rcx
	rep movsb
	retq
.Lmemmove_end_forward_efs:
	.previous

	.section .altinstructions,"a"
	altinstruction_entry .Lmemmove_begin_forward,		\
		.Lmemmove_begin_forward_efs,X86_FEATURE_ERMS,	\
		.Lmemmove_end_forward-.Lmemmove_begin_forward,	\
		.Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
	.previous
ENDPROC(memmove)
x86-64, mem: Convert memmove() to assembly file and fix return value bug memmove_64.c only implements memmove() function which is completely written in inline assembly code. Therefore it doesn't make sense to keep the assembly code in .c file. Currently memmove() doesn't store return value to rax. This may cause issue if caller uses the return value. The patch fixes this issue. Signed-off-by: Fenghua Yu <fenghua.yu@intel.com> LKML-Reference: <1295314755-6625-1-git-send-email-fenghua.yu@intel.com> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com> 2011-01-18 09:39:15 +08:00			`/*`
			`* Normally compiler builtins are used, but sometimes the compiler calls out`
			`* of line code. Based on asm-i386/string.h.`
			`*`
			`* This assembly file is re-written from memmove_64.c file.`
			`* - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>`
			`*/`
			`#define _STRING_C`
			`#include <linux/linkage.h>`
			`#include <asm/dwarf2.h>`
x86, mem: memmove_64.S: Optimize memmove by enhanced REP MOVSB/STOSB Support memmove() by enhanced rep movsb. On processors supporting enhanced REP MOVSB/STOSB, the alternative memmove() function using enhanced rep movsb overrides the original function. The patch doesn't change the backward memmove case to use enhanced rep movsb. Signed-off-by: Fenghua Yu <fenghua.yu@intel.com> Link: http://lkml.kernel.org/r/1305671358-14478-9-git-send-email-fenghua.yu@intel.com Signed-off-by: H. Peter Anvin <hpa@linux.intel.com> 2011-05-18 06:29:17 +08:00			`#include <asm/cpufeature.h>`
x86: Make alternative instruction pointers relative This save a few bytes on x86-64 and means that future patches can apply alternatives to unrelocated code. Signed-off-by: Andy Lutomirski <luto@mit.edu> Link: http://lkml.kernel.org/r/ff64a6b9a1a3860ca4a7b8b6dc7b4754f9491cd7.1310563276.git.luto@mit.edu Signed-off-by: H. Peter Anvin <hpa@linux.intel.com> 2011-07-13 21:24:10 +08:00			`#include <asm/alternative-asm.h>`
x86-64, mem: Convert memmove() to assembly file and fix return value bug memmove_64.c only implements memmove() function which is completely written in inline assembly code. Therefore it doesn't make sense to keep the assembly code in .c file. Currently memmove() doesn't store return value to rax. This may cause issue if caller uses the return value. The patch fixes this issue. Signed-off-by: Fenghua Yu <fenghua.yu@intel.com> LKML-Reference: <1295314755-6625-1-git-send-email-fenghua.yu@intel.com> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com> 2011-01-18 09:39:15 +08:00
			`#undef memmove`

			`/*`
			`* Implement memmove(). This can handle overlap between src and dst.`
			`*`
			`* Input:`
			`* rdi: dest`
			`* rsi: src`
			`* rdx: count`
			`*`
			`* Output:`
			`* rax: dest`
			`*/`
			`ENTRY(memmove)`
			`CFI_STARTPROC`
x86, mem: memmove_64.S: Optimize memmove by enhanced REP MOVSB/STOSB Support memmove() by enhanced rep movsb. On processors supporting enhanced REP MOVSB/STOSB, the alternative memmove() function using enhanced rep movsb overrides the original function. The patch doesn't change the backward memmove case to use enhanced rep movsb. Signed-off-by: Fenghua Yu <fenghua.yu@intel.com> Link: http://lkml.kernel.org/r/1305671358-14478-9-git-send-email-fenghua.yu@intel.com Signed-off-by: H. Peter Anvin <hpa@linux.intel.com> 2011-05-18 06:29:17 +08:00
x86-64, mem: Convert memmove() to assembly file and fix return value bug memmove_64.c only implements memmove() function which is completely written in inline assembly code. Therefore it doesn't make sense to keep the assembly code in .c file. Currently memmove() doesn't store return value to rax. This may cause issue if caller uses the return value. The patch fixes this issue. Signed-off-by: Fenghua Yu <fenghua.yu@intel.com> LKML-Reference: <1295314755-6625-1-git-send-email-fenghua.yu@intel.com> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com> 2011-01-18 09:39:15 +08:00			`/* Handle more 32bytes in loop */`
			`mov %rdi, %rax`
			`cmp $0x20, %rdx`
			`jb 1f`

			`/* Decide forward/backward copy mode */`
			`cmp %rdi, %rsi`
x86, mem: memmove_64.S: Optimize memmove by enhanced REP MOVSB/STOSB Support memmove() by enhanced rep movsb. On processors supporting enhanced REP MOVSB/STOSB, the alternative memmove() function using enhanced rep movsb overrides the original function. The patch doesn't change the backward memmove case to use enhanced rep movsb. Signed-off-by: Fenghua Yu <fenghua.yu@intel.com> Link: http://lkml.kernel.org/r/1305671358-14478-9-git-send-email-fenghua.yu@intel.com Signed-off-by: H. Peter Anvin <hpa@linux.intel.com> 2011-05-18 06:29:17 +08:00			`jge .Lmemmove_begin_forward`
			`mov %rsi, %r8`
			`add %rdx, %r8`
			`cmp %rdi, %r8`
			`jg 2f`
x86-64, mem: Convert memmove() to assembly file and fix return value bug memmove_64.c only implements memmove() function which is completely written in inline assembly code. Therefore it doesn't make sense to keep the assembly code in .c file. Currently memmove() doesn't store return value to rax. This may cause issue if caller uses the return value. The patch fixes this issue. Signed-off-by: Fenghua Yu <fenghua.yu@intel.com> LKML-Reference: <1295314755-6625-1-git-send-email-fenghua.yu@intel.com> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com> 2011-01-18 09:39:15 +08:00
x86, mem: memmove_64.S: Optimize memmove by enhanced REP MOVSB/STOSB Support memmove() by enhanced rep movsb. On processors supporting enhanced REP MOVSB/STOSB, the alternative memmove() function using enhanced rep movsb overrides the original function. The patch doesn't change the backward memmove case to use enhanced rep movsb. Signed-off-by: Fenghua Yu <fenghua.yu@intel.com> Link: http://lkml.kernel.org/r/1305671358-14478-9-git-send-email-fenghua.yu@intel.com Signed-off-by: H. Peter Anvin <hpa@linux.intel.com> 2011-05-18 06:29:17 +08:00			`.Lmemmove_begin_forward:`
x86-64, mem: Convert memmove() to assembly file and fix return value bug memmove_64.c only implements memmove() function which is completely written in inline assembly code. Therefore it doesn't make sense to keep the assembly code in .c file. Currently memmove() doesn't store return value to rax. This may cause issue if caller uses the return value. The patch fixes this issue. Signed-off-by: Fenghua Yu <fenghua.yu@intel.com> LKML-Reference: <1295314755-6625-1-git-send-email-fenghua.yu@intel.com> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com> 2011-01-18 09:39:15 +08:00			`/*`
			`* movsq instruction have many startup latency`
			`* so we handle small size by general register.`
			`*/`
			`cmp $680, %rdx`
			`jb 3f`
			`/*`
			`* movsq instruction is only good for aligned case.`
			`*/`

			`cmpb %dil, %sil`
			`je 4f`
			`3:`
			`sub $0x20, %rdx`
			`/*`
			`* We gobble 32byts forward in each loop.`
			`*/`
			`5:`
			`sub $0x20, %rdx`
			`movq 0*8(%rsi), %r11`
			`movq 1*8(%rsi), %r10`
			`movq 2*8(%rsi), %r9`
			`movq 3*8(%rsi), %r8`
			`leaq 4*8(%rsi), %rsi`

			`movq %r11, 0*8(%rdi)`
			`movq %r10, 1*8(%rdi)`
			`movq %r9, 2*8(%rdi)`
			`movq %r8, 3*8(%rdi)`
			`leaq 4*8(%rdi), %rdi`
			`jae 5b`
			`addq $0x20, %rdx`
			`jmp 1f`
			`/*`
			`* Handle data forward by movsq.`
			`*/`
			`.p2align 4`
			`4:`
			`movq %rdx, %rcx`
			`movq -8(%rsi, %rdx), %r11`
			`lea -8(%rdi, %rdx), %r10`
			`shrq $3, %rcx`
			`rep movsq`
			`movq %r11, (%r10)`
			`jmp 13f`
x86, mem: memmove_64.S: Optimize memmove by enhanced REP MOVSB/STOSB Support memmove() by enhanced rep movsb. On processors supporting enhanced REP MOVSB/STOSB, the alternative memmove() function using enhanced rep movsb overrides the original function. The patch doesn't change the backward memmove case to use enhanced rep movsb. Signed-off-by: Fenghua Yu <fenghua.yu@intel.com> Link: http://lkml.kernel.org/r/1305671358-14478-9-git-send-email-fenghua.yu@intel.com Signed-off-by: H. Peter Anvin <hpa@linux.intel.com> 2011-05-18 06:29:17 +08:00			`.Lmemmove_end_forward:`

x86-64, mem: Convert memmove() to assembly file and fix return value bug memmove_64.c only implements memmove() function which is completely written in inline assembly code. Therefore it doesn't make sense to keep the assembly code in .c file. Currently memmove() doesn't store return value to rax. This may cause issue if caller uses the return value. The patch fixes this issue. Signed-off-by: Fenghua Yu <fenghua.yu@intel.com> LKML-Reference: <1295314755-6625-1-git-send-email-fenghua.yu@intel.com> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com> 2011-01-18 09:39:15 +08:00			`/*`
			`* Handle data backward by movsq.`
			`*/`
			`.p2align 4`
			`7:`
			`movq %rdx, %rcx`
			`movq (%rsi), %r11`
			`movq %rdi, %r10`
			`leaq -8(%rsi, %rdx), %rsi`
			`leaq -8(%rdi, %rdx), %rdi`
			`shrq $3, %rcx`
			`std`
			`rep movsq`
			`cld`
			`movq %r11, (%r10)`
			`jmp 13f`

			`/*`
			`* Start to prepare for backward copy.`
			`*/`
			`.p2align 4`
			`2:`
			`cmp $680, %rdx`
			`jb 6f`
			`cmp %dil, %sil`
			`je 7b`
			`6:`
			`/*`
			`* Calculate copy position to tail.`
			`*/`
			`addq %rdx, %rsi`
			`addq %rdx, %rdi`
			`subq $0x20, %rdx`
			`/*`
			`* We gobble 32byts backward in each loop.`
			`*/`
			`8:`
			`subq $0x20, %rdx`
			`movq -1*8(%rsi), %r11`
			`movq -2*8(%rsi), %r10`
			`movq -3*8(%rsi), %r9`
			`movq -4*8(%rsi), %r8`
			`leaq -4*8(%rsi), %rsi`

			`movq %r11, -1*8(%rdi)`
			`movq %r10, -2*8(%rdi)`
			`movq %r9, -3*8(%rdi)`
			`movq %r8, -4*8(%rdi)`
			`leaq -4*8(%rdi), %rdi`
			`jae 8b`
			`/*`
			`* Calculate copy position to head.`
			`*/`
			`addq $0x20, %rdx`
			`subq %rdx, %rsi`
			`subq %rdx, %rdi`
			`1:`
			`cmpq $16, %rdx`
			`jb 9f`
			`/*`
			`* Move data from 16 bytes to 31 bytes.`
			`*/`
			`movq 0*8(%rsi), %r11`
			`movq 1*8(%rsi), %r10`
			`movq -2*8(%rsi, %rdx), %r9`
			`movq -1*8(%rsi, %rdx), %r8`
			`movq %r11, 0*8(%rdi)`
			`movq %r10, 1*8(%rdi)`
			`movq %r9, -2*8(%rdi, %rdx)`
			`movq %r8, -1*8(%rdi, %rdx)`
			`jmp 13f`
			`.p2align 4`
			`9:`
			`cmpq $8, %rdx`
			`jb 10f`
			`/*`
			`* Move data from 8 bytes to 15 bytes.`
			`*/`
			`movq 0*8(%rsi), %r11`
			`movq -1*8(%rsi, %rdx), %r10`
			`movq %r11, 0*8(%rdi)`
			`movq %r10, -1*8(%rdi, %rdx)`
			`jmp 13f`
			`10:`
			`cmpq $4, %rdx`
			`jb 11f`
			`/*`
			`* Move data from 4 bytes to 7 bytes.`
			`*/`
			`movl (%rsi), %r11d`
			`movl -4(%rsi, %rdx), %r10d`
			`movl %r11d, (%rdi)`
			`movl %r10d, -4(%rdi, %rdx)`
			`jmp 13f`
			`11:`
			`cmp $2, %rdx`
			`jb 12f`
			`/*`
			`* Move data from 2 bytes to 3 bytes.`
			`*/`
			`movw (%rsi), %r11w`
			`movw -2(%rsi, %rdx), %r10w`
			`movw %r11w, (%rdi)`
			`movw %r10w, -2(%rdi, %rdx)`
			`jmp 13f`
			`12:`
			`cmp $1, %rdx`
			`jb 13f`
			`/*`
			`* Move data for 1 byte.`
			`*/`
			`movb (%rsi), %r11b`
			`movb %r11b, (%rdi)`
			`13:`
			`retq`
			`CFI_ENDPROC`
x86, mem: memmove_64.S: Optimize memmove by enhanced REP MOVSB/STOSB Support memmove() by enhanced rep movsb. On processors supporting enhanced REP MOVSB/STOSB, the alternative memmove() function using enhanced rep movsb overrides the original function. The patch doesn't change the backward memmove case to use enhanced rep movsb. Signed-off-by: Fenghua Yu <fenghua.yu@intel.com> Link: http://lkml.kernel.org/r/1305671358-14478-9-git-send-email-fenghua.yu@intel.com Signed-off-by: H. Peter Anvin <hpa@linux.intel.com> 2011-05-18 06:29:17 +08:00
			`.section .altinstr_replacement,"ax"`
			`.Lmemmove_begin_forward_efs:`
			`/* Forward moving data. */`
			`movq %rdx, %rcx`
			`rep movsb`
			`retq`
			`.Lmemmove_end_forward_efs:`
			`.previous`

			`.section .altinstructions,"a"`
x86: Make alternative instruction pointers relative This save a few bytes on x86-64 and means that future patches can apply alternatives to unrelocated code. Signed-off-by: Andy Lutomirski <luto@mit.edu> Link: http://lkml.kernel.org/r/ff64a6b9a1a3860ca4a7b8b6dc7b4754f9491cd7.1310563276.git.luto@mit.edu Signed-off-by: H. Peter Anvin <hpa@linux.intel.com> 2011-07-13 21:24:10 +08:00			`altinstruction_entry .Lmemmove_begin_forward, \`
			`.Lmemmove_begin_forward_efs,X86_FEATURE_ERMS, \`
			`.Lmemmove_end_forward-.Lmemmove_begin_forward, \`
			`.Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs`
x86, mem: memmove_64.S: Optimize memmove by enhanced REP MOVSB/STOSB Support memmove() by enhanced rep movsb. On processors supporting enhanced REP MOVSB/STOSB, the alternative memmove() function using enhanced rep movsb overrides the original function. The patch doesn't change the backward memmove case to use enhanced rep movsb. Signed-off-by: Fenghua Yu <fenghua.yu@intel.com> Link: http://lkml.kernel.org/r/1305671358-14478-9-git-send-email-fenghua.yu@intel.com Signed-off-by: H. Peter Anvin <hpa@linux.intel.com> 2011-05-18 06:29:17 +08:00			`.previous`
x86-64, mem: Convert memmove() to assembly file and fix return value bug memmove_64.c only implements memmove() function which is completely written in inline assembly code. Therefore it doesn't make sense to keep the assembly code in .c file. Currently memmove() doesn't store return value to rax. This may cause issue if caller uses the return value. The patch fixes this issue. Signed-off-by: Fenghua Yu <fenghua.yu@intel.com> LKML-Reference: <1295314755-6625-1-git-send-email-fenghua.yu@intel.com> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com> 2011-01-18 09:39:15 +08:00			`ENDPROC(memmove)`