2
0
mirror of https://github.com/edk2-porting/linux-next.git synced 2024-12-26 22:24:09 +08:00
linux-next/arch/x86/lib/memcpy_64.S

192 lines
3.3 KiB
ArmAsm
Raw Normal View History

/* Copyright 2002 Andi Kleen */
#include <linux/linkage.h>
#include <asm/cpufeature.h>
#include <asm/dwarf2.h>
/*
* memcpy - Copy a memory block.
*
* Input:
* rdi destination
* rsi source
* rdx count
*
* Output:
* rax original destination
*/
/*
* memcpy_c() - fast string ops (REP MOVSQ) based variant.
*
* This gets patched over the unrolled variant (below) via the
* alternative instructions framework:
*/
.section .altinstr_replacement, "ax", @progbits
.Lmemcpy_c:
movq %rdi, %rax
movl %edx, %ecx
shrl $3, %ecx
andl $7, %edx
rep movsq
movl %edx, %ecx
rep movsb
ret
.Lmemcpy_e:
.previous
ENTRY(__memcpy)
ENTRY(memcpy)
CFI_STARTPROC
x86, mem: Optimize memcpy by avoiding memory false dependece All read operations after allocation stage can run speculatively, all write operation will run in program order, and if addresses are different read may run before older write operation, otherwise wait until write commit. However CPU don't check each address bit, so read could fail to recognize different address even they are in different page.For example if rsi is 0xf004, rdi is 0xe008, in following operation there will generate big performance latency. 1. movq (%rsi), %rax 2. movq %rax, (%rdi) 3. movq 8(%rsi), %rax 4. movq %rax, 8(%rdi) If %rsi and rdi were in really the same meory page, there are TRUE read-after-write dependence because instruction 2 write 0x008 and instruction 3 read 0x00c, the two address are overlap partially. Actually there are in different page and no any issues, but without checking each address bit CPU could think they are in the same page, and instruction 3 have to wait for instruction 2 to write data into cache from write buffer, then load data from cache, the cost time read spent is equal to mfence instruction. We may avoid it by tuning operation sequence as follow. 1. movq 8(%rsi), %rax 2. movq %rax, 8(%rdi) 3. movq (%rsi), %rax 4. movq %rax, (%rdi) Instruction 3 read 0x004, instruction 2 write address 0x010, no any dependence. At last on Core2 we gain 1.83x speedup compared with original instruction sequence. In this patch we first handle small size(less 20bytes), then jump to different copy mode. Based on our micro-benchmark small bytes from 1 to 127 bytes, we got up to 2X improvement, and up to 1.5X improvement for 1024 bytes on Corei7. (We use our micro-benchmark, and will do further test according to your requirment) Signed-off-by: Ma Ling <ling.ma@intel.com> LKML-Reference: <1277753065-18610-1-git-send-email-ling.ma@intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2010-06-29 03:24:25 +08:00
movq %rdi, %rax
/*
x86, mem: Optimize memcpy by avoiding memory false dependece All read operations after allocation stage can run speculatively, all write operation will run in program order, and if addresses are different read may run before older write operation, otherwise wait until write commit. However CPU don't check each address bit, so read could fail to recognize different address even they are in different page.For example if rsi is 0xf004, rdi is 0xe008, in following operation there will generate big performance latency. 1. movq (%rsi), %rax 2. movq %rax, (%rdi) 3. movq 8(%rsi), %rax 4. movq %rax, 8(%rdi) If %rsi and rdi were in really the same meory page, there are TRUE read-after-write dependence because instruction 2 write 0x008 and instruction 3 read 0x00c, the two address are overlap partially. Actually there are in different page and no any issues, but without checking each address bit CPU could think they are in the same page, and instruction 3 have to wait for instruction 2 to write data into cache from write buffer, then load data from cache, the cost time read spent is equal to mfence instruction. We may avoid it by tuning operation sequence as follow. 1. movq 8(%rsi), %rax 2. movq %rax, 8(%rdi) 3. movq (%rsi), %rax 4. movq %rax, (%rdi) Instruction 3 read 0x004, instruction 2 write address 0x010, no any dependence. At last on Core2 we gain 1.83x speedup compared with original instruction sequence. In this patch we first handle small size(less 20bytes), then jump to different copy mode. Based on our micro-benchmark small bytes from 1 to 127 bytes, we got up to 2X improvement, and up to 1.5X improvement for 1024 bytes on Corei7. (We use our micro-benchmark, and will do further test according to your requirment) Signed-off-by: Ma Ling <ling.ma@intel.com> LKML-Reference: <1277753065-18610-1-git-send-email-ling.ma@intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2010-06-29 03:24:25 +08:00
* Use 32bit CMP here to avoid long NOP padding.
*/
x86, mem: Optimize memcpy by avoiding memory false dependece All read operations after allocation stage can run speculatively, all write operation will run in program order, and if addresses are different read may run before older write operation, otherwise wait until write commit. However CPU don't check each address bit, so read could fail to recognize different address even they are in different page.For example if rsi is 0xf004, rdi is 0xe008, in following operation there will generate big performance latency. 1. movq (%rsi), %rax 2. movq %rax, (%rdi) 3. movq 8(%rsi), %rax 4. movq %rax, 8(%rdi) If %rsi and rdi were in really the same meory page, there are TRUE read-after-write dependence because instruction 2 write 0x008 and instruction 3 read 0x00c, the two address are overlap partially. Actually there are in different page and no any issues, but without checking each address bit CPU could think they are in the same page, and instruction 3 have to wait for instruction 2 to write data into cache from write buffer, then load data from cache, the cost time read spent is equal to mfence instruction. We may avoid it by tuning operation sequence as follow. 1. movq 8(%rsi), %rax 2. movq %rax, 8(%rdi) 3. movq (%rsi), %rax 4. movq %rax, (%rdi) Instruction 3 read 0x004, instruction 2 write address 0x010, no any dependence. At last on Core2 we gain 1.83x speedup compared with original instruction sequence. In this patch we first handle small size(less 20bytes), then jump to different copy mode. Based on our micro-benchmark small bytes from 1 to 127 bytes, we got up to 2X improvement, and up to 1.5X improvement for 1024 bytes on Corei7. (We use our micro-benchmark, and will do further test according to your requirment) Signed-off-by: Ma Ling <ling.ma@intel.com> LKML-Reference: <1277753065-18610-1-git-send-email-ling.ma@intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2010-06-29 03:24:25 +08:00
cmp $0x20, %edx
jb .Lhandle_tail
/*
x86, mem: Optimize memcpy by avoiding memory false dependece All read operations after allocation stage can run speculatively, all write operation will run in program order, and if addresses are different read may run before older write operation, otherwise wait until write commit. However CPU don't check each address bit, so read could fail to recognize different address even they are in different page.For example if rsi is 0xf004, rdi is 0xe008, in following operation there will generate big performance latency. 1. movq (%rsi), %rax 2. movq %rax, (%rdi) 3. movq 8(%rsi), %rax 4. movq %rax, 8(%rdi) If %rsi and rdi were in really the same meory page, there are TRUE read-after-write dependence because instruction 2 write 0x008 and instruction 3 read 0x00c, the two address are overlap partially. Actually there are in different page and no any issues, but without checking each address bit CPU could think they are in the same page, and instruction 3 have to wait for instruction 2 to write data into cache from write buffer, then load data from cache, the cost time read spent is equal to mfence instruction. We may avoid it by tuning operation sequence as follow. 1. movq 8(%rsi), %rax 2. movq %rax, 8(%rdi) 3. movq (%rsi), %rax 4. movq %rax, (%rdi) Instruction 3 read 0x004, instruction 2 write address 0x010, no any dependence. At last on Core2 we gain 1.83x speedup compared with original instruction sequence. In this patch we first handle small size(less 20bytes), then jump to different copy mode. Based on our micro-benchmark small bytes from 1 to 127 bytes, we got up to 2X improvement, and up to 1.5X improvement for 1024 bytes on Corei7. (We use our micro-benchmark, and will do further test according to your requirment) Signed-off-by: Ma Ling <ling.ma@intel.com> LKML-Reference: <1277753065-18610-1-git-send-email-ling.ma@intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2010-06-29 03:24:25 +08:00
* We check whether memory false dependece could occur,
* then jump to corresponding copy mode.
*/
x86, mem: Optimize memcpy by avoiding memory false dependece All read operations after allocation stage can run speculatively, all write operation will run in program order, and if addresses are different read may run before older write operation, otherwise wait until write commit. However CPU don't check each address bit, so read could fail to recognize different address even they are in different page.For example if rsi is 0xf004, rdi is 0xe008, in following operation there will generate big performance latency. 1. movq (%rsi), %rax 2. movq %rax, (%rdi) 3. movq 8(%rsi), %rax 4. movq %rax, 8(%rdi) If %rsi and rdi were in really the same meory page, there are TRUE read-after-write dependence because instruction 2 write 0x008 and instruction 3 read 0x00c, the two address are overlap partially. Actually there are in different page and no any issues, but without checking each address bit CPU could think they are in the same page, and instruction 3 have to wait for instruction 2 to write data into cache from write buffer, then load data from cache, the cost time read spent is equal to mfence instruction. We may avoid it by tuning operation sequence as follow. 1. movq 8(%rsi), %rax 2. movq %rax, 8(%rdi) 3. movq (%rsi), %rax 4. movq %rax, (%rdi) Instruction 3 read 0x004, instruction 2 write address 0x010, no any dependence. At last on Core2 we gain 1.83x speedup compared with original instruction sequence. In this patch we first handle small size(less 20bytes), then jump to different copy mode. Based on our micro-benchmark small bytes from 1 to 127 bytes, we got up to 2X improvement, and up to 1.5X improvement for 1024 bytes on Corei7. (We use our micro-benchmark, and will do further test according to your requirment) Signed-off-by: Ma Ling <ling.ma@intel.com> LKML-Reference: <1277753065-18610-1-git-send-email-ling.ma@intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2010-06-29 03:24:25 +08:00
cmp %dil, %sil
jl .Lcopy_backward
subl $0x20, %edx
.Lcopy_forward_loop:
subq $0x20, %rdx
/*
x86, mem: Optimize memcpy by avoiding memory false dependece All read operations after allocation stage can run speculatively, all write operation will run in program order, and if addresses are different read may run before older write operation, otherwise wait until write commit. However CPU don't check each address bit, so read could fail to recognize different address even they are in different page.For example if rsi is 0xf004, rdi is 0xe008, in following operation there will generate big performance latency. 1. movq (%rsi), %rax 2. movq %rax, (%rdi) 3. movq 8(%rsi), %rax 4. movq %rax, 8(%rdi) If %rsi and rdi were in really the same meory page, there are TRUE read-after-write dependence because instruction 2 write 0x008 and instruction 3 read 0x00c, the two address are overlap partially. Actually there are in different page and no any issues, but without checking each address bit CPU could think they are in the same page, and instruction 3 have to wait for instruction 2 to write data into cache from write buffer, then load data from cache, the cost time read spent is equal to mfence instruction. We may avoid it by tuning operation sequence as follow. 1. movq 8(%rsi), %rax 2. movq %rax, 8(%rdi) 3. movq (%rsi), %rax 4. movq %rax, (%rdi) Instruction 3 read 0x004, instruction 2 write address 0x010, no any dependence. At last on Core2 we gain 1.83x speedup compared with original instruction sequence. In this patch we first handle small size(less 20bytes), then jump to different copy mode. Based on our micro-benchmark small bytes from 1 to 127 bytes, we got up to 2X improvement, and up to 1.5X improvement for 1024 bytes on Corei7. (We use our micro-benchmark, and will do further test according to your requirment) Signed-off-by: Ma Ling <ling.ma@intel.com> LKML-Reference: <1277753065-18610-1-git-send-email-ling.ma@intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2010-06-29 03:24:25 +08:00
* Move in blocks of 4x8 bytes:
*/
x86, mem: Optimize memcpy by avoiding memory false dependece All read operations after allocation stage can run speculatively, all write operation will run in program order, and if addresses are different read may run before older write operation, otherwise wait until write commit. However CPU don't check each address bit, so read could fail to recognize different address even they are in different page.For example if rsi is 0xf004, rdi is 0xe008, in following operation there will generate big performance latency. 1. movq (%rsi), %rax 2. movq %rax, (%rdi) 3. movq 8(%rsi), %rax 4. movq %rax, 8(%rdi) If %rsi and rdi were in really the same meory page, there are TRUE read-after-write dependence because instruction 2 write 0x008 and instruction 3 read 0x00c, the two address are overlap partially. Actually there are in different page and no any issues, but without checking each address bit CPU could think they are in the same page, and instruction 3 have to wait for instruction 2 to write data into cache from write buffer, then load data from cache, the cost time read spent is equal to mfence instruction. We may avoid it by tuning operation sequence as follow. 1. movq 8(%rsi), %rax 2. movq %rax, 8(%rdi) 3. movq (%rsi), %rax 4. movq %rax, (%rdi) Instruction 3 read 0x004, instruction 2 write address 0x010, no any dependence. At last on Core2 we gain 1.83x speedup compared with original instruction sequence. In this patch we first handle small size(less 20bytes), then jump to different copy mode. Based on our micro-benchmark small bytes from 1 to 127 bytes, we got up to 2X improvement, and up to 1.5X improvement for 1024 bytes on Corei7. (We use our micro-benchmark, and will do further test according to your requirment) Signed-off-by: Ma Ling <ling.ma@intel.com> LKML-Reference: <1277753065-18610-1-git-send-email-ling.ma@intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2010-06-29 03:24:25 +08:00
movq 0*8(%rsi), %r8
movq 1*8(%rsi), %r9
movq 2*8(%rsi), %r10
movq 3*8(%rsi), %r11
leaq 4*8(%rsi), %rsi
movq %r8, 0*8(%rdi)
movq %r9, 1*8(%rdi)
movq %r10, 2*8(%rdi)
movq %r11, 3*8(%rdi)
leaq 4*8(%rdi), %rdi
jae .Lcopy_forward_loop
addq $0x20, %rdx
jmp .Lhandle_tail
.Lcopy_backward:
/*
* Calculate copy position to tail.
*/
addq %rdx, %rsi
addq %rdx, %rdi
subq $0x20, %rdx
/*
* At most 3 ALU operations in one cycle,
* so append NOPS in the same 16bytes trunk.
*/
.p2align 4
.Lcopy_backward_loop:
subq $0x20, %rdx
movq -1*8(%rsi), %r8
movq -2*8(%rsi), %r9
movq -3*8(%rsi), %r10
movq -4*8(%rsi), %r11
leaq -4*8(%rsi), %rsi
movq %r8, -1*8(%rdi)
movq %r9, -2*8(%rdi)
movq %r10, -3*8(%rdi)
movq %r11, -4*8(%rdi)
leaq -4*8(%rdi), %rdi
jae .Lcopy_backward_loop
x86, mem: Optimize memcpy by avoiding memory false dependece All read operations after allocation stage can run speculatively, all write operation will run in program order, and if addresses are different read may run before older write operation, otherwise wait until write commit. However CPU don't check each address bit, so read could fail to recognize different address even they are in different page.For example if rsi is 0xf004, rdi is 0xe008, in following operation there will generate big performance latency. 1. movq (%rsi), %rax 2. movq %rax, (%rdi) 3. movq 8(%rsi), %rax 4. movq %rax, 8(%rdi) If %rsi and rdi were in really the same meory page, there are TRUE read-after-write dependence because instruction 2 write 0x008 and instruction 3 read 0x00c, the two address are overlap partially. Actually there are in different page and no any issues, but without checking each address bit CPU could think they are in the same page, and instruction 3 have to wait for instruction 2 to write data into cache from write buffer, then load data from cache, the cost time read spent is equal to mfence instruction. We may avoid it by tuning operation sequence as follow. 1. movq 8(%rsi), %rax 2. movq %rax, 8(%rdi) 3. movq (%rsi), %rax 4. movq %rax, (%rdi) Instruction 3 read 0x004, instruction 2 write address 0x010, no any dependence. At last on Core2 we gain 1.83x speedup compared with original instruction sequence. In this patch we first handle small size(less 20bytes), then jump to different copy mode. Based on our micro-benchmark small bytes from 1 to 127 bytes, we got up to 2X improvement, and up to 1.5X improvement for 1024 bytes on Corei7. (We use our micro-benchmark, and will do further test according to your requirment) Signed-off-by: Ma Ling <ling.ma@intel.com> LKML-Reference: <1277753065-18610-1-git-send-email-ling.ma@intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2010-06-29 03:24:25 +08:00
/*
* Calculate copy position to head.
*/
addq $0x20, %rdx
subq %rdx, %rsi
subq %rdx, %rdi
.Lhandle_tail:
x86, mem: Optimize memcpy by avoiding memory false dependece All read operations after allocation stage can run speculatively, all write operation will run in program order, and if addresses are different read may run before older write operation, otherwise wait until write commit. However CPU don't check each address bit, so read could fail to recognize different address even they are in different page.For example if rsi is 0xf004, rdi is 0xe008, in following operation there will generate big performance latency. 1. movq (%rsi), %rax 2. movq %rax, (%rdi) 3. movq 8(%rsi), %rax 4. movq %rax, 8(%rdi) If %rsi and rdi were in really the same meory page, there are TRUE read-after-write dependence because instruction 2 write 0x008 and instruction 3 read 0x00c, the two address are overlap partially. Actually there are in different page and no any issues, but without checking each address bit CPU could think they are in the same page, and instruction 3 have to wait for instruction 2 to write data into cache from write buffer, then load data from cache, the cost time read spent is equal to mfence instruction. We may avoid it by tuning operation sequence as follow. 1. movq 8(%rsi), %rax 2. movq %rax, 8(%rdi) 3. movq (%rsi), %rax 4. movq %rax, (%rdi) Instruction 3 read 0x004, instruction 2 write address 0x010, no any dependence. At last on Core2 we gain 1.83x speedup compared with original instruction sequence. In this patch we first handle small size(less 20bytes), then jump to different copy mode. Based on our micro-benchmark small bytes from 1 to 127 bytes, we got up to 2X improvement, and up to 1.5X improvement for 1024 bytes on Corei7. (We use our micro-benchmark, and will do further test according to your requirment) Signed-off-by: Ma Ling <ling.ma@intel.com> LKML-Reference: <1277753065-18610-1-git-send-email-ling.ma@intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2010-06-29 03:24:25 +08:00
cmpq $16, %rdx
jb .Lless_16bytes
x86, mem: Optimize memcpy by avoiding memory false dependece All read operations after allocation stage can run speculatively, all write operation will run in program order, and if addresses are different read may run before older write operation, otherwise wait until write commit. However CPU don't check each address bit, so read could fail to recognize different address even they are in different page.For example if rsi is 0xf004, rdi is 0xe008, in following operation there will generate big performance latency. 1. movq (%rsi), %rax 2. movq %rax, (%rdi) 3. movq 8(%rsi), %rax 4. movq %rax, 8(%rdi) If %rsi and rdi were in really the same meory page, there are TRUE read-after-write dependence because instruction 2 write 0x008 and instruction 3 read 0x00c, the two address are overlap partially. Actually there are in different page and no any issues, but without checking each address bit CPU could think they are in the same page, and instruction 3 have to wait for instruction 2 to write data into cache from write buffer, then load data from cache, the cost time read spent is equal to mfence instruction. We may avoid it by tuning operation sequence as follow. 1. movq 8(%rsi), %rax 2. movq %rax, 8(%rdi) 3. movq (%rsi), %rax 4. movq %rax, (%rdi) Instruction 3 read 0x004, instruction 2 write address 0x010, no any dependence. At last on Core2 we gain 1.83x speedup compared with original instruction sequence. In this patch we first handle small size(less 20bytes), then jump to different copy mode. Based on our micro-benchmark small bytes from 1 to 127 bytes, we got up to 2X improvement, and up to 1.5X improvement for 1024 bytes on Corei7. (We use our micro-benchmark, and will do further test according to your requirment) Signed-off-by: Ma Ling <ling.ma@intel.com> LKML-Reference: <1277753065-18610-1-git-send-email-ling.ma@intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2010-06-29 03:24:25 +08:00
/*
* Move data from 16 bytes to 31 bytes.
*/
movq 0*8(%rsi), %r8
movq 1*8(%rsi), %r9
movq -2*8(%rsi, %rdx), %r10
movq -1*8(%rsi, %rdx), %r11
movq %r8, 0*8(%rdi)
movq %r9, 1*8(%rdi)
movq %r10, -2*8(%rdi, %rdx)
movq %r11, -1*8(%rdi, %rdx)
retq
.p2align 4
x86, mem: Optimize memcpy by avoiding memory false dependece All read operations after allocation stage can run speculatively, all write operation will run in program order, and if addresses are different read may run before older write operation, otherwise wait until write commit. However CPU don't check each address bit, so read could fail to recognize different address even they are in different page.For example if rsi is 0xf004, rdi is 0xe008, in following operation there will generate big performance latency. 1. movq (%rsi), %rax 2. movq %rax, (%rdi) 3. movq 8(%rsi), %rax 4. movq %rax, 8(%rdi) If %rsi and rdi were in really the same meory page, there are TRUE read-after-write dependence because instruction 2 write 0x008 and instruction 3 read 0x00c, the two address are overlap partially. Actually there are in different page and no any issues, but without checking each address bit CPU could think they are in the same page, and instruction 3 have to wait for instruction 2 to write data into cache from write buffer, then load data from cache, the cost time read spent is equal to mfence instruction. We may avoid it by tuning operation sequence as follow. 1. movq 8(%rsi), %rax 2. movq %rax, 8(%rdi) 3. movq (%rsi), %rax 4. movq %rax, (%rdi) Instruction 3 read 0x004, instruction 2 write address 0x010, no any dependence. At last on Core2 we gain 1.83x speedup compared with original instruction sequence. In this patch we first handle small size(less 20bytes), then jump to different copy mode. Based on our micro-benchmark small bytes from 1 to 127 bytes, we got up to 2X improvement, and up to 1.5X improvement for 1024 bytes on Corei7. (We use our micro-benchmark, and will do further test according to your requirment) Signed-off-by: Ma Ling <ling.ma@intel.com> LKML-Reference: <1277753065-18610-1-git-send-email-ling.ma@intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2010-06-29 03:24:25 +08:00
.Lless_16bytes:
cmpq $8, %rdx
jb .Lless_8bytes
/*
* Move data from 8 bytes to 15 bytes.
*/
movq 0*8(%rsi), %r8
movq -1*8(%rsi, %rdx), %r9
movq %r8, 0*8(%rdi)
movq %r9, -1*8(%rdi, %rdx)
retq
.p2align 4
.Lless_8bytes:
cmpq $4, %rdx
jb .Lless_3bytes
x86, mem: Optimize memcpy by avoiding memory false dependece All read operations after allocation stage can run speculatively, all write operation will run in program order, and if addresses are different read may run before older write operation, otherwise wait until write commit. However CPU don't check each address bit, so read could fail to recognize different address even they are in different page.For example if rsi is 0xf004, rdi is 0xe008, in following operation there will generate big performance latency. 1. movq (%rsi), %rax 2. movq %rax, (%rdi) 3. movq 8(%rsi), %rax 4. movq %rax, 8(%rdi) If %rsi and rdi were in really the same meory page, there are TRUE read-after-write dependence because instruction 2 write 0x008 and instruction 3 read 0x00c, the two address are overlap partially. Actually there are in different page and no any issues, but without checking each address bit CPU could think they are in the same page, and instruction 3 have to wait for instruction 2 to write data into cache from write buffer, then load data from cache, the cost time read spent is equal to mfence instruction. We may avoid it by tuning operation sequence as follow. 1. movq 8(%rsi), %rax 2. movq %rax, 8(%rdi) 3. movq (%rsi), %rax 4. movq %rax, (%rdi) Instruction 3 read 0x004, instruction 2 write address 0x010, no any dependence. At last on Core2 we gain 1.83x speedup compared with original instruction sequence. In this patch we first handle small size(less 20bytes), then jump to different copy mode. Based on our micro-benchmark small bytes from 1 to 127 bytes, we got up to 2X improvement, and up to 1.5X improvement for 1024 bytes on Corei7. (We use our micro-benchmark, and will do further test according to your requirment) Signed-off-by: Ma Ling <ling.ma@intel.com> LKML-Reference: <1277753065-18610-1-git-send-email-ling.ma@intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2010-06-29 03:24:25 +08:00
/*
* Move data from 4 bytes to 7 bytes.
*/
movl (%rsi), %ecx
movl -4(%rsi, %rdx), %r8d
movl %ecx, (%rdi)
movl %r8d, -4(%rdi, %rdx)
retq
.p2align 4
x86, mem: Optimize memcpy by avoiding memory false dependece All read operations after allocation stage can run speculatively, all write operation will run in program order, and if addresses are different read may run before older write operation, otherwise wait until write commit. However CPU don't check each address bit, so read could fail to recognize different address even they are in different page.For example if rsi is 0xf004, rdi is 0xe008, in following operation there will generate big performance latency. 1. movq (%rsi), %rax 2. movq %rax, (%rdi) 3. movq 8(%rsi), %rax 4. movq %rax, 8(%rdi) If %rsi and rdi were in really the same meory page, there are TRUE read-after-write dependence because instruction 2 write 0x008 and instruction 3 read 0x00c, the two address are overlap partially. Actually there are in different page and no any issues, but without checking each address bit CPU could think they are in the same page, and instruction 3 have to wait for instruction 2 to write data into cache from write buffer, then load data from cache, the cost time read spent is equal to mfence instruction. We may avoid it by tuning operation sequence as follow. 1. movq 8(%rsi), %rax 2. movq %rax, 8(%rdi) 3. movq (%rsi), %rax 4. movq %rax, (%rdi) Instruction 3 read 0x004, instruction 2 write address 0x010, no any dependence. At last on Core2 we gain 1.83x speedup compared with original instruction sequence. In this patch we first handle small size(less 20bytes), then jump to different copy mode. Based on our micro-benchmark small bytes from 1 to 127 bytes, we got up to 2X improvement, and up to 1.5X improvement for 1024 bytes on Corei7. (We use our micro-benchmark, and will do further test according to your requirment) Signed-off-by: Ma Ling <ling.ma@intel.com> LKML-Reference: <1277753065-18610-1-git-send-email-ling.ma@intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2010-06-29 03:24:25 +08:00
.Lless_3bytes:
cmpl $0, %edx
je .Lend
/*
* Move data from 1 bytes to 3 bytes.
*/
.Lloop_1:
movb (%rsi), %r8b
movb %r8b, (%rdi)
incq %rdi
incq %rsi
x86, mem: Optimize memcpy by avoiding memory false dependece All read operations after allocation stage can run speculatively, all write operation will run in program order, and if addresses are different read may run before older write operation, otherwise wait until write commit. However CPU don't check each address bit, so read could fail to recognize different address even they are in different page.For example if rsi is 0xf004, rdi is 0xe008, in following operation there will generate big performance latency. 1. movq (%rsi), %rax 2. movq %rax, (%rdi) 3. movq 8(%rsi), %rax 4. movq %rax, 8(%rdi) If %rsi and rdi were in really the same meory page, there are TRUE read-after-write dependence because instruction 2 write 0x008 and instruction 3 read 0x00c, the two address are overlap partially. Actually there are in different page and no any issues, but without checking each address bit CPU could think they are in the same page, and instruction 3 have to wait for instruction 2 to write data into cache from write buffer, then load data from cache, the cost time read spent is equal to mfence instruction. We may avoid it by tuning operation sequence as follow. 1. movq 8(%rsi), %rax 2. movq %rax, 8(%rdi) 3. movq (%rsi), %rax 4. movq %rax, (%rdi) Instruction 3 read 0x004, instruction 2 write address 0x010, no any dependence. At last on Core2 we gain 1.83x speedup compared with original instruction sequence. In this patch we first handle small size(less 20bytes), then jump to different copy mode. Based on our micro-benchmark small bytes from 1 to 127 bytes, we got up to 2X improvement, and up to 1.5X improvement for 1024 bytes on Corei7. (We use our micro-benchmark, and will do further test according to your requirment) Signed-off-by: Ma Ling <ling.ma@intel.com> LKML-Reference: <1277753065-18610-1-git-send-email-ling.ma@intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2010-06-29 03:24:25 +08:00
decl %edx
jnz .Lloop_1
.Lend:
x86, mem: Optimize memcpy by avoiding memory false dependece All read operations after allocation stage can run speculatively, all write operation will run in program order, and if addresses are different read may run before older write operation, otherwise wait until write commit. However CPU don't check each address bit, so read could fail to recognize different address even they are in different page.For example if rsi is 0xf004, rdi is 0xe008, in following operation there will generate big performance latency. 1. movq (%rsi), %rax 2. movq %rax, (%rdi) 3. movq 8(%rsi), %rax 4. movq %rax, 8(%rdi) If %rsi and rdi were in really the same meory page, there are TRUE read-after-write dependence because instruction 2 write 0x008 and instruction 3 read 0x00c, the two address are overlap partially. Actually there are in different page and no any issues, but without checking each address bit CPU could think they are in the same page, and instruction 3 have to wait for instruction 2 to write data into cache from write buffer, then load data from cache, the cost time read spent is equal to mfence instruction. We may avoid it by tuning operation sequence as follow. 1. movq 8(%rsi), %rax 2. movq %rax, 8(%rdi) 3. movq (%rsi), %rax 4. movq %rax, (%rdi) Instruction 3 read 0x004, instruction 2 write address 0x010, no any dependence. At last on Core2 we gain 1.83x speedup compared with original instruction sequence. In this patch we first handle small size(less 20bytes), then jump to different copy mode. Based on our micro-benchmark small bytes from 1 to 127 bytes, we got up to 2X improvement, and up to 1.5X improvement for 1024 bytes on Corei7. (We use our micro-benchmark, and will do further test according to your requirment) Signed-off-by: Ma Ling <ling.ma@intel.com> LKML-Reference: <1277753065-18610-1-git-send-email-ling.ma@intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2010-06-29 03:24:25 +08:00
retq
CFI_ENDPROC
ENDPROC(memcpy)
ENDPROC(__memcpy)
/*
* Some CPUs run faster using the string copy instructions.
* It is also a lot simpler. Use this when possible:
*/
.section .altinstructions, "a"
.align 8
.quad memcpy
.quad .Lmemcpy_c
.word X86_FEATURE_REP_GOOD
/*
* Replace only beginning, memcpy is used to apply alternatives,
* so it is silly to overwrite itself with nops - reboot is the
* only outcome...
*/
.byte .Lmemcpy_e - .Lmemcpy_c
.byte .Lmemcpy_e - .Lmemcpy_c
.previous