linux-stable-rt/arch/x86_64/lib/memcpy.S

/* Copyright 2002 Andi Kleen */
	
	#include <asm/cpufeature.h>		
/*
 * memcpy - Copy a memory block.
 *
 * Input:	
 * rdi destination
 * rsi source
 * rdx count
 * 
 * Output:
 * rax original destination
 */	

 	.globl __memcpy
	.globl memcpy
	.p2align 4
__memcpy:
memcpy:		
	pushq %rbx
	movq %rdi,%rax

	movl %edx,%ecx
	shrl $6,%ecx
	jz .Lhandle_tail

	.p2align 4
.Lloop_64:
	decl %ecx

	movq (%rsi),%r11
	movq 8(%rsi),%r8

	movq %r11,(%rdi)
	movq %r8,1*8(%rdi)

	movq 2*8(%rsi),%r9
	movq 3*8(%rsi),%r10

	movq %r9,2*8(%rdi)
	movq %r10,3*8(%rdi)

	movq 4*8(%rsi),%r11
	movq 5*8(%rsi),%r8

	movq %r11,4*8(%rdi)
	movq %r8,5*8(%rdi)

	movq 6*8(%rsi),%r9
	movq 7*8(%rsi),%r10

	movq %r9,6*8(%rdi)
	movq %r10,7*8(%rdi)

	leaq 64(%rsi),%rsi
	leaq 64(%rdi),%rdi
	jnz  .Lloop_64

.Lhandle_tail:
	movl %edx,%ecx
	andl $63,%ecx
	shrl $3,%ecx
	jz   .Lhandle_7
	.p2align 4
.Lloop_8:
	decl %ecx
	movq (%rsi),%r8
	movq %r8,(%rdi)
	leaq 8(%rdi),%rdi
	leaq 8(%rsi),%rsi
	jnz  .Lloop_8

.Lhandle_7:
	movl %edx,%ecx
	andl $7,%ecx
	jz .Lende
	.p2align 4
.Lloop_1:
	movb (%rsi),%r8b
	movb %r8b,(%rdi)
	incq %rdi
	incq %rsi
	decl %ecx
	jnz .Lloop_1

.Lende:
	popq %rbx
	ret
.Lfinal:

	/* Some CPUs run faster using the string copy instructions.
	   It is also a lot simpler. Use this when possible */

	.section .altinstructions,"a"
	.align 8
	.quad  memcpy
	.quad  memcpy_c
	.byte  X86_FEATURE_REP_GOOD
	.byte  .Lfinal-memcpy
	.byte  memcpy_c_end-memcpy_c
	.previous

	.section .altinstr_replacement,"ax"
 /* rdi	destination
  * rsi source
  * rdx count
  */
memcpy_c:
	movq %rdi,%rax
	movl %edx,%ecx
	shrl $3,%ecx
	andl $7,%edx	
	rep 
	movsq 
	movl %edx,%ecx
	rep
	movsb
	ret
memcpy_c_end:
	.previous
Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! 2005-04-17 06:20:36 +08:00			`/* Copyright 2002 Andi Kleen */`

			`#include <asm/cpufeature.h>`
			`/*`
			`* memcpy - Copy a memory block.`
			`*`
			`* Input:`
			`* rdi destination`
			`* rsi source`
			`* rdx count`
			`*`
			`* Output:`
			`* rax original destination`
			`*/`

			`.globl __memcpy`
			`.globl memcpy`
			`.p2align 4`
			`__memcpy:`
			`memcpy:`
[PATCH] x86_64: Undo the earlier changes to remove unrolled copy/memset functions They cause quite bad performance regressions on Netburst This is temporary until we can get new optimized functions for these CPUs. This undoes changes that were done in 2.6.15 and in 2.6.16-rc1, essentially bringing the code back to 2.6.14 level. Only change is I renamed the X86_FEATURE_K8_C flag to X86_FEATURE_REP_GOOD and fixed the check for the flag and also fixed some comments. Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org> 2006-02-04 04:51:02 +08:00			`pushq %rbx`
			`movq %rdi,%rax`

			`movl %edx,%ecx`
			`shrl $6,%ecx`
			`jz .Lhandle_tail`

			`.p2align 4`
			`.Lloop_64:`
			`decl %ecx`

			`movq (%rsi),%r11`
			`movq 8(%rsi),%r8`

			`movq %r11,(%rdi)`
			`movq %r8,1*8(%rdi)`

			`movq 2*8(%rsi),%r9`
			`movq 3*8(%rsi),%r10`

			`movq %r9,2*8(%rdi)`
			`movq %r10,3*8(%rdi)`

			`movq 4*8(%rsi),%r11`
			`movq 5*8(%rsi),%r8`

			`movq %r11,4*8(%rdi)`
			`movq %r8,5*8(%rdi)`

			`movq 6*8(%rsi),%r9`
			`movq 7*8(%rsi),%r10`

			`movq %r9,6*8(%rdi)`
			`movq %r10,7*8(%rdi)`

			`leaq 64(%rsi),%rsi`
			`leaq 64(%rdi),%rdi`
			`jnz .Lloop_64`

			`.Lhandle_tail:`
			`movl %edx,%ecx`
			`andl $63,%ecx`
			`shrl $3,%ecx`
			`jz .Lhandle_7`
			`.p2align 4`
			`.Lloop_8:`
			`decl %ecx`
			`movq (%rsi),%r8`
			`movq %r8,(%rdi)`
			`leaq 8(%rdi),%rdi`
			`leaq 8(%rsi),%rsi`
			`jnz .Lloop_8`

			`.Lhandle_7:`
			`movl %edx,%ecx`
			`andl $7,%ecx`
			`jz .Lende`
			`.p2align 4`
			`.Lloop_1:`
			`movb (%rsi),%r8b`
			`movb %r8b,(%rdi)`
			`incq %rdi`
			`incq %rsi`
			`decl %ecx`
			`jnz .Lloop_1`

			`.Lende:`
			`popq %rbx`
			`ret`
			`.Lfinal:`

			`/* Some CPUs run faster using the string copy instructions.`
			`It is also a lot simpler. Use this when possible */`

			`.section .altinstructions,"a"`
			`.align 8`
			`.quad memcpy`
			`.quad memcpy_c`
			`.byte X86_FEATURE_REP_GOOD`
			`.byte .Lfinal-memcpy`
			`.byte memcpy_c_end-memcpy_c`
			`.previous`

			`.section .altinstr_replacement,"ax"`
			`/* rdi destination`
			`* rsi source`
			`* rdx count`
			`*/`
			`memcpy_c:`
Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! 2005-04-17 06:20:36 +08:00			`movq %rdi,%rax`
			`movl %edx,%ecx`
			`shrl $3,%ecx`
			`andl $7,%edx`
			`rep`
			`movsq`
			`movl %edx,%ecx`
			`rep`
			`movsb`
			`ret`
[PATCH] x86_64: Undo the earlier changes to remove unrolled copy/memset functions They cause quite bad performance regressions on Netburst This is temporary until we can get new optimized functions for these CPUs. This undoes changes that were done in 2.6.15 and in 2.6.16-rc1, essentially bringing the code back to 2.6.14 level. Only change is I renamed the X86_FEATURE_K8_C flag to X86_FEATURE_REP_GOOD and fixed the check for the flag and also fixed some comments. Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org> 2006-02-04 04:51:02 +08:00			`memcpy_c_end:`
			`.previous`