arch/arm/arch_string.S

/*
** Copyright 2001, Travis Geiselbrecht. All rights reserved.
** Distributed under the terms of the NewOS License.
*/

#include <asm_defs.h>

//#warning M68K: optimize memcpy
#if 1

/* that should be enough for now */

.align 4
FUNCTION(memcpy):
	// check for zero length copy or the same pointer
	cmp		r2, #0
	cmpne	r1, r0
	bxeq	lr

	// save a few registers for use and the return code (input dst)
	stmfd	sp!, {r0, r4, r5, lr}

	// check for forwards overlap (src > dst, distance < len)
	subs	r3, r0, r1
	cmpgt	r2, r3
	bgt		.L_forwardoverlap

	// check for a short copy len.
	// 20 bytes is enough so that if a 16 byte alignment needs to happen there is at least a
	//   wordwise copy worth of work to be done.
	cmp		r2, #(16+4)
	blt		.L_bytewise

	// see if they are similarly aligned on 4 byte boundaries
	eor		r3, r0, r1
	tst		r3, #3
	bne		.L_bytewise		// dissimilarly aligned, nothing we can do (for now)

	// check for 16 byte alignment on dst.
	// this will also catch src being not 4 byte aligned, since it is similarly 4 byte
	//   aligned with dst at this point.
	tst		r0, #15
	bne		.L_not16bytealigned

	// check to see if we have at least 32 bytes of data to copy.
	// if not, just revert to wordwise copy
	cmp		r2, #32
	blt		.L_wordwise

.L_bigcopy:
	// copy 32 bytes at a time. src & dst need to be at least 4 byte aligned,
	// and we need at least 32 bytes remaining to copy

	// save r6-r7 for use in the big copy
	stmfd	sp!, {r6-r7}

	sub		r2, r2, #32		// subtract an extra 32 to the len so we can avoid an extra compare

.L_bigcopy_loop:
	ldmia	r1!, {r4, r5, r6, r7}
	stmia	r0!, {r4, r5, r6, r7}
	ldmia	r1!, {r4, r5, r6, r7}
	subs	r2, r2, #32
	stmia	r0!, {r4, r5, r6, r7}
	bge		.L_bigcopy_loop

	// restore r6-r7
	ldmfd	sp!, {r6-r7}

	// see if we are done
	adds	r2, r2, #32
	beq		.L_done

	// less then 4 bytes left?
	cmp		r2, #4
	blt		.L_bytewise

.L_wordwise:
	// copy 4 bytes at a time.
	// src & dst are guaranteed to be word aligned, and at least 4 bytes are left to copy.
	subs	r2, r2, #4

.L_wordwise_loop:
	ldr		r3, [r1], #4
	subs	r2, r2, #4
	str		r3, [r0], #4
	bge		.L_wordwise_loop

	// correct the remaining len and test for completion
	adds	r2, r2, #4
	beq		.L_done

.L_bytewise:
	// simple bytewise copy
	ldrb	r3, [r1], #1
	subs	r2, r2, #1
	strb	r3, [r0], #1
	bgt		.L_bytewise

.L_done:
	// load dst for return and restore r4,r5
//#if ARM_ARCH_LEVEL >= 5
//	ldmfd	sp!, {r0, r4, r5, pc}
//#else
	ldmfd	sp!, {r0, r4, r5, lr}
	bx		lr
//#endif

.L_not16bytealigned:
	// dst is not 16 byte aligned, so we will copy up to 15 bytes to get it aligned.
	// src is guaranteed to be similarly word aligned with dst.

	// set the condition flags based on the alignment.
	lsl		r12, r0, #28
	rsb		r12, r12, #0
	msr		CPSR_f, r12				// move into NZCV fields in CPSR

	// move as many bytes as necessary to get the dst aligned
	ldrvsb	r3, [r1], #1			// V set
	ldrcsh	r4, [r1], #2			// C set
	ldreq	r5, [r1], #4			// Z set

	strvsb	r3, [r0], #1
	strcsh	r4, [r0], #2
	streq	r5, [r0], #4

	ldmmiia	r1!, {r3-r4}			// N set
	stmmiia	r0!, {r3-r4}

	// fix the remaining len
	sub		r2, r2, r12, lsr #28

	// test to see what we should do now
	cmp		r2, #32
	bge		.L_bigcopy
	b		.L_wordwise

	// src and dest overlap 'forwards' or dst > src
.L_forwardoverlap:

	// do a bytewise reverse copy for now
	add		r1, r1, r2
	add		r0, r0, r2

.L_bytewisereverse:
	// simple bytewise reverse copy
	ldrb	r3, [r1], #-1
	subs	r2, r2, #1
	strb	r3, [r0], #-1
	bgt		.L_bytewisereverse

	b		.L_done
	// check for zero length copy or the same pointer

FUNCTION_END(memcpy)
#endif