;==============================================================================
; File:        memcpy.s
; Description: XScale-optimised implementations of <string.h> memory- and
;              string-related functions
;
; Written by Adrian Lees as part of the Aemulor project and released,
; unsupported into the Public Domain.
;==============================================================================

		AREA	|ARM$$code|,CODE,READONLY

		EXPORT	memcpy
		EXPORT	memmove
		EXPORT	memset
		EXPORT	strcpy
		EXPORT	strlen


; Copy/move a block of memory
;
; entry	a1 -> destination
;	a2 -> source
;	a3 = size in bytes
; exit	a1 -> destination

memcpy
memmove		STR	lr,[sp,#-4]!
		STR	v1,[sp,#-4]!
		STR	v2,[sp,#-4]!

		;are we copying/moving up/down in memory?

		CMP	a1,a2
		BLO	down

		ADD	a2,a2,a3
		ADD	a4,a1,a3

		;are the source and destination both word-aligned?

		ANDS	ip,a2,#3
		TSTEQ	a4,#3
		BNE	up_unaligned

up_aligned	;copying up in memory, we need to run backwards
		;  in case the two regions overlap (allowed by memmove)

		CMP	a3,#32
		BLO	ucp4
		CMP	a3,#128
		BLO	ucp16lp

ucp32lp		SUBS	a3,a3,#32
		PLD	[a2,#-36]
		LDRHS	v1,[a2,#-4]!
		LDRHS	v2,[a2,#-4]!
		LDRHS	ip,[a2,#-4]!
		LDRHS	lr,[a2,#-4]!
		STRHS	v1,[a4,#-4]!
		STRHS	v2,[a4,#-4]!
		STRHS	ip,[a4,#-4]!
		STRHS	lr,[a4,#-4]!
		LDRHS	v1,[a2,#-4]!
		LDRHS	v2,[a2,#-4]!
		LDRHS	ip,[a2,#-4]!
		LDRHS	lr,[a2,#-4]!
		STRHS	v1,[a4,#-4]!
		STRHS	v2,[a4,#-4]!
		STRHS	ip,[a4,#-4]!
		STRHS	lr,[a4,#-4]!
		BHI	ucp32lp
		ADDLO	a3,a3,#32

ucp16lp		SUBS	a3,a3,#16
		LDRHS	v1,[a2,#-4]!
		LDRHS	v2,[a2,#-4]!
		LDRHS	ip,[a2,#-4]!
		LDRHS	lr,[a2,#-4]!
		STRHS	v1,[a4,#-4]!
		STRHS	v2,[a4,#-4]!
		STRHS	ip,[a4,#-4]!
		STRHS	lr,[a4,#-4]!
		BHI	ucp16lp
		ADDLO	a3,a3,#16

ucp4		SUBS	a3,a3,#4
		LDRHS	ip,[a2,#-4]!
ucp4lp		STRHS	ip,[a4,#-4]!
		SUBHSS	a3,a3,#4
		LDRHS	ip,[a2,#-4]!
		BHS	ucp4lp
		ADD	a3,a3,#4

ucp1		SUBS	a3,a3,#1
		LDRHSB	ip,[a2,#-1]!
ucp1lp		STRHSB	ip,[a4,#-1]!
		SUBHSS	a3,a3,#1
		LDRHSB	ip,[a2,#-1]!
		BHS	ucp1lp

		LDR	v2,[sp],#4
		LDR	v1,[sp],#4
		LDR	pc,[sp],#4

up_unaligned	;source and/or destination unaligned
		;
		;copying up in memory, we need to run backwards
		;align source to next word boundary
		;
		;ip = nof bytes til aligned

		RSB	v1,ip,#3	;jump table index
		SUBS	a3,a3,ip	;check length not equalled/exceeded
		ADDLS	a3,a3,ip	;restore
		ADDHI	pc,pc,v1,LSL #3
		B	ucp1
		LDRB	v1,[a2,#-1]!
		STRB	v1,[a4,#-1]!
		LDRB	v1,[a2,#-1]!
		STRB	v1,[a4,#-1]!
		LDRB	v1,[a2,#-1]!
		STRB	v1,[a4,#-1]!

		ANDS	ip,a4,#3
		BEQ	up_aligned

		LDR	lr,[a4,-ip]!	;=> use post-dec when storing
		STR	v3,[sp,#-4]!	;free another register

		CMP	ip,#2
		BHI	ucp_sh3
		BEQ	ucp_sh2

ucp_sh1		SUBS	a3,a3,#32
		BIC	lr,lr,#&FF
		BLO	ucp32sh1fin
ucp32sh1lp	PLD	[a2,#-36]
		LDR	v1,[a2,#-4]!
		LDR	v2,[a2,#-4]!
		LDR	v3,[a2,#-4]!
		LDR	ip,[a2,#-4]!
		ORR	lr,lr,v1,LSR #24
		MOV	v1,v1,LSL #8
		ORR	v1,v1,v2,LSR #24
		MOV	v2,v2,LSL #8
		ORR	v2,v2,v3,LSR #24
		MOV	v3,v3,LSL #8
		ORR	v3,v3,ip,LSR #24
		STR	lr,[a4],#-4
		STR	v1,[a4],#-4
		STR	v2,[a4],#-4
		STR	v3,[a4],#-4
		MOV	lr,ip,LSL #8
		LDR	v1,[a2,#-4]!
		LDR	v2,[a2,#-4]!
		LDR	v3,[a2,#-4]!
		LDR	ip,[a2,#-4]!
		ORR	lr,lr,v1,LSR #24
		MOV	v1,v1,LSL #8
		ORR	v1,v1,v2,LSR #24
		MOV	v2,v2,LSL #8
		ORR	v2,v2,v3,LSR #24
		MOV	v3,v3,LSL #8
		ORR	v3,v3,ip,LSR #24
		STR	lr,[a4],#-4
		STR	v1,[a4],#-4
		STR	v2,[a4],#-4
		STR	v3,[a4],#-4
		MOV	lr,ip,LSL #8
		SUBS	a3,a3,#32
		BHS	ucp32sh1lp
ucp32sh1fin	ADD	a3,a3,#32

		SUBS	a3,a3,#4
		LDRHS	v1,[a2,#-4]!
		BLO	ucp4sh1fin
ucp4sh1lp	SUBS	a3,a3,#4
		ORR	lr,lr,v1,LSR #24
		STR	lr,[a4],#-4
		MOV	lr,v1,LSL #8
		LDRHS	v1,[a2,#-4]!
		BHS	ucp4sh1lp
ucp4sh1fin	ADD	a3,a3,#4

ucp1sh1		;we arrive here with 3 bytes still in lr
		;  and a further 0-3 bytes to copy

		CMP	a3,#1
		LDRHSB	v1,[a2,#-1]!
		LDRLOB	v1,[a4]
		LDRHIB	v2,[a2,#-1]!
		CMP	a3,#2
		LDRHIB	v3,[a2,#-1]!
		ORR	lr,lr,v1
		STR	lr,[a4],#-4
		STRHSB	v2,[a4,#3]
		STRHIB	v3,[a4,#2]

		LDR	v3,[sp],#4
		LDR	v2,[sp],#4
		LDR	v1,[sp],#4
		LDR	pc,[sp],#4

ucp_sh2		SUBS	a3,a3,#32
		BIC	lr,lr,#&FF00
		BIC	lr,lr,#&00FF
		BLO	ucp32sh2fin
ucp32sh2lp	PLD	[a2,#-36]
		LDR	v1,[a2,#-4]!
		LDR	v2,[a2,#-4]!
		LDR	v3,[a2,#-4]!
		LDR	ip,[a2,#-4]!
		ORR	lr,lr,v1,LSR #16
		MOV	v1,v1,LSL #16
		ORR	v1,v1,v2,LSR #16
		MOV	v2,v2,LSL #16
		ORR	v2,v2,v3,LSR #16
		MOV	v3,v3,LSL #16
		ORR	v3,v3,ip,LSR #16
		STR	lr,[a4],#-4
		STR	v1,[a4],#-4
		STR	v2,[a4],#-4
		STR	v3,[a4],#-4
		MOV	lr,ip,LSL #16
		LDR	v1,[a2,#-4]!
		LDR	v2,[a2,#-4]!
		LDR	v3,[a2,#-4]!
		LDR	ip,[a2,#-4]!
		ORR	lr,lr,v1,LSR #16
		MOV	v1,v1,LSL #16
		ORR	v1,v1,v2,LSR #16
		MOV	v2,v2,LSL #16
		ORR	v2,v2,v3,LSR #16
		MOV	v3,v3,LSL #16
		ORR	v3,v3,ip,LSR #16
		STR	lr,[a4],#-4
		STR	v1,[a4],#-4
		STR	v2,[a4],#-4
		STR	v3,[a4],#-4
		MOV	lr,ip,LSL #16
		SUBS	a3,a3,#32
		BHS	ucp32sh2lp
ucp32sh2fin	ADD	a3,a3,#32

		SUBS	a3,a3,#4
		LDRHS	v1,[a2,#-4]!
		BLO	ucp4sh2fin
ucp4sh2lp	SUBS	a3,a3,#4
		ORR	lr,lr,v1,LSR #16
		STR	lr,[a4],#-4
		MOV	lr,v1,LSL #16
		LDRHS	v1,[a2,#-4]!
		BHS	ucp4sh2lp
ucp4sh2fin	ADD	a3,a3,#4

ucp1sh2		;we arrive here with 2 bytes still in lr
		;  and a further 0-3 bytes to copy

		CMP	a3,#1
		LDRHSB	v1,[a2,#-1]!
		LDRLOB	v1,[a4,#1]
		LDRHIB	v2,[a2,#-1]!
		LDRLSB	v2,[a4]
		CMP	a3,#2
		LDRHIB	v3,[a2,#-1]!
		ORR	lr,lr,v1,LSL #8
		ORR	lr,lr,v2
		STR	lr,[a4],#-4
		STRHIB	v3,[a4,#3]

		LDR	v3,[sp],#4
		LDR	v2,[sp],#4
		LDR	v1,[sp],#4
		LDR	pc,[sp],#4

ucp_sh3		SUBS	a3,a3,#32
		AND	lr,lr,#&FF000000
		BLO	ucp32sh3fin
ucp32sh3lp	PLD	[a2,#-36]
		LDR	v1,[a2,#-4]!
		LDR	v2,[a2,#-4]!
		LDR	v3,[a2,#-4]!
		LDR	ip,[a2,#-4]!
		ORR	lr,lr,v1,LSR #8
		MOV	v1,v1,LSL #24
		ORR	v1,v1,v2,LSR #8
		MOV	v2,v2,LSL #24
		ORR	v2,v2,v3,LSR #8
		MOV	v3,v3,LSL #24
		ORR	v3,v3,ip,LSR #8
		STR	lr,[a4],#-4
		STR	v1,[a4],#-4
		STR	v2,[a4],#-4
		STR	v3,[a4],#-4
		MOV	lr,ip,LSL #24
		LDR	v1,[a2,#-4]!
		LDR	v2,[a2,#-4]!
		LDR	v3,[a2,#-4]!
		LDR	ip,[a2,#-4]!
		ORR	lr,lr,v1,LSR #8
		MOV	v1,v1,LSL #24
		ORR	v1,v1,v2,LSR #8
		MOV	v2,v2,LSL #24
		ORR	v2,v2,v3,LSR #8
		MOV	v3,v3,LSL #24
		ORR	v3,v3,ip,LSR #8
		STR	lr,[a4],#-4
		STR	v1,[a4],#-4
		STR	v2,[a4],#-4
		STR	v3,[a4],#-4
		MOV	lr,ip,LSL #24
		SUBS	a3,a3,#32
		BHS	ucp32sh3lp
ucp32sh3fin	ADD	a3,a3,#32

		SUBS	a3,a3,#4
		LDRHS	v1,[a2,#-4]!
		BLO	ucp4sh3fin
ucp4sh3lp	SUBS	a3,a3,#4
		ORR	lr,lr,v1,LSR #8
		STR	lr,[a4],#-4
		MOV	lr,v1,LSL #24
		LDRHS	v1,[a2,#-4]!
		BHS	ucp4sh3lp
ucp4sh3fin	ADD	a3,a3,#4

ucp1sh3		;we arrive here with 1 byte still in lr
		;  and a further 0-3 bytes to copy

		CMP	a3,#1
		LDRHSB	v1,[a2,#-1]!
		LDRLOB	v1,[a4,#2]
		LDRHIB	v2,[a2,#-1]!
		LDRLSB	v2,[a4,#1]
		CMP	a3,#2
		LDRHIB	v3,[a2,#-1]!
		LDRLSB	v3,[a4]
		ORR	lr,lr,v1,LSL #16
		ORR	lr,lr,v2,LSL #8
		ORR	lr,lr,v3
		STR	lr,[a4]

		LDR	v3,[sp],#4
		LDR	v2,[sp],#4
		LDR	v1,[sp],#4
		LDR	pc,[sp],#4


down		;copying down in memory
		; are the source and destination both word-aligned?

		MOV	a4,a1
		ANDS	ip,a2,#3
		TSTEQ	a1,#3
		BNE	down_unaligned

down_aligned	CMP	a3,#32
		BLO	dcp4
		CMP	a3,#128
		BLO	dcp16lp

dcp32lp		SUBS	a3,a3,#32
		PLD	[a2,#32]
		LDRHS	v1,[a2],#4
		LDRHS	v2,[a2],#4
		LDRHS	ip,[a2],#4
		LDRHS	lr,[a2],#4
		STRHS	v1,[a4],#4
		STRHS	v2,[a4],#4
		STRHS	ip,[a4],#4
		STRHS	lr,[a4],#4
		LDRHS	v1,[a2],#4
		LDRHS	v2,[a2],#4
		LDRHS	ip,[a2],#4
		LDRHS	lr,[a2],#4
		STRHS	v1,[a4],#4
		STRHS	v2,[a4],#4
		STRHS	ip,[a4],#4
		STRHS	lr,[a4],#4
		BHI	dcp32lp
		ADDLO	a3,a3,#32

dcp16lp		SUBS	a3,a3,#16
		LDRHS	v1,[a2],#4
		LDRHS	v2,[a2],#4
		LDRHS	ip,[a2],#4
		LDRHS	lr,[a2],#4
		STRHS	v1,[a4],#4
		STRHS	v2,[a4],#4
		STRHS	ip,[a4],#4
		STRHS	lr,[a4],#4
		BHI	dcp16lp
		ADDLO	a3,a3,#16

dcp4		SUBS	a3,a3,#4
		LDRHS	ip,[a2],#4
dcp4lp		STRHS	ip,[a4],#4
		SUBHSS	a3,a3,#4
		LDRHS	ip,[a2],#4
		BHS	dcp4lp
		ADD	a3,a3,#4

dcp1		SUBS	a3,a3,#1
		LDRHSB	ip,[a2],#1
dcp1lp		STRHSB	ip,[a4],#1
		SUBHSS	a3,a3,#1
		LDRHSB	ip,[a2],#1
		BHS	dcp1lp

		LDR	v2,[sp],#4
		LDR	v1,[sp],#4
		LDR	pc,[sp],#4

down_unaligned	;align source to next word boundary

		RSB	v1,ip,#4	;nof bytes til src aligned
		SUBS	a3,a3,v1	;check length not equalled/exceeded
		ADDLS	a3,a3,v1	;restore
		ADDHI	pc,pc,ip,LSL #3
		B	dcp1
		ADD	a3,a3,#4	;offset = 0, restore
		B	down_chkdest
		LDRB	v1,[a2],#1	;offset = 1
		STRB	v1,[a4],#1
		LDRB	v1,[a2],#1	;offset = 2
		STRB	v1,[a4],#1
		LDRB	v1,[a2],#1	;offset = 3
		STRB	v1,[a4],#1

down_chkdest	;is the destination also word-aligned now?

		ANDS	ip,a4,#3
		BEQ	down_aligned
		LDR	lr,[a4,-ip]!
		STR	v3,[sp,#-4]!	;free another register

		CMP	ip,#2
		BHI	dcp_sh3
		BEQ	dcp_sh2

dcp_sh1		SUBS	a3,a3,#32
		AND	lr,lr,#&FF
		BLO	dcp32sh1fin
dcp32sh1lp	PLD	[a2,#32]
		LDR	v1,[a2],#4
		LDR	v2,[a2],#4
		LDR	v3,[a2],#4
		LDR	ip,[a2],#4
		ORR	lr,lr,v1,LSL #8
		MOV	v1,v1,LSR #24
		ORR	v1,v1,v2,LSL #8
		MOV	v2,v2,LSR #24
		ORR	v2,v2,v3,LSL #8
		MOV	v3,v3,LSR #24
		ORR	v3,v3,ip,LSL #8
		STR	lr,[a4],#4
		STR	v1,[a4],#4
		STR	v2,[a4],#4
		STR	v3,[a4],#4
		MOV	lr,ip,LSR #24
		LDR	v1,[a2],#4
		LDR	v2,[a2],#4
		LDR	v3,[a2],#4
		LDR	ip,[a2],#4
		ORR	lr,lr,v1,LSL #8
		MOV	v1,v1,LSR #24
		ORR	v1,v1,v2,LSL #8
		MOV	v2,v2,LSR #24
		ORR	v2,v2,v3,LSL #8
		MOV	v3,v3,LSR #24
		ORR	v3,v3,ip,LSL #8
		STR	lr,[a4],#4
		STR	v1,[a4],#4
		STR	v2,[a4],#4
		STR	v3,[a4],#4
		MOV	lr,ip,LSR #24
		SUBS	a3,a3,#32
		BHS	dcp32sh1lp
dcp32sh1fin	ADD	a3,a3,#32

		SUBS	a3,a3,#4
		LDRHS	v1,[a2],#4
		BLO	dcp4sh1fin
dcp4sh1lp	SUBS	a3,a3,#4
		ORR	lr,lr,v1,LSL #8
		STR	lr,[a4],#4
		MOV	lr,v1,LSR #24
		LDRHS	v1,[a2],#4
		BHS	dcp4sh1lp
dcp4sh1fin	ADD	a3,a3,#4

dcp1sh1		;we arrive here with 1 byte still in lr
		;  and a further 0-3 bytes to copy

		CMP	a3,#1
		LDRHSB	v1,[a2],#1
		LDRLOB	v1,[a4,#1]
		LDRHIB	v2,[a2],#1
		LDRLSB	v2,[a4,#2]
		CMP	a3,#2
		LDRHIB	v3,[a2],#1
		LDRLSB	v3,[a4,#3]
		ORR	lr,lr,v1,LSL #8
		ORR	lr,lr,v2,LSL #16
		ORR	lr,lr,v3,LSL #24
		STR	lr,[a4]

		LDR	v3,[sp],#4
		LDR	v2,[sp],#4
		LDR	v1,[sp],#4
		LDR	pc,[sp],#4


dcp_sh2		SUBS	a3,a3,#32
		BIC	lr,lr,#&FF000000
		BIC	lr,lr,#&00FF0000
		BLO	dcp32sh2fin
dcp32sh2lp	PLD	[a2,#32]
		LDR	v1,[a2],#4
		LDR	v2,[a2],#4
		LDR	v3,[a2],#4
		LDR	ip,[a2],#4
		ORR	lr,lr,v1,LSL #16
		MOV	v1,v1,LSR #16
		ORR	v1,v1,v2,LSL #16
		MOV	v2,v2,LSR #16
		ORR	v2,v2,v3,LSL #16
		MOV	v3,v3,LSR #16
		ORR	v3,v3,ip,LSL #16
		STR	lr,[a4],#4
		STR	v1,[a4],#4
		STR	v2,[a4],#4
		STR	v3,[a4],#4
		MOV	lr,ip,LSR #16
		LDR	v1,[a2],#4
		LDR	v2,[a2],#4
		LDR	v3,[a2],#4
		LDR	ip,[a2],#4
		ORR	lr,lr,v1,LSL #16
		MOV	v1,v1,LSR #16
		ORR	v1,v1,v2,LSL #16
		MOV	v2,v2,LSR #16
		ORR	v2,v2,v3,LSL #16
		MOV	v3,v3,LSR #16
		ORR	v3,v3,ip,LSL #16
		STR	lr,[a4],#4
		STR	v1,[a4],#4
		STR	v2,[a4],#4
		STR	v3,[a4],#4
		MOV	lr,ip,LSR #16
		SUBS	a3,a3,#32
		BHS	dcp32sh2lp
dcp32sh2fin	ADD	a3,a3,#32

		SUBS	a3,a3,#4
		LDRHS	v1,[a2],#4
		BLO	dcp4sh2fin
dcp4sh2lp	SUBS	a3,a3,#4
		ORR	lr,lr,v1,LSL #16
		STR	lr,[a4],#4
		MOV	lr,v1,LSR #16
		LDRHS	v1,[a2],#4
		BHS	dcp4sh2lp
dcp4sh2fin	ADD	a3,a3,#4

dcp1sh2		;we arrive here with 2 bytes still in lr
		;  and a further 0-3 bytes to copy

		CMP	a3,#1
		LDRLOB	v1,[a4,#2]
		LDRHSB	v1,[a2],#1
		LDRLSB	v2,[a4,#3]
		LDRHIB	v2,[a2],#1
		CMP	a3,#2
		LDRHIB	v3,[a2],#1
		ORR	lr,lr,v1,LSL #16
		ORR	lr,lr,v2,LSL #24
		STR	lr,[a4],#4
		STRHIB	v3,[a4],#1

		LDR	v3,[sp],#4
		LDR	v2,[sp],#4
		LDR	v1,[sp],#4
		LDR	pc,[sp],#4

dcp_sh3		SUBS	a3,a3,#32
		BIC	lr,lr,#&FF000000
		BLO	dcp32sh3fin
dcp32sh3lp	PLD	[a2,#32]
		LDR	v1,[a2],#4
		LDR	v2,[a2],#4
		LDR	v3,[a2],#4
		LDR	ip,[a2],#4
		ORR	lr,lr,v1,LSL #24
		MOV	v1,v1,LSR #8
		ORR	v1,v1,v2,LSL #24
		MOV	v2,v2,LSR #8
		ORR	v2,v2,v3,LSL #24
		MOV	v3,v3,LSR #8
		ORR	v3,v3,ip,LSL #24
		STR	lr,[a4],#4
		STR	v1,[a4],#4
		STR	v2,[a4],#4
		STR	v3,[a4],#4
		MOV	lr,ip,LSR #8
		LDR	v1,[a2],#4
		LDR	v2,[a2],#4
		LDR	v3,[a2],#4
		LDR	ip,[a2],#4
		ORR	lr,lr,v1,LSL #24
		MOV	v1,v1,LSR #8
		ORR	v1,v1,v2,LSL #24
		MOV	v2,v2,LSR #8
		ORR	v2,v2,v3,LSL #24
		MOV	v3,v3,LSR #8
		ORR	v3,v3,ip,LSL #24
		STR	lr,[a4],#4
		STR	v1,[a4],#4
		STR	v2,[a4],#4
		STR	v3,[a4],#4
		MOV	lr,ip,LSR #8
		SUBS	a3,a3,#32
		BHS	dcp32sh3lp
dcp32sh3fin	ADD	a3,a3,#32

		SUBS	a3,a3,#4
		LDRHS	v1,[a2],#4
		BLO	dcp4sh3fin
dcp4sh3lp	SUBS	a3,a3,#4
		ORR	lr,lr,v1,LSL #24
		STR	lr,[a4],#4
		MOV	lr,v1,LSR #8
		LDRHS	v1,[a2],#4
		BHS	dcp4sh3lp
dcp4sh3fin	ADD	a3,a3,#4

dcp1sh3		;we arrive here with 3 bytes still in lr
		;  and a further 0-3 bytes to copy

		CMP	a3,#1
		LDRHSB	v1,[a2],#1
		LDRLOB	v1,[a4,#3]	;read 4th from dest buffer
		LDRHIB	v2,[a2],#1
		CMP	a3,#2
		LDRHIB	v3,[a2],#1
		ORR	lr,lr,v1,LSL #24
		STR	lr,[a4],#4
		STRHSB	v2,[a4]
		STRHIB	v3,[a4,#1]

		LDR	v3,[sp],#4
		LDR	v2,[sp],#4
		LDR	v1,[sp],#4
		LDR	pc,[sp],#4


; Fill a block of memory
;
; entry	a1 -> destination buffer
;	a2 =  char used to fill buffer
;	a3 =  nof chars to be written
; exit	a1 -> destination buffer

memset		AND	a2,a2,#&FF
		MOV	a4,a1
		ORR	a2,a2,a2,LSL #8
		ANDS	ip,a1,#3
		ORR	a2,a2,a2,LSL #16
		BNE	mset_unaligned

mset_aligned	CMP	a3,#64
		BLO	mset16_lp

mset64_lp	SUBS	a3,a3,#64
		STRHS	a2,[a4],#4
		STRHS	a2,[a4],#4
		STRHS	a2,[a4],#4
		STRHS	a2,[a4],#4
		STRHS	a2,[a4],#4
		STRHS	a2,[a4],#4
		STRHS	a2,[a4],#4
		STRHS	a2,[a4],#4
		STRHS	a2,[a4],#4
		STRHS	a2,[a4],#4
		STRHS	a2,[a4],#4
		STRHS	a2,[a4],#4
		STRHS	a2,[a4],#4
		STRHS	a2,[a4],#4
		STRHS	a2,[a4],#4
		STRHS	a2,[a4],#4
		BHI	mset64_lp
		ADDLO	a3,a3,#64

mset16_lp	SUBS	a3,a3,#16
		STRHS	a2,[a4],#4
		STRHS	a2,[a4],#4
		STRHS	a2,[a4],#4
		STRHS	a2,[a4],#4
		BHI	mset16_lp
		ADDLO	a3,a3,#16

		;at most 15 bytes left

		SUBS	a3,a3,#4
		STRHS	a2,[a4],#4
		SUBHSS	a3,a3,#4
		STRHS	a2,[a4],#4
		SUBHSS	a3,a3,#4
		STRHS	a2,[a4],#4
		ADDLO	a3,a3,#4

		;at most 3 bytes left

		SUBS	a3,a3,#1
		STRHSB	a2,[a4],#1
		SUBHSS	a3,a3,#1
		STRHSB	a2,[a4],#1
		SUBHSS	a3,a3,#1
		STRHSB	a2,[a4],#1
		MOV	pc,lr

mset_unaligned	RSB	ip,ip,#4	;nof bytes til word aligned
		CMP	ip,a3
		MOVHI	ip,a3		;but don't overrun

		SUBS	ip,ip,#1
		SUBHS	a3,a3,#1
		STRHSB	a2,[a4],#1
		SUBHSS	ip,ip,#1
		SUBHS	a3,a3,#1
		STRHSB	a2,[a4],#1
		SUBHSS	ip,ip,#1
		SUBHS	a3,a3,#1
		STRHSB	a2,[a4],#1

		TEQ	a3,#0		;finished already?
		MOVEQ	pc,lr
		B	mset_aligned


; ----- Remember that string-handling routines can only read one
;       word ahead without risking a data abort :-(

; Copy a NUL-terminated string
;
; entry	a1 -> destination buffer
;	a2 -> string
; exit	a1 -> destination buffer

strcpy		;strongly biased in favour of word-aligned source and destination

		STR	a1,[sp,#-4]!
		MOV	a4,#1
		ORR	ip,a1,a2
		ORR	a4,a4,a4,LSL #8
		TST	ip,#3
		LDREQ	ip,[a2],#4
		ORR	a4,a4,a4,LSL #16
		BNE	strcpy_misaligned

strcpy_lp	SUBS	a3,ip,a4	;3 instruction test for zero byte in word
		EORCS	a3,a3,ip
		BICCSS	a3,a4,a3
		LDREQ	a3,[a2],#4	;safe to read next word now
		STREQ	ip,[a1],#4
		BNE	strcpy_fin

		SUBS	ip,a3,a4	;3 instruction test for zero byte in word
		EORCS	ip,ip,a3
		BICCSS	ip,a4,ip
		LDREQ	ip,[a2],#4	;safe to read next word now
		STREQ	a3,[a1],#4
		BNE	strcpy_a3

		SUBS	a3,ip,a4	;3 instruction test for zero byte in word
		EORCS	a3,a3,ip
		BICCSS	a3,a4,a3
		LDREQ	a3,[a2],#4	;safe to read next word now
		STREQ	ip,[a1],#4
		BNE	strcpy_fin

		SUBS	ip,a3,a4	;3 instruction test for zero byte in word
		EORCS	ip,ip,a3
		BICCSS	ip,a4,ip
		LDREQ	ip,[a2],#4	;safe to read next word now
		STREQ	a3,[a1],#4
		BEQ	strcpy_lp

strcpy_a3	MOV	ip,a3

		;we still need to store the 1 to 4 bytes used in the last word

strcpy_fin	STRB	ip,[a1],#1	;store the first byte of the last word
					; (this may be the terminator)
		TST	ip,#&FF
		MOVNE	ip,ip,LSR #8
		STRNEB	ip,[a1],#1
		TSTNE	ip,#&FF
		MOVNE	ip,ip,LSR #8
		STRNEB	ip,[a1],#1
		TSTNE	ip,#&FF
		MOVNE	ip,ip,LSR #8
		STRNEB	ip,[a1],#1

		;finished (we copied the terminator byte before checking its value)

		LDR	a1,[sp],#4
		MOV	pc,lr

strcpy_misaligned
		TST	a2,#3
		LDRNEB	ip,[a2],#1
		BEQ	strcpy_chkdest

		;copy individual bytes until source ptr is word-aligned or finished
		; (between 1 and 3 iterations performed)

strcpy_align_lp	TST	a2,#3		;is source word-aligned yet
		TEQNE	ip,#0		;  or is this the end?
		STRB	ip,[a1],#1
		LDRNEB	ip,[a2],#1
		BNE	strcpy_align_lp

		TEQ	ip,#0		;end of string reached?
		LDREQ	a1,[sp],#4
		MOVEQ	pc,lr

strcpy_chkdest	;is destination also word-aligned now?

		ANDS	a3,a1,#3
		LDREQ	ip,[a2],#4
		BEQ	strcpy_lp
		CMP	a3,#2
		BHI	strcpy3
		BEQ	strcpy2

strcpy1		LDR	a3,[a2],#4
		LDRB	ip,[a1,#-1]!
		STR	lr,[sp,#-4]!	;need another register

strcpy1_lp	SUBS	lr,a3,a4
		EORCS	lr,lr,a3
		BICCSS	lr,a4,lr
		BNE	strcpy1_fin
		ORR	lr,ip,a3,LSL #8
		LDR	ip,[a2],#4
		STR	lr,[a1],#4
		MOV	a3,a3,LSR #24

		SUBS	lr,ip,a4
		EORCS	lr,lr,ip
		BICCSS	lr,a4,lr
		BNE	strcpy1_ip
		ORR	lr,a3,ip,LSL #8
		LDR	a3,[a2],#4
		STR	lr,[a1],#4
		MOV	ip,ip,LSR #24

		SUBS	lr,a3,a4
		EORCS	lr,lr,a3
		BICCSS	lr,a4,lr
		BNE	strcpy1_fin
		ORR	lr,ip,a3,LSL #8
		LDR	ip,[a2],#4
		STR	lr,[a1],#4
		MOV	a3,a3,LSR #24

		SUBS	lr,ip,a4
		EORCS	lr,lr,ip
		BICCSS	lr,a4,lr
		BNE	strcpy1_ip
		ORR	lr,a3,ip,LSL #8
		LDR	a3,[a2],#4
		STR	lr,[a1],#4
		MOV	ip,ip,LSR #24
		B	strcpy1_lp

strcpy1_ip	MOV	lr,a3
		MOV	a3,ip
		MOV	ip,lr

strcpy1_fin	;a3 contains the last loaded word which contains a 0 byte
		;ip contains (in its LSB) the next byte to be written to dest

		STRB	ip,[a1],#1
		STRB	a3,[a1],#1
		TST	a3,#&FF
		MOVNE	a3,a3,LSR #8
		STRNEB	a3,[a1],#1
		TSTNE	a3,#&FF
		MOVNE	a3,a3,LSR #8
		STRNEB	a3,[a1],#1
		TSTNE	a3,#&FF
		MOVNE	a3,a3,LSR #8
		STRNEB	a3,[a1],#1

		;finished

		LDR	a1,[sp,#4]
		LDR	pc,[sp],#8


strcpy2		LDR	ip,[a1,#-2]!
		LDR	a3,[a2],#4
		STR	lr,[sp,#-4]!	;need another register
		BIC	ip,ip,#&FF000000
		BIC	ip,ip,#&00FF0000

strcpy2_lp	SUBS	lr,a3,a4
		EORCS	lr,lr,a3
		BICCSS	lr,a4,lr
		BNE	strcpy2_fin
		ORR	lr,ip,a3,LSL #16
		LDR	ip,[a2],#4
		STR	lr,[a1],#4
		MOV	a3,a3,LSR #16

		SUBS	lr,ip,a4
		EORCS	lr,lr,ip
		BICCSS	lr,a4,lr
		BNE	strcpy2_ip
		ORR	lr,a3,ip,LSL #16
		LDR	a3,[a2],#4
		STR	lr,[a1],#4
		MOV	ip,ip,LSR #16

		SUBS	lr,a3,a4
		EORCS	lr,lr,a3
		BICCSS	lr,a4,lr
		BNE	strcpy2_fin
		ORR	lr,ip,a3,LSL #16
		LDR	ip,[a2],#4
		STR	lr,[a1],#4
		MOV	a3,a3,LSR #16

		SUBS	lr,ip,a4
		EORCS	lr,lr,ip
		BICCSS	lr,a4,lr
		BNE	strcpy2_ip
		ORR	lr,a3,ip,LSL #16
		LDR	a3,[a2],#4
		STR	lr,[a1],#4
		MOV	ip,ip,LSR #16
		B	strcpy2_lp

strcpy2_ip	MOV	lr,a3
		MOV	a3,ip
		MOV	ip,lr

strcpy2_fin	;a3 contains the last loaded word which contains a 0 byte
		;ip contains (in its 2 LSBs) the 2 next bytes to be written to dest

		MOV	lr,ip,LSR #8
		STRB	ip,[a1],#1
		STRB	lr,[a1],#1
		STRB	a3,[a1],#1
		TST	a3,#&FF
		MOVNE	a3,a3,LSR #8
		STRNEB	a3,[a1],#1
		TSTNE	a3,#&FF
		MOVNE	a3,a3,LSR #8
		STRNEB	a3,[a1],#1
		TSTNE	a3,#&FF
		MOVNE	a3,a3,LSR #8
		STRNEB	a3,[a1],#1

		;finished

		LDR	a1,[sp,#4]
		LDR	pc,[sp],#8


strcpy3		LDR	ip,[a1,#-3]!
		LDR	a3,[a2],#4
		STR	lr,[sp,#-4]!	;need another register
		BIC	ip,ip,#&FF000000

strcpy3_lp	SUBS	lr,a3,a4
		EORCS	lr,lr,a3
		BICCSS	lr,a4,lr
		BNE	strcpy3_fin
		ORR	lr,ip,a3,LSL #24
		LDR	ip,[a2],#4
		STR	lr,[a1],#4
		MOV	a3,a3,LSR #8

		SUBS	lr,ip,a4
		EORCS	lr,lr,ip
		BICCSS	lr,a4,lr
		BNE	strcpy3_ip
		ORR	lr,a3,ip,LSL #24
		LDR	a3,[a2],#4
		STR	lr,[a1],#4
		MOV	ip,ip,LSR #8

		SUBS	lr,a3,a4
		EORCS	lr,lr,a3
		BICCSS	lr,a4,lr
		BNE	strcpy3_fin
		ORR	lr,ip,a3,LSL #24
		LDR	ip,[a2],#4
		STR	lr,[a1],#4
		MOV	a3,a3,LSR #8

		SUBS	lr,ip,a4
		EORCS	lr,lr,ip
		BICCSS	lr,a4,lr
		BNE	strcpy3_ip
		ORR	lr,a3,ip,LSL #24
		LDR	a3,[a2],#4
		STR	lr,[a1],#4
		MOV	ip,ip,LSR #8
		B	strcpy3_lp

strcpy3_ip	MOV	lr,a3
		MOV	a3,ip
		MOV	ip,lr

strcpy3_fin	;a3 contains the last loaded word which contains a 0 byte
		;ip contains (in its 3 LSBs) the 3 next bytes to be written to dest

		ORR	lr,ip,a3,LSL #24
		STR	lr,[a1],#4
		TST	a3,#&FF
		MOVNE	a3,a3,LSR #8
		STRNEB	a3,[a1],#1
		TSTNE	a3,#&FF
		MOVNE	a3,a3,LSR #8
		STRNEB	a3,[a1],#1
		TSTNE	a3,#&FF
		MOVNE	a3,a3,LSR #8
		STRNEB	a3,[a1],#1

		;finished

		LDR	a1,[sp,#4]
		LDR	pc,[sp],#8

		ALIGN	32


; Determine the length of a NUL-terminated string
;
; entry	a1 -> NUL-terminated string
; exit	a1 = length excluding terminator

strlen		ANDS	a3,a1,#3
		MOV	a4,#1
		LDR	ip,[a1,-a3]	;read 1st word, guaranteed to be aligned
		ORR	a4,a4,a4,LSL #8
		MOV	a2,a1
		ORR	a4,a4,a4,LSL #16
		BNE	strlen_align

strlen_lp	SUBS	a3,ip,a4	;3 instruction test for zero byte in word
		EORCS	a3,a3,ip
		BICCSS	a3,a4,a3
		LDREQ	a3,[a2,#4]!	;safe to read next word now
		BNE	strlen_end

		SUBS	ip,a3,a4	;3 instruction test for zero byte in word
		EORCS	ip,ip,a3
		BICCSS	ip,a4,ip
		LDREQ	ip,[a2,#4]!	;safe to read next word now
		BNE	strlen_end_a3

		SUBS	a3,ip,a4	;3 instruction test for zero byte in word
		EORCS	a3,a3,ip
		BICCSS	a3,a4,a3
		LDREQ	a3,[a2,#4]!	;safe to read next word now
		BNE	strlen_end

		SUBS	ip,a3,a4	;3 instruction test for zero byte in word
		EORCS	ip,ip,a3
		BICCSS	ip,a4,ip
		LDREQ	ip,[a2,#4]!	;safe to read next word now
		BEQ	strlen_lp

strlen_end_a3	MOV	ip,a3

strlen_end	SUB	a1,a2,a1	;excludes last word read
		TST	ip,#&FF
		ADDNE	a1,a1,#1
		TSTNE	ip,#&FF00
		ADDNE	a1,a1,#1
		TSTNE	ip,#&FF0000
		ADDNE	a1,a1,#1
		MOV	pc,lr

strlen_align	CMP	a3,#2
		BHI	strlen3
		BEQ	strlen2
		TST	ip,#&FF00
		MOVEQ	a1,#0
		MOVEQ	pc,lr
		ADD	a2,a2,#1
strlen2		TST	ip,#&FF0000
		ADDNE	a2,a2,#1
strlen3		TSTNE	ip,#&FF000000
		LDRNE	ip,[a2,#1]!
		SUBEQ	a1,a2,a1
		MOVEQ	pc,lr
		B	strlen_lp

		END
