/*	$NetBSD: bcopyinout.S,v 1.23 2022/10/20 06:58:38 skrll Exp $	*/

/*
 * Copyright (c) 2002 Wasabi Systems, Inc.
 * All rights reserved.
 *
 * Written by Allen Briggs for Wasabi Systems, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *      This product includes software developed for the NetBSD Project by
 *      Wasabi Systems, Inc.
 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
 *    or promote products derived from this software without specific prior
 *    written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include "opt_multiprocessor.h"
#include "opt_cpuoptions.h"

#include "assym.h"

#include <machine/asm.h>

#include <arm/locore.h>

#if defined(__XSCALE__) || defined(_ARM_ARCH_6)
/*
 * armv6 and v7 have pld and strd so they can use the xscale
 * bcopyinout as well.
 */
#include "bcopyinout_xscale.S"
#else

RCSID("$NetBSD: bcopyinout.S,v 1.23 2022/10/20 06:58:38 skrll Exp $")

	.text
	.align	0

#define SAVE_REGS	stmfd	sp!, {r4-r11}
#define RESTORE_REGS	ldmfd	sp!, {r4-r11}

#if defined(__XSCALE__) || defined(_ARM_ARCH_6)
#define HELLOCPP #
#define PREFETCH(rx,o)	pld	[ rx , HELLOCPP (o) ]
#else
#define PREFETCH(rx,o)
#endif

/*
 * r0 = user space address
 * r1 = kernel space address
 * r2 = length
 *
 * Copies bytes from user space to kernel space
 *
 * We save/restore r4-r11:
 * r4-r11 are scratch
 */
ENTRY(copyin)
	/* Quick exit if length is zero */
	teq	r2, #0
	moveq	r0, #0
	RETc(eq)

	SAVE_REGS
	GET_CURPCB(r4)

	ldr	r5, [r4, #PCB_ONFAULT]
	adr	r3, .Lcopyfault
	str	r3, [r4, #PCB_ONFAULT]

	PREFETCH(r0, 0)
	PREFETCH(r1, 0)

	/*
	 * If not too many bytes, take the slow path.
	 */
	cmp	r2, #0x08
	blt	.Licleanup

	/*
	 * Align destination to word boundary.
	 */
	and	r6, r1, #0x3
	ldr	pc, [pc, r6, lsl #2]
	b	.Lialend
	.word	.Lialend
	.word	.Lial3
	.word	.Lial2
	.word	.Lial1
.Lial3:	ldrbt	r6, [r0], #1
	sub	r2, r2, #1
	strb	r6, [r1], #1
.Lial2:	ldrbt	r7, [r0], #1
	sub	r2, r2, #1
	strb	r7, [r1], #1
.Lial1:	ldrbt	r6, [r0], #1
	sub	r2, r2, #1
	strb	r6, [r1], #1
.Lialend:

	/*
	 * If few bytes left, finish slow.
	 */
	cmp	r2, #0x08
	blt	.Licleanup

	/*
	 * If source is not aligned, finish slow.
	 */
	ands	r3, r0, #0x03
	bne	.Licleanup

	cmp	r2, #0x60	/* Must be > 0x5f for unrolled cacheline */
	blt	.Licleanup8

	/*
	 * Align destination to cacheline boundary.
	 * If source and destination are nicely aligned, this can be a big
	 * win.  If not, it's still cheaper to copy in groups of 32 even if
	 * we don't get the nice cacheline alignment.
	 */
	and	r6, r1, #0x1f
	ldr	pc, [pc, r6]
	b	.Licaligned
	.word	.Licaligned
	.word	.Lical28
	.word	.Lical24
	.word	.Lical20
	.word	.Lical16
	.word	.Lical12
	.word	.Lical8
	.word	.Lical4
.Lical28:ldrt	r6, [r0], #4
	sub	r2, r2, #4
	str	r6, [r1], #4
.Lical24:ldrt	r7, [r0], #4
	sub	r2, r2, #4
	str	r7, [r1], #4
.Lical20:ldrt	r6, [r0], #4
	sub	r2, r2, #4
	str	r6, [r1], #4
.Lical16:ldrt	r7, [r0], #4
	sub	r2, r2, #4
	str	r7, [r1], #4
.Lical12:ldrt	r6, [r0], #4
	sub	r2, r2, #4
	str	r6, [r1], #4
.Lical8:ldrt	r7, [r0], #4
	sub	r2, r2, #4
	str	r7, [r1], #4
.Lical4:ldrt	r6, [r0], #4
	sub	r2, r2, #4
	str	r6, [r1], #4

	/*
	 * We start with > 0x40 bytes to copy (>= 0x60 got us into this
	 * part of the code, and we may have knocked that down by as much
	 * as 0x1c getting aligned).
	 *
	 * This loop basically works out to:
	 * do {
	 * 	prefetch-next-cacheline(s)
	 *	bytes -= 0x20;
	 *	copy cacheline
	 * } while (bytes >= 0x40);
	 * bytes -= 0x20;
	 * copy cacheline
	 */
.Licaligned:
	PREFETCH(r0, 32)
	PREFETCH(r1, 32)

	sub	r2, r2, #0x20

	/* Copy a cacheline */
	ldrt	r10, [r0], #4
	ldrt	r11, [r0], #4
	ldrt	r6, [r0], #4
	ldrt	r7, [r0], #4
	ldrt	r8, [r0], #4
	ldrt	r9, [r0], #4
	stmia	r1!, {r10-r11}
	ldrt	r10, [r0], #4
	ldrt	r11, [r0], #4
	stmia	r1!, {r6-r11}

	cmp	r2, #0x40
	bge	.Licaligned

	sub	r2, r2, #0x20

	/* Copy a cacheline */
	ldrt	r10, [r0], #4
	ldrt	r11, [r0], #4
	ldrt	r6, [r0], #4
	ldrt	r7, [r0], #4
	ldrt	r8, [r0], #4
	ldrt	r9, [r0], #4
	stmia	r1!, {r10-r11}
	ldrt	r10, [r0], #4
	ldrt	r11, [r0], #4
	stmia	r1!, {r6-r11}

	cmp	r2, #0x08
	blt	.Liprecleanup

.Licleanup8:
	ldrt	r8, [r0], #4
	ldrt	r9, [r0], #4
	sub	r2, r2, #8
	stmia	r1!, {r8, r9}
	cmp	r2, #8
	bge	.Licleanup8

.Liprecleanup:
	/*
	 * If we're done, bail.
	 */
	cmp	r2, #0
	beq	.Liout

.Licleanup:
	and	r6, r2, #0x3
	ldr	pc, [pc, r6, lsl #2]
	b	.Licend
	.word	.Lic4
	.word	.Lic1
	.word	.Lic2
	.word	.Lic3
.Lic4:	ldrbt	r6, [r0], #1
	sub	r2, r2, #1
	strb	r6, [r1], #1
.Lic3:	ldrbt	r7, [r0], #1
	sub	r2, r2, #1
	strb	r7, [r1], #1
.Lic2:	ldrbt	r6, [r0], #1
	sub	r2, r2, #1
	strb	r6, [r1], #1
.Lic1:	ldrbt	r7, [r0], #1
	subs	r2, r2, #1
	strb	r7, [r1], #1
.Licend:
	bne	.Licleanup

.Liout:
	mov	r0, #0

	str	r5, [r4, #PCB_ONFAULT]
	RESTORE_REGS

	RET

.Lcopyfault:
	str	r5, [r4, #PCB_ONFAULT]
	RESTORE_REGS

	RET
END(copyin)

/*
 * r0 = kernel space address
 * r1 = user space address
 * r2 = length
 *
 * Copies bytes from kernel space to user space
 *
 * We save/restore r4-r11:
 * r4-r11 are scratch
 */

ENTRY(copyout)
	/* Quick exit if length is zero */
	teq	r2, #0
	moveq	r0, #0
	moveq	pc, lr

	SAVE_REGS
	GET_CURPCB(r4)

	ldr	r5, [r4, #PCB_ONFAULT]
	adr	r3, .Lcopyfault
	str	r3, [r4, #PCB_ONFAULT]

	PREFETCH(r0, 0)
	PREFETCH(r1, 0)

	/*
	 * If not too many bytes, take the slow path.
	 */
	cmp	r2, #0x08
	blt	.Lcleanup

	/*
	 * Align destination to word boundary.
	 */
	and	r6, r1, #0x3
	ldr	pc, [pc, r6, lsl #2]
	b	.Lalend
	.word	.Lalend
	.word	.Lal3
	.word	.Lal2
	.word	.Lal1
.Lal3:	ldrb	r6, [r0], #1
	sub	r2, r2, #1
	strbt	r6, [r1], #1
.Lal2:	ldrb	r7, [r0], #1
	sub	r2, r2, #1
	strbt	r7, [r1], #1
.Lal1:	ldrb	r6, [r0], #1
	sub	r2, r2, #1
	strbt	r6, [r1], #1
.Lalend:

	/*
	 * If few bytes left, finish slow.
	 */
	cmp	r2, #0x08
	blt	.Lcleanup

	/*
	 * If source is not aligned, finish slow.
	 */
	ands	r3, r0, #0x03
	bne	.Lcleanup

	cmp	r2, #0x60	/* Must be > 0x5f for unrolled cacheline */
	blt	.Lcleanup8

	/*
	 * Align source & destination to cacheline boundary.
	 */
	and	r6, r1, #0x1f
	ldr	pc, [pc, r6]
	b	.Lcaligned
	.word	.Lcaligned
	.word	.Lcal28
	.word	.Lcal24
	.word	.Lcal20
	.word	.Lcal16
	.word	.Lcal12
	.word	.Lcal8
	.word	.Lcal4
.Lcal28:ldr	r6, [r0], #4
	sub	r2, r2, #4
	strt	r6, [r1], #4
.Lcal24:ldr	r7, [r0], #4
	sub	r2, r2, #4
	strt	r7, [r1], #4
.Lcal20:ldr	r6, [r0], #4
	sub	r2, r2, #4
	strt	r6, [r1], #4
.Lcal16:ldr	r7, [r0], #4
	sub	r2, r2, #4
	strt	r7, [r1], #4
.Lcal12:ldr	r6, [r0], #4
	sub	r2, r2, #4
	strt	r6, [r1], #4
.Lcal8:	ldr	r7, [r0], #4
	sub	r2, r2, #4
	strt	r7, [r1], #4
.Lcal4:	ldr	r6, [r0], #4
	sub	r2, r2, #4
	strt	r6, [r1], #4

	/*
	 * We start with > 0x40 bytes to copy (>= 0x60 got us into this
	 * part of the code, and we may have knocked that down by as much
	 * as 0x1c getting aligned).
	 *
	 * This loop basically works out to:
	 * do {
	 * 	prefetch-next-cacheline(s)
	 *	bytes -= 0x20;
	 *	copy cacheline
	 * } while (bytes >= 0x40);
	 * bytes -= 0x20;
	 * copy cacheline
	 */
.Lcaligned:
	PREFETCH(r0, 32)
	PREFETCH(r1, 32)

	sub	r2, r2, #0x20

	/* Copy a cacheline */
	ldmia	r0!, {r6-r11}
	strt	r6, [r1], #4
	strt	r7, [r1], #4
	ldmia	r0!, {r6-r7}
	strt	r8, [r1], #4
	strt	r9, [r1], #4
	strt	r10, [r1], #4
	strt	r11, [r1], #4
	strt	r6, [r1], #4
	strt	r7, [r1], #4

	cmp	r2, #0x40
	bge	.Lcaligned

	sub	r2, r2, #0x20

	/* Copy a cacheline */
	ldmia	r0!, {r6-r11}
	strt	r6, [r1], #4
	strt	r7, [r1], #4
	ldmia	r0!, {r6-r7}
	strt	r8, [r1], #4
	strt	r9, [r1], #4
	strt	r10, [r1], #4
	strt	r11, [r1], #4
	strt	r6, [r1], #4
	strt	r7, [r1], #4

	cmp	r2, #0x08
	blt	.Lprecleanup

.Lcleanup8:
	ldmia	r0!, {r8-r9}
	sub	r2, r2, #8
	strt	r8, [r1], #4
	strt	r9, [r1], #4
	cmp	r2, #8
	bge	.Lcleanup8

.Lprecleanup:
	/*
	 * If we're done, bail.
	 */
	cmp	r2, #0
	beq	.Lout

.Lcleanup:
	and	r6, r2, #0x3
	ldr	pc, [pc, r6, lsl #2]
	b	.Lcend
	.word	.Lc4
	.word	.Lc1
	.word	.Lc2
	.word	.Lc3
.Lc4:	ldrb	r6, [r0], #1
	sub	r2, r2, #1
	strbt	r6, [r1], #1
.Lc3:	ldrb	r7, [r0], #1
	sub	r2, r2, #1
	strbt	r7, [r1], #1
.Lc2:	ldrb	r6, [r0], #1
	sub	r2, r2, #1
	strbt	r6, [r1], #1
.Lc1:	ldrb	r7, [r0], #1
	subs	r2, r2, #1
	strbt	r7, [r1], #1
.Lcend:
	bne	.Lcleanup

.Lout:
	mov	r0, #0

	str	r5, [r4, #PCB_ONFAULT]
	RESTORE_REGS

	RET
END(copyout)

/*
 * r0 = kernel space source address
 * r1 = kernel space destination address
 * r2 = length
 *
 * Copies bytes from kernel space to kernel space, aborting on page fault
 *
 * Copy of copyout, but without the ldrt/strt instructions.
 */

ENTRY(kcopy)
	/* Quick exit if length is zero */
	teq	r2, #0
	moveq	r0, #0
	moveq	pc, lr

	SAVE_REGS
	GET_CURPCB(r4)

	ldr	r5, [r4, #PCB_ONFAULT]
	adr	r3, .Lcopyfault
	str	r3, [r4, #PCB_ONFAULT]

	PREFETCH(r0, 0)
	PREFETCH(r1, 0)

	/*
	 * If not too many bytes, take the slow path.
	 */
	cmp	r2, #0x08
	blt	.Lkcleanup

	/*
	 * Align destination to word boundary.
	 */
	and	r6, r1, #0x3
	ldr	pc, [pc, r6, lsl #2]
	b	.Lkalend
	.word	.Lkalend
	.word	.Lkal3
	.word	.Lkal2
	.word	.Lkal1
.Lkal3:	ldrb	r6, [r0], #1
	sub	r2, r2, #1
	strb	r6, [r1], #1
.Lkal2:	ldrb	r7, [r0], #1
	sub	r2, r2, #1
	strb	r7, [r1], #1
.Lkal1:	ldrb	r6, [r0], #1
	sub	r2, r2, #1
	strb	r6, [r1], #1
.Lkalend:

	/*
	 * If few bytes left, finish slow.
	 */
	cmp	r2, #0x08
	blt	.Lkcleanup

	/*
	 * If source is not aligned, finish slow.
	 */
	ands	r3, r0, #0x03
	bne	.Lkcleanup

	cmp	r2, #0x60	/* Must be > 0x5f for unrolled cacheline */
	blt	.Lkcleanup8

	/*
	 * Align source & destination to cacheline boundary.
	 */
	and	r6, r1, #0x1f
	ldr	pc, [pc, r6]
	b	.Lkcaligned
	.word	.Lkcaligned
	.word	.Lkcal28
	.word	.Lkcal24
	.word	.Lkcal20
	.word	.Lkcal16
	.word	.Lkcal12
	.word	.Lkcal8
	.word	.Lkcal4
.Lkcal28:ldr	r6, [r0], #4
	sub	r2, r2, #4
	str	r6, [r1], #4
.Lkcal24:ldr	r7, [r0], #4
	sub	r2, r2, #4
	str	r7, [r1], #4
.Lkcal20:ldr	r6, [r0], #4
	sub	r2, r2, #4
	str	r6, [r1], #4
.Lkcal16:ldr	r7, [r0], #4
	sub	r2, r2, #4
	str	r7, [r1], #4
.Lkcal12:ldr	r6, [r0], #4
	sub	r2, r2, #4
	str	r6, [r1], #4
.Lkcal8:ldr	r7, [r0], #4
	sub	r2, r2, #4
	str	r7, [r1], #4
.Lkcal4:ldr	r6, [r0], #4
	sub	r2, r2, #4
	str	r6, [r1], #4

	/*
	 * We start with > 0x40 bytes to copy (>= 0x60 got us into this
	 * part of the code, and we may have knocked that down by as much
	 * as 0x1c getting aligned).
	 *
	 * This loop basically works out to:
	 * do {
	 * 	prefetch-next-cacheline(s)
	 *	bytes -= 0x20;
	 *	copy cacheline
	 * } while (bytes >= 0x40);
	 * bytes -= 0x20;
	 * copy cacheline
	 */
.Lkcaligned:
	PREFETCH(r0, 32)
	PREFETCH(r1, 32)

	sub	r2, r2, #0x20

	/* Copy a cacheline */
	ldmia	r0!, {r6-r11}
	stmia	r1!, {r6, r7}
	ldmia	r0!, {r6, r7}
	stmia	r1!, {r8-r11}
	stmia	r1!, {r6, r7}

	cmp	r2, #0x40
	bge	.Lkcaligned

	sub	r2, r2, #0x20

	/* Copy a cacheline */
	ldmia	r0!, {r6-r11}
	stmia	r1!, {r6-r7}
	ldmia	r0!, {r6-r7}
	stmia	r1!, {r8-r11}
	stmia	r1!, {r6-r7}

	cmp	r2, #0x08
	blt	.Lkprecleanup

.Lkcleanup8:
	ldmia	r0!, {r8-r9}
	sub	r2, r2, #8
	stmia	r1!, {r8-r9}
	cmp	r2, #8
	bge	.Lkcleanup8

.Lkprecleanup:
	/*
	 * If we're done, bail.
	 */
	cmp	r2, #0
	beq	.Lkout

.Lkcleanup:
	and	r6, r2, #0x3
	ldr	pc, [pc, r6, lsl #2]
	b	.Lkcend
	.word	.Lkc4
	.word	.Lkc1
	.word	.Lkc2
	.word	.Lkc3
.Lkc4:	ldrb	r6, [r0], #1
	sub	r2, r2, #1
	strb	r6, [r1], #1
.Lkc3:	ldrb	r7, [r0], #1
	sub	r2, r2, #1
	strb	r7, [r1], #1
.Lkc2:	ldrb	r6, [r0], #1
	sub	r2, r2, #1
	strb	r6, [r1], #1
.Lkc1:	ldrb	r7, [r0], #1
	subs	r2, r2, #1
	strb	r7, [r1], #1
.Lkcend:
	bne	.Lkcleanup

.Lkout:
	mov	r0, #0

	str	r5, [r4, #PCB_ONFAULT]
	RESTORE_REGS

	RET
END(kcopy)
#endif	/* !__XSCALE__ */

/*
 * int badaddr_read_1(const uint8_t *src, uint8_t *dest)
 *
 * Copies a single 8-bit value from src to dest, returning 0 on success,
 * else EFAULT if a page fault occurred.
 */
ENTRY(badaddr_read_1)
	GET_CURPCB(r2)
	ldr	ip, [r2, #PCB_ONFAULT]
	adr	r3, 1f
	str	r3, [r2, #PCB_ONFAULT]
	nop
	nop
	nop
	ldrb	r3, [r0]
	nop
	nop
	nop
	strb	r3, [r1]
	mov	r0, #0		/* No fault */
1:	str	ip, [r2, #PCB_ONFAULT]
	RET
END(badaddr_read_1)

/*
 * int badaddr_read_2(const uint16_t *src, uint16_t *dest)
 *
 * Copies a single 16-bit value from src to dest, returning 0 on success,
 * else EFAULT if a page fault occurred.
 */
ENTRY(badaddr_read_2)
	GET_CURPCB(r2)
	ldr	ip, [r2, #PCB_ONFAULT]
	adr	r3, 1f
	str	r3, [r2, #PCB_ONFAULT]
	nop
	nop
	nop
	ldrh	r3, [r0]
	nop
	nop
	nop
	strh	r3, [r1]
	mov	r0, #0		/* No fault */
1:	str	ip, [r2, #PCB_ONFAULT]
	RET
END(badaddr_read_2)

/*
 * int badaddr_read_4(const uint32_t *src, uint32_t *dest)
 *
 * Copies a single 32-bit value from src to dest, returning 0 on success,
 * else EFAULT if a page fault occurred.
 */
ENTRY(badaddr_read_4)
	GET_CURPCB(r2)
	ldr	ip, [r2, #PCB_ONFAULT]
	adr	r3, 1f
	str	r3, [r2, #PCB_ONFAULT]
	nop
	nop
	nop
	ldr	r3, [r0]
	nop
	nop
	nop
	str	r3, [r1]
	mov	r0, #0		/* No fault */
1:	str	ip, [r2, #PCB_ONFAULT]
	RET
END(badaddr_read_4)