/*
  ramloader1100.s - a RAM program loader for NXP LPC1100 devices.
  Copyright 2013 Marc Prager
 
  ramloader is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License
  as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
 
  ramloader is published in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
  of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
  You should have received a copy of the GNU General Public License along with ramloader.
  If not see <http://www.gnu.org/licenses/>
 */

/*
Version info: $Rev: 473 $

Conventions "RAMLOADER ABI"
============================

- NO STACK USE, only one level of function calls allowed (using lr).
- init-Routines may use all registers
- r0 as argument/result and local variable
- r1 as local variable in main and command handlers
- r2 as local variable in main and command handlers
- r3 = checksum in ramloader.write (local variable)
- r4/r5 for uart0Read/uart0Write local variables
- r6 = RAM address, relative to RAM_BASE (new)
- r7 = local variable in main and command handlers 
- high registers unused with the exception of lr and pc.

I stay with the RESET clock settings of 12MHz IRC und use the fractional baud rate generator to get close
to 115200bd. 12MHz = 115200bd * 16 * 4 * (1+5/8) + small error.
This is done with the following settings:
  UART.dll = 4
  UART.mulval = 8
  UART.divaddval = 5

This ramloader uses the vector table offset register (VTOR) to use the vector table of the downloaded code
in RAM.

ramloader uses/keeps the same pins as the LPC800 bootloader:
	PIO1_6 = RXD
	PIO1_7 = TXD
	PIO0_1 = /BOOT (must be HIGH)
	PIO0_0 = /RESET

ramloader ALLWAYS loads your SP and program entry point from address 0 (=remapped 0x1000 0000). It cannot remap other
RAM addresses.
*/

.syntax unified
.cpu cortex-m0plus

# definitions for LPC11xx
@ .set	RAM_BASE,	0x10000000
.set	RAM_BASE_LOG,	28		@ RAM_BASE = 1<<28
@ .set	RAM_SIZE,	0x1000		@ 32kiB, needed for 'fill' operation
.set	RAM_SIZE_LOG,	12		@ RAM_SIZE = 1<<12

.set	CCLK,		12*1000*1000

.set	BAUD,		115200

################################################################################

@ SYSCON
.equ	SYSCON_BASE,		0x40048000
	.equ	_SYSMEMREMAP,	0x00
	.equ	_PRESETCTRL,	0x04
	.equ	_UARTCLKDIV,	0x98

.equ	_SYSCON_SYSAHBCLKCTRL,	0x80
.equ	SYSAHBCLKCTRL,		0x40048080
	.equ	CLK_UART,	12		@ UART clock enable
	.equ	CLK_IOCON,	16		@ IOCON clock enable

.equ	PD_BASE,		SYSCON_BASE+0x230
	.equ	_PDSLEEPCFG,	0x0
	.equ	_PDAWAKECFG,	0x4
	.equ	_PDRUNCFG,	0x8

@ IOCON 
.equ	IOCON_BASE,	0x40044000
	.equ	_PIO_RXD,	0xA4
	.equ	_PIO_TXD,	0xA8

@ UART0
.equ	UART_BASE,	0x40008000
	.equ	_RBR,		0x00
	.equ	_THR,		0x00
	.equ	_LCR,		0x0C
	.equ	_DLL,		0x00
	.equ	_DLM,		0x04
	.equ	_LSR,		0x14
	.equ	_FDR,		0x28
	.equ	_FCR,		0x08
	.equ	DLAB,		1<<7
	.equ	LCR_8N1,	3<<0
	.equ	FCR_FIFOEN,	1<<0


@ Vector table remapping
.equ	VTOR,		0xE000ED08

################################################################################

.text
.thumb
vectorTable:
	initialSP:	.word	0		@ ramloader doesn't use any stack
	initialPC:	.word	_start+1

@ No need to waste FLASH here, we're not using interrupts and do not cause exceptions.
.org 0x20	@ just behind the NXP valid code checksum area

.global _start
_start:
		@ enable AHB clocks for IOCON, UART
		ldr r1,=SYSAHBCLKCTRL
		ldr r0,[r1]
		ldr r2,= 1<<CLK_UART | 1<<CLK_IOCON
		orrs r0,r0,r2
		str r0,[r1]

		@ enable UART clock
		ldr r1,=SYSCON_BASE+0x80
		movs r0,#1	@ 0 means: disabled, 1=divide by 1, ...
		str r0,[r1,#+_UARTCLKDIV-0x80]

	1:
		@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
		@@@@@@@@@@ UART initialization
		@ UART, same pins as ISP boot loader

		@ peripheral reset is deasserted (1!) by default

		@ configure PINS
		ldr r1,=IOCON_BASE+0x80
		ldr r0,=1<<0 | 3<<6	@ function 1, digital port
		str r0,[r1,#+_PIO_RXD-0x80]
		str r0,[r1,#+_PIO_TXD-0x80]

		@ configure baud rate
		ldr r1,=UART_BASE
		movs r0,#DLAB
		str r0,[r1,#+_LCR]
		movs r0,#4
		str r0,[r1,#_DLL]	@ integral divider 4
		movs r0,#0
		str r0,[r1,#_DLM]
		movs r0,#5 + (8<<4)
		str r0,[r1,#_FDR]	@ fractional divider (1+5/8)

		@configure frame bits
		movs r0,#LCR_8N1	@ 8bits, no parity, 1 stop 
		str r0,[r1,#+_LCR]
		movs r0,#FCR_FIFOEN
		str r0,[r1,#+_FCR]

		@@@@@@@@@@ output welcome message
		ldr r3,=0f
	1:	ldrb r0,[r3]
		tst r0,r0	@tst r0,#0xFF
		beq main
		bl uart0Write
		adds r3,r3,#1
		b 1b
		
	0:	.ascii	"ramloader 4.1 preliminary ($Rev: 473 $) for LPC1100, (C) Marc Prager 2013\n"
		.ascii	"ram at 0x10000000\n"
		.asciz	"<RAMLOADER: hardware reset>\n"
		.align 2

.func
.thumb_func
main:
		bl uart0Read
		movs r1,#7	@ codes>7 not allowed: ignore
		cmp r0,r1
		bhi main

		@ calculate jump destination
		lsls r0,r0,#1
		add r0,pc
		mov pc,r0
		
		@ 16 bit instructions jump table
	main.table:
		b.n ramloader.hello		@ cmd 0				-> <0x41>
		b.n ramloader.reset_address	@ cmd 1				-> <0xA5>
		b.n ramloader.write		@ cmd 2 <256 data bytes>	-> <sumL> <sumH>
		b.n ramloader.read		@ cmd 3				-> <256 data bytes>
		b.n ramloader.execute		@ cmd 4				-> invalidate write cache and execute program
		b.n ramloader.sync		@ cmd 5 <code>			-> ~<code>
		b.n ramloader.fill		@ cmd 6 <fillByte>		-> 'F'	fill ram (4k)
		b.n ramloader.setAddress	@ cmd 7 <32-bit-addr, LE>	-> 'A' set address

.endfunc

.align 2

.func
.thumb_func
uart0Read:
	@ result r0: char
	@ uses r4,r5
		ldr r4,=UART_BASE
		@ check, if character available
	1:	ldr r0,[r4,#+_LSR]	@ bit #0 is RDR
		lsrs r0,r0,#0+1
		bcc 1b				@ no character, yet
		ldr r0,[r4,#+_RBR]
		bx lr
.endfunc

.func
.thumb_func
uart0Write:
	@param r0: char
	@ uses r4, r5
		@ check, if UART is ready
		ldr r4,=UART_BASE
	1:	ldr r5,[r4,#+_LSR]	@ bit #5 is THRE
		lsrs r5,r5,#5+1
		bcc 1b				@ bit not set => buffer full
		str r0,[r4,#+_THR]
		bx lr
.endfunc

.func
.thumb_func
uart0Sync:	@ wait until all characters sent.
	@ uses r4,r5
	1:	ldr r4,=UART_BASE
		ldr r5,[r4,#+_LSR]	@ bit #6 is TEMT
		lsrs r5,r5,#6+1
		bcc 1b				@ character not sent, yet
		bx lr
.endfunc

.func
.thumb_func
ramloader.hello:
		movs r0,#'A'
		bl uart0Write
		b.n main
.endfunc

.func
.thumb_func
ramloader.reset_address:
		movs r6,#0
		movs r0,#0xA5
		bl uart0Write
		b.n main
.endfunc

.func
.thumb_func
ramloader.write:
		movs r7,#1
		lsls r7,r7,#RAM_BASE_LOG
		movs r3,#0		@ checksum
	1:
		bl uart0Read
		strb r0,[r7,+r6]
		adds r3,r0,r3
		movs r0,#1
		adds r6,r6,r0
		movs r0,#0xFF
		tst r6,r0
		bne 1b

		@ return checksum
		mov r0,r3		@ write lower 8 bits
		bl uart0Write
		lsrs r0,r3,#8
		bl uart0Write
		b.n main
.endfunc

.func
.thumb_func
ramloader.read:
		movs r7,#1
		lsls r7,r7,#RAM_BASE_LOG
	1:
		ldrb r0, [r7,+r6]
		bl uart0Write
		movs r0,#1
		adds r6,r6,r0
		movs r0,#0xFF
		tst r6,r0
		bne 1b

		b.n main
.endfunc


.func
.thumb_func
ramloader.sync:
		bl uart0Read
		movs r1,#0xFF
		eors r0,r1
		bl uart0Write

		b.n main
.endfunc

.func
.thumb_func
ramloader.fill:
		bl uart0Read		@ get fill byte
		@ldr r1,#RAM_BASE
		movs r1,#1
		lsls r1,r1,#RAM_BASE_LOG
		@ldr r2,=RAM_BASE+RAM_SIZE
		movs r2,#1
		lsls r2,r2,#RAM_SIZE_LOG
		movs r3,#1
	1:	strb r0,[r1]
		adds r1,r3,r1
		cmp r1,r2
		bne 1b

		movs r0,#'F'
		bl uart0Write
		b.n main
.endfunc

.func
.thumb_func
ramloader.setAddress:
		movs r6,#0
		movs r1,#8
		@ LSB
		bl uart0Read
		orrs r6,r6,r0
		rors r6,r6,r1

		bl uart0Read
		orrs r6,r6,r0
		rors r6,r6,r1
		
		bl uart0Read
		orrs r6,r6,r0
		rors r6,r6,r1

		# MSB
		bl uart0Read
		orrs r6,r6,r0
		rors r6,r6,r1

		@ compensate for RAM_BASE
		movs r0,#1
		lsls r0,r0,#RAM_BASE_LOG
		subs r6,r6,r0

		movs r0,#'A'
		bl uart0Write

		b main
.endfunc

.ltorg

message:	.asciz	"<RAMLOADER: remap lower 512 bytes, reset peripherals then start program>\n"
		.align 2

.func
.thumb_func
ramloader.execute:
		@ Say goodbye and stop serial port...

		ldr r3,=message
	1:	ldrb r0,[r3]
		tst r0,r0
		beq 2f 
		bl uart0Write
		adds r3,r3,#1
		b 1b
		
		@ wait until last message is sent
		bl uart0Sync

		@ wait some more (around 100ms) to let UART0 TX level stabilize before disconnecting
	2:	movs r0,#1
		lsls r0,r0,#20
	3:	subs r0,r0,#1
		bne 3b

		@ undo the initializations



		@ configure PINS
		ldr r1,=IOCON_BASE+0x80
		ldr r0,=0<<0 | 3<<8	@ function 0, I2C-mode 3 (reset default)
		str r0,[r1,#+_PIO_RXD-0x80]
		str r0,[r1,#+_PIO_TXD-0x80]

		@ configure baud rate
		ldr r1,=UART_BASE
		movs r0,#DLAB
		str r0,[r1,#+_LCR]
		movs r0,#1
		str r0,[r1,#_DLL]	@ integral divider 4
		movs r0,#0
		str r0,[r1,#_DLM]
		movs r0,#1<<4		@ reset default
		str r0,[r1,#_FDR]

		@configure frame bits
		movs r0,#0 
		str r0,[r1,#+_LCR]
		str r0,[r1,#+_FCR]

		@ clock prescaler and FRG was disabled on reset
		ldr r1,=SYSCON_BASE+0x80
		movs r0,#0	@ reset value of the following 3 regs.
		str r0,[r1,#+_UARTCLKDIV-0x80]

/*
		@ honor interrupt table entries
		ldr r0,=SYSCON_BASE
		ldr r1,=0x1FF7		@ bits 0..12, except #3 = UART0
		str r1,[r0,#+_PRESETCTRL]	@ reset UART0
		adds r1,r1,#1<<3		@ replaces a | 1<<3
		str r1,[r0,#+_PRESETCTRL]	@ deassert reset of UART0
*/
		@ ldr r0,=RAM_BASE
		movs r0,#1
		lsls r0,r0,#RAM_BASE_LOG
		ldr r1,=VTOR
		str r0,[r1]		@ pretend, vector table is in ROM.

		ldr r1,[r0,#+0]		@ initial stack pointer
		mov sp,r1		@ initial stack pointer
		ldr r1,[r0,#+4]		@ jump to start of program
		bx r1
		@ never reached !
.endfunc

.end


