This repository has been archived on 2025-02-01. You can view files and clone it, but cannot push or open issues or pull requests.
reprapfirmware-dc42/IsqrtThumb.s79
David Crocker 710d218c9a Version 1.00n
Add R parameter to M552 command to allow HTTP port to be configured
When listing SD card files, ignore files with a leading '.' in the
filename (e.g. files that Mac computers add to store metadata)
2015-03-06 09:37:08 +00:00

128 lines
4.9 KiB
Text

RSEG ICODE:CODE(2)
CODE16
public isqrt
isqrt:
; Enter with R0 having low 32 bits and R1 having high 32 bits of input parameter
; Exits with R0 having 32 bit result
; Performs the following pseudo code:
; INPUT = 64 bit input
; RESIDULE = 96 bit variable
; RESULT = 32 bit accumulated result
; Clear RESULT
; Set RESIDULE = INPUT (top 32 bits will be clear)
; Do this loop 32 times:
; Loop start
; {
; RESIDULE = RESIDULE << 2
; RESULT = RESULT << 1
; if ((RESULT << 1) + 1) <= (RESIDULE >> 32) then
; { RESIDULE = RESIDULE - ((RESULT << 1) + 1) << 32
; RESULT = RESULT + 1
; }
; }
; Loop end
; Return RESULT
; For optimisation the 32 iterations are divided into 2 sets of 16 iterations. The first set can shift
; RESIDULE as a 64 bit variable, and the second shifts it as a 96 bit variable while performing
; 32 and 64 bit arithmetic operations respectively on RESIDULE.
;
; A third loop is also used if the INPUT has the top 32 bits clear (only need to iterate 16 times)
; The total number of iterations needed is in fact equal to half the number of significant bits in the input
; but the overhead required to implement only the required number of iterations would on average take more time
; than it saves.
;
; The 32 bit shifts do not need to be executed because they are achieved by operating on the appropriate
; High or Low 32 bit register
; Register usage:
; R0 = Low 32 bits of input - re-used for low 32 bits of residule
; R1 = High 32 bits of input - reused for middle 32 bits of residule
; R2 = Top 32 bits of the 96 bit residule
; R3 = Loop counter
; R4 = Result accumulator
; R5 = Potential new residule middle 32 bits
; R6 = Potential new residule top 32 bits
; R7 = zero constant - because thumb has no immediate mode for SBC instruction
push {r2,r3,r4,r5,r6,r7} ; Save registers changed (no need to save entry or exit registers)
; Set up initial variables
mov r2,#0 ; Clear residulal shift registers
mov r3,r2 ; ...
mov r4,r2 ; Clear result
mov r7,r2 ; Clear 0 constant
mov r3,#16 ; Do 16 iterations per loop
; See if we have any high bits
cmp r1,r7 ; Is high 32 bits clear?
beq Isqrt_Loop_32 ; Only need to do low 32 bits if so
Isqrt_Loop_High:
add r1,r1,r1 ; Shift the residual up 2 bits, overflowing into R2
adc r2,r2 ; ...
add r1,r1,r1 ; ...
adc r2,r2 ; ...
lsl r4,r4,#1 ; Shift accumulated result left 1 bit (this will clear the carry flag)
mov r5,r2 ; Thumb need source=destination
sbc r5,r4 ; Take potential new result +1 from shifted residual
bcc Loopback_High ; If result is already greater we don't set the bit
sub r5,r5,r4 ; R5 now has (result << 1 +1)
bcc Loopback_High ; If result is greater we don't set the bit
mov r2,r5 ; Replace if result was lower or equal
add r4,r4,#1 ; Set the bit in the result
Loopback_High: sub r3,r3,#1 ; Loop counter
bne Isqrt_Loop_High ; Loop 16 times
; R1 must now be clear (because it has been left shifted 32 times)
; Continue, this time using 64 bit arithmetic, re-using R1 for the high 32 bits (and R2 has the low 32 bits)
mov r3,#16 ; Do 16 iterations per loop
Isqrt_Loop_Low:
add r0,r0,r0 ; Shift the 64 bit residual up 2 bits (2 squared), overflowing into R1
adc r2,r2 ; ...
adc r1,r1 ; ...
add r0,r0,r0 ; ...
adc r2,r2 ; ...
adc r1,r1 ; ...
lsl r4,r4,#1 ; Shift accumulated result left 1 bit (this will clear the carry flag)
mov r5,r2 ; Thumb needs source-destination
mov r6,r1 ; Thumb needs source-destination
sbc r5,r4 ; Take potential new result +1 from shifted residual
sbc r6,r7 ; Deal with carry into top 32 bits
bcc Loopback_Low ; If result is already greater we don't set the bit
sbc r5,r4 ; Subtract result again (result << 1 +1)
sbc r6,r7 ; R5/R6 now has potential new value
bcc Loopback_Low ; If result is greater we don't set the bit
mov r2,r5 ; Replace if result was lower or equal
mov r1,r6 ; ...
add r4,r4,#1 ; Set the bit in the result
Loopback_Low: sub r3,r3,#1 ; Loop counter
bne Isqrt_Loop_Low ; Loop 16 times
mov r0,r4 ; Result back into R0
pop {r2,r3,r4,r5,r6,r7}; Recover registers
bx r14 ; Return
; This routine is used instead if the top 32 bits of the input are zero
Isqrt_Loop_32: add r0,r0,r0 ; Shift the 32 bit residual up 2 bits (2 squared), overflowing into R2
adc r2,r2 ; ...
add r0,r0,r0 ; ...
adc r2,r2 ; ...
lsl r4,r4,#1 ; Shift accumulated result left 1 bit (this will clear the carry flag)
mov r5,r2 ; Thumb need source=destination
sbc r5,r4 ; Take potential new result +1 from shifted residual
bcc Loopback_32 ; If result is already greater we don't set the bit
sub r5,r5,r4 ; R5 now has (result << 1 +1)
bcc Loopback_32 ; If result is greater we don't set the bit
mov r2,r5 ; Replace if result was lower or equal
add r4,r4,#1 ; Set the bit in the result
Loopback_32: sub r3,r3,#1 ; Loop counter
bne Isqrt_Loop_32 ; Loop 16 times
mov r0,r4 ; Result back into R0
pop {r2,r3,r4,r5,r6,r7}; Recover registers
bx r14 ; Return
END