For my next trick, je vous presentez, logarithme naturel (in fixed point 8.8 format):
Code:
lognat:
;Input: H.L needs to be on [1,2]
;Output: H.L if z flag is set, else if nz, no result
;avg speed: 677 t-states
dec h
dec h
jr nz,$+8
inc l \ dec l
ret nz
ld l,177
ret
inc h
ret nz
ld b,h
ld c,l
ld e,l
ld d,8
add hl,hl
add hl,hl
add hl,de
ex de,hl
add hl,hl \ sbc hl,de \ jr nc,$+3 \ add hl,de \ adc a,a
add hl,hl \ sbc hl,de \ jr nc,$+3 \ add hl,de \ adc a,a
add hl,hl \ sbc hl,de \ jr nc,$+3 \ add hl,de \ adc a,a
add hl,hl \ sbc hl,de \ jr nc,$+3 \ add hl,de \ adc a,a
add hl,hl \ sbc hl,de \ jr nc,$+3 \ add hl,de \ adc a,a
add hl,hl \ sbc hl,de \ jr nc,$+3 \ add hl,de \ adc a,a
add hl,hl \ sbc hl,de \ jr nc,$+3 \ add hl,de \ adc a,a
add hl,hl \ sbc hl,de \ adc a,a
ld h,a \ ld l,b
sla h \ jr c,$+3 \ ld l,c
add hl,hl \ jr c,$+3 \ add hl,bc
add hl,hl \ jr c,$+3 \ add hl,bc
add hl,hl \ jr c,$+3 \ add hl,bc
add hl,hl \ jr c,$+3 \ add hl,bc
add hl,hl \ jr c,$+3 \ add hl,bc
add hl,hl \ jr c,$+3 \ add hl,bc
add hl,hl \ jr c,$+3 \ add hl,bc
rl l
ld a,h
adc a,b
ld l,a
ld h,b
cp a
ret
It only works accurately on [1,2], where I designed the algorithm. It is pretty fast, returning the result in 677 t-states or less, but its range is fairly limited (to 257 values).
I have been working on another routine that actually works on (0,128), but it takes upwards of about 1500 t-states and there are some accuracy issues I want to fix before posting.
EDIT:
I have the routine with a little better speed outlook and much better accuracy outside the [1,2] range than before (worst case is 2/256).
Code:
lognat:
;Input: H.L needs to be on (0,128.0)
;Output: H.L if c flag set
; returns nc if input is negative (HL not modified)
;Error:
; The error on the outputs is as follows:
; 20592 inputs are exact
; 12075 inputs are off by 1/256
; 100 inputs are off by 2/256
; So all 32767 inputs are within 2/256, with average error being <1/683 which is smaller than 1/256.
;Size: 177 bytes
;Speed: average speed is less than 1250 t-states
ld a,h \ or l \ jr nz,$+5
ld h,80h \ ret
dec h
dec h
jr nz,$+9
inc l \ dec l
jr nz,normalizeln-1
ld l,177
ret
inc h
jr nz,normalizeln
ld b,h
ld c,l
ld e,l
ld d,8
add hl,hl
add hl,hl
add hl,de
ex de,hl
call HL_Div_DE
ld h,a \ ld l,b
sla h \ jr c,$+3 \ ld l,c
add hl,hl \ jr c,$+3 \ add hl,bc
add hl,hl \ jr c,$+3 \ add hl,bc
add hl,hl \ jr c,$+3 \ add hl,bc
add hl,hl \ jr c,$+3 \ add hl,bc
add hl,hl \ jr c,$+3 \ add hl,bc
add hl,hl \ jr c,$+3 \ add hl,bc
add hl,hl \ jr c,$+3 \ add hl,bc
rl l
ld a,h
adc a,b
ld h,b
ld l,a
scf
ret
HL_Div_DE:
add hl,hl \ sbc hl,de \ jr nc,$+3 \ add hl,de \ adc a,a
add hl,hl \ sbc hl,de \ jr nc,$+3 \ add hl,de \ adc a,a
add hl,hl \ sbc hl,de \ jr nc,$+3 \ add hl,de \ adc a,a
add hl,hl \ sbc hl,de \ jr nc,$+3 \ add hl,de \ adc a,a
add hl,hl \ sbc hl,de \ jr nc,$+3 \ add hl,de \ adc a,a
add hl,hl \ sbc hl,de \ jr nc,$+3 \ add hl,de \ adc a,a
add hl,hl \ sbc hl,de \ jr nc,$+3 \ add hl,de \ adc a,a
add hl,hl \ sbc hl,de \ adc a,a \ ret
inc h
normalizeln:
xor a
inc h \ ret m
ld d,a \ ld e,a
ld a,l
jr z,toosmall
inc e \ srl h \ rra \ jr nz,$-4
rla \ rl h
dec e
stepin:
ld l,a
push de
call lognat
pop de
;now multiply DE by 355, then divide by 2 (rounding)
ld b,d \ ld c,e \ ld a,d
ex de,hl
add hl,hl
add hl,hl ;4
add hl,bc ;5
add hl,hl ;10
add hl,bc ;11
add hl,hl ;22
add hl,hl
add hl,hl
add hl,hl
add hl,bc
add hl,hl
add hl,bc
sra h \ rr l
adc hl,de
scf
ret
toosmall:
dec d
dec e \ add a,a \ jr nc,$-2
inc h
jp stepin
EDIT2:
Here are a bunch of fixed point 8.8 routines, mostly optimised for speed.
B.C*D.E→H.L
Code:
BC_Times_DE:
; BHLA is the 32-bit result
ld a,b
or a
ld hl,0
ld b,h
;1
add a,a
jr nc,$+4
ld h,d
ld l,e
;2
add hl,hl
rla
jr nc,$+4
add hl,de
adc a,b
;227+10b-7p
add hl,hl
rla
jr nc,$+4
add hl,de
adc a,b
add hl,hl
rla
jr nc,$+4
add hl,de
adc a,b
add hl,hl
rla
jr nc,$+4
add hl,de
adc a,b
add hl,hl
rla
jr nc,$+4
add hl,de
adc a,b
add hl,hl
rla
jr nc,$+4
add hl,de
adc a,b
add hl,hl
rla
jr nc,$+4
add hl,de
adc a,b
;===
;AHL is the result of B*DE*256
push hl
ld h,b
ld l,b
ld b,a
ld a,c
ld c,h
;1
add a,a
jr nc,$+4
ld h,d
ld l,e
;2
add hl,hl
rla
jr nc,$+4
add hl,de
adc a,c
;227+10b-7p
add hl,hl
rla
jr nc,$+4
add hl,de
adc a,c
add hl,hl
rla
jr nc,$+4
add hl,de
adc a,c
add hl,hl
rla
jr nc,$+4
add hl,de
adc a,c
add hl,hl
rla
jr nc,$+4
add hl,de
adc a,c
add hl,hl
rla
jr nc,$+4
add hl,de
adc a,c
add hl,hl
rla
jr nc,$+4
add hl,de
adc a,c
pop de
;Now BDE*256+AHL
ld c,a
ld a,l
ld l,h
ld h,c
add hl,de
ret nc
inc b
;BHLA is the 32-bit result
ret
D.E/B.C→D.E
Code:
DE_Div_BC_88:
;Inputs:
; DE,BC are 8.8 Fixed Point numbers
;Outputs:
; DE is the 8.8 Fixed Point result (rounded to the least significant bit)
ld a,8
ld hl,0
Loop1:
rl d
adc hl,hl
sbc hl,bc
jr nc,$+3
add hl,bc
dec a
jr nz,Loop1
ld d,e
ld e,a
ld a,16
jp $+6
DivLoop:
add hl,bc
dec a
ret z
sla e
rl d
adc hl,hl
jr c,overflow
sbc hl,bc
jr c,DivLoop
inc e
jp DivLoop+1
overflow:
or a
sbc hl,bc
inc e
jp DivLoop
sqrt(H.L)→H.L
Code:
SqrtHL_prec12:
;input: HL
;Output: HL
;183 bytes
xor a
ld b,a
ld e,l
ld l,h
ld h,a
add hl,hl
add hl,hl
cp h
jr nc,$+5
dec h
ld a,4
add hl,hl
add hl,hl
ld c,a
sub h
jr nc,$+6
cpl
ld h,a
inc c
inc c
ld a,c
add hl,hl
add hl,hl
add a,a
ld c,a
sub h
jr nc,$+6
cpl
ld h,a
inc c
inc c
ld a,c
add hl,hl
add hl,hl
add a,a
ld c,a
sub h
jr nc,$+6
cpl
ld h,a
inc c
inc c
ld a,c
ld l,e
add hl,hl
add hl,hl
add a,a
ld c,a
sub h
jr nc,$+6
cpl
ld h,a
inc c
inc c
ld a,c
add hl,hl
add hl,hl
add a,a
ld c,a
sub h
jr nc,$+6
cpl
ld h,a
inc c
inc c
ld a,c
add a,a \ ld c,a
add hl,hl
add hl,hl
jr nc,$+6
sub h \ jp $+6
sub h
jr nc,$+6
inc c \ inc c
cpl
ld h,a
ld a,l
ld l,h
add a,a
ld h,a
adc hl,hl
adc hl,hl
sll c \ rl b
sbc hl,bc
jr nc,$+3
add hl,bc
sbc a,a \ add a,a \ inc a \ add a,c \ ld c,a
;iteration 9
add hl,hl \ add hl,hl
sll c \ rl b
sbc hl,bc
jr nc,$+3
add hl,bc
sbc a,a \ add a,a \ inc a \ add a,c \ ld c,a
add hl,hl \ add hl,hl
sll c \ rl b
sbc hl,bc
jr nc,$+3
add hl,bc
sbc a,a \ add a,a \ inc a \ add a,c \ ld c,a
add hl,hl \ add hl,hl
sll c \ rl b
sbc hl,bc
jr nc,$+3
add hl,bc
sbc a,a \ add a,a \ inc a \ add a,c \ ld c,a
add hl,hl \ add hl,hl
sll c \ rl b
sbc hl,bc
jr nc,$+3
add hl,bc
sbc a,a \ add a,a \ inc a \ add a,c \ ld c,a
;12th iteration completed
; output in BC
srl b \ rr c
ld h,b
ld l,c
ret
2H.L→DEH.L
Code:
pow2:
;Inputs:
; HL is the 8.8 fixed point number 'x' for 2^x
;Outputs:
; DEHL is the 24.8 fixed point result. If there was overflow exceeding 2^24, then this value is set to the max.
ld a,l
or a
push hl ;save H for later, H is the integer part of the power
ld hl,1
jr z,integerpow
scf ;set the carry flag so that a bit is rotated into a. This will act as our counter.
;wait until we come across the lowest bit. Also note that we
rra
jr nc,$-1
ld hl,2*256
powloop:
push af
call FPSqrtHL ;returns in HL
pop af
srl a
jr z,integerpow
jr nc,powloop
add hl,hl
jp powloop
integerpow:
pop bc
;Now b is the integer part for 2^x that we need to multiply HL by.
ld de,0
ld a,b
or a
ret z
add hl,hl
rl e \ rl d \ jr c,wayoverflow
djnz $-7
ret
wayoverflow:
ld hl,-1
ld d,h
ld e,l
ret
log2(H.L)→H.L
Code:
Log_2_88_size:
;Inputs:
; HL is an unsigned 8.8 fixed point number.
;Outputs:
; HL is the signed 8.8 fixed point value of log base 2 of the input.
;Example:
; pass HL = 3.0, returns 1.58203125 (actual is ~1.584962501...)
;averages about 39 t-states slower than original
;62 bytes
ex de,hl
ld hl,0
ld a,d
ld c,8
or a
jr z,DE_lessthan_1
srl d
jr z,logloop-1
inc l
rr e
jr $-7
DE_lessthan_1:
ld a,e
dec hl
or a
ret z
inc l
dec l
add a,a
jr nc,$-2
ld e,a
inc d
logloop:
add hl,hl
push hl
ld h,d
ld l,e
ld a,e
ld b,8
add hl,hl
rla
jr nc,$+5
add hl,de
adc a,0
djnz $-7
ld e,h
ld d,a
pop hl
rr a ;this is NOT supposed to be rra, we need the z flag affected
jr z,$+7
srl d
rr e
inc l
dec c
jr nz,logloop
ret
sin(D.E)→H.L (uses the BC_Times_DE routine above)
Code:
ine_88:
;Inputs: de
push de
sra d \ rr e
ld b,d \ ld c,e
call BC_Times_DE
push hl ;x^2/4
sra h \ rr l
ex de,hl
ld b,d \ ld c,e
call BC_Times_DE
sra h \ rr l
inc h
ex (sp),hl ;x^4/128+1 is on stack, HL=x^2/4
xor a
ld d,a
ld b,h
ld c,l
add hl,hl \ rla
add hl,hl \ rla
add hl,bc \ adc a,d
ld b,h
ld c,l
add hl,hl \ rla
add hl,hl \ rla
add hl,hl \ rla
add hl,hl \ rla
add hl,bc \ adc a,d
ld e,l
ld l,h
ld h,a
rl e
adc hl,hl
rl e
jr nc,$+3
inc hl
pop de
ex hl,de
or a
sbc hl,de
ex de,hl
pop bc
jp BC_Times_DE
EDIT3: If you are willing to use a 256 byte LUT, this is a much, much faster fixed point lg() routine that I thought of on my walk home:
Code:
lg_88:
;Input: HL is a fixed point number
;Output: lg(H.L)->H.L
;Speed: Avg: 340
ld de,lgLUT
ld b,0
ld a,h
or a
ret m
ld a,l
jr z,$+8
inc b \ srl h \ rra \ jr nz,$-4
or a \ jr nz,$+6
ld hl,8000h \ ret
rra \ inc b \ jr nc,$-2
;A is the element to look up in the LUT
ld l,a
ld c,h
dec b
add hl,hl
add hl,de
ld e,(hl)
inc hl
ld d,(hl)
ex de,hl
add hl,bc
ret
lglut:
.dw $F800
.dw $F996
.dw $FA52
.dw $FACF
.dw $FB2C
.dw $FB76
.dw $FBB3
.dw $FBE8
.dw $FC16
.dw $FC3F
.dw $FC64
.dw $FC86
.dw $FCA5
.dw $FCC1
.dw $FCDC
.dw $FCF4
.dw $FD0B
.dw $FD21
.dw $FD36
.dw $FD49
.dw $FD5C
.dw $FD6D
.dw $FD7E
.dw $FD8E
.dw $FD9D
.dw $FDAC
.dw $FDBA
.dw $FDC8
.dw $FDD5
.dw $FDE2
.dw $FDEE
.dw $FDFA
.dw $FE06
.dw $FE11
.dw $FE1C
.dw $FE26
.dw $FE31
.dw $FE3B
.dw $FE44
.dw $FE4E
.dw $FE57
.dw $FE60
.dw $FE69
.dw $FE71
.dw $FE7A
.dw $FE82
.dw $FE8A
.dw $FE92
.dw $FE9A
.dw $FEA1
.dw $FEA9
.dw $FEB0
.dw $FEB7
.dw $FEBE
.dw $FEC5
.dw $FECB
.dw $FED2
.dw $FED8
.dw $FEDF
.dw $FEE5
.dw $FEEB
.dw $FEF1
.dw $FEF7
.dw $FEFD
.dw $FF03
.dw $FF09
.dw $FF0E
.dw $FF14
.dw $FF19
.dw $FF1E
.dw $FF24
.dw $FF29
.dw $FF2E
.dw $FF33
.dw $FF38
.dw $FF3D
.dw $FF42
.dw $FF47
.dw $FF4B
.dw $FF50
.dw $FF55
.dw $FF59
.dw $FF5E
.dw $FF62
.dw $FF67
.dw $FF6B
.dw $FF6F
.dw $FF74
.dw $FF78
.dw $FF7C
.dw $FF80
.dw $FF84
.dw $FF88
.dw $FF8C
.dw $FF90
.dw $FF94
.dw $FF98
.dw $FF9B
.dw $FF9F
.dw $FFA3
.dw $FFA7
.dw $FFAA
.dw $FFAE
.dw $FFB2
.dw $FFB5
.dw $FFB9
.dw $FFBC
.dw $FFC0
.dw $FFC3
.dw $FFC6
.dw $FFCA
.dw $FFCD
.dw $FFD0
.dw $FFD4
.dw $FFD7
.dw $FFDA
.dw $FFDD
.dw $FFE0
.dw $FFE4
.dw $FFE7
.dw $FFEA
.dw $FFED
.dw $FFF0
.dw $FFF3
.dw $FFF6
.dw $FFF9
.dw $FFFC
.dw $FFFF
It averages about 340 t-states and should be accurate to every bit. Being this fast, it might be better to combine this with the very fast multiplication routine above to get log base x Or for example, you can incorporate a natural log routine by simply multiplying lg(x) by 1/lg(e) (which is approximately 355/512):
Code:
ln_88:
;Input: HL is a fixed point number
;Output: ln(H.L)->H.L
;Speed: Avg: 340+(325 worst case)
call lg_88
;now signed multiply HL by 355, then divide by 2 (rounding)
ld de,0
bit 7,h
jr z,$+9
dec e \ xor a \ sub l \ ld l,a
sbc a,a \ sub h \ ld h,a
ld b,h
ld c,l
xor a
add hl,hl
add hl,hl \ rla
add hl,bc \ adc a,d
add hl,hl \ rla
add hl,bc \ adc a,d
add hl,hl \ rla
add hl,hl \ rla
add hl,hl \ rla
add hl,hl \ rla
add hl,bc \ adc a,d
add hl,hl \ rla
add hl,bc \ adc a,d
sra a \ rr h
ld l,h
ld h,a
inc e
ret nz
xor a \ sub l \ ld l,a
sbc a,a \ sub h \ ld h,a
ret
The overhead being at most 325 t-states, bringing it to an average of 665 t-states for ln() on (0,128)