- eZ80 Optimized Routines
- 30 Jan 2015 06:49:53 am
- Last edited by Xeda112358 on 24 Mar 2015 01:36:10 pm; edited 6 times in total
Share the most optimized eZ80 routines!
Can anybody do better? (Usually: Yes.)
16-bit unsigned multiplication returning 32-bit product. It expects to be called in Z80 mode.
Code:
As a note, I found Karatsuba multiplication to be less efficient up to 24-bits and I have yet to write a 32-bit routine. EDIT:It is more efficient at 32-bits, but only by little
Remember, a lot of the optimized Z80 routines work well here, but some of them can actually be reworked to be even more efficient than a direct translation.
EDIT: 5-Feb-15 The following are UNTESTED.
Here is a 16-bit GCD algorithm.
Code:
32-bit multiplication:
Code:
sqrt16:
Code:
EDIT: 9-Feb-15
This one performs A*HL/255. Be warned that this does not work on certain boundary values. For example, A=85 would imply division by 3, but if you input HL as divisible by 3, you get one less than the actual result. So 9*85/255 should return 3, but this routine returns 2 instead.
Code:
One way to almost remedy the roud-off issue, (I think except for HL=65535) is:
Code:
This comes at a cost of 21cc and 8 bytes.
The rounding version works fairly well and only adds 5cc and 5 bytes:
Code:
edit: fixed and improved ↑
24-bit integer square root that expects ADL mode. This can also be used for a fixed point square root, where the upper 16-bits make the 8.8 fixed point number, the lower 8 bits are zero (or if the previous operation had extended precision, load that). The output would be in D.E
Code:
DE/BC
Code:
Can anybody do better? (Usually: Yes.)
16-bit unsigned multiplication returning 32-bit product. It expects to be called in Z80 mode.
Code:
mul16:
;;Expects Z80 mode
;;Inputs: hl,bc
;;Outputs: Upper 16 bits in DE, lower 16 bits in BC
;;54 or 56 t-states
;;30 bytes
ld d,c \ ld e,l \ mlt de \ push de ;11
ld d,h \ ld e,b \ mlt de ;8
ld a,l \ ld l,c \ ld c,a ;3
mlt hl \ mlt bc \ add hl,bc ;13
jr nc,$+3 \ inc de \ pop bc ;6
ld a,b \ add a,l \ ld b,a ;3
ld a,e \ adc a,h \ ld e,a ;3
ret nc \ inc d \ ret ;7 (+2 for carry)
As a note, I found Karatsuba multiplication to be less efficient up to 24-bits and I have yet to write a 32-bit routine. EDIT:It is more efficient at 32-bits, but only by little
Remember, a lot of the optimized Z80 routines work well here, but some of them can actually be reworked to be even more efficient than a direct translation.
EDIT: 5-Feb-15 The following are UNTESTED.
Here is a 16-bit GCD algorithm.
Code:
GCD16:
;;Expect Z80 mode
;;Inputs: HL,DE
;;Output: HL
;; BC=0
;; DE=0
;; A=0
;Binary GCD algorithm
;About 432cc on average (average of 500 iterations with randomized inputs on [0,65535]
;78 bytes
xor a
ld b,a
ld c,a
sbc hl,bc
ex de,hl
ret z
sbc hl,bc
ex de,hl
ret z
;factor out cofactor powers of two
ld a,l \ or e \ rra \ jr nc,$+16
srl h \ rr l
srl d \ rr e
inc b
ld a,l \ or e \ rra \ jr nc,$-12
.loop:
;factor out powers of 2 from hl
ld a,l \ rra \ ld a,h \ jr c,$+10
rra \ rr l \ bit 0,l \ jr z,$-5 \ ld h,a
;factor out powers of 2 from de
ld a,e \ rra \ ld a,d \ jr c,$+10
rra \ rr e \ bit 0,e \ jr z,$-5 \ ld d,a
xor a
sbc hl,de
jr z,finish
jr nc,loop
add hl,de
or a
ex de,hl
sbc hl,de
jr loop
.finish:
ex de,hl
dec b
inc b
ret z
add hl,hl \ djnz $-1 \ ret
32-bit multiplication:
Code:
mul32:
;;Expects Z80 mode
;;Inputs: ix points to the first 32-bit number
;; ix+4 points to the next 32-bit number
;;Outputs: ix+8 points to the 64-bit result
;;Algorithm: Karatsuba
;348cc to 375cc
;compute z0
ld hl,(ix) ;5
ld bc,(ix+4) ;5
call mul16 ;59+2
ld (ix+8),bc ;5
ld (ix+10),de ;5
;compute z2
ld hl,(ix+2) ;5
ld bc,(ix+6) ;5
call mul16 ;59+2
ld (ix+12),bc ;5
ld (ix+14),de ;5
;compute z1, most complicated as it requires 17-bits of info for each factor
ld hl,(ix+2) ;5
ld bc,(ix) ;5
add hl,bc ;1
rla ;1
ld b,h ;1
ld c,l ;1
ld hl,(ix+6) ;5
ld de,(ix+4) ;5
add hl,de ;1
rla ;1
push hl ;3
push bc ;3
push af ;3
call mul16 ;59+2
pop af ;3
and 3 ;2
ex de,hl ;1
pop de ;3
jr z,$+7 ;label0 ;3|(6+1)
;bit 0 means add [stack0] to the upper 16 bits
;bit 1 means add [stack1] to the upper 16 bits
;both mean add 2^32
jp po,$+5 \ or 4 ;--
rra \ jr nc,$+7 ;4+4
rrca \ add hl,bc \ adc a,0 \ rlca ;--
;
srl a \ pop de \ jr nc,$+5 ;8+2
add hl,bc \ adc a,0 ;
ld d,b \ ld e,c ;2
;z1 = AHLDE-z0-z2
ld bc,(ix+8) \ ex de,hl \ sbc hl,bc ;8
ld bc,(ix+10) \ ex de,hl \ sbc hl,bc \ sbc a,0 ;10
ld bc,(ix+12) \ ex de,hl \ sbc hl,bc ;8
ld bc,(ix+14) \ ex de,hl \ sbc hl,bc \ sbc a,0 ;10
ex de,hl ;1
ld bc,(ix+10) \ add hl,bc \ ld (ix+10),hl \ ex de,hl ;12
ld bc,(ix+12) \ adc hl,bc \ ld (ix+12),hl \ adc a,0 ;13
ret z \ ld hl,(ix+14) \ add a,l \ ld l,a ;7+16
jr nc,$+3 \ inc h \ ld (ix+14),hl \ ret ;--
sqrt16:
Code:
sqrt16:
;;Expext Z80 mode
;;Inputs: HL
;;Output: A
;Unrolled, speed optimized.
;At most 112 clock cycles
;111 bytes.
xor a \ ld c,a \ ld d,a \ ld e,l \ ld l,h \ ld h,c
add hl,hl \ add hl,hl \ sub h \ jr nc,$+5 \ inc c \ cpl \ ld h,a
add hl,hl \ add hl,hl \ rl c \ ld a,c \ rla
sub h \ jr nc,$+5 \ inc c \ cpl \ ld h,a
add hl,hl \ add hl,hl \ rl c \ ld a,c \ rla
sub h \ jr nc,$+5 \ inc c \ cpl \ ld h,a
add hl,hl \ add hl,hl \ rl c \ ld a,c \ rla
sub h \ jr nc,$+5 \ inc c \ cpl \ ld h,a
add hl,hl \ add hl,hl \ rl c \ ld a,c \ rla
sub h \ jr nc,$+5 \ inc c \ cpl \ ld h,a
add hl,hl \ add hl,hl \ rl c \ ld a,c \ rla
sub h \ jr nc,$+5 \ inc c \ cpl \ ld h,a
sla c \ ld a,c \ add a,a \ add hl,hl \ add hl,hl
jr nc,$+5 \ sub h \ jr $+5 \ sub h \ jr nc,$+5 \ inc c \ cpl \ ld h,a
ld e,h \ ld h,l \ ld l,c \ ld a,l
add hl,hl \ rl e \ rl d
add hl,hl \ rl e \ rl d
sbc hl,de \ rla \ ret
EDIT: 9-Feb-15
This one performs A*HL/255. Be warned that this does not work on certain boundary values. For example, A=85 would imply division by 3, but if you input HL as divisible by 3, you get one less than the actual result. So 9*85/255 should return 3, but this routine returns 2 instead.
Code:
fastMul_Ndiv255:
;;Expects Z80 mode
;;Description: Multiplies HL by A/255
;;Inputs: A,HL
;;Outputs: HL is the result
;; A is a copy of the upper byte of the result
;; DE is the product of the input A,H
;; B is a copy of E
;; C is a copy of the upper byte of the product of inputs A,L
;;32cc
;;18 bytes
ld d,a
ld e,h
ld h,d
mlt hl
mlt de
;15
;DE
; DE
; HL
; HL
ld c,h
ld b,e
ld a,d
add hl,bc \ adc a,0
add hl,de \ adc a,0
ld l,h \ ld h,a
ret
;17
One way to almost remedy the roud-off issue, (I think except for HL=65535) is:
Code:
fastMul_Ndiv255_fix:
;;Expects Z80 mode
;;Description: Multiplies HL by A/255
;;Inputs: A,HL
;;Outputs: HL is the result
;;52cc
;;26 bytes
push af
ld d,a
ld e,l
ld l,d
mlt hl
mlt de
;HL
; HL
; DE
; DE
ld c,d
ld b,l
ld a,h
add hl,de \ adc a,0
add hl,bc \ adc a,0
ld d,l \ ld l,h \ ld h,a
pop af \ add a,e
ret nc
adc a,d
ret nc
inc hl
ret
This comes at a cost of 21cc and 8 bytes.
The rounding version works fairly well and only adds 5cc and 5 bytes:
Code:
fastMul_Ndiv255_round:
;;Expects Z80 mode
;;Description: Multiplies HL by A/255
;;Inputs: A,HL
;;Outputs: HL is the result
;;37cc
;;23 bytes
ld d,a
ld e,l
ld l,d
mlt hl
mlt de
;15
;HL
; HL
; DE
; DE
ld c,d
ld b,l
ld a,h
add hl,de \ adc a,0
adc hl,bc \ adc a,0
sla l
ld l,h \ ld h,a
jr nc,$+3
inc hl
ret
;22
edit: fixed and improved ↑
24-bit integer square root that expects ADL mode. This can also be used for a fixed point square root, where the upper 16-bits make the 8.8 fixed point number, the lower 8 bits are zero (or if the previous operation had extended precision, load that). The output would be in D.E
Code:
sqrt24:
;;Expects ADL mode
;;Inputs: HL
;;Outputs: DE is the integer square root
;; HL is the difference inputHL-DE^2
;; c flag reset
xor a \ ld b,l \ push bc \ ld b,a \ ld d,a \ ld c,a \ ld l,a \ ld e,a
;Iteration 1
add hl,hl \ rl c \ add hl,hl \ rl c
sub c \ jr nc,$+6 \ inc e \ inc e \ cpl \ ld c,a
;Iteration 2
add hl,hl \ rl c \ add hl,hl \ rl c \ rl e \ ld a,e
sub c \ jr nc,$+6 \ inc e \ inc e \ cpl \ ld c,a
;Iteration 3
add hl,hl \ rl c \ add hl,hl \ rl c \ rl e \ ld a,e
sub c \ jr nc,$+6 \ inc e \ inc e \ cpl \ ld c,a
;Iteration 4
add hl,hl \ rl c \ add hl,hl \ rl c \ rl e \ ld a,e
sub c \ jr nc,$+6 \ inc e \ inc e \ cpl \ ld c,a
;Iteration 5
add hl,hl \ rl c \ add hl,hl \ rl c \ rl e \ ld a,e
sub c \ jr nc,$+6 \ inc e \ inc e \ cpl \ ld c,a
;Iteration 6
add hl,hl \ rl c \ add hl,hl \ rl c \ rl e \ ld a,e
sub c \ jr nc,$+6 \ inc e \ inc e \ cpl \ ld c,a
;Iteration 7
add hl,hl \ rl c \ add hl,hl \ rl c \ rl b
ex de,hl \ add hl,hl \ push hl \ sbc hl,bc \ jr nc,$+8
ld a,h \ cpl \ ld b,a
ld a,l \ cpl \ ld c,a
pop hl
jr nc,$+4 \ inc hl \ inc hl
ex de,hl
;Iteration 8
add hl,hl \ ld l,c \ ld h,b \ adc hl,hl \ adc hl,hl
ex de,hl \ add hl,hl \ sbc hl,de \ add hl,de \ ex de,hl
jr nc,$+6 \ sbc hl,de \ inc de \ inc de
;Iteration 9
pop af
rla \ adc hl,hl \ rla \ adc hl,hl
ex de,hl \ add hl,hl \ sbc hl,de \ add hl,de \ ex de,hl
jr nc,$+6 \ sbc hl,de \ inc de \ inc de
;Iteration 10
rla \ adc hl,hl \ rla \ adc hl,hl
ex de,hl \ add hl,hl \ sbc hl,de \ add hl,de \ ex de,hl
jr nc,$+6 \ sbc hl,de \ inc de \ inc de
;Iteration 11
rla \ adc hl,hl \ rla \ adc hl,hl
ex de,hl \ add hl,hl \ sbc hl,de \ add hl,de \ ex de,hl
jr nc,$+6 \ sbc hl,de \ inc de \ inc de
;Iteration 11
rla \ adc hl,hl \ rla \ adc hl,hl
ex de,hl \ add hl,hl \ sbc hl,de \ add hl,de \ ex de,hl
jr nc,$+6 \ sbc hl,de \ inc de \ inc de
rr d \ rr e \ ret
DE/BC
Code:
div16:
;;Inputs: DE is the numerator, BC is the divisor
;;Outputs: DE is the result
;; A is a copy of E
;; HL is the remainder
;; BC is not changed
;140 bytes
;145cc
xor a \ sbc hl,hl
ld a,d
rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc
rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc
rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc
rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc
rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc
rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc
rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc
rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc
rla \ cpl \ ld d,a
ld a,e
rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc
rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc
rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc
rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc
rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc
rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc
rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc
rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc
rla \ cpl \ ld e,a
ret