» Goto page 1, 2  Next
Share the most optimized eZ80 routines!
Can anybody do better? (Usually: Yes.)

16-bit unsigned multiplication returning 32-bit product. It expects to be called in Z80 mode.

Code:

mul16:
;;Expects Z80 mode
;;Inputs: hl,bc
;;Outputs: Upper 16 bits in DE, lower 16 bits in BC
;;54 or 56 t-states
;;30 bytes
ld d,c \ ld e,l \ mlt de \ push de ;11
ld d,h \ ld e,b \ mlt de           ;8
ld a,l \ ld l,c \ ld c,a           ;3
mlt hl \ mlt bc \ add hl,bc        ;13
jr nc,\$+3 \ inc de \ pop bc        ;6
ld a,b \ add a,l \ ld b,a          ;3
ld a,e \ adc a,h \ ld e,a          ;3
ret nc \ inc d \ ret               ;7 (+2 for carry)

As a note, I found Karatsuba multiplication to be less efficient up to 24-bits and I have yet to write a 32-bit routine. EDIT:It is more efficient at 32-bits, but only by little

Remember, a lot of the optimized Z80 routines work well here, but some of them can actually be reworked to be even more efficient than a direct translation.
EDIT: 5-Feb-15 The following are UNTESTED.

Here is a 16-bit GCD algorithm.

Code:

GCD16:
;;Expect Z80 mode
;;Inputs: HL,DE
;;Output: HL
;;        BC=0
;;        DE=0
;;        A=0
;Binary GCD algorithm
;About 432cc on average (average of 500 iterations with randomized inputs on [0,65535]
;78 bytes
xor a
ld b,a
ld c,a
sbc hl,bc
ex de,hl
ret z
sbc hl,bc
ex de,hl
ret z

;factor out cofactor powers of two
ld a,l \ or e \ rra \ jr nc,\$+16
srl h \ rr l
srl d \ rr e
inc b
ld a,l \ or e \ rra \ jr nc,\$-12
.loop:
;factor out powers of 2 from hl
ld a,l \ rra \ ld a,h \ jr c,\$+10
rra \ rr l \ bit 0,l \ jr z,\$-5 \ ld h,a
;factor out powers of 2 from de
ld a,e \ rra \ ld a,d \ jr c,\$+10
rra \ rr e \ bit 0,e \ jr z,\$-5 \ ld d,a

xor a
sbc hl,de
jr z,finish
jr nc,loop
or a
ex de,hl
sbc hl,de
jr loop
.finish:
ex de,hl
dec b
inc b
ret z
add hl,hl \ djnz \$-1 \ ret

32-bit multiplication:

Code:

mul32:
;;Expects Z80 mode
;;Inputs:  ix points to the first 32-bit number
;;         ix+4 points to the next 32-bit number
;;Outputs: ix+8 points to the 64-bit result
;;Algorithm: Karatsuba
;348cc to 375cc

;compute z0
ld hl,(ix)        ;5
ld bc,(ix+4)      ;5
call mul16        ;59+2
ld (ix+8),bc      ;5
ld (ix+10),de     ;5
;compute z2
ld hl,(ix+2)      ;5
ld bc,(ix+6)      ;5
call mul16        ;59+2
ld (ix+12),bc     ;5
ld (ix+14),de     ;5
;compute z1, most complicated as it requires 17-bits of info for each factor
ld hl,(ix+2)      ;5
ld bc,(ix)        ;5
rla               ;1
ld b,h            ;1
ld c,l            ;1

ld hl,(ix+6)      ;5
ld de,(ix+4)      ;5
rla               ;1

push hl           ;3
push bc           ;3
push af           ;3
call mul16        ;59+2
pop af            ;3
and 3             ;2
ex de,hl          ;1
pop de            ;3
jr z,\$+7 ;label0  ;3|(6+1)
;bit 0 means add [stack0] to the upper 16 bits
;bit 1 means add [stack1] to the upper 16 bits
jp po,\$+5 \ or 4                     ;--
rra \ jr nc,\$+7                      ;4+4
;
srl a \ pop de \ jr nc,\$+5           ;8+2
ld d,b \ ld e,c                      ;2

;z1 = AHLDE-z0-z2
ld bc,(ix+8) \ ex de,hl \ sbc hl,bc              ;8
ld bc,(ix+10) \ ex de,hl \ sbc hl,bc \ sbc a,0   ;10

ld bc,(ix+12) \ ex de,hl \ sbc hl,bc             ;8
ld bc,(ix+14) \ ex de,hl \ sbc hl,bc \ sbc a,0   ;10
ex de,hl                                         ;1

ld bc,(ix+10) \ add hl,bc \ ld (ix+10),hl \ ex de,hl  ;12
ld bc,(ix+12) \ adc hl,bc \ ld (ix+12),hl \ adc a,0   ;13
ret z \ ld hl,(ix+14) \ add a,l \ ld l,a              ;7+16
jr nc,\$+3 \ inc h \ ld (ix+14),hl \ ret               ;--

sqrt16:

Code:

sqrt16:
;;Expext Z80 mode
;;Inputs: HL
;;Output: A
;Unrolled, speed optimized.
;At most 112 clock cycles
;111 bytes.
xor a \ ld c,a \ ld d,a \ ld e,l \ ld l,h \ ld h,c

add hl,hl \ add hl,hl \ sub h \ jr nc,\$+5 \ inc c \ cpl \ ld h,a

add hl,hl \ add hl,hl \ rl c \ ld a,c \ rla
sub h \ jr nc,\$+5 \ inc c \ cpl \ ld h,a

add hl,hl \ add hl,hl \ rl c \ ld a,c \ rla
sub h \ jr nc,\$+5 \ inc c \ cpl \ ld h,a

add hl,hl \ add hl,hl \ rl c \ ld a,c \ rla
sub h \ jr nc,\$+5 \ inc c \ cpl \ ld h,a

add hl,hl \ add hl,hl \ rl c \ ld a,c \ rla
sub h \ jr nc,\$+5 \ inc c \ cpl \ ld h,a

add hl,hl \ add hl,hl \ rl c \ ld a,c \ rla
sub h \ jr nc,\$+5 \ inc c \ cpl \ ld h,a

jr nc,\$+5 \ sub h \ jr \$+5 \ sub h \ jr nc,\$+5 \ inc c \ cpl \ ld h,a

ld e,h \ ld h,l \ ld l,c \ ld a,l
add hl,hl \ rl e \ rl d
add hl,hl \ rl e \ rl d
sbc hl,de \ rla \ ret

EDIT: 9-Feb-15
This one performs A*HL/255. Be warned that this does not work on certain boundary values. For example, A=85 would imply division by 3, but if you input HL as divisible by 3, you get one less than the actual result. So 9*85/255 should return 3, but this routine returns 2 instead.

Code:

fastMul_Ndiv255:
;;Expects Z80 mode
;;Description: Multiplies HL by A/255
;;Inputs: A,HL
;;Outputs: HL is the result
;;         A is a copy of the upper byte of the result
;;         DE is the product of the input A,H
;;         B is a copy of E
;;         C is a copy of the upper byte of the product of inputs A,L
;;32cc
;;18 bytes
ld d,a
ld e,h
ld h,d
mlt hl
mlt de
;15
;DE
; DE
; HL
;  HL
ld c,h
ld b,e
ld a,d
ld l,h \ ld h,a
ret
;17

One way to almost remedy the roud-off issue, (I think except for HL=65535) is:

Code:

fastMul_Ndiv255_fix:
;;Expects Z80 mode
;;Description: Multiplies HL by A/255
;;Inputs: A,HL
;;Outputs: HL is the result
;;52cc
;;26 bytes
push af
ld d,a
ld e,l
ld l,d
mlt hl
mlt de
;HL
; HL
; DE
;  DE
ld c,d
ld b,l
ld a,h
ld d,l \ ld l,h \ ld h,a
ret nc
ret nc
inc hl
ret

This comes at a cost of 21cc and 8 bytes.
The rounding version works fairly well and only adds 5cc and 5 bytes:

Code:

fastMul_Ndiv255_round:
;;Expects Z80 mode
;;Description: Multiplies HL by A/255
;;Inputs: A,HL
;;Outputs: HL is the result
;;37cc
;;23 bytes
ld d,a
ld e,l
ld l,d
mlt hl
mlt de
;15
;HL
; HL
; DE
;  DE
ld c,d
ld b,l
ld a,h
sla l
ld l,h \ ld h,a
jr nc,\$+3
inc hl
ret
;22

edit: fixed and improved ↑

24-bit integer square root that expects ADL mode. This can also be used for a fixed point square root, where the upper 16-bits make the 8.8 fixed point number, the lower 8 bits are zero (or if the previous operation had extended precision, load that). The output would be in D.E

Code:

sqrt24:
;;Inputs: HL
;;Outputs: DE is the integer square root
;;         HL is the difference inputHL-DE^2
;;         c flag reset
xor a \ ld b,l \ push bc \ ld b,a \ ld d,a \ ld c,a \ ld l,a \ ld e,a
;Iteration 1
sub c \ jr nc,\$+6 \ inc e \ inc e \ cpl \ ld c,a
;Iteration 2
add hl,hl \ rl c \ add hl,hl \ rl c \ rl e \ ld a,e
sub c \ jr nc,\$+6 \ inc e \ inc e \ cpl \ ld c,a
;Iteration 3
add hl,hl \ rl c \ add hl,hl \ rl c \ rl e \ ld a,e
sub c \ jr nc,\$+6 \ inc e \ inc e \ cpl \ ld c,a
;Iteration 4
add hl,hl \ rl c \ add hl,hl \ rl c \ rl e \ ld a,e
sub c \ jr nc,\$+6 \ inc e \ inc e \ cpl \ ld c,a
;Iteration 5
add hl,hl \ rl c \ add hl,hl \ rl c \ rl e \ ld a,e
sub c \ jr nc,\$+6 \ inc e \ inc e \ cpl \ ld c,a
;Iteration 6
add hl,hl \ rl c \ add hl,hl \ rl c \ rl e \ ld a,e
sub c \ jr nc,\$+6 \ inc e \ inc e \ cpl \ ld c,a

;Iteration 7
add hl,hl \ rl c \ add hl,hl \ rl c \ rl b
ex de,hl \ add hl,hl \ push hl \ sbc hl,bc \ jr nc,\$+8
ld a,h \ cpl \ ld b,a
ld a,l \ cpl \ ld c,a
pop hl
jr nc,\$+4 \ inc hl \ inc hl
ex de,hl
;Iteration 8
ex de,hl \ add hl,hl \ sbc hl,de \ add hl,de \ ex de,hl
jr nc,\$+6 \ sbc hl,de \ inc de \ inc de
;Iteration 9
pop af
ex de,hl \ add hl,hl \ sbc hl,de \ add hl,de \ ex de,hl
jr nc,\$+6 \ sbc hl,de \ inc de \ inc de
;Iteration 10
ex de,hl \ add hl,hl \ sbc hl,de \ add hl,de \ ex de,hl
jr nc,\$+6 \ sbc hl,de \ inc de \ inc de
;Iteration 11
ex de,hl \ add hl,hl \ sbc hl,de \ add hl,de \ ex de,hl
jr nc,\$+6 \ sbc hl,de \ inc de \ inc de
;Iteration 11
ex de,hl \ add hl,hl \ sbc hl,de \ add hl,de \ ex de,hl
jr nc,\$+6 \ sbc hl,de \ inc de \ inc de
rr d \ rr e \ ret

DE/BC

Code:

div16:
;;Inputs: DE is the numerator, BC is the divisor
;;Outputs: DE is the result
;;         A is a copy of E
;;         HL is the remainder
;;         BC is not changed
;140 bytes
;145cc
xor a \ sbc hl,hl

ld a,d
rla \ adc hl,hl \ sbc hl,bc \ jr nc,\$+3 \ add hl,bc
rla \ adc hl,hl \ sbc hl,bc \ jr nc,\$+3 \ add hl,bc
rla \ adc hl,hl \ sbc hl,bc \ jr nc,\$+3 \ add hl,bc
rla \ adc hl,hl \ sbc hl,bc \ jr nc,\$+3 \ add hl,bc
rla \ adc hl,hl \ sbc hl,bc \ jr nc,\$+3 \ add hl,bc
rla \ adc hl,hl \ sbc hl,bc \ jr nc,\$+3 \ add hl,bc
rla \ adc hl,hl \ sbc hl,bc \ jr nc,\$+3 \ add hl,bc
rla \ adc hl,hl \ sbc hl,bc \ jr nc,\$+3 \ add hl,bc
rla \ cpl \ ld d,a

ld a,e
rla \ adc hl,hl \ sbc hl,bc \ jr nc,\$+3 \ add hl,bc
rla \ adc hl,hl \ sbc hl,bc \ jr nc,\$+3 \ add hl,bc
rla \ adc hl,hl \ sbc hl,bc \ jr nc,\$+3 \ add hl,bc
rla \ adc hl,hl \ sbc hl,bc \ jr nc,\$+3 \ add hl,bc
rla \ adc hl,hl \ sbc hl,bc \ jr nc,\$+3 \ add hl,bc
rla \ adc hl,hl \ sbc hl,bc \ jr nc,\$+3 \ add hl,bc
rla \ adc hl,hl \ sbc hl,bc \ jr nc,\$+3 \ add hl,bc
rla \ adc hl,hl \ sbc hl,bc \ jr nc,\$+3 \ add hl,bc
rla \ cpl \ ld e,a
ret
As some of you may know, it's not possible to access the top 8 bits of a 24-bit register directly, which nullifies some of the methods we've used for 16-bit registers. But thankfully, the ALU is no longer slow for 16/24-bit data, so some things can be optimized in different ways than before.

Check if HL is 0:

Code:
;sets zero/sign
;4 bytes, 4 cycles
or a,a
sbc hl,de

Check if BC is 0:

Code:
;sets HL=BC, sets zero/sign, preserves carry
;4 bytes, 4 cycles
sbc hl,hl

Compare HL to DE:

Code:
;sets all flags from comparison
;4 bytes, 4 cycles
or a,a
sbc hl,de
I know I saw from some of calc84's code that to set HL to zero:

Code:

or a  \ sbc hl,hl

That is 3cc and destroys some flags, but is faster than ld hl,0 in ADL mode (same speed in Z80). Plus, if you know that the c flag is reset, you can do just "sbc hl,hl" for 2 bytes, 2cc instead of 3 or 4 for Z80 and ADL respectively.
Xeda, I could use an ADL-mode PRNG routine. It should generate at least 16-bits at a time. (It'd be great if I could specify a maximum to scale the number, too.) I could just use your Rand24 routine, and just add .S to any instructions that need it, but I suspect the MLT instruction would be very useful for that routine.

Honestly, I'm using ADL mode for robotfindskitten because it makes reading a datafile from the archive a lot easier, and I don't feel like trying to manage short/long mode transitions. Furthermore, you can't use the OS interrupt handler in short mode, which would mean writing driver routines I haven't completed yet.

I'd also like to use the RTC as an entropy source. Here's an ugly routine I threw together quickly to convert the current time into a 32-bit integer.

Code:

;------ GetRtcTimeLinear -------------------------------------------------------
GetRtcTimeLinear:
; WARNING: THIS CODE WILL SOMETIMES (RARELY) READ THE TIME INCORRECTLY!
; It does not freeze the time before reading it.
; Not a problem for entropy-purposes, but it is a problem if you need the actual
; time.
;
; This monstrous routine will return the time as a 32-bit number of seconds in
; DE:HL.
; At least, I think.
; Inputs:
;  - None
; Output:
;  - DE:HL
; Destroys:
;  - AF
;  - BC
;  - IX maybe?
;  - ???
; Minutes into seconds
ld   de, (mpRtcMinutes)
ld   d, 60
mlt   de
; Hours into seconds
ld   hl, (mpRtcHours)
ld   h, 225
mlt   hl
ld   de, (mpRtcSeconds)
; Convert into DE:HL format
push   hl
ld   b, 8
rtcHighBitLoop:
djnz   rtcHighBitLoop
pop   hl
ld   e, a
ld   d, 0
push   de
push   hl
; Days into seconds.
ld   hl, (mpRtcDays)
ld   bc, (60 * 60 * 24) / 2   ; Half
call   MultBcByDe
push   bc
pop   hl
ex   de, hl
adc.s   hl, hl         ; ... and double
ex   de, hl
pop   bc
ex   de, hl
pop   bc
ex   de, hl
; Done
ret

;------ MultBcByDe -------------------------------------------------------------
MultBcByDe:
; Multiplies BC (16-bit) by HL (16-bit).
; From Xeda
; Inputs:
;  - HL (16-bits)
;  - BC (16-bits)
; Output:
;  - DE:BC
; Destroys:
;  - HL
;  - AF
ld   d, c   \   ld   e, l   \   mlt   de   \   push   de
ld   d, h   \   ld   e, b   \   mlt   de
ld   a, l   \   ld   l, c   \   ld   c, a
mlt   hl
mlt   bc
jr   nc, \$ + 3 \   inc   de
pop   bc
ld   a, b   \   add   a, l   \   ld   b, a
ld   a, e   \   adc   a, h   \   ld   e, a
ret   nc
inc   d
ret

Another source of entropy I'm thinking of using is the 24-bit checksum of RAM.

Code:
ld   bc, 3
ld   iy, ((1024 * 256) + (320 * 240 * 2)) / 256
ld   hl, 0D00000h
_:   xor   a
_:   ld   de, (hl)
dec   a
jr   nz, -_
dec   iy
ld   a, iyl
or   iyh
jr   nz, --_

Optimizations?
Oh goody, I've been working on PRNGs all morning and decided to finally check my email Using the RTC is an excellent idea of course.

Here is a 24-bit LFSR which is almost certainly not going to be good for a game (it *could* be, but only in certain situations):

Code:

LFSR24:
;;Input: (seed1) is non-zero
;;Output: (seed1) updated, HL is the result
;;Destroys: A
;;30cc (-3cc if using SMC)
#IF smc == 0
ld hl,(seed1)
#ELSE
seed1 = \$+1
ld hl,1
#ENDIF
ld a,l
rla
and %00010111
jp po,\$+5
inc l
ld (seed1),hl
ret

It is fast, and I use it for some noisiness. It has period 2^24-1. Next, you could use an LCG like the following:

Code:

lcg16777213:
;;x=11875x+137 mod (2^24-3)
;;Returns HL
#IF smc == 0
ld hl,(seed24)
#ELSE
seed24 = \$+1
ld hl,10101
#ENDIF
;multiply seed by 95, first
xor a \ ld d,a
ld e,a
push hl
ld bc,(seed24)
sbc hl,bc
sbc a,d
or a
pop bc
sbc hl,bc
sbc a,e
;AHL * 125
;=AHL*128-AHL*2-AHL
ld e,a
push de \ push hl
ld e,a
push de \ push hl
add hl,hl \ rla \ rl d
add hl,hl \ rla \ rl d
add hl,hl \ rla \ rl d
add hl,hl \ rla \ rl d
add hl,hl \ rla \ rl d
add hl,hl \ rla \ rl d
ld e,a
pop bc \ sbc hl,bc \ ex de,hl \ pop bc \ sbc hl,bc \ ex de,hl
pop bc \ sbc hl,bc \ ex de,hl \ pop bc \ sbc hl,bc \ ex de,hl
ld c,137
ld c,b
ex de,hl
ex de,hl
;mod 2^16-3
ex de,hl
ld b,h
ld c,l
ex de,hl
sbc hl,de \ jr nc,\$+5 \ dec hl \ dec hl \ dec hl
ld (seed24),hl
ret

This has a period of 2^24-3. The problem of course, is that it doesn't provide a full 24-bit range of values. However, this should be fairly suitable for games, especially with the extra noise you are adding.

For those not using RTC noise, I suggest using the LFSR for noise by combining the two routines into one:

Code:

prng24:
;;Returns HL
;;uses seed24 and seed1 for an LCG and LFSR respectively
;;period length is (2^24-1)(2^24-3) = 281,474,909,601,795
#IF smc == 0
ld hl,(seed24)
#ELSE
seed24 = \$+1
ld hl,10101
#ENDIF
;multiply seed by 95, first
xor a \ ld d,a
ld e,a
push hl
ld bc,(seed24)
sbc hl,bc
sbc a,d
or a
pop bc
sbc hl,bc
sbc a,e
;AHL * 125
;=AHL*128-AHL*2-AHL
ld e,a
push de \ push hl
ld e,a
push de \ push hl
add hl,hl \ rla \ rl d
add hl,hl \ rla \ rl d
add hl,hl \ rla \ rl d
add hl,hl \ rla \ rl d
add hl,hl \ rla \ rl d
add hl,hl \ rla \ rl d
ld e,a
pop bc \ sbc hl,bc \ ex de,hl \ pop bc \ sbc hl,bc \ ex de,hl
pop bc \ sbc hl,bc \ ex de,hl \ pop bc \ sbc hl,bc \ ex de,hl
ld c,137
ld c,b
ex de,hl
ex de,hl
;mod 2^16-3
ex de,hl
ld b,h
ld c,l
ex de,hl
sbc hl,de \ jr nc,\$+5 \ dec hl \ dec hl \ dec hl
ld (seed24),hl
ex de,hl
#IF smc == 0
ld hl,(seed1)
#ELSE
seed1 = \$+1
ld hl,1
#ENDIF
ld a,l
rla
and %00010111
jp po,\$+5
inc l
ld (seed1),hl
ret

I will try to remember to calculate timings later. I hope this helps (and works, as I did not test it).

EDIT:
I made some routines that will probably be more useful for you, DrDnar, but they all expect to be called in Z80 mode. CALL.IS should suffice.

The RNG I included here has a much smaller cycle length (about 1/65536 the size), but feel free to replace it with any of your own, as long as it has some of its output in HL. Included are routines to put a range on the output (8-bit and 16-bit ranges), as well as an optimized rand10 which generates a random int from 0 to 9. I used that to generate random TI floats

Code:

;eZ80
;smc = 1
;inc_rand        = 1
;inc_randrange8  = 1
;inc_randrange16 = 1
;inc_rand10      = 1

;;random.inc
;;Random Number Generator Routines
;;Author:
;;    Zeda Thomas (xedaelnara@gmail.com)
;;    +All parts of this document may be used for any purpose.
;;    +Give some credit to the author, even if only mentioned in the source code.
;;      ++This is mainly so that any issues with the codes or algorithms can be reported.
;;      ++If this license info is unchanged and included in this file, that will suffice.
;;Disclaimer:
;;    +These are NOT cryptographically secure. These are quality PRNGs for games and simulation.
;;Routines:
;;  routine         in         out              mode    size    speed
;;=====================================================================
;;  rand            ---         HL               Z80    53      65~67
;;  randrange8      B           A in [0,B-1]     Z80    10      85~87
;;  randrange16     BC          HL in [0,BC-1]   Z80    10      142~146
;;  rand10          ---         A in [0,9]       Z80    16      88~90
;;  randTI          HL          (hl)             Z80    27      1540~1568
;;  mlt16           HL,BC       DEBC             Z80    30      54~56

#IF inc_rand==1
rand:
;;Input:
;;  (seed1) is a non-zero 16-bit int
;;  (seed2) is a 16-bit int
;;Output:
;;  HL is the pseudo-random number
;;  DE is the output of the LFSR
;;  BC is the previous seed of the Lehmer PRNG
;;  (seed1) is the output of the LFSR
;;  (seed2) is the output of the Lehmer PRNG
;;Destroys: A
;;Notes:
;;  This uses a 16-bit Lehmer PRNG, and an LFSR
;;  The period is 4,294,901,760 (65536*65535)
;;  Technically,the Lehmer PRNG here can have values from 1 to 65536. In this application, we store 65536 as 0.
;;Speed:
;;    65+2a, a is 0 or 1
;;      probability a= 1 is 38/65536
;;    Average: 65.00115966796875
;;    Best:65
;;    Worst:67
;;53 bytes
#IF smc == 0
ld hl,(seed2)       ;5
#ELSE
seed2 = \$+1
ld hl,0             ;3
#ENDIF
;multiply by 75
ld a,h
or l
jr nz,nospecial
ld hl,-74
jr writeseed2
nospecial:
ld b,h
ld c,75
ld h,c
mlt hl
mlt bc
ld a,c
ld h,a
ld c,b
ld b,0
sbc hl,bc
jr nc,\$+5
inc hl \ inc hl \ inc hl
writeseed2:
ld (seed2),hl
ex de,hl
#IF smc == 0
ld hl,(seed1)
#ELSE
seed1 = \$+1
ld hl,1
#ENDIF
ld a,h
and %10110100
jp po,\$+4
inc l
ld (seed1),hl
ret
#ENDIF
#IF inc_randrange8==1
randrange8:
;;Input: B
;;Output: A,H is a number on [0,B-1]
;;Example, if B is 10, this outputs an int from 0 to 9
;;85cc~87
;;10 bytes
push bc     ;3
call rand   ;5+
pop bc      ;3
ld l,b
mlt hl
ld a,h
ret
#ENDIF
#IF inc_randrange16==1
randrange16:
;;Input: BC is the exclusive upperbound
;;Output:HL is an int less than BC
;;142cc~146cc
push bc
call rand
pop bc
call mul16
ex de,hl
ret
mul16:
;;Expects Z80 mode
;;Inputs: hl,bc
;;Outputs: Upper 16 bits in DE, lower 16 bits in BC
;;54 or 56 t-states
;;30 bytes
ld d,c \ ld e,l \ mlt de \ push de ;11
ld d,h \ ld e,b \ mlt de           ;8
ld a,l \ ld l,c \ ld c,a           ;3
mlt hl \ mlt bc \ add hl,bc        ;13
jr nc,\$+3 \ inc de \ pop bc        ;6
ld a,b \ add a,l \ ld b,a          ;3
ld a,e \ adc a,h \ ld e,a          ;3
ret nc \ inc d \ ret               ;7 (+2 for carry)
#ENDIF
#IF inc_randTI==1
randTI:
;;Inputs: HL points to where the float should be written (does not allocate RAM).
;;Outputs: Stores a random floating point number on [0,1) to where HL points
;;1540~1568
inc_rand10 = 1
ld (hl),0 \ inc hl
ld (hl),7Fh \ inc hl
ld b,7
randTIloop:
push bc
push hl
call rand10
pop hl
rrd
push hl
call rand10
pop hl
rrd
pop bc
djnz randTIloop
ret
#ENDIF
#IF inc_rand10==1
rand10:
;;Output: A is an int from 0 to 9.
;;88cc~90cc
;;16 bytes
call rand
xor a
ld b,h
ld c,l
ret
#ENDIF

Absolute Value:

Inputs: HL
Outputs: abs(HL)

Code:
AbsHL:
ld de,0
ret p
ex de,hl
sbc hl,de
ret
MateoConLechuga wrote:
Absolute Value:

Inputs: HL
Outputs: abs(HL)

Code:
AbsHL:
ld de,0
ret p
ex de,hl
sbc hl,de
ret

I believe this is an improvement:

Code:
AbsHL:
ex de,hl
AbsDE:
or a,a
sbc hl,hl
sbc hl,de
ret p
ex de,hl
ret

At least, it's an improvement across uniform inputs. It's actually slightly slower (4 cycles) for nonnegative inputs, but it's much faster (21 cycles) for negative inputs.
Runer112 wrote:

It's actually slightly slower (4 cycles) for nonnegative inputs...

A slight modification can fix that at no cost to the negative case or size, but destroying A:

Code:

absHL:
ex de,hl
absDE:
ld a,h
or a
ret p
sbc hl,hl
sbc hl,de
ret
Xeda112358 wrote:
A slight modification can fix that at no cost to the negative case or size, but destroying A:

Code:

absHL:
ex de,hl
absDE:
ld a,h
or a
ret p
sbc hl,hl
sbc hl,de
ret

Hm, that's neat! Forgot about the 1's complement inversion and conversion to 2's complement. Technically that measures the sign of the middle byte though, yet it obviously still works. Nice job.
I made a simple text routine for fonts up to 8 pixels wide. It assumes a font in the following format:

Code:
Font_Width           = 8 ; Width in pixels (any integer from 1-8)
Font_Height          = 14 ; Height in pixels (any integer from 1-240)
Font_HSpacing        = 0 ; Space between characters in pixels
Font_VSpacing        = 0 ; Space between lines in pixels
Font_CharData:
.db \$00,\$00,\$FE,\$C6,\$C6,\$C6,\$C6,\$C6,\$C6,\$C6,\$C6,\$FE,\$00,\$00 ; Char 00
.db \$00,\$00,\$FE,\$C6,\$C6,\$C6,\$C6,\$C6,\$C6,\$C6,\$C6,\$FE,\$00,\$00 ; Char 01
etc.

The text itself should be in this format:

Code:
txt_HelloWorld:
.db "Hello world!",1
.db "The byte \$01 denotes a newline",1
.db "The byte \$00 terminates the string",0

Here is the code itself: http://pastebin.com/XKEYt9vR

Hope this is useful to someone!
EDIT 1-31-2016: It turns out that the multiplication and division routines I had posted here didn't work for many cases, specifically negative numbers. I have fixed both. (Also now neither of them destroy IX and IY!)

I'm currently working on some 16.8 fixed-point math routines. Addition and subtraction are obviously very easy, and here are routines I wrote for multiplication and division. I'm absolutely sure they could be optimized.
Pastebin!

Neither of these has been extensively tested, so there could be minor issues.

Also, here are some neat tricks!

HLu→A

Code:
dec sp
push hl
inc sp
pop af

Byte shift HL up

Code:
push hl
dec sp
pop hl
inc sp

"Pop grab" (pop something off the stack, but leave it on the stack)

Code:
pop hl
dec sp \ dec sp \ dec sp

They are pretty simple and obvious, but I thought I'd put them here anyway.
Here's a pretty nice line routine that doesn't care what order the x or y coordinates are. Works in 8bpp mode. It also doesn't use ix or iy, so it is compatible with ZDS C and other things. No clipping is implemented; that will be with an alternate routine that clips the line before it draws it.
It is fairly well optimized, but then again I don't know much about optimization. Enjoy!

Inputs:

Code:
de=x, hl=x, b=y, c=y, a=indexed color

Code:

Code:
_line:
ld (color),a
ld (color2),a
ld a,c
ld (y1),a
push de
push hl
push bc
or a,a
sbc hl,de
ld a,\$03
jr nc,+_
ld a,\$0B
_:  ld (xStep),a
ld (xStep2),a
ex de,hl
or a,a
sbc hl,hl
sbc hl,de
jp p,+_
ex de,hl
_:  ld (dx),hl
push hl
ld (dx1),hl
ld (dx12),hl
or a,a
sbc hl,hl
ex de,hl
sbc hl,hl
ld e,b
ld l,c
or a,a
sbc hl,de
ld a,\$3C
jr nc,+_
inc a
_:   ld (yStep),a
ld (yStep2),a
ex de,hl
or a,a
sbc hl,hl
sbc hl,de
jp p,+_
ex de,hl
_:   ld (dy),hl
ld (dy1),hl
ld (dy12),hl
pop de
pop af
ld hl,(dy)
or a,a
sbc hl,de
pop de
pop bc
ld hl,0
jr nc,changeYLoop
changeXLoop:
push hl
ld l,a
ld h,lcdWidth/2
mlt hl
push bc
ld bc,vram
color: =\$+1
ld (hl),0
pop bc
push bc
pop hl
or a,a
sbc hl,de
pop hl
ret z
xStep:
nop
push de
dy1: =\$+1
ld de,0
or a,a
jp m,+_
dx: =\$+1
ld de,0
or a,a
sbc hl,de
jp c,+_
yStep:
nop
dx1: =\$+1
ld de,0
sbc hl,de
_:
pop de
jp changeXLoop
changeYLoop:
push bc
push hl
ld l,a
ld h,lcdWidth/2
mlt hl
ld bc,vram
color2: =\$+1
ld (hl),0
pop hl
pop bc
y1: =\$+1
cp a,0
ret z
yStep2:
nop
push de
dx12: =\$+1
ld de,0
or a,a
jp m,+_
dy: =\$+1
ld de,0
or a,a
sbc hl,de
jp c,+_
xStep2:
nop
dy12: =\$+1
ld de,0
sbc hl,de
_:
pop de
jp changeYLoop
While working on my 3D renderer I wrote some simple LUT-based trigonometry routines. It takes input in register A where 256 = 360°. The result is in 16.8 fixed-point, though you could just change the lookup table to return any other format. Here you go:

EDIT: I think it's pretty well optimized, but I wouldn't say it's perfect. I don't really like needing to use B to store the top two bits of A...

Code:
; Destroys A B DE
; Result in HL
CosA:
SinA:
ld b,a
and a,%00111111
ld hl,3 \ ld h,a \ mlt hl
bit 6,b
jr z,_
ex de,hl
ld hl,63*3
sbc hl,de
_:   ld de,SinCosLUT
bit 7,b
jr z,_
ld de,(hl)
or a,a \ sbc hl,hl ; carry could have been set by ADD HL,DE four lines earlier
sbc hl,de
ret
_:   ld hl,(hl) \ ret

SinCosLUT: ; 192 bytes (64 items, 3 bytes each)
; This could probably be made to be 2 bytes per item, but it would be harder to clear out the MSByte of the output
.dl 0
.dl 6
.dl 13
.dl 19
.dl 25
.dl 31
.dl 38
.dl 44
.dl 50
.dl 56
.dl 62
.dl 68
.dl 74
.dl 80
.dl 86
.dl 92
.dl 98
.dl 104
.dl 109
.dl 115
.dl 121
.dl 126
.dl 132
.dl 137
.dl 142
.dl 147
.dl 152
.dl 157
.dl 162
.dl 167
.dl 172
.dl 177
.dl 181
.dl 185
.dl 190
.dl 194
.dl 198
.dl 202
.dl 206
.dl 209
.dl 213
.dl 216
.dl 220
.dl 223
.dl 226
.dl 229
.dl 231
.dl 234
.dl 237
.dl 239
.dl 241
.dl 243
.dl 245
.dl 247
.dl 248
.dl 250
.dl 251
.dl 252
.dl 253
.dl 254
.dl 255
.dl 255
.dl 256
.dl 256
Hactar wrote:
"Pop grab" (pop something off the stack, but leave it on the stack)

Code:
pop hl
dec sp \ dec sp \ dec sp
This is unsafe if interrupts are enabled! If an interrupt triggers, the interrupt handling data will overwrite the thing on the stack you're trying to preserve.
DrDnar wrote:
Hactar wrote:
"Pop grab" (pop something off the stack, but leave it on the stack)

Code:
pop hl
dec sp \ dec sp \ dec sp
This is unsafe if interrupts are enabled! If an interrupt triggers, the interrupt handling data will overwrite the thing on the stack you're trying to preserve.

Just use pop/push. It doesn't have this interrupt problem, it's smaller, and (for non-index registers) it's faster.

By the way, you don't need to make up a name for this operation. It already has a name: "peek."
MateoConLechuga wrote:
Here's a pretty nice line routine that doesn't care what order the x or y coordinates are. Works in 8bpp mode. It also doesn't use ix or iy, so it is compatible with ZDS C and other things. No clipping is implemented; that will be with an alternate routine that clips the line before it draws it.
It is fairly well optimized, but then again I don't know much about optimization. Enjoy!

Mateo, I optimised a few things :

Code:
_line:  ld (color),a    ; de,b =x,y     hl,c=x,y     a=indexed color
ld (color2),a
ld a,c
ld (y1),a
ld (nde+1),hl
push de
push bc
or a,a
sbc hl,de
ld a,\$03
jr nc,+_
ld a,\$0B
_:      ld (xStep),a
ld (xStep2),a
ex de,hl
or a,a
sbc hl,hl
sbc hl,de
jp p,+_
ex de,hl
_:      ld (dx),hl
push hl
ld (dx1),hl
ld (dx12),hl
or a,a
sbc hl,hl
ex de,hl
sbc hl,hl
ld e,b
ld l,c
or a,a
sbc hl,de
ld a,30
ld (yStep),a
ld (yStep2),a
ex de,hl
or a,a
sbc hl,hl
sbc hl,de
jp p,+_
ex de,hl
_:      ld (dy),hl
ld (dy1),hl
ld (dy12),hl
pop de
pop af
srl h
rr l
sbc hl,de
pop bc
ld hl,0
jr nc,changeYLoop
changeXLoop:
push hl
ld l,a
ld h,160
mlt hl
ld de,\$d40000
color: =\$+1
ld (hl),0
sbc hl,hl
ld h,b
ld l,c
or a,a
nde:    ld de,000
sbc hl,de
pop hl
ret z
xStep:  nop
dy1: =\$+1
ld de,0
or a,a
jp m,changeXLoop
dx: =\$+1
ld de,0
or a,a
sbc hl,de
jr c,changeXLoop
yStep:  nop
dx1: =\$+1
ld de,0
sbc hl,de
jr changeXLoop
changeYLoop:
push hl
ld l,a
ld h,160
mlt hl
ld de,\$d40000
color2: =\$+1
ld (hl),0
pop hl
y1: =\$+1
cp a,0
ret z
yStep2:  nop
dx12: =\$+1
ld de,0
or a,a
jp m,changeYLoop
dy: =\$+1
ld de,0
or a,a
sbc hl,de
jr c,changeYLoop
xStep2: nop
dy12: =\$+1
ld de,0
sbc hl,de
jr changeYLoop
Does anyone have a good routine to get the positive difference between a and c?
tmwilliamlin168 wrote:
Does anyone have a good routine to get the positive difference between a and c?

I've just started learning assembly, but I think "sub c \ ret nc \ neg \ ret" is optimal for unsigned numbers. For the difference between two signed numbers, I think you'd need to negate (a-c) if ((a was <0) xor (c was <0) xor (a-c is <0)).

There's a difference between unsigned and signed, because, for example, when a is 0xFF and c is 0x00, the unsigned difference is 255, but the signed difference is 1. "add 128 \ ld e,a \ ld a,c \ add 128 \ sub e \ ret c \ neg \ ret" works, but there's definitely something better.
lirtosiast wrote:
tmwilliamlin168 wrote:
Does anyone have a good routine to get the positive difference between a and c?

I've just started learning assembly, but I think "sub c \ ret nc \ neg \ ret" is optimal for unsigned numbers. For the difference between two signed numbers, I think you'd need to negate (a-c) if ((a was <0) xor (c was <0) xor (a-c is <0)).

So if c is bigger than a, and you do sub c, then the carry flag will be set?
c = 10
a = 5
5-10 will set the carry because we go past 0. The other way around (10-5) won't reset the carry because we didn't go past zero so there is no carry.

lirtosiast: I believe your routine should work for unsigned numbers as is as well.

Register to Join the Conversation
Have your own thoughts to add to this or any other topic? Want to ask a question, offer a suggestion, share your own programs and projects, upload a file to the file archives, get help with calculator and computer programming, or simply chat with like-minded coders and tech and calculator enthusiasts via the site-wide AJAX SAX widget? Registration for a free Cemetech account only takes a minute.

»
» Goto page 1, 2  Next
» All times are UTC - 5 Hours

You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum