Kerim
Member
In general, using more memory may let the code run faster. This applies in this division.
One may notice that the trick that lets the code of the previous thread “AVR Code for Dividing 16 Bits by 8-Bit Constant” be relatively fast is that the long division of reg_Kd [=(2*256*256/reg_N+1)/2] is made already by the compiler. This could be repeated for all possible values of reg_N [2 to 255] and the results will be listed on a table [it takes 256 words and its first two entries could be any numbers).
One may notice that the trick that lets the code of the previous thread “AVR Code for Dividing 16 Bits by 8-Bit Constant” be relatively fast is that the long division of reg_Kd [=(2*256*256/reg_N+1)/2] is made already by the compiler. This could be repeated for all possible values of reg_N [2 to 255] and the results will be listed on a table [it takes 256 words and its first two entries could be any numbers).
Code:
;======================================================
; Division of 16-bit register {A} by 8-bit register {N}
;======================================================
; {R}={A}/{N} = r17:r16 / {N} = r17:r16 * {Kd} /256 /256 then rounding
; {A} dividend in r17:r16 [0 to 65535]
; {N} constant divisor in r22 [N=0 to 255], see Note_1, Note_2 below
; {R} result in r21:r20 , {A}/{N} rounded
; {Kd} = round(256*256/{N},0), the division constant [from KdTable:]
; {Kr} = INT[ (1-{N})/2 ] , the rounding constant
; Used registers: ZH,ZL,r19,r18,r14,r13,r1,r0
; 49, 59 or 60 cycles = 42c|52c|53c code + 4c RET + 3c RCALL
D16var8:
TST r22 ;1abc
BREQ DOoverF ;1abc
CPI r22, 1 ;1abc
BREQ DO_same ;1abc
LDI ZH, high(KdTable) ;1abc, address in words, 0x??00
MOV ZL, r22 ;1abc
; *2 [from word address to byte address]
LSL ZL ;1abc
ROL ZH ;1abc
LPM r13, Z+ ;3abc
LPM r14, Z ;3abc, 14abc
; r14:r13 = reg_Kd
; multiplicand in r17:r16 = {A}
; multiplier in r14:r13 = {Kd}
; mul. result in r21:r20:r19:r18
; valid result in r21:r20
MUL r17, r14 ;2abc
MOVW r21:r20, r1:r0 ;1abc
MUL r16, r13 ;2abc
MOVW r19:r18, r1:r0 ;1abc
MUL r17, r13 ;2abc
CLR r13 ;1abc
ADD r19, r0 ;1abc
ADC r20, r1 ;1abc
ADC r21, r13 ;1abc
MUL r14, r16 ;2abc
ADD r19, r0 ;1abc
ADC r20, r1 ;1abc
ADC r21, r13 ;1abc +17abc= 31abc
; {B} = r21:r20 = r17:r16 * r14:r13 /256/256 = {A}*{Kd}/256/256
; {R} = {B} or {B}+1
; for rounding
MOV r13, r22 ;1abc
; r13 = {N}
MUL r20, r13 ;2abc
MOVW r19:r18, r1:r0 ;1abc
MUL r21, r13 ;2abc
ADD r19, r0 ;1abc, +7abc= 38abc
; {C} = r19:r18 = {B}*{N} = r21:r20 * r13
; the following conditions were deduced empirically
; if( Carry_1=0, {R}={B}, if({D2}>0, {R}={B}, {R}={B}+1 ) )
SUB r18, r16 ;1abc
SBC r19, r17 ;1abc
; {D1} = r19:r18 = {C} - {A} = r19:r18 - r17:r16
BRCC DIV_ret ;2a|1bc +4a=[42a]
; if Carry_1=0, {R}={B}
DEC r13 ;1bc
LSR r13 ;1bc
; r13 = - reg_Kr
ADD r18, r13 ;1bc
CLR r13 ;1bc
ADC r19, r13 ;1bc
; {D2} = r19:r18 = {D1} + {-Kr} = r19:r18 + {-Kr}
BRPL DIV_ret ;2b|1c, +10b=[52b]
; if {D2} positive, {R}={B}
SUBI r20, low(-1) ;1c
SBCI r21, high(-1) ;1c, +11c=[53c]
; {R}={B}+1
DIV_ret:
RET ;4
; {R} = r21:r20 = {B} or {B}+1 [ {A}/{N} rounded ]
DOoverF:
SER r20
SER r21
RET
; {R} = r21:r20 = 0xFFFF
DO_same:
MOV r20, r16
MOV r21, r17
RET
;====================================
.org 0x??00
KdTable:
.dw 0,0,32768,21845,16384,13107,10923,9362
.dw 8192,7282,6554,5958,5461,5041,4681,4369
.dw 4096,3855,3641,3449,3277,3121,2979,2849
.dw 2731,2621,2521,2427,2341,2260,2185,2114
.dw 2048,1986,1928,1872,1820,1771,1725,1680
.dw 1638,1598,1560,1524,1489,1456,1425,1394
.dw 1365,1337,1311,1285,1260,1237,1214,1192
.dw 1170,1150,1130,1111,1092,1074,1057,1040
.dw 1024,1008,993,978,964,950,936,923
.dw 910,898,886,874,862,851,840,830
.dw 819,809,799,790,780,771,762,753
.dw 745,736,728,720,712,705,697,690
.dw 683,676,669,662,655,649,643,636
.dw 630,624,618,612,607,601,596,590
.dw 585,580,575,570,565,560,555,551
.dw 546,542,537,533,529,524,520,516
.dw 512,508,504,500,496,493,489,485
.dw 482,478,475,471,468,465,462,458
.dw 455,452,449,446,443,440,437,434
.dw 431,428,426,423,420,417,415,412
.dw 410,407,405,402,400,397,395,392
.dw 390,388,386,383,381,379,377,374
.dw 372,370,368,366,364,362,360,358
.dw 356,354,352,350,349,347,345,343
.dw 341,340,338,336,334,333,331,329
.dw 328,326,324,323,321,320,318,317
.dw 315,314,312,311,309,308,306,305
.dw 303,302,301,299,298,297,295,294
.dw 293,291,290,289,287,286,285,284
.dw 282,281,280,279,278,277,275,274
.dw 273,272,271,270,269,267,266,265
.dw 264,263,262,261,260,259,258,257
/*
Note_1:
If N=0, the returned result [r21:r20] is 65535 [the highest possible number for it]. But this could be replaced by setting an overflow flag for example.
Note_2:
In case reg_N [r22] won’t be 0 or 1 for sure, testing r22 at the beginning of the code could be removed, also the code of DOoverF: and DO_same:
*/