#include "asm.h" /* ** divu_t divu64_c(unsigned dh, unsigned dl, unsigned dv); ** ** This routine is based on Algorithm D in Donald E. Knuth's ** "The Art of Computer Programming", Vol. 2, section 4.3.1. ** The extra checks in step D3 have been omitted as they ** would take more time to do than they would save. ** ** There are three normalization to choose one from. The first, ** while fastest in terms of execution cycles, suffers from a ** bad case of three-word-instructionitis. The second uses about ** 60% more execution cycle, but in some cases may be faster due ** to fewer instruction fetch delays. The third is the standard ** bit-by-bit approach. Unlike the first two, its time depends ** on the data. At best it is much faster, but at worst takes ** more than twice the time of the other two. */ .text .global divu64_c divu64_c: lea -24(sp), sp // Make room on stack for registers moveml d2-d7, (sp) // Save regsisters movel 36(sp), d3 // Divisor in d3 movel 32(sp), d1 // Dividend in d2,d1 movel 28(sp), d2 // One word dividend? beq simple // Yep, handle it cmpl d3, d2 // Overflow, divide by zero? bcc ovflow // Yep, report it // Normalize the divisor: cmpl #0x10000, d3 scs d5 andl #16, d5 lsll d5, d3 cmpl #0x1000000, d3 scs d4 andl #8, d4 orl d4, d5 lsll d4, d3 cmpl #0x10000000, d3 scs d4 andl #4, d4 orl d4, d5 lsll d4, d3 cmpl #0x40000000, d3 scs d4 andl #2, d4 orl d4, d5 lsll d4, d3 spl d4 andl #1, d4 orl d4, d5 lsll d4, d3 /* Alternate version of normalization: movel #1, d6 movel #16, d7 clrl d5 1: lsll d7, d6 cmpl d6, d3 scs d4 andl d7, d4 lsll d4, d3 orl d4, d5 lsrl #1, d7 bneb 1b */ /* Standard version: clrl d5 tstl d3 bmib 2f 1: addl #1, d5 lsll #1, d3 bplb 1b 2: */ // Calculate exponent movel #32, d4 subl d5, d4 // Align dividend to normalized divisor lsll d5, d2 movel d1, d0 lsrl d4, d0 orl d0, d2 lsll d5, d1 // d3 now holds the normalized divisor. d2 and d1 hold the // aligned dividend. d5 holds the normalization constant. // The quotient is built up in d0, the remainder in d1. // Estimate first digit movel d2, d0 movel d3, d6 swap d3 // It's convenient to have a swapped divisor divuw d3, d0 // Trial division by high digit of divisor svs d7 // Quotient too large? extbl d7 // If so, force to 0xffff orl d7, d0 movel d3, d7 muluw d0, d6 // Low partial product muluw d0, d7 // High partial product swap d6 // Put low digits in place clrl d4 movew d6, d4 // Seperate high 16 bits and eorl d4, d6 // low 16 bits of low part addl d4, d7 // Form full product // Form remainder; check for digit too large & correct subl d6, d1 subxl d7, d2 bplb 4f // Good estimate? movel d3, d6 // Nope, split divisor so we can do (~32%) movew d6, d4 // an add-back (divisor halves still swapped eorl d4, d6 // and upper half d4 still 0 from above) 3: subl #1, d0 // Decrement quotient digit addl d6, d1 // and adjust remainder addxl d4, d2 bmib 3b // Two big? (at most! <1%) 4: swap d0 // Put digit in place // Estimate second digit movel d1, d6 movew d2, d6 swap d6 divuw d3, d6 // Trial division by high digit of divisor svs d7 // Quotient too large? extbl d7 // If so, force to 0xffff orl d7, d6 movew d6, d0 // Combine with upper digit movel d3, d7 swap d3 // Get back unswapped divisor muluw d3, d6 // Low partial product muluw d0, d7 // High partial product swap d7 movew d7, d4 // (Upper 16 bits d4 still 0) eorl d4, d7 addl d7, d6 // Form full product clrl d7 addxl d7, d4 // Form remainder; check for digit too large & correct subl d6, d1 subxl d4, d2 bplb 7f // Good estimate? 6: subl #1, d0 // Decrement quotient digit (~32%) addl d3, d1 // and adjust remainder addxl d7, d2 // (d7 0 from above) bmib 6b // Two big? (at most! < 1%) 7: // Correct remainder for normalized divisor lsrl d5, d1 done: moveml (sp), d2-d7 lea 24(sp), sp rts simple: tstl d3 // Divide by zero? beqb ovflow // Yep, don't trap movel d1, d0 // REMU takes too long divul d3, d0 // We'll do it for ourselves! mulul d0, d3 // and save almost 30 clocks! subl d3, d1 brab done ovflow: movel #-1, d0 // Return all ones movel d0, d1 // and impossible remainder brab done