#include "asm.h"

/*
** divu_t divu64_c(unsigned dh, unsigned dl, unsigned dv);
**
** This routine is based on Algorithm D in Donald E. Knuth's
** "The Art of Computer Programming", Vol. 2, section 4.3.1.
** The extra checks in step D3 have been omitted as they
** would take more time to do than they would save.
**
** There are three normalization to choose one from. The first,
** while fastest in terms of execution cycles, suffers from a
** bad case of three-word-instructionitis. The second uses about
** 60% more execution cycle, but in some cases may be faster due
** to fewer instruction fetch delays. The third is the standard
** bit-by-bit approach. Unlike the first two, its time depends
** on the data. At best it is much faster, but at worst takes
** more than twice the time of the other two.
*/
	.text
	.global	divu64_c
divu64_c:
	lea	-24(sp), sp	// Make room on stack for registers
	moveml	d2-d7, (sp)	// Save regsisters

	movel	36(sp), d3	// Divisor in d3
	movel	32(sp), d1	// Dividend in d2,d1
	movel	28(sp), d2	// One word dividend?
	beq	simple		// Yep, handle it
	cmpl	d3, d2		// Overflow, divide by zero?
	bcc	ovflow		// Yep, report it

	// Normalize the divisor:
	cmpl	#0x10000, d3
	scs	d5
	andl	#16, d5
	lsll	d5, d3
	cmpl	#0x1000000, d3
	scs	d4
	andl	#8, d4
	orl	d4, d5
	lsll	d4, d3
	cmpl	#0x10000000, d3
	scs	d4
	andl	#4, d4
	orl	d4, d5
	lsll	d4, d3
	cmpl	#0x40000000, d3
	scs	d4
	andl	#2, d4
	orl	d4, d5
	lsll	d4, d3
	spl	d4
	andl	#1, d4
	orl	d4, d5
	lsll	d4, d3

/* Alternate version of normalization:
	movel	#1, d6
	movel	#16, d7
	clrl	d5
1:	lsll	d7, d6
	cmpl	d6, d3
	scs	d4
	andl	d7, d4
	lsll	d4, d3
	orl	d4, d5
	lsrl	#1, d7
	bneb	1b
*/

/* Standard version:
	clrl	d5
	tstl	d3
	bmib	2f
1:	addl	#1, d5
	lsll	#1, d3
	bplb	1b
2:
*/

	// Calculate exponent
	movel	#32, d4
	subl	d5, d4

	// Align dividend to normalized divisor
	lsll	d5, d2
	movel	d1, d0
	lsrl	d4, d0
	orl	d0, d2
	lsll	d5, d1

	// d3 now holds the normalized divisor. d2 and d1 hold the
	// aligned dividend. d5 holds the normalization constant.
	// The quotient is built up in d0, the remainder in d1.

	// Estimate first digit
	movel	d2, d0
	movel	d3, d6
	swap	d3		// It's convenient to have a swapped divisor
	divuw	d3, d0		// Trial division by high digit of divisor 
	svs	d7		// Quotient too large?
	extbl	d7		// If so, force to 0xffff
	orl	d7, d0

	movel	d3, d7
	muluw	d0, d6		// Low partial product
	muluw	d0, d7		// High partial product
	swap	d6		// Put low digits in place
	clrl	d4
	movew	d6, d4		// Seperate high 16 bits and
	eorl	d4, d6		// low 16 bits of low part
	addl	d4, d7		// Form full product

	// Form remainder; check for digit too large & correct
	subl	d6, d1
	subxl	d7, d2
	bplb	4f		// Good estimate?
	movel	d3, d6		// Nope, split divisor so we can do (~32%)
	movew	d6, d4		// an add-back (divisor halves still swapped
	eorl	d4, d6		// and upper half d4 still 0 from above)
3:	subl	#1, d0		// Decrement quotient digit
	addl	d6, d1		// and adjust remainder
	addxl	d4, d2
	bmib	3b		// Two big? (at most! <1%)

4:	swap	d0		// Put digit in place

	// Estimate second digit
	movel	d1, d6
	movew	d2, d6
	swap	d6
	divuw	d3, d6		// Trial division by high digit of divisor
	svs	d7		// Quotient too large?
	extbl	d7		// If so, force to 0xffff
	orl	d7, d6
	movew	d6, d0		// Combine with upper digit

	movel	d3, d7
	swap	d3		// Get back unswapped divisor
	muluw	d3, d6		// Low partial product
	muluw	d0, d7		// High partial product
	swap	d7
	movew	d7, d4		// (Upper 16 bits d4 still 0)
	eorl	d4, d7
	addl	d7, d6		// Form full product
	clrl	d7
	addxl	d7, d4

	// Form remainder; check for digit too large & correct
	subl	d6, d1
	subxl	d4, d2
	bplb	7f		// Good estimate?
6:	subl	#1, d0		// Decrement quotient digit (~32%)
	addl	d3, d1		// and adjust remainder
	addxl	d7, d2		// (d7 0 from above)
	bmib	6b		// Two big? (at most! < 1%)
7:
	// Correct remainder for normalized divisor
	lsrl	d5, d1

done:	moveml	(sp), d2-d7
	lea	24(sp), sp
	rts

simple:	tstl	d3		// Divide by zero?
	beqb	ovflow		// Yep, don't trap
	movel	d1, d0		// REMU takes too long
	divul	d3, d0		// We'll do it for ourselves!
	mulul	d0, d3		// and save almost 30 clocks!
	subl	d3, d1
	brab	done

ovflow:	movel	#-1, d0		// Return all ones
	movel	d0, d1		// and impossible remainder
	brab	done