*******************************************************************************
*									      *
*  64/32 BIT DIVISION (UNSIGNED)					      *
*  01/30/07 (dkc)							      *
*									      *
*  This C64 subroutine does 64/32 bit division.  The calling sequence of the  *
*  subroutine is;							      *
*									      *
*     address of dividend (A[0], A[1]) => a4				      *
*     address of quotient (B[0], B[1]) => b4				      *
*     divisor => a6							      *
*									      *
*******************************************************************************
	.global _div64_32
	.text
_div64_32:
	ldw.d1 *a4, a1	      ;  load A[0]
||	lmbd.l1 1, a6, a5     ;  left-most bit detection
||	mvk.s1 32, a3	      ;  load 32

	ldw.d1 *+a4[1], a4    ;  load A[1]
||	add.l1 a5, a3, a5     ;  divisor left-most bit detection

	nop 4

	lmbd.l1 1, a1, a0     ;  left-most bit detection
||	lmbd.l2x 1, a4, b0    ;  left-most bit detection
||	zero.s1 a2	      ;  clear flag

  [!a1] add.s1x b0, a3, a0    ;  dividend left-most bit detection
||[!a1] cmpltu.l1 a4, a6, a2  ;  compare A[1] to divisor

	sub.l1 a5, a0, a8     ;  shift = lmbd(1,x2) - lmbd(1,x1)
||	mv.l2x a3, b0	      ;  load 32
||	mpy.m1 a7, 0, a7      ;  clear D[0]
|| [a2] b.s2 askip	      ;  return zero
||	mv.s1 a1, a5	      ;  load A[0]

	cmplt.l1 a8, a3, a2   ;  compare shift to 32
||	sub.l2x b0, a8, b0    ;  32 - shift
||	sub.s1 a8, a3, a9     ;  shift - 32
|| [a2] mpy.m1 a5, 0, a5      ;  load 0
|| [a2] subab.d1 a4, a4, a4   ;  load 0
||	mvk.s2 32, b5	      ;  load 32

  [!a2] mv.l1 a6, a7	      ;  D[0] = D[1]
||[!a2] mpy.m1 a6, 0, a6      ;  clear D[1]
|| [a2] shl.s1 a6, a8, a6     ;  D[1] << shift
|| [a2] shru.s2x a6, b0, b0   ;  D[0] = D[1] >> (32-shift)

  [!a2] shl.s1 a7, a9, a7     ;  D[0] = D[0] << (shift-32)
|| [a2] mv.l1x b0, a7	      ;  load D[0]
||	mvk.s2 63, b0	      ;  load 63
||	mv.l2x a8, b2	      ;  load shift
||	subab.d1 a9, a9, a9   ;  load 0

	not.l1 a7, a0	      ;  invert D[0]
||	not.s1 a6, a8	      ;  invert D[1]
||	mv.l2x a8, b2	      ;  load shift - 1
||	subab.d2 b0, b2, b0   ;  64 - shift
||	mvk.s2 31, b6	      ;  load 31
||	addab.d1 a9, 1, a1    ;  load 1

	cmplt.l2 b0, b5, b1   ;  compare 64-shift to 32
||	shl.s2 b0, 5, b9      ;  (64-shift) << 5
||	subab.d2 b6, b2, b6   ;  32 - shift
||	addu.l1 a9:a8, a1, a9:a8  ;   -D[0]::D[1]
*
	and.l1 a9, 1, a9      ;  isolate carry bit
||	mv.s1 a5, a10	      ;  save A[0]
||	mpy.m1 a5, 0, a5      ;  load 0
||	or.l2 b9, b0, b9      ;  (64-shift)::(64-shift)
||	shl.s2 b6, 5, b7      ;  (32-shift) << 5
||	stw.d2 a10, *b15--    ;  save a10

	add.l1 a0, a9, a9     ;  -D[0]::D[1]
||	cmpgt.l2 b0, b5, b0   ;  compare 64-shift to 32
||	or.s2 b6, b7, b6      ;  (32-shift)::(32-shift)
||	zero.s1 a1	      ;  load 0
||	stw.d2 a11, *b15--    ;  save a11
*****************
*  begin loop	*
*****************
aloop	addu.l1 a5:a4, a8, a1:a0  ;  A[1] - D[1]
||	shru.s1 a4, 31, a3    ;  isolate MSB of A[1]
||	addab.d1 a10, a10, a6 ;  A[0] << 1
|| [b2] b.s2 aloop	      ;  conditional branch to loop beginning
|| [b2] sub.l2 b2, 1, b2      ;  decrement loop count

	and.l1 a1, 1, a7      ;  isolate carry bit
||	addab.d1 a10, a9, a1  ;  A[0] - D[0] - carry
||	shl.s1 a4, 1, a11     ;  A[1] << 1

	add.l1 a1, a7, a1     ;  A[0] - D[0]
||	or.s1 a3, a6, a3      ;  A[0] << 1 | LSB

	cmplt.l1 a1, 0, a2    ;  compare A[1]::A[0] - D[1]::D[0] to zero
||	shl.s1 a1:a0, 1, a1:a0	;  (A[1] - D[1]) << 1
||	addab.d1 a1, a1, a7   ;  (A[0] - D[0]) << 1

   [a2] addab.d1 a3, 0, a10   ;  if less than, A[0] = A[0] << 1
|| [a2] mv.s1 a11, a4	      ;  if less than, A[1] = A[1] << 1
||	or.l1 a0, 1, a0       ;  ((A[1] - D[1]) << 1) | 1
||	and.l2x a1, 1, b7     ;  isolate LSB of (A[0] - D[0])
||	mpy.m1x b9, 1, a0     ;  load (64-shift)::(64-shift)

  [!a2] or.l1x a7, b7, a10    ;  if greater than or equal, A[0] = (delta<<1)|1
||[!a2] mv.s1 a0, a4	      ;  if greater than or equal, A[1] = (delta<<1)|1
||	subab.d1 a1, a1, a1   ;  load 0
*****************
*  end loop	*
*****************
   [b1] extu.s1 a10, a0, a5   ;  A[0] << (64-shift)
||	mv.l1x b6, a0	      ;  (32-shift)::(32-shift)
||	ldw.d2 *++b15[1], a11 ;  restore a11

  [!b1] zero.l1 a5	      ;  zero A[0]
|| [b0] extu.s1 a4, a0, a4    ;  A[1] << (64-shift)
||	ldw.d2 *++b15[1], a10 ;  restore a10

askip	b.s2 b3
||	stw.d2 a5, *b4	      ;  store quotient

	stw.d2 a4, *+b4[1]    ;  store quotient

	nop 4
	.end