*******************************************************************************
*									      *
*  128/64 BIT DIVISION (UNSIGNED)					      *
*  01/30/07 (dkc)							      *
*									      *
*  This C64 subroutine does 128/64 bit division.  The calling sequence of the *
*  subroutine is;							      *
*									      *
*     dividend (A[0], A[1], A[2], A[3]) => a4, b4, a6, b6		      *
*     address of quotient => a8 					      *
*     divisor (D[0], D[1]) => b8, a10					      *
*									      *
*******************************************************************************
	.global _div128_64
	.text
_div128_64:
	mv.l1 a4, a1	      ;  load A[0]
||	mv.s1x b4, a4	      ;  load A[1]
||	addab.d1 a6, 0, a13   ;  load A[2]
||	mv.l2x a8, b4	      ;  load address of quotient
||	mvk.s2 32, b9	      ;  load 32
||	stw.d2 a13, *b15--    ;  save a13

	lmbd.l2 1, b8, b5     ;  left-most bit detection
||	lmbd.l1 1, a10, a5    ;  left-most bit detection
||	mv.s1x b6, a3	      ;  load A[3]
||	mv.s2x a13, b2	      ;  load A[2]
||	addab.d1 a10, 0, a6   ;  load D[3]
||	addab.d2 b8, 0, b1    ;  load D[2]

  [!b1] add.l2x b5, a5, b5    ;  divisor left-most bit detection
||	cmpltu.l1x a13, b8, a2	;  compare A[2] to D[2]
||	stw.d2 a12, *b15--    ;  save a12
||	or.s1 a1, a4, a9      ;  A[0] | A[1]
||	addab.d1 a1, 0, a5    ;  load A[0]

	lmbd.l1 1, a5, a0     ;  left-most bit detection (A[0])
||	lmbd.l2x 1, a4, b0    ;  left-most bit detection (A[1])
||	sub.s2 b2, b8, b2     ;  compare A[2] to D[2]
||	mv.s1x b5, a7	      ;  load divisor left-most bit detection
||	addab.d1 a3, 0, a12   ;  load A[3]
||	stw.d2 b11, *b15--    ;  save b11
||	mpy.m2 b11, 0, b11    ;  zero D[0]

  [!a1] add.s1x b0, a0, a0    ;  left-most bit detection (A[0] and A[1])
||[!b2] cmpltu.l1 a12, a6, a2  ;  compare A[3] to D[3]
||	lmbd.l2x 1, a13, b5   ;  left-most bit detection (A[2])
||	addab.d1 a9, 0, a1    ;  load A[0] | A[1]
||	stw.d2 b10, *b15--    ;  save b10
||	mpy.m2 b10, 0, b10    ;  zero D[1]

  [!a1] add.s1x a0, b5, a0    ;  left-most bit detection (A[0], A[1], and A[2])
||	or.l1 a1, a13, a1     ;  A[0] | A[1] | A[2]
||	lmbd.l2x 1, a12, b5   ;  left-most bit detection (A[3])
||	stw.d2 b12, *b15--    ;  save b12
|| [a1] subab.d1 a2, a2, a2   ;  load zero

  [!a1] add.l1x a0, b5, a0    ;  dividend left-most bit detection
||	addk.s1 64, a7	      ;  divisor left-most bit detection + 64
|| [a2] subab.d1 a4, a4, a4   ;  load 0
|| [a2] mpy.m1 a13, 0, a13    ;  load 0

	subab.d1 a7, a0, a8   ;  shift = lmbd(1,x2) - lmbd(1,x1)
||	mv.l1x b8, a7	      ;  load D[2]
|| [a2] b.s2 zskip	      ;  return zero
|| [a2] mpy.m1 a5, 0, a5      ;  load 0
||	mvk.s1 32, a3	      ;  load 32

  [!a2] cmplt.l1 a8, a3, a2   ;  compare shift to 32
|| [a2] subab.d1 a2, a2, a2   ;  load zero
||	sub.l2x b9, a8, b0    ;  32 - shift
||	sub.s1 a8, a3, a9     ;  shift - 32
||	mvk.s2 32, b5	      ;  load 32
|| [a2] mpy.m1 a12, 0, a12    ;  load 0

  [!a2] addab.d1 a6, 0, a7    ;  D[2] = D[3]
||[!a2] mpy.m1 a6, 0, a6      ;  clear D[3]
||[!a2] addab.d2 b10, 0, b11  ;  D[0] = D[1]
||[!a2] mv.l2 b8, b10	      ;  D[1] = D[2]
|| [a2] shru.s2x a6, b0, b2   ;  D[2] = D[3] >> (32-shift)
|| [a2] b.s1 askip	      ;  branch if shift < 32

   [a2] shl.s1 a7, a8, a7     ;  D[2] << shift
|| [a2] shru.s2 b10, b0, b1   ;  D[0] = D[1] >> (32-shift)
||	mv.l2x a8, b8	      ;  load shift
|| [a2] addab.d1 a7, 0, a0    ;  save D[2]
|| [a2] mv.l1x b0, a1	      ;  load 32-shift

   [a2] or.l1x a7, b2, a7     ;  load D[2]
|| [a2] shl.s2 b11, b8, b11   ;  D[0] << shift
|| [a2] shru.s1 a0, a1, a0    ;  D[1] = D[2] >> (32-shift)

   [a2] or.l2 b11, b1, b11    ;  load D[0]
|| [a2] shl.s2 b10, b8, b10   ;  D[1] << shift
|| [a2] shl.s1 a6, a8, a6     ;  D[3] << shift

   [a2] or.l2x b10, a0, b10   ;  load D[1]
||	cmplt.l1 a9, a3, a2   ;  compare shift-32 to 32
||[!a2] add.s2 b0, b5, b0     ;  64 - shift
||[!a2] sub.s1 a9, a3, a9     ;  shift - 64
||[!a2] subab.d2 b8, b5, b8   ;  shift - 32

	mvk.s2 3, b12	      ;  load 3
*
  [!a2] mpy.m1 a7, 0, a7      ;  clear D[2]
||[!a2] addab.d2 b10, 0, b11  ;  D[0] = D[1]
||[!a2] mv.l2x a7, b10	      ;  D[1] = D[2]
|| [a2] shru.s2 b10, b0, b1   ;  D[0] = D[1] >> (64-shift)
|| [a2] b.s1 askip	      ;  branch if shift < 64
|| [a2] mv.l1x b0, a1	      ;  load 64-shift

   [a2] shl.s2 b11, b8, b11   ;  D[0] << (shift-32)
|| [a2] shru.s1 a7, a1, a0    ;  D[1] = D[2] >> (64-shift)
|| [a2] mv.l1x b8, a1	      ;  load shift - 32

   [a2] or.l2 b11, b1, b11    ;  load D[0]
|| [a2] shl.s2 b10, b8, b10   ;  D[1] << (shift-32)
|| [a2] shl.s1 a7, a1, a7     ;  D[2] << (shift-32)

   [a2] or.l2x b10, a0, b10   ;  load D[1]
||	cmplt.l1 a9, a3, a2   ;  compare shift-64 to 32
||[!a2] add.s2 b0, b5, b0     ;  96 - shift
||[!a2] sub.s1 a9, a3, a9     ;  shift - 96
||[!a2] subab.d2 b8, b5, b8   ;  shift - 64

	mv.l1x b8, a0	      ;  load shift - 64
||	subab.d2 b12, 1, b12  ;  load 2

	nop
*
  [!a2] addab.d2 b10, 0, b11  ;  D[0] = D[1]
||[!a2] mpy.m2 b10, 0, b10    ;  D[1] = 0
|| [a2] shru.s2 b10, b0, b1   ;  D[0] = D[1] >> (96-shift)
||	sub.l2 b8, b5, b2     ;  shift - 96

   [a2] shl.s1x b11, a0, a0   ;  D[0] << (shift-64)
||[!a2] shl.s2 b11, b2, b11   ;  D[0] << (shift-96)
||	subab.d2 b12, 1, b12  ;  load 1
||[!a2] add.l2 b0, b5, b0     ;  128 - shift

   [a2] or.l2x b1, a0, b11    ;  load D[0]
|| [a2] shl.s2 b10, b8, b10   ;  D[1] << (shift-64)
||[!a2] subab.d2 b12, 1, b12  ;  load 0

askip	not.l1 a7, a0	      ;  invert D[2]
||	not.s1 a6, a8	      ;  invert D[3]
||	subab.d1 a9, a9, a9   ;  load 0
||	mpy.m1 a1, 0, a1      ;  load 0
||	mv.l2x a8, b2	      ;  load shift
||	sub.s2 b0, 1, b0      ;  127-shift

	add.l1 a9:a8, 1, a9:a8	;  -D[3]
||	mpy.m1 a3, 0, a3      ;  load 0
||	shl.s2 b0, 5, b9      ;  (127-shift) << 5

	add.l1 a1:a0, a9, a1:a0  ;  -D[2]
||	not.s1x b10, a2       ;  invert D[1]
||	or.l2 b9, b0, b9      ;  (127-shift)::(127-shift)

	add.l1 a3:a2, a1, a3:a2  ;  -D[1]
||	addab.d1 a0, 0, a9    ;  load -D[2]
||	not.s1x b11, a0       ;  invert D[0]

	add.l1 a0, a3, a0     ;  -D[0]
||	mv.l2x a2, b10	      ;  load -D[1]
||	mv.s1 a13, a14	      ;  save A[2]
||	mpy.m1 a13, 0, a13    ;  load 0
||	stw.d2 a14, *b15--    ;  save a14

	mv.l2x a0, b11	      ;  load -D[0]
||	zero.s1 a1	      ;  load 0
||	stw.d2 a15, *b15--    ;  save a15
*****************
*  begin loop	*
*****************
aloop	addu.l1 a13:a12, a8, a1:a0  ;  A[3] - D[3]
||	subab.d1 a15, a15, a15	 ;  load 0
||	shru.s2x a14, 31, b7  ;  isolate MSB of A[2]
||	shru.s1 a12, 31, a3   ;  isolate MSB of A[3]

	addu.l1 a15:a14, a1, a15:a14  ;  A[2] + carry
||	shru.s2x a0, 31, b0   ;  isolate MSB of delta[3]
||	addab.d1 a0, a0, a0   ;  (A[3]-D[3]) << 1
||	shl.s1 a14, 1, a13    ;  A[2] << 1
||	mpy.m1 a7, 0, a7      ;  load 0

	addu.l1 a15:a14, a9, a15:a14  ;  A[2] - D[2]
||	addab.d1 a4, 0, a6    ;  load A[1]
||	or.s1 a0, 1, a0       ;  (A[3]-D[3])<<1 | 1
||	shru.s2x a4, 31, b5   ;  isolate MSB of A[1]

	addu.l1 a7:a6, a15, a7:a6  ;  A[1] + carry
||	addab.d1 a14, a14, a1 ;  (A[2]-D[2]) << 1
||	shru.s2x a14, 31, b1   ;  isolate MSB of delta[2]
||	mv.s1x b10, a14       ;  load D[1]

	addu.l1 a7:a6, a14, a7:a6   ;  A[1] - D[1]
||	or.s1x a1, b0, a1     ;  (A[2]-D[2])<<1 | LSB
||	addab.d1 a12, a12, a12	;  A[3] << 1
|| [b2] b.s2 aloop	      ;  conditional branch to loop beginning

	add.l1 a5, a7, a7     ;  A[0] + carry
||	shru.s1 a6, 31, a5    ;  isolate MSB of delta[1]
||	addab.d1 a6, a6, a6   ;  (A[1]-D[1]) << 1
||	shl.s2x a5, 1, b6     ;  A[0] << 1

	add.l1x a7, b11, a7   ;  A[0] - D[0]
||	or.s1 a13, a3, a14    ;  (A[2]<<1) | LSB
||	addab.d1 a4, a4, a4   ;  A[1] << 1
||	or.l2 b6, b5, b6      ;  (A[0]<<1) | LSB

	cmplt.l1 a7, 0, a2    ;  compare delta to 0
||	addab.d1 a7, a7, a7   ;  (A[0]-D[0]) << 1
||	or.s1x a6, b1, a6     ;  (A[1]-D[1])<<1 | LSB
||	or.l2x b7, a4, b7     ;  (A[1]<<1) | LSB
||	mpy.m1 a1, 0, a1      ;  load 0

  [!a2] or.l1 a7, a5, a5      ;  A[0] = (A[0]-D[0])<<1 | LSB
|| [a2] mv.s1x b7, a4	      ;  A[1] = (A[1]<<1) | LSB
||[!a2] addab.d1 a1, 0, a14   ;  A[2] = (A[2]-D[2])<<1 | LSB
||	mpy.m1 a13, 0, a13    ;  load 0

  [!a2] addab.d1 a0, 0, a12   ;  A[3] = (A[3]-D[2])<<1
||[!a2] mv.l1 a6, a4	      ;  A[1] = (A[1]-D[1])<<1 | LSB
|| [a2] mv.s1x b6, a5	      ;  A[0] = (A[0]<<1) | LSB
|| [b2] sub.l2 b2, 1, b2      ;  decrement loop count
*****************
*  end loop	*
*****************
	cmpeq.l2 b12, 3, b1   ;  compare flag to 3
||	mv.l1x b9, a0	      ;  load shift
||	mv.s1 a14, a13	      ;  load A[2]

   [b1] extu.s1 a12, a0, a12  ;  A[3] << shift
|| [b1] zero.l1 a13	      ;  A[2] = 0
|| [b1] subab.d1 a4, a4, a4   ;  A[1] = 0
|| [b1] mpy.m1 a5, 0, a5      ;  A[0] = 0
||	cmpeq.l2 b12, 2, b1   ;  compare flag to 2

   [b1] extu.s1 a13, a0, a13  ;  A[2] << shift
|| [b1] zero.l1 a4	      ;  A[1] = 0
|| [b1] subab.d1 a5, a5, a5   ;  A[0] = 0
||	cmpeq.l2 b12, 1, b1   ;  compare flag to 1

   [b1] extu.s1 a4, a0, a4    ;  A[1] << shift
|| [b1] zero.l1 a5	      ;  A[0] = 0
||	cmpeq.l2 b12, 0, b1   ;  compare flag to 0
||	ldw.d2 *++b15[1], a15 ;  restore a15

   [b1] extu.s1 a5, a0, a5    ;  A[0] << shift
||	ldw.d2 *++b15[1], a14 ;  restore a14

zskip	ldw.d2 *++b15[1], b12 ;  restore b12

	ldw.d2 *++b15[1], b10 ;  restore b10

	ldw.d2 *++b15[1], b11 ;  restore b11

	ldw.d2 *++b15[1], a12 ;  restore a12
||	b.s2 b3

	ldw.d2 *++b15[1], a13 ;  restore a13

	stw.d2 a5, *b4	      ;  store quotient

	stw.d2 a4, *+b4[1]    ;  store quotient

	stw.d2 a13, *+b4[2]   ;  store quotient
||	mv.l1 a12, a0	      ;  save quotient

	stw.d2 a0, *+b4[3]    ;  store quotient
	.end