*******************************************************************************
*									      *
*  FIND LIMB								      *
*  09/03/10 (dkc)							      *
*									      *
*  This C64 subroutine finds a limb in S given its parity vector.  The calling*
*  sequence of the subroutine is;					      *
*									      *
*     I[0] (order/2+2) => a4						      *
*     I[1] (order/2+2) => b4						      *
*     amount to shift off LSB's of order => a6                                *
*     inner loop count => b6						      *
*     delta => a8							      *
*     address of G => b8						      *
*									      *
*  Note:  The shift amount must be small enough so that there are significant *
*	  bits left in the upper word of the order.			      *
*									      *
*******************************************************************************
	.global _limb
	.ref _sv
	.text
_limb:	mvkl.s2 _sv, b16      ; load address of parity vector
||	sub.l2 b6, 1, b6      ; j-1
||	mvk.d1 1, a3	      ; c=1

	mvkh.s2 _sv, b16      ; load address of parity vector
||	mv.d1 a6, a17	      ; load order (shift amount)
||	mpy.m1 a31, 0, a31    ; clear carry

	mv.s1x b4, a30	      ; H[1]=I[1]
||	mv.l1 a4, a5	      ; G[0]=I[0]
||	ldw.d2 *b16, a19      ; load sv[index]

	mpy.m1 a23, 0, a23    ; oldg=0
||	mv.s1 a30, a4	      ; G[1]=H[1]
||	addu.l1 a31:a30, a8, a31:a30  ; H=H+delta
||	mv.l2 b6, b2	      ; load j-1

	mv.d2 b16, b18	      ; load address of parity vector
||	add.l1 a5, a31, a31   ; H=H+delta

	mpy.m1 a1, 0, a1      ; clear flag

	and.l1 a4, 1, a0      ; G[1]&1
||	mpy.m1 a7, 0, a7      ; clear upper word
||	shl.s2x a5, 1, b9     ; G[0]+G[0]
||	mv.s1 a4, a6	      ; load G[1]
||	mvk.d2 1, b26	      ; set flag

  [!a0] shru.s1 a4, 1, a4     ; G[1]>>1
||	xor.l1 a19, a23, a2   ; sv[index]^oldg
||[!a0] mvk.d1 1, a23	      ; oldg=1
|| [a0] mpy.m1 a23, 0, a23    ; oldg=0
||[!a0] addaw.d2 b18, 1, b18  ; index=index+1
||	mvk.s2 1, b0	      ; set flag

  [!a0] shl.s1 a5, 31, a6     ; G[0]<<31
||[!a2] zero.s2 b2	      ; clear inner loop count
|| [a0] add.d1x a5, b9, a5    ; G[0]+G[0]+G[0]
|| [a0] addu.l1 a7:a6, a4, a7:a6  ; G[1]+G[1]
|| [a2] ldw.d2 *b18, a19      ; load sv[index]
***********************
*  begin outer loop   *
***********************
***********************
*  begin inner loop   *
***********************
cloop:
  [!a0] or.d1 a4, a6, a4      ; G[1]=G[1]|(G[0>>31)
||[!a0] shru.s1 a5, 1, a5     ; G[0]=G[0]>>1
|| [a0] addu.l1 a7:a6, a4, a7:a6  ; G[1]+G[1]+G[1]
||[!a2] zero.l2 b0	      ; clear flag

   [a0] addu.l1 a7:a6, a3, a7:a6  ; G[1]+G[1]+G[1]+1
||	shru.s1 a31, a17, a29 ; shift off LSB's
||[!a0] and.d1 a4, 1, a18     ; check if G[1] is even
||	mpy.m1 a7, 0, a7      ; clear upper word
|| [b2] b.s2 cloop	      ; conditional branch to loop beginning

   [a0] add.s1 a7, a5, a5     ; G[0]
|| [a0] mv.l1 a6, a4	      ; load G[1]
|| [a0] and.d1 a6, 1, a18     ; check if G[1] is even
||[!b2] mpy.m1 a27, 0, a27    ; clear MSB's
*
   [b2] shru.s1 a5, a17, a27  ; shift off LSB's
||[!a1] rotl.m1 a4, 0, a20    ; save G[1]
|| [b2] and.l1 a4, 1, a0      ; G[1]&1
||[!b2] mvk.d1 1, a0	      ; set flag

  [!a1] cmpgtu.l1 a27, 2, a1  ; compare to 2
||	shl.s1 a5, 1, a28     ; G[0]+G[0]
||	rotl.m1 a4, 0, a6     ; load G[1]
||[!b2] rotl.m2x a18, 0, b26  ; save G[1]&1
|| [b2] sub.l2 b2, 1, b2      ; decrement loop count

  [!a0] shru.s1 a4, 1, a4     ; G[1]>>1
||[!a0] xor.l1 a19, a23, a2   ; sv[index]^oldg
||[!a0] mvk.d1 1, a23	      ; oldg=1
||[!a0] addaw.d2 b18, 1, b18  ; index=index+1
||[!a1] rotl.m1 a5, 0, a21    ; save G[0]
|| [a1] mpy.m2 b2, 0, b2      ; exit loop
|| [a1] zero.s2 b0	      ; clear flag

  [!a0] shl.s1 a5, 31, a6     ; G[0]<<31
||[!a2] zero.s2 b2	      ; clear inner loop count
|| [a0] add.d1 a5, a28, a5    ; G[0]+G[0]+G[0]
|| [a0] addu.l1 a7:a6, a4, a7:a6  ; G[1]+G[1]
|| [a0] mpy.m1 a23, 0, a23    ; oldg=0
|| [a2] ldw.d2 *b18, a19      ; load sv[index]
||[!a1] and.l2 b0, b26, b0    ; "and" conditions
||	rotl.m2x a5, 0, b5    ; save G[0]
*********************
*  end inner loop   *
*********************
   [b0] b.s2 cskip	      ; solution found
|| [b0] mvk.s1 1, a2	      ; set outer loop count
||[!b0] cmpgtu.l1 a29, 2, a2  ; compare to order
||[!b0] ldw.d2 *b16, a19      ; load sv[index]

  [!a2] b.s2 cloop	      ; conditional branch to loop beginning
||	mv.s1 a31, a5	      ; G[0]=H[0]
||	mv.d1 a30, a4	      ; G[1]=H[1]
||	mpy.m1 a23, 0, a23    ; oldg=0
||[!b0] zero.l1 a31	      ; clear carry bit
||	mv.l2 b6, b2	      ; load j-1

  [!b0] addu.l1 a31:a30, a8, a31:a30  ; H=H+delta
||	mv.l2 b16, b18	      ; load address of parity vector

  [!b0] add.l1 a5, a31, a31
||	mpy.m1 a1, 0, a1      ; clear flag

	and.l1 a4, 1, a0      ; G[1]&1
||	mpy.m1 a7, 0, a7      ; clear upper word
||	shl.s2x a5, 1, b9     ; G[0]+G[0]
||	mv.s1 a4, a6	      ; load G[1]
||	mvk.d2 1, b26	      ; set flag

  [!a0] shru.s1 a4, 1, a4     ; G[1]>>1
||	xor.l1 a19, a23, a2   ; sv[index]^oldg
||[!a0] mvk.d1 1, a23	      ; oldg=1
|| [a0] mpy.m1 a23, 0, a23    ; oldg=0
||[!a0] addaw.d2 b18, 1, b18  ; index=index+1
||	mvk.s2 1, b0	      ; set flag

  [!a0] shl.s1 a5, 31, a6     ; G[0]<<31
||[!a2] zero.s2 b2	      ; clear inner loop count
|| [a0] add.d1x a5, b9, a5    ; G[0]+G[0]+G[0]
|| [a0] addu.l1 a7:a6, a4, a7:a6  ; G[1]+G[1]
|| [a2] ldw.d2 *b18, a19      ; load sv[index]
********************
*  end outer loop  *
********************
	b.s2 b3 	      ; return
||	mvk.d1 0, a4	      ; clear flag

	nop 5
*************
*  return   *
*************
cskip:	b.s2 b3 	      ; return
||	stw.d2 a21, *b8       ; store G[0]

	stw.d2 a20, *+b8[1]   ; store G[1]
||	mvk.d1 1, a4	      ; set flag

	stw.d2 a31, *+b8[2]   ; store H[0]

	stw.d2 a30, *+b8[3]   ; store H[1]

	nop 2

	.end