*  FIND 3N+C CYCLES							      *
*  04/15/13 (dkc)							      *
*									      *
*  This C64 subroutine finds 3n+c cycles.  The calling sequence of the sub-   *
*  routine is;								      *
*									      *
*     K[0] => a4							      *
*     K[1] => b4							      *
*     maximum allowable value of first word of an odd element => a6	      *
*     c => b6								      *
*     address to store output n value (one in the cycle) => a8		      *
*									      *
*  (K[0], K[1]) is an input 64-bit odd n value (positive or negative).	The   *
*  input n and c values must be in the range so that |3*n+c| is less than or  *
*  equal to 0x7fffffffffffffff.  If the maximum allowable first word in a     *
*  subsequent even n value in the sequence is exceeded, the subroutine	      *
*  terminates and 0 is returned (c must be in the range so this even n value  *
*  can be computed without overflowing the signed 64-bit words).  Every other *
*  initial even n value in the sequence is checked to determine if the maximum*
*  allowable value has been exceeded, so c must be in the range so that       *
*  3*[(3*n+c)/2]+c can be computed without overflowing the signed 64-bit      *
*  words.  If the second word of an initial even element in the 3n+c sequence *
*  is 0, the subroutine terminates and an error code of 2 is returned.	(To   *
*  reduce execution time, the algorithm is coded on the assumption that the   *
*  second word is not 0.  The probability that the second word is 0 is very   *
*  small.)  Otherwise, a value of 1 is returned (indicating that a cycle has  *
*  been found).  An element in the cycle is returned in the output array.     *
*  Robert Floyd's cycle-finding algorithm is used.  Each iteration of the     *
*  loop takes 22 instruction cycles to execute, so the subroutine is much     *
*  faster than most C code (depending on what compiler and chip the code is   *
*  being run on).							      *
*									      *
*******************************************************************************
       .global _cycle
       .text
_cycle:zero.l2 b19			; load 0
||     mvk.s2 1, b31			; load 1
||     mv.d2 b4, b18			; load K[1]
||     mvk.d1 2, a22			; load error code

       addu.l2 b19:b18, b18, b21:b20	; K[1]*2
||     rotl.m2x a4, 0, b16		; load K[0]
||     shl.s2 b31, 31, b31		; load mask
||     mv.l1x b6, a24			; load c

       addu.l2 b21:b20, b18, b21:b20	; K[1]*3
||     mvk.s2 32, b29			; load 32
||     mvk.s1 32, a29			; load 32

       addu.l2 b21:b20, b6, b21:b20	; K[1]=K[1]*3+c
||     mv.l1x b31, a31			; load mask

       addab.d2 b16, b16, b17		; K[0]*2
||     shr.s2 b20, 31, b23		; load MSB of K[1]
||     bitr.m2 b20, b0			; bit-reverse K[1]

       addab.d2 b17, b16, b16		; K[0]*3

 [!b0] b.s2 b3				; return
||     addab.d2 b16, b21, b16		; K[0]=K[0]*3+carry
||     lmbd.l2 1, b0, b30		; count bits
||     mpy.m2 b2, 0, b2 		; load 0

 [!b0] mvk.s1 2, a4			; return error code
||     and.d2 b16, b31, b1		; isolate sign
||     sub.s2 b29, b30, b28		; 32-count
||     neg.l2 b16, b17			; -K[0]-carry

 [!b1] addab.d2 b16, 0, b17		; load K[0]
||[b1] add.l2 b17, b23, b17		; -K[0]
||     shru.s2 b20, b30, b20		; K[1]>>count

  [b0] cmpltu.l2x a6, b17, b2		; compare to maximum
||     shl.s2 b16, b28, b28		; K[0]<<(32-count)

  [b2] mpy.m1 a4, 0, a4 		; return 0
||     shr.s2 b16, b30, b16		; K[0]=K[0]>>count
||     or.l2 b20, b28, b20		; K[1]=K[1]|(K[0]<<(32-count))

  [b2] b.s2 b3				; return
||     mpy.m1 a19, 0, a19		; load 0
***
       mv.l1x b20, a18			; load K[1]

       mv.s1x b16, a16			; load K[0]
||     addu.l1 a19:a18, a18, a21:a20	; K[1]*2

       addu.l1 a21:a20, a18, a21:a20	; K[1]*3

       addu.l1 a21:a20, a24, a21:a20	; K[1]=K[1]*3+c

       addab.d1 a16, a16, a17		; K[0]*2
||     shr.s1 a20, 31, a23		; load MSB of K[1]
||     bitr.m1 a20, a0			; bit-reverse K[1]

       addab.d1 a17, a16, a16		; K[0]*3

 [!a0] b.s2 b3				; return
||     addab.d1 a16, a21, a16		; K[0]=K[0]*3+carry
||     lmbd.l1 1, a0, a30		; count bits
||     mpy.m1 a2, 0, a2 		; load 0

 [!a0] rotl.m1 a22, 0, a4		; return error code
||     sub.s1 a29, a30, a28		; 32-count
||     and.d1 a16, a31, a1		; isolate sign
||     neg.l1 a16, a17			; -K[0]-carry

 [!a1] addab.d1 a16, 0, a17		; load K[0]
||[a1] add.l1 a17, a23, a17		; -K[0]
||     shru.s1 a20, a30, a20		; K[1]>>count

  [a0] cmpltu.l1 a6, a17, a2		; compare to maximum
||     shl.s1 a16, a28, a28		; K[0]<<(32-count)
||     mpy.m1 a19, 0, a19		; load 0

  [a2] mpy.m1 a4, 0, a4 		; return 0
||     shr.s1 a16, a30, a16		; K[0]=K[0]>>count
||     or.l1 a20, a28, a20		; K[1]=K[1]|(K[0]<<(32-count))
||     mpy.m2 b19, 0, b19		; load 0

  [a2] b.s2 b3				; return
||     mv.s1 a20, a18			; load K[1]
||     mv.l2 b20, b18			; load K[1]
***
       addu.l1 a19:a18, a18, a21:a20	; K[1]*2
||     addu.l2 b19:b18, b18, b21:b20	; K[1]*2

       addu.l1 a21:a20, a18, a21:a20	; K[1]*3
||     addu.l2 b21:b20, b18, b21:b20	; K[1]*3

       addu.l1 a21:a20, a24, a21:a20	; K[1]=K[1]*3+c
||     addu.l2 b21:b20, b6, b21:b20	; K[1]=K[1]*3+c

       addab.d1 a16, a16, a17		; K[0]*2
||     bitr.m1 a20, a0			; bit-reverse K[1]
||     addab.d2 b16, b16, b17		; K[0]*2
||     shr.s2 b20, 31, b23		; load MSB of K[1]
||     bitr.m2 b20, b0			; bit-reverse K[1]

       addab.d1 a17, a16, a16		; K[0]*3
||     mvk.s1 1, a2			; load 1
||     addab.d2 b17, b16, b16		; K[0]*3
||     mvk.s2 1, b2			; load 1

 [!a0] sub.s2 b2, b2, b2		; clear flag
||     addab.d1 a16, a21, a16		; K[0]=K[0]*3+carry
||     lmbd.l1 1, a0, a30		; count bits
||[!b0] mpy.m2 b2, 0, b2		 ; clear flag
||     addab.d2 b16, b21, b16		; K[0]=K[0]*3+carry
||     lmbd.l2 1, b0, b30		; count bits
***************
*  begin loop *
***************
aloop:
 [!a0] rotl.m1 a22, 0, a4		; return error code
||     sub.l1 a29, a30, a28		; 32-count
||     shru.s1 a20, a30, a20		; K[1]>>count
||[!b0] mvk.d1 2, a4			 ; return error code
||     and.d2 b16, b31, b1		; isolate sign
||     sub.s2 b29, b30, b28		; 32-count
||     neg.l2 b16, b17			; -K[0]-carry
||     mpy.m2 b1, 0, b1 		; load 0

       shl.s1 a16, a28, a28		; K[0]<<(32-count)
||[!b1] addab.d2 b16, 0, b17		 ; load K[0]
||[b1] add.l2 b17, b23, b17		; -K[0]
||     shru.s2 b20, b30, b20		; K[1]>>count

       shr.s1 a16, a30, a16		; K[0]=K[0]>>count
||     or.l1 a20, a28, a20		; K[1]=K[1]|(K[0]<<(32-count))
||     mpy.m1 a19, 0, a19		; load 0
||[b2] cmpltu.l2x a6, b17, b1		; compare to maximum
||     shl.s2 b16, b28, b28		; K[0]<<(32-count)

       addab.d1 a20, 0, a18		; load K[1]
||[b1] mpy.m1 a4, 0, a4 		; return 0
||     shr.s2 b16, b30, b16		; K[0]=K[0]>>count
||     or.l2 b20, b28, b20		; K[1]=K[1]|(K[0]<<(32-count))
||     mpy.m2 b19, 0, b19		; load 0
||[b1] subab.d2 b2, b2, b2		; load 0

 [!b2] b.s2 b3				; return
||     addu.l1 a19:a18, a18, a21:a20	; K[1]*2
||     mv.l2 b20, b18			; load K[1]
||     mvk.s1 1, a0			; load 1

       addu.l1 a21:a20, a18, a21:a20	; K[1]*3

       addu.l1 a21:a20, a24, a21:a20	; K[1]=K[1]*3+c

       addab.d1 a16, a16, a17		; K[0]*2
||     shr.s1 a20, 31, a23		; load MSB of K[1]
||     bitr.m1 a20, a0			; bit-reverse K[1]
||[b2] mv.l1 a20, a0			; load K[1]

 [!a0] b.s2 b3				; return
||     addab.d1 a17, a16, a16		; K[0]*3

       addab.d1 a16, a21, a16		; K[0]=K[0]*3+carry
||     lmbd.l1 1, a0, a30		; count bits

 [!a0] rotl.m1 a22, 0, a4		; return error code
||     sub.s1 a29, a30, a28		; 32-count
||     and.d1 a16, a31, a1		; isolate sign
||     neg.l1 a16, a17			; -K[0]-carry

 [!a1] addab.d1 a16, 0, a17		; load K[0]
||[a1] add.l1 a17, a23, a17		; -K[0]
||     shru.s1 a20, a30, a20		; K[1]>>count

       cmpltu.l1 a6, a17, a2		; compare to maximum
||     shl.s1 a16, a28, a28		; K[0]<<(32-count)
||     mvk.d1 1, a1			; load 1

       shr.s1 a16, a30, a16		; K[0]=K[0]>>count
||     or.l1 a20, a28, a20		; K[1]=K[1]|(K[0]<<(32-count))
||     subab.d1 a19, a19, a19		; load 0
||[a2] rotl.m1x b16, 0, a16		; load K[0]

 [!a2] cmpeq.l1x a20, b20, a1		; compare K[1] values
||[a2] b.s2 b3				; return
||[a2] mpy.m1 a4, 0, a4 		; return 0
||     addab.d1 a20, 0, a18		; load K[1]

  [a1] cmpeq.l1x a16, b16, a1		; compare K[0] values

 [!a1] b.s2 aloop			; branch if not equal
||     addu.l1 a19:a18, a18, a21:a20	; K[1]*2
||     addu.l2 b19:b18, b18, b21:b20	; K[1]*2
||[a1] stw.d1 a20, *+a8[1];		; save K[1]
||[a1] mvk.s1 1, a4			; return 1
||[a2] mpy.m1 a4, 0, a4 		; return 0

       addu.l1 a21:a20, a18, a21:a20	; K[1]*3
||     addu.l2 b21:b20, b18, b21:b20	; K[1]*3
||[a1] stw.d1 a16, *a8			; save K[0]
||[a1] b.s2 b3				; return

 [!a1] addu.l1 a21:a20, a24, a21:a20	; K[1]=K[1]*3+c
||     addu.l2 b21:b20, b6, b21:b20	; K[1]=K[1]*3+c

       addab.d1 a16, a16, a17		; K[0]*2
||     bitr.m1 a20, a0			; bit-reverse K[1]
||     addab.d2 b16, b16, b17		; K[0]*2
||     shr.s2 b20, 31, b23		; load MSB of K[1]
||     bitr.m2 b20, b0			; bit-reverse K[1]

 [!a1] addab.d1 a17, a16, a16		; K[0]*3
||     mvk.s1 1, a2			; load 1
||     addab.d2 b17, b16, b16		; K[0]*3
||     mvk.s2 1, b2			; load 1

 [!a0] sub.s2 b2, b2, b2		; clear flag
||     addab.d1 a16, a21, a16		; K[0]=K[0]*3+carry
||     lmbd.l1 1, a0, a30		; count bits
||[!b0] mpy.m2 b2, 0, b2		 ; clear flag
||     addab.d2 b16, b21, b16		; K[0]=K[0]*3+carry
||     lmbd.l2 1, b0, b30		; count bits
****************
*  end loop    *
****************
       nop
       .end