* FIND 3N+C CYCLES *
* 04/15/13 (dkc) *
* *
* This C64 subroutine finds 3n+c cycles. The calling sequence of the sub- *
* routine is; *
* *
* K[0] => a4 *
* K[1] => b4 *
* maximum allowable value of first word of an odd element => a6 *
* c => b6 *
* address to store output n value (one in the cycle) => a8 *
* *
* (K[0], K[1]) is an input 64-bit odd n value (positive or negative). The *
* input n and c values must be in the range so that |3*n+c| is less than or *
* equal to 0x7fffffffffffffff. If the maximum allowable first word in a *
* subsequent even n value in the sequence is exceeded, the subroutine *
* terminates and 0 is returned (c must be in the range so this even n value *
* can be computed without overflowing the signed 64-bit words). Every other *
* initial even n value in the sequence is checked to determine if the maximum*
* allowable value has been exceeded, so c must be in the range so that *
* 3*[(3*n+c)/2]+c can be computed without overflowing the signed 64-bit *
* words. If the second word of an initial even element in the 3n+c sequence *
* is 0, the subroutine terminates and an error code of 2 is returned. (To *
* reduce execution time, the algorithm is coded on the assumption that the *
* second word is not 0. The probability that the second word is 0 is very *
* small.) Otherwise, a value of 1 is returned (indicating that a cycle has *
* been found). An element in the cycle is returned in the output array. *
* Robert Floyd's cycle-finding algorithm is used. Each iteration of the *
* loop takes 22 instruction cycles to execute, so the subroutine is much *
* faster than most C code (depending on what compiler and chip the code is *
* being run on). *
* *
*******************************************************************************
.global _cycle
.text
_cycle:zero.l2 b19 ; load 0
|| mvk.s2 1, b31 ; load 1
|| mv.d2 b4, b18 ; load K[1]
|| mvk.d1 2, a22 ; load error code
addu.l2 b19:b18, b18, b21:b20 ; K[1]*2
|| rotl.m2x a4, 0, b16 ; load K[0]
|| shl.s2 b31, 31, b31 ; load mask
|| mv.l1x b6, a24 ; load c
addu.l2 b21:b20, b18, b21:b20 ; K[1]*3
|| mvk.s2 32, b29 ; load 32
|| mvk.s1 32, a29 ; load 32
addu.l2 b21:b20, b6, b21:b20 ; K[1]=K[1]*3+c
|| mv.l1x b31, a31 ; load mask
addab.d2 b16, b16, b17 ; K[0]*2
|| shr.s2 b20, 31, b23 ; load MSB of K[1]
|| bitr.m2 b20, b0 ; bit-reverse K[1]
addab.d2 b17, b16, b16 ; K[0]*3
[!b0] b.s2 b3 ; return
|| addab.d2 b16, b21, b16 ; K[0]=K[0]*3+carry
|| lmbd.l2 1, b0, b30 ; count bits
|| mpy.m2 b2, 0, b2 ; load 0
[!b0] mvk.s1 2, a4 ; return error code
|| and.d2 b16, b31, b1 ; isolate sign
|| sub.s2 b29, b30, b28 ; 32-count
|| neg.l2 b16, b17 ; -K[0]-carry
[!b1] addab.d2 b16, 0, b17 ; load K[0]
||[b1] add.l2 b17, b23, b17 ; -K[0]
|| shru.s2 b20, b30, b20 ; K[1]>>count
[b0] cmpltu.l2x a6, b17, b2 ; compare to maximum
|| shl.s2 b16, b28, b28 ; K[0]<<(32-count)
[b2] mpy.m1 a4, 0, a4 ; return 0
|| shr.s2 b16, b30, b16 ; K[0]=K[0]>>count
|| or.l2 b20, b28, b20 ; K[1]=K[1]|(K[0]<<(32-count))
[b2] b.s2 b3 ; return
|| mpy.m1 a19, 0, a19 ; load 0
***
mv.l1x b20, a18 ; load K[1]
mv.s1x b16, a16 ; load K[0]
|| addu.l1 a19:a18, a18, a21:a20 ; K[1]*2
addu.l1 a21:a20, a18, a21:a20 ; K[1]*3
addu.l1 a21:a20, a24, a21:a20 ; K[1]=K[1]*3+c
addab.d1 a16, a16, a17 ; K[0]*2
|| shr.s1 a20, 31, a23 ; load MSB of K[1]
|| bitr.m1 a20, a0 ; bit-reverse K[1]
addab.d1 a17, a16, a16 ; K[0]*3
[!a0] b.s2 b3 ; return
|| addab.d1 a16, a21, a16 ; K[0]=K[0]*3+carry
|| lmbd.l1 1, a0, a30 ; count bits
|| mpy.m1 a2, 0, a2 ; load 0
[!a0] rotl.m1 a22, 0, a4 ; return error code
|| sub.s1 a29, a30, a28 ; 32-count
|| and.d1 a16, a31, a1 ; isolate sign
|| neg.l1 a16, a17 ; -K[0]-carry
[!a1] addab.d1 a16, 0, a17 ; load K[0]
||[a1] add.l1 a17, a23, a17 ; -K[0]
|| shru.s1 a20, a30, a20 ; K[1]>>count
[a0] cmpltu.l1 a6, a17, a2 ; compare to maximum
|| shl.s1 a16, a28, a28 ; K[0]<<(32-count)
|| mpy.m1 a19, 0, a19 ; load 0
[a2] mpy.m1 a4, 0, a4 ; return 0
|| shr.s1 a16, a30, a16 ; K[0]=K[0]>>count
|| or.l1 a20, a28, a20 ; K[1]=K[1]|(K[0]<<(32-count))
|| mpy.m2 b19, 0, b19 ; load 0
[a2] b.s2 b3 ; return
|| mv.s1 a20, a18 ; load K[1]
|| mv.l2 b20, b18 ; load K[1]
***
addu.l1 a19:a18, a18, a21:a20 ; K[1]*2
|| addu.l2 b19:b18, b18, b21:b20 ; K[1]*2
addu.l1 a21:a20, a18, a21:a20 ; K[1]*3
|| addu.l2 b21:b20, b18, b21:b20 ; K[1]*3
addu.l1 a21:a20, a24, a21:a20 ; K[1]=K[1]*3+c
|| addu.l2 b21:b20, b6, b21:b20 ; K[1]=K[1]*3+c
addab.d1 a16, a16, a17 ; K[0]*2
|| bitr.m1 a20, a0 ; bit-reverse K[1]
|| addab.d2 b16, b16, b17 ; K[0]*2
|| shr.s2 b20, 31, b23 ; load MSB of K[1]
|| bitr.m2 b20, b0 ; bit-reverse K[1]
addab.d1 a17, a16, a16 ; K[0]*3
|| mvk.s1 1, a2 ; load 1
|| addab.d2 b17, b16, b16 ; K[0]*3
|| mvk.s2 1, b2 ; load 1
[!a0] sub.s2 b2, b2, b2 ; clear flag
|| addab.d1 a16, a21, a16 ; K[0]=K[0]*3+carry
|| lmbd.l1 1, a0, a30 ; count bits
||[!b0] mpy.m2 b2, 0, b2 ; clear flag
|| addab.d2 b16, b21, b16 ; K[0]=K[0]*3+carry
|| lmbd.l2 1, b0, b30 ; count bits
***************
* begin loop *
***************
aloop:
[!a0] rotl.m1 a22, 0, a4 ; return error code
|| sub.l1 a29, a30, a28 ; 32-count
|| shru.s1 a20, a30, a20 ; K[1]>>count
||[!b0] mvk.d1 2, a4 ; return error code
|| and.d2 b16, b31, b1 ; isolate sign
|| sub.s2 b29, b30, b28 ; 32-count
|| neg.l2 b16, b17 ; -K[0]-carry
|| mpy.m2 b1, 0, b1 ; load 0
shl.s1 a16, a28, a28 ; K[0]<<(32-count)
||[!b1] addab.d2 b16, 0, b17 ; load K[0]
||[b1] add.l2 b17, b23, b17 ; -K[0]
|| shru.s2 b20, b30, b20 ; K[1]>>count
shr.s1 a16, a30, a16 ; K[0]=K[0]>>count
|| or.l1 a20, a28, a20 ; K[1]=K[1]|(K[0]<<(32-count))
|| mpy.m1 a19, 0, a19 ; load 0
||[b2] cmpltu.l2x a6, b17, b1 ; compare to maximum
|| shl.s2 b16, b28, b28 ; K[0]<<(32-count)
addab.d1 a20, 0, a18 ; load K[1]
||[b1] mpy.m1 a4, 0, a4 ; return 0
|| shr.s2 b16, b30, b16 ; K[0]=K[0]>>count
|| or.l2 b20, b28, b20 ; K[1]=K[1]|(K[0]<<(32-count))
|| mpy.m2 b19, 0, b19 ; load 0
||[b1] subab.d2 b2, b2, b2 ; load 0
[!b2] b.s2 b3 ; return
|| addu.l1 a19:a18, a18, a21:a20 ; K[1]*2
|| mv.l2 b20, b18 ; load K[1]
|| mvk.s1 1, a0 ; load 1
addu.l1 a21:a20, a18, a21:a20 ; K[1]*3
addu.l1 a21:a20, a24, a21:a20 ; K[1]=K[1]*3+c
addab.d1 a16, a16, a17 ; K[0]*2
|| shr.s1 a20, 31, a23 ; load MSB of K[1]
|| bitr.m1 a20, a0 ; bit-reverse K[1]
||[b2] mv.l1 a20, a0 ; load K[1]
[!a0] b.s2 b3 ; return
|| addab.d1 a17, a16, a16 ; K[0]*3
addab.d1 a16, a21, a16 ; K[0]=K[0]*3+carry
|| lmbd.l1 1, a0, a30 ; count bits
[!a0] rotl.m1 a22, 0, a4 ; return error code
|| sub.s1 a29, a30, a28 ; 32-count
|| and.d1 a16, a31, a1 ; isolate sign
|| neg.l1 a16, a17 ; -K[0]-carry
[!a1] addab.d1 a16, 0, a17 ; load K[0]
||[a1] add.l1 a17, a23, a17 ; -K[0]
|| shru.s1 a20, a30, a20 ; K[1]>>count
cmpltu.l1 a6, a17, a2 ; compare to maximum
|| shl.s1 a16, a28, a28 ; K[0]<<(32-count)
|| mvk.d1 1, a1 ; load 1
shr.s1 a16, a30, a16 ; K[0]=K[0]>>count
|| or.l1 a20, a28, a20 ; K[1]=K[1]|(K[0]<<(32-count))
|| subab.d1 a19, a19, a19 ; load 0
||[a2] rotl.m1x b16, 0, a16 ; load K[0]
[!a2] cmpeq.l1x a20, b20, a1 ; compare K[1] values
||[a2] b.s2 b3 ; return
||[a2] mpy.m1 a4, 0, a4 ; return 0
|| addab.d1 a20, 0, a18 ; load K[1]
[a1] cmpeq.l1x a16, b16, a1 ; compare K[0] values
[!a1] b.s2 aloop ; branch if not equal
|| addu.l1 a19:a18, a18, a21:a20 ; K[1]*2
|| addu.l2 b19:b18, b18, b21:b20 ; K[1]*2
||[a1] stw.d1 a20, *+a8[1]; ; save K[1]
||[a1] mvk.s1 1, a4 ; return 1
||[a2] mpy.m1 a4, 0, a4 ; return 0
addu.l1 a21:a20, a18, a21:a20 ; K[1]*3
|| addu.l2 b21:b20, b18, b21:b20 ; K[1]*3
||[a1] stw.d1 a16, *a8 ; save K[0]
||[a1] b.s2 b3 ; return
[!a1] addu.l1 a21:a20, a24, a21:a20 ; K[1]=K[1]*3+c
|| addu.l2 b21:b20, b6, b21:b20 ; K[1]=K[1]*3+c
addab.d1 a16, a16, a17 ; K[0]*2
|| bitr.m1 a20, a0 ; bit-reverse K[1]
|| addab.d2 b16, b16, b17 ; K[0]*2
|| shr.s2 b20, 31, b23 ; load MSB of K[1]
|| bitr.m2 b20, b0 ; bit-reverse K[1]
[!a1] addab.d1 a17, a16, a16 ; K[0]*3
|| mvk.s1 1, a2 ; load 1
|| addab.d2 b17, b16, b16 ; K[0]*3
|| mvk.s2 1, b2 ; load 1
[!a0] sub.s2 b2, b2, b2 ; clear flag
|| addab.d1 a16, a21, a16 ; K[0]=K[0]*3+carry
|| lmbd.l1 1, a0, a30 ; count bits
||[!b0] mpy.m2 b2, 0, b2 ; clear flag
|| addab.d2 b16, b21, b16 ; K[0]=K[0]*3+carry
|| lmbd.l2 1, b0, b30 ; count bits
****************
* end loop *
****************
nop
.end