#ifdef __linux__ #include #else #include #include #endif .text .set noat .set noreorder .globl bn_mul_mont .align 5 .ent bn_mul_mont bn_mul_mont: lda sp,-48(sp) stq ra,0(sp) stq s3,8(sp) stq s4,16(sp) stq s5,24(sp) stq fp,32(sp) mov sp,fp .mask 0x0400f000,-48 .frame fp,48,ra .prologue 0 .align 4 .set reorder sextl a5,a5 mov 0,v0 cmplt a5,4,AT bne AT,.Lexit ldq t1,0(a1) # ap[0] s8addq a5,16,AT ldq t4,8(a1) subq sp,AT,sp ldq t5,0(a2) # bp[0] lda AT,-4096(zero) # mov -4096,AT ldq a4,0(a4) and sp,AT,sp mulq t1,t5,t0 ldq t3,0(a3) # np[0] umulh t1,t5,t1 ldq t6,8(a3) mulq t0,a4,s5 mulq t3,s5,t2 umulh t3,s5,t3 addq t2,t0,t2 cmpult t2,t0,AT addq t3,AT,t3 mulq t4,t5,t8 mov 2,s4 umulh t4,t5,t9 mov sp,t7 mulq t6,s5,t10 s8addq s4,a1,t4 umulh t6,s5,t11 s8addq s4,a3,t6 .align 4 .L1st: .set noreorder ldq t4,0(t4) addl s4,1,s4 ldq t6,0(t6) lda t7,8(t7) addq t8,t1,t0 mulq t4,t5,t8 cmpult t0,t1,AT addq t10,t3,t2 mulq t6,s5,t10 addq t9,AT,t1 cmpult t2,t3,v0 cmplt s4,a5,t12 umulh t4,t5,t9 addq t11,v0,t3 addq t2,t0,t2 s8addq s4,a1,t4 umulh t6,s5,t11 cmpult t2,t0,v0 addq t3,v0,t3 s8addq s4,a3,t6 stq t2,-8(t7) nop unop bne t12,.L1st .set reorder addq t8,t1,t0 addq t10,t3,t2 cmpult t0,t1,AT cmpult t2,t3,v0 addq t9,AT,t1 addq t11,v0,t3 addq t2,t0,t2 cmpult t2,t0,v0 addq t3,v0,t3 stq t2,0(t7) addq t3,t1,t3 cmpult t3,t1,AT stq t3,8(t7) stq AT,16(t7) mov 1,s3 .align 4 .Louter: s8addq s3,a2,t5 ldq t1,0(a1) ldq t4,8(a1) ldq t5,0(t5) ldq t3,0(a3) ldq t6,8(a3) ldq t12,0(sp) mulq t1,t5,t0 umulh t1,t5,t1 addq t0,t12,t0 cmpult t0,t12,AT addq t1,AT,t1 mulq t0,a4,s5 mulq t3,s5,t2 umulh t3,s5,t3 addq t2,t0,t2 cmpult t2,t0,AT mov 2,s4 addq t3,AT,t3 mulq t4,t5,t8 mov sp,t7 umulh t4,t5,t9 mulq t6,s5,t10 s8addq s4,a1,t4 umulh t6,s5,t11 .align 4 .Linner: .set noreorder ldq t12,8(t7) #L0 nop #U1 ldq t4,0(t4) #L1 s8addq s4,a3,t6 #U0 ldq t6,0(t6) #L0 nop #U1 addq t8,t1,t0 #L1 lda t7,8(t7) mulq t4,t5,t8 #U1 cmpult t0,t1,AT #L0 addq t10,t3,t2 #L1 addl s4,1,s4 mulq t6,s5,t10 #U1 addq t9,AT,t1 #L0 addq t0,t12,t0 #L1 cmpult t2,t3,v0 #U0 umulh t4,t5,t9 #U1 cmpult t0,t12,AT #L0 addq t2,t0,t2 #L1 addq t11,v0,t3 #U0 umulh t6,s5,t11 #U1 s8addq s4,a1,t4 #L0 cmpult t2,t0,v0 #L1 cmplt s4,a5,t12 #U0 # borrow t12 addq t1,AT,t1 #L0 addq t3,v0,t3 #U1 stq t2,-8(t7) #L1 bne t12,.Linner #U0 .set reorder ldq t12,8(t7) addq t8,t1,t0 addq t10,t3,t2 cmpult t0,t1,AT cmpult t2,t3,v0 addq t9,AT,t1 addq t11,v0,t3 addq t0,t12,t0 cmpult t0,t12,AT addq t1,AT,t1 ldq t12,16(t7) addq t2,t0,s4 cmpult s4,t0,v0 addq t3,v0,t3 addq t3,t1,t2 stq s4,0(t7) cmpult t2,t1,t3 addq t2,t12,t2 cmpult t2,t12,AT addl s3,1,s3 addq t3,AT,t3 stq t2,8(t7) cmplt s3,a5,t12 # borrow t12 stq t3,16(t7) bne t12,.Louter s8addq a5,sp,t12 # &tp[num] mov a0,a2 # put rp aside mov sp,t7 mov sp,a1 mov 0,t1 # clear borrow bit .align 4 .Lsub: ldq t0,0(t7) ldq t2,0(a3) lda t7,8(t7) lda a3,8(a3) subq t0,t2,t2 # tp[i]-np[i] cmpult t0,t2,AT subq t2,t1,t0 cmpult t2,t0,t1 or t1,AT,t1 stq t0,0(a0) cmpult t7,t12,v0 lda a0,8(a0) bne v0,.Lsub subq t3,t1,t1 # handle upmost overflow bit mov sp,t7 mov a2,a0 # restore rp .align 4 .Lcopy: ldq t4,0(t7) # conditional copy ldq t6,0(a0) lda t7,8(t7) lda a0,8(a0) cmoveq t1,t6,t4 stq zero,-8(t7) # zap tp cmpult t7,t12,AT stq t4,-8(a0) bne AT,.Lcopy mov 1,v0 .Lexit: .set noreorder mov fp,sp /*ldq ra,0(sp)*/ ldq s3,8(sp) ldq s4,16(sp) ldq s5,24(sp) ldq fp,32(sp) lda sp,48(sp) ret (ra) .end bn_mul_mont .ascii "Montgomery Multiplication for Alpha, CRYPTOGAMS by " .align 2