#ifndef __KERNEL__ # include "arm_arch.h" .hidden OPENSSL_armcap_P #endif .text .align 5 .Lsigma: .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral .Lone: .long 1,2,3,4 .Lrot24: .long 0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 .align 2 .globl ChaCha20_ctr32 .type ChaCha20_ctr32,%function .align 5 ChaCha20_ctr32: cbz x2,.Labort cmp x2,#192 b.lo .Lshort #ifndef __KERNEL__ adrp x17,OPENSSL_armcap_P ldr w17,[x17,#:lo12:OPENSSL_armcap_P] tst w17,#ARMV7_NEON b.ne .LChaCha20_neon #endif .Lshort: .inst 0xd503233f // paciasp stp x29,x30,[sp,#-96]! add x29,sp,#0 adr x5,.Lsigma stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] sub sp,sp,#64 ldp x22,x23,[x5] // load sigma ldp x24,x25,[x3] // load key ldp x26,x27,[x3,#16] ldp x28,x30,[x4] // load counter #ifdef __AARCH64EB__ ror x24,x24,#32 ror x25,x25,#32 ror x26,x26,#32 ror x27,x27,#32 ror x28,x28,#32 ror x30,x30,#32 #endif .Loop_outer: mov w5,w22 // unpack key block lsr x6,x22,#32 mov w7,w23 lsr x8,x23,#32 mov w9,w24 lsr x10,x24,#32 mov w11,w25 lsr x12,x25,#32 mov w13,w26 lsr x14,x26,#32 mov w15,w27 lsr x16,x27,#32 mov w17,w28 lsr x19,x28,#32 mov w20,w30 lsr x21,x30,#32 mov x4,#10 subs x2,x2,#64 .Loop: sub x4,x4,#1 add w5,w5,w9 add w6,w6,w10 add w7,w7,w11 add w8,w8,w12 eor w17,w17,w5 eor w19,w19,w6 eor w20,w20,w7 eor w21,w21,w8 ror w17,w17,#16 ror w19,w19,#16 ror w20,w20,#16 ror w21,w21,#16 add w13,w13,w17 add w14,w14,w19 add w15,w15,w20 add w16,w16,w21 eor w9,w9,w13 eor w10,w10,w14 eor w11,w11,w15 eor w12,w12,w16 ror w9,w9,#20 ror w10,w10,#20 ror w11,w11,#20 ror w12,w12,#20 add w5,w5,w9 add w6,w6,w10 add w7,w7,w11 add w8,w8,w12 eor w17,w17,w5 eor w19,w19,w6 eor w20,w20,w7 eor w21,w21,w8 ror w17,w17,#24 ror w19,w19,#24 ror w20,w20,#24 ror w21,w21,#24 add w13,w13,w17 add w14,w14,w19 add w15,w15,w20 add w16,w16,w21 eor w9,w9,w13 eor w10,w10,w14 eor w11,w11,w15 eor w12,w12,w16 ror w9,w9,#25 ror w10,w10,#25 ror w11,w11,#25 ror w12,w12,#25 add w5,w5,w10 add w6,w6,w11 add w7,w7,w12 add w8,w8,w9 eor w21,w21,w5 eor w17,w17,w6 eor w19,w19,w7 eor w20,w20,w8 ror w21,w21,#16 ror w17,w17,#16 ror w19,w19,#16 ror w20,w20,#16 add w15,w15,w21 add w16,w16,w17 add w13,w13,w19 add w14,w14,w20 eor w10,w10,w15 eor w11,w11,w16 eor w12,w12,w13 eor w9,w9,w14 ror w10,w10,#20 ror w11,w11,#20 ror w12,w12,#20 ror w9,w9,#20 add w5,w5,w10 add w6,w6,w11 add w7,w7,w12 add w8,w8,w9 eor w21,w21,w5 eor w17,w17,w6 eor w19,w19,w7 eor w20,w20,w8 ror w21,w21,#24 ror w17,w17,#24 ror w19,w19,#24 ror w20,w20,#24 add w15,w15,w21 add w16,w16,w17 add w13,w13,w19 add w14,w14,w20 eor w10,w10,w15 eor w11,w11,w16 eor w12,w12,w13 eor w9,w9,w14 ror w10,w10,#25 ror w11,w11,#25 ror w12,w12,#25 ror w9,w9,#25 cbnz x4,.Loop add w5,w5,w22 // accumulate key block add x6,x6,x22,lsr#32 add w7,w7,w23 add x8,x8,x23,lsr#32 add w9,w9,w24 add x10,x10,x24,lsr#32 add w11,w11,w25 add x12,x12,x25,lsr#32 add w13,w13,w26 add x14,x14,x26,lsr#32 add w15,w15,w27 add x16,x16,x27,lsr#32 add w17,w17,w28 add x19,x19,x28,lsr#32 add w20,w20,w30 add x21,x21,x30,lsr#32 b.lo .Ltail add x5,x5,x6,lsl#32 // pack add x7,x7,x8,lsl#32 ldp x6,x8,[x1,#0] // load input add x9,x9,x10,lsl#32 add x11,x11,x12,lsl#32 ldp x10,x12,[x1,#16] add x13,x13,x14,lsl#32 add x15,x15,x16,lsl#32 ldp x14,x16,[x1,#32] add x17,x17,x19,lsl#32 add x20,x20,x21,lsl#32 ldp x19,x21,[x1,#48] add x1,x1,#64 #ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 rev x9,x9 rev x11,x11 rev x13,x13 rev x15,x15 rev x17,x17 rev x20,x20 #endif eor x5,x5,x6 eor x7,x7,x8 eor x9,x9,x10 eor x11,x11,x12 eor x13,x13,x14 eor x15,x15,x16 eor x17,x17,x19 eor x20,x20,x21 stp x5,x7,[x0,#0] // store output add x28,x28,#1 // increment counter stp x9,x11,[x0,#16] stp x13,x15,[x0,#32] stp x17,x20,[x0,#48] add x0,x0,#64 b.hi .Loop_outer ldp x19,x20,[x29,#16] add sp,sp,#64 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 .inst 0xd50323bf // autiasp .Labort: ret .align 4 .Ltail: add x2,x2,#64 .Less_than_64: sub x0,x0,#1 add x1,x1,x2 add x0,x0,x2 add x4,sp,x2 neg x2,x2 add x5,x5,x6,lsl#32 // pack add x7,x7,x8,lsl#32 add x9,x9,x10,lsl#32 add x11,x11,x12,lsl#32 add x13,x13,x14,lsl#32 add x15,x15,x16,lsl#32 add x17,x17,x19,lsl#32 add x20,x20,x21,lsl#32 #ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 rev x9,x9 rev x11,x11 rev x13,x13 rev x15,x15 rev x17,x17 rev x20,x20 #endif stp x5,x7,[sp,#0] stp x9,x11,[sp,#16] stp x13,x15,[sp,#32] stp x17,x20,[sp,#48] .Loop_tail: ldrb w10,[x1,x2] ldrb w11,[x4,x2] add x2,x2,#1 eor w10,w10,w11 strb w10,[x0,x2] cbnz x2,.Loop_tail stp xzr,xzr,[sp,#0] stp xzr,xzr,[sp,#16] stp xzr,xzr,[sp,#32] stp xzr,xzr,[sp,#48] ldp x19,x20,[x29,#16] add sp,sp,#64 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 .inst 0xd50323bf // autiasp ret .size ChaCha20_ctr32,.-ChaCha20_ctr32 #ifdef __KERNEL__ .globl ChaCha20_neon #endif .type ChaCha20_neon,%function .align 5 ChaCha20_neon: .LChaCha20_neon: .inst 0xd503233f // paciasp stp x29,x30,[sp,#-96]! add x29,sp,#0 adr x5,.Lsigma stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] cmp x2,#512 b.hs .L512_or_more_neon sub sp,sp,#64 ldp x22,x23,[x5] // load sigma ld1 {v0.4s},[x5],#16 ldp x24,x25,[x3] // load key ldp x26,x27,[x3,#16] ld1 {v1.4s,v2.4s},[x3] ldp x28,x30,[x4] // load counter ld1 {v3.4s},[x4] stp d8,d9,[sp] // meet ABI requirements ld1 {v8.4s,v9.4s},[x5] #ifdef __AARCH64EB__ rev64 v0.4s,v0.4s ror x24,x24,#32 ror x25,x25,#32 ror x26,x26,#32 ror x27,x27,#32 ror x28,x28,#32 ror x30,x30,#32 #endif .Loop_outer_neon: dup v16.4s,v0.s[0] // unpack key block mov w5,w22 dup v20.4s,v0.s[1] lsr x6,x22,#32 dup v24.4s,v0.s[2] mov w7,w23 dup v28.4s,v0.s[3] lsr x8,x23,#32 dup v17.4s,v1.s[0] mov w9,w24 dup v21.4s,v1.s[1] lsr x10,x24,#32 dup v25.4s,v1.s[2] mov w11,w25 dup v29.4s,v1.s[3] lsr x12,x25,#32 dup v19.4s,v3.s[0] mov w13,w26 dup v23.4s,v3.s[1] lsr x14,x26,#32 dup v27.4s,v3.s[2] mov w15,w27 dup v31.4s,v3.s[3] lsr x16,x27,#32 add v19.4s,v19.4s,v8.4s mov w17,w28 dup v18.4s,v2.s[0] lsr x19,x28,#32 dup v22.4s,v2.s[1] mov w20,w30 dup v26.4s,v2.s[2] lsr x21,x30,#32 dup v30.4s,v2.s[3] mov x4,#10 subs x2,x2,#320 .Loop_neon: sub x4,x4,#1 add v16.4s,v16.4s,v17.4s add w5,w5,w9 add v20.4s,v20.4s,v21.4s add w6,w6,w10 add v24.4s,v24.4s,v25.4s add w7,w7,w11 add v28.4s,v28.4s,v29.4s add w8,w8,w12 eor v19.16b,v19.16b,v16.16b eor w17,w17,w5 eor v23.16b,v23.16b,v20.16b eor w19,w19,w6 eor v27.16b,v27.16b,v24.16b eor w20,w20,w7 eor v31.16b,v31.16b,v28.16b eor w21,w21,w8 rev32 v19.8h,v19.8h ror w17,w17,#16 rev32 v23.8h,v23.8h ror w19,w19,#16 rev32 v27.8h,v27.8h ror w20,w20,#16 rev32 v31.8h,v31.8h ror w21,w21,#16 add v18.4s,v18.4s,v19.4s add w13,w13,w17 add v22.4s,v22.4s,v23.4s add w14,w14,w19 add v26.4s,v26.4s,v27.4s add w15,w15,w20 add v30.4s,v30.4s,v31.4s add w16,w16,w21 eor v4.16b,v17.16b,v18.16b eor w9,w9,w13 eor v5.16b,v21.16b,v22.16b eor w10,w10,w14 eor v6.16b,v25.16b,v26.16b eor w11,w11,w15 eor v7.16b,v29.16b,v30.16b eor w12,w12,w16 ushr v17.4s,v4.4s,#20 ror w9,w9,#20 ushr v21.4s,v5.4s,#20 ror w10,w10,#20 ushr v25.4s,v6.4s,#20 ror w11,w11,#20 ushr v29.4s,v7.4s,#20 ror w12,w12,#20 sli v17.4s,v4.4s,#12 add w5,w5,w9 sli v21.4s,v5.4s,#12 add w6,w6,w10 sli v25.4s,v6.4s,#12 add w7,w7,w11 sli v29.4s,v7.4s,#12 add w8,w8,w12 add v16.4s,v16.4s,v17.4s eor w17,w17,w5 add v20.4s,v20.4s,v21.4s eor w19,w19,w6 add v24.4s,v24.4s,v25.4s eor w20,w20,w7 add v28.4s,v28.4s,v29.4s eor w21,w21,w8 eor v4.16b,v19.16b,v16.16b ror w17,w17,#24 eor v5.16b,v23.16b,v20.16b ror w19,w19,#24 eor v6.16b,v27.16b,v24.16b ror w20,w20,#24 eor v7.16b,v31.16b,v28.16b ror w21,w21,#24 tbl v19.16b,{v4.16b},v9.16b add w13,w13,w17 tbl v23.16b,{v5.16b},v9.16b add w14,w14,w19 tbl v27.16b,{v6.16b},v9.16b add w15,w15,w20 tbl v31.16b,{v7.16b},v9.16b add w16,w16,w21 add v18.4s,v18.4s,v19.4s eor w9,w9,w13 add v22.4s,v22.4s,v23.4s eor w10,w10,w14 add v26.4s,v26.4s,v27.4s eor w11,w11,w15 add v30.4s,v30.4s,v31.4s eor w12,w12,w16 eor v4.16b,v17.16b,v18.16b ror w9,w9,#25 eor v5.16b,v21.16b,v22.16b ror w10,w10,#25 eor v6.16b,v25.16b,v26.16b ror w11,w11,#25 eor v7.16b,v29.16b,v30.16b ror w12,w12,#25 ushr v17.4s,v4.4s,#25 ushr v21.4s,v5.4s,#25 ushr v25.4s,v6.4s,#25 ushr v29.4s,v7.4s,#25 sli v17.4s,v4.4s,#7 sli v21.4s,v5.4s,#7 sli v25.4s,v6.4s,#7 sli v29.4s,v7.4s,#7 add v16.4s,v16.4s,v21.4s add w5,w5,w10 add v20.4s,v20.4s,v25.4s add w6,w6,w11 add v24.4s,v24.4s,v29.4s add w7,w7,w12 add v28.4s,v28.4s,v17.4s add w8,w8,w9 eor v31.16b,v31.16b,v16.16b eor w21,w21,w5 eor v19.16b,v19.16b,v20.16b eor w17,w17,w6 eor v23.16b,v23.16b,v24.16b eor w19,w19,w7 eor v27.16b,v27.16b,v28.16b eor w20,w20,w8 rev32 v31.8h,v31.8h ror w21,w21,#16 rev32 v19.8h,v19.8h ror w17,w17,#16 rev32 v23.8h,v23.8h ror w19,w19,#16 rev32 v27.8h,v27.8h ror w20,w20,#16 add v26.4s,v26.4s,v31.4s add w15,w15,w21 add v30.4s,v30.4s,v19.4s add w16,w16,w17 add v18.4s,v18.4s,v23.4s add w13,w13,w19 add v22.4s,v22.4s,v27.4s add w14,w14,w20 eor v4.16b,v21.16b,v26.16b eor w10,w10,w15 eor v5.16b,v25.16b,v30.16b eor w11,w11,w16 eor v6.16b,v29.16b,v18.16b eor w12,w12,w13 eor v7.16b,v17.16b,v22.16b eor w9,w9,w14 ushr v21.4s,v4.4s,#20 ror w10,w10,#20 ushr v25.4s,v5.4s,#20 ror w11,w11,#20 ushr v29.4s,v6.4s,#20 ror w12,w12,#20 ushr v17.4s,v7.4s,#20 ror w9,w9,#20 sli v21.4s,v4.4s,#12 add w5,w5,w10 sli v25.4s,v5.4s,#12 add w6,w6,w11 sli v29.4s,v6.4s,#12 add w7,w7,w12 sli v17.4s,v7.4s,#12 add w8,w8,w9 add v16.4s,v16.4s,v21.4s eor w21,w21,w5 add v20.4s,v20.4s,v25.4s eor w17,w17,w6 add v24.4s,v24.4s,v29.4s eor w19,w19,w7 add v28.4s,v28.4s,v17.4s eor w20,w20,w8 eor v4.16b,v31.16b,v16.16b ror w21,w21,#24 eor v5.16b,v19.16b,v20.16b ror w17,w17,#24 eor v6.16b,v23.16b,v24.16b ror w19,w19,#24 eor v7.16b,v27.16b,v28.16b ror w20,w20,#24 tbl v31.16b,{v4.16b},v9.16b add w15,w15,w21 tbl v19.16b,{v5.16b},v9.16b add w16,w16,w17 tbl v23.16b,{v6.16b},v9.16b add w13,w13,w19 tbl v27.16b,{v7.16b},v9.16b add w14,w14,w20 add v26.4s,v26.4s,v31.4s eor w10,w10,w15 add v30.4s,v30.4s,v19.4s eor w11,w11,w16 add v18.4s,v18.4s,v23.4s eor w12,w12,w13 add v22.4s,v22.4s,v27.4s eor w9,w9,w14 eor v4.16b,v21.16b,v26.16b ror w10,w10,#25 eor v5.16b,v25.16b,v30.16b ror w11,w11,#25 eor v6.16b,v29.16b,v18.16b ror w12,w12,#25 eor v7.16b,v17.16b,v22.16b ror w9,w9,#25 ushr v21.4s,v4.4s,#25 ushr v25.4s,v5.4s,#25 ushr v29.4s,v6.4s,#25 ushr v17.4s,v7.4s,#25 sli v21.4s,v4.4s,#7 sli v25.4s,v5.4s,#7 sli v29.4s,v6.4s,#7 sli v17.4s,v7.4s,#7 cbnz x4,.Loop_neon add v19.4s,v19.4s,v8.4s zip1 v4.4s,v16.4s,v20.4s // transpose data zip1 v5.4s,v24.4s,v28.4s zip2 v6.4s,v16.4s,v20.4s zip2 v7.4s,v24.4s,v28.4s zip1 v16.2d,v4.2d,v5.2d zip2 v20.2d,v4.2d,v5.2d zip1 v24.2d,v6.2d,v7.2d zip2 v28.2d,v6.2d,v7.2d zip1 v4.4s,v17.4s,v21.4s zip1 v5.4s,v25.4s,v29.4s zip2 v6.4s,v17.4s,v21.4s zip2 v7.4s,v25.4s,v29.4s zip1 v17.2d,v4.2d,v5.2d zip2 v21.2d,v4.2d,v5.2d zip1 v25.2d,v6.2d,v7.2d zip2 v29.2d,v6.2d,v7.2d zip1 v4.4s,v18.4s,v22.4s add w5,w5,w22 // accumulate key block zip1 v5.4s,v26.4s,v30.4s add x6,x6,x22,lsr#32 zip2 v6.4s,v18.4s,v22.4s add w7,w7,w23 zip2 v7.4s,v26.4s,v30.4s add x8,x8,x23,lsr#32 zip1 v18.2d,v4.2d,v5.2d add w9,w9,w24 zip2 v22.2d,v4.2d,v5.2d add x10,x10,x24,lsr#32 zip1 v26.2d,v6.2d,v7.2d add w11,w11,w25 zip2 v30.2d,v6.2d,v7.2d add x12,x12,x25,lsr#32 zip1 v4.4s,v19.4s,v23.4s add w13,w13,w26 zip1 v5.4s,v27.4s,v31.4s add x14,x14,x26,lsr#32 zip2 v6.4s,v19.4s,v23.4s add w15,w15,w27 zip2 v7.4s,v27.4s,v31.4s add x16,x16,x27,lsr#32 zip1 v19.2d,v4.2d,v5.2d add w17,w17,w28 zip2 v23.2d,v4.2d,v5.2d add x19,x19,x28,lsr#32 zip1 v27.2d,v6.2d,v7.2d add w20,w20,w30 zip2 v31.2d,v6.2d,v7.2d add x21,x21,x30,lsr#32 b.lo .Ltail_neon add x5,x5,x6,lsl#32 // pack add x7,x7,x8,lsl#32 ldp x6,x8,[x1,#0] // load input add v16.4s,v16.4s,v0.4s // accumulate key block add x9,x9,x10,lsl#32 add x11,x11,x12,lsl#32 ldp x10,x12,[x1,#16] add v17.4s,v17.4s,v1.4s add x13,x13,x14,lsl#32 add x15,x15,x16,lsl#32 ldp x14,x16,[x1,#32] add v18.4s,v18.4s,v2.4s add x17,x17,x19,lsl#32 add x20,x20,x21,lsl#32 ldp x19,x21,[x1,#48] add v19.4s,v19.4s,v3.4s add x1,x1,#64 #ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 rev x9,x9 rev x11,x11 rev x13,x13 rev x15,x15 rev x17,x17 rev x20,x20 #endif ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 eor x5,x5,x6 add v20.4s,v20.4s,v0.4s eor x7,x7,x8 add v21.4s,v21.4s,v1.4s eor x9,x9,x10 add v22.4s,v22.4s,v2.4s eor x11,x11,x12 add v23.4s,v23.4s,v3.4s eor x13,x13,x14 eor v16.16b,v16.16b,v4.16b movi v4.4s,#5 eor x15,x15,x16 eor v17.16b,v17.16b,v5.16b eor x17,x17,x19 eor v18.16b,v18.16b,v6.16b eor x20,x20,x21 eor v19.16b,v19.16b,v7.16b add v8.4s,v8.4s,v4.4s // += 5 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 stp x5,x7,[x0,#0] // store output add x28,x28,#5 // increment counter stp x9,x11,[x0,#16] stp x13,x15,[x0,#32] stp x17,x20,[x0,#48] add x0,x0,#64 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 add v24.4s,v24.4s,v0.4s add v25.4s,v25.4s,v1.4s add v26.4s,v26.4s,v2.4s add v27.4s,v27.4s,v3.4s ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 eor v20.16b,v20.16b,v4.16b eor v21.16b,v21.16b,v5.16b eor v22.16b,v22.16b,v6.16b eor v23.16b,v23.16b,v7.16b st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 add v28.4s,v28.4s,v0.4s add v29.4s,v29.4s,v1.4s add v30.4s,v30.4s,v2.4s add v31.4s,v31.4s,v3.4s ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 eor v24.16b,v24.16b,v16.16b eor v25.16b,v25.16b,v17.16b eor v26.16b,v26.16b,v18.16b eor v27.16b,v27.16b,v19.16b st1 {v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64 eor v28.16b,v28.16b,v20.16b eor v29.16b,v29.16b,v21.16b eor v30.16b,v30.16b,v22.16b eor v31.16b,v31.16b,v23.16b st1 {v28.16b,v29.16b,v30.16b,v31.16b},[x0],#64 b.hi .Loop_outer_neon ldp d8,d9,[sp] // meet ABI requirements ldp x19,x20,[x29,#16] add sp,sp,#64 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 .inst 0xd50323bf // autiasp ret .align 4 .Ltail_neon: add x2,x2,#320 ldp d8,d9,[sp] // meet ABI requirements cmp x2,#64 b.lo .Less_than_64 add x5,x5,x6,lsl#32 // pack add x7,x7,x8,lsl#32 ldp x6,x8,[x1,#0] // load input add x9,x9,x10,lsl#32 add x11,x11,x12,lsl#32 ldp x10,x12,[x1,#16] add x13,x13,x14,lsl#32 add x15,x15,x16,lsl#32 ldp x14,x16,[x1,#32] add x17,x17,x19,lsl#32 add x20,x20,x21,lsl#32 ldp x19,x21,[x1,#48] add x1,x1,#64 #ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 rev x9,x9 rev x11,x11 rev x13,x13 rev x15,x15 rev x17,x17 rev x20,x20 #endif eor x5,x5,x6 eor x7,x7,x8 eor x9,x9,x10 eor x11,x11,x12 eor x13,x13,x14 eor x15,x15,x16 eor x17,x17,x19 eor x20,x20,x21 stp x5,x7,[x0,#0] // store output add v16.4s,v16.4s,v0.4s // accumulate key block stp x9,x11,[x0,#16] add v17.4s,v17.4s,v1.4s stp x13,x15,[x0,#32] add v18.4s,v18.4s,v2.4s stp x17,x20,[x0,#48] add v19.4s,v19.4s,v3.4s add x0,x0,#64 b.eq .Ldone_neon sub x2,x2,#64 cmp x2,#64 b.lo .Last_neon ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 eor v16.16b,v16.16b,v4.16b eor v17.16b,v17.16b,v5.16b eor v18.16b,v18.16b,v6.16b eor v19.16b,v19.16b,v7.16b st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 b.eq .Ldone_neon add v16.4s,v20.4s,v0.4s add v17.4s,v21.4s,v1.4s sub x2,x2,#64 add v18.4s,v22.4s,v2.4s cmp x2,#64 add v19.4s,v23.4s,v3.4s b.lo .Last_neon ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 eor v20.16b,v16.16b,v4.16b eor v21.16b,v17.16b,v5.16b eor v22.16b,v18.16b,v6.16b eor v23.16b,v19.16b,v7.16b st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 b.eq .Ldone_neon add v16.4s,v24.4s,v0.4s add v17.4s,v25.4s,v1.4s sub x2,x2,#64 add v18.4s,v26.4s,v2.4s cmp x2,#64 add v19.4s,v27.4s,v3.4s b.lo .Last_neon ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 eor v24.16b,v16.16b,v4.16b eor v25.16b,v17.16b,v5.16b eor v26.16b,v18.16b,v6.16b eor v27.16b,v19.16b,v7.16b st1 {v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64 b.eq .Ldone_neon add v16.4s,v28.4s,v0.4s add v17.4s,v29.4s,v1.4s add v18.4s,v30.4s,v2.4s add v19.4s,v31.4s,v3.4s sub x2,x2,#64 .Last_neon: st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] sub x0,x0,#1 add x1,x1,x2 add x0,x0,x2 add x4,sp,x2 neg x2,x2 .Loop_tail_neon: ldrb w10,[x1,x2] ldrb w11,[x4,x2] add x2,x2,#1 eor w10,w10,w11 strb w10,[x0,x2] cbnz x2,.Loop_tail_neon stp xzr,xzr,[sp,#0] stp xzr,xzr,[sp,#16] stp xzr,xzr,[sp,#32] stp xzr,xzr,[sp,#48] .Ldone_neon: ldp x19,x20,[x29,#16] add sp,sp,#64 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 .inst 0xd50323bf // autiasp ret .size ChaCha20_neon,.-ChaCha20_neon .type ChaCha20_512_neon,%function .align 5 ChaCha20_512_neon: .inst 0xd503233f // paciasp stp x29,x30,[sp,#-96]! add x29,sp,#0 adr x5,.Lsigma stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] .L512_or_more_neon: sub sp,sp,#128+64 eor v7.16b,v7.16b,v7.16b ldp x22,x23,[x5] // load sigma ld1 {v0.4s},[x5],#16 ldp x24,x25,[x3] // load key ldp x26,x27,[x3,#16] ld1 {v1.4s,v2.4s},[x3] ldp x28,x30,[x4] // load counter ld1 {v3.4s},[x4] ld1 {v7.s}[0],[x5] add x3,x5,#16 // .Lrot24 #ifdef __AARCH64EB__ rev64 v0.4s,v0.4s ror x24,x24,#32 ror x25,x25,#32 ror x26,x26,#32 ror x27,x27,#32 ror x28,x28,#32 ror x30,x30,#32 #endif add v3.4s,v3.4s,v7.4s // += 1 stp q0,q1,[sp,#0] // off-load key block, invariant part add v3.4s,v3.4s,v7.4s // not typo str q2,[sp,#32] add v4.4s,v3.4s,v7.4s add v5.4s,v4.4s,v7.4s add v6.4s,v5.4s,v7.4s shl v7.4s,v7.4s,#2 // 1 -> 4 stp d8,d9,[sp,#128+0] // meet ABI requirements stp d10,d11,[sp,#128+16] stp d12,d13,[sp,#128+32] stp d14,d15,[sp,#128+48] sub x2,x2,#512 // not typo .Loop_outer_512_neon: mov v8.16b,v0.16b mov v12.16b,v0.16b mov v16.16b,v0.16b mov v20.16b,v0.16b mov v24.16b,v0.16b mov v28.16b,v0.16b mov v9.16b,v1.16b mov w5,w22 // unpack key block mov v13.16b,v1.16b lsr x6,x22,#32 mov v17.16b,v1.16b mov w7,w23 mov v21.16b,v1.16b lsr x8,x23,#32 mov v25.16b,v1.16b mov w9,w24 mov v29.16b,v1.16b lsr x10,x24,#32 mov v11.16b,v3.16b mov w11,w25 mov v15.16b,v4.16b lsr x12,x25,#32 mov v19.16b,v5.16b mov w13,w26 mov v23.16b,v6.16b lsr x14,x26,#32 mov v10.16b,v2.16b mov w15,w27 mov v14.16b,v2.16b lsr x16,x27,#32 add v27.4s,v11.4s,v7.4s // +4 mov w17,w28 add v31.4s,v15.4s,v7.4s // +4 lsr x19,x28,#32 mov v18.16b,v2.16b mov w20,w30 mov v22.16b,v2.16b lsr x21,x30,#32 mov v26.16b,v2.16b stp q3,q4,[sp,#48] // off-load key block, variable part mov v30.16b,v2.16b stp q5,q6,[sp,#80] mov x4,#5 ld1 {v6.4s},[x3] subs x2,x2,#512 .Loop_upper_neon: sub x4,x4,#1 add v8.4s,v8.4s,v9.4s add w5,w5,w9 add v12.4s,v12.4s,v13.4s add w6,w6,w10 add v16.4s,v16.4s,v17.4s add w7,w7,w11 add v20.4s,v20.4s,v21.4s add w8,w8,w12 add v24.4s,v24.4s,v25.4s eor w17,w17,w5 add v28.4s,v28.4s,v29.4s eor w19,w19,w6 eor v11.16b,v11.16b,v8.16b eor w20,w20,w7 eor v15.16b,v15.16b,v12.16b eor w21,w21,w8 eor v19.16b,v19.16b,v16.16b ror w17,w17,#16 eor v23.16b,v23.16b,v20.16b ror w19,w19,#16 eor v27.16b,v27.16b,v24.16b ror w20,w20,#16 eor v31.16b,v31.16b,v28.16b ror w21,w21,#16 rev32 v11.8h,v11.8h add w13,w13,w17 rev32 v15.8h,v15.8h add w14,w14,w19 rev32 v19.8h,v19.8h add w15,w15,w20 rev32 v23.8h,v23.8h add w16,w16,w21 rev32 v27.8h,v27.8h eor w9,w9,w13 rev32 v31.8h,v31.8h eor w10,w10,w14 add v10.4s,v10.4s,v11.4s eor w11,w11,w15 add v14.4s,v14.4s,v15.4s eor w12,w12,w16 add v18.4s,v18.4s,v19.4s ror w9,w9,#20 add v22.4s,v22.4s,v23.4s ror w10,w10,#20 add v26.4s,v26.4s,v27.4s ror w11,w11,#20 add v30.4s,v30.4s,v31.4s ror w12,w12,#20 eor v0.16b,v9.16b,v10.16b add w5,w5,w9 eor v1.16b,v13.16b,v14.16b add w6,w6,w10 eor v2.16b,v17.16b,v18.16b add w7,w7,w11 eor v3.16b,v21.16b,v22.16b add w8,w8,w12 eor v4.16b,v25.16b,v26.16b eor w17,w17,w5 eor v5.16b,v29.16b,v30.16b eor w19,w19,w6 ushr v9.4s,v0.4s,#20 eor w20,w20,w7 ushr v13.4s,v1.4s,#20 eor w21,w21,w8 ushr v17.4s,v2.4s,#20 ror w17,w17,#24 ushr v21.4s,v3.4s,#20 ror w19,w19,#24 ushr v25.4s,v4.4s,#20 ror w20,w20,#24 ushr v29.4s,v5.4s,#20 ror w21,w21,#24 sli v9.4s,v0.4s,#12 add w13,w13,w17 sli v13.4s,v1.4s,#12 add w14,w14,w19 sli v17.4s,v2.4s,#12 add w15,w15,w20 sli v21.4s,v3.4s,#12 add w16,w16,w21 sli v25.4s,v4.4s,#12 eor w9,w9,w13 sli v29.4s,v5.4s,#12 eor w10,w10,w14 add v8.4s,v8.4s,v9.4s eor w11,w11,w15 add v12.4s,v12.4s,v13.4s eor w12,w12,w16 add v16.4s,v16.4s,v17.4s ror w9,w9,#25 add v20.4s,v20.4s,v21.4s ror w10,w10,#25 add v24.4s,v24.4s,v25.4s ror w11,w11,#25 add v28.4s,v28.4s,v29.4s ror w12,w12,#25 eor v11.16b,v11.16b,v8.16b add w5,w5,w10 eor v15.16b,v15.16b,v12.16b add w6,w6,w11 eor v19.16b,v19.16b,v16.16b add w7,w7,w12 eor v23.16b,v23.16b,v20.16b add w8,w8,w9 eor v27.16b,v27.16b,v24.16b eor w21,w21,w5 eor v31.16b,v31.16b,v28.16b eor w17,w17,w6 tbl v11.16b,{v11.16b},v6.16b eor w19,w19,w7 tbl v15.16b,{v15.16b},v6.16b eor w20,w20,w8 tbl v19.16b,{v19.16b},v6.16b ror w21,w21,#16 tbl v23.16b,{v23.16b},v6.16b ror w17,w17,#16 tbl v27.16b,{v27.16b},v6.16b ror w19,w19,#16 tbl v31.16b,{v31.16b},v6.16b ror w20,w20,#16 add v10.4s,v10.4s,v11.4s add w15,w15,w21 add v14.4s,v14.4s,v15.4s add w16,w16,w17 add v18.4s,v18.4s,v19.4s add w13,w13,w19 add v22.4s,v22.4s,v23.4s add w14,w14,w20 add v26.4s,v26.4s,v27.4s eor w10,w10,w15 add v30.4s,v30.4s,v31.4s eor w11,w11,w16 eor v0.16b,v9.16b,v10.16b eor w12,w12,w13 eor v1.16b,v13.16b,v14.16b eor w9,w9,w14 eor v2.16b,v17.16b,v18.16b ror w10,w10,#20 eor v3.16b,v21.16b,v22.16b ror w11,w11,#20 eor v4.16b,v25.16b,v26.16b ror w12,w12,#20 eor v5.16b,v29.16b,v30.16b ror w9,w9,#20 ushr v9.4s,v0.4s,#25 add w5,w5,w10 ushr v13.4s,v1.4s,#25 add w6,w6,w11 ushr v17.4s,v2.4s,#25 add w7,w7,w12 ushr v21.4s,v3.4s,#25 add w8,w8,w9 ushr v25.4s,v4.4s,#25 eor w21,w21,w5 ushr v29.4s,v5.4s,#25 eor w17,w17,w6 sli v9.4s,v0.4s,#7 eor w19,w19,w7 sli v13.4s,v1.4s,#7 eor w20,w20,w8 sli v17.4s,v2.4s,#7 ror w21,w21,#24 sli v21.4s,v3.4s,#7 ror w17,w17,#24 sli v25.4s,v4.4s,#7 ror w19,w19,#24 sli v29.4s,v5.4s,#7 ror w20,w20,#24 ext v10.16b,v10.16b,v10.16b,#8 add w15,w15,w21 ext v14.16b,v14.16b,v14.16b,#8 add w16,w16,w17 ext v18.16b,v18.16b,v18.16b,#8 add w13,w13,w19 ext v22.16b,v22.16b,v22.16b,#8 add w14,w14,w20 ext v26.16b,v26.16b,v26.16b,#8 eor w10,w10,w15 ext v30.16b,v30.16b,v30.16b,#8 eor w11,w11,w16 ext v11.16b,v11.16b,v11.16b,#12 eor w12,w12,w13 ext v15.16b,v15.16b,v15.16b,#12 eor w9,w9,w14 ext v19.16b,v19.16b,v19.16b,#12 ror w10,w10,#25 ext v23.16b,v23.16b,v23.16b,#12 ror w11,w11,#25 ext v27.16b,v27.16b,v27.16b,#12 ror w12,w12,#25 ext v31.16b,v31.16b,v31.16b,#12 ror w9,w9,#25 ext v9.16b,v9.16b,v9.16b,#4 ext v13.16b,v13.16b,v13.16b,#4 ext v17.16b,v17.16b,v17.16b,#4 ext v21.16b,v21.16b,v21.16b,#4 ext v25.16b,v25.16b,v25.16b,#4 ext v29.16b,v29.16b,v29.16b,#4 add v8.4s,v8.4s,v9.4s add w5,w5,w9 add v12.4s,v12.4s,v13.4s add w6,w6,w10 add v16.4s,v16.4s,v17.4s add w7,w7,w11 add v20.4s,v20.4s,v21.4s add w8,w8,w12 add v24.4s,v24.4s,v25.4s eor w17,w17,w5 add v28.4s,v28.4s,v29.4s eor w19,w19,w6 eor v11.16b,v11.16b,v8.16b eor w20,w20,w7 eor v15.16b,v15.16b,v12.16b eor w21,w21,w8 eor v19.16b,v19.16b,v16.16b ror w17,w17,#16 eor v23.16b,v23.16b,v20.16b ror w19,w19,#16 eor v27.16b,v27.16b,v24.16b ror w20,w20,#16 eor v31.16b,v31.16b,v28.16b ror w21,w21,#16 rev32 v11.8h,v11.8h add w13,w13,w17 rev32 v15.8h,v15.8h add w14,w14,w19 rev32 v19.8h,v19.8h add w15,w15,w20 rev32 v23.8h,v23.8h add w16,w16,w21 rev32 v27.8h,v27.8h eor w9,w9,w13 rev32 v31.8h,v31.8h eor w10,w10,w14 add v10.4s,v10.4s,v11.4s eor w11,w11,w15 add v14.4s,v14.4s,v15.4s eor w12,w12,w16 add v18.4s,v18.4s,v19.4s ror w9,w9,#20 add v22.4s,v22.4s,v23.4s ror w10,w10,#20 add v26.4s,v26.4s,v27.4s ror w11,w11,#20 add v30.4s,v30.4s,v31.4s ror w12,w12,#20 eor v0.16b,v9.16b,v10.16b add w5,w5,w9 eor v1.16b,v13.16b,v14.16b add w6,w6,w10 eor v2.16b,v17.16b,v18.16b add w7,w7,w11 eor v3.16b,v21.16b,v22.16b add w8,w8,w12 eor v4.16b,v25.16b,v26.16b eor w17,w17,w5 eor v5.16b,v29.16b,v30.16b eor w19,w19,w6 ushr v9.4s,v0.4s,#20 eor w20,w20,w7 ushr v13.4s,v1.4s,#20 eor w21,w21,w8 ushr v17.4s,v2.4s,#20 ror w17,w17,#24 ushr v21.4s,v3.4s,#20 ror w19,w19,#24 ushr v25.4s,v4.4s,#20 ror w20,w20,#24 ushr v29.4s,v5.4s,#20 ror w21,w21,#24 sli v9.4s,v0.4s,#12 add w13,w13,w17 sli v13.4s,v1.4s,#12 add w14,w14,w19 sli v17.4s,v2.4s,#12 add w15,w15,w20 sli v21.4s,v3.4s,#12 add w16,w16,w21 sli v25.4s,v4.4s,#12 eor w9,w9,w13 sli v29.4s,v5.4s,#12 eor w10,w10,w14 add v8.4s,v8.4s,v9.4s eor w11,w11,w15 add v12.4s,v12.4s,v13.4s eor w12,w12,w16 add v16.4s,v16.4s,v17.4s ror w9,w9,#25 add v20.4s,v20.4s,v21.4s ror w10,w10,#25 add v24.4s,v24.4s,v25.4s ror w11,w11,#25 add v28.4s,v28.4s,v29.4s ror w12,w12,#25 eor v11.16b,v11.16b,v8.16b add w5,w5,w10 eor v15.16b,v15.16b,v12.16b add w6,w6,w11 eor v19.16b,v19.16b,v16.16b add w7,w7,w12 eor v23.16b,v23.16b,v20.16b add w8,w8,w9 eor v27.16b,v27.16b,v24.16b eor w21,w21,w5 eor v31.16b,v31.16b,v28.16b eor w17,w17,w6 tbl v11.16b,{v11.16b},v6.16b eor w19,w19,w7 tbl v15.16b,{v15.16b},v6.16b eor w20,w20,w8 tbl v19.16b,{v19.16b},v6.16b ror w21,w21,#16 tbl v23.16b,{v23.16b},v6.16b ror w17,w17,#16 tbl v27.16b,{v27.16b},v6.16b ror w19,w19,#16 tbl v31.16b,{v31.16b},v6.16b ror w20,w20,#16 add v10.4s,v10.4s,v11.4s add w15,w15,w21 add v14.4s,v14.4s,v15.4s add w16,w16,w17 add v18.4s,v18.4s,v19.4s add w13,w13,w19 add v22.4s,v22.4s,v23.4s add w14,w14,w20 add v26.4s,v26.4s,v27.4s eor w10,w10,w15 add v30.4s,v30.4s,v31.4s eor w11,w11,w16 eor v0.16b,v9.16b,v10.16b eor w12,w12,w13 eor v1.16b,v13.16b,v14.16b eor w9,w9,w14 eor v2.16b,v17.16b,v18.16b ror w10,w10,#20 eor v3.16b,v21.16b,v22.16b ror w11,w11,#20 eor v4.16b,v25.16b,v26.16b ror w12,w12,#20 eor v5.16b,v29.16b,v30.16b ror w9,w9,#20 ushr v9.4s,v0.4s,#25 add w5,w5,w10 ushr v13.4s,v1.4s,#25 add w6,w6,w11 ushr v17.4s,v2.4s,#25 add w7,w7,w12 ushr v21.4s,v3.4s,#25 add w8,w8,w9 ushr v25.4s,v4.4s,#25 eor w21,w21,w5 ushr v29.4s,v5.4s,#25 eor w17,w17,w6 sli v9.4s,v0.4s,#7 eor w19,w19,w7 sli v13.4s,v1.4s,#7 eor w20,w20,w8 sli v17.4s,v2.4s,#7 ror w21,w21,#24 sli v21.4s,v3.4s,#7 ror w17,w17,#24 sli v25.4s,v4.4s,#7 ror w19,w19,#24 sli v29.4s,v5.4s,#7 ror w20,w20,#24 ext v10.16b,v10.16b,v10.16b,#8 add w15,w15,w21 ext v14.16b,v14.16b,v14.16b,#8 add w16,w16,w17 ext v18.16b,v18.16b,v18.16b,#8 add w13,w13,w19 ext v22.16b,v22.16b,v22.16b,#8 add w14,w14,w20 ext v26.16b,v26.16b,v26.16b,#8 eor w10,w10,w15 ext v30.16b,v30.16b,v30.16b,#8 eor w11,w11,w16 ext v11.16b,v11.16b,v11.16b,#4 eor w12,w12,w13 ext v15.16b,v15.16b,v15.16b,#4 eor w9,w9,w14 ext v19.16b,v19.16b,v19.16b,#4 ror w10,w10,#25 ext v23.16b,v23.16b,v23.16b,#4 ror w11,w11,#25 ext v27.16b,v27.16b,v27.16b,#4 ror w12,w12,#25 ext v31.16b,v31.16b,v31.16b,#4 ror w9,w9,#25 ext v9.16b,v9.16b,v9.16b,#12 ext v13.16b,v13.16b,v13.16b,#12 ext v17.16b,v17.16b,v17.16b,#12 ext v21.16b,v21.16b,v21.16b,#12 ext v25.16b,v25.16b,v25.16b,#12 ext v29.16b,v29.16b,v29.16b,#12 cbnz x4,.Loop_upper_neon add w5,w5,w22 // accumulate key block add x6,x6,x22,lsr#32 add w7,w7,w23 add x8,x8,x23,lsr#32 add w9,w9,w24 add x10,x10,x24,lsr#32 add w11,w11,w25 add x12,x12,x25,lsr#32 add w13,w13,w26 add x14,x14,x26,lsr#32 add w15,w15,w27 add x16,x16,x27,lsr#32 add w17,w17,w28 add x19,x19,x28,lsr#32 add w20,w20,w30 add x21,x21,x30,lsr#32 add x5,x5,x6,lsl#32 // pack add x7,x7,x8,lsl#32 ldp x6,x8,[x1,#0] // load input add x9,x9,x10,lsl#32 add x11,x11,x12,lsl#32 ldp x10,x12,[x1,#16] add x13,x13,x14,lsl#32 add x15,x15,x16,lsl#32 ldp x14,x16,[x1,#32] add x17,x17,x19,lsl#32 add x20,x20,x21,lsl#32 ldp x19,x21,[x1,#48] add x1,x1,#64 #ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 rev x9,x9 rev x11,x11 rev x13,x13 rev x15,x15 rev x17,x17 rev x20,x20 #endif eor x5,x5,x6 eor x7,x7,x8 eor x9,x9,x10 eor x11,x11,x12 eor x13,x13,x14 eor x15,x15,x16 eor x17,x17,x19 eor x20,x20,x21 stp x5,x7,[x0,#0] // store output add x28,x28,#1 // increment counter mov w5,w22 // unpack key block lsr x6,x22,#32 stp x9,x11,[x0,#16] mov w7,w23 lsr x8,x23,#32 stp x13,x15,[x0,#32] mov w9,w24 lsr x10,x24,#32 stp x17,x20,[x0,#48] add x0,x0,#64 mov w11,w25 lsr x12,x25,#32 mov w13,w26 lsr x14,x26,#32 mov w15,w27 lsr x16,x27,#32 mov w17,w28 lsr x19,x28,#32 mov w20,w30 lsr x21,x30,#32 mov x4,#5 .Loop_lower_neon: sub x4,x4,#1 add v8.4s,v8.4s,v9.4s add w5,w5,w9 add v12.4s,v12.4s,v13.4s add w6,w6,w10 add v16.4s,v16.4s,v17.4s add w7,w7,w11 add v20.4s,v20.4s,v21.4s add w8,w8,w12 add v24.4s,v24.4s,v25.4s eor w17,w17,w5 add v28.4s,v28.4s,v29.4s eor w19,w19,w6 eor v11.16b,v11.16b,v8.16b eor w20,w20,w7 eor v15.16b,v15.16b,v12.16b eor w21,w21,w8 eor v19.16b,v19.16b,v16.16b ror w17,w17,#16 eor v23.16b,v23.16b,v20.16b ror w19,w19,#16 eor v27.16b,v27.16b,v24.16b ror w20,w20,#16 eor v31.16b,v31.16b,v28.16b ror w21,w21,#16 rev32 v11.8h,v11.8h add w13,w13,w17 rev32 v15.8h,v15.8h add w14,w14,w19 rev32 v19.8h,v19.8h add w15,w15,w20 rev32 v23.8h,v23.8h add w16,w16,w21 rev32 v27.8h,v27.8h eor w9,w9,w13 rev32 v31.8h,v31.8h eor w10,w10,w14 add v10.4s,v10.4s,v11.4s eor w11,w11,w15 add v14.4s,v14.4s,v15.4s eor w12,w12,w16 add v18.4s,v18.4s,v19.4s ror w9,w9,#20 add v22.4s,v22.4s,v23.4s ror w10,w10,#20 add v26.4s,v26.4s,v27.4s ror w11,w11,#20 add v30.4s,v30.4s,v31.4s ror w12,w12,#20 eor v0.16b,v9.16b,v10.16b add w5,w5,w9 eor v1.16b,v13.16b,v14.16b add w6,w6,w10 eor v2.16b,v17.16b,v18.16b add w7,w7,w11 eor v3.16b,v21.16b,v22.16b add w8,w8,w12 eor v4.16b,v25.16b,v26.16b eor w17,w17,w5 eor v5.16b,v29.16b,v30.16b eor w19,w19,w6 ushr v9.4s,v0.4s,#20 eor w20,w20,w7 ushr v13.4s,v1.4s,#20 eor w21,w21,w8 ushr v17.4s,v2.4s,#20 ror w17,w17,#24 ushr v21.4s,v3.4s,#20 ror w19,w19,#24 ushr v25.4s,v4.4s,#20 ror w20,w20,#24 ushr v29.4s,v5.4s,#20 ror w21,w21,#24 sli v9.4s,v0.4s,#12 add w13,w13,w17 sli v13.4s,v1.4s,#12 add w14,w14,w19 sli v17.4s,v2.4s,#12 add w15,w15,w20 sli v21.4s,v3.4s,#12 add w16,w16,w21 sli v25.4s,v4.4s,#12 eor w9,w9,w13 sli v29.4s,v5.4s,#12 eor w10,w10,w14 add v8.4s,v8.4s,v9.4s eor w11,w11,w15 add v12.4s,v12.4s,v13.4s eor w12,w12,w16 add v16.4s,v16.4s,v17.4s ror w9,w9,#25 add v20.4s,v20.4s,v21.4s ror w10,w10,#25 add v24.4s,v24.4s,v25.4s ror w11,w11,#25 add v28.4s,v28.4s,v29.4s ror w12,w12,#25 eor v11.16b,v11.16b,v8.16b add w5,w5,w10 eor v15.16b,v15.16b,v12.16b add w6,w6,w11 eor v19.16b,v19.16b,v16.16b add w7,w7,w12 eor v23.16b,v23.16b,v20.16b add w8,w8,w9 eor v27.16b,v27.16b,v24.16b eor w21,w21,w5 eor v31.16b,v31.16b,v28.16b eor w17,w17,w6 tbl v11.16b,{v11.16b},v6.16b eor w19,w19,w7 tbl v15.16b,{v15.16b},v6.16b eor w20,w20,w8 tbl v19.16b,{v19.16b},v6.16b ror w21,w21,#16 tbl v23.16b,{v23.16b},v6.16b ror w17,w17,#16 tbl v27.16b,{v27.16b},v6.16b ror w19,w19,#16 tbl v31.16b,{v31.16b},v6.16b ror w20,w20,#16 add v10.4s,v10.4s,v11.4s add w15,w15,w21 add v14.4s,v14.4s,v15.4s add w16,w16,w17 add v18.4s,v18.4s,v19.4s add w13,w13,w19 add v22.4s,v22.4s,v23.4s add w14,w14,w20 add v26.4s,v26.4s,v27.4s eor w10,w10,w15 add v30.4s,v30.4s,v31.4s eor w11,w11,w16 eor v0.16b,v9.16b,v10.16b eor w12,w12,w13 eor v1.16b,v13.16b,v14.16b eor w9,w9,w14 eor v2.16b,v17.16b,v18.16b ror w10,w10,#20 eor v3.16b,v21.16b,v22.16b ror w11,w11,#20 eor v4.16b,v25.16b,v26.16b ror w12,w12,#20 eor v5.16b,v29.16b,v30.16b ror w9,w9,#20 ushr v9.4s,v0.4s,#25 add w5,w5,w10 ushr v13.4s,v1.4s,#25 add w6,w6,w11 ushr v17.4s,v2.4s,#25 add w7,w7,w12 ushr v21.4s,v3.4s,#25 add w8,w8,w9 ushr v25.4s,v4.4s,#25 eor w21,w21,w5 ushr v29.4s,v5.4s,#25 eor w17,w17,w6 sli v9.4s,v0.4s,#7 eor w19,w19,w7 sli v13.4s,v1.4s,#7 eor w20,w20,w8 sli v17.4s,v2.4s,#7 ror w21,w21,#24 sli v21.4s,v3.4s,#7 ror w17,w17,#24 sli v25.4s,v4.4s,#7 ror w19,w19,#24 sli v29.4s,v5.4s,#7 ror w20,w20,#24 ext v10.16b,v10.16b,v10.16b,#8 add w15,w15,w21 ext v14.16b,v14.16b,v14.16b,#8 add w16,w16,w17 ext v18.16b,v18.16b,v18.16b,#8 add w13,w13,w19 ext v22.16b,v22.16b,v22.16b,#8 add w14,w14,w20 ext v26.16b,v26.16b,v26.16b,#8 eor w10,w10,w15 ext v30.16b,v30.16b,v30.16b,#8 eor w11,w11,w16 ext v11.16b,v11.16b,v11.16b,#12 eor w12,w12,w13 ext v15.16b,v15.16b,v15.16b,#12 eor w9,w9,w14 ext v19.16b,v19.16b,v19.16b,#12 ror w10,w10,#25 ext v23.16b,v23.16b,v23.16b,#12 ror w11,w11,#25 ext v27.16b,v27.16b,v27.16b,#12 ror w12,w12,#25 ext v31.16b,v31.16b,v31.16b,#12 ror w9,w9,#25 ext v9.16b,v9.16b,v9.16b,#4 ext v13.16b,v13.16b,v13.16b,#4 ext v17.16b,v17.16b,v17.16b,#4 ext v21.16b,v21.16b,v21.16b,#4 ext v25.16b,v25.16b,v25.16b,#4 ext v29.16b,v29.16b,v29.16b,#4 add v8.4s,v8.4s,v9.4s add w5,w5,w9 add v12.4s,v12.4s,v13.4s add w6,w6,w10 add v16.4s,v16.4s,v17.4s add w7,w7,w11 add v20.4s,v20.4s,v21.4s add w8,w8,w12 add v24.4s,v24.4s,v25.4s eor w17,w17,w5 add v28.4s,v28.4s,v29.4s eor w19,w19,w6 eor v11.16b,v11.16b,v8.16b eor w20,w20,w7 eor v15.16b,v15.16b,v12.16b eor w21,w21,w8 eor v19.16b,v19.16b,v16.16b ror w17,w17,#16 eor v23.16b,v23.16b,v20.16b ror w19,w19,#16 eor v27.16b,v27.16b,v24.16b ror w20,w20,#16 eor v31.16b,v31.16b,v28.16b ror w21,w21,#16 rev32 v11.8h,v11.8h add w13,w13,w17 rev32 v15.8h,v15.8h add w14,w14,w19 rev32 v19.8h,v19.8h add w15,w15,w20 rev32 v23.8h,v23.8h add w16,w16,w21 rev32 v27.8h,v27.8h eor w9,w9,w13 rev32 v31.8h,v31.8h eor w10,w10,w14 add v10.4s,v10.4s,v11.4s eor w11,w11,w15 add v14.4s,v14.4s,v15.4s eor w12,w12,w16 add v18.4s,v18.4s,v19.4s ror w9,w9,#20 add v22.4s,v22.4s,v23.4s ror w10,w10,#20 add v26.4s,v26.4s,v27.4s ror w11,w11,#20 add v30.4s,v30.4s,v31.4s ror w12,w12,#20 eor v0.16b,v9.16b,v10.16b add w5,w5,w9 eor v1.16b,v13.16b,v14.16b add w6,w6,w10 eor v2.16b,v17.16b,v18.16b add w7,w7,w11 eor v3.16b,v21.16b,v22.16b add w8,w8,w12 eor v4.16b,v25.16b,v26.16b eor w17,w17,w5 eor v5.16b,v29.16b,v30.16b eor w19,w19,w6 ushr v9.4s,v0.4s,#20 eor w20,w20,w7 ushr v13.4s,v1.4s,#20 eor w21,w21,w8 ushr v17.4s,v2.4s,#20 ror w17,w17,#24 ushr v21.4s,v3.4s,#20 ror w19,w19,#24 ushr v25.4s,v4.4s,#20 ror w20,w20,#24 ushr v29.4s,v5.4s,#20 ror w21,w21,#24 sli v9.4s,v0.4s,#12 add w13,w13,w17 sli v13.4s,v1.4s,#12 add w14,w14,w19 sli v17.4s,v2.4s,#12 add w15,w15,w20 sli v21.4s,v3.4s,#12 add w16,w16,w21 sli v25.4s,v4.4s,#12 eor w9,w9,w13 sli v29.4s,v5.4s,#12 eor w10,w10,w14 add v8.4s,v8.4s,v9.4s eor w11,w11,w15 add v12.4s,v12.4s,v13.4s eor w12,w12,w16 add v16.4s,v16.4s,v17.4s ror w9,w9,#25 add v20.4s,v20.4s,v21.4s ror w10,w10,#25 add v24.4s,v24.4s,v25.4s ror w11,w11,#25 add v28.4s,v28.4s,v29.4s ror w12,w12,#25 eor v11.16b,v11.16b,v8.16b add w5,w5,w10 eor v15.16b,v15.16b,v12.16b add w6,w6,w11 eor v19.16b,v19.16b,v16.16b add w7,w7,w12 eor v23.16b,v23.16b,v20.16b add w8,w8,w9 eor v27.16b,v27.16b,v24.16b eor w21,w21,w5 eor v31.16b,v31.16b,v28.16b eor w17,w17,w6 tbl v11.16b,{v11.16b},v6.16b eor w19,w19,w7 tbl v15.16b,{v15.16b},v6.16b eor w20,w20,w8 tbl v19.16b,{v19.16b},v6.16b ror w21,w21,#16 tbl v23.16b,{v23.16b},v6.16b ror w17,w17,#16 tbl v27.16b,{v27.16b},v6.16b ror w19,w19,#16 tbl v31.16b,{v31.16b},v6.16b ror w20,w20,#16 add v10.4s,v10.4s,v11.4s add w15,w15,w21 add v14.4s,v14.4s,v15.4s add w16,w16,w17 add v18.4s,v18.4s,v19.4s add w13,w13,w19 add v22.4s,v22.4s,v23.4s add w14,w14,w20 add v26.4s,v26.4s,v27.4s eor w10,w10,w15 add v30.4s,v30.4s,v31.4s eor w11,w11,w16 eor v0.16b,v9.16b,v10.16b eor w12,w12,w13 eor v1.16b,v13.16b,v14.16b eor w9,w9,w14 eor v2.16b,v17.16b,v18.16b ror w10,w10,#20 eor v3.16b,v21.16b,v22.16b ror w11,w11,#20 eor v4.16b,v25.16b,v26.16b ror w12,w12,#20 eor v5.16b,v29.16b,v30.16b ror w9,w9,#20 ushr v9.4s,v0.4s,#25 add w5,w5,w10 ushr v13.4s,v1.4s,#25 add w6,w6,w11 ushr v17.4s,v2.4s,#25 add w7,w7,w12 ushr v21.4s,v3.4s,#25 add w8,w8,w9 ushr v25.4s,v4.4s,#25 eor w21,w21,w5 ushr v29.4s,v5.4s,#25 eor w17,w17,w6 sli v9.4s,v0.4s,#7 eor w19,w19,w7 sli v13.4s,v1.4s,#7 eor w20,w20,w8 sli v17.4s,v2.4s,#7 ror w21,w21,#24 sli v21.4s,v3.4s,#7 ror w17,w17,#24 sli v25.4s,v4.4s,#7 ror w19,w19,#24 sli v29.4s,v5.4s,#7 ror w20,w20,#24 ext v10.16b,v10.16b,v10.16b,#8 add w15,w15,w21 ext v14.16b,v14.16b,v14.16b,#8 add w16,w16,w17 ext v18.16b,v18.16b,v18.16b,#8 add w13,w13,w19 ext v22.16b,v22.16b,v22.16b,#8 add w14,w14,w20 ext v26.16b,v26.16b,v26.16b,#8 eor w10,w10,w15 ext v30.16b,v30.16b,v30.16b,#8 eor w11,w11,w16 ext v11.16b,v11.16b,v11.16b,#4 eor w12,w12,w13 ext v15.16b,v15.16b,v15.16b,#4 eor w9,w9,w14 ext v19.16b,v19.16b,v19.16b,#4 ror w10,w10,#25 ext v23.16b,v23.16b,v23.16b,#4 ror w11,w11,#25 ext v27.16b,v27.16b,v27.16b,#4 ror w12,w12,#25 ext v31.16b,v31.16b,v31.16b,#4 ror w9,w9,#25 ext v9.16b,v9.16b,v9.16b,#12 ext v13.16b,v13.16b,v13.16b,#12 ext v17.16b,v17.16b,v17.16b,#12 ext v21.16b,v21.16b,v21.16b,#12 ext v25.16b,v25.16b,v25.16b,#12 ext v29.16b,v29.16b,v29.16b,#12 cbnz x4,.Loop_lower_neon add w5,w5,w22 // accumulate key block ldp q0,q1,[sp,#0] add x6,x6,x22,lsr#32 ldp q2,q3,[sp,#32] add w7,w7,w23 ldp q4,q5,[sp,#64] add x8,x8,x23,lsr#32 ldr q6,[sp,#96] add v8.4s,v8.4s,v0.4s add w9,w9,w24 add v12.4s,v12.4s,v0.4s add x10,x10,x24,lsr#32 add v16.4s,v16.4s,v0.4s add w11,w11,w25 add v20.4s,v20.4s,v0.4s add x12,x12,x25,lsr#32 add v24.4s,v24.4s,v0.4s add w13,w13,w26 add v28.4s,v28.4s,v0.4s add x14,x14,x26,lsr#32 add v10.4s,v10.4s,v2.4s add w15,w15,w27 add v14.4s,v14.4s,v2.4s add x16,x16,x27,lsr#32 add v18.4s,v18.4s,v2.4s add w17,w17,w28 add v22.4s,v22.4s,v2.4s add x19,x19,x28,lsr#32 add v26.4s,v26.4s,v2.4s add w20,w20,w30 add v30.4s,v30.4s,v2.4s add x21,x21,x30,lsr#32 add v27.4s,v27.4s,v7.4s // +4 add x5,x5,x6,lsl#32 // pack add v31.4s,v31.4s,v7.4s // +4 add x7,x7,x8,lsl#32 add v11.4s,v11.4s,v3.4s ldp x6,x8,[x1,#0] // load input add v15.4s,v15.4s,v4.4s add x9,x9,x10,lsl#32 add v19.4s,v19.4s,v5.4s add x11,x11,x12,lsl#32 add v23.4s,v23.4s,v6.4s ldp x10,x12,[x1,#16] add v27.4s,v27.4s,v3.4s add x13,x13,x14,lsl#32 add v31.4s,v31.4s,v4.4s add x15,x15,x16,lsl#32 add v9.4s,v9.4s,v1.4s ldp x14,x16,[x1,#32] add v13.4s,v13.4s,v1.4s add x17,x17,x19,lsl#32 add v17.4s,v17.4s,v1.4s add x20,x20,x21,lsl#32 add v21.4s,v21.4s,v1.4s ldp x19,x21,[x1,#48] add v25.4s,v25.4s,v1.4s add x1,x1,#64 add v29.4s,v29.4s,v1.4s #ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 rev x9,x9 rev x11,x11 rev x13,x13 rev x15,x15 rev x17,x17 rev x20,x20 #endif ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 eor x5,x5,x6 eor x7,x7,x8 eor x9,x9,x10 eor x11,x11,x12 eor x13,x13,x14 eor v8.16b,v8.16b,v0.16b eor x15,x15,x16 eor v9.16b,v9.16b,v1.16b eor x17,x17,x19 eor v10.16b,v10.16b,v2.16b eor x20,x20,x21 eor v11.16b,v11.16b,v3.16b ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 stp x5,x7,[x0,#0] // store output add x28,x28,#7 // increment counter stp x9,x11,[x0,#16] stp x13,x15,[x0,#32] stp x17,x20,[x0,#48] add x0,x0,#64 st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 eor v12.16b,v12.16b,v0.16b eor v13.16b,v13.16b,v1.16b eor v14.16b,v14.16b,v2.16b eor v15.16b,v15.16b,v3.16b st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 eor v16.16b,v16.16b,v8.16b ldp q0,q1,[sp,#0] eor v17.16b,v17.16b,v9.16b ldp q2,q3,[sp,#32] eor v18.16b,v18.16b,v10.16b eor v19.16b,v19.16b,v11.16b st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 eor v20.16b,v20.16b,v12.16b eor v21.16b,v21.16b,v13.16b eor v22.16b,v22.16b,v14.16b eor v23.16b,v23.16b,v15.16b st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 eor v24.16b,v24.16b,v16.16b eor v25.16b,v25.16b,v17.16b eor v26.16b,v26.16b,v18.16b eor v27.16b,v27.16b,v19.16b st1 {v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64 shl v8.4s,v7.4s,#1 // 4 -> 8 eor v28.16b,v28.16b,v20.16b eor v29.16b,v29.16b,v21.16b eor v30.16b,v30.16b,v22.16b eor v31.16b,v31.16b,v23.16b st1 {v28.16b,v29.16b,v30.16b,v31.16b},[x0],#64 add v3.4s,v3.4s,v8.4s // += 8 add v4.4s,v4.4s,v8.4s add v5.4s,v5.4s,v8.4s add v6.4s,v6.4s,v8.4s b.hs .Loop_outer_512_neon adds x2,x2,#512 ushr v7.4s,v7.4s,#1 // 4 -> 2 ldp d10,d11,[sp,#128+16] // meet ABI requirements ldp d12,d13,[sp,#128+32] ldp d14,d15,[sp,#128+48] stp q0,q0,[sp,#0] // wipe off-load area stp q0,q0,[sp,#32] stp q0,q0,[sp,#64] b.eq .Ldone_512_neon sub x3,x3,#16 // .Lone cmp x2,#192 add sp,sp,#128 sub v3.4s,v3.4s,v7.4s // -= 2 ld1 {v8.4s,v9.4s},[x3] b.hs .Loop_outer_neon ldp d8,d9,[sp,#0] // meet ABI requirements eor v1.16b,v1.16b,v1.16b eor v2.16b,v2.16b,v2.16b eor v3.16b,v3.16b,v3.16b eor v4.16b,v4.16b,v4.16b eor v5.16b,v5.16b,v5.16b eor v6.16b,v6.16b,v6.16b b .Loop_outer .Ldone_512_neon: ldp d8,d9,[sp,#128+0] // meet ABI requirements ldp x19,x20,[x29,#16] add sp,sp,#128+64 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 .inst 0xd50323bf // autiasp ret .size ChaCha20_512_neon,.-ChaCha20_512_neon