#include "arm_arch.h" #if __ARM_MAX_ARCH__>=8 .arch armv8-a+crypto .text .globl aes_gcm_enc_128_kernel .type aes_gcm_enc_128_kernel,%function .align 4 aes_gcm_enc_128_kernel: cbz x1, .L128_enc_ret stp x19, x20, [sp, #-112]! mov x16, x4 mov x8, x5 stp x21, x22, [sp, #16] stp x23, x24, [sp, #32] stp d8, d9, [sp, #48] stp d10, d11, [sp, #64] stp d12, d13, [sp, #80] stp d14, d15, [sp, #96] ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 #ifdef __AARCH64EB__ rev x10, x10 rev x11, x11 #endif ldp x13, x14, [x8, #160] //load rk10 #ifdef __AARCH64EB__ ror x13, x13, #32 ror x14, x14, #32 #endif ld1 {v11.16b}, [x3] ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b lsr x5, x1, #3 //byte_len mov x15, x5 ld1 {v18.4s}, [x8], #16 //load rk0 add x4, x0, x1, lsr #3 //end_input_ptr sub x5, x5, #1 //byte_len - 1 lsr x12, x11, #32 ldr q15, [x3, #112] //load h4l | h4h #ifndef __AARCH64EB__ ext v15.16b, v15.16b, v15.16b, #8 #endif fmov d1, x10 //CTR block 1 rev w12, w12 //rev_ctr32 add w12, w12, #1 //increment rev_ctr32 orr w11, w11, w11 ld1 {v19.4s}, [x8], #16 //load rk1 rev w9, w12 //CTR block 1 add w12, w12, #1 //CTR block 1 fmov d3, x10 //CTR block 3 orr x9, x11, x9, lsl #32 //CTR block 1 ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible fmov v1.d[1], x9 //CTR block 1 rev w9, w12 //CTR block 2 fmov d2, x10 //CTR block 2 orr x9, x11, x9, lsl #32 //CTR block 2 add w12, w12, #1 //CTR block 2 fmov v2.d[1], x9 //CTR block 2 rev w9, w12 //CTR block 3 orr x9, x11, x9, lsl #32 //CTR block 3 ld1 {v20.4s}, [x8], #16 //load rk2 add w12, w12, #1 //CTR block 3 fmov v3.d[1], x9 //CTR block 3 ldr q14, [x3, #80] //load h3l | h3h #ifndef __AARCH64EB__ ext v14.16b, v14.16b, v14.16b, #8 #endif aese v1.16b, v18.16b aesmc v1.16b, v1.16b //AES block 1 - round 0 ld1 {v21.4s}, [x8], #16 //load rk3 aese v2.16b, v18.16b aesmc v2.16b, v2.16b //AES block 2 - round 0 ldr q12, [x3, #32] //load h1l | h1h #ifndef __AARCH64EB__ ext v12.16b, v12.16b, v12.16b, #8 #endif aese v0.16b, v18.16b aesmc v0.16b, v0.16b //AES block 0 - round 0 ld1 {v22.4s}, [x8], #16 //load rk4 aese v3.16b, v18.16b aesmc v3.16b, v3.16b //AES block 3 - round 0 ld1 {v23.4s}, [x8], #16 //load rk5 aese v2.16b, v19.16b aesmc v2.16b, v2.16b //AES block 2 - round 1 trn2 v17.2d, v14.2d, v15.2d //h4l | h3l aese v0.16b, v19.16b aesmc v0.16b, v0.16b //AES block 0 - round 1 ld1 {v24.4s}, [x8], #16 //load rk6 aese v1.16b, v19.16b aesmc v1.16b, v1.16b //AES block 1 - round 1 ld1 {v25.4s}, [x8], #16 //load rk7 aese v3.16b, v19.16b aesmc v3.16b, v3.16b //AES block 3 - round 1 trn1 v9.2d, v14.2d, v15.2d //h4h | h3h aese v0.16b, v20.16b aesmc v0.16b, v0.16b //AES block 0 - round 2 ld1 {v26.4s}, [x8], #16 //load rk8 aese v1.16b, v20.16b aesmc v1.16b, v1.16b //AES block 1 - round 2 ldr q13, [x3, #64] //load h2l | h2h #ifndef __AARCH64EB__ ext v13.16b, v13.16b, v13.16b, #8 #endif aese v3.16b, v20.16b aesmc v3.16b, v3.16b //AES block 3 - round 2 aese v2.16b, v20.16b aesmc v2.16b, v2.16b //AES block 2 - round 2 eor v17.16b, v17.16b, v9.16b //h4k | h3k aese v0.16b, v21.16b aesmc v0.16b, v0.16b //AES block 0 - round 3 aese v1.16b, v21.16b aesmc v1.16b, v1.16b //AES block 1 - round 3 aese v2.16b, v21.16b aesmc v2.16b, v2.16b //AES block 2 - round 3 ld1 {v27.4s}, [x8], #16 //load rk9 aese v3.16b, v21.16b aesmc v3.16b, v3.16b //AES block 3 - round 3 and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) trn2 v16.2d, v12.2d, v13.2d //h2l | h1l aese v3.16b, v22.16b aesmc v3.16b, v3.16b //AES block 3 - round 4 add x5, x5, x0 aese v2.16b, v22.16b aesmc v2.16b, v2.16b //AES block 2 - round 4 cmp x0, x5 //check if we have <= 4 blocks aese v0.16b, v22.16b aesmc v0.16b, v0.16b //AES block 0 - round 4 aese v3.16b, v23.16b aesmc v3.16b, v3.16b //AES block 3 - round 5 aese v2.16b, v23.16b aesmc v2.16b, v2.16b //AES block 2 - round 5 aese v0.16b, v23.16b aesmc v0.16b, v0.16b //AES block 0 - round 5 aese v3.16b, v24.16b aesmc v3.16b, v3.16b //AES block 3 - round 6 aese v1.16b, v22.16b aesmc v1.16b, v1.16b //AES block 1 - round 4 aese v2.16b, v24.16b aesmc v2.16b, v2.16b //AES block 2 - round 6 trn1 v8.2d, v12.2d, v13.2d //h2h | h1h aese v0.16b, v24.16b aesmc v0.16b, v0.16b //AES block 0 - round 6 aese v1.16b, v23.16b aesmc v1.16b, v1.16b //AES block 1 - round 5 aese v3.16b, v25.16b aesmc v3.16b, v3.16b //AES block 3 - round 7 aese v0.16b, v25.16b aesmc v0.16b, v0.16b //AES block 0 - round 7 aese v1.16b, v24.16b aesmc v1.16b, v1.16b //AES block 1 - round 6 aese v2.16b, v25.16b aesmc v2.16b, v2.16b //AES block 2 - round 7 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 0 - round 8 aese v1.16b, v25.16b aesmc v1.16b, v1.16b //AES block 1 - round 7 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 2 - round 8 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 3 - round 8 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 1 - round 8 aese v2.16b, v27.16b //AES block 2 - round 9 aese v0.16b, v27.16b //AES block 0 - round 9 eor v16.16b, v16.16b, v8.16b //h2k | h1k aese v1.16b, v27.16b //AES block 1 - round 9 aese v3.16b, v27.16b //AES block 3 - round 9 b.ge .L128_enc_tail //handle tail ldp x6, x7, [x0, #0] //AES block 0 - load plaintext #ifdef __AARCH64EB__ rev x6, x6 rev x7, x7 #endif ldp x21, x22, [x0, #32] //AES block 2 - load plaintext #ifdef __AARCH64EB__ rev x21, x21 rev x22, x22 #endif ldp x19, x20, [x0, #16] //AES block 1 - load plaintext #ifdef __AARCH64EB__ rev x19, x19 rev x20, x20 #endif ldp x23, x24, [x0, #48] //AES block 3 - load plaintext #ifdef __AARCH64EB__ rev x23, x23 rev x24, x24 #endif eor x6, x6, x13 //AES block 0 - round 10 low eor x7, x7, x14 //AES block 0 - round 10 high eor x21, x21, x13 //AES block 2 - round 10 low fmov d4, x6 //AES block 0 - mov low eor x19, x19, x13 //AES block 1 - round 10 low eor x22, x22, x14 //AES block 2 - round 10 high fmov v4.d[1], x7 //AES block 0 - mov high fmov d5, x19 //AES block 1 - mov low eor x20, x20, x14 //AES block 1 - round 10 high eor x23, x23, x13 //AES block 3 - round 10 low fmov v5.d[1], x20 //AES block 1 - mov high fmov d6, x21 //AES block 2 - mov low eor x24, x24, x14 //AES block 3 - round 10 high rev w9, w12 //CTR block 4 fmov v6.d[1], x22 //AES block 2 - mov high orr x9, x11, x9, lsl #32 //CTR block 4 eor v4.16b, v4.16b, v0.16b //AES block 0 - result fmov d0, x10 //CTR block 4 add w12, w12, #1 //CTR block 4 fmov v0.d[1], x9 //CTR block 4 rev w9, w12 //CTR block 5 eor v5.16b, v5.16b, v1.16b //AES block 1 - result fmov d1, x10 //CTR block 5 orr x9, x11, x9, lsl #32 //CTR block 5 add w12, w12, #1 //CTR block 5 add x0, x0, #64 //AES input_ptr update fmov v1.d[1], x9 //CTR block 5 fmov d7, x23 //AES block 3 - mov low rev w9, w12 //CTR block 6 st1 { v4.16b}, [x2], #16 //AES block 0 - store result fmov v7.d[1], x24 //AES block 3 - mov high orr x9, x11, x9, lsl #32 //CTR block 6 add w12, w12, #1 //CTR block 6 eor v6.16b, v6.16b, v2.16b //AES block 2 - result st1 { v5.16b}, [x2], #16 //AES block 1 - store result fmov d2, x10 //CTR block 6 cmp x0, x5 //check if we have <= 8 blocks fmov v2.d[1], x9 //CTR block 6 rev w9, w12 //CTR block 7 st1 { v6.16b}, [x2], #16 //AES block 2 - store result orr x9, x11, x9, lsl #32 //CTR block 7 eor v7.16b, v7.16b, v3.16b //AES block 3 - result st1 { v7.16b}, [x2], #16 //AES block 3 - store result b.ge .L128_enc_prepretail //do prepretail .L128_enc_main_loop: //main loop start ldp x23, x24, [x0, #48] //AES block 4k+3 - load plaintext #ifdef __AARCH64EB__ rev x23, x23 rev x24, x24 #endif rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) aese v2.16b, v18.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 fmov d3, x10 //CTR block 4k+3 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) aese v1.16b, v18.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 add w12, w12, #1 //CTR block 4k+3 fmov v3.d[1], x9 //CTR block 4k+3 aese v0.16b, v18.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 mov d31, v6.d[1] //GHASH block 4k+2 - mid aese v2.16b, v19.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 mov d30, v5.d[1] //GHASH block 4k+1 - mid aese v1.16b, v19.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 eor v4.16b, v4.16b, v11.16b //PRE 1 aese v3.16b, v18.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 eor x24, x24, x14 //AES block 4k+3 - round 10 high pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext #ifdef __AARCH64EB__ rev x6, x6 rev x7, x7 #endif aese v0.16b, v19.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 rev w9, w12 //CTR block 4k+8 eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid mov d8, v4.d[1] //GHASH block 4k - mid orr x9, x11, x9, lsl #32 //CTR block 4k+8 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high add w12, w12, #1 //CTR block 4k+8 mov d10, v17.d[1] //GHASH block 4k - mid aese v0.16b, v20.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid aese v1.16b, v20.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 aese v0.16b, v21.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high eor x7, x7, x14 //AES block 4k+4 - round 10 high eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid mov d30, v7.d[1] //GHASH block 4k+3 - mid aese v3.16b, v19.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low aese v2.16b, v20.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 eor x6, x6, x13 //AES block 4k+4 - round 10 low aese v1.16b, v21.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high aese v2.16b, v21.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low movi v8.8b, #0xc2 pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low aese v1.16b, v22.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 aese v3.16b, v20.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 shl d8, d8, #56 //mod_constant aese v0.16b, v22.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high aese v1.16b, v23.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext #ifdef __AARCH64EB__ rev x19, x19 rev x20, x20 #endif aese v3.16b, v21.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid aese v0.16b, v23.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext #ifdef __AARCH64EB__ rev x21, x21 rev x22, x22 #endif pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low aese v2.16b, v22.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 eor x19, x19, x13 //AES block 4k+5 - round 10 low aese v3.16b, v22.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid aese v1.16b, v24.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 eor x23, x23, x13 //AES block 4k+3 - round 10 low aese v2.16b, v23.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up fmov d4, x6 //AES block 4k+4 - mov low aese v0.16b, v24.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 fmov v4.d[1], x7 //AES block 4k+4 - mov high add x0, x0, #64 //AES input_ptr update fmov d7, x23 //AES block 4k+3 - mov low ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment aese v3.16b, v23.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 fmov d5, x19 //AES block 4k+5 - mov low aese v0.16b, v25.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up aese v2.16b, v24.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 eor x20, x20, x14 //AES block 4k+5 - round 10 high aese v1.16b, v25.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 fmov v5.d[1], x20 //AES block 4k+5 - mov high aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 fmov v7.d[1], x24 //AES block 4k+3 - mov high aese v3.16b, v24.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 cmp x0, x5 //.LOOP CONTROL aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid aese v0.16b, v27.16b //AES block 4k+4 - round 9 eor x21, x21, x13 //AES block 4k+6 - round 10 low eor x22, x22, x14 //AES block 4k+6 - round 10 high aese v3.16b, v25.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 fmov d6, x21 //AES block 4k+6 - mov low aese v1.16b, v27.16b //AES block 4k+5 - round 9 fmov v6.d[1], x22 //AES block 4k+6 - mov high aese v2.16b, v25.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 eor v4.16b, v4.16b, v0.16b //AES block 4k+4 - result fmov d0, x10 //CTR block 4k+8 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 fmov v0.d[1], x9 //CTR block 4k+8 rev w9, w12 //CTR block 4k+9 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 eor v5.16b, v5.16b, v1.16b //AES block 4k+5 - result add w12, w12, #1 //CTR block 4k+9 orr x9, x11, x9, lsl #32 //CTR block 4k+9 fmov d1, x10 //CTR block 4k+9 pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low fmov v1.d[1], x9 //CTR block 4k+9 rev w9, w12 //CTR block 4k+10 aese v2.16b, v27.16b //AES block 4k+6 - round 9 st1 { v4.16b}, [x2], #16 //AES block 4k+4 - store result eor v6.16b, v6.16b, v2.16b //AES block 4k+6 - result orr x9, x11, x9, lsl #32 //CTR block 4k+10 aese v3.16b, v27.16b //AES block 4k+7 - round 9 add w12, w12, #1 //CTR block 4k+10 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment fmov d2, x10 //CTR block 4k+10 eor v11.16b, v11.16b, v9.16b //MODULO - fold into low st1 { v5.16b}, [x2], #16 //AES block 4k+5 - store result fmov v2.d[1], x9 //CTR block 4k+10 st1 { v6.16b}, [x2], #16 //AES block 4k+6 - store result rev w9, w12 //CTR block 4k+11 orr x9, x11, x9, lsl #32 //CTR block 4k+11 eor v7.16b, v7.16b, v3.16b //AES block 4k+3 - result eor v11.16b, v11.16b, v10.16b //MODULO - fold into low st1 { v7.16b}, [x2], #16 //AES block 4k+3 - store result b.lt .L128_enc_main_loop .L128_enc_prepretail: //PREPRETAIL rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) fmov d3, x10 //CTR block 4k+3 rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 add w12, w12, #1 //CTR block 4k+3 fmov v3.d[1], x9 //CTR block 4k+3 aese v1.16b, v18.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) eor v4.16b, v4.16b, v11.16b //PRE 1 pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high aese v3.16b, v18.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 mov d30, v5.d[1] //GHASH block 4k+1 - mid pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low mov d8, v4.d[1] //GHASH block 4k - mid mov d31, v6.d[1] //GHASH block 4k+2 - mid mov d10, v17.d[1] //GHASH block 4k - mid aese v1.16b, v19.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid aese v3.16b, v19.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid aese v0.16b, v18.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid aese v2.16b, v18.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid mov d30, v7.d[1] //GHASH block 4k+3 - mid aese v0.16b, v19.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low aese v2.16b, v19.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high aese v0.16b, v20.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low movi v8.8b, #0xc2 aese v2.16b, v20.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low aese v3.16b, v20.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid aese v2.16b, v21.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 aese v1.16b, v20.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high aese v0.16b, v21.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid shl d8, d8, #56 //mod_constant aese v1.16b, v21.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low aese v0.16b, v22.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 pmull v28.1q, v9.1d, v8.1d eor v10.16b, v10.16b, v9.16b //karatsuba tidy up aese v1.16b, v22.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 aese v0.16b, v23.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 ext v9.16b, v9.16b, v9.16b, #8 aese v3.16b, v21.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 aese v2.16b, v22.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 eor v10.16b, v10.16b, v11.16b aese v0.16b, v24.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 aese v3.16b, v22.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 aese v1.16b, v23.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 aese v2.16b, v23.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 eor v10.16b, v10.16b, v28.16b aese v3.16b, v23.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 aese v1.16b, v24.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 aese v2.16b, v24.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 aese v3.16b, v24.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 eor v10.16b, v10.16b, v9.16b aese v0.16b, v25.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 aese v2.16b, v25.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 aese v3.16b, v25.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 pmull v28.1q, v10.1d, v8.1d aese v1.16b, v25.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 ext v10.16b, v10.16b, v10.16b, #8 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 eor v11.16b, v11.16b, v28.16b aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 aese v3.16b, v27.16b //AES block 4k+7 - round 9 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 aese v0.16b, v27.16b //AES block 4k+4 - round 9 aese v1.16b, v27.16b //AES block 4k+5 - round 9 eor v11.16b, v11.16b, v10.16b aese v2.16b, v27.16b //AES block 4k+6 - round 9 .L128_enc_tail: //TAIL sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext #ifdef __AARCH64EB__ rev x6, x6 rev x7, x7 #endif cmp x5, #48 ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag eor x6, x6, x13 //AES block 4k+4 - round 10 low eor x7, x7, x14 //AES block 4k+4 - round 10 high fmov d4, x6 //AES block 4k+4 - mov low fmov v4.d[1], x7 //AES block 4k+4 - mov high eor v5.16b, v4.16b, v0.16b //AES block 4k+4 - result b.gt .L128_enc_blocks_more_than_3 sub w12, w12, #1 movi v11.8b, #0 mov v3.16b, v2.16b cmp x5, #32 mov v2.16b, v1.16b movi v9.8b, #0 movi v10.8b, #0 b.gt .L128_enc_blocks_more_than_2 mov v3.16b, v1.16b cmp x5, #16 sub w12, w12, #1 b.gt .L128_enc_blocks_more_than_1 sub w12, w12, #1 b .L128_enc_blocks_less_than_1 .L128_enc_blocks_more_than_3: //blocks left > 3 st1 { v5.16b}, [x2], #16 //AES final-3 block - store result ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high #ifdef __AARCH64EB__ rev x6, x6 rev x7, x7 #endif rev64 v4.16b, v5.16b //GHASH final-3 block eor v4.16b, v4.16b, v8.16b //feed in partial tag eor x7, x7, x14 //AES final-2 block - round 10 high eor x6, x6, x13 //AES final-2 block - round 10 low fmov d5, x6 //AES final-2 block - mov low movi v8.8b, #0 //suppress further partial tag feed in fmov v5.d[1], x7 //AES final-2 block - mov high pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low mov d22, v4.d[1] //GHASH final-3 block - mid pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high mov d10, v17.d[1] //GHASH final-3 block - mid eor v5.16b, v5.16b, v1.16b //AES final-2 block - result eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid .L128_enc_blocks_more_than_2: //blocks left > 2 st1 { v5.16b}, [x2], #16 //AES final-2 block - store result rev64 v4.16b, v5.16b //GHASH final-2 block ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high #ifdef __AARCH64EB__ rev x6, x6 rev x7, x7 #endif eor v4.16b, v4.16b, v8.16b //feed in partial tag eor x6, x6, x13 //AES final-1 block - round 10 low fmov d5, x6 //AES final-1 block - mov low eor x7, x7, x14 //AES final-1 block - round 10 high pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high fmov v5.d[1], x7 //AES final-1 block - mov high mov d22, v4.d[1] //GHASH final-2 block - mid pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid eor v5.16b, v5.16b, v2.16b //AES final-1 block - result eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid movi v8.8b, #0 //suppress further partial tag feed in eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid .L128_enc_blocks_more_than_1: //blocks left > 1 st1 { v5.16b}, [x2], #16 //AES final-1 block - store result rev64 v4.16b, v5.16b //GHASH final-1 block ldp x6, x7, [x0], #16 //AES final block - load input low & high #ifdef __AARCH64EB__ rev x6, x6 rev x7, x7 #endif eor v4.16b, v4.16b, v8.16b //feed in partial tag eor x7, x7, x14 //AES final block - round 10 high eor x6, x6, x13 //AES final block - round 10 low fmov d5, x6 //AES final block - mov low pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high fmov v5.d[1], x7 //AES final block - mov high mov d22, v4.d[1] //GHASH final-1 block - mid pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid eor v5.16b, v5.16b, v3.16b //AES final block - result ins v22.d[1], v22.d[0] //GHASH final-1 block - mid pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid movi v8.8b, #0 //suppress further partial tag feed in .L128_enc_blocks_less_than_1: //blocks left <= 1 and x1, x1, #127 //bit_length %= 128 mvn x13, xzr //rk10_l = 0xffffffffffffffff mvn x14, xzr //rk10_h = 0xffffffffffffffff sub x1, x1, #128 //bit_length -= 128 neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) and x1, x1, #127 //bit_length %= 128 lsr x14, x14, x1 //rk10_h is mask for top 64b of last block cmp x1, #64 csel x6, x13, x14, lt csel x7, x14, xzr, lt fmov d0, x6 //ctr0b is mask for last block fmov v0.d[1], x7 and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits rev64 v4.16b, v5.16b //GHASH final block eor v4.16b, v4.16b, v8.16b //feed in partial tag mov d8, v4.d[1] //GHASH final block - mid pmull v21.1q, v4.1d, v12.1d //GHASH final block - low ld1 { v18.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored eor v8.8b, v8.8b, v4.8b //GHASH final block - mid #ifndef __AARCH64EB__ rev w9, w12 #else mov w9, w12 #endif pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid eor v11.16b, v11.16b, v21.16b //GHASH final block - low eor v9.16b, v9.16b, v20.16b //GHASH final block - high eor v10.16b, v10.16b, v8.16b //GHASH final block - mid movi v8.8b, #0xc2 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up shl d8, d8, #56 //mod_constant eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment bif v5.16b, v18.16b, v0.16b //insert existing bytes in top end of result before storing eor v11.16b, v11.16b, v9.16b //MODULO - fold into low st1 { v5.16b}, [x2] //store all 16B str w9, [x16, #12] //store the updated counter eor v11.16b, v11.16b, v10.16b //MODULO - fold into low ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b mov x0, x15 st1 { v11.16b }, [x3] ldp x21, x22, [sp, #16] ldp x23, x24, [sp, #32] ldp d8, d9, [sp, #48] ldp d10, d11, [sp, #64] ldp d12, d13, [sp, #80] ldp d14, d15, [sp, #96] ldp x19, x20, [sp], #112 ret .L128_enc_ret: mov w0, #0x0 ret .size aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel .globl aes_gcm_dec_128_kernel .type aes_gcm_dec_128_kernel,%function .align 4 aes_gcm_dec_128_kernel: cbz x1, .L128_dec_ret stp x19, x20, [sp, #-112]! mov x16, x4 mov x8, x5 stp x21, x22, [sp, #16] stp x23, x24, [sp, #32] stp d8, d9, [sp, #48] stp d10, d11, [sp, #64] stp d12, d13, [sp, #80] stp d14, d15, [sp, #96] lsr x5, x1, #3 //byte_len mov x15, x5 ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 #ifdef __AARCH64EB__ rev x10, x10 rev x11, x11 #endif ldp x13, x14, [x8, #160] //load rk10 #ifdef __AARCH64EB__ ror x14, x14, 32 ror x13, x13, 32 #endif sub x5, x5, #1 //byte_len - 1 ld1 {v18.4s}, [x8], #16 //load rk0 and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible ldr q13, [x3, #64] //load h2l | h2h #ifndef __AARCH64EB__ ext v13.16b, v13.16b, v13.16b, #8 #endif lsr x12, x11, #32 fmov d2, x10 //CTR block 2 ld1 {v19.4s}, [x8], #16 //load rk1 orr w11, w11, w11 rev w12, w12 //rev_ctr32 fmov d1, x10 //CTR block 1 add w12, w12, #1 //increment rev_ctr32 aese v0.16b, v18.16b aesmc v0.16b, v0.16b //AES block 0 - round 0 rev w9, w12 //CTR block 1 orr x9, x11, x9, lsl #32 //CTR block 1 ld1 {v20.4s}, [x8], #16 //load rk2 add w12, w12, #1 //CTR block 1 fmov v1.d[1], x9 //CTR block 1 rev w9, w12 //CTR block 2 add w12, w12, #1 //CTR block 2 aese v0.16b, v19.16b aesmc v0.16b, v0.16b //AES block 0 - round 1 orr x9, x11, x9, lsl #32 //CTR block 2 fmov v2.d[1], x9 //CTR block 2 rev w9, w12 //CTR block 3 fmov d3, x10 //CTR block 3 orr x9, x11, x9, lsl #32 //CTR block 3 add w12, w12, #1 //CTR block 3 fmov v3.d[1], x9 //CTR block 3 add x4, x0, x1, lsr #3 //end_input_ptr aese v1.16b, v18.16b aesmc v1.16b, v1.16b //AES block 1 - round 0 ld1 {v21.4s}, [x8], #16 //load rk3 aese v0.16b, v20.16b aesmc v0.16b, v0.16b //AES block 0 - round 2 ld1 {v22.4s}, [x8], #16 //load rk4 aese v2.16b, v18.16b aesmc v2.16b, v2.16b //AES block 2 - round 0 ld1 {v23.4s}, [x8], #16 //load rk5 aese v1.16b, v19.16b aesmc v1.16b, v1.16b //AES block 1 - round 1 ld1 {v24.4s}, [x8], #16 //load rk6 aese v3.16b, v18.16b aesmc v3.16b, v3.16b //AES block 3 - round 0 aese v2.16b, v19.16b aesmc v2.16b, v2.16b //AES block 2 - round 1 aese v1.16b, v20.16b aesmc v1.16b, v1.16b //AES block 1 - round 2 aese v3.16b, v19.16b aesmc v3.16b, v3.16b //AES block 3 - round 1 ld1 { v11.16b}, [x3] ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b aese v0.16b, v21.16b aesmc v0.16b, v0.16b //AES block 0 - round 3 ld1 {v25.4s}, [x8], #16 //load rk7 aese v1.16b, v21.16b aesmc v1.16b, v1.16b //AES block 1 - round 3 aese v3.16b, v20.16b aesmc v3.16b, v3.16b //AES block 3 - round 2 aese v2.16b, v20.16b aesmc v2.16b, v2.16b //AES block 2 - round 2 ld1 {v26.4s}, [x8], #16 //load rk8 aese v1.16b, v22.16b aesmc v1.16b, v1.16b //AES block 1 - round 4 aese v3.16b, v21.16b aesmc v3.16b, v3.16b //AES block 3 - round 3 aese v2.16b, v21.16b aesmc v2.16b, v2.16b //AES block 2 - round 3 ldr q14, [x3, #80] //load h3l | h3h #ifndef __AARCH64EB__ ext v14.16b, v14.16b, v14.16b, #8 #endif aese v0.16b, v22.16b aesmc v0.16b, v0.16b //AES block 0 - round 4 ld1 {v27.4s}, [x8], #16 //load rk9 aese v1.16b, v23.16b aesmc v1.16b, v1.16b //AES block 1 - round 5 aese v2.16b, v22.16b aesmc v2.16b, v2.16b //AES block 2 - round 4 aese v3.16b, v22.16b aesmc v3.16b, v3.16b //AES block 3 - round 4 aese v0.16b, v23.16b aesmc v0.16b, v0.16b //AES block 0 - round 5 aese v2.16b, v23.16b aesmc v2.16b, v2.16b //AES block 2 - round 5 ldr q12, [x3, #32] //load h1l | h1h #ifndef __AARCH64EB__ ext v12.16b, v12.16b, v12.16b, #8 #endif aese v3.16b, v23.16b aesmc v3.16b, v3.16b //AES block 3 - round 5 aese v0.16b, v24.16b aesmc v0.16b, v0.16b //AES block 0 - round 6 aese v1.16b, v24.16b aesmc v1.16b, v1.16b //AES block 1 - round 6 aese v3.16b, v24.16b aesmc v3.16b, v3.16b //AES block 3 - round 6 aese v2.16b, v24.16b aesmc v2.16b, v2.16b //AES block 2 - round 6 trn1 v8.2d, v12.2d, v13.2d //h2h | h1h ldr q15, [x3, #112] //load h4l | h4h #ifndef __AARCH64EB__ ext v15.16b, v15.16b, v15.16b, #8 #endif trn2 v16.2d, v12.2d, v13.2d //h2l | h1l add x5, x5, x0 aese v1.16b, v25.16b aesmc v1.16b, v1.16b //AES block 1 - round 7 aese v2.16b, v25.16b aesmc v2.16b, v2.16b //AES block 2 - round 7 aese v0.16b, v25.16b aesmc v0.16b, v0.16b //AES block 0 - round 7 eor v16.16b, v16.16b, v8.16b //h2k | h1k aese v3.16b, v25.16b aesmc v3.16b, v3.16b //AES block 3 - round 7 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 1 - round 8 trn2 v17.2d, v14.2d, v15.2d //h4l | h3l aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 2 - round 8 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 3 - round 8 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 0 - round 8 trn1 v9.2d, v14.2d, v15.2d //h4h | h3h aese v2.16b, v27.16b //AES block 2 - round 9 aese v3.16b, v27.16b //AES block 3 - round 9 aese v0.16b, v27.16b //AES block 0 - round 9 cmp x0, x5 //check if we have <= 4 blocks aese v1.16b, v27.16b //AES block 1 - round 9 eor v17.16b, v17.16b, v9.16b //h4k | h3k b.ge .L128_dec_tail //handle tail ld1 {v4.16b, v5.16b}, [x0], #32 //AES block 0 - load ciphertext; AES block 1 - load ciphertext eor v1.16b, v5.16b, v1.16b //AES block 1 - result ld1 {v6.16b}, [x0], #16 //AES block 2 - load ciphertext eor v0.16b, v4.16b, v0.16b //AES block 0 - result rev64 v4.16b, v4.16b //GHASH block 0 rev w9, w12 //CTR block 4 orr x9, x11, x9, lsl #32 //CTR block 4 add w12, w12, #1 //CTR block 4 ld1 {v7.16b}, [x0], #16 //AES block 3 - load ciphertext rev64 v5.16b, v5.16b //GHASH block 1 mov x19, v1.d[0] //AES block 1 - mov low mov x20, v1.d[1] //AES block 1 - mov high mov x6, v0.d[0] //AES block 0 - mov low cmp x0, x5 //check if we have <= 8 blocks mov x7, v0.d[1] //AES block 0 - mov high fmov d0, x10 //CTR block 4 fmov v0.d[1], x9 //CTR block 4 rev w9, w12 //CTR block 5 eor x19, x19, x13 //AES block 1 - round 10 low #ifdef __AARCH64EB__ rev x19, x19 #endif fmov d1, x10 //CTR block 5 add w12, w12, #1 //CTR block 5 orr x9, x11, x9, lsl #32 //CTR block 5 fmov v1.d[1], x9 //CTR block 5 rev w9, w12 //CTR block 6 add w12, w12, #1 //CTR block 6 orr x9, x11, x9, lsl #32 //CTR block 6 eor x20, x20, x14 //AES block 1 - round 10 high #ifdef __AARCH64EB__ rev x20, x20 #endif eor x6, x6, x13 //AES block 0 - round 10 low #ifdef __AARCH64EB__ rev x6, x6 #endif eor v2.16b, v6.16b, v2.16b //AES block 2 - result eor x7, x7, x14 //AES block 0 - round 10 high #ifdef __AARCH64EB__ rev x7, x7 #endif stp x6, x7, [x2], #16 //AES block 0 - store result stp x19, x20, [x2], #16 //AES block 1 - store result b.ge .L128_dec_prepretail //do prepretail .L128_dec_main_loop: //main loop start eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 mov x21, v2.d[0] //AES block 4k+2 - mov low pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high mov x22, v2.d[1] //AES block 4k+2 - mov high aese v1.16b, v18.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 fmov d2, x10 //CTR block 4k+6 rev64 v6.16b, v6.16b //GHASH block 4k+2 fmov v2.d[1], x9 //CTR block 4k+6 rev w9, w12 //CTR block 4k+7 mov x23, v3.d[0] //AES block 4k+3 - mov low eor v4.16b, v4.16b, v11.16b //PRE 1 mov d30, v5.d[1] //GHASH block 4k+1 - mid aese v1.16b, v19.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 rev64 v7.16b, v7.16b //GHASH block 4k+3 pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low mov x24, v3.d[1] //AES block 4k+3 - mov high orr x9, x11, x9, lsl #32 //CTR block 4k+7 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low fmov d3, x10 //CTR block 4k+7 eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid aese v1.16b, v20.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 fmov v3.d[1], x9 //CTR block 4k+7 aese v2.16b, v18.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 mov d10, v17.d[1] //GHASH block 4k - mid pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low aese v1.16b, v21.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 mov d8, v4.d[1] //GHASH block 4k - mid aese v3.16b, v18.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high aese v0.16b, v18.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid aese v3.16b, v19.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 eor x23, x23, x13 //AES block 4k+3 - round 10 low #ifdef __AARCH64EB__ rev x23, x23 #endif pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid eor x22, x22, x14 //AES block 4k+2 - round 10 high #ifdef __AARCH64EB__ rev x22, x22 #endif mov d31, v6.d[1] //GHASH block 4k+2 - mid aese v0.16b, v19.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid aese v3.16b, v20.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid aese v0.16b, v20.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 aese v1.16b, v22.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high aese v0.16b, v21.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high aese v2.16b, v19.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 mov d30, v7.d[1] //GHASH block 4k+3 - mid aese v0.16b, v22.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid eor x24, x24, x14 //AES block 4k+3 - round 10 high #ifdef __AARCH64EB__ rev x24, x24 #endif aese v2.16b, v20.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid aese v1.16b, v23.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 eor x21, x21, x13 //AES block 4k+2 - round 10 low #ifdef __AARCH64EB__ rev x21, x21 #endif aese v0.16b, v23.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 movi v8.8b, #0xc2 aese v2.16b, v21.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low aese v1.16b, v24.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 aese v0.16b, v24.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid aese v2.16b, v22.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 stp x21, x22, [x2], #16 //AES block 4k+2 - store result pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high ld1 {v4.16b}, [x0], #16 //AES block 4k+3 - load ciphertext aese v1.16b, v25.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 add w12, w12, #1 //CTR block 4k+7 aese v0.16b, v25.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 shl d8, d8, #56 //mod_constant aese v2.16b, v23.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 stp x23, x24, [x2], #16 //AES block 4k+3 - store result aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up aese v3.16b, v21.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 rev w9, w12 //CTR block 4k+8 pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid ld1 {v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment aese v0.16b, v27.16b //AES block 4k+4 - round 9 orr x9, x11, x9, lsl #32 //CTR block 4k+8 aese v3.16b, v22.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up aese v1.16b, v27.16b //AES block 4k+5 - round 9 aese v2.16b, v24.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 eor v0.16b, v4.16b, v0.16b //AES block 4k+4 - result aese v3.16b, v23.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 ld1 {v6.16b}, [x0], #16 //AES block 4k+5 - load ciphertext add w12, w12, #1 //CTR block 4k+8 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid eor v1.16b, v5.16b, v1.16b //AES block 4k+5 - result aese v2.16b, v25.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 ld1 {v7.16b}, [x0], #16 //AES block 4k+6 - load ciphertext aese v3.16b, v24.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 rev64 v5.16b, v5.16b //GHASH block 4k+5 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid mov x7, v0.d[1] //AES block 4k+4 - mov high aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 mov x6, v0.d[0] //AES block 4k+4 - mov low aese v3.16b, v25.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 fmov d0, x10 //CTR block 4k+8 pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low fmov v0.d[1], x9 //CTR block 4k+8 rev w9, w12 //CTR block 4k+9 aese v2.16b, v27.16b //AES block 4k+6 - round 9 orr x9, x11, x9, lsl #32 //CTR block 4k+9 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 eor x7, x7, x14 //AES block 4k+4 - round 10 high #ifdef __AARCH64EB__ rev x7, x7 #endif eor v11.16b, v11.16b, v8.16b //MODULO - fold into low mov x20, v1.d[1] //AES block 4k+5 - mov high eor x6, x6, x13 //AES block 4k+4 - round 10 low #ifdef __AARCH64EB__ rev x6, x6 #endif eor v2.16b, v6.16b, v2.16b //AES block 4k+6 - result mov x19, v1.d[0] //AES block 4k+5 - mov low add w12, w12, #1 //CTR block 4k+9 aese v3.16b, v27.16b //AES block 4k+7 - round 9 fmov d1, x10 //CTR block 4k+9 cmp x0, x5 //.LOOP CONTROL rev64 v4.16b, v4.16b //GHASH block 4k+4 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low fmov v1.d[1], x9 //CTR block 4k+9 rev w9, w12 //CTR block 4k+10 add w12, w12, #1 //CTR block 4k+10 eor x20, x20, x14 //AES block 4k+5 - round 10 high #ifdef __AARCH64EB__ rev x20, x20 #endif stp x6, x7, [x2], #16 //AES block 4k+4 - store result eor x19, x19, x13 //AES block 4k+5 - round 10 low #ifdef __AARCH64EB__ rev x19, x19 #endif stp x19, x20, [x2], #16 //AES block 4k+5 - store result orr x9, x11, x9, lsl #32 //CTR block 4k+10 b.lt .L128_dec_main_loop .L128_dec_prepretail: //PREPRETAIL ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 mov x21, v2.d[0] //AES block 4k+2 - mov low mov d30, v5.d[1] //GHASH block 4k+1 - mid aese v0.16b, v18.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result aese v1.16b, v18.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 mov x22, v2.d[1] //AES block 4k+2 - mov high eor v4.16b, v4.16b, v11.16b //PRE 1 fmov d2, x10 //CTR block 4k+6 rev64 v6.16b, v6.16b //GHASH block 4k+2 aese v0.16b, v19.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 fmov v2.d[1], x9 //CTR block 4k+6 rev w9, w12 //CTR block 4k+7 mov x23, v3.d[0] //AES block 4k+3 - mov low eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low mov d10, v17.d[1] //GHASH block 4k - mid mov x24, v3.d[1] //AES block 4k+3 - mov high aese v1.16b, v19.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 mov d31, v6.d[1] //GHASH block 4k+2 - mid aese v0.16b, v20.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 orr x9, x11, x9, lsl #32 //CTR block 4k+7 pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low mov d8, v4.d[1] //GHASH block 4k - mid fmov d3, x10 //CTR block 4k+7 aese v2.16b, v18.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 fmov v3.d[1], x9 //CTR block 4k+7 pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid rev64 v7.16b, v7.16b //GHASH block 4k+3 aese v2.16b, v19.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high aese v3.16b, v18.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high mov d30, v7.d[1] //GHASH block 4k+3 - mid aese v1.16b, v20.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high movi v8.8b, #0xc2 aese v3.16b, v19.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low aese v2.16b, v20.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high aese v3.16b, v20.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 eor x23, x23, x13 //AES block 4k+3 - round 10 low #ifdef __AARCH64EB__ rev x23, x23 #endif pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid eor x21, x21, x13 //AES block 4k+2 - round 10 low #ifdef __AARCH64EB__ rev x21, x21 #endif eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low aese v2.16b, v21.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 aese v1.16b, v21.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 shl d8, d8, #56 //mod_constant aese v0.16b, v21.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 aese v2.16b, v22.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid aese v1.16b, v22.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 aese v3.16b, v21.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up aese v2.16b, v23.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 aese v1.16b, v23.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 aese v3.16b, v22.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 aese v0.16b, v22.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid aese v1.16b, v24.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment aese v3.16b, v23.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 aese v0.16b, v23.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid aese v1.16b, v25.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 aese v2.16b, v24.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 aese v0.16b, v24.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid aese v3.16b, v24.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 aese v0.16b, v25.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 aese v1.16b, v27.16b //AES block 4k+5 - round 9 pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low eor x24, x24, x14 //AES block 4k+3 - round 10 high #ifdef __AARCH64EB__ rev x24, x24 #endif aese v2.16b, v25.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment aese v3.16b, v25.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 eor v11.16b, v11.16b, v8.16b //MODULO - fold into low aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 eor x22, x22, x14 //AES block 4k+2 - round 10 high #ifdef __AARCH64EB__ rev x22, x22 #endif aese v0.16b, v27.16b //AES block 4k+4 - round 9 stp x21, x22, [x2], #16 //AES block 4k+2 - store result aese v2.16b, v27.16b //AES block 4k+6 - round 9 add w12, w12, #1 //CTR block 4k+7 stp x23, x24, [x2], #16 //AES block 4k+3 - store result aese v3.16b, v27.16b //AES block 4k+7 - round 9 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low .L128_dec_tail: //TAIL sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process ld1 { v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext eor v0.16b, v5.16b, v0.16b //AES block 4k+4 - result mov x7, v0.d[1] //AES block 4k+4 - mov high mov x6, v0.d[0] //AES block 4k+4 - mov low cmp x5, #48 eor x7, x7, x14 //AES block 4k+4 - round 10 high #ifdef __AARCH64EB__ rev x7, x7 #endif ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag eor x6, x6, x13 //AES block 4k+4 - round 10 low #ifdef __AARCH64EB__ rev x6, x6 #endif b.gt .L128_dec_blocks_more_than_3 mov v3.16b, v2.16b sub w12, w12, #1 movi v11.8b, #0 movi v9.8b, #0 mov v2.16b, v1.16b movi v10.8b, #0 cmp x5, #32 b.gt .L128_dec_blocks_more_than_2 cmp x5, #16 mov v3.16b, v1.16b sub w12, w12, #1 b.gt .L128_dec_blocks_more_than_1 sub w12, w12, #1 b .L128_dec_blocks_less_than_1 .L128_dec_blocks_more_than_3: //blocks left > 3 rev64 v4.16b, v5.16b //GHASH final-3 block ld1 { v5.16b}, [x0], #16 //AES final-2 block - load ciphertext eor v4.16b, v4.16b, v8.16b //feed in partial tag mov d10, v17.d[1] //GHASH final-3 block - mid stp x6, x7, [x2], #16 //AES final-3 block - store result eor v0.16b, v5.16b, v1.16b //AES final-2 block - result mov d22, v4.d[1] //GHASH final-3 block - mid mov x7, v0.d[1] //AES final-2 block - mov high pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low mov x6, v0.d[0] //AES final-2 block - mov low pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid movi v8.8b, #0 //suppress further partial tag feed in eor x7, x7, x14 //AES final-2 block - round 10 high #ifdef __AARCH64EB__ rev x7, x7 #endif pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid eor x6, x6, x13 //AES final-2 block - round 10 low #ifdef __AARCH64EB__ rev x6, x6 #endif .L128_dec_blocks_more_than_2: //blocks left > 2 rev64 v4.16b, v5.16b //GHASH final-2 block ld1 { v5.16b}, [x0], #16 //AES final-1 block - load ciphertext eor v4.16b, v4.16b, v8.16b //feed in partial tag eor v0.16b, v5.16b, v2.16b //AES final-1 block - result stp x6, x7, [x2], #16 //AES final-2 block - store result mov d22, v4.d[1] //GHASH final-2 block - mid pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high mov x6, v0.d[0] //AES final-1 block - mov low mov x7, v0.d[1] //AES final-1 block - mov high eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid movi v8.8b, #0 //suppress further partial tag feed in pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid eor x6, x6, x13 //AES final-1 block - round 10 low #ifdef __AARCH64EB__ rev x6, x6 #endif eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid eor x7, x7, x14 //AES final-1 block - round 10 high #ifdef __AARCH64EB__ rev x7, x7 #endif .L128_dec_blocks_more_than_1: //blocks left > 1 rev64 v4.16b, v5.16b //GHASH final-1 block ld1 { v5.16b}, [x0], #16 //AES final block - load ciphertext eor v4.16b, v4.16b, v8.16b //feed in partial tag mov d22, v4.d[1] //GHASH final-1 block - mid eor v0.16b, v5.16b, v3.16b //AES final block - result eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid stp x6, x7, [x2], #16 //AES final-1 block - store result mov x6, v0.d[0] //AES final block - mov low mov x7, v0.d[1] //AES final block - mov high ins v22.d[1], v22.d[0] //GHASH final-1 block - mid pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid movi v8.8b, #0 //suppress further partial tag feed in eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high eor x7, x7, x14 //AES final block - round 10 high #ifdef __AARCH64EB__ rev x7, x7 #endif eor x6, x6, x13 //AES final block - round 10 low #ifdef __AARCH64EB__ rev x6, x6 #endif eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid .L128_dec_blocks_less_than_1: //blocks left <= 1 mvn x14, xzr //rk10_h = 0xffffffffffffffff and x1, x1, #127 //bit_length %= 128 mvn x13, xzr //rk10_l = 0xffffffffffffffff sub x1, x1, #128 //bit_length -= 128 neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) and x1, x1, #127 //bit_length %= 128 lsr x14, x14, x1 //rk10_h is mask for top 64b of last block cmp x1, #64 csel x10, x14, xzr, lt csel x9, x13, x14, lt fmov d0, x9 //ctr0b is mask for last block mov v0.d[1], x10 and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits rev64 v4.16b, v5.16b //GHASH final block eor v4.16b, v4.16b, v8.16b //feed in partial tag ldp x4, x5, [x2] //load existing bytes we need to not overwrite and x7, x7, x10 pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high mov d8, v4.d[1] //GHASH final block - mid eor v8.8b, v8.8b, v4.8b //GHASH final block - mid eor v9.16b, v9.16b, v20.16b //GHASH final block - high pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid pmull v21.1q, v4.1d, v12.1d //GHASH final block - low bic x4, x4, x9 //mask out low existing bytes and x6, x6, x9 #ifndef __AARCH64EB__ rev w9, w12 #else mov w9, w12 #endif eor v10.16b, v10.16b, v8.16b //GHASH final block - mid movi v8.8b, #0xc2 eor v11.16b, v11.16b, v21.16b //GHASH final block - low bic x5, x5, x10 //mask out high existing bytes shl d8, d8, #56 //mod_constant eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up orr x6, x6, x4 str w9, [x16, #12] //store the updated counter orr x7, x7, x5 stp x6, x7, [x2] ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment eor v11.16b, v11.16b, v8.16b //MODULO - fold into low eor v11.16b, v11.16b, v10.16b //MODULO - fold into low ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b mov x0, x15 st1 { v11.16b }, [x3] ldp x21, x22, [sp, #16] ldp x23, x24, [sp, #32] ldp d8, d9, [sp, #48] ldp d10, d11, [sp, #64] ldp d12, d13, [sp, #80] ldp d14, d15, [sp, #96] ldp x19, x20, [sp], #112 ret .L128_dec_ret: mov w0, #0x0 ret .size aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel .globl aes_gcm_enc_192_kernel .type aes_gcm_enc_192_kernel,%function .align 4 aes_gcm_enc_192_kernel: cbz x1, .L192_enc_ret stp x19, x20, [sp, #-112]! mov x16, x4 mov x8, x5 stp x21, x22, [sp, #16] stp x23, x24, [sp, #32] stp d8, d9, [sp, #48] stp d10, d11, [sp, #64] stp d12, d13, [sp, #80] stp d14, d15, [sp, #96] ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 #ifdef __AARCH64EB__ rev x10, x10 rev x11, x11 #endif ldp x13, x14, [x8, #192] //load rk12 #ifdef __AARCH64EB__ ror x13, x13, #32 ror x14, x14, #32 #endif ld1 {v18.4s}, [x8], #16 //load rk0 ld1 {v19.4s}, [x8], #16 //load rk1 ld1 {v20.4s}, [x8], #16 //load rk2 lsr x12, x11, #32 ld1 {v21.4s}, [x8], #16 //load rk3 orr w11, w11, w11 ld1 {v22.4s}, [x8], #16 //load rk4 rev w12, w12 //rev_ctr32 add w12, w12, #1 //increment rev_ctr32 fmov d3, x10 //CTR block 3 rev w9, w12 //CTR block 1 add w12, w12, #1 //CTR block 1 fmov d1, x10 //CTR block 1 orr x9, x11, x9, lsl #32 //CTR block 1 ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible fmov v1.d[1], x9 //CTR block 1 rev w9, w12 //CTR block 2 add w12, w12, #1 //CTR block 2 fmov d2, x10 //CTR block 2 orr x9, x11, x9, lsl #32 //CTR block 2 fmov v2.d[1], x9 //CTR block 2 rev w9, w12 //CTR block 3 orr x9, x11, x9, lsl #32 //CTR block 3 ld1 {v23.4s}, [x8], #16 //load rk5 fmov v3.d[1], x9 //CTR block 3 ld1 {v24.4s}, [x8], #16 //load rk6 ld1 {v25.4s}, [x8], #16 //load rk7 aese v0.16b, v18.16b aesmc v0.16b, v0.16b //AES block 0 - round 0 ld1 { v11.16b}, [x3] ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b aese v3.16b, v18.16b aesmc v3.16b, v3.16b //AES block 3 - round 0 ld1 {v26.4s}, [x8], #16 //load rk8 aese v1.16b, v18.16b aesmc v1.16b, v1.16b //AES block 1 - round 0 ldr q15, [x3, #112] //load h4l | h4h #ifndef __AARCH64EB__ ext v15.16b, v15.16b, v15.16b, #8 #endif aese v2.16b, v18.16b aesmc v2.16b, v2.16b //AES block 2 - round 0 ld1 {v27.4s}, [x8], #16 //load rk9 aese v0.16b, v19.16b aesmc v0.16b, v0.16b //AES block 0 - round 1 ld1 {v28.4s}, [x8], #16 //load rk10 aese v1.16b, v19.16b aesmc v1.16b, v1.16b //AES block 1 - round 1 ldr q12, [x3, #32] //load h1l | h1h #ifndef __AARCH64EB__ ext v12.16b, v12.16b, v12.16b, #8 #endif aese v2.16b, v19.16b aesmc v2.16b, v2.16b //AES block 2 - round 1 ld1 {v29.4s}, [x8], #16 //load rk11 aese v3.16b, v19.16b aesmc v3.16b, v3.16b //AES block 3 - round 1 ldr q14, [x3, #80] //load h3l | h3h #ifndef __AARCH64EB__ ext v14.16b, v14.16b, v14.16b, #8 #endif aese v0.16b, v20.16b aesmc v0.16b, v0.16b //AES block 0 - round 2 aese v2.16b, v20.16b aesmc v2.16b, v2.16b //AES block 2 - round 2 aese v3.16b, v20.16b aesmc v3.16b, v3.16b //AES block 3 - round 2 aese v0.16b, v21.16b aesmc v0.16b, v0.16b //AES block 0 - round 3 trn1 v9.2d, v14.2d, v15.2d //h4h | h3h aese v2.16b, v21.16b aesmc v2.16b, v2.16b //AES block 2 - round 3 aese v1.16b, v20.16b aesmc v1.16b, v1.16b //AES block 1 - round 2 trn2 v17.2d, v14.2d, v15.2d //h4l | h3l aese v0.16b, v22.16b aesmc v0.16b, v0.16b //AES block 0 - round 4 aese v3.16b, v21.16b aesmc v3.16b, v3.16b //AES block 3 - round 3 aese v1.16b, v21.16b aesmc v1.16b, v1.16b //AES block 1 - round 3 aese v0.16b, v23.16b aesmc v0.16b, v0.16b //AES block 0 - round 5 aese v2.16b, v22.16b aesmc v2.16b, v2.16b //AES block 2 - round 4 aese v1.16b, v22.16b aesmc v1.16b, v1.16b //AES block 1 - round 4 aese v0.16b, v24.16b aesmc v0.16b, v0.16b //AES block 0 - round 6 aese v3.16b, v22.16b aesmc v3.16b, v3.16b //AES block 3 - round 4 aese v2.16b, v23.16b aesmc v2.16b, v2.16b //AES block 2 - round 5 aese v1.16b, v23.16b aesmc v1.16b, v1.16b //AES block 1 - round 5 aese v3.16b, v23.16b aesmc v3.16b, v3.16b //AES block 3 - round 5 aese v2.16b, v24.16b aesmc v2.16b, v2.16b //AES block 2 - round 6 ldr q13, [x3, #64] //load h2l | h2h #ifndef __AARCH64EB__ ext v13.16b, v13.16b, v13.16b, #8 #endif aese v1.16b, v24.16b aesmc v1.16b, v1.16b //AES block 1 - round 6 aese v3.16b, v24.16b aesmc v3.16b, v3.16b //AES block 3 - round 6 aese v0.16b, v25.16b aesmc v0.16b, v0.16b //AES block 0 - round 7 aese v1.16b, v25.16b aesmc v1.16b, v1.16b //AES block 1 - round 7 trn2 v16.2d, v12.2d, v13.2d //h2l | h1l aese v3.16b, v25.16b aesmc v3.16b, v3.16b //AES block 3 - round 7 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 0 - round 8 aese v2.16b, v25.16b aesmc v2.16b, v2.16b //AES block 2 - round 7 trn1 v8.2d, v12.2d, v13.2d //h2h | h1h aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 1 - round 8 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 3 - round 8 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 2 - round 8 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 0 - round 9 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 3 - round 9 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 2 - round 9 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 1 - round 9 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 0 - round 10 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 2 - round 10 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 1 - round 10 lsr x5, x1, #3 //byte_len mov x15, x5 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 3 - round 10 sub x5, x5, #1 //byte_len - 1 eor v16.16b, v16.16b, v8.16b //h2k | h1k and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) eor v17.16b, v17.16b, v9.16b //h4k | h3k aese v2.16b, v29.16b //AES block 2 - round 11 add x4, x0, x1, lsr #3 //end_input_ptr add x5, x5, x0 aese v1.16b, v29.16b //AES block 1 - round 11 cmp x0, x5 //check if we have <= 4 blocks aese v0.16b, v29.16b //AES block 0 - round 11 add w12, w12, #1 //CTR block 3 aese v3.16b, v29.16b //AES block 3 - round 11 b.ge .L192_enc_tail //handle tail rev w9, w12 //CTR block 4 ldp x6, x7, [x0, #0] //AES block 0 - load plaintext #ifdef __AARCH64EB__ rev x6, x6 rev x7, x7 #endif orr x9, x11, x9, lsl #32 //CTR block 4 ldp x21, x22, [x0, #32] //AES block 2 - load plaintext #ifdef __AARCH64EB__ rev x21, x21 rev x22, x22 #endif ldp x23, x24, [x0, #48] //AES block 3 - load plaintext #ifdef __AARCH64EB__ rev x23, x23 rev x24, x24 #endif ldp x19, x20, [x0, #16] //AES block 1 - load plaintext #ifdef __AARCH64EB__ rev x19, x19 rev x20, x20 #endif add x0, x0, #64 //AES input_ptr update cmp x0, x5 //check if we have <= 8 blocks eor x6, x6, x13 //AES block 0 - round 12 low eor x7, x7, x14 //AES block 0 - round 12 high eor x22, x22, x14 //AES block 2 - round 12 high fmov d4, x6 //AES block 0 - mov low eor x24, x24, x14 //AES block 3 - round 12 high fmov v4.d[1], x7 //AES block 0 - mov high eor x21, x21, x13 //AES block 2 - round 12 low eor x19, x19, x13 //AES block 1 - round 12 low fmov d5, x19 //AES block 1 - mov low eor x20, x20, x14 //AES block 1 - round 12 high fmov v5.d[1], x20 //AES block 1 - mov high eor x23, x23, x13 //AES block 3 - round 12 low fmov d6, x21 //AES block 2 - mov low add w12, w12, #1 //CTR block 4 eor v4.16b, v4.16b, v0.16b //AES block 0 - result fmov d0, x10 //CTR block 4 fmov v0.d[1], x9 //CTR block 4 rev w9, w12 //CTR block 5 orr x9, x11, x9, lsl #32 //CTR block 5 add w12, w12, #1 //CTR block 5 fmov d7, x23 //AES block 3 - mov low st1 { v4.16b}, [x2], #16 //AES block 0 - store result fmov v6.d[1], x22 //AES block 2 - mov high eor v5.16b, v5.16b, v1.16b //AES block 1 - result fmov d1, x10 //CTR block 5 st1 { v5.16b}, [x2], #16 //AES block 1 - store result fmov v7.d[1], x24 //AES block 3 - mov high fmov v1.d[1], x9 //CTR block 5 rev w9, w12 //CTR block 6 orr x9, x11, x9, lsl #32 //CTR block 6 add w12, w12, #1 //CTR block 6 eor v6.16b, v6.16b, v2.16b //AES block 2 - result fmov d2, x10 //CTR block 6 fmov v2.d[1], x9 //CTR block 6 rev w9, w12 //CTR block 7 orr x9, x11, x9, lsl #32 //CTR block 7 st1 { v6.16b}, [x2], #16 //AES block 2 - store result eor v7.16b, v7.16b, v3.16b //AES block 3 - result st1 { v7.16b}, [x2], #16 //AES block 3 - store result b.ge .L192_enc_prepretail //do prepretail .L192_enc_main_loop: //main loop start aese v2.16b, v18.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) aese v1.16b, v18.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext #ifdef __AARCH64EB__ rev x19, x19 rev x20, x20 #endif ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 fmov d3, x10 //CTR block 4k+3 rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) aese v2.16b, v19.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 fmov v3.d[1], x9 //CTR block 4k+3 pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext #ifdef __AARCH64EB__ rev x21, x21 rev x22, x22 #endif aese v0.16b, v18.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 ldp x23, x24, [x0, #48] //AES block 4k+3 - load plaintext #ifdef __AARCH64EB__ rev x23, x23 rev x24, x24 #endif pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low eor v4.16b, v4.16b, v11.16b //PRE 1 aese v1.16b, v19.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 aese v0.16b, v19.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) aese v3.16b, v18.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 eor x24, x24, x14 //AES block 4k+3 - round 12 high pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low mov d8, v4.d[1] //GHASH block 4k - mid aese v0.16b, v20.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 aese v3.16b, v19.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 eor x21, x21, x13 //AES block 4k+6 - round 12 low eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low aese v0.16b, v21.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 eor x19, x19, x13 //AES block 4k+5 - round 12 low aese v1.16b, v20.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 mov d31, v6.d[1] //GHASH block 4k+2 - mid pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high mov d4, v5.d[1] //GHASH block 4k+1 - mid aese v2.16b, v20.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 aese v1.16b, v21.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 mov d10, v17.d[1] //GHASH block 4k - mid eor v9.16b, v9.16b, v30.16b //GHASH block 4k+1 - high aese v3.16b, v20.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid pmull2 v30.1q, v6.2d, v13.2d //GHASH block 4k+2 - high aese v0.16b, v22.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid aese v3.16b, v21.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high eor x20, x20, x14 //AES block 4k+5 - round 12 high ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid aese v0.16b, v23.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 add w12, w12, #1 //CTR block 4k+3 aese v3.16b, v22.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 eor v9.16b, v9.16b, v30.16b //GHASH block 4k+2 - high pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid eor x22, x22, x14 //AES block 4k+6 - round 12 high pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid eor x23, x23, x13 //AES block 4k+3 - round 12 low mov d30, v7.d[1] //GHASH block 4k+3 - mid pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid rev w9, w12 //CTR block 4k+8 pmull v8.1q, v6.1d, v13.1d //GHASH block 4k+2 - low orr x9, x11, x9, lsl #32 //CTR block 4k+8 aese v2.16b, v21.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid aese v1.16b, v22.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext #ifdef __AARCH64EB__ rev x6, x6 rev x7, x7 #endif aese v0.16b, v24.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low aese v2.16b, v22.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 add x0, x0, #64 //AES input_ptr update aese v1.16b, v23.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 movi v8.8b, #0xc2 pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low eor x7, x7, x14 //AES block 4k+4 - round 12 high eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid aese v2.16b, v23.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 eor x6, x6, x13 //AES block 4k+4 - round 12 low aese v1.16b, v24.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 shl d8, d8, #56 //mod_constant aese v3.16b, v23.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high aese v0.16b, v25.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 fmov d5, x19 //AES block 4k+5 - mov low aese v1.16b, v25.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid aese v3.16b, v24.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 fmov v5.d[1], x20 //AES block 4k+5 - mov high aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid cmp x0, x5 //.LOOP CONTROL fmov d4, x6 //AES block 4k+4 - mov low aese v2.16b, v24.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 fmov v4.d[1], x7 //AES block 4k+4 - mov high aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 fmov d7, x23 //AES block 4k+3 - mov low eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up add w12, w12, #1 //CTR block 4k+8 aese v2.16b, v25.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 fmov v7.d[1], x24 //AES block 4k+3 - mov high pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment fmov d6, x21 //AES block 4k+6 - mov low aese v3.16b, v25.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 aese v0.16b, v29.16b //AES block 4k+4 - round 11 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 eor v4.16b, v4.16b, v0.16b //AES block 4k+4 - result fmov d0, x10 //CTR block 4k+8 aese v1.16b, v29.16b //AES block 4k+5 - round 11 fmov v0.d[1], x9 //CTR block 4k+8 rev w9, w12 //CTR block 4k+9 pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low fmov v6.d[1], x22 //AES block 4k+6 - mov high st1 { v4.16b}, [x2], #16 //AES block 4k+4 - store result aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 orr x9, x11, x9, lsl #32 //CTR block 4k+9 eor v5.16b, v5.16b, v1.16b //AES block 4k+5 - result add w12, w12, #1 //CTR block 4k+9 fmov d1, x10 //CTR block 4k+9 aese v2.16b, v29.16b //AES block 4k+6 - round 11 fmov v1.d[1], x9 //CTR block 4k+9 rev w9, w12 //CTR block 4k+10 add w12, w12, #1 //CTR block 4k+10 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment orr x9, x11, x9, lsl #32 //CTR block 4k+10 st1 { v5.16b}, [x2], #16 //AES block 4k+5 - store result eor v11.16b, v11.16b, v9.16b //MODULO - fold into low aese v3.16b, v29.16b //AES block 4k+7 - round 11 eor v6.16b, v6.16b, v2.16b //AES block 4k+6 - result fmov d2, x10 //CTR block 4k+10 st1 { v6.16b}, [x2], #16 //AES block 4k+6 - store result fmov v2.d[1], x9 //CTR block 4k+10 rev w9, w12 //CTR block 4k+11 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low orr x9, x11, x9, lsl #32 //CTR block 4k+11 eor v7.16b, v7.16b, v3.16b //AES block 4k+3 - result st1 { v7.16b}, [x2], #16 //AES block 4k+3 - store result b.lt .L192_enc_main_loop .L192_enc_prepretail: //PREPRETAIL aese v0.16b, v18.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) fmov d3, x10 //CTR block 4k+3 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 add w12, w12, #1 //CTR block 4k+3 aese v1.16b, v18.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) aese v2.16b, v18.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 fmov v3.d[1], x9 //CTR block 4k+3 eor v4.16b, v4.16b, v11.16b //PRE 1 mov d10, v17.d[1] //GHASH block 4k - mid aese v1.16b, v19.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low mov d8, v4.d[1] //GHASH block 4k - mid pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid mov d4, v5.d[1] //GHASH block 4k+1 - mid eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low mov d31, v6.d[1] //GHASH block 4k+2 - mid aese v3.16b, v18.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 eor v9.16b, v9.16b, v30.16b //GHASH block 4k+1 - high pmull2 v30.1q, v6.2d, v13.2d //GHASH block 4k+2 - high eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid aese v3.16b, v19.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 aese v2.16b, v19.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 eor v9.16b, v9.16b, v30.16b //GHASH block 4k+2 - high aese v0.16b, v19.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 aese v1.16b, v20.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 mov d30, v7.d[1] //GHASH block 4k+3 - mid pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid aese v0.16b, v20.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid aese v1.16b, v21.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high pmull v8.1q, v6.1d, v13.1d //GHASH block 4k+2 - low aese v0.16b, v21.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid aese v3.16b, v20.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 aese v2.16b, v20.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low aese v0.16b, v22.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 aese v3.16b, v21.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid aese v2.16b, v21.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low movi v8.8b, #0xc2 aese v3.16b, v22.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 aese v2.16b, v22.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 aese v1.16b, v22.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid aese v3.16b, v23.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 aese v2.16b, v23.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 aese v1.16b, v23.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low aese v0.16b, v23.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 aese v3.16b, v24.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 eor v10.16b, v10.16b, v9.16b //karatsuba tidy up aese v1.16b, v24.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 aese v0.16b, v24.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 shl d8, d8, #56 //mod_constant aese v3.16b, v25.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 aese v1.16b, v25.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 eor v10.16b, v10.16b, v11.16b aese v0.16b, v25.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 pmull v30.1q, v9.1d, v8.1d aese v2.16b, v24.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 ext v9.16b, v9.16b, v9.16b, #8 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 eor v10.16b, v10.16b, v30.16b aese v2.16b, v25.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 eor v10.16b, v10.16b, v9.16b aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 pmull v30.1q, v10.1d, v8.1d ext v10.16b, v10.16b, v10.16b, #8 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 eor v11.16b, v11.16b, v30.16b aese v0.16b, v29.16b //AES block 4k+4 - round 11 aese v3.16b, v29.16b //AES block 4k+7 - round 11 aese v2.16b, v29.16b //AES block 4k+6 - round 11 aese v1.16b, v29.16b //AES block 4k+5 - round 11 eor v11.16b, v11.16b, v10.16b .L192_enc_tail: //TAIL sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext #ifdef __AARCH64EB__ rev x6, x6 rev x7, x7 #endif eor x6, x6, x13 //AES block 4k+4 - round 12 low eor x7, x7, x14 //AES block 4k+4 - round 12 high fmov d4, x6 //AES block 4k+4 - mov low fmov v4.d[1], x7 //AES block 4k+4 - mov high cmp x5, #48 eor v5.16b, v4.16b, v0.16b //AES block 4k+4 - result ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag b.gt .L192_enc_blocks_more_than_3 sub w12, w12, #1 movi v10.8b, #0 mov v3.16b, v2.16b movi v9.8b, #0 cmp x5, #32 mov v2.16b, v1.16b movi v11.8b, #0 b.gt .L192_enc_blocks_more_than_2 sub w12, w12, #1 mov v3.16b, v1.16b cmp x5, #16 b.gt .L192_enc_blocks_more_than_1 sub w12, w12, #1 b .L192_enc_blocks_less_than_1 .L192_enc_blocks_more_than_3: //blocks left > 3 st1 { v5.16b}, [x2], #16 //AES final-3 block - store result ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high #ifdef __AARCH64EB__ rev x6, x6 rev x7, x7 #endif rev64 v4.16b, v5.16b //GHASH final-3 block eor x6, x6, x13 //AES final-2 block - round 12 low eor v4.16b, v4.16b, v8.16b //feed in partial tag eor x7, x7, x14 //AES final-2 block - round 12 high fmov d5, x6 //AES final-2 block - mov low fmov v5.d[1], x7 //AES final-2 block - mov high mov d22, v4.d[1] //GHASH final-3 block - mid pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low mov d10, v17.d[1] //GHASH final-3 block - mid eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid movi v8.8b, #0 //suppress further partial tag feed in pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid eor v5.16b, v5.16b, v1.16b //AES final-2 block - result .L192_enc_blocks_more_than_2: //blocks left > 2 st1 { v5.16b}, [x2], #16 //AES final-2 block - store result rev64 v4.16b, v5.16b //GHASH final-2 block ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high #ifdef __AARCH64EB__ rev x6, x6 rev x7, x7 #endif eor v4.16b, v4.16b, v8.16b //feed in partial tag eor x7, x7, x14 //AES final-1 block - round 12 high pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high mov d22, v4.d[1] //GHASH final-2 block - mid pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low eor x6, x6, x13 //AES final-1 block - round 12 low fmov d5, x6 //AES final-1 block - mov low fmov v5.d[1], x7 //AES final-1 block - mov high eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid movi v8.8b, #0 //suppress further partial tag feed in eor v5.16b, v5.16b, v2.16b //AES final-1 block - result eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid .L192_enc_blocks_more_than_1: //blocks left > 1 st1 { v5.16b}, [x2], #16 //AES final-1 block - store result ldp x6, x7, [x0], #16 //AES final block - load input low & high #ifdef __AARCH64EB__ rev x6, x6 rev x7, x7 #endif rev64 v4.16b, v5.16b //GHASH final-1 block eor x6, x6, x13 //AES final block - round 12 low eor v4.16b, v4.16b, v8.16b //feed in partial tag movi v8.8b, #0 //suppress further partial tag feed in mov d22, v4.d[1] //GHASH final-1 block - mid eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid eor x7, x7, x14 //AES final block - round 12 high fmov d5, x6 //AES final block - mov low pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high fmov v5.d[1], x7 //AES final block - mov high ins v22.d[1], v22.d[0] //GHASH final-1 block - mid eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid eor v5.16b, v5.16b, v3.16b //AES final block - result eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid .L192_enc_blocks_less_than_1: //blocks left <= 1 ld1 { v18.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored #ifndef __AARCH64EB__ rev w9, w12 #else mov w9, w12 #endif and x1, x1, #127 //bit_length %= 128 sub x1, x1, #128 //bit_length -= 128 mvn x14, xzr //rk12_h = 0xffffffffffffffff neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) mvn x13, xzr //rk12_l = 0xffffffffffffffff and x1, x1, #127 //bit_length %= 128 lsr x14, x14, x1 //rk12_h is mask for top 64b of last block cmp x1, #64 csel x6, x13, x14, lt csel x7, x14, xzr, lt fmov d0, x6 //ctr0b is mask for last block fmov v0.d[1], x7 and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits rev64 v4.16b, v5.16b //GHASH final block eor v4.16b, v4.16b, v8.16b //feed in partial tag mov d8, v4.d[1] //GHASH final block - mid pmull v21.1q, v4.1d, v12.1d //GHASH final block - low pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high eor v8.8b, v8.8b, v4.8b //GHASH final block - mid eor v11.16b, v11.16b, v21.16b //GHASH final block - low eor v9.16b, v9.16b, v20.16b //GHASH final block - high pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid eor v10.16b, v10.16b, v8.16b //GHASH final block - mid movi v8.8b, #0xc2 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up shl d8, d8, #56 //mod_constant bif v5.16b, v18.16b, v0.16b //insert existing bytes in top end of result before storing eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment eor v11.16b, v11.16b, v9.16b //MODULO - fold into low str w9, [x16, #12] //store the updated counter st1 { v5.16b}, [x2] //store all 16B eor v11.16b, v11.16b, v10.16b //MODULO - fold into low ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b mov x0, x15 st1 { v11.16b }, [x3] ldp x21, x22, [sp, #16] ldp x23, x24, [sp, #32] ldp d8, d9, [sp, #48] ldp d10, d11, [sp, #64] ldp d12, d13, [sp, #80] ldp d14, d15, [sp, #96] ldp x19, x20, [sp], #112 ret .L192_enc_ret: mov w0, #0x0 ret .size aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel .globl aes_gcm_dec_192_kernel .type aes_gcm_dec_192_kernel,%function .align 4 aes_gcm_dec_192_kernel: cbz x1, .L192_dec_ret stp x19, x20, [sp, #-112]! mov x16, x4 mov x8, x5 stp x21, x22, [sp, #16] stp x23, x24, [sp, #32] stp d8, d9, [sp, #48] stp d10, d11, [sp, #64] stp d12, d13, [sp, #80] stp d14, d15, [sp, #96] add x4, x0, x1, lsr #3 //end_input_ptr ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 #ifdef __AARCH64EB__ rev x10, x10 rev x11, x11 #endif ldp x13, x14, [x8, #192] //load rk12 #ifdef __AARCH64EB__ ror x13, x13, #32 ror x14, x14, #32 #endif ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible ld1 {v18.4s}, [x8], #16 //load rk0 lsr x5, x1, #3 //byte_len mov x15, x5 ld1 {v19.4s}, [x8], #16 //load rk1 lsr x12, x11, #32 orr w11, w11, w11 fmov d3, x10 //CTR block 3 rev w12, w12 //rev_ctr32 fmov d1, x10 //CTR block 1 add w12, w12, #1 //increment rev_ctr32 ld1 {v20.4s}, [x8], #16 //load rk2 aese v0.16b, v18.16b aesmc v0.16b, v0.16b //AES block 0 - round 0 rev w9, w12 //CTR block 1 add w12, w12, #1 //CTR block 1 orr x9, x11, x9, lsl #32 //CTR block 1 ld1 {v21.4s}, [x8], #16 //load rk3 fmov v1.d[1], x9 //CTR block 1 rev w9, w12 //CTR block 2 add w12, w12, #1 //CTR block 2 fmov d2, x10 //CTR block 2 orr x9, x11, x9, lsl #32 //CTR block 2 fmov v2.d[1], x9 //CTR block 2 rev w9, w12 //CTR block 3 aese v0.16b, v19.16b aesmc v0.16b, v0.16b //AES block 0 - round 1 orr x9, x11, x9, lsl #32 //CTR block 3 fmov v3.d[1], x9 //CTR block 3 ld1 {v22.4s}, [x8], #16 //load rk4 aese v0.16b, v20.16b aesmc v0.16b, v0.16b //AES block 0 - round 2 aese v2.16b, v18.16b aesmc v2.16b, v2.16b //AES block 2 - round 0 ld1 {v23.4s}, [x8], #16 //load rk5 aese v1.16b, v18.16b aesmc v1.16b, v1.16b //AES block 1 - round 0 ldr q15, [x3, #112] //load h4l | h4h #ifndef __AARCH64EB__ ext v15.16b, v15.16b, v15.16b, #8 #endif aese v3.16b, v18.16b aesmc v3.16b, v3.16b //AES block 3 - round 0 ldr q13, [x3, #64] //load h2l | h2h #ifndef __AARCH64EB__ ext v13.16b, v13.16b, v13.16b, #8 #endif aese v2.16b, v19.16b aesmc v2.16b, v2.16b //AES block 2 - round 1 ldr q14, [x3, #80] //load h3l | h3h #ifndef __AARCH64EB__ ext v14.16b, v14.16b, v14.16b, #8 #endif aese v1.16b, v19.16b aesmc v1.16b, v1.16b //AES block 1 - round 1 aese v3.16b, v19.16b aesmc v3.16b, v3.16b //AES block 3 - round 1 ldr q12, [x3, #32] //load h1l | h1h #ifndef __AARCH64EB__ ext v12.16b, v12.16b, v12.16b, #8 #endif aese v2.16b, v20.16b aesmc v2.16b, v2.16b //AES block 2 - round 2 ld1 {v24.4s}, [x8], #16 //load rk6 aese v0.16b, v21.16b aesmc v0.16b, v0.16b //AES block 0 - round 3 ld1 {v25.4s}, [x8], #16 //load rk7 aese v1.16b, v20.16b aesmc v1.16b, v1.16b //AES block 1 - round 2 ld1 {v26.4s}, [x8], #16 //load rk8 aese v3.16b, v20.16b aesmc v3.16b, v3.16b //AES block 3 - round 2 ld1 {v27.4s}, [x8], #16 //load rk9 aese v2.16b, v21.16b aesmc v2.16b, v2.16b //AES block 2 - round 3 ld1 { v11.16b}, [x3] ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b aese v1.16b, v21.16b aesmc v1.16b, v1.16b //AES block 1 - round 3 add w12, w12, #1 //CTR block 3 aese v3.16b, v21.16b aesmc v3.16b, v3.16b //AES block 3 - round 3 trn1 v9.2d, v14.2d, v15.2d //h4h | h3h aese v0.16b, v22.16b aesmc v0.16b, v0.16b //AES block 0 - round 4 ld1 {v28.4s}, [x8], #16 //load rk10 aese v1.16b, v22.16b aesmc v1.16b, v1.16b //AES block 1 - round 4 trn2 v17.2d, v14.2d, v15.2d //h4l | h3l aese v2.16b, v22.16b aesmc v2.16b, v2.16b //AES block 2 - round 4 aese v3.16b, v22.16b aesmc v3.16b, v3.16b //AES block 3 - round 4 trn2 v16.2d, v12.2d, v13.2d //h2l | h1l aese v0.16b, v23.16b aesmc v0.16b, v0.16b //AES block 0 - round 5 ld1 {v29.4s}, [x8], #16 //load rk11 aese v1.16b, v23.16b aesmc v1.16b, v1.16b //AES block 1 - round 5 aese v2.16b, v23.16b aesmc v2.16b, v2.16b //AES block 2 - round 5 aese v3.16b, v23.16b aesmc v3.16b, v3.16b //AES block 3 - round 5 aese v0.16b, v24.16b aesmc v0.16b, v0.16b //AES block 0 - round 6 aese v2.16b, v24.16b aesmc v2.16b, v2.16b //AES block 2 - round 6 aese v3.16b, v24.16b aesmc v3.16b, v3.16b //AES block 3 - round 6 aese v0.16b, v25.16b aesmc v0.16b, v0.16b //AES block 0 - round 7 aese v2.16b, v25.16b aesmc v2.16b, v2.16b //AES block 2 - round 7 aese v3.16b, v25.16b aesmc v3.16b, v3.16b //AES block 3 - round 7 aese v1.16b, v24.16b aesmc v1.16b, v1.16b //AES block 1 - round 6 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 2 - round 8 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 3 - round 8 aese v1.16b, v25.16b aesmc v1.16b, v1.16b //AES block 1 - round 7 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 2 - round 9 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 3 - round 9 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 1 - round 8 sub x5, x5, #1 //byte_len - 1 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 0 - round 8 and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 3 - round 10 add x5, x5, x0 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 1 - round 9 cmp x0, x5 //check if we have <= 4 blocks aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 0 - round 9 trn1 v8.2d, v12.2d, v13.2d //h2h | h1h aese v3.16b, v29.16b //AES block 3 - round 11 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 2 - round 10 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 1 - round 10 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 0 - round 10 eor v16.16b, v16.16b, v8.16b //h2k | h1k aese v2.16b, v29.16b //AES block 2 - round 11 aese v1.16b, v29.16b //AES block 1 - round 11 eor v17.16b, v17.16b, v9.16b //h4k | h3k aese v0.16b, v29.16b //AES block 0 - round 11 b.ge .L192_dec_tail //handle tail ld1 {v4.16b, v5.16b}, [x0], #32 //AES block 0,1 - load ciphertext eor v1.16b, v5.16b, v1.16b //AES block 1 - result eor v0.16b, v4.16b, v0.16b //AES block 0 - result rev w9, w12 //CTR block 4 ld1 {v6.16b, v7.16b}, [x0], #32 //AES block 2,3 - load ciphertext mov x19, v1.d[0] //AES block 1 - mov low mov x20, v1.d[1] //AES block 1 - mov high mov x6, v0.d[0] //AES block 0 - mov low orr x9, x11, x9, lsl #32 //CTR block 4 add w12, w12, #1 //CTR block 4 mov x7, v0.d[1] //AES block 0 - mov high rev64 v4.16b, v4.16b //GHASH block 0 fmov d0, x10 //CTR block 4 rev64 v5.16b, v5.16b //GHASH block 1 cmp x0, x5 //check if we have <= 8 blocks eor x19, x19, x13 //AES block 1 - round 12 low #ifdef __AARCH64EB__ rev x19, x19 #endif fmov v0.d[1], x9 //CTR block 4 rev w9, w12 //CTR block 5 orr x9, x11, x9, lsl #32 //CTR block 5 fmov d1, x10 //CTR block 5 eor x20, x20, x14 //AES block 1 - round 12 high #ifdef __AARCH64EB__ rev x20, x20 #endif add w12, w12, #1 //CTR block 5 fmov v1.d[1], x9 //CTR block 5 eor x6, x6, x13 //AES block 0 - round 12 low #ifdef __AARCH64EB__ rev x6, x6 #endif rev w9, w12 //CTR block 6 eor x7, x7, x14 //AES block 0 - round 12 high #ifdef __AARCH64EB__ rev x7, x7 #endif stp x6, x7, [x2], #16 //AES block 0 - store result orr x9, x11, x9, lsl #32 //CTR block 6 stp x19, x20, [x2], #16 //AES block 1 - store result add w12, w12, #1 //CTR block 6 eor v2.16b, v6.16b, v2.16b //AES block 2 - result b.ge .L192_dec_prepretail //do prepretail .L192_dec_main_loop: //main loop start aese v1.16b, v18.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low mov x21, v2.d[0] //AES block 4k+2 - mov low mov x22, v2.d[1] //AES block 4k+2 - mov high eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result rev64 v7.16b, v7.16b //GHASH block 4k+3 aese v1.16b, v19.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 fmov d2, x10 //CTR block 4k+6 aese v0.16b, v18.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 eor v4.16b, v4.16b, v11.16b //PRE 1 pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high fmov v2.d[1], x9 //CTR block 4k+6 aese v1.16b, v20.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 mov x24, v3.d[1] //AES block 4k+3 - mov high aese v0.16b, v19.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 mov x23, v3.d[0] //AES block 4k+3 - mov low pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high fmov d3, x10 //CTR block 4k+7 mov d8, v4.d[1] //GHASH block 4k - mid pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low mov d10, v17.d[1] //GHASH block 4k - mid rev w9, w12 //CTR block 4k+7 aese v2.16b, v18.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 orr x9, x11, x9, lsl #32 //CTR block 4k+7 fmov v3.d[1], x9 //CTR block 4k+7 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid mov d4, v5.d[1] //GHASH block 4k+1 - mid aese v1.16b, v21.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 aese v0.16b, v20.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 eor x22, x22, x14 //AES block 4k+2 - round 12 high #ifdef __AARCH64EB__ rev x22, x22 #endif aese v2.16b, v19.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid aese v3.16b, v18.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 rev64 v6.16b, v6.16b //GHASH block 4k+2 aese v2.16b, v20.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low eor x21, x21, x13 //AES block 4k+2 - round 12 low #ifdef __AARCH64EB__ rev x21, x21 #endif aese v1.16b, v22.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 aese v0.16b, v21.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid mov d31, v6.d[1] //GHASH block 4k+2 - mid aese v3.16b, v19.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 eor v9.16b, v9.16b, v30.16b //GHASH block 4k+1 - high aese v0.16b, v22.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 pmull2 v30.1q, v6.2d, v13.2d //GHASH block 4k+2 - high eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid pmull v8.1q, v6.1d, v13.1d //GHASH block 4k+2 - low aese v0.16b, v23.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 eor v9.16b, v9.16b, v30.16b //GHASH block 4k+2 - high mov d30, v7.d[1] //GHASH block 4k+3 - mid aese v1.16b, v23.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high aese v3.16b, v20.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid aese v1.16b, v24.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 aese v0.16b, v24.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid aese v3.16b, v21.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low aese v0.16b, v25.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high aese v1.16b, v25.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 movi v8.8b, #0xc2 pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid aese v2.16b, v21.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low aese v3.16b, v22.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 aese v2.16b, v22.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up aese v2.16b, v23.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 aese v3.16b, v23.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 shl d8, d8, #56 //mod_constant aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 aese v2.16b, v24.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 ld1 {v4.16b}, [x0], #16 //AES block 4k+4 - load ciphertext aese v3.16b, v24.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid ld1 {v5.16b}, [x0], #16 //AES block 4k+5 - load ciphertext eor x23, x23, x13 //AES block 4k+3 - round 12 low #ifdef __AARCH64EB__ rev x23, x23 #endif aese v2.16b, v25.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment aese v0.16b, v29.16b //AES block 4k+4 - round 11 add w12, w12, #1 //CTR block 4k+7 aese v3.16b, v25.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 ld1 {v6.16b}, [x0], #16 //AES block 4k+6 - load ciphertext aese v1.16b, v29.16b //AES block 4k+5 - round 11 ld1 {v7.16b}, [x0], #16 //AES block 4k+7 - load ciphertext rev w9, w12 //CTR block 4k+8 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 stp x21, x22, [x2], #16 //AES block 4k+2 - store result aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid cmp x0, x5 //.LOOP CONTROL eor v0.16b, v4.16b, v0.16b //AES block 4k+4 - result eor x24, x24, x14 //AES block 4k+3 - round 12 high #ifdef __AARCH64EB__ rev x24, x24 #endif eor v1.16b, v5.16b, v1.16b //AES block 4k+5 - result aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 orr x9, x11, x9, lsl #32 //CTR block 4k+8 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low mov x19, v1.d[0] //AES block 4k+5 - mov low mov x6, v0.d[0] //AES block 4k+4 - mov low stp x23, x24, [x2], #16 //AES block 4k+3 - store result rev64 v5.16b, v5.16b //GHASH block 4k+5 aese v2.16b, v29.16b //AES block 4k+6 - round 11 mov x7, v0.d[1] //AES block 4k+4 - mov high aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 mov x20, v1.d[1] //AES block 4k+5 - mov high fmov d0, x10 //CTR block 4k+8 add w12, w12, #1 //CTR block 4k+8 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment eor v2.16b, v6.16b, v2.16b //AES block 4k+6 - result fmov v0.d[1], x9 //CTR block 4k+8 rev w9, w12 //CTR block 4k+9 eor x6, x6, x13 //AES block 4k+4 - round 12 low #ifdef __AARCH64EB__ rev x6, x6 #endif orr x9, x11, x9, lsl #32 //CTR block 4k+9 eor v11.16b, v11.16b, v8.16b //MODULO - fold into low fmov d1, x10 //CTR block 4k+9 add w12, w12, #1 //CTR block 4k+9 eor x19, x19, x13 //AES block 4k+5 - round 12 low #ifdef __AARCH64EB__ rev x19, x19 #endif fmov v1.d[1], x9 //CTR block 4k+9 rev w9, w12 //CTR block 4k+10 eor x20, x20, x14 //AES block 4k+5 - round 12 high #ifdef __AARCH64EB__ rev x20, x20 #endif eor x7, x7, x14 //AES block 4k+4 - round 12 high #ifdef __AARCH64EB__ rev x7, x7 #endif stp x6, x7, [x2], #16 //AES block 4k+4 - store result eor v11.16b, v11.16b, v10.16b //MODULO - fold into low add w12, w12, #1 //CTR block 4k+10 rev64 v4.16b, v4.16b //GHASH block 4k+4 orr x9, x11, x9, lsl #32 //CTR block 4k+10 aese v3.16b, v29.16b //AES block 4k+7 - round 11 stp x19, x20, [x2], #16 //AES block 4k+5 - store result b.lt .L192_dec_main_loop .L192_dec_prepretail: //PREPRETAIL mov x22, v2.d[1] //AES block 4k+2 - mov high ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result aese v1.16b, v18.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 mov x21, v2.d[0] //AES block 4k+2 - mov low aese v0.16b, v18.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 mov d10, v17.d[1] //GHASH block 4k - mid eor v4.16b, v4.16b, v11.16b //PRE 1 fmov d2, x10 //CTR block 4k+6 aese v1.16b, v19.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 mov x23, v3.d[0] //AES block 4k+3 - mov low aese v0.16b, v19.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 mov x24, v3.d[1] //AES block 4k+3 - mov high pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low mov d8, v4.d[1] //GHASH block 4k - mid fmov d3, x10 //CTR block 4k+7 aese v1.16b, v20.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 rev64 v6.16b, v6.16b //GHASH block 4k+2 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high fmov v2.d[1], x9 //CTR block 4k+6 rev w9, w12 //CTR block 4k+7 orr x9, x11, x9, lsl #32 //CTR block 4k+7 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid mov d4, v5.d[1] //GHASH block 4k+1 - mid pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low eor x24, x24, x14 //AES block 4k+3 - round 12 high #ifdef __AARCH64EB__ rev x24, x24 #endif fmov v3.d[1], x9 //CTR block 4k+7 aese v0.16b, v20.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 eor x21, x21, x13 //AES block 4k+2 - round 12 low #ifdef __AARCH64EB__ rev x21, x21 #endif pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high eor x22, x22, x14 //AES block 4k+2 - round 12 high #ifdef __AARCH64EB__ rev x22, x22 #endif eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid eor x23, x23, x13 //AES block 4k+3 - round 12 low #ifdef __AARCH64EB__ rev x23, x23 #endif stp x21, x22, [x2], #16 //AES block 4k+2 - store result rev64 v7.16b, v7.16b //GHASH block 4k+3 stp x23, x24, [x2], #16 //AES block 4k+3 - store result aese v3.16b, v18.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 eor v9.16b, v9.16b, v30.16b //GHASH block 4k+1 - high pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid add w12, w12, #1 //CTR block 4k+7 pmull2 v30.1q, v6.2d, v13.2d //GHASH block 4k+2 - high eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low aese v2.16b, v18.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid mov d31, v6.d[1] //GHASH block 4k+2 - mid aese v3.16b, v19.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 aese v2.16b, v19.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 eor v9.16b, v9.16b, v30.16b //GHASH block 4k+2 - high eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid pmull v8.1q, v6.1d, v13.1d //GHASH block 4k+2 - low aese v2.16b, v20.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 mov d30, v7.d[1] //GHASH block 4k+3 - mid aese v3.16b, v20.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low aese v0.16b, v21.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid aese v1.16b, v21.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low aese v0.16b, v22.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high movi v8.8b, #0xc2 pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid aese v2.16b, v21.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 shl d8, d8, #56 //mod_constant eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high aese v0.16b, v23.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid aese v2.16b, v22.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low aese v0.16b, v24.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 aese v3.16b, v21.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid aese v2.16b, v23.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 aese v0.16b, v25.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up aese v3.16b, v22.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 aese v2.16b, v24.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 aese v3.16b, v23.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up aese v1.16b, v22.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 aese v2.16b, v25.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 aese v1.16b, v23.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 aese v3.16b, v24.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 aese v1.16b, v24.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 aese v3.16b, v25.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid aese v1.16b, v25.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 aese v0.16b, v29.16b eor v11.16b, v11.16b, v8.16b //MODULO - fold into low aese v2.16b, v29.16b aese v1.16b, v29.16b aese v3.16b, v29.16b eor v11.16b, v11.16b, v10.16b //MODULO - fold into low .L192_dec_tail: //TAIL sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process ld1 { v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext eor v0.16b, v5.16b, v0.16b //AES block 4k+4 - result mov x7, v0.d[1] //AES block 4k+4 - mov high mov x6, v0.d[0] //AES block 4k+4 - mov low ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag cmp x5, #48 eor x7, x7, x14 //AES block 4k+4 - round 12 high #ifdef __AARCH64EB__ rev x7, x7 #endif eor x6, x6, x13 //AES block 4k+4 - round 12 low #ifdef __AARCH64EB__ rev x6, x6 #endif b.gt .L192_dec_blocks_more_than_3 movi v11.8b, #0 movi v9.8b, #0 mov v3.16b, v2.16b mov v2.16b, v1.16b sub w12, w12, #1 movi v10.8b, #0 cmp x5, #32 b.gt .L192_dec_blocks_more_than_2 mov v3.16b, v1.16b cmp x5, #16 sub w12, w12, #1 b.gt .L192_dec_blocks_more_than_1 sub w12, w12, #1 b .L192_dec_blocks_less_than_1 .L192_dec_blocks_more_than_3: //blocks left > 3 rev64 v4.16b, v5.16b //GHASH final-3 block ld1 { v5.16b}, [x0], #16 //AES final-2 block - load ciphertext stp x6, x7, [x2], #16 //AES final-3 block - store result eor v4.16b, v4.16b, v8.16b //feed in partial tag eor v0.16b, v5.16b, v1.16b //AES final-2 block - result pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low mov x6, v0.d[0] //AES final-2 block - mov low mov d22, v4.d[1] //GHASH final-3 block - mid mov x7, v0.d[1] //AES final-2 block - mov high mov d10, v17.d[1] //GHASH final-3 block - mid eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high eor x6, x6, x13 //AES final-2 block - round 12 low #ifdef __AARCH64EB__ rev x6, x6 #endif movi v8.8b, #0 //suppress further partial tag feed in pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid eor x7, x7, x14 //AES final-2 block - round 12 high #ifdef __AARCH64EB__ rev x7, x7 #endif .L192_dec_blocks_more_than_2: //blocks left > 2 rev64 v4.16b, v5.16b //GHASH final-2 block ld1 { v5.16b}, [x0], #16 //AES final-1 block - load ciphertext eor v4.16b, v4.16b, v8.16b //feed in partial tag movi v8.8b, #0 //suppress further partial tag feed in eor v0.16b, v5.16b, v2.16b //AES final-1 block - result mov d22, v4.d[1] //GHASH final-2 block - mid pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low stp x6, x7, [x2], #16 //AES final-2 block - store result eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid mov x7, v0.d[1] //AES final-1 block - mov high eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low mov x6, v0.d[0] //AES final-1 block - mov low pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high eor x7, x7, x14 //AES final-1 block - round 12 high #ifdef __AARCH64EB__ rev x7, x7 #endif eor x6, x6, x13 //AES final-1 block - round 12 low #ifdef __AARCH64EB__ rev x6, x6 #endif eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid .L192_dec_blocks_more_than_1: //blocks left > 1 rev64 v4.16b, v5.16b //GHASH final-1 block eor v4.16b, v4.16b, v8.16b //feed in partial tag ld1 { v5.16b}, [x0], #16 //AES final block - load ciphertext mov d22, v4.d[1] //GHASH final-1 block - mid pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high eor v0.16b, v5.16b, v3.16b //AES final block - result stp x6, x7, [x2], #16 //AES final-1 block - store result eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low mov x7, v0.d[1] //AES final block - mov high ins v22.d[1], v22.d[0] //GHASH final-1 block - mid mov x6, v0.d[0] //AES final block - mov low pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid movi v8.8b, #0 //suppress further partial tag feed in eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low eor x7, x7, x14 //AES final block - round 12 high #ifdef __AARCH64EB__ rev x7, x7 #endif eor x6, x6, x13 //AES final block - round 12 low #ifdef __AARCH64EB__ rev x6, x6 #endif eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid .L192_dec_blocks_less_than_1: //blocks left <= 1 mvn x13, xzr //rk12_l = 0xffffffffffffffff ldp x4, x5, [x2] //load existing bytes we need to not overwrite and x1, x1, #127 //bit_length %= 128 sub x1, x1, #128 //bit_length -= 128 neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) and x1, x1, #127 //bit_length %= 128 mvn x14, xzr //rk12_h = 0xffffffffffffffff lsr x14, x14, x1 //rk12_h is mask for top 64b of last block cmp x1, #64 csel x9, x13, x14, lt csel x10, x14, xzr, lt fmov d0, x9 //ctr0b is mask for last block and x6, x6, x9 bic x4, x4, x9 //mask out low existing bytes orr x6, x6, x4 mov v0.d[1], x10 #ifndef __AARCH64EB__ rev w9, w12 #else mov w9, w12 #endif and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits str w9, [x16, #12] //store the updated counter rev64 v4.16b, v5.16b //GHASH final block eor v4.16b, v4.16b, v8.16b //feed in partial tag bic x5, x5, x10 //mask out high existing bytes and x7, x7, x10 pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high mov d8, v4.d[1] //GHASH final block - mid pmull v21.1q, v4.1d, v12.1d //GHASH final block - low eor v8.8b, v8.8b, v4.8b //GHASH final block - mid eor v9.16b, v9.16b, v20.16b //GHASH final block - high pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid eor v11.16b, v11.16b, v21.16b //GHASH final block - low eor v10.16b, v10.16b, v8.16b //GHASH final block - mid movi v8.8b, #0xc2 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up shl d8, d8, #56 //mod_constant eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid orr x7, x7, x5 stp x6, x7, [x2] ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low eor v11.16b, v11.16b, v8.16b //MODULO - fold into low ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment eor v11.16b, v11.16b, v10.16b //MODULO - fold into low ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b mov x0, x15 st1 { v11.16b }, [x3] ldp x21, x22, [sp, #16] ldp x23, x24, [sp, #32] ldp d8, d9, [sp, #48] ldp d10, d11, [sp, #64] ldp d12, d13, [sp, #80] ldp d14, d15, [sp, #96] ldp x19, x20, [sp], #112 ret .L192_dec_ret: mov w0, #0x0 ret .size aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel .globl aes_gcm_enc_256_kernel .type aes_gcm_enc_256_kernel,%function .align 4 aes_gcm_enc_256_kernel: cbz x1, .L256_enc_ret stp x19, x20, [sp, #-112]! mov x16, x4 mov x8, x5 stp x21, x22, [sp, #16] stp x23, x24, [sp, #32] stp d8, d9, [sp, #48] stp d10, d11, [sp, #64] stp d12, d13, [sp, #80] stp d14, d15, [sp, #96] add x4, x0, x1, lsr #3 //end_input_ptr lsr x5, x1, #3 //byte_len mov x15, x5 ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 #ifdef __AARCH64EB__ rev x10, x10 rev x11, x11 #endif ldp x13, x14, [x8, #224] //load rk14 #ifdef __AARCH64EB__ ror x13, x13, #32 ror x14, x14, #32 #endif ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible sub x5, x5, #1 //byte_len - 1 ld1 {v18.4s}, [x8], #16 //load rk0 and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) ld1 {v19.4s}, [x8], #16 //load rk1 add x5, x5, x0 lsr x12, x11, #32 fmov d2, x10 //CTR block 2 orr w11, w11, w11 rev w12, w12 //rev_ctr32 cmp x0, x5 //check if we have <= 4 blocks fmov d1, x10 //CTR block 1 aese v0.16b, v18.16b aesmc v0.16b, v0.16b //AES block 0 - round 0 add w12, w12, #1 //increment rev_ctr32 rev w9, w12 //CTR block 1 fmov d3, x10 //CTR block 3 orr x9, x11, x9, lsl #32 //CTR block 1 add w12, w12, #1 //CTR block 1 ld1 {v20.4s}, [x8], #16 //load rk2 fmov v1.d[1], x9 //CTR block 1 rev w9, w12 //CTR block 2 add w12, w12, #1 //CTR block 2 orr x9, x11, x9, lsl #32 //CTR block 2 ld1 {v21.4s}, [x8], #16 //load rk3 fmov v2.d[1], x9 //CTR block 2 rev w9, w12 //CTR block 3 aese v0.16b, v19.16b aesmc v0.16b, v0.16b //AES block 0 - round 1 orr x9, x11, x9, lsl #32 //CTR block 3 fmov v3.d[1], x9 //CTR block 3 aese v1.16b, v18.16b aesmc v1.16b, v1.16b //AES block 1 - round 0 ld1 {v22.4s}, [x8], #16 //load rk4 aese v0.16b, v20.16b aesmc v0.16b, v0.16b //AES block 0 - round 2 ld1 {v23.4s}, [x8], #16 //load rk5 aese v2.16b, v18.16b aesmc v2.16b, v2.16b //AES block 2 - round 0 ld1 {v24.4s}, [x8], #16 //load rk6 aese v1.16b, v19.16b aesmc v1.16b, v1.16b //AES block 1 - round 1 ldr q14, [x3, #80] //load h3l | h3h #ifndef __AARCH64EB__ ext v14.16b, v14.16b, v14.16b, #8 #endif aese v3.16b, v18.16b aesmc v3.16b, v3.16b //AES block 3 - round 0 ld1 {v25.4s}, [x8], #16 //load rk7 aese v2.16b, v19.16b aesmc v2.16b, v2.16b //AES block 2 - round 1 ld1 {v26.4s}, [x8], #16 //load rk8 aese v1.16b, v20.16b aesmc v1.16b, v1.16b //AES block 1 - round 2 ldr q13, [x3, #64] //load h2l | h2h #ifndef __AARCH64EB__ ext v13.16b, v13.16b, v13.16b, #8 #endif aese v3.16b, v19.16b aesmc v3.16b, v3.16b //AES block 3 - round 1 ld1 {v27.4s}, [x8], #16 //load rk9 aese v2.16b, v20.16b aesmc v2.16b, v2.16b //AES block 2 - round 2 ldr q15, [x3, #112] //load h4l | h4h #ifndef __AARCH64EB__ ext v15.16b, v15.16b, v15.16b, #8 #endif aese v1.16b, v21.16b aesmc v1.16b, v1.16b //AES block 1 - round 3 ld1 {v28.4s}, [x8], #16 //load rk10 aese v3.16b, v20.16b aesmc v3.16b, v3.16b //AES block 3 - round 2 ld1 {v29.4s}, [x8], #16 //load rk11 aese v2.16b, v21.16b aesmc v2.16b, v2.16b //AES block 2 - round 3 add w12, w12, #1 //CTR block 3 aese v0.16b, v21.16b aesmc v0.16b, v0.16b //AES block 0 - round 3 aese v3.16b, v21.16b aesmc v3.16b, v3.16b //AES block 3 - round 3 ld1 { v11.16b}, [x3] ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b aese v2.16b, v22.16b aesmc v2.16b, v2.16b //AES block 2 - round 4 aese v0.16b, v22.16b aesmc v0.16b, v0.16b //AES block 0 - round 4 aese v1.16b, v22.16b aesmc v1.16b, v1.16b //AES block 1 - round 4 aese v3.16b, v22.16b aesmc v3.16b, v3.16b //AES block 3 - round 4 aese v0.16b, v23.16b aesmc v0.16b, v0.16b //AES block 0 - round 5 aese v1.16b, v23.16b aesmc v1.16b, v1.16b //AES block 1 - round 5 aese v3.16b, v23.16b aesmc v3.16b, v3.16b //AES block 3 - round 5 aese v2.16b, v23.16b aesmc v2.16b, v2.16b //AES block 2 - round 5 aese v1.16b, v24.16b aesmc v1.16b, v1.16b //AES block 1 - round 6 trn2 v17.2d, v14.2d, v15.2d //h4l | h3l aese v3.16b, v24.16b aesmc v3.16b, v3.16b //AES block 3 - round 6 ld1 {v30.4s}, [x8], #16 //load rk12 aese v0.16b, v24.16b aesmc v0.16b, v0.16b //AES block 0 - round 6 ldr q12, [x3, #32] //load h1l | h1h #ifndef __AARCH64EB__ ext v12.16b, v12.16b, v12.16b, #8 #endif aese v2.16b, v24.16b aesmc v2.16b, v2.16b //AES block 2 - round 6 ld1 {v31.4s}, [x8], #16 //load rk13 aese v1.16b, v25.16b aesmc v1.16b, v1.16b //AES block 1 - round 7 trn1 v9.2d, v14.2d, v15.2d //h4h | h3h aese v0.16b, v25.16b aesmc v0.16b, v0.16b //AES block 0 - round 7 aese v2.16b, v25.16b aesmc v2.16b, v2.16b //AES block 2 - round 7 aese v3.16b, v25.16b aesmc v3.16b, v3.16b //AES block 3 - round 7 trn2 v16.2d, v12.2d, v13.2d //h2l | h1l aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 1 - round 8 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 2 - round 8 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 3 - round 8 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 1 - round 9 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 2 - round 9 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 0 - round 8 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 1 - round 10 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 3 - round 9 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 0 - round 9 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 2 - round 10 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 3 - round 10 aese v1.16b, v29.16b aesmc v1.16b, v1.16b //AES block 1 - round 11 aese v2.16b, v29.16b aesmc v2.16b, v2.16b //AES block 2 - round 11 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 0 - round 10 aese v1.16b, v30.16b aesmc v1.16b, v1.16b //AES block 1 - round 12 aese v2.16b, v30.16b aesmc v2.16b, v2.16b //AES block 2 - round 12 aese v0.16b, v29.16b aesmc v0.16b, v0.16b //AES block 0 - round 11 eor v17.16b, v17.16b, v9.16b //h4k | h3k aese v3.16b, v29.16b aesmc v3.16b, v3.16b //AES block 3 - round 11 aese v2.16b, v31.16b //AES block 2 - round 13 trn1 v8.2d, v12.2d, v13.2d //h2h | h1h aese v0.16b, v30.16b aesmc v0.16b, v0.16b //AES block 0 - round 12 aese v3.16b, v30.16b aesmc v3.16b, v3.16b //AES block 3 - round 12 aese v1.16b, v31.16b //AES block 1 - round 13 aese v0.16b, v31.16b //AES block 0 - round 13 aese v3.16b, v31.16b //AES block 3 - round 13 eor v16.16b, v16.16b, v8.16b //h2k | h1k b.ge .L256_enc_tail //handle tail ldp x19, x20, [x0, #16] //AES block 1 - load plaintext #ifdef __AARCH64EB__ rev x19, x19 rev x20, x20 #endif rev w9, w12 //CTR block 4 ldp x6, x7, [x0, #0] //AES block 0 - load plaintext #ifdef __AARCH64EB__ rev x6, x6 rev x7, x7 #endif ldp x23, x24, [x0, #48] //AES block 3 - load plaintext #ifdef __AARCH64EB__ rev x23, x23 rev x24, x24 #endif ldp x21, x22, [x0, #32] //AES block 2 - load plaintext #ifdef __AARCH64EB__ rev x21, x21 rev x22, x22 #endif add x0, x0, #64 //AES input_ptr update eor x19, x19, x13 //AES block 1 - round 14 low eor x20, x20, x14 //AES block 1 - round 14 high fmov d5, x19 //AES block 1 - mov low eor x6, x6, x13 //AES block 0 - round 14 low eor x7, x7, x14 //AES block 0 - round 14 high eor x24, x24, x14 //AES block 3 - round 14 high fmov d4, x6 //AES block 0 - mov low cmp x0, x5 //check if we have <= 8 blocks fmov v4.d[1], x7 //AES block 0 - mov high eor x23, x23, x13 //AES block 3 - round 14 low eor x21, x21, x13 //AES block 2 - round 14 low fmov v5.d[1], x20 //AES block 1 - mov high fmov d6, x21 //AES block 2 - mov low add w12, w12, #1 //CTR block 4 orr x9, x11, x9, lsl #32 //CTR block 4 fmov d7, x23 //AES block 3 - mov low eor x22, x22, x14 //AES block 2 - round 14 high fmov v6.d[1], x22 //AES block 2 - mov high eor v4.16b, v4.16b, v0.16b //AES block 0 - result fmov d0, x10 //CTR block 4 fmov v0.d[1], x9 //CTR block 4 rev w9, w12 //CTR block 5 add w12, w12, #1 //CTR block 5 eor v5.16b, v5.16b, v1.16b //AES block 1 - result fmov d1, x10 //CTR block 5 orr x9, x11, x9, lsl #32 //CTR block 5 fmov v1.d[1], x9 //CTR block 5 rev w9, w12 //CTR block 6 st1 { v4.16b}, [x2], #16 //AES block 0 - store result fmov v7.d[1], x24 //AES block 3 - mov high orr x9, x11, x9, lsl #32 //CTR block 6 eor v6.16b, v6.16b, v2.16b //AES block 2 - result st1 { v5.16b}, [x2], #16 //AES block 1 - store result add w12, w12, #1 //CTR block 6 fmov d2, x10 //CTR block 6 fmov v2.d[1], x9 //CTR block 6 st1 { v6.16b}, [x2], #16 //AES block 2 - store result rev w9, w12 //CTR block 7 orr x9, x11, x9, lsl #32 //CTR block 7 eor v7.16b, v7.16b, v3.16b //AES block 3 - result st1 { v7.16b}, [x2], #16 //AES block 3 - store result b.ge .L256_enc_prepretail //do prepretail .L256_enc_main_loop: //main loop start aese v0.16b, v18.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) aese v1.16b, v18.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 fmov d3, x10 //CTR block 4k+3 aese v2.16b, v18.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 aese v0.16b, v19.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 fmov v3.d[1], x9 //CTR block 4k+3 aese v1.16b, v19.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 ldp x23, x24, [x0, #48] //AES block 4k+7 - load plaintext #ifdef __AARCH64EB__ rev x23, x23 rev x24, x24 #endif aese v2.16b, v19.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext #ifdef __AARCH64EB__ rev x21, x21 rev x22, x22 #endif aese v0.16b, v20.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 eor v4.16b, v4.16b, v11.16b //PRE 1 aese v1.16b, v20.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 aese v3.16b, v18.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 eor x23, x23, x13 //AES block 4k+7 - round 14 low aese v0.16b, v21.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 mov d10, v17.d[1] //GHASH block 4k - mid pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high eor x22, x22, x14 //AES block 4k+6 - round 14 high mov d8, v4.d[1] //GHASH block 4k - mid aese v3.16b, v19.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) aese v0.16b, v22.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid aese v2.16b, v20.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 aese v0.16b, v23.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) pmull2 v4.1q, v5.2d, v14.2d //GHASH block 4k+1 - high pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high mov d4, v5.d[1] //GHASH block 4k+1 - mid aese v1.16b, v21.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 aese v3.16b, v20.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 eor v11.16b, v11.16b, v8.16b //GHASH block 4k+1 - low aese v2.16b, v21.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 aese v1.16b, v22.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 mov d8, v6.d[1] //GHASH block 4k+2 - mid aese v3.16b, v21.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid aese v2.16b, v22.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 aese v0.16b, v24.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 eor v8.8b, v8.8b, v6.8b //GHASH block 4k+2 - mid aese v3.16b, v22.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid aese v0.16b, v25.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 aese v3.16b, v23.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 ins v8.d[1], v8.d[0] //GHASH block 4k+2 - mid aese v1.16b, v23.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 aese v2.16b, v23.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 aese v1.16b, v24.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid pmull2 v4.1q, v6.2d, v13.2d //GHASH block 4k+2 - high pmull v5.1q, v6.1d, v13.1d //GHASH block 4k+2 - low aese v1.16b, v25.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low eor v9.16b, v9.16b, v4.16b //GHASH block 4k+2 - high aese v3.16b, v24.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext #ifdef __AARCH64EB__ rev x19, x19 rev x20, x20 #endif aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 mov d4, v7.d[1] //GHASH block 4k+3 - mid aese v2.16b, v24.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 eor v11.16b, v11.16b, v5.16b //GHASH block 4k+2 - low pmull2 v8.1q, v8.2d, v16.2d //GHASH block 4k+2 - mid pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high eor v4.8b, v4.8b, v7.8b //GHASH block 4k+3 - mid aese v2.16b, v25.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 eor x19, x19, x13 //AES block 4k+5 - round 14 low aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 eor v10.16b, v10.16b, v8.16b //GHASH block 4k+2 - mid aese v3.16b, v25.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 eor x21, x21, x13 //AES block 4k+6 - round 14 low aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 movi v8.8b, #0xc2 pmull v4.1q, v4.1d, v16.1d //GHASH block 4k+3 - mid eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high fmov d5, x19 //AES block 4k+5 - mov low aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext #ifdef __AARCH64EB__ rev x6, x6 rev x7, x7 #endif aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 shl d8, d8, #56 //mod_constant aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+3 - mid aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 add w12, w12, #1 //CTR block 4k+3 aese v0.16b, v29.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 11 eor v4.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up aese v1.16b, v29.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 11 add x0, x0, #64 //AES input_ptr update pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid rev w9, w12 //CTR block 4k+8 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 eor x6, x6, x13 //AES block 4k+4 - round 14 low aese v1.16b, v30.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 12 eor v10.16b, v10.16b, v4.16b //MODULO - karatsuba tidy up aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 eor x7, x7, x14 //AES block 4k+4 - round 14 high fmov d4, x6 //AES block 4k+4 - mov low orr x9, x11, x9, lsl #32 //CTR block 4k+8 eor v7.16b, v9.16b, v7.16b //MODULO - fold into mid aese v0.16b, v30.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 12 eor x20, x20, x14 //AES block 4k+5 - round 14 high aese v2.16b, v29.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 11 eor x24, x24, x14 //AES block 4k+7 - round 14 high aese v3.16b, v29.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 11 add w12, w12, #1 //CTR block 4k+8 aese v0.16b, v31.16b //AES block 4k+4 - round 13 fmov v4.d[1], x7 //AES block 4k+4 - mov high eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid aese v2.16b, v30.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 12 fmov d7, x23 //AES block 4k+7 - mov low aese v1.16b, v31.16b //AES block 4k+5 - round 13 fmov v5.d[1], x20 //AES block 4k+5 - mov high fmov d6, x21 //AES block 4k+6 - mov low cmp x0, x5 //.LOOP CONTROL fmov v6.d[1], x22 //AES block 4k+6 - mov high pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low eor v4.16b, v4.16b, v0.16b //AES block 4k+4 - result fmov d0, x10 //CTR block 4k+8 fmov v0.d[1], x9 //CTR block 4k+8 rev w9, w12 //CTR block 4k+9 add w12, w12, #1 //CTR block 4k+9 eor v5.16b, v5.16b, v1.16b //AES block 4k+5 - result fmov d1, x10 //CTR block 4k+9 orr x9, x11, x9, lsl #32 //CTR block 4k+9 aese v3.16b, v30.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 12 fmov v1.d[1], x9 //CTR block 4k+9 aese v2.16b, v31.16b //AES block 4k+6 - round 13 rev w9, w12 //CTR block 4k+10 st1 { v4.16b}, [x2], #16 //AES block 4k+4 - store result orr x9, x11, x9, lsl #32 //CTR block 4k+10 eor v11.16b, v11.16b, v9.16b //MODULO - fold into low fmov v7.d[1], x24 //AES block 4k+7 - mov high ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment st1 { v5.16b}, [x2], #16 //AES block 4k+5 - store result add w12, w12, #1 //CTR block 4k+10 aese v3.16b, v31.16b //AES block 4k+7 - round 13 eor v6.16b, v6.16b, v2.16b //AES block 4k+6 - result fmov d2, x10 //CTR block 4k+10 st1 { v6.16b}, [x2], #16 //AES block 4k+6 - store result fmov v2.d[1], x9 //CTR block 4k+10 rev w9, w12 //CTR block 4k+11 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low orr x9, x11, x9, lsl #32 //CTR block 4k+11 eor v7.16b, v7.16b, v3.16b //AES block 4k+7 - result st1 { v7.16b}, [x2], #16 //AES block 4k+7 - store result b.lt .L256_enc_main_loop .L256_enc_prepretail: //PREPRETAIL aese v1.16b, v18.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) aese v2.16b, v18.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 fmov d3, x10 //CTR block 4k+3 aese v0.16b, v18.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) fmov v3.d[1], x9 //CTR block 4k+3 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 aese v2.16b, v19.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 aese v0.16b, v19.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 eor v4.16b, v4.16b, v11.16b //PRE 1 rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) aese v2.16b, v20.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 aese v3.16b, v18.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 mov d10, v17.d[1] //GHASH block 4k - mid aese v1.16b, v19.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low mov d8, v4.d[1] //GHASH block 4k - mid pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high aese v2.16b, v21.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 aese v1.16b, v20.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid aese v0.16b, v20.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 aese v3.16b, v19.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 aese v1.16b, v21.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid pmull2 v4.1q, v5.2d, v14.2d //GHASH block 4k+1 - high pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low aese v3.16b, v20.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high mov d4, v5.d[1] //GHASH block 4k+1 - mid aese v0.16b, v21.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 eor v11.16b, v11.16b, v8.16b //GHASH block 4k+1 - low aese v3.16b, v21.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid mov d8, v6.d[1] //GHASH block 4k+2 - mid aese v0.16b, v22.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) aese v3.16b, v22.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid eor v8.8b, v8.8b, v6.8b //GHASH block 4k+2 - mid add w12, w12, #1 //CTR block 4k+3 pmull v5.1q, v6.1d, v13.1d //GHASH block 4k+2 - low aese v3.16b, v23.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 aese v2.16b, v22.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid pmull2 v4.1q, v6.2d, v13.2d //GHASH block 4k+2 - high eor v11.16b, v11.16b, v5.16b //GHASH block 4k+2 - low ins v8.d[1], v8.d[0] //GHASH block 4k+2 - mid aese v2.16b, v23.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+2 - high mov d4, v7.d[1] //GHASH block 4k+3 - mid aese v1.16b, v22.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 pmull2 v8.1q, v8.2d, v16.2d //GHASH block 4k+2 - mid eor v4.8b, v4.8b, v7.8b //GHASH block 4k+3 - mid pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high aese v1.16b, v23.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 pmull v4.1q, v4.1d, v16.1d //GHASH block 4k+3 - mid eor v10.16b, v10.16b, v8.16b //GHASH block 4k+2 - mid aese v0.16b, v23.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 aese v1.16b, v24.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 aese v2.16b, v24.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 aese v0.16b, v24.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 movi v8.8b, #0xc2 aese v3.16b, v24.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 aese v1.16b, v25.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high aese v0.16b, v25.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 aese v3.16b, v25.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 shl d8, d8, #56 //mod_constant aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+3 - mid pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 eor v10.16b, v10.16b, v9.16b //karatsuba tidy up pmull v4.1q, v9.1d, v8.1d ext v9.16b, v9.16b, v9.16b, #8 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 aese v2.16b, v25.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 eor v10.16b, v10.16b, v11.16b aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 aese v1.16b, v29.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 11 eor v10.16b, v10.16b, v4.16b aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 aese v1.16b, v30.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 12 aese v0.16b, v29.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 11 eor v10.16b, v10.16b, v9.16b aese v3.16b, v29.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 11 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 aese v0.16b, v30.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 12 pmull v4.1q, v10.1d, v8.1d aese v2.16b, v29.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 11 ext v10.16b, v10.16b, v10.16b, #8 aese v3.16b, v30.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 12 aese v1.16b, v31.16b //AES block 4k+5 - round 13 eor v11.16b, v11.16b, v4.16b aese v2.16b, v30.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 12 aese v3.16b, v31.16b //AES block 4k+7 - round 13 aese v0.16b, v31.16b //AES block 4k+4 - round 13 aese v2.16b, v31.16b //AES block 4k+6 - round 13 eor v11.16b, v11.16b, v10.16b .L256_enc_tail: //TAIL ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext #ifdef __AARCH64EB__ rev x6, x6 rev x7, x7 #endif eor x6, x6, x13 //AES block 4k+4 - round 14 low eor x7, x7, x14 //AES block 4k+4 - round 14 high cmp x5, #48 fmov d4, x6 //AES block 4k+4 - mov low fmov v4.d[1], x7 //AES block 4k+4 - mov high eor v5.16b, v4.16b, v0.16b //AES block 4k+4 - result b.gt .L256_enc_blocks_more_than_3 cmp x5, #32 mov v3.16b, v2.16b movi v11.8b, #0 movi v9.8b, #0 sub w12, w12, #1 mov v2.16b, v1.16b movi v10.8b, #0 b.gt .L256_enc_blocks_more_than_2 mov v3.16b, v1.16b sub w12, w12, #1 cmp x5, #16 b.gt .L256_enc_blocks_more_than_1 sub w12, w12, #1 b .L256_enc_blocks_less_than_1 .L256_enc_blocks_more_than_3: //blocks left > 3 st1 { v5.16b}, [x2], #16 //AES final-3 block - store result ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high #ifdef __AARCH64EB__ rev x6, x6 rev x7, x7 #endif rev64 v4.16b, v5.16b //GHASH final-3 block eor x6, x6, x13 //AES final-2 block - round 14 low eor v4.16b, v4.16b, v8.16b //feed in partial tag eor x7, x7, x14 //AES final-2 block - round 14 high mov d22, v4.d[1] //GHASH final-3 block - mid fmov d5, x6 //AES final-2 block - mov low fmov v5.d[1], x7 //AES final-2 block - mov high eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid movi v8.8b, #0 //suppress further partial tag feed in mov d10, v17.d[1] //GHASH final-3 block - mid pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid eor v5.16b, v5.16b, v1.16b //AES final-2 block - result .L256_enc_blocks_more_than_2: //blocks left > 2 st1 { v5.16b}, [x2], #16 //AES final-2 block - store result ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high #ifdef __AARCH64EB__ rev x6, x6 rev x7, x7 #endif rev64 v4.16b, v5.16b //GHASH final-2 block eor x6, x6, x13 //AES final-1 block - round 14 low eor v4.16b, v4.16b, v8.16b //feed in partial tag fmov d5, x6 //AES final-1 block - mov low eor x7, x7, x14 //AES final-1 block - round 14 high fmov v5.d[1], x7 //AES final-1 block - mov high movi v8.8b, #0 //suppress further partial tag feed in pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high mov d22, v4.d[1] //GHASH final-2 block - mid pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid eor v5.16b, v5.16b, v2.16b //AES final-1 block - result eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid .L256_enc_blocks_more_than_1: //blocks left > 1 st1 { v5.16b}, [x2], #16 //AES final-1 block - store result rev64 v4.16b, v5.16b //GHASH final-1 block ldp x6, x7, [x0], #16 //AES final block - load input low & high #ifdef __AARCH64EB__ rev x6, x6 rev x7, x7 #endif eor v4.16b, v4.16b, v8.16b //feed in partial tag movi v8.8b, #0 //suppress further partial tag feed in eor x6, x6, x13 //AES final block - round 14 low mov d22, v4.d[1] //GHASH final-1 block - mid pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high eor x7, x7, x14 //AES final block - round 14 high eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high ins v22.d[1], v22.d[0] //GHASH final-1 block - mid fmov d5, x6 //AES final block - mov low fmov v5.d[1], x7 //AES final block - mov high pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low eor v5.16b, v5.16b, v3.16b //AES final block - result eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low .L256_enc_blocks_less_than_1: //blocks left <= 1 and x1, x1, #127 //bit_length %= 128 mvn x13, xzr //rk14_l = 0xffffffffffffffff sub x1, x1, #128 //bit_length -= 128 neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) ld1 { v18.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored mvn x14, xzr //rk14_h = 0xffffffffffffffff and x1, x1, #127 //bit_length %= 128 lsr x14, x14, x1 //rk14_h is mask for top 64b of last block cmp x1, #64 csel x6, x13, x14, lt csel x7, x14, xzr, lt fmov d0, x6 //ctr0b is mask for last block fmov v0.d[1], x7 and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits rev64 v4.16b, v5.16b //GHASH final block eor v4.16b, v4.16b, v8.16b //feed in partial tag bif v5.16b, v18.16b, v0.16b //insert existing bytes in top end of result before storing pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high mov d8, v4.d[1] //GHASH final block - mid #ifndef __AARCH64EB__ rev w9, w12 #else mov w9, w12 #endif pmull v21.1q, v4.1d, v12.1d //GHASH final block - low eor v9.16b, v9.16b, v20.16b //GHASH final block - high eor v8.8b, v8.8b, v4.8b //GHASH final block - mid pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid eor v11.16b, v11.16b, v21.16b //GHASH final block - low eor v10.16b, v10.16b, v8.16b //GHASH final block - mid movi v8.8b, #0xc2 eor v4.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up shl d8, d8, #56 //mod_constant eor v10.16b, v10.16b, v4.16b //MODULO - karatsuba tidy up pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment str w9, [x16, #12] //store the updated counter st1 { v5.16b}, [x2] //store all 16B eor v11.16b, v11.16b, v9.16b //MODULO - fold into low eor v11.16b, v11.16b, v10.16b //MODULO - fold into low ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b mov x0, x15 st1 { v11.16b }, [x3] ldp x21, x22, [sp, #16] ldp x23, x24, [sp, #32] ldp d8, d9, [sp, #48] ldp d10, d11, [sp, #64] ldp d12, d13, [sp, #80] ldp d14, d15, [sp, #96] ldp x19, x20, [sp], #112 ret .L256_enc_ret: mov w0, #0x0 ret .size aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel .globl aes_gcm_dec_256_kernel .type aes_gcm_dec_256_kernel,%function .align 4 aes_gcm_dec_256_kernel: cbz x1, .L256_dec_ret stp x19, x20, [sp, #-112]! mov x16, x4 mov x8, x5 stp x21, x22, [sp, #16] stp x23, x24, [sp, #32] stp d8, d9, [sp, #48] stp d10, d11, [sp, #64] stp d12, d13, [sp, #80] stp d14, d15, [sp, #96] lsr x5, x1, #3 //byte_len mov x15, x5 ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 #ifdef __AARCH64EB__ rev x10, x10 rev x11, x11 #endif ldp x13, x14, [x8, #224] //load rk14 #ifdef __AARCH64EB__ ror x14, x14, #32 ror x13, x13, #32 #endif ld1 {v18.4s}, [x8], #16 //load rk0 sub x5, x5, #1 //byte_len - 1 ld1 {v19.4s}, [x8], #16 //load rk1 and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) add x4, x0, x1, lsr #3 //end_input_ptr ld1 {v20.4s}, [x8], #16 //load rk2 lsr x12, x11, #32 ld1 {v21.4s}, [x8], #16 //load rk3 orr w11, w11, w11 ld1 {v22.4s}, [x8], #16 //load rk4 add x5, x5, x0 rev w12, w12 //rev_ctr32 add w12, w12, #1 //increment rev_ctr32 fmov d3, x10 //CTR block 3 rev w9, w12 //CTR block 1 add w12, w12, #1 //CTR block 1 fmov d1, x10 //CTR block 1 orr x9, x11, x9, lsl #32 //CTR block 1 ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible fmov v1.d[1], x9 //CTR block 1 rev w9, w12 //CTR block 2 add w12, w12, #1 //CTR block 2 fmov d2, x10 //CTR block 2 orr x9, x11, x9, lsl #32 //CTR block 2 fmov v2.d[1], x9 //CTR block 2 rev w9, w12 //CTR block 3 orr x9, x11, x9, lsl #32 //CTR block 3 ld1 {v23.4s}, [x8], #16 //load rk5 fmov v3.d[1], x9 //CTR block 3 add w12, w12, #1 //CTR block 3 ld1 {v24.4s}, [x8], #16 //load rk6 ld1 {v25.4s}, [x8], #16 //load rk7 ld1 {v26.4s}, [x8], #16 //load rk8 aese v0.16b, v18.16b aesmc v0.16b, v0.16b //AES block 0 - round 0 ldr q14, [x3, #80] //load h3l | h3h #ifndef __AARCH64EB__ ext v14.16b, v14.16b, v14.16b, #8 #endif aese v3.16b, v18.16b aesmc v3.16b, v3.16b //AES block 3 - round 0 ldr q15, [x3, #112] //load h4l | h4h #ifndef __AARCH64EB__ ext v15.16b, v15.16b, v15.16b, #8 #endif aese v1.16b, v18.16b aesmc v1.16b, v1.16b //AES block 1 - round 0 ldr q13, [x3, #64] //load h2l | h2h #ifndef __AARCH64EB__ ext v13.16b, v13.16b, v13.16b, #8 #endif aese v2.16b, v18.16b aesmc v2.16b, v2.16b //AES block 2 - round 0 ld1 {v27.4s}, [x8], #16 //load rk9 aese v0.16b, v19.16b aesmc v0.16b, v0.16b //AES block 0 - round 1 aese v1.16b, v19.16b aesmc v1.16b, v1.16b //AES block 1 - round 1 ld1 { v11.16b}, [x3] ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b aese v2.16b, v19.16b aesmc v2.16b, v2.16b //AES block 2 - round 1 ld1 {v28.4s}, [x8], #16 //load rk10 aese v3.16b, v19.16b aesmc v3.16b, v3.16b //AES block 3 - round 1 ld1 {v29.4s}, [x8], #16 //load rk11 aese v0.16b, v20.16b aesmc v0.16b, v0.16b //AES block 0 - round 2 ldr q12, [x3, #32] //load h1l | h1h #ifndef __AARCH64EB__ ext v12.16b, v12.16b, v12.16b, #8 #endif aese v2.16b, v20.16b aesmc v2.16b, v2.16b //AES block 2 - round 2 ld1 {v30.4s}, [x8], #16 //load rk12 aese v3.16b, v20.16b aesmc v3.16b, v3.16b //AES block 3 - round 2 aese v0.16b, v21.16b aesmc v0.16b, v0.16b //AES block 0 - round 3 aese v1.16b, v20.16b aesmc v1.16b, v1.16b //AES block 1 - round 2 aese v3.16b, v21.16b aesmc v3.16b, v3.16b //AES block 3 - round 3 aese v0.16b, v22.16b aesmc v0.16b, v0.16b //AES block 0 - round 4 cmp x0, x5 //check if we have <= 4 blocks aese v2.16b, v21.16b aesmc v2.16b, v2.16b //AES block 2 - round 3 aese v1.16b, v21.16b aesmc v1.16b, v1.16b //AES block 1 - round 3 aese v3.16b, v22.16b aesmc v3.16b, v3.16b //AES block 3 - round 4 aese v2.16b, v22.16b aesmc v2.16b, v2.16b //AES block 2 - round 4 aese v1.16b, v22.16b aesmc v1.16b, v1.16b //AES block 1 - round 4 aese v3.16b, v23.16b aesmc v3.16b, v3.16b //AES block 3 - round 5 aese v0.16b, v23.16b aesmc v0.16b, v0.16b //AES block 0 - round 5 aese v1.16b, v23.16b aesmc v1.16b, v1.16b //AES block 1 - round 5 aese v2.16b, v23.16b aesmc v2.16b, v2.16b //AES block 2 - round 5 aese v0.16b, v24.16b aesmc v0.16b, v0.16b //AES block 0 - round 6 aese v3.16b, v24.16b aesmc v3.16b, v3.16b //AES block 3 - round 6 aese v1.16b, v24.16b aesmc v1.16b, v1.16b //AES block 1 - round 6 aese v2.16b, v24.16b aesmc v2.16b, v2.16b //AES block 2 - round 6 aese v0.16b, v25.16b aesmc v0.16b, v0.16b //AES block 0 - round 7 aese v1.16b, v25.16b aesmc v1.16b, v1.16b //AES block 1 - round 7 aese v3.16b, v25.16b aesmc v3.16b, v3.16b //AES block 3 - round 7 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 0 - round 8 aese v2.16b, v25.16b aesmc v2.16b, v2.16b //AES block 2 - round 7 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 3 - round 8 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 1 - round 8 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 0 - round 9 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 2 - round 8 ld1 {v31.4s}, [x8], #16 //load rk13 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 1 - round 9 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 0 - round 10 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 3 - round 9 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 1 - round 10 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 2 - round 9 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 3 - round 10 aese v0.16b, v29.16b aesmc v0.16b, v0.16b //AES block 0 - round 11 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 2 - round 10 aese v3.16b, v29.16b aesmc v3.16b, v3.16b //AES block 3 - round 11 aese v1.16b, v29.16b aesmc v1.16b, v1.16b //AES block 1 - round 11 aese v2.16b, v29.16b aesmc v2.16b, v2.16b //AES block 2 - round 11 trn1 v9.2d, v14.2d, v15.2d //h4h | h3h trn2 v17.2d, v14.2d, v15.2d //h4l | h3l trn1 v8.2d, v12.2d, v13.2d //h2h | h1h trn2 v16.2d, v12.2d, v13.2d //h2l | h1l aese v1.16b, v30.16b aesmc v1.16b, v1.16b //AES block 1 - round 12 aese v0.16b, v30.16b aesmc v0.16b, v0.16b //AES block 0 - round 12 aese v2.16b, v30.16b aesmc v2.16b, v2.16b //AES block 2 - round 12 aese v3.16b, v30.16b aesmc v3.16b, v3.16b //AES block 3 - round 12 eor v17.16b, v17.16b, v9.16b //h4k | h3k aese v1.16b, v31.16b //AES block 1 - round 13 aese v2.16b, v31.16b //AES block 2 - round 13 eor v16.16b, v16.16b, v8.16b //h2k | h1k aese v3.16b, v31.16b //AES block 3 - round 13 aese v0.16b, v31.16b //AES block 0 - round 13 b.ge .L256_dec_tail //handle tail ld1 {v4.16b, v5.16b}, [x0], #32 //AES block 0,1 - load ciphertext rev w9, w12 //CTR block 4 eor v0.16b, v4.16b, v0.16b //AES block 0 - result eor v1.16b, v5.16b, v1.16b //AES block 1 - result rev64 v5.16b, v5.16b //GHASH block 1 ld1 {v6.16b}, [x0], #16 //AES block 2 - load ciphertext mov x7, v0.d[1] //AES block 0 - mov high mov x6, v0.d[0] //AES block 0 - mov low rev64 v4.16b, v4.16b //GHASH block 0 add w12, w12, #1 //CTR block 4 fmov d0, x10 //CTR block 4 orr x9, x11, x9, lsl #32 //CTR block 4 fmov v0.d[1], x9 //CTR block 4 rev w9, w12 //CTR block 5 add w12, w12, #1 //CTR block 5 mov x19, v1.d[0] //AES block 1 - mov low orr x9, x11, x9, lsl #32 //CTR block 5 mov x20, v1.d[1] //AES block 1 - mov high eor x7, x7, x14 //AES block 0 - round 14 high #ifdef __AARCH64EB__ rev x7, x7 #endif eor x6, x6, x13 //AES block 0 - round 14 low #ifdef __AARCH64EB__ rev x6, x6 #endif stp x6, x7, [x2], #16 //AES block 0 - store result fmov d1, x10 //CTR block 5 ld1 {v7.16b}, [x0], #16 //AES block 3 - load ciphertext fmov v1.d[1], x9 //CTR block 5 rev w9, w12 //CTR block 6 add w12, w12, #1 //CTR block 6 eor x19, x19, x13 //AES block 1 - round 14 low #ifdef __AARCH64EB__ rev x19, x19 #endif orr x9, x11, x9, lsl #32 //CTR block 6 eor x20, x20, x14 //AES block 1 - round 14 high #ifdef __AARCH64EB__ rev x20, x20 #endif stp x19, x20, [x2], #16 //AES block 1 - store result eor v2.16b, v6.16b, v2.16b //AES block 2 - result cmp x0, x5 //check if we have <= 8 blocks b.ge .L256_dec_prepretail //do prepretail .L256_dec_main_loop: //main loop start mov x21, v2.d[0] //AES block 4k+2 - mov low ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result aese v0.16b, v18.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 mov x22, v2.d[1] //AES block 4k+2 - mov high aese v1.16b, v18.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 fmov d2, x10 //CTR block 4k+6 fmov v2.d[1], x9 //CTR block 4k+6 eor v4.16b, v4.16b, v11.16b //PRE 1 rev w9, w12 //CTR block 4k+7 aese v0.16b, v19.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 mov x24, v3.d[1] //AES block 4k+3 - mov high aese v1.16b, v19.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 mov x23, v3.d[0] //AES block 4k+3 - mov low pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high mov d8, v4.d[1] //GHASH block 4k - mid fmov d3, x10 //CTR block 4k+7 aese v0.16b, v20.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 orr x9, x11, x9, lsl #32 //CTR block 4k+7 aese v2.16b, v18.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 fmov v3.d[1], x9 //CTR block 4k+7 aese v1.16b, v20.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid aese v0.16b, v21.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 eor x22, x22, x14 //AES block 4k+2 - round 14 high #ifdef __AARCH64EB__ rev x22, x22 #endif aese v2.16b, v19.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 mov d10, v17.d[1] //GHASH block 4k - mid aese v1.16b, v21.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 rev64 v6.16b, v6.16b //GHASH block 4k+2 aese v3.16b, v18.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 eor x21, x21, x13 //AES block 4k+2 - round 14 low #ifdef __AARCH64EB__ rev x21, x21 #endif aese v2.16b, v20.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 stp x21, x22, [x2], #16 //AES block 4k+2 - store result pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low pmull2 v4.1q, v5.2d, v14.2d //GHASH block 4k+1 - high aese v2.16b, v21.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 rev64 v7.16b, v7.16b //GHASH block 4k+3 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid eor x23, x23, x13 //AES block 4k+3 - round 14 low #ifdef __AARCH64EB__ rev x23, x23 #endif pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low eor x24, x24, x14 //AES block 4k+3 - round 14 high #ifdef __AARCH64EB__ rev x24, x24 #endif eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high aese v2.16b, v22.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 aese v3.16b, v19.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 mov d4, v5.d[1] //GHASH block 4k+1 - mid aese v0.16b, v22.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 eor v11.16b, v11.16b, v8.16b //GHASH block 4k+1 - low aese v2.16b, v23.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 add w12, w12, #1 //CTR block 4k+7 aese v3.16b, v20.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 mov d8, v6.d[1] //GHASH block 4k+2 - mid aese v1.16b, v22.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid pmull v5.1q, v6.1d, v13.1d //GHASH block 4k+2 - low aese v3.16b, v21.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 eor v8.8b, v8.8b, v6.8b //GHASH block 4k+2 - mid aese v1.16b, v23.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 aese v0.16b, v23.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 eor v11.16b, v11.16b, v5.16b //GHASH block 4k+2 - low pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid rev w9, w12 //CTR block 4k+8 aese v1.16b, v24.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 ins v8.d[1], v8.d[0] //GHASH block 4k+2 - mid aese v0.16b, v24.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 add w12, w12, #1 //CTR block 4k+8 aese v3.16b, v22.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 aese v1.16b, v25.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid aese v0.16b, v25.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 pmull2 v4.1q, v6.2d, v13.2d //GHASH block 4k+2 - high mov d6, v7.d[1] //GHASH block 4k+3 - mid aese v3.16b, v23.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 pmull2 v8.1q, v8.2d, v16.2d //GHASH block 4k+2 - mid aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+2 - high aese v3.16b, v24.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 pmull v4.1q, v7.1d, v12.1d //GHASH block 4k+3 - low orr x9, x11, x9, lsl #32 //CTR block 4k+8 eor v10.16b, v10.16b, v8.16b //GHASH block 4k+2 - mid pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 eor v6.8b, v6.8b, v7.8b //GHASH block 4k+3 - mid aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 aese v2.16b, v24.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 pmull v6.1q, v6.1d, v16.1d //GHASH block 4k+3 - mid movi v8.8b, #0xc2 aese v2.16b, v25.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 eor v11.16b, v11.16b, v4.16b //GHASH block 4k+3 - low aese v0.16b, v29.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 11 aese v3.16b, v25.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 shl d8, d8, #56 //mod_constant aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 eor v10.16b, v10.16b, v6.16b //GHASH block 4k+3 - mid aese v0.16b, v30.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 12 pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid eor v6.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 ld1 {v4.16b}, [x0], #16 //AES block 4k+4 - load ciphertext aese v0.16b, v31.16b //AES block 4k+4 - round 13 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 eor v10.16b, v10.16b, v6.16b //MODULO - karatsuba tidy up aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 ld1 {v5.16b}, [x0], #16 //AES block 4k+5 - load ciphertext aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 eor v0.16b, v4.16b, v0.16b //AES block 4k+4 - result aese v1.16b, v29.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 11 stp x23, x24, [x2], #16 //AES block 4k+3 - store result aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 ld1 {v6.16b}, [x0], #16 //AES block 4k+6 - load ciphertext aese v1.16b, v30.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 12 ld1 {v7.16b}, [x0], #16 //AES block 4k+7 - load ciphertext aese v2.16b, v29.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 11 mov x7, v0.d[1] //AES block 4k+4 - mov high aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid aese v1.16b, v31.16b //AES block 4k+5 - round 13 mov x6, v0.d[0] //AES block 4k+4 - mov low aese v2.16b, v30.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 12 fmov d0, x10 //CTR block 4k+8 aese v3.16b, v29.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 11 fmov v0.d[1], x9 //CTR block 4k+8 pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low eor v1.16b, v5.16b, v1.16b //AES block 4k+5 - result rev w9, w12 //CTR block 4k+9 aese v2.16b, v31.16b //AES block 4k+6 - round 13 orr x9, x11, x9, lsl #32 //CTR block 4k+9 cmp x0, x5 //.LOOP CONTROL add w12, w12, #1 //CTR block 4k+9 eor x6, x6, x13 //AES block 4k+4 - round 14 low #ifdef __AARCH64EB__ rev x6, x6 #endif eor x7, x7, x14 //AES block 4k+4 - round 14 high #ifdef __AARCH64EB__ rev x7, x7 #endif mov x20, v1.d[1] //AES block 4k+5 - mov high eor v2.16b, v6.16b, v2.16b //AES block 4k+6 - result eor v11.16b, v11.16b, v8.16b //MODULO - fold into low aese v3.16b, v30.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 12 mov x19, v1.d[0] //AES block 4k+5 - mov low fmov d1, x10 //CTR block 4k+9 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment fmov v1.d[1], x9 //CTR block 4k+9 rev w9, w12 //CTR block 4k+10 add w12, w12, #1 //CTR block 4k+10 aese v3.16b, v31.16b //AES block 4k+7 - round 13 orr x9, x11, x9, lsl #32 //CTR block 4k+10 rev64 v5.16b, v5.16b //GHASH block 4k+5 eor x20, x20, x14 //AES block 4k+5 - round 14 high #ifdef __AARCH64EB__ rev x20, x20 #endif stp x6, x7, [x2], #16 //AES block 4k+4 - store result eor x19, x19, x13 //AES block 4k+5 - round 14 low #ifdef __AARCH64EB__ rev x19, x19 #endif stp x19, x20, [x2], #16 //AES block 4k+5 - store result rev64 v4.16b, v4.16b //GHASH block 4k+4 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low b.lt .L256_dec_main_loop .L256_dec_prepretail: //PREPRETAIL ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 mov x21, v2.d[0] //AES block 4k+2 - mov low eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result aese v0.16b, v18.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 mov x22, v2.d[1] //AES block 4k+2 - mov high aese v1.16b, v18.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 fmov d2, x10 //CTR block 4k+6 fmov v2.d[1], x9 //CTR block 4k+6 rev w9, w12 //CTR block 4k+7 eor v4.16b, v4.16b, v11.16b //PRE 1 rev64 v6.16b, v6.16b //GHASH block 4k+2 orr x9, x11, x9, lsl #32 //CTR block 4k+7 mov x23, v3.d[0] //AES block 4k+3 - mov low aese v1.16b, v19.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 mov x24, v3.d[1] //AES block 4k+3 - mov high pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low mov d8, v4.d[1] //GHASH block 4k - mid fmov d3, x10 //CTR block 4k+7 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high fmov v3.d[1], x9 //CTR block 4k+7 aese v2.16b, v18.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 mov d10, v17.d[1] //GHASH block 4k - mid aese v0.16b, v19.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid pmull2 v4.1q, v5.2d, v14.2d //GHASH block 4k+1 - high aese v2.16b, v19.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 rev64 v7.16b, v7.16b //GHASH block 4k+3 aese v3.16b, v18.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low aese v3.16b, v19.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 mov d4, v5.d[1] //GHASH block 4k+1 - mid aese v0.16b, v20.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 aese v1.16b, v20.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 eor v11.16b, v11.16b, v8.16b //GHASH block 4k+1 - low aese v2.16b, v20.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 aese v0.16b, v21.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 mov d8, v6.d[1] //GHASH block 4k+2 - mid aese v3.16b, v20.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid pmull v5.1q, v6.1d, v13.1d //GHASH block 4k+2 - low aese v0.16b, v22.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 aese v3.16b, v21.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 eor v8.8b, v8.8b, v6.8b //GHASH block 4k+2 - mid pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid aese v0.16b, v23.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 eor v11.16b, v11.16b, v5.16b //GHASH block 4k+2 - low aese v3.16b, v22.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid pmull2 v4.1q, v6.2d, v13.2d //GHASH block 4k+2 - high aese v3.16b, v23.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 ins v8.d[1], v8.d[0] //GHASH block 4k+2 - mid aese v2.16b, v21.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 aese v1.16b, v21.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+2 - high pmull v4.1q, v7.1d, v12.1d //GHASH block 4k+3 - low aese v2.16b, v22.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 mov d6, v7.d[1] //GHASH block 4k+3 - mid aese v1.16b, v22.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 pmull2 v8.1q, v8.2d, v16.2d //GHASH block 4k+2 - mid aese v2.16b, v23.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 eor v6.8b, v6.8b, v7.8b //GHASH block 4k+3 - mid aese v1.16b, v23.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 aese v3.16b, v24.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 eor v10.16b, v10.16b, v8.16b //GHASH block 4k+2 - mid aese v2.16b, v24.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 aese v0.16b, v24.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 movi v8.8b, #0xc2 aese v1.16b, v24.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 eor v11.16b, v11.16b, v4.16b //GHASH block 4k+3 - low pmull v6.1q, v6.1d, v16.1d //GHASH block 4k+3 - mid aese v3.16b, v25.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high aese v1.16b, v25.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 aese v0.16b, v25.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 eor v10.16b, v10.16b, v6.16b //GHASH block 4k+3 - mid aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 aese v2.16b, v25.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 eor v6.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 shl d8, d8, #56 //mod_constant aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 eor v10.16b, v10.16b, v6.16b //MODULO - karatsuba tidy up pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 eor x22, x22, x14 //AES block 4k+2 - round 14 high #ifdef __AARCH64EB__ rev x22, x22 #endif aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 eor x23, x23, x13 //AES block 4k+3 - round 14 low #ifdef __AARCH64EB__ rev x23, x23 #endif aese v2.16b, v29.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 11 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid aese v0.16b, v29.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 11 add w12, w12, #1 //CTR block 4k+7 aese v1.16b, v29.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 11 eor x21, x21, x13 //AES block 4k+2 - round 14 low #ifdef __AARCH64EB__ rev x21, x21 #endif aese v2.16b, v30.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 12 pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low eor x24, x24, x14 //AES block 4k+3 - round 14 high #ifdef __AARCH64EB__ rev x24, x24 #endif aese v3.16b, v29.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 11 stp x21, x22, [x2], #16 //AES block 4k+2 - store result aese v1.16b, v30.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 12 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment aese v0.16b, v30.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 12 stp x23, x24, [x2], #16 //AES block 4k+3 - store result aese v3.16b, v30.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 12 eor v11.16b, v11.16b, v8.16b //MODULO - fold into low aese v1.16b, v31.16b //AES block 4k+5 - round 13 aese v0.16b, v31.16b //AES block 4k+4 - round 13 aese v3.16b, v31.16b //AES block 4k+7 - round 13 aese v2.16b, v31.16b //AES block 4k+6 - round 13 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low .L256_dec_tail: //TAIL sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process ld1 { v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext eor v0.16b, v5.16b, v0.16b //AES block 4k+4 - result mov x6, v0.d[0] //AES block 4k+4 - mov low mov x7, v0.d[1] //AES block 4k+4 - mov high ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag cmp x5, #48 eor x6, x6, x13 //AES block 4k+4 - round 14 low #ifdef __AARCH64EB__ rev x6, x6 #endif eor x7, x7, x14 //AES block 4k+4 - round 14 high #ifdef __AARCH64EB__ rev x7, x7 #endif b.gt .L256_dec_blocks_more_than_3 sub w12, w12, #1 mov v3.16b, v2.16b movi v10.8b, #0 movi v11.8b, #0 cmp x5, #32 movi v9.8b, #0 mov v2.16b, v1.16b b.gt .L256_dec_blocks_more_than_2 sub w12, w12, #1 mov v3.16b, v1.16b cmp x5, #16 b.gt .L256_dec_blocks_more_than_1 sub w12, w12, #1 b .L256_dec_blocks_less_than_1 .L256_dec_blocks_more_than_3: //blocks left > 3 rev64 v4.16b, v5.16b //GHASH final-3 block ld1 { v5.16b}, [x0], #16 //AES final-2 block - load ciphertext stp x6, x7, [x2], #16 //AES final-3 block - store result mov d10, v17.d[1] //GHASH final-3 block - mid eor v4.16b, v4.16b, v8.16b //feed in partial tag eor v0.16b, v5.16b, v1.16b //AES final-2 block - result mov d22, v4.d[1] //GHASH final-3 block - mid mov x6, v0.d[0] //AES final-2 block - mov low mov x7, v0.d[1] //AES final-2 block - mov high eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid movi v8.8b, #0 //suppress further partial tag feed in pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid eor x6, x6, x13 //AES final-2 block - round 14 low #ifdef __AARCH64EB__ rev x6, x6 #endif pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low eor x7, x7, x14 //AES final-2 block - round 14 high #ifdef __AARCH64EB__ rev x7, x7 #endif .L256_dec_blocks_more_than_2: //blocks left > 2 rev64 v4.16b, v5.16b //GHASH final-2 block ld1 { v5.16b}, [x0], #16 //AES final-1 block - load ciphertext eor v4.16b, v4.16b, v8.16b //feed in partial tag stp x6, x7, [x2], #16 //AES final-2 block - store result eor v0.16b, v5.16b, v2.16b //AES final-1 block - result mov d22, v4.d[1] //GHASH final-2 block - mid pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid mov x6, v0.d[0] //AES final-1 block - mov low mov x7, v0.d[1] //AES final-1 block - mov high eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low movi v8.8b, #0 //suppress further partial tag feed in pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high eor x6, x6, x13 //AES final-1 block - round 14 low #ifdef __AARCH64EB__ rev x6, x6 #endif eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid eor x7, x7, x14 //AES final-1 block - round 14 high #ifdef __AARCH64EB__ rev x7, x7 #endif .L256_dec_blocks_more_than_1: //blocks left > 1 stp x6, x7, [x2], #16 //AES final-1 block - store result rev64 v4.16b, v5.16b //GHASH final-1 block ld1 { v5.16b}, [x0], #16 //AES final block - load ciphertext eor v4.16b, v4.16b, v8.16b //feed in partial tag movi v8.8b, #0 //suppress further partial tag feed in mov d22, v4.d[1] //GHASH final-1 block - mid eor v0.16b, v5.16b, v3.16b //AES final block - result pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low mov x6, v0.d[0] //AES final block - mov low ins v22.d[1], v22.d[0] //GHASH final-1 block - mid mov x7, v0.d[1] //AES final block - mov high pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid eor x6, x6, x13 //AES final block - round 14 low #ifdef __AARCH64EB__ rev x6, x6 #endif eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid eor x7, x7, x14 //AES final block - round 14 high #ifdef __AARCH64EB__ rev x7, x7 #endif .L256_dec_blocks_less_than_1: //blocks left <= 1 and x1, x1, #127 //bit_length %= 128 mvn x14, xzr //rk14_h = 0xffffffffffffffff sub x1, x1, #128 //bit_length -= 128 mvn x13, xzr //rk14_l = 0xffffffffffffffff ldp x4, x5, [x2] //load existing bytes we need to not overwrite neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) and x1, x1, #127 //bit_length %= 128 lsr x14, x14, x1 //rk14_h is mask for top 64b of last block cmp x1, #64 csel x9, x13, x14, lt csel x10, x14, xzr, lt fmov d0, x9 //ctr0b is mask for last block and x6, x6, x9 mov v0.d[1], x10 bic x4, x4, x9 //mask out low existing bytes #ifndef __AARCH64EB__ rev w9, w12 #else mov w9, w12 #endif bic x5, x5, x10 //mask out high existing bytes orr x6, x6, x4 and x7, x7, x10 orr x7, x7, x5 and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits rev64 v4.16b, v5.16b //GHASH final block eor v4.16b, v4.16b, v8.16b //feed in partial tag pmull v21.1q, v4.1d, v12.1d //GHASH final block - low mov d8, v4.d[1] //GHASH final block - mid eor v8.8b, v8.8b, v4.8b //GHASH final block - mid pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid eor v9.16b, v9.16b, v20.16b //GHASH final block - high eor v11.16b, v11.16b, v21.16b //GHASH final block - low eor v10.16b, v10.16b, v8.16b //GHASH final block - mid movi v8.8b, #0xc2 eor v6.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up shl d8, d8, #56 //mod_constant eor v10.16b, v10.16b, v6.16b //MODULO - karatsuba tidy up pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment eor v11.16b, v11.16b, v8.16b //MODULO - fold into low stp x6, x7, [x2] str w9, [x16, #12] //store the updated counter eor v11.16b, v11.16b, v10.16b //MODULO - fold into low ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b mov x0, x15 st1 { v11.16b }, [x3] ldp x21, x22, [sp, #16] ldp x23, x24, [sp, #32] ldp d8, d9, [sp, #48] ldp d10, d11, [sp, #64] ldp d12, d13, [sp, #80] ldp d14, d15, [sp, #96] ldp x19, x20, [sp], #112 ret .L256_dec_ret: mov w0, #0x0 ret .size aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel .byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 #endif