// Code generated by command: go run chacha20poly1305_amd64_asm.go -out ../chacha20poly1305_amd64.s -pkg chacha20poly1305. DO NOT EDIT.
//go:build gc && !purego
#include "textflag.h"
// func polyHashADInternal<>()
TEXT polyHashADInternal<>(SB), NOSPLIT, $0
XORQ R10, R10
XORQ R11, R11
XORQ R12, R12
CMPQ R9, $0x0d
JNE hashADLoop
MOVQ (CX), R10
MOVQ 5(CX), R11
SHRQ $0x18, R11
MOVQ $0x00000001, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
RET
hashADLoop:
// Hash in 16 byte chunks
CMPQ R9, $0x10
JB hashADTail
ADDQ (CX), R10
ADCQ 8(CX), R11
ADCQ $0x01, R12
LEAQ 16(CX), CX
SUBQ $0x10, R9
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
JMP hashADLoop
hashADTail:
CMPQ R9, $0x00
JE hashADDone
// Hash last < 16 byte tail
XORQ R13, R13
XORQ R14, R14
XORQ R15, R15
ADDQ R9, CX
hashADTailLoop:
SHLQ $0x08, R13, R14
SHLQ $0x08, R13
MOVB -1(CX), R15
XORQ R15, R13
DECQ CX
DECQ R9
JNE hashADTailLoop
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
hashADDone:
RET
// func chacha20Poly1305Open(dst []byte, key []uint32, src []byte, ad []byte) bool
// Requires: AVX, AVX2, BMI2, CMOV, SSE2
TEXT ·chacha20Poly1305Open(SB), $288-97
// For aligned stack access
MOVQ SP, BP
ADDQ $0x20, BP
ANDQ $-32, BP
MOVQ dst_base+0(FP), DI
MOVQ key_base+24(FP), R8
MOVQ src_base+48(FP), SI
MOVQ src_len+56(FP), BX
MOVQ ad_base+72(FP), CX
VZEROUPPER
VMOVDQU ·chacha20Constants<>+0(SB), Y0
VBROADCASTI128 16(R8), Y14
VBROADCASTI128 32(R8), Y12
VBROADCASTI128 48(R8), Y4
VPADDD ·avx2InitMask<>+0(SB), Y4, Y4
// Special optimization, for very short buffers
CMPQ BX, $0xc0
JBE openAVX2192
CMPQ BX, $0x00000140
JBE openAVX2320
// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
VMOVDQA Y14, 32(BP)
VMOVDQA Y12, 64(BP)
VMOVDQA Y4, 192(BP)
MOVQ $0x0000000a, R9
openAVX2PreparePolyKey:
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
VPALIGNR $0x04, Y14, Y14, Y14
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x0c, Y4, Y4, Y4
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
VPALIGNR $0x0c, Y14, Y14, Y14
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x04, Y4, Y4, Y4
DECQ R9
JNE openAVX2PreparePolyKey
VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
VPADDD 32(BP), Y14, Y14
VPADDD 64(BP), Y12, Y12
VPADDD 192(BP), Y4, Y4
VPERM2I128 $0x02, Y0, Y14, Y3
// Clamp and store poly key
VPAND ·polyClampMask<>+0(SB), Y3, Y3
VMOVDQA Y3, (BP)
// Stream for the first 64 bytes
VPERM2I128 $0x13, Y0, Y14, Y0
VPERM2I128 $0x13, Y12, Y4, Y14
// Hash AD + first 64 bytes
MOVQ ad_len+80(FP), R9
CALL polyHashADInternal<>(SB)
XORQ CX, CX
openAVX2InitialHash64:
ADDQ (SI)(CX*1), R10
ADCQ 8(SI)(CX*1), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
ADDQ $0x10, CX
CMPQ CX, $0x40
JNE openAVX2InitialHash64
// Decrypt the first 64 bytes
VPXOR (SI), Y0, Y0
VPXOR 32(SI), Y14, Y14
VMOVDQU Y0, (DI)
VMOVDQU Y14, 32(DI)
LEAQ 64(SI), SI
LEAQ 64(DI), DI
SUBQ $0x40, BX
openAVX2MainLoop:
CMPQ BX, $0x00000200
JB openAVX2MainLoopDone
// Load state, increment counter blocks, store the incremented counters
VMOVDQU ·chacha20Constants<>+0(SB), Y0
VMOVDQA Y0, Y5
VMOVDQA Y0, Y6
VMOVDQA Y0, Y7
VMOVDQA 32(BP), Y14
VMOVDQA Y14, Y9
VMOVDQA Y14, Y10
VMOVDQA Y14, Y11
VMOVDQA 64(BP), Y12
VMOVDQA Y12, Y13
VMOVDQA Y12, Y8
VMOVDQA Y12, Y15
VMOVDQA 192(BP), Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
VMOVDQA Y4, 96(BP)
VMOVDQA Y1, 128(BP)
VMOVDQA Y2, 160(BP)
VMOVDQA Y3, 192(BP)
XORQ CX, CX
openAVX2InternalLoop:
ADDQ (SI)(CX*1), R10
ADCQ 8(SI)(CX*1), R11
ADCQ $0x01, R12
VPADDD Y14, Y0, Y0
VPADDD Y9, Y5, Y5
VPADDD Y10, Y6, Y6
VPADDD Y11, Y7, Y7
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
VPXOR Y0, Y4, Y4
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y3, Y3
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
VPADDD Y4, Y12, Y12
VPADDD Y1, Y13, Y13
VPADDD Y2, Y8, Y8
VPADDD Y3, Y15, Y15
VPXOR Y12, Y14, Y14
VPXOR Y13, Y9, Y9
VPXOR Y8, Y10, Y10
VPXOR Y15, Y11, Y11
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
VMOVDQA Y15, 224(BP)
VPSLLD $0x0c, Y14, Y15
VPSRLD $0x14, Y14, Y14
VPXOR Y15, Y14, Y14
VPSLLD $0x0c, Y9, Y15
VPSRLD $0x14, Y9, Y9
VPXOR Y15, Y9, Y9
VPSLLD $0x0c, Y10, Y15
VPSRLD $0x14, Y10, Y10
VPXOR Y15, Y10, Y10
VPSLLD $0x0c, Y11, Y15
VPSRLD $0x14, Y11, Y11
VPXOR Y15, Y11, Y11
VMOVDQA 224(BP), Y15
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
VPADDD Y14, Y0, Y0
VPADDD Y9, Y5, Y5
VPADDD Y10, Y6, Y6
VPADDD Y11, Y7, Y7
VPXOR Y0, Y4, Y4
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y3, Y3
ADDQ 16(SI)(CX*1), R10
ADCQ 24(SI)(CX*1), R11
ADCQ $0x01, R12
VPADDD Y4, Y12, Y12
VPADDD Y1, Y13, Y13
VPADDD Y2, Y8, Y8
VPADDD Y3, Y15, Y15
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
VPXOR Y12, Y14, Y14
VPXOR Y13, Y9, Y9
VPXOR Y8, Y10, Y10
VPXOR Y15, Y11, Y11
VMOVDQA Y15, 224(BP)
VPSLLD $0x07, Y14, Y15
VPSRLD $0x19, Y14, Y14
VPXOR Y15, Y14, Y14
VPSLLD $0x07, Y9, Y15
VPSRLD $0x19, Y9, Y9
VPXOR Y15, Y9, Y9
VPSLLD $0x07, Y10, Y15
VPSRLD $0x19, Y10, Y10
VPXOR Y15, Y10, Y10
VPSLLD $0x07, Y11, Y15
VPSRLD $0x19, Y11, Y11
VPXOR Y15, Y11, Y11
VMOVDQA 224(BP), Y15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
VPALIGNR $0x04, Y14, Y14, Y14
VPALIGNR $0x04, Y9, Y9, Y9
VPALIGNR $0x04, Y10, Y10, Y10
VPALIGNR $0x04, Y11, Y11, Y11
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x08, Y15, Y15, Y15
VPALIGNR $0x0c, Y4, Y4, Y4
VPALIGNR $0x0c, Y1, Y1, Y1
VPALIGNR $0x0c, Y2, Y2, Y2
VPALIGNR $0x0c, Y3, Y3, Y3
VPADDD Y14, Y0, Y0
VPADDD Y9, Y5, Y5
VPADDD Y10, Y6, Y6
VPADDD Y11, Y7, Y7
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
VPXOR Y0, Y4, Y4
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y3, Y3
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
VPADDD Y4, Y12, Y12
VPADDD Y1, Y13, Y13
VPADDD Y2, Y8, Y8
VPADDD Y3, Y15, Y15
VPXOR Y12, Y14, Y14
VPXOR Y13, Y9, Y9
VPXOR Y8, Y10, Y10
VPXOR Y15, Y11, Y11
ADDQ 32(SI)(CX*1), R10
ADCQ 40(SI)(CX*1), R11
ADCQ $0x01, R12
LEAQ 48(CX), CX
VMOVDQA Y15, 224(BP)
VPSLLD $0x0c, Y14, Y15
VPSRLD $0x14, Y14, Y14
VPXOR Y15, Y14, Y14
VPSLLD $0x0c, Y9, Y15
VPSRLD $0x14, Y9, Y9
VPXOR Y15, Y9, Y9
VPSLLD $0x0c, Y10, Y15
VPSRLD $0x14, Y10, Y10
VPXOR Y15, Y10, Y10
VPSLLD $0x0c, Y11, Y15
VPSRLD $0x14, Y11, Y11
VPXOR Y15, Y11, Y11
VMOVDQA 224(BP), Y15
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
VPADDD Y14, Y0, Y0
VPADDD Y9, Y5, Y5
VPADDD Y10, Y6, Y6
VPADDD Y11, Y7, Y7
VPXOR Y0, Y4, Y4
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y3, Y3
VPADDD Y4, Y12, Y12
VPADDD Y1, Y13, Y13
VPADDD Y2, Y8, Y8
VPADDD Y3, Y15, Y15
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
VPXOR Y12, Y14, Y14
VPXOR Y13, Y9, Y9
VPXOR Y8, Y10, Y10
VPXOR Y15, Y11, Y11
VMOVDQA Y15, 224(BP)
VPSLLD $0x07, Y14, Y15
VPSRLD $0x19, Y14, Y14
VPXOR Y15, Y14, Y14
VPSLLD $0x07, Y9, Y15
VPSRLD $0x19, Y9, Y9
VPXOR Y15, Y9, Y9
VPSLLD $0x07, Y10, Y15
VPSRLD $0x19, Y10, Y10
VPXOR Y15, Y10, Y10
VPSLLD $0x07, Y11, Y15
VPSRLD $0x19, Y11, Y11
VPXOR Y15, Y11, Y11
VMOVDQA 224(BP), Y15
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
VPALIGNR $0x0c, Y14, Y14, Y14
VPALIGNR $0x0c, Y9, Y9, Y9
VPALIGNR $0x0c, Y10, Y10, Y10
VPALIGNR $0x0c, Y11, Y11, Y11
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x08, Y15, Y15, Y15
VPALIGNR $0x04, Y4, Y4, Y4
VPALIGNR $0x04, Y1, Y1, Y1
VPALIGNR $0x04, Y2, Y2, Y2
VPALIGNR $0x04, Y3, Y3, Y3
CMPQ CX, $0x000001e0
JNE openAVX2InternalLoop
VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
VPADDD ·chacha20Constants<>+0(SB), Y7, Y7
VPADDD 32(BP), Y14, Y14
VPADDD 32(BP), Y9, Y9
VPADDD 32(BP), Y10, Y10
VPADDD 32(BP), Y11, Y11
VPADDD 64(BP), Y12, Y12
VPADDD 64(BP), Y13, Y13
VPADDD 64(BP), Y8, Y8
VPADDD 64(BP), Y15, Y15
VPADDD 96(BP), Y4, Y4
VPADDD 128(BP), Y1, Y1
VPADDD 160(BP), Y2, Y2
VPADDD 192(BP), Y3, Y3
VMOVDQA Y15, 224(BP)
// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
ADDQ 480(SI), R10
ADCQ 488(SI), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
VPERM2I128 $0x02, Y0, Y14, Y15
VPERM2I128 $0x13, Y0, Y14, Y14
VPERM2I128 $0x02, Y12, Y4, Y0
VPERM2I128 $0x13, Y12, Y4, Y12
VPXOR (SI), Y15, Y15
VPXOR 32(SI), Y0, Y0
VPXOR 64(SI), Y14, Y14
VPXOR 96(SI), Y12, Y12
VMOVDQU Y15, (DI)
VMOVDQU Y0, 32(DI)
VMOVDQU Y14, 64(DI)
VMOVDQU Y12, 96(DI)
VPERM2I128 $0x02, Y5, Y9, Y0
VPERM2I128 $0x02, Y13, Y1, Y14
VPERM2I128 $0x13, Y5, Y9, Y12
VPERM2I128 $0x13, Y13, Y1, Y4
VPXOR 128(SI), Y0, Y0
VPXOR 160(SI), Y14, Y14
VPXOR 192(SI), Y12, Y12
VPXOR 224(SI), Y4, Y4
VMOVDQU Y0, 128(DI)
VMOVDQU Y14, 160(DI)
VMOVDQU Y12, 192(DI)
VMOVDQU Y4, 224(DI)
// and here
ADDQ 496(SI), R10
ADCQ 504(SI), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
VPERM2I128 $0x02, Y6, Y10, Y0
VPERM2I128 $0x02, Y8, Y2, Y14
VPERM2I128 $0x13, Y6, Y10, Y12
VPERM2I128 $0x13, Y8, Y2, Y4
VPXOR 256(SI), Y0, Y0
VPXOR 288(SI), Y14, Y14
VPXOR 320(SI), Y12, Y12
VPXOR 352(SI), Y4, Y4
VMOVDQU Y0, 256(DI)
VMOVDQU Y14, 288(DI)
VMOVDQU Y12, 320(DI)
VMOVDQU Y4, 352(DI)
VPERM2I128 $0x02, Y7, Y11, Y0
VPERM2I128 $0x02, 224(BP), Y3, Y14
VPERM2I128 $0x13, Y7, Y11, Y12
VPERM2I128 $0x13, 224(BP), Y3, Y4
VPXOR 384(SI), Y0, Y0
VPXOR 416(SI), Y14, Y14
VPXOR 448(SI), Y12, Y12
VPXOR 480(SI), Y4, Y4
VMOVDQU Y0, 384(DI)
VMOVDQU Y14, 416(DI)
VMOVDQU Y12, 448(DI)
VMOVDQU Y4, 480(DI)
LEAQ 512(SI), SI
LEAQ 512(DI), DI
SUBQ $0x00000200, BX
JMP openAVX2MainLoop
openAVX2MainLoopDone:
// Handle the various tail sizes efficiently
TESTQ BX, BX
JE openSSEFinalize
CMPQ BX, $0x80
JBE openAVX2Tail128
CMPQ BX, $0x00000100
JBE openAVX2Tail256
CMPQ BX, $0x00000180
JBE openAVX2Tail384
JMP openAVX2Tail512
openSSEFinalize:
// Hash in the PT, AAD lengths
ADDQ ad_len+80(FP), R10
ADCQ src_len+56(FP), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
// Final reduce
MOVQ R10, R13
MOVQ R11, R14
MOVQ R12, R15
SUBQ $-5, R10
SBBQ $-1, R11
SBBQ $0x03, R12
CMOVQCS R13, R10
CMOVQCS R14, R11
CMOVQCS R15, R12
// Add in the "s" part of the key
ADDQ 16(BP), R10
ADCQ 24(BP), R11
// Finally, constant time compare to the tag at the end of the message
XORQ AX, AX
MOVQ $0x00000001, DX
XORQ (SI), R10
XORQ 8(SI), R11
ORQ R11, R10
CMOVQEQ DX, AX
// Return true iff tags are equal
MOVB AX, ret+96(FP)
RET
openSSETail16:
TESTQ BX, BX
JE openSSEFinalize
// We can safely load the CT from the end, because it is padded with the MAC
MOVQ BX, R9
SHLQ $0x04, R9
LEAQ ·andMask<>+0(SB), R13
MOVOU (SI), X12
ADDQ BX, SI
PAND -16(R13)(R9*1), X12
MOVO X12, 64(BP)
MOVQ X12, R13
MOVQ 72(BP), R14
PXOR X1, X12
// We can only store one byte at a time, since plaintext can be shorter than 16 bytes
openSSETail16Store:
MOVQ X12, R8
MOVB R8, (DI)
PSRLDQ $0x01, X12
INCQ DI
DECQ BX
JNE openSSETail16Store
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
JMP openSSEFinalize
openAVX2192:
VMOVDQA Y0, Y5
VMOVDQA Y14, Y9
VMOVDQA Y12, Y13
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
VMOVDQA Y0, Y6
VMOVDQA Y14, Y10
VMOVDQA Y12, Y8
VMOVDQA Y4, Y2
VMOVDQA Y1, Y15
MOVQ $0x0000000a, R9
openAVX2192InnerCipherLoop:
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y3
VPSRLD $0x14, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y3
VPSRLD $0x19, Y9, Y9
VPXOR Y3, Y9, Y9
VPALIGNR $0x04, Y14, Y14, Y14
VPALIGNR $0x04, Y9, Y9, Y9
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x0c, Y4, Y4, Y4
VPALIGNR $0x0c, Y1, Y1, Y1
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y3
VPSRLD $0x14, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y3
VPSRLD $0x19, Y9, Y9
VPXOR Y3, Y9, Y9
VPALIGNR $0x0c, Y14, Y14, Y14
VPALIGNR $0x0c, Y9, Y9, Y9
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x04, Y4, Y4, Y4
VPALIGNR $0x04, Y1, Y1, Y1
DECQ R9
JNE openAVX2192InnerCipherLoop
VPADDD Y6, Y0, Y0
VPADDD Y6, Y5, Y5
VPADDD Y10, Y14, Y14
VPADDD Y10, Y9, Y9
VPADDD Y8, Y12, Y12
VPADDD Y8, Y13, Y13
VPADDD Y2, Y4, Y4
VPADDD Y15, Y1, Y1
VPERM2I128 $0x02, Y0, Y14, Y3
// Clamp and store poly key
VPAND ·polyClampMask<>+0(SB), Y3, Y3
VMOVDQA Y3, (BP)
// Stream for up to 192 bytes
VPERM2I128 $0x13, Y0, Y14, Y0
VPERM2I128 $0x13, Y12, Y4, Y14
VPERM2I128 $0x02, Y5, Y9, Y12
VPERM2I128 $0x02, Y13, Y1, Y4
VPERM2I128 $0x13, Y5, Y9, Y5
VPERM2I128 $0x13, Y13, Y1, Y9
openAVX2ShortOpen:
// Hash
MOVQ ad_len+80(FP), R9
CALL polyHashADInternal<>(SB)
openAVX2ShortOpenLoop:
CMPQ BX, $0x20
JB openAVX2ShortTail32
SUBQ $0x20, BX
// Load for hashing
ADDQ (SI), R10
ADCQ 8(SI), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
ADDQ 16(SI), R10
ADCQ 24(SI), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
// Load for decryption
VPXOR (SI), Y0, Y0
VMOVDQU Y0, (DI)
LEAQ 32(SI), SI
LEAQ 32(DI), DI
// Shift stream left
VMOVDQA Y14, Y0
VMOVDQA Y12, Y14
VMOVDQA Y4, Y12
VMOVDQA Y5, Y4
VMOVDQA Y9, Y5
VMOVDQA Y13, Y9
VMOVDQA Y1, Y13
VMOVDQA Y6, Y1
VMOVDQA Y10, Y6
JMP openAVX2ShortOpenLoop
openAVX2ShortTail32:
CMPQ BX, $0x10
VMOVDQA X0, X1
JB openAVX2ShortDone
SUBQ $0x10, BX
// Load for hashing
ADDQ (SI), R10
ADCQ 8(SI), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
// Load for decryption
VPXOR (SI), X0, X12
VMOVDQU X12, (DI)
LEAQ 16(SI), SI
LEAQ 16(DI), DI
VPERM2I128 $0x11, Y0, Y0, Y0
VMOVDQA X0, X1
openAVX2ShortDone:
VZEROUPPER
JMP openSSETail16
openAVX2320:
VMOVDQA Y0, Y5
VMOVDQA Y14, Y9
VMOVDQA Y12, Y13
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
VMOVDQA Y0, Y6
VMOVDQA Y14, Y10
VMOVDQA Y12, Y8
VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
VMOVDQA Y14, Y7
VMOVDQA Y12, Y11
VMOVDQA Y4, Y15
MOVQ $0x0000000a, R9
openAVX2320InnerCipherLoop:
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y3
VPSRLD $0x14, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y3
VPSRLD $0x19, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x0c, Y10, Y3
VPSRLD $0x14, Y10, Y10
VPXOR Y3, Y10, Y10
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x07, Y10, Y3
VPSRLD $0x19, Y10, Y10
VPXOR Y3, Y10, Y10
VPALIGNR $0x04, Y14, Y14, Y14
VPALIGNR $0x04, Y9, Y9, Y9
VPALIGNR $0x04, Y10, Y10, Y10
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x0c, Y4, Y4, Y4
VPALIGNR $0x0c, Y1, Y1, Y1
VPALIGNR $0x0c, Y2, Y2, Y2
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y3
VPSRLD $0x14, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y3
VPSRLD $0x19, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x0c, Y10, Y3
VPSRLD $0x14, Y10, Y10
VPXOR Y3, Y10, Y10
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x07, Y10, Y3
VPSRLD $0x19, Y10, Y10
VPXOR Y3, Y10, Y10
VPALIGNR $0x0c, Y14, Y14, Y14
VPALIGNR $0x0c, Y9, Y9, Y9
VPALIGNR $0x0c, Y10, Y10, Y10
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x04, Y4, Y4, Y4
VPALIGNR $0x04, Y1, Y1, Y1
VPALIGNR $0x04, Y2, Y2, Y2
DECQ R9
JNE openAVX2320InnerCipherLoop
VMOVDQA ·chacha20Constants<>+0(SB), Y3
VPADDD Y3, Y0, Y0
VPADDD Y3, Y5, Y5
VPADDD Y3, Y6, Y6
VPADDD Y7, Y14, Y14
VPADDD Y7, Y9, Y9
VPADDD Y7, Y10, Y10
VPADDD Y11, Y12, Y12
VPADDD Y11, Y13, Y13
VPADDD Y11, Y8, Y8
VMOVDQA ·avx2IncMask<>+0(SB), Y3
VPADDD Y15, Y4, Y4
VPADDD Y3, Y15, Y15
VPADDD Y15, Y1, Y1
VPADDD Y3, Y15, Y15
VPADDD Y15, Y2, Y2
// Clamp and store poly key
VPERM2I128 $0x02, Y0, Y14, Y3
VPAND ·polyClampMask<>+0(SB), Y3, Y3
VMOVDQA Y3, (BP)
// Stream for up to 320 bytes
VPERM2I128 $0x13, Y0, Y14, Y0
VPERM2I128 $0x13, Y12, Y4, Y14
VPERM2I128 $0x02, Y5, Y9, Y12
VPERM2I128 $0x02, Y13, Y1, Y4
VPERM2I128 $0x13, Y5, Y9, Y5
VPERM2I128 $0x13, Y13, Y1, Y9
VPERM2I128 $0x02, Y6, Y10, Y13
VPERM2I128 $0x02, Y8, Y2, Y1
VPERM2I128 $0x13, Y6, Y10, Y6
VPERM2I128 $0x13, Y8, Y2, Y10
JMP openAVX2ShortOpen
openAVX2Tail128:
// Need to decrypt up to 128 bytes - prepare two blocks
VMOVDQA ·chacha20Constants<>+0(SB), Y5
VMOVDQA 32(BP), Y9
VMOVDQA 64(BP), Y13
VMOVDQA 192(BP), Y1
VPADDD ·avx2IncMask<>+0(SB), Y1, Y1
VMOVDQA Y1, Y4
XORQ R9, R9
MOVQ BX, CX
ANDQ $-16, CX
TESTQ CX, CX
JE openAVX2Tail128LoopB
openAVX2Tail128LoopA:
ADDQ (SI)(R9*1), R10
ADCQ 8(SI)(R9*1), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
openAVX2Tail128LoopB:
ADDQ $0x10, R9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y3
VPSRLD $0x14, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y3
VPSRLD $0x19, Y9, Y9
VPXOR Y3, Y9, Y9
VPALIGNR $0x04, Y9, Y9, Y9
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x0c, Y1, Y1, Y1
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y3
VPSRLD $0x14, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y3
VPSRLD $0x19, Y9, Y9
VPXOR Y3, Y9, Y9
VPALIGNR $0x0c, Y9, Y9, Y9
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x04, Y1, Y1, Y1
CMPQ R9, CX
JB openAVX2Tail128LoopA
CMPQ R9, $0xa0
JNE openAVX2Tail128LoopB
VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
VPADDD 32(BP), Y9, Y9
VPADDD 64(BP), Y13, Y13
VPADDD Y4, Y1, Y1
VPERM2I128 $0x02, Y5, Y9, Y0
VPERM2I128 $0x02, Y13, Y1, Y14
VPERM2I128 $0x13, Y5, Y9, Y12
VPERM2I128 $0x13, Y13, Y1, Y4
openAVX2TailLoop:
CMPQ BX, $0x20
JB openAVX2Tail
SUBQ $0x20, BX
// Load for decryption
VPXOR (SI), Y0, Y0
VMOVDQU Y0, (DI)
LEAQ 32(SI), SI
LEAQ 32(DI), DI
VMOVDQA Y14, Y0
VMOVDQA Y12, Y14
VMOVDQA Y4, Y12
JMP openAVX2TailLoop
openAVX2Tail:
CMPQ BX, $0x10
VMOVDQA X0, X1
JB openAVX2TailDone
SUBQ $0x10, BX
// Load for decryption
VPXOR (SI), X0, X12
VMOVDQU X12, (DI)
LEAQ 16(SI), SI
LEAQ 16(DI), DI
VPERM2I128 $0x11, Y0, Y0, Y0
VMOVDQA X0, X1
openAVX2TailDone:
VZEROUPPER
JMP openSSETail16
openAVX2Tail256:
VMOVDQA ·chacha20Constants<>+0(SB), Y0
VMOVDQA Y0, Y5
VMOVDQA 32(BP), Y14
VMOVDQA Y14, Y9
VMOVDQA 64(BP), Y12
VMOVDQA Y12, Y13
VMOVDQA 192(BP), Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
VMOVDQA Y4, Y7
VMOVDQA Y1, Y11
// Compute the number of iterations that will hash data
MOVQ BX, 224(BP)
MOVQ BX, CX
SUBQ $0x80, CX
SHRQ $0x04, CX
MOVQ $0x0000000a, R9
CMPQ CX, $0x0a
CMOVQGT R9, CX
MOVQ SI, BX
XORQ R9, R9
openAVX2Tail256LoopA:
ADDQ (BX), R10
ADCQ 8(BX), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 16(BX), BX
openAVX2Tail256LoopB:
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y3
VPSRLD $0x14, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y3
VPSRLD $0x19, Y9, Y9
VPXOR Y3, Y9, Y9
VPALIGNR $0x04, Y14, Y14, Y14
VPALIGNR $0x04, Y9, Y9, Y9
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x0c, Y4, Y4, Y4
VPALIGNR $0x0c, Y1, Y1, Y1
INCQ R9
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y3
VPSRLD $0x14, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y3
VPSRLD $0x19, Y9, Y9
VPXOR Y3, Y9, Y9
VPALIGNR $0x0c, Y14, Y14, Y14
VPALIGNR $0x0c, Y9, Y9, Y9
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x04, Y4, Y4, Y4
VPALIGNR $0x04, Y1, Y1, Y1
CMPQ R9, CX
JB openAVX2Tail256LoopA
CMPQ R9, $0x0a
JNE openAVX2Tail256LoopB
MOVQ BX, R9
SUBQ SI, BX
MOVQ BX, CX
MOVQ 224(BP), BX
openAVX2Tail256Hash:
ADDQ $0x10, CX
CMPQ CX, BX
JGT openAVX2Tail256HashEnd
ADDQ (R9), R10
ADCQ 8(R9), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 16(R9), R9
JMP openAVX2Tail256Hash
openAVX2Tail256HashEnd:
VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
VPADDD 32(BP), Y14, Y14
VPADDD 32(BP), Y9, Y9
VPADDD 64(BP), Y12, Y12
VPADDD 64(BP), Y13, Y13
VPADDD Y7, Y4, Y4
VPADDD Y11, Y1, Y1
VPERM2I128 $0x02, Y0, Y14, Y6
VPERM2I128 $0x02, Y12, Y4, Y10
VPERM2I128 $0x13, Y0, Y14, Y8
VPERM2I128 $0x13, Y12, Y4, Y2
VPERM2I128 $0x02, Y5, Y9, Y0
VPERM2I128 $0x02, Y13, Y1, Y14
VPERM2I128 $0x13, Y5, Y9, Y12
VPERM2I128 $0x13, Y13, Y1, Y4
VPXOR (SI), Y6, Y6
VPXOR 32(SI), Y10, Y10
VPXOR 64(SI), Y8, Y8
VPXOR 96(SI), Y2, Y2
VMOVDQU Y6, (DI)
VMOVDQU Y10, 32(DI)
VMOVDQU Y8, 64(DI)
VMOVDQU Y2, 96(DI)
LEAQ 128(SI), SI
LEAQ 128(DI), DI
SUBQ $0x80, BX
JMP openAVX2TailLoop
openAVX2Tail384:
// Need to decrypt up to 384 bytes - prepare six blocks
VMOVDQA ·chacha20Constants<>+0(SB), Y0
VMOVDQA Y0, Y5
VMOVDQA Y0, Y6
VMOVDQA 32(BP), Y14
VMOVDQA Y14, Y9
VMOVDQA Y14, Y10
VMOVDQA 64(BP), Y12
VMOVDQA Y12, Y13
VMOVDQA Y12, Y8
VMOVDQA 192(BP), Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
VMOVDQA Y4, 96(BP)
VMOVDQA Y1, 128(BP)
VMOVDQA Y2, 160(BP)
// Compute the number of iterations that will hash two blocks of data
MOVQ BX, 224(BP)
MOVQ BX, CX
SUBQ $0x00000100, CX
SHRQ $0x04, CX
ADDQ $0x06, CX
MOVQ $0x0000000a, R9
CMPQ CX, $0x0a
CMOVQGT R9, CX
MOVQ SI, BX
XORQ R9, R9
openAVX2Tail384LoopB:
ADDQ (BX), R10
ADCQ 8(BX), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 16(BX), BX
openAVX2Tail384LoopA:
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y3
VPSRLD $0x14, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y3
VPSRLD $0x19, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x0c, Y10, Y3
VPSRLD $0x14, Y10, Y10
VPXOR Y3, Y10, Y10
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x07, Y10, Y3
VPSRLD $0x19, Y10, Y10
VPXOR Y3, Y10, Y10
VPALIGNR $0x04, Y14, Y14, Y14
VPALIGNR $0x04, Y9, Y9, Y9
VPALIGNR $0x04, Y10, Y10, Y10
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x0c, Y4, Y4, Y4
VPALIGNR $0x0c, Y1, Y1, Y1
VPALIGNR $0x0c, Y2, Y2, Y2
ADDQ (BX), R10
ADCQ 8(BX), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 16(BX), BX
INCQ R9
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y3
VPSRLD $0x14, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y3
VPSRLD $0x19, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x0c, Y10, Y3
VPSRLD $0x14, Y10, Y10
VPXOR Y3, Y10, Y10
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x07, Y10, Y3
VPSRLD $0x19, Y10, Y10
VPXOR Y3, Y10, Y10
VPALIGNR $0x0c, Y14, Y14, Y14
VPALIGNR $0x0c, Y9, Y9, Y9
VPALIGNR $0x0c, Y10, Y10, Y10
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x04, Y4, Y4, Y4
VPALIGNR $0x04, Y1, Y1, Y1
VPALIGNR $0x04, Y2, Y2, Y2
CMPQ R9, CX
JB openAVX2Tail384LoopB
CMPQ R9, $0x0a
JNE openAVX2Tail384LoopA
MOVQ BX, R9
SUBQ SI, BX
MOVQ BX, CX
MOVQ 224(BP), BX
openAVX2Tail384Hash:
ADDQ $0x10, CX
CMPQ CX, BX
JGT openAVX2Tail384HashEnd
ADDQ (R9), R10
ADCQ 8(R9), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 16(R9), R9
JMP openAVX2Tail384Hash
openAVX2Tail384HashEnd:
VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
VPADDD 32(BP), Y14, Y14
VPADDD 32(BP), Y9, Y9
VPADDD 32(BP), Y10, Y10
VPADDD 64(BP), Y12, Y12
VPADDD 64(BP), Y13, Y13
VPADDD 64(BP), Y8, Y8
VPADDD 96(BP), Y4, Y4
VPADDD 128(BP), Y1, Y1
VPADDD 160(BP), Y2, Y2
VPERM2I128 $0x02, Y0, Y14, Y3
VPERM2I128 $0x02, Y12, Y4, Y7
VPERM2I128 $0x13, Y0, Y14, Y11
VPERM2I128 $0x13, Y12, Y4, Y15
VPXOR (SI), Y3, Y3
VPXOR 32(SI), Y7, Y7
VPXOR 64(SI), Y11, Y11
VPXOR 96(SI), Y15, Y15
VMOVDQU Y3, (DI)
VMOVDQU Y7, 32(DI)
VMOVDQU Y11, 64(DI)
VMOVDQU Y15, 96(DI)
VPERM2I128 $0x02, Y5, Y9, Y3
VPERM2I128 $0x02, Y13, Y1, Y7
VPERM2I128 $0x13, Y5, Y9, Y11
VPERM2I128 $0x13, Y13, Y1, Y15
VPXOR 128(SI), Y3, Y3
VPXOR 160(SI), Y7, Y7
VPXOR 192(SI), Y11, Y11
VPXOR 224(SI), Y15, Y15
VMOVDQU Y3, 128(DI)
VMOVDQU Y7, 160(DI)
VMOVDQU Y11, 192(DI)
VMOVDQU Y15, 224(DI)
VPERM2I128 $0x02, Y6, Y10, Y0
VPERM2I128 $0x02, Y8, Y2, Y14
VPERM2I128 $0x13, Y6, Y10, Y12
VPERM2I128 $0x13, Y8, Y2, Y4
LEAQ 256(SI), SI
LEAQ 256(DI), DI
SUBQ $0x00000100, BX
JMP openAVX2TailLoop
openAVX2Tail512:
VMOVDQU ·chacha20Constants<>+0(SB), Y0
VMOVDQA Y0, Y5
VMOVDQA Y0, Y6
VMOVDQA Y0, Y7
VMOVDQA 32(BP), Y14
VMOVDQA Y14, Y9
VMOVDQA Y14, Y10
VMOVDQA Y14, Y11
VMOVDQA 64(BP), Y12
VMOVDQA Y12, Y13
VMOVDQA Y12, Y8
VMOVDQA Y12, Y15
VMOVDQA 192(BP), Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
VMOVDQA Y4, 96(BP)
VMOVDQA Y1, 128(BP)
VMOVDQA Y2, 160(BP)
VMOVDQA Y3, 192(BP)
XORQ CX, CX
MOVQ SI, R9
openAVX2Tail512LoopB:
ADDQ (R9), R10
ADCQ 8(R9), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 16(R9), R9
openAVX2Tail512LoopA:
VPADDD Y14, Y0, Y0
VPADDD Y9, Y5, Y5
VPADDD Y10, Y6, Y6
VPADDD Y11, Y7, Y7
VPXOR Y0, Y4, Y4
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y3, Y3
VPADDD Y4, Y12, Y12
VPADDD Y1, Y13, Y13
VPADDD Y2, Y8, Y8
VPADDD Y3, Y15, Y15
VPXOR Y12, Y14, Y14
VPXOR Y13, Y9, Y9
VPXOR Y8, Y10, Y10
VPXOR Y15, Y11, Y11
VMOVDQA Y15, 224(BP)
VPSLLD $0x0c, Y14, Y15
VPSRLD $0x14, Y14, Y14
VPXOR Y15, Y14, Y14
VPSLLD $0x0c, Y9, Y15
VPSRLD $0x14, Y9, Y9
VPXOR Y15, Y9, Y9
VPSLLD $0x0c, Y10, Y15
VPSRLD $0x14, Y10, Y10
VPXOR Y15, Y10, Y10
VPSLLD $0x0c, Y11, Y15
VPSRLD $0x14, Y11, Y11
VPXOR Y15, Y11, Y11
VMOVDQA 224(BP), Y15
ADDQ (R9), R10
ADCQ 8(R9), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
VPADDD Y14, Y0, Y0
VPADDD Y9, Y5, Y5
VPADDD Y10, Y6, Y6
VPADDD Y11, Y7, Y7
VPXOR Y0, Y4, Y4
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y3, Y3
VPADDD Y4, Y12, Y12
VPADDD Y1, Y13, Y13
VPADDD Y2, Y8, Y8
VPADDD Y3, Y15, Y15
VPXOR Y12, Y14, Y14
VPXOR Y13, Y9, Y9
VPXOR Y8, Y10, Y10
VPXOR Y15, Y11, Y11
VMOVDQA Y15, 224(BP)
VPSLLD $0x07, Y14, Y15
VPSRLD $0x19, Y14, Y14
VPXOR Y15, Y14, Y14
VPSLLD $0x07, Y9, Y15
VPSRLD $0x19, Y9, Y9
VPXOR Y15, Y9, Y9
VPSLLD $0x07, Y10, Y15
VPSRLD $0x19, Y10, Y10
VPXOR Y15, Y10, Y10
VPSLLD $0x07, Y11, Y15
VPSRLD $0x19, Y11, Y11
VPXOR Y15, Y11, Y11
VMOVDQA 224(BP), Y15
VPALIGNR $0x04, Y14, Y14, Y14
VPALIGNR $0x04, Y9, Y9, Y9
VPALIGNR $0x04, Y10, Y10, Y10
VPALIGNR $0x04, Y11, Y11, Y11
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x08, Y15, Y15, Y15
VPALIGNR $0x0c, Y4, Y4, Y4
VPALIGNR $0x0c, Y1, Y1, Y1
VPALIGNR $0x0c, Y2, Y2, Y2
VPALIGNR $0x0c, Y3, Y3, Y3
VPADDD Y14, Y0, Y0
VPADDD Y9, Y5, Y5
VPADDD Y10, Y6, Y6
VPADDD Y11, Y7, Y7
VPXOR Y0, Y4, Y4
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y3, Y3
VPADDD Y4, Y12, Y12
VPADDD Y1, Y13, Y13
VPADDD Y2, Y8, Y8
VPADDD Y3, Y15, Y15
VPXOR Y12, Y14, Y14
VPXOR Y13, Y9, Y9
VPXOR Y8, Y10, Y10
VPXOR Y15, Y11, Y11
ADDQ 16(R9), R10
ADCQ 24(R9), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 32(R9), R9
VMOVDQA Y15, 224(BP)
VPSLLD $0x0c, Y14, Y15
VPSRLD $0x14, Y14, Y14
VPXOR Y15, Y14, Y14
VPSLLD $0x0c, Y9, Y15
VPSRLD $0x14, Y9, Y9
VPXOR Y15, Y9, Y9
VPSLLD $0x0c, Y10, Y15
VPSRLD $0x14, Y10, Y10
VPXOR Y15, Y10, Y10
VPSLLD $0x0c, Y11, Y15
VPSRLD $0x14, Y11, Y11
VPXOR Y15, Y11, Y11
VMOVDQA 224(BP), Y15
VPADDD Y14, Y0, Y0
VPADDD Y9, Y5, Y5
VPADDD Y10, Y6, Y6
VPADDD Y11, Y7, Y7
VPXOR Y0, Y4, Y4
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y3, Y3
VPADDD Y4, Y12, Y12
VPADDD Y1, Y13, Y13
VPADDD Y2, Y8, Y8
VPADDD Y3, Y15, Y15
VPXOR Y12, Y14, Y14
VPXOR Y13, Y9, Y9
VPXOR Y8, Y10, Y10
VPXOR Y15, Y11, Y11
VMOVDQA Y15, 224(BP)
VPSLLD $0x07, Y14, Y15
VPSRLD $0x19, Y14, Y14
VPXOR Y15, Y14, Y14
VPSLLD $0x07, Y9, Y15
VPSRLD $0x19, Y9, Y9
VPXOR Y15, Y9, Y9
VPSLLD $0x07, Y10, Y15
VPSRLD $0x19, Y10, Y10
VPXOR Y15, Y10, Y10
VPSLLD $0x07, Y11, Y15
VPSRLD $0x19, Y11, Y11
VPXOR Y15, Y11, Y11
VMOVDQA 224(BP), Y15
VPALIGNR $0x0c, Y14, Y14, Y14
VPALIGNR $0x0c, Y9, Y9, Y9
VPALIGNR $0x0c, Y10, Y10, Y10
VPALIGNR $0x0c, Y11, Y11, Y11
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x08, Y15, Y15, Y15
VPALIGNR $0x04, Y4, Y4, Y4
VPALIGNR $0x04, Y1, Y1, Y1
VPALIGNR $0x04, Y2, Y2, Y2
VPALIGNR $0x04, Y3, Y3, Y3
INCQ CX
CMPQ CX, $0x04
JLT openAVX2Tail512LoopB
CMPQ CX, $0x0a
JNE openAVX2Tail512LoopA
MOVQ BX, CX
SUBQ $0x00000180, CX
ANDQ $-16, CX
openAVX2Tail512HashLoop:
TESTQ CX, CX
JE openAVX2Tail512HashEnd
ADDQ (R9), R10
ADCQ 8(R9), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 16(R9), R9
SUBQ $0x10, CX
JMP openAVX2Tail512HashLoop
openAVX2Tail512HashEnd:
VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
VPADDD ·chacha20Constants<>+0(SB), Y7, Y7
VPADDD 32(BP), Y14, Y14
VPADDD 32(BP), Y9, Y9
VPADDD 32(BP), Y10, Y10
VPADDD 32(BP), Y11, Y11
VPADDD 64(BP), Y12, Y12
VPADDD 64(BP), Y13, Y13
VPADDD 64(BP), Y8, Y8
VPADDD 64(BP), Y15, Y15
VPADDD 96(BP), Y4, Y4
VPADDD 128(BP), Y1, Y1
VPADDD 160(BP), Y2, Y2
VPADDD 192(BP), Y3, Y3
VMOVDQA Y15, 224(BP)
VPERM2I128 $0x02, Y0, Y14, Y15
VPERM2I128 $0x13, Y0, Y14, Y14
VPERM2I128 $0x02, Y12, Y4, Y0
VPERM2I128 $0x13, Y12, Y4, Y12
VPXOR (SI), Y15, Y15
VPXOR 32(SI), Y0, Y0
VPXOR 64(SI), Y14, Y14
VPXOR 96(SI), Y12, Y12
VMOVDQU Y15, (DI)
VMOVDQU Y0, 32(DI)
VMOVDQU Y14, 64(DI)
VMOVDQU Y12, 96(DI)
VPERM2I128 $0x02, Y5, Y9, Y0
VPERM2I128 $0x02, Y13, Y1, Y14
VPERM2I128 $0x13, Y5, Y9, Y12
VPERM2I128 $0x13, Y13, Y1, Y4
VPXOR 128(SI), Y0, Y0
VPXOR 160(SI), Y14, Y14
VPXOR 192(SI), Y12, Y12
VPXOR 224(SI), Y4, Y4
VMOVDQU Y0, 128(DI)
VMOVDQU Y14, 160(DI)
VMOVDQU Y12, 192(DI)
VMOVDQU Y4, 224(DI)
VPERM2I128 $0x02, Y6, Y10, Y0
VPERM2I128 $0x02, Y8, Y2, Y14
VPERM2I128 $0x13, Y6, Y10, Y12
VPERM2I128 $0x13, Y8, Y2, Y4
VPXOR 256(SI), Y0, Y0
VPXOR 288(SI), Y14, Y14
VPXOR 320(SI), Y12, Y12
VPXOR 352(SI), Y4, Y4
VMOVDQU Y0, 256(DI)
VMOVDQU Y14, 288(DI)
VMOVDQU Y12, 320(DI)
VMOVDQU Y4, 352(DI)
VPERM2I128 $0x02, Y7, Y11, Y0
VPERM2I128 $0x02, 224(BP), Y3, Y14
VPERM2I128 $0x13, Y7, Y11, Y12
VPERM2I128 $0x13, 224(BP), Y3, Y4
LEAQ 384(SI), SI
LEAQ 384(DI), DI
SUBQ $0x00000180, BX
JMP openAVX2TailLoop
DATA ·chacha20Constants<>+0(SB)/4, $0x61707865
DATA ·chacha20Constants<>+4(SB)/4, $0x3320646e
DATA ·chacha20Constants<>+8(SB)/4, $0x79622d32
DATA ·chacha20Constants<>+12(SB)/4, $0x6b206574
DATA ·chacha20Constants<>+16(SB)/4, $0x61707865
DATA ·chacha20Constants<>+20(SB)/4, $0x3320646e
DATA ·chacha20Constants<>+24(SB)/4, $0x79622d32
DATA ·chacha20Constants<>+28(SB)/4, $0x6b206574
GLOBL ·chacha20Constants<>(SB), RODATA|NOPTR, $32
DATA ·avx2InitMask<>+0(SB)/8, $0x0000000000000000
DATA ·avx2InitMask<>+8(SB)/8, $0x0000000000000000
DATA ·avx2InitMask<>+16(SB)/8, $0x0000000000000001
DATA ·avx2InitMask<>+24(SB)/8, $0x0000000000000000
GLOBL ·avx2InitMask<>(SB), RODATA|NOPTR, $32
DATA ·rol16<>+0(SB)/8, $0x0504070601000302
DATA ·rol16<>+8(SB)/8, $0x0d0c0f0e09080b0a
DATA ·rol16<>+16(SB)/8, $0x0504070601000302
DATA ·rol16<>+24(SB)/8, $0x0d0c0f0e09080b0a
GLOBL ·rol16<>(SB), RODATA|NOPTR, $32
DATA ·rol8<>+0(SB)/8, $0x0605040702010003
DATA ·rol8<>+8(SB)/8, $0x0e0d0c0f0a09080b
DATA ·rol8<>+16(SB)/8, $0x0605040702010003
DATA ·rol8<>+24(SB)/8, $0x0e0d0c0f0a09080b
GLOBL ·rol8<>(SB), RODATA|NOPTR, $32
DATA ·polyClampMask<>+0(SB)/8, $0x0ffffffc0fffffff
DATA ·polyClampMask<>+8(SB)/8, $0x0ffffffc0ffffffc
DATA ·polyClampMask<>+16(SB)/8, $0xffffffffffffffff
DATA ·polyClampMask<>+24(SB)/8, $0xffffffffffffffff
GLOBL ·polyClampMask<>(SB), RODATA|NOPTR, $32
DATA ·avx2IncMask<>+0(SB)/8, $0x0000000000000002
DATA ·avx2IncMask<>+8(SB)/8, $0x0000000000000000
DATA ·avx2IncMask<>+16(SB)/8, $0x0000000000000002
DATA ·avx2IncMask<>+24(SB)/8, $0x0000000000000000
GLOBL ·avx2IncMask<>(SB), RODATA|NOPTR, $32
DATA ·andMask<>+0(SB)/8, $0x00000000000000ff
DATA ·andMask<>+8(SB)/8, $0x0000000000000000
DATA ·andMask<>+16(SB)/8, $0x000000000000ffff
DATA ·andMask<>+24(SB)/8, $0x0000000000000000
DATA ·andMask<>+32(SB)/8, $0x0000000000ffffff
DATA ·andMask<>+40(SB)/8, $0x0000000000000000
DATA ·andMask<>+48(SB)/8, $0x00000000ffffffff
DATA ·andMask<>+56(SB)/8, $0x0000000000000000
DATA ·andMask<>+64(SB)/8, $0x000000ffffffffff
DATA ·andMask<>+72(SB)/8, $0x0000000000000000
DATA ·andMask<>+80(SB)/8, $0x0000ffffffffffff
DATA ·andMask<>+88(SB)/8, $0x0000000000000000
DATA ·andMask<>+96(SB)/8, $0x00ffffffffffffff
DATA ·andMask<>+104(SB)/8, $0x0000000000000000
DATA ·andMask<>+112(SB)/8, $0xffffffffffffffff
DATA ·andMask<>+120(SB)/8, $0x0000000000000000
DATA ·andMask<>+128(SB)/8, $0xffffffffffffffff
DATA ·andMask<>+136(SB)/8, $0x00000000000000ff
DATA ·andMask<>+144(SB)/8, $0xffffffffffffffff
DATA ·andMask<>+152(SB)/8, $0x000000000000ffff
DATA ·andMask<>+160(SB)/8, $0xffffffffffffffff
DATA ·andMask<>+168(SB)/8, $0x0000000000ffffff
DATA ·andMask<>+176(SB)/8, $0xffffffffffffffff
DATA ·andMask<>+184(SB)/8, $0x00000000ffffffff
DATA ·andMask<>+192(SB)/8, $0xffffffffffffffff
DATA ·andMask<>+200(SB)/8, $0x000000ffffffffff
DATA ·andMask<>+208(SB)/8, $0xffffffffffffffff
DATA ·andMask<>+216(SB)/8, $0x0000ffffffffffff
DATA ·andMask<>+224(SB)/8, $0xffffffffffffffff
DATA ·andMask<>+232(SB)/8, $0x00ffffffffffffff
GLOBL ·andMask<>(SB), RODATA|NOPTR, $240
// func chacha20Poly1305Seal(dst []byte, key []uint32, src []byte, ad []byte)
// Requires: AVX, AVX2, BMI2, CMOV, SSE2
TEXT ·chacha20Poly1305Seal(SB), $288-96
MOVQ SP, BP
ADDQ $0x20, BP
ANDQ $-32, BP
MOVQ dst_base+0(FP), DI
MOVQ key_base+24(FP), R8
MOVQ src_base+48(FP), SI
MOVQ src_len+56(FP), BX
MOVQ ad_base+72(FP), CX
VZEROUPPER
VMOVDQU ·chacha20Constants<>+0(SB), Y0
VBROADCASTI128 16(R8), Y14
VBROADCASTI128 32(R8), Y12
VBROADCASTI128 48(R8), Y4
VPADDD ·avx2InitMask<>+0(SB), Y4, Y4
// Special optimizations, for very short buffers
CMPQ BX, $0x000000c0
JBE seal192AVX2
CMPQ BX, $0x00000140
JBE seal320AVX2
// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
VMOVDQA Y0, Y5
VMOVDQA Y0, Y6
VMOVDQA Y0, Y7
VMOVDQA Y14, Y9
VMOVDQA Y14, Y10
VMOVDQA Y14, Y11
VMOVDQA Y14, 32(BP)
VMOVDQA Y12, Y13
VMOVDQA Y12, Y8
VMOVDQA Y12, Y15
VMOVDQA Y12, 64(BP)
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
VMOVDQA Y4, 96(BP)
VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
VMOVDQA Y1, 128(BP)
VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
VMOVDQA Y2, 160(BP)
VMOVDQA Y3, 192(BP)
MOVQ $0x0000000a, R9
sealAVX2IntroLoop:
VMOVDQA Y15, 224(BP)
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y15
VPSRLD $0x14, Y14, Y14
VPXOR Y15, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y15
VPSRLD $0x19, Y14, Y14
VPXOR Y15, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y15
VPSRLD $0x14, Y9, Y9
VPXOR Y15, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y15
VPSRLD $0x19, Y9, Y9
VPXOR Y15, Y9, Y9
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x0c, Y10, Y15
VPSRLD $0x14, Y10, Y10
VPXOR Y15, Y10, Y10
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x07, Y10, Y15
VPSRLD $0x19, Y10, Y10
VPXOR Y15, Y10, Y10
VMOVDQA 224(BP), Y15
VMOVDQA Y13, 224(BP)
VPADDD Y11, Y7, Y7
VPXOR Y7, Y3, Y3
VPSHUFB ·rol16<>+0(SB), Y3, Y3
VPADDD Y3, Y15, Y15
VPXOR Y15, Y11, Y11
VPSLLD $0x0c, Y11, Y13
VPSRLD $0x14, Y11, Y11
VPXOR Y13, Y11, Y11
VPADDD Y11, Y7, Y7
VPXOR Y7, Y3, Y3
VPSHUFB ·rol8<>+0(SB), Y3, Y3
VPADDD Y3, Y15, Y15
VPXOR Y15, Y11, Y11
VPSLLD $0x07, Y11, Y13
VPSRLD $0x19, Y11, Y11
VPXOR Y13, Y11, Y11
VMOVDQA 224(BP), Y13
VPALIGNR $0x04, Y14, Y14, Y14
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x0c, Y4, Y4, Y4
VPALIGNR $0x04, Y9, Y9, Y9
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x0c, Y1, Y1, Y1
VPALIGNR $0x04, Y10, Y10, Y10
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x0c, Y2, Y2, Y2
VPALIGNR $0x04, Y11, Y11, Y11
VPALIGNR $0x08, Y15, Y15, Y15
VPALIGNR $0x0c, Y3, Y3, Y3
VMOVDQA Y15, 224(BP)
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y15
VPSRLD $0x14, Y14, Y14
VPXOR Y15, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y15
VPSRLD $0x19, Y14, Y14
VPXOR Y15, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y15
VPSRLD $0x14, Y9, Y9
VPXOR Y15, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y15
VPSRLD $0x19, Y9, Y9
VPXOR Y15, Y9, Y9
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x0c, Y10, Y15
VPSRLD $0x14, Y10, Y10
VPXOR Y15, Y10, Y10
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x07, Y10, Y15
VPSRLD $0x19, Y10, Y10
VPXOR Y15, Y10, Y10
VMOVDQA 224(BP), Y15
VMOVDQA Y13, 224(BP)
VPADDD Y11, Y7, Y7
VPXOR Y7, Y3, Y3
VPSHUFB ·rol16<>+0(SB), Y3, Y3
VPADDD Y3, Y15, Y15
VPXOR Y15, Y11, Y11
VPSLLD $0x0c, Y11, Y13
VPSRLD $0x14, Y11, Y11
VPXOR Y13, Y11, Y11
VPADDD Y11, Y7, Y7
VPXOR Y7, Y3, Y3
VPSHUFB ·rol8<>+0(SB), Y3, Y3
VPADDD Y3, Y15, Y15
VPXOR Y15, Y11, Y11
VPSLLD $0x07, Y11, Y13
VPSRLD $0x19, Y11, Y11
VPXOR Y13, Y11, Y11
VMOVDQA 224(BP), Y13
VPALIGNR $0x0c, Y14, Y14, Y14
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x04, Y4, Y4, Y4
VPALIGNR $0x0c, Y9, Y9, Y9
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x04, Y1, Y1, Y1
VPALIGNR $0x0c, Y10, Y10, Y10
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x04, Y2, Y2, Y2
VPALIGNR $0x0c, Y11, Y11, Y11
VPALIGNR $0x08, Y15, Y15, Y15
VPALIGNR $0x04, Y3, Y3, Y3
DECQ R9
JNE sealAVX2IntroLoop
VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
VPADDD ·chacha20Constants<>+0(SB), Y7, Y7
VPADDD 32(BP), Y14, Y14
VPADDD 32(BP), Y9, Y9
VPADDD 32(BP), Y10, Y10
VPADDD 32(BP), Y11, Y11
VPADDD 64(BP), Y12, Y12
VPADDD 64(BP), Y13, Y13
VPADDD 64(BP), Y8, Y8
VPADDD 64(BP), Y15, Y15
VPADDD 96(BP), Y4, Y4
VPADDD 128(BP), Y1, Y1
VPADDD 160(BP), Y2, Y2
VPADDD 192(BP), Y3, Y3
VPERM2I128 $0x13, Y12, Y4, Y12
VPERM2I128 $0x02, Y0, Y14, Y4
VPERM2I128 $0x13, Y0, Y14, Y0
// Clamp and store poly key
VPAND ·polyClampMask<>+0(SB), Y4, Y4
VMOVDQA Y4, (BP)
// Hash AD
MOVQ ad_len+80(FP), R9
CALL polyHashADInternal<>(SB)
// Can store at least 320 bytes
VPXOR (SI), Y0, Y0
VPXOR 32(SI), Y12, Y12
VMOVDQU Y0, (DI)
VMOVDQU Y12, 32(DI)
VPERM2I128 $0x02, Y5, Y9, Y0
VPERM2I128 $0x02, Y13, Y1, Y14
VPERM2I128 $0x13, Y5, Y9, Y12
VPERM2I128 $0x13, Y13, Y1, Y4
VPXOR 64(SI), Y0, Y0
VPXOR 96(SI), Y14, Y14
VPXOR 128(SI), Y12, Y12
VPXOR 160(SI), Y4, Y4
VMOVDQU Y0, 64(DI)
VMOVDQU Y14, 96(DI)
VMOVDQU Y12, 128(DI)
VMOVDQU Y4, 160(DI)
VPERM2I128 $0x02, Y6, Y10, Y0
VPERM2I128 $0x02, Y8, Y2, Y14
VPERM2I128 $0x13, Y6, Y10, Y12
VPERM2I128 $0x13, Y8, Y2, Y4
VPXOR 192(SI), Y0, Y0
VPXOR 224(SI), Y14, Y14
VPXOR 256(SI), Y12, Y12
VPXOR 288(SI), Y4, Y4
VMOVDQU Y0, 192(DI)
VMOVDQU Y14, 224(DI)
VMOVDQU Y12, 256(DI)
VMOVDQU Y4, 288(DI)
MOVQ $0x00000140, CX
SUBQ $0x00000140, BX
LEAQ 320(SI), SI
VPERM2I128 $0x02, Y7, Y11, Y0
VPERM2I128 $0x02, Y15, Y3, Y14
VPERM2I128 $0x13, Y7, Y11, Y12
VPERM2I128 $0x13, Y15, Y3, Y4
CMPQ BX, $0x80
JBE sealAVX2SealHash
VPXOR (SI), Y0, Y0
VPXOR 32(SI), Y14, Y14
VPXOR 64(SI), Y12, Y12
VPXOR 96(SI), Y4, Y4
VMOVDQU Y0, 320(DI)
VMOVDQU Y14, 352(DI)
VMOVDQU Y12, 384(DI)
VMOVDQU Y4, 416(DI)
SUBQ $0x80, BX
LEAQ 128(SI), SI
MOVQ $0x00000008, CX
MOVQ $0x00000002, R9
CMPQ BX, $0x80
JBE sealAVX2Tail128
CMPQ BX, $0x00000100
JBE sealAVX2Tail256
CMPQ BX, $0x00000180
JBE sealAVX2Tail384
CMPQ BX, $0x00000200
JBE sealAVX2Tail512
// We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
VMOVDQA ·chacha20Constants<>+0(SB), Y0
VMOVDQA Y0, Y5
VMOVDQA Y0, Y6
VMOVDQA Y0, Y7
VMOVDQA 32(BP), Y14
VMOVDQA Y14, Y9
VMOVDQA Y14, Y10
VMOVDQA Y14, Y11
VMOVDQA 64(BP), Y12
VMOVDQA Y12, Y13
VMOVDQA Y12, Y8
VMOVDQA Y12, Y15
VMOVDQA 192(BP), Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
VMOVDQA Y4, 96(BP)
VMOVDQA Y1, 128(BP)
VMOVDQA Y2, 160(BP)
VMOVDQA Y3, 192(BP)
VMOVDQA Y15, 224(BP)
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y15
VPSRLD $0x14, Y14, Y14
VPXOR Y15, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y15
VPSRLD $0x19, Y14, Y14
VPXOR Y15, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y15
VPSRLD $0x14, Y9, Y9
VPXOR Y15, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y15
VPSRLD $0x19, Y9, Y9
VPXOR Y15, Y9, Y9
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x0c, Y10, Y15
VPSRLD $0x14, Y10, Y10
VPXOR Y15, Y10, Y10
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x07, Y10, Y15
VPSRLD $0x19, Y10, Y10
VPXOR Y15, Y10, Y10
VMOVDQA 224(BP), Y15
VMOVDQA Y13, 224(BP)
VPADDD Y11, Y7, Y7
VPXOR Y7, Y3, Y3
VPSHUFB ·rol16<>+0(SB), Y3, Y3
VPADDD Y3, Y15, Y15
VPXOR Y15, Y11, Y11
VPSLLD $0x0c, Y11, Y13
VPSRLD $0x14, Y11, Y11
VPXOR Y13, Y11, Y11
VPADDD Y11, Y7, Y7
VPXOR Y7, Y3, Y3
VPSHUFB ·rol8<>+0(SB), Y3, Y3
VPADDD Y3, Y15, Y15
VPXOR Y15, Y11, Y11
VPSLLD $0x07, Y11, Y13
VPSRLD $0x19, Y11, Y11
VPXOR Y13, Y11, Y11
VMOVDQA 224(BP), Y13
VPALIGNR $0x04, Y14, Y14, Y14
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x0c, Y4, Y4, Y4
VPALIGNR $0x04, Y9, Y9, Y9
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x0c, Y1, Y1, Y1
VPALIGNR $0x04, Y10, Y10, Y10
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x0c, Y2, Y2, Y2
VPALIGNR $0x04, Y11, Y11, Y11
VPALIGNR $0x08, Y15, Y15, Y15
VPALIGNR $0x0c, Y3, Y3, Y3
VMOVDQA Y15, 224(BP)
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y15
VPSRLD $0x14, Y14, Y14
VPXOR Y15, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y15
VPSRLD $0x19, Y14, Y14
VPXOR Y15, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y15
VPSRLD $0x14, Y9, Y9
VPXOR Y15, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y15
VPSRLD $0x19, Y9, Y9
VPXOR Y15, Y9, Y9
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x0c, Y10, Y15
VPSRLD $0x14, Y10, Y10
VPXOR Y15, Y10, Y10
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x07, Y10, Y15
VPSRLD $0x19, Y10, Y10
VPXOR Y15, Y10, Y10
VMOVDQA 224(BP), Y15
VMOVDQA Y13, 224(BP)
VPADDD Y11, Y7, Y7
VPXOR Y7, Y3, Y3
VPSHUFB ·rol16<>+0(SB), Y3, Y3
VPADDD Y3, Y15, Y15
VPXOR Y15, Y11, Y11
VPSLLD $0x0c, Y11, Y13
VPSRLD $0x14, Y11, Y11
VPXOR Y13, Y11, Y11
VPADDD Y11, Y7, Y7
VPXOR Y7, Y3, Y3
VPSHUFB ·rol8<>+0(SB), Y3, Y3
VPADDD Y3, Y15, Y15
VPXOR Y15, Y11, Y11
VPSLLD $0x07, Y11, Y13
VPSRLD $0x19, Y11, Y11
VPXOR Y13, Y11, Y11
VMOVDQA 224(BP), Y13
VPALIGNR $0x0c, Y14, Y14, Y14
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x04, Y4, Y4, Y4
VPALIGNR $0x0c, Y9, Y9, Y9
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x04, Y1, Y1, Y1
VPALIGNR $0x0c, Y10, Y10, Y10
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x04, Y2, Y2, Y2
VPALIGNR $0x0c, Y11, Y11, Y11
VPALIGNR $0x08, Y15, Y15, Y15
VPALIGNR $0x04, Y3, Y3, Y3
VPADDD Y14, Y0, Y0
VPADDD Y9, Y5, Y5
VPADDD Y10, Y6, Y6
VPADDD Y11, Y7, Y7
VPXOR Y0, Y4, Y4
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y3, Y3
VPADDD Y4, Y12, Y12
VPADDD Y1, Y13, Y13
VPADDD Y2, Y8, Y8
VPADDD Y3, Y15, Y15
VPXOR Y12, Y14, Y14
VPXOR Y13, Y9, Y9
VPXOR Y8, Y10, Y10
VPXOR Y15, Y11, Y11
VMOVDQA Y15, 224(BP)
VPSLLD $0x0c, Y14, Y15
VPSRLD $0x14, Y14, Y14
VPXOR Y15, Y14, Y14
VPSLLD $0x0c, Y9, Y15
VPSRLD $0x14, Y9, Y9
VPXOR Y15, Y9, Y9
VPSLLD $0x0c, Y10, Y15
VPSRLD $0x14, Y10, Y10
VPXOR Y15, Y10, Y10
VPSLLD $0x0c, Y11, Y15
VPSRLD $0x14, Y11, Y11
VPXOR Y15, Y11, Y11
VMOVDQA 224(BP), Y15
SUBQ $0x10, DI
MOVQ $0x00000009, CX
JMP sealAVX2InternalLoopStart
sealAVX2MainLoop:
VMOVDQU ·chacha20Constants<>+0(SB), Y0
VMOVDQA Y0, Y5
VMOVDQA Y0, Y6
VMOVDQA Y0, Y7
VMOVDQA 32(BP), Y14
VMOVDQA Y14, Y9
VMOVDQA Y14, Y10
VMOVDQA Y14, Y11
VMOVDQA 64(BP), Y12
VMOVDQA Y12, Y13
VMOVDQA Y12, Y8
VMOVDQA Y12, Y15
VMOVDQA 192(BP), Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
VMOVDQA Y4, 96(BP)
VMOVDQA Y1, 128(BP)
VMOVDQA Y2, 160(BP)
VMOVDQA Y3, 192(BP)
MOVQ $0x0000000a, CX
sealAVX2InternalLoop:
ADDQ (DI), R10
ADCQ 8(DI), R11
ADCQ $0x01, R12
VPADDD Y14, Y0, Y0
VPADDD Y9, Y5, Y5
VPADDD Y10, Y6, Y6
VPADDD Y11, Y7, Y7
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
VPXOR Y0, Y4, Y4
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y3, Y3
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
VPADDD Y4, Y12, Y12
VPADDD Y1, Y13, Y13
VPADDD Y2, Y8, Y8
VPADDD Y3, Y15, Y15
VPXOR Y12, Y14, Y14
VPXOR Y13, Y9, Y9
VPXOR Y8, Y10, Y10
VPXOR Y15, Y11, Y11
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
VMOVDQA Y15, 224(BP)
VPSLLD $0x0c, Y14, Y15
VPSRLD $0x14, Y14, Y14
VPXOR Y15, Y14, Y14
VPSLLD $0x0c, Y9, Y15
VPSRLD $0x14, Y9, Y9
VPXOR Y15, Y9, Y9
VPSLLD $0x0c, Y10, Y15
VPSRLD $0x14, Y10, Y10
VPXOR Y15, Y10, Y10
VPSLLD $0x0c, Y11, Y15
VPSRLD $0x14, Y11, Y11
VPXOR Y15, Y11, Y11
VMOVDQA 224(BP), Y15
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
sealAVX2InternalLoopStart:
VPADDD Y14, Y0, Y0
VPADDD Y9, Y5, Y5
VPADDD Y10, Y6, Y6
VPADDD Y11, Y7, Y7
VPXOR Y0, Y4, Y4
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y3, Y3
ADDQ 16(DI), R10
ADCQ 24(DI), R11
ADCQ $0x01, R12
VPADDD Y4, Y12, Y12
VPADDD Y1, Y13, Y13
VPADDD Y2, Y8, Y8
VPADDD Y3, Y15, Y15
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
VPXOR Y12, Y14, Y14
VPXOR Y13, Y9, Y9
VPXOR Y8, Y10, Y10
VPXOR Y15, Y11, Y11
VMOVDQA Y15, 224(BP)
VPSLLD $0x07, Y14, Y15
VPSRLD $0x19, Y14, Y14
VPXOR Y15, Y14, Y14
VPSLLD $0x07, Y9, Y15
VPSRLD $0x19, Y9, Y9
VPXOR Y15, Y9, Y9
VPSLLD $0x07, Y10, Y15
VPSRLD $0x19, Y10, Y10
VPXOR Y15, Y10, Y10
VPSLLD $0x07, Y11, Y15
VPSRLD $0x19, Y11, Y11
VPXOR Y15, Y11, Y11
VMOVDQA 224(BP), Y15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
VPALIGNR $0x04, Y14, Y14, Y14
VPALIGNR $0x04, Y9, Y9, Y9
VPALIGNR $0x04, Y10, Y10, Y10
VPALIGNR $0x04, Y11, Y11, Y11
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x08, Y15, Y15, Y15
VPALIGNR $0x0c, Y4, Y4, Y4
VPALIGNR $0x0c, Y1, Y1, Y1
VPALIGNR $0x0c, Y2, Y2, Y2
VPALIGNR $0x0c, Y3, Y3, Y3
VPADDD Y14, Y0, Y0
VPADDD Y9, Y5, Y5
VPADDD Y10, Y6, Y6
VPADDD Y11, Y7, Y7
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
VPXOR Y0, Y4, Y4
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y3, Y3
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
VPADDD Y4, Y12, Y12
VPADDD Y1, Y13, Y13
VPADDD Y2, Y8, Y8
VPADDD Y3, Y15, Y15
VPXOR Y12, Y14, Y14
VPXOR Y13, Y9, Y9
VPXOR Y8, Y10, Y10
VPXOR Y15, Y11, Y11
ADDQ 32(DI), R10
ADCQ 40(DI), R11
ADCQ $0x01, R12
LEAQ 48(DI), DI
VMOVDQA Y15, 224(BP)
VPSLLD $0x0c, Y14, Y15
VPSRLD $0x14, Y14, Y14
VPXOR Y15, Y14, Y14
VPSLLD $0x0c, Y9, Y15
VPSRLD $0x14, Y9, Y9
VPXOR Y15, Y9, Y9
VPSLLD $0x0c, Y10, Y15
VPSRLD $0x14, Y10, Y10
VPXOR Y15, Y10, Y10
VPSLLD $0x0c, Y11, Y15
VPSRLD $0x14, Y11, Y11
VPXOR Y15, Y11, Y11
VMOVDQA 224(BP), Y15
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
VPADDD Y14, Y0, Y0
VPADDD Y9, Y5, Y5
VPADDD Y10, Y6, Y6
VPADDD Y11, Y7, Y7
VPXOR Y0, Y4, Y4
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y3, Y3
VPADDD Y4, Y12, Y12
VPADDD Y1, Y13, Y13
VPADDD Y2, Y8, Y8
VPADDD Y3, Y15, Y15
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
VPXOR Y12, Y14, Y14
VPXOR Y13, Y9, Y9
VPXOR Y8, Y10, Y10
VPXOR Y15, Y11, Y11
VMOVDQA Y15, 224(BP)
VPSLLD $0x07, Y14, Y15
VPSRLD $0x19, Y14, Y14
VPXOR Y15, Y14, Y14
VPSLLD $0x07, Y9, Y15
VPSRLD $0x19, Y9, Y9
VPXOR Y15, Y9, Y9
VPSLLD $0x07, Y10, Y15
VPSRLD $0x19, Y10, Y10
VPXOR Y15, Y10, Y10
VPSLLD $0x07, Y11, Y15
VPSRLD $0x19, Y11, Y11
VPXOR Y15, Y11, Y11
VMOVDQA 224(BP), Y15
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
VPALIGNR $0x0c, Y14, Y14, Y14
VPALIGNR $0x0c, Y9, Y9, Y9
VPALIGNR $0x0c, Y10, Y10, Y10
VPALIGNR $0x0c, Y11, Y11, Y11
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x08, Y15, Y15, Y15
VPALIGNR $0x04, Y4, Y4, Y4
VPALIGNR $0x04, Y1, Y1, Y1
VPALIGNR $0x04, Y2, Y2, Y2
VPALIGNR $0x04, Y3, Y3, Y3
DECQ CX
JNE sealAVX2InternalLoop
VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
VPADDD ·chacha20Constants<>+0(SB), Y7, Y7
VPADDD 32(BP), Y14, Y14
VPADDD 32(BP), Y9, Y9
VPADDD 32(BP), Y10, Y10
VPADDD 32(BP), Y11, Y11
VPADDD 64(BP), Y12, Y12
VPADDD 64(BP), Y13, Y13
VPADDD 64(BP), Y8, Y8
VPADDD 64(BP), Y15, Y15
VPADDD 96(BP), Y4, Y4
VPADDD 128(BP), Y1, Y1
VPADDD 160(BP), Y2, Y2
VPADDD 192(BP), Y3, Y3
VMOVDQA Y15, 224(BP)
// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
ADDQ (DI), R10
ADCQ 8(DI), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 32(DI), DI
VPERM2I128 $0x02, Y0, Y14, Y15
VPERM2I128 $0x13, Y0, Y14, Y14
VPERM2I128 $0x02, Y12, Y4, Y0
VPERM2I128 $0x13, Y12, Y4, Y12
VPXOR (SI), Y15, Y15
VPXOR 32(SI), Y0, Y0
VPXOR 64(SI), Y14, Y14
VPXOR 96(SI), Y12, Y12
VMOVDQU Y15, (DI)
VMOVDQU Y0, 32(DI)
VMOVDQU Y14, 64(DI)
VMOVDQU Y12, 96(DI)
VPERM2I128 $0x02, Y5, Y9, Y0
VPERM2I128 $0x02, Y13, Y1, Y14
VPERM2I128 $0x13, Y5, Y9, Y12
VPERM2I128 $0x13, Y13, Y1, Y4
VPXOR 128(SI), Y0, Y0
VPXOR 160(SI), Y14, Y14
VPXOR 192(SI), Y12, Y12
VPXOR 224(SI), Y4, Y4
VMOVDQU Y0, 128(DI)
VMOVDQU Y14, 160(DI)
VMOVDQU Y12, 192(DI)
VMOVDQU Y4, 224(DI)
// and here
ADDQ -16(DI), R10
ADCQ -8(DI), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
VPERM2I128 $0x02, Y6, Y10, Y0
VPERM2I128 $0x02, Y8, Y2, Y14
VPERM2I128 $0x13, Y6, Y10, Y12
VPERM2I128 $0x13, Y8, Y2, Y4
VPXOR 256(SI), Y0, Y0
VPXOR 288(SI), Y14, Y14
VPXOR 320(SI), Y12, Y12
VPXOR 352(SI), Y4, Y4
VMOVDQU Y0, 256(DI)
VMOVDQU Y14, 288(DI)
VMOVDQU Y12, 320(DI)
VMOVDQU Y4, 352(DI)
VPERM2I128 $0x02, Y7, Y11, Y0
VPERM2I128 $0x02, 224(BP), Y3, Y14
VPERM2I128 $0x13, Y7, Y11, Y12
VPERM2I128 $0x13, 224(BP), Y3, Y4
VPXOR 384(SI), Y0, Y0
VPXOR 416(SI), Y14, Y14
VPXOR 448(SI), Y12, Y12
VPXOR 480(SI), Y4, Y4
VMOVDQU Y0, 384(DI)
VMOVDQU Y14, 416(DI)
VMOVDQU Y12, 448(DI)
VMOVDQU Y4, 480(DI)
LEAQ 512(SI), SI
SUBQ $0x00000200, BX
CMPQ BX, $0x00000200
JG sealAVX2MainLoop
// Tail can only hash 480 bytes
ADDQ (DI), R10
ADCQ 8(DI), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
ADDQ 16(DI), R10
ADCQ 24(DI), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 32(DI), DI
MOVQ $0x0000000a, CX
MOVQ $0x00000000, R9
CMPQ BX, $0x80
JBE sealAVX2Tail128
CMPQ BX, $0x00000100
JBE sealAVX2Tail256
CMPQ BX, $0x00000180
JBE sealAVX2Tail384
JMP sealAVX2Tail512
sealSSETail:
TESTQ BX, BX
JE sealSSEFinalize
// We can only load the PT one byte at a time to avoid read after end of buffer
MOVQ BX, R9
SHLQ $0x04, R9
LEAQ ·andMask<>+0(SB), R13
MOVQ BX, CX
LEAQ -1(SI)(BX*1), SI
XORQ R15, R15
XORQ R8, R8
XORQ AX, AX
sealSSETailLoadLoop:
SHLQ $0x08, R15, R8
SHLQ $0x08, R15
MOVB (SI), AX
XORQ AX, R15
LEAQ -1(SI), SI
DECQ CX
JNE sealSSETailLoadLoop
MOVQ R15, 64(BP)
MOVQ R8, 72(BP)
PXOR 64(BP), X1
MOVOU X1, (DI)
MOVOU -16(R13)(R9*1), X12
PAND X12, X1
MOVQ X1, R13
PSRLDQ $0x08, X1
MOVQ X1, R14
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
ADDQ BX, DI
sealSSEFinalize:
// Hash in the buffer lengths
ADDQ ad_len+80(FP), R10
ADCQ src_len+56(FP), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
// Final reduce
MOVQ R10, R13
MOVQ R11, R14
MOVQ R12, R15
SUBQ $-5, R10
SBBQ $-1, R11
SBBQ $0x03, R12
CMOVQCS R13, R10
CMOVQCS R14, R11
CMOVQCS R15, R12
// Add in the "s" part of the key
ADDQ 16(BP), R10
ADCQ 24(BP), R11
// Finally store the tag at the end of the message
MOVQ R10, (DI)
MOVQ R11, 8(DI)
RET
seal192AVX2:
VMOVDQA Y0, Y5
VMOVDQA Y14, Y9
VMOVDQA Y12, Y13
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
VMOVDQA Y0, Y6
VMOVDQA Y14, Y10
VMOVDQA Y12, Y8
VMOVDQA Y4, Y2
VMOVDQA Y1, Y15
MOVQ $0x0000000a, R9
sealAVX2192InnerCipherLoop:
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y3
VPSRLD $0x14, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y3
VPSRLD $0x19, Y9, Y9
VPXOR Y3, Y9, Y9
VPALIGNR $0x04, Y14, Y14, Y14
VPALIGNR $0x04, Y9, Y9, Y9
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x0c, Y4, Y4, Y4
VPALIGNR $0x0c, Y1, Y1, Y1
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y3
VPSRLD $0x14, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y3
VPSRLD $0x19, Y9, Y9
VPXOR Y3, Y9, Y9
VPALIGNR $0x0c, Y14, Y14, Y14
VPALIGNR $0x0c, Y9, Y9, Y9
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x04, Y4, Y4, Y4
VPALIGNR $0x04, Y1, Y1, Y1
DECQ R9
JNE sealAVX2192InnerCipherLoop
VPADDD Y6, Y0, Y0
VPADDD Y6, Y5, Y5
VPADDD Y10, Y14, Y14
VPADDD Y10, Y9, Y9
VPADDD Y8, Y12, Y12
VPADDD Y8, Y13, Y13
VPADDD Y2, Y4, Y4
VPADDD Y15, Y1, Y1
VPERM2I128 $0x02, Y0, Y14, Y3
// Clamp and store poly key
VPAND ·polyClampMask<>+0(SB), Y3, Y3
VMOVDQA Y3, (BP)
// Stream for up to 192 bytes
VPERM2I128 $0x13, Y0, Y14, Y0
VPERM2I128 $0x13, Y12, Y4, Y14
VPERM2I128 $0x02, Y5, Y9, Y12
VPERM2I128 $0x02, Y13, Y1, Y4
VPERM2I128 $0x13, Y5, Y9, Y5
VPERM2I128 $0x13, Y13, Y1, Y9
sealAVX2ShortSeal:
// Hash aad
MOVQ ad_len+80(FP), R9
CALL polyHashADInternal<>(SB)
XORQ CX, CX
sealAVX2SealHash:
// itr1 holds the number of bytes encrypted but not yet hashed
CMPQ CX, $0x10
JB sealAVX2ShortSealLoop
ADDQ (DI), R10
ADCQ 8(DI), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
SUBQ $0x10, CX
ADDQ $0x10, DI
JMP sealAVX2SealHash
sealAVX2ShortSealLoop:
CMPQ BX, $0x20
JB sealAVX2ShortTail32
SUBQ $0x20, BX
// Load for encryption
VPXOR (SI), Y0, Y0
VMOVDQU Y0, (DI)
LEAQ 32(SI), SI
// Now can hash
ADDQ (DI), R10
ADCQ 8(DI), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
ADDQ 16(DI), R10
ADCQ 24(DI), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 32(DI), DI
// Shift stream left
VMOVDQA Y14, Y0
VMOVDQA Y12, Y14
VMOVDQA Y4, Y12
VMOVDQA Y5, Y4
VMOVDQA Y9, Y5
VMOVDQA Y13, Y9
VMOVDQA Y1, Y13
VMOVDQA Y6, Y1
VMOVDQA Y10, Y6
JMP sealAVX2ShortSealLoop
sealAVX2ShortTail32:
CMPQ BX, $0x10
VMOVDQA X0, X1
JB sealAVX2ShortDone
SUBQ $0x10, BX
// Load for encryption
VPXOR (SI), X0, X12
VMOVDQU X12, (DI)
LEAQ 16(SI), SI
// Hash
ADDQ (DI), R10
ADCQ 8(DI), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 16(DI), DI
VPERM2I128 $0x11, Y0, Y0, Y0
VMOVDQA X0, X1
sealAVX2ShortDone:
VZEROUPPER
JMP sealSSETail
seal320AVX2:
VMOVDQA Y0, Y5
VMOVDQA Y14, Y9
VMOVDQA Y12, Y13
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
VMOVDQA Y0, Y6
VMOVDQA Y14, Y10
VMOVDQA Y12, Y8
VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
VMOVDQA Y14, Y7
VMOVDQA Y12, Y11
VMOVDQA Y4, Y15
MOVQ $0x0000000a, R9
sealAVX2320InnerCipherLoop:
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y3
VPSRLD $0x14, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y3
VPSRLD $0x19, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x0c, Y10, Y3
VPSRLD $0x14, Y10, Y10
VPXOR Y3, Y10, Y10
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x07, Y10, Y3
VPSRLD $0x19, Y10, Y10
VPXOR Y3, Y10, Y10
VPALIGNR $0x04, Y14, Y14, Y14
VPALIGNR $0x04, Y9, Y9, Y9
VPALIGNR $0x04, Y10, Y10, Y10
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x0c, Y4, Y4, Y4
VPALIGNR $0x0c, Y1, Y1, Y1
VPALIGNR $0x0c, Y2, Y2, Y2
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y3
VPSRLD $0x14, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y3
VPSRLD $0x19, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x0c, Y10, Y3
VPSRLD $0x14, Y10, Y10
VPXOR Y3, Y10, Y10
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x07, Y10, Y3
VPSRLD $0x19, Y10, Y10
VPXOR Y3, Y10, Y10
VPALIGNR $0x0c, Y14, Y14, Y14
VPALIGNR $0x0c, Y9, Y9, Y9
VPALIGNR $0x0c, Y10, Y10, Y10
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x04, Y4, Y4, Y4
VPALIGNR $0x04, Y1, Y1, Y1
VPALIGNR $0x04, Y2, Y2, Y2
DECQ R9
JNE sealAVX2320InnerCipherLoop
VMOVDQA ·chacha20Constants<>+0(SB), Y3
VPADDD Y3, Y0, Y0
VPADDD Y3, Y5, Y5
VPADDD Y3, Y6, Y6
VPADDD Y7, Y14, Y14
VPADDD Y7, Y9, Y9
VPADDD Y7, Y10, Y10
VPADDD Y11, Y12, Y12
VPADDD Y11, Y13, Y13
VPADDD Y11, Y8, Y8
VMOVDQA ·avx2IncMask<>+0(SB), Y3
VPADDD Y15, Y4, Y4
VPADDD Y3, Y15, Y15
VPADDD Y15, Y1, Y1
VPADDD Y3, Y15, Y15
VPADDD Y15, Y2, Y2
// Clamp and store poly key
VPERM2I128 $0x02, Y0, Y14, Y3
VPAND ·polyClampMask<>+0(SB), Y3, Y3
VMOVDQA Y3, (BP)
// Stream for up to 320 bytes
VPERM2I128 $0x13, Y0, Y14, Y0
VPERM2I128 $0x13, Y12, Y4, Y14
VPERM2I128 $0x02, Y5, Y9, Y12
VPERM2I128 $0x02, Y13, Y1, Y4
VPERM2I128 $0x13, Y5, Y9, Y5
VPERM2I128 $0x13, Y13, Y1, Y9
VPERM2I128 $0x02, Y6, Y10, Y13
VPERM2I128 $0x02, Y8, Y2, Y1
VPERM2I128 $0x13, Y6, Y10, Y6
VPERM2I128 $0x13, Y8, Y2, Y10
JMP sealAVX2ShortSeal
sealAVX2Tail128:
VMOVDQA ·chacha20Constants<>+0(SB), Y0
VMOVDQA 32(BP), Y14
VMOVDQA 64(BP), Y12
VMOVDQA 192(BP), Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
VMOVDQA Y4, Y1
sealAVX2Tail128LoopA:
ADDQ (DI), R10
ADCQ 8(DI), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 16(DI), DI
sealAVX2Tail128LoopB:
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
ADDQ (DI), R10
ADCQ 8(DI), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
VPALIGNR $0x04, Y14, Y14, Y14
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x0c, Y4, Y4, Y4
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
ADDQ 16(DI), R10
ADCQ 24(DI), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 32(DI), DI
VPALIGNR $0x0c, Y14, Y14, Y14
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x04, Y4, Y4, Y4
DECQ CX
JG sealAVX2Tail128LoopA
DECQ R9
JGE sealAVX2Tail128LoopB
VPADDD ·chacha20Constants<>+0(SB), Y0, Y5
VPADDD 32(BP), Y14, Y9
VPADDD 64(BP), Y12, Y13
VPADDD Y1, Y4, Y1
VPERM2I128 $0x02, Y5, Y9, Y0
VPERM2I128 $0x02, Y13, Y1, Y14
VPERM2I128 $0x13, Y5, Y9, Y12
VPERM2I128 $0x13, Y13, Y1, Y4
JMP sealAVX2ShortSealLoop
sealAVX2Tail256:
VMOVDQA ·chacha20Constants<>+0(SB), Y0
VMOVDQA ·chacha20Constants<>+0(SB), Y5
VMOVDQA 32(BP), Y14
VMOVDQA 32(BP), Y9
VMOVDQA 64(BP), Y12
VMOVDQA 64(BP), Y13
VMOVDQA 192(BP), Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
VMOVDQA Y4, Y7
VMOVDQA Y1, Y11
sealAVX2Tail256LoopA:
ADDQ (DI), R10
ADCQ 8(DI), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 16(DI), DI
sealAVX2Tail256LoopB:
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y3
VPSRLD $0x14, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y3
VPSRLD $0x19, Y9, Y9
VPXOR Y3, Y9, Y9
ADDQ (DI), R10
ADCQ 8(DI), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
VPALIGNR $0x04, Y14, Y14, Y14
VPALIGNR $0x04, Y9, Y9, Y9
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x0c, Y4, Y4, Y4
VPALIGNR $0x0c, Y1, Y1, Y1
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y3
VPSRLD $0x14, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y3
VPSRLD $0x19, Y9, Y9
VPXOR Y3, Y9, Y9
ADDQ 16(DI), R10
ADCQ 24(DI), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 32(DI), DI
VPALIGNR $0x0c, Y14, Y14, Y14
VPALIGNR $0x0c, Y9, Y9, Y9
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x04, Y4, Y4, Y4
VPALIGNR $0x04, Y1, Y1, Y1
DECQ CX
JG sealAVX2Tail256LoopA
DECQ R9
JGE sealAVX2Tail256LoopB
VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
VPADDD 32(BP), Y14, Y14
VPADDD 32(BP), Y9, Y9
VPADDD 64(BP), Y12, Y12
VPADDD 64(BP), Y13, Y13
VPADDD Y7, Y4, Y4
VPADDD Y11, Y1, Y1
VPERM2I128 $0x02, Y0, Y14, Y3
VPERM2I128 $0x02, Y12, Y4, Y7
VPERM2I128 $0x13, Y0, Y14, Y11
VPERM2I128 $0x13, Y12, Y4, Y15
VPXOR (SI), Y3, Y3
VPXOR 32(SI), Y7, Y7
VPXOR 64(SI), Y11, Y11
VPXOR 96(SI), Y15, Y15
VMOVDQU Y3, (DI)
VMOVDQU Y7, 32(DI)
VMOVDQU Y11, 64(DI)
VMOVDQU Y15, 96(DI)
MOVQ $0x00000080, CX
LEAQ 128(SI), SI
SUBQ $0x80, BX
VPERM2I128 $0x02, Y5, Y9, Y0
VPERM2I128 $0x02, Y13, Y1, Y14
VPERM2I128 $0x13, Y5, Y9, Y12
VPERM2I128 $0x13, Y13, Y1, Y4
JMP sealAVX2SealHash
sealAVX2Tail384:
VMOVDQA ·chacha20Constants<>+0(SB), Y0
VMOVDQA Y0, Y5
VMOVDQA Y0, Y6
VMOVDQA 32(BP), Y14
VMOVDQA Y14, Y9
VMOVDQA Y14, Y10
VMOVDQA 64(BP), Y12
VMOVDQA Y12, Y13
VMOVDQA Y12, Y8
VMOVDQA 192(BP), Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
VMOVDQA Y4, Y7
VMOVDQA Y1, Y11
VMOVDQA Y2, Y15
sealAVX2Tail384LoopA:
ADDQ (DI), R10
ADCQ 8(DI), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 16(DI), DI
sealAVX2Tail384LoopB:
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y3
VPSRLD $0x14, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y3
VPSRLD $0x19, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x0c, Y10, Y3
VPSRLD $0x14, Y10, Y10
VPXOR Y3, Y10, Y10
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x07, Y10, Y3
VPSRLD $0x19, Y10, Y10
VPXOR Y3, Y10, Y10
ADDQ (DI), R10
ADCQ 8(DI), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
VPALIGNR $0x04, Y14, Y14, Y14
VPALIGNR $0x04, Y9, Y9, Y9
VPALIGNR $0x04, Y10, Y10, Y10
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x0c, Y4, Y4, Y4
VPALIGNR $0x0c, Y1, Y1, Y1
VPALIGNR $0x0c, Y2, Y2, Y2
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x0c, Y14, Y3
VPSRLD $0x14, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y14, Y0, Y0
VPXOR Y0, Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPADDD Y4, Y12, Y12
VPXOR Y12, Y14, Y14
VPSLLD $0x07, Y14, Y3
VPSRLD $0x19, Y14, Y14
VPXOR Y3, Y14, Y14
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x0c, Y9, Y3
VPSRLD $0x14, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y9, Y5, Y5
VPXOR Y5, Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPADDD Y1, Y13, Y13
VPXOR Y13, Y9, Y9
VPSLLD $0x07, Y9, Y3
VPSRLD $0x19, Y9, Y9
VPXOR Y3, Y9, Y9
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x0c, Y10, Y3
VPSRLD $0x14, Y10, Y10
VPXOR Y3, Y10, Y10
VPADDD Y10, Y6, Y6
VPXOR Y6, Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPADDD Y2, Y8, Y8
VPXOR Y8, Y10, Y10
VPSLLD $0x07, Y10, Y3
VPSRLD $0x19, Y10, Y10
VPXOR Y3, Y10, Y10
ADDQ 16(DI), R10
ADCQ 24(DI), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 32(DI), DI
VPALIGNR $0x0c, Y14, Y14, Y14
VPALIGNR $0x0c, Y9, Y9, Y9
VPALIGNR $0x0c, Y10, Y10, Y10
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x04, Y4, Y4, Y4
VPALIGNR $0x04, Y1, Y1, Y1
VPALIGNR $0x04, Y2, Y2, Y2
DECQ CX
JG sealAVX2Tail384LoopA
DECQ R9
JGE sealAVX2Tail384LoopB
VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
VPADDD 32(BP), Y14, Y14
VPADDD 32(BP), Y9, Y9
VPADDD 32(BP), Y10, Y10
VPADDD 64(BP), Y12, Y12
VPADDD 64(BP), Y13, Y13
VPADDD 64(BP), Y8, Y8
VPADDD Y7, Y4, Y4
VPADDD Y11, Y1, Y1
VPADDD Y15, Y2, Y2
VPERM2I128 $0x02, Y0, Y14, Y3
VPERM2I128 $0x02, Y12, Y4, Y7
VPERM2I128 $0x13, Y0, Y14, Y11
VPERM2I128 $0x13, Y12, Y4, Y15
VPXOR (SI), Y3, Y3
VPXOR 32(SI), Y7, Y7
VPXOR 64(SI), Y11, Y11
VPXOR 96(SI), Y15, Y15
VMOVDQU Y3, (DI)
VMOVDQU Y7, 32(DI)
VMOVDQU Y11, 64(DI)
VMOVDQU Y15, 96(DI)
VPERM2I128 $0x02, Y5, Y9, Y3
VPERM2I128 $0x02, Y13, Y1, Y7
VPERM2I128 $0x13, Y5, Y9, Y11
VPERM2I128 $0x13, Y13, Y1, Y15
VPXOR 128(SI), Y3, Y3
VPXOR 160(SI), Y7, Y7
VPXOR 192(SI), Y11, Y11
VPXOR 224(SI), Y15, Y15
VMOVDQU Y3, 128(DI)
VMOVDQU Y7, 160(DI)
VMOVDQU Y11, 192(DI)
VMOVDQU Y15, 224(DI)
MOVQ $0x00000100, CX
LEAQ 256(SI), SI
SUBQ $0x00000100, BX
VPERM2I128 $0x02, Y6, Y10, Y0
VPERM2I128 $0x02, Y8, Y2, Y14
VPERM2I128 $0x13, Y6, Y10, Y12
VPERM2I128 $0x13, Y8, Y2, Y4
JMP sealAVX2SealHash
sealAVX2Tail512:
VMOVDQA ·chacha20Constants<>+0(SB), Y0
VMOVDQA Y0, Y5
VMOVDQA Y0, Y6
VMOVDQA Y0, Y7
VMOVDQA 32(BP), Y14
VMOVDQA Y14, Y9
VMOVDQA Y14, Y10
VMOVDQA Y14, Y11
VMOVDQA 64(BP), Y12
VMOVDQA Y12, Y13
VMOVDQA Y12, Y8
VMOVDQA Y12, Y15
VMOVDQA 192(BP), Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y4
VPADDD ·avx2IncMask<>+0(SB), Y4, Y1
VPADDD ·avx2IncMask<>+0(SB), Y1, Y2
VPADDD ·avx2IncMask<>+0(SB), Y2, Y3
VMOVDQA Y4, 96(BP)
VMOVDQA Y1, 128(BP)
VMOVDQA Y2, 160(BP)
VMOVDQA Y3, 192(BP)
sealAVX2Tail512LoopA:
ADDQ (DI), R10
ADCQ 8(DI), R11
ADCQ $0x01, R12
MOVQ (BP), AX
MOVQ AX, R15
MULQ R10
MOVQ AX, R13
MOVQ DX, R14
MOVQ (BP), AX
MULQ R11
IMULQ R12, R15
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), AX
MOVQ AX, R8
MULQ R10
ADDQ AX, R14
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 8(BP), AX
MULQ R11
ADDQ AX, R15
ADCQ $0x00, DX
IMULQ R12, R8
ADDQ R10, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 16(DI), DI
sealAVX2Tail512LoopB:
VPADDD Y14, Y0, Y0
VPADDD Y9, Y5, Y5
VPADDD Y10, Y6, Y6
VPADDD Y11, Y7, Y7
VPXOR Y0, Y4, Y4
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y3, Y3
VPADDD Y4, Y12, Y12
VPADDD Y1, Y13, Y13
VPADDD Y2, Y8, Y8
VPADDD Y3, Y15, Y15
VPXOR Y12, Y14, Y14
VPXOR Y13, Y9, Y9
VPXOR Y8, Y10, Y10
VPXOR Y15, Y11, Y11
VMOVDQA Y15, 224(BP)
VPSLLD $0x0c, Y14, Y15
VPSRLD $0x14, Y14, Y14
VPXOR Y15, Y14, Y14
VPSLLD $0x0c, Y9, Y15
VPSRLD $0x14, Y9, Y9
VPXOR Y15, Y9, Y9
VPSLLD $0x0c, Y10, Y15
VPSRLD $0x14, Y10, Y10
VPXOR Y15, Y10, Y10
VPSLLD $0x0c, Y11, Y15
VPSRLD $0x14, Y11, Y11
VPXOR Y15, Y11, Y11
VMOVDQA 224(BP), Y15
ADDQ (DI), R10
ADCQ 8(DI), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
VPADDD Y14, Y0, Y0
VPADDD Y9, Y5, Y5
VPADDD Y10, Y6, Y6
VPADDD Y11, Y7, Y7
VPXOR Y0, Y4, Y4
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y3, Y3
VPADDD Y4, Y12, Y12
VPADDD Y1, Y13, Y13
VPADDD Y2, Y8, Y8
VPADDD Y3, Y15, Y15
VPXOR Y12, Y14, Y14
VPXOR Y13, Y9, Y9
VPXOR Y8, Y10, Y10
VPXOR Y15, Y11, Y11
VMOVDQA Y15, 224(BP)
VPSLLD $0x07, Y14, Y15
VPSRLD $0x19, Y14, Y14
VPXOR Y15, Y14, Y14
VPSLLD $0x07, Y9, Y15
VPSRLD $0x19, Y9, Y9
VPXOR Y15, Y9, Y9
VPSLLD $0x07, Y10, Y15
VPSRLD $0x19, Y10, Y10
VPXOR Y15, Y10, Y10
VPSLLD $0x07, Y11, Y15
VPSRLD $0x19, Y11, Y11
VPXOR Y15, Y11, Y11
VMOVDQA 224(BP), Y15
VPALIGNR $0x04, Y14, Y14, Y14
VPALIGNR $0x04, Y9, Y9, Y9
VPALIGNR $0x04, Y10, Y10, Y10
VPALIGNR $0x04, Y11, Y11, Y11
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x08, Y15, Y15, Y15
VPALIGNR $0x0c, Y4, Y4, Y4
VPALIGNR $0x0c, Y1, Y1, Y1
VPALIGNR $0x0c, Y2, Y2, Y2
VPALIGNR $0x0c, Y3, Y3, Y3
VPADDD Y14, Y0, Y0
VPADDD Y9, Y5, Y5
VPADDD Y10, Y6, Y6
VPADDD Y11, Y7, Y7
VPXOR Y0, Y4, Y4
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
VPSHUFB ·rol16<>+0(SB), Y4, Y4
VPSHUFB ·rol16<>+0(SB), Y1, Y1
VPSHUFB ·rol16<>+0(SB), Y2, Y2
VPSHUFB ·rol16<>+0(SB), Y3, Y3
VPADDD Y4, Y12, Y12
VPADDD Y1, Y13, Y13
VPADDD Y2, Y8, Y8
VPADDD Y3, Y15, Y15
VPXOR Y12, Y14, Y14
VPXOR Y13, Y9, Y9
VPXOR Y8, Y10, Y10
VPXOR Y15, Y11, Y11
ADDQ 16(DI), R10
ADCQ 24(DI), R11
ADCQ $0x01, R12
MOVQ (BP), DX
MOVQ DX, R15
MULXQ R10, R13, R14
IMULQ R12, R15
MULXQ R11, AX, DX
ADDQ AX, R14
ADCQ DX, R15
MOVQ 8(BP), DX
MULXQ R10, R10, AX
ADDQ R10, R14
MULXQ R11, R11, R8
ADCQ R11, R15
ADCQ $0x00, R8
IMULQ R12, DX
ADDQ AX, R15
ADCQ DX, R8
MOVQ R13, R10
MOVQ R14, R11
MOVQ R15, R12
ANDQ $0x03, R12
MOVQ R15, R13
ANDQ $-4, R13
MOVQ R8, R14
SHRQ $0x02, R8, R15
SHRQ $0x02, R8
ADDQ R13, R10
ADCQ R14, R11
ADCQ $0x00, R12
ADDQ R15, R10
ADCQ R8, R11
ADCQ $0x00, R12
LEAQ 32(DI), DI
VMOVDQA Y15, 224(BP)
VPSLLD $0x0c, Y14, Y15
VPSRLD $0x14, Y14, Y14
VPXOR Y15, Y14, Y14
VPSLLD $0x0c, Y9, Y15
VPSRLD $0x14, Y9, Y9
VPXOR Y15, Y9, Y9
VPSLLD $0x0c, Y10, Y15
VPSRLD $0x14, Y10, Y10
VPXOR Y15, Y10, Y10
VPSLLD $0x0c, Y11, Y15
VPSRLD $0x14, Y11, Y11
VPXOR Y15, Y11, Y11
VMOVDQA 224(BP), Y15
VPADDD Y14, Y0, Y0
VPADDD Y9, Y5, Y5
VPADDD Y10, Y6, Y6
VPADDD Y11, Y7, Y7
VPXOR Y0, Y4, Y4
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
VPSHUFB ·rol8<>+0(SB), Y4, Y4
VPSHUFB ·rol8<>+0(SB), Y1, Y1
VPSHUFB ·rol8<>+0(SB), Y2, Y2
VPSHUFB ·rol8<>+0(SB), Y3, Y3
VPADDD Y4, Y12, Y12
VPADDD Y1, Y13, Y13
VPADDD Y2, Y8, Y8
VPADDD Y3, Y15, Y15
VPXOR Y12, Y14, Y14
VPXOR Y13, Y9, Y9
VPXOR Y8, Y10, Y10
VPXOR Y15, Y11, Y11
VMOVDQA Y15, 224(BP)
VPSLLD $0x07, Y14, Y15
VPSRLD $0x19, Y14, Y14
VPXOR Y15, Y14, Y14
VPSLLD $0x07, Y9, Y15
VPSRLD $0x19, Y9, Y9
VPXOR Y15, Y9, Y9
VPSLLD $0x07, Y10, Y15
VPSRLD $0x19, Y10, Y10
VPXOR Y15, Y10, Y10
VPSLLD $0x07, Y11, Y15
VPSRLD $0x19, Y11, Y11
VPXOR Y15, Y11, Y11
VMOVDQA 224(BP), Y15
VPALIGNR $0x0c, Y14, Y14, Y14
VPALIGNR $0x0c, Y9, Y9, Y9
VPALIGNR $0x0c, Y10, Y10, Y10
VPALIGNR $0x0c, Y11, Y11, Y11
VPALIGNR $0x08, Y12, Y12, Y12
VPALIGNR $0x08, Y13, Y13, Y13
VPALIGNR $0x08, Y8, Y8, Y8
VPALIGNR $0x08, Y15, Y15, Y15
VPALIGNR $0x04, Y4, Y4, Y4
VPALIGNR $0x04, Y1, Y1, Y1
VPALIGNR $0x04, Y2, Y2, Y2
VPALIGNR $0x04, Y3, Y3, Y3
DECQ CX
JG sealAVX2Tail512LoopA
DECQ R9
JGE sealAVX2Tail512LoopB
VPADDD ·chacha20Constants<>+0(SB), Y0, Y0
VPADDD ·chacha20Constants<>+0(SB), Y5, Y5
VPADDD ·chacha20Constants<>+0(SB), Y6, Y6
VPADDD ·chacha20Constants<>+0(SB), Y7, Y7
VPADDD 32(BP), Y14, Y14
VPADDD 32(BP), Y9, Y9
VPADDD 32(BP), Y10, Y10
VPADDD 32(BP), Y11, Y11
VPADDD 64(BP), Y12, Y12
VPADDD 64(BP), Y13, Y13
VPADDD 64(BP), Y8, Y8
VPADDD 64(BP), Y15, Y15
VPADDD 96(BP), Y4, Y4
VPADDD 128(BP), Y1, Y1
VPADDD 160(BP), Y2, Y2
VPADDD 192(BP), Y3, Y3
VMOVDQA Y15, 224(BP)
VPERM2I128 $0x02, Y0, Y14, Y15
VPXOR (SI), Y15, Y15
VMOVDQU Y15, (DI)
VPERM2I128 $0x02, Y12, Y4, Y15
VPXOR 32(SI), Y15, Y15
VMOVDQU Y15, 32(DI)
VPERM2I128 $0x13, Y0, Y14, Y15
VPXOR 64(SI), Y15, Y15
VMOVDQU Y15, 64(DI)
VPERM2I128 $0x13, Y12, Y4, Y15
VPXOR 96(SI), Y15, Y15
VMOVDQU Y15, 96(DI)
VPERM2I128 $0x02, Y5, Y9, Y0
VPERM2I128 $0x02, Y13, Y1, Y14
VPERM2I128 $0x13, Y5, Y9, Y12
VPERM2I128 $0x13, Y13, Y1, Y4
VPXOR 128(SI), Y0, Y0
VPXOR 160(SI), Y14, Y14
VPXOR 192(SI), Y12, Y12
VPXOR 224(SI), Y4, Y4
VMOVDQU Y0, 128(DI)
VMOVDQU Y14, 160(DI)
VMOVDQU Y12, 192(DI)
VMOVDQU Y4, 224(DI)
VPERM2I128 $0x02, Y6, Y10, Y0
VPERM2I128 $0x02, Y8, Y2, Y14
VPERM2I128 $0x13, Y6, Y10, Y12
VPERM2I128 $0x13, Y8, Y2, Y4
VPXOR 256(SI), Y0, Y0
VPXOR 288(SI), Y14, Y14
VPXOR 320(SI), Y12, Y12
VPXOR 352(SI), Y4, Y4
VMOVDQU Y0, 256(DI)
VMOVDQU Y14, 288(DI)
VMOVDQU Y12, 320(DI)
VMOVDQU Y4, 352(DI)
MOVQ $0x00000180, CX
LEAQ 384(SI), SI
SUBQ $0x00000180, BX
VPERM2I128 $0x02, Y7, Y11, Y0
VPERM2I128 $0x02, 224(BP), Y3, Y14
VPERM2I128 $0x13, Y7, Y11, Y12
VPERM2I128 $0x13, 224(BP), Y3, Y4
JMP sealAVX2SealHash
 |
The pages are generated with Golds v0.8.4. (GOOS=linux GOARCH=amd64)
Golds is a Go 101 project developed by Tapir Liu.
PR and bug reports are welcome and can be submitted to the issue list.
Please follow @zigo_101 (reachable from the left QR code) to get the latest news of Golds. |