// Code generated by command: go run chacha20poly1305_amd64_asm.go -out ../chacha20poly1305_amd64.s -pkg chacha20poly1305. DO NOT EDIT.

//go:build gc && !purego

#include "textflag.h"

// func polyHashADInternal<>()
TEXT polyHashADInternal<>(SB), NOSPLIT, $0
	XORQ  R10, R10
	XORQ  R11, R11
	XORQ  R12, R12
	CMPQ  R9, $0x0d
	JNE   hashADLoop
	MOVQ  (CX), R10
	MOVQ  5(CX), R11
	SHRQ  $0x18, R11
	MOVQ  $0x00000001, R12
	MOVQ  (BP), AX
	MOVQ  AX, R15
	MULQ  R10
	MOVQ  AX, R13
	MOVQ  DX, R14
	MOVQ  (BP), AX
	MULQ  R11
	IMULQ R12, R15
	ADDQ  AX, R14
	ADCQ  DX, R15
	MOVQ  8(BP), AX
	MOVQ  AX, R8
	MULQ  R10
	ADDQ  AX, R14
	ADCQ  $0x00, DX
	MOVQ  DX, R10
	MOVQ  8(BP), AX
	MULQ  R11
	ADDQ  AX, R15
	ADCQ  $0x00, DX
	IMULQ R12, R8
	ADDQ  R10, R15
	ADCQ  DX, R8
	MOVQ  R13, R10
	MOVQ  R14, R11
	MOVQ  R15, R12
	ANDQ  $0x03, R12
	MOVQ  R15, R13
	ANDQ  $-4, R13
	MOVQ  R8, R14
	SHRQ  $0x02, R8, R15
	SHRQ  $0x02, R8
	ADDQ  R13, R10
	ADCQ  R14, R11
	ADCQ  $0x00, R12
	ADDQ  R15, R10
	ADCQ  R8, R11
	ADCQ  $0x00, R12
	RET

hashADLoop:
	// Hash in 16 byte chunks
	CMPQ  R9, $0x10
	JB    hashADTail
	ADDQ  (CX), R10
	ADCQ  8(CX), R11
	ADCQ  $0x01, R12
	LEAQ  16(CX), CX
	SUBQ  $0x10, R9
	MOVQ  (BP), AX
	MOVQ  AX, R15
	MULQ  R10
	MOVQ  AX, R13
	MOVQ  DX, R14
	MOVQ  (BP), AX
	MULQ  R11
	IMULQ R12, R15
	ADDQ  AX, R14
	ADCQ  DX, R15
	MOVQ  8(BP), AX
	MOVQ  AX, R8
	MULQ  R10
	ADDQ  AX, R14
	ADCQ  $0x00, DX
	MOVQ  DX, R10
	MOVQ  8(BP), AX
	MULQ  R11
	ADDQ  AX, R15
	ADCQ  $0x00, DX
	IMULQ R12, R8
	ADDQ  R10, R15
	ADCQ  DX, R8
	MOVQ  R13, R10
	MOVQ  R14, R11
	MOVQ  R15, R12
	ANDQ  $0x03, R12
	MOVQ  R15, R13
	ANDQ  $-4, R13
	MOVQ  R8, R14
	SHRQ  $0x02, R8, R15
	SHRQ  $0x02, R8
	ADDQ  R13, R10
	ADCQ  R14, R11
	ADCQ  $0x00, R12
	ADDQ  R15, R10
	ADCQ  R8, R11
	ADCQ  $0x00, R12
	JMP   hashADLoop

hashADTail:
	CMPQ R9, $0x00
	JE   hashADDone

	// Hash last < 16 byte tail
	XORQ R13, R13
	XORQ R14, R14
	XORQ R15, R15
	ADDQ R9, CX

hashADTailLoop:
	SHLQ  $0x08, R13, R14
	SHLQ  $0x08, R13
	MOVB  -1(CX), R15
	XORQ  R15, R13
	DECQ  CX
	DECQ  R9
	JNE   hashADTailLoop
	ADDQ  R13, R10
	ADCQ  R14, R11
	ADCQ  $0x01, R12
	MOVQ  (BP), AX
	MOVQ  AX, R15
	MULQ  R10
	MOVQ  AX, R13
	MOVQ  DX, R14
	MOVQ  (BP), AX
	MULQ  R11
	IMULQ R12, R15
	ADDQ  AX, R14
	ADCQ  DX, R15
	MOVQ  8(BP), AX
	MOVQ  AX, R8
	MULQ  R10
	ADDQ  AX, R14
	ADCQ  $0x00, DX
	MOVQ  DX, R10
	MOVQ  8(BP), AX
	MULQ  R11
	ADDQ  AX, R15
	ADCQ  $0x00, DX
	IMULQ R12, R8
	ADDQ  R10, R15
	ADCQ  DX, R8
	MOVQ  R13, R10
	MOVQ  R14, R11
	MOVQ  R15, R12
	ANDQ  $0x03, R12
	MOVQ  R15, R13
	ANDQ  $-4, R13
	MOVQ  R8, R14
	SHRQ  $0x02, R8, R15
	SHRQ  $0x02, R8
	ADDQ  R13, R10
	ADCQ  R14, R11
	ADCQ  $0x00, R12
	ADDQ  R15, R10
	ADCQ  R8, R11
	ADCQ  $0x00, R12

hashADDone:
	RET

// func chacha20Poly1305Open(dst []byte, key []uint32, src []byte, ad []byte) bool
// Requires: AVX, AVX2, BMI2, CMOV, SSE2
TEXT ·chacha20Poly1305Open(SB), $288-97
	// For aligned stack access
	MOVQ           SP, BP
	ADDQ           $0x20, BP
	ANDQ           $-32, BP
	MOVQ           dst_base+0(FP), DI
	MOVQ           key_base+24(FP), R8
	MOVQ           src_base+48(FP), SI
	MOVQ           src_len+56(FP), BX
	MOVQ           ad_base+72(FP), CX
	VZEROUPPER
	VMOVDQU        ·chacha20Constants<>+0(SB), Y0
	VBROADCASTI128 16(R8), Y14
	VBROADCASTI128 32(R8), Y12
	VBROADCASTI128 48(R8), Y4
	VPADDD         ·avx2InitMask<>+0(SB), Y4, Y4

	// Special optimization, for very short buffers
	CMPQ BX, $0xc0
	JBE  openAVX2192
	CMPQ BX, $0x00000140
	JBE  openAVX2320

	// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
	VMOVDQA Y14, 32(BP)
	VMOVDQA Y12, 64(BP)
	VMOVDQA Y4, 192(BP)
	MOVQ    $0x0000000a, R9

openAVX2PreparePolyKey:
	VPADDD     Y14, Y0, Y0
	VPXOR      Y0, Y4, Y4
	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
	VPADDD     Y4, Y12, Y12
	VPXOR      Y12, Y14, Y14
	VPSLLD     $0x0c, Y14, Y3
	VPSRLD     $0x14, Y14, Y14
	VPXOR      Y3, Y14, Y14
	VPADDD     Y14, Y0, Y0
	VPXOR      Y0, Y4, Y4
	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
	VPADDD     Y4, Y12, Y12
	VPXOR      Y12, Y14, Y14
	VPSLLD     $0x07, Y14, Y3
	VPSRLD     $0x19, Y14, Y14
	VPXOR      Y3, Y14, Y14
	VPALIGNR   $0x04, Y14, Y14, Y14
	VPALIGNR   $0x08, Y12, Y12, Y12
	VPALIGNR   $0x0c, Y4, Y4, Y4
	VPADDD     Y14, Y0, Y0
	VPXOR      Y0, Y4, Y4
	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
	VPADDD     Y4, Y12, Y12
	VPXOR      Y12, Y14, Y14
	VPSLLD     $0x0c, Y14, Y3
	VPSRLD     $0x14, Y14, Y14
	VPXOR      Y3, Y14, Y14
	VPADDD     Y14, Y0, Y0
	VPXOR      Y0, Y4, Y4
	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
	VPADDD     Y4, Y12, Y12
	VPXOR      Y12, Y14, Y14
	VPSLLD     $0x07, Y14, Y3
	VPSRLD     $0x19, Y14, Y14
	VPXOR      Y3, Y14, Y14
	VPALIGNR   $0x0c, Y14, Y14, Y14
	VPALIGNR   $0x08, Y12, Y12, Y12
	VPALIGNR   $0x04, Y4, Y4, Y4
	DECQ       R9
	JNE        openAVX2PreparePolyKey
	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
	VPADDD     32(BP), Y14, Y14
	VPADDD     64(BP), Y12, Y12
	VPADDD     192(BP), Y4, Y4
	VPERM2I128 $0x02, Y0, Y14, Y3

	// Clamp and store poly key
	VPAND   ·polyClampMask<>+0(SB), Y3, Y3
	VMOVDQA Y3, (BP)

	// Stream for the first 64 bytes
	VPERM2I128 $0x13, Y0, Y14, Y0
	VPERM2I128 $0x13, Y12, Y4, Y14

	// Hash AD + first 64 bytes
	MOVQ ad_len+80(FP), R9
	CALL polyHashADInternal<>(SB)
	XORQ CX, CX

openAVX2InitialHash64:
	ADDQ  (SI)(CX*1), R10
	ADCQ  8(SI)(CX*1), R11
	ADCQ  $0x01, R12
	MOVQ  (BP), DX
	MOVQ  DX, R15
	MULXQ R10, R13, R14
	IMULQ R12, R15
	MULXQ R11, AX, DX
	ADDQ  AX, R14
	ADCQ  DX, R15
	MOVQ  8(BP), DX
	MULXQ R10, R10, AX
	ADDQ  R10, R14
	MULXQ R11, R11, R8
	ADCQ  R11, R15
	ADCQ  $0x00, R8
	IMULQ R12, DX
	ADDQ  AX, R15
	ADCQ  DX, R8
	MOVQ  R13, R10
	MOVQ  R14, R11
	MOVQ  R15, R12
	ANDQ  $0x03, R12
	MOVQ  R15, R13
	ANDQ  $-4, R13
	MOVQ  R8, R14
	SHRQ  $0x02, R8, R15
	SHRQ  $0x02, R8
	ADDQ  R13, R10
	ADCQ  R14, R11
	ADCQ  $0x00, R12
	ADDQ  R15, R10
	ADCQ  R8, R11
	ADCQ  $0x00, R12
	ADDQ  $0x10, CX
	CMPQ  CX, $0x40
	JNE   openAVX2InitialHash64

	// Decrypt the first 64 bytes
	VPXOR   (SI), Y0, Y0
	VPXOR   32(SI), Y14, Y14
	VMOVDQU Y0, (DI)
	VMOVDQU Y14, 32(DI)
	LEAQ    64(SI), SI
	LEAQ    64(DI), DI
	SUBQ    $0x40, BX

openAVX2MainLoop:
	CMPQ BX, $0x00000200
	JB   openAVX2MainLoopDone

	// Load state, increment counter blocks, store the incremented counters
	VMOVDQU ·chacha20Constants<>+0(SB), Y0
	VMOVDQA Y0, Y5
	VMOVDQA Y0, Y6
	VMOVDQA Y0, Y7
	VMOVDQA 32(BP), Y14
	VMOVDQA Y14, Y9
	VMOVDQA Y14, Y10
	VMOVDQA Y14, Y11
	VMOVDQA 64(BP), Y12
	VMOVDQA Y12, Y13
	VMOVDQA Y12, Y8
	VMOVDQA Y12, Y15
	VMOVDQA 192(BP), Y4
	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
	VPADDD  ·avx2IncMask<>+0(SB), Y2, Y3
	VMOVDQA Y4, 96(BP)
	VMOVDQA Y1, 128(BP)
	VMOVDQA Y2, 160(BP)
	VMOVDQA Y3, 192(BP)
	XORQ    CX, CX

openAVX2InternalLoop:
	ADDQ     (SI)(CX*1), R10
	ADCQ     8(SI)(CX*1), R11
	ADCQ     $0x01, R12
	VPADDD   Y14, Y0, Y0
	VPADDD   Y9, Y5, Y5
	VPADDD   Y10, Y6, Y6
	VPADDD   Y11, Y7, Y7
	MOVQ     (BP), DX
	MOVQ     DX, R15
	MULXQ    R10, R13, R14
	IMULQ    R12, R15
	MULXQ    R11, AX, DX
	ADDQ     AX, R14
	ADCQ     DX, R15
	VPXOR    Y0, Y4, Y4
	VPXOR    Y5, Y1, Y1
	VPXOR    Y6, Y2, Y2
	VPXOR    Y7, Y3, Y3
	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
	MOVQ     8(BP), DX
	MULXQ    R10, R10, AX
	ADDQ     R10, R14
	MULXQ    R11, R11, R8
	ADCQ     R11, R15
	ADCQ     $0x00, R8
	VPADDD   Y4, Y12, Y12
	VPADDD   Y1, Y13, Y13
	VPADDD   Y2, Y8, Y8
	VPADDD   Y3, Y15, Y15
	VPXOR    Y12, Y14, Y14
	VPXOR    Y13, Y9, Y9
	VPXOR    Y8, Y10, Y10
	VPXOR    Y15, Y11, Y11
	IMULQ    R12, DX
	ADDQ     AX, R15
	ADCQ     DX, R8
	VMOVDQA  Y15, 224(BP)
	VPSLLD   $0x0c, Y14, Y15
	VPSRLD   $0x14, Y14, Y14
	VPXOR    Y15, Y14, Y14
	VPSLLD   $0x0c, Y9, Y15
	VPSRLD   $0x14, Y9, Y9
	VPXOR    Y15, Y9, Y9
	VPSLLD   $0x0c, Y10, Y15
	VPSRLD   $0x14, Y10, Y10
	VPXOR    Y15, Y10, Y10
	VPSLLD   $0x0c, Y11, Y15
	VPSRLD   $0x14, Y11, Y11
	VPXOR    Y15, Y11, Y11
	VMOVDQA  224(BP), Y15
	MOVQ     R13, R10
	MOVQ     R14, R11
	MOVQ     R15, R12
	ANDQ     $0x03, R12
	MOVQ     R15, R13
	ANDQ     $-4, R13
	MOVQ     R8, R14
	SHRQ     $0x02, R8, R15
	SHRQ     $0x02, R8
	ADDQ     R13, R10
	ADCQ     R14, R11
	ADCQ     $0x00, R12
	ADDQ     R15, R10
	ADCQ     R8, R11
	ADCQ     $0x00, R12
	VPADDD   Y14, Y0, Y0
	VPADDD   Y9, Y5, Y5
	VPADDD   Y10, Y6, Y6
	VPADDD   Y11, Y7, Y7
	VPXOR    Y0, Y4, Y4
	VPXOR    Y5, Y1, Y1
	VPXOR    Y6, Y2, Y2
	VPXOR    Y7, Y3, Y3
	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
	ADDQ     16(SI)(CX*1), R10
	ADCQ     24(SI)(CX*1), R11
	ADCQ     $0x01, R12
	VPADDD   Y4, Y12, Y12
	VPADDD   Y1, Y13, Y13
	VPADDD   Y2, Y8, Y8
	VPADDD   Y3, Y15, Y15
	MOVQ     (BP), DX
	MOVQ     DX, R15
	MULXQ    R10, R13, R14
	IMULQ    R12, R15
	MULXQ    R11, AX, DX
	ADDQ     AX, R14
	ADCQ     DX, R15
	VPXOR    Y12, Y14, Y14
	VPXOR    Y13, Y9, Y9
	VPXOR    Y8, Y10, Y10
	VPXOR    Y15, Y11, Y11
	VMOVDQA  Y15, 224(BP)
	VPSLLD   $0x07, Y14, Y15
	VPSRLD   $0x19, Y14, Y14
	VPXOR    Y15, Y14, Y14
	VPSLLD   $0x07, Y9, Y15
	VPSRLD   $0x19, Y9, Y9
	VPXOR    Y15, Y9, Y9
	VPSLLD   $0x07, Y10, Y15
	VPSRLD   $0x19, Y10, Y10
	VPXOR    Y15, Y10, Y10
	VPSLLD   $0x07, Y11, Y15
	VPSRLD   $0x19, Y11, Y11
	VPXOR    Y15, Y11, Y11
	VMOVDQA  224(BP), Y15
	MOVQ     8(BP), DX
	MULXQ    R10, R10, AX
	ADDQ     R10, R14
	MULXQ    R11, R11, R8
	ADCQ     R11, R15
	ADCQ     $0x00, R8
	VPALIGNR $0x04, Y14, Y14, Y14
	VPALIGNR $0x04, Y9, Y9, Y9
	VPALIGNR $0x04, Y10, Y10, Y10
	VPALIGNR $0x04, Y11, Y11, Y11
	VPALIGNR $0x08, Y12, Y12, Y12
	VPALIGNR $0x08, Y13, Y13, Y13
	VPALIGNR $0x08, Y8, Y8, Y8
	VPALIGNR $0x08, Y15, Y15, Y15
	VPALIGNR $0x0c, Y4, Y4, Y4
	VPALIGNR $0x0c, Y1, Y1, Y1
	VPALIGNR $0x0c, Y2, Y2, Y2
	VPALIGNR $0x0c, Y3, Y3, Y3
	VPADDD   Y14, Y0, Y0
	VPADDD   Y9, Y5, Y5
	VPADDD   Y10, Y6, Y6
	VPADDD   Y11, Y7, Y7
	IMULQ    R12, DX
	ADDQ     AX, R15
	ADCQ     DX, R8
	VPXOR    Y0, Y4, Y4
	VPXOR    Y5, Y1, Y1
	VPXOR    Y6, Y2, Y2
	VPXOR    Y7, Y3, Y3
	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
	MOVQ     R13, R10
	MOVQ     R14, R11
	MOVQ     R15, R12
	ANDQ     $0x03, R12
	MOVQ     R15, R13
	ANDQ     $-4, R13
	MOVQ     R8, R14
	SHRQ     $0x02, R8, R15
	SHRQ     $0x02, R8
	ADDQ     R13, R10
	ADCQ     R14, R11
	ADCQ     $0x00, R12
	ADDQ     R15, R10
	ADCQ     R8, R11
	ADCQ     $0x00, R12
	VPADDD   Y4, Y12, Y12
	VPADDD   Y1, Y13, Y13
	VPADDD   Y2, Y8, Y8
	VPADDD   Y3, Y15, Y15
	VPXOR    Y12, Y14, Y14
	VPXOR    Y13, Y9, Y9
	VPXOR    Y8, Y10, Y10
	VPXOR    Y15, Y11, Y11
	ADDQ     32(SI)(CX*1), R10
	ADCQ     40(SI)(CX*1), R11
	ADCQ     $0x01, R12
	LEAQ     48(CX), CX
	VMOVDQA  Y15, 224(BP)
	VPSLLD   $0x0c, Y14, Y15
	VPSRLD   $0x14, Y14, Y14
	VPXOR    Y15, Y14, Y14
	VPSLLD   $0x0c, Y9, Y15
	VPSRLD   $0x14, Y9, Y9
	VPXOR    Y15, Y9, Y9
	VPSLLD   $0x0c, Y10, Y15
	VPSRLD   $0x14, Y10, Y10
	VPXOR    Y15, Y10, Y10
	VPSLLD   $0x0c, Y11, Y15
	VPSRLD   $0x14, Y11, Y11
	VPXOR    Y15, Y11, Y11
	VMOVDQA  224(BP), Y15
	MOVQ     (BP), DX
	MOVQ     DX, R15
	MULXQ    R10, R13, R14
	IMULQ    R12, R15
	MULXQ    R11, AX, DX
	ADDQ     AX, R14
	ADCQ     DX, R15
	VPADDD   Y14, Y0, Y0
	VPADDD   Y9, Y5, Y5
	VPADDD   Y10, Y6, Y6
	VPADDD   Y11, Y7, Y7
	VPXOR    Y0, Y4, Y4
	VPXOR    Y5, Y1, Y1
	VPXOR    Y6, Y2, Y2
	VPXOR    Y7, Y3, Y3
	MOVQ     8(BP), DX
	MULXQ    R10, R10, AX
	ADDQ     R10, R14
	MULXQ    R11, R11, R8
	ADCQ     R11, R15
	ADCQ     $0x00, R8
	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
	VPADDD   Y4, Y12, Y12
	VPADDD   Y1, Y13, Y13
	VPADDD   Y2, Y8, Y8
	VPADDD   Y3, Y15, Y15
	IMULQ    R12, DX
	ADDQ     AX, R15
	ADCQ     DX, R8
	VPXOR    Y12, Y14, Y14
	VPXOR    Y13, Y9, Y9
	VPXOR    Y8, Y10, Y10
	VPXOR    Y15, Y11, Y11
	VMOVDQA  Y15, 224(BP)
	VPSLLD   $0x07, Y14, Y15
	VPSRLD   $0x19, Y14, Y14
	VPXOR    Y15, Y14, Y14
	VPSLLD   $0x07, Y9, Y15
	VPSRLD   $0x19, Y9, Y9
	VPXOR    Y15, Y9, Y9
	VPSLLD   $0x07, Y10, Y15
	VPSRLD   $0x19, Y10, Y10
	VPXOR    Y15, Y10, Y10
	VPSLLD   $0x07, Y11, Y15
	VPSRLD   $0x19, Y11, Y11
	VPXOR    Y15, Y11, Y11
	VMOVDQA  224(BP), Y15
	MOVQ     R13, R10
	MOVQ     R14, R11
	MOVQ     R15, R12
	ANDQ     $0x03, R12
	MOVQ     R15, R13
	ANDQ     $-4, R13
	MOVQ     R8, R14
	SHRQ     $0x02, R8, R15
	SHRQ     $0x02, R8
	ADDQ     R13, R10
	ADCQ     R14, R11
	ADCQ     $0x00, R12
	ADDQ     R15, R10
	ADCQ     R8, R11
	ADCQ     $0x00, R12
	VPALIGNR $0x0c, Y14, Y14, Y14
	VPALIGNR $0x0c, Y9, Y9, Y9
	VPALIGNR $0x0c, Y10, Y10, Y10
	VPALIGNR $0x0c, Y11, Y11, Y11
	VPALIGNR $0x08, Y12, Y12, Y12
	VPALIGNR $0x08, Y13, Y13, Y13
	VPALIGNR $0x08, Y8, Y8, Y8
	VPALIGNR $0x08, Y15, Y15, Y15
	VPALIGNR $0x04, Y4, Y4, Y4
	VPALIGNR $0x04, Y1, Y1, Y1
	VPALIGNR $0x04, Y2, Y2, Y2
	VPALIGNR $0x04, Y3, Y3, Y3
	CMPQ     CX, $0x000001e0
	JNE      openAVX2InternalLoop
	VPADDD   ·chacha20Constants<>+0(SB), Y0, Y0
	VPADDD   ·chacha20Constants<>+0(SB), Y5, Y5
	VPADDD   ·chacha20Constants<>+0(SB), Y6, Y6
	VPADDD   ·chacha20Constants<>+0(SB), Y7, Y7
	VPADDD   32(BP), Y14, Y14
	VPADDD   32(BP), Y9, Y9
	VPADDD   32(BP), Y10, Y10
	VPADDD   32(BP), Y11, Y11
	VPADDD   64(BP), Y12, Y12
	VPADDD   64(BP), Y13, Y13
	VPADDD   64(BP), Y8, Y8
	VPADDD   64(BP), Y15, Y15
	VPADDD   96(BP), Y4, Y4
	VPADDD   128(BP), Y1, Y1
	VPADDD   160(BP), Y2, Y2
	VPADDD   192(BP), Y3, Y3
	VMOVDQA  Y15, 224(BP)

	// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
	ADDQ       480(SI), R10
	ADCQ       488(SI), R11
	ADCQ       $0x01, R12
	MOVQ       (BP), DX
	MOVQ       DX, R15
	MULXQ      R10, R13, R14
	IMULQ      R12, R15
	MULXQ      R11, AX, DX
	ADDQ       AX, R14
	ADCQ       DX, R15
	MOVQ       8(BP), DX
	MULXQ      R10, R10, AX
	ADDQ       R10, R14
	MULXQ      R11, R11, R8
	ADCQ       R11, R15
	ADCQ       $0x00, R8
	IMULQ      R12, DX
	ADDQ       AX, R15
	ADCQ       DX, R8
	MOVQ       R13, R10
	MOVQ       R14, R11
	MOVQ       R15, R12
	ANDQ       $0x03, R12
	MOVQ       R15, R13
	ANDQ       $-4, R13
	MOVQ       R8, R14
	SHRQ       $0x02, R8, R15
	SHRQ       $0x02, R8
	ADDQ       R13, R10
	ADCQ       R14, R11
	ADCQ       $0x00, R12
	ADDQ       R15, R10
	ADCQ       R8, R11
	ADCQ       $0x00, R12
	VPERM2I128 $0x02, Y0, Y14, Y15
	VPERM2I128 $0x13, Y0, Y14, Y14
	VPERM2I128 $0x02, Y12, Y4, Y0
	VPERM2I128 $0x13, Y12, Y4, Y12
	VPXOR      (SI), Y15, Y15
	VPXOR      32(SI), Y0, Y0
	VPXOR      64(SI), Y14, Y14
	VPXOR      96(SI), Y12, Y12
	VMOVDQU    Y15, (DI)
	VMOVDQU    Y0, 32(DI)
	VMOVDQU    Y14, 64(DI)
	VMOVDQU    Y12, 96(DI)
	VPERM2I128 $0x02, Y5, Y9, Y0
	VPERM2I128 $0x02, Y13, Y1, Y14
	VPERM2I128 $0x13, Y5, Y9, Y12
	VPERM2I128 $0x13, Y13, Y1, Y4
	VPXOR      128(SI), Y0, Y0
	VPXOR      160(SI), Y14, Y14
	VPXOR      192(SI), Y12, Y12
	VPXOR      224(SI), Y4, Y4
	VMOVDQU    Y0, 128(DI)
	VMOVDQU    Y14, 160(DI)
	VMOVDQU    Y12, 192(DI)
	VMOVDQU    Y4, 224(DI)

	// and here
	ADDQ       496(SI), R10
	ADCQ       504(SI), R11
	ADCQ       $0x01, R12
	MOVQ       (BP), DX
	MOVQ       DX, R15
	MULXQ      R10, R13, R14
	IMULQ      R12, R15
	MULXQ      R11, AX, DX
	ADDQ       AX, R14
	ADCQ       DX, R15
	MOVQ       8(BP), DX
	MULXQ      R10, R10, AX
	ADDQ       R10, R14
	MULXQ      R11, R11, R8
	ADCQ       R11, R15
	ADCQ       $0x00, R8
	IMULQ      R12, DX
	ADDQ       AX, R15
	ADCQ       DX, R8
	MOVQ       R13, R10
	MOVQ       R14, R11
	MOVQ       R15, R12
	ANDQ       $0x03, R12
	MOVQ       R15, R13
	ANDQ       $-4, R13
	MOVQ       R8, R14
	SHRQ       $0x02, R8, R15
	SHRQ       $0x02, R8
	ADDQ       R13, R10
	ADCQ       R14, R11
	ADCQ       $0x00, R12
	ADDQ       R15, R10
	ADCQ       R8, R11
	ADCQ       $0x00, R12
	VPERM2I128 $0x02, Y6, Y10, Y0
	VPERM2I128 $0x02, Y8, Y2, Y14
	VPERM2I128 $0x13, Y6, Y10, Y12
	VPERM2I128 $0x13, Y8, Y2, Y4
	VPXOR      256(SI), Y0, Y0
	VPXOR      288(SI), Y14, Y14
	VPXOR      320(SI), Y12, Y12
	VPXOR      352(SI), Y4, Y4
	VMOVDQU    Y0, 256(DI)
	VMOVDQU    Y14, 288(DI)
	VMOVDQU    Y12, 320(DI)
	VMOVDQU    Y4, 352(DI)
	VPERM2I128 $0x02, Y7, Y11, Y0
	VPERM2I128 $0x02, 224(BP), Y3, Y14
	VPERM2I128 $0x13, Y7, Y11, Y12
	VPERM2I128 $0x13, 224(BP), Y3, Y4
	VPXOR      384(SI), Y0, Y0
	VPXOR      416(SI), Y14, Y14
	VPXOR      448(SI), Y12, Y12
	VPXOR      480(SI), Y4, Y4
	VMOVDQU    Y0, 384(DI)
	VMOVDQU    Y14, 416(DI)
	VMOVDQU    Y12, 448(DI)
	VMOVDQU    Y4, 480(DI)
	LEAQ       512(SI), SI
	LEAQ       512(DI), DI
	SUBQ       $0x00000200, BX
	JMP        openAVX2MainLoop

openAVX2MainLoopDone:
	// Handle the various tail sizes efficiently
	TESTQ BX, BX
	JE    openSSEFinalize
	CMPQ  BX, $0x80
	JBE   openAVX2Tail128
	CMPQ  BX, $0x00000100
	JBE   openAVX2Tail256
	CMPQ  BX, $0x00000180
	JBE   openAVX2Tail384
	JMP   openAVX2Tail512

openSSEFinalize:
	// Hash in the PT, AAD lengths
	ADDQ  ad_len+80(FP), R10
	ADCQ  src_len+56(FP), R11
	ADCQ  $0x01, R12
	MOVQ  (BP), AX
	MOVQ  AX, R15
	MULQ  R10
	MOVQ  AX, R13
	MOVQ  DX, R14
	MOVQ  (BP), AX
	MULQ  R11
	IMULQ R12, R15
	ADDQ  AX, R14
	ADCQ  DX, R15
	MOVQ  8(BP), AX
	MOVQ  AX, R8
	MULQ  R10
	ADDQ  AX, R14
	ADCQ  $0x00, DX
	MOVQ  DX, R10
	MOVQ  8(BP), AX
	MULQ  R11
	ADDQ  AX, R15
	ADCQ  $0x00, DX
	IMULQ R12, R8
	ADDQ  R10, R15
	ADCQ  DX, R8
	MOVQ  R13, R10
	MOVQ  R14, R11
	MOVQ  R15, R12
	ANDQ  $0x03, R12
	MOVQ  R15, R13
	ANDQ  $-4, R13
	MOVQ  R8, R14
	SHRQ  $0x02, R8, R15
	SHRQ  $0x02, R8
	ADDQ  R13, R10
	ADCQ  R14, R11
	ADCQ  $0x00, R12
	ADDQ  R15, R10
	ADCQ  R8, R11
	ADCQ  $0x00, R12

	// Final reduce
	MOVQ    R10, R13
	MOVQ    R11, R14
	MOVQ    R12, R15
	SUBQ    $-5, R10
	SBBQ    $-1, R11
	SBBQ    $0x03, R12
	CMOVQCS R13, R10
	CMOVQCS R14, R11
	CMOVQCS R15, R12

	// Add in the "s" part of the key
	ADDQ 16(BP), R10
	ADCQ 24(BP), R11

	// Finally, constant time compare to the tag at the end of the message
	XORQ    AX, AX
	MOVQ    $0x00000001, DX
	XORQ    (SI), R10
	XORQ    8(SI), R11
	ORQ     R11, R10
	CMOVQEQ DX, AX

	// Return true iff tags are equal
	MOVB AX, ret+96(FP)
	RET

openSSETail16:
	TESTQ BX, BX
	JE    openSSEFinalize

	// We can safely load the CT from the end, because it is padded with the MAC
	MOVQ  BX, R9
	SHLQ  $0x04, R9
	LEAQ  ·andMask<>+0(SB), R13
	MOVOU (SI), X12
	ADDQ  BX, SI
	PAND  -16(R13)(R9*1), X12
	MOVO  X12, 64(BP)
	MOVQ  X12, R13
	MOVQ  72(BP), R14
	PXOR  X1, X12

	// We can only store one byte at a time, since plaintext can be shorter than 16 bytes
openSSETail16Store:
	MOVQ   X12, R8
	MOVB   R8, (DI)
	PSRLDQ $0x01, X12
	INCQ   DI
	DECQ   BX
	JNE    openSSETail16Store
	ADDQ   R13, R10
	ADCQ   R14, R11
	ADCQ   $0x01, R12
	MOVQ   (BP), AX
	MOVQ   AX, R15
	MULQ   R10
	MOVQ   AX, R13
	MOVQ   DX, R14
	MOVQ   (BP), AX
	MULQ   R11
	IMULQ  R12, R15
	ADDQ   AX, R14
	ADCQ   DX, R15
	MOVQ   8(BP), AX
	MOVQ   AX, R8
	MULQ   R10
	ADDQ   AX, R14
	ADCQ   $0x00, DX
	MOVQ   DX, R10
	MOVQ   8(BP), AX
	MULQ   R11
	ADDQ   AX, R15
	ADCQ   $0x00, DX
	IMULQ  R12, R8
	ADDQ   R10, R15
	ADCQ   DX, R8
	MOVQ   R13, R10
	MOVQ   R14, R11
	MOVQ   R15, R12
	ANDQ   $0x03, R12
	MOVQ   R15, R13
	ANDQ   $-4, R13
	MOVQ   R8, R14
	SHRQ   $0x02, R8, R15
	SHRQ   $0x02, R8
	ADDQ   R13, R10
	ADCQ   R14, R11
	ADCQ   $0x00, R12
	ADDQ   R15, R10
	ADCQ   R8, R11
	ADCQ   $0x00, R12
	JMP    openSSEFinalize

openAVX2192:
	VMOVDQA Y0, Y5
	VMOVDQA Y14, Y9
	VMOVDQA Y12, Y13
	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
	VMOVDQA Y0, Y6
	VMOVDQA Y14, Y10
	VMOVDQA Y12, Y8
	VMOVDQA Y4, Y2
	VMOVDQA Y1, Y15
	MOVQ    $0x0000000a, R9

openAVX2192InnerCipherLoop:
	VPADDD     Y14, Y0, Y0
	VPXOR      Y0, Y4, Y4
	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
	VPADDD     Y4, Y12, Y12
	VPXOR      Y12, Y14, Y14
	VPSLLD     $0x0c, Y14, Y3
	VPSRLD     $0x14, Y14, Y14
	VPXOR      Y3, Y14, Y14
	VPADDD     Y14, Y0, Y0
	VPXOR      Y0, Y4, Y4
	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
	VPADDD     Y4, Y12, Y12
	VPXOR      Y12, Y14, Y14
	VPSLLD     $0x07, Y14, Y3
	VPSRLD     $0x19, Y14, Y14
	VPXOR      Y3, Y14, Y14
	VPADDD     Y9, Y5, Y5
	VPXOR      Y5, Y1, Y1
	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
	VPADDD     Y1, Y13, Y13
	VPXOR      Y13, Y9, Y9
	VPSLLD     $0x0c, Y9, Y3
	VPSRLD     $0x14, Y9, Y9
	VPXOR      Y3, Y9, Y9
	VPADDD     Y9, Y5, Y5
	VPXOR      Y5, Y1, Y1
	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
	VPADDD     Y1, Y13, Y13
	VPXOR      Y13, Y9, Y9
	VPSLLD     $0x07, Y9, Y3
	VPSRLD     $0x19, Y9, Y9
	VPXOR      Y3, Y9, Y9
	VPALIGNR   $0x04, Y14, Y14, Y14
	VPALIGNR   $0x04, Y9, Y9, Y9
	VPALIGNR   $0x08, Y12, Y12, Y12
	VPALIGNR   $0x08, Y13, Y13, Y13
	VPALIGNR   $0x0c, Y4, Y4, Y4
	VPALIGNR   $0x0c, Y1, Y1, Y1
	VPADDD     Y14, Y0, Y0
	VPXOR      Y0, Y4, Y4
	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
	VPADDD     Y4, Y12, Y12
	VPXOR      Y12, Y14, Y14
	VPSLLD     $0x0c, Y14, Y3
	VPSRLD     $0x14, Y14, Y14
	VPXOR      Y3, Y14, Y14
	VPADDD     Y14, Y0, Y0
	VPXOR      Y0, Y4, Y4
	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
	VPADDD     Y4, Y12, Y12
	VPXOR      Y12, Y14, Y14
	VPSLLD     $0x07, Y14, Y3
	VPSRLD     $0x19, Y14, Y14
	VPXOR      Y3, Y14, Y14
	VPADDD     Y9, Y5, Y5
	VPXOR      Y5, Y1, Y1
	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
	VPADDD     Y1, Y13, Y13
	VPXOR      Y13, Y9, Y9
	VPSLLD     $0x0c, Y9, Y3
	VPSRLD     $0x14, Y9, Y9
	VPXOR      Y3, Y9, Y9
	VPADDD     Y9, Y5, Y5
	VPXOR      Y5, Y1, Y1
	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
	VPADDD     Y1, Y13, Y13
	VPXOR      Y13, Y9, Y9
	VPSLLD     $0x07, Y9, Y3
	VPSRLD     $0x19, Y9, Y9
	VPXOR      Y3, Y9, Y9
	VPALIGNR   $0x0c, Y14, Y14, Y14
	VPALIGNR   $0x0c, Y9, Y9, Y9
	VPALIGNR   $0x08, Y12, Y12, Y12
	VPALIGNR   $0x08, Y13, Y13, Y13
	VPALIGNR   $0x04, Y4, Y4, Y4
	VPALIGNR   $0x04, Y1, Y1, Y1
	DECQ       R9
	JNE        openAVX2192InnerCipherLoop
	VPADDD     Y6, Y0, Y0
	VPADDD     Y6, Y5, Y5
	VPADDD     Y10, Y14, Y14
	VPADDD     Y10, Y9, Y9
	VPADDD     Y8, Y12, Y12
	VPADDD     Y8, Y13, Y13
	VPADDD     Y2, Y4, Y4
	VPADDD     Y15, Y1, Y1
	VPERM2I128 $0x02, Y0, Y14, Y3

	// Clamp and store poly key
	VPAND   ·polyClampMask<>+0(SB), Y3, Y3
	VMOVDQA Y3, (BP)

	// Stream for up to 192 bytes
	VPERM2I128 $0x13, Y0, Y14, Y0
	VPERM2I128 $0x13, Y12, Y4, Y14
	VPERM2I128 $0x02, Y5, Y9, Y12
	VPERM2I128 $0x02, Y13, Y1, Y4
	VPERM2I128 $0x13, Y5, Y9, Y5
	VPERM2I128 $0x13, Y13, Y1, Y9

openAVX2ShortOpen:
	// Hash
	MOVQ ad_len+80(FP), R9
	CALL polyHashADInternal<>(SB)

openAVX2ShortOpenLoop:
	CMPQ BX, $0x20
	JB   openAVX2ShortTail32
	SUBQ $0x20, BX

	// Load for hashing
	ADDQ  (SI), R10
	ADCQ  8(SI), R11
	ADCQ  $0x01, R12
	MOVQ  (BP), DX
	MOVQ  DX, R15
	MULXQ R10, R13, R14
	IMULQ R12, R15
	MULXQ R11, AX, DX
	ADDQ  AX, R14
	ADCQ  DX, R15
	MOVQ  8(BP), DX
	MULXQ R10, R10, AX
	ADDQ  R10, R14
	MULXQ R11, R11, R8
	ADCQ  R11, R15
	ADCQ  $0x00, R8
	IMULQ R12, DX
	ADDQ  AX, R15
	ADCQ  DX, R8
	MOVQ  R13, R10
	MOVQ  R14, R11
	MOVQ  R15, R12
	ANDQ  $0x03, R12
	MOVQ  R15, R13
	ANDQ  $-4, R13
	MOVQ  R8, R14
	SHRQ  $0x02, R8, R15
	SHRQ  $0x02, R8
	ADDQ  R13, R10
	ADCQ  R14, R11
	ADCQ  $0x00, R12
	ADDQ  R15, R10
	ADCQ  R8, R11
	ADCQ  $0x00, R12
	ADDQ  16(SI), R10
	ADCQ  24(SI), R11
	ADCQ  $0x01, R12
	MOVQ  (BP), DX
	MOVQ  DX, R15
	MULXQ R10, R13, R14
	IMULQ R12, R15
	MULXQ R11, AX, DX
	ADDQ  AX, R14
	ADCQ  DX, R15
	MOVQ  8(BP), DX
	MULXQ R10, R10, AX
	ADDQ  R10, R14
	MULXQ R11, R11, R8
	ADCQ  R11, R15
	ADCQ  $0x00, R8
	IMULQ R12, DX
	ADDQ  AX, R15
	ADCQ  DX, R8
	MOVQ  R13, R10
	MOVQ  R14, R11
	MOVQ  R15, R12
	ANDQ  $0x03, R12
	MOVQ  R15, R13
	ANDQ  $-4, R13
	MOVQ  R8, R14
	SHRQ  $0x02, R8, R15
	SHRQ  $0x02, R8
	ADDQ  R13, R10
	ADCQ  R14, R11
	ADCQ  $0x00, R12
	ADDQ  R15, R10
	ADCQ  R8, R11
	ADCQ  $0x00, R12

	// Load for decryption
	VPXOR   (SI), Y0, Y0
	VMOVDQU Y0, (DI)
	LEAQ    32(SI), SI
	LEAQ    32(DI), DI

	// Shift stream left
	VMOVDQA Y14, Y0
	VMOVDQA Y12, Y14
	VMOVDQA Y4, Y12
	VMOVDQA Y5, Y4
	VMOVDQA Y9, Y5
	VMOVDQA Y13, Y9
	VMOVDQA Y1, Y13
	VMOVDQA Y6, Y1
	VMOVDQA Y10, Y6
	JMP     openAVX2ShortOpenLoop

openAVX2ShortTail32:
	CMPQ    BX, $0x10
	VMOVDQA X0, X1
	JB      openAVX2ShortDone
	SUBQ    $0x10, BX

	// Load for hashing
	ADDQ  (SI), R10
	ADCQ  8(SI), R11
	ADCQ  $0x01, R12
	MOVQ  (BP), DX
	MOVQ  DX, R15
	MULXQ R10, R13, R14
	IMULQ R12, R15
	MULXQ R11, AX, DX
	ADDQ  AX, R14
	ADCQ  DX, R15
	MOVQ  8(BP), DX
	MULXQ R10, R10, AX
	ADDQ  R10, R14
	MULXQ R11, R11, R8
	ADCQ  R11, R15
	ADCQ  $0x00, R8
	IMULQ R12, DX
	ADDQ  AX, R15
	ADCQ  DX, R8
	MOVQ  R13, R10
	MOVQ  R14, R11
	MOVQ  R15, R12
	ANDQ  $0x03, R12
	MOVQ  R15, R13
	ANDQ  $-4, R13
	MOVQ  R8, R14
	SHRQ  $0x02, R8, R15
	SHRQ  $0x02, R8
	ADDQ  R13, R10
	ADCQ  R14, R11
	ADCQ  $0x00, R12
	ADDQ  R15, R10
	ADCQ  R8, R11
	ADCQ  $0x00, R12

	// Load for decryption
	VPXOR      (SI), X0, X12
	VMOVDQU    X12, (DI)
	LEAQ       16(SI), SI
	LEAQ       16(DI), DI
	VPERM2I128 $0x11, Y0, Y0, Y0
	VMOVDQA    X0, X1

openAVX2ShortDone:
	VZEROUPPER
	JMP openSSETail16

openAVX2320:
	VMOVDQA Y0, Y5
	VMOVDQA Y14, Y9
	VMOVDQA Y12, Y13
	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
	VMOVDQA Y0, Y6
	VMOVDQA Y14, Y10
	VMOVDQA Y12, Y8
	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
	VMOVDQA Y14, Y7
	VMOVDQA Y12, Y11
	VMOVDQA Y4, Y15
	MOVQ    $0x0000000a, R9

openAVX2320InnerCipherLoop:
	VPADDD   Y14, Y0, Y0
	VPXOR    Y0, Y4, Y4
	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
	VPADDD   Y4, Y12, Y12
	VPXOR    Y12, Y14, Y14
	VPSLLD   $0x0c, Y14, Y3
	VPSRLD   $0x14, Y14, Y14
	VPXOR    Y3, Y14, Y14
	VPADDD   Y14, Y0, Y0
	VPXOR    Y0, Y4, Y4
	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
	VPADDD   Y4, Y12, Y12
	VPXOR    Y12, Y14, Y14
	VPSLLD   $0x07, Y14, Y3
	VPSRLD   $0x19, Y14, Y14
	VPXOR    Y3, Y14, Y14
	VPADDD   Y9, Y5, Y5
	VPXOR    Y5, Y1, Y1
	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
	VPADDD   Y1, Y13, Y13
	VPXOR    Y13, Y9, Y9
	VPSLLD   $0x0c, Y9, Y3
	VPSRLD   $0x14, Y9, Y9
	VPXOR    Y3, Y9, Y9
	VPADDD   Y9, Y5, Y5
	VPXOR    Y5, Y1, Y1
	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
	VPADDD   Y1, Y13, Y13
	VPXOR    Y13, Y9, Y9
	VPSLLD   $0x07, Y9, Y3
	VPSRLD   $0x19, Y9, Y9
	VPXOR    Y3, Y9, Y9
	VPADDD   Y10, Y6, Y6
	VPXOR    Y6, Y2, Y2
	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
	VPADDD   Y2, Y8, Y8
	VPXOR    Y8, Y10, Y10
	VPSLLD   $0x0c, Y10, Y3
	VPSRLD   $0x14, Y10, Y10
	VPXOR    Y3, Y10, Y10
	VPADDD   Y10, Y6, Y6
	VPXOR    Y6, Y2, Y2
	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
	VPADDD   Y2, Y8, Y8
	VPXOR    Y8, Y10, Y10
	VPSLLD   $0x07, Y10, Y3
	VPSRLD   $0x19, Y10, Y10
	VPXOR    Y3, Y10, Y10
	VPALIGNR $0x04, Y14, Y14, Y14
	VPALIGNR $0x04, Y9, Y9, Y9
	VPALIGNR $0x04, Y10, Y10, Y10
	VPALIGNR $0x08, Y12, Y12, Y12
	VPALIGNR $0x08, Y13, Y13, Y13
	VPALIGNR $0x08, Y8, Y8, Y8
	VPALIGNR $0x0c, Y4, Y4, Y4
	VPALIGNR $0x0c, Y1, Y1, Y1
	VPALIGNR $0x0c, Y2, Y2, Y2
	VPADDD   Y14, Y0, Y0
	VPXOR    Y0, Y4, Y4
	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
	VPADDD   Y4, Y12, Y12
	VPXOR    Y12, Y14, Y14
	VPSLLD   $0x0c, Y14, Y3
	VPSRLD   $0x14, Y14, Y14
	VPXOR    Y3, Y14, Y14
	VPADDD   Y14, Y0, Y0
	VPXOR    Y0, Y4, Y4
	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
	VPADDD   Y4, Y12, Y12
	VPXOR    Y12, Y14, Y14
	VPSLLD   $0x07, Y14, Y3
	VPSRLD   $0x19, Y14, Y14
	VPXOR    Y3, Y14, Y14
	VPADDD   Y9, Y5, Y5
	VPXOR    Y5, Y1, Y1
	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
	VPADDD   Y1, Y13, Y13
	VPXOR    Y13, Y9, Y9
	VPSLLD   $0x0c, Y9, Y3
	VPSRLD   $0x14, Y9, Y9
	VPXOR    Y3, Y9, Y9
	VPADDD   Y9, Y5, Y5
	VPXOR    Y5, Y1, Y1
	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
	VPADDD   Y1, Y13, Y13
	VPXOR    Y13, Y9, Y9
	VPSLLD   $0x07, Y9, Y3
	VPSRLD   $0x19, Y9, Y9
	VPXOR    Y3, Y9, Y9
	VPADDD   Y10, Y6, Y6
	VPXOR    Y6, Y2, Y2
	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
	VPADDD   Y2, Y8, Y8
	VPXOR    Y8, Y10, Y10
	VPSLLD   $0x0c, Y10, Y3
	VPSRLD   $0x14, Y10, Y10
	VPXOR    Y3, Y10, Y10
	VPADDD   Y10, Y6, Y6
	VPXOR    Y6, Y2, Y2
	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
	VPADDD   Y2, Y8, Y8
	VPXOR    Y8, Y10, Y10
	VPSLLD   $0x07, Y10, Y3
	VPSRLD   $0x19, Y10, Y10
	VPXOR    Y3, Y10, Y10
	VPALIGNR $0x0c, Y14, Y14, Y14
	VPALIGNR $0x0c, Y9, Y9, Y9
	VPALIGNR $0x0c, Y10, Y10, Y10
	VPALIGNR $0x08, Y12, Y12, Y12
	VPALIGNR $0x08, Y13, Y13, Y13
	VPALIGNR $0x08, Y8, Y8, Y8
	VPALIGNR $0x04, Y4, Y4, Y4
	VPALIGNR $0x04, Y1, Y1, Y1
	VPALIGNR $0x04, Y2, Y2, Y2
	DECQ     R9
	JNE      openAVX2320InnerCipherLoop
	VMOVDQA  ·chacha20Constants<>+0(SB), Y3
	VPADDD   Y3, Y0, Y0
	VPADDD   Y3, Y5, Y5
	VPADDD   Y3, Y6, Y6
	VPADDD   Y7, Y14, Y14
	VPADDD   Y7, Y9, Y9
	VPADDD   Y7, Y10, Y10
	VPADDD   Y11, Y12, Y12
	VPADDD   Y11, Y13, Y13
	VPADDD   Y11, Y8, Y8
	VMOVDQA  ·avx2IncMask<>+0(SB), Y3
	VPADDD   Y15, Y4, Y4
	VPADDD   Y3, Y15, Y15
	VPADDD   Y15, Y1, Y1
	VPADDD   Y3, Y15, Y15
	VPADDD   Y15, Y2, Y2

	// Clamp and store poly key
	VPERM2I128 $0x02, Y0, Y14, Y3
	VPAND      ·polyClampMask<>+0(SB), Y3, Y3
	VMOVDQA    Y3, (BP)

	// Stream for up to 320 bytes
	VPERM2I128 $0x13, Y0, Y14, Y0
	VPERM2I128 $0x13, Y12, Y4, Y14
	VPERM2I128 $0x02, Y5, Y9, Y12
	VPERM2I128 $0x02, Y13, Y1, Y4
	VPERM2I128 $0x13, Y5, Y9, Y5
	VPERM2I128 $0x13, Y13, Y1, Y9
	VPERM2I128 $0x02, Y6, Y10, Y13
	VPERM2I128 $0x02, Y8, Y2, Y1
	VPERM2I128 $0x13, Y6, Y10, Y6
	VPERM2I128 $0x13, Y8, Y2, Y10
	JMP        openAVX2ShortOpen

openAVX2Tail128:
	// Need to decrypt up to 128 bytes - prepare two blocks
	VMOVDQA ·chacha20Constants<>+0(SB), Y5
	VMOVDQA 32(BP), Y9
	VMOVDQA 64(BP), Y13
	VMOVDQA 192(BP), Y1
	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y1
	VMOVDQA Y1, Y4
	XORQ    R9, R9
	MOVQ    BX, CX
	ANDQ    $-16, CX
	TESTQ   CX, CX
	JE      openAVX2Tail128LoopB

openAVX2Tail128LoopA:
	ADDQ  (SI)(R9*1), R10
	ADCQ  8(SI)(R9*1), R11
	ADCQ  $0x01, R12
	MOVQ  (BP), DX
	MOVQ  DX, R15
	MULXQ R10, R13, R14
	IMULQ R12, R15
	MULXQ R11, AX, DX
	ADDQ  AX, R14
	ADCQ  DX, R15
	MOVQ  8(BP), DX
	MULXQ R10, R10, AX
	ADDQ  R10, R14
	MULXQ R11, R11, R8
	ADCQ  R11, R15
	ADCQ  $0x00, R8
	IMULQ R12, DX
	ADDQ  AX, R15
	ADCQ  DX, R8
	MOVQ  R13, R10
	MOVQ  R14, R11
	MOVQ  R15, R12
	ANDQ  $0x03, R12
	MOVQ  R15, R13
	ANDQ  $-4, R13
	MOVQ  R8, R14
	SHRQ  $0x02, R8, R15
	SHRQ  $0x02, R8
	ADDQ  R13, R10
	ADCQ  R14, R11
	ADCQ  $0x00, R12
	ADDQ  R15, R10
	ADCQ  R8, R11
	ADCQ  $0x00, R12

openAVX2Tail128LoopB:
	ADDQ       $0x10, R9
	VPADDD     Y9, Y5, Y5
	VPXOR      Y5, Y1, Y1
	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
	VPADDD     Y1, Y13, Y13
	VPXOR      Y13, Y9, Y9
	VPSLLD     $0x0c, Y9, Y3
	VPSRLD     $0x14, Y9, Y9
	VPXOR      Y3, Y9, Y9
	VPADDD     Y9, Y5, Y5
	VPXOR      Y5, Y1, Y1
	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
	VPADDD     Y1, Y13, Y13
	VPXOR      Y13, Y9, Y9
	VPSLLD     $0x07, Y9, Y3
	VPSRLD     $0x19, Y9, Y9
	VPXOR      Y3, Y9, Y9
	VPALIGNR   $0x04, Y9, Y9, Y9
	VPALIGNR   $0x08, Y13, Y13, Y13
	VPALIGNR   $0x0c, Y1, Y1, Y1
	VPADDD     Y9, Y5, Y5
	VPXOR      Y5, Y1, Y1
	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
	VPADDD     Y1, Y13, Y13
	VPXOR      Y13, Y9, Y9
	VPSLLD     $0x0c, Y9, Y3
	VPSRLD     $0x14, Y9, Y9
	VPXOR      Y3, Y9, Y9
	VPADDD     Y9, Y5, Y5
	VPXOR      Y5, Y1, Y1
	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
	VPADDD     Y1, Y13, Y13
	VPXOR      Y13, Y9, Y9
	VPSLLD     $0x07, Y9, Y3
	VPSRLD     $0x19, Y9, Y9
	VPXOR      Y3, Y9, Y9
	VPALIGNR   $0x0c, Y9, Y9, Y9
	VPALIGNR   $0x08, Y13, Y13, Y13
	VPALIGNR   $0x04, Y1, Y1, Y1
	CMPQ       R9, CX
	JB         openAVX2Tail128LoopA
	CMPQ       R9, $0xa0
	JNE        openAVX2Tail128LoopB
	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
	VPADDD     32(BP), Y9, Y9
	VPADDD     64(BP), Y13, Y13
	VPADDD     Y4, Y1, Y1
	VPERM2I128 $0x02, Y5, Y9, Y0
	VPERM2I128 $0x02, Y13, Y1, Y14
	VPERM2I128 $0x13, Y5, Y9, Y12
	VPERM2I128 $0x13, Y13, Y1, Y4

openAVX2TailLoop:
	CMPQ BX, $0x20
	JB   openAVX2Tail
	SUBQ $0x20, BX

	// Load for decryption
	VPXOR   (SI), Y0, Y0
	VMOVDQU Y0, (DI)
	LEAQ    32(SI), SI
	LEAQ    32(DI), DI
	VMOVDQA Y14, Y0
	VMOVDQA Y12, Y14
	VMOVDQA Y4, Y12
	JMP     openAVX2TailLoop

openAVX2Tail:
	CMPQ    BX, $0x10
	VMOVDQA X0, X1
	JB      openAVX2TailDone
	SUBQ    $0x10, BX

	// Load for decryption
	VPXOR      (SI), X0, X12
	VMOVDQU    X12, (DI)
	LEAQ       16(SI), SI
	LEAQ       16(DI), DI
	VPERM2I128 $0x11, Y0, Y0, Y0
	VMOVDQA    X0, X1

openAVX2TailDone:
	VZEROUPPER
	JMP openSSETail16

openAVX2Tail256:
	VMOVDQA ·chacha20Constants<>+0(SB), Y0
	VMOVDQA Y0, Y5
	VMOVDQA 32(BP), Y14
	VMOVDQA Y14, Y9
	VMOVDQA 64(BP), Y12
	VMOVDQA Y12, Y13
	VMOVDQA 192(BP), Y4
	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
	VMOVDQA Y4, Y7
	VMOVDQA Y1, Y11

	// Compute the number of iterations that will hash data
	MOVQ    BX, 224(BP)
	MOVQ    BX, CX
	SUBQ    $0x80, CX
	SHRQ    $0x04, CX
	MOVQ    $0x0000000a, R9
	CMPQ    CX, $0x0a
	CMOVQGT R9, CX
	MOVQ    SI, BX
	XORQ    R9, R9

openAVX2Tail256LoopA:
	ADDQ  (BX), R10
	ADCQ  8(BX), R11
	ADCQ  $0x01, R12
	MOVQ  (BP), DX
	MOVQ  DX, R15
	MULXQ R10, R13, R14
	IMULQ R12, R15
	MULXQ R11, AX, DX
	ADDQ  AX, R14
	ADCQ  DX, R15
	MOVQ  8(BP), DX
	MULXQ R10, R10, AX
	ADDQ  R10, R14
	MULXQ R11, R11, R8
	ADCQ  R11, R15
	ADCQ  $0x00, R8
	IMULQ R12, DX
	ADDQ  AX, R15
	ADCQ  DX, R8
	MOVQ  R13, R10
	MOVQ  R14, R11
	MOVQ  R15, R12
	ANDQ  $0x03, R12
	MOVQ  R15, R13
	ANDQ  $-4, R13
	MOVQ  R8, R14
	SHRQ  $0x02, R8, R15
	SHRQ  $0x02, R8
	ADDQ  R13, R10
	ADCQ  R14, R11
	ADCQ  $0x00, R12
	ADDQ  R15, R10
	ADCQ  R8, R11
	ADCQ  $0x00, R12
	LEAQ  16(BX), BX

openAVX2Tail256LoopB:
	VPADDD   Y14, Y0, Y0
	VPXOR    Y0, Y4, Y4
	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
	VPADDD   Y4, Y12, Y12
	VPXOR    Y12, Y14, Y14
	VPSLLD   $0x0c, Y14, Y3
	VPSRLD   $0x14, Y14, Y14
	VPXOR    Y3, Y14, Y14
	VPADDD   Y14, Y0, Y0
	VPXOR    Y0, Y4, Y4
	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
	VPADDD   Y4, Y12, Y12
	VPXOR    Y12, Y14, Y14
	VPSLLD   $0x07, Y14, Y3
	VPSRLD   $0x19, Y14, Y14
	VPXOR    Y3, Y14, Y14
	VPADDD   Y9, Y5, Y5
	VPXOR    Y5, Y1, Y1
	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
	VPADDD   Y1, Y13, Y13
	VPXOR    Y13, Y9, Y9
	VPSLLD   $0x0c, Y9, Y3
	VPSRLD   $0x14, Y9, Y9
	VPXOR    Y3, Y9, Y9
	VPADDD   Y9, Y5, Y5
	VPXOR    Y5, Y1, Y1
	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
	VPADDD   Y1, Y13, Y13
	VPXOR    Y13, Y9, Y9
	VPSLLD   $0x07, Y9, Y3
	VPSRLD   $0x19, Y9, Y9
	VPXOR    Y3, Y9, Y9
	VPALIGNR $0x04, Y14, Y14, Y14
	VPALIGNR $0x04, Y9, Y9, Y9
	VPALIGNR $0x08, Y12, Y12, Y12
	VPALIGNR $0x08, Y13, Y13, Y13
	VPALIGNR $0x0c, Y4, Y4, Y4
	VPALIGNR $0x0c, Y1, Y1, Y1
	INCQ     R9
	VPADDD   Y14, Y0, Y0
	VPXOR    Y0, Y4, Y4
	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
	VPADDD   Y4, Y12, Y12
	VPXOR    Y12, Y14, Y14
	VPSLLD   $0x0c, Y14, Y3
	VPSRLD   $0x14, Y14, Y14
	VPXOR    Y3, Y14, Y14
	VPADDD   Y14, Y0, Y0
	VPXOR    Y0, Y4, Y4
	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
	VPADDD   Y4, Y12, Y12
	VPXOR    Y12, Y14, Y14
	VPSLLD   $0x07, Y14, Y3
	VPSRLD   $0x19, Y14, Y14
	VPXOR    Y3, Y14, Y14
	VPADDD   Y9, Y5, Y5
	VPXOR    Y5, Y1, Y1
	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
	VPADDD   Y1, Y13, Y13
	VPXOR    Y13, Y9, Y9
	VPSLLD   $0x0c, Y9, Y3
	VPSRLD   $0x14, Y9, Y9
	VPXOR    Y3, Y9, Y9
	VPADDD   Y9, Y5, Y5
	VPXOR    Y5, Y1, Y1
	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
	VPADDD   Y1, Y13, Y13
	VPXOR    Y13, Y9, Y9
	VPSLLD   $0x07, Y9, Y3
	VPSRLD   $0x19, Y9, Y9
	VPXOR    Y3, Y9, Y9
	VPALIGNR $0x0c, Y14, Y14, Y14
	VPALIGNR $0x0c, Y9, Y9, Y9
	VPALIGNR $0x08, Y12, Y12, Y12
	VPALIGNR $0x08, Y13, Y13, Y13
	VPALIGNR $0x04, Y4, Y4, Y4
	VPALIGNR $0x04, Y1, Y1, Y1
	CMPQ     R9, CX
	JB       openAVX2Tail256LoopA
	CMPQ     R9, $0x0a
	JNE      openAVX2Tail256LoopB
	MOVQ     BX, R9
	SUBQ     SI, BX
	MOVQ     BX, CX
	MOVQ     224(BP), BX

openAVX2Tail256Hash:
	ADDQ  $0x10, CX
	CMPQ  CX, BX
	JGT   openAVX2Tail256HashEnd
	ADDQ  (R9), R10
	ADCQ  8(R9), R11
	ADCQ  $0x01, R12
	MOVQ  (BP), DX
	MOVQ  DX, R15
	MULXQ R10, R13, R14
	IMULQ R12, R15
	MULXQ R11, AX, DX
	ADDQ  AX, R14
	ADCQ  DX, R15
	MOVQ  8(BP), DX
	MULXQ R10, R10, AX
	ADDQ  R10, R14
	MULXQ R11, R11, R8
	ADCQ  R11, R15
	ADCQ  $0x00, R8
	IMULQ R12, DX
	ADDQ  AX, R15
	ADCQ  DX, R8
	MOVQ  R13, R10
	MOVQ  R14, R11
	MOVQ  R15, R12
	ANDQ  $0x03, R12
	MOVQ  R15, R13
	ANDQ  $-4, R13
	MOVQ  R8, R14
	SHRQ  $0x02, R8, R15
	SHRQ  $0x02, R8
	ADDQ  R13, R10
	ADCQ  R14, R11
	ADCQ  $0x00, R12
	ADDQ  R15, R10
	ADCQ  R8, R11
	ADCQ  $0x00, R12
	LEAQ  16(R9), R9
	JMP   openAVX2Tail256Hash

openAVX2Tail256HashEnd:
	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
	VPADDD     32(BP), Y14, Y14
	VPADDD     32(BP), Y9, Y9
	VPADDD     64(BP), Y12, Y12
	VPADDD     64(BP), Y13, Y13
	VPADDD     Y7, Y4, Y4
	VPADDD     Y11, Y1, Y1
	VPERM2I128 $0x02, Y0, Y14, Y6
	VPERM2I128 $0x02, Y12, Y4, Y10
	VPERM2I128 $0x13, Y0, Y14, Y8
	VPERM2I128 $0x13, Y12, Y4, Y2
	VPERM2I128 $0x02, Y5, Y9, Y0
	VPERM2I128 $0x02, Y13, Y1, Y14
	VPERM2I128 $0x13, Y5, Y9, Y12
	VPERM2I128 $0x13, Y13, Y1, Y4
	VPXOR      (SI), Y6, Y6
	VPXOR      32(SI), Y10, Y10
	VPXOR      64(SI), Y8, Y8
	VPXOR      96(SI), Y2, Y2
	VMOVDQU    Y6, (DI)
	VMOVDQU    Y10, 32(DI)
	VMOVDQU    Y8, 64(DI)
	VMOVDQU    Y2, 96(DI)
	LEAQ       128(SI), SI
	LEAQ       128(DI), DI
	SUBQ       $0x80, BX
	JMP        openAVX2TailLoop

openAVX2Tail384:
	// Need to decrypt up to 384 bytes - prepare six blocks
	VMOVDQA ·chacha20Constants<>+0(SB), Y0
	VMOVDQA Y0, Y5
	VMOVDQA Y0, Y6
	VMOVDQA 32(BP), Y14
	VMOVDQA Y14, Y9
	VMOVDQA Y14, Y10
	VMOVDQA 64(BP), Y12
	VMOVDQA Y12, Y13
	VMOVDQA Y12, Y8
	VMOVDQA 192(BP), Y4
	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
	VMOVDQA Y4, 96(BP)
	VMOVDQA Y1, 128(BP)
	VMOVDQA Y2, 160(BP)

	// Compute the number of iterations that will hash two blocks of data
	MOVQ    BX, 224(BP)
	MOVQ    BX, CX
	SUBQ    $0x00000100, CX
	SHRQ    $0x04, CX
	ADDQ    $0x06, CX
	MOVQ    $0x0000000a, R9
	CMPQ    CX, $0x0a
	CMOVQGT R9, CX
	MOVQ    SI, BX
	XORQ    R9, R9

openAVX2Tail384LoopB:
	ADDQ  (BX), R10
	ADCQ  8(BX), R11
	ADCQ  $0x01, R12
	MOVQ  (BP), DX
	MOVQ  DX, R15
	MULXQ R10, R13, R14
	IMULQ R12, R15
	MULXQ R11, AX, DX
	ADDQ  AX, R14
	ADCQ  DX, R15
	MOVQ  8(BP), DX
	MULXQ R10, R10, AX
	ADDQ  R10, R14
	MULXQ R11, R11, R8
	ADCQ  R11, R15
	ADCQ  $0x00, R8
	IMULQ R12, DX
	ADDQ  AX, R15
	ADCQ  DX, R8
	MOVQ  R13, R10
	MOVQ  R14, R11
	MOVQ  R15, R12
	ANDQ  $0x03, R12
	MOVQ  R15, R13
	ANDQ  $-4, R13
	MOVQ  R8, R14
	SHRQ  $0x02, R8, R15
	SHRQ  $0x02, R8
	ADDQ  R13, R10
	ADCQ  R14, R11
	ADCQ  $0x00, R12
	ADDQ  R15, R10
	ADCQ  R8, R11
	ADCQ  $0x00, R12
	LEAQ  16(BX), BX

openAVX2Tail384LoopA:
	VPADDD   Y14, Y0, Y0
	VPXOR    Y0, Y4, Y4
	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
	VPADDD   Y4, Y12, Y12
	VPXOR    Y12, Y14, Y14
	VPSLLD   $0x0c, Y14, Y3
	VPSRLD   $0x14, Y14, Y14
	VPXOR    Y3, Y14, Y14
	VPADDD   Y14, Y0, Y0
	VPXOR    Y0, Y4, Y4
	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
	VPADDD   Y4, Y12, Y12
	VPXOR    Y12, Y14, Y14
	VPSLLD   $0x07, Y14, Y3
	VPSRLD   $0x19, Y14, Y14
	VPXOR    Y3, Y14, Y14
	VPADDD   Y9, Y5, Y5
	VPXOR    Y5, Y1, Y1
	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
	VPADDD   Y1, Y13, Y13
	VPXOR    Y13, Y9, Y9
	VPSLLD   $0x0c, Y9, Y3
	VPSRLD   $0x14, Y9, Y9
	VPXOR    Y3, Y9, Y9
	VPADDD   Y9, Y5, Y5
	VPXOR    Y5, Y1, Y1
	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
	VPADDD   Y1, Y13, Y13
	VPXOR    Y13, Y9, Y9
	VPSLLD   $0x07, Y9, Y3
	VPSRLD   $0x19, Y9, Y9
	VPXOR    Y3, Y9, Y9
	VPADDD   Y10, Y6, Y6
	VPXOR    Y6, Y2, Y2
	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
	VPADDD   Y2, Y8, Y8
	VPXOR    Y8, Y10, Y10
	VPSLLD   $0x0c, Y10, Y3
	VPSRLD   $0x14, Y10, Y10
	VPXOR    Y3, Y10, Y10
	VPADDD   Y10, Y6, Y6
	VPXOR    Y6, Y2, Y2
	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
	VPADDD   Y2, Y8, Y8
	VPXOR    Y8, Y10, Y10
	VPSLLD   $0x07, Y10, Y3
	VPSRLD   $0x19, Y10, Y10
	VPXOR    Y3, Y10, Y10
	VPALIGNR $0x04, Y14, Y14, Y14
	VPALIGNR $0x04, Y9, Y9, Y9
	VPALIGNR $0x04, Y10, Y10, Y10
	VPALIGNR $0x08, Y12, Y12, Y12
	VPALIGNR $0x08, Y13, Y13, Y13
	VPALIGNR $0x08, Y8, Y8, Y8
	VPALIGNR $0x0c, Y4, Y4, Y4
	VPALIGNR $0x0c, Y1, Y1, Y1
	VPALIGNR $0x0c, Y2, Y2, Y2
	ADDQ     (BX), R10
	ADCQ     8(BX), R11
	ADCQ     $0x01, R12
	MOVQ     (BP), DX
	MOVQ     DX, R15
	MULXQ    R10, R13, R14
	IMULQ    R12, R15
	MULXQ    R11, AX, DX
	ADDQ     AX, R14
	ADCQ     DX, R15
	MOVQ     8(BP), DX
	MULXQ    R10, R10, AX
	ADDQ     R10, R14
	MULXQ    R11, R11, R8
	ADCQ     R11, R15
	ADCQ     $0x00, R8
	IMULQ    R12, DX
	ADDQ     AX, R15
	ADCQ     DX, R8
	MOVQ     R13, R10
	MOVQ     R14, R11
	MOVQ     R15, R12
	ANDQ     $0x03, R12
	MOVQ     R15, R13
	ANDQ     $-4, R13
	MOVQ     R8, R14
	SHRQ     $0x02, R8, R15
	SHRQ     $0x02, R8
	ADDQ     R13, R10
	ADCQ     R14, R11
	ADCQ     $0x00, R12
	ADDQ     R15, R10
	ADCQ     R8, R11
	ADCQ     $0x00, R12
	LEAQ     16(BX), BX
	INCQ     R9
	VPADDD   Y14, Y0, Y0
	VPXOR    Y0, Y4, Y4
	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
	VPADDD   Y4, Y12, Y12
	VPXOR    Y12, Y14, Y14
	VPSLLD   $0x0c, Y14, Y3
	VPSRLD   $0x14, Y14, Y14
	VPXOR    Y3, Y14, Y14
	VPADDD   Y14, Y0, Y0
	VPXOR    Y0, Y4, Y4
	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
	VPADDD   Y4, Y12, Y12
	VPXOR    Y12, Y14, Y14
	VPSLLD   $0x07, Y14, Y3
	VPSRLD   $0x19, Y14, Y14
	VPXOR    Y3, Y14, Y14
	VPADDD   Y9, Y5, Y5
	VPXOR    Y5, Y1, Y1
	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
	VPADDD   Y1, Y13, Y13
	VPXOR    Y13, Y9, Y9
	VPSLLD   $0x0c, Y9, Y3
	VPSRLD   $0x14, Y9, Y9
	VPXOR    Y3, Y9, Y9
	VPADDD   Y9, Y5, Y5
	VPXOR    Y5, Y1, Y1
	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
	VPADDD   Y1, Y13, Y13
	VPXOR    Y13, Y9, Y9
	VPSLLD   $0x07, Y9, Y3
	VPSRLD   $0x19, Y9, Y9
	VPXOR    Y3, Y9, Y9
	VPADDD   Y10, Y6, Y6
	VPXOR    Y6, Y2, Y2
	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
	VPADDD   Y2, Y8, Y8
	VPXOR    Y8, Y10, Y10
	VPSLLD   $0x0c, Y10, Y3
	VPSRLD   $0x14, Y10, Y10
	VPXOR    Y3, Y10, Y10
	VPADDD   Y10, Y6, Y6
	VPXOR    Y6, Y2, Y2
	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
	VPADDD   Y2, Y8, Y8
	VPXOR    Y8, Y10, Y10
	VPSLLD   $0x07, Y10, Y3
	VPSRLD   $0x19, Y10, Y10
	VPXOR    Y3, Y10, Y10
	VPALIGNR $0x0c, Y14, Y14, Y14
	VPALIGNR $0x0c, Y9, Y9, Y9
	VPALIGNR $0x0c, Y10, Y10, Y10
	VPALIGNR $0x08, Y12, Y12, Y12
	VPALIGNR $0x08, Y13, Y13, Y13
	VPALIGNR $0x08, Y8, Y8, Y8
	VPALIGNR $0x04, Y4, Y4, Y4
	VPALIGNR $0x04, Y1, Y1, Y1
	VPALIGNR $0x04, Y2, Y2, Y2
	CMPQ     R9, CX
	JB       openAVX2Tail384LoopB
	CMPQ     R9, $0x0a
	JNE      openAVX2Tail384LoopA
	MOVQ     BX, R9
	SUBQ     SI, BX
	MOVQ     BX, CX
	MOVQ     224(BP), BX

openAVX2Tail384Hash:
	ADDQ  $0x10, CX
	CMPQ  CX, BX
	JGT   openAVX2Tail384HashEnd
	ADDQ  (R9), R10
	ADCQ  8(R9), R11
	ADCQ  $0x01, R12
	MOVQ  (BP), DX
	MOVQ  DX, R15
	MULXQ R10, R13, R14
	IMULQ R12, R15
	MULXQ R11, AX, DX
	ADDQ  AX, R14
	ADCQ  DX, R15
	MOVQ  8(BP), DX
	MULXQ R10, R10, AX
	ADDQ  R10, R14
	MULXQ R11, R11, R8
	ADCQ  R11, R15
	ADCQ  $0x00, R8
	IMULQ R12, DX
	ADDQ  AX, R15
	ADCQ  DX, R8
	MOVQ  R13, R10
	MOVQ  R14, R11
	MOVQ  R15, R12
	ANDQ  $0x03, R12
	MOVQ  R15, R13
	ANDQ  $-4, R13
	MOVQ  R8, R14
	SHRQ  $0x02, R8, R15
	SHRQ  $0x02, R8
	ADDQ  R13, R10
	ADCQ  R14, R11
	ADCQ  $0x00, R12
	ADDQ  R15, R10
	ADCQ  R8, R11
	ADCQ  $0x00, R12
	LEAQ  16(R9), R9
	JMP   openAVX2Tail384Hash

openAVX2Tail384HashEnd:
	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
	VPADDD     ·chacha20Constants<>+0(SB), Y6, Y6
	VPADDD     32(BP), Y14, Y14
	VPADDD     32(BP), Y9, Y9
	VPADDD     32(BP), Y10, Y10
	VPADDD     64(BP), Y12, Y12
	VPADDD     64(BP), Y13, Y13
	VPADDD     64(BP), Y8, Y8
	VPADDD     96(BP), Y4, Y4
	VPADDD     128(BP), Y1, Y1
	VPADDD     160(BP), Y2, Y2
	VPERM2I128 $0x02, Y0, Y14, Y3
	VPERM2I128 $0x02, Y12, Y4, Y7
	VPERM2I128 $0x13, Y0, Y14, Y11
	VPERM2I128 $0x13, Y12, Y4, Y15
	VPXOR      (SI), Y3, Y3
	VPXOR      32(SI), Y7, Y7
	VPXOR      64(SI), Y11, Y11
	VPXOR      96(SI), Y15, Y15
	VMOVDQU    Y3, (DI)
	VMOVDQU    Y7, 32(DI)
	VMOVDQU    Y11, 64(DI)
	VMOVDQU    Y15, 96(DI)
	VPERM2I128 $0x02, Y5, Y9, Y3
	VPERM2I128 $0x02, Y13, Y1, Y7
	VPERM2I128 $0x13, Y5, Y9, Y11
	VPERM2I128 $0x13, Y13, Y1, Y15
	VPXOR      128(SI), Y3, Y3
	VPXOR      160(SI), Y7, Y7
	VPXOR      192(SI), Y11, Y11
	VPXOR      224(SI), Y15, Y15
	VMOVDQU    Y3, 128(DI)
	VMOVDQU    Y7, 160(DI)
	VMOVDQU    Y11, 192(DI)
	VMOVDQU    Y15, 224(DI)
	VPERM2I128 $0x02, Y6, Y10, Y0
	VPERM2I128 $0x02, Y8, Y2, Y14
	VPERM2I128 $0x13, Y6, Y10, Y12
	VPERM2I128 $0x13, Y8, Y2, Y4
	LEAQ       256(SI), SI
	LEAQ       256(DI), DI
	SUBQ       $0x00000100, BX
	JMP        openAVX2TailLoop

openAVX2Tail512:
	VMOVDQU ·chacha20Constants<>+0(SB), Y0
	VMOVDQA Y0, Y5
	VMOVDQA Y0, Y6
	VMOVDQA Y0, Y7
	VMOVDQA 32(BP), Y14
	VMOVDQA Y14, Y9
	VMOVDQA Y14, Y10
	VMOVDQA Y14, Y11
	VMOVDQA 64(BP), Y12
	VMOVDQA Y12, Y13
	VMOVDQA Y12, Y8
	VMOVDQA Y12, Y15
	VMOVDQA 192(BP), Y4
	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
	VPADDD  ·avx2IncMask<>+0(SB), Y2, Y3
	VMOVDQA Y4, 96(BP)
	VMOVDQA Y1, 128(BP)
	VMOVDQA Y2, 160(BP)
	VMOVDQA Y3, 192(BP)
	XORQ    CX, CX
	MOVQ    SI, R9

openAVX2Tail512LoopB:
	ADDQ  (R9), R10
	ADCQ  8(R9), R11
	ADCQ  $0x01, R12
	MOVQ  (BP), DX
	MOVQ  DX, R15
	MULXQ R10, R13, R14
	IMULQ R12, R15
	MULXQ R11, AX, DX
	ADDQ  AX, R14
	ADCQ  DX, R15
	MOVQ  8(BP), DX
	MULXQ R10, R10, AX
	ADDQ  R10, R14
	MULXQ R11, R11, R8
	ADCQ  R11, R15
	ADCQ  $0x00, R8
	IMULQ R12, DX
	ADDQ  AX, R15
	ADCQ  DX, R8
	MOVQ  R13, R10
	MOVQ  R14, R11
	MOVQ  R15, R12
	ANDQ  $0x03, R12
	MOVQ  R15, R13
	ANDQ  $-4, R13
	MOVQ  R8, R14
	SHRQ  $0x02, R8, R15
	SHRQ  $0x02, R8
	ADDQ  R13, R10
	ADCQ  R14, R11
	ADCQ  $0x00, R12
	ADDQ  R15, R10
	ADCQ  R8, R11
	ADCQ  $0x00, R12
	LEAQ  16(R9), R9

openAVX2Tail512LoopA:
	VPADDD   Y14, Y0, Y0
	VPADDD   Y9, Y5, Y5
	VPADDD   Y10, Y6, Y6
	VPADDD   Y11, Y7, Y7
	VPXOR    Y0, Y4, Y4
	VPXOR    Y5, Y1, Y1
	VPXOR    Y6, Y2, Y2
	VPXOR    Y7, Y3, Y3
	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
	VPADDD   Y4, Y12, Y12
	VPADDD   Y1, Y13, Y13
	VPADDD   Y2, Y8, Y8
	VPADDD   Y3, Y15, Y15
	VPXOR    Y12, Y14, Y14
	VPXOR    Y13, Y9, Y9
	VPXOR    Y8, Y10, Y10
	VPXOR    Y15, Y11, Y11
	VMOVDQA  Y15, 224(BP)
	VPSLLD   $0x0c, Y14, Y15
	VPSRLD   $0x14, Y14, Y14
	VPXOR    Y15, Y14, Y14
	VPSLLD   $0x0c, Y9, Y15
	VPSRLD   $0x14, Y9, Y9
	VPXOR    Y15, Y9, Y9
	VPSLLD   $0x0c, Y10, Y15
	VPSRLD   $0x14, Y10, Y10
	VPXOR    Y15, Y10, Y10
	VPSLLD   $0x0c, Y11, Y15
	VPSRLD   $0x14, Y11, Y11
	VPXOR    Y15, Y11, Y11
	VMOVDQA  224(BP), Y15
	ADDQ     (R9), R10
	ADCQ     8(R9), R11
	ADCQ     $0x01, R12
	MOVQ     (BP), DX
	MOVQ     DX, R15
	MULXQ    R10, R13, R14
	IMULQ    R12, R15
	MULXQ    R11, AX, DX
	ADDQ     AX, R14
	ADCQ     DX, R15
	MOVQ     8(BP), DX
	MULXQ    R10, R10, AX
	ADDQ     R10, R14
	MULXQ    R11, R11, R8
	ADCQ     R11, R15
	ADCQ     $0x00, R8
	IMULQ    R12, DX
	ADDQ     AX, R15
	ADCQ     DX, R8
	MOVQ     R13, R10
	MOVQ     R14, R11
	MOVQ     R15, R12
	ANDQ     $0x03, R12
	MOVQ     R15, R13
	ANDQ     $-4, R13
	MOVQ     R8, R14
	SHRQ     $0x02, R8, R15
	SHRQ     $0x02, R8
	ADDQ     R13, R10
	ADCQ     R14, R11
	ADCQ     $0x00, R12
	ADDQ     R15, R10
	ADCQ     R8, R11
	ADCQ     $0x00, R12
	VPADDD   Y14, Y0, Y0
	VPADDD   Y9, Y5, Y5
	VPADDD   Y10, Y6, Y6
	VPADDD   Y11, Y7, Y7
	VPXOR    Y0, Y4, Y4
	VPXOR    Y5, Y1, Y1
	VPXOR    Y6, Y2, Y2
	VPXOR    Y7, Y3, Y3
	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
	VPADDD   Y4, Y12, Y12
	VPADDD   Y1, Y13, Y13
	VPADDD   Y2, Y8, Y8
	VPADDD   Y3, Y15, Y15
	VPXOR    Y12, Y14, Y14
	VPXOR    Y13, Y9, Y9
	VPXOR    Y8, Y10, Y10
	VPXOR    Y15, Y11, Y11
	VMOVDQA  Y15, 224(BP)
	VPSLLD   $0x07, Y14, Y15
	VPSRLD   $0x19, Y14, Y14
	VPXOR    Y15, Y14, Y14
	VPSLLD   $0x07, Y9, Y15
	VPSRLD   $0x19, Y9, Y9
	VPXOR    Y15, Y9, Y9
	VPSLLD   $0x07, Y10, Y15
	VPSRLD   $0x19, Y10, Y10
	VPXOR    Y15, Y10, Y10
	VPSLLD   $0x07, Y11, Y15
	VPSRLD   $0x19, Y11, Y11
	VPXOR    Y15, Y11, Y11
	VMOVDQA  224(BP), Y15
	VPALIGNR $0x04, Y14, Y14, Y14
	VPALIGNR $0x04, Y9, Y9, Y9
	VPALIGNR $0x04, Y10, Y10, Y10
	VPALIGNR $0x04, Y11, Y11, Y11
	VPALIGNR $0x08, Y12, Y12, Y12
	VPALIGNR $0x08, Y13, Y13, Y13
	VPALIGNR $0x08, Y8, Y8, Y8
	VPALIGNR $0x08, Y15, Y15, Y15
	VPALIGNR $0x0c, Y4, Y4, Y4
	VPALIGNR $0x0c, Y1, Y1, Y1
	VPALIGNR $0x0c, Y2, Y2, Y2
	VPALIGNR $0x0c, Y3, Y3, Y3
	VPADDD   Y14, Y0, Y0
	VPADDD   Y9, Y5, Y5
	VPADDD   Y10, Y6, Y6
	VPADDD   Y11, Y7, Y7
	VPXOR    Y0, Y4, Y4
	VPXOR    Y5, Y1, Y1
	VPXOR    Y6, Y2, Y2
	VPXOR    Y7, Y3, Y3
	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
	VPADDD   Y4, Y12, Y12
	VPADDD   Y1, Y13, Y13
	VPADDD   Y2, Y8, Y8
	VPADDD   Y3, Y15, Y15
	VPXOR    Y12, Y14, Y14
	VPXOR    Y13, Y9, Y9
	VPXOR    Y8, Y10, Y10
	VPXOR    Y15, Y11, Y11
	ADDQ     16(R9), R10
	ADCQ     24(R9), R11
	ADCQ     $0x01, R12
	MOVQ     (BP), DX
	MOVQ     DX, R15
	MULXQ    R10, R13, R14
	IMULQ    R12, R15
	MULXQ    R11, AX, DX
	ADDQ     AX, R14
	ADCQ     DX, R15
	MOVQ     8(BP), DX
	MULXQ    R10, R10, AX
	ADDQ     R10, R14
	MULXQ    R11, R11, R8
	ADCQ     R11, R15
	ADCQ     $0x00, R8
	IMULQ    R12, DX
	ADDQ     AX, R15
	ADCQ     DX, R8
	MOVQ     R13, R10
	MOVQ     R14, R11
	MOVQ     R15, R12
	ANDQ     $0x03, R12
	MOVQ     R15, R13
	ANDQ     $-4, R13
	MOVQ     R8, R14
	SHRQ     $0x02, R8, R15
	SHRQ     $0x02, R8
	ADDQ     R13, R10
	ADCQ     R14, R11
	ADCQ     $0x00, R12
	ADDQ     R15, R10
	ADCQ     R8, R11
	ADCQ     $0x00, R12
	LEAQ     32(R9), R9
	VMOVDQA  Y15, 224(BP)
	VPSLLD   $0x0c, Y14, Y15
	VPSRLD   $0x14, Y14, Y14
	VPXOR    Y15, Y14, Y14
	VPSLLD   $0x0c, Y9, Y15
	VPSRLD   $0x14, Y9, Y9
	VPXOR    Y15, Y9, Y9
	VPSLLD   $0x0c, Y10, Y15
	VPSRLD   $0x14, Y10, Y10
	VPXOR    Y15, Y10, Y10
	VPSLLD   $0x0c, Y11, Y15
	VPSRLD   $0x14, Y11, Y11
	VPXOR    Y15, Y11, Y11
	VMOVDQA  224(BP), Y15
	VPADDD   Y14, Y0, Y0
	VPADDD   Y9, Y5, Y5
	VPADDD   Y10, Y6, Y6
	VPADDD   Y11, Y7, Y7
	VPXOR    Y0, Y4, Y4
	VPXOR    Y5, Y1, Y1
	VPXOR    Y6, Y2, Y2
	VPXOR    Y7, Y3, Y3
	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
	VPADDD   Y4, Y12, Y12
	VPADDD   Y1, Y13, Y13
	VPADDD   Y2, Y8, Y8
	VPADDD   Y3, Y15, Y15
	VPXOR    Y12, Y14, Y14
	VPXOR    Y13, Y9, Y9
	VPXOR    Y8, Y10, Y10
	VPXOR    Y15, Y11, Y11
	VMOVDQA  Y15, 224(BP)
	VPSLLD   $0x07, Y14, Y15
	VPSRLD   $0x19, Y14, Y14
	VPXOR    Y15, Y14, Y14
	VPSLLD   $0x07, Y9, Y15
	VPSRLD   $0x19, Y9, Y9
	VPXOR    Y15, Y9, Y9
	VPSLLD   $0x07, Y10, Y15
	VPSRLD   $0x19, Y10, Y10
	VPXOR    Y15, Y10, Y10
	VPSLLD   $0x07, Y11, Y15
	VPSRLD   $0x19, Y11, Y11
	VPXOR    Y15, Y11, Y11
	VMOVDQA  224(BP), Y15
	VPALIGNR $0x0c, Y14, Y14, Y14
	VPALIGNR $0x0c, Y9, Y9, Y9
	VPALIGNR $0x0c, Y10, Y10, Y10
	VPALIGNR $0x0c, Y11, Y11, Y11
	VPALIGNR $0x08, Y12, Y12, Y12
	VPALIGNR $0x08, Y13, Y13, Y13
	VPALIGNR $0x08, Y8, Y8, Y8
	VPALIGNR $0x08, Y15, Y15, Y15
	VPALIGNR $0x04, Y4, Y4, Y4
	VPALIGNR $0x04, Y1, Y1, Y1
	VPALIGNR $0x04, Y2, Y2, Y2
	VPALIGNR $0x04, Y3, Y3, Y3
	INCQ     CX
	CMPQ     CX, $0x04
	JLT      openAVX2Tail512LoopB
	CMPQ     CX, $0x0a
	JNE      openAVX2Tail512LoopA
	MOVQ     BX, CX
	SUBQ     $0x00000180, CX
	ANDQ     $-16, CX

openAVX2Tail512HashLoop:
	TESTQ CX, CX
	JE    openAVX2Tail512HashEnd
	ADDQ  (R9), R10
	ADCQ  8(R9), R11
	ADCQ  $0x01, R12
	MOVQ  (BP), DX
	MOVQ  DX, R15
	MULXQ R10, R13, R14
	IMULQ R12, R15
	MULXQ R11, AX, DX
	ADDQ  AX, R14
	ADCQ  DX, R15
	MOVQ  8(BP), DX
	MULXQ R10, R10, AX
	ADDQ  R10, R14
	MULXQ R11, R11, R8
	ADCQ  R11, R15
	ADCQ  $0x00, R8
	IMULQ R12, DX
	ADDQ  AX, R15
	ADCQ  DX, R8
	MOVQ  R13, R10
	MOVQ  R14, R11
	MOVQ  R15, R12
	ANDQ  $0x03, R12
	MOVQ  R15, R13
	ANDQ  $-4, R13
	MOVQ  R8, R14
	SHRQ  $0x02, R8, R15
	SHRQ  $0x02, R8
	ADDQ  R13, R10
	ADCQ  R14, R11
	ADCQ  $0x00, R12
	ADDQ  R15, R10
	ADCQ  R8, R11
	ADCQ  $0x00, R12
	LEAQ  16(R9), R9
	SUBQ  $0x10, CX
	JMP   openAVX2Tail512HashLoop

openAVX2Tail512HashEnd:
	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
	VPADDD     ·chacha20Constants<>+0(SB), Y6, Y6
	VPADDD     ·chacha20Constants<>+0(SB), Y7, Y7
	VPADDD     32(BP), Y14, Y14
	VPADDD     32(BP), Y9, Y9
	VPADDD     32(BP), Y10, Y10
	VPADDD     32(BP), Y11, Y11
	VPADDD     64(BP), Y12, Y12
	VPADDD     64(BP), Y13, Y13
	VPADDD     64(BP), Y8, Y8
	VPADDD     64(BP), Y15, Y15
	VPADDD     96(BP), Y4, Y4
	VPADDD     128(BP), Y1, Y1
	VPADDD     160(BP), Y2, Y2
	VPADDD     192(BP), Y3, Y3
	VMOVDQA    Y15, 224(BP)
	VPERM2I128 $0x02, Y0, Y14, Y15
	VPERM2I128 $0x13, Y0, Y14, Y14
	VPERM2I128 $0x02, Y12, Y4, Y0
	VPERM2I128 $0x13, Y12, Y4, Y12
	VPXOR      (SI), Y15, Y15
	VPXOR      32(SI), Y0, Y0
	VPXOR      64(SI), Y14, Y14
	VPXOR      96(SI), Y12, Y12
	VMOVDQU    Y15, (DI)
	VMOVDQU    Y0, 32(DI)
	VMOVDQU    Y14, 64(DI)
	VMOVDQU    Y12, 96(DI)
	VPERM2I128 $0x02, Y5, Y9, Y0
	VPERM2I128 $0x02, Y13, Y1, Y14
	VPERM2I128 $0x13, Y5, Y9, Y12
	VPERM2I128 $0x13, Y13, Y1, Y4
	VPXOR      128(SI), Y0, Y0
	VPXOR      160(SI), Y14, Y14
	VPXOR      192(SI), Y12, Y12
	VPXOR      224(SI), Y4, Y4
	VMOVDQU    Y0, 128(DI)
	VMOVDQU    Y14, 160(DI)
	VMOVDQU    Y12, 192(DI)
	VMOVDQU    Y4, 224(DI)
	VPERM2I128 $0x02, Y6, Y10, Y0
	VPERM2I128 $0x02, Y8, Y2, Y14
	VPERM2I128 $0x13, Y6, Y10, Y12
	VPERM2I128 $0x13, Y8, Y2, Y4
	VPXOR      256(SI), Y0, Y0
	VPXOR      288(SI), Y14, Y14
	VPXOR      320(SI), Y12, Y12
	VPXOR      352(SI), Y4, Y4
	VMOVDQU    Y0, 256(DI)
	VMOVDQU    Y14, 288(DI)
	VMOVDQU    Y12, 320(DI)
	VMOVDQU    Y4, 352(DI)
	VPERM2I128 $0x02, Y7, Y11, Y0
	VPERM2I128 $0x02, 224(BP), Y3, Y14
	VPERM2I128 $0x13, Y7, Y11, Y12
	VPERM2I128 $0x13, 224(BP), Y3, Y4
	LEAQ       384(SI), SI
	LEAQ       384(DI), DI
	SUBQ       $0x00000180, BX
	JMP        openAVX2TailLoop

DATA ·chacha20Constants<>+0(SB)/4, $0x61707865
DATA ·chacha20Constants<>+4(SB)/4, $0x3320646e
DATA ·chacha20Constants<>+8(SB)/4, $0x79622d32
DATA ·chacha20Constants<>+12(SB)/4, $0x6b206574
DATA ·chacha20Constants<>+16(SB)/4, $0x61707865
DATA ·chacha20Constants<>+20(SB)/4, $0x3320646e
DATA ·chacha20Constants<>+24(SB)/4, $0x79622d32
DATA ·chacha20Constants<>+28(SB)/4, $0x6b206574
GLOBL ·chacha20Constants<>(SB), RODATA|NOPTR, $32

DATA ·avx2InitMask<>+0(SB)/8, $0x0000000000000000
DATA ·avx2InitMask<>+8(SB)/8, $0x0000000000000000
DATA ·avx2InitMask<>+16(SB)/8, $0x0000000000000001
DATA ·avx2InitMask<>+24(SB)/8, $0x0000000000000000
GLOBL ·avx2InitMask<>(SB), RODATA|NOPTR, $32

DATA ·rol16<>+0(SB)/8, $0x0504070601000302
DATA ·rol16<>+8(SB)/8, $0x0d0c0f0e09080b0a
DATA ·rol16<>+16(SB)/8, $0x0504070601000302
DATA ·rol16<>+24(SB)/8, $0x0d0c0f0e09080b0a
GLOBL ·rol16<>(SB), RODATA|NOPTR, $32

DATA ·rol8<>+0(SB)/8, $0x0605040702010003
DATA ·rol8<>+8(SB)/8, $0x0e0d0c0f0a09080b
DATA ·rol8<>+16(SB)/8, $0x0605040702010003
DATA ·rol8<>+24(SB)/8, $0x0e0d0c0f0a09080b
GLOBL ·rol8<>(SB), RODATA|NOPTR, $32

DATA ·polyClampMask<>+0(SB)/8, $0x0ffffffc0fffffff
DATA ·polyClampMask<>+8(SB)/8, $0x0ffffffc0ffffffc
DATA ·polyClampMask<>+16(SB)/8, $0xffffffffffffffff
DATA ·polyClampMask<>+24(SB)/8, $0xffffffffffffffff
GLOBL ·polyClampMask<>(SB), RODATA|NOPTR, $32

DATA ·avx2IncMask<>+0(SB)/8, $0x0000000000000002
DATA ·avx2IncMask<>+8(SB)/8, $0x0000000000000000
DATA ·avx2IncMask<>+16(SB)/8, $0x0000000000000002
DATA ·avx2IncMask<>+24(SB)/8, $0x0000000000000000
GLOBL ·avx2IncMask<>(SB), RODATA|NOPTR, $32

DATA ·andMask<>+0(SB)/8, $0x00000000000000ff
DATA ·andMask<>+8(SB)/8, $0x0000000000000000
DATA ·andMask<>+16(SB)/8, $0x000000000000ffff
DATA ·andMask<>+24(SB)/8, $0x0000000000000000
DATA ·andMask<>+32(SB)/8, $0x0000000000ffffff
DATA ·andMask<>+40(SB)/8, $0x0000000000000000
DATA ·andMask<>+48(SB)/8, $0x00000000ffffffff
DATA ·andMask<>+56(SB)/8, $0x0000000000000000
DATA ·andMask<>+64(SB)/8, $0x000000ffffffffff
DATA ·andMask<>+72(SB)/8, $0x0000000000000000
DATA ·andMask<>+80(SB)/8, $0x0000ffffffffffff
DATA ·andMask<>+88(SB)/8, $0x0000000000000000
DATA ·andMask<>+96(SB)/8, $0x00ffffffffffffff
DATA ·andMask<>+104(SB)/8, $0x0000000000000000
DATA ·andMask<>+112(SB)/8, $0xffffffffffffffff
DATA ·andMask<>+120(SB)/8, $0x0000000000000000
DATA ·andMask<>+128(SB)/8, $0xffffffffffffffff
DATA ·andMask<>+136(SB)/8, $0x00000000000000ff
DATA ·andMask<>+144(SB)/8, $0xffffffffffffffff
DATA ·andMask<>+152(SB)/8, $0x000000000000ffff
DATA ·andMask<>+160(SB)/8, $0xffffffffffffffff
DATA ·andMask<>+168(SB)/8, $0x0000000000ffffff
DATA ·andMask<>+176(SB)/8, $0xffffffffffffffff
DATA ·andMask<>+184(SB)/8, $0x00000000ffffffff
DATA ·andMask<>+192(SB)/8, $0xffffffffffffffff
DATA ·andMask<>+200(SB)/8, $0x000000ffffffffff
DATA ·andMask<>+208(SB)/8, $0xffffffffffffffff
DATA ·andMask<>+216(SB)/8, $0x0000ffffffffffff
DATA ·andMask<>+224(SB)/8, $0xffffffffffffffff
DATA ·andMask<>+232(SB)/8, $0x00ffffffffffffff
GLOBL ·andMask<>(SB), RODATA|NOPTR, $240

// func chacha20Poly1305Seal(dst []byte, key []uint32, src []byte, ad []byte)
// Requires: AVX, AVX2, BMI2, CMOV, SSE2
TEXT ·chacha20Poly1305Seal(SB), $288-96
	MOVQ           SP, BP
	ADDQ           $0x20, BP
	ANDQ           $-32, BP
	MOVQ           dst_base+0(FP), DI
	MOVQ           key_base+24(FP), R8
	MOVQ           src_base+48(FP), SI
	MOVQ           src_len+56(FP), BX
	MOVQ           ad_base+72(FP), CX
	VZEROUPPER
	VMOVDQU        ·chacha20Constants<>+0(SB), Y0
	VBROADCASTI128 16(R8), Y14
	VBROADCASTI128 32(R8), Y12
	VBROADCASTI128 48(R8), Y4
	VPADDD         ·avx2InitMask<>+0(SB), Y4, Y4

	// Special optimizations, for very short buffers
	CMPQ BX, $0x000000c0
	JBE  seal192AVX2
	CMPQ BX, $0x00000140
	JBE  seal320AVX2

	// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
	VMOVDQA Y0, Y5
	VMOVDQA Y0, Y6
	VMOVDQA Y0, Y7
	VMOVDQA Y14, Y9
	VMOVDQA Y14, Y10
	VMOVDQA Y14, Y11
	VMOVDQA Y14, 32(BP)
	VMOVDQA Y12, Y13
	VMOVDQA Y12, Y8
	VMOVDQA Y12, Y15
	VMOVDQA Y12, 64(BP)
	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
	VMOVDQA Y4, 96(BP)
	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
	VMOVDQA Y1, 128(BP)
	VPADDD  ·avx2IncMask<>+0(SB), Y2, Y3
	VMOVDQA Y2, 160(BP)
	VMOVDQA Y3, 192(BP)
	MOVQ    $0x0000000a, R9

sealAVX2IntroLoop:
	VMOVDQA    Y15, 224(BP)
	VPADDD     Y14, Y0, Y0
	VPXOR      Y0, Y4, Y4
	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
	VPADDD     Y4, Y12, Y12
	VPXOR      Y12, Y14, Y14
	VPSLLD     $0x0c, Y14, Y15
	VPSRLD     $0x14, Y14, Y14
	VPXOR      Y15, Y14, Y14
	VPADDD     Y14, Y0, Y0
	VPXOR      Y0, Y4, Y4
	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
	VPADDD     Y4, Y12, Y12
	VPXOR      Y12, Y14, Y14
	VPSLLD     $0x07, Y14, Y15
	VPSRLD     $0x19, Y14, Y14
	VPXOR      Y15, Y14, Y14
	VPADDD     Y9, Y5, Y5
	VPXOR      Y5, Y1, Y1
	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
	VPADDD     Y1, Y13, Y13
	VPXOR      Y13, Y9, Y9
	VPSLLD     $0x0c, Y9, Y15
	VPSRLD     $0x14, Y9, Y9
	VPXOR      Y15, Y9, Y9
	VPADDD     Y9, Y5, Y5
	VPXOR      Y5, Y1, Y1
	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
	VPADDD     Y1, Y13, Y13
	VPXOR      Y13, Y9, Y9
	VPSLLD     $0x07, Y9, Y15
	VPSRLD     $0x19, Y9, Y9
	VPXOR      Y15, Y9, Y9
	VPADDD     Y10, Y6, Y6
	VPXOR      Y6, Y2, Y2
	VPSHUFB    ·rol16<>+0(SB), Y2, Y2
	VPADDD     Y2, Y8, Y8
	VPXOR      Y8, Y10, Y10
	VPSLLD     $0x0c, Y10, Y15
	VPSRLD     $0x14, Y10, Y10
	VPXOR      Y15, Y10, Y10
	VPADDD     Y10, Y6, Y6
	VPXOR      Y6, Y2, Y2
	VPSHUFB    ·rol8<>+0(SB), Y2, Y2
	VPADDD     Y2, Y8, Y8
	VPXOR      Y8, Y10, Y10
	VPSLLD     $0x07, Y10, Y15
	VPSRLD     $0x19, Y10, Y10
	VPXOR      Y15, Y10, Y10
	VMOVDQA    224(BP), Y15
	VMOVDQA    Y13, 224(BP)
	VPADDD     Y11, Y7, Y7
	VPXOR      Y7, Y3, Y3
	VPSHUFB    ·rol16<>+0(SB), Y3, Y3
	VPADDD     Y3, Y15, Y15
	VPXOR      Y15, Y11, Y11
	VPSLLD     $0x0c, Y11, Y13
	VPSRLD     $0x14, Y11, Y11
	VPXOR      Y13, Y11, Y11
	VPADDD     Y11, Y7, Y7
	VPXOR      Y7, Y3, Y3
	VPSHUFB    ·rol8<>+0(SB), Y3, Y3
	VPADDD     Y3, Y15, Y15
	VPXOR      Y15, Y11, Y11
	VPSLLD     $0x07, Y11, Y13
	VPSRLD     $0x19, Y11, Y11
	VPXOR      Y13, Y11, Y11
	VMOVDQA    224(BP), Y13
	VPALIGNR   $0x04, Y14, Y14, Y14
	VPALIGNR   $0x08, Y12, Y12, Y12
	VPALIGNR   $0x0c, Y4, Y4, Y4
	VPALIGNR   $0x04, Y9, Y9, Y9
	VPALIGNR   $0x08, Y13, Y13, Y13
	VPALIGNR   $0x0c, Y1, Y1, Y1
	VPALIGNR   $0x04, Y10, Y10, Y10
	VPALIGNR   $0x08, Y8, Y8, Y8
	VPALIGNR   $0x0c, Y2, Y2, Y2
	VPALIGNR   $0x04, Y11, Y11, Y11
	VPALIGNR   $0x08, Y15, Y15, Y15
	VPALIGNR   $0x0c, Y3, Y3, Y3
	VMOVDQA    Y15, 224(BP)
	VPADDD     Y14, Y0, Y0
	VPXOR      Y0, Y4, Y4
	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
	VPADDD     Y4, Y12, Y12
	VPXOR      Y12, Y14, Y14
	VPSLLD     $0x0c, Y14, Y15
	VPSRLD     $0x14, Y14, Y14
	VPXOR      Y15, Y14, Y14
	VPADDD     Y14, Y0, Y0
	VPXOR      Y0, Y4, Y4
	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
	VPADDD     Y4, Y12, Y12
	VPXOR      Y12, Y14, Y14
	VPSLLD     $0x07, Y14, Y15
	VPSRLD     $0x19, Y14, Y14
	VPXOR      Y15, Y14, Y14
	VPADDD     Y9, Y5, Y5
	VPXOR      Y5, Y1, Y1
	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
	VPADDD     Y1, Y13, Y13
	VPXOR      Y13, Y9, Y9
	VPSLLD     $0x0c, Y9, Y15
	VPSRLD     $0x14, Y9, Y9
	VPXOR      Y15, Y9, Y9
	VPADDD     Y9, Y5, Y5
	VPXOR      Y5, Y1, Y1
	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
	VPADDD     Y1, Y13, Y13
	VPXOR      Y13, Y9, Y9
	VPSLLD     $0x07, Y9, Y15
	VPSRLD     $0x19, Y9, Y9
	VPXOR      Y15, Y9, Y9
	VPADDD     Y10, Y6, Y6
	VPXOR      Y6, Y2, Y2
	VPSHUFB    ·rol16<>+0(SB), Y2, Y2
	VPADDD     Y2, Y8, Y8
	VPXOR      Y8, Y10, Y10
	VPSLLD     $0x0c, Y10, Y15
	VPSRLD     $0x14, Y10, Y10
	VPXOR      Y15, Y10, Y10
	VPADDD     Y10, Y6, Y6
	VPXOR      Y6, Y2, Y2
	VPSHUFB    ·rol8<>+0(SB), Y2, Y2
	VPADDD     Y2, Y8, Y8
	VPXOR      Y8, Y10, Y10
	VPSLLD     $0x07, Y10, Y15
	VPSRLD     $0x19, Y10, Y10
	VPXOR      Y15, Y10, Y10
	VMOVDQA    224(BP), Y15
	VMOVDQA    Y13, 224(BP)
	VPADDD     Y11, Y7, Y7
	VPXOR      Y7, Y3, Y3
	VPSHUFB    ·rol16<>+0(SB), Y3, Y3
	VPADDD     Y3, Y15, Y15
	VPXOR      Y15, Y11, Y11
	VPSLLD     $0x0c, Y11, Y13
	VPSRLD     $0x14, Y11, Y11
	VPXOR      Y13, Y11, Y11
	VPADDD     Y11, Y7, Y7
	VPXOR      Y7, Y3, Y3
	VPSHUFB    ·rol8<>+0(SB), Y3, Y3
	VPADDD     Y3, Y15, Y15
	VPXOR      Y15, Y11, Y11
	VPSLLD     $0x07, Y11, Y13
	VPSRLD     $0x19, Y11, Y11
	VPXOR      Y13, Y11, Y11
	VMOVDQA    224(BP), Y13
	VPALIGNR   $0x0c, Y14, Y14, Y14
	VPALIGNR   $0x08, Y12, Y12, Y12
	VPALIGNR   $0x04, Y4, Y4, Y4
	VPALIGNR   $0x0c, Y9, Y9, Y9
	VPALIGNR   $0x08, Y13, Y13, Y13
	VPALIGNR   $0x04, Y1, Y1, Y1
	VPALIGNR   $0x0c, Y10, Y10, Y10
	VPALIGNR   $0x08, Y8, Y8, Y8
	VPALIGNR   $0x04, Y2, Y2, Y2
	VPALIGNR   $0x0c, Y11, Y11, Y11
	VPALIGNR   $0x08, Y15, Y15, Y15
	VPALIGNR   $0x04, Y3, Y3, Y3
	DECQ       R9
	JNE        sealAVX2IntroLoop
	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
	VPADDD     ·chacha20Constants<>+0(SB), Y6, Y6
	VPADDD     ·chacha20Constants<>+0(SB), Y7, Y7
	VPADDD     32(BP), Y14, Y14
	VPADDD     32(BP), Y9, Y9
	VPADDD     32(BP), Y10, Y10
	VPADDD     32(BP), Y11, Y11
	VPADDD     64(BP), Y12, Y12
	VPADDD     64(BP), Y13, Y13
	VPADDD     64(BP), Y8, Y8
	VPADDD     64(BP), Y15, Y15
	VPADDD     96(BP), Y4, Y4
	VPADDD     128(BP), Y1, Y1
	VPADDD     160(BP), Y2, Y2
	VPADDD     192(BP), Y3, Y3
	VPERM2I128 $0x13, Y12, Y4, Y12
	VPERM2I128 $0x02, Y0, Y14, Y4
	VPERM2I128 $0x13, Y0, Y14, Y0

	// Clamp and store poly key
	VPAND   ·polyClampMask<>+0(SB), Y4, Y4
	VMOVDQA Y4, (BP)

	// Hash AD
	MOVQ ad_len+80(FP), R9
	CALL polyHashADInternal<>(SB)

	// Can store at least 320 bytes
	VPXOR      (SI), Y0, Y0
	VPXOR      32(SI), Y12, Y12
	VMOVDQU    Y0, (DI)
	VMOVDQU    Y12, 32(DI)
	VPERM2I128 $0x02, Y5, Y9, Y0
	VPERM2I128 $0x02, Y13, Y1, Y14
	VPERM2I128 $0x13, Y5, Y9, Y12
	VPERM2I128 $0x13, Y13, Y1, Y4
	VPXOR      64(SI), Y0, Y0
	VPXOR      96(SI), Y14, Y14
	VPXOR      128(SI), Y12, Y12
	VPXOR      160(SI), Y4, Y4
	VMOVDQU    Y0, 64(DI)
	VMOVDQU    Y14, 96(DI)
	VMOVDQU    Y12, 128(DI)
	VMOVDQU    Y4, 160(DI)
	VPERM2I128 $0x02, Y6, Y10, Y0
	VPERM2I128 $0x02, Y8, Y2, Y14
	VPERM2I128 $0x13, Y6, Y10, Y12
	VPERM2I128 $0x13, Y8, Y2, Y4
	VPXOR      192(SI), Y0, Y0
	VPXOR      224(SI), Y14, Y14
	VPXOR      256(SI), Y12, Y12
	VPXOR      288(SI), Y4, Y4
	VMOVDQU    Y0, 192(DI)
	VMOVDQU    Y14, 224(DI)
	VMOVDQU    Y12, 256(DI)
	VMOVDQU    Y4, 288(DI)
	MOVQ       $0x00000140, CX
	SUBQ       $0x00000140, BX
	LEAQ       320(SI), SI
	VPERM2I128 $0x02, Y7, Y11, Y0
	VPERM2I128 $0x02, Y15, Y3, Y14
	VPERM2I128 $0x13, Y7, Y11, Y12
	VPERM2I128 $0x13, Y15, Y3, Y4
	CMPQ       BX, $0x80
	JBE        sealAVX2SealHash
	VPXOR      (SI), Y0, Y0
	VPXOR      32(SI), Y14, Y14
	VPXOR      64(SI), Y12, Y12
	VPXOR      96(SI), Y4, Y4
	VMOVDQU    Y0, 320(DI)
	VMOVDQU    Y14, 352(DI)
	VMOVDQU    Y12, 384(DI)
	VMOVDQU    Y4, 416(DI)
	SUBQ       $0x80, BX
	LEAQ       128(SI), SI
	MOVQ       $0x00000008, CX
	MOVQ       $0x00000002, R9
	CMPQ       BX, $0x80
	JBE        sealAVX2Tail128
	CMPQ       BX, $0x00000100
	JBE        sealAVX2Tail256
	CMPQ       BX, $0x00000180
	JBE        sealAVX2Tail384
	CMPQ       BX, $0x00000200
	JBE        sealAVX2Tail512

	// We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
	VMOVDQA  ·chacha20Constants<>+0(SB), Y0
	VMOVDQA  Y0, Y5
	VMOVDQA  Y0, Y6
	VMOVDQA  Y0, Y7
	VMOVDQA  32(BP), Y14
	VMOVDQA  Y14, Y9
	VMOVDQA  Y14, Y10
	VMOVDQA  Y14, Y11
	VMOVDQA  64(BP), Y12
	VMOVDQA  Y12, Y13
	VMOVDQA  Y12, Y8
	VMOVDQA  Y12, Y15
	VMOVDQA  192(BP), Y4
	VPADDD   ·avx2IncMask<>+0(SB), Y4, Y4
	VPADDD   ·avx2IncMask<>+0(SB), Y4, Y1
	VPADDD   ·avx2IncMask<>+0(SB), Y1, Y2
	VPADDD   ·avx2IncMask<>+0(SB), Y2, Y3
	VMOVDQA  Y4, 96(BP)
	VMOVDQA  Y1, 128(BP)
	VMOVDQA  Y2, 160(BP)
	VMOVDQA  Y3, 192(BP)
	VMOVDQA  Y15, 224(BP)
	VPADDD   Y14, Y0, Y0
	VPXOR    Y0, Y4, Y4
	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
	VPADDD   Y4, Y12, Y12
	VPXOR    Y12, Y14, Y14
	VPSLLD   $0x0c, Y14, Y15
	VPSRLD   $0x14, Y14, Y14
	VPXOR    Y15, Y14, Y14
	VPADDD   Y14, Y0, Y0
	VPXOR    Y0, Y4, Y4
	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
	VPADDD   Y4, Y12, Y12
	VPXOR    Y12, Y14, Y14
	VPSLLD   $0x07, Y14, Y15
	VPSRLD   $0x19, Y14, Y14
	VPXOR    Y15, Y14, Y14
	VPADDD   Y9, Y5, Y5
	VPXOR    Y5, Y1, Y1
	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
	VPADDD   Y1, Y13, Y13
	VPXOR    Y13, Y9, Y9
	VPSLLD   $0x0c, Y9, Y15
	VPSRLD   $0x14, Y9, Y9
	VPXOR    Y15, Y9, Y9
	VPADDD   Y9, Y5, Y5
	VPXOR    Y5, Y1, Y1
	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
	VPADDD   Y1, Y13, Y13
	VPXOR    Y13, Y9, Y9
	VPSLLD   $0x07, Y9, Y15
	VPSRLD   $0x19, Y9, Y9
	VPXOR    Y15, Y9, Y9
	VPADDD   Y10, Y6, Y6
	VPXOR    Y6, Y2, Y2
	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
	VPADDD   Y2, Y8, Y8
	VPXOR    Y8, Y10, Y10
	VPSLLD   $0x0c, Y10, Y15
	VPSRLD   $0x14, Y10, Y10
	VPXOR    Y15, Y10, Y10
	VPADDD   Y10, Y6, Y6
	VPXOR    Y6, Y2, Y2
	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
	VPADDD   Y2, Y8, Y8
	VPXOR    Y8, Y10, Y10
	VPSLLD   $0x07, Y10, Y15
	VPSRLD   $0x19, Y10, Y10
	VPXOR    Y15, Y10, Y10
	VMOVDQA  224(BP), Y15
	VMOVDQA  Y13, 224(BP)
	VPADDD   Y11, Y7, Y7
	VPXOR    Y7, Y3, Y3
	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
	VPADDD   Y3, Y15, Y15
	VPXOR    Y15, Y11, Y11
	VPSLLD   $0x0c, Y11, Y13
	VPSRLD   $0x14, Y11, Y11
	VPXOR    Y13, Y11, Y11
	VPADDD   Y11, Y7, Y7
	VPXOR    Y7, Y3, Y3
	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
	VPADDD   Y3, Y15, Y15
	VPXOR    Y15, Y11, Y11
	VPSLLD   $0x07, Y11, Y13
	VPSRLD   $0x19, Y11, Y11
	VPXOR    Y13, Y11, Y11
	VMOVDQA  224(BP), Y13
	VPALIGNR $0x04, Y14, Y14, Y14
	VPALIGNR $0x08, Y12, Y12, Y12
	VPALIGNR $0x0c, Y4, Y4, Y4
	VPALIGNR $0x04, Y9, Y9, Y9
	VPALIGNR $0x08, Y13, Y13, Y13
	VPALIGNR $0x0c, Y1, Y1, Y1
	VPALIGNR $0x04, Y10, Y10, Y10
	VPALIGNR $0x08, Y8, Y8, Y8
	VPALIGNR $0x0c, Y2, Y2, Y2
	VPALIGNR $0x04, Y11, Y11, Y11
	VPALIGNR $0x08, Y15, Y15, Y15
	VPALIGNR $0x0c, Y3, Y3, Y3
	VMOVDQA  Y15, 224(BP)
	VPADDD   Y14, Y0, Y0
	VPXOR    Y0, Y4, Y4
	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
	VPADDD   Y4, Y12, Y12
	VPXOR    Y12, Y14, Y14
	VPSLLD   $0x0c, Y14, Y15
	VPSRLD   $0x14, Y14, Y14
	VPXOR    Y15, Y14, Y14
	VPADDD   Y14, Y0, Y0
	VPXOR    Y0, Y4, Y4
	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
	VPADDD   Y4, Y12, Y12
	VPXOR    Y12, Y14, Y14
	VPSLLD   $0x07, Y14, Y15
	VPSRLD   $0x19, Y14, Y14
	VPXOR    Y15, Y14, Y14
	VPADDD   Y9, Y5, Y5
	VPXOR    Y5, Y1, Y1
	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
	VPADDD   Y1, Y13, Y13
	VPXOR    Y13, Y9, Y9
	VPSLLD   $0x0c, Y9, Y15
	VPSRLD   $0x14, Y9, Y9
	VPXOR    Y15, Y9, Y9
	VPADDD   Y9, Y5, Y5
	VPXOR    Y5, Y1, Y1
	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
	VPADDD   Y1, Y13, Y13
	VPXOR    Y13, Y9, Y9
	VPSLLD   $0x07, Y9, Y15
	VPSRLD   $0x19, Y9, Y9
	VPXOR    Y15, Y9, Y9
	VPADDD   Y10, Y6, Y6
	VPXOR    Y6, Y2, Y2
	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
	VPADDD   Y2, Y8, Y8
	VPXOR    Y8, Y10, Y10
	VPSLLD   $0x0c, Y10, Y15
	VPSRLD   $0x14, Y10, Y10
	VPXOR    Y15, Y10, Y10
	VPADDD   Y10, Y6, Y6
	VPXOR    Y6, Y2, Y2
	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
	VPADDD   Y2, Y8, Y8
	VPXOR    Y8, Y10, Y10
	VPSLLD   $0x07, Y10, Y15
	VPSRLD   $0x19, Y10, Y10
	VPXOR    Y15, Y10, Y10
	VMOVDQA  224(BP), Y15
	VMOVDQA  Y13, 224(BP)
	VPADDD   Y11, Y7, Y7
	VPXOR    Y7, Y3, Y3
	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
	VPADDD   Y3, Y15, Y15
	VPXOR    Y15, Y11, Y11
	VPSLLD   $0x0c, Y11, Y13
	VPSRLD   $0x14, Y11, Y11
	VPXOR    Y13, Y11, Y11
	VPADDD   Y11, Y7, Y7
	VPXOR    Y7, Y3, Y3
	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
	VPADDD   Y3, Y15, Y15
	VPXOR    Y15, Y11, Y11
	VPSLLD   $0x07, Y11, Y13
	VPSRLD   $0x19, Y11, Y11
	VPXOR    Y13, Y11, Y11
	VMOVDQA  224(BP), Y13
	VPALIGNR $0x0c, Y14, Y14, Y14
	VPALIGNR $0x08, Y12, Y12, Y12
	VPALIGNR $0x04, Y4, Y4, Y4
	VPALIGNR $0x0c, Y9, Y9, Y9
	VPALIGNR $0x08, Y13, Y13, Y13
	VPALIGNR $0x04, Y1, Y1, Y1
	VPALIGNR $0x0c, Y10, Y10, Y10
	VPALIGNR $0x08, Y8, Y8, Y8
	VPALIGNR $0x04, Y2, Y2, Y2
	VPALIGNR $0x0c, Y11, Y11, Y11
	VPALIGNR $0x08, Y15, Y15, Y15
	VPALIGNR $0x04, Y3, Y3, Y3
	VPADDD   Y14, Y0, Y0
	VPADDD   Y9, Y5, Y5
	VPADDD   Y10, Y6, Y6
	VPADDD   Y11, Y7, Y7
	VPXOR    Y0, Y4, Y4
	VPXOR    Y5, Y1, Y1
	VPXOR    Y6, Y2, Y2
	VPXOR    Y7, Y3, Y3
	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
	VPADDD   Y4, Y12, Y12
	VPADDD   Y1, Y13, Y13
	VPADDD   Y2, Y8, Y8
	VPADDD   Y3, Y15, Y15
	VPXOR    Y12, Y14, Y14
	VPXOR    Y13, Y9, Y9
	VPXOR    Y8, Y10, Y10
	VPXOR    Y15, Y11, Y11
	VMOVDQA  Y15, 224(BP)
	VPSLLD   $0x0c, Y14, Y15
	VPSRLD   $0x14, Y14, Y14
	VPXOR    Y15, Y14, Y14
	VPSLLD   $0x0c, Y9, Y15
	VPSRLD   $0x14, Y9, Y9
	VPXOR    Y15, Y9, Y9
	VPSLLD   $0x0c, Y10, Y15
	VPSRLD   $0x14, Y10, Y10
	VPXOR    Y15, Y10, Y10
	VPSLLD   $0x0c, Y11, Y15
	VPSRLD   $0x14, Y11, Y11
	VPXOR    Y15, Y11, Y11
	VMOVDQA  224(BP), Y15
	SUBQ     $0x10, DI
	MOVQ     $0x00000009, CX
	JMP      sealAVX2InternalLoopStart

sealAVX2MainLoop:
	VMOVDQU ·chacha20Constants<>+0(SB), Y0
	VMOVDQA Y0, Y5
	VMOVDQA Y0, Y6
	VMOVDQA Y0, Y7
	VMOVDQA 32(BP), Y14
	VMOVDQA Y14, Y9
	VMOVDQA Y14, Y10
	VMOVDQA Y14, Y11
	VMOVDQA 64(BP), Y12
	VMOVDQA Y12, Y13
	VMOVDQA Y12, Y8
	VMOVDQA Y12, Y15
	VMOVDQA 192(BP), Y4
	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
	VPADDD  ·avx2IncMask<>+0(SB), Y2, Y3
	VMOVDQA Y4, 96(BP)
	VMOVDQA Y1, 128(BP)
	VMOVDQA Y2, 160(BP)
	VMOVDQA Y3, 192(BP)
	MOVQ    $0x0000000a, CX

sealAVX2InternalLoop:
	ADDQ    (DI), R10
	ADCQ    8(DI), R11
	ADCQ    $0x01, R12
	VPADDD  Y14, Y0, Y0
	VPADDD  Y9, Y5, Y5
	VPADDD  Y10, Y6, Y6
	VPADDD  Y11, Y7, Y7
	MOVQ    (BP), DX
	MOVQ    DX, R15
	MULXQ   R10, R13, R14
	IMULQ   R12, R15
	MULXQ   R11, AX, DX
	ADDQ    AX, R14
	ADCQ    DX, R15
	VPXOR   Y0, Y4, Y4
	VPXOR   Y5, Y1, Y1
	VPXOR   Y6, Y2, Y2
	VPXOR   Y7, Y3, Y3
	VPSHUFB ·rol16<>+0(SB), Y4, Y4
	VPSHUFB ·rol16<>+0(SB), Y1, Y1
	VPSHUFB ·rol16<>+0(SB), Y2, Y2
	VPSHUFB ·rol16<>+0(SB), Y3, Y3
	MOVQ    8(BP), DX
	MULXQ   R10, R10, AX
	ADDQ    R10, R14
	MULXQ   R11, R11, R8
	ADCQ    R11, R15
	ADCQ    $0x00, R8
	VPADDD  Y4, Y12, Y12
	VPADDD  Y1, Y13, Y13
	VPADDD  Y2, Y8, Y8
	VPADDD  Y3, Y15, Y15
	VPXOR   Y12, Y14, Y14
	VPXOR   Y13, Y9, Y9
	VPXOR   Y8, Y10, Y10
	VPXOR   Y15, Y11, Y11
	IMULQ   R12, DX
	ADDQ    AX, R15
	ADCQ    DX, R8
	VMOVDQA Y15, 224(BP)
	VPSLLD  $0x0c, Y14, Y15
	VPSRLD  $0x14, Y14, Y14
	VPXOR   Y15, Y14, Y14
	VPSLLD  $0x0c, Y9, Y15
	VPSRLD  $0x14, Y9, Y9
	VPXOR   Y15, Y9, Y9
	VPSLLD  $0x0c, Y10, Y15
	VPSRLD  $0x14, Y10, Y10
	VPXOR   Y15, Y10, Y10
	VPSLLD  $0x0c, Y11, Y15
	VPSRLD  $0x14, Y11, Y11
	VPXOR   Y15, Y11, Y11
	VMOVDQA 224(BP), Y15
	MOVQ    R13, R10
	MOVQ    R14, R11
	MOVQ    R15, R12
	ANDQ    $0x03, R12
	MOVQ    R15, R13
	ANDQ    $-4, R13
	MOVQ    R8, R14
	SHRQ    $0x02, R8, R15
	SHRQ    $0x02, R8
	ADDQ    R13, R10
	ADCQ    R14, R11
	ADCQ    $0x00, R12
	ADDQ    R15, R10
	ADCQ    R8, R11
	ADCQ    $0x00, R12

sealAVX2InternalLoopStart:
	VPADDD   Y14, Y0, Y0
	VPADDD   Y9, Y5, Y5
	VPADDD   Y10, Y6, Y6
	VPADDD   Y11, Y7, Y7
	VPXOR    Y0, Y4, Y4
	VPXOR    Y5, Y1, Y1
	VPXOR    Y6, Y2, Y2
	VPXOR    Y7, Y3, Y3
	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
	ADDQ     16(DI), R10
	ADCQ     24(DI), R11
	ADCQ     $0x01, R12
	VPADDD   Y4, Y12, Y12
	VPADDD   Y1, Y13, Y13
	VPADDD   Y2, Y8, Y8
	VPADDD   Y3, Y15, Y15
	MOVQ     (BP), DX
	MOVQ     DX, R15
	MULXQ    R10, R13, R14
	IMULQ    R12, R15
	MULXQ    R11, AX, DX
	ADDQ     AX, R14
	ADCQ     DX, R15
	VPXOR    Y12, Y14, Y14
	VPXOR    Y13, Y9, Y9
	VPXOR    Y8, Y10, Y10
	VPXOR    Y15, Y11, Y11
	VMOVDQA  Y15, 224(BP)
	VPSLLD   $0x07, Y14, Y15
	VPSRLD   $0x19, Y14, Y14
	VPXOR    Y15, Y14, Y14
	VPSLLD   $0x07, Y9, Y15
	VPSRLD   $0x19, Y9, Y9
	VPXOR    Y15, Y9, Y9
	VPSLLD   $0x07, Y10, Y15
	VPSRLD   $0x19, Y10, Y10
	VPXOR    Y15, Y10, Y10
	VPSLLD   $0x07, Y11, Y15
	VPSRLD   $0x19, Y11, Y11
	VPXOR    Y15, Y11, Y11
	VMOVDQA  224(BP), Y15
	MOVQ     8(BP), DX
	MULXQ    R10, R10, AX
	ADDQ     R10, R14
	MULXQ    R11, R11, R8
	ADCQ     R11, R15
	ADCQ     $0x00, R8
	VPALIGNR $0x04, Y14, Y14, Y14
	VPALIGNR $0x04, Y9, Y9, Y9
	VPALIGNR $0x04, Y10, Y10, Y10
	VPALIGNR $0x04, Y11, Y11, Y11
	VPALIGNR $0x08, Y12, Y12, Y12
	VPALIGNR $0x08, Y13, Y13, Y13
	VPALIGNR $0x08, Y8, Y8, Y8
	VPALIGNR $0x08, Y15, Y15, Y15
	VPALIGNR $0x0c, Y4, Y4, Y4
	VPALIGNR $0x0c, Y1, Y1, Y1
	VPALIGNR $0x0c, Y2, Y2, Y2
	VPALIGNR $0x0c, Y3, Y3, Y3
	VPADDD   Y14, Y0, Y0
	VPADDD   Y9, Y5, Y5
	VPADDD   Y10, Y6, Y6
	VPADDD   Y11, Y7, Y7
	IMULQ    R12, DX
	ADDQ     AX, R15
	ADCQ     DX, R8
	VPXOR    Y0, Y4, Y4
	VPXOR    Y5, Y1, Y1
	VPXOR    Y6, Y2, Y2
	VPXOR    Y7, Y3, Y3
	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
	MOVQ     R13, R10
	MOVQ     R14, R11
	MOVQ     R15, R12
	ANDQ     $0x03, R12
	MOVQ     R15, R13
	ANDQ     $-4, R13
	MOVQ     R8, R14
	SHRQ     $0x02, R8, R15
	SHRQ     $0x02, R8
	ADDQ     R13, R10
	ADCQ     R14, R11
	ADCQ     $0x00, R12
	ADDQ     R15, R10
	ADCQ     R8, R11
	ADCQ     $0x00, R12
	VPADDD   Y4, Y12, Y12
	VPADDD   Y1, Y13, Y13
	VPADDD   Y2, Y8, Y8
	VPADDD   Y3, Y15, Y15
	VPXOR    Y12, Y14, Y14
	VPXOR    Y13, Y9, Y9
	VPXOR    Y8, Y10, Y10
	VPXOR    Y15, Y11, Y11
	ADDQ     32(DI), R10
	ADCQ     40(DI), R11
	ADCQ     $0x01, R12
	LEAQ     48(DI), DI
	VMOVDQA  Y15, 224(BP)
	VPSLLD   $0x0c, Y14, Y15
	VPSRLD   $0x14, Y14, Y14
	VPXOR    Y15, Y14, Y14
	VPSLLD   $0x0c, Y9, Y15
	VPSRLD   $0x14, Y9, Y9
	VPXOR    Y15, Y9, Y9
	VPSLLD   $0x0c, Y10, Y15
	VPSRLD   $0x14, Y10, Y10
	VPXOR    Y15, Y10, Y10
	VPSLLD   $0x0c, Y11, Y15
	VPSRLD   $0x14, Y11, Y11
	VPXOR    Y15, Y11, Y11
	VMOVDQA  224(BP), Y15
	MOVQ     (BP), DX
	MOVQ     DX, R15
	MULXQ    R10, R13, R14
	IMULQ    R12, R15
	MULXQ    R11, AX, DX
	ADDQ     AX, R14
	ADCQ     DX, R15
	VPADDD   Y14, Y0, Y0
	VPADDD   Y9, Y5, Y5
	VPADDD   Y10, Y6, Y6
	VPADDD   Y11, Y7, Y7
	VPXOR    Y0, Y4, Y4
	VPXOR    Y5, Y1, Y1
	VPXOR    Y6, Y2, Y2
	VPXOR    Y7, Y3, Y3
	MOVQ     8(BP), DX
	MULXQ    R10, R10, AX
	ADDQ     R10, R14
	MULXQ    R11, R11, R8
	ADCQ     R11, R15
	ADCQ     $0x00, R8
	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
	VPADDD   Y4, Y12, Y12
	VPADDD   Y1, Y13, Y13
	VPADDD   Y2, Y8, Y8
	VPADDD   Y3, Y15, Y15
	IMULQ    R12, DX
	ADDQ     AX, R15
	ADCQ     DX, R8
	VPXOR    Y12, Y14, Y14
	VPXOR    Y13, Y9, Y9
	VPXOR    Y8, Y10, Y10
	VPXOR    Y15, Y11, Y11
	VMOVDQA  Y15, 224(BP)
	VPSLLD   $0x07, Y14, Y15
	VPSRLD   $0x19, Y14, Y14
	VPXOR    Y15, Y14, Y14
	VPSLLD   $0x07, Y9, Y15
	VPSRLD   $0x19, Y9, Y9
	VPXOR    Y15, Y9, Y9
	VPSLLD   $0x07, Y10, Y15
	VPSRLD   $0x19, Y10, Y10
	VPXOR    Y15, Y10, Y10
	VPSLLD   $0x07, Y11, Y15
	VPSRLD   $0x19, Y11, Y11
	VPXOR    Y15, Y11, Y11
	VMOVDQA  224(BP), Y15
	MOVQ     R13, R10
	MOVQ     R14, R11
	MOVQ     R15, R12
	ANDQ     $0x03, R12
	MOVQ     R15, R13
	ANDQ     $-4, R13
	MOVQ     R8, R14
	SHRQ     $0x02, R8, R15
	SHRQ     $0x02, R8
	ADDQ     R13, R10
	ADCQ     R14, R11
	ADCQ     $0x00, R12
	ADDQ     R15, R10
	ADCQ     R8, R11
	ADCQ     $0x00, R12
	VPALIGNR $0x0c, Y14, Y14, Y14
	VPALIGNR $0x0c, Y9, Y9, Y9
	VPALIGNR $0x0c, Y10, Y10, Y10
	VPALIGNR $0x0c, Y11, Y11, Y11
	VPALIGNR $0x08, Y12, Y12, Y12
	VPALIGNR $0x08, Y13, Y13, Y13
	VPALIGNR $0x08, Y8, Y8, Y8
	VPALIGNR $0x08, Y15, Y15, Y15
	VPALIGNR $0x04, Y4, Y4, Y4
	VPALIGNR $0x04, Y1, Y1, Y1
	VPALIGNR $0x04, Y2, Y2, Y2
	VPALIGNR $0x04, Y3, Y3, Y3
	DECQ     CX
	JNE      sealAVX2InternalLoop
	VPADDD   ·chacha20Constants<>+0(SB), Y0, Y0
	VPADDD   ·chacha20Constants<>+0(SB), Y5, Y5
	VPADDD   ·chacha20Constants<>+0(SB), Y6, Y6
	VPADDD   ·chacha20Constants<>+0(SB), Y7, Y7
	VPADDD   32(BP), Y14, Y14
	VPADDD   32(BP), Y9, Y9
	VPADDD   32(BP), Y10, Y10
	VPADDD   32(BP), Y11, Y11
	VPADDD   64(BP), Y12, Y12
	VPADDD   64(BP), Y13, Y13
	VPADDD   64(BP), Y8, Y8
	VPADDD   64(BP), Y15, Y15
	VPADDD   96(BP), Y4, Y4
	VPADDD   128(BP), Y1, Y1
	VPADDD   160(BP), Y2, Y2
	VPADDD   192(BP), Y3, Y3
	VMOVDQA  Y15, 224(BP)

	// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
	ADDQ       (DI), R10
	ADCQ       8(DI), R11
	ADCQ       $0x01, R12
	MOVQ       (BP), DX
	MOVQ       DX, R15
	MULXQ      R10, R13, R14
	IMULQ      R12, R15
	MULXQ      R11, AX, DX
	ADDQ       AX, R14
	ADCQ       DX, R15
	MOVQ       8(BP), DX
	MULXQ      R10, R10, AX
	ADDQ       R10, R14
	MULXQ      R11, R11, R8
	ADCQ       R11, R15
	ADCQ       $0x00, R8
	IMULQ      R12, DX
	ADDQ       AX, R15
	ADCQ       DX, R8
	MOVQ       R13, R10
	MOVQ       R14, R11
	MOVQ       R15, R12
	ANDQ       $0x03, R12
	MOVQ       R15, R13
	ANDQ       $-4, R13
	MOVQ       R8, R14
	SHRQ       $0x02, R8, R15
	SHRQ       $0x02, R8
	ADDQ       R13, R10
	ADCQ       R14, R11
	ADCQ       $0x00, R12
	ADDQ       R15, R10
	ADCQ       R8, R11
	ADCQ       $0x00, R12
	LEAQ       32(DI), DI
	VPERM2I128 $0x02, Y0, Y14, Y15
	VPERM2I128 $0x13, Y0, Y14, Y14
	VPERM2I128 $0x02, Y12, Y4, Y0
	VPERM2I128 $0x13, Y12, Y4, Y12
	VPXOR      (SI), Y15, Y15
	VPXOR      32(SI), Y0, Y0
	VPXOR      64(SI), Y14, Y14
	VPXOR      96(SI), Y12, Y12
	VMOVDQU    Y15, (DI)
	VMOVDQU    Y0, 32(DI)
	VMOVDQU    Y14, 64(DI)
	VMOVDQU    Y12, 96(DI)
	VPERM2I128 $0x02, Y5, Y9, Y0
	VPERM2I128 $0x02, Y13, Y1, Y14
	VPERM2I128 $0x13, Y5, Y9, Y12
	VPERM2I128 $0x13, Y13, Y1, Y4
	VPXOR      128(SI), Y0, Y0
	VPXOR      160(SI), Y14, Y14
	VPXOR      192(SI), Y12, Y12
	VPXOR      224(SI), Y4, Y4
	VMOVDQU    Y0, 128(DI)
	VMOVDQU    Y14, 160(DI)
	VMOVDQU    Y12, 192(DI)
	VMOVDQU    Y4, 224(DI)

	// and here
	ADDQ       -16(DI), R10
	ADCQ       -8(DI), R11
	ADCQ       $0x01, R12
	MOVQ       (BP), DX
	MOVQ       DX, R15
	MULXQ      R10, R13, R14
	IMULQ      R12, R15
	MULXQ      R11, AX, DX
	ADDQ       AX, R14
	ADCQ       DX, R15
	MOVQ       8(BP), DX
	MULXQ      R10, R10, AX
	ADDQ       R10, R14
	MULXQ      R11, R11, R8
	ADCQ       R11, R15
	ADCQ       $0x00, R8
	IMULQ      R12, DX
	ADDQ       AX, R15
	ADCQ       DX, R8
	MOVQ       R13, R10
	MOVQ       R14, R11
	MOVQ       R15, R12
	ANDQ       $0x03, R12
	MOVQ       R15, R13
	ANDQ       $-4, R13
	MOVQ       R8, R14
	SHRQ       $0x02, R8, R15
	SHRQ       $0x02, R8
	ADDQ       R13, R10
	ADCQ       R14, R11
	ADCQ       $0x00, R12
	ADDQ       R15, R10
	ADCQ       R8, R11
	ADCQ       $0x00, R12
	VPERM2I128 $0x02, Y6, Y10, Y0
	VPERM2I128 $0x02, Y8, Y2, Y14
	VPERM2I128 $0x13, Y6, Y10, Y12
	VPERM2I128 $0x13, Y8, Y2, Y4
	VPXOR      256(SI), Y0, Y0
	VPXOR      288(SI), Y14, Y14
	VPXOR      320(SI), Y12, Y12
	VPXOR      352(SI), Y4, Y4
	VMOVDQU    Y0, 256(DI)
	VMOVDQU    Y14, 288(DI)
	VMOVDQU    Y12, 320(DI)
	VMOVDQU    Y4, 352(DI)
	VPERM2I128 $0x02, Y7, Y11, Y0
	VPERM2I128 $0x02, 224(BP), Y3, Y14
	VPERM2I128 $0x13, Y7, Y11, Y12
	VPERM2I128 $0x13, 224(BP), Y3, Y4
	VPXOR      384(SI), Y0, Y0
	VPXOR      416(SI), Y14, Y14
	VPXOR      448(SI), Y12, Y12
	VPXOR      480(SI), Y4, Y4
	VMOVDQU    Y0, 384(DI)
	VMOVDQU    Y14, 416(DI)
	VMOVDQU    Y12, 448(DI)
	VMOVDQU    Y4, 480(DI)
	LEAQ       512(SI), SI
	SUBQ       $0x00000200, BX
	CMPQ       BX, $0x00000200
	JG         sealAVX2MainLoop

	// Tail can only hash 480 bytes
	ADDQ  (DI), R10
	ADCQ  8(DI), R11
	ADCQ  $0x01, R12
	MOVQ  (BP), DX
	MOVQ  DX, R15
	MULXQ R10, R13, R14
	IMULQ R12, R15
	MULXQ R11, AX, DX
	ADDQ  AX, R14
	ADCQ  DX, R15
	MOVQ  8(BP), DX
	MULXQ R10, R10, AX
	ADDQ  R10, R14
	MULXQ R11, R11, R8
	ADCQ  R11, R15
	ADCQ  $0x00, R8
	IMULQ R12, DX
	ADDQ  AX, R15
	ADCQ  DX, R8
	MOVQ  R13, R10
	MOVQ  R14, R11
	MOVQ  R15, R12
	ANDQ  $0x03, R12
	MOVQ  R15, R13
	ANDQ  $-4, R13
	MOVQ  R8, R14
	SHRQ  $0x02, R8, R15
	SHRQ  $0x02, R8
	ADDQ  R13, R10
	ADCQ  R14, R11
	ADCQ  $0x00, R12
	ADDQ  R15, R10
	ADCQ  R8, R11
	ADCQ  $0x00, R12
	ADDQ  16(DI), R10
	ADCQ  24(DI), R11
	ADCQ  $0x01, R12
	MOVQ  (BP), DX
	MOVQ  DX, R15
	MULXQ R10, R13, R14
	IMULQ R12, R15
	MULXQ R11, AX, DX
	ADDQ  AX, R14
	ADCQ  DX, R15
	MOVQ  8(BP), DX
	MULXQ R10, R10, AX
	ADDQ  R10, R14
	MULXQ R11, R11, R8
	ADCQ  R11, R15
	ADCQ  $0x00, R8
	IMULQ R12, DX
	ADDQ  AX, R15
	ADCQ  DX, R8
	MOVQ  R13, R10
	MOVQ  R14, R11
	MOVQ  R15, R12
	ANDQ  $0x03, R12
	MOVQ  R15, R13
	ANDQ  $-4, R13
	MOVQ  R8, R14
	SHRQ  $0x02, R8, R15
	SHRQ  $0x02, R8
	ADDQ  R13, R10
	ADCQ  R14, R11
	ADCQ  $0x00, R12
	ADDQ  R15, R10
	ADCQ  R8, R11
	ADCQ  $0x00, R12
	LEAQ  32(DI), DI
	MOVQ  $0x0000000a, CX
	MOVQ  $0x00000000, R9
	CMPQ  BX, $0x80
	JBE   sealAVX2Tail128
	CMPQ  BX, $0x00000100
	JBE   sealAVX2Tail256
	CMPQ  BX, $0x00000180
	JBE   sealAVX2Tail384
	JMP   sealAVX2Tail512

sealSSETail:
	TESTQ BX, BX
	JE    sealSSEFinalize

	// We can only load the PT one byte at a time to avoid read after end of buffer
	MOVQ BX, R9
	SHLQ $0x04, R9
	LEAQ ·andMask<>+0(SB), R13
	MOVQ BX, CX
	LEAQ -1(SI)(BX*1), SI
	XORQ R15, R15
	XORQ R8, R8
	XORQ AX, AX

sealSSETailLoadLoop:
	SHLQ   $0x08, R15, R8
	SHLQ   $0x08, R15
	MOVB   (SI), AX
	XORQ   AX, R15
	LEAQ   -1(SI), SI
	DECQ   CX
	JNE    sealSSETailLoadLoop
	MOVQ   R15, 64(BP)
	MOVQ   R8, 72(BP)
	PXOR   64(BP), X1
	MOVOU  X1, (DI)
	MOVOU  -16(R13)(R9*1), X12
	PAND   X12, X1
	MOVQ   X1, R13
	PSRLDQ $0x08, X1
	MOVQ   X1, R14
	ADDQ   R13, R10
	ADCQ   R14, R11
	ADCQ   $0x01, R12
	MOVQ   (BP), AX
	MOVQ   AX, R15
	MULQ   R10
	MOVQ   AX, R13
	MOVQ   DX, R14
	MOVQ   (BP), AX
	MULQ   R11
	IMULQ  R12, R15
	ADDQ   AX, R14
	ADCQ   DX, R15
	MOVQ   8(BP), AX
	MOVQ   AX, R8
	MULQ   R10
	ADDQ   AX, R14
	ADCQ   $0x00, DX
	MOVQ   DX, R10
	MOVQ   8(BP), AX
	MULQ   R11
	ADDQ   AX, R15
	ADCQ   $0x00, DX
	IMULQ  R12, R8
	ADDQ   R10, R15
	ADCQ   DX, R8
	MOVQ   R13, R10
	MOVQ   R14, R11
	MOVQ   R15, R12
	ANDQ   $0x03, R12
	MOVQ   R15, R13
	ANDQ   $-4, R13
	MOVQ   R8, R14
	SHRQ   $0x02, R8, R15
	SHRQ   $0x02, R8
	ADDQ   R13, R10
	ADCQ   R14, R11
	ADCQ   $0x00, R12
	ADDQ   R15, R10
	ADCQ   R8, R11
	ADCQ   $0x00, R12
	ADDQ   BX, DI

sealSSEFinalize:
	// Hash in the buffer lengths
	ADDQ  ad_len+80(FP), R10
	ADCQ  src_len+56(FP), R11
	ADCQ  $0x01, R12
	MOVQ  (BP), AX
	MOVQ  AX, R15
	MULQ  R10
	MOVQ  AX, R13
	MOVQ  DX, R14
	MOVQ  (BP), AX
	MULQ  R11
	IMULQ R12, R15
	ADDQ  AX, R14
	ADCQ  DX, R15
	MOVQ  8(BP), AX
	MOVQ  AX, R8
	MULQ  R10
	ADDQ  AX, R14
	ADCQ  $0x00, DX
	MOVQ  DX, R10
	MOVQ  8(BP), AX
	MULQ  R11
	ADDQ  AX, R15
	ADCQ  $0x00, DX
	IMULQ R12, R8
	ADDQ  R10, R15
	ADCQ  DX, R8
	MOVQ  R13, R10
	MOVQ  R14, R11
	MOVQ  R15, R12
	ANDQ  $0x03, R12
	MOVQ  R15, R13
	ANDQ  $-4, R13
	MOVQ  R8, R14
	SHRQ  $0x02, R8, R15
	SHRQ  $0x02, R8
	ADDQ  R13, R10
	ADCQ  R14, R11
	ADCQ  $0x00, R12
	ADDQ  R15, R10
	ADCQ  R8, R11
	ADCQ  $0x00, R12

	// Final reduce
	MOVQ    R10, R13
	MOVQ    R11, R14
	MOVQ    R12, R15
	SUBQ    $-5, R10
	SBBQ    $-1, R11
	SBBQ    $0x03, R12
	CMOVQCS R13, R10
	CMOVQCS R14, R11
	CMOVQCS R15, R12

	// Add in the "s" part of the key
	ADDQ 16(BP), R10
	ADCQ 24(BP), R11

	// Finally store the tag at the end of the message
	MOVQ R10, (DI)
	MOVQ R11, 8(DI)
	RET

seal192AVX2:
	VMOVDQA Y0, Y5
	VMOVDQA Y14, Y9
	VMOVDQA Y12, Y13
	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
	VMOVDQA Y0, Y6
	VMOVDQA Y14, Y10
	VMOVDQA Y12, Y8
	VMOVDQA Y4, Y2
	VMOVDQA Y1, Y15
	MOVQ    $0x0000000a, R9

sealAVX2192InnerCipherLoop:
	VPADDD     Y14, Y0, Y0
	VPXOR      Y0, Y4, Y4
	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
	VPADDD     Y4, Y12, Y12
	VPXOR      Y12, Y14, Y14
	VPSLLD     $0x0c, Y14, Y3
	VPSRLD     $0x14, Y14, Y14
	VPXOR      Y3, Y14, Y14
	VPADDD     Y14, Y0, Y0
	VPXOR      Y0, Y4, Y4
	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
	VPADDD     Y4, Y12, Y12
	VPXOR      Y12, Y14, Y14
	VPSLLD     $0x07, Y14, Y3
	VPSRLD     $0x19, Y14, Y14
	VPXOR      Y3, Y14, Y14
	VPADDD     Y9, Y5, Y5
	VPXOR      Y5, Y1, Y1
	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
	VPADDD     Y1, Y13, Y13
	VPXOR      Y13, Y9, Y9
	VPSLLD     $0x0c, Y9, Y3
	VPSRLD     $0x14, Y9, Y9
	VPXOR      Y3, Y9, Y9
	VPADDD     Y9, Y5, Y5
	VPXOR      Y5, Y1, Y1
	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
	VPADDD     Y1, Y13, Y13
	VPXOR      Y13, Y9, Y9
	VPSLLD     $0x07, Y9, Y3
	VPSRLD     $0x19, Y9, Y9
	VPXOR      Y3, Y9, Y9
	VPALIGNR   $0x04, Y14, Y14, Y14
	VPALIGNR   $0x04, Y9, Y9, Y9
	VPALIGNR   $0x08, Y12, Y12, Y12
	VPALIGNR   $0x08, Y13, Y13, Y13
	VPALIGNR   $0x0c, Y4, Y4, Y4
	VPALIGNR   $0x0c, Y1, Y1, Y1
	VPADDD     Y14, Y0, Y0
	VPXOR      Y0, Y4, Y4
	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
	VPADDD     Y4, Y12, Y12
	VPXOR      Y12, Y14, Y14
	VPSLLD     $0x0c, Y14, Y3
	VPSRLD     $0x14, Y14, Y14
	VPXOR      Y3, Y14, Y14
	VPADDD     Y14, Y0, Y0
	VPXOR      Y0, Y4, Y4
	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
	VPADDD     Y4, Y12, Y12
	VPXOR      Y12, Y14, Y14
	VPSLLD     $0x07, Y14, Y3
	VPSRLD     $0x19, Y14, Y14
	VPXOR      Y3, Y14, Y14
	VPADDD     Y9, Y5, Y5
	VPXOR      Y5, Y1, Y1
	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
	VPADDD     Y1, Y13, Y13
	VPXOR      Y13, Y9, Y9
	VPSLLD     $0x0c, Y9, Y3
	VPSRLD     $0x14, Y9, Y9
	VPXOR      Y3, Y9, Y9
	VPADDD     Y9, Y5, Y5
	VPXOR      Y5, Y1, Y1
	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
	VPADDD     Y1, Y13, Y13
	VPXOR      Y13, Y9, Y9
	VPSLLD     $0x07, Y9, Y3
	VPSRLD     $0x19, Y9, Y9
	VPXOR      Y3, Y9, Y9
	VPALIGNR   $0x0c, Y14, Y14, Y14
	VPALIGNR   $0x0c, Y9, Y9, Y9
	VPALIGNR   $0x08, Y12, Y12, Y12
	VPALIGNR   $0x08, Y13, Y13, Y13
	VPALIGNR   $0x04, Y4, Y4, Y4
	VPALIGNR   $0x04, Y1, Y1, Y1
	DECQ       R9
	JNE        sealAVX2192InnerCipherLoop
	VPADDD     Y6, Y0, Y0
	VPADDD     Y6, Y5, Y5
	VPADDD     Y10, Y14, Y14
	VPADDD     Y10, Y9, Y9
	VPADDD     Y8, Y12, Y12
	VPADDD     Y8, Y13, Y13
	VPADDD     Y2, Y4, Y4
	VPADDD     Y15, Y1, Y1
	VPERM2I128 $0x02, Y0, Y14, Y3

	// Clamp and store poly key
	VPAND   ·polyClampMask<>+0(SB), Y3, Y3
	VMOVDQA Y3, (BP)

	// Stream for up to 192 bytes
	VPERM2I128 $0x13, Y0, Y14, Y0
	VPERM2I128 $0x13, Y12, Y4, Y14
	VPERM2I128 $0x02, Y5, Y9, Y12
	VPERM2I128 $0x02, Y13, Y1, Y4
	VPERM2I128 $0x13, Y5, Y9, Y5
	VPERM2I128 $0x13, Y13, Y1, Y9

sealAVX2ShortSeal:
	// Hash aad
	MOVQ ad_len+80(FP), R9
	CALL polyHashADInternal<>(SB)
	XORQ CX, CX

sealAVX2SealHash:
	// itr1 holds the number of bytes encrypted but not yet hashed
	CMPQ  CX, $0x10
	JB    sealAVX2ShortSealLoop
	ADDQ  (DI), R10
	ADCQ  8(DI), R11
	ADCQ  $0x01, R12
	MOVQ  (BP), AX
	MOVQ  AX, R15
	MULQ  R10
	MOVQ  AX, R13
	MOVQ  DX, R14
	MOVQ  (BP), AX
	MULQ  R11
	IMULQ R12, R15
	ADDQ  AX, R14
	ADCQ  DX, R15
	MOVQ  8(BP), AX
	MOVQ  AX, R8
	MULQ  R10
	ADDQ  AX, R14
	ADCQ  $0x00, DX
	MOVQ  DX, R10
	MOVQ  8(BP), AX
	MULQ  R11
	ADDQ  AX, R15
	ADCQ  $0x00, DX
	IMULQ R12, R8
	ADDQ  R10, R15
	ADCQ  DX, R8
	MOVQ  R13, R10
	MOVQ  R14, R11
	MOVQ  R15, R12
	ANDQ  $0x03, R12
	MOVQ  R15, R13
	ANDQ  $-4, R13
	MOVQ  R8, R14
	SHRQ  $0x02, R8, R15
	SHRQ  $0x02, R8
	ADDQ  R13, R10
	ADCQ  R14, R11
	ADCQ  $0x00, R12
	ADDQ  R15, R10
	ADCQ  R8, R11
	ADCQ  $0x00, R12
	SUBQ  $0x10, CX
	ADDQ  $0x10, DI
	JMP   sealAVX2SealHash

sealAVX2ShortSealLoop:
	CMPQ BX, $0x20
	JB   sealAVX2ShortTail32
	SUBQ $0x20, BX

	// Load for encryption
	VPXOR   (SI), Y0, Y0
	VMOVDQU Y0, (DI)
	LEAQ    32(SI), SI

	// Now can hash
	ADDQ  (DI), R10
	ADCQ  8(DI), R11
	ADCQ  $0x01, R12
	MOVQ  (BP), DX
	MOVQ  DX, R15
	MULXQ R10, R13, R14
	IMULQ R12, R15
	MULXQ R11, AX, DX
	ADDQ  AX, R14
	ADCQ  DX, R15
	MOVQ  8(BP), DX
	MULXQ R10, R10, AX
	ADDQ  R10, R14
	MULXQ R11, R11, R8
	ADCQ  R11, R15
	ADCQ  $0x00, R8
	IMULQ R12, DX
	ADDQ  AX, R15
	ADCQ  DX, R8
	MOVQ  R13, R10
	MOVQ  R14, R11
	MOVQ  R15, R12
	ANDQ  $0x03, R12
	MOVQ  R15, R13
	ANDQ  $-4, R13
	MOVQ  R8, R14
	SHRQ  $0x02, R8, R15
	SHRQ  $0x02, R8
	ADDQ  R13, R10
	ADCQ  R14, R11
	ADCQ  $0x00, R12
	ADDQ  R15, R10
	ADCQ  R8, R11
	ADCQ  $0x00, R12
	ADDQ  16(DI), R10
	ADCQ  24(DI), R11
	ADCQ  $0x01, R12
	MOVQ  (BP), DX
	MOVQ  DX, R15
	MULXQ R10, R13, R14
	IMULQ R12, R15
	MULXQ R11, AX, DX
	ADDQ  AX, R14
	ADCQ  DX, R15
	MOVQ  8(BP), DX
	MULXQ R10, R10, AX
	ADDQ  R10, R14
	MULXQ R11, R11, R8
	ADCQ  R11, R15
	ADCQ  $0x00, R8
	IMULQ R12, DX
	ADDQ  AX, R15
	ADCQ  DX, R8
	MOVQ  R13, R10
	MOVQ  R14, R11
	MOVQ  R15, R12
	ANDQ  $0x03, R12
	MOVQ  R15, R13
	ANDQ  $-4, R13
	MOVQ  R8, R14
	SHRQ  $0x02, R8, R15
	SHRQ  $0x02, R8
	ADDQ  R13, R10
	ADCQ  R14, R11
	ADCQ  $0x00, R12
	ADDQ  R15, R10
	ADCQ  R8, R11
	ADCQ  $0x00, R12
	LEAQ  32(DI), DI

	// Shift stream left
	VMOVDQA Y14, Y0
	VMOVDQA Y12, Y14
	VMOVDQA Y4, Y12
	VMOVDQA Y5, Y4
	VMOVDQA Y9, Y5
	VMOVDQA Y13, Y9
	VMOVDQA Y1, Y13
	VMOVDQA Y6, Y1
	VMOVDQA Y10, Y6
	JMP     sealAVX2ShortSealLoop

sealAVX2ShortTail32:
	CMPQ    BX, $0x10
	VMOVDQA X0, X1
	JB      sealAVX2ShortDone
	SUBQ    $0x10, BX

	// Load for encryption
	VPXOR   (SI), X0, X12
	VMOVDQU X12, (DI)
	LEAQ    16(SI), SI

	// Hash
	ADDQ       (DI), R10
	ADCQ       8(DI), R11
	ADCQ       $0x01, R12
	MOVQ       (BP), DX
	MOVQ       DX, R15
	MULXQ      R10, R13, R14
	IMULQ      R12, R15
	MULXQ      R11, AX, DX
	ADDQ       AX, R14
	ADCQ       DX, R15
	MOVQ       8(BP), DX
	MULXQ      R10, R10, AX
	ADDQ       R10, R14
	MULXQ      R11, R11, R8
	ADCQ       R11, R15
	ADCQ       $0x00, R8
	IMULQ      R12, DX
	ADDQ       AX, R15
	ADCQ       DX, R8
	MOVQ       R13, R10
	MOVQ       R14, R11
	MOVQ       R15, R12
	ANDQ       $0x03, R12
	MOVQ       R15, R13
	ANDQ       $-4, R13
	MOVQ       R8, R14
	SHRQ       $0x02, R8, R15
	SHRQ       $0x02, R8
	ADDQ       R13, R10
	ADCQ       R14, R11
	ADCQ       $0x00, R12
	ADDQ       R15, R10
	ADCQ       R8, R11
	ADCQ       $0x00, R12
	LEAQ       16(DI), DI
	VPERM2I128 $0x11, Y0, Y0, Y0
	VMOVDQA    X0, X1

sealAVX2ShortDone:
	VZEROUPPER
	JMP sealSSETail

seal320AVX2:
	VMOVDQA Y0, Y5
	VMOVDQA Y14, Y9
	VMOVDQA Y12, Y13
	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
	VMOVDQA Y0, Y6
	VMOVDQA Y14, Y10
	VMOVDQA Y12, Y8
	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
	VMOVDQA Y14, Y7
	VMOVDQA Y12, Y11
	VMOVDQA Y4, Y15
	MOVQ    $0x0000000a, R9

sealAVX2320InnerCipherLoop:
	VPADDD   Y14, Y0, Y0
	VPXOR    Y0, Y4, Y4
	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
	VPADDD   Y4, Y12, Y12
	VPXOR    Y12, Y14, Y14
	VPSLLD   $0x0c, Y14, Y3
	VPSRLD   $0x14, Y14, Y14
	VPXOR    Y3, Y14, Y14
	VPADDD   Y14, Y0, Y0
	VPXOR    Y0, Y4, Y4
	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
	VPADDD   Y4, Y12, Y12
	VPXOR    Y12, Y14, Y14
	VPSLLD   $0x07, Y14, Y3
	VPSRLD   $0x19, Y14, Y14
	VPXOR    Y3, Y14, Y14
	VPADDD   Y9, Y5, Y5
	VPXOR    Y5, Y1, Y1
	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
	VPADDD   Y1, Y13, Y13
	VPXOR    Y13, Y9, Y9
	VPSLLD   $0x0c, Y9, Y3
	VPSRLD   $0x14, Y9, Y9
	VPXOR    Y3, Y9, Y9
	VPADDD   Y9, Y5, Y5
	VPXOR    Y5, Y1, Y1
	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
	VPADDD   Y1, Y13, Y13
	VPXOR    Y13, Y9, Y9
	VPSLLD   $0x07, Y9, Y3
	VPSRLD   $0x19, Y9, Y9
	VPXOR    Y3, Y9, Y9
	VPADDD   Y10, Y6, Y6
	VPXOR    Y6, Y2, Y2
	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
	VPADDD   Y2, Y8, Y8
	VPXOR    Y8, Y10, Y10
	VPSLLD   $0x0c, Y10, Y3
	VPSRLD   $0x14, Y10, Y10
	VPXOR    Y3, Y10, Y10
	VPADDD   Y10, Y6, Y6
	VPXOR    Y6, Y2, Y2
	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
	VPADDD   Y2, Y8, Y8
	VPXOR    Y8, Y10, Y10
	VPSLLD   $0x07, Y10, Y3
	VPSRLD   $0x19, Y10, Y10
	VPXOR    Y3, Y10, Y10
	VPALIGNR $0x04, Y14, Y14, Y14
	VPALIGNR $0x04, Y9, Y9, Y9
	VPALIGNR $0x04, Y10, Y10, Y10
	VPALIGNR $0x08, Y12, Y12, Y12
	VPALIGNR $0x08, Y13, Y13, Y13
	VPALIGNR $0x08, Y8, Y8, Y8
	VPALIGNR $0x0c, Y4, Y4, Y4
	VPALIGNR $0x0c, Y1, Y1, Y1
	VPALIGNR $0x0c, Y2, Y2, Y2
	VPADDD   Y14, Y0, Y0
	VPXOR    Y0, Y4, Y4
	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
	VPADDD   Y4, Y12, Y12
	VPXOR    Y12, Y14, Y14
	VPSLLD   $0x0c, Y14, Y3
	VPSRLD   $0x14, Y14, Y14
	VPXOR    Y3, Y14, Y14
	VPADDD   Y14, Y0, Y0
	VPXOR    Y0, Y4, Y4
	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
	VPADDD   Y4, Y12, Y12
	VPXOR    Y12, Y14, Y14
	VPSLLD   $0x07, Y14, Y3
	VPSRLD   $0x19, Y14, Y14
	VPXOR    Y3, Y14, Y14
	VPADDD   Y9, Y5, Y5
	VPXOR    Y5, Y1, Y1
	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
	VPADDD   Y1, Y13, Y13
	VPXOR    Y13, Y9, Y9
	VPSLLD   $0x0c, Y9, Y3
	VPSRLD   $0x14, Y9, Y9
	VPXOR    Y3, Y9, Y9
	VPADDD   Y9, Y5, Y5
	VPXOR    Y5, Y1, Y1
	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
	VPADDD   Y1, Y13, Y13
	VPXOR    Y13, Y9, Y9
	VPSLLD   $0x07, Y9, Y3
	VPSRLD   $0x19, Y9, Y9
	VPXOR    Y3, Y9, Y9
	VPADDD   Y10, Y6, Y6
	VPXOR    Y6, Y2, Y2
	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
	VPADDD   Y2, Y8, Y8
	VPXOR    Y8, Y10, Y10
	VPSLLD   $0x0c, Y10, Y3
	VPSRLD   $0x14, Y10, Y10
	VPXOR    Y3, Y10, Y10
	VPADDD   Y10, Y6, Y6
	VPXOR    Y6, Y2, Y2
	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
	VPADDD   Y2, Y8, Y8
	VPXOR    Y8, Y10, Y10
	VPSLLD   $0x07, Y10, Y3
	VPSRLD   $0x19, Y10, Y10
	VPXOR    Y3, Y10, Y10
	VPALIGNR $0x0c, Y14, Y14, Y14
	VPALIGNR $0x0c, Y9, Y9, Y9
	VPALIGNR $0x0c, Y10, Y10, Y10
	VPALIGNR $0x08, Y12, Y12, Y12
	VPALIGNR $0x08, Y13, Y13, Y13
	VPALIGNR $0x08, Y8, Y8, Y8
	VPALIGNR $0x04, Y4, Y4, Y4
	VPALIGNR $0x04, Y1, Y1, Y1
	VPALIGNR $0x04, Y2, Y2, Y2
	DECQ     R9
	JNE      sealAVX2320InnerCipherLoop
	VMOVDQA  ·chacha20Constants<>+0(SB), Y3
	VPADDD   Y3, Y0, Y0
	VPADDD   Y3, Y5, Y5
	VPADDD   Y3, Y6, Y6
	VPADDD   Y7, Y14, Y14
	VPADDD   Y7, Y9, Y9
	VPADDD   Y7, Y10, Y10
	VPADDD   Y11, Y12, Y12
	VPADDD   Y11, Y13, Y13
	VPADDD   Y11, Y8, Y8
	VMOVDQA  ·avx2IncMask<>+0(SB), Y3
	VPADDD   Y15, Y4, Y4
	VPADDD   Y3, Y15, Y15
	VPADDD   Y15, Y1, Y1
	VPADDD   Y3, Y15, Y15
	VPADDD   Y15, Y2, Y2

	// Clamp and store poly key
	VPERM2I128 $0x02, Y0, Y14, Y3
	VPAND      ·polyClampMask<>+0(SB), Y3, Y3
	VMOVDQA    Y3, (BP)

	// Stream for up to 320 bytes
	VPERM2I128 $0x13, Y0, Y14, Y0
	VPERM2I128 $0x13, Y12, Y4, Y14
	VPERM2I128 $0x02, Y5, Y9, Y12
	VPERM2I128 $0x02, Y13, Y1, Y4
	VPERM2I128 $0x13, Y5, Y9, Y5
	VPERM2I128 $0x13, Y13, Y1, Y9
	VPERM2I128 $0x02, Y6, Y10, Y13
	VPERM2I128 $0x02, Y8, Y2, Y1
	VPERM2I128 $0x13, Y6, Y10, Y6
	VPERM2I128 $0x13, Y8, Y2, Y10
	JMP        sealAVX2ShortSeal

sealAVX2Tail128:
	VMOVDQA ·chacha20Constants<>+0(SB), Y0
	VMOVDQA 32(BP), Y14
	VMOVDQA 64(BP), Y12
	VMOVDQA 192(BP), Y4
	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
	VMOVDQA Y4, Y1

sealAVX2Tail128LoopA:
	ADDQ  (DI), R10
	ADCQ  8(DI), R11
	ADCQ  $0x01, R12
	MOVQ  (BP), AX
	MOVQ  AX, R15
	MULQ  R10
	MOVQ  AX, R13
	MOVQ  DX, R14
	MOVQ  (BP), AX
	MULQ  R11
	IMULQ R12, R15
	ADDQ  AX, R14
	ADCQ  DX, R15
	MOVQ  8(BP), AX
	MOVQ  AX, R8
	MULQ  R10
	ADDQ  AX, R14
	ADCQ  $0x00, DX
	MOVQ  DX, R10
	MOVQ  8(BP), AX
	MULQ  R11
	ADDQ  AX, R15
	ADCQ  $0x00, DX
	IMULQ R12, R8
	ADDQ  R10, R15
	ADCQ  DX, R8
	MOVQ  R13, R10
	MOVQ  R14, R11
	MOVQ  R15, R12
	ANDQ  $0x03, R12
	MOVQ  R15, R13
	ANDQ  $-4, R13
	MOVQ  R8, R14
	SHRQ  $0x02, R8, R15
	SHRQ  $0x02, R8
	ADDQ  R13, R10
	ADCQ  R14, R11
	ADCQ  $0x00, R12
	ADDQ  R15, R10
	ADCQ  R8, R11
	ADCQ  $0x00, R12
	LEAQ  16(DI), DI

sealAVX2Tail128LoopB:
	VPADDD     Y14, Y0, Y0
	VPXOR      Y0, Y4, Y4
	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
	VPADDD     Y4, Y12, Y12
	VPXOR      Y12, Y14, Y14
	VPSLLD     $0x0c, Y14, Y3
	VPSRLD     $0x14, Y14, Y14
	VPXOR      Y3, Y14, Y14
	VPADDD     Y14, Y0, Y0
	VPXOR      Y0, Y4, Y4
	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
	VPADDD     Y4, Y12, Y12
	VPXOR      Y12, Y14, Y14
	VPSLLD     $0x07, Y14, Y3
	VPSRLD     $0x19, Y14, Y14
	VPXOR      Y3, Y14, Y14
	ADDQ       (DI), R10
	ADCQ       8(DI), R11
	ADCQ       $0x01, R12
	MOVQ       (BP), AX
	MOVQ       AX, R15
	MULQ       R10
	MOVQ       AX, R13
	MOVQ       DX, R14
	MOVQ       (BP), AX
	MULQ       R11
	IMULQ      R12, R15
	ADDQ       AX, R14
	ADCQ       DX, R15
	MOVQ       8(BP), AX
	MOVQ       AX, R8
	MULQ       R10
	ADDQ       AX, R14
	ADCQ       $0x00, DX
	MOVQ       DX, R10
	MOVQ       8(BP), AX
	MULQ       R11
	ADDQ       AX, R15
	ADCQ       $0x00, DX
	IMULQ      R12, R8
	ADDQ       R10, R15
	ADCQ       DX, R8
	MOVQ       R13, R10
	MOVQ       R14, R11
	MOVQ       R15, R12
	ANDQ       $0x03, R12
	MOVQ       R15, R13
	ANDQ       $-4, R13
	MOVQ       R8, R14
	SHRQ       $0x02, R8, R15
	SHRQ       $0x02, R8
	ADDQ       R13, R10
	ADCQ       R14, R11
	ADCQ       $0x00, R12
	ADDQ       R15, R10
	ADCQ       R8, R11
	ADCQ       $0x00, R12
	VPALIGNR   $0x04, Y14, Y14, Y14
	VPALIGNR   $0x08, Y12, Y12, Y12
	VPALIGNR   $0x0c, Y4, Y4, Y4
	VPADDD     Y14, Y0, Y0
	VPXOR      Y0, Y4, Y4
	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
	VPADDD     Y4, Y12, Y12
	VPXOR      Y12, Y14, Y14
	VPSLLD     $0x0c, Y14, Y3
	VPSRLD     $0x14, Y14, Y14
	VPXOR      Y3, Y14, Y14
	VPADDD     Y14, Y0, Y0
	VPXOR      Y0, Y4, Y4
	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
	VPADDD     Y4, Y12, Y12
	VPXOR      Y12, Y14, Y14
	VPSLLD     $0x07, Y14, Y3
	VPSRLD     $0x19, Y14, Y14
	VPXOR      Y3, Y14, Y14
	ADDQ       16(DI), R10
	ADCQ       24(DI), R11
	ADCQ       $0x01, R12
	MOVQ       (BP), AX
	MOVQ       AX, R15
	MULQ       R10
	MOVQ       AX, R13
	MOVQ       DX, R14
	MOVQ       (BP), AX
	MULQ       R11
	IMULQ      R12, R15
	ADDQ       AX, R14
	ADCQ       DX, R15
	MOVQ       8(BP), AX
	MOVQ       AX, R8
	MULQ       R10
	ADDQ       AX, R14
	ADCQ       $0x00, DX
	MOVQ       DX, R10
	MOVQ       8(BP), AX
	MULQ       R11
	ADDQ       AX, R15
	ADCQ       $0x00, DX
	IMULQ      R12, R8
	ADDQ       R10, R15
	ADCQ       DX, R8
	MOVQ       R13, R10
	MOVQ       R14, R11
	MOVQ       R15, R12
	ANDQ       $0x03, R12
	MOVQ       R15, R13
	ANDQ       $-4, R13
	MOVQ       R8, R14
	SHRQ       $0x02, R8, R15
	SHRQ       $0x02, R8
	ADDQ       R13, R10
	ADCQ       R14, R11
	ADCQ       $0x00, R12
	ADDQ       R15, R10
	ADCQ       R8, R11
	ADCQ       $0x00, R12
	LEAQ       32(DI), DI
	VPALIGNR   $0x0c, Y14, Y14, Y14
	VPALIGNR   $0x08, Y12, Y12, Y12
	VPALIGNR   $0x04, Y4, Y4, Y4
	DECQ       CX
	JG         sealAVX2Tail128LoopA
	DECQ       R9
	JGE        sealAVX2Tail128LoopB
	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y5
	VPADDD     32(BP), Y14, Y9
	VPADDD     64(BP), Y12, Y13
	VPADDD     Y1, Y4, Y1
	VPERM2I128 $0x02, Y5, Y9, Y0
	VPERM2I128 $0x02, Y13, Y1, Y14
	VPERM2I128 $0x13, Y5, Y9, Y12
	VPERM2I128 $0x13, Y13, Y1, Y4
	JMP        sealAVX2ShortSealLoop

sealAVX2Tail256:
	VMOVDQA ·chacha20Constants<>+0(SB), Y0
	VMOVDQA ·chacha20Constants<>+0(SB), Y5
	VMOVDQA 32(BP), Y14
	VMOVDQA 32(BP), Y9
	VMOVDQA 64(BP), Y12
	VMOVDQA 64(BP), Y13
	VMOVDQA 192(BP), Y4
	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
	VMOVDQA Y4, Y7
	VMOVDQA Y1, Y11

sealAVX2Tail256LoopA:
	ADDQ  (DI), R10
	ADCQ  8(DI), R11
	ADCQ  $0x01, R12
	MOVQ  (BP), AX
	MOVQ  AX, R15
	MULQ  R10
	MOVQ  AX, R13
	MOVQ  DX, R14
	MOVQ  (BP), AX
	MULQ  R11
	IMULQ R12, R15
	ADDQ  AX, R14
	ADCQ  DX, R15
	MOVQ  8(BP), AX
	MOVQ  AX, R8
	MULQ  R10
	ADDQ  AX, R14
	ADCQ  $0x00, DX
	MOVQ  DX, R10
	MOVQ  8(BP), AX
	MULQ  R11
	ADDQ  AX, R15
	ADCQ  $0x00, DX
	IMULQ R12, R8
	ADDQ  R10, R15
	ADCQ  DX, R8
	MOVQ  R13, R10
	MOVQ  R14, R11
	MOVQ  R15, R12
	ANDQ  $0x03, R12
	MOVQ  R15, R13
	ANDQ  $-4, R13
	MOVQ  R8, R14
	SHRQ  $0x02, R8, R15
	SHRQ  $0x02, R8
	ADDQ  R13, R10
	ADCQ  R14, R11
	ADCQ  $0x00, R12
	ADDQ  R15, R10
	ADCQ  R8, R11
	ADCQ  $0x00, R12
	LEAQ  16(DI), DI

sealAVX2Tail256LoopB:
	VPADDD     Y14, Y0, Y0
	VPXOR      Y0, Y4, Y4
	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
	VPADDD     Y4, Y12, Y12
	VPXOR      Y12, Y14, Y14
	VPSLLD     $0x0c, Y14, Y3
	VPSRLD     $0x14, Y14, Y14
	VPXOR      Y3, Y14, Y14
	VPADDD     Y14, Y0, Y0
	VPXOR      Y0, Y4, Y4
	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
	VPADDD     Y4, Y12, Y12
	VPXOR      Y12, Y14, Y14
	VPSLLD     $0x07, Y14, Y3
	VPSRLD     $0x19, Y14, Y14
	VPXOR      Y3, Y14, Y14
	VPADDD     Y9, Y5, Y5
	VPXOR      Y5, Y1, Y1
	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
	VPADDD     Y1, Y13, Y13
	VPXOR      Y13, Y9, Y9
	VPSLLD     $0x0c, Y9, Y3
	VPSRLD     $0x14, Y9, Y9
	VPXOR      Y3, Y9, Y9
	VPADDD     Y9, Y5, Y5
	VPXOR      Y5, Y1, Y1
	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
	VPADDD     Y1, Y13, Y13
	VPXOR      Y13, Y9, Y9
	VPSLLD     $0x07, Y9, Y3
	VPSRLD     $0x19, Y9, Y9
	VPXOR      Y3, Y9, Y9
	ADDQ       (DI), R10
	ADCQ       8(DI), R11
	ADCQ       $0x01, R12
	MOVQ       (BP), AX
	MOVQ       AX, R15
	MULQ       R10
	MOVQ       AX, R13
	MOVQ       DX, R14
	MOVQ       (BP), AX
	MULQ       R11
	IMULQ      R12, R15
	ADDQ       AX, R14
	ADCQ       DX, R15
	MOVQ       8(BP), AX
	MOVQ       AX, R8
	MULQ       R10
	ADDQ       AX, R14
	ADCQ       $0x00, DX
	MOVQ       DX, R10
	MOVQ       8(BP), AX
	MULQ       R11
	ADDQ       AX, R15
	ADCQ       $0x00, DX
	IMULQ      R12, R8
	ADDQ       R10, R15
	ADCQ       DX, R8
	MOVQ       R13, R10
	MOVQ       R14, R11
	MOVQ       R15, R12
	ANDQ       $0x03, R12
	MOVQ       R15, R13
	ANDQ       $-4, R13
	MOVQ       R8, R14
	SHRQ       $0x02, R8, R15
	SHRQ       $0x02, R8
	ADDQ       R13, R10
	ADCQ       R14, R11
	ADCQ       $0x00, R12
	ADDQ       R15, R10
	ADCQ       R8, R11
	ADCQ       $0x00, R12
	VPALIGNR   $0x04, Y14, Y14, Y14
	VPALIGNR   $0x04, Y9, Y9, Y9
	VPALIGNR   $0x08, Y12, Y12, Y12
	VPALIGNR   $0x08, Y13, Y13, Y13
	VPALIGNR   $0x0c, Y4, Y4, Y4
	VPALIGNR   $0x0c, Y1, Y1, Y1
	VPADDD     Y14, Y0, Y0
	VPXOR      Y0, Y4, Y4
	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
	VPADDD     Y4, Y12, Y12
	VPXOR      Y12, Y14, Y14
	VPSLLD     $0x0c, Y14, Y3
	VPSRLD     $0x14, Y14, Y14
	VPXOR      Y3, Y14, Y14
	VPADDD     Y14, Y0, Y0
	VPXOR      Y0, Y4, Y4
	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
	VPADDD     Y4, Y12, Y12
	VPXOR      Y12, Y14, Y14
	VPSLLD     $0x07, Y14, Y3
	VPSRLD     $0x19, Y14, Y14
	VPXOR      Y3, Y14, Y14
	VPADDD     Y9, Y5, Y5
	VPXOR      Y5, Y1, Y1
	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
	VPADDD     Y1, Y13, Y13
	VPXOR      Y13, Y9, Y9
	VPSLLD     $0x0c, Y9, Y3
	VPSRLD     $0x14, Y9, Y9
	VPXOR      Y3, Y9, Y9
	VPADDD     Y9, Y5, Y5
	VPXOR      Y5, Y1, Y1
	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
	VPADDD     Y1, Y13, Y13
	VPXOR      Y13, Y9, Y9
	VPSLLD     $0x07, Y9, Y3
	VPSRLD     $0x19, Y9, Y9
	VPXOR      Y3, Y9, Y9
	ADDQ       16(DI), R10
	ADCQ       24(DI), R11
	ADCQ       $0x01, R12
	MOVQ       (BP), AX
	MOVQ       AX, R15
	MULQ       R10
	MOVQ       AX, R13
	MOVQ       DX, R14
	MOVQ       (BP), AX
	MULQ       R11
	IMULQ      R12, R15
	ADDQ       AX, R14
	ADCQ       DX, R15
	MOVQ       8(BP), AX
	MOVQ       AX, R8
	MULQ       R10
	ADDQ       AX, R14
	ADCQ       $0x00, DX
	MOVQ       DX, R10
	MOVQ       8(BP), AX
	MULQ       R11
	ADDQ       AX, R15
	ADCQ       $0x00, DX
	IMULQ      R12, R8
	ADDQ       R10, R15
	ADCQ       DX, R8
	MOVQ       R13, R10
	MOVQ       R14, R11
	MOVQ       R15, R12
	ANDQ       $0x03, R12
	MOVQ       R15, R13
	ANDQ       $-4, R13
	MOVQ       R8, R14
	SHRQ       $0x02, R8, R15
	SHRQ       $0x02, R8
	ADDQ       R13, R10
	ADCQ       R14, R11
	ADCQ       $0x00, R12
	ADDQ       R15, R10
	ADCQ       R8, R11
	ADCQ       $0x00, R12
	LEAQ       32(DI), DI
	VPALIGNR   $0x0c, Y14, Y14, Y14
	VPALIGNR   $0x0c, Y9, Y9, Y9
	VPALIGNR   $0x08, Y12, Y12, Y12
	VPALIGNR   $0x08, Y13, Y13, Y13
	VPALIGNR   $0x04, Y4, Y4, Y4
	VPALIGNR   $0x04, Y1, Y1, Y1
	DECQ       CX
	JG         sealAVX2Tail256LoopA
	DECQ       R9
	JGE        sealAVX2Tail256LoopB
	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
	VPADDD     32(BP), Y14, Y14
	VPADDD     32(BP), Y9, Y9
	VPADDD     64(BP), Y12, Y12
	VPADDD     64(BP), Y13, Y13
	VPADDD     Y7, Y4, Y4
	VPADDD     Y11, Y1, Y1
	VPERM2I128 $0x02, Y0, Y14, Y3
	VPERM2I128 $0x02, Y12, Y4, Y7
	VPERM2I128 $0x13, Y0, Y14, Y11
	VPERM2I128 $0x13, Y12, Y4, Y15
	VPXOR      (SI), Y3, Y3
	VPXOR      32(SI), Y7, Y7
	VPXOR      64(SI), Y11, Y11
	VPXOR      96(SI), Y15, Y15
	VMOVDQU    Y3, (DI)
	VMOVDQU    Y7, 32(DI)
	VMOVDQU    Y11, 64(DI)
	VMOVDQU    Y15, 96(DI)
	MOVQ       $0x00000080, CX
	LEAQ       128(SI), SI
	SUBQ       $0x80, BX
	VPERM2I128 $0x02, Y5, Y9, Y0
	VPERM2I128 $0x02, Y13, Y1, Y14
	VPERM2I128 $0x13, Y5, Y9, Y12
	VPERM2I128 $0x13, Y13, Y1, Y4
	JMP        sealAVX2SealHash

sealAVX2Tail384:
	VMOVDQA ·chacha20Constants<>+0(SB), Y0
	VMOVDQA Y0, Y5
	VMOVDQA Y0, Y6
	VMOVDQA 32(BP), Y14
	VMOVDQA Y14, Y9
	VMOVDQA Y14, Y10
	VMOVDQA 64(BP), Y12
	VMOVDQA Y12, Y13
	VMOVDQA Y12, Y8
	VMOVDQA 192(BP), Y4
	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
	VMOVDQA Y4, Y7
	VMOVDQA Y1, Y11
	VMOVDQA Y2, Y15

sealAVX2Tail384LoopA:
	ADDQ  (DI), R10
	ADCQ  8(DI), R11
	ADCQ  $0x01, R12
	MOVQ  (BP), AX
	MOVQ  AX, R15
	MULQ  R10
	MOVQ  AX, R13
	MOVQ  DX, R14
	MOVQ  (BP), AX
	MULQ  R11
	IMULQ R12, R15
	ADDQ  AX, R14
	ADCQ  DX, R15
	MOVQ  8(BP), AX
	MOVQ  AX, R8
	MULQ  R10
	ADDQ  AX, R14
	ADCQ  $0x00, DX
	MOVQ  DX, R10
	MOVQ  8(BP), AX
	MULQ  R11
	ADDQ  AX, R15
	ADCQ  $0x00, DX
	IMULQ R12, R8
	ADDQ  R10, R15
	ADCQ  DX, R8
	MOVQ  R13, R10
	MOVQ  R14, R11
	MOVQ  R15, R12
	ANDQ  $0x03, R12
	MOVQ  R15, R13
	ANDQ  $-4, R13
	MOVQ  R8, R14
	SHRQ  $0x02, R8, R15
	SHRQ  $0x02, R8
	ADDQ  R13, R10
	ADCQ  R14, R11
	ADCQ  $0x00, R12
	ADDQ  R15, R10
	ADCQ  R8, R11
	ADCQ  $0x00, R12
	LEAQ  16(DI), DI

sealAVX2Tail384LoopB:
	VPADDD     Y14, Y0, Y0
	VPXOR      Y0, Y4, Y4
	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
	VPADDD     Y4, Y12, Y12
	VPXOR      Y12, Y14, Y14
	VPSLLD     $0x0c, Y14, Y3
	VPSRLD     $0x14, Y14, Y14
	VPXOR      Y3, Y14, Y14
	VPADDD     Y14, Y0, Y0
	VPXOR      Y0, Y4, Y4
	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
	VPADDD     Y4, Y12, Y12
	VPXOR      Y12, Y14, Y14
	VPSLLD     $0x07, Y14, Y3
	VPSRLD     $0x19, Y14, Y14
	VPXOR      Y3, Y14, Y14
	VPADDD     Y9, Y5, Y5
	VPXOR      Y5, Y1, Y1
	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
	VPADDD     Y1, Y13, Y13
	VPXOR      Y13, Y9, Y9
	VPSLLD     $0x0c, Y9, Y3
	VPSRLD     $0x14, Y9, Y9
	VPXOR      Y3, Y9, Y9
	VPADDD     Y9, Y5, Y5
	VPXOR      Y5, Y1, Y1
	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
	VPADDD     Y1, Y13, Y13
	VPXOR      Y13, Y9, Y9
	VPSLLD     $0x07, Y9, Y3
	VPSRLD     $0x19, Y9, Y9
	VPXOR      Y3, Y9, Y9
	VPADDD     Y10, Y6, Y6
	VPXOR      Y6, Y2, Y2
	VPSHUFB    ·rol16<>+0(SB), Y2, Y2
	VPADDD     Y2, Y8, Y8
	VPXOR      Y8, Y10, Y10
	VPSLLD     $0x0c, Y10, Y3
	VPSRLD     $0x14, Y10, Y10
	VPXOR      Y3, Y10, Y10
	VPADDD     Y10, Y6, Y6
	VPXOR      Y6, Y2, Y2
	VPSHUFB    ·rol8<>+0(SB), Y2, Y2
	VPADDD     Y2, Y8, Y8
	VPXOR      Y8, Y10, Y10
	VPSLLD     $0x07, Y10, Y3
	VPSRLD     $0x19, Y10, Y10
	VPXOR      Y3, Y10, Y10
	ADDQ       (DI), R10
	ADCQ       8(DI), R11
	ADCQ       $0x01, R12
	MOVQ       (BP), AX
	MOVQ       AX, R15
	MULQ       R10
	MOVQ       AX, R13
	MOVQ       DX, R14
	MOVQ       (BP), AX
	MULQ       R11
	IMULQ      R12, R15
	ADDQ       AX, R14
	ADCQ       DX, R15
	MOVQ       8(BP), AX
	MOVQ       AX, R8
	MULQ       R10
	ADDQ       AX, R14
	ADCQ       $0x00, DX
	MOVQ       DX, R10
	MOVQ       8(BP), AX
	MULQ       R11
	ADDQ       AX, R15
	ADCQ       $0x00, DX
	IMULQ      R12, R8
	ADDQ       R10, R15
	ADCQ       DX, R8
	MOVQ       R13, R10
	MOVQ       R14, R11
	MOVQ       R15, R12
	ANDQ       $0x03, R12
	MOVQ       R15, R13
	ANDQ       $-4, R13
	MOVQ       R8, R14
	SHRQ       $0x02, R8, R15
	SHRQ       $0x02, R8
	ADDQ       R13, R10
	ADCQ       R14, R11
	ADCQ       $0x00, R12
	ADDQ       R15, R10
	ADCQ       R8, R11
	ADCQ       $0x00, R12
	VPALIGNR   $0x04, Y14, Y14, Y14
	VPALIGNR   $0x04, Y9, Y9, Y9
	VPALIGNR   $0x04, Y10, Y10, Y10
	VPALIGNR   $0x08, Y12, Y12, Y12
	VPALIGNR   $0x08, Y13, Y13, Y13
	VPALIGNR   $0x08, Y8, Y8, Y8
	VPALIGNR   $0x0c, Y4, Y4, Y4
	VPALIGNR   $0x0c, Y1, Y1, Y1
	VPALIGNR   $0x0c, Y2, Y2, Y2
	VPADDD     Y14, Y0, Y0
	VPXOR      Y0, Y4, Y4
	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
	VPADDD     Y4, Y12, Y12
	VPXOR      Y12, Y14, Y14
	VPSLLD     $0x0c, Y14, Y3
	VPSRLD     $0x14, Y14, Y14
	VPXOR      Y3, Y14, Y14
	VPADDD     Y14, Y0, Y0
	VPXOR      Y0, Y4, Y4
	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
	VPADDD     Y4, Y12, Y12
	VPXOR      Y12, Y14, Y14
	VPSLLD     $0x07, Y14, Y3
	VPSRLD     $0x19, Y14, Y14
	VPXOR      Y3, Y14, Y14
	VPADDD     Y9, Y5, Y5
	VPXOR      Y5, Y1, Y1
	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
	VPADDD     Y1, Y13, Y13
	VPXOR      Y13, Y9, Y9
	VPSLLD     $0x0c, Y9, Y3
	VPSRLD     $0x14, Y9, Y9
	VPXOR      Y3, Y9, Y9
	VPADDD     Y9, Y5, Y5
	VPXOR      Y5, Y1, Y1
	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
	VPADDD     Y1, Y13, Y13
	VPXOR      Y13, Y9, Y9
	VPSLLD     $0x07, Y9, Y3
	VPSRLD     $0x19, Y9, Y9
	VPXOR      Y3, Y9, Y9
	VPADDD     Y10, Y6, Y6
	VPXOR      Y6, Y2, Y2
	VPSHUFB    ·rol16<>+0(SB), Y2, Y2
	VPADDD     Y2, Y8, Y8
	VPXOR      Y8, Y10, Y10
	VPSLLD     $0x0c, Y10, Y3
	VPSRLD     $0x14, Y10, Y10
	VPXOR      Y3, Y10, Y10
	VPADDD     Y10, Y6, Y6
	VPXOR      Y6, Y2, Y2
	VPSHUFB    ·rol8<>+0(SB), Y2, Y2
	VPADDD     Y2, Y8, Y8
	VPXOR      Y8, Y10, Y10
	VPSLLD     $0x07, Y10, Y3
	VPSRLD     $0x19, Y10, Y10
	VPXOR      Y3, Y10, Y10
	ADDQ       16(DI), R10
	ADCQ       24(DI), R11
	ADCQ       $0x01, R12
	MOVQ       (BP), AX
	MOVQ       AX, R15
	MULQ       R10
	MOVQ       AX, R13
	MOVQ       DX, R14
	MOVQ       (BP), AX
	MULQ       R11
	IMULQ      R12, R15
	ADDQ       AX, R14
	ADCQ       DX, R15
	MOVQ       8(BP), AX
	MOVQ       AX, R8
	MULQ       R10
	ADDQ       AX, R14
	ADCQ       $0x00, DX
	MOVQ       DX, R10
	MOVQ       8(BP), AX
	MULQ       R11
	ADDQ       AX, R15
	ADCQ       $0x00, DX
	IMULQ      R12, R8
	ADDQ       R10, R15
	ADCQ       DX, R8
	MOVQ       R13, R10
	MOVQ       R14, R11
	MOVQ       R15, R12
	ANDQ       $0x03, R12
	MOVQ       R15, R13
	ANDQ       $-4, R13
	MOVQ       R8, R14
	SHRQ       $0x02, R8, R15
	SHRQ       $0x02, R8
	ADDQ       R13, R10
	ADCQ       R14, R11
	ADCQ       $0x00, R12
	ADDQ       R15, R10
	ADCQ       R8, R11
	ADCQ       $0x00, R12
	LEAQ       32(DI), DI
	VPALIGNR   $0x0c, Y14, Y14, Y14
	VPALIGNR   $0x0c, Y9, Y9, Y9
	VPALIGNR   $0x0c, Y10, Y10, Y10
	VPALIGNR   $0x08, Y12, Y12, Y12
	VPALIGNR   $0x08, Y13, Y13, Y13
	VPALIGNR   $0x08, Y8, Y8, Y8
	VPALIGNR   $0x04, Y4, Y4, Y4
	VPALIGNR   $0x04, Y1, Y1, Y1
	VPALIGNR   $0x04, Y2, Y2, Y2
	DECQ       CX
	JG         sealAVX2Tail384LoopA
	DECQ       R9
	JGE        sealAVX2Tail384LoopB
	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
	VPADDD     ·chacha20Constants<>+0(SB), Y6, Y6
	VPADDD     32(BP), Y14, Y14
	VPADDD     32(BP), Y9, Y9
	VPADDD     32(BP), Y10, Y10
	VPADDD     64(BP), Y12, Y12
	VPADDD     64(BP), Y13, Y13
	VPADDD     64(BP), Y8, Y8
	VPADDD     Y7, Y4, Y4
	VPADDD     Y11, Y1, Y1
	VPADDD     Y15, Y2, Y2
	VPERM2I128 $0x02, Y0, Y14, Y3
	VPERM2I128 $0x02, Y12, Y4, Y7
	VPERM2I128 $0x13, Y0, Y14, Y11
	VPERM2I128 $0x13, Y12, Y4, Y15
	VPXOR      (SI), Y3, Y3
	VPXOR      32(SI), Y7, Y7
	VPXOR      64(SI), Y11, Y11
	VPXOR      96(SI), Y15, Y15
	VMOVDQU    Y3, (DI)
	VMOVDQU    Y7, 32(DI)
	VMOVDQU    Y11, 64(DI)
	VMOVDQU    Y15, 96(DI)
	VPERM2I128 $0x02, Y5, Y9, Y3
	VPERM2I128 $0x02, Y13, Y1, Y7
	VPERM2I128 $0x13, Y5, Y9, Y11
	VPERM2I128 $0x13, Y13, Y1, Y15
	VPXOR      128(SI), Y3, Y3
	VPXOR      160(SI), Y7, Y7
	VPXOR      192(SI), Y11, Y11
	VPXOR      224(SI), Y15, Y15
	VMOVDQU    Y3, 128(DI)
	VMOVDQU    Y7, 160(DI)
	VMOVDQU    Y11, 192(DI)
	VMOVDQU    Y15, 224(DI)
	MOVQ       $0x00000100, CX
	LEAQ       256(SI), SI
	SUBQ       $0x00000100, BX
	VPERM2I128 $0x02, Y6, Y10, Y0
	VPERM2I128 $0x02, Y8, Y2, Y14
	VPERM2I128 $0x13, Y6, Y10, Y12
	VPERM2I128 $0x13, Y8, Y2, Y4
	JMP        sealAVX2SealHash

sealAVX2Tail512:
	VMOVDQA ·chacha20Constants<>+0(SB), Y0
	VMOVDQA Y0, Y5
	VMOVDQA Y0, Y6
	VMOVDQA Y0, Y7
	VMOVDQA 32(BP), Y14
	VMOVDQA Y14, Y9
	VMOVDQA Y14, Y10
	VMOVDQA Y14, Y11
	VMOVDQA 64(BP), Y12
	VMOVDQA Y12, Y13
	VMOVDQA Y12, Y8
	VMOVDQA Y12, Y15
	VMOVDQA 192(BP), Y4
	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
	VPADDD  ·avx2IncMask<>+0(SB), Y2, Y3
	VMOVDQA Y4, 96(BP)
	VMOVDQA Y1, 128(BP)
	VMOVDQA Y2, 160(BP)
	VMOVDQA Y3, 192(BP)

sealAVX2Tail512LoopA:
	ADDQ  (DI), R10
	ADCQ  8(DI), R11
	ADCQ  $0x01, R12
	MOVQ  (BP), AX
	MOVQ  AX, R15
	MULQ  R10
	MOVQ  AX, R13
	MOVQ  DX, R14
	MOVQ  (BP), AX
	MULQ  R11
	IMULQ R12, R15
	ADDQ  AX, R14
	ADCQ  DX, R15
	MOVQ  8(BP), AX
	MOVQ  AX, R8
	MULQ  R10
	ADDQ  AX, R14
	ADCQ  $0x00, DX
	MOVQ  DX, R10
	MOVQ  8(BP), AX
	MULQ  R11
	ADDQ  AX, R15
	ADCQ  $0x00, DX
	IMULQ R12, R8
	ADDQ  R10, R15
	ADCQ  DX, R8
	MOVQ  R13, R10
	MOVQ  R14, R11
	MOVQ  R15, R12
	ANDQ  $0x03, R12
	MOVQ  R15, R13
	ANDQ  $-4, R13
	MOVQ  R8, R14
	SHRQ  $0x02, R8, R15
	SHRQ  $0x02, R8
	ADDQ  R13, R10
	ADCQ  R14, R11
	ADCQ  $0x00, R12
	ADDQ  R15, R10
	ADCQ  R8, R11
	ADCQ  $0x00, R12
	LEAQ  16(DI), DI

sealAVX2Tail512LoopB:
	VPADDD     Y14, Y0, Y0
	VPADDD     Y9, Y5, Y5
	VPADDD     Y10, Y6, Y6
	VPADDD     Y11, Y7, Y7
	VPXOR      Y0, Y4, Y4
	VPXOR      Y5, Y1, Y1
	VPXOR      Y6, Y2, Y2
	VPXOR      Y7, Y3, Y3
	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
	VPSHUFB    ·rol16<>+0(SB), Y2, Y2
	VPSHUFB    ·rol16<>+0(SB), Y3, Y3
	VPADDD     Y4, Y12, Y12
	VPADDD     Y1, Y13, Y13
	VPADDD     Y2, Y8, Y8
	VPADDD     Y3, Y15, Y15
	VPXOR      Y12, Y14, Y14
	VPXOR      Y13, Y9, Y9
	VPXOR      Y8, Y10, Y10
	VPXOR      Y15, Y11, Y11
	VMOVDQA    Y15, 224(BP)
	VPSLLD     $0x0c, Y14, Y15
	VPSRLD     $0x14, Y14, Y14
	VPXOR      Y15, Y14, Y14
	VPSLLD     $0x0c, Y9, Y15
	VPSRLD     $0x14, Y9, Y9
	VPXOR      Y15, Y9, Y9
	VPSLLD     $0x0c, Y10, Y15
	VPSRLD     $0x14, Y10, Y10
	VPXOR      Y15, Y10, Y10
	VPSLLD     $0x0c, Y11, Y15
	VPSRLD     $0x14, Y11, Y11
	VPXOR      Y15, Y11, Y11
	VMOVDQA    224(BP), Y15
	ADDQ       (DI), R10
	ADCQ       8(DI), R11
	ADCQ       $0x01, R12
	MOVQ       (BP), DX
	MOVQ       DX, R15
	MULXQ      R10, R13, R14
	IMULQ      R12, R15
	MULXQ      R11, AX, DX
	ADDQ       AX, R14
	ADCQ       DX, R15
	MOVQ       8(BP), DX
	MULXQ      R10, R10, AX
	ADDQ       R10, R14
	MULXQ      R11, R11, R8
	ADCQ       R11, R15
	ADCQ       $0x00, R8
	IMULQ      R12, DX
	ADDQ       AX, R15
	ADCQ       DX, R8
	MOVQ       R13, R10
	MOVQ       R14, R11
	MOVQ       R15, R12
	ANDQ       $0x03, R12
	MOVQ       R15, R13
	ANDQ       $-4, R13
	MOVQ       R8, R14
	SHRQ       $0x02, R8, R15
	SHRQ       $0x02, R8
	ADDQ       R13, R10
	ADCQ       R14, R11
	ADCQ       $0x00, R12
	ADDQ       R15, R10
	ADCQ       R8, R11
	ADCQ       $0x00, R12
	VPADDD     Y14, Y0, Y0
	VPADDD     Y9, Y5, Y5
	VPADDD     Y10, Y6, Y6
	VPADDD     Y11, Y7, Y7
	VPXOR      Y0, Y4, Y4
	VPXOR      Y5, Y1, Y1
	VPXOR      Y6, Y2, Y2
	VPXOR      Y7, Y3, Y3
	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
	VPSHUFB    ·rol8<>+0(SB), Y2, Y2
	VPSHUFB    ·rol8<>+0(SB), Y3, Y3
	VPADDD     Y4, Y12, Y12
	VPADDD     Y1, Y13, Y13
	VPADDD     Y2, Y8, Y8
	VPADDD     Y3, Y15, Y15
	VPXOR      Y12, Y14, Y14
	VPXOR      Y13, Y9, Y9
	VPXOR      Y8, Y10, Y10
	VPXOR      Y15, Y11, Y11
	VMOVDQA    Y15, 224(BP)
	VPSLLD     $0x07, Y14, Y15
	VPSRLD     $0x19, Y14, Y14
	VPXOR      Y15, Y14, Y14
	VPSLLD     $0x07, Y9, Y15
	VPSRLD     $0x19, Y9, Y9
	VPXOR      Y15, Y9, Y9
	VPSLLD     $0x07, Y10, Y15
	VPSRLD     $0x19, Y10, Y10
	VPXOR      Y15, Y10, Y10
	VPSLLD     $0x07, Y11, Y15
	VPSRLD     $0x19, Y11, Y11
	VPXOR      Y15, Y11, Y11
	VMOVDQA    224(BP), Y15
	VPALIGNR   $0x04, Y14, Y14, Y14
	VPALIGNR   $0x04, Y9, Y9, Y9
	VPALIGNR   $0x04, Y10, Y10, Y10
	VPALIGNR   $0x04, Y11, Y11, Y11
	VPALIGNR   $0x08, Y12, Y12, Y12
	VPALIGNR   $0x08, Y13, Y13, Y13
	VPALIGNR   $0x08, Y8, Y8, Y8
	VPALIGNR   $0x08, Y15, Y15, Y15
	VPALIGNR   $0x0c, Y4, Y4, Y4
	VPALIGNR   $0x0c, Y1, Y1, Y1
	VPALIGNR   $0x0c, Y2, Y2, Y2
	VPALIGNR   $0x0c, Y3, Y3, Y3
	VPADDD     Y14, Y0, Y0
	VPADDD     Y9, Y5, Y5
	VPADDD     Y10, Y6, Y6
	VPADDD     Y11, Y7, Y7
	VPXOR      Y0, Y4, Y4
	VPXOR      Y5, Y1, Y1
	VPXOR      Y6, Y2, Y2
	VPXOR      Y7, Y3, Y3
	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
	VPSHUFB    ·rol16<>+0(SB), Y2, Y2
	VPSHUFB    ·rol16<>+0(SB), Y3, Y3
	VPADDD     Y4, Y12, Y12
	VPADDD     Y1, Y13, Y13
	VPADDD     Y2, Y8, Y8
	VPADDD     Y3, Y15, Y15
	VPXOR      Y12, Y14, Y14
	VPXOR      Y13, Y9, Y9
	VPXOR      Y8, Y10, Y10
	VPXOR      Y15, Y11, Y11
	ADDQ       16(DI), R10
	ADCQ       24(DI), R11
	ADCQ       $0x01, R12
	MOVQ       (BP), DX
	MOVQ       DX, R15
	MULXQ      R10, R13, R14
	IMULQ      R12, R15
	MULXQ      R11, AX, DX
	ADDQ       AX, R14
	ADCQ       DX, R15
	MOVQ       8(BP), DX
	MULXQ      R10, R10, AX
	ADDQ       R10, R14
	MULXQ      R11, R11, R8
	ADCQ       R11, R15
	ADCQ       $0x00, R8
	IMULQ      R12, DX
	ADDQ       AX, R15
	ADCQ       DX, R8
	MOVQ       R13, R10
	MOVQ       R14, R11
	MOVQ       R15, R12
	ANDQ       $0x03, R12
	MOVQ       R15, R13
	ANDQ       $-4, R13
	MOVQ       R8, R14
	SHRQ       $0x02, R8, R15
	SHRQ       $0x02, R8
	ADDQ       R13, R10
	ADCQ       R14, R11
	ADCQ       $0x00, R12
	ADDQ       R15, R10
	ADCQ       R8, R11
	ADCQ       $0x00, R12
	LEAQ       32(DI), DI
	VMOVDQA    Y15, 224(BP)
	VPSLLD     $0x0c, Y14, Y15
	VPSRLD     $0x14, Y14, Y14
	VPXOR      Y15, Y14, Y14
	VPSLLD     $0x0c, Y9, Y15
	VPSRLD     $0x14, Y9, Y9
	VPXOR      Y15, Y9, Y9
	VPSLLD     $0x0c, Y10, Y15
	VPSRLD     $0x14, Y10, Y10
	VPXOR      Y15, Y10, Y10
	VPSLLD     $0x0c, Y11, Y15
	VPSRLD     $0x14, Y11, Y11
	VPXOR      Y15, Y11, Y11
	VMOVDQA    224(BP), Y15
	VPADDD     Y14, Y0, Y0
	VPADDD     Y9, Y5, Y5
	VPADDD     Y10, Y6, Y6
	VPADDD     Y11, Y7, Y7
	VPXOR      Y0, Y4, Y4
	VPXOR      Y5, Y1, Y1
	VPXOR      Y6, Y2, Y2
	VPXOR      Y7, Y3, Y3
	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
	VPSHUFB    ·rol8<>+0(SB), Y2, Y2
	VPSHUFB    ·rol8<>+0(SB), Y3, Y3
	VPADDD     Y4, Y12, Y12
	VPADDD     Y1, Y13, Y13
	VPADDD     Y2, Y8, Y8
	VPADDD     Y3, Y15, Y15
	VPXOR      Y12, Y14, Y14
	VPXOR      Y13, Y9, Y9
	VPXOR      Y8, Y10, Y10
	VPXOR      Y15, Y11, Y11
	VMOVDQA    Y15, 224(BP)
	VPSLLD     $0x07, Y14, Y15
	VPSRLD     $0x19, Y14, Y14
	VPXOR      Y15, Y14, Y14
	VPSLLD     $0x07, Y9, Y15
	VPSRLD     $0x19, Y9, Y9
	VPXOR      Y15, Y9, Y9
	VPSLLD     $0x07, Y10, Y15
	VPSRLD     $0x19, Y10, Y10
	VPXOR      Y15, Y10, Y10
	VPSLLD     $0x07, Y11, Y15
	VPSRLD     $0x19, Y11, Y11
	VPXOR      Y15, Y11, Y11
	VMOVDQA    224(BP), Y15
	VPALIGNR   $0x0c, Y14, Y14, Y14
	VPALIGNR   $0x0c, Y9, Y9, Y9
	VPALIGNR   $0x0c, Y10, Y10, Y10
	VPALIGNR   $0x0c, Y11, Y11, Y11
	VPALIGNR   $0x08, Y12, Y12, Y12
	VPALIGNR   $0x08, Y13, Y13, Y13
	VPALIGNR   $0x08, Y8, Y8, Y8
	VPALIGNR   $0x08, Y15, Y15, Y15
	VPALIGNR   $0x04, Y4, Y4, Y4
	VPALIGNR   $0x04, Y1, Y1, Y1
	VPALIGNR   $0x04, Y2, Y2, Y2
	VPALIGNR   $0x04, Y3, Y3, Y3
	DECQ       CX
	JG         sealAVX2Tail512LoopA
	DECQ       R9
	JGE        sealAVX2Tail512LoopB
	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
	VPADDD     ·chacha20Constants<>+0(SB), Y6, Y6
	VPADDD     ·chacha20Constants<>+0(SB), Y7, Y7
	VPADDD     32(BP), Y14, Y14
	VPADDD     32(BP), Y9, Y9
	VPADDD     32(BP), Y10, Y10
	VPADDD     32(BP), Y11, Y11
	VPADDD     64(BP), Y12, Y12
	VPADDD     64(BP), Y13, Y13
	VPADDD     64(BP), Y8, Y8
	VPADDD     64(BP), Y15, Y15
	VPADDD     96(BP), Y4, Y4
	VPADDD     128(BP), Y1, Y1
	VPADDD     160(BP), Y2, Y2
	VPADDD     192(BP), Y3, Y3
	VMOVDQA    Y15, 224(BP)
	VPERM2I128 $0x02, Y0, Y14, Y15
	VPXOR      (SI), Y15, Y15
	VMOVDQU    Y15, (DI)
	VPERM2I128 $0x02, Y12, Y4, Y15
	VPXOR      32(SI), Y15, Y15
	VMOVDQU    Y15, 32(DI)
	VPERM2I128 $0x13, Y0, Y14, Y15
	VPXOR      64(SI), Y15, Y15
	VMOVDQU    Y15, 64(DI)
	VPERM2I128 $0x13, Y12, Y4, Y15
	VPXOR      96(SI), Y15, Y15
	VMOVDQU    Y15, 96(DI)
	VPERM2I128 $0x02, Y5, Y9, Y0
	VPERM2I128 $0x02, Y13, Y1, Y14
	VPERM2I128 $0x13, Y5, Y9, Y12
	VPERM2I128 $0x13, Y13, Y1, Y4
	VPXOR      128(SI), Y0, Y0
	VPXOR      160(SI), Y14, Y14
	VPXOR      192(SI), Y12, Y12
	VPXOR      224(SI), Y4, Y4
	VMOVDQU    Y0, 128(DI)
	VMOVDQU    Y14, 160(DI)
	VMOVDQU    Y12, 192(DI)
	VMOVDQU    Y4, 224(DI)
	VPERM2I128 $0x02, Y6, Y10, Y0
	VPERM2I128 $0x02, Y8, Y2, Y14
	VPERM2I128 $0x13, Y6, Y10, Y12
	VPERM2I128 $0x13, Y8, Y2, Y4
	VPXOR      256(SI), Y0, Y0
	VPXOR      288(SI), Y14, Y14
	VPXOR      320(SI), Y12, Y12
	VPXOR      352(SI), Y4, Y4
	VMOVDQU    Y0, 256(DI)
	VMOVDQU    Y14, 288(DI)
	VMOVDQU    Y12, 320(DI)
	VMOVDQU    Y4, 352(DI)
	MOVQ       $0x00000180, CX
	LEAQ       384(SI), SI
	SUBQ       $0x00000180, BX
	VPERM2I128 $0x02, Y7, Y11, Y0
	VPERM2I128 $0x02, 224(BP), Y3, Y14
	VPERM2I128 $0x13, Y7, Y11, Y12
	VPERM2I128 $0x13, 224(BP), Y3, Y4
	JMP        sealAVX2SealHash