// Code generated by command: go run encode_asm.go -pkg base64 -out ../base64/encode_amd64.s -stubs ../base64/encode_amd64.go. DO NOT EDIT.

//go:build !purego

#include "textflag.h"

// func encodeAVX2(dst []byte, src []byte, lut *int8) (int, int)
// Requires: AVX, AVX2, SSE4.1
TEXT ·encodeAVX2(SB), NOSPLIT, $0-72
	MOVQ         dst_base+0(FP), AX
	MOVQ         src_base+24(FP), DX
	MOVQ         lut+48(FP), SI
	MOVQ         src_len+32(FP), DI
	MOVB         $0x33, CL
	PINSRB       $0x00, CX, X4
	VPBROADCASTB X4, Y4
	MOVB         $0x19, CL
	PINSRB       $0x00, CX, X5
	VPBROADCASTB X5, Y5
	XORQ         CX, CX
	XORQ         BX, BX

	// Load the 16-byte LUT into both lanes of the register
	VPERMQ $0x44, (SI), Y3

	// Load the first block using a mask to avoid potential fault
	VMOVDQU    b64_enc_load<>+0(SB), Y0
	VPMASKMOVD -4(DX)(BX*1), Y0, Y0

loop:
	VPSHUFB  b64_enc_shuf<>+0(SB), Y0, Y0
	VPAND    b64_enc_mask1<>+0(SB), Y0, Y1
	VPSLLW   $0x08, Y1, Y2
	VPSLLW   $0x04, Y1, Y1
	VPBLENDW $0xaa, Y2, Y1, Y2
	VPAND    b64_enc_mask2<>+0(SB), Y0, Y1
	VPMULHUW b64_enc_mult<>+0(SB), Y1, Y0
	VPOR     Y0, Y2, Y0
	VPSUBUSB Y4, Y0, Y1
	VPCMPGTB Y5, Y0, Y2
	VPSUBB   Y2, Y1, Y1
	VPSHUFB  Y1, Y3, Y1
	VPADDB   Y0, Y1, Y0
	VMOVDQU  Y0, (AX)(CX*1)
	ADDQ     $0x20, CX
	ADDQ     $0x18, BX
	SUBQ     $0x18, DI
	CMPQ     DI, $0x20
	JB       done
	VMOVDQU  -4(DX)(BX*1), Y0
	JMP      loop

done:
	MOVQ CX, ret+56(FP)
	MOVQ BX, ret1+64(FP)
	VZEROUPPER
	RET

DATA b64_enc_load<>+0(SB)/8, $0x8000000000000000
DATA b64_enc_load<>+8(SB)/8, $0x8000000080000000
DATA b64_enc_load<>+16(SB)/8, $0x8000000080000000
DATA b64_enc_load<>+24(SB)/8, $0x8000000080000000
GLOBL b64_enc_load<>(SB), RODATA|NOPTR, $32

DATA b64_enc_shuf<>+0(SB)/8, $0x0809070805060405
DATA b64_enc_shuf<>+8(SB)/8, $0x0e0f0d0e0b0c0a0b
DATA b64_enc_shuf<>+16(SB)/8, $0x0405030401020001
DATA b64_enc_shuf<>+24(SB)/8, $0x0a0b090a07080607
GLOBL b64_enc_shuf<>(SB), RODATA|NOPTR, $32

DATA b64_enc_mask1<>+0(SB)/8, $0x003f03f0003f03f0
DATA b64_enc_mask1<>+8(SB)/8, $0x003f03f0003f03f0
DATA b64_enc_mask1<>+16(SB)/8, $0x003f03f0003f03f0
DATA b64_enc_mask1<>+24(SB)/8, $0x003f03f0003f03f0
GLOBL b64_enc_mask1<>(SB), RODATA|NOPTR, $32

DATA b64_enc_mask2<>+0(SB)/8, $0x0fc0fc000fc0fc00
DATA b64_enc_mask2<>+8(SB)/8, $0x0fc0fc000fc0fc00
DATA b64_enc_mask2<>+16(SB)/8, $0x0fc0fc000fc0fc00
DATA b64_enc_mask2<>+24(SB)/8, $0x0fc0fc000fc0fc00
GLOBL b64_enc_mask2<>(SB), RODATA|NOPTR, $32

DATA b64_enc_mult<>+0(SB)/8, $0x0400004004000040
DATA b64_enc_mult<>+8(SB)/8, $0x0400004004000040
DATA b64_enc_mult<>+16(SB)/8, $0x0400004004000040
DATA b64_enc_mult<>+24(SB)/8, $0x0400004004000040
GLOBL b64_enc_mult<>(SB), RODATA|NOPTR, $32