// Code generated by command: go run decode_asm.go -pkg base64 -out ../base64/decode_amd64.s -stubs ../base64/decode_amd64.go. DO NOT EDIT.

//go:build !purego

#include "textflag.h"

DATA b64_dec_lut_hi<>+0(SB)/8, $0x0804080402011010
DATA b64_dec_lut_hi<>+8(SB)/8, $0x1010101010101010
DATA b64_dec_lut_hi<>+16(SB)/8, $0x0804080402011010
DATA b64_dec_lut_hi<>+24(SB)/8, $0x1010101010101010
GLOBL b64_dec_lut_hi<>(SB), RODATA|NOPTR, $32

DATA b64_dec_madd1<>+0(SB)/8, $0x0140014001400140
DATA b64_dec_madd1<>+8(SB)/8, $0x0140014001400140
DATA b64_dec_madd1<>+16(SB)/8, $0x0140014001400140
DATA b64_dec_madd1<>+24(SB)/8, $0x0140014001400140
GLOBL b64_dec_madd1<>(SB), RODATA|NOPTR, $32

DATA b64_dec_madd2<>+0(SB)/8, $0x0001100000011000
DATA b64_dec_madd2<>+8(SB)/8, $0x0001100000011000
DATA b64_dec_madd2<>+16(SB)/8, $0x0001100000011000
DATA b64_dec_madd2<>+24(SB)/8, $0x0001100000011000
GLOBL b64_dec_madd2<>(SB), RODATA|NOPTR, $32

DATA b64_dec_shuf_lo<>+0(SB)/8, $0x0000000000000000
DATA b64_dec_shuf_lo<>+8(SB)/8, $0x0600010200000000
GLOBL b64_dec_shuf_lo<>(SB), RODATA|NOPTR, $16

DATA b64_dec_shuf<>+0(SB)/8, $0x090a040506000102
DATA b64_dec_shuf<>+8(SB)/8, $0x000000000c0d0e08
DATA b64_dec_shuf<>+16(SB)/8, $0x0c0d0e08090a0405
DATA b64_dec_shuf<>+24(SB)/8, $0x0000000000000000
GLOBL b64_dec_shuf<>(SB), RODATA|NOPTR, $32

// func decodeAVX2(dst []byte, src []byte, lut *int8) (int, int)
// Requires: AVX, AVX2, SSE4.1
TEXT ·decodeAVX2(SB), NOSPLIT, $0-72
	MOVQ         dst_base+0(FP), AX
	MOVQ         src_base+24(FP), DX
	MOVQ         lut+48(FP), SI
	MOVQ         src_len+32(FP), DI
	MOVB         $0x2f, CL
	PINSRB       $0x00, CX, X8
	VPBROADCASTB X8, Y8
	XORQ         CX, CX
	XORQ         BX, BX
	VPXOR        Y7, Y7, Y7
	VPERMQ       $0x44, (SI), Y6
	VPERMQ       $0x44, 16(SI), Y4
	VMOVDQA      b64_dec_lut_hi<>+0(SB), Y5

loop:
	VMOVDQU      (DX)(BX*1), Y0
	VPSRLD       $0x04, Y0, Y2
	VPAND        Y8, Y0, Y3
	VPSHUFB      Y3, Y4, Y3
	VPAND        Y8, Y2, Y2
	VPSHUFB      Y2, Y5, Y9
	VPTEST       Y9, Y3
	JNE          done
	VPCMPEQB     Y8, Y0, Y3
	VPADDB       Y3, Y2, Y2
	VPSHUFB      Y2, Y6, Y2
	VPADDB       Y0, Y2, Y0
	VPMADDUBSW   b64_dec_madd1<>+0(SB), Y0, Y0
	VPMADDWD     b64_dec_madd2<>+0(SB), Y0, Y0
	VEXTRACTI128 $0x01, Y0, X1
	VPSHUFB      b64_dec_shuf_lo<>+0(SB), X1, X1
	VPSHUFB      b64_dec_shuf<>+0(SB), Y0, Y0
	VPBLENDD     $0x08, Y1, Y0, Y1
	VPBLENDD     $0xc0, Y7, Y1, Y1
	VMOVDQU      Y1, (AX)(CX*1)
	ADDQ         $0x18, CX
	ADDQ         $0x20, BX
	SUBQ         $0x20, DI
	CMPQ         DI, $0x2d
	JB           done
	JMP          loop

done:
	MOVQ CX, ret+56(FP)
	MOVQ BX, ret1+64(FP)
	VZEROUPPER
	RET

// func decodeAVX2URI(dst []byte, src []byte, lut *int8) (int, int)
// Requires: AVX, AVX2, SSE4.1
TEXT ·decodeAVX2URI(SB), NOSPLIT, $0-72
	MOVB         $0x2f, AL
	PINSRB       $0x00, AX, X0
	VPBROADCASTB X0, Y0
	MOVB         $0x5f, AL
	PINSRB       $0x00, AX, X1
	VPBROADCASTB X1, Y1
	MOVQ         dst_base+0(FP), AX
	MOVQ         src_base+24(FP), DX
	MOVQ         lut+48(FP), SI
	MOVQ         src_len+32(FP), DI
	MOVB         $0x2f, CL
	PINSRB       $0x00, CX, X10
	VPBROADCASTB X10, Y10
	XORQ         CX, CX
	XORQ         BX, BX
	VPXOR        Y9, Y9, Y9
	VPERMQ       $0x44, (SI), Y8
	VPERMQ       $0x44, 16(SI), Y6
	VMOVDQA      b64_dec_lut_hi<>+0(SB), Y7

loop:
	VMOVDQU      (DX)(BX*1), Y2
	VPCMPEQB     Y2, Y1, Y4
	VPBLENDVB    Y4, Y0, Y2, Y2
	VPSRLD       $0x04, Y2, Y4
	VPAND        Y10, Y2, Y5
	VPSHUFB      Y5, Y6, Y5
	VPAND        Y10, Y4, Y4
	VPSHUFB      Y4, Y7, Y11
	VPTEST       Y11, Y5
	JNE          done
	VPCMPEQB     Y10, Y2, Y5
	VPADDB       Y5, Y4, Y4
	VPSHUFB      Y4, Y8, Y4
	VPADDB       Y2, Y4, Y2
	VPMADDUBSW   b64_dec_madd1<>+0(SB), Y2, Y2
	VPMADDWD     b64_dec_madd2<>+0(SB), Y2, Y2
	VEXTRACTI128 $0x01, Y2, X3
	VPSHUFB      b64_dec_shuf_lo<>+0(SB), X3, X3
	VPSHUFB      b64_dec_shuf<>+0(SB), Y2, Y2
	VPBLENDD     $0x08, Y3, Y2, Y3
	VPBLENDD     $0xc0, Y9, Y3, Y3
	VMOVDQU      Y3, (AX)(CX*1)
	ADDQ         $0x18, CX
	ADDQ         $0x20, BX
	SUBQ         $0x20, DI
	CMPQ         DI, $0x2d
	JB           done
	JMP          loop

done:
	MOVQ CX, ret+56(FP)
	MOVQ BX, ret1+64(FP)
	VZEROUPPER
	RET