Source File
crc32_amd64.go
Belonging Package
hash/crc32
// Copyright 2011 The Go Authors. All rights reserved.// Use of this source code is governed by a BSD-style// license that can be found in the LICENSE file.// AMD64-specific hardware-assisted CRC32 algorithms. See crc32.go for a// description of the interface that each architecture-specific file// implements.package crc32import ()// This file contains the code to call the SSE 4.2 version of the Castagnoli// and IEEE CRC.// castagnoliSSE42 is defined in crc32_amd64.s and uses the SSE 4.2 CRC32// instruction.////go:noescapefunc ( uint32, []byte) uint32// castagnoliSSE42Triple is defined in crc32_amd64.s and uses the SSE 4.2 CRC32// instruction.////go:noescapefunc (, , uint32,, , []byte,uint32,) ( uint32, uint32, uint32)// ieeeCLMUL is defined in crc_amd64.s and uses the PCLMULQDQ// instruction as well as SSE 4.1.////go:noescapefunc ( uint32, []byte) uint32const castagnoliK1 = 168const castagnoliK2 = 1344type sse42Table [4]Tablevar castagnoliSSE42TableK1 *sse42Tablevar castagnoliSSE42TableK2 *sse42Tablefunc () bool {return cpu.X86.HasSSE42}func () {if !cpu.X86.HasSSE42 {panic("arch-specific Castagnoli not available")}castagnoliSSE42TableK1 = new(sse42Table)castagnoliSSE42TableK2 = new(sse42Table)// See description in updateCastagnoli.// t[0][i] = CRC(i000, O)// t[1][i] = CRC(0i00, O)// t[2][i] = CRC(00i0, O)// t[3][i] = CRC(000i, O)// where O is a sequence of K zeros.var [castagnoliK2]bytefor := 0; < 4; ++ {for := 0; < 256; ++ {:= uint32() << uint32(*8)castagnoliSSE42TableK1[][] = castagnoliSSE42(, [:castagnoliK1])castagnoliSSE42TableK2[][] = castagnoliSSE42(, [:])}}}// castagnoliShift computes the CRC32-C of K1 or K2 zeroes (depending on the// table given) with the given initial crc value. This corresponds to// CRC(crc, O) in the description in updateCastagnoli.func ( *sse42Table, uint32) uint32 {return [3][>>24] ^[2][(>>16)&0xFF] ^[1][(>>8)&0xFF] ^[0][&0xFF]}func ( uint32, []byte) uint32 {if !cpu.X86.HasSSE42 {panic("not available")}// This method is inspired from the algorithm in Intel's white paper:// "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction"// The same strategy of splitting the buffer in three is used but the// combining calculation is different; the complete derivation is explained// below.//// -- The basic idea --//// The CRC32 instruction (available in SSE4.2) can process 8 bytes at a// time. In recent Intel architectures the instruction takes 3 cycles;// however the processor can pipeline up to three instructions if they// don't depend on each other.//// Roughly this means that we can process three buffers in about the same// time we can process one buffer.//// The idea is then to split the buffer in three, CRC the three pieces// separately and then combine the results.//// Combining the results requires precomputed tables, so we must choose a// fixed buffer length to optimize. The longer the length, the faster; but// only buffers longer than this length will use the optimization. We choose// two cutoffs and compute tables for both:// - one around 512: 168*3=504// - one around 4KB: 1344*3=4032//// -- The nitty gritty --//// Let CRC(I, X) be the non-inverted CRC32-C of the sequence X (with// initial non-inverted CRC I). This function has the following properties:// (a) CRC(I, AB) = CRC(CRC(I, A), B)// (b) CRC(I, A xor B) = CRC(I, A) xor CRC(0, B)//// Say we want to compute CRC(I, ABC) where A, B, C are three sequences of// K bytes each, where K is a fixed constant. Let O be the sequence of K zero// bytes.//// CRC(I, ABC) = CRC(I, ABO xor C)// = CRC(I, ABO) xor CRC(0, C)// = CRC(CRC(I, AB), O) xor CRC(0, C)// = CRC(CRC(I, AO xor B), O) xor CRC(0, C)// = CRC(CRC(I, AO) xor CRC(0, B), O) xor CRC(0, C)// = CRC(CRC(CRC(I, A), O) xor CRC(0, B), O) xor CRC(0, C)//// The castagnoliSSE42Triple function can compute CRC(I, A), CRC(0, B),// and CRC(0, C) efficiently. We just need to find a way to quickly compute// CRC(uvwx, O) given a 4-byte initial value uvwx. We can precompute these// values; since we can't have a 32-bit table, we break it up into four// 8-bit tables://// CRC(uvwx, O) = CRC(u000, O) xor// CRC(0v00, O) xor// CRC(00w0, O) xor// CRC(000x, O)//// We can compute tables corresponding to the four terms for all 8-bit// values.= ^// If a buffer is long enough to use the optimization, process the first few// bytes to align the buffer to an 8 byte boundary (if necessary).if len() >= castagnoliK1*3 {:= int(uintptr(unsafe.Pointer(&[0])) & 7)if != 0 {= 8 -= castagnoliSSE42(, [:])= [:]}}// Process 3*K2 at a time.for len() >= castagnoliK2*3 {// Compute CRC(I, A), CRC(0, B), and CRC(0, C)., , := castagnoliSSE42Triple(, 0, 0,, [castagnoliK2:], [castagnoliK2*2:],castagnoliK2/24)// CRC(I, AB) = CRC(CRC(I, A), O) xor CRC(0, B):= castagnoliShift(castagnoliSSE42TableK2, ) ^// CRC(I, ABC) = CRC(CRC(I, AB), O) xor CRC(0, C)= castagnoliShift(castagnoliSSE42TableK2, ) ^= [castagnoliK2*3:]}// Process 3*K1 at a time.for len() >= castagnoliK1*3 {// Compute CRC(I, A), CRC(0, B), and CRC(0, C)., , := castagnoliSSE42Triple(, 0, 0,, [castagnoliK1:], [castagnoliK1*2:],castagnoliK1/24)// CRC(I, AB) = CRC(CRC(I, A), O) xor CRC(0, B):= castagnoliShift(castagnoliSSE42TableK1, ) ^// CRC(I, ABC) = CRC(CRC(I, AB), O) xor CRC(0, C)= castagnoliShift(castagnoliSSE42TableK1, ) ^= [castagnoliK1*3:]}// Use the simple implementation for what's left.= castagnoliSSE42(, )return ^}func () bool {return cpu.X86.HasPCLMULQDQ && cpu.X86.HasSSE41}var archIeeeTable8 *slicing8Tablefunc () {if !cpu.X86.HasPCLMULQDQ || !cpu.X86.HasSSE41 {panic("not available")}// We still use slicing-by-8 for small buffers.archIeeeTable8 = slicingMakeTable(IEEE)}func ( uint32, []byte) uint32 {if !cpu.X86.HasPCLMULQDQ || !cpu.X86.HasSSE41 {panic("not available")}if len() >= 64 {:= len() & 15:= len() -= ^ieeeCLMUL(^, [:])= [:]}if len() == 0 {return}return slicingUpdate(, archIeeeTable8, )}
![]() |
The pages are generated with Golds v0.6.7. (GOOS=linux GOARCH=amd64) Golds is a Go 101 project developed by Tapir Liu. PR and bug reports are welcome and can be submitted to the issue list. Please follow @Go100and1 (reachable from the left QR code) to get the latest news of Golds. |