// Copyright 2009 The Go Authors. All rights reserved.// Use of this source code is governed by a BSD-style// license that can be found in the LICENSE file.
// Package utf8 implements functions and constants to support text encoded in// UTF-8. It includes functions to translate between runes and UTF-8 byte sequences.// See https://en.wikipedia.org/wiki/UTF-8
package utf8// The conditions RuneError==unicode.ReplacementChar and// MaxRune==unicode.MaxRune are verified in the tests.// Defining them locally avoids this package depending on package unicode.// Numbers fundamental to the encoding.const (RuneError = '\uFFFD'// the "error" Rune or "Unicode replacement character"RuneSelf = 0x80// characters below RuneSelf are represented as themselves in a single byte.MaxRune = '\U0010FFFF'// Maximum valid Unicode code point.UTFMax = 4// maximum number of bytes of a UTF-8 encoded Unicode character.)// Code points in the surrogate range are not valid for UTF-8.const (surrogateMin = 0xD800surrogateMax = 0xDFFF)const (t1 = 0b00000000tx = 0b10000000t2 = 0b11000000t3 = 0b11100000t4 = 0b11110000t5 = 0b11111000maskx = 0b00111111mask2 = 0b00011111mask3 = 0b00001111mask4 = 0b00000111rune1Max = 1<<7 - 1rune2Max = 1<<11 - 1rune3Max = 1<<16 - 1// The default lowest and highest continuation byte.locb = 0b10000000hicb = 0b10111111// These names of these constants are chosen to give nice alignment in the // table below. The first nibble is an index into acceptRanges or F for // special one-byte cases. The second nibble is the Rune length or the // Status for the special one-byte case.xx = 0xF1// invalid: size 1as = 0xF0// ASCII: size 1s1 = 0x02// accept 0, size 2s2 = 0x13// accept 1, size 3s3 = 0x03// accept 0, size 3s4 = 0x23// accept 2, size 3s5 = 0x34// accept 3, size 4s6 = 0x04// accept 0, size 4s7 = 0x44// accept 4, size 4)// first is information about the first byte in a UTF-8 sequence.varfirst = [256]uint8{// 1 2 3 4 5 6 7 8 9 A B C D E Fas, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0Fas, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1Fas, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2Fas, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3Fas, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4Fas, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5Fas, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6Fas, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F// 1 2 3 4 5 6 7 8 9 A B C D E Fxx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8Fxx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9Fxx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAFxx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBFxx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCFs1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDFs2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEFs5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF}// acceptRange gives the range of valid values for the second byte in a UTF-8// sequence.typeacceptRangestruct {louint8// lowest value for second byte.hiuint8// highest value for second byte.}// acceptRanges has size 16 to avoid bounds checks in the code that uses it.varacceptRanges = [16]acceptRange{0: {locb, hicb},1: {0xA0, hicb},2: {locb, 0x9F},3: {0x90, hicb},4: {locb, 0x8F},}// FullRune reports whether the bytes in p begin with a full UTF-8 encoding of a rune.// An invalid encoding is considered a full Rune since it will convert as a width-1 error rune.func ( []byte) bool { := len()if == 0 {returnfalse } := first[[0]]if >= int(&7) {returntrue// ASCII, invalid or valid. }// Must be short or invalid. := acceptRanges[>>4]if > 1 && ([1] < .lo || .hi < [1]) {returntrue } elseif > 2 && ([2] < locb || hicb < [2]) {returntrue }returnfalse}// FullRuneInString is like FullRune but its input is a string.func ( string) bool { := len()if == 0 {returnfalse } := first[[0]]if >= int(&7) {returntrue// ASCII, invalid, or valid. }// Must be short or invalid. := acceptRanges[>>4]if > 1 && ([1] < .lo || .hi < [1]) {returntrue } elseif > 2 && ([2] < locb || hicb < [2]) {returntrue }returnfalse}// DecodeRune unpacks the first UTF-8 encoding in p and returns the rune and// its width in bytes. If p is empty it returns (RuneError, 0). Otherwise, if// the encoding is invalid, it returns (RuneError, 1). Both are impossible// results for correct, non-empty UTF-8.//// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is// out of range, or is not the shortest possible UTF-8 encoding for the// value. No other validation is performed.func ( []byte) ( rune, int) { := len()if < 1 {returnRuneError, 0 } := [0] := first[]if >= as {// The following code simulates an additional check for x == xx and // handling the ASCII and invalid cases accordingly. This mask-and-or // approach prevents an additional branch. := rune() << 31 >> 31// Create 0x0000 or 0xFFFF.returnrune([0])&^ | RuneError&, 1 } := int( & 7) := acceptRanges[>>4]if < {returnRuneError, 1 } := [1]if < .lo || .hi < {returnRuneError, 1 }if <= 2 { // <= instead of == to help the compiler eliminate some bounds checksreturnrune(&mask2)<<6 | rune(&maskx), 2 } := [2]if < locb || hicb < {returnRuneError, 1 }if <= 3 {returnrune(&mask3)<<12 | rune(&maskx)<<6 | rune(&maskx), 3 } := [3]if < locb || hicb < {returnRuneError, 1 }returnrune(&mask4)<<18 | rune(&maskx)<<12 | rune(&maskx)<<6 | rune(&maskx), 4}// DecodeRuneInString is like DecodeRune but its input is a string. If s is// empty it returns (RuneError, 0). Otherwise, if the encoding is invalid, it// returns (RuneError, 1). Both are impossible results for correct, non-empty// UTF-8.//// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is// out of range, or is not the shortest possible UTF-8 encoding for the// value. No other validation is performed.func ( string) ( rune, int) { := len()if < 1 {returnRuneError, 0 } := [0] := first[]if >= as {// The following code simulates an additional check for x == xx and // handling the ASCII and invalid cases accordingly. This mask-and-or // approach prevents an additional branch. := rune() << 31 >> 31// Create 0x0000 or 0xFFFF.returnrune([0])&^ | RuneError&, 1 } := int( & 7) := acceptRanges[>>4]if < {returnRuneError, 1 } := [1]if < .lo || .hi < {returnRuneError, 1 }if <= 2 { // <= instead of == to help the compiler eliminate some bounds checksreturnrune(&mask2)<<6 | rune(&maskx), 2 } := [2]if < locb || hicb < {returnRuneError, 1 }if <= 3 {returnrune(&mask3)<<12 | rune(&maskx)<<6 | rune(&maskx), 3 } := [3]if < locb || hicb < {returnRuneError, 1 }returnrune(&mask4)<<18 | rune(&maskx)<<12 | rune(&maskx)<<6 | rune(&maskx), 4}// DecodeLastRune unpacks the last UTF-8 encoding in p and returns the rune and// its width in bytes. If p is empty it returns (RuneError, 0). Otherwise, if// the encoding is invalid, it returns (RuneError, 1). Both are impossible// results for correct, non-empty UTF-8.//// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is// out of range, or is not the shortest possible UTF-8 encoding for the// value. No other validation is performed.func ( []byte) ( rune, int) { := len()if == 0 {returnRuneError, 0 } := - 1 = rune([])if < RuneSelf {return , 1 }// guard against O(n^2) behavior when traversing // backwards through strings with long sequences of // invalid UTF-8. := - UTFMaxif < 0 { = 0 }for --; >= ; -- {ifRuneStart([]) {break } }if < 0 { = 0 } , = DecodeRune([:])if + != {returnRuneError, 1 }return , }// DecodeLastRuneInString is like DecodeLastRune but its input is a string. If// s is empty it returns (RuneError, 0). Otherwise, if the encoding is invalid,// it returns (RuneError, 1). Both are impossible results for correct,// non-empty UTF-8.//// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is// out of range, or is not the shortest possible UTF-8 encoding for the// value. No other validation is performed.func ( string) ( rune, int) { := len()if == 0 {returnRuneError, 0 } := - 1 = rune([])if < RuneSelf {return , 1 }// guard against O(n^2) behavior when traversing // backwards through strings with long sequences of // invalid UTF-8. := - UTFMaxif < 0 { = 0 }for --; >= ; -- {ifRuneStart([]) {break } }if < 0 { = 0 } , = DecodeRuneInString([:])if + != {returnRuneError, 1 }return , }// RuneLen returns the number of bytes required to encode the rune.// It returns -1 if the rune is not a valid value to encode in UTF-8.func ( rune) int {switch {case < 0:return -1case <= rune1Max:return1case <= rune2Max:return2casesurrogateMin <= && <= surrogateMax:return -1case <= rune3Max:return3case <= MaxRune:return4 }return -1}// EncodeRune writes into p (which must be large enough) the UTF-8 encoding of the rune.// If the rune is out of range, it writes the encoding of RuneError.// It returns the number of bytes written.func ( []byte, rune) int {// Negative values are erroneous. Making it unsigned addresses the problem.switch := uint32(); {case <= rune1Max: [0] = byte()return1case <= rune2Max: _ = [1] // eliminate bounds checks [0] = t2 | byte(>>6) [1] = tx | byte()&maskxreturn2case > MaxRune, surrogateMin <= && <= surrogateMax: = RuneErrorfallthroughcase <= rune3Max: _ = [2] // eliminate bounds checks [0] = t3 | byte(>>12) [1] = tx | byte(>>6)&maskx [2] = tx | byte()&maskxreturn3default: _ = [3] // eliminate bounds checks [0] = t4 | byte(>>18) [1] = tx | byte(>>12)&maskx [2] = tx | byte(>>6)&maskx [3] = tx | byte()&maskxreturn4 }}// AppendRune appends the UTF-8 encoding of r to the end of p and// returns the extended buffer. If the rune is out of range,// it appends the encoding of RuneError.func ( []byte, rune) []byte {// This function is inlineable for fast handling of ASCII.ifuint32() <= rune1Max {returnappend(, byte()) }returnappendRuneNonASCII(, )}func ( []byte, rune) []byte {// Negative values are erroneous. Making it unsigned addresses the problem.switch := uint32(); {case <= rune2Max:returnappend(, t2|byte(>>6), tx|byte()&maskx)case > MaxRune, surrogateMin <= && <= surrogateMax: = RuneErrorfallthroughcase <= rune3Max:returnappend(, t3|byte(>>12), tx|byte(>>6)&maskx, tx|byte()&maskx)default:returnappend(, t4|byte(>>18), tx|byte(>>12)&maskx, tx|byte(>>6)&maskx, tx|byte()&maskx) }}// RuneCount returns the number of runes in p. Erroneous and short// encodings are treated as single runes of width 1 byte.func ( []byte) int { := len()varintfor := 0; < ; { ++ := []if < RuneSelf {// ASCII fast path ++continue } := first[]if == xx { ++ // invalid.continue } := int( & 7)if + > { ++ // Short or invalid.continue } := acceptRanges[>>4]if := [+1]; < .lo || .hi < { = 1 } elseif == 2 { } elseif := [+2]; < locb || hicb < { = 1 } elseif == 3 { } elseif := [+3]; < locb || hicb < { = 1 } += }return}// RuneCountInString is like RuneCount but its input is a string.func ( string) ( int) { := len()for := 0; < ; ++ { := []if < RuneSelf {// ASCII fast path ++continue } := first[]if == xx { ++ // invalid.continue } := int( & 7)if + > { ++ // Short or invalid.continue } := acceptRanges[>>4]if := [+1]; < .lo || .hi < { = 1 } elseif == 2 { } elseif := [+2]; < locb || hicb < { = 1 } elseif == 3 { } elseif := [+3]; < locb || hicb < { = 1 } += }return}// RuneStart reports whether the byte could be the first byte of an encoded,// possibly invalid rune. Second and subsequent bytes always have the top two// bits set to 10.func ( byte) bool { return &0xC0 != 0x80 }// Valid reports whether p consists entirely of valid UTF-8-encoded runes.func ( []byte) bool {// This optimization avoids the need to recompute the capacity // when generating code for p[8:], bringing it to parity with // ValidString, which was 20% faster on long ASCII strings. = [:len():len()]// Fast path. Check for and skip 8 bytes of ASCII characters per iteration.forlen() >= 8 {// Combining two 32 bit loads allows the same code to be used // for 32 and 64 bit platforms. // The compiler can generate a 32bit load for first32 and second32 // on many platforms. See test/codegen/memcombine.go. := uint32([0]) | uint32([1])<<8 | uint32([2])<<16 | uint32([3])<<24 := uint32([4]) | uint32([5])<<8 | uint32([6])<<16 | uint32([7])<<24if (|)&0x80808080 != 0 {// Found a non ASCII byte (>= RuneSelf).break } = [8:] } := len()for := 0; < ; { := []if < RuneSelf { ++continue } := first[]if == xx {returnfalse// Illegal starter byte. } := int( & 7)if + > {returnfalse// Short or invalid. } := acceptRanges[>>4]if := [+1]; < .lo || .hi < {returnfalse } elseif == 2 { } elseif := [+2]; < locb || hicb < {returnfalse } elseif == 3 { } elseif := [+3]; < locb || hicb < {returnfalse } += }returntrue}// ValidString reports whether s consists entirely of valid UTF-8-encoded runes.func ( string) bool {// Fast path. Check for and skip 8 bytes of ASCII characters per iteration.forlen() >= 8 {// Combining two 32 bit loads allows the same code to be used // for 32 and 64 bit platforms. // The compiler can generate a 32bit load for first32 and second32 // on many platforms. See test/codegen/memcombine.go. := uint32([0]) | uint32([1])<<8 | uint32([2])<<16 | uint32([3])<<24 := uint32([4]) | uint32([5])<<8 | uint32([6])<<16 | uint32([7])<<24if (|)&0x80808080 != 0 {// Found a non ASCII byte (>= RuneSelf).break } = [8:] } := len()for := 0; < ; { := []if < RuneSelf { ++continue } := first[]if == xx {returnfalse// Illegal starter byte. } := int( & 7)if + > {returnfalse// Short or invalid. } := acceptRanges[>>4]if := [+1]; < .lo || .hi < {returnfalse } elseif == 2 { } elseif := [+2]; < locb || hicb < {returnfalse } elseif == 3 { } elseif := [+3]; < locb || hicb < {returnfalse } += }returntrue}// ValidRune reports whether r can be legally encoded as UTF-8.// Code points that are out of range or a surrogate half are illegal.func ( rune) bool {switch {case0 <= && < surrogateMin:returntruecasesurrogateMax < && <= MaxRune:returntrue }returnfalse}
The pages are generated with Goldsv0.6.7. (GOOS=linux GOARCH=amd64)
Golds is a Go 101 project developed by Tapir Liu.
PR and bug reports are welcome and can be submitted to the issue list.
Please follow @Go100and1 (reachable from the left QR code) to get the latest news of Golds.