synchronization

This commit is contained in:
2025-08-25 16:04:00 +08:00
commit 33f9b3ce46
1951 changed files with 854396 additions and 0 deletions

View File

@@ -0,0 +1,371 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package colltab
import (
"fmt"
"unicode"
)
// Level identifies the collation comparison level.
// The primary level corresponds to the basic sorting of text.
// The secondary level corresponds to accents and related linguistic elements.
// The tertiary level corresponds to casing and related concepts.
// The quaternary level is derived from the other levels by the
// various algorithms for handling variable elements.
type Level int
const (
Primary Level = iota
Secondary
Tertiary
Quaternary
Identity
NumLevels
)
const (
defaultSecondary = 0x20
defaultTertiary = 0x2
maxTertiary = 0x1F
MaxQuaternary = 0x1FFFFF // 21 bits.
)
// Elem is a representation of a collation element. This API provides ways to encode
// and decode Elems. Implementations of collation tables may use values greater
// or equal to PrivateUse for their own purposes. However, these should never be
// returned by AppendNext.
type Elem uint32
const (
maxCE Elem = 0xAFFFFFFF
PrivateUse = minContract
minContract = 0xC0000000
maxContract = 0xDFFFFFFF
minExpand = 0xE0000000
maxExpand = 0xEFFFFFFF
minDecomp = 0xF0000000
)
type ceType int
const (
ceNormal ceType = iota // ceNormal includes implicits (ce == 0)
ceContractionIndex // rune can be a start of a contraction
ceExpansionIndex // rune expands into a sequence of collation elements
ceDecompose // rune expands using NFKC decomposition
)
func (ce Elem) ctype() ceType {
if ce <= maxCE {
return ceNormal
}
if ce <= maxContract {
return ceContractionIndex
} else {
if ce <= maxExpand {
return ceExpansionIndex
}
return ceDecompose
}
panic("should not reach here")
return ceType(-1)
}
// For normal collation elements, we assume that a collation element either has
// a primary or non-default secondary value, not both.
// Collation elements with a primary value are of the form
// 01pppppp pppppppp ppppppp0 ssssssss
// - p* is primary collation value
// - s* is the secondary collation value
// 00pppppp pppppppp ppppppps sssttttt, where
// - p* is primary collation value
// - s* offset of secondary from default value.
// - t* is the tertiary collation value
// 100ttttt cccccccc pppppppp pppppppp
// - t* is the tertiar collation value
// - c* is the canonical combining class
// - p* is the primary collation value
// Collation elements with a secondary value are of the form
// 1010cccc ccccssss ssssssss tttttttt, where
// - c* is the canonical combining class
// - s* is the secondary collation value
// - t* is the tertiary collation value
// 11qqqqqq qqqqqqqq qqqqqqq0 00000000
// - q* quaternary value
const (
ceTypeMask = 0xC0000000
ceTypeMaskExt = 0xE0000000
ceIgnoreMask = 0xF00FFFFF
ceType1 = 0x40000000
ceType2 = 0x00000000
ceType3or4 = 0x80000000
ceType4 = 0xA0000000
ceTypeQ = 0xC0000000
Ignore = ceType4
firstNonPrimary = 0x80000000
lastSpecialPrimary = 0xA0000000
secondaryMask = 0x80000000
hasTertiaryMask = 0x40000000
primaryValueMask = 0x3FFFFE00
maxPrimaryBits = 21
compactPrimaryBits = 16
maxSecondaryBits = 12
maxTertiaryBits = 8
maxCCCBits = 8
maxSecondaryCompactBits = 8
maxSecondaryDiffBits = 4
maxTertiaryCompactBits = 5
primaryShift = 9
compactSecondaryShift = 5
minCompactSecondary = defaultSecondary - 4
)
func makeImplicitCE(primary int) Elem {
return ceType1 | Elem(primary<<primaryShift) | defaultSecondary
}
// MakeElem returns an Elem for the given values. It will return an error
// if the given combination of values is invalid.
func MakeElem(primary, secondary, tertiary int, ccc uint8) (Elem, error) {
if w := primary; w >= 1<<maxPrimaryBits || w < 0 {
return 0, fmt.Errorf("makeCE: primary weight out of bounds: %x >= %x", w, 1<<maxPrimaryBits)
}
if w := secondary; w >= 1<<maxSecondaryBits || w < 0 {
return 0, fmt.Errorf("makeCE: secondary weight out of bounds: %x >= %x", w, 1<<maxSecondaryBits)
}
if w := tertiary; w >= 1<<maxTertiaryBits || w < 0 {
return 0, fmt.Errorf("makeCE: tertiary weight out of bounds: %x >= %x", w, 1<<maxTertiaryBits)
}
ce := Elem(0)
if primary != 0 {
if ccc != 0 {
if primary >= 1<<compactPrimaryBits {
return 0, fmt.Errorf("makeCE: primary weight with non-zero CCC out of bounds: %x >= %x", primary, 1<<compactPrimaryBits)
}
if secondary != defaultSecondary {
return 0, fmt.Errorf("makeCE: cannot combine non-default secondary value (%x) with non-zero CCC (%x)", secondary, ccc)
}
ce = Elem(tertiary << (compactPrimaryBits + maxCCCBits))
ce |= Elem(ccc) << compactPrimaryBits
ce |= Elem(primary)
ce |= ceType3or4
} else if tertiary == defaultTertiary {
if secondary >= 1<<maxSecondaryCompactBits {
return 0, fmt.Errorf("makeCE: secondary weight with non-zero primary out of bounds: %x >= %x", secondary, 1<<maxSecondaryCompactBits)
}
ce = Elem(primary<<(maxSecondaryCompactBits+1) + secondary)
ce |= ceType1
} else {
d := secondary - defaultSecondary + maxSecondaryDiffBits
if d >= 1<<maxSecondaryDiffBits || d < 0 {
return 0, fmt.Errorf("makeCE: secondary weight diff out of bounds: %x < 0 || %x > %x", d, d, 1<<maxSecondaryDiffBits)
}
if tertiary >= 1<<maxTertiaryCompactBits {
return 0, fmt.Errorf("makeCE: tertiary weight with non-zero primary out of bounds: %x > %x", tertiary, 1<<maxTertiaryCompactBits)
}
ce = Elem(primary<<maxSecondaryDiffBits + d)
ce = ce<<maxTertiaryCompactBits + Elem(tertiary)
}
} else {
ce = Elem(secondary<<maxTertiaryBits + tertiary)
ce += Elem(ccc) << (maxSecondaryBits + maxTertiaryBits)
ce |= ceType4
}
return ce, nil
}
// MakeQuaternary returns an Elem with the given quaternary value.
func MakeQuaternary(v int) Elem {
return ceTypeQ | Elem(v<<primaryShift)
}
// Mask sets weights for any level smaller than l to 0.
// The resulting Elem can be used to test for equality with
// other Elems to which the same mask has been applied.
func (ce Elem) Mask(l Level) uint32 {
return 0
}
// CCC returns the canonical combining class associated with the underlying character,
// if applicable, or 0 otherwise.
func (ce Elem) CCC() uint8 {
if ce&ceType3or4 != 0 {
if ce&ceType4 == ceType3or4 {
return uint8(ce >> 16)
}
return uint8(ce >> 20)
}
return 0
}
// Primary returns the primary collation weight for ce.
func (ce Elem) Primary() int {
if ce >= firstNonPrimary {
if ce > lastSpecialPrimary {
return 0
}
return int(uint16(ce))
}
return int(ce&primaryValueMask) >> primaryShift
}
// Secondary returns the secondary collation weight for ce.
func (ce Elem) Secondary() int {
switch ce & ceTypeMask {
case ceType1:
return int(uint8(ce))
case ceType2:
return minCompactSecondary + int((ce>>compactSecondaryShift)&0xF)
case ceType3or4:
if ce < ceType4 {
return defaultSecondary
}
return int(ce>>8) & 0xFFF
case ceTypeQ:
return 0
}
panic("should not reach here")
}
// Tertiary returns the tertiary collation weight for ce.
func (ce Elem) Tertiary() uint8 {
if ce&hasTertiaryMask == 0 {
if ce&ceType3or4 == 0 {
return uint8(ce & 0x1F)
}
if ce&ceType4 == ceType4 {
return uint8(ce)
}
return uint8(ce>>24) & 0x1F // type 2
} else if ce&ceTypeMask == ceType1 {
return defaultTertiary
}
// ce is a quaternary value.
return 0
}
func (ce Elem) updateTertiary(t uint8) Elem {
if ce&ceTypeMask == ceType1 {
// convert to type 4
nce := ce & primaryValueMask
nce |= Elem(uint8(ce)-minCompactSecondary) << compactSecondaryShift
ce = nce
} else if ce&ceTypeMaskExt == ceType3or4 {
ce &= ^Elem(maxTertiary << 24)
return ce | (Elem(t) << 24)
} else {
// type 2 or 4
ce &= ^Elem(maxTertiary)
}
return ce | Elem(t)
}
// Quaternary returns the quaternary value if explicitly specified,
// 0 if ce == Ignore, or MaxQuaternary otherwise.
// Quaternary values are used only for shifted variants.
func (ce Elem) Quaternary() int {
if ce&ceTypeMask == ceTypeQ {
return int(ce&primaryValueMask) >> primaryShift
} else if ce&ceIgnoreMask == Ignore {
return 0
}
return MaxQuaternary
}
// Weight returns the collation weight for the given level.
func (ce Elem) Weight(l Level) int {
switch l {
case Primary:
return ce.Primary()
case Secondary:
return ce.Secondary()
case Tertiary:
return int(ce.Tertiary())
case Quaternary:
return ce.Quaternary()
}
return 0 // return 0 (ignore) for undefined levels.
}
// For contractions, collation elements are of the form
// 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where
// - n* is the size of the first node in the contraction trie.
// - i* is the index of the first node in the contraction trie.
// - b* is the offset into the contraction collation element table.
// See contract.go for details on the contraction trie.
const (
maxNBits = 4
maxTrieIndexBits = 12
maxContractOffsetBits = 13
)
func splitContractIndex(ce Elem) (index, n, offset int) {
n = int(ce & (1<<maxNBits - 1))
ce >>= maxNBits
index = int(ce & (1<<maxTrieIndexBits - 1))
ce >>= maxTrieIndexBits
offset = int(ce & (1<<maxContractOffsetBits - 1))
return
}
// For expansions, Elems are of the form 11100000 00000000 bbbbbbbb bbbbbbbb,
// where b* is the index into the expansion sequence table.
const maxExpandIndexBits = 16
func splitExpandIndex(ce Elem) (index int) {
return int(uint16(ce))
}
// Some runes can be expanded using NFKD decomposition. Instead of storing the full
// sequence of collation elements, we decompose the rune and lookup the collation
// elements for each rune in the decomposition and modify the tertiary weights.
// The Elem, in this case, is of the form 11110000 00000000 wwwwwwww vvvvvvvv, where
// - v* is the replacement tertiary weight for the first rune,
// - w* is the replacement tertiary weight for the second rune,
// Tertiary weights of subsequent runes should be replaced with maxTertiary.
// See http://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details.
func splitDecompose(ce Elem) (t1, t2 uint8) {
return uint8(ce), uint8(ce >> 8)
}
const (
// These constants were taken from http://www.unicode.org/versions/Unicode6.0.0/ch12.pdf.
minUnified rune = 0x4E00
maxUnified = 0x9FFF
minCompatibility = 0xF900
maxCompatibility = 0xFAFF
minRare = 0x3400
maxRare = 0x4DBF
)
const (
commonUnifiedOffset = 0x10000
rareUnifiedOffset = 0x20000 // largest rune in common is U+FAFF
otherOffset = 0x50000 // largest rune in rare is U+2FA1D
illegalOffset = otherOffset + int(unicode.MaxRune)
maxPrimary = illegalOffset + 1
)
// implicitPrimary returns the primary weight for the a rune
// for which there is no entry for the rune in the collation table.
// We take a different approach from the one specified in
// http://unicode.org/reports/tr10/#Implicit_Weights,
// but preserve the resulting relative ordering of the runes.
func implicitPrimary(r rune) int {
if unicode.Is(unicode.Ideographic, r) {
if r >= minUnified && r <= maxUnified {
// The most common case for CJK.
return int(r) + commonUnifiedOffset
}
if r >= minCompatibility && r <= maxCompatibility {
// This will typically not hit. The DUCET explicitly specifies mappings
// for all characters that do not decompose.
return int(r) + commonUnifiedOffset
}
return int(r) + rareUnifiedOffset
}
return int(r) + otherOffset
}

View File

@@ -0,0 +1,105 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package colltab contains functionality related to collation tables.
// It is only to be used by the collate and search packages.
package colltab // import "golang.org/x/text/internal/colltab"
import (
"sort"
"golang.org/x/text/language"
)
// MatchLang finds the index of t in tags, using a matching algorithm used for
// collation and search. tags[0] must be language.Und, the remaining tags should
// be sorted alphabetically.
//
// Language matching for collation and search is different from the matching
// defined by language.Matcher: the (inferred) base language must be an exact
// match for the relevant fields. For example, "gsw" should not match "de".
// Also the parent relation is different, as a parent may have a different
// script. So usually the parent of zh-Hant is und, whereas for MatchLang it is
// zh.
func MatchLang(t language.Tag, tags []language.Tag) int {
// Canonicalize the values, including collapsing macro languages.
t, _ = language.All.Canonicalize(t)
base, conf := t.Base()
// Estimate the base language, but only use high-confidence values.
if conf < language.High {
// The root locale supports "search" and "standard". We assume that any
// implementation will only use one of both.
return 0
}
// Maximize base and script and normalize the tag.
if _, s, r := t.Raw(); (r != language.Region{}) {
p, _ := language.Raw.Compose(base, s, r)
// Taking the parent forces the script to be maximized.
p = p.Parent()
// Add back region and extensions.
t, _ = language.Raw.Compose(p, r, t.Extensions())
} else {
// Set the maximized base language.
t, _ = language.Raw.Compose(base, s, t.Extensions())
}
// Find start index of the language tag.
start := 1 + sort.Search(len(tags)-1, func(i int) bool {
b, _, _ := tags[i+1].Raw()
return base.String() <= b.String()
})
if start < len(tags) {
if b, _, _ := tags[start].Raw(); b != base {
return 0
}
}
// Besides the base language, script and region, only the collation type and
// the custom variant defined in the 'u' extension are used to distinguish a
// locale.
// Strip all variants and extensions and add back the custom variant.
tdef, _ := language.Raw.Compose(t.Raw())
tdef, _ = tdef.SetTypeForKey("va", t.TypeForKey("va"))
// First search for a specialized collation type, if present.
try := []language.Tag{tdef}
if co := t.TypeForKey("co"); co != "" {
tco, _ := tdef.SetTypeForKey("co", co)
try = []language.Tag{tco, tdef}
}
for _, tx := range try {
for ; tx != language.Und; tx = parent(tx) {
for i, t := range tags[start:] {
if b, _, _ := t.Raw(); b != base {
break
}
if tx == t {
return start + i
}
}
}
}
return 0
}
// parent computes the structural parent. This means inheritance may change
// script. So, unlike the CLDR parent, parent(zh-Hant) == zh.
func parent(t language.Tag) language.Tag {
if t.TypeForKey("va") != "" {
t, _ = t.SetTypeForKey("va", "")
return t
}
result := language.Und
if b, s, r := t.Raw(); (r != language.Region{}) {
result, _ = language.Raw.Compose(b, s, t.Extensions())
} else if (s != language.Script{}) {
result, _ = language.Raw.Compose(b, t.Extensions())
} else if (b != language.Base{}) {
result, _ = language.Raw.Compose(t.Extensions())
}
return result
}

View File

@@ -0,0 +1,145 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package colltab
import "unicode/utf8"
// For a description of ContractTrieSet, see text/collate/build/contract.go.
type ContractTrieSet []struct{ L, H, N, I uint8 }
// ctScanner is used to match a trie to an input sequence.
// A contraction may match a non-contiguous sequence of bytes in an input string.
// For example, if there is a contraction for <a, combining_ring>, it should match
// the sequence <a, combining_cedilla, combining_ring>, as combining_cedilla does
// not block combining_ring.
// ctScanner does not automatically skip over non-blocking non-starters, but rather
// retains the state of the last match and leaves it up to the user to continue
// the match at the appropriate points.
type ctScanner struct {
states ContractTrieSet
s []byte
n int
index int
pindex int
done bool
}
type ctScannerString struct {
states ContractTrieSet
s string
n int
index int
pindex int
done bool
}
func (t ContractTrieSet) scanner(index, n int, b []byte) ctScanner {
return ctScanner{s: b, states: t[index:], n: n}
}
func (t ContractTrieSet) scannerString(index, n int, str string) ctScannerString {
return ctScannerString{s: str, states: t[index:], n: n}
}
// result returns the offset i and bytes consumed p so far. If no suffix
// matched, i and p will be 0.
func (s *ctScanner) result() (i, p int) {
return s.index, s.pindex
}
func (s *ctScannerString) result() (i, p int) {
return s.index, s.pindex
}
const (
final = 0
noIndex = 0xFF
)
// scan matches the longest suffix at the current location in the input
// and returns the number of bytes consumed.
func (s *ctScanner) scan(p int) int {
pr := p // the p at the rune start
str := s.s
states, n := s.states, s.n
for i := 0; i < n && p < len(str); {
e := states[i]
c := str[p]
// TODO: a significant number of contractions are of a form that
// cannot match discontiguous UTF-8 in a normalized string. We could let
// a negative value of e.n mean that we can set s.done = true and avoid
// the need for additional matches.
if c >= e.L {
if e.L == c {
p++
if e.I != noIndex {
s.index = int(e.I)
s.pindex = p
}
if e.N != final {
i, states, n = 0, states[int(e.H)+n:], int(e.N)
if p >= len(str) || utf8.RuneStart(str[p]) {
s.states, s.n, pr = states, n, p
}
} else {
s.done = true
return p
}
continue
} else if e.N == final && c <= e.H {
p++
s.done = true
s.index = int(c-e.L) + int(e.I)
s.pindex = p
return p
}
}
i++
}
return pr
}
// scan is a verbatim copy of ctScanner.scan.
func (s *ctScannerString) scan(p int) int {
pr := p // the p at the rune start
str := s.s
states, n := s.states, s.n
for i := 0; i < n && p < len(str); {
e := states[i]
c := str[p]
// TODO: a significant number of contractions are of a form that
// cannot match discontiguous UTF-8 in a normalized string. We could let
// a negative value of e.n mean that we can set s.done = true and avoid
// the need for additional matches.
if c >= e.L {
if e.L == c {
p++
if e.I != noIndex {
s.index = int(e.I)
s.pindex = p
}
if e.N != final {
i, states, n = 0, states[int(e.H)+n:], int(e.N)
if p >= len(str) || utf8.RuneStart(str[p]) {
s.states, s.n, pr = states, n, p
}
} else {
s.done = true
return p
}
continue
} else if e.N == final && c <= e.H {
p++
s.done = true
s.index = int(c-e.L) + int(e.I)
s.pindex = p
return p
}
}
i++
}
return pr
}

View File

@@ -0,0 +1,178 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package colltab
// An Iter incrementally converts chunks of the input text to collation
// elements, while ensuring that the collation elements are in normalized order
// (that is, they are in the order as if the input text were normalized first).
type Iter struct {
Weighter Weighter
Elems []Elem
// N is the number of elements in Elems that will not be reordered on
// subsequent iterations, N <= len(Elems).
N int
bytes []byte
str string
// Because the Elems buffer may contain collation elements that are needed
// for look-ahead, we need two positions in the text (bytes or str): one for
// the end position in the text for the current iteration and one for the
// start of the next call to appendNext.
pEnd int // end position in text corresponding to N.
pNext int // pEnd <= pNext.
}
// Reset sets the position in the current input text to p and discards any
// results obtained so far.
func (i *Iter) Reset(p int) {
i.Elems = i.Elems[:0]
i.N = 0
i.pEnd = p
i.pNext = p
}
// Len returns the length of the input text.
func (i *Iter) Len() int {
if i.bytes != nil {
return len(i.bytes)
}
return len(i.str)
}
// Discard removes the collation elements up to N.
func (i *Iter) Discard() {
// TODO: change this such that only modifiers following starters will have
// to be copied.
i.Elems = i.Elems[:copy(i.Elems, i.Elems[i.N:])]
i.N = 0
}
// End returns the end position of the input text for which Next has returned
// results.
func (i *Iter) End() int {
return i.pEnd
}
// SetInput resets i to input s.
func (i *Iter) SetInput(s []byte) {
i.bytes = s
i.str = ""
i.Reset(0)
}
// SetInputString resets i to input s.
func (i *Iter) SetInputString(s string) {
i.str = s
i.bytes = nil
i.Reset(0)
}
func (i *Iter) done() bool {
return i.pNext >= len(i.str) && i.pNext >= len(i.bytes)
}
func (i *Iter) appendNext() bool {
if i.done() {
return false
}
var sz int
if i.bytes == nil {
i.Elems, sz = i.Weighter.AppendNextString(i.Elems, i.str[i.pNext:])
} else {
i.Elems, sz = i.Weighter.AppendNext(i.Elems, i.bytes[i.pNext:])
}
if sz == 0 {
sz = 1
}
i.pNext += sz
return true
}
// Next appends Elems to the internal array. On each iteration, it will either
// add starters or modifiers. In the majority of cases, an Elem with a primary
// value > 0 will have a CCC of 0. The CCC values of collation elements are also
// used to detect if the input string was not normalized and to adjust the
// result accordingly.
func (i *Iter) Next() bool {
if i.N == len(i.Elems) && !i.appendNext() {
return false
}
// Check if the current segment starts with a starter.
prevCCC := i.Elems[len(i.Elems)-1].CCC()
if prevCCC == 0 {
i.N = len(i.Elems)
i.pEnd = i.pNext
return true
} else if i.Elems[i.N].CCC() == 0 {
// set i.N to only cover part of i.Elems for which prevCCC == 0 and
// use rest for the next call to next.
for i.N++; i.N < len(i.Elems) && i.Elems[i.N].CCC() == 0; i.N++ {
}
i.pEnd = i.pNext
return true
}
// The current (partial) segment starts with modifiers. We need to collect
// all successive modifiers to ensure that they are normalized.
for {
p := len(i.Elems)
i.pEnd = i.pNext
if !i.appendNext() {
break
}
if ccc := i.Elems[p].CCC(); ccc == 0 || len(i.Elems)-i.N > maxCombiningCharacters {
// Leave the starter for the next iteration. This ensures that we
// do not return sequences of collation elements that cross two
// segments.
//
// TODO: handle large number of combining characters by fully
// normalizing the input segment before iteration. This ensures
// results are consistent across the text repo.
i.N = p
return true
} else if ccc < prevCCC {
i.doNorm(p, ccc) // should be rare, never occurs for NFD and FCC.
} else {
prevCCC = ccc
}
}
done := len(i.Elems) != i.N
i.N = len(i.Elems)
return done
}
// nextNoNorm is the same as next, but does not "normalize" the collation
// elements.
func (i *Iter) nextNoNorm() bool {
// TODO: remove this function. Using this instead of next does not seem
// to improve performance in any significant way. We retain this until
// later for evaluation purposes.
if i.done() {
return false
}
i.appendNext()
i.N = len(i.Elems)
return true
}
const maxCombiningCharacters = 30
// doNorm reorders the collation elements in i.Elems.
// It assumes that blocks of collation elements added with appendNext
// either start and end with the same CCC or start with CCC == 0.
// This allows for a single insertion point for the entire block.
// The correctness of this assumption is verified in builder.go.
func (i *Iter) doNorm(p int, ccc uint8) {
n := len(i.Elems)
k := p
for p--; p > i.N && ccc < i.Elems[p-1].CCC(); p-- {
}
i.Elems = append(i.Elems, i.Elems[p:k]...)
copy(i.Elems[p:], i.Elems[k:])
i.Elems = i.Elems[:n]
}

View File

@@ -0,0 +1,236 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package colltab
import (
"unicode"
"unicode/utf8"
)
// NewNumericWeighter wraps w to replace individual digits to sort based on their
// numeric value.
//
// Weighter w must have a free primary weight after the primary weight for 9.
// If this is not the case, numeric value will sort at the same primary level
// as the first primary sorting after 9.
func NewNumericWeighter(w Weighter) Weighter {
getElem := func(s string) Elem {
elems, _ := w.AppendNextString(nil, s)
return elems[0]
}
nine := getElem("9")
// Numbers should order before zero, but the DUCET has no room for this.
// TODO: move before zero once we use fractional collation elements.
ns, _ := MakeElem(nine.Primary()+1, nine.Secondary(), int(nine.Tertiary()), 0)
return &numericWeighter{
Weighter: w,
// We assume that w sorts digits of different kinds in order of numeric
// value and that the tertiary weight order is preserved.
//
// TODO: evaluate whether it is worth basing the ranges on the Elem
// encoding itself once the move to fractional weights is complete.
zero: getElem("0"),
zeroSpecialLo: getElem(""), // U+FF10 FULLWIDTH DIGIT ZERO
zeroSpecialHi: getElem("₀"), // U+2080 SUBSCRIPT ZERO
nine: nine,
nineSpecialHi: getElem("₉"), // U+2089 SUBSCRIPT NINE
numberStart: ns,
}
}
// A numericWeighter translates a stream of digits into a stream of weights
// representing the numeric value.
type numericWeighter struct {
Weighter
// The Elems below all demarcate boundaries of specific ranges. With the
// current element encoding digits are in two ranges: normal (default
// tertiary value) and special. For most languages, digits have collation
// elements in the normal range.
//
// Note: the range tests are very specific for the element encoding used by
// this implementation. The tests in collate_test.go are designed to fail
// if this code is not updated when an encoding has changed.
zero Elem // normal digit zero
zeroSpecialLo Elem // special digit zero, low tertiary value
zeroSpecialHi Elem // special digit zero, high tertiary value
nine Elem // normal digit nine
nineSpecialHi Elem // special digit nine
numberStart Elem
}
// AppendNext calls the namesake of the underlying weigher, but replaces single
// digits with weights representing their value.
func (nw *numericWeighter) AppendNext(buf []Elem, s []byte) (ce []Elem, n int) {
ce, n = nw.Weighter.AppendNext(buf, s)
nc := numberConverter{
elems: buf,
w: nw,
b: s,
}
isZero, ok := nc.checkNextDigit(ce)
if !ok {
return ce, n
}
// ce might have been grown already, so take it instead of buf.
nc.init(ce, len(buf), isZero)
for n < len(s) {
ce, sz := nw.Weighter.AppendNext(nc.elems, s[n:])
nc.b = s
n += sz
if !nc.update(ce) {
break
}
}
return nc.result(), n
}
// AppendNextString calls the namesake of the underlying weigher, but replaces
// single digits with weights representing their value.
func (nw *numericWeighter) AppendNextString(buf []Elem, s string) (ce []Elem, n int) {
ce, n = nw.Weighter.AppendNextString(buf, s)
nc := numberConverter{
elems: buf,
w: nw,
s: s,
}
isZero, ok := nc.checkNextDigit(ce)
if !ok {
return ce, n
}
nc.init(ce, len(buf), isZero)
for n < len(s) {
ce, sz := nw.Weighter.AppendNextString(nc.elems, s[n:])
nc.s = s
n += sz
if !nc.update(ce) {
break
}
}
return nc.result(), n
}
type numberConverter struct {
w *numericWeighter
elems []Elem
nDigits int
lenIndex int
s string // set if the input was of type string
b []byte // set if the input was of type []byte
}
// init completes initialization of a numberConverter and prepares it for adding
// more digits. elems is assumed to have a digit starting at oldLen.
func (nc *numberConverter) init(elems []Elem, oldLen int, isZero bool) {
// Insert a marker indicating the start of a number and and a placeholder
// for the number of digits.
if isZero {
elems = append(elems[:oldLen], nc.w.numberStart, 0)
} else {
elems = append(elems, 0, 0)
copy(elems[oldLen+2:], elems[oldLen:])
elems[oldLen] = nc.w.numberStart
elems[oldLen+1] = 0
nc.nDigits = 1
}
nc.elems = elems
nc.lenIndex = oldLen + 1
}
// checkNextDigit reports whether bufNew adds a single digit relative to the old
// buffer. If it does, it also reports whether this digit is zero.
func (nc *numberConverter) checkNextDigit(bufNew []Elem) (isZero, ok bool) {
if len(nc.elems) >= len(bufNew) {
return false, false
}
e := bufNew[len(nc.elems)]
if e < nc.w.zeroSpecialLo || nc.w.nine < e {
// Not a number.
return false, false
}
if e < nc.w.zero {
if e > nc.w.nineSpecialHi {
// Not a number.
return false, false
}
if !nc.isDigit() {
return false, false
}
isZero = e <= nc.w.zeroSpecialHi
} else {
// This is the common case if we encounter a digit.
isZero = e == nc.w.zero
}
// Test the remaining added collation elements have a zero primary value.
if n := len(bufNew) - len(nc.elems); n > 1 {
for i := len(nc.elems) + 1; i < len(bufNew); i++ {
if bufNew[i].Primary() != 0 {
return false, false
}
}
// In some rare cases, collation elements will encode runes in
// unicode.No as a digit. For example Ethiopic digits (U+1369 - U+1371)
// are not in Nd. Also some digits that clearly belong in unicode.No,
// like U+0C78 TELUGU FRACTION DIGIT ZERO FOR ODD POWERS OF FOUR, have
// collation elements indistinguishable from normal digits.
// Unfortunately, this means we need to make this check for nearly all
// non-Latin digits.
//
// TODO: check the performance impact and find something better if it is
// an issue.
if !nc.isDigit() {
return false, false
}
}
return isZero, true
}
func (nc *numberConverter) isDigit() bool {
if nc.b != nil {
r, _ := utf8.DecodeRune(nc.b)
return unicode.In(r, unicode.Nd)
}
r, _ := utf8.DecodeRuneInString(nc.s)
return unicode.In(r, unicode.Nd)
}
// We currently support a maximum of about 2M digits (the number of primary
// values). Such numbers will compare correctly against small numbers, but their
// comparison against other large numbers is undefined.
//
// TODO: define a proper fallback, such as comparing large numbers textually or
// actually allowing numbers of unlimited length.
//
// TODO: cap this to a lower number (like 100) and maybe allow a larger number
// in an option?
const maxDigits = 1<<maxPrimaryBits - 1
func (nc *numberConverter) update(elems []Elem) bool {
isZero, ok := nc.checkNextDigit(elems)
if nc.nDigits == 0 && isZero {
return true
}
nc.elems = elems
if !ok {
return false
}
nc.nDigits++
return nc.nDigits < maxDigits
}
// result fills in the length element for the digit sequence and returns the
// completed collation elements.
func (nc *numberConverter) result() []Elem {
e, _ := MakeElem(nc.nDigits, defaultSecondary, defaultTertiary, 0)
nc.elems[nc.lenIndex] = e
return nc.elems
}

View File

@@ -0,0 +1,275 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package colltab
import (
"unicode/utf8"
"golang.org/x/text/unicode/norm"
)
// Table holds all collation data for a given collation ordering.
type Table struct {
Index Trie // main trie
// expansion info
ExpandElem []uint32
// contraction info
ContractTries ContractTrieSet
ContractElem []uint32
MaxContractLen int
VariableTop uint32
}
func (t *Table) AppendNext(w []Elem, b []byte) (res []Elem, n int) {
return t.appendNext(w, source{bytes: b})
}
func (t *Table) AppendNextString(w []Elem, s string) (res []Elem, n int) {
return t.appendNext(w, source{str: s})
}
func (t *Table) Start(p int, b []byte) int {
// TODO: implement
panic("not implemented")
}
func (t *Table) StartString(p int, s string) int {
// TODO: implement
panic("not implemented")
}
func (t *Table) Domain() []string {
// TODO: implement
panic("not implemented")
}
func (t *Table) Top() uint32 {
return t.VariableTop
}
type source struct {
str string
bytes []byte
}
func (src *source) lookup(t *Table) (ce Elem, sz int) {
if src.bytes == nil {
return t.Index.lookupString(src.str)
}
return t.Index.lookup(src.bytes)
}
func (src *source) tail(sz int) {
if src.bytes == nil {
src.str = src.str[sz:]
} else {
src.bytes = src.bytes[sz:]
}
}
func (src *source) nfd(buf []byte, end int) []byte {
if src.bytes == nil {
return norm.NFD.AppendString(buf[:0], src.str[:end])
}
return norm.NFD.Append(buf[:0], src.bytes[:end]...)
}
func (src *source) rune() (r rune, sz int) {
if src.bytes == nil {
return utf8.DecodeRuneInString(src.str)
}
return utf8.DecodeRune(src.bytes)
}
func (src *source) properties(f norm.Form) norm.Properties {
if src.bytes == nil {
return f.PropertiesString(src.str)
}
return f.Properties(src.bytes)
}
// appendNext appends the weights corresponding to the next rune or
// contraction in s. If a contraction is matched to a discontinuous
// sequence of runes, the weights for the interstitial runes are
// appended as well. It returns a new slice that includes the appended
// weights and the number of bytes consumed from s.
func (t *Table) appendNext(w []Elem, src source) (res []Elem, n int) {
ce, sz := src.lookup(t)
tp := ce.ctype()
if tp == ceNormal {
if ce == 0 {
r, _ := src.rune()
const (
hangulSize = 3
firstHangul = 0xAC00
lastHangul = 0xD7A3
)
if r >= firstHangul && r <= lastHangul {
// TODO: performance can be considerably improved here.
n = sz
var buf [16]byte // Used for decomposing Hangul.
for b := src.nfd(buf[:0], hangulSize); len(b) > 0; b = b[sz:] {
ce, sz = t.Index.lookup(b)
w = append(w, ce)
}
return w, n
}
ce = makeImplicitCE(implicitPrimary(r))
}
w = append(w, ce)
} else if tp == ceExpansionIndex {
w = t.appendExpansion(w, ce)
} else if tp == ceContractionIndex {
n := 0
src.tail(sz)
if src.bytes == nil {
w, n = t.matchContractionString(w, ce, src.str)
} else {
w, n = t.matchContraction(w, ce, src.bytes)
}
sz += n
} else if tp == ceDecompose {
// Decompose using NFKD and replace tertiary weights.
t1, t2 := splitDecompose(ce)
i := len(w)
nfkd := src.properties(norm.NFKD).Decomposition()
for p := 0; len(nfkd) > 0; nfkd = nfkd[p:] {
w, p = t.appendNext(w, source{bytes: nfkd})
}
w[i] = w[i].updateTertiary(t1)
if i++; i < len(w) {
w[i] = w[i].updateTertiary(t2)
for i++; i < len(w); i++ {
w[i] = w[i].updateTertiary(maxTertiary)
}
}
}
return w, sz
}
func (t *Table) appendExpansion(w []Elem, ce Elem) []Elem {
i := splitExpandIndex(ce)
n := int(t.ExpandElem[i])
i++
for _, ce := range t.ExpandElem[i : i+n] {
w = append(w, Elem(ce))
}
return w
}
func (t *Table) matchContraction(w []Elem, ce Elem, suffix []byte) ([]Elem, int) {
index, n, offset := splitContractIndex(ce)
scan := t.ContractTries.scanner(index, n, suffix)
buf := [norm.MaxSegmentSize]byte{}
bufp := 0
p := scan.scan(0)
if !scan.done && p < len(suffix) && suffix[p] >= utf8.RuneSelf {
// By now we should have filtered most cases.
p0 := p
bufn := 0
rune := norm.NFD.Properties(suffix[p:])
p += rune.Size()
if rune.LeadCCC() != 0 {
prevCC := rune.TrailCCC()
// A gap may only occur in the last normalization segment.
// This also ensures that len(scan.s) < norm.MaxSegmentSize.
if end := norm.NFD.FirstBoundary(suffix[p:]); end != -1 {
scan.s = suffix[:p+end]
}
for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf {
rune = norm.NFD.Properties(suffix[p:])
if ccc := rune.LeadCCC(); ccc == 0 || prevCC >= ccc {
break
}
prevCC = rune.TrailCCC()
if pp := scan.scan(p); pp != p {
// Copy the interstitial runes for later processing.
bufn += copy(buf[bufn:], suffix[p0:p])
if scan.pindex == pp {
bufp = bufn
}
p, p0 = pp, pp
} else {
p += rune.Size()
}
}
}
}
// Append weights for the matched contraction, which may be an expansion.
i, n := scan.result()
ce = Elem(t.ContractElem[i+offset])
if ce.ctype() == ceNormal {
w = append(w, ce)
} else {
w = t.appendExpansion(w, ce)
}
// Append weights for the runes in the segment not part of the contraction.
for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] {
w, p = t.appendNext(w, source{bytes: b})
}
return w, n
}
// TODO: unify the two implementations. This is best done after first simplifying
// the algorithm taking into account the inclusion of both NFC and NFD forms
// in the table.
func (t *Table) matchContractionString(w []Elem, ce Elem, suffix string) ([]Elem, int) {
index, n, offset := splitContractIndex(ce)
scan := t.ContractTries.scannerString(index, n, suffix)
buf := [norm.MaxSegmentSize]byte{}
bufp := 0
p := scan.scan(0)
if !scan.done && p < len(suffix) && suffix[p] >= utf8.RuneSelf {
// By now we should have filtered most cases.
p0 := p
bufn := 0
rune := norm.NFD.PropertiesString(suffix[p:])
p += rune.Size()
if rune.LeadCCC() != 0 {
prevCC := rune.TrailCCC()
// A gap may only occur in the last normalization segment.
// This also ensures that len(scan.s) < norm.MaxSegmentSize.
if end := norm.NFD.FirstBoundaryInString(suffix[p:]); end != -1 {
scan.s = suffix[:p+end]
}
for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf {
rune = norm.NFD.PropertiesString(suffix[p:])
if ccc := rune.LeadCCC(); ccc == 0 || prevCC >= ccc {
break
}
prevCC = rune.TrailCCC()
if pp := scan.scan(p); pp != p {
// Copy the interstitial runes for later processing.
bufn += copy(buf[bufn:], suffix[p0:p])
if scan.pindex == pp {
bufp = bufn
}
p, p0 = pp, pp
} else {
p += rune.Size()
}
}
}
}
// Append weights for the matched contraction, which may be an expansion.
i, n := scan.result()
ce = Elem(t.ContractElem[i+offset])
if ce.ctype() == ceNormal {
w = append(w, ce)
} else {
w = t.appendExpansion(w, ce)
}
// Append weights for the runes in the segment not part of the contraction.
for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] {
w, p = t.appendNext(w, source{bytes: b})
}
return w, n
}

View File

@@ -0,0 +1,159 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// The trie in this file is used to associate the first full character in an
// UTF-8 string to a collation element. All but the last byte in a UTF-8 byte
// sequence are used to lookup offsets in the index table to be used for the
// next byte. The last byte is used to index into a table of collation elements.
// For a full description, see go.text/collate/build/trie.go.
package colltab
const blockSize = 64
type Trie struct {
Index0 []uint16 // index for first byte (0xC0-0xFF)
Values0 []uint32 // index for first byte (0x00-0x7F)
Index []uint16
Values []uint32
}
const (
t1 = 0x00 // 0000 0000
tx = 0x80 // 1000 0000
t2 = 0xC0 // 1100 0000
t3 = 0xE0 // 1110 0000
t4 = 0xF0 // 1111 0000
t5 = 0xF8 // 1111 1000
t6 = 0xFC // 1111 1100
te = 0xFE // 1111 1110
)
func (t *Trie) lookupValue(n uint16, b byte) Elem {
return Elem(t.Values[int(n)<<6+int(b)])
}
// lookup returns the trie value for the first UTF-8 encoding in s and
// the width in bytes of this encoding. The size will be 0 if s does not
// hold enough bytes to complete the encoding. len(s) must be greater than 0.
func (t *Trie) lookup(s []byte) (v Elem, sz int) {
c0 := s[0]
switch {
case c0 < tx:
return Elem(t.Values0[c0]), 1
case c0 < t2:
return 0, 1
case c0 < t3:
if len(s) < 2 {
return 0, 0
}
i := t.Index0[c0]
c1 := s[1]
if c1 < tx || t2 <= c1 {
return 0, 1
}
return t.lookupValue(i, c1), 2
case c0 < t4:
if len(s) < 3 {
return 0, 0
}
i := t.Index0[c0]
c1 := s[1]
if c1 < tx || t2 <= c1 {
return 0, 1
}
o := int(i)<<6 + int(c1)
i = t.Index[o]
c2 := s[2]
if c2 < tx || t2 <= c2 {
return 0, 2
}
return t.lookupValue(i, c2), 3
case c0 < t5:
if len(s) < 4 {
return 0, 0
}
i := t.Index0[c0]
c1 := s[1]
if c1 < tx || t2 <= c1 {
return 0, 1
}
o := int(i)<<6 + int(c1)
i = t.Index[o]
c2 := s[2]
if c2 < tx || t2 <= c2 {
return 0, 2
}
o = int(i)<<6 + int(c2)
i = t.Index[o]
c3 := s[3]
if c3 < tx || t2 <= c3 {
return 0, 3
}
return t.lookupValue(i, c3), 4
}
// Illegal rune
return 0, 1
}
// The body of lookupString is a verbatim copy of that of lookup.
func (t *Trie) lookupString(s string) (v Elem, sz int) {
c0 := s[0]
switch {
case c0 < tx:
return Elem(t.Values0[c0]), 1
case c0 < t2:
return 0, 1
case c0 < t3:
if len(s) < 2 {
return 0, 0
}
i := t.Index0[c0]
c1 := s[1]
if c1 < tx || t2 <= c1 {
return 0, 1
}
return t.lookupValue(i, c1), 2
case c0 < t4:
if len(s) < 3 {
return 0, 0
}
i := t.Index0[c0]
c1 := s[1]
if c1 < tx || t2 <= c1 {
return 0, 1
}
o := int(i)<<6 + int(c1)
i = t.Index[o]
c2 := s[2]
if c2 < tx || t2 <= c2 {
return 0, 2
}
return t.lookupValue(i, c2), 3
case c0 < t5:
if len(s) < 4 {
return 0, 0
}
i := t.Index0[c0]
c1 := s[1]
if c1 < tx || t2 <= c1 {
return 0, 1
}
o := int(i)<<6 + int(c1)
i = t.Index[o]
c2 := s[2]
if c2 < tx || t2 <= c2 {
return 0, 2
}
o = int(i)<<6 + int(c2)
i = t.Index[o]
c3 := s[3]
if c3 < tx || t2 <= c3 {
return 0, 3
}
return t.lookupValue(i, c3), 4
}
// Illegal rune
return 0, 1
}

View File

@@ -0,0 +1,31 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package colltab // import "golang.org/x/text/internal/colltab"
// A Weighter can be used as a source for Collator and Searcher.
type Weighter interface {
// Start finds the start of the segment that includes position p.
Start(p int, b []byte) int
// StartString finds the start of the segment that includes position p.
StartString(p int, s string) int
// AppendNext appends Elems to buf corresponding to the longest match
// of a single character or contraction from the start of s.
// It returns the new buf and the number of bytes consumed.
AppendNext(buf []Elem, s []byte) (ce []Elem, n int)
// AppendNextString appends Elems to buf corresponding to the longest match
// of a single character or contraction from the start of s.
// It returns the new buf and the number of bytes consumed.
AppendNextString(buf []Elem, s string) (ce []Elem, n int)
// Domain returns a slice of all single characters and contractions for which
// collation elements are defined in this table.
Domain() []string
// Top returns the highest variable primary value.
Top() uint32
}

View File

@@ -0,0 +1,369 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package gen
import (
"bytes"
"encoding/gob"
"fmt"
"hash"
"hash/fnv"
"io"
"log"
"os"
"reflect"
"strings"
"unicode"
"unicode/utf8"
)
// This file contains utilities for generating code.
// TODO: other write methods like:
// - slices, maps, types, etc.
// CodeWriter is a utility for writing structured code. It computes the content
// hash and size of written content. It ensures there are newlines between
// written code blocks.
type CodeWriter struct {
buf bytes.Buffer
Size int
Hash hash.Hash32 // content hash
gob *gob.Encoder
// For comments we skip the usual one-line separator if they are followed by
// a code block.
skipSep bool
}
func (w *CodeWriter) Write(p []byte) (n int, err error) {
return w.buf.Write(p)
}
// NewCodeWriter returns a new CodeWriter.
func NewCodeWriter() *CodeWriter {
h := fnv.New32()
return &CodeWriter{Hash: h, gob: gob.NewEncoder(h)}
}
// WriteGoFile appends the buffer with the total size of all created structures
// and writes it as a Go file to the the given file with the given package name.
func (w *CodeWriter) WriteGoFile(filename, pkg string) {
f, err := os.Create(filename)
if err != nil {
log.Fatalf("Could not create file %s: %v", filename, err)
}
defer f.Close()
if _, err = w.WriteGo(f, pkg, ""); err != nil {
log.Fatalf("Error writing file %s: %v", filename, err)
}
}
// WriteVersionedGoFile appends the buffer with the total size of all created
// structures and writes it as a Go file to the the given file with the given
// package name and build tags for the current Unicode version,
func (w *CodeWriter) WriteVersionedGoFile(filename, pkg string) {
tags := buildTags()
if tags != "" {
filename = insertVersion(filename, UnicodeVersion())
}
f, err := os.Create(filename)
if err != nil {
log.Fatalf("Could not create file %s: %v", filename, err)
}
defer f.Close()
if _, err = w.WriteGo(f, pkg, tags); err != nil {
log.Fatalf("Error writing file %s: %v", filename, err)
}
}
// WriteGo appends the buffer with the total size of all created structures and
// writes it as a Go file to the the given writer with the given package name.
func (w *CodeWriter) WriteGo(out io.Writer, pkg, tags string) (n int, err error) {
sz := w.Size
w.WriteComment("Total table size %d bytes (%dKiB); checksum: %X\n", sz, sz/1024, w.Hash.Sum32())
defer w.buf.Reset()
return WriteGo(out, pkg, tags, w.buf.Bytes())
}
func (w *CodeWriter) printf(f string, x ...interface{}) {
fmt.Fprintf(w, f, x...)
}
func (w *CodeWriter) insertSep() {
if w.skipSep {
w.skipSep = false
return
}
// Use at least two newlines to ensure a blank space between the previous
// block. WriteGoFile will remove extraneous newlines.
w.printf("\n\n")
}
// WriteComment writes a comment block. All line starts are prefixed with "//".
// Initial empty lines are gobbled. The indentation for the first line is
// stripped from consecutive lines.
func (w *CodeWriter) WriteComment(comment string, args ...interface{}) {
s := fmt.Sprintf(comment, args...)
s = strings.Trim(s, "\n")
// Use at least two newlines to ensure a blank space between the previous
// block. WriteGoFile will remove extraneous newlines.
w.printf("\n\n// ")
w.skipSep = true
// strip first indent level.
sep := "\n"
for ; len(s) > 0 && (s[0] == '\t' || s[0] == ' '); s = s[1:] {
sep += s[:1]
}
strings.NewReplacer(sep, "\n// ", "\n", "\n// ").WriteString(w, s)
w.printf("\n")
}
func (w *CodeWriter) writeSizeInfo(size int) {
w.printf("// Size: %d bytes\n", size)
}
// WriteConst writes a constant of the given name and value.
func (w *CodeWriter) WriteConst(name string, x interface{}) {
w.insertSep()
v := reflect.ValueOf(x)
switch v.Type().Kind() {
case reflect.String:
w.printf("const %s %s = ", name, typeName(x))
w.WriteString(v.String())
w.printf("\n")
default:
w.printf("const %s = %#v\n", name, x)
}
}
// WriteVar writes a variable of the given name and value.
func (w *CodeWriter) WriteVar(name string, x interface{}) {
w.insertSep()
v := reflect.ValueOf(x)
oldSize := w.Size
sz := int(v.Type().Size())
w.Size += sz
switch v.Type().Kind() {
case reflect.String:
w.printf("var %s %s = ", name, typeName(x))
w.WriteString(v.String())
case reflect.Struct:
w.gob.Encode(x)
fallthrough
case reflect.Slice, reflect.Array:
w.printf("var %s = ", name)
w.writeValue(v)
w.writeSizeInfo(w.Size - oldSize)
default:
w.printf("var %s %s = ", name, typeName(x))
w.gob.Encode(x)
w.writeValue(v)
w.writeSizeInfo(w.Size - oldSize)
}
w.printf("\n")
}
func (w *CodeWriter) writeValue(v reflect.Value) {
x := v.Interface()
switch v.Kind() {
case reflect.String:
w.WriteString(v.String())
case reflect.Array:
// Don't double count: callers of WriteArray count on the size being
// added, so we need to discount it here.
w.Size -= int(v.Type().Size())
w.writeSlice(x, true)
case reflect.Slice:
w.writeSlice(x, false)
case reflect.Struct:
w.printf("%s{\n", typeName(v.Interface()))
t := v.Type()
for i := 0; i < v.NumField(); i++ {
w.printf("%s: ", t.Field(i).Name)
w.writeValue(v.Field(i))
w.printf(",\n")
}
w.printf("}")
default:
w.printf("%#v", x)
}
}
// WriteString writes a string literal.
func (w *CodeWriter) WriteString(s string) {
s = strings.Replace(s, `\`, `\\`, -1)
io.WriteString(w.Hash, s) // content hash
w.Size += len(s)
const maxInline = 40
if len(s) <= maxInline {
w.printf("%q", s)
return
}
// We will render the string as a multi-line string.
const maxWidth = 80 - 4 - len(`"`) - len(`" +`)
// When starting on its own line, go fmt indents line 2+ an extra level.
n, max := maxWidth, maxWidth-4
// As per https://golang.org/issue/18078, the compiler has trouble
// compiling the concatenation of many strings, s0 + s1 + s2 + ... + sN,
// for large N. We insert redundant, explicit parentheses to work around
// that, lowering the N at any given step: (s0 + s1 + ... + s63) + (s64 +
// ... + s127) + etc + (etc + ... + sN).
explicitParens, extraComment := len(s) > 128*1024, ""
if explicitParens {
w.printf(`(`)
extraComment = "; the redundant, explicit parens are for https://golang.org/issue/18078"
}
// Print "" +\n, if a string does not start on its own line.
b := w.buf.Bytes()
if p := len(bytes.TrimRight(b, " \t")); p > 0 && b[p-1] != '\n' {
w.printf("\"\" + // Size: %d bytes%s\n", len(s), extraComment)
n, max = maxWidth, maxWidth
}
w.printf(`"`)
for sz, p, nLines := 0, 0, 0; p < len(s); {
var r rune
r, sz = utf8.DecodeRuneInString(s[p:])
out := s[p : p+sz]
chars := 1
if !unicode.IsPrint(r) || r == utf8.RuneError || r == '"' {
switch sz {
case 1:
out = fmt.Sprintf("\\x%02x", s[p])
case 2, 3:
out = fmt.Sprintf("\\u%04x", r)
case 4:
out = fmt.Sprintf("\\U%08x", r)
}
chars = len(out)
}
if n -= chars; n < 0 {
nLines++
if explicitParens && nLines&63 == 63 {
w.printf("\") + (\"")
}
w.printf("\" +\n\"")
n = max - len(out)
}
w.printf("%s", out)
p += sz
}
w.printf(`"`)
if explicitParens {
w.printf(`)`)
}
}
// WriteSlice writes a slice value.
func (w *CodeWriter) WriteSlice(x interface{}) {
w.writeSlice(x, false)
}
// WriteArray writes an array value.
func (w *CodeWriter) WriteArray(x interface{}) {
w.writeSlice(x, true)
}
func (w *CodeWriter) writeSlice(x interface{}, isArray bool) {
v := reflect.ValueOf(x)
w.gob.Encode(v.Len())
w.Size += v.Len() * int(v.Type().Elem().Size())
name := typeName(x)
if isArray {
name = fmt.Sprintf("[%d]%s", v.Len(), name[strings.Index(name, "]")+1:])
}
if isArray {
w.printf("%s{\n", name)
} else {
w.printf("%s{ // %d elements\n", name, v.Len())
}
switch kind := v.Type().Elem().Kind(); kind {
case reflect.String:
for _, s := range x.([]string) {
w.WriteString(s)
w.printf(",\n")
}
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64,
reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
// nLine and nBlock are the number of elements per line and block.
nLine, nBlock, format := 8, 64, "%d,"
switch kind {
case reflect.Uint8:
format = "%#02x,"
case reflect.Uint16:
format = "%#04x,"
case reflect.Uint32:
nLine, nBlock, format = 4, 32, "%#08x,"
case reflect.Uint, reflect.Uint64:
nLine, nBlock, format = 4, 32, "%#016x,"
case reflect.Int8:
nLine = 16
}
n := nLine
for i := 0; i < v.Len(); i++ {
if i%nBlock == 0 && v.Len() > nBlock {
w.printf("// Entry %X - %X\n", i, i+nBlock-1)
}
x := v.Index(i).Interface()
w.gob.Encode(x)
w.printf(format, x)
if n--; n == 0 {
n = nLine
w.printf("\n")
}
}
w.printf("\n")
case reflect.Struct:
zero := reflect.Zero(v.Type().Elem()).Interface()
for i := 0; i < v.Len(); i++ {
x := v.Index(i).Interface()
w.gob.EncodeValue(v)
if !reflect.DeepEqual(zero, x) {
line := fmt.Sprintf("%#v,\n", x)
line = line[strings.IndexByte(line, '{'):]
w.printf("%d: ", i)
w.printf(line)
}
}
case reflect.Array:
for i := 0; i < v.Len(); i++ {
w.printf("%d: %#v,\n", i, v.Index(i).Interface())
}
default:
panic("gen: slice elem type not supported")
}
w.printf("}")
}
// WriteType writes a definition of the type of the given value and returns the
// type name.
func (w *CodeWriter) WriteType(x interface{}) string {
t := reflect.TypeOf(x)
w.printf("type %s struct {\n", t.Name())
for i := 0; i < t.NumField(); i++ {
w.printf("\t%s %s\n", t.Field(i).Name, t.Field(i).Type)
}
w.printf("}\n")
return t.Name()
}
// typeName returns the name of the go type of x.
func typeName(x interface{}) string {
t := reflect.ValueOf(x).Type()
return strings.Replace(fmt.Sprint(t), "main.", "", 1)
}

View File

@@ -0,0 +1,333 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package gen contains common code for the various code generation tools in the
// text repository. Its usage ensures consistency between tools.
//
// This package defines command line flags that are common to most generation
// tools. The flags allow for specifying specific Unicode and CLDR versions
// in the public Unicode data repository (http://www.unicode.org/Public).
//
// A local Unicode data mirror can be set through the flag -local or the
// environment variable UNICODE_DIR. The former takes precedence. The local
// directory should follow the same structure as the public repository.
//
// IANA data can also optionally be mirrored by putting it in the iana directory
// rooted at the top of the local mirror. Beware, though, that IANA data is not
// versioned. So it is up to the developer to use the right version.
package gen // import "golang.org/x/text/internal/gen"
import (
"bytes"
"flag"
"fmt"
"go/build"
"go/format"
"io"
"io/ioutil"
"log"
"net/http"
"os"
"path"
"path/filepath"
"strings"
"sync"
"unicode"
"golang.org/x/text/unicode/cldr"
)
var (
url = flag.String("url",
"http://www.unicode.org/Public",
"URL of Unicode database directory")
iana = flag.String("iana",
"http://www.iana.org",
"URL of the IANA repository")
unicodeVersion = flag.String("unicode",
getEnv("UNICODE_VERSION", unicode.Version),
"unicode version to use")
cldrVersion = flag.String("cldr",
getEnv("CLDR_VERSION", cldr.Version),
"cldr version to use")
)
func getEnv(name, def string) string {
if v := os.Getenv(name); v != "" {
return v
}
return def
}
// Init performs common initialization for a gen command. It parses the flags
// and sets up the standard logging parameters.
func Init() {
log.SetPrefix("")
log.SetFlags(log.Lshortfile)
flag.Parse()
}
const header = `// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
`
// UnicodeVersion reports the requested Unicode version.
func UnicodeVersion() string {
return *unicodeVersion
}
// CLDRVersion reports the requested CLDR version.
func CLDRVersion() string {
return *cldrVersion
}
var tags = []struct{ version, buildTags string }{
{"10.0.0", "go1.10"},
{"", "!go1.10"},
}
// buildTags reports the build tags used for the current Unicode version.
func buildTags() string {
v := UnicodeVersion()
for _, x := range tags {
// We should do a numeric comparison, but including the collate package
// would create an import cycle. We approximate it by assuming that
// longer version strings are later.
if len(x.version) <= len(v) {
return x.buildTags
}
if len(x.version) == len(v) && x.version <= v {
return x.buildTags
}
}
return tags[0].buildTags
}
// IsLocal reports whether data files are available locally.
func IsLocal() bool {
dir, err := localReadmeFile()
if err != nil {
return false
}
if _, err = os.Stat(dir); err != nil {
return false
}
return true
}
// OpenUCDFile opens the requested UCD file. The file is specified relative to
// the public Unicode root directory. It will call log.Fatal if there are any
// errors.
func OpenUCDFile(file string) io.ReadCloser {
return openUnicode(path.Join(*unicodeVersion, "ucd", file))
}
// OpenCLDRCoreZip opens the CLDR core zip file. It will call log.Fatal if there
// are any errors.
func OpenCLDRCoreZip() io.ReadCloser {
return OpenUnicodeFile("cldr", *cldrVersion, "core.zip")
}
// OpenUnicodeFile opens the requested file of the requested category from the
// root of the Unicode data archive. The file is specified relative to the
// public Unicode root directory. If version is "", it will use the default
// Unicode version. It will call log.Fatal if there are any errors.
func OpenUnicodeFile(category, version, file string) io.ReadCloser {
if version == "" {
version = UnicodeVersion()
}
return openUnicode(path.Join(category, version, file))
}
// OpenIANAFile opens the requested IANA file. The file is specified relative
// to the IANA root, which is typically either http://www.iana.org or the
// iana directory in the local mirror. It will call log.Fatal if there are any
// errors.
func OpenIANAFile(path string) io.ReadCloser {
return Open(*iana, "iana", path)
}
var (
dirMutex sync.Mutex
localDir string
)
const permissions = 0755
func localReadmeFile() (string, error) {
p, err := build.Import("golang.org/x/text", "", build.FindOnly)
if err != nil {
return "", fmt.Errorf("Could not locate package: %v", err)
}
return filepath.Join(p.Dir, "DATA", "README"), nil
}
func getLocalDir() string {
dirMutex.Lock()
defer dirMutex.Unlock()
readme, err := localReadmeFile()
if err != nil {
log.Fatal(err)
}
dir := filepath.Dir(readme)
if _, err := os.Stat(readme); err != nil {
if err := os.MkdirAll(dir, permissions); err != nil {
log.Fatalf("Could not create directory: %v", err)
}
ioutil.WriteFile(readme, []byte(readmeTxt), permissions)
}
return dir
}
const readmeTxt = `Generated by golang.org/x/text/internal/gen. DO NOT EDIT.
This directory contains downloaded files used to generate the various tables
in the golang.org/x/text subrepo.
Note that the language subtag repo (iana/assignments/language-subtag-registry)
and all other times in the iana subdirectory are not versioned and will need
to be periodically manually updated. The easiest way to do this is to remove
the entire iana directory. This is mostly of concern when updating the language
package.
`
// Open opens subdir/path if a local directory is specified and the file exists,
// where subdir is a directory relative to the local root, or fetches it from
// urlRoot/path otherwise. It will call log.Fatal if there are any errors.
func Open(urlRoot, subdir, path string) io.ReadCloser {
file := filepath.Join(getLocalDir(), subdir, filepath.FromSlash(path))
return open(file, urlRoot, path)
}
func openUnicode(path string) io.ReadCloser {
file := filepath.Join(getLocalDir(), filepath.FromSlash(path))
return open(file, *url, path)
}
// TODO: automatically periodically update non-versioned files.
func open(file, urlRoot, path string) io.ReadCloser {
if f, err := os.Open(file); err == nil {
return f
}
r := get(urlRoot, path)
defer r.Close()
b, err := ioutil.ReadAll(r)
if err != nil {
log.Fatalf("Could not download file: %v", err)
}
os.MkdirAll(filepath.Dir(file), permissions)
if err := ioutil.WriteFile(file, b, permissions); err != nil {
log.Fatalf("Could not create file: %v", err)
}
return ioutil.NopCloser(bytes.NewReader(b))
}
func get(root, path string) io.ReadCloser {
url := root + "/" + path
fmt.Printf("Fetching %s...", url)
defer fmt.Println(" done.")
resp, err := http.Get(url)
if err != nil {
log.Fatalf("HTTP GET: %v", err)
}
if resp.StatusCode != 200 {
log.Fatalf("Bad GET status for %q: %q", url, resp.Status)
}
return resp.Body
}
// TODO: use Write*Version in all applicable packages.
// WriteUnicodeVersion writes a constant for the Unicode version from which the
// tables are generated.
func WriteUnicodeVersion(w io.Writer) {
fmt.Fprintf(w, "// UnicodeVersion is the Unicode version from which the tables in this package are derived.\n")
fmt.Fprintf(w, "const UnicodeVersion = %q\n\n", UnicodeVersion())
}
// WriteCLDRVersion writes a constant for the CLDR version from which the
// tables are generated.
func WriteCLDRVersion(w io.Writer) {
fmt.Fprintf(w, "// CLDRVersion is the CLDR version from which the tables in this package are derived.\n")
fmt.Fprintf(w, "const CLDRVersion = %q\n\n", CLDRVersion())
}
// WriteGoFile prepends a standard file comment and package statement to the
// given bytes, applies gofmt, and writes them to a file with the given name.
// It will call log.Fatal if there are any errors.
func WriteGoFile(filename, pkg string, b []byte) {
w, err := os.Create(filename)
if err != nil {
log.Fatalf("Could not create file %s: %v", filename, err)
}
defer w.Close()
if _, err = WriteGo(w, pkg, "", b); err != nil {
log.Fatalf("Error writing file %s: %v", filename, err)
}
}
func insertVersion(filename, version string) string {
suffix := ".go"
if strings.HasSuffix(filename, "_test.go") {
suffix = "_test.go"
}
return fmt.Sprint(filename[:len(filename)-len(suffix)], version, suffix)
}
// WriteVersionedGoFile prepends a standard file comment, adds build tags to
// version the file for the current Unicode version, and package statement to
// the given bytes, applies gofmt, and writes them to a file with the given
// name. It will call log.Fatal if there are any errors.
func WriteVersionedGoFile(filename, pkg string, b []byte) {
tags := buildTags()
if tags != "" {
filename = insertVersion(filename, UnicodeVersion())
}
w, err := os.Create(filename)
if err != nil {
log.Fatalf("Could not create file %s: %v", filename, err)
}
defer w.Close()
if _, err = WriteGo(w, pkg, tags, b); err != nil {
log.Fatalf("Error writing file %s: %v", filename, err)
}
}
// WriteGo prepends a standard file comment and package statement to the given
// bytes, applies gofmt, and writes them to w.
func WriteGo(w io.Writer, pkg, tags string, b []byte) (n int, err error) {
src := []byte(header)
if tags != "" {
src = append(src, fmt.Sprintf("// +build %s\n\n", tags)...)
}
src = append(src, fmt.Sprintf("package %s\n\n", pkg)...)
src = append(src, b...)
formatted, err := format.Source(src)
if err != nil {
// Print the generated code even in case of an error so that the
// returned error can be meaningfully interpreted.
n, _ = w.Write(src)
return n, err
}
return w.Write(formatted)
}
// Repackage rewrites a Go file from belonging to package main to belonging to
// the given package.
func Repackage(inFile, outFile, pkg string) {
src, err := ioutil.ReadFile(inFile)
if err != nil {
log.Fatalf("reading %s: %v", inFile, err)
}
const toDelete = "package main\n\n"
i := bytes.Index(src, []byte(toDelete))
if i < 0 {
log.Fatalf("Could not find %q in %s.", toDelete, inFile)
}
w := &bytes.Buffer{}
w.Write(src[i+len(toDelete):])
WriteGoFile(outFile, pkg, w.Bytes())
}

View File

@@ -0,0 +1,100 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package tag contains functionality handling tags and related data.
package tag // import "golang.org/x/text/internal/tag"
import "sort"
// An Index converts tags to a compact numeric value.
//
// All elements are of size 4. Tags may be up to 4 bytes long. Excess bytes can
// be used to store additional information about the tag.
type Index string
// Elem returns the element data at the given index.
func (s Index) Elem(x int) string {
return string(s[x*4 : x*4+4])
}
// Index reports the index of the given key or -1 if it could not be found.
// Only the first len(key) bytes from the start of the 4-byte entries will be
// considered for the search and the first match in Index will be returned.
func (s Index) Index(key []byte) int {
n := len(key)
// search the index of the first entry with an equal or higher value than
// key in s.
index := sort.Search(len(s)/4, func(i int) bool {
return cmp(s[i*4:i*4+n], key) != -1
})
i := index * 4
if cmp(s[i:i+len(key)], key) != 0 {
return -1
}
return index
}
// Next finds the next occurrence of key after index x, which must have been
// obtained from a call to Index using the same key. It returns x+1 or -1.
func (s Index) Next(key []byte, x int) int {
if x++; x*4 < len(s) && cmp(s[x*4:x*4+len(key)], key) == 0 {
return x
}
return -1
}
// cmp returns an integer comparing a and b lexicographically.
func cmp(a Index, b []byte) int {
n := len(a)
if len(b) < n {
n = len(b)
}
for i, c := range b[:n] {
switch {
case a[i] > c:
return 1
case a[i] < c:
return -1
}
}
switch {
case len(a) < len(b):
return -1
case len(a) > len(b):
return 1
}
return 0
}
// Compare returns an integer comparing a and b lexicographically.
func Compare(a string, b []byte) int {
return cmp(Index(a), b)
}
// FixCase reformats b to the same pattern of cases as form.
// If returns false if string b is malformed.
func FixCase(form string, b []byte) bool {
if len(form) != len(b) {
return false
}
for i, c := range b {
if form[i] <= 'Z' {
if c >= 'a' {
c -= 'z' - 'Z'
}
if c < 'A' || 'Z' < c {
return false
}
} else {
if c <= 'Z' {
c += 'z' - 'Z'
}
if c < 'a' || 'z' < c {
return false
}
}
b[i] = c
}
return true
}

View File

@@ -0,0 +1,58 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package triegen
// This file defines Compacter and its implementations.
import "io"
// A Compacter generates an alternative, more space-efficient way to store a
// trie value block. A trie value block holds all possible values for the last
// byte of a UTF-8 encoded rune. Excluding ASCII characters, a trie value block
// always has 64 values, as a UTF-8 encoding ends with a byte in [0x80, 0xC0).
type Compacter interface {
// Size returns whether the Compacter could encode the given block as well
// as its size in case it can. len(v) is always 64.
Size(v []uint64) (sz int, ok bool)
// Store stores the block using the Compacter's compression method.
// It returns a handle with which the block can be retrieved.
// len(v) is always 64.
Store(v []uint64) uint32
// Print writes the data structures associated to the given store to w.
Print(w io.Writer) error
// Handler returns the name of a function that gets called during trie
// lookup for blocks generated by the Compacter. The function should be of
// the form func (n uint32, b byte) uint64, where n is the index returned by
// the Compacter's Store method and b is the last byte of the UTF-8
// encoding, where 0x80 <= b < 0xC0, for which to do the lookup in the
// block.
Handler() string
}
// simpleCompacter is the default Compacter used by builder. It implements a
// normal trie block.
type simpleCompacter builder
func (b *simpleCompacter) Size([]uint64) (sz int, ok bool) {
return blockSize * b.ValueSize, true
}
func (b *simpleCompacter) Store(v []uint64) uint32 {
h := uint32(len(b.ValueBlocks) - blockOffset)
b.ValueBlocks = append(b.ValueBlocks, v)
return h
}
func (b *simpleCompacter) Print(io.Writer) error {
// Structures are printed in print.go.
return nil
}
func (b *simpleCompacter) Handler() string {
panic("Handler should be special-cased for this Compacter")
}

View File

@@ -0,0 +1,251 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package triegen
import (
"bytes"
"fmt"
"io"
"strings"
"text/template"
)
// print writes all the data structures as well as the code necessary to use the
// trie to w.
func (b *builder) print(w io.Writer) error {
b.Stats.NValueEntries = len(b.ValueBlocks) * blockSize
b.Stats.NValueBytes = len(b.ValueBlocks) * blockSize * b.ValueSize
b.Stats.NIndexEntries = len(b.IndexBlocks) * blockSize
b.Stats.NIndexBytes = len(b.IndexBlocks) * blockSize * b.IndexSize
b.Stats.NHandleBytes = len(b.Trie) * 2 * b.IndexSize
// If we only have one root trie, all starter blocks are at position 0 and
// we can access the arrays directly.
if len(b.Trie) == 1 {
// At this point we cannot refer to the generated tables directly.
b.ASCIIBlock = b.Name + "Values"
b.StarterBlock = b.Name + "Index"
} else {
// Otherwise we need to have explicit starter indexes in the trie
// structure.
b.ASCIIBlock = "t.ascii"
b.StarterBlock = "t.utf8Start"
}
b.SourceType = "[]byte"
if err := lookupGen.Execute(w, b); err != nil {
return err
}
b.SourceType = "string"
if err := lookupGen.Execute(w, b); err != nil {
return err
}
if err := trieGen.Execute(w, b); err != nil {
return err
}
for _, c := range b.Compactions {
if err := c.c.Print(w); err != nil {
return err
}
}
return nil
}
func printValues(n int, values []uint64) string {
w := &bytes.Buffer{}
boff := n * blockSize
fmt.Fprintf(w, "\t// Block %#x, offset %#x", n, boff)
var newline bool
for i, v := range values {
if i%6 == 0 {
newline = true
}
if v != 0 {
if newline {
fmt.Fprintf(w, "\n")
newline = false
}
fmt.Fprintf(w, "\t%#02x:%#04x, ", boff+i, v)
}
}
return w.String()
}
func printIndex(b *builder, nr int, n *node) string {
w := &bytes.Buffer{}
boff := nr * blockSize
fmt.Fprintf(w, "\t// Block %#x, offset %#x", nr, boff)
var newline bool
for i, c := range n.children {
if i%8 == 0 {
newline = true
}
if c != nil {
v := b.Compactions[c.index.compaction].Offset + uint32(c.index.index)
if v != 0 {
if newline {
fmt.Fprintf(w, "\n")
newline = false
}
fmt.Fprintf(w, "\t%#02x:%#02x, ", boff+i, v)
}
}
}
return w.String()
}
var (
trieGen = template.Must(template.New("trie").Funcs(template.FuncMap{
"printValues": printValues,
"printIndex": printIndex,
"title": strings.Title,
"dec": func(x int) int { return x - 1 },
"psize": func(n int) string {
return fmt.Sprintf("%d bytes (%.2f KiB)", n, float64(n)/1024)
},
}).Parse(trieTemplate))
lookupGen = template.Must(template.New("lookup").Parse(lookupTemplate))
)
// TODO: consider the return type of lookup. It could be uint64, even if the
// internal value type is smaller. We will have to verify this with the
// performance of unicode/norm, which is very sensitive to such changes.
const trieTemplate = `{{$b := .}}{{$multi := gt (len .Trie) 1}}
// {{.Name}}Trie. Total size: {{psize .Size}}. Checksum: {{printf "%08x" .Checksum}}.
type {{.Name}}Trie struct { {{if $multi}}
ascii []{{.ValueType}} // index for ASCII bytes
utf8Start []{{.IndexType}} // index for UTF-8 bytes >= 0xC0
{{end}}}
func new{{title .Name}}Trie(i int) *{{.Name}}Trie { {{if $multi}}
h := {{.Name}}TrieHandles[i]
return &{{.Name}}Trie{ {{.Name}}Values[uint32(h.ascii)<<6:], {{.Name}}Index[uint32(h.multi)<<6:] }
}
type {{.Name}}TrieHandle struct {
ascii, multi {{.IndexType}}
}
// {{.Name}}TrieHandles: {{len .Trie}} handles, {{.Stats.NHandleBytes}} bytes
var {{.Name}}TrieHandles = [{{len .Trie}}]{{.Name}}TrieHandle{
{{range .Trie}} { {{.ASCIIIndex}}, {{.StarterIndex}} }, // {{printf "%08x" .Checksum}}: {{.Name}}
{{end}}}{{else}}
return &{{.Name}}Trie{}
}
{{end}}
// lookupValue determines the type of block n and looks up the value for b.
func (t *{{.Name}}Trie) lookupValue(n uint32, b byte) {{.ValueType}}{{$last := dec (len .Compactions)}} {
switch { {{range $i, $c := .Compactions}}
{{if eq $i $last}}default{{else}}case n < {{$c.Cutoff}}{{end}}:{{if ne $i 0}}
n -= {{$c.Offset}}{{end}}
return {{print $b.ValueType}}({{$c.Handler}}){{end}}
}
}
// {{.Name}}Values: {{len .ValueBlocks}} blocks, {{.Stats.NValueEntries}} entries, {{.Stats.NValueBytes}} bytes
// The third block is the zero block.
var {{.Name}}Values = [{{.Stats.NValueEntries}}]{{.ValueType}} {
{{range $i, $v := .ValueBlocks}}{{printValues $i $v}}
{{end}}}
// {{.Name}}Index: {{len .IndexBlocks}} blocks, {{.Stats.NIndexEntries}} entries, {{.Stats.NIndexBytes}} bytes
// Block 0 is the zero block.
var {{.Name}}Index = [{{.Stats.NIndexEntries}}]{{.IndexType}} {
{{range $i, $v := .IndexBlocks}}{{printIndex $b $i $v}}
{{end}}}
`
// TODO: consider allowing zero-length strings after evaluating performance with
// unicode/norm.
const lookupTemplate = `
// lookup{{if eq .SourceType "string"}}String{{end}} returns the trie value for the first UTF-8 encoding in s and
// the width in bytes of this encoding. The size will be 0 if s does not
// hold enough bytes to complete the encoding. len(s) must be greater than 0.
func (t *{{.Name}}Trie) lookup{{if eq .SourceType "string"}}String{{end}}(s {{.SourceType}}) (v {{.ValueType}}, sz int) {
c0 := s[0]
switch {
case c0 < 0x80: // is ASCII
return {{.ASCIIBlock}}[c0], 1
case c0 < 0xC2:
return 0, 1 // Illegal UTF-8: not a starter, not ASCII.
case c0 < 0xE0: // 2-byte UTF-8
if len(s) < 2 {
return 0, 0
}
i := {{.StarterBlock}}[c0]
c1 := s[1]
if c1 < 0x80 || 0xC0 <= c1 {
return 0, 1 // Illegal UTF-8: not a continuation byte.
}
return t.lookupValue(uint32(i), c1), 2
case c0 < 0xF0: // 3-byte UTF-8
if len(s) < 3 {
return 0, 0
}
i := {{.StarterBlock}}[c0]
c1 := s[1]
if c1 < 0x80 || 0xC0 <= c1 {
return 0, 1 // Illegal UTF-8: not a continuation byte.
}
o := uint32(i)<<6 + uint32(c1)
i = {{.Name}}Index[o]
c2 := s[2]
if c2 < 0x80 || 0xC0 <= c2 {
return 0, 2 // Illegal UTF-8: not a continuation byte.
}
return t.lookupValue(uint32(i), c2), 3
case c0 < 0xF8: // 4-byte UTF-8
if len(s) < 4 {
return 0, 0
}
i := {{.StarterBlock}}[c0]
c1 := s[1]
if c1 < 0x80 || 0xC0 <= c1 {
return 0, 1 // Illegal UTF-8: not a continuation byte.
}
o := uint32(i)<<6 + uint32(c1)
i = {{.Name}}Index[o]
c2 := s[2]
if c2 < 0x80 || 0xC0 <= c2 {
return 0, 2 // Illegal UTF-8: not a continuation byte.
}
o = uint32(i)<<6 + uint32(c2)
i = {{.Name}}Index[o]
c3 := s[3]
if c3 < 0x80 || 0xC0 <= c3 {
return 0, 3 // Illegal UTF-8: not a continuation byte.
}
return t.lookupValue(uint32(i), c3), 4
}
// Illegal rune
return 0, 1
}
// lookup{{if eq .SourceType "string"}}String{{end}}Unsafe returns the trie value for the first UTF-8 encoding in s.
// s must start with a full and valid UTF-8 encoded rune.
func (t *{{.Name}}Trie) lookup{{if eq .SourceType "string"}}String{{end}}Unsafe(s {{.SourceType}}) {{.ValueType}} {
c0 := s[0]
if c0 < 0x80 { // is ASCII
return {{.ASCIIBlock}}[c0]
}
i := {{.StarterBlock}}[c0]
if c0 < 0xE0 { // 2-byte UTF-8
return t.lookupValue(uint32(i), s[1])
}
i = {{.Name}}Index[uint32(i)<<6+uint32(s[1])]
if c0 < 0xF0 { // 3-byte UTF-8
return t.lookupValue(uint32(i), s[2])
}
i = {{.Name}}Index[uint32(i)<<6+uint32(s[2])]
if c0 < 0xF8 { // 4-byte UTF-8
return t.lookupValue(uint32(i), s[3])
}
return 0
}
`

View File

@@ -0,0 +1,494 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package triegen implements a code generator for a trie for associating
// unsigned integer values with UTF-8 encoded runes.
//
// Many of the go.text packages use tries for storing per-rune information. A
// trie is especially useful if many of the runes have the same value. If this
// is the case, many blocks can be expected to be shared allowing for
// information on many runes to be stored in little space.
//
// As most of the lookups are done directly on []byte slices, the tries use the
// UTF-8 bytes directly for the lookup. This saves a conversion from UTF-8 to
// runes and contributes a little bit to better performance. It also naturally
// provides a fast path for ASCII.
//
// Space is also an issue. There are many code points defined in Unicode and as
// a result tables can get quite large. So every byte counts. The triegen
// package automatically chooses the smallest integer values to represent the
// tables. Compacters allow further compression of the trie by allowing for
// alternative representations of individual trie blocks.
//
// triegen allows generating multiple tries as a single structure. This is
// useful when, for example, one wants to generate tries for several languages
// that have a lot of values in common. Some existing libraries for
// internationalization store all per-language data as a dynamically loadable
// chunk. The go.text packages are designed with the assumption that the user
// typically wants to compile in support for all supported languages, in line
// with the approach common to Go to create a single standalone binary. The
// multi-root trie approach can give significant storage savings in this
// scenario.
//
// triegen generates both tables and code. The code is optimized to use the
// automatically chosen data types. The following code is generated for a Trie
// or multiple Tries named "foo":
// - type fooTrie
// The trie type.
//
// - func newFooTrie(x int) *fooTrie
// Trie constructor, where x is the index of the trie passed to Gen.
//
// - func (t *fooTrie) lookup(s []byte) (v uintX, sz int)
// The lookup method, where uintX is automatically chosen.
//
// - func lookupString, lookupUnsafe and lookupStringUnsafe
// Variants of the above.
//
// - var fooValues and fooIndex and any tables generated by Compacters.
// The core trie data.
//
// - var fooTrieHandles
// Indexes of starter blocks in case of multiple trie roots.
//
// It is recommended that users test the generated trie by checking the returned
// value for every rune. Such exhaustive tests are possible as the the number of
// runes in Unicode is limited.
package triegen // import "golang.org/x/text/internal/triegen"
// TODO: Arguably, the internally optimized data types would not have to be
// exposed in the generated API. We could also investigate not generating the
// code, but using it through a package. We would have to investigate the impact
// on performance of making such change, though. For packages like unicode/norm,
// small changes like this could tank performance.
import (
"encoding/binary"
"fmt"
"hash/crc64"
"io"
"log"
"unicode/utf8"
)
// builder builds a set of tries for associating values with runes. The set of
// tries can share common index and value blocks.
type builder struct {
Name string
// ValueType is the type of the trie values looked up.
ValueType string
// ValueSize is the byte size of the ValueType.
ValueSize int
// IndexType is the type of trie index values used for all UTF-8 bytes of
// a rune except the last one.
IndexType string
// IndexSize is the byte size of the IndexType.
IndexSize int
// SourceType is used when generating the lookup functions. If the user
// requests StringSupport, all lookup functions will be generated for
// string input as well.
SourceType string
Trie []*Trie
IndexBlocks []*node
ValueBlocks [][]uint64
Compactions []compaction
Checksum uint64
ASCIIBlock string
StarterBlock string
indexBlockIdx map[uint64]int
valueBlockIdx map[uint64]nodeIndex
asciiBlockIdx map[uint64]int
// Stats are used to fill out the template.
Stats struct {
NValueEntries int
NValueBytes int
NIndexEntries int
NIndexBytes int
NHandleBytes int
}
err error
}
// A nodeIndex encodes the index of a node, which is defined by the compaction
// which stores it and an index within the compaction. For internal nodes, the
// compaction is always 0.
type nodeIndex struct {
compaction int
index int
}
// compaction keeps track of stats used for the compaction.
type compaction struct {
c Compacter
blocks []*node
maxHandle uint32
totalSize int
// Used by template-based generator and thus exported.
Cutoff uint32
Offset uint32
Handler string
}
func (b *builder) setError(err error) {
if b.err == nil {
b.err = err
}
}
// An Option can be passed to Gen.
type Option func(b *builder) error
// Compact configures the trie generator to use the given Compacter.
func Compact(c Compacter) Option {
return func(b *builder) error {
b.Compactions = append(b.Compactions, compaction{
c: c,
Handler: c.Handler() + "(n, b)"})
return nil
}
}
// Gen writes Go code for a shared trie lookup structure to w for the given
// Tries. The generated trie type will be called nameTrie. newNameTrie(x) will
// return the *nameTrie for tries[x]. A value can be looked up by using one of
// the various lookup methods defined on nameTrie. It returns the table size of
// the generated trie.
func Gen(w io.Writer, name string, tries []*Trie, opts ...Option) (sz int, err error) {
// The index contains two dummy blocks, followed by the zero block. The zero
// block is at offset 0x80, so that the offset for the zero block for
// continuation bytes is 0.
b := &builder{
Name: name,
Trie: tries,
IndexBlocks: []*node{{}, {}, {}},
Compactions: []compaction{{
Handler: name + "Values[n<<6+uint32(b)]",
}},
// The 0 key in indexBlockIdx and valueBlockIdx is the hash of the zero
// block.
indexBlockIdx: map[uint64]int{0: 0},
valueBlockIdx: map[uint64]nodeIndex{0: {}},
asciiBlockIdx: map[uint64]int{},
}
b.Compactions[0].c = (*simpleCompacter)(b)
for _, f := range opts {
if err := f(b); err != nil {
return 0, err
}
}
b.build()
if b.err != nil {
return 0, b.err
}
if err = b.print(w); err != nil {
return 0, err
}
return b.Size(), nil
}
// A Trie represents a single root node of a trie. A builder may build several
// overlapping tries at once.
type Trie struct {
root *node
hiddenTrie
}
// hiddenTrie contains values we want to be visible to the template generator,
// but hidden from the API documentation.
type hiddenTrie struct {
Name string
Checksum uint64
ASCIIIndex int
StarterIndex int
}
// NewTrie returns a new trie root.
func NewTrie(name string) *Trie {
return &Trie{
&node{
children: make([]*node, blockSize),
values: make([]uint64, utf8.RuneSelf),
},
hiddenTrie{Name: name},
}
}
// Gen is a convenience wrapper around the Gen func passing t as the only trie
// and uses the name passed to NewTrie. It returns the size of the generated
// tables.
func (t *Trie) Gen(w io.Writer, opts ...Option) (sz int, err error) {
return Gen(w, t.Name, []*Trie{t}, opts...)
}
// node is a node of the intermediate trie structure.
type node struct {
// children holds this node's children. It is always of length 64.
// A child node may be nil.
children []*node
// values contains the values of this node. If it is non-nil, this node is
// either a root or leaf node:
// For root nodes, len(values) == 128 and it maps the bytes in [0x00, 0x7F].
// For leaf nodes, len(values) == 64 and it maps the bytes in [0x80, 0xBF].
values []uint64
index nodeIndex
}
// Insert associates value with the given rune. Insert will panic if a non-zero
// value is passed for an invalid rune.
func (t *Trie) Insert(r rune, value uint64) {
if value == 0 {
return
}
s := string(r)
if []rune(s)[0] != r && value != 0 {
// Note: The UCD tables will always assign what amounts to a zero value
// to a surrogate. Allowing a zero value for an illegal rune allows
// users to iterate over [0..MaxRune] without having to explicitly
// exclude surrogates, which would be tedious.
panic(fmt.Sprintf("triegen: non-zero value for invalid rune %U", r))
}
if len(s) == 1 {
// It is a root node value (ASCII).
t.root.values[s[0]] = value
return
}
n := t.root
for ; len(s) > 1; s = s[1:] {
if n.children == nil {
n.children = make([]*node, blockSize)
}
p := s[0] % blockSize
c := n.children[p]
if c == nil {
c = &node{}
n.children[p] = c
}
if len(s) > 2 && c.values != nil {
log.Fatalf("triegen: insert(%U): found internal node with values", r)
}
n = c
}
if n.values == nil {
n.values = make([]uint64, blockSize)
}
if n.children != nil {
log.Fatalf("triegen: insert(%U): found leaf node that also has child nodes", r)
}
n.values[s[0]-0x80] = value
}
// Size returns the number of bytes the generated trie will take to store. It
// needs to be exported as it is used in the templates.
func (b *builder) Size() int {
// Index blocks.
sz := len(b.IndexBlocks) * blockSize * b.IndexSize
// Skip the first compaction, which represents the normal value blocks, as
// its totalSize does not account for the ASCII blocks, which are managed
// separately.
sz += len(b.ValueBlocks) * blockSize * b.ValueSize
for _, c := range b.Compactions[1:] {
sz += c.totalSize
}
// TODO: this computation does not account for the fixed overhead of a using
// a compaction, either code or data. As for data, though, the typical
// overhead of data is in the order of bytes (2 bytes for cases). Further,
// the savings of using a compaction should anyway be substantial for it to
// be worth it.
// For multi-root tries, we also need to account for the handles.
if len(b.Trie) > 1 {
sz += 2 * b.IndexSize * len(b.Trie)
}
return sz
}
func (b *builder) build() {
// Compute the sizes of the values.
var vmax uint64
for _, t := range b.Trie {
vmax = maxValue(t.root, vmax)
}
b.ValueType, b.ValueSize = getIntType(vmax)
// Compute all block allocations.
// TODO: first compute the ASCII blocks for all tries and then the other
// nodes. ASCII blocks are more restricted in placement, as they require two
// blocks to be placed consecutively. Processing them first may improve
// sharing (at least one zero block can be expected to be saved.)
for _, t := range b.Trie {
b.Checksum += b.buildTrie(t)
}
// Compute the offsets for all the Compacters.
offset := uint32(0)
for i := range b.Compactions {
c := &b.Compactions[i]
c.Offset = offset
offset += c.maxHandle + 1
c.Cutoff = offset
}
// Compute the sizes of indexes.
// TODO: different byte positions could have different sizes. So far we have
// not found a case where this is beneficial.
imax := uint64(b.Compactions[len(b.Compactions)-1].Cutoff)
for _, ib := range b.IndexBlocks {
if x := uint64(ib.index.index); x > imax {
imax = x
}
}
b.IndexType, b.IndexSize = getIntType(imax)
}
func maxValue(n *node, max uint64) uint64 {
if n == nil {
return max
}
for _, c := range n.children {
max = maxValue(c, max)
}
for _, v := range n.values {
if max < v {
max = v
}
}
return max
}
func getIntType(v uint64) (string, int) {
switch {
case v < 1<<8:
return "uint8", 1
case v < 1<<16:
return "uint16", 2
case v < 1<<32:
return "uint32", 4
}
return "uint64", 8
}
const (
blockSize = 64
// Subtract two blocks to offset 0x80, the first continuation byte.
blockOffset = 2
// Subtract three blocks to offset 0xC0, the first non-ASCII starter.
rootBlockOffset = 3
)
var crcTable = crc64.MakeTable(crc64.ISO)
func (b *builder) buildTrie(t *Trie) uint64 {
n := t.root
// Get the ASCII offset. For the first trie, the ASCII block will be at
// position 0.
hasher := crc64.New(crcTable)
binary.Write(hasher, binary.BigEndian, n.values)
hash := hasher.Sum64()
v, ok := b.asciiBlockIdx[hash]
if !ok {
v = len(b.ValueBlocks)
b.asciiBlockIdx[hash] = v
b.ValueBlocks = append(b.ValueBlocks, n.values[:blockSize], n.values[blockSize:])
if v == 0 {
// Add the zero block at position 2 so that it will be assigned a
// zero reference in the lookup blocks.
// TODO: always do this? This would allow us to remove a check from
// the trie lookup, but at the expense of extra space. Analyze
// performance for unicode/norm.
b.ValueBlocks = append(b.ValueBlocks, make([]uint64, blockSize))
}
}
t.ASCIIIndex = v
// Compute remaining offsets.
t.Checksum = b.computeOffsets(n, true)
// We already subtracted the normal blockOffset from the index. Subtract the
// difference for starter bytes.
t.StarterIndex = n.index.index - (rootBlockOffset - blockOffset)
return t.Checksum
}
func (b *builder) computeOffsets(n *node, root bool) uint64 {
// For the first trie, the root lookup block will be at position 3, which is
// the offset for UTF-8 non-ASCII starter bytes.
first := len(b.IndexBlocks) == rootBlockOffset
if first {
b.IndexBlocks = append(b.IndexBlocks, n)
}
// We special-case the cases where all values recursively are 0. This allows
// for the use of a zero block to which all such values can be directed.
hash := uint64(0)
if n.children != nil || n.values != nil {
hasher := crc64.New(crcTable)
for _, c := range n.children {
var v uint64
if c != nil {
v = b.computeOffsets(c, false)
}
binary.Write(hasher, binary.BigEndian, v)
}
binary.Write(hasher, binary.BigEndian, n.values)
hash = hasher.Sum64()
}
if first {
b.indexBlockIdx[hash] = rootBlockOffset - blockOffset
}
// Compacters don't apply to internal nodes.
if n.children != nil {
v, ok := b.indexBlockIdx[hash]
if !ok {
v = len(b.IndexBlocks) - blockOffset
b.IndexBlocks = append(b.IndexBlocks, n)
b.indexBlockIdx[hash] = v
}
n.index = nodeIndex{0, v}
} else {
h, ok := b.valueBlockIdx[hash]
if !ok {
bestI, bestSize := 0, blockSize*b.ValueSize
for i, c := range b.Compactions[1:] {
if sz, ok := c.c.Size(n.values); ok && bestSize > sz {
bestI, bestSize = i+1, sz
}
}
c := &b.Compactions[bestI]
c.totalSize += bestSize
v := c.c.Store(n.values)
if c.maxHandle < v {
c.maxHandle = v
}
h = nodeIndex{bestI, int(v)}
b.valueBlockIdx[hash] = h
}
n.index = h
}
return hash
}

View File

@@ -0,0 +1,371 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package ucd provides a parser for Unicode Character Database files, the
// format of which is defined in http://www.unicode.org/reports/tr44/. See
// http://www.unicode.org/Public/UCD/latest/ucd/ for example files.
//
// It currently does not support substitutions of missing fields.
package ucd // import "golang.org/x/text/internal/ucd"
import (
"bufio"
"errors"
"fmt"
"io"
"log"
"regexp"
"strconv"
"strings"
)
// UnicodeData.txt fields.
const (
CodePoint = iota
Name
GeneralCategory
CanonicalCombiningClass
BidiClass
DecompMapping
DecimalValue
DigitValue
NumericValue
BidiMirrored
Unicode1Name
ISOComment
SimpleUppercaseMapping
SimpleLowercaseMapping
SimpleTitlecaseMapping
)
// Parse calls f for each entry in the given reader of a UCD file. It will close
// the reader upon return. It will call log.Fatal if any error occurred.
//
// This implements the most common usage pattern of using Parser.
func Parse(r io.ReadCloser, f func(p *Parser)) {
defer r.Close()
p := New(r)
for p.Next() {
f(p)
}
if err := p.Err(); err != nil {
r.Close() // os.Exit will cause defers not to be called.
log.Fatal(err)
}
}
// An Option is used to configure a Parser.
type Option func(p *Parser)
func keepRanges(p *Parser) {
p.keepRanges = true
}
var (
// KeepRanges prevents the expansion of ranges. The raw ranges can be
// obtained by calling Range(0) on the parser.
KeepRanges Option = keepRanges
)
// The Part option register a handler for lines starting with a '@'. The text
// after a '@' is available as the first field. Comments are handled as usual.
func Part(f func(p *Parser)) Option {
return func(p *Parser) {
p.partHandler = f
}
}
// The CommentHandler option passes comments that are on a line by itself to
// a given handler.
func CommentHandler(f func(s string)) Option {
return func(p *Parser) {
p.commentHandler = f
}
}
// A Parser parses Unicode Character Database (UCD) files.
type Parser struct {
scanner *bufio.Scanner
keepRanges bool // Don't expand rune ranges in field 0.
err error
comment string
field []string
// parsedRange is needed in case Range(0) is called more than once for one
// field. In some cases this requires scanning ahead.
line int
parsedRange bool
rangeStart, rangeEnd rune
partHandler func(p *Parser)
commentHandler func(s string)
}
func (p *Parser) setError(err error, msg string) {
if p.err == nil && err != nil {
if msg == "" {
p.err = fmt.Errorf("ucd:line:%d: %v", p.line, err)
} else {
p.err = fmt.Errorf("ucd:line:%d:%s: %v", p.line, msg, err)
}
}
}
func (p *Parser) getField(i int) string {
if i >= len(p.field) {
return ""
}
return p.field[i]
}
// Err returns a non-nil error if any error occurred during parsing.
func (p *Parser) Err() error {
return p.err
}
// New returns a Parser for the given Reader.
func New(r io.Reader, o ...Option) *Parser {
p := &Parser{
scanner: bufio.NewScanner(r),
}
for _, f := range o {
f(p)
}
return p
}
// Next parses the next line in the file. It returns true if a line was parsed
// and false if it reached the end of the file.
func (p *Parser) Next() bool {
if !p.keepRanges && p.rangeStart < p.rangeEnd {
p.rangeStart++
return true
}
p.comment = ""
p.field = p.field[:0]
p.parsedRange = false
for p.scanner.Scan() && p.err == nil {
p.line++
s := p.scanner.Text()
if s == "" {
continue
}
if s[0] == '#' {
if p.commentHandler != nil {
p.commentHandler(strings.TrimSpace(s[1:]))
}
continue
}
// Parse line
if i := strings.IndexByte(s, '#'); i != -1 {
p.comment = strings.TrimSpace(s[i+1:])
s = s[:i]
}
if s[0] == '@' {
if p.partHandler != nil {
p.field = append(p.field, strings.TrimSpace(s[1:]))
p.partHandler(p)
p.field = p.field[:0]
}
p.comment = ""
continue
}
for {
i := strings.IndexByte(s, ';')
if i == -1 {
p.field = append(p.field, strings.TrimSpace(s))
break
}
p.field = append(p.field, strings.TrimSpace(s[:i]))
s = s[i+1:]
}
if !p.keepRanges {
p.rangeStart, p.rangeEnd = p.getRange(0)
}
return true
}
p.setError(p.scanner.Err(), "scanner failed")
return false
}
func parseRune(b string) (rune, error) {
if len(b) > 2 && b[0] == 'U' && b[1] == '+' {
b = b[2:]
}
x, err := strconv.ParseUint(b, 16, 32)
return rune(x), err
}
func (p *Parser) parseRune(s string) rune {
x, err := parseRune(s)
p.setError(err, "failed to parse rune")
return x
}
// Rune parses and returns field i as a rune.
func (p *Parser) Rune(i int) rune {
if i > 0 || p.keepRanges {
return p.parseRune(p.getField(i))
}
return p.rangeStart
}
// Runes interprets and returns field i as a sequence of runes.
func (p *Parser) Runes(i int) (runes []rune) {
add := func(s string) {
if s = strings.TrimSpace(s); len(s) > 0 {
runes = append(runes, p.parseRune(s))
}
}
for b := p.getField(i); ; {
i := strings.IndexByte(b, ' ')
if i == -1 {
add(b)
break
}
add(b[:i])
b = b[i+1:]
}
return
}
var (
errIncorrectLegacyRange = errors.New("ucd: unmatched <* First>")
// reRange matches one line of a legacy rune range.
reRange = regexp.MustCompile("^([0-9A-F]*);<([^,]*), ([^>]*)>(.*)$")
)
// Range parses and returns field i as a rune range. A range is inclusive at
// both ends. If the field only has one rune, first and last will be identical.
// It supports the legacy format for ranges used in UnicodeData.txt.
func (p *Parser) Range(i int) (first, last rune) {
if !p.keepRanges {
return p.rangeStart, p.rangeStart
}
return p.getRange(i)
}
func (p *Parser) getRange(i int) (first, last rune) {
b := p.getField(i)
if k := strings.Index(b, ".."); k != -1 {
return p.parseRune(b[:k]), p.parseRune(b[k+2:])
}
// The first field may not be a rune, in which case we may ignore any error
// and set the range as 0..0.
x, err := parseRune(b)
if err != nil {
// Disable range parsing henceforth. This ensures that an error will be
// returned if the user subsequently will try to parse this field as
// a Rune.
p.keepRanges = true
}
// Special case for UnicodeData that was retained for backwards compatibility.
if i == 0 && len(p.field) > 1 && strings.HasSuffix(p.field[1], "First>") {
if p.parsedRange {
return p.rangeStart, p.rangeEnd
}
mf := reRange.FindStringSubmatch(p.scanner.Text())
p.line++
if mf == nil || !p.scanner.Scan() {
p.setError(errIncorrectLegacyRange, "")
return x, x
}
// Using Bytes would be more efficient here, but Text is a lot easier
// and this is not a frequent case.
ml := reRange.FindStringSubmatch(p.scanner.Text())
if ml == nil || mf[2] != ml[2] || ml[3] != "Last" || mf[4] != ml[4] {
p.setError(errIncorrectLegacyRange, "")
return x, x
}
p.rangeStart, p.rangeEnd = x, p.parseRune(p.scanner.Text()[:len(ml[1])])
p.parsedRange = true
return p.rangeStart, p.rangeEnd
}
return x, x
}
// bools recognizes all valid UCD boolean values.
var bools = map[string]bool{
"": false,
"N": false,
"No": false,
"F": false,
"False": false,
"Y": true,
"Yes": true,
"T": true,
"True": true,
}
// Bool parses and returns field i as a boolean value.
func (p *Parser) Bool(i int) bool {
f := p.getField(i)
for s, v := range bools {
if f == s {
return v
}
}
p.setError(strconv.ErrSyntax, "error parsing bool")
return false
}
// Int parses and returns field i as an integer value.
func (p *Parser) Int(i int) int {
x, err := strconv.ParseInt(string(p.getField(i)), 10, 64)
p.setError(err, "error parsing int")
return int(x)
}
// Uint parses and returns field i as an unsigned integer value.
func (p *Parser) Uint(i int) uint {
x, err := strconv.ParseUint(string(p.getField(i)), 10, 64)
p.setError(err, "error parsing uint")
return uint(x)
}
// Float parses and returns field i as a decimal value.
func (p *Parser) Float(i int) float64 {
x, err := strconv.ParseFloat(string(p.getField(i)), 64)
p.setError(err, "error parsing float")
return x
}
// String parses and returns field i as a string value.
func (p *Parser) String(i int) string {
return string(p.getField(i))
}
// Strings parses and returns field i as a space-separated list of strings.
func (p *Parser) Strings(i int) []string {
ss := strings.Split(string(p.getField(i)), " ")
for i, s := range ss {
ss[i] = strings.TrimSpace(s)
}
return ss
}
// Comment returns the comments for the current line.
func (p *Parser) Comment() string {
return string(p.comment)
}
var errUndefinedEnum = errors.New("ucd: undefined enum value")
// Enum interprets and returns field i as a value that must be one of the values
// in enum.
func (p *Parser) Enum(i int, enum ...string) string {
f := p.getField(i)
for _, s := range enum {
if f == s {
return s
}
}
p.setError(errUndefinedEnum, "error parsing enum")
return ""
}