Skip to content

Commit

Permalink
decode: Reduce allocations, memory usage & table size (#6)
Browse files Browse the repository at this point in the history
Remove original runes from table, compute them from position, fill-in highly compressible dummy ranges  instead -> results in 210950 Byte table.go instead of 401301 Bytes

Benchmarks improvements:

benchmark                   old ns/op     new ns/op     delta
BenchmarkUnidecode-12       436           434           -0.34%
BenchmarkDecodeTable-12     7930175       5941300       -25.08%

benchmark                   old allocs     new allocs     delta
BenchmarkUnidecode-12       6              6              +0.00%
BenchmarkDecodeTable-12     89332          44491          -50.20%

benchmark                   old bytes     new bytes     delta
BenchmarkUnidecode-12       120           120           +0.00%
BenchmarkDecodeTable-12     1071070       826012        -22.88%
  • Loading branch information
stefanb authored Nov 8, 2021
1 parent 3795a1d commit e606663
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 17 deletions.
21 changes: 12 additions & 9 deletions decode.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,17 @@ package unidecode

import (
"compress/zlib"
"encoding/binary"
"io"
"strings"
)

const (
dummyLenght = byte(0xff)
)

var (
transliterations [65536][]rune
transCount = rune(len(transliterations))
getUint16 = binary.LittleEndian.Uint16
)

func decodeTransliterations() {
Expand All @@ -19,20 +21,21 @@ func decodeTransliterations() {
panic(err)
}
defer r.Close()
tmp1 := make([]byte, 2)
tmp2 := tmp1[:1]
b := make([]byte, 0, 13) // 13 = longest transliteration, adjust if needed
lenB := b[:1]
chr := uint16(0xffff) // char counter, rely on overflow on first pass
for {
if _, err := io.ReadAtLeast(r, tmp1, 2); err != nil {
chr++
if _, err := io.ReadFull(r, lenB); err != nil {
if err == io.EOF {
break
}
panic(err)
}
chr := getUint16(tmp1)
if _, err := io.ReadAtLeast(r, tmp2, 1); err != nil {
panic(err)
if lenB[0] == dummyLenght {
continue
}
b := make([]byte, int(tmp2[0]))
b = b[:lenB[0]] // resize, preserving allocation
if _, err := io.ReadFull(r, b); err != nil {
panic(err)
}
Expand Down
23 changes: 16 additions & 7 deletions make_table.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
//go:build none
// +build none

package main
Expand All @@ -19,6 +20,7 @@ func main() {
panic(err)
}
var buf bytes.Buffer
previousVal := int64(-1)
for _, line := range strings.Split(string(data), "\n") {
if strings.HasPrefix(line, "/*") || line == "" {
continue
Expand All @@ -31,19 +33,26 @@ func main() {
if err != nil {
panic(err)
}

if previousVal+1 != val {
rangechars := 0
for i := previousVal + 1; i <= val-1; i++ {
if err := binary.Write(&buf, binary.LittleEndian, uint8(0xff)); err != nil {
panic(err)
}
rangechars++
}
fmt.Printf("Filled dummy range: 0x%04x - 0x%04x (%4d chars)\n", previousVal+1, val-1, rangechars)
}

s, err := strconv.Unquote(line[sep+2:])
if err != nil {
panic(err)
}
if s == "" {
continue
}
if err := binary.Write(&buf, binary.LittleEndian, uint16(val)); err != nil {
panic(err)
}
if err := binary.Write(&buf, binary.LittleEndian, uint8(len(s))); err != nil {
panic(err)
}
previousVal = val
buf.WriteString(s)
}
var cbuf bytes.Buffer
Expand All @@ -60,7 +69,7 @@ func main() {
buf.Reset()
buf.WriteString("package unidecode\n")
buf.WriteString("// AUTOGENERATED - DO NOT EDIT!\n\n")
fmt.Fprintf(&buf, "var tableData = %q;\n", cbuf.String())
fmt.Fprintf(&buf, "const tableData = %q;\n", cbuf.String())
dst, err := format.Source(buf.Bytes())
if err != nil {
panic(err)
Expand Down
2 changes: 1 addition & 1 deletion table.go

Large diffs are not rendered by default.

0 comments on commit e606663

Please sign in to comment.