Add tools/src/lut: A lookup table with compaction

Used by the intrinsic definition generator.

Bug: tint:832
Change-Id: I000b1ebe067117e533b3edd8a3fac67a3cbc14c9
Reviewed-on: https://dawn-review.googlesource.com/c/tint/+/52601
Kokoro: Kokoro <noreply+kokoro@google.com>
Commit-Queue: Ben Clayton <bclayton@google.com>
Reviewed-by: David Neto <dneto@google.com>
This commit is contained in:
Ben Clayton 2021-06-01 15:41:11 +00:00 committed by Tint LUCI CQ
parent b487c71e59
commit e8177dade6
2 changed files with 311 additions and 0 deletions

245
tools/src/lut/lut.go Normal file
View File

@ -0,0 +1,245 @@
// Copyright 2021 The Tint Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package lut provides a look up table, which compresses indexed data
package lut
import (
"sort"
"dawn.googlesource.com/tint/tools/src/list"
)
// LUT is a look up table.
// The table holds a number of items that are stored in a linear list.
type LUT interface {
// Add adds a sequence of items to the table.
// items can be a single element, a slice of element, or a List of element.
// Returns a pointer to the offset of the first item in the table's list.
// The sequence of items stored at [offset, offset+N), where N is the
// number of items added will remain equal, even after calling Compact().
Add(items interface{}) *int
// Compact reorders the table items so that the table storage is compacted
// by shuffling data around and de-duplicating sequences of common data.
// Each originally added sequence is preserved in the resulting table, with
// the same contiguous ordering, but with a potentially different offset.
// Heuristics are used to shorten the table length, by exploiting common
// subsequences, and removing duplicate sequences.
// Note that shortest common superstring is NP-hard, so heuristics are used.
// Compact updates pointers returned by Add().
Compact()
}
// New returns a new look up table
func New(storage list.List) LUT {
return &lut{storage: storage}
}
// A sequence represents a span of entries in the table
type sequence struct {
offset *int // Pointer to the start index of the sequence
count int // Length of the sequence
}
// lut implements LUT
type lut struct {
storage list.List // The List that backs this LUT
sequences []sequence // The entries in the LUT
}
func (t *lut) Add(items interface{}) *int {
offset := t.storage.Count()
t.storage.Append(items)
count := t.storage.Count() - offset
offsetPtr := &offset
t.sequences = append(t.sequences, sequence{offsetPtr, count})
return offsetPtr
}
func (t lut) Compact() {
// Generate int32 identifiers for each unique item in the table.
// We use these to compare items instead of comparing the real data as this
// function is comparison-heavy, and integer compares are cheap.
srcIDs := t.itemIDs()
dstIDs := make([]int32, len(srcIDs))
// Make a copy the data held in the table, use the copy as the source, and
// t.storage as the destination.
srcData := list.Copy(t.storage)
dstData := t.storage
// Sort all the sequences by length, with the largest first.
// This helps 'seed' the compacted form with the largest items first.
// This can improve the compaction as small sequences can pack into larger,
// placed items.
sort.Slice(t.sequences, func(i, j int) bool {
return t.sequences[i].count > t.sequences[j].count
})
// unplaced is the list of sequences that have not yet been placed.
// All sequences are initially unplaced.
unplaced := make([]sequence, len(t.sequences))
copy(unplaced, t.sequences)
// placed is the list of sequences that have been placed.
// Nothing is initially placed.
placed := make([]sequence, 0, len(t.sequences))
// remove removes the sequence in unplaced with the index i.
remove := func(i int) {
placed = append(placed, unplaced[i])
if i > 0 {
if i < len(unplaced)-1 {
copy(unplaced[i:], unplaced[i+1:])
}
unplaced = unplaced[:len(unplaced)-1]
} else {
unplaced = unplaced[1:]
}
}
// cp copies data from [srcOffset:srcOffset+count] to [dstOffset:dstOffset+count].
cp := func(dstOffset, srcOffset, count int) {
dstData.CopyFrom(srcData, dstOffset, srcOffset, count)
copy(
dstIDs[dstOffset:dstOffset+count],
srcIDs[srcOffset:srcOffset+count],
)
}
// match describes a sequence that can be placed.
type match struct {
dst int // destination offset
src sequence // source sequence
len int // number of items that matched
idx int // sequence index
}
// number of items that have been placed.
newSize := 0
// While there's sequences to place...
for len(unplaced) > 0 {
// Place the next largest, unplaced sequence at the end of the new list
cp(newSize, *unplaced[0].offset, unplaced[0].count)
*unplaced[0].offset = newSize
newSize += unplaced[0].count
remove(0)
for {
// Look for the sequence with the longest match against the
// currently placed data. Any mismatches with currently placed data
// will nullify the match. The head or tail of this sequence may
// extend the currently placed data.
best := match{}
// For each unplaced sequence...
for i := 0; i < len(unplaced); i++ {
seq := unplaced[i]
if best.len >= seq.count {
// The best match is already at least as long as this
// sequence and sequences are sorted by size, so best cannot
// be beaten. Stop searching.
break
}
// Perform a full sweep from left to right, scoring the match...
for shift := -seq.count + 1; shift < newSize; shift++ {
dstS := max(shift, 0)
dstE := min(shift+seq.count, newSize)
count := dstE - dstS
srcS := *seq.offset - min(shift, 0)
srcE := srcS + count
if best.len < count {
if equal(srcIDs[srcS:srcE], dstIDs[dstS:dstE]) {
best = match{shift, seq, count, i}
}
}
}
}
if best.src.offset == nil {
// Nothing matched. Not even one element.
// Resort to placing the next largest sequence at the end.
break
}
if best.dst < 0 {
// Best match wants to place the sequence to the left of the
// current output. We have to shuffle everything...
n := -best.dst
dstData.Copy(n, 0, newSize)
copy(dstIDs[n:n+newSize], dstIDs)
newSize += n
best.dst = 0
for _, p := range placed {
*p.offset += n
}
}
// Place the best matching sequence.
cp(best.dst, *best.src.offset, best.src.count)
newSize = max(newSize, best.dst+best.src.count)
*best.src.offset = best.dst
remove(best.idx)
}
}
// Shrink the output buffer to the new size.
dstData.Resize(newSize)
// All done.
}
// Generate a set of identifiers for all the unique items in storage
func (t lut) itemIDs() []int32 {
storageSize := t.storage.Count()
keys := make([]int32, storageSize)
dataToKey := map[interface{}]int32{}
for i := 0; i < storageSize; i++ {
data := t.storage.Get(i)
key, found := dataToKey[data]
if !found {
key = int32(len(dataToKey))
dataToKey[data] = key
}
keys[i] = key
}
return keys
}
func max(a, b int) int {
if a < b {
return b
}
return a
}
func min(a, b int) int {
if a > b {
return b
}
return a
}
func equal(a, b []int32) bool {
for i, v := range a {
if b[i] != v {
return false
}
}
return true
}

66
tools/src/lut/lut_test.go Normal file
View File

@ -0,0 +1,66 @@
// Copyright 2021 The Tint Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package lut_test
import (
"testing"
"dawn.googlesource.com/tint/tools/src/list"
"dawn.googlesource.com/tint/tools/src/lut"
)
func TestCompactWithFragments(t *testing.T) {
runes := []rune{}
lut := lut.New(list.Wrap(&runes))
indices := []*int{
lut.Add([]rune("the life in your")),
lut.Add([]rune("in your life that count")),
lut.Add([]rune("In the end,")),
lut.Add([]rune("the life in")),
lut.Add([]rune("count. It's the")),
lut.Add([]rune("years")),
lut.Add([]rune("in your years.")),
lut.Add([]rune("it's not the years in")),
lut.Add([]rune("not the years")),
lut.Add([]rune("not the years in your")),
lut.Add([]rune("end, it's")),
}
lut.Compact()
expect := "In the end, it's not the years in your life that count. It's the life in your years."
got := string(runes)
if got != expect {
t.Errorf("Compact result was not as expected\nExpected: '%v'\nGot: '%v'", expect, got)
}
expectedIndices := []int{
61, // the life in your
31, // in your life that count
0, // In the end,
61, // the life in
49, // count. It's the
25, // years
70, // in your years.
12, // it's not the years in
17, // not the years
17, // not the years in your
7, // end, it's
}
for i, p := range indices {
if expected, got := expectedIndices[i], *p; expected != got {
t.Errorf("Index %v was not expected. Expected %v, got %v", i, expected, got)
}
}
}