diff --git a/tools/src/lut/lut.go b/tools/src/lut/lut.go new file mode 100644 index 0000000000..d811a1e236 --- /dev/null +++ b/tools/src/lut/lut.go @@ -0,0 +1,245 @@ +// Copyright 2021 The Tint Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package lut provides a look up table, which compresses indexed data +package lut + +import ( + "sort" + + "dawn.googlesource.com/tint/tools/src/list" +) + +// LUT is a look up table. +// The table holds a number of items that are stored in a linear list. +type LUT interface { + // Add adds a sequence of items to the table. + // items can be a single element, a slice of element, or a List of element. + // Returns a pointer to the offset of the first item in the table's list. + // The sequence of items stored at [offset, offset+N), where N is the + // number of items added will remain equal, even after calling Compact(). + Add(items interface{}) *int + // Compact reorders the table items so that the table storage is compacted + // by shuffling data around and de-duplicating sequences of common data. + // Each originally added sequence is preserved in the resulting table, with + // the same contiguous ordering, but with a potentially different offset. + // Heuristics are used to shorten the table length, by exploiting common + // subsequences, and removing duplicate sequences. + // Note that shortest common superstring is NP-hard, so heuristics are used. + // Compact updates pointers returned by Add(). + Compact() +} + +// New returns a new look up table +func New(storage list.List) LUT { + return &lut{storage: storage} +} + +// A sequence represents a span of entries in the table +type sequence struct { + offset *int // Pointer to the start index of the sequence + count int // Length of the sequence +} + +// lut implements LUT +type lut struct { + storage list.List // The List that backs this LUT + sequences []sequence // The entries in the LUT +} + +func (t *lut) Add(items interface{}) *int { + offset := t.storage.Count() + t.storage.Append(items) + count := t.storage.Count() - offset + offsetPtr := &offset + t.sequences = append(t.sequences, sequence{offsetPtr, count}) + return offsetPtr +} + +func (t lut) Compact() { + // Generate int32 identifiers for each unique item in the table. + // We use these to compare items instead of comparing the real data as this + // function is comparison-heavy, and integer compares are cheap. + srcIDs := t.itemIDs() + dstIDs := make([]int32, len(srcIDs)) + + // Make a copy the data held in the table, use the copy as the source, and + // t.storage as the destination. + srcData := list.Copy(t.storage) + dstData := t.storage + + // Sort all the sequences by length, with the largest first. + // This helps 'seed' the compacted form with the largest items first. + // This can improve the compaction as small sequences can pack into larger, + // placed items. + sort.Slice(t.sequences, func(i, j int) bool { + return t.sequences[i].count > t.sequences[j].count + }) + + // unplaced is the list of sequences that have not yet been placed. + // All sequences are initially unplaced. + unplaced := make([]sequence, len(t.sequences)) + copy(unplaced, t.sequences) + + // placed is the list of sequences that have been placed. + // Nothing is initially placed. + placed := make([]sequence, 0, len(t.sequences)) + + // remove removes the sequence in unplaced with the index i. + remove := func(i int) { + placed = append(placed, unplaced[i]) + if i > 0 { + if i < len(unplaced)-1 { + copy(unplaced[i:], unplaced[i+1:]) + } + unplaced = unplaced[:len(unplaced)-1] + } else { + unplaced = unplaced[1:] + } + } + + // cp copies data from [srcOffset:srcOffset+count] to [dstOffset:dstOffset+count]. + cp := func(dstOffset, srcOffset, count int) { + dstData.CopyFrom(srcData, dstOffset, srcOffset, count) + copy( + dstIDs[dstOffset:dstOffset+count], + srcIDs[srcOffset:srcOffset+count], + ) + } + + // match describes a sequence that can be placed. + type match struct { + dst int // destination offset + src sequence // source sequence + len int // number of items that matched + idx int // sequence index + } + + // number of items that have been placed. + newSize := 0 + + // While there's sequences to place... + for len(unplaced) > 0 { + // Place the next largest, unplaced sequence at the end of the new list + cp(newSize, *unplaced[0].offset, unplaced[0].count) + *unplaced[0].offset = newSize + newSize += unplaced[0].count + remove(0) + + for { + // Look for the sequence with the longest match against the + // currently placed data. Any mismatches with currently placed data + // will nullify the match. The head or tail of this sequence may + // extend the currently placed data. + best := match{} + + // For each unplaced sequence... + for i := 0; i < len(unplaced); i++ { + seq := unplaced[i] + + if best.len >= seq.count { + // The best match is already at least as long as this + // sequence and sequences are sorted by size, so best cannot + // be beaten. Stop searching. + break + } + + // Perform a full sweep from left to right, scoring the match... + for shift := -seq.count + 1; shift < newSize; shift++ { + dstS := max(shift, 0) + dstE := min(shift+seq.count, newSize) + count := dstE - dstS + srcS := *seq.offset - min(shift, 0) + srcE := srcS + count + + if best.len < count { + if equal(srcIDs[srcS:srcE], dstIDs[dstS:dstE]) { + best = match{shift, seq, count, i} + } + } + } + } + + if best.src.offset == nil { + // Nothing matched. Not even one element. + // Resort to placing the next largest sequence at the end. + break + } + + if best.dst < 0 { + // Best match wants to place the sequence to the left of the + // current output. We have to shuffle everything... + n := -best.dst + dstData.Copy(n, 0, newSize) + copy(dstIDs[n:n+newSize], dstIDs) + newSize += n + best.dst = 0 + for _, p := range placed { + *p.offset += n + } + } + + // Place the best matching sequence. + cp(best.dst, *best.src.offset, best.src.count) + newSize = max(newSize, best.dst+best.src.count) + *best.src.offset = best.dst + remove(best.idx) + } + } + + // Shrink the output buffer to the new size. + dstData.Resize(newSize) + + // All done. +} + +// Generate a set of identifiers for all the unique items in storage +func (t lut) itemIDs() []int32 { + storageSize := t.storage.Count() + keys := make([]int32, storageSize) + dataToKey := map[interface{}]int32{} + for i := 0; i < storageSize; i++ { + data := t.storage.Get(i) + key, found := dataToKey[data] + if !found { + key = int32(len(dataToKey)) + dataToKey[data] = key + } + keys[i] = key + } + return keys +} + +func max(a, b int) int { + if a < b { + return b + } + return a +} + +func min(a, b int) int { + if a > b { + return b + } + return a +} + +func equal(a, b []int32) bool { + for i, v := range a { + if b[i] != v { + return false + } + } + return true +} diff --git a/tools/src/lut/lut_test.go b/tools/src/lut/lut_test.go new file mode 100644 index 0000000000..fa74522d38 --- /dev/null +++ b/tools/src/lut/lut_test.go @@ -0,0 +1,66 @@ +// Copyright 2021 The Tint Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package lut_test + +import ( + "testing" + + "dawn.googlesource.com/tint/tools/src/list" + "dawn.googlesource.com/tint/tools/src/lut" +) + +func TestCompactWithFragments(t *testing.T) { + runes := []rune{} + lut := lut.New(list.Wrap(&runes)) + indices := []*int{ + lut.Add([]rune("the life in your")), + lut.Add([]rune("in your life that count")), + lut.Add([]rune("In the end,")), + lut.Add([]rune("the life in")), + lut.Add([]rune("count. It's the")), + lut.Add([]rune("years")), + lut.Add([]rune("in your years.")), + lut.Add([]rune("it's not the years in")), + lut.Add([]rune("not the years")), + lut.Add([]rune("not the years in your")), + lut.Add([]rune("end, it's")), + } + + lut.Compact() + + expect := "In the end, it's not the years in your life that count. It's the life in your years." + got := string(runes) + if got != expect { + t.Errorf("Compact result was not as expected\nExpected: '%v'\nGot: '%v'", expect, got) + } + expectedIndices := []int{ + 61, // the life in your + 31, // in your life that count + 0, // In the end, + 61, // the life in + 49, // count. It's the + 25, // years + 70, // in your years. + 12, // it's not the years in + 17, // not the years + 17, // not the years in your + 7, // end, it's + } + for i, p := range indices { + if expected, got := expectedIndices[i], *p; expected != got { + t.Errorf("Index %v was not expected. Expected %v, got %v", i, expected, got) + } + } +}