246 lines
7.0 KiB
Go
246 lines
7.0 KiB
Go
// Copyright 2021 The Tint Authors.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
// Package lut provides a look up table, which compresses indexed data
|
|
package lut
|
|
|
|
import (
|
|
"sort"
|
|
|
|
"dawn.googlesource.com/dawn/tools/src/list"
|
|
)
|
|
|
|
// LUT is a look up table.
|
|
// The table holds a number of items that are stored in a linear list.
|
|
type LUT interface {
|
|
// Add adds a sequence of items to the table.
|
|
// items can be a single element, a slice of element, or a List of element.
|
|
// Returns a pointer to the offset of the first item in the table's list.
|
|
// The sequence of items stored at [offset, offset+N), where N is the
|
|
// number of items added will remain equal, even after calling Compact().
|
|
Add(items interface{}) *int
|
|
// Compact reorders the table items so that the table storage is compacted
|
|
// by shuffling data around and de-duplicating sequences of common data.
|
|
// Each originally added sequence is preserved in the resulting table, with
|
|
// the same contiguous ordering, but with a potentially different offset.
|
|
// Heuristics are used to shorten the table length, by exploiting common
|
|
// subsequences, and removing duplicate sequences.
|
|
// Note that shortest common superstring is NP-hard, so heuristics are used.
|
|
// Compact updates pointers returned by Add().
|
|
Compact()
|
|
}
|
|
|
|
// New returns a new look up table
|
|
func New(storage list.List) LUT {
|
|
return &lut{storage: storage}
|
|
}
|
|
|
|
// A sequence represents a span of entries in the table
|
|
type sequence struct {
|
|
offset *int // Pointer to the start index of the sequence
|
|
count int // Length of the sequence
|
|
}
|
|
|
|
// lut implements LUT
|
|
type lut struct {
|
|
storage list.List // The List that backs this LUT
|
|
sequences []sequence // The entries in the LUT
|
|
}
|
|
|
|
func (t *lut) Add(items interface{}) *int {
|
|
offset := t.storage.Count()
|
|
t.storage.Append(items)
|
|
count := t.storage.Count() - offset
|
|
offsetPtr := &offset
|
|
t.sequences = append(t.sequences, sequence{offsetPtr, count})
|
|
return offsetPtr
|
|
}
|
|
|
|
func (t lut) Compact() {
|
|
// Generate int32 identifiers for each unique item in the table.
|
|
// We use these to compare items instead of comparing the real data as this
|
|
// function is comparison-heavy, and integer compares are cheap.
|
|
srcIDs := t.itemIDs()
|
|
dstIDs := make([]int32, len(srcIDs))
|
|
|
|
// Make a copy the data held in the table, use the copy as the source, and
|
|
// t.storage as the destination.
|
|
srcData := list.Copy(t.storage)
|
|
dstData := t.storage
|
|
|
|
// Sort all the sequences by length, with the largest first.
|
|
// This helps 'seed' the compacted form with the largest items first.
|
|
// This can improve the compaction as small sequences can pack into larger,
|
|
// placed items.
|
|
sort.Slice(t.sequences, func(i, j int) bool {
|
|
return t.sequences[i].count > t.sequences[j].count
|
|
})
|
|
|
|
// unplaced is the list of sequences that have not yet been placed.
|
|
// All sequences are initially unplaced.
|
|
unplaced := make([]sequence, len(t.sequences))
|
|
copy(unplaced, t.sequences)
|
|
|
|
// placed is the list of sequences that have been placed.
|
|
// Nothing is initially placed.
|
|
placed := make([]sequence, 0, len(t.sequences))
|
|
|
|
// remove removes the sequence in unplaced with the index i.
|
|
remove := func(i int) {
|
|
placed = append(placed, unplaced[i])
|
|
if i > 0 {
|
|
if i < len(unplaced)-1 {
|
|
copy(unplaced[i:], unplaced[i+1:])
|
|
}
|
|
unplaced = unplaced[:len(unplaced)-1]
|
|
} else {
|
|
unplaced = unplaced[1:]
|
|
}
|
|
}
|
|
|
|
// cp copies data from [srcOffset:srcOffset+count] to [dstOffset:dstOffset+count].
|
|
cp := func(dstOffset, srcOffset, count int) {
|
|
dstData.CopyFrom(srcData, dstOffset, srcOffset, count)
|
|
copy(
|
|
dstIDs[dstOffset:dstOffset+count],
|
|
srcIDs[srcOffset:srcOffset+count],
|
|
)
|
|
}
|
|
|
|
// match describes a sequence that can be placed.
|
|
type match struct {
|
|
dst int // destination offset
|
|
src sequence // source sequence
|
|
len int // number of items that matched
|
|
idx int // sequence index
|
|
}
|
|
|
|
// number of items that have been placed.
|
|
newSize := 0
|
|
|
|
// While there's sequences to place...
|
|
for len(unplaced) > 0 {
|
|
// Place the next largest, unplaced sequence at the end of the new list
|
|
cp(newSize, *unplaced[0].offset, unplaced[0].count)
|
|
*unplaced[0].offset = newSize
|
|
newSize += unplaced[0].count
|
|
remove(0)
|
|
|
|
for {
|
|
// Look for the sequence with the longest match against the
|
|
// currently placed data. Any mismatches with currently placed data
|
|
// will nullify the match. The head or tail of this sequence may
|
|
// extend the currently placed data.
|
|
best := match{}
|
|
|
|
// For each unplaced sequence...
|
|
for i := 0; i < len(unplaced); i++ {
|
|
seq := unplaced[i]
|
|
|
|
if best.len >= seq.count {
|
|
// The best match is already at least as long as this
|
|
// sequence and sequences are sorted by size, so best cannot
|
|
// be beaten. Stop searching.
|
|
break
|
|
}
|
|
|
|
// Perform a full sweep from left to right, scoring the match...
|
|
for shift := -seq.count + 1; shift < newSize; shift++ {
|
|
dstS := max(shift, 0)
|
|
dstE := min(shift+seq.count, newSize)
|
|
count := dstE - dstS
|
|
srcS := *seq.offset - min(shift, 0)
|
|
srcE := srcS + count
|
|
|
|
if best.len < count {
|
|
if equal(srcIDs[srcS:srcE], dstIDs[dstS:dstE]) {
|
|
best = match{shift, seq, count, i}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if best.src.offset == nil {
|
|
// Nothing matched. Not even one element.
|
|
// Resort to placing the next largest sequence at the end.
|
|
break
|
|
}
|
|
|
|
if best.dst < 0 {
|
|
// Best match wants to place the sequence to the left of the
|
|
// current output. We have to shuffle everything...
|
|
n := -best.dst
|
|
dstData.Copy(n, 0, newSize)
|
|
copy(dstIDs[n:n+newSize], dstIDs)
|
|
newSize += n
|
|
best.dst = 0
|
|
for _, p := range placed {
|
|
*p.offset += n
|
|
}
|
|
}
|
|
|
|
// Place the best matching sequence.
|
|
cp(best.dst, *best.src.offset, best.src.count)
|
|
newSize = max(newSize, best.dst+best.src.count)
|
|
*best.src.offset = best.dst
|
|
remove(best.idx)
|
|
}
|
|
}
|
|
|
|
// Shrink the output buffer to the new size.
|
|
dstData.Resize(newSize)
|
|
|
|
// All done.
|
|
}
|
|
|
|
// Generate a set of identifiers for all the unique items in storage
|
|
func (t lut) itemIDs() []int32 {
|
|
storageSize := t.storage.Count()
|
|
keys := make([]int32, storageSize)
|
|
dataToKey := map[interface{}]int32{}
|
|
for i := 0; i < storageSize; i++ {
|
|
data := t.storage.Get(i)
|
|
key, found := dataToKey[data]
|
|
if !found {
|
|
key = int32(len(dataToKey))
|
|
dataToKey[data] = key
|
|
}
|
|
keys[i] = key
|
|
}
|
|
return keys
|
|
}
|
|
|
|
func max(a, b int) int {
|
|
if a < b {
|
|
return b
|
|
}
|
|
return a
|
|
}
|
|
|
|
func min(a, b int) int {
|
|
if a > b {
|
|
return b
|
|
}
|
|
return a
|
|
}
|
|
|
|
func equal(a, b []int32) bool {
|
|
for i, v := range a {
|
|
if b[i] != v {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|