// Copyright 2022 The Dawn Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//    http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cov

import (
	"log"
	"sort"
	"sync"
)

// Optimize optimizes the Tree by de-duplicating common spans into a tree of SpanGroups.
//
// Breaking down tests into group hierarchies provide a natural way to structure
// coverage data, as tests of the same suite, file or test are likely to have
// similar coverage spans.
//
// For each source file in the codebase, we create a tree of SpanGroups, where the
// leaves are the test cases.
//
// For example, given the following Paths:
//
//	a.b.d.h
//	a.b.d.i.n
//	a.b.d.i.o
//	a.b.e.j
//	a.b.e.k.p
//	a.b.e.k.q
//	a.c.f
//	a.c.g.l.r
//	a.c.g.m
//
// We would construct the following tree:
//
//	             a
//	      ╭──────┴──────╮
//	      b             c
//	  ╭───┴───╮     ╭───┴───╮
//	  d       e     f       g
//	╭─┴─╮   ╭─┴─╮         ╭─┴─╮
//	h   i   j   k         l   m
//	   ╭┴╮     ╭┴╮        │
//	   n o     p q        r
//
// Each leaf node in this tree (`h`, `n`, `o`, `j`, `p`, `q`, `f`, `r`, `m`)
// represent a test case, and non-leaf nodes (`a`, `b`, `c`, `d`, `e`, `g`, `i`,
// `k`, `l`) are suite, file or tests.
//
// To begin, we create a test tree structure, and associate the full list of test
// coverage spans with every leaf node (test case) in this tree.
//
// This data structure hasn't given us any compression benefits yet, but we can
// now do a few tricks to dramatically reduce number of spans needed to describe
// the graph:
//
//	~ Optimization 1: Common span promotion ~
//
// The first compression scheme is to promote common spans up the tree when they
// are common for all children. This will reduce the number of spans needed to be
// encoded in the final file.
//
// For example, if the test group `a` has 4 children that all share the same span
// `X`:
//
//	         a
//	   ╭───┬─┴─┬───╮
//	   b   c   d   e
//	[X,Y] [X] [X] [X,Z]
//
// Then span `X` can be promoted up to `a`:
//
//	      [X]
//	       a
//	 ╭───┬─┴─┬───╮
//	 b   c   d   e
//	[Y] []   [] [Z]
//
//	~ Optimization 2: Span XOR promotion ~
//
// This idea can be extended further, by not requiring all the children to share
// the same span before promotion. If *most* child nodes share the same span, we
// can still promote the span, but this time we *remove* the span from the
// children *if they had it*, and *add* the span to children *if they didn't
// have it*.
//
// For example, if the test group `a` has 4 children with 3 that share the span
// `X`:
//
//	         a
//	   ╭───┬─┴─┬───╮
//	   b   c   d   e
//	[X,Y] [X]  [] [X,Z]
//
// Then span `X` can be promoted up to `a` by flipping the presence of `X` on the
// child nodes:
//
//	      [X]
//	       a
//	 ╭───┬─┴─┬───╮
//	 b   c   d   e
//	[Y] []  [X] [Z]
//
// This process repeats up the tree.
//
// With this optimization applied, we now need to traverse the tree from root to
// leaf in order to know whether a given span is in use for the leaf node (test case):
//
// * If the span is encountered an *odd* number of times during traversal, then
// the span is *covered*.
// * If the span is encountered an *even* number of times during traversal, then
// the span is *not covered*.
//
// See tools/src/cov/coverage_test.go for more examples of this optimization.
//
//	~ Optimization 3: Common span grouping ~
//
// With real world data, we encounter groups of spans that are commonly found
// together. To further reduce coverage data, the whole graph is scanned for common
// span patterns, and are indexed by each tree node.
// The XOR'ing of spans as described above is performed as if the spans were not
// grouped.
//
//	~ Optimization 4: Lookup tables ~
//
// All spans, span-groups and strings are stored in de-duplicated tables, and are
// indexed wherever possible.
func (t *Tree) Optimize() {
	log.Printf("Optimizing coverage tree...")

	// Start by gathering all of the unique spansets
	wg := sync.WaitGroup{}
	wg.Add(len(t.files))
	for _, file := range t.files {
		file := file
		go func() {
			defer wg.Done()
			o := optimizer{}
			for idx, tc := range file.tcm {
				o.invertForCommon(tc, &t.testRoot.children[idx])
			}
			o.createGroups(file)
		}()
	}
	wg.Wait()
}

type optimizer struct{}

// createGroups looks for common SpanSets, and creates indexable span groups
// which are then used instead.
func (o *optimizer) createGroups(f *treeFile) {
	const minSpansInGroup = 2

	type spansetKey string
	spansetMap := map[spansetKey]SpanSet{}

	f.tcm.traverse(func(tc *TestCoverage) {
		if len(tc.Spans) >= minSpansInGroup {
			key := spansetKey(tc.Spans.String())
			if _, ok := spansetMap[key]; !ok {
				spansetMap[key] = tc.Spans
			}
		}
	})

	if len(spansetMap) == 0 {
		return
	}

	type spansetInfo struct {
		key spansetKey
		set SpanSet // fully expanded set
		grp SpanGroup
		id  SpanGroupID
	}
	spansets := make([]*spansetInfo, 0, len(spansetMap))
	for key, set := range spansetMap {
		spansets = append(spansets, &spansetInfo{
			key: key,
			set: set,
			grp: SpanGroup{Spans: set},
		})
	}

	// Sort by number of spans in each sets starting with the largest.
	sort.Slice(spansets, func(i, j int) bool {
		a, b := spansets[i].set, spansets[j].set
		switch {
		case len(a) > len(b):
			return true
		case len(a) < len(b):
			return false
		}
		return a.List().Compare(b.List()) == -1 // Just to keep output stable
	})

	// Assign IDs now that we have stable order.
	for i := range spansets {
		spansets[i].id = SpanGroupID(i)
	}

	// Loop over the spanGroups starting from the largest, and try to fold them
	// into the larger sets.
	// This is O(n^2) complexity.
nextSpan:
	for i, a := range spansets[:len(spansets)-1] {
		for _, b := range spansets[i+1:] {
			if len(a.set) > len(b.set) && a.set.containsAll(b.set) {
				extend := b.id // Do not take address of iterator!
				a.grp.Spans = a.set.removeAll(b.set)
				a.grp.Extend = &extend
				continue nextSpan
			}
		}
	}

	// Rebuild a map of spansetKey to SpanGroup
	spangroupMap := make(map[spansetKey]*spansetInfo, len(spansets))
	for _, s := range spansets {
		spangroupMap[s.key] = s
	}

	// Store the groups in the tree
	f.spangroups = make(map[SpanGroupID]SpanGroup, len(spansets))
	for _, s := range spansets {
		f.spangroups[s.id] = s.grp
	}

	// Update all the uses.
	f.tcm.traverse(func(tc *TestCoverage) {
		key := spansetKey(tc.Spans.String())
		if g, ok := spangroupMap[key]; ok {
			tc.Spans = nil
			tc.Group = &g.id
		}
	})
}

// invertCommon looks for tree nodes with the majority of the child nodes with
// the same spans. This span is promoted up to the parent, and the children
// have the span inverted.
func (o *optimizer) invertForCommon(tc *TestCoverage, t *Test) {
	wg := sync.WaitGroup{}
	wg.Add(len(tc.Children))
	for id, child := range tc.Children {
		id, child := id, child
		go func() {
			defer wg.Done()
			o.invertForCommon(child, &t.children[id])
		}()
	}
	wg.Wait()

	counts := map[SpanID]int{}
	for _, child := range tc.Children {
		for span := range child.Spans {
			counts[span] = counts[span] + 1
		}
	}

	for span, count := range counts {
		if count > len(t.children)/2 {
			tc.Spans = tc.Spans.invert(span)
			for _, idx := range t.indices {
				child := tc.Children.index(idx)
				child.Spans = child.Spans.invert(span)
				if child.deletable() {
					delete(tc.Children, idx)
				}
			}
		}
	}
}