From c95e05784d4c66accdb0a66407d3019db032f326 Mon Sep 17 00:00:00 2001 From: Ben Clayton Date: Mon, 31 May 2021 09:48:48 +0000 Subject: [PATCH] Add cmd/intrinsic-gen lexer and tokens Ground work for the new intrinsic definition parser. Bug: tint:832 Change-Id: I341ae11e36ef7af96ce7d01609a96e2c02425e87 Reviewed-on: https://dawn-review.googlesource.com/c/tint/+/52500 Kokoro: Kokoro Reviewed-by: David Neto --- tools/src/cmd/intrinsic-gen/lexer/lexer.go | 200 ++++++++++++++++++ .../src/cmd/intrinsic-gen/lexer/lexer_test.go | 144 +++++++++++++ tools/src/cmd/intrinsic-gen/tok/tok.go | 119 +++++++++++ 3 files changed, 463 insertions(+) create mode 100644 tools/src/cmd/intrinsic-gen/lexer/lexer.go create mode 100644 tools/src/cmd/intrinsic-gen/lexer/lexer_test.go create mode 100644 tools/src/cmd/intrinsic-gen/tok/tok.go diff --git a/tools/src/cmd/intrinsic-gen/lexer/lexer.go b/tools/src/cmd/intrinsic-gen/lexer/lexer.go new file mode 100644 index 0000000000..424dabb238 --- /dev/null +++ b/tools/src/cmd/intrinsic-gen/lexer/lexer.go @@ -0,0 +1,200 @@ +// Copyright 2021 The Tint Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package lexer provides a basic lexer for the Tint intrinsic definition +// language +package lexer + +import ( + "fmt" + "unicode" + + "dawn.googlesource.com/tint/tools/src/cmd/intrinsic-gen/tok" +) + +// Lex produces a list of tokens for the given source code +func Lex(src []rune, filepath string) ([]tok.Token, error) { + l := lexer{ + tok.Location{Line: 1, Column: 1, Rune: 0, Filepath: filepath}, + src, + []tok.Token{}, + } + if err := l.lex(); err != nil { + return nil, err + } + return l.tokens, nil +} + +type lexer struct { + loc tok.Location + runes []rune + tokens []tok.Token +} + +// lex() lexes the source, populating l.tokens +func (l *lexer) lex() error { + for { + switch l.peek(0) { + case 0: + return nil + case ' ', '\t': + l.next() + case '\n': + l.next() + case '<': + l.tok(1, tok.Lt) + case '>': + l.tok(1, tok.Gt) + case '(': + l.tok(1, tok.Lparen) + case ')': + l.tok(1, tok.Rparen) + case '{': + l.tok(1, tok.Lbrace) + case '}': + l.tok(1, tok.Rbrace) + case ':': + l.tok(1, tok.Colon) + case ',': + l.tok(1, tok.Comma) + case '|': + l.tok(1, tok.Or) + case '"': + start := l.loc + l.next() // Skip opening quote + n := l.count(toFirst('\n', '"')) + if l.peek(n) != '"' { + return fmt.Errorf("%v unterminated string", start) + } + l.tok(n, tok.String) + l.next() // Skip closing quote + default: + switch { + case l.peek(1) == '/': + l.skip(l.count(toFirst('\n'))) + l.next() // Consume newline + case l.match("[[", tok.Ldeco): + case l.match("]]", tok.Rdeco): + case l.match("->", tok.Arrow): + case l.match("fn", tok.Function): + case l.match("enum", tok.Enum): + case l.match("type", tok.Type): + case l.match("match", tok.Match): + case unicode.IsLetter(l.peek(0)): + l.tok(l.count(alphaNumericOrUnderscore), tok.Identifier) + case unicode.IsNumber(l.peek(0)): + l.tok(l.count(unicode.IsNumber), tok.Integer) + default: + return fmt.Errorf("%v: unexpected '%v'", l.loc, string(l.runes[0])) + } + } + } +} + +// next() consumes and returns the next rune in the source, or 0 if reached EOF +func (l *lexer) next() rune { + if len(l.runes) > 0 { + r := l.runes[0] + l.runes = l.runes[1:] + l.loc.Rune++ + if r == '\n' { + l.loc.Line++ + l.loc.Column = 1 + } else { + l.loc.Column++ + } + return r + } + return 0 +} + +// skip() consumes the next `n` runes in the source +func (l *lexer) skip(n int) { + for i := 0; i < n; i++ { + l.next() + } +} + +// peek() returns the rune `i` runes ahead of the current position +func (l *lexer) peek(i int) rune { + if i >= len(l.runes) { + return 0 + } + return l.runes[i] +} + +// predicate is a function that can be passed to count() +type predicate func(r rune) bool + +// count() returns the number of sequential runes from the current position that +// match the predicate `p` +func (l *lexer) count(p predicate) int { + for i := 0; i < len(l.runes); i++ { + if !p(l.peek(i)) { + return i + } + } + return len(l.runes) +} + +// tok() appends a new token of kind `k` using the next `n` runes. +// The next `n` runes are consumed by tok(). +func (l *lexer) tok(n int, k tok.Kind) { + start := l.loc + runes := l.runes[:n] + l.skip(n) + end := l.loc + + src := tok.Source{S: start, E: end} + l.tokens = append(l.tokens, tok.Token{Kind: k, Source: src, Runes: runes}) +} + +// match() checks whether the next runes are equal to `s`. If they are, then +// these runes are used to append a new token of kind `k`, and match() returns +// true. If the next runes are not equal to `s` then false is returned, and no +// runes are consumed. +func (l *lexer) match(s string, kind tok.Kind) bool { + runes := []rune(s) + if len(l.runes) < len(runes) { + return false + } + for i, r := range runes { + if l.runes[i] != r { + return false + } + } + l.tok(len(runes), kind) + return true +} + +// toFirst() returns a predicate that returns true if the rune is not in `runes` +// toFirst() is intended to be used with count(), so `count(toFirst('x'))` will +// count up to, but not including the number of consecutive runes that are not +// 'x'. +func toFirst(runes ...rune) predicate { + return func(r rune) bool { + for _, t := range runes { + if t == r { + return false + } + } + return true + } +} + +// alphaNumericOrUnderscore() returns true if the rune `r` is a number, letter +// or underscore. +func alphaNumericOrUnderscore(r rune) bool { + return r == '_' || unicode.IsLetter(r) || unicode.IsNumber(r) +} diff --git a/tools/src/cmd/intrinsic-gen/lexer/lexer_test.go b/tools/src/cmd/intrinsic-gen/lexer/lexer_test.go new file mode 100644 index 0000000000..da354c45db --- /dev/null +++ b/tools/src/cmd/intrinsic-gen/lexer/lexer_test.go @@ -0,0 +1,144 @@ +// Copyright 2021 The Tint Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package lexer_test + +import ( + "fmt" + "testing" + + "dawn.googlesource.com/tint/tools/src/cmd/intrinsic-gen/lexer" + "dawn.googlesource.com/tint/tools/src/cmd/intrinsic-gen/tok" +) + +func TestLexTokens(t *testing.T) { + type test struct { + src string + expect tok.Token + } + + filepath := "test.txt" + loc := func(l, c, r int) tok.Location { + return tok.Location{Line: l, Column: c, Rune: r, Filepath: filepath} + } + + for _, test := range []test{ + {"ident", tok.Token{Kind: tok.Identifier, Runes: []rune("ident"), Source: tok.Source{ + S: loc(1, 1, 0), E: loc(1, 6, 5), + }}}, + {"ident_123", tok.Token{Kind: tok.Identifier, Runes: []rune("ident_123"), Source: tok.Source{ + S: loc(1, 1, 0), E: loc(1, 10, 9), + }}}, + {"123456789", tok.Token{Kind: tok.Integer, Runes: []rune("123456789"), Source: tok.Source{ + S: loc(1, 1, 0), E: loc(1, 10, 9), + }}}, + {"match", tok.Token{Kind: tok.Match, Runes: []rune("match"), Source: tok.Source{ + S: loc(1, 1, 0), E: loc(1, 6, 5), + }}}, + {"fn", tok.Token{Kind: tok.Function, Runes: []rune("fn"), Source: tok.Source{ + S: loc(1, 1, 0), E: loc(1, 3, 2), + }}}, + {"type", tok.Token{Kind: tok.Type, Runes: []rune("type"), Source: tok.Source{ + S: loc(1, 1, 0), E: loc(1, 5, 4), + }}}, + {"enum", tok.Token{Kind: tok.Enum, Runes: []rune("enum"), Source: tok.Source{ + S: loc(1, 1, 0), E: loc(1, 5, 4), + }}}, + {":", tok.Token{Kind: tok.Colon, Runes: []rune(":"), Source: tok.Source{ + S: loc(1, 1, 0), E: loc(1, 2, 1), + }}}, + {",", tok.Token{Kind: tok.Comma, Runes: []rune(","), Source: tok.Source{ + S: loc(1, 1, 0), E: loc(1, 2, 1), + }}}, + {"<", tok.Token{Kind: tok.Lt, Runes: []rune("<"), Source: tok.Source{ + S: loc(1, 1, 0), E: loc(1, 2, 1), + }}}, + {">", tok.Token{Kind: tok.Gt, Runes: []rune(">"), Source: tok.Source{ + S: loc(1, 1, 0), E: loc(1, 2, 1), + }}}, + {"{", tok.Token{Kind: tok.Lbrace, Runes: []rune("{"), Source: tok.Source{ + S: loc(1, 1, 0), E: loc(1, 2, 1), + }}}, + {"}", tok.Token{Kind: tok.Rbrace, Runes: []rune("}"), Source: tok.Source{ + S: loc(1, 1, 0), E: loc(1, 2, 1), + }}}, + {"[[", tok.Token{Kind: tok.Ldeco, Runes: []rune("[["), Source: tok.Source{ + S: loc(1, 1, 0), E: loc(1, 3, 2), + }}}, + {"]]", tok.Token{Kind: tok.Rdeco, Runes: []rune("]]"), Source: tok.Source{ + S: loc(1, 1, 0), E: loc(1, 3, 2), + }}}, + {"(", tok.Token{Kind: tok.Lparen, Runes: []rune("("), Source: tok.Source{ + S: loc(1, 1, 0), E: loc(1, 2, 1), + }}}, + {")", tok.Token{Kind: tok.Rparen, Runes: []rune(")"), Source: tok.Source{ + S: loc(1, 1, 0), E: loc(1, 2, 1), + }}}, + {"|", tok.Token{Kind: tok.Or, Runes: []rune("|"), Source: tok.Source{ + S: loc(1, 1, 0), E: loc(1, 2, 1), + }}}, + {"->", tok.Token{Kind: tok.Arrow, Runes: []rune("->"), Source: tok.Source{ + S: loc(1, 1, 0), E: loc(1, 3, 2), + }}}, + {"x // y ", tok.Token{Kind: tok.Identifier, Runes: []rune("x"), Source: tok.Source{ + S: loc(1, 1, 0), E: loc(1, 2, 1), + }}}, + {`"abc"`, tok.Token{Kind: tok.String, Runes: []rune("abc"), Source: tok.Source{ + S: loc(1, 2, 1), E: loc(1, 5, 4), + }}}, + {` + // + ident + + `, tok.Token{Kind: tok.Identifier, Runes: []rune("ident"), Source: tok.Source{ + S: loc(3, 4, 10), E: loc(3, 9, 15), + }}}, + } { + got, err := lexer.Lex([]rune(test.src), filepath) + name := fmt.Sprintf(`Lex("%v")`, test.src) + switch { + case err != nil: + t.Errorf("%v returned error: %v", name, err) + case len(got) != 1: + t.Errorf("%v returned %d tokens: %v", name, len(got), got) + case got[0].Kind != test.expect.Kind: + t.Errorf(`%v returned unexpected token kind: got "%+v", expected "%+v"`, name, got[0], test.expect) + case string(got[0].Runes) != string(test.expect.Runes): + t.Errorf(`%v returned unexpected token runes: got "%+v", expected "%+v"`, name, string(got[0].Runes), string(test.expect.Runes)) + case got[0].Source != test.expect.Source: + t.Errorf(`%v returned unexpected token source: got %+v, expected %+v`, name, got[0].Source, test.expect.Source) + } + } +} + +func TestErrors(t *testing.T) { + type test struct { + src string + expect string + } + + for _, test := range []test{ + {" \"abc", "test.txt:1:2 unterminated string"}, + {" \"abc\n", "test.txt:1:2 unterminated string"}, + {"*", "test.txt:1:1: unexpected '*'"}, + } { + got, err := lexer.Lex([]rune(test.src), "test.txt") + if gotErr := err.Error(); test.expect != gotErr { + t.Errorf(`Lex() returned error "%+v", expected error "%+v"`, gotErr, test.expect) + } + if got != nil { + t.Errorf("Lex() returned non-nil for error") + } + } +} diff --git a/tools/src/cmd/intrinsic-gen/tok/tok.go b/tools/src/cmd/intrinsic-gen/tok/tok.go new file mode 100644 index 0000000000..c15a2359b9 --- /dev/null +++ b/tools/src/cmd/intrinsic-gen/tok/tok.go @@ -0,0 +1,119 @@ +// Copyright 2021 The Tint Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package tok defines tokens that are produced by the Tint intrinsic definition +// lexer +package tok + +import "fmt" + +// Kind is an enumerator of token kinds +type Kind string + +// Token enumerator types +const ( + InvalidToken Kind = "" + Identifier Kind = "ident" + Integer Kind = "integer" + String Kind = "string" + Match Kind = "match" + Function Kind = "fn" + Type Kind = "type" + Enum Kind = "enum" + Colon Kind = ":" + Comma Kind = "," + Lt Kind = "<" + Gt Kind = ">" + Lbrace Kind = "{" + Rbrace Kind = "}" + Ldeco Kind = "[[" + Rdeco Kind = "]]" + Lparen Kind = "(" + Rparen Kind = ")" + Or Kind = "|" + Arrow Kind = "->" +) + +// Invalid represents an invalid token +var Invalid = Token{Kind: InvalidToken} + +// Location describes a rune location in the source code +type Location struct { + // 1-based line index + Line int + // 1-based column index + Column int + // 0-based rune index + Rune int + // Optional file path + Filepath string +} + +// Format implements the fmt.Formatter interface +func (l Location) Format(w fmt.State, verb rune) { + if w.Flag('+') { + if l.Filepath != "" { + fmt.Fprintf(w, "%v:%v:%v[%v]", l.Filepath, l.Line, l.Column, l.Rune) + } else { + fmt.Fprintf(w, "%v:%v[%v]", l.Line, l.Column, l.Rune) + } + } else { + if l.Filepath != "" { + fmt.Fprintf(w, "%v:%v:%v", l.Filepath, l.Line, l.Column) + } else { + fmt.Fprintf(w, "%v:%v", l.Line, l.Column) + } + } +} + +// Source describes a start and end range in the source code +type Source struct { + S, E Location +} + +// IsValid returns true if the source is valid +func (s Source) IsValid() bool { + return s.S.Line != 0 && s.S.Column != 0 && s.E.Line != 0 && s.E.Column != 0 +} + +// Format implements the fmt.Formatter interface +func (s Source) Format(w fmt.State, verb rune) { + if w.Flag('+') { + fmt.Fprint(w, "[") + s.S.Format(w, verb) + fmt.Fprint(w, " - ") + s.E.Format(w, verb) + fmt.Fprint(w, "]") + } else { + s.S.Format(w, verb) + } +} + +// Token describes a parsed token +type Token struct { + Kind Kind + Runes []rune + Source Source +} + +// Format implements the fmt.Formatter interface +func (t Token) Format(w fmt.State, verb rune) { + fmt.Fprint(w, "[") + t.Source.Format(w, verb) + fmt.Fprint(w, " ") + fmt.Fprint(w, t.Kind) + fmt.Fprint(w, " ") + fmt.Fprint(w, string(t.Runes)) + fmt.Fprint(w, "]") +}