From a00aea29f46fd3c9ab295990169815774aafa388 Mon Sep 17 00:00:00 2001 From: Brandon Dyck Date: Mon, 2 Sep 2024 11:04:00 -0600 Subject: [PATCH] Rename SpotReader, remove cruft, and move helpers --- cursor/cursor.go | 95 ++++++++++++++++++++++++++++++++++++ cursor/helper.go | 38 +++++++++++++++ gigaparsec.go | 96 ++++++++++++++++++++++++++++++++++++ spotreader/reader.go | 113 ------------------------------------------- 4 files changed, 229 insertions(+), 113 deletions(-) create mode 100644 cursor/cursor.go create mode 100644 cursor/helper.go create mode 100644 gigaparsec.go delete mode 100644 spotreader/reader.go diff --git a/cursor/cursor.go b/cursor/cursor.go new file mode 100644 index 0000000..c998088 --- /dev/null +++ b/cursor/cursor.go @@ -0,0 +1,95 @@ +package cursor + +import ( + "io" +) + +// Cursor reads data from a specific spot in a data source. +type Cursor[Datum any] interface { + // I almost parameterized Cursor by its implementation (i.e. the Curiously + // Recurring Template Pattern), but then each parser would need that parameter. + // That might work well in a language with much stronger type inference, but + // not in Go. The upside would have been that for each implementation Impl, + // Impl.Read could have returned an unboxed Impl, which would have slightly + // simplified testing and maybe slightly reduced allocs. + + // Read fill dst with data from this Cursor's position in the underlying + // source. It returns the number of data it read and a new Cursor for + // the position at which the read ended, or an error if the read failed. + // All calls to a given Cursor will return data from the same position. + // If n < len(dst), Read will return an error explaining why it read fewer + // bytes than requested. If Read tried to read past the end of the source, + // err will be io.EOF. + Read(dst []Datum) (n uint64, next Cursor[Datum], err error) + + // Pos returns the Cursor's position within the source. + Pos() uint64 +} + +type SliceCursor[Datum any] struct { + data []Datum + offset uint64 +} + +func NewSlice[Datum any]([]Datum) SliceCursor[Datum] { panic("not implemented") } + +func (sc SliceCursor[Datum]) Read(dst []Datum) (n uint64, next Cursor[Datum], err error) { + copied := copy(dst, sc.data[sc.offset:]) + if copied < len(dst) { + err = io.EOF + } + n = uint64(copied) + sc.offset += n + return n, sc, err +} + +func (sc SliceCursor[Datum]) Pos() uint64 { + return sc.offset +} + +type ReaderAtCursor struct { + r io.ReaderAt + pos uint64 +} + +func NewReaderAt(r io.ReaderAt) ReaderAtCursor { + return ReaderAtCursor{r: r} +} + +func (rac ReaderAtCursor) Read(dst []byte) (uint64, Cursor[byte], error) { + n, err := rac.r.ReadAt(dst, int64(rac.pos)) + if n > 0 { + rac.pos += uint64(n) + } + return uint64(n), rac, err +} + +func (rac ReaderAtCursor) Pos() uint64 { + return rac.pos +} + +// StringCursor is identical to SliceCursor[byte], but uses a string as its data source. +// The advantage is that creating a StringCursor does not require copying the source +// string into a []byte. +type StringCursor struct { + source string + offset uint64 +} + +func NewString(s string) StringCursor { + return StringCursor{source: s} +} + +func (sc StringCursor) Read(dst []byte) (n uint64, next Cursor[byte], err error) { + copied := copy(dst, sc.source[sc.offset:]) + if copied < len(dst) { + err = io.EOF + } + n = uint64(copied) + sc.offset += n + return n, sc, err +} + +func (sc StringCursor) Pos() uint64 { + return sc.offset +} diff --git a/cursor/helper.go b/cursor/helper.go new file mode 100644 index 0000000..a5eaa72 --- /dev/null +++ b/cursor/helper.go @@ -0,0 +1,38 @@ +package cursor + +import "io" + +// BufferedReaderAt uses a buffer to supplement an io.Reader +// with limited backward seeking. +type BufferedReaderAt struct{} + +func NewBufferedReaderAt(r io.Reader, minBuffer uint64) *BufferedReaderAt + +// ReadAt reads bytes from the underlying reader. If the offset is after +// the end of the buffer, ReadAt will first read and ignore bytes from the +// underlying reader until it reaches the offset. If the offset is +// before the start of the buffer, ReadAt will return an error. +// +// If your parser needs unlimited lookahead, you should probably +// just read the whole input into a slice and use BytesCursor. +func (b *BufferedReaderAt) ReadAt(dst []byte, offset int64) (int, error) + +// RuneReader is an io.RuneReader backed by a Cursor, for compatibility +// with the regexp package. +type RuneReader struct { + cursor Cursor[byte] +} + +func NewRuneReader(c Cursor[byte]) *RuneReader { + return &RuneReader{cursor: c} +} + +func (r *RuneReader) Read(dst []byte) (int, error) { + n, c, err := r.cursor.Read(dst) + r.cursor = c + return int(n), err +} + +func (r *RuneReader) Cursor() Cursor[byte] { + return r.cursor +} diff --git a/gigaparsec.go b/gigaparsec.go new file mode 100644 index 0000000..e2a43c1 --- /dev/null +++ b/gigaparsec.go @@ -0,0 +1,96 @@ +package gigaparsec + +import ( + "errors" + "fmt" +) + +// TODO Everything. See https://smunix.github.io/dev.stephendiehl.com/fun/002_parsers.html. + +var ErrNoParse = errors.New("no parse") + +type Result[T any] struct { + Value T + State + Message +} + +type Message struct { + Pos + Msg string + Expected []string +} + +type ParseError struct { + Message +} + +func (pe ParseError) Error() string { + return fmt.Sprintf("parse error: %d: %s", pe.Message.Pos, pe.Message.Msg) +} + +type Pos uint64 + +type State struct { + Pos + Input []byte +} + +type Parser[T any] func(State) (consumed bool, reply Result[T], err error) + +func Return[T any](value T) Parser[T] { + return func(state State) (bool, Result[T], error) { + return false, Result[T]{Value: value, State: state}, nil + } +} + +func Satisfy(pred func(byte) bool) Parser[byte] { + return func(state State) (bool, Result[byte], error) { + if len(state.Input) == 0 { + return false, Result[byte]{}, ErrNoParse + } + b := state.Input[0] + if pred(b) { + return true, Result[byte]{Value: b, State: state.Input[1:]}, nil + } + return false, Result[byte]{}, ErrNoParse + } +} + +func Bind[A, B any](p Parser[A], f func(A) Parser[B]) Parser[B] { + return func(input State) (bool, Result[B], error) { + consumed, replyA, err := p(input) + if err != nil { + return false, Result[B]{}, err + } + consumed2, replyB, err := f(replyA.Value)(replyA.Rest) + return consumed || consumed2, replyB, err + } +} + +func Choose[A any](p, q Parser[A]) Parser[A] { + return func(input State) (bool, Result[A], error) { + consumedP, replyP, errP := p(input) + if consumedP { + return consumedP, replyP, errP + } + if errP != nil { + return q(input) + } + consumedQ, replyQ, errQ := q(input) + if consumedQ { + return consumedQ, replyQ, errQ + } + return consumedP, replyP, errP + } +} + +func Try[A any](p Parser[A]) Parser[A] { + return func(input State) (bool, Result[A], error) { + consumed, reply, err := p(input) + if err != nil { + return false, Result[A]{}, err + } + return consumed, reply, err + } +} diff --git a/spotreader/reader.go b/spotreader/reader.go deleted file mode 100644 index 6cab634..0000000 --- a/spotreader/reader.go +++ /dev/null @@ -1,113 +0,0 @@ -package spotreader - -import ( - "io" - "iter" -) - -type ReaderSource struct { - io.ReadSeeker -} - -// BufferedReadSeeker uses a buffer to supplement an io.Reader -// with limited backward seeking. -type BufferedReadSeeker struct{} - -func NewBufferedReadSeeker(r io.Reader, minBuffer uint64) *BufferedReadSeeker - -// Read reads bytes from the underlying reader. If the current offset is after -// the end of the buffer, Read will first read and ignore bytes from the -// underlying reader until it reaches the offset. If the current offset is -// before the start of the buffer, Read will return an error. -// -// If your parser needs unlimited lookahead, you should probably -// just read the whole input into a slice and use BytesSpotReader. -func (b *BufferedReadSeeker) Read([]byte) (int, error) - -func (b *BufferedReadSeeker) Seek(offset int64, whence int) (int64, error) - -// SpotReader reads data from a specific spot in a stream. -type SpotReader[Datum any] interface { - // Read returns n data from this SpotReader's position in the underlying - // stream. It returns the data and a new SpotReader for the position at which - // the read ended, or an error if the read failed. - // All calls to a given SpotReader will return data from the same position. - Read(n uint64) ([]Datum, SpotReader[Datum], error) - - // Pos returns the SpotReader's position within the stream. - Pos() int64 -} - -// TODO Consider parameterizing SpotReader by its implementation so that Read -// doesn't have to box the next SpotReader: -type UnboxedSpotReader[Datum any, Impl any] interface { - Read(n uint64) ([]Datum, Impl, error) - Pos() int64 -} - -// FakeSpotReader is an example of an UnboxedSpotReader. -// This style would only be worth using after pretty solid benchmarking. -// If this doesn't lower allocs, then I could also try parameterizing -// parsers by type constrained by UnboxedSpotReader, but that would make -// the user write a lot of hideous type signatures. -type FakeSpotReader[Datum any] struct{} - -func (f FakeSpotReader[Datum]) Read(uint64) ([]Datum, SpotReader[Datum], error) -func (f FakeSpotReader[Datum]) Pos() int64 - -func ExampleFakeSpotReader() { - var sr1 SpotReader[int] = FakeSpotReader[int]{} - var sr2 SpotReader[int] - _, sr2, _ = sr1.Read(0) - sr2.Pos() -} - -// SeqSpotReader as backed by a sequence of values of some type. -// It is intended for use with a concurrent lexing pass. -// TODO Since this will probably be handling tokens one at a time, -// consider using a circular buffer. -type SeqSpotReader[Datum any] struct{} - -func NewSeq[Datum any](seq iter.Seq[Datum], buflen uint) SeqSpotReader[Datum] { - panic("not implemented") -} - -type SliceSpotReader struct{} - -func NewSlice[Datum any]([]Datum) SliceSpotReader { panic("not implemented") } - -type BytesSpotReader struct{} - -func NewBytes([]byte) BytesSpotReader - -type ReadSeekerSpotReader struct{} - -func NewReadSeeker(io.ReadSeeker) ReadSeekerSpotReader - -type StringSpotReader struct{} - -func NewString(s string) StringSpotReader - -// RuneReader is an io.RuneReader backed by a SpotReader, for compatibility -// with the regexp package. -type RuneReader struct{} - -func NewRuneReader(s SpotReader[byte]) *RuneReader - -func (s *RuneReader) Read([]byte) (int, error) - -/* -I don't know how to structure this yet, and I'll need some experimentation to -decide. The idea is that there will be a readseeker that lives outside the -parser calls, and there will be an immutable reader that refers to it and gets -passed through them as part of the parser state. That immutable reader will -also hold an offset from the start of the input, so when it reads, it will -first seek to that point in the ReadSeeker. Thus a given reader can only read -at a particular point in the input. It will return a new reader with an offset -equal to the first readers offset plus the length of the read. - -For using SpotReader with an io.Reader source that is not an io.ReadSeeker, -BufferedReadSeeker allows limited backward seeking. This will not work with -unlimited lookahead/backtracking; its Seek method will return an error if -the desired offset is before the start of the buffer. -*/