Rename SpotReader, remove cruft, and move helpers

2024-09-02 11:04:00 -06:00 · 2024-09-02 11:04:00 -06:00 · a00aea29f4
commit a00aea29f4
parent 4be0e425ba
4 changed files with 229 additions and 113 deletions
--- a/cursor/cursor.go
+++ b/cursor/cursor.go
@ -0,0 +1,95 @@
+package cursor
+
+import (
+	"io"
+)
+
+// Cursor reads data from a specific spot in a data source.
+type Cursor[Datum any] interface {
+	// I almost parameterized Cursor by its implementation (i.e. the Curiously
+	// Recurring Template Pattern), but then each parser would need that parameter.
+	// That might work well in a language with much stronger type inference, but
+	// not in Go. The upside would have been that for each implementation Impl,
+	// Impl.Read could have returned an unboxed Impl, which would have slightly
+	// simplified testing and maybe slightly reduced allocs.
+
+	// Read fill dst with data from this Cursor's position in the underlying
+	// source. It returns the number of data it read and a new Cursor for
+	// the position at which the read ended, or an error if the read failed.
+	// All calls to a given Cursor will return data from the same position.
+	// If n < len(dst), Read will return an error explaining why it read fewer
+	// bytes than requested. If Read tried to read past the end of the source,
+	// err will be io.EOF.
+	Read(dst []Datum) (n uint64, next Cursor[Datum], err error)
+
+	// Pos returns the Cursor's position within the source.
+	Pos() uint64
+}
+
+type SliceCursor[Datum any] struct {
+	data   []Datum
+	offset uint64
+}
+
+func NewSlice[Datum any]([]Datum) SliceCursor[Datum] { panic("not implemented") }
+
+func (sc SliceCursor[Datum]) Read(dst []Datum) (n uint64, next Cursor[Datum], err error) {
+	copied := copy(dst, sc.data[sc.offset:])
+	if copied < len(dst) {
+		err = io.EOF
+	}
+	n = uint64(copied)
+	sc.offset += n
+	return n, sc, err
+}
+
+func (sc SliceCursor[Datum]) Pos() uint64 {
+	return sc.offset
+}
+
+type ReaderAtCursor struct {
+	r   io.ReaderAt
+	pos uint64
+}
+
+func NewReaderAt(r io.ReaderAt) ReaderAtCursor {
+	return ReaderAtCursor{r: r}
+}
+
+func (rac ReaderAtCursor) Read(dst []byte) (uint64, Cursor[byte], error) {
+	n, err := rac.r.ReadAt(dst, int64(rac.pos))
+	if n > 0 {
+		rac.pos += uint64(n)
+	}
+	return uint64(n), rac, err
+}
+
+func (rac ReaderAtCursor) Pos() uint64 {
+	return rac.pos
+}
+
+// StringCursor is identical to SliceCursor[byte], but uses a string as its data source.
+// The advantage is that creating a StringCursor does not require copying the source
+// string into a []byte.
+type StringCursor struct {
+	source string
+	offset uint64
+}
+
+func NewString(s string) StringCursor {
+	return StringCursor{source: s}
+}
+
+func (sc StringCursor) Read(dst []byte) (n uint64, next Cursor[byte], err error) {
+	copied := copy(dst, sc.source[sc.offset:])
+	if copied < len(dst) {
+		err = io.EOF
+	}
+	n = uint64(copied)
+	sc.offset += n
+	return n, sc, err
+}
+
+func (sc StringCursor) Pos() uint64 {
+	return sc.offset
+}
--- a/cursor/helper.go
+++ b/cursor/helper.go
@ -0,0 +1,38 @@
+package cursor
+
+import "io"
+
+// BufferedReaderAt uses a buffer to supplement an io.Reader
+// with limited backward seeking.
+type BufferedReaderAt struct{}
+
+func NewBufferedReaderAt(r io.Reader, minBuffer uint64) *BufferedReaderAt
+
+// ReadAt reads bytes from the underlying reader. If the offset is after
+// the end of the buffer, ReadAt will first read and ignore bytes from the
+// underlying reader until it reaches the offset. If the offset is
+// before the start of the buffer, ReadAt will return an error.
+//
+// If your parser needs unlimited lookahead, you should probably
+// just read the whole input into a slice and use BytesCursor.
+func (b *BufferedReaderAt) ReadAt(dst []byte, offset int64) (int, error)
+
+// RuneReader is an io.RuneReader backed by a Cursor, for compatibility
+// with the regexp package.
+type RuneReader struct {
+	cursor Cursor[byte]
+}
+
+func NewRuneReader(c Cursor[byte]) *RuneReader {
+	return &RuneReader{cursor: c}
+}
+
+func (r *RuneReader) Read(dst []byte) (int, error) {
+	n, c, err := r.cursor.Read(dst)
+	r.cursor = c
+	return int(n), err
+}
+
+func (r *RuneReader) Cursor() Cursor[byte] {
+	return r.cursor
+}
--- a/gigaparsec.go
+++ b/gigaparsec.go
@ -0,0 +1,96 @@
+package gigaparsec
+
+import (
+	"errors"
+	"fmt"
+)
+
+// TODO Everything. See https://smunix.github.io/dev.stephendiehl.com/fun/002_parsers.html.
+
+var ErrNoParse = errors.New("no parse")
+
+type Result[T any] struct {
+	Value T
+	State
+	Message
+}
+
+type Message struct {
+	Pos
+	Msg      string
+	Expected []string
+}
+
+type ParseError struct {
+	Message
+}
+
+func (pe ParseError) Error() string {
+	return fmt.Sprintf("parse error: %d: %s", pe.Message.Pos, pe.Message.Msg)
+}
+
+type Pos uint64
+
+type State struct {
+	Pos
+	Input []byte
+}
+
+type Parser[T any] func(State) (consumed bool, reply Result[T], err error)
+
+func Return[T any](value T) Parser[T] {
+	return func(state State) (bool, Result[T], error) {
+		return false, Result[T]{Value: value, State: state}, nil
+	}
+}
+
+func Satisfy(pred func(byte) bool) Parser[byte] {
+	return func(state State) (bool, Result[byte], error) {
+		if len(state.Input) == 0 {
+			return false, Result[byte]{}, ErrNoParse
+		}
+		b := state.Input[0]
+		if pred(b) {
+			return true, Result[byte]{Value: b, State: state.Input[1:]}, nil
+		}
+		return false, Result[byte]{}, ErrNoParse
+	}
+}
+
+func Bind[A, B any](p Parser[A], f func(A) Parser[B]) Parser[B] {
+	return func(input State) (bool, Result[B], error) {
+		consumed, replyA, err := p(input)
+		if err != nil {
+			return false, Result[B]{}, err
+		}
+		consumed2, replyB, err := f(replyA.Value)(replyA.Rest)
+		return consumed || consumed2, replyB, err
+	}
+}
+
+func Choose[A any](p, q Parser[A]) Parser[A] {
+	return func(input State) (bool, Result[A], error) {
+		consumedP, replyP, errP := p(input)
+		if consumedP {
+			return consumedP, replyP, errP
+		}
+		if errP != nil {
+			return q(input)
+		}
+		consumedQ, replyQ, errQ := q(input)
+		if consumedQ {
+			return consumedQ, replyQ, errQ
+		}
+		return consumedP, replyP, errP
+	}
+}
+
+func Try[A any](p Parser[A]) Parser[A] {
+	return func(input State) (bool, Result[A], error) {
+		consumed, reply, err := p(input)
+		if err != nil {
+			return false, Result[A]{}, err
+		}
+		return consumed, reply, err
+	}
+}
--- a/spotreader/reader.go
+++ b/spotreader/reader.go
@ -1,113 +0,0 @@
-package spotreader
-
-import (
-	"io"
-	"iter"
-)
-
-type ReaderSource struct {
-	io.ReadSeeker
-}
-
-// BufferedReadSeeker uses a buffer to supplement an io.Reader
-// with limited backward seeking.
-type BufferedReadSeeker struct{}
-
-func NewBufferedReadSeeker(r io.Reader, minBuffer uint64) *BufferedReadSeeker
-
-// Read reads bytes from the underlying reader. If the current offset is after
-// the end of the buffer, Read will first read and ignore bytes from the
-// underlying reader until it reaches the offset. If the current offset is
-// before the start of the buffer, Read will return an error.
-//
-// If your parser needs unlimited lookahead, you should probably
-// just read the whole input into a slice and use BytesSpotReader.
-func (b *BufferedReadSeeker) Read([]byte) (int, error)
-
-func (b *BufferedReadSeeker) Seek(offset int64, whence int) (int64, error)
-
-// SpotReader reads data from a specific spot in a stream.
-type SpotReader[Datum any] interface {
-	// Read returns n data from this SpotReader's position in the underlying
-	// stream. It returns the data and a new SpotReader for the position at which
-	// the read ended, or an error if the read failed.
-	// All calls to a given SpotReader will return data from the same position.
-	Read(n uint64) ([]Datum, SpotReader[Datum], error)
-
-	// Pos returns the SpotReader's position within the stream.
-	Pos() int64
-}
-
-// TODO Consider parameterizing SpotReader by its implementation so that Read
-// doesn't have to box the next SpotReader:
-type UnboxedSpotReader[Datum any, Impl any] interface {
-	Read(n uint64) ([]Datum, Impl, error)
-	Pos() int64
-}
-
-// FakeSpotReader is an example of an UnboxedSpotReader.
-// This style would only be worth using after pretty solid benchmarking.
-// If this doesn't lower allocs, then I could also try parameterizing
-// parsers by type constrained by UnboxedSpotReader, but that would make
-// the user write a lot of hideous type signatures.
-type FakeSpotReader[Datum any] struct{}
-
-func (f FakeSpotReader[Datum]) Read(uint64) ([]Datum, SpotReader[Datum], error)
-func (f FakeSpotReader[Datum]) Pos() int64
-
-func ExampleFakeSpotReader() {
-	var sr1 SpotReader[int] = FakeSpotReader[int]{}
-	var sr2 SpotReader[int]
-	_, sr2, _ = sr1.Read(0)
-	sr2.Pos()
-}
-
-// SeqSpotReader as backed by a sequence of values of some type.
-// It is intended for use with a concurrent lexing pass.
-// TODO Since this will probably be handling tokens one at a time,
-// consider using a circular buffer.
-type SeqSpotReader[Datum any] struct{}
-
-func NewSeq[Datum any](seq iter.Seq[Datum], buflen uint) SeqSpotReader[Datum] {
-	panic("not implemented")
-}
-
-type SliceSpotReader struct{}
-
-func NewSlice[Datum any]([]Datum) SliceSpotReader { panic("not implemented") }
-
-type BytesSpotReader struct{}
-
-func NewBytes([]byte) BytesSpotReader
-
-type ReadSeekerSpotReader struct{}
-
-func NewReadSeeker(io.ReadSeeker) ReadSeekerSpotReader
-
-type StringSpotReader struct{}
-
-func NewString(s string) StringSpotReader
-
-// RuneReader is an io.RuneReader backed by a SpotReader, for compatibility
-// with the regexp package.
-type RuneReader struct{}
-
-func NewRuneReader(s SpotReader[byte]) *RuneReader
-
-func (s *RuneReader) Read([]byte) (int, error)
-
-/*
-I don't know how to structure this yet, and I'll need some experimentation to
-decide. The idea is that there will be a readseeker that lives outside the
-parser calls, and there will be an immutable reader that refers to it and gets
-passed through them as part of the parser state. That immutable reader will
-also hold an offset from the start of the input, so when it reads, it will
-first seek to that point in the ReadSeeker. Thus a given reader can only read
-at a particular point in the input. It will return a new reader with an offset
-equal to the first readers offset plus the length of the read.
-
-For using SpotReader with an io.Reader source that is not an io.ReadSeeker,
-BufferedReadSeeker allows limited backward seeking. This will not work with
-unlimited lookahead/backtracking; its Seek method will return an error if
-the desired offset is before the start of the buffer.
-*/