Rename SpotReader, remove cruft, and move helpers

This commit is contained in:
Brandon Dyck 2024-09-02 11:04:00 -06:00
parent 4be0e425ba
commit a00aea29f4
4 changed files with 229 additions and 113 deletions

95
cursor/cursor.go Normal file
View File

@ -0,0 +1,95 @@
package cursor
import (
"io"
)
// Cursor reads data from a specific spot in a data source.
type Cursor[Datum any] interface {
// I almost parameterized Cursor by its implementation (i.e. the Curiously
// Recurring Template Pattern), but then each parser would need that parameter.
// That might work well in a language with much stronger type inference, but
// not in Go. The upside would have been that for each implementation Impl,
// Impl.Read could have returned an unboxed Impl, which would have slightly
// simplified testing and maybe slightly reduced allocs.
// Read fill dst with data from this Cursor's position in the underlying
// source. It returns the number of data it read and a new Cursor for
// the position at which the read ended, or an error if the read failed.
// All calls to a given Cursor will return data from the same position.
// If n < len(dst), Read will return an error explaining why it read fewer
// bytes than requested. If Read tried to read past the end of the source,
// err will be io.EOF.
Read(dst []Datum) (n uint64, next Cursor[Datum], err error)
// Pos returns the Cursor's position within the source.
Pos() uint64
}
type SliceCursor[Datum any] struct {
data []Datum
offset uint64
}
func NewSlice[Datum any]([]Datum) SliceCursor[Datum] { panic("not implemented") }
func (sc SliceCursor[Datum]) Read(dst []Datum) (n uint64, next Cursor[Datum], err error) {
copied := copy(dst, sc.data[sc.offset:])
if copied < len(dst) {
err = io.EOF
}
n = uint64(copied)
sc.offset += n
return n, sc, err
}
func (sc SliceCursor[Datum]) Pos() uint64 {
return sc.offset
}
type ReaderAtCursor struct {
r io.ReaderAt
pos uint64
}
func NewReaderAt(r io.ReaderAt) ReaderAtCursor {
return ReaderAtCursor{r: r}
}
func (rac ReaderAtCursor) Read(dst []byte) (uint64, Cursor[byte], error) {
n, err := rac.r.ReadAt(dst, int64(rac.pos))
if n > 0 {
rac.pos += uint64(n)
}
return uint64(n), rac, err
}
func (rac ReaderAtCursor) Pos() uint64 {
return rac.pos
}
// StringCursor is identical to SliceCursor[byte], but uses a string as its data source.
// The advantage is that creating a StringCursor does not require copying the source
// string into a []byte.
type StringCursor struct {
source string
offset uint64
}
func NewString(s string) StringCursor {
return StringCursor{source: s}
}
func (sc StringCursor) Read(dst []byte) (n uint64, next Cursor[byte], err error) {
copied := copy(dst, sc.source[sc.offset:])
if copied < len(dst) {
err = io.EOF
}
n = uint64(copied)
sc.offset += n
return n, sc, err
}
func (sc StringCursor) Pos() uint64 {
return sc.offset
}

38
cursor/helper.go Normal file
View File

@ -0,0 +1,38 @@
package cursor
import "io"
// BufferedReaderAt uses a buffer to supplement an io.Reader
// with limited backward seeking.
type BufferedReaderAt struct{}
func NewBufferedReaderAt(r io.Reader, minBuffer uint64) *BufferedReaderAt
// ReadAt reads bytes from the underlying reader. If the offset is after
// the end of the buffer, ReadAt will first read and ignore bytes from the
// underlying reader until it reaches the offset. If the offset is
// before the start of the buffer, ReadAt will return an error.
//
// If your parser needs unlimited lookahead, you should probably
// just read the whole input into a slice and use BytesCursor.
func (b *BufferedReaderAt) ReadAt(dst []byte, offset int64) (int, error)
// RuneReader is an io.RuneReader backed by a Cursor, for compatibility
// with the regexp package.
type RuneReader struct {
cursor Cursor[byte]
}
func NewRuneReader(c Cursor[byte]) *RuneReader {
return &RuneReader{cursor: c}
}
func (r *RuneReader) Read(dst []byte) (int, error) {
n, c, err := r.cursor.Read(dst)
r.cursor = c
return int(n), err
}
func (r *RuneReader) Cursor() Cursor[byte] {
return r.cursor
}

96
gigaparsec.go Normal file
View File

@ -0,0 +1,96 @@
package gigaparsec
import (
"errors"
"fmt"
)
// TODO Everything. See https://smunix.github.io/dev.stephendiehl.com/fun/002_parsers.html.
var ErrNoParse = errors.New("no parse")
type Result[T any] struct {
Value T
State
Message
}
type Message struct {
Pos
Msg string
Expected []string
}
type ParseError struct {
Message
}
func (pe ParseError) Error() string {
return fmt.Sprintf("parse error: %d: %s", pe.Message.Pos, pe.Message.Msg)
}
type Pos uint64
type State struct {
Pos
Input []byte
}
type Parser[T any] func(State) (consumed bool, reply Result[T], err error)
func Return[T any](value T) Parser[T] {
return func(state State) (bool, Result[T], error) {
return false, Result[T]{Value: value, State: state}, nil
}
}
func Satisfy(pred func(byte) bool) Parser[byte] {
return func(state State) (bool, Result[byte], error) {
if len(state.Input) == 0 {
return false, Result[byte]{}, ErrNoParse
}
b := state.Input[0]
if pred(b) {
return true, Result[byte]{Value: b, State: state.Input[1:]}, nil
}
return false, Result[byte]{}, ErrNoParse
}
}
func Bind[A, B any](p Parser[A], f func(A) Parser[B]) Parser[B] {
return func(input State) (bool, Result[B], error) {
consumed, replyA, err := p(input)
if err != nil {
return false, Result[B]{}, err
}
consumed2, replyB, err := f(replyA.Value)(replyA.Rest)
return consumed || consumed2, replyB, err
}
}
func Choose[A any](p, q Parser[A]) Parser[A] {
return func(input State) (bool, Result[A], error) {
consumedP, replyP, errP := p(input)
if consumedP {
return consumedP, replyP, errP
}
if errP != nil {
return q(input)
}
consumedQ, replyQ, errQ := q(input)
if consumedQ {
return consumedQ, replyQ, errQ
}
return consumedP, replyP, errP
}
}
func Try[A any](p Parser[A]) Parser[A] {
return func(input State) (bool, Result[A], error) {
consumed, reply, err := p(input)
if err != nil {
return false, Result[A]{}, err
}
return consumed, reply, err
}
}

View File

@ -1,113 +0,0 @@
package spotreader
import (
"io"
"iter"
)
type ReaderSource struct {
io.ReadSeeker
}
// BufferedReadSeeker uses a buffer to supplement an io.Reader
// with limited backward seeking.
type BufferedReadSeeker struct{}
func NewBufferedReadSeeker(r io.Reader, minBuffer uint64) *BufferedReadSeeker
// Read reads bytes from the underlying reader. If the current offset is after
// the end of the buffer, Read will first read and ignore bytes from the
// underlying reader until it reaches the offset. If the current offset is
// before the start of the buffer, Read will return an error.
//
// If your parser needs unlimited lookahead, you should probably
// just read the whole input into a slice and use BytesSpotReader.
func (b *BufferedReadSeeker) Read([]byte) (int, error)
func (b *BufferedReadSeeker) Seek(offset int64, whence int) (int64, error)
// SpotReader reads data from a specific spot in a stream.
type SpotReader[Datum any] interface {
// Read returns n data from this SpotReader's position in the underlying
// stream. It returns the data and a new SpotReader for the position at which
// the read ended, or an error if the read failed.
// All calls to a given SpotReader will return data from the same position.
Read(n uint64) ([]Datum, SpotReader[Datum], error)
// Pos returns the SpotReader's position within the stream.
Pos() int64
}
// TODO Consider parameterizing SpotReader by its implementation so that Read
// doesn't have to box the next SpotReader:
type UnboxedSpotReader[Datum any, Impl any] interface {
Read(n uint64) ([]Datum, Impl, error)
Pos() int64
}
// FakeSpotReader is an example of an UnboxedSpotReader.
// This style would only be worth using after pretty solid benchmarking.
// If this doesn't lower allocs, then I could also try parameterizing
// parsers by type constrained by UnboxedSpotReader, but that would make
// the user write a lot of hideous type signatures.
type FakeSpotReader[Datum any] struct{}
func (f FakeSpotReader[Datum]) Read(uint64) ([]Datum, SpotReader[Datum], error)
func (f FakeSpotReader[Datum]) Pos() int64
func ExampleFakeSpotReader() {
var sr1 SpotReader[int] = FakeSpotReader[int]{}
var sr2 SpotReader[int]
_, sr2, _ = sr1.Read(0)
sr2.Pos()
}
// SeqSpotReader as backed by a sequence of values of some type.
// It is intended for use with a concurrent lexing pass.
// TODO Since this will probably be handling tokens one at a time,
// consider using a circular buffer.
type SeqSpotReader[Datum any] struct{}
func NewSeq[Datum any](seq iter.Seq[Datum], buflen uint) SeqSpotReader[Datum] {
panic("not implemented")
}
type SliceSpotReader struct{}
func NewSlice[Datum any]([]Datum) SliceSpotReader { panic("not implemented") }
type BytesSpotReader struct{}
func NewBytes([]byte) BytesSpotReader
type ReadSeekerSpotReader struct{}
func NewReadSeeker(io.ReadSeeker) ReadSeekerSpotReader
type StringSpotReader struct{}
func NewString(s string) StringSpotReader
// RuneReader is an io.RuneReader backed by a SpotReader, for compatibility
// with the regexp package.
type RuneReader struct{}
func NewRuneReader(s SpotReader[byte]) *RuneReader
func (s *RuneReader) Read([]byte) (int, error)
/*
I don't know how to structure this yet, and I'll need some experimentation to
decide. The idea is that there will be a readseeker that lives outside the
parser calls, and there will be an immutable reader that refers to it and gets
passed through them as part of the parser state. That immutable reader will
also hold an offset from the start of the input, so when it reads, it will
first seek to that point in the ReadSeeker. Thus a given reader can only read
at a particular point in the input. It will return a new reader with an offset
equal to the first readers offset plus the length of the read.
For using SpotReader with an io.Reader source that is not an io.ReadSeeker,
BufferedReadSeeker allows limited backward seeking. This will not work with
unlimited lookahead/backtracking; its Seek method will return an error if
the desired offset is before the start of the buffer.
*/