Rename SpotReader, remove cruft, and move helpers
This commit is contained in:
parent
4be0e425ba
commit
a00aea29f4
95
cursor/cursor.go
Normal file
95
cursor/cursor.go
Normal file
@ -0,0 +1,95 @@
|
|||||||
|
package cursor
|
||||||
|
|
||||||
|
import (
|
||||||
|
"io"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Cursor reads data from a specific spot in a data source.
|
||||||
|
type Cursor[Datum any] interface {
|
||||||
|
// I almost parameterized Cursor by its implementation (i.e. the Curiously
|
||||||
|
// Recurring Template Pattern), but then each parser would need that parameter.
|
||||||
|
// That might work well in a language with much stronger type inference, but
|
||||||
|
// not in Go. The upside would have been that for each implementation Impl,
|
||||||
|
// Impl.Read could have returned an unboxed Impl, which would have slightly
|
||||||
|
// simplified testing and maybe slightly reduced allocs.
|
||||||
|
|
||||||
|
// Read fill dst with data from this Cursor's position in the underlying
|
||||||
|
// source. It returns the number of data it read and a new Cursor for
|
||||||
|
// the position at which the read ended, or an error if the read failed.
|
||||||
|
// All calls to a given Cursor will return data from the same position.
|
||||||
|
// If n < len(dst), Read will return an error explaining why it read fewer
|
||||||
|
// bytes than requested. If Read tried to read past the end of the source,
|
||||||
|
// err will be io.EOF.
|
||||||
|
Read(dst []Datum) (n uint64, next Cursor[Datum], err error)
|
||||||
|
|
||||||
|
// Pos returns the Cursor's position within the source.
|
||||||
|
Pos() uint64
|
||||||
|
}
|
||||||
|
|
||||||
|
type SliceCursor[Datum any] struct {
|
||||||
|
data []Datum
|
||||||
|
offset uint64
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewSlice[Datum any]([]Datum) SliceCursor[Datum] { panic("not implemented") }
|
||||||
|
|
||||||
|
func (sc SliceCursor[Datum]) Read(dst []Datum) (n uint64, next Cursor[Datum], err error) {
|
||||||
|
copied := copy(dst, sc.data[sc.offset:])
|
||||||
|
if copied < len(dst) {
|
||||||
|
err = io.EOF
|
||||||
|
}
|
||||||
|
n = uint64(copied)
|
||||||
|
sc.offset += n
|
||||||
|
return n, sc, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (sc SliceCursor[Datum]) Pos() uint64 {
|
||||||
|
return sc.offset
|
||||||
|
}
|
||||||
|
|
||||||
|
type ReaderAtCursor struct {
|
||||||
|
r io.ReaderAt
|
||||||
|
pos uint64
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewReaderAt(r io.ReaderAt) ReaderAtCursor {
|
||||||
|
return ReaderAtCursor{r: r}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (rac ReaderAtCursor) Read(dst []byte) (uint64, Cursor[byte], error) {
|
||||||
|
n, err := rac.r.ReadAt(dst, int64(rac.pos))
|
||||||
|
if n > 0 {
|
||||||
|
rac.pos += uint64(n)
|
||||||
|
}
|
||||||
|
return uint64(n), rac, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (rac ReaderAtCursor) Pos() uint64 {
|
||||||
|
return rac.pos
|
||||||
|
}
|
||||||
|
|
||||||
|
// StringCursor is identical to SliceCursor[byte], but uses a string as its data source.
|
||||||
|
// The advantage is that creating a StringCursor does not require copying the source
|
||||||
|
// string into a []byte.
|
||||||
|
type StringCursor struct {
|
||||||
|
source string
|
||||||
|
offset uint64
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewString(s string) StringCursor {
|
||||||
|
return StringCursor{source: s}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (sc StringCursor) Read(dst []byte) (n uint64, next Cursor[byte], err error) {
|
||||||
|
copied := copy(dst, sc.source[sc.offset:])
|
||||||
|
if copied < len(dst) {
|
||||||
|
err = io.EOF
|
||||||
|
}
|
||||||
|
n = uint64(copied)
|
||||||
|
sc.offset += n
|
||||||
|
return n, sc, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (sc StringCursor) Pos() uint64 {
|
||||||
|
return sc.offset
|
||||||
|
}
|
38
cursor/helper.go
Normal file
38
cursor/helper.go
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
package cursor
|
||||||
|
|
||||||
|
import "io"
|
||||||
|
|
||||||
|
// BufferedReaderAt uses a buffer to supplement an io.Reader
|
||||||
|
// with limited backward seeking.
|
||||||
|
type BufferedReaderAt struct{}
|
||||||
|
|
||||||
|
func NewBufferedReaderAt(r io.Reader, minBuffer uint64) *BufferedReaderAt
|
||||||
|
|
||||||
|
// ReadAt reads bytes from the underlying reader. If the offset is after
|
||||||
|
// the end of the buffer, ReadAt will first read and ignore bytes from the
|
||||||
|
// underlying reader until it reaches the offset. If the offset is
|
||||||
|
// before the start of the buffer, ReadAt will return an error.
|
||||||
|
//
|
||||||
|
// If your parser needs unlimited lookahead, you should probably
|
||||||
|
// just read the whole input into a slice and use BytesCursor.
|
||||||
|
func (b *BufferedReaderAt) ReadAt(dst []byte, offset int64) (int, error)
|
||||||
|
|
||||||
|
// RuneReader is an io.RuneReader backed by a Cursor, for compatibility
|
||||||
|
// with the regexp package.
|
||||||
|
type RuneReader struct {
|
||||||
|
cursor Cursor[byte]
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewRuneReader(c Cursor[byte]) *RuneReader {
|
||||||
|
return &RuneReader{cursor: c}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *RuneReader) Read(dst []byte) (int, error) {
|
||||||
|
n, c, err := r.cursor.Read(dst)
|
||||||
|
r.cursor = c
|
||||||
|
return int(n), err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *RuneReader) Cursor() Cursor[byte] {
|
||||||
|
return r.cursor
|
||||||
|
}
|
96
gigaparsec.go
Normal file
96
gigaparsec.go
Normal file
@ -0,0 +1,96 @@
|
|||||||
|
package gigaparsec
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
)
|
||||||
|
|
||||||
|
// TODO Everything. See https://smunix.github.io/dev.stephendiehl.com/fun/002_parsers.html.
|
||||||
|
|
||||||
|
var ErrNoParse = errors.New("no parse")
|
||||||
|
|
||||||
|
type Result[T any] struct {
|
||||||
|
Value T
|
||||||
|
State
|
||||||
|
Message
|
||||||
|
}
|
||||||
|
|
||||||
|
type Message struct {
|
||||||
|
Pos
|
||||||
|
Msg string
|
||||||
|
Expected []string
|
||||||
|
}
|
||||||
|
|
||||||
|
type ParseError struct {
|
||||||
|
Message
|
||||||
|
}
|
||||||
|
|
||||||
|
func (pe ParseError) Error() string {
|
||||||
|
return fmt.Sprintf("parse error: %d: %s", pe.Message.Pos, pe.Message.Msg)
|
||||||
|
}
|
||||||
|
|
||||||
|
type Pos uint64
|
||||||
|
|
||||||
|
type State struct {
|
||||||
|
Pos
|
||||||
|
Input []byte
|
||||||
|
}
|
||||||
|
|
||||||
|
type Parser[T any] func(State) (consumed bool, reply Result[T], err error)
|
||||||
|
|
||||||
|
func Return[T any](value T) Parser[T] {
|
||||||
|
return func(state State) (bool, Result[T], error) {
|
||||||
|
return false, Result[T]{Value: value, State: state}, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func Satisfy(pred func(byte) bool) Parser[byte] {
|
||||||
|
return func(state State) (bool, Result[byte], error) {
|
||||||
|
if len(state.Input) == 0 {
|
||||||
|
return false, Result[byte]{}, ErrNoParse
|
||||||
|
}
|
||||||
|
b := state.Input[0]
|
||||||
|
if pred(b) {
|
||||||
|
return true, Result[byte]{Value: b, State: state.Input[1:]}, nil
|
||||||
|
}
|
||||||
|
return false, Result[byte]{}, ErrNoParse
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func Bind[A, B any](p Parser[A], f func(A) Parser[B]) Parser[B] {
|
||||||
|
return func(input State) (bool, Result[B], error) {
|
||||||
|
consumed, replyA, err := p(input)
|
||||||
|
if err != nil {
|
||||||
|
return false, Result[B]{}, err
|
||||||
|
}
|
||||||
|
consumed2, replyB, err := f(replyA.Value)(replyA.Rest)
|
||||||
|
return consumed || consumed2, replyB, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func Choose[A any](p, q Parser[A]) Parser[A] {
|
||||||
|
return func(input State) (bool, Result[A], error) {
|
||||||
|
consumedP, replyP, errP := p(input)
|
||||||
|
if consumedP {
|
||||||
|
return consumedP, replyP, errP
|
||||||
|
}
|
||||||
|
if errP != nil {
|
||||||
|
return q(input)
|
||||||
|
}
|
||||||
|
consumedQ, replyQ, errQ := q(input)
|
||||||
|
if consumedQ {
|
||||||
|
return consumedQ, replyQ, errQ
|
||||||
|
}
|
||||||
|
return consumedP, replyP, errP
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func Try[A any](p Parser[A]) Parser[A] {
|
||||||
|
return func(input State) (bool, Result[A], error) {
|
||||||
|
consumed, reply, err := p(input)
|
||||||
|
if err != nil {
|
||||||
|
return false, Result[A]{}, err
|
||||||
|
}
|
||||||
|
return consumed, reply, err
|
||||||
|
}
|
||||||
|
}
|
@ -1,113 +0,0 @@
|
|||||||
package spotreader
|
|
||||||
|
|
||||||
import (
|
|
||||||
"io"
|
|
||||||
"iter"
|
|
||||||
)
|
|
||||||
|
|
||||||
type ReaderSource struct {
|
|
||||||
io.ReadSeeker
|
|
||||||
}
|
|
||||||
|
|
||||||
// BufferedReadSeeker uses a buffer to supplement an io.Reader
|
|
||||||
// with limited backward seeking.
|
|
||||||
type BufferedReadSeeker struct{}
|
|
||||||
|
|
||||||
func NewBufferedReadSeeker(r io.Reader, minBuffer uint64) *BufferedReadSeeker
|
|
||||||
|
|
||||||
// Read reads bytes from the underlying reader. If the current offset is after
|
|
||||||
// the end of the buffer, Read will first read and ignore bytes from the
|
|
||||||
// underlying reader until it reaches the offset. If the current offset is
|
|
||||||
// before the start of the buffer, Read will return an error.
|
|
||||||
//
|
|
||||||
// If your parser needs unlimited lookahead, you should probably
|
|
||||||
// just read the whole input into a slice and use BytesSpotReader.
|
|
||||||
func (b *BufferedReadSeeker) Read([]byte) (int, error)
|
|
||||||
|
|
||||||
func (b *BufferedReadSeeker) Seek(offset int64, whence int) (int64, error)
|
|
||||||
|
|
||||||
// SpotReader reads data from a specific spot in a stream.
|
|
||||||
type SpotReader[Datum any] interface {
|
|
||||||
// Read returns n data from this SpotReader's position in the underlying
|
|
||||||
// stream. It returns the data and a new SpotReader for the position at which
|
|
||||||
// the read ended, or an error if the read failed.
|
|
||||||
// All calls to a given SpotReader will return data from the same position.
|
|
||||||
Read(n uint64) ([]Datum, SpotReader[Datum], error)
|
|
||||||
|
|
||||||
// Pos returns the SpotReader's position within the stream.
|
|
||||||
Pos() int64
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO Consider parameterizing SpotReader by its implementation so that Read
|
|
||||||
// doesn't have to box the next SpotReader:
|
|
||||||
type UnboxedSpotReader[Datum any, Impl any] interface {
|
|
||||||
Read(n uint64) ([]Datum, Impl, error)
|
|
||||||
Pos() int64
|
|
||||||
}
|
|
||||||
|
|
||||||
// FakeSpotReader is an example of an UnboxedSpotReader.
|
|
||||||
// This style would only be worth using after pretty solid benchmarking.
|
|
||||||
// If this doesn't lower allocs, then I could also try parameterizing
|
|
||||||
// parsers by type constrained by UnboxedSpotReader, but that would make
|
|
||||||
// the user write a lot of hideous type signatures.
|
|
||||||
type FakeSpotReader[Datum any] struct{}
|
|
||||||
|
|
||||||
func (f FakeSpotReader[Datum]) Read(uint64) ([]Datum, SpotReader[Datum], error)
|
|
||||||
func (f FakeSpotReader[Datum]) Pos() int64
|
|
||||||
|
|
||||||
func ExampleFakeSpotReader() {
|
|
||||||
var sr1 SpotReader[int] = FakeSpotReader[int]{}
|
|
||||||
var sr2 SpotReader[int]
|
|
||||||
_, sr2, _ = sr1.Read(0)
|
|
||||||
sr2.Pos()
|
|
||||||
}
|
|
||||||
|
|
||||||
// SeqSpotReader as backed by a sequence of values of some type.
|
|
||||||
// It is intended for use with a concurrent lexing pass.
|
|
||||||
// TODO Since this will probably be handling tokens one at a time,
|
|
||||||
// consider using a circular buffer.
|
|
||||||
type SeqSpotReader[Datum any] struct{}
|
|
||||||
|
|
||||||
func NewSeq[Datum any](seq iter.Seq[Datum], buflen uint) SeqSpotReader[Datum] {
|
|
||||||
panic("not implemented")
|
|
||||||
}
|
|
||||||
|
|
||||||
type SliceSpotReader struct{}
|
|
||||||
|
|
||||||
func NewSlice[Datum any]([]Datum) SliceSpotReader { panic("not implemented") }
|
|
||||||
|
|
||||||
type BytesSpotReader struct{}
|
|
||||||
|
|
||||||
func NewBytes([]byte) BytesSpotReader
|
|
||||||
|
|
||||||
type ReadSeekerSpotReader struct{}
|
|
||||||
|
|
||||||
func NewReadSeeker(io.ReadSeeker) ReadSeekerSpotReader
|
|
||||||
|
|
||||||
type StringSpotReader struct{}
|
|
||||||
|
|
||||||
func NewString(s string) StringSpotReader
|
|
||||||
|
|
||||||
// RuneReader is an io.RuneReader backed by a SpotReader, for compatibility
|
|
||||||
// with the regexp package.
|
|
||||||
type RuneReader struct{}
|
|
||||||
|
|
||||||
func NewRuneReader(s SpotReader[byte]) *RuneReader
|
|
||||||
|
|
||||||
func (s *RuneReader) Read([]byte) (int, error)
|
|
||||||
|
|
||||||
/*
|
|
||||||
I don't know how to structure this yet, and I'll need some experimentation to
|
|
||||||
decide. The idea is that there will be a readseeker that lives outside the
|
|
||||||
parser calls, and there will be an immutable reader that refers to it and gets
|
|
||||||
passed through them as part of the parser state. That immutable reader will
|
|
||||||
also hold an offset from the start of the input, so when it reads, it will
|
|
||||||
first seek to that point in the ReadSeeker. Thus a given reader can only read
|
|
||||||
at a particular point in the input. It will return a new reader with an offset
|
|
||||||
equal to the first readers offset plus the length of the read.
|
|
||||||
|
|
||||||
For using SpotReader with an io.Reader source that is not an io.ReadSeeker,
|
|
||||||
BufferedReadSeeker allows limited backward seeking. This will not work with
|
|
||||||
unlimited lookahead/backtracking; its Seek method will return an error if
|
|
||||||
the desired offset is before the start of the buffer.
|
|
||||||
*/
|
|
Loading…
Reference in New Issue
Block a user