From 4be0e425babba70cbcdd16fc54c2110270a1cc32 Mon Sep 17 00:00:00 2001 From: Brandon Dyck Date: Sat, 31 Aug 2024 15:13:27 -0600 Subject: [PATCH] Added sketch and notes on reader design --- go.mod | 3 ++ spotreader/reader.go | 113 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 go.mod create mode 100644 spotreader/reader.go diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..9e5d34e --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module git.codemonkeysoftware.net/b/gigaparsec + +go 1.23 diff --git a/spotreader/reader.go b/spotreader/reader.go new file mode 100644 index 0000000..6cab634 --- /dev/null +++ b/spotreader/reader.go @@ -0,0 +1,113 @@ +package spotreader + +import ( + "io" + "iter" +) + +type ReaderSource struct { + io.ReadSeeker +} + +// BufferedReadSeeker uses a buffer to supplement an io.Reader +// with limited backward seeking. +type BufferedReadSeeker struct{} + +func NewBufferedReadSeeker(r io.Reader, minBuffer uint64) *BufferedReadSeeker + +// Read reads bytes from the underlying reader. If the current offset is after +// the end of the buffer, Read will first read and ignore bytes from the +// underlying reader until it reaches the offset. If the current offset is +// before the start of the buffer, Read will return an error. +// +// If your parser needs unlimited lookahead, you should probably +// just read the whole input into a slice and use BytesSpotReader. +func (b *BufferedReadSeeker) Read([]byte) (int, error) + +func (b *BufferedReadSeeker) Seek(offset int64, whence int) (int64, error) + +// SpotReader reads data from a specific spot in a stream. +type SpotReader[Datum any] interface { + // Read returns n data from this SpotReader's position in the underlying + // stream. It returns the data and a new SpotReader for the position at which + // the read ended, or an error if the read failed. + // All calls to a given SpotReader will return data from the same position. + Read(n uint64) ([]Datum, SpotReader[Datum], error) + + // Pos returns the SpotReader's position within the stream. + Pos() int64 +} + +// TODO Consider parameterizing SpotReader by its implementation so that Read +// doesn't have to box the next SpotReader: +type UnboxedSpotReader[Datum any, Impl any] interface { + Read(n uint64) ([]Datum, Impl, error) + Pos() int64 +} + +// FakeSpotReader is an example of an UnboxedSpotReader. +// This style would only be worth using after pretty solid benchmarking. +// If this doesn't lower allocs, then I could also try parameterizing +// parsers by type constrained by UnboxedSpotReader, but that would make +// the user write a lot of hideous type signatures. +type FakeSpotReader[Datum any] struct{} + +func (f FakeSpotReader[Datum]) Read(uint64) ([]Datum, SpotReader[Datum], error) +func (f FakeSpotReader[Datum]) Pos() int64 + +func ExampleFakeSpotReader() { + var sr1 SpotReader[int] = FakeSpotReader[int]{} + var sr2 SpotReader[int] + _, sr2, _ = sr1.Read(0) + sr2.Pos() +} + +// SeqSpotReader as backed by a sequence of values of some type. +// It is intended for use with a concurrent lexing pass. +// TODO Since this will probably be handling tokens one at a time, +// consider using a circular buffer. +type SeqSpotReader[Datum any] struct{} + +func NewSeq[Datum any](seq iter.Seq[Datum], buflen uint) SeqSpotReader[Datum] { + panic("not implemented") +} + +type SliceSpotReader struct{} + +func NewSlice[Datum any]([]Datum) SliceSpotReader { panic("not implemented") } + +type BytesSpotReader struct{} + +func NewBytes([]byte) BytesSpotReader + +type ReadSeekerSpotReader struct{} + +func NewReadSeeker(io.ReadSeeker) ReadSeekerSpotReader + +type StringSpotReader struct{} + +func NewString(s string) StringSpotReader + +// RuneReader is an io.RuneReader backed by a SpotReader, for compatibility +// with the regexp package. +type RuneReader struct{} + +func NewRuneReader(s SpotReader[byte]) *RuneReader + +func (s *RuneReader) Read([]byte) (int, error) + +/* +I don't know how to structure this yet, and I'll need some experimentation to +decide. The idea is that there will be a readseeker that lives outside the +parser calls, and there will be an immutable reader that refers to it and gets +passed through them as part of the parser state. That immutable reader will +also hold an offset from the start of the input, so when it reads, it will +first seek to that point in the ReadSeeker. Thus a given reader can only read +at a particular point in the input. It will return a new reader with an offset +equal to the first readers offset plus the length of the read. + +For using SpotReader with an io.Reader source that is not an io.ReadSeeker, +BufferedReadSeeker allows limited backward seeking. This will not work with +unlimited lookahead/backtracking; its Seek method will return an error if +the desired offset is before the start of the buffer. +*/