gigaparsec/bytes/regexp.go

74 lines
2.2 KiB
Go
Raw Normal View History

2024-09-10 22:46:31 +00:00
package bytes
import (
2024-09-10 22:52:08 +00:00
"errors"
2024-09-10 22:46:31 +00:00
"fmt"
2024-09-10 22:52:08 +00:00
"io"
2024-09-10 22:46:31 +00:00
"regexp"
"strings"
2024-09-10 22:52:08 +00:00
"unicode/utf8"
2024-09-10 22:46:31 +00:00
"git.codemonkeysoftware.net/b/gigaparsec"
"git.codemonkeysoftware.net/b/gigaparsec/cursor"
)
2024-09-10 22:52:08 +00:00
// RuneReader is an io.RuneReader backed by a Cursor, for compatibility
// with the regexp package.
type RuneReader struct {
cursor cursor.Cursor[byte]
}
func NewRuneReader(c cursor.Cursor[byte]) *RuneReader {
return &RuneReader{cursor: c}
}
func (rr *RuneReader) ReadRune() (r rune, size int, err error) {
var b [4]byte
s := b[:]
n, next, err := rr.cursor.Read(s)
if err != nil && !errors.Is(err, io.EOF) {
rr.cursor = next
return 0, 0, fmt.Errorf("ReadRune: %w", err)
}
s = s[:n]
r, size = utf8.DecodeRune(s)
rr.cursor = rr.cursor.At(rr.cursor.Pos() + uint64(size))
return r, size, err
}
func (rr *RuneReader) Cursor() cursor.Cursor[byte] {
return rr.cursor
}
2024-09-10 22:46:31 +00:00
func Regexp(str string) gigaparsec.Parser[byte, []byte] {
if !strings.HasPrefix(str, "^") && !strings.HasPrefix(str, `\A`) {
2024-09-10 22:46:31 +00:00
str = "^" + str
}
re := regexp.MustCompile(str)
expected := fmt.Sprintf("match `%s`", str)
return func(input gigaparsec.State[byte]) (gigaparsec.Result[byte, []byte], error) {
2024-09-10 22:52:08 +00:00
r := NewRuneReader(input.Cursor())
2024-09-10 22:46:31 +00:00
idx := re.FindReaderIndex(r)
// TODO Check error from r; this requires an Error() method on cursor.RuneReader.
if idx == nil {
return gigaparsec.Fail[byte, []byte](false, gigaparsec.Message{
2024-09-10 22:46:31 +00:00
Pos: input.Pos(),
Expected: []string{expected},
// TODO Not having a Got is unsatisfactory, but how do I extract useful information?
// Maybe just read a fixed number of bytes or to the end, whichever comes first?
// I could add extra methods to cursor.RuneReader to figure out how much it had read.
}), nil
2024-09-10 22:46:31 +00:00
}
// Alas, this is a little wasteful because a Regexp can only return indices
// when searching a RuneReader.
dst := make([]byte, idx[1]-idx[0])
n, _, err := input.Cursor().Read(dst)
if err != nil {
// If we can't access those same bytes again, something is wrong.
return gigaparsec.Result[byte, []byte]{}, fmt.Errorf("Regex: unexpected error: %w", err)
2024-09-10 22:46:31 +00:00
}
next := input.At(input.Pos() + n)
return gigaparsec.Succeed(true, dst, next, gigaparsec.MessageOK(input.Pos())), nil
2024-09-10 22:46:31 +00:00
}
}