1046 lines
26 KiB
Lua
1046 lines
26 KiB
Lua
-- $Id: utf8.lua 179 2009-04-03 18:10:03Z pasta $
|
|
--
|
|
-- Provides UTF-8 aware string functions implemented in pure lua:
|
|
-- * utf8len(s)
|
|
-- * utf8sub(s, i, j)
|
|
-- * utf8reverse(s)
|
|
-- * utf8char(unicode)
|
|
-- * utf8unicode(s, i, j)
|
|
-- * utf8gensub(s, sub_len)
|
|
-- * utf8find(str, regex, init, plain)
|
|
-- * utf8match(str, regex, init)
|
|
-- * utf8gmatch(str, regex, all)
|
|
-- * utf8gsub(str, regex, repl, limit)
|
|
--
|
|
-- If utf8data.lua (containing the lower<->upper case mappings) is loaded, these
|
|
-- additional functions are available:
|
|
-- * utf8upper(s)
|
|
-- * utf8lower(s)
|
|
--
|
|
-- All functions behave as their non UTF-8 aware counterparts with the exception
|
|
-- that UTF-8 characters are used instead of bytes for all units.
|
|
|
|
--[[
|
|
Copyright (c) 2006-2007, Kyle Smith
|
|
All rights reserved.
|
|
|
|
Contributors:
|
|
Alimov Stepan
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are met:
|
|
|
|
* Redistributions of source code must retain the above copyright notice,
|
|
this list of conditions and the following disclaimer.
|
|
* Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in the
|
|
documentation and/or other materials provided with the distribution.
|
|
* Neither the name of the author nor the names of its contributors may be
|
|
used to endorse or promote products derived from this software without
|
|
specific prior written permission.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
|
|
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
--]]
|
|
|
|
-- ABNF from RFC 3629
|
|
--
|
|
-- UTF8-octets = *( UTF8-char )
|
|
-- UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
|
|
-- UTF8-1 = %x00-7F
|
|
-- UTF8-2 = %xC2-DF UTF8-tail
|
|
-- UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
|
|
-- %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
|
|
-- UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
|
|
-- %xF4 %x80-8F 2( UTF8-tail )
|
|
-- UTF8-tail = %x80-BF
|
|
--
|
|
|
|
local byte = string.byte
|
|
local char = string.char
|
|
local dump = string.dump
|
|
local find = string.find
|
|
local format = string.format
|
|
local len = string.len
|
|
local lower = string.lower
|
|
local rep = string.rep
|
|
local sub = string.sub
|
|
local upper = string.upper
|
|
|
|
-- returns the number of bytes used by the UTF-8 character at byte i in s
|
|
-- also doubles as a UTF-8 character validator
|
|
local function utf8charbytes (s, i)
|
|
-- argument defaults
|
|
i = i or 1
|
|
|
|
-- argument checking
|
|
if type(s) ~= "string" then
|
|
error("bad argument #1 to 'utf8charbytes' (string expected, got ".. type(s).. ")")
|
|
end
|
|
if type(i) ~= "number" then
|
|
error("bad argument #2 to 'utf8charbytes' (number expected, got ".. type(i).. ")")
|
|
end
|
|
|
|
local c = byte(s, i)
|
|
|
|
-- determine bytes needed for character, based on RFC 3629
|
|
-- validate byte 1
|
|
if c > 0 and c <= 127 then
|
|
-- UTF8-1
|
|
return 1
|
|
|
|
elseif c >= 194 and c <= 223 then
|
|
-- UTF8-2
|
|
local c2 = byte(s, i + 1)
|
|
|
|
if not c2 then
|
|
error("UTF-8 string terminated early")
|
|
end
|
|
|
|
-- validate byte 2
|
|
if c2 < 128 or c2 > 191 then
|
|
error("Invalid UTF-8 character")
|
|
end
|
|
|
|
return 2
|
|
|
|
elseif c >= 224 and c <= 239 then
|
|
-- UTF8-3
|
|
local c2 = byte(s, i + 1)
|
|
local c3 = byte(s, i + 2)
|
|
|
|
if not c2 or not c3 then
|
|
error("UTF-8 string terminated early")
|
|
end
|
|
|
|
-- validate byte 2
|
|
if c == 224 and (c2 < 160 or c2 > 191) then
|
|
error("Invalid UTF-8 character")
|
|
elseif c == 237 and (c2 < 128 or c2 > 159) then
|
|
error("Invalid UTF-8 character")
|
|
elseif c2 < 128 or c2 > 191 then
|
|
error("Invalid UTF-8 character")
|
|
end
|
|
|
|
-- validate byte 3
|
|
if c3 < 128 or c3 > 191 then
|
|
error("Invalid UTF-8 character")
|
|
end
|
|
|
|
return 3
|
|
|
|
elseif c >= 240 and c <= 244 then
|
|
-- UTF8-4
|
|
local c2 = byte(s, i + 1)
|
|
local c3 = byte(s, i + 2)
|
|
local c4 = byte(s, i + 3)
|
|
|
|
if not c2 or not c3 or not c4 then
|
|
error("UTF-8 string terminated early")
|
|
end
|
|
|
|
-- validate byte 2
|
|
if c == 240 and (c2 < 144 or c2 > 191) then
|
|
error("Invalid UTF-8 character")
|
|
elseif c == 244 and (c2 < 128 or c2 > 143) then
|
|
error("Invalid UTF-8 character")
|
|
elseif c2 < 128 or c2 > 191 then
|
|
error("Invalid UTF-8 character")
|
|
end
|
|
|
|
-- validate byte 3
|
|
if c3 < 128 or c3 > 191 then
|
|
error("Invalid UTF-8 character")
|
|
end
|
|
|
|
-- validate byte 4
|
|
if c4 < 128 or c4 > 191 then
|
|
error("Invalid UTF-8 character")
|
|
end
|
|
|
|
return 4
|
|
|
|
else
|
|
error("Invalid UTF-8 character")
|
|
end
|
|
end
|
|
|
|
-- returns the number of characters in a UTF-8 string
|
|
local function utf8len (s)
|
|
-- argument checking
|
|
if type(s) ~= "string" then
|
|
for k,v in pairs(s) do print('"',tostring(k),'"',tostring(v),'"') end
|
|
error("bad argument #1 to 'utf8len' (string expected, got ".. type(s).. ")")
|
|
end
|
|
|
|
local pos = 1
|
|
local bytes = len(s)
|
|
local length = 0
|
|
|
|
while pos <= bytes do
|
|
length = length + 1
|
|
pos = pos + utf8charbytes(s, pos)
|
|
end
|
|
|
|
return length
|
|
end
|
|
|
|
-- functions identically to string.sub except that i and j are UTF-8 characters
|
|
-- instead of bytes
|
|
local function utf8sub (s, i, j)
|
|
-- argument defaults
|
|
j = j or -1
|
|
|
|
local pos = 1
|
|
local bytes = len(s)
|
|
local length = 0
|
|
|
|
-- only set l if i or j is negative
|
|
local l = (i >= 0 and j >= 0) or utf8len(s)
|
|
local startChar = (i >= 0) and i or l + i + 1
|
|
local endChar = (j >= 0) and j or l + j + 1
|
|
|
|
-- can't have start before end!
|
|
if startChar > endChar then
|
|
return ""
|
|
end
|
|
|
|
-- byte offsets to pass to string.sub
|
|
local startByte,endByte = 1,bytes
|
|
|
|
while pos <= bytes do
|
|
length = length + 1
|
|
|
|
if length == startChar then
|
|
startByte = pos
|
|
end
|
|
|
|
pos = pos + utf8charbytes(s, pos)
|
|
|
|
if length == endChar then
|
|
endByte = pos - 1
|
|
break
|
|
end
|
|
end
|
|
|
|
if startChar > length then startByte = bytes+1 end
|
|
if endChar < 1 then endByte = 0 end
|
|
|
|
return sub(s, startByte, endByte)
|
|
end
|
|
|
|
--[[
|
|
-- replace UTF-8 characters based on a mapping table
|
|
local function utf8replace (s, mapping)
|
|
-- argument checking
|
|
if type(s) ~= "string" then
|
|
error("bad argument #1 to 'utf8replace' (string expected, got ".. type(s).. ")")
|
|
end
|
|
if type(mapping) ~= "table" then
|
|
error("bad argument #2 to 'utf8replace' (table expected, got ".. type(mapping).. ")")
|
|
end
|
|
|
|
local pos = 1
|
|
local bytes = len(s)
|
|
local charbytes
|
|
local newstr = ""
|
|
|
|
while pos <= bytes do
|
|
charbytes = utf8charbytes(s, pos)
|
|
local c = sub(s, pos, pos + charbytes - 1)
|
|
|
|
newstr = newstr .. (mapping[c] or c)
|
|
|
|
pos = pos + charbytes
|
|
end
|
|
|
|
return newstr
|
|
end
|
|
|
|
|
|
-- identical to string.upper except it knows about unicode simple case conversions
|
|
local function utf8upper (s)
|
|
return utf8replace(s, utf8_lc_uc)
|
|
end
|
|
|
|
-- identical to string.lower except it knows about unicode simple case conversions
|
|
local function utf8lower (s)
|
|
return utf8replace(s, utf8_uc_lc)
|
|
end
|
|
]]
|
|
|
|
-- identical to string.reverse except that it supports UTF-8
|
|
local function utf8reverse (s)
|
|
-- argument checking
|
|
if type(s) ~= "string" then
|
|
error("bad argument #1 to 'utf8reverse' (string expected, got ".. type(s).. ")")
|
|
end
|
|
|
|
local bytes = len(s)
|
|
local pos = bytes
|
|
local charbytes
|
|
local newstr = ""
|
|
|
|
while pos > 0 do
|
|
local c = byte(s, pos)
|
|
while c >= 128 and c <= 191 do
|
|
pos = pos - 1
|
|
c = byte(s, pos)
|
|
end
|
|
|
|
charbytes = utf8charbytes(s, pos)
|
|
|
|
newstr = newstr .. sub(s, pos, pos + charbytes - 1)
|
|
|
|
pos = pos - 1
|
|
end
|
|
|
|
return newstr
|
|
end
|
|
|
|
-- http://en.wikipedia.org/wiki/Utf8
|
|
-- http://developer.coronalabs.com/code/utf-8-conversion-utility
|
|
local function utf8char(unicode)
|
|
if unicode <= 0x7F then return char(unicode) end
|
|
|
|
if (unicode <= 0x7FF) then
|
|
local Byte0 = 0xC0 + math.floor(unicode / 0x40);
|
|
local Byte1 = 0x80 + (unicode % 0x40);
|
|
return char(Byte0, Byte1);
|
|
end;
|
|
|
|
if (unicode <= 0xFFFF) then
|
|
local Byte0 = 0xE0 + math.floor(unicode / 0x1000);
|
|
local Byte1 = 0x80 + (math.floor(unicode / 0x40) % 0x40);
|
|
local Byte2 = 0x80 + (unicode % 0x40);
|
|
return char(Byte0, Byte1, Byte2);
|
|
end;
|
|
|
|
if (unicode <= 0x10FFFF) then
|
|
local code = unicode
|
|
local Byte3= 0x80 + (code % 0x40);
|
|
code = math.floor(code / 0x40)
|
|
local Byte2= 0x80 + (code % 0x40);
|
|
code = math.floor(code / 0x40)
|
|
local Byte1= 0x80 + (code % 0x40);
|
|
code = math.floor(code / 0x40)
|
|
local Byte0= 0xF0 + code;
|
|
|
|
return char(Byte0, Byte1, Byte2, Byte3);
|
|
end;
|
|
|
|
error 'Unicode cannot be greater than U+10FFFF!'
|
|
end
|
|
|
|
local shift_6 = 2^6
|
|
local shift_12 = 2^12
|
|
local shift_18 = 2^18
|
|
|
|
local utf8unicode
|
|
utf8unicode = function(str, i, j, byte_pos)
|
|
i = i or 1
|
|
j = j or i
|
|
|
|
if i > j then return end
|
|
|
|
local ch,bytes
|
|
|
|
if byte_pos then
|
|
bytes = utf8charbytes(str,byte_pos)
|
|
ch = sub(str,byte_pos,byte_pos-1+bytes)
|
|
else
|
|
ch,byte_pos = utf8sub(str,i,i), 0
|
|
bytes = #ch
|
|
end
|
|
|
|
local unicode
|
|
|
|
if bytes == 1 then unicode = byte(ch) end
|
|
if bytes == 2 then
|
|
local byte0,byte1 = byte(ch,1,2)
|
|
local code0,code1 = byte0-0xC0,byte1-0x80
|
|
unicode = code0*shift_6 + code1
|
|
end
|
|
if bytes == 3 then
|
|
local byte0,byte1,byte2 = byte(ch,1,3)
|
|
local code0,code1,code2 = byte0-0xE0,byte1-0x80,byte2-0x80
|
|
unicode = code0*shift_12 + code1*shift_6 + code2
|
|
end
|
|
if bytes == 4 then
|
|
local byte0,byte1,byte2,byte3 = byte(ch,1,4)
|
|
local code0,code1,code2,code3 = byte0-0xF0,byte1-0x80,byte2-0x80,byte3-0x80
|
|
unicode = code0*shift_18 + code1*shift_12 + code2*shift_6 + code3
|
|
end
|
|
|
|
return unicode,utf8unicode(str, i+1, j, byte_pos+bytes)
|
|
end
|
|
|
|
-- Returns an iterator which returns the next substring and its byte interval
|
|
local function utf8gensub(str, sub_len)
|
|
sub_len = sub_len or 1
|
|
local byte_pos = 1
|
|
local length = #str
|
|
return function(skip)
|
|
if skip then byte_pos = byte_pos + skip end
|
|
local char_count = 0
|
|
local start = byte_pos
|
|
repeat
|
|
if byte_pos > length then return end
|
|
char_count = char_count + 1
|
|
local bytes = utf8charbytes(str,byte_pos)
|
|
byte_pos = byte_pos+bytes
|
|
|
|
until char_count == sub_len
|
|
|
|
local last = byte_pos-1
|
|
local slice = sub(str,start,last)
|
|
return slice, start, last
|
|
end
|
|
end
|
|
|
|
local function binsearch(sortedTable, item, comp)
|
|
local head, tail = 1, #sortedTable
|
|
local mid = math.floor((head + tail)/2)
|
|
if not comp then
|
|
while (tail - head) > 1 do
|
|
if sortedTable[tonumber(mid)] > item then
|
|
tail = mid
|
|
else
|
|
head = mid
|
|
end
|
|
mid = math.floor((head + tail)/2)
|
|
end
|
|
end
|
|
if sortedTable[tonumber(head)] == item then
|
|
return true, tonumber(head)
|
|
elseif sortedTable[tonumber(tail)] == item then
|
|
return true, tonumber(tail)
|
|
else
|
|
return false
|
|
end
|
|
end
|
|
local function classMatchGenerator(class, plain)
|
|
local codes = {}
|
|
local ranges = {}
|
|
local ignore = false
|
|
local range = false
|
|
local firstletter = true
|
|
local unmatch = false
|
|
|
|
local it = utf8gensub(class)
|
|
|
|
local skip
|
|
for c, _, be in it do
|
|
skip = be
|
|
if not ignore and not plain then
|
|
if c == "%" then
|
|
ignore = true
|
|
elseif c == "-" then
|
|
table.insert(codes, utf8unicode(c))
|
|
range = true
|
|
elseif c == "^" then
|
|
if not firstletter then
|
|
error('!!!')
|
|
else
|
|
unmatch = true
|
|
end
|
|
elseif c == ']' then
|
|
break
|
|
else
|
|
if not range then
|
|
table.insert(codes, utf8unicode(c))
|
|
else
|
|
table.remove(codes) -- removing '-'
|
|
table.insert(ranges, {table.remove(codes), utf8unicode(c)})
|
|
range = false
|
|
end
|
|
end
|
|
elseif ignore and not plain then
|
|
if c == 'a' then -- %a: represents all letters. (ONLY ASCII)
|
|
table.insert(ranges, {65, 90}) -- A - Z
|
|
table.insert(ranges, {97, 122}) -- a - z
|
|
elseif c == 'c' then -- %c: represents all control characters.
|
|
table.insert(ranges, {0, 31})
|
|
table.insert(codes, 127)
|
|
elseif c == 'd' then -- %d: represents all digits.
|
|
table.insert(ranges, {48, 57}) -- 0 - 9
|
|
elseif c == 'g' then -- %g: represents all printable characters except space.
|
|
table.insert(ranges, {1, 8})
|
|
table.insert(ranges, {14, 31})
|
|
table.insert(ranges, {33, 132})
|
|
table.insert(ranges, {134, 159})
|
|
table.insert(ranges, {161, 5759})
|
|
table.insert(ranges, {5761, 8191})
|
|
table.insert(ranges, {8203, 8231})
|
|
table.insert(ranges, {8234, 8238})
|
|
table.insert(ranges, {8240, 8286})
|
|
table.insert(ranges, {8288, 12287})
|
|
elseif c == 'l' then -- %l: represents all lowercase letters. (ONLY ASCII)
|
|
table.insert(ranges, {97, 122}) -- a - z
|
|
elseif c == 'p' then -- %p: represents all punctuation characters. (ONLY ASCII)
|
|
table.insert(ranges, {33, 47})
|
|
table.insert(ranges, {58, 64})
|
|
table.insert(ranges, {91, 96})
|
|
table.insert(ranges, {123, 126})
|
|
elseif c == 's' then -- %s: represents all space characters.
|
|
table.insert(ranges, {9, 13})
|
|
table.insert(codes, 32)
|
|
table.insert(codes, 133)
|
|
table.insert(codes, 160)
|
|
table.insert(codes, 5760)
|
|
table.insert(ranges, {8192, 8202})
|
|
table.insert(codes, 8232)
|
|
table.insert(codes, 8233)
|
|
table.insert(codes, 8239)
|
|
table.insert(codes, 8287)
|
|
table.insert(codes, 12288)
|
|
elseif c == 'u' then -- %u: represents all uppercase letters. (ONLY ASCII)
|
|
table.insert(ranges, {65, 90}) -- A - Z
|
|
elseif c == 'w' then -- %w: represents all alphanumeric characters. (ONLY ASCII)
|
|
table.insert(ranges, {48, 57}) -- 0 - 9
|
|
table.insert(ranges, {65, 90}) -- A - Z
|
|
table.insert(ranges, {97, 122}) -- a - z
|
|
elseif c == 'x' then -- %x: represents all hexadecimal digits.
|
|
table.insert(ranges, {48, 57}) -- 0 - 9
|
|
table.insert(ranges, {65, 70}) -- A - F
|
|
table.insert(ranges, {97, 102}) -- a - f
|
|
else
|
|
if not range then
|
|
table.insert(codes, utf8unicode(c))
|
|
else
|
|
table.remove(codes) -- removing '-'
|
|
table.insert(ranges, {table.remove(codes), utf8unicode(c)})
|
|
range = false
|
|
end
|
|
end
|
|
ignore = false
|
|
else
|
|
if not range then
|
|
table.insert(codes, utf8unicode(c))
|
|
else
|
|
table.remove(codes) -- removing '-'
|
|
table.insert(ranges, {table.remove(codes), utf8unicode(c)})
|
|
range = false
|
|
end
|
|
ignore = false
|
|
end
|
|
|
|
firstletter = false
|
|
end
|
|
|
|
table.sort(codes)
|
|
|
|
local function inRanges(charCode)
|
|
for _,r in ipairs(ranges) do
|
|
if r[1] <= charCode and charCode <= r[2] then
|
|
return true
|
|
end
|
|
end
|
|
return false
|
|
end
|
|
if not unmatch then
|
|
return function(charCode)
|
|
return binsearch(codes, charCode) or inRanges(charCode)
|
|
end, skip
|
|
else
|
|
return function(charCode)
|
|
return charCode ~= -1 and not (binsearch(codes, charCode) or inRanges(charCode))
|
|
end, skip
|
|
end
|
|
end
|
|
|
|
--[[
|
|
-- utf8sub with extra argument, and extra result value
|
|
local function utf8subWithBytes (s, i, j, sb)
|
|
-- argument defaults
|
|
j = j or -1
|
|
|
|
local pos = sb or 1
|
|
local bytes = len(s)
|
|
local length = 0
|
|
|
|
-- only set l if i or j is negative
|
|
local l = (i >= 0 and j >= 0) or utf8len(s)
|
|
local startChar = (i >= 0) and i or l + i + 1
|
|
local endChar = (j >= 0) and j or l + j + 1
|
|
|
|
-- can't have start before end!
|
|
if startChar > endChar then
|
|
return ""
|
|
end
|
|
|
|
-- byte offsets to pass to string.sub
|
|
local startByte,endByte = 1,bytes
|
|
|
|
while pos <= bytes do
|
|
length = length + 1
|
|
|
|
if length == startChar then
|
|
startByte = pos
|
|
end
|
|
|
|
pos = pos + utf8charbytes(s, pos)
|
|
|
|
if length == endChar then
|
|
endByte = pos - 1
|
|
break
|
|
end
|
|
end
|
|
|
|
if startChar > length then startByte = bytes+1 end
|
|
if endChar < 1 then endByte = 0 end
|
|
|
|
return sub(s, startByte, endByte), endByte + 1
|
|
end
|
|
]]
|
|
|
|
local cache = setmetatable({},{
|
|
__mode = 'kv'
|
|
})
|
|
local cachePlain = setmetatable({},{
|
|
__mode = 'kv'
|
|
})
|
|
local function matcherGenerator(regex, plain)
|
|
local matcher = {
|
|
functions = {},
|
|
captures = {}
|
|
}
|
|
if not plain then
|
|
cache[regex] = matcher
|
|
else
|
|
cachePlain[regex] = matcher
|
|
end
|
|
local function simple(func)
|
|
return function(cC)
|
|
if func(cC) then
|
|
matcher:nextFunc()
|
|
matcher:nextStr()
|
|
else
|
|
matcher:reset()
|
|
end
|
|
end
|
|
end
|
|
local function star(func)
|
|
return function(cC)
|
|
if func(cC) then
|
|
matcher:fullResetOnNextFunc()
|
|
matcher:nextStr()
|
|
else
|
|
matcher:nextFunc()
|
|
end
|
|
end
|
|
end
|
|
local function minus(func)
|
|
return function(cC)
|
|
if func(cC) then
|
|
matcher:fullResetOnNextStr()
|
|
end
|
|
matcher:nextFunc()
|
|
end
|
|
end
|
|
local function question(func)
|
|
return function(cC)
|
|
if func(cC) then
|
|
matcher:fullResetOnNextFunc()
|
|
matcher:nextStr()
|
|
end
|
|
matcher:nextFunc()
|
|
end
|
|
end
|
|
|
|
local function capture(id)
|
|
return function(_)
|
|
local l = matcher.captures[id][2] - matcher.captures[id][1]
|
|
local captured = utf8sub(matcher.string, matcher.captures[id][1], matcher.captures[id][2])
|
|
local check = utf8sub(matcher.string, matcher.str, matcher.str + l)
|
|
if captured == check then
|
|
for _ = 0, l do
|
|
matcher:nextStr()
|
|
end
|
|
matcher:nextFunc()
|
|
else
|
|
matcher:reset()
|
|
end
|
|
end
|
|
end
|
|
local function captureStart(id)
|
|
return function(_)
|
|
matcher.captures[id][1] = matcher.str
|
|
matcher:nextFunc()
|
|
end
|
|
end
|
|
local function captureStop(id)
|
|
return function(_)
|
|
matcher.captures[id][2] = matcher.str - 1
|
|
matcher:nextFunc()
|
|
end
|
|
end
|
|
|
|
local function balancer(str)
|
|
local sum = 0
|
|
local bc, ec = utf8sub(str, 1, 1), utf8sub(str, 2, 2)
|
|
local skip = len(bc) + len(ec)
|
|
bc, ec = utf8unicode(bc), utf8unicode(ec)
|
|
return function(cC)
|
|
if cC == ec and sum > 0 then
|
|
sum = sum - 1
|
|
if sum == 0 then
|
|
matcher:nextFunc()
|
|
end
|
|
matcher:nextStr()
|
|
elseif cC == bc then
|
|
sum = sum + 1
|
|
matcher:nextStr()
|
|
else
|
|
if sum == 0 or cC == -1 then
|
|
sum = 0
|
|
matcher:reset()
|
|
else
|
|
matcher:nextStr()
|
|
end
|
|
end
|
|
end, skip
|
|
end
|
|
|
|
matcher.functions[1] = function(_)
|
|
matcher:fullResetOnNextStr()
|
|
matcher.seqStart = matcher.str
|
|
matcher:nextFunc()
|
|
if (matcher.str > matcher.startStr and matcher.fromStart) or matcher.str >= matcher.stringLen then
|
|
matcher.stop = true
|
|
matcher.seqStart = nil
|
|
end
|
|
end
|
|
|
|
local lastFunc
|
|
local ignore = false
|
|
local skip = nil
|
|
local it = (function()
|
|
local gen = utf8gensub(regex)
|
|
return function()
|
|
return gen(skip)
|
|
end
|
|
end)()
|
|
local cs = {}
|
|
for c, bs, be in it do
|
|
skip = nil
|
|
if plain then
|
|
table.insert(matcher.functions, simple(classMatchGenerator(c, plain)))
|
|
else
|
|
if ignore then
|
|
if find('123456789', c, 1, true) then
|
|
if lastFunc then
|
|
table.insert(matcher.functions, simple(lastFunc))
|
|
lastFunc = nil
|
|
end
|
|
table.insert(matcher.functions, capture(tonumber(c)))
|
|
elseif c == 'b' then
|
|
if lastFunc then
|
|
table.insert(matcher.functions, simple(lastFunc))
|
|
lastFunc = nil
|
|
end
|
|
local b
|
|
b, skip = balancer(sub(regex, be + 1, be + 9))
|
|
table.insert(matcher.functions, b)
|
|
else
|
|
lastFunc = classMatchGenerator('%' .. c)
|
|
end
|
|
ignore = false
|
|
else
|
|
if c == '*' then
|
|
if lastFunc then
|
|
table.insert(matcher.functions, star(lastFunc))
|
|
lastFunc = nil
|
|
else
|
|
error('invalid regex after ' .. sub(regex, 1, bs))
|
|
end
|
|
elseif c == '+' then
|
|
if lastFunc then
|
|
table.insert(matcher.functions, simple(lastFunc))
|
|
table.insert(matcher.functions, star(lastFunc))
|
|
lastFunc = nil
|
|
else
|
|
error('invalid regex after ' .. sub(regex, 1, bs))
|
|
end
|
|
elseif c == '-' then
|
|
if lastFunc then
|
|
table.insert(matcher.functions, minus(lastFunc))
|
|
lastFunc = nil
|
|
else
|
|
error('invalid regex after ' .. sub(regex, 1, bs))
|
|
end
|
|
elseif c == '?' then
|
|
if lastFunc then
|
|
table.insert(matcher.functions, question(lastFunc))
|
|
lastFunc = nil
|
|
else
|
|
error('invalid regex after ' .. sub(regex, 1, bs))
|
|
end
|
|
elseif c == '^' then
|
|
if bs == 1 then
|
|
matcher.fromStart = true
|
|
else
|
|
error('invalid regex after ' .. sub(regex, 1, bs))
|
|
end
|
|
elseif c == '$' then
|
|
if be == len(regex) then
|
|
matcher.toEnd = true
|
|
else
|
|
error('invalid regex after ' .. sub(regex, 1, bs))
|
|
end
|
|
elseif c == '[' then
|
|
if lastFunc then
|
|
table.insert(matcher.functions, simple(lastFunc))
|
|
end
|
|
lastFunc, skip = classMatchGenerator(sub(regex, be + 1))
|
|
elseif c == '(' then
|
|
if lastFunc then
|
|
table.insert(matcher.functions, simple(lastFunc))
|
|
lastFunc = nil
|
|
end
|
|
table.insert(matcher.captures, {})
|
|
table.insert(cs, #matcher.captures)
|
|
table.insert(matcher.functions, captureStart(cs[#cs]))
|
|
if sub(regex, be + 1, be + 1) == ')' then matcher.captures[#matcher.captures].empty = true end
|
|
elseif c == ')' then
|
|
if lastFunc then
|
|
table.insert(matcher.functions, simple(lastFunc))
|
|
lastFunc = nil
|
|
end
|
|
local cap = table.remove(cs)
|
|
if not cap then
|
|
error('invalid capture: "(" missing')
|
|
end
|
|
table.insert(matcher.functions, captureStop(cap))
|
|
elseif c == '.' then
|
|
if lastFunc then
|
|
table.insert(matcher.functions, simple(lastFunc))
|
|
end
|
|
lastFunc = function(cC) return cC ~= -1 end
|
|
elseif c == '%' then
|
|
ignore = true
|
|
else
|
|
if lastFunc then
|
|
table.insert(matcher.functions, simple(lastFunc))
|
|
end
|
|
lastFunc = classMatchGenerator(c)
|
|
end
|
|
end
|
|
end
|
|
end
|
|
if #cs > 0 then
|
|
error('invalid capture: ")" missing')
|
|
end
|
|
if lastFunc then
|
|
table.insert(matcher.functions, simple(lastFunc))
|
|
end
|
|
|
|
table.insert(matcher.functions, function()
|
|
if matcher.toEnd and matcher.str ~= matcher.stringLen then
|
|
matcher:reset()
|
|
else
|
|
matcher.stop = true
|
|
end
|
|
end)
|
|
|
|
matcher.nextFunc = function(self)
|
|
self.func = self.func + 1
|
|
end
|
|
matcher.nextStr = function(self)
|
|
self.str = self.str + 1
|
|
end
|
|
matcher.strReset = function(self)
|
|
local oldReset = self.reset
|
|
local str = self.str
|
|
self.reset = function(s)
|
|
s.str = str
|
|
s.reset = oldReset
|
|
end
|
|
end
|
|
matcher.fullResetOnNextFunc = function(self)
|
|
local oldReset = self.reset
|
|
local func = self.func +1
|
|
local str = self.str
|
|
self.reset = function(s)
|
|
s.func = func
|
|
s.str = str
|
|
s.reset = oldReset
|
|
end
|
|
end
|
|
matcher.fullResetOnNextStr = function(self)
|
|
local oldReset = self.reset
|
|
local str = self.str + 1
|
|
local func = self.func
|
|
self.reset = function(s)
|
|
s.func = func
|
|
s.str = str
|
|
s.reset = oldReset
|
|
end
|
|
end
|
|
|
|
matcher.process = function(self, str, start)
|
|
|
|
self.func = 1
|
|
start = start or 1
|
|
self.startStr = (start >= 0) and start or utf8len(str) + start + 1
|
|
self.seqStart = self.startStr
|
|
self.str = self.startStr
|
|
self.stringLen = utf8len(str) + 1
|
|
self.string = str
|
|
self.stop = false
|
|
|
|
self.reset = function(s)
|
|
s.func = 1
|
|
end
|
|
|
|
-- local lastPos = self.str
|
|
-- local lastByte
|
|
local ch
|
|
while not self.stop do
|
|
if self.str < self.stringLen then
|
|
--[[ if lastPos < self.str then
|
|
print('last byte', lastByte)
|
|
ch, lastByte = utf8subWithBytes(str, 1, self.str - lastPos - 1, lastByte)
|
|
ch, lastByte = utf8subWithBytes(str, 1, 1, lastByte)
|
|
lastByte = lastByte - 1
|
|
else
|
|
ch, lastByte = utf8subWithBytes(str, self.str, self.str)
|
|
end
|
|
lastPos = self.str ]]
|
|
ch = utf8sub(str, self.str,self.str)
|
|
--print('char', ch, utf8unicode(ch))
|
|
self.functions[self.func](utf8unicode(ch))
|
|
else
|
|
self.functions[self.func](-1)
|
|
end
|
|
end
|
|
|
|
if self.seqStart then
|
|
local captures = {}
|
|
for _,pair in pairs(self.captures) do
|
|
if pair.empty then
|
|
table.insert(captures, pair[1])
|
|
else
|
|
table.insert(captures, utf8sub(str, pair[1], pair[2]))
|
|
end
|
|
end
|
|
return self.seqStart, self.str - 1, unpack(captures)
|
|
end
|
|
end
|
|
|
|
return matcher
|
|
end
|
|
|
|
-- string.find
|
|
local function utf8find(str, regex, init, plain)
|
|
local matcher = cache[regex] or matcherGenerator(regex, plain)
|
|
return matcher:process(str, init)
|
|
end
|
|
|
|
-- string.match
|
|
local function utf8match(str, regex, init)
|
|
init = init or 1
|
|
local found = {utf8find(str, regex, init)}
|
|
if found[1] then
|
|
if found[3] then
|
|
return unpack(found, 3)
|
|
end
|
|
return utf8sub(str, found[1], found[2])
|
|
end
|
|
end
|
|
|
|
-- string.gmatch
|
|
local function utf8gmatch(str, regex, all)
|
|
regex = (utf8sub(regex,1,1) ~= '^') and regex or '%' .. regex
|
|
local lastChar = 1
|
|
return function()
|
|
local found = {utf8find(str, regex, lastChar)}
|
|
if found[1] then
|
|
lastChar = found[2] + 1
|
|
if found[all and 1 or 3] then
|
|
return unpack(found, all and 1 or 3)
|
|
end
|
|
return utf8sub(str, found[1], found[2])
|
|
end
|
|
end
|
|
end
|
|
|
|
local function replace(repl, args)
|
|
local ret = ''
|
|
if type(repl) == 'string' then
|
|
local ignore = false
|
|
local num
|
|
for c in utf8gensub(repl) do
|
|
if not ignore then
|
|
if c == '%' then
|
|
ignore = true
|
|
else
|
|
ret = ret .. c
|
|
end
|
|
else
|
|
num = tonumber(c)
|
|
if num then
|
|
ret = ret .. args[num]
|
|
else
|
|
ret = ret .. c
|
|
end
|
|
ignore = false
|
|
end
|
|
end
|
|
elseif type(repl) == 'table' then
|
|
ret = repl[args[1] or args[0]] or ''
|
|
elseif type(repl) == 'function' then
|
|
if #args > 0 then
|
|
ret = repl(unpack(args, 1)) or ''
|
|
else
|
|
ret = repl(args[0]) or ''
|
|
end
|
|
end
|
|
return ret
|
|
end
|
|
-- string.gsub
|
|
local function utf8gsub(str, regex, repl, limit)
|
|
limit = limit or -1
|
|
local ret = ''
|
|
local prevEnd = 1
|
|
local it = utf8gmatch(str, regex, true)
|
|
local found = {it()}
|
|
local n = 0
|
|
while #found > 0 and limit ~= n do
|
|
local args = {[0] = utf8sub(str, found[1], found[2]), unpack(found, 3)}
|
|
ret = ret .. utf8sub(str, prevEnd, found[1] - 1)
|
|
.. replace(repl, args)
|
|
prevEnd = found[2] + 1
|
|
n = n + 1
|
|
found = {it()}
|
|
end
|
|
return ret .. utf8sub(str, prevEnd), n
|
|
end
|
|
|
|
local utf8 = {}
|
|
utf8.len = utf8len
|
|
utf8.sub = utf8sub
|
|
utf8.reverse = utf8reverse
|
|
utf8.char = utf8char
|
|
utf8.unicode = utf8unicode
|
|
utf8.gensub = utf8gensub
|
|
utf8.byte = utf8unicode
|
|
utf8.find = utf8find
|
|
utf8.match = utf8match
|
|
utf8.gmatch = utf8gmatch
|
|
utf8.gsub = utf8gsub
|
|
utf8.dump = dump
|
|
utf8.format = format
|
|
utf8.lower = lower
|
|
utf8.upper = upper
|
|
utf8.rep = rep
|
|
return utf8
|