Module:User:Erutuon/UTF-8
Jump to navigation
Jump to search
- The following documentation is located at Module:User:Erutuon/UTF-8/documentation. [edit]
- Useful links: root page • root page’s subpages • links • transclusions • testcases • user page • user talk page • userspace
Function for making a byte pattern that will match any UTF-8 character in a range between and including two characters. May be useful for making transliteration modules more efficient. Then again, it's cryptic and some random person can vandalize the pattern or just accidentally misuse it, making the module fail.
- Ა-ჿ (
0x1C90-0x10FF
):[\225-\224][\128-\191][\128-\191]
- Lua error at line 64: The first character to makeUTF8Pattern (U+10140) should have a lower codepoint than the second (U+3FF).
- ㋿-〿 (
0x32FF-0x303F
):\226[\128-\191][\128-\191]
local export = {}
local m_debug = require("Module:debug")
-- Excludes null byte, which is supposed to be able to be included in Lua strings,
-- but causes patterns to fail.
local continuationByte = "[\128-\191]"
---[[
local UTF8Char = "[\1-\127\194-\244]" .. continuationByte .. "*"
local nonASCII = "[\194-\244]" .. continuationByte .. "+"
--]]
local escapePatt = "\\%d"
local hexPatt = "0x%X"
local floor = math.floor
local function highlight(text)
return m_debug.highlight(text, { inline = true })
end
local function hex(number)
return hexPatt:format(number)
end
local function byteEscape(number)
return escapePatt:format(number)
end
local function escapeBytes(str)
local out = {}
for i, byte in ipairs{ string.byte(str, 1, -1) } do
if byte < 128 then
table.insert(out, string.char(byte))
else
table.insert(out, byteEscape(byte))
end
end
return table.concat(out)
end
-- Based on the helpful byte chart at [[w:UTF-8#Codepage layout]].
local function getLeadingContinuation(codepoint)
if codepoint < 0x80 then
error("ASCII does not have leading bytes.")
elseif codepoint < 0x800 then
return 194 + floor((codepoint - 0x80) / 0x40), 1
elseif codepoint < 0x10000 then
return 224 + floor((codepoint - 0x800) / 0x1000), 2
elseif codepoint < 0x11000 then
return 240 + floor((codepoint - 0x10000) / 0x40000), 3
else
error(("Codepoint U+%X is outside valid range."):format(codepoint))
end
end
function export.makeUTF8Pattern(lower, higher)
local codepoint1, codepoint2 = mw.ustring.codepoint(lower), mw.ustring.codepoint(higher)
local leading1, continuationCount1 = getLeadingContinuation(codepoint1)
local leading2, continuationCount2 = getLeadingContinuation(codepoint2)
local continuationSequence = string.rep(continuationByte, continuationCount1)
if continuationCount1 < continuationCount2 then
continuationSequence = continuationSequence .. "+"
elseif continuationCount1 > continuationCount2 then
error(string.format("The first character to makeUTF8Pattern (U+%X) should have a lower codepoint than the second (U+%X).", codepoint1, codepoint2))
end
local leading = leading1 == leading2 and byteEscape(leading1) or "[" .. byteEscape(leading1) .. "-" .. byteEscape(leading2) .. "]"
return lower .. "-" .. higher ..
" (" .. highlight(hex(codepoint1) .. "-" .. hex(codepoint2)) .. "): " ..
highlight(leading .. escapeBytes(continuationSequence))
end
function export.makeRange(characters)
characters = string.gsub(characters, "%-", "")
local firstChar = string.match(characters, UTF8Char)
local lower, higher = firstChar, firstChar
for character in string.gmatch(characters, UTF8Char) do
if character < lower then
lower = character
elseif character > higher then
higher = character
end
end
return lower, higher
end
function export.charPatternForScript(scCode)
local sc = require("Module:scripts").getByCode(scCode)
local characters = sc:getCharacters()
return export.makeUTF8Pattern(export.makeRange(characters))
end
function export.show(frame)
return export.charPatternForScript(frame.args[1] or "polytonic")
end
return export