Module:User:Erutuon/UTF-8

From Wiktionary, the free dictionary
Jump to navigation Jump to search

Function for making a byte pattern that will match any UTF-8 character in a range between and including two characters. May be useful for making transliteration modules more efficient. Then again, it's cryptic and some random person can vandalize the pattern or just accidentally misuse it, making the module fail.

  • Ა-ჿ (0x1C90-0x10FF): [\225-\224][\128-\191][\128-\191]
  • Lua error at line 64: The first character to makeUTF8Pattern (U+10140) should have a lower codepoint than the second (U+3FF).
  • ㋿-〿 (0x32FF-0x303F): \226[\128-\191][\128-\191]

local export = {}

local m_debug = require("Module:debug")

-- Excludes null byte, which is supposed to be able to be included in Lua strings,
-- but causes patterns to fail.
local continuationByte = "[\128-\191]"
---[[
local UTF8Char = "[\1-\127\194-\244]" .. continuationByte .. "*"
local nonASCII = "[\194-\244]" .. continuationByte .. "+"
--]]
local escapePatt = "\\%d"
local hexPatt = "0x%X"

local floor = math.floor

local function highlight(text)
	return m_debug.highlight(text, { inline = true })
end

local function hex(number)
	return hexPatt:format(number)
end

local function byteEscape(number)
	return escapePatt:format(number)
end

local function escapeBytes(str)
	local out = {}
	for i, byte in ipairs{ string.byte(str, 1, -1) } do
		if byte < 128 then
			table.insert(out, string.char(byte))
		else
			table.insert(out, byteEscape(byte))
		end
	end
	return table.concat(out)
end

-- Based on the helpful byte chart at [[w:UTF-8#Codepage layout]].
local function getLeadingContinuation(codepoint)
	if codepoint < 0x80 then
		error("ASCII does not have leading bytes.")
	elseif codepoint < 0x800 then
		return 194 + floor((codepoint - 0x80) / 0x40), 1
	elseif codepoint < 0x10000 then
		return 224 + floor((codepoint - 0x800) / 0x1000), 2
	elseif codepoint < 0x11000 then
		return 240 + floor((codepoint - 0x10000) / 0x40000), 3
	else
		error(("Codepoint U+%X is outside valid range."):format(codepoint))
	end
end

function export.makeUTF8Pattern(lower, higher)
	local codepoint1, codepoint2 = mw.ustring.codepoint(lower), mw.ustring.codepoint(higher)
	local leading1, continuationCount1 = getLeadingContinuation(codepoint1)
	local leading2, continuationCount2 = getLeadingContinuation(codepoint2)
	local continuationSequence = string.rep(continuationByte, continuationCount1)
	if continuationCount1 < continuationCount2 then
		continuationSequence = continuationSequence .. "+"
	elseif continuationCount1 > continuationCount2 then
		error(string.format("The first character to makeUTF8Pattern (U+%X) should have a lower codepoint than the second (U+%X).", codepoint1, codepoint2))
	end
	local leading = leading1 == leading2 and byteEscape(leading1) or "[" .. byteEscape(leading1) .. "-" .. byteEscape(leading2) .. "]"
	return lower .. "-" .. higher ..
		" (" .. highlight(hex(codepoint1) .. "-" .. hex(codepoint2)) .. "): " ..
		highlight(leading .. escapeBytes(continuationSequence))
end

function export.makeRange(characters)
	characters = string.gsub(characters, "%-", "")
	local firstChar = string.match(characters, UTF8Char)
	local lower, higher = firstChar, firstChar
	for character in string.gmatch(characters, UTF8Char) do
		if character < lower then
			lower = character
		elseif character > higher then
			higher = character
		end
	end
	return lower, higher
end

function export.charPatternForScript(scCode)
	local sc = require("Module:scripts").getByCode(scCode)
	local characters = sc:getCharacters()
	return export.makeUTF8Pattern(export.makeRange(characters))
end

function export.show(frame)
	return export.charPatternForScript(frame.args[1] or "polytonic")
end

return export