Module:User:Erutuon/script recognition

Definition from Wiktionary, the free dictionary
Jump to: navigation, search

This module generated the codepoint-to-script lookup table in Module:Unicode data/scripts.

ABCD一丨丶丿乙亅: Latn, Latn, Latn, Latn, Hani, Hani, Hani, Hani, Hani, Hani


local export = {}

local getCodepoint = mw.ustring.codepoint
local U = mw.ustring.char

local title = mw.title.getCurrentTitle().fullText

local function check(funcName, expectType)
	return function(argIndex, arg)
		require("libraryUtil").checkType(funcName, argIndex, arg, expectType)
	end
end

local function dump(val, tsort)
	return require("Module:debug").highlight_dump(val, nil, tsort, { modified = true })
end

local function hasContents(t)
	if next(t) then
		return true
	else
		return false
	end
end

local function log(message)
	if title:match("testcases/documentation$") then
		mw.log(message)
	end
end
	

local function isInRange(value, lower, upper)
	-- mw.log(value, lower, upper)
	local check = check("isInRange", "number")
	check(1, value)
	check(2, lower)
	check(3, upper)
	
	return value >= lower and value <= upper
end

local function lookupCharacter(characterLookup, character)
	local codepoint
	if type(character) == "string" then
		if mw.ustring.len(character) == 1 then
			codepoint = getCodepoint(character)
		else
			error("Character " .. character .. " has length " .. mw.ustring.len(character) .. ". It is supposed to be a single character.")
		end
	elseif type(character) == "number" then
		codepoint = character
	else
		error("Character is the wrong type: " .. type(character) .. ".")
	end
	
	if characterLookup.smallest and not isInRange(codepoint, characterLookup.smallest, characterLookup.largest) then
		return false
	elseif characterLookup.values and characterLookup.values[codepoint] then
		return true
	else
		for i, range in ipairs(characterLookup) do
			if isInRange(codepoint, range[1], range[2]) then
				return true
			end
		end
	end
	
	return false
end

local function forEachChar(str, func)
	if type(func) == "function" then
		for i = 1, mw.ustring.len(str) do
			char = mw.ustring.sub(str, i, i)
			func(char)
		end
	end
end

function export.makeCharacterLookup(pattern)
	local characterLookup = {}
	local values = {}
	local allValues = {}
	
	local i = 1
	-- Create ranges in which all characters belong to the script.
	local workingString = mw.ustring.gsub(
		pattern,
		"([^-])%-([^-])",
		function(item1, item2)
			local codepoint1, codepoint2 = getCodepoint(item1), getCodepoint(item2)
			--[[
			if not (codepoint1 < codepoint2) then
				error("Wrong codepoint order with " .. U(codepoint1) .. " and " .. U(codepoint2) .. "!")
			end
			]]
			table.insert(characterLookup, { codepoint1, codepoint2 })
			allValues[codepoint1] = true
			allValues[codepoint2] = true
			return ""
		end
	)
	if workingString ~= "" then
		workingString = mw.ustring.gsub(
			workingString,
			".",
			function(char)
				local codepoint = getCodepoint(char)
				values[codepoint] = true
				allValues[codepoint] = true
			end
		)
	end
	
	--[[
		Place the tables of ranges in the Unicode order (the patterns
		should already be in that order, but just to be safe).
	]]
	table.sort(
		characterLookup,
		function(item1, item2)
			return item1[1] < item2[1]
		end
	)
	
	local allValuesKeys = require("Module:table").numKeys(allValues)
	
	local smallest, largest = allValuesKeys[1], allValuesKeys[#allValuesKeys]
	
	-- Don't create an empty values table.
	if hasContents(values) then
		characterLookup.values = values
	end
	
	--[[
		Don't record the smallest and largest values if they're found in the
		first range.
	]]
	if not (smallest == characterLookup[1][1] and largest == characterLookup[1][2]) then
		characterLookup.smallest, characterLookup.largest = smallest, largest
	end
	
	return characterLookup
end

function export.makeAllScriptsCharacterLookup()
	local allScriptsCharacterLookup = {}
	local patternToScript = {}
	for code, data in pairs(mw.loadData("Module:scripts/data")) do
		if data.characters then
			-- Don't generate identical lookup table twice.
			local scriptWithPattern = patternToScript[data.characters]
			if scriptWithPattern then
				allScriptsCharacterLookup[code] = allScriptsCharacterLookup[scriptWithPattern]
			else
				allScriptsCharacterLookup[code] = export.makeCharacterLookup(data.characters)
			end
			patternToScript[data.characters] = code
		end
	end
	return allScriptsCharacterLookup
end

-- fa-Arab → Arab-fa
local function switchLangSc(scriptCode)
	return scriptCode:gsub("^([^-]+)%-(.+)$", "%2-%1")
end

-- To ensure that Grek and Latn appear first.
-- This also makes Grek and Latn take precedence when generating
-- the codepoint-to-script lookup table.
local scriptCodeReplacements = {
	polytonic = "Grek2",
	Latinx = "Latnx",
	Latf = "Latnf",
}

local function modifyAdHocCode(code)
	if scriptCodeReplacements[code] then
		return scriptCodeReplacements[code]
	elseif not (code:match("[A-Z][a-z][a-z][a-z]") or
			code:match("[a-z][a-z][a-z]%-[A-Z][a-z][a-z][a-z]")) then
		return code:gsub("^(.+)$", "~%1")
	else
		return code
	end
end
	
local function keySort(key1, key2)
	local type1, type2 = type(key1), type(key2)
	if type1 == "number" and type2 == "string" then
		return true
	elseif type1 == "string" and type2 == "number" then
		return false
	elseif type1 == "string" then
		key1, key2 = modifyAdHocCode(key1), modifyAdHocCode(key2)
		key1, key2 = switchLangSc(key1), switchLangSc(key2)
		local lower1, lower2 = mw.ustring.lower(key1), mw.ustring.lower(key2)
		return lower1 < lower2
	else
		return key1 < key2
	end
end

local function hex(number)
	return string.format("0x%X", number)
end

local function divideRange(lower, upper, width, testing)
	local ranges = {}
	
	if not (lower and upper) then
		mw.log("divideRange failed:", lower, upper, width, testing)
		return nil
	end
	
	local position = math.floor(lower / width)
	local start = position * width
	
	local i = 0
	local increment = i * width
	repeat
		local range1 = start + increment
		local range2 = range1 + width - 1
		
		if range1 < lower then
			range1 = lower
		end
		
		if range2 > upper then
			range2 = upper
		end
		
		if testing then
			range1, range2 = hex(range1), hex(range2)
		end
		
		ranges[position + i] = { range1, range2 }
		
		i = i + 1
		increment = i * width
	until
		 start + increment > upper
	
	return ranges
end

function export.showDividedRange(frame)
	local lower = 0x2A700
	local higher = 0x2B73F
	local width = 0x1000
	local dividedRange = divideRange(lower, higher, width, true)
	return table.concat({ hex(lower), hex(higher) }, ", ") .. dump(dividedRange)
end

-- Scripts that consist entirely of characters from another script.
local scriptBlacklist = {
	["Latf"]		= true;
	["Hans"]		= true;
	["Hant"]		= true;
	["Kore"]		= true;
	["Jpan"]		= true;
	["fa-Arab"] 	= true;
	["kk-Arab"] 	= true;
	["ks-Arab"] 	= true;
	["ku-Arab"]		= true;
	["ms-Arab"]		= true;
	["mzn-Arab"]	= true;
	["ota-Arab"]	= true;
	["pa-Arab"]		= true;
	["ps-Arab"]		= true;
	["sd-Arab"]		= true;
	["tt-Arab"]		= true;
	["ug-Arab"]		= true;
	["ur-Arab"]		= true;
	["nv-Latn"]		= true;
	["pjt-Latn"]	= true;
	["Zyyy"]		= true;
}

local function sortRange(range1, range2)
	local number1, number2 = tonumber(range1[1]), tonumber(range2[1])
	if number1 == number2 then
		return keySort(range1[3], range2[3])
	else
		return number1 < number2
	end
end

local function makeCodepointToScriptLookup(testing)
	local output = {}
	output.individual = {}
	local rangeStrings = {}
	
	local allScriptsCharacterLookup = export.makeAllScriptsCharacterLookup()
	for scriptCode, lookup in require("Module:table").sortedPairs(allScriptsCharacterLookup, keySort) do
		if not scriptBlacklist[scriptCode] then
			for key, value in ipairs(lookup) do
				if type(value) == "table" then
					local newRanges = divideRange(value[1], value[2], 0x1000, testing)
					if newRanges then
						for position, newRange in pairs(newRanges) do
							local rangeString = newRange[1] .. newRange[2]
							if rangeStrings[rangeString] then
								mw.log("The range " .. newRange[1] .. "-" .. newRange[2] ..
									" is already recorded as belonging to the script code " .. 
									rangeStrings[rangeString] .. ".")
							else
								rangeStrings[rangeString] = scriptCode
								
								if not output[position] then
									output[position] = {}
								end
								
								table.insert(output[position], { newRange[1], newRange[2], scriptCode })
							end
						end
					end
				end
			end
			
			if lookup.values then
				for codepoint in pairs(lookup.values) do
					if output.individual[codepoint] then
						mw.log("The codepoint " .. hex(codepoint) ..
								" is already recorded as belonging to the script code" ..
								output.individual[codepoint] .. ".")
					else
						output.individual[codepoint] = scriptCode
					end
				end
			end
		end
	end
	
	for position, ranges in pairs(output) do
		table.sort(
			ranges,
			sortRange
		)
	end
	
	for position, ranges in pairs(output) do
		if type(position) == "number" then
			ranges.size = #ranges
		end
	end
	
	return output
end

--[[
	Binary search: more efficient for the longer lists of codepoint ranges than
	for the shorter ones.
]]
local function binarySearch(ranges, value)
	--	Initialize numbers.
	local iStart, iMid = 1, 0
	-- Can't use # because table is loaded by mw.loadData.
	local iEnd = require("Module:table").size(ranges)
	
	if iEnd == 0 then
		return nil
	end
	
	local iterations = 0
	
	-- Do search.
	while iStart <= iEnd do
		iterations = iterations + 1
		
		-- Calculate middle.
		iMid = math.floor((iStart + iEnd) / 2)
		
		-- Get compare value.
		local range = ranges[iMid]
		
		-- Return matching index. Assumes there are no duplicates.
		if isInRange(value, range[1], range[2]) then
			return range
		
		-- Keep searching.
		elseif value < range[1] then
			iEnd = iMid - 1
		else
			iStart = iMid + 1
		end
	end
	return nil
end

local function lookupInOrder(number, ranges)
	for i, range in ipairs(ranges) do
		if isInRange(number, range[1], range[2]) then
			-- mw.log(mw.ustring.char(number), hex(number), i)
			return range[3]
		end
		if number < range[1] then
			-- mw.log(mw.ustring.char(number), hex(number), i)
			return nil
		end
	end
end

-- Save previously used codepoint ranges in case another character is in the
-- same range.
local rangesCache = {}

function export.charToScript(char)
	local lookup = mw.loadData("Module:User:Erutuon/script recognition/data") -- makeCodepointToScriptLookup()
	local codepoint = mw.ustring.codepoint(char)
	
	local individualMatch = lookup.individual[codepoint]
	if individualMatch then
		return individualMatch
	else
		local script = lookupInOrder(codepoint, rangesCache)
		if script then
			return script
		end
		
		local index = math.floor(codepoint / 0x1000)
		
		script = lookupInOrder(index, lookup.blocks)
		if script then
			return script
		end
		
		local range = binarySearch(lookup[index], codepoint)
		if range then
			table.insert(rangesCache, range)
			table.sort(rangesCache, sortRange)
			return range[3]
		end
	end
	
	return nil
end

function export.show(frame)
	local allScriptsCharacterLookup = mw.loadData("Module:User:Erutuon/script recognition/data")
	
	local str = frame.args[1] or "ABCD一丨丶丿乙亅"
	
	local result = {}
	forEachChar(
		str,
		function(char)
			table.insert(result, tostring(export.charToScript(char)))
		end
	)

	return str .. ": " .. table.concat(result, ", ")
end

function export.showLookup(frame)
	return dump(makeCodepointToScriptLookup())
end

return export