Module:User:Theknightwho/lua-uca/lua-uca-collator

From Wiktionary, the free dictionary
Jump to navigation Jump to search

This is a private module sandbox of Theknightwho, for their own experimentation. Items in this module may be added and removed at Theknightwho's discretion; do not rely on this module's stability.


-- object for Unicode string collation
local tailoring_lib = require "Module:User:Theknightwho/lua-uca/lua-uca-tailoring"
local reordering_table = mw.loadData "Module:User:Theknightwho/lua-uca/lua-uca-reordering-table"

local collator = {}
collator.__index = collator

local function copy_table(tbl)
	local t = {}
	for k, v in pairs(tbl) do
		if type(v) == "table" then
			t[k] = copy_table(v)
		else
			t[k] = v
		end
	end
	return t
end

collator.copy_table = copy_table

function collator.new(codes)
	local self = setmetatable({}, collator)
	-- tree with mappings from codepoints to collation elements
	self.codes = codes
	self.updated_codes = {}
	setmetatable(self.updated_codes, {__mode="kv"})
	-- cached sort keys
	self.stringcache = {}
	self.tailoring_multiplier = {1, 1, 1, 1}
	return self
end

function collator:get_implicit_weight(codepoints, pos)
	-- implicit weight is based on the codepoint value
	local codepoint = codepoints[pos]
	return {{codepoint, 0, 0}}, pos + 1
end

function collator:read_weight(codepoints, pos)
	-- try to find contractions and return weight for longest matched string
	-- in the database
	local function read_children(parent, pos)
		local newpos = pos + 1
		newcodepoint = codepoints[newpos]
		-- if we go out of the codepoint array
		if not newcodepoint then return nil end
		local child = type(parent) == "table" and parent[newcodepoint] or nil
		if child then
			local nextchild, nextpos = read_children(child, newpos)
			if nextchild then return nextchild, nextpos end
			if child then return child, newpos end
		end
		return nil
	end
	local weights
	local current_codepoint = codepoints[pos]
	local codes = self.updated_codes[current_codepoint] or self.codes[current_codepoint]
	if not codes then return nil, pos + 1 end
	-- first try to read contractions
	weights, new_pos = read_children(codes, pos)
	if weights then return weights, new_pos + 1 end
	-- if no contraction, weights are in the value field
	if not self.codes[current_codepoint] then return nil, pos + 1 end
	return self.codes[current_codepoint], pos + 1
end

-- get weights for the next characters
function collator:get_weights(codepoints, pos)
	local weights, next_pos = self:read_weight(codepoints, pos)
	-- return implicit weights for codepoints that are not in the database
	if not weights then
		weights, next_pos = self:get_implicit_weight(codepoints, pos)
	end
	-- don't step next_pos if it is larger than size of the codepoints array
	if next_pos > #codepoints then next_pos = nil end
	return weights, next_pos
end

function collator:update_levels(levels, weights)
	-- process weight weights
	if type(weights) == "table" then
		for i, w in ipairs(weights) do
			if type(w) == "table" then
				for j, x in ipairs(w) do
					-- process collation elements
					if x ~= 0 and type(x) == "number" then -- ignore zero elements
						-- insert element at the current collation level
						local current_level = levels[j] or {}
						table.insert(current_level, x)
						levels[j] = current_level
					end
				end
			else
				if w ~= 0 and type(w) == "number" then -- ignore zero elements
					-- insert element at the current collation level
					local current_level = levels[i] or {}
					table.insert(current_level, w)
					levels[i] = current_level
				end
			end
		end
	elseif type(weights) == "number" then
		if weights ~= 0 then
			local current_level = levels[1] or {}
			table.insert(current_level, weights)
			levels[1] = current_level
		end
	end
	return levels
end

-- make sort key from codepoints array
function collator:make_sort_key(codepoints)
	local levels = {}
	local pos = 1
	local weights
	local sort_key = {}
	while true do
		weights, pos = self:get_weights(codepoints, pos)
		levels = self:update_levels(levels, weights)
		-- break when we reach end of the codepoints array
		if not pos then break end
	end
	for i, elements in ipairs(levels) do
		for _, element in ipairs(elements) do
			table.insert(sort_key, element)
		end
		-- zero separates levels in the sort key
		table.insert(sort_key, 0)
	end
	return sort_key
end

function collator:compare(a, b)
	-- sort using sort keys
	local min = math.min(#a, #b)
	for i = 1, min do
		if a[i] ~= b[i] then return a[i] < b[i] end
	end
	-- this should happen only when the strings are equal
	-- it needs to return false, otherwise the table.sort function reports
	-- "invalid order function for sorting" error
	return #a < #b
end

local codepoints_cache = {}

function collator:compare_strings(a,b)
	local codepoint, len, toNFD = mw.ustring.codepoint, mw.ustring.len, mw.ustring.toNFD
	-- sort using strings
	local cache = self.stringcache
	local get_sortkey = function(x)
		x = toNFD(x)
		return self:make_sort_key({codepoint(x, 1, len(x))})
	end
	local asortkey = cache[a] or get_sortkey(a)
	local bsortkey = cache[b] or get_sortkey(b)
	cache[a], cache[b] = asortkey, bsortkey
	return self:compare(asortkey, bsortkey)
end

-- update collation codes
function collator:update_codes(key, elements)
	local main_codes = self.codes
	local keys = self.updated_codes
	local function add_to_tree(tbl, current_pos)
		local tbl = tbl or {}
		local current_key = key[current_pos]
		local el = tbl[current_key] or {}
		if current_pos < #key then
			el[2] = add_to_tree(el[2], current_pos + 1)
		elseif current_pos == #key then
			el[1] = elements
		end
		tbl[current_key] = el
		return tbl
	end
	keys = add_to_tree(keys, 1)
end


--- change sorting ordering
function collator:tailor(base, target, tailoring_table)
	-- get the value of the base character
	local value = self:get_weights(base, 1)
	local new_value = {}
	-- create a new collation element
	for k, v in ipairs(value) do
		local subtable = {}
		for x, y in ipairs(v) do
			subtable[x] = y + ((tailoring_table[x] or 0) * self.tailoring_multiplier[x] or 1)
		end
		new_value[k] = subtable
	end
	-- when tailoring sets an equivialent character, it needs to be ignored in collator:weight_to_codepoints
	local is_equivalent = 0
	for _, x in ipairs(tailoring_table) do is_equivalent = is_equivalent + x end
	if is_equivalent == 0 then new_value.equal = true end
	self:update_codes(target, new_value)
end

-- reorder scripts
-- pass table with script names to reorder
function collator:reorder(tbl)
	-- make table of the reordering table
	local t = copy_table(reordering_table)
	for _, script in ipairs(tbl) do
		-- reorder scripts
		tailoring_lib.reorder(script, t)
	end
	-- apply reordering to the collator object
	tailoring_lib.reorder_collator(self, t)
end

-- expand characters to another characters
function collator:equal(base, target)
	local new_weight = {}
	local values, pos
	pos = 1
	while true do
		value, pos = self:get_weights(target, pos)
		for _, v in ipairs(value) do
			new_weight[#new_weight + 1] = v
		end
		if not pos then break end
	end
	self:update_codes(base, new_weight)
end

return collator