Module:User:Theknightwho/lua-uca/lua-uca-collator
Jump to navigation
Jump to search
- This module lacks a documentation subpage. You may create it.
- Useful links: root page • root page’s subpages • links • transclusions • testcases • user page • user talk page • userspace
This is a private module sandbox of Theknightwho, for their own experimentation. Items in this module may be added and removed at Theknightwho's discretion; do not rely on this module's stability.
-- object for Unicode string collation
local tailoring_lib = require "Module:User:Theknightwho/lua-uca/lua-uca-tailoring"
local reordering_table = mw.loadData "Module:User:Theknightwho/lua-uca/lua-uca-reordering-table"
local collator = {}
collator.__index = collator
local function copy_table(tbl)
local t = {}
for k, v in pairs(tbl) do
if type(v) == "table" then
t[k] = copy_table(v)
else
t[k] = v
end
end
return t
end
collator.copy_table = copy_table
function collator.new(codes)
local self = setmetatable({}, collator)
-- tree with mappings from codepoints to collation elements
self.codes = codes
self.updated_codes = {}
setmetatable(self.updated_codes, {__mode="kv"})
-- cached sort keys
self.stringcache = {}
self.tailoring_multiplier = {1, 1, 1, 1}
return self
end
function collator:get_implicit_weight(codepoints, pos)
-- implicit weight is based on the codepoint value
local codepoint = codepoints[pos]
return {{codepoint, 0, 0}}, pos + 1
end
function collator:read_weight(codepoints, pos)
-- try to find contractions and return weight for longest matched string
-- in the database
local function read_children(parent, pos)
local newpos = pos + 1
newcodepoint = codepoints[newpos]
-- if we go out of the codepoint array
if not newcodepoint then return nil end
local child = type(parent) == "table" and parent[newcodepoint] or nil
if child then
local nextchild, nextpos = read_children(child, newpos)
if nextchild then return nextchild, nextpos end
if child then return child, newpos end
end
return nil
end
local weights
local current_codepoint = codepoints[pos]
local codes = self.updated_codes[current_codepoint] or self.codes[current_codepoint]
if not codes then return nil, pos + 1 end
-- first try to read contractions
weights, new_pos = read_children(codes, pos)
if weights then return weights, new_pos + 1 end
-- if no contraction, weights are in the value field
if not self.codes[current_codepoint] then return nil, pos + 1 end
return self.codes[current_codepoint], pos + 1
end
-- get weights for the next characters
function collator:get_weights(codepoints, pos)
local weights, next_pos = self:read_weight(codepoints, pos)
-- return implicit weights for codepoints that are not in the database
if not weights then
weights, next_pos = self:get_implicit_weight(codepoints, pos)
end
-- don't step next_pos if it is larger than size of the codepoints array
if next_pos > #codepoints then next_pos = nil end
return weights, next_pos
end
function collator:update_levels(levels, weights)
-- process weight weights
if type(weights) == "table" then
for i, w in ipairs(weights) do
if type(w) == "table" then
for j, x in ipairs(w) do
-- process collation elements
if x ~= 0 and type(x) == "number" then -- ignore zero elements
-- insert element at the current collation level
local current_level = levels[j] or {}
table.insert(current_level, x)
levels[j] = current_level
end
end
else
if w ~= 0 and type(w) == "number" then -- ignore zero elements
-- insert element at the current collation level
local current_level = levels[i] or {}
table.insert(current_level, w)
levels[i] = current_level
end
end
end
elseif type(weights) == "number" then
if weights ~= 0 then
local current_level = levels[1] or {}
table.insert(current_level, weights)
levels[1] = current_level
end
end
return levels
end
-- make sort key from codepoints array
function collator:make_sort_key(codepoints)
local levels = {}
local pos = 1
local weights
local sort_key = {}
while true do
weights, pos = self:get_weights(codepoints, pos)
levels = self:update_levels(levels, weights)
-- break when we reach end of the codepoints array
if not pos then break end
end
for i, elements in ipairs(levels) do
for _, element in ipairs(elements) do
table.insert(sort_key, element)
end
-- zero separates levels in the sort key
table.insert(sort_key, 0)
end
return sort_key
end
function collator:compare(a, b)
-- sort using sort keys
local min = math.min(#a, #b)
for i = 1, min do
if a[i] ~= b[i] then return a[i] < b[i] end
end
-- this should happen only when the strings are equal
-- it needs to return false, otherwise the table.sort function reports
-- "invalid order function for sorting" error
return #a < #b
end
local codepoints_cache = {}
function collator:compare_strings(a,b)
local codepoint, len, toNFD = mw.ustring.codepoint, mw.ustring.len, mw.ustring.toNFD
-- sort using strings
local cache = self.stringcache
local get_sortkey = function(x)
x = toNFD(x)
return self:make_sort_key({codepoint(x, 1, len(x))})
end
local asortkey = cache[a] or get_sortkey(a)
local bsortkey = cache[b] or get_sortkey(b)
cache[a], cache[b] = asortkey, bsortkey
return self:compare(asortkey, bsortkey)
end
-- update collation codes
function collator:update_codes(key, elements)
local main_codes = self.codes
local keys = self.updated_codes
local function add_to_tree(tbl, current_pos)
local tbl = tbl or {}
local current_key = key[current_pos]
local el = tbl[current_key] or {}
if current_pos < #key then
el[2] = add_to_tree(el[2], current_pos + 1)
elseif current_pos == #key then
el[1] = elements
end
tbl[current_key] = el
return tbl
end
keys = add_to_tree(keys, 1)
end
--- change sorting ordering
function collator:tailor(base, target, tailoring_table)
-- get the value of the base character
local value = self:get_weights(base, 1)
local new_value = {}
-- create a new collation element
for k, v in ipairs(value) do
local subtable = {}
for x, y in ipairs(v) do
subtable[x] = y + ((tailoring_table[x] or 0) * self.tailoring_multiplier[x] or 1)
end
new_value[k] = subtable
end
-- when tailoring sets an equivialent character, it needs to be ignored in collator:weight_to_codepoints
local is_equivalent = 0
for _, x in ipairs(tailoring_table) do is_equivalent = is_equivalent + x end
if is_equivalent == 0 then new_value.equal = true end
self:update_codes(target, new_value)
end
-- reorder scripts
-- pass table with script names to reorder
function collator:reorder(tbl)
-- make table of the reordering table
local t = copy_table(reordering_table)
for _, script in ipairs(tbl) do
-- reorder scripts
tailoring_lib.reorder(script, t)
end
-- apply reordering to the collator object
tailoring_lib.reorder_collator(self, t)
end
-- expand characters to another characters
function collator:equal(base, target)
local new_weight = {}
local values, pos
pos = 1
while true do
value, pos = self:get_weights(target, pos)
for _, v in ipairs(value) do
new_weight[#new_weight + 1] = v
end
if not pos then break end
end
self:update_codes(base, new_weight)
end
return collator