Module:User:Erutuon/split language data modules

From Wiktionary, the free dictionary
Jump to navigation Jump to search

Template:documentation

local export = {}

-- Version of deepcopy from [[Module:table]] that doesn't preserve
-- references to the same table. If tables are recursive, this will cause
-- a stack overflow!
local function deepcopy(orig, level)
	if type(orig) == 'table' then
		local copy = {}
		for orig_key, orig_value in pairs(orig) do
			copy[deepcopy(orig_key)] = deepcopy(orig_value)
		end
		return copy
	else -- number, string, boolean, etc
		return orig
	end
end

-- Load data module containing multiple language data tables.
-- mw.text.jsonEncode converts integer keys to strings if the data table
-- contains any string keys. Have to convert them back.
function export.load_json_language_data(json)
	local data = mw.text.jsonDecode(json)
	local fixed_data = {}
	for code, old_table in pairs(data) do
		-- Key 1 (canonical name) is always present in the original
		-- language data, so key "1" is always present in old_table
		-- if it was decoded from a JSON object.
		if old_table["1"] then
			local new_table = {}
			for k, v in pairs(old_table) do
				k = tonumber(k) or k
				new_table[k] = v
			end
			fixed_data[code] = new_table
		else
			fixed_data[code] = old_table
		end
	end
	return fixed_data
end

function export.split(key_maker)
	local tables_by_key = {}
	for code, data in pairs(require "Module:languages/data/all") do
		local key = key_maker(code, data)
		local subtable = tables_by_key[key]
		if not subtable then
			subtable = {}
			tables_by_key[key] = subtable
		end
		-- mw.text.jsonEncode refuses to directly encode the language data
		-- because of "circular references", which probably means
		-- the scripts fields that refer to the same tables containing
		-- {"Latn"}, {"Cyrl"}, {"Arab"}.
		-- Sequence tables (only canonical name, Wikidata item, and family)
		-- are encoded as JSON arrays, others as objects.
		-- Thus the number-indexed values will be under either string or
		-- number fields. Thus keys should be processed using
		-- tonumber(key) or key
		-- or the equivalent on the other side.
		subtable[code] = deepcopy(data)
	end
	return mw.text.jsonEncode(tables_by_key)
end

function export.split_by_two_letter_prefix(frame)
	return export.split(
		function(code)
			return code:sub(1, 2)
		end)
end

function export.show(frame)
	local all_languages = export.split_by_two_letter_prefix()
	return "length: " .. #all_languages
		.. "\n\n"
		.. all_languages
end

return export