Module:family tree/etymology languages

From Wiktionary, the free dictionary
Jump to navigation Jump to search

Creates a version of Module:etymology languages/data in which there is just one code per language. Codes that have the same language data are reduced to the most language-codiest one. For instance, Austrian German has three codes (Austrian German, AG., de-AT) by which it can be accessed in etymology templates like {{cog}}. de-AT is chosen as the only language code for Austrian German because it looks the most like a language code.

The following criteria are used successively to weed out candidates for language-codiest code:

  1. The code must consist of letters and hyphens.
  2. The code must not contain an uppercase letter followed by a lowercase letter.
  3. The code must be as short as possible.

This list shows those etymology languages that have multiple codes, together with the code chosen by this module and the other codes:


local language_codes = require "Module:languages/code to canonical name"

-- Prefer nrf-grn and nrf-jer over roa-grn and roa-jer (Guernsey and Jersey).
-- Adds 2 to the weighting.
local function isLangCode(code)
	if language_codes[code:match("^%l+")] then return 1 else return 0 end
end

-- Order of preference:
-- xx, xxx, xx-xxx, xxx-xxx, xx-xxx-xxx, xxx-xxx-xxx, xx-XX, xxx-XX, xx-XX-xxx, xxx-XX-xxx
-- Language codes are preferred over family codes of the same format.
local function determine_preferred_etymology_language_code(code1, code2)
	local function weighting(code)
		if code:find("^%l%l$") then
			return 14
		elseif code:find("^%l%l%l$") then
			return 13
		elseif code:find("^%l%l%-%l%l%l$") then
			return 12
		elseif code:find("^%l%l%l%-%l%l%l$") then
			return 10 + isLangCode(code)
		elseif code:find("^%l%l%-%l%l%l%-%l%l%l$") then
			return 9
		elseif code:find("^%l%l%l%-%l%l%l%-%l%l%l$") then
			return 7 + isLangCode(code)
		elseif code:find("^%l%l%-%u%u$") then
			return 6
		elseif code:find("^%l%l%l%-%u%u$") then
			return 4 + isLangCode(code)
		elseif code:find("^%l%l%-%u%u%-%l%l%l$") then
			return 3
		elseif code:find("^%l%l%l%-%u%u%-%l%l%l$") then
			return 1 + isLangCode(code)
		else
			return 0
		end
	end
	
	local weighting1, weighting2 = weighting(code1), weighting(code2)
	if weighting1 > weighting2 then
		return code1
	elseif weighting1 < weighting2 then
		return code2
	elseif #code1 < #code2 then
		return code1
	elseif #code1 > #code2 then
		return code2
	-- If all else fails, use alphabetical order.
	elseif code1 > code2 then
		return code2
	else
		return code1
	end
end

local function fold(t, accum, func)
	for k, v in pairs(t) do
		accum = func(k, v, accum)
	end
	return accum
end

local function invert(t)
	local inverted = {}
	for k, v in pairs(t) do
		inverted[v] = k
	end
	return inverted
end

return invert(fold(
	require "Module:etymology languages/data",
	{},
	function (code, data, data_to_code)
		if data_to_code[data] then
			local preferred_code = determine_preferred_etymology_language_code(data_to_code[data], code)
			data_to_code[data] = preferred_code
			table.insert(data.codes, code)
		else
			data_to_code[data] = code
			data.codes = { code }
		end
		return data_to_code
	end))