Module:User:Erutuon/lang stuff

From Wiktionary, the free dictionary
Jump to navigation Jump to search

Generates the tables of information about Wiktionary languages in User:Erutuon/language stuff, User:Erutuon/languages with no scripts, and User:Erutuon/otherNames.


local export = {}

local script_key = 4

local all_scripts

local function _link_script (script_code)
	all_scripts = all_scripts or require "Module:scripts/data"
	local script_data = all_scripts[script_code]
	if not script_data then
		error("No script with code " .. tostring(script_code) .. ".")
	end
	
	local name = script_data[1]
	local last_word = name:match "%a+$":lower()
	if last_word == "scripts" or last_word == "code" or last_word == "semaphore" then
		return "[[:Category:" .. name .. "|" .. script_code .. "]]"
	else
		return "[[:Category:" .. name .. " script|" .. script_code .. "]]"
	end
end

local cache = {}
local function link_script (script_code)
	local link = cache[script_code]
	if not link then
		link = _link_script(script_code)
		cache[script_code] = link
	end
	return link
end

local array_from_comma_list_or_array = require "Module:fun".memoize(function (list)
	if type(list) == "table" then
		return require "Module:array"(list)
	end
	local array = require "Module:array"()
	for name in string.gmatch(list, "[^%s,]+") do
		array:insert(name)
	end
	return array
end)

local function ToC_item(title)
	return '<div style="overflow: hidden; height: 0; margin: 0; padding: 0;">\n=='
		.. title .. '==\n</div>'
end

function export.lang_and_fam_name(frame)
	local language_name_to_code = require "Module:languages/canonical names"
	local family_data	 = require "Module:families/data"
	local Map			 = require "Module:User:Erutuon/lang_stuff/map"
	local is_combining	 = require "Module:Unicode data".is_combining
	local fun			 = require "Module:fun"
	
	local function link_name(name, family)
		if family then
			local catname = name:find "[Ll]anguages$" and name or name .. " languages"
			return "[[:Category:" .. catname .. "|" .. catname .. "]]"
		else
			return "[[:Category:" .. (name:find "[Ll]anguage$" and name or name .. " language") .. "|" .. name .. "]]"
		end
	end
	
	local get_sort_value = fun.memoize(function (canonical_name)
		return mw.ustring.toNFD(canonical_name):gsub(
			"[\194-\244][\128-\191]+",
			function (nonASCII_char)
				if is_combining(mw.ustring.codepoint(nonASCII_char)) then
					return ""
				end
			end)
	end)
	
	local count = 0
	local families_that_share_name_with_language = Map:new(family_data)
		:filter(
			function (data)
				count = count + 1
				return language_name_to_code[data[1]] ~= nil
			end)
	
	return ToC_item("Languages and language families with the same name")
		.. '\n{| class="wikitable sortable"\n|+ ' .. count
		.. ' pairs of languages and language families have the same canonical name\n! language !! code !! family !! code\n'
		.. families_that_share_name_with_language
			-- Convert to array and add language family code as "code" field in
			-- data table.
			:to_array("code")
			:sort(
				function (family1, family2)
					return get_sort_value(family1[1]) < get_sort_value(family2[1])
				end)
			:map(
				function (data)
					local canonicalName = data[1]
					return ("|-\n| %s || <code>%s</code> || %s || <code>%s</code>\n")
						:format(link_name(canonicalName), language_name_to_code[canonicalName],
							link_name(canonicalName, true), data.code)
				end)
			:concat()
		.. '|}'
end

function export.number_of_scripts(frame, number)
	local fun = require "Module:fun"
	local m_table = require "Module:table"
	local Map = require "Module:User:Erutuon/lang_stuff/map"
	local Array = require "Module:array"
	local minimum_number_of_scripts = number or tonumber(frame.args[1])
		or error("Supply a number in parameter 1.")
	local get_length = fun.memoize(m_table.length)
	
	local languages = Map:new(require "Module:languages/data/all")
			:filter(
				function (data)
					return data[script_key] and #array_from_comma_list_or_array(data[script_key]) >= minimum_number_of_scripts
				end)
	
	local count = languages:size()

	return ToC_item("Number of scripts")
		.. '\n{| class="wikitable sortable"\n|+ ' .. count .. ' languages use ' .. minimum_number_of_scripts
		.. ' or more scripts\n! canonical name !! code !! script<br>count !! style="width: 8em;" | scripts\n'
		.. languages
			:filter(function(data, code) return not (code == "und" or code == "mul") end)
			:map(
				function (data, code)
					local canonical_name = data[1]
					local scripts = array_from_comma_list_or_array(data[script_key])
					return ('|-\n| [[:Category:%s|%s]] || <code style="white-space: nowrap;">%s</code> || %d || %s\n')
						:format(
							canonical_name .. (canonical_name:find("language") and "" or " language"),
							canonical_name,
							code,
							#scripts,
							scripts:map(link_script):concat(", "))
				end)
			:sorted_concat()
		.. "|}"
end

function export.census(frame)
	local alldata = require "Module:languages/data/all"
	local Map = require "Module:User:Erutuon/lang_stuff/map"
	
	local count = require "Module:count":new()
	
	for code, data in pairs(alldata) do
		local module_key
		if #code == 2 then
			module_key = 2
		elseif #code == 3 then
			count[3] = count[3] + 1
			module_key = 3 .. code:sub(1, 1)
		else
			module_key = "exceptional"
		end
		count[module_key] = count[module_key] + 1
	end
	
	return ToC_item('Languages in each module')
		.. '\n{| class="wikitable sortable"\n|+ '
		.. 'Total number of codes in each language data module\n! module !! count\n'
		.. Map:new(count)
			:map(
				function(count, module_key)
					local module
					if module_key == 2 then
						module = "data/2"
					elseif module_key == 3 then
						return ('|-\n| data-sort-value="%d" | three-letter codes || %d\n')
							:format(module_key, count)
					elseif module_key:sub(1, 1) == "3" then
						module = "data/3/" .. module_key:sub(2, 2)
					else
						module = "data/exceptional"
					end
					
					return ('|-\n| data-sort-value="%s" | [[Module:languages/%s]] || %d\n')
						:format(tostring(module_key), module, count)
				end)
			:sorted_concat()
		.. '|}'
end

function export.exceptional_code_formats(frame)
	local Map			= require "Module:User:Erutuon/lang_stuff/map"
	local language_data = Map:new(require "Module:languages/data/exceptional")
	
	local function add(t, k, v)
		local subtable = t[k]
		if not subtable then
			subtable = {}
			t[k] = subtable
		end
		table.insert(subtable, v)
	end
	
	local codes_by_format = setmetatable({}, { __index = Map:new{ add = add } })
	
	for code in language_data:sorted_pairs() do
		local code_repr = code:gsub("[^-]", "a")
		codes_by_format:add(code_repr, code)
	end
	
	local function get_sort_value(code_repr)
		return code_repr:gsub(
			"[^-]+",
			string.len)
	end
	
	local function compare(code_repr1, code_repr2)
		return get_sort_value(code_repr1) < get_sort_value(code_repr2)
	end
	
	return ToC_item('Exceptional code formats')
		.. '\n{| class="wikitable sortable"\n|+ '
		.. 'Code formats in [[Module:languages/data/exceptional]]\n! format !! count\n'
		.. codes_by_format
			:map(
				function(codes, code_repr)
					codes = Map:new(codes)
					return ('|-\n| <code>%s</code> || title="%s" | %d\n'):format(
						code_repr,
						codes:sorted_concat(", "),
						#codes)
				end)
			:sorted_concat("", compare)
		.. '|}'
end

function export.script_combinations(frame)
	local Array = require "Module:array"
	local Map = require "Module:User:Erutuon/lang_stuff/map"
	local language_data = require "Module:languages/data/all"
	
	local function add(t, k, lang_code)
		local subtable = t[k]
		if not subtable then
			subtable = {}
			t[k] = subtable
		end
		table.insert(subtable, lang_code)
	end
	
	local script_combinations = setmetatable({}, { __index = Map:new{ add = add } })
	
	for code, data in pairs(language_data) do
		if not (code == "und" or code == "mul") then
			local script_list = data[script_key]
			if script_list == nil then
				script_combinations:add("None", code)
			else
				script_combinations:add(array_from_comma_list_or_array(script_list):concat ", ", code)
			end
		end
	end
	
	local count = script_combinations:size()
	
	local number_of_languages_in_tooltip = 80
	local function display_language(language_code)
		return language_data[language_code][1] .. " (" .. language_code .. ")"
	end
	
	return ToC_item('Script combinations')
.. [[

{| class="wikitable sortable"
|+ ]] .. count .. [[ script combinations (sorted alphabetically) and the number of languages that use them
! style="width: 8em;" | script list !! script<br>count !! languages
]]
		.. script_combinations
			:map(
				function (languages, script_list)
					-- Count alphabetic characters at beginning of string or after |.
					local script_count
					script_list, script_count = script_list:gsub("[^, ]+", link_script)
					local language_count = #languages
					local language_list = languages[2]
						and Array(languages)
							:sort()
							:slice(1, number_of_languages_in_tooltip)
							:map(display_language)
							:concat ", "
						or display_language(languages[1])
					
					if languages[number_of_languages_in_tooltip + 1] then
						language_list = language_list .. ", ..."
					end
					
					return ('|-\n| %s || %d || title="%s" | %d\n')
						:format(script_list, script_count, language_list,
							language_count)
				end)
			:sorted_concat(
				"",
				function (script_list1, script_list2)
					return script_list1:lower() < script_list2:lower()
				end)
		.. '|}'
end

function export.count_data_items(frame)
	local counts = require "Module:count":new()
	local Map = require "Module:User:Erutuon/lang_stuff/map"
	
	for _, data in pairs(require "Module:languages/data/all") do
		for k in pairs(data) do
			counts[k] = counts[k] + 1
		end
		counts.total = counts.total + 1
	end
	
	local info = { "canonical name", "Wikidata item", "family" }
	
	return ToC_item('Data item census')
.. [[

{| class="wikitable sortable"
|+ Number of languages with each data item in their table
! data item !! count
|-
]]
		.. Map:new(counts)
			:map(
				function (count, data_key)
					if data_key == "total" then
						return ("| total<br>languages || %d"):format(count)
					elseif info[data_key] then
						return ("| <code>%s</code> (%s) || %d"):format(data_key, info[data_key], count)
					else
						return ("| <code>%s</code> || %d"):format(data_key, count)
					end
				end)
			:sorted_concat(
				"\n|-\n",
				function (data_key1, data_key2)
					if data_key1 == "total" then -- Ensure "total languages" shows at the top.
						return true
					else
						return counts[data_key1] > counts[data_key2]
					end
				end)
	.. "\n|}"
end

function export.no_scripts(frame)
	local Map = require "Module:User:Erutuon/lang_stuff/map"
	
	return ToC_item('Languages with no scripts') .. 
[[

{| class="wikitable sortable"
|+ Languages with no scripts
! code !! name !! module
|-
]] .. Map:new(require "Module:languages/data/all")
		:filter(
			function (data)
				return data[script_key] == nil
			end)
		:map(
			function(data, code)
				local name = data[1]
				local article = data.wikipedia_article
					or data.wikidata_item and mw.wikibase.sitelink(data.wikidata_item, 'enwiki')
					or name:find("[Ll]anguage") and name
					or name .. " language"
				
				local module =
					#code == 3 and "data/3/" .. code:sub(1, 1)
					or #code == 2 and "data/2"
					or "data/exceptional"
				
				return ('| %s || [[w:%s|%s]] || [[Module:languages/%s|%s]]')
					:format(code, article, name, module, module)
			end)
		:sorted_concat("\n|-\n")
	.. "\n|}"
end

function export.entry_name_replacements(frame)
	local Array = require "Module:array"
	local Map = require "Module:User:Erutuon/lang_stuff/map"
	local add_dotted_circle = require "Module:Unicode data".add_dotted_circle
	
	local function script_tag(script_code, str)
		return '<span class="' .. script_code .. '">' .. str .. '</span>'
	end
	
	local function show_from_or_to(from_or_to, script_code)
		if not (from_or_to and from_or_to[1]) then return "" end
		return script_tag(script_code, from_or_to[2]
			and add_dotted_circle(Array(from_or_to):concat ", ")
			or from_or_to[1])
	end
	
	local header = 'Languages with entry name replacements'
	
	return ToC_item(header) .. ([[

{| class="wikitable sortable"
|+ header
! language !! script !! replacements
|-
]]):gsub('header', header)
		.. Map:new(require "Module:languages/data/all")
			:filter(
				function (data)
					return type(data.entry_name) == "table"
				end)
			:map(
				function (data, code)
					local output = Array()
					for script, replacements in pairs(data.entry_name) do
						if replacements.from then
							local script = require "Module:languages".getByCode(code)
								:findBestScript(Array(replacements.from):concat())
							local script_code = script:getCode()
								
							output:insert(('|-\n| %s (<code>%s</code>) || %s || %s<br>&darr;<br>%s')
								:format(data[1], code,
									link_script(script:getCode()),
									show_from_or_to(replacements.from, script_code),
									show_from_or_to(replacements.to, script_code)))
						end
					end
					return output:concat("\n")
				end)
			:sorted_concat "\n|-\n"
		.. "\n|}"
end

function export.wikimedia_languages(frame)
	local fun = require "Module:fun"
	
	local languages_with_Wikimedia_code = setmetatable({},
		{
			__index = function (self, key)
				local val = {}
				self[key] = val
				return val
			end,
		})
	
	local language_data = require "Module:languages/data/all"
	
	for code, data in pairs(language_data) do
		if data.wikimedia_codes then
			for wikimedia_code in data.wikimedia_codes:gmatch "[^%s,]+" do
				table.insert(languages_with_Wikimedia_code[wikimedia_code],
					code)
			end
		end
	end
	
	for _, codes in pairs(languages_with_Wikimedia_code) do
		if codes[2] then
			table.sort(codes)
		end
	end
	
	return ToC_item("Wiktionary languages by Wikimedia language")
		.. [[

{| class="wikitable sortable"
|+ Languages by their Wikimedia language
! Wikimedia language !! Wiktionary language
]] .. table.concat(
		fun.mapIter(
			function (Wiktionary_codes, Wikimedia_code)
				return ("|-\n| [https://%s.wiktionary.org %s] || %s"):format(
					Wikimedia_code,
					Wikimedia_code,
					table.concat(
						fun.map(
							function (code)
								return ("%s ([[:Category:%s language|%s]])"):format(
									code,
									language_data[code][1], -- canonical name
									language_data[code][1])
							end,
							Wiktionary_codes),
						", "))
			end,
			require "Module:table".sortedPairs(languages_with_Wikimedia_code)),
		"\n")
		.. "\n|}"
end

function export.ambiguous_names(frame)
	local Map = require "Module:User:Erutuon/lang stuff/map"
	local Array = require "Module:array"
	local language_data = require "Module:languages/data/all"
	local language_objects = require "Module:languages/cache"
	
	local name_to_object = {}
	setmetatable(name_to_object, {
		__index = function (self, key)
			local val = Array()
			self[key] = val
			return val
		end
	})
	
	for code, data in pairs(language_data) do
		local canonical_name = data[1]
		name_to_object[canonical_name]:insert(code)
		if data.otherNames then
			for _, name in ipairs(data.otherNames) do
				name_to_object[name]:insert(code)
			end
		end
	end
	
	return ToC_item("Languages with ambiguous canonical or non-canonical names")
		.. [[

{| class="wikitable sortable"
|+ Canonical or non-canonical names that correspond to more than one language
! name !! languages]]
		.. Map:new(name_to_object)
			:filter(function (languages) return #languages > 1 end)
			:map(
				function (lang_codes, name)
					local languages = Array(lang_codes)
						:map(function (lang_code) return language_objects[lang_code] end)
						:sort(function (lang1, lang2)
								return lang1:getCanonicalName() < lang2:getCanonicalName()
							end)
						:map(function (lang)
								return (lang:getCanonicalName() == name
									and "<mark>[[:Category:%s|%s]]</mark> (<code>%s</code>)"
									or "[[:Category:%s|%s]] (<code>%s</code>)")
									:format(lang:getCategoryName(), lang:getCanonicalName(), lang:getCode())
							end)
						:concat(", ")
					return ("\n|-\n| %s || %s"):format(name, languages)
				end)
			:sortedConcat()
		.. "\n|}"
end

function export.languages_with_prefix_of_another_language(frame)
	local Array = require "Module:array"
	local all_languages = require "Module:languages/data/all"
	local language_name_to_code = require "Module:languages/canonical names"
	local make_language_object = require "Module:languages".makeObject
	
	local function get_category_name(canonical_name)
		return canonical_name:find("[Ll]anguage$") and canonical_name
			or canonical_name .. " language"
	end
	
	local function make_category_link(canonical_name)
		return ("[[:Category:%s|%s]]")
			:format(get_category_name(canonical_name), canonical_name)
	end
	
	return require "Module:User:Erutuon/lang stuff/map":new(all_languages)
		:map(function (data, code)
				local name = data[1]
				local words = mw.text.split(name, " ", true)
				for i = #words - 1, 1, -1 do
					local prefix = table.concat(words, " ", 1, i)
					if language_name_to_code[prefix] then
						data.prefixes = data.prefixes or {}
						table.insert(data.prefixes, prefix)
					end
				end
				return data
			end)
		:filter(function (data, code)
				return data.prefixes ~= nil
			end)
		:map(function (data, code)
				local name = data[1]
				
				return ("* %s (<code>%s</code>): %s")
					:format(make_category_link(name), code,
						Array(data.prefixes)
							:map(make_category_link)
							:concat(", "))
			end)
		:sorted_concat("\n")
end

function export.languages_with_otherNames_field(frame)
	local Array = require "Module:array"
	local all_languages = require "Module:languages/data/all"
	local make_language_object = require "Module:languages".makeObject
	
	local function get_category_name(canonical_name)
		return canonical_name:find("[Ll]anguage$") and canonical_name
			or canonical_name .. " language"
	end
	
	local function make_category_link(canonical_name)
		return ("[[:Category:%s|%s]]")
			:format(get_category_name(canonical_name), canonical_name)
	end
	
	local get_data_module = require "Module:languages".getDataModuleName
	local function module_link(code)
		local module = get_data_module(code)
		return "[[Module:" .. module .. "|" .. module:gsub("languages/", "") .. "]]"
	end
	
	return [[
{| class="wikitable sortable"
|+ Languages with <code>otherNames</code> field in their language data
! name !! code !! otherNames !! module
]] .. require "Module:User:Erutuon/lang stuff/map":new(all_languages)
		:filter(function (data)
				return data.otherNames ~= nil
			end)
		:map(function (data, code)
				local name = data[1]
				
				return ("|-\n| %s || <code>%s</code> || %s || %s\n")
					:format(make_category_link(name), code,
						Array(data.otherNames)
							:concat(", "),
						module_link(code))
			end)
		:sorted_concat()
	.. "|}"
end

function export.languages_with_odd_translit_modules(frame)
	local Array = require "Module:array"
	local all_languages = require "Module:languages/data/all"
	
	local function get_category_name(canonical_name)
		return canonical_name:find("[Ll]anguage$") and canonical_name
			or canonical_name .. " language"
	end
	
	local function make_category_link(canonical_name)
		return ("[[:Category:%s|%s]]")
			:format(get_category_name(canonical_name), canonical_name)
	end
	
	local caption = "Languages with odd transliteration modules (not beginning with language or script code)"
	
	return ToC_item(caption) .. [[

{| class="wikitable sortable"
|+ ]] .. caption .. [[

! name !! code !! script !! transliteration<br>module
]] .. require "Module:User:Erutuon/lang stuff/map":new(all_languages)
		:filter(function (data, code)
				return data.translit ~= nil
			end)
		:map(function (data, code)
				local output
				for script_code, translit in pairs(type(data.translit) == "table" and data.translit or {data.translit}) do
					if type(translit) == "string" and translit ~= nil
					and not (translit == code .. "-translit"
						or translit == "translit-redirect"
						or (data[script_key] and array_from_comma_list_or_array(data[script_key])
							:some(function(script_code)
									return translit:find("^.*" .. script_code:gsub("%-", "%%-") .. ".*%-translit$")
							end))
						)
							then
						local name = data[1]
						output = output or Array()
						output:insert(("|-\n| %s || <code>%s</code> || %s || [[Module:%s]]\n")
							:format(
								make_category_link(name), code,
								type(script_code) == "string"
									and require "Module:scripts".getByCode(script_code):makeCategoryLink()
									or "",
								translit))
					end
				end
				return output and output:concat "\n" or ""
			end)
		:sorted_concat()
	.. "|}"
end

function export.data_item_combinations(frame)
	local Array = require "Module:array"
	local Map = require "Module:User:Erutuon/lang stuff/map"
	local all_languages = {}
	
	local function transfer(module_subpage)
		for code, data in pairs(require("Module:languages/" .. module_subpage)) do
			all_languages[code] = data
		end
	end
	
	transfer "data/2"
	for b = ("a"):byte(), ("z"):byte() do
		transfer("data/3/" .. string.char(b))
	end
	transfer "data/exceptional"
	
	local data_key_order = Array.keys(Map:new(all_languages)
		:values()
		:fold(
			function(set, data)
				for k in pairs(data) do
					set[k] = true
				end
				return set
			end,
			{})):invert()
	
	local function make_sortkey(data_key_list)
		local sortkey = Array()
		for _, key in pairs(data_key_list) do
			local order = assert(data_key_order[key])
			local char = string.char(("a"):byte() + order - 1)
			sortkey:insert(char)
		end
		return sortkey:concat()
	end
	
	local data_keys = Map:new(all_languages)
		:map(
			function (data)
				return Array.keys(data)
			end)
		:values()
	
	local data_key_counts = data_keys
		:fold(
			function (counts, data_items)
				local key = data_items:concat ", "
				counts[key] = (counts[key] or {})
				counts[key].count = (counts[key].count or 0) + 1
				counts[key].items = data_items
				return counts
			end,
			Map:new())
		:values()
	
	data_key_counts = data_key_counts:sort(
		function(counts1, counts2)
			return counts1.count > counts2.count
		end)
	
	local caption = "Count of each combination of data items"
	
	return ToC_item(caption) .. [[

{| class="wikitable sortable"
|+ ]] .. caption .. [[

! combination<br>of data keys !! number of languages
]] .. data_key_counts
		:map(
			function(count)
				return ('|-\n| data-sort-value="%s" | %s || %d\n'):format(make_sortkey(count.items), count.items:concat ", ", count.count)
			end)
		:concat()
	.. "|}"
end

function export.show(frame)
	local out = {}
	for i, function_name in ipairs(frame.args) do
		local func = export[function_name] or error("No exported function " .. function_name)
		xpcall(function()
			if function_name == "number_of_scripts" then
				out[i] = func(frame, 3)
			else
				out[i] = func(frame)
			end
		end, function (err)
			mw.addWarning("Error running export." .. function_name .. ":\n" .. tostring(err) .. "\n" .. debug.traceback())
		end)
	end
	
	return table.concat(out, "\n")
end

return export