Module:data consistency check

Definition from Wiktionary, the free dictionary
Jump to navigation Jump to search

This module checks the validity and internal consistency of the language, language family, and script data used on Wiktionary: the modules in Category:Language data modules as well as Module:scripts/data.

Checks performed[edit]

For multiple data modules:

  • Codes for languages, families and etymology-only languages must be unique and cannot clash with one another.
  • Canonical names for languages, families, and etymology-only languages must not be found in the list of other names.
  • Each name in the list of other names must appear only once.
  • otherNames, if present, must be an array.

Codes in Module:languages data must:

  • Be defined in the correct submodule according to whether the code is two-letter, three-letter or exceptional.
  • Have canonicalName, which must not be the same as the canonical name of another language.
  • If scripts is given, then it must be an array, and each string in the array must be a valid script code.
  • If family is given, it must be a valid family code.
  • If type is given, it must be one of the recognised values (regular, reconstructed, appendix-constructed).
  • If entry_name is given, it must contain two arrays (from and to).
  • If sort_key is given, it must be either a string or a table containing two arrays (from and to).
  • If entry_name or sort_key is given, the from array must be longer or equal in length to the to array.
  • If standardChars is given, it must form a valid Lua string pattern when placed between square brackets with ^ before it ("[^...]). (It should match all characters regularly used in the language, but that cannot be tested.)
  • Have no data keys besides these: "canonicalName", "entry_name", "sort_key", "otherNames", "type", "scripts", "family", "ancestors", "wikimedia_codes", "wikipedia_article", "standardChars", "translit_module", "override_translit", "link_tr", "wikidata_item".

Checks not performed:

  • If translit_module is present, it should be the name of a module, and this module should contain a tr function that takes a pagename (and optionally a language code and script code) as arguments.
  • If sort_key is a string, it should be the name of a module, and this module should contain a makeSortKey function that takes a pagename (and optionally a language code and script code) as arguments.

These are not checked here, because module errors will quickly crop up in entries if these conditions are not met, assuming that Module:utilities attempts to generate a sortkey for a category pertaining to the language in question, or full_link attempts to use the transliteration module.

Module:languages/code to canonical name and Module:languages/canonical names must contain all the codes and canonical names found in the data submodules of Module:languages, and no more.

Codes in Module:etymology languages data must:

  • Have canonicalName.
  • Have parent, which must be a valid language, family or etymology-only language code.
  • Have no data keys besides these: "canonicalName", "otherNames", "parent", "ancestors", "wikipedia_article", "wikidata_item".

Codes in Module:families data must:

  • Have canonicalName, which must not be the same as the canonical name of another family.
  • If family is given, it must be a valid family code.
  • Have at least one language or subfamily belonging to it.
  • Have no data keys besides these: "canonicalName", "otherNames", "family", "protoLanguage", "wikidata_item".

Codes in Module:scripts data must:

  • Have canonicalName.
  • Have at least one language that lists it as one of its scripts.
  • Have a characters pattern for script autodetection, and this must form a valid Lua string pattern when placed between square brackets ("[...]"). (It should match all characters in the script, but that cannot be tested.)
  • Have no data keys besides these: "canonicalName", "otherNames", "parent", "systems", "wikipedia_article", "characters", "direction".

Output[edit]

Discrepancies detected:

Module:families/data

Module:languages/canonical names

  • The canonical name Proto-Sanglechi-Ishkashimi (ira-sgi-pro) is missing.
  • The canonical name Proto-Bodo-Garo (tbq-bdg-pro) is missing.

Module:languages/code to canonical name

  • The code ira-sgi-pro (Proto-Sanglechi-Ishkashimi) is missing.
  • The code ira-wnj (Wanji) is missing.
  • The code tbq-bdg-pro (Proto-Bodo-Garo) is missing.

Module:languages/datax

  • Wanji language (ira-wnj) has a canonical name that is not unique, it is also used by the code wbi.

Module:scripts/data


local export = {}

local m_table = require("Module:table")
local list_to_set = m_table.listToSet

local messages = {}
setmetatable(
	messages,
	{
		__index = function(self, key) -- automatically generate subtables
			local val = {}
			self[key] = val
			return val
		end
	})

local function discrepancy(modname, ...)
	table.insert(messages[modname], string.format(...))
end

local all_codes = {}

local language_names = {}
local family_names = {}
local script_names = {}

local nonempty_fams = {}
local nonempty_scrs = {}
	
local function link(name)
	if not name then
		return "???"
	elseif name:find("[Ll]anguage$") then
		return "[[:Category:" .. name .. "|" .. name .. "]]"
	else
		return "[[:Category:" .. name .. " language|" .. name .. " language]]"
	end
end
	
local function link_script(name)
	if not name then
		return "???"
	elseif name:find("[Cc]ode$") or name:find("[Ss]emaphore$") then
		return "[[:Category:" .. name:gsub("^%l", string.upper) .. "|" .. name .. "]]"
	else
		return "[[:Category:" .. name .. " script|" .. name .. " script]]"
	end
end

local m_fun = require("Module:fun")
local map = m_fun.map

local function invalid_keys_message(modname, code, data, invalid_keys, is_script)
	local plural = invalid_keys[2] and true or false
	discrepancy(modname, "The data key%s %s for %s (<code>%s</code>) %s invalid.",
		plural and "s" or "",
		table.concat(
			map(
				function(key)
					return '<code>' .. key .. '</code>'
				end,
				invalid_keys),
			", "),
		(is_script and link_script or link)(data.canonicalName),
		code,
		plural and "are" or "is")
end

local function check_data_keys(valid_keys, is_script)
	valid_keys = list_to_set(valid_keys)
	
	return function (modname, code, data)
		local invalid_keys, i
		for k in pairs(data) do
			if not valid_keys[k] then
				invalid_keys = invalid_keys or {}
				i = (i or 0) + 1
				invalid_keys[i] = k
			end
		end
		if invalid_keys then
			invalid_keys_message(modname, code, data, invalid_keys, is_script)
		end
	end
end

local function check_other_names(modname, code, canonical_name, other_names)
	local names = {}
	for _, other_name in ipairs(other_names or {}) do
		if other_name == canonical_name then
			discrepancy(modname,
				"%s, the canonical name for <code>%s</code>, is repeated in the table of <code>otherNames</code>.",
				canonical_name, code)
		end
		if names[other_name] then
			discrepancy(modname,
				"The name %s is found twice or more in the list of <code>otherNames</code> for %s (<code>%s</code>)",
				other_name, canonical_name, code)
		end
		names[other_name] = true
	end
end

local get_codepoint = mw.ustring.codepoint
local function validate_pattern(pattern, modname, code, data, standardChars)
	if type(pattern) ~= "string" then
		discrepancy(modname, '"%s", the %spattern for %s (<code>%s</code>), is not a string.',
			pattern, standardChars and 'standard character ' or '', code, data.canonicalName)
	end
	local ranges
	for lower, higher in mw.ustring.gmatch(pattern, "(.)%-(.)") do
		if get_codepoint(lower) >= get_codepoint(higher) then
			ranges = ranges or {}
			table.insert(ranges, { lower, higher })
		end
	end
	if ranges and ranges[1] then
		local plural = ranges[2] and "s" or ""
		discrepancy(modname, '%s (<code>%s</code>) specifies an invalid pattern ' ..
			'for %scharacter detection: <code>"%s"</code>. The first codepoint%s ' ..
			'in the range%s %s %s must be less than the second.',
			link(data.canonicalName), code, standardChars and 'standard ' or '', pattern, plural, plural,
			table.concat(
				map(
					function(range)
						return range[1] .. "-" .. range[2] .. (" (U+%X, U+%X)")
							:format(get_codepoint(range[1]), get_codepoint(range[2]))
					end,
					ranges),
				", "),
			ranges[2] and "are" or "is")
	end
	if not pcall(mw.ustring.find, "", "[" .. pattern .. "]") then
		discrepancy(modname, '%s (<code>%s</code>) specifies an invalid pattern for ' ..
			(standardChars and 'standard' or '') .. ' character detection: <code>"%s"</code>',
			link(data.canonical_name), code, pattern)
	end
end

-- Modification of isArray in [[Module:table]].
local function find_gap(t)
	local i = 0
	for _ in pairs(t) do
		i = i + 1
		if t[i] == nil then
			return i
		end
	end
end

local function check_array(modname, code, data, array_name)
	local gap = find_gap(data[array_name])
	if gap then
		discrepancy(modname, "The %s array in the data table for %s (<code>%s</code>) has a gap at index %d.",
			array_name, data.canonicalName, code, gap)
	end
end

local repl_keys = { "from", "to" }
local function check_entry_name_or_sortkey(modname, code, data, replacements_name)
	local replacements = data[replacements_name]
	if type(replacements) == "string" then
		if replacements_name ~= "sort_key" then
			discrepancy(modname, "The %s field in the data table for %s (<code>%s</code>) must be a table.",
				replacements_name, data.canonicalName, code)
		end
		return
	end
	
	for _, key in ipairs(repl_keys) do
		local gap = find_gap(replacements[key])
		if gap then
			discrepancy(modname, "The %s array in the %s table for %s (<code>%s</code>) has a gap at index %d.",
				key, replacements_name, data.canonicalName, code, gap)
		end
	end
	
	if replacements.from and replacements.to
			and m_table.length(replacements.to) > m_table.length(replacements.from) then
		discrepancy(modname,
			"The <code>from</code> array in the %s table for %s (<code>%s</code>) must be shorter or equal to the <code>to</code> array.",
			replacements_name, data.canonicalName, code)
	end
end

local function check_languages()
	local m_family_data = mw.loadData('Module:families/data')
	local m_script_data = mw.loadData('Module:scripts/data')
	local m_language_codes = mw.loadData('Module:languages/code to canonical name')
	local m_language_canonical_names = mw.loadData('Module:languages/canonical names')
	
	local check_language_data_keys = check_data_keys{
		1, 2, 3, -- canonical name, wikidata item, family
		"entry_name", "sort_key", "otherNames", "type", "scripts", "ancestors",
		"wikimedia_codes", "wikipedia_article", "standardChars",
		"translit_module", "override_translit", "link_tr",
	}
	
	local function check_language(modname, code, data)
		local canonical_name, wikidata_item, lang_type = data[1], data[2], data.type
		
		check_language_data_keys(modname, code, data)
		
		if all_codes[code] then
			discrepancy(modname, "Code <code>%s</code> is not unique, is also defined in [[Module:%s]].", code, all_codes[code])
		else
			if not m_language_codes[code] then
				discrepancy("languages/code to canonical name", "The code <code>%s</code> (%s) is missing.", code, canonical_name)
			end
			all_codes[code] = modname
		end
		
		if not canonical_name then
			discrepancy(modname, "Code <code>%s</code> has no canonical name specified.", code)
		elseif language_names[canonical_name] then
			discrepancy(modname,
				"%s (<code>%s</code>) has a canonical name that is not unique, it is also used by the code <code>%s</code>.",
				link(canonical_name), code, language_names[canonical_name])
		else
			if not m_language_canonical_names[canonical_name] then
				discrepancy("languages/canonical names", "The canonical name %s (<code>%s</code>) is missing.", canonical_name, code)
			end
			language_names[canonical_name] = code
		end
		
		if wikidata_item then
			if not wikidata_item:match '^Q%d+$' then
				discrepancy(modname,
					"%s (<code>%s</code>) has a Wikidata item with an invalid form: <code>%s</code>.",
					canonical_name, code, wikidata_item)
			end
		end
		
		if data.otherNames then
			check_other_names(modname, code, canonical_name, data.otherNames)
			check_array(modname, code, data, "otherNames")
		end
		
		if lang_type and (lang_type ~= "regular") and (lang_type ~= "reconstructed") and (lang_type ~= "appendix-constructed") then
			discrepancy(modname, "%s (<code>%s</code>) is of an invalid type <code>%s</code>.", link(canonical_name), code, data.type)
		end
		
		if data.scripts then
			check_array(modname, code, data, "scripts")
			if not data.scripts[1] then
				discrepancy(modname, "%s (<code>%s</code>) has no scripts listed.", link(canonical_name), code)
			else
				for _, sccode in ipairs(data.scripts) do
					if not m_script_data[sccode] then
						discrepancy(modname,
							"%s (<code>%s</code>) lists an invalid script code <code>%s</code>.",
							link(canonical_name), code, sccode)
					end
		
					nonempty_scrs[sccode] = true
				end
			end
		end
		
		if data[3] then
			local family = data[3]
			if not m_family_data[family] then
				discrepancy(modname,
					"%s (<code>%s</code>) has an invalid family code <code>%s</code>.",
					link(canonical_name), code, family)
			end
			
			nonempty_fams[family] = true
		end
		
		if data.sort_key then
			check_entry_name_or_sortkey(modname, code, data, "sort_key")
		end
		
		if data.entry_name then
			check_entry_name_or_sortkey(modname, code, data, "entry_name")
		end

		if data.standardChars then
			validate_pattern(data.standardChars, modname, code, data, true)
		end
	end
	
	-- Check two-letter codes
	local modname = "languages/data2"
	local data2 = mw.loadData("Module:" .. modname)
	
	for code, data in pairs(data2) do
		if not code:find("^[a-z][a-z]$") then
			discrepancy(modname, '%s (<code>%s</code>) does not have a two-letter code.', link(data.canonicalName), code)
		end
		
		check_language(modname, code, data)
	end
	
	-- Check three-letter codes
	for i = string.byte('a'), string.byte('z') do
		local letter = string.char(i)
		local modname = "languages/data3/" .. letter
		local data3 = mw.loadData("Module:" .. modname)
		local code_pattern = "^" .. letter .. "[a-z][a-z]$"
		
		for code, data in pairs(data3) do
			if not code:find(code_pattern) then
				discrepancy(modname,
					'%s (<code>%s</code>) does not have a three-letter code starting with "<code>%s</code>".',
					link(data.canonicalName), code, letter)
			end
			
			check_language(modname, code, data)
		end
	end
	
	-- Check exceptional codes
	modname = "languages/datax"
	local datax = mw.loadData("Module:" .. modname)
	
	for code, data in pairs(datax) do
		if code:find("^[a-z][a-z][a-z]?$") then
			discrepancy(modname, '%s (<code>%s</code>) has a two- or three-letter code.', link(data.canonicalName), code)
		end
		
		check_language(modname, code, data)
	end
	
	-- These checks must be done while all_codes only contains language codes:
	-- that is, after language data modules have been processed, but before
	-- etymology languages, families, and scripts have.
	local function check_code_and_name(modname, code, canonical_name)
		if not all_codes[code] then
			if not language_names[canonical_name] then
				discrepancy(modname,
					"The code <code>%s</code> and the canonical name %s should be removed; they are not found in a submodule of [[Module:languages]].",
					code, canonical_name)
			else
				discrepancy(modname,
					"<code>%s</code>, the code for the canonical name %s, is wrong; it should be <code>%s</code>.",
					code, canonical_name, language_names[canonical_name])
			end
		elseif not language_names[canonical_name] then
			local data_table = mw.loadData("Module:" .. all_codes[code])[code]
			discrepancy(modname,
				"%s, the canonical name for the code <code>%s</code>, is wrong; it should be %s.",
				canonical_name, code, data_table[1] or data_table.canonicalName)
		end
	end
	
	for code, canonical_name in pairs(m_language_codes) do
		check_code_and_name("languages/code to canonical name", code, canonical_name)
	end
	
	for canonical_name, code in pairs(m_language_canonical_names) do
		check_code_and_name("languages/canonical names", code, canonical_name)
	end		
end

local function check_etym_languages()
	local modname = "etymology languages/data"
	local m_etym_language_data = require("Module:" .. modname) -- no mw.loadData
	local m_language_data = mw.loadData("Module:languages/alldata")
	local m_family_data = mw.loadData('Module:families/data')
	
	local check_etymology_language_data_keys = check_data_keys{
		"canonicalName", "otherNames", "parent",
		"wikipedia_article", "wikidata_item"
	}
	
	local function link(name)
		if not name then
			return "???"
		elseif name:find("[Ll]anguage$") then
			return name
		else
			return name .. " language"
		end
	end
	
	for code, data in pairs(m_etym_language_data) do
		local canonical_name, parent = data.canonicalName, data.parent
		check_etymology_language_data_keys(modname, code, data)
		
		if all_codes[code] then
			discrepancy(modname, "Code <code>%s</code> is not unique, is also defined in [[Module:%s]].", code, all_codes[code])
		else
			all_codes[code] = modname
		end
		
		if not canonical_name then
			discrepancy(modname, "Code <code>%s</code> has no canonical name specified.", code)
		elseif language_names[canonical_name] then
			--[=[
			discrepancy(modname,
				"%s (<code>%s</code>) has a canonical name that is not unique, it is also used by the code <code>%s</code>.",
				link(data.names[1]), code, language_names[data.names[1]])
			--]=]
		else
			language_names[canonical_name] = code
		end
		
		if data.otherNames then
			check_other_names(modname, code, canonical_name, data.otherNames)
			check_array(modname, code, data, "otherNames")
		end
		
		if parent then
			if not m_language_data[parent] and not m_family_data[parent] and not m_etym_language_data[parent] then
				discrepancy(modname,
					"Etymology-only %s (<code>%s</code>) has invalid parent language or family code <code>%s</code>.",
					link(canonical_name), code, parent)
			end
			
			nonempty_fams[parent] = true
		else
			discrepancy(modname,
				"Etymology-only %s (<code>%s</code>) has no parent language or family code.",
				link(canonical_name), code)
		end
	end

	local checked = {}
	for code, data in pairs(m_etym_language_data) do
		local stack = {}

		while data do
			if checked[data] then
				break	
			end
			if stack[data] then
				discrepancy(modname, "%s (<code>%s</code>) has a cyclic parental relationship to %s (<code>%s</code>)",
					link(data.canonical_name), code,
					link(m_etym_language_data[data.parent].canonicalName), data.parent
				)
				break
			end
			stack[data] = true
			code, data = data.parent, data.parent and m_etym_language_data[data.parent]
		end
		
		for data in pairs(stack) do
			checked[data] = true	
		end
	end
end

local function check_families()
	local modname = "families/data"
	local m_family_data = mw.loadData("Module:" .. modname)
	
	local check_family_data_keys = check_data_keys{
		"canonicalName", "otherNames", "family", "protoLanguage", "wikidata_item"
	}

	local function link(name)
		if not name then
			return "???"
		elseif name:find("[Ll]anguages$") then
			return "[[:Category:" .. name .. "|" .. name .. " family]]"
		else
			return "[[:Category:" .. name .. " languages|" .. name .. " family]]"
		end
	end
	
	for code, data in pairs(m_family_data) do
		check_family_data_keys(modname, code, data)
		
		if all_codes[code] then
			discrepancy(modname, "Code <code>%s</code> is not unique, is also defined in [[Module:%s]].", code, all_codes[code])
		else
			all_codes[code] = modname
		end
		
		if not data.canonicalName then
			discrepancy(modname, "<code>%s</code> has no canonical name specified.", code)
		elseif family_names[data.canonicalName] then
			discrepancy(modname,
				"%s (<code>%s</code>) has a canonical name that is not unique, it is also used by the code <code>%s</code>.",
				link(data.canonicalName), code, family_names[data.canonicalName])
		else
			family_names[data.canonicalName] = code
		end
		
		if data.otherNames then
			check_other_names(modname, code, data.canonical_name, data.otherNames)
			check_array(modname, code, data, "otherNames")
		end
		
		if data.family then
			if not m_family_data[data.family] then
				discrepancy(modname,
					"%s (<code>%s</code>) has an invalid parent family code <code>%s</code>.",
					link(data.canonicalName), code, data.family)
			end
			
			nonempty_fams[data.family] = true
		end
	end
	
	for code, data in pairs(m_family_data) do
		if not nonempty_fams[code] then
			discrepancy(modname, "%s (<code>%s</code>) has no child families or languages.", link(data.canonicalName), code)
		end
	end

	local checked = { ['qfa-not'] = true }
	for code, data in pairs(m_family_data) do
		local stack = {}

		while data do
			if checked[code] then
				break	
			end
			if stack[code] then
				discrepancy(modname, "%s (<code>%s</code>) has a cyclic parental relationship to %s (<code>%s</code>)",
					link(data.canonicalName), code,
					link(m_family_data[data[3]].canonicalName), data[3]
				)
				break
			end
			stack[code] = true
			code, data = data.family, m_family_data[data[3]]
		end
		
		for code in pairs(stack) do
			checked[code] = true	
		end
	end
end

local function check_scripts()
	local modname = "scripts/data"
	local m_script_data = mw.loadData("Module:" .. modname)
	
	local check_script_data_keys = check_data_keys({
		"canonicalName", "otherNames", "parent", "systems", "wikipedia_article",
		"characters", "direction", "character_category",
	}, true)
	
	local m_script_codes = mw.loadData('Module:scripts/code to canonical name')
	local m_script_canonical_names = mw.loadData('Module:scripts/by name')
	
	for code, data in pairs(m_script_data) do
		local canonical_name = data.canonicalName
		if not m_script_codes[code] and #code == 4 then
			discrepancy('scripts/code to canonical name', '<code>%s</code> (%s) is missing', code, canonical_name)
		end
		
		check_script_data_keys(modname, code, data)
		
		if not canonical_name then
			discrepancy(modname, "Code <code>%s</code> has no canonical name specified.", code)
		elseif script_names[canonical_name] then
			--[=[
			discrepancy(modname,
				"%s (<code>%s</code>) has a canonical name that is not unique, it is also used by the code <code>%s</code>.",
				link_script(data.names[1]), code, script_names[data.names[1]])
			--]=]
		else
			if not m_script_canonical_names[canonical_name] and #code == 4 then
				discrepancy('scripts/by name', '%s (<code>%s</code>) is missing', canonical_name, code)
			end
			script_names[canonical_name] = code
		end
		
		if data.otherNames then
			check_other_names(modname, code, canonical_name, data.otherNames)
			check_array(modname, code, data, "otherNames")
		end
		
		if not nonempty_scrs[code] then
			discrepancy(modname,
				"%s (<code>%s</code>) is not used by any language%s.",
				link_script(canonical_name), code, data.characters and ""
					or " and has no characters listed for auto-detection")
		--[[
		elseif not data.characters then
			discrepancy(modname, "%s (<code>%s</code>) has no characters listed for auto-detection.", link_script(canonical_name), code)
		--]]
		end

		if data.characters then
			validate_pattern(data.characters, modname, code, data, false)
		end
	end
end

function export.perform(frame)
	check_languages()
	check_etym_languages()

	-- families and scripts must be checked AFTER languages; languages checks fill out
	-- the nonempty_fams and nonempty_scrs tables, used for testing if a family/script
	-- is ever used in the data
	check_families()
	check_scripts()
	
	local function find_code(message)
		return string.match(message, "<code>([^<]+)</code>")
	end
	
	find_code = m_fun.memoize(find_code)
	
	local function comp(message1, message2)
		local code1, code2 = find_code(message1), find_code(message2)
		if code1 and code2 then
			return code1 < code2
		else
			return message1 < message2
		end
	end
	
	-- Format the messages
	local ret = {}
	local i = 1 -- leave index 1 for opening message
	for modname, msglist in m_table.sortedPairs(messages) do
		table.sort(msglist, comp)
		i = i + 1
		ret[i] = table.concat{
			'===[[Module:', modname, ']]===',
			table.concat(map(function(msg) return "\n* " .. msg end, msglist))
		}
	end
	
	-- Are there any messages?
	if i == 1 then
		return '<b class="success">Glory to Arstotzka.</b>'
	else
		ret[1] = '<b class="warning">Discrepancies detected:</b>'
		
		return table.concat(ret, '\n')
	end
end

return export