Module:User:Benwing2/category tree/poscatboiler/data/languages

From Wiktionary, the free dictionary
Jump to navigation Jump to search

This is a private module sandbox of Benwing2, for their own experimentation. Items in this module may be added and removed at Benwing2's discretion; do not rely on this module's stability.


local raw_categories = {}
local raw_handlers = {}

local m_languages = require("Module:languages")
local m_sc_getByCode = require("Module:scripts").getByCode
local m_table = require("Module:table")

local Hang = m_sc_getByCode("Hang")
local Hani = m_sc_getByCode("Hani")
local Hira = m_sc_getByCode("Hira")
local Hrkt = m_sc_getByCode("Hrkt")
local Kana = m_sc_getByCode("Kana")

-- This handles language categories of the form e.g. [[Category:French language]] and
-- [[:Category:British Sign Language]] and regional variant categories of the form
-- e.g. [[Category:Regional French]].


-----------------------------------------------------------------------------
--                                                                         --
--                              RAW CATEGORIES                             --
--                                                                         --
-----------------------------------------------------------------------------


raw_categories["Regionalisms"] = {
	description = "Categories that group terms in regional varieties of various languages.",
	additional = "{{{umbrella_msg}}}",
	parents = {
		"Fundamental",
	},
}

raw_categories["All languages"] = {
	intro = "{{sisterlinks|Category:Languages}}\n[[File:Languages world map-transparent background.svg|thumb|right|250px|Rough world map of language families]]",
	description = "This category contains the categories for every language on Wiktionary.",
	additional = "Not all languages that Wiktionary recognises may have a category here yet. There are many that have " ..
	"not yet received any attention from editors, mainly because not all Wiktionary users know about every single " ..
	"language. See [[Wiktionary:List of languages]] for a full list.",
	parents = {
		"Fundamental",
	},
}

raw_categories["All extinct languages"] = {
	description = "This category contains the categories for every [[extinct language]] on Wiktionary.",
	additional = "Do not confuse this category with [[:Category:Extinct languages]] which is for the names of extinct languages.",
	parents = {
		"All languages",
	},
}



-----------------------------------------------------------------------------
--                                                                         --
--                                RAW HANDLERS                             --
--                                                                         --
-----------------------------------------------------------------------------


local function makeCategoryLink(object)
	return "[[:Category:" .. object:getCategoryName() .. "|" .. object:getCanonicalName() .. "]]"
end

local function ucfirst(text)
	return mw.getContentLanguage():ucfirst(text)
end

local function linkbox(lang, setwiki, setwikt, setsister, entryname)
	local wiktionarylinks = "''None.''"
	
	local canonicalName = lang:getCanonicalName()
	local wikimediaLanguages = lang:getWikimediaLanguages()
	local nameWithLanguage = lang:getCategoryName("nocap")
	local categoryName = lang:getCategoryName()
	local wikipediaArticle = setwiki or lang:getWikipediaArticle()
	setsister = setsister and ucfirst(setsister) or nil
	
	if setwikt then
		require "Module:debug".track "langcatboiler/setwikt"
		if setwikt == "-" then
			require "Module:debug".track "langcatboiler/setwikt/hyphen"
		end
	end
	
	if setwikt ~= "-" and wikimediaLanguages and wikimediaLanguages[1] then
		wiktionarylinks = {}
		
		for _, wikimedialang in ipairs(wikimediaLanguages) do
			table.insert(wiktionarylinks,
				(wikimedialang:getCanonicalName() ~= canonicalName and "(''" .. wikimedialang:getCanonicalName() .. "'') " or "") ..
				"'''[[:" .. wikimedialang:getCode() .. ":|" .. wikimedialang:getCode() .. ".wiktionary.org]]'''")
		end
		
		wiktionarylinks = table.concat(wiktionarylinks, "<br/>")
	end
	
	local plural = wikimediaLanguages[2] and "s" or ""
	
	return table.concat{
[=[<div style="clear: right; border: solid #aaa 1px; margin: 1 1 1 1; background: #f9f9f9; width: 270px; padding: 5px; margin: 5px; text-align: left; float: right">
<div style="text-align: center; margin-bottom: 10px; margin-top: 5px">''']=], nameWithLanguage, [=['''</div>

{| style="font-size: 90%; background: #f9f9f9;"
|-
| style="vertical-align: middle; height: 35px; width: 35px;" | [[File:Wiktionary-logo-v2.svg|35px|none|Wiktionary]]
|| '']=], nameWithLanguage, [=[ edition]=], plural, [=[ of Wiktionary''
|-
| colspan="2" style="padding-left: 10px; border-bottom: 1px solid lightgray;" | ]=], wiktionarylinks, [=[

|-
| style="vertical-align: middle; height: 35px" | [[File:Wikipedia-logo.png|35px|none|Wikipedia]]
|| ''Wikipedia article about ]=], nameWithLanguage, [=[''
|-
| colspan="2" style="padding-left: 10px; border-bottom: 1px solid lightgray;" | ]=], (setwiki == "-" and "''None.''" or "'''[[w:" .. wikipediaArticle .. "|" .. wikipediaArticle .. "]]'''"), [=[

|-
| style="vertical-align: middle; height: 35px" | [[File:Wikimedia-logo.svg|35px|none|Wikimedia Commons]]
|| ''Links related to ]=], nameWithLanguage, [=[ in sister projects at Wikimedia Commons''
|-
| colspan="2" style="padding-left: 10px; border-bottom: 1px solid lightgray;" | ]=], (setsister == "-" and "''None.''" or "'''[[commons:Category:" .. (setsister or categoryName) .. "|" .. (setsister or categoryName) .. "]]'''"), [=[

|-
| style="vertical-align: middle; height: 35px" | [[File:Crystal kfind.png|35px|none|Considerations]]
|| ]=], nameWithLanguage, [=[ considerations
|-
| colspan="2" style="padding-left: 10px; border-bottom: 1px solid lightgray;" | '''[[Wiktionary:About ]=], canonicalName, [=[]]'''<br>'''[[:Category:]=], canonicalName, [=[ reference templates|Reference templates]] ({{PAGESINCAT:]=], canonicalName, [=[ reference templates}})'''<br>'''[[Appendix:]=], canonicalName, [=[ bibliography|Bibliography]]'''
|-
| style="vertical-align: middle; height: 35px" | [[File:Open book nae 02.svg|35px|none|Entry]]
|| ]=], nameWithLanguage, [=[ entry
|-
| colspan="2" style="padding-left: 10px;" | ''']=], require("Module:links").full_link({lang = m_languages.getByCode("en"), term = entryname or canonicalName}), [=['''
|}
</div>]=]
}
end

local function edit_link(title, text)
	return '<span class="plainlinks">['
		.. tostring(mw.uri.fullUrl(title, { action = "edit" }))
		.. ' ' .. text .. ']</span>'
end

-- Should perhaps use wiki syntax.
local function infobox(lang)
	local ret = {}
	
	table.insert(ret, '<table class="wikitable language-category-info"')
	
	if type(lang.getRawData) == "function" then
		local raw_data = lang:getRawData()
		if raw_data then
			local replacements = {
				[1] = "canonical-name",
				[2] = "wikidata-item",
				[3] = "family",
			}
			local function replacer(letter1, letter2)
				return letter1:lower() .. "-" .. letter2:lower()
			end
			-- For each key in the language data modules, returns a descriptive
			-- kebab-case version (containing ASCII lowercase words separated
			-- by hyphens).
			local function kebab_case(key)
				key = replacements[key] or key
				key = key:gsub("(%l)(%u)", replacer):gsub("(%l)_(%l)", replacer)
				return key
			end
			local function html_attribute_encode(str)
				str = mw.text.jsonEncode(str)
					:gsub('"', "&quot;")
					-- & in attributes is automatically escaped.
					-- :gsub("&", "&amp;")
					:gsub("<", "&lt;")
					:gsub(">", "&gt;")
				return str
			end
			pcall(function ()
				table.insert(ret, ' data-code="' .. lang:getCode() .. '"')
				for k, v in m_table.sortedPairs(lang:getRawData()) do
					table.insert(ret, " data-" .. kebab_case(k)
						.. '="'
						.. html_attribute_encode(v)
						.. '"')
				end
			end)
		end
	end
	table.insert(ret, '>\n')
	table.insert(ret, '<tr class="language-category-data">\n<th colspan="2">'
		.. edit_link("Module:" .. m_languages.getDataModuleName(lang:getCode()),
			"Edit language data")
		.. "</th>\n</tr>\n")
	table.insert(ret, "<tr>\n<th>Canonical name</th><td>" .. lang:getCanonicalName() .. "</td>\n</tr>\n")

	local otherNames = lang:getOtherNames(true)
	if otherNames then
		local names = {}
		
		for _, name in ipairs(otherNames) do
			table.insert(names, "<li>" .. name .. "</li>")
		end
		
		if #names > 0 then
			table.insert(ret, "<tr>\n<th>Other names</th><td><ul>" .. table.concat(names, "\n") .. "</ul></td>\n</tr>\n")
		end
	end
	
	local aliases = lang:getAliases()
	if aliases then
		local names = {}
		
		for _, name in ipairs(aliases) do
			table.insert(names, "<li>" .. name .. "</li>")
		end
		
		if #names > 0 then
			table.insert(ret, "<tr>\n<th>Aliases</th><td><ul>" .. table.concat(names, "\n") .. "</ul></td>\n</tr>\n")
		end
	end

	local varieties = lang:getVarieties()
	if varieties then
		local names = {}
		
		for _, name in ipairs(varieties) do
			if type(name) == "string" then
				table.insert(names, "<li>" .. name .. "</li>")
			else
				assert(type(name) == "table")
				local first_var
				local subvars = {}
				for i, var in ipairs(name) do
					if i == 1 then
						first_var = var
					else
						table.insert(subvars, "<li>" .. var .. "</li>")
					end
				end
				if #subvars > 0 then
					table.insert(names, "<li><dl><dt>" .. first_var .. "</dt>\n<dd><ul>" .. table.concat(subvars, "\n") .. "</ul></dd></dl></li>")
				elseif first_var then
					table.insert(names, "<li>" .. first_var .. "</li>")
				end
			end
		end
		
		if #names > 0 then
			table.insert(ret, "<tr>\n<th>Varieties</th><td><ul>" .. table.concat(names, "\n") .. "</ul></td>\n</tr>\n")
		end
	end

	table.insert(ret, "<tr>\n<th>[[Wiktionary:Languages|Language code]]</th><td><code>" .. lang:getCode() .. "</code></td>\n</tr>\n")
	table.insert(ret, "<tr>\n<th>[[Wiktionary:Families|Language family]]</th>\n")
	
	local fam = lang:getFamily()
	local famCode = fam and fam:getCode()
	
	if not fam then
		table.insert(ret, "<td>unclassified</td>")
	elseif famCode == "qfa-iso" then
		table.insert(ret, "<td>[[:Category:Language isolates|language isolate]]</td>")
	elseif famCode == "qfa-mix" then
		table.insert(ret, "<td>[[:Category:Mixed languages|mixed language]]</td>")
	elseif famCode == "sgn" then
		table.insert(ret, "<td>[[:Category:Sign languages|sign language]]</td>")
	elseif famCode == "crp" then
		table.insert(ret, "<td>[[:Category:Creole or pidgin languages|creole or pidgin]]</td>")
	elseif famCode == "art" then
		table.insert(ret, "<td>[[:Category:Constructed languages|constructed language]]</td>")
	else
		table.insert(ret, "<td>" .. makeCategoryLink(fam) .. "</td>")
	end
	
	table.insert(ret, "\n</tr>\n<tr>\n<th>Ancestors</th>\n")
	
	local ancestors, ancestorChain = lang:getAncestors(), lang:getAncestorChain()
	if ancestors[2] then
		local ancestorList = {}
		
		for i, anc in ipairs(ancestors) do
			ancestorList[i] = "<li>" .. makeCategoryLink(anc) .. "</li>"
		end
		
		table.insert(ret, "<td><ul>\n" .. table.concat(ancestorList, "\n") .. "</ul></td>\n")
	elseif ancestorChain[1] then
		table.insert(ret, "<td><ul>\n")
		
		local chain = {}
		
		for i, anc in ipairs(ancestorChain) do
			chain[i] = "<li>" .. makeCategoryLink(anc) .. "</li>"
		end
		
		table.insert(ret, table.concat(chain, "\n<ul>\n"))
		
		for _, _ in ipairs(chain) do
			table.insert(ret, "</ul>")
		end
		
		table.insert(ret, "</td>\n")
	else
		table.insert(ret, "<td>unknown</td>\n")
	end
	
	table.insert(ret, "</tr>\n")
	
	local scripts = lang:getScripts()
	
	if scripts[1] then
		local script_text = {}
		
		local function makeScriptLine(sc)
			local code = sc:getCode()
			local url = tostring(mw.uri.fullUrl('Special:Search', {
				search = 'contentmodel:css insource:"' .. code
					.. '" insource:/\\.' .. code .. '/',
				ns8 = '1'
			}))
			return makeCategoryLink(sc)
				.. ' (<span class="plainlinks" title="Search for stylesheets referencing this script">[' .. url .. ' <code>' .. code .. '</code>]</span>)'
		end
		
		local function add_Hrkt(text)
			table.insert(text, "<li>" .. makeScriptLine(Hrkt))
			table.insert(text, "<ul>")
			table.insert(text, "<li>" .. makeScriptLine(Hira) .. "</li>")
			table.insert(text, "<li>" .. makeScriptLine(Kana) .. "</li>")
			table.insert(text, "</ul>")
			table.insert(text, "</li>")
		end
		
		for _, sc in ipairs(scripts) do
			local text = {}
			local code = sc:getCode()
			
			if code == "Hrkt" then
				add_Hrkt(text)
			else
				table.insert(text, "<li>" .. makeScriptLine(sc))
				if code == "Jpan" then
					table.insert(text, "<ul>")
					table.insert(text, "<li>" .. makeScriptLine(Hani) .. "</li>")
					add_Hrkt(text)
					table.insert(text, "</ul>")
				elseif code == "Kore" then
					table.insert(text, "<ul>")
					table.insert(text, "<li>" .. makeScriptLine(Hang) .. "</li>")
					table.insert(text, "<li>" .. makeScriptLine(Hani) .. "</li>")
					table.insert(text, "</ul>")
				end
				table.insert(text, "</li>")
			end
			
			table.insert(script_text, table.concat(text, "\n"))
		end
		
		table.insert(ret, "<tr>\n<th>[[Wiktionary:Scripts|Scripts]]</th>\n<td><ul>\n" .. table.concat(script_text, "\n") .. "</ul></td>\n</tr>\n")
	else
		table.insert(ret, "<tr>\n<th>[[Wiktionary:Scripts|Scripts]]</th>\n<td>not specified</td>\n</tr>\n")
	end
	
	local function add_module_info(raw_data, heading)
		if raw_data then
			local scripts = lang:getScriptCodes()
			local module_info, n, add = {}, 0, false
			if type(raw_data) == "string" then
				table.insert(module_info,
					("[[Module:%s]]"):format(raw_data))
				add = true
			elseif type(raw_data) == "table" and m_table.size(scripts) == 1 and type(raw_data[scripts[1]]) == "string" then
				table.insert(module_info,
					("[[Module:%s]]"):format(raw_data[scripts[1]]))
				add = true
			elseif type(raw_data) == "table" then
				table.insert(module_info, "<ul>")
				for script, data in m_table.sortedPairs(raw_data) do
					local script_info
					if m_sc_getByCode(script) then
						if type(data) == "string" then
							script_info = ("[[Module:%s]]</li>"):format(data)
						else
							n = n + 1
							script_info = "(none)\n"
						end
						table.insert(module_info, ("<li><code>%s</code>: %s"):format(script, script_info))
					end
				end
				table.insert(module_info, "</ul>")
				if m_table.size(module_info) > 2 and n < (m_table.size(module_info) - 2) then add = true end
			end
			
			if add then
				table.insert(ret, [=[
<tr>
<th>]=] .. heading .. [=[</th>
<td>]=] .. table.concat(module_info) .. [=[</td>
</tr>
]=])
			end
		end
	end
	
	add_module_info(lang._rawData.generate_forms, "Form-generating<br>module")
	add_module_info(lang._rawData.translit, "[[Wiktionary:Transliteration and romanization|Transliteration<br>module]]")
	add_module_info(lang._rawData.display_text, "Display text<br>module")
	add_module_info(lang._rawData.entry_name, "Entry name<br>module")
	add_module_info(lang._rawData.sort_key, "[[sortkey|Sortkey]]<br>module")
	
	local wikidataItem = lang:getWikidataItem()
	if lang:getWikidataItem() and mw.wikibase then
		local URL = mw.wikibase.getEntityUrl(wikidataItem)
		local link
		if URL then
			link = '[' .. URL .. ' ' .. wikidataItem .. ']'
		else
			link = '<span class="error">Invalid Wikidata item: <code>' .. wikidataItem .. '</code></span>'
		end
		table.insert(ret, "<tr><th>Wikidata</th><td>" .. link .. "</td></tr>")
	end
	
	table.insert(ret, "</table>")
	
	return table.concat(ret)
end

local function NavFrame(content, title)
	return '<div class="NavFrame"><div class="NavHead">'
		.. (title or '{{{title}}}') .. '</div>'
		.. '<div class="NavContent" style="text-align: left;">'
		.. content
		.. '</div></div>'
end


local function get_description_intro_additional(lang, countries, extinct, setwiki, setwikt, setsister, entryname)
	local nameWithLanguage = lang:getCategoryName("nocap")
	if lang:getCode() == "und" then
		local description =
			"This is the main category of the '''" .. nameWithLanguage .. "''', represented in Wiktionary by the [[Wiktionary:Languages|code]] '''" .. lang:getCode() .. "'''. " ..
			"This language contains terms in historical writing, whose meaning has not yet been determined by scholars."
		return description, nil, nil
	end
	
	local canonicalName = lang:getCanonicalName()
	
	local intro = linkbox(lang, setwiki, setwikt, setsister, entryname)

	local the_prefix
	if canonicalName:find(" Language$") then
		the_prefix = ""
	else
		the_prefix = "the "
	end
	local description = "This is the main category of " .. the_prefix .. "'''" .. nameWithLanguage .. "'''."

	local country_links = {}
	for _, country in ipairs(countries) do
		if country ~= "UNKNOWN" then
			local country_without_the = country:match("^the (.*)$")
			if country_without_the then
				table.insert(country_links, "the [[" .. country_without_the .. "]]")
			else
				table.insert(country_links, "[[" .. country .. "]]")
			end
		end
	end
	local country_desc
	if #country_links > 0 then
		local country_link_text = m_table.serialCommaJoin(country_links)
		if extinct then
			country_desc = "It is an [[extinct language]] that was formerly spoken in " .. country_link_text .. ".\n\n"
		else
			country_desc = "It is spoken in " .. country_link_text .. ".\n\n"
		end
	elseif extinct then
		country_desc = "It is an [[extinct language]]."
	else
		country_desc = ""
	end

	local add = country_desc .. "Information about " .. canonicalName .. ":\n\n" .. infobox(lang)
	
	if lang:hasType("reconstructed") then
		add = add .. "\n\n" ..
			ucfirst(canonicalName) .. " is a reconstructed language. Its words and roots are not directly attested in any written works, but have been reconstructed through the ''comparative method'', " ..
			"which finds regular similarities between languages that cannot be explained by coincidence or word-borrowing, and extrapolates ancient forms from these similarities.\n\n" ..
			"According to our [[Wiktionary:Criteria for inclusion|criteria for inclusion]], terms in " .. canonicalName ..
			" should '''not''' be present in entries in the main namespace, but may be added to the Reconstruction: namespace."
	elseif lang:hasType("appendix-constructed") then
		add = add .. "\n\n" ..
			ucfirst(canonicalName) .. " is a constructed language that is only in sporadic use. " ..
			"According to our [[Wiktionary:Criteria for inclusion|criteria for inclusion]], terms in " .. canonicalName ..
			" should '''not''' be present in entries in the main namespace, but may be added to the Appendix: namespace. " ..
			"All terms in this language may be available at [[Appendix:" .. ucfirst(canonicalName) .. "]]."
	end
	
	local about = mw.title.new("Wiktionary:About " .. canonicalName)
	
	if about.exists then
		add = add .. "\n\n" ..
			"Please see '''[[Wiktionary:About " .. canonicalName .. "]]''' for information and special considerations for creating " .. nameWithLanguage .. " entries."
	end
	
	local ok, tree_of_descendants = pcall(
		require("Module:family tree").print_children,
		lang:getCode(), {
			protolanguage_under_family = true,
			must_have_descendants = true
		})
	
	if ok then
		if tree_of_descendants then
			add = add .. NavFrame(
				tree_of_descendants,
				"Family tree")
		else
			add = add .. "\n\n" .. ucfirst(lang:getCanonicalName())
				.. " has no descendants or varieties listed in Wiktionary's language data modules."
		end
	else
		mw.log("error while generating tree: " .. tostring(tree_of_descendants))
	end

	return description, intro, add
end


local function get_parents(lang, countries, extinct)
	local canonicalName = lang:getCanonicalName()
	
	local ret = {{name = "All languages", sort = canonicalName}}
	
	local fam = lang:getFamily()
	local famCode = fam and fam:getCode()
	
	-- FIXME: Some of the following categories should be added to this module.
	if not fam then
		table.insert(ret, {name = "Category:Unclassified languages", sort = canonicalName})
	elseif famCode == "qfa-iso" then
		table.insert(ret, {name = "Category:Language isolates", sort = canonicalName})
	elseif famCode == "qfa-mix" then
		table.insert(ret, {name = "Category:Mixed languages", sort = canonicalName})
	elseif famCode == "sgn" then
		table.insert(ret, {name = "Category:All sign languages", sort = canonicalName})
	elseif famCode == "crp" then
		table.insert(ret, {name = "Category:Creole or pidgin languages", sort = canonicalName})
		for _, anc in ipairs(lang:getAncestors()) do
			-- Avoid Haitian Creole being categorised in [[:Category:Haitian Creole-based creole or pidgin languages]], as one of its ancestors is an etymology-only variety of it.
			-- Use that ancestor's ancestors instead.
			if anc:getFullCode() == lang:getCode() then
				for _, anc_extra in ipairs(anc:getAncestors()) do
					table.insert(ret, {name = "Category:" .. ucfirst(anc_extra:getFullName()) .. "-based creole or pidgin languages", sort = canonicalName})
				end
			else
				table.insert(ret, {name = "Category:" .. ucfirst(anc:getFullName()) .. "-based creole or pidgin languages", sort = canonicalName})
			end
		end
	elseif famCode == "art" then
		if lang:hasType("appendix-constructed") then
			table.insert(ret, {name = "Category:Appendix-only constructed languages", sort = canonicalName})
		else
			table.insert(ret, {name = "Category:Constructed languages", sort = canonicalName})
		end
		for _, anc in ipairs(lang:getAncestors()) do
			if anc:getFullCode() == lang:getCode() then
				for _, anc_extra in ipairs(anc:getAncestors()) do
					table.insert(ret, {name = "Category:" .. ucfirst(anc_extra:getFullName()) .. "-based constructed languages", sort = canonicalName})
				end
			else
				table.insert(ret, {name = "Category:" .. ucfirst(anc:getFullName()) .. "-based constructed languages", sort = canonicalName})
			end
		end
	else
		table.insert(ret, {name = "Category:" .. fam:getCategoryName(), sort = canonicalName})
		if lang:hasType("reconstructed") then
			table.insert(ret, {name = "Category:Reconstructed languages", sort = (mw.ustring.gsub(canonicalName, "^Proto%-", ""))})
		end
	end
	
	local function add_sc_cat(sc)
		table.insert(ret, {name = "Category:" .. sc:getCategoryName() .. " languages", sort = canonicalName})
	end
	
	local function add_Hrkt()
		add_sc_cat(Hrkt)
		add_sc_cat(Hira)
		add_sc_cat(Kana)
	end
	
	for _, sc in ipairs(lang:getScripts()) do
		if sc:getCode() == "Hrkt" then
			add_Hrkt()
		else
			add_sc_cat(sc)
			if sc:getCode() == "Jpan" then
				add_sc_cat(Hani)
				add_Hrkt()
			elseif sc:getCode() == "Kore" then
				add_sc_cat(Hang)
				add_sc_cat(Hani)
			end
		end
	end
	
	if lang:hasTranslit() then
		table.insert(ret, {name = "Category:Languages with automatic transliteration", sort = canonicalName})
	end
	
	local saw_country = false
	for _, country in ipairs(countries) do
		if country ~= "UNKNOWN" then
			table.insert(ret, {name = "Category:Languages of " .. country, sort = canonicalName})
			saw_country = true
		end
	end

	if extinct then
		table.insert(ret, {name = "Category:All extinct languages", sort = canonicalName})
	end

	if not saw_country then
		table.insert(ret, {name = "Category:Languages not sorted into a country category", sort = canonicalName})
	end

	return ret
end


local function get_children(lang)
	local ret = {}

	-- FIXME: We should work on the children mechanism so it isn't necessary to manually specify these.
	for _, label in ipairs({"appendices", "entry maintenance", "lemmas", "names", "phrases", "rhymes", "symbols", "templates", "terms by etymology", "terms by usage"}) do
		table.insert(ret, {name = label, is_label = true})
	end

	table.insert(ret, {name = "terms derived from {{{langname}}}", is_label = true, lang = false})
	table.insert(ret, {module = "topic cat", args = {code = "{{{langcode}}}", label = "all topics"}, sort = "all topics"})
	table.insert(ret, {name = "Regional {{{langname}}}"})
	table.insert(ret, {name = "Requests concerning {{{langname}}}"})
	table.insert(ret, {name = "Category:Rhymes:{{{langname}}}", description = "Lists of {{{langname}}} words by their rhymes."})
	table.insert(ret, {name = "Category:User {{{langcode}}}", description = "Wiktionary users categorized by fluency levels in {{{langname}}}."})
	return ret
end


-- Handle language categories of the form e.g. [[:Category:French language]] and
-- [[:Category:British Sign Language]].
table.insert(raw_handlers, function(data)
	local lang
	local langname = data.category:match("^(.*) language$")
	if langname then
		lang = m_languages.getByCanonicalName(langname)
	elseif data.category:find(" Language$") then
		lang = m_languages.getByCanonicalName(data.category)
	end
	if not lang then
		return nil
	end
	local params = {
		[1] = {list = true},
		["setwiki"] = {},
		["setwikt"] = {},
		["setsister"] = {},
		["entryname"] = {},
		["extinct"] = {type = "boolean"},
	}
	local args = require("Module:parameters").process(data.args, params)
	-- If called from inside, don't require any arguments, as they can't be known
	-- in general and aren't needed just to generate the first parent (used for
	-- breadcrumbs).
	if #args[1] == 0 and not data.called_from_inside then
		-- At least one country must be specified unless the language is constructed (e.g. Esperanto) or reconstructed (e.g. Proto-Indo-European).
		local fam = lang:getFamily()
		if not (lang:hasType("reconstructed") or (fam and fam:getCode() == "art")) then
			error("At least one country (param 1=) must be specified for language '" .. lang:getCanonicalName() .. "' (code '" .. lang:getCode() .. "'). " ..
				"Use the value UNKNOWN if the language's location is truly unknown.")
		end
	end
	local description, intro, additional = "", "", ""
	-- If called from inside the category tree system, it's called when generating
	-- parents or children, and we don't need to generate the description or additional
	-- text (which is very expensive in terms of memory because it calls [[Module:family tree]],
	-- which calls [[Module:languages/data/all]]).
	if not data.called_from_inside then
		description, intro, additional = get_description_intro_additional(
			lang, args[1], args.extinct, args.setwiki, args.setwikt, args.setsister, args.entryname
		)
	end
	return {
		description = description,
		lang = lang:getCode(),
		intro = intro,
		additional = additional,
		breadcrumb = lang:getCanonicalName(),
		parents = get_parents(lang, args[1], args.extinct),
		extra_children = get_children(lang),
		umbrella = false,
		can_be_empty = true,
	}, true
end)


-- Handle categories such as [[:Category:Regional French]] and [[:Category:Regional Ancient Greek]].
table.insert(raw_handlers, function(data)
	local langname = data.category:match("^Regional (.*)$")
	if langname then
		local lang = require("Module:languages").getByCanonicalName(langname)
		if lang then
			return {
				lang = lang:getCode(),
				description = "Categories containing terms in regional varieties of " .. lang:makeCategoryLink() .. ".",
				additional = "This category sometimes also directly contains terms that are uncategorized regionalisms: such terms should be recategorized by the particular regional variety they belong to, or categorized as dialectal.",
				parents = {
					"{{{langcat}}}",
					{name = "Regionalisms", sort = langname},
				},
				breadcrumb = "Regional",
			}
		end
	end
end)


-- Fancy version of ine() (if-not-empty). Converts empty string to nil, but also strips leading/trailing space.
local function ine(arg)
	if not arg then return nil end
	arg = mw.text.trim(arg)
	if arg == "" then return nil end
	return arg
end


-- Modeled after splitLabelLang() in [[Module:auto cat]]. Try to split off a maximally long language (full or
-- etymology-only) on the right, and return the resulting language object and the region preceding it. We need to
-- check the maximally long language because of cases like 'English' vs 'Middle English' and 'Chinese Pidgin English';
-- [[:Category:Late Middle English]] should split as 'Late' and 'Middle English', not as 'Late Middle' and 'English'.
local function split_region_lang(pagename)
	local getByCanonicalName = require("Module:languages").getByCanonicalName
	local canonicalName
	local lang
	local region
	
	-- Try the entire title as a language; if not, chop off a word on the left and repeat.
	local words = mw.text.split(pagename, " ")
	for i = 1, #words do
		canonicalName = table.concat(words, " ", i, #words)
		lang = getByCanonicalName(canonicalName, nil, "allow etym")
		if lang then
			if i == 1 then
				region = nil
			else
				region = table.concat(words, " ", 1, i - 1)
			end
			break
		end
	end

	return lang, region
end


-- Handle dialect categories such as [[:Category:New Zealand English]], [[:Category:Late Middle English]],
-- [[:Category:Arbëresh Albanian]] or arbitrarily-named categories like [[:Category:Provençal]]. We currently require
-- that dialect=1 is specified to the call to {{auto cat}} to avoid overfiring.
table.insert(raw_handlers, function(data)
	local raw_args

	if data.called_from_inside then
		-- If called from inside we won't have any params available and want to handle basic categories for
		-- etymology-only languages so e.g. [[:Category:Arbëresh Albanian]] can have [[:Category:Tosk Albanian]] as its
		-- parent.
		local lang, breadcrumb = split_region_lang(data.category)
		if lang then
			local cat_page = mw.title.new("Category:" .. data.category)
			if cat_page then
				local contents = cat_page:getContent()
				if contents then
					for name, args, _, _ in require("Module:templateparser").findTemplates(contents) do
						if name == "auto cat" or name == "autocat" then
							raw_args = args
							break
						end
					end
				end
			end
			if not raw_args then
				-- FIXME: If we can't parse the scraped {{auto cat}} spec, maybe we should fail rather than return
				-- default values like this.
				return {
					-- FIXME, allow etymological codes here
					lang = lang:getFullCode(),
					description = "Foo",
					parents = {"Regional " .. lang:getFullName()},
					breadcrumb = breadcrumb or lang:getCanonicalName(),
					umbrella = false,
					can_be_empty = true,
				}, true
			end
		else
			return nil
		end
	end

	if not data.called_from_inside and not ine(data.args.dialect) then
		return nil
	end

	local params = {
		[1] = {},
		dialect = {type = "boolean"},
		lang = {},
		verb = {},
		prep = {},
		def = {},
		-- FIXME: Not implemented. When is this useful?
		nodef = {type = "boolean"},
		nolink = {type = "boolean"},
		parentcat = {},
		othercat = {list = true},
		wp = {},
		wplang = {},
		breadcrumb = {},
		pagename = {}, -- for testing or demonstration
	}
	if not data.called_from_inside then
		raw_args = data.args
	end

	local args = require("Module:parameters").process(raw_args, params)

	local lang, breadcrumb, regiondesc, langname
	local region
	local pagename = args.pagename or data.category
	if not args.lang then
		lang, breadcrumb = split_region_lang(pagename)
		langname = lang:getCanonicalName()
		if not lang then
			error(("lang= not given and unable to parse language from category '%s'"):format(pagename))
		end
		regiondesc = args[1] or breadcrumb
		-- If the langname and pagename are the same (happens only with etym-only languages), the parent category is set below
		-- to the full parent, so the breadcrumb should show the language name.
		breadcrumb = breadcrumb or langname
	else
		lang = m_languages.getByCode(args.lang, "lang", "allow etym")
		langname = lang:getCanonicalName()
		if pagename == langname then
			breadcrumb = langname
			-- regiondesc should stay nil
		else
			breadcrumb = pagename:match("^(.*) " .. require("Module:pattern utilities").pattern_escape(langname) .. "$")
			regiondesc = breadcrumb
		end
	end
	if args[1] then
		regiondesc = args[1]
	elseif not regiondesc then
		error(("1= (region) not given and unable to infer region from category '%s' given language name '%s'"):
			format(pagename, langname))
	end
	breadcrumb = args.breadcrumb or breadcrumb or require("Module:links").remove_links(regiondesc)

	local intro
	if args.wp then
		if args.wplang then
			intro = ("{{wp|%s|lang=%s}}"):format(args.wp, args.wplang)
		elseif args.wp == "1" then
			intro = "{{wp}}"
		else
			intro = ("{{wp|%s}}"):format(args.wp)
		end
	end

	local additional
	local parents = {}
	local langname_for_desc = langname
	local etymcodes = {}
	local function make_code(code)
		return ("<code>%s</code>"):format(code)
	end
	if lang:hasType("etymology-only") then
		if langname == pagename then
			local parent_name = lang:getFullName()
			langname_for_desc = parent_name
		end
		local langcode = lang:getCode()
		table.insert(etymcodes, make_code(langcode))
		-- Find all alias codes for the etymology-only language.
		-- FIXME: There should be a better/easier way of doing this.
		local ety_code_to_name = mw.loadData("Module:etymology languages/code to canonical name")
		for code, canon_name in pairs(ety_code_to_name) do
			if canon_name == langname and code ~= langcode then
				table.insert(etymcodes, make_code(code))
			end
		end
		additional = ("[[Module:etymology_languages/data|Etymology-only language]] code: %s"):format(
			m_table.serialCommaJoin(etymcodes, {conj = "or"}))
	end
	local default_parent = "Regional " .. lang:getFullName()

	local description = args.def or ("Terms or senses in %s as %s %s %s."):format(
		langname_for_desc, args.verb or "spoken", args.prep or "in",
		args.nolink and regiondesc or ("{{l|en|%s}}"):format(regiondesc)
	)

	default_parent = args.parentcat or default_parent
	table.insert(parents, default_parent)
	for _, cat in ipairs(args.othercat) do
		table.insert(parents, cat)
	end

	return {
		-- FIXME, allow etymological codes here
		lang = lang:getFullCode(),
		intro = intro,
		description = description,
		additional = additional,
		parents = parents,
		breadcrumb = {name = breadcrumb, nocap = true},
		umbrella = false,
		can_be_empty = true,
	}, true
end)


-- Handle categories such as [[:Category:English-based creole or pidgin languages]].
table.insert(raw_handlers, function(data)
	local langname = data.category:match("(.*)%-based creole or pidgin languages$")
	if langname then
		local lang = require("Module:languages").getByCanonicalName(langname)
		if lang then
			return {
				lang = lang:getCode(),
				description = "Languages which developed as a [[creole]] or [[pidgin]] from " .. lang:makeCategoryLink() .. ".",
				parents = {{name = "Creole or pidgin languages", sort = "*" .. langname}},
				breadcrumb = lang:getCanonicalName() .. "-based",
			}
		end
	end
end)


-- Handle categories such as [[:Category:English-based constructed languages]].
table.insert(raw_handlers, function(data)
	local langname = data.category:match("(.*)%-based constructed languages$")
	if langname then
		local lang = require("Module:languages").getByCanonicalName(langname)
		if lang then
			return {
				lang = lang:getCode(),
				description = "Constructed languages which are based on " .. lang:makeCategoryLink() .. ".",
				parents = {{name = "Constructed languages", sort = "*" .. langname}},
				breadcrumb = lang:getCanonicalName() .. "-based",
			}
		end
	end
end)


return {RAW_CATEGORIES = raw_categories, RAW_HANDLERS = raw_handlers}