Module:category tree/poscatboiler/data/language varieties

From Wiktionary, the free dictionary
Jump to navigation Jump to search

This data submodule defines part of Wiktionary's category structure.

For an introduction to the poscatboiler system and a description of how to add or modify categories, see Module:category tree/poscatboiler/data/documentation.


local export = {}

local raw_categories = {}
local raw_handlers = {}

local m_languages = require("Module:languages")
local m_table = require("Module:table")
local parse_utilities_module = "Module:parse utilities"
local string_utilities_module = "Module:string utilities"
local labels_module = "Module:labels"
local labels_utilities_module = "Module:labels/utilities"
local rsplit = mw.text.split

local function track(page)
	-- [[Special:WhatLinksHere/Wiktionary:Tracking/poscatboiler/languages/PAGE]]
	return require("Module:debug/track")("poscatboiler/language-varieties/" .. page)
end

local function pattern_escape(pattern)
	return require(string_utilities_module).pattern_escape(pattern)
end

-- This module handles lect/variety categories of all sorts, e.g. regional lect categories such as
-- [[:Category:American English]] and [[:Category:Provençal]]; temporal lect categories such as
-- [[:Category:Early Modern English]]; sociolect categories such as [[:Category:Polari]]; and umbrella categories of the
-- form e.g. [[:Category:Varieties of English]] and [[:Category:Regional French]].

-- FIXME: Eliminate the word "dialect" here and in the {{auto cat}} parameter in favor of "lect" or "variety".

--[=[
FIXME:

1. Support multiple parents. [DONE]
2. Support cat: in parents to indicate a category. [DONE]
3. When linking a description without embedded links, use the equivalent of {{wtorw}} to auto-link to Wikipedia. [DONE]
4. Support the = true. [DONE]
]=]

-----------------------------------------------------------------------------
--                                                                         --
--                              RAW CATEGORIES                             --
--                                                                         --
-----------------------------------------------------------------------------


raw_categories["Language varieties"] = {
	description = "Categories that group terms in varieties of various languages (regional, temporal, sociolectal, etc.).",
	additional = "{{{umbrella_meta_msg}}}",
	parents = {
		"Fundamental",
	},
}

raw_categories["Regionalisms"] = {
	description = "Categories that group terms in regional varieties of various languages.",
	additional = "{{{umbrella_meta_msg}}}",
	parents = {
		"Fundamental",
		"Language varieties",
	},
}


-----------------------------------------------------------------------------
--                                                                         --
--                                RAW HANDLERS                             --
--                                                                         --
-----------------------------------------------------------------------------


local function split_on_comma(term)
	if term:find(",%s") then
		return require(parse_utilities_module).split_on_comma(term)
	else
		return rsplit(term, ",")
	end
end

local function ucfirst(text)
	return mw.getContentLanguage():ucfirst(text)
end

local function lcfirst(text)
	return mw.getContentLanguage():lcfirst(text)
end

local function page_exists(page)
	local title = mw.title.new(page)
	return title and title.exists
end


-- Handle categories such as [[:Category:Varieties of French]] and [[:Category:Varieties of Ancient Greek]].
table.insert(raw_handlers, function(data)
	local langname = data.category:match("^Varieties of (.*)$")
	if langname then
		local lang = require("Module:languages").getByCanonicalName(langname)
		if lang then
			return {
				lang = lang:getCode(),
				description = "Categories containing terms in varieties of " .. lang:makeCategoryLink() .. " (regional, temporal, sociolectal, etc.).",
				parents = {
					"{{{langcat}}}",
					{name = "Language varieties", sort = langname},
				},
				breadcrumb = "Varieties",
			}
		end
	end
end)


-- Handle categories such as [[:Category:Regional French]] and [[:Category:Regional Ancient Greek]].
table.insert(raw_handlers, function(data)
	local langname = data.category:match("^Regional (.*)$")
	if langname then
		local lang = require("Module:languages").getByCanonicalName(langname)
		if lang then
			return {
				lang = lang:getCode(),
				description = "Categories containing terms in regional varieties of " .. lang:makeCategoryLink() .. ".",
				additional = "This category sometimes also directly contains terms that are uncategorized regionalisms: such terms should be recategorized by the particular regional variety they belong to, or categorized as dialectal.",
				parents = {
					"Varieties of {{{langname}}}",
					{name = "Regionalisms", sort = langname},
				},
				breadcrumb = "Regional",
			}
		end
	end
end)


-- Fancy version of ine() (if-not-empty). Converts empty string to nil, but also strips leading/trailing space.
local function ine(arg)
	if not arg then return nil end
	arg = mw.text.trim(arg)
	if arg == "" then return nil end
	return arg
end


-- Get the full language to use e.g. in the settings.
local function get_returnable_lang(lang)
	if lang:hasType("family") then
		return nil
	else
		return lang:getFull()
	end
end


-- Get the full language code to return in the settings.
local function get_returnable_lang_code(lang)
	if lang:hasType("family") then
		return "und"
	else
		return lang:getFullCode()
	end
end


local memoizing_dialect_handler


local function category_to_lang_name(category)
	local getByCanonicalName = require("Module:languages").getByCanonicalName
	local lang
	lang = getByCanonicalName(category, nil, "allow etym", "allow family")
	if not lang then
		-- Some languages have lowercase-initial names e.g. 'the BMAC substrate', but the category begins with an
		-- uppercase letter.
		lang = getByCanonicalName(lcfirst(category), nil, "allow etym", "allow family")
	end
	return lang
end


-- Given a category (without the "Category:" prefix), look up the page defining the category, find the call to
-- {{auto cat}} (if any), and return a table of its arguments. If the category page doesn't exist or doesn't have
-- an {{auto cat}} invocation, return nil.
local function scrape_category_for_auto_cat_args(cat)
	local cat_page = mw.title.new("Category:" .. cat)
	if cat_page then
		local contents = cat_page:getContent()
		if contents then
			for name, args in require("Module:template parser").findTemplates(contents) do
				-- The template parser automatically handles redirects and canonicalizes them, so uses of {{autocat}}
				-- will also be found.
				if name == "auto cat" then
					return args
				end
			end
		end
	end
	return nil
end


-- Try to figure out if this variety is extinct or reconstructed, if type= not given.
local function determine_lect_type(category, lang, default_parent_cat)
	if category:find("^Proto%-") or lang:getCanonicalName():find("^Proto%-") or lang:hasType("reconstructed") then
		-- Is it reconstructed?
		return "reconstructed"
	end
	if lang:getCode():find("^qsb%-") then
		-- Substrate.
		return "unattested"
	end
	if lang:hasType("full") then
		-- If a full language, scrape the {{auto cat}} call and check for extinct=1.
		local parent_args = scrape_category_for_auto_cat_args(lang:getCategoryName())
		if parent_args and ine(parent_args.extinct) and require("Module:yesno")(parent_args.extinct, false) then
			return "extinct"
		end
	end
	-- Otherwise, call the dialect handler recursively for the parent category. This is correct e.g. for
	-- things like subvarieties of Classical Persian, where the lang itself (Persian) isn't extinct but the
	-- parent category refers to an extinct variety. If the dialect handler fails to return a type, it's because
	-- the parent category doesn't exist or isn't defined using {{auto cat}}, and doesn't have a language as a
	-- suffix. In that case, if we're dealing with an etymology-only language, check the parent language. Finally,
	-- fall back to returning "extant" if all else fails.
	local parent_type
	if default_parent_cat then
		export.register_likely_dialect_parent_cat(default_parent_cat)
		_, parent_type = memoizing_dialect_handler(default_parent_cat, nil, true)
	end
	if parent_type then
		return parent_type
	end
	local parent_lang = lang:getParent()
	if parent_lang then
		return determine_lect_type(category, parent_lang, nil)
	end
	return "extant"
end


-- Try to figure out the region (used as the default breadcrumb and region description) from the language. If the
-- language name is an etymology-only language, try to derive a region based on a parent etymology-only or full
-- language. For example, if the pagename is '[[:Category:British English]]', the language is 'en-GB' (British English)
-- and the same as the pagename, but we'd like to return a region 'British'. This is also called in cases where the
-- language is explicitly given but we need to infer the region from the parent language; e.g.
-- [[:Category:Lucerne Alemmanic German]] is a type of High Alemannic German but we want to infer 'Lucerne' based on
-- the parent 'Alemannic German'. If this doesn't work and the language name has a space in it, we try using
-- progressively smaller suffixes of the language. For example, for [[:Category:Walser German]]', the language is
-- 'wae' (Walser German), but the parent is 'Highest Alemannic German', whose parent is 'Alemannic German' (a full
-- language), and just "German" is nowhere in the parent-child relationships but found as a suffix in the parent
-- language. Another such case is with [[:Category:Ionic Greek]], whose parent is 'Ancient Greek'.
local function infer_region_from_lang(pagename, lang)
	local langname = lang:getCanonicalName()
	local lang_to_check = lang
	if ucfirst(langname) == pagename then
		lang_to_check = lang_to_check:getParent()
	end
	-- First check against the language name and progressively smaller suffixes; then repeat for any parents (of
	-- etymology languages). If the language name is the same as the page name, we need to start with the parent;
	-- otherwise we will always match against a suffix, but that's not what we want.
	while lang_to_check do
		local suffix = lang_to_check:getCanonicalName()
		while true do
			region = pagename:match("^(.*) " .. pattern_escape(suffix) .. "$")
			if region then
				return region
			end
			suffix = suffix:match("^.- (.*)$")
			if not suffix then
				break
			end
		end
		lang_to_check = lang_to_check:getParent()
	end

	return nil
end


-- Modeled after splitLabelLang() in [[Module:auto cat]]. Try to split off a maximally long language (full or
-- etymology-only) on the right, and return the resulting language object and the region preceding it. We need to
-- check the maximally long language because of cases like 'English' vs 'Middle English' and 'Chinese Pidgin English';
-- [[:Category:Late Middle English]] should split as 'Late' and 'Middle English', not as 'Late Middle' and 'English'.
local function split_region_lang(pagename)
	local lang
	local region

	-- Try the entire title as a language; if not, chop off a word on the left and repeat.
	local words = mw.text.split(pagename, " ")
	for i = 1, #words do
		lang = category_to_lang_name(table.concat(words, " ", i, #words))
		if lang then
			if i == 1 then
				region = nil
			else
				region = table.concat(words, " ", 1, i - 1)
			end
			break
		end
	end

	if not region and lang then
		-- The pagename is the same as a language name. Try to infer the region from the parent. See comment at
		-- function.
		region = infer_region_from_lang(pagename, lang)
	end

	return lang, region
end


-- Return the default parent cat for the given language and category. If the language and category are the same, we're
-- dealing with the overall cat for an etymology-only language, so use the category of the parent language; otherwise
-- we're dealing with a subcategory of a regular or etymology-only language (e.g. [[:Category:Issime Walser]], a
-- subcategory of [[:Category:Walser German]]), so use the language's category itself. If the resulting language is an
-- etymology-only language or a family, the parent category is that language or family's category, which for
-- etymology-only languages is named the same as the etymology-only language, and for families is named
-- "FAMILY languages"; otherwise, use "Regional LANG" as the category unless `noreg` is given, in which case we use
-- "Varieties of LANG".
local function get_default_parent_cat_from_category(category, lang, noreg)
	if lang:getCode():find("^qsb%-") then
		-- substrate
		return "Substrate languages"
	end
	local lang_for_cat
	if ucfirst(lang:getCanonicalName()) == category then
		lang_for_cat = lang:getParent()
		if not lang_for_cat then
			error(("Category '%s' has a name the same as a full language; you probably need to explicitly specify a different language using |lang="):format(category))
		end
	else
		lang_for_cat = lang
	end
	if lang_for_cat:hasType("etymology-only") or lang_for_cat:hasType("family") then
		return lang_for_cat:getCategoryName()
	elseif noreg then
		return "Varieties of " .. lang_for_cat:getCanonicalName()
	else
		return "Regional " .. lang_for_cat:getCanonicalName()
	end
end


-- Find the labels that categorize into `category`. Only categories specified using the `regional_categories` and
-- `plain_categories` fields will be returned. `lang` is the language object to use when looking up categories specified
-- using the `regional_categories` field, which append the language onto the specified category prefix. If `lang` is a
-- family or is omitted, no categories specified using `regional_categories` will be returned. Lang-specific modules for
-- all languages will be checked for matching labels that specify `category` as their category using `plain_categories`;
-- this helps e.g. with varieties of Chinese, whose labels are found in [[Module:labels/data/lang/zh]]. The return value
-- is a table in the same format as returned by `find_labels_for_category` in [[Module:labels/utilities]].
--
-- FIXME: It should be possible to check for categories specified using `regional_categories` even when `lang` is nil.
local function find_labels_for_category(category, lang)
	local regional_cat_labels, plain_cat_labels
	local full_lang
	local m_labels_utilities = require(labels_utilities_module)
	if lang and lang:hasType("language") then
		full_lang = lang:getFull()
		local regional_component = category:match("^(.-) " .. pattern_escape(full_lang:getCanonicalName()) .. "$")
		if regional_component then
			regional_cat_labels = m_labels_utilities.find_labels_for_category(regional_component,
				"regional", full_lang)
		end
	end
	plain_cat_labels = m_labels_utilities.find_labels_for_category(category, "plain", full_lang, "check all langs")

	local all_labels
	if regional_cat_labels and plain_cat_labels then
		all_labels = regional_cat_labels
		for k, v in pairs(plain_cat_labels) do
			all_labels[k] = v
		end
	else
		all_labels = regional_cat_labels or plain_cat_labels
	end

	return all_labels
end


-- Find the labels for category `category` and language object `lang` (which can be nil or a family, but in that case,
-- no labels on a category specified using `regional_categories`; FIXME: it should be possible to implement this). Then
-- filter them down to those that are specified using a lang-specific module and sort them for use in checking
-- properties such as parent and description. We filter down to only lang-specific labels because those specified in a
-- general module (especially [[Module:labels/data/regional]]) won't be able to have proper descriptions and especially
-- parents, which tend to be language-specific. The sort order prioritizes labels that match the category exactly
-- (either through the canonical version or any alias); this is followed by labels that are a prefix of the category
-- (again, either through the canonical version or any alias), so that labels whose categories are specified using
-- `regional_categories` are prioritized. Any other labels are sorted last, so that e.g. if both the label "Alberta" and
-- "Canada" (with alias "Canadian") for lang=en categorize into [[:Category:Canadian English]], we prefer the label
-- "Canada". For cases where e.g. both labels match the category as prefixes, ties are broken by prioritizing the labels
-- found in the lang-specific module whose language matches `lang`.
--
-- Returns two items. The first is a table of all labels categorizing into `category` (subject to the provisos described
-- in `find_labels_for_category()`), in the same format as returned by `find_labels_for_category` in
-- [[Module:labels/utilities]]. (Specifically, the values are objects containing all relevant information on a given
-- label, and the keys are less important.) The second is a list of label objects after filtering and sorting, in the
-- same format as the values in the `all_labels` table. The first return value will be nil if no labels could be found
-- categorizing into `category`, and the second return value will be nil if no labels remain after filtering.
local function get_sorted_labels(category, lang)
	local all_labels = find_labels_for_category(category, lang)
	if not all_labels then
		return nil
	end

	local m_labels = require(labels_module)
	local lang_specific_pattern = "^" .. pattern_escape(m_labels.lang_specific_data_modules_prefix)
	local sorted_labels = {}
	for _, labelobj in pairs(all_labels) do
		if labelobj.module:find(lang_specific_pattern) then
			table.insert(sorted_labels, labelobj)
		end
	end

	local function sort_labelobj(a, b)
		local function matches_exactly(labelobj)
			if labelobj.canonical == category then
				return true
			end
			for _, alias in ipairs(labelobj.aliases) do
				if alias == category then
					return true
				end
			end
			return false
		end

		local function matches_as_prefix(labelobj)
			if category:find("^" .. pattern_escape(labelobj.canonical) .. " ") then
				return true
			end
			for _, alias in ipairs(labelobj.aliases) do
				if category:find("^" .. pattern_escape(alias) .. " ") then
					return true
				end
			end
			return false
		end

		local function tiebreak()
			local a_matches_lang = lang and a.lang:getFullCode() == lang:getFullCode()
			local b_matches_lang = lang and b.lang:getFullCode() == lang:getFullCode()
			if a_matches_lang and not b_matches_lang then
				return true
			elseif b_matches_lang and not a_matches_lang then
				return false
			else
				return a.canonical < b.canonical
			end
		end

		local a_matches_exactly = matches_exactly(a)
		local b_matches_exactly = matches_exactly(b)
		if a_matches_exactly and not b_matches_exactly then
			return true
		elseif b_matches_exactly and not a_matches_exactly then
			return false
		elseif a_matches_exactly and b_matches_exactly then
			return tiebreak()
		end

		local a_matches_as_prefix = matches_as_prefix(a)
		local b_matches_as_prefix = matches_as_prefix(b)
		if a_matches_as_prefix and not b_matches_as_prefix then
			return true
		elseif b_matches_as_prefix and not a_matches_as_prefix then
			return false
		else
			return tiebreak()
		end
	end

	table.sort(sorted_labels, sort_labelobj)
	if #sorted_labels > 0 then
		return all_labels, sorted_labels
	else
		return all_labels, nil
	end
end


-- Find the categories (only of type `regional_categories` and `plain_categories`) that label `label` categorizes into.
-- Return value is nil if the label couldn't be located at all, otherwise a list of categories (which may be empty).
local function get_categories_for_label(label, lang)
	local m_labels = require(labels_module)
	local labret = m_labels.get_label_info { label = label, lang = lang }
	if not labret.recognized then
		return nil
	end
	local categories = m_labels.fetch_categories(labret.canonical or label, labret.data, lang, nil, nil,
		{["plain_categories"] = true})
	local reg_cats = m_labels.fetch_categories(labret.canonical or label, labret.data, lang, nil, nil,
		{["regional_categories"] = true})
	if #reg_cats > 0 then
		for _, cat in ipairs(reg_cats) do
			table.insert(categories, cat)
		end
	end
	return categories
end


-- Given the sorted labels that categorize into `category`, return the parent categories for the first label that specifies
-- any parents. `default` is the default parent category, usually "Regional LANG" or (if noreg=1 is specified) "Varieties of LANG";
-- it is used if the parent is explicitly given as `true` or "+" (or one of these values occurs among others), or if a parent label
-- was given but didn't categorize into any regional or plain categories, or if no labels with parents could be found. If
-- `all_cats` is specified, all categories associated with all specified parent labels (if more than one is present) are returned;
-- otherwise, only the categories for the first parent label are returned.
--
-- Returns two values: the list of parent categories and the label object from which the categories were derived (or nil if no
-- label object could be found with a `parent` field, in which case the return value of the list of categories is a simple-element
-- list consisting of `default`). The format of the parent category list is such that the list can directly be specified as the
-- value of the `parents` field returned by the raw handler. This means that usually the individual list elements are strings
-- (referring to raw poscat labels), but they may be strings prefixed by "Category:" (for arbitrary categories), or objects of the
-- form {name = "CATEGORY", lang = "LANGCODE", is_label = true} for poscat language labels.
local function get_parents_from_sorted_labels(sorted_labels, category, all_cats)
	for _, labobj in ipairs(sorted_labels) do
		local parent = labobj.labdata.parent
		if parent == true then
			parent = {parent}
		elseif parent and type(parent) == "string" then
			parent = split_on_comma(parent)
		end
		local function get_parent_cats(par)
			if par == true or par == "+" then
				return {"+"}
			end
			if par:find("^cat:") then
				return {"Category:" .. par:gsub("^cat:", "")}
			end
			if par:find("^Category:") then
				return {par}
			end
			if par:find("^rawposcat:") then
				return {(par:gsub("^rawposcat:", ""))}
			end
			if par:find("^poscat:") then
				local langcode, label = par:match("^poscat:([^:]+):(.*)$")
				if not langcode then
					error(("Parent poscatboiler language label '%s' for label '%s' for category '%s' (defined in module [[%s]]) needs to be of the form 'poscat:LANGCODE:LABEL'"):format(
						par, labobj.canonical, category, labobj.module))
				end
				return {{name = label, lang = langcode, is_label = true}}
			end
			local this_cats = get_categories_for_label(par, labobj.lang)
			if not this_cats then
				error(("Parent label '%s' for label '%s' for category '%s' (defined in module [[%s]]) couldn't be located"):format(
					par, labobj.canonical, category, labobj.module))
			end
			return this_cats
		end
		if parent then
			if type(parent) ~= "table" then
				error(("Internal error: Expected a string, boolean `true` or list for the value of the parent field for label '%s' for category '%s' (defined in module [[%s]]), but saw type '%s': %s"):format(
					labobj.canonical, category, labobj.module, type(parent), mw.dumpObject(parent)))
			end
			local cats
			if all_cats then
				cats = {}
				for _, par in ipairs(parent) do
					local this_cats = get_parent_cats(par)
					for _, this_cat in ipairs(this_cats) do
						m_table.insertIfNot(cats, this_cat)
					end
				end
			else
				cats = get_parent_cats(parent[1])
			end

			if #cats > 0 then
				return cats, labobj
			end
			-- FIXME: If the parent doesn't specify any categories, should we try the next parent or fall back
			-- to the parent determined through get_default_parent_cat_from_category() (which is what we currently
			-- do)?
			return {"+"}, labobj
		end
	end
	return {"+"}, nil
end

local likely_dialect_parent_cat = {}

-- Register that `cat` is likely to be a dialect cat, so we try to handle it as such in the dialect handler when
-- we are called on that category. This avoids the need to have manual allow-lists of nonstandardly-named parent
-- dialect categories to handle, such as [[:Category:Assyrian]], [[:Category:Ripuarian Franconian]] ("Franconian" is
-- not a language) and [[:Category:Limburgan-Ripuarian transitional dialects]].
function export.register_likely_dialect_parent_cat(cat)
	if type(cat) == "string" and not cat:find("^Category:") then
		likely_dialect_parent_cat[cat] = true
	end
end

-- Handle dialect categories such as [[:Category:New Zealand English]], [[:Category:Late Middle English]],
-- [[:Category:Arbëresh Albanian]], [[:Category:Provençal]] or arbitrarily-named categories like
-- [[:Category:Issime Walser]]. We currently require that dialect=1 is specified to the call to {{auto cat}} to avoid
-- overfiring. However, if called from inside, we are processing the breadcrumb for the parent (or conceivably the
-- child) of a dialect category, and won't have any params set, so we can't rely on dialect=1. In that case, only fire
-- if the category is or ends in the name of a full or etymology-only language, and scrape the category's call to
-- {{auto cat}} to get the appropriate params. This means that nonstandardly-named categories like
-- [[:Category:Issime Walser]] can't be parents of other dialect categories. To work around this, either we have to
-- relax the code below to operate on all raw categories (not necessarily a good idea), or we rename the
-- nonstandardly-named categories (e.g. in the case above, to [[:Category:Issime Walser German]], since Walser German
-- is a recognized etymology-only language).
--
-- NOTE: We are able to handle categories for etymology-only families (currently only [[:Category:Middle Iranian]] and
-- [[:Category:Old Iranian]]) and for etymology-only substrate languages (e.g. [[:Category:The BMAC substrate]]).
-- There is some special "family" code for the former.
local function dialect_handler(category, raw_args, called_from_inside)
	if called_from_inside then
		-- Avoid infinite loops from wrongly processing non-lect categories. We have a check around line 344 below
		-- for categories whose {{auto cat}} doesn't say dialect=1, but we still need the following in case of
		-- non-existent categories we're being asked to process (e.g. [[:Category:User bcc]] ->
		-- [[:Category:Southern Balochi]] (nonexistent) -> [[:Category:Regional Baluchi]] (nonexistent), which
		-- causes an infinite loop without the check below.
		if category:find("^Regional ") or category:find("^Varieties of ") or category:find("^Rhymes:") then
			return nil
		end

		-- If called from inside we won't have any params available. See comment above about this. We scrape the
		-- category page's call to {{auto cat}} to get the appropriate params, and if that fails, we currently fall back
		-- to defaults based on the label(s) that categorize(s) into the category or the name of the category. Since the
		-- call from inside is only to get the parent category and breadcrumb, these defaults actually work in most
		-- cases but not all; e.g. in the chain [[:Category:Regional Yoruba]] -> [[:Category:Central Yoruba]] ->
		-- [[:Category:Ekiti Yoruba]] -> [[:Category:Akurẹ Yoruba]], if we are forced to use default values, we will
		-- produce the right parent for [[:Category:Central Yoruba]] but not for [[:Category:Ekiti Yoruba]], where the
		-- default parent would be [[:Category:Regional Yoruba]] instead of the correct [[:Category:Central Yoruba]].
		local lang, breadcrumb = split_region_lang(category)
		if lang or likely_dialect_parent_cat[category] then
			raw_args = scrape_category_for_auto_cat_args(category)
			if raw_args and not ine(raw_args.dialect) then
				-- We are scraping something like [[:Category:American Sign Language]] that ends in a valid language but is not
				-- a dialect.
				return nil
			end
			if not raw_args then
				-- If we can't parse the scraped {{auto cat}} spec, return default values. This helps e.g. in converting
				-- from the old {{dialectboiler}} template and generally when adding new varieties.
				local parents, label_with_parent

				local function getprop(prop)
					return -- ine(raw_args[prop]) or
						label_with_parent and label_with_parent.labdata[prop]
				end

				local all_labels, sorted_labels = get_sorted_labels(category, lang)
				if sorted_labels then
					parents, label_with_parent = get_parents_from_sorted_labels(sorted_labels, category)
					if not lang and label_with_parent then
						lang = label_with_parent.lang
					end
				else
					parents = {"+"}
				end

				if not lang then
					-- We were instructed to scrape by virtue of `dialect_parent_cats_to_scrape`, but couldn't scrape
					-- anything.
					return nil
				end

				local default_parent_cat_from_category = get_default_parent_cat_from_category(category, lang,
					getprop("noreg"))
				for i, parent in ipairs(parents) do
					if parent == "+" then
						parents[i] = default_parent_cat_from_category
					end
				end
				local first_parent_cat = parents[1]
				if type(first_parent_cat) ~= "string" or first_parent_cat:find("^Category:") then
					-- Only keep `first_parent_cat` if it refers to a raw poscat label (which is probably a dialect
					-- handler label).
					first_parent_cat = nil
				end

				track("dialect")
				export.register_likely_dialect_parent_cat(parents[1])

				-- NOTE: When called from inside, the description doesn't matter; nor do any parents other than the
				-- first. This is because called_from_inside is only set when computing the breadcrumb trail, which
				-- only needs the language, first parent and breadcrumb.
				return {
					-- FIXME, allow etymological codes here
					lang = get_returnable_lang_code(lang),
					description = "Foo",
					parents = parents,
					breadcrumb = breadcrumb or lang:getCanonicalName(),
					umbrella = false,
					can_be_empty = true,
				}, determine_lect_type(category, lang, first_parent_cat)
			end
		else
			return nil
		end
	end

	if not called_from_inside and not ine(raw_args.dialect) then
		return nil
	end

	-------------------- 1. Process parameters. -------------------

	local params = {
		[1] = {},
		dialect = {type = "boolean"},
		lang = {},
		verb = {},
		prep = {},
		the = {type = "boolean"},
		def = {},
		fulldef = {},
		addl = {},
		nolink = {type = "boolean"},
		noreg = {type = "boolean"}, -- don't make the default parent be "Regional LANG"; instead, "Varieties of LANG"
		type = {}, -- "extinct", "extant", "reconstructed", "unattested", "constructed"
		cat = {},
		othercat = {}, -- comma-separated
		country = {}, -- comma-separated
		wp = {},
		wikidata = {},
		breadcrumb = {},
		pagename = {}, -- for testing or demonstration
	}

	local args = require("Module:parameters").process(raw_args, params)

	local allowed_type_values = {"extinct", "extant", "reconstructed", "unattested", "constructed"}
	if args.type and not m_table.contains(allowed_type_values, args.type) then
		error(("Unrecognized value '%s' for type=; should be one of %s"):format(
			args.type, table.concat(allowed_type_values, ", ")))
	end

	-------------------- 2. Initialize breadcrumb, regiondesc and language from category. -------------------

	-- They may be overridden later.

	local lang, breadcrumb, regiondesc, langname
	local region
	category = args.pagename or category
	if not args.lang then
		lang, breadcrumb = split_region_lang(category)
		if lang then
			langname = lang:getCanonicalName()
		end
		-- The lang and/or breadcrumb may be nil at this point (e.g. we're processing a category like
		-- [[:Category:Singlish]] or [[:Category:Polari]] that doesn't have a language in it). We don't throw an error
		-- yet because we may be able to fetch the lang, regiondesc and breadcrumb from a label that categorizes into
		-- the category.
		regiondesc = breadcrumb
	else
		lang = m_languages.getByCode(args.lang, "lang", "allow etym")
		langname = lang:getCanonicalName()
		if category == ucfirst(langname) then
			-- breadcrumb and regiondesc should stay nil; breadcrumb will get `category` as a default, and the lack of
			-- regiondesc will cause an error to be thrown unless the user gave it explicitly or specified def=.
		else
			breadcrumb = category:match("^(.*) " .. pattern_escape(langname) .. "$")
			if not breadcrumb then
				-- Try to infer the region from the parent. See comment at function.
				breadcrumb = infer_region_from_lang(category, lang)
			end
			regiondesc = breadcrumb
		end
	end

	-------------------- 3. Determine labels categorizing into this category. -------------------

	local all_labels, sorted_labels = get_sorted_labels(category, lang)

	-------------------- 4. Determine parent categories and initialize additional properties. -------------------

	-- The first label with a parent is used to fetch additional properties, such as region= and addl=.

	local parents
	local first_parent_cat = args.cat
	local label_with_parent

	local function getprop(prop)
		return args[prop] or label_with_parent and label_with_parent.labdata[prop]
	end

	if first_parent_cat then
		parents = {first_parent_cat}
		if not lang then
			error(("lang= not given and unable to parse language from category '%s' (didn't check labels categorizing into the category because cat= explicitly given)"):format(category))
		end
	else
		if sorted_labels then
			parents, label_with_parent = get_parents_from_sorted_labels(sorted_labels, category, "all cats")
			if not lang and label_with_parent then
				lang = label_with_parent.lang
				langname = lang:getCanonicalName()
			end
		else
			parents = {"+"}
		end
		if not lang then
			error(("lang= not given, unable to parse language from category '%s' and can't find a label categorizing into the category"):format(category))
		end
		local default_parent_cat_from_category = get_default_parent_cat_from_category(category, lang, getprop("noreg"))
		for i, parent in ipairs(parents) do
			if parent == "+" then
				parents[i] = default_parent_cat_from_category
			end
		end
		first_parent_cat = parents[1]
	end
	if type(first_parent_cat) ~= "string" or first_parent_cat:find("^Category:") then
		-- Only keep `first_parent_cat` if it refers to a raw poscat label (which is probably a dialect handler label).
		-- WARNING: Code below using `first_parent_cat` must handle nil.
		first_parent_cat = nil
	end

	local othercat = getprop("othercat")
	if othercat and type(othercat) == "string" then
		othercat = split_on_comma(othercat)
	end
	if othercat then
		for _, cat in ipairs(othercat) do
			if not cat:find("^Category:") then
				cat = "Category:" .. cat
			end
			table.insert(parents, cat)
		end
	end

	local countries = getprop("country")
	if countries and type(countries) == "string" then
		countries = split_on_comma(countries)
	end

	-- If no breadcrumb, this often happens when the langname and category are the same (happens only with etym-only
	-- languages), and the parent category is set below to the full parent, so the breadcrumb should show the
	-- language name (or equivalently, the category). If the langname and category are different, we should fall back to
	-- the category. E.g. for Singlish, lang=en is specified and we can't infer a breadcrumb because the dialect name
	-- doesn't end in "English"; in this case we want the breadcrumb to show "Singlish".
	breadcrumb = getprop("breadcrumb") or breadcrumb or category

	local the_prefix

	if args[1] then
		regiondesc = args[1]
		the_prefix = ""
	else
		local regionprop = getprop("region")
		if regionprop then
			regiondesc = regionprop
			the_prefix = ""
		elseif label_with_parent then
			-- It's not clear which of the following two are better. The second one uses the actual label display form,
			-- which might be argued to be better, except that it will often be linked to a Wikipedia article about the
			-- dialect rather than the place. The first one just uses the canonical label directly (which will later be
			-- linked to itself if unlinked). A third possibility is to use `label_with_parent.display` if present,
			-- otherwise `label_with_parent.canonical`.
			regiondesc = label_with_parent.canonical
			if label_with_parent.display and regiondesc ~= label_with_parent.display then
				track("display-different-from-canonical")
			end
			-- regiondesc = require(labels_module).get_displayed_label(label_with_parent.canonical, label_with_parent.labdata, lang)
		end
	end
	the_prefix = the_prefix or getprop("the") and "the " or ""

	countries = countries or {regiondesc and the_prefix .. regiondesc or nil}
	for _, country in ipairs(countries) do
		if not country:find("[<=]") then
			country = require("Module:links").remove_links(country)
			local cat = "Category:Languages of " .. country
			if page_exists(cat) then
				table.insert(parents, cat)
			end
		end
	end

	-------------------- 5. Refine the language to an etymology-only child if possible. -------------------
	
	-- Now that we've determined the parent, we look up the parent hierarchy until we find a category naming an
	-- etymology-only language. If we find one and it's a child of the language we've determined, use it.

	local ancestral_cat = first_parent_cat

	local refined_lang
	while true do
		refined_lang = category_to_lang_name(ancestral_cat)
		if refined_lang then
			break
		end
		export.register_likely_dialect_parent_cat(ancestral_cat)
		local settings, _ = memoizing_dialect_handler(ancestral_cat, nil, true)
		if not settings then
			break
		end
		ancestral_cat = settings.parents[1]
	end

	if refined_lang and refined_lang:hasParent(lang) then
		lang = refined_lang
		langname = lang:getCanonicalName()
	end

	-------------------- 6. Initialize `additional` with user-specified text and info about labels. -------------------

	local additional = getprop("addl")

	local function append_addl(addl_text)
		if not addl_text then
			return
		end
		if additional then
			additional = additional .. "\n\n" .. addl_text
		else
			additional = addl_text
		end
	end

	if all_labels then
		local m_labels_utilities = require(labels_utilities_module)
		append_addl(m_labels_utilities.format_labels_categorizing(all_labels, nil,
			get_returnable_lang(lang)))
	end

	-------------------- 7. Augment `additional` with information about etymology-only codes. -------------------

	local langname_for_desc
	local etymcodes = {}
	local function make_code(code)
		return ("<code>%s</code>"):format(code)
	end
	if lang:hasType("etymology-only") and ucfirst(langname) == category then
		langname_for_desc = lang:getParentName()
		local langcode = lang:getCode()
		table.insert(etymcodes, make_code(langcode))
		-- Find all alias codes for the etymology-only language.
		-- FIXME: There should be a better/easier way of doing this.
		local ety_code_to_name = mw.loadData("Module:etymology languages/code to canonical name")
		for code, canon_name in pairs(ety_code_to_name) do
			if canon_name == langname and code ~= langcode then
				table.insert(etymcodes, make_code(code))
			end
		end
		local addl_etym_codes = ("[[Module:etymology_languages/data|Etymology-only language]] code: %s."):format(
			m_table.serialCommaJoin(etymcodes, {conj = "or"}))
		append_addl(addl_etym_codes)
	else
		langname_for_desc = langname
	end

	-------------------- 8. Try to figure out if this variety is extinct or reconstructed. -------------------

	local lect_type = getprop("type")
	if not lect_type then
		lect_type = determine_lect_type(category, lang, first_parent_cat)
	end
	local function prefix_addl(addl_text)
		if additional then
			additional = addl_text .. "\n\n" .. additional
		else
			additional = addl_text
		end
	end
	if lect_type == "extinct" then
		prefix_addl("This language variety is [[extinct language|extinct]].")
		table.insert(parents, "Category:All extinct languages")
	elseif lect_type == "reconstructed" then
		prefix_addl("This language variety is [[reconstructed language|reconstructed]].")
		table.insert(parents, "Category:Reconstructed languages")
	elseif lect_type == "unattested" then
		prefix_addl("This language variety is {{w|unattested language|unattested}}.")
		table.insert(parents, "Category:Unattested languages")
	elseif lect_type == "constructed" then
		prefix_addl("This language variety is [[constructed language|constructed]].")
		table.insert(parents, "Category:Constructed languages")
	end

	-------------------- 9. Compute `description`. -------------------

	local description

	local fulldef = getprop("fulldef")
	if fulldef then
		description = fulldef .. "."
	end

	if not description then
		local def = getprop("def")
		if def then
			description = ("Terms or senses in %s."):format(def)
		end
	end

	if not description then
		if not regiondesc then
			-- We need regiondesc for the description unless def= or fulldef= is given, which overrides the part that needs it.
			error(("1= (region) not given and unable to infer region from category '%s' given language name '%s'"):
				format(category, langname))
		end

		local lang_en = m_languages.getByCode("en", true)

		local linked_regiondesc = regiondesc
		-- Don't try to link if HTML, = sign, template call or embedded link found in text. Embedded links will
		-- automatically be converted to English links by JavaScript.
		local function linkable(text)
			return not text:find("[<={}%[%]|]")
		end
		if linked_regiondesc:find("<country>") then
			if not countries then
				error(("Can't specify <country> in region description '%s' when country= not given"):format(linked_regiondesc))
			end
			-- Link the countries individually before calling serialCommaJoin(), which inserts HTML.
			local linked_countries = {}
			for _, country in ipairs(countries) do
				if linkable(country) then
					country = require("Module:links").full_link { lang = lang_en, term = country }
				end
				table.insert(linked_countries, country)
			end
			linked_countries = m_table.serialCommaJoin(linked_countries)
			linked_regiondesc = linked_regiondesc:gsub("<country>",
				require(string_utilities_module).replacement_escape(linked_countries))
		elseif not getprop("nolink") and linkable(linked_regiondesc) then
			-- Even if nolink not given, don't try to link if HTML or = sign found in linked_regiondesc, otherwise
			-- we're likely to get an error.
			if page_exists(linked_regiondesc) then
				-- Only construct a Wiktionary link if the page exists; otherwise construct a Wikipedia link.
				linked_regiondesc = require("Module:links").full_link { lang = lang_en, term = linked_regiondesc }
			else
				linked_regiondesc = ("[[w:%s|%s]]"):format(linked_regiondesc, linked_regiondesc)
			end
		end
		linked_regiondesc = the_prefix .. linked_regiondesc
		local verb = getprop("verb") or "spoken"
		local prep = getprop("prep")

		if not langname_for_desc then
			error(category)
		end
		description = ("Terms or senses in %s as %s%s %s."):format(
			langname_for_desc, verb, prep == "-" and "" or " " .. (prep or "in"), linked_regiondesc)
	end

	-------------------- 10. Compute the Wikipedia articles that go into `topright`. -------------------

	local topright_parts = {}
	-- Insert Wikipedia article `article` for Wikimedia language `wmcode` into `topright_parts`, avoiding duplication.
	local function insert_wikipedia_article(wmcode, article)
		m_table.insertIfNot(topright_parts, ("{{wp%s%s}}"):format(
			wmcode == "en" and "" or "|lang=" .. wmcode,
			article == category and "" or "|" .. article
		))
	end

	local function insert_wikipedia_articles_for_wikipedia_specs(specs, default)
		for _, article in ipairs(specs) do
			local foreign_wiki
			if article == true then
				article = default
			else
				if article:find(":[^ ]") then
					local actual_article
					foreign_wiki, actual_article = article:match("^([a-z][a-z][a-z-]*):([^ ].*)$")
					if actual_article then
						article = actual_article
					end
				end
				if article == "+" then
					article = default
				elseif article == "-" then
					article = nil
				else
					article = require("Module:yesno")(article, article)
					if article == true then
						article = default
					end
				end
			end
			if article then
				insert_wikipedia_article(foreign_wiki or "en", article)
			end
		end
	end

	local function insert_wikipedia_articles_for_wikidata_specs(specs, lang)
		if not mw.wikibase then
			error(("Unable to retrieve data from Wikidata ID's '%s'; `mw.wikibase` not defined"):format(args.wikidata))
		end
		local wikipedia_langs = require(labels_module).get_langs_to_extract_wikipedia_articles_from_wikidata(lang)
		local ids_without_wmcodes = {}
		local ids_with_wmcodes = {}
		for _, id in ipairs(specs) do
			if id:find(":") then
				table.insert(ids_with_wmcodes, id)
			else
				table.insert(ids_without_wmcodes, id)
			end
		end
		for _, wmcode in ipairs(wikipedia_langs) do
			for _, id in ipairs(ids_without_wmcodes) do
				local article = mw.wikibase.sitelink(id, wmcode .. "wiki")
				if article then
					insert_wikipedia_article(wmcode, article)
				end
			end
		end
		for _, id in ipairs(ids_with_wmcodes) do
			local wmcode, wikidata_id = id:match("^(.-):(.*)$")
			local article = mw.wikibase.sitelink(wikidata_id, wmcode .. "wiki")
			if article then
				insert_wikipedia_article(wmcode, article)
			end
		end
	end

	if args.wp or args.wikidata then
		if args.wp then
			insert_wikipedia_articles_for_wikipedia_specs(split_on_comma(args.wp), category)
		end
		if args.wikidata then
			insert_wikipedia_articles_for_wikidata_specs(rsplit(args.wikidata, "%s*,%s*"), lang)
		end
	elseif pagename == ucfirst(langname) then
		local topright_parts = {}
		local wikipedia_langs = require(labels_module).get_langs_to_extract_wikipedia_articles_from_wikidata(lang)
		for _, wmcode in ipairs(wikipedia_langs) do
			local article = lang:getWikipediaArticle("no category fallback", wmcode .. "wiki")
			if article then
				insert_wikipedia_article(wmcode, article)
			end
		end
	end
	if #topright_parts == 0 and sorted_labels then
		for _, labobj in pairs(all_labels) do
			local wp_specs = labobj.labdata.Wikipedia
			if wp_specs then
				if type(wp_specs) ~= "table" then
					wp_specs = {wp_specs}
				end
				insert_wikipedia_articles_for_wikipedia_specs(wp_specs, labobj.canonical)
			end
			local wikidata_specs = labobj.labdata.Wikidata
			if wikidata_specs then
				if type(wikidata_specs) ~= "table" then
					wikidata_specs = {wikidata_specs}
				end
				insert_wikipedia_articles_for_wikidata_specs(wikidata_specs, labobj.lang)
			end
		end
	end

	local topright
	if #topright_parts > 0 then
		topright = table.concat(topright_parts)
	end

	-------------------- 11. Return the combined structure of all information. -------------------

	track("dialect")
	export.register_likely_dialect_parent_cat(parents[1])

	return {
		-- FIXME, allow etymological codes here
		lang = get_returnable_lang_code(lang),
		topright = topright,
		description = description,
		additional = additional,
		parents = parents,
		breadcrumb = {name = breadcrumb, nocap = true},
		umbrella = false,
		can_be_empty = true,
	}, lect_type
end


local memoized_responses = {}

memoizing_dialect_handler = function(category, raw_args, called_from_inside)
	mw.log(category)
	local retval = memoized_responses[category]
	if not retval then
		retval = {dialect_handler(category, raw_args, called_from_inside)}
		memoized_responses[category] = retval
	end
	local obj, lect_type = retval[1], retval[2]
	return obj, lect_type
end

-- Actual handler for dialect categories. See dialect_handler() above.
table.insert(raw_handlers, function(data)
	local settings, _ = memoizing_dialect_handler(data.category, data.args, data.called_from_inside)
	return settings, not not settings
end)


return {RAW_CATEGORIES = raw_categories, RAW_HANDLERS = raw_handlers, export = export}