Module:headword utilities

From Wiktionary, the free dictionary
Jump to navigation Jump to search

local export = {}

local table_module = "Module:table"
local string_utilities_module = "Module:string utilities"
local parse_utilities_module = "Module:parse utilities"

local rfind = mw.ustring.find
local rmatch = mw.ustring.match
local rsplit = mw.text.split
local rsubn = mw.ustring.gsub

-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
	local retval = rsubn(term, foo, bar)
	return retval
end


-- Default function to split a word on apostrophes. Don't split apostrophes at the beginning or end of a word (e.g.
-- [['ndrangheta]] or [[po']]). Handle multiple apostrophes correctly, e.g. [[l'altr'ieri]] -> [[l']][altr']][[ieri]].
function export.default_split_apostrophe(word)
	local begapo, inner_word, endapo = word:match("^('*)(.-)('*)$")
	local apostrophe_parts = rsplit(word, "'")
	local linked_apostrophe_parts = {}
	local apostrophes_at_beginning = ""
	local i = 1
	-- Apostrophes at beginning get attached to the first word after (which will always exist but may
	-- be blank if the word consists only of apostrophes).
	while i < #apostrophe_parts do -- <, not <=, in case the word consists only of apostrophes
		local apostrophe_part = apostrophe_parts[i]
		i = i + 1
		if apostrophe_part == "" then
			apostrophes_at_beginning = apostrophes_at_beginning .. "'"
		else
			break
		end
	end
	apostrophe_parts[i] = apostrophes_at_beginning .. apostrophe_parts[i]
	-- Now, do the remaining parts. A blank part indicates more than one apostrophe in a row; we join
	-- all of them to the preceding word.
	while i <= #apostrophe_parts do
		local apostrophe_part = apostrophe_parts[i]
		if apostrophe_part == "" then
			linked_apostrophe_parts[#linked_apostrophe_parts] =
				linked_apostrophe_parts[#linked_apostrophe_parts] .. "'"
		elseif i == #apostrophe_parts then
			table.insert(linked_apostrophe_parts, apostrophe_part)
		else
			table.insert(linked_apostrophe_parts, apostrophe_part .. "'")
		end
		i = i + 1
	end
	for i, tolink in ipairs(linked_apostrophe_parts) do
		linked_apostrophe_parts[i] = "[[" .. tolink .. "]]"
	end
	return table.concat(linked_apostrophe_parts)
end


--[=[
Auto-add links to a word that should not have spaces but may have hyphens and/or apostrophes. We split off final
punctuation, then split on hyphens if `data.split_hyphen` is given, and also split on apostrophes if
`data.split_apostrophe` is given. We only split on hyphens if they are in the middle of the word, not at the beginning
or end (hyphens at the beginning or end indicate suffixes or prefixes, respectively). `include_hyphen_prefixes`, if
given, is a set of prefixes (not including the final hyphen) where we should include the final hyphen in the prefix.
Hence, e.g. if "anti" is in the set, a Portuguese word like [[anti-herói]] "anti-hero" will be split [[anti-]][[herói]]
(whereas a word like [[código-fonte]] "source code" will be split as [[código]]-[[fonte]]).

If `data.split_apostrophe` is specified, we split on apostrophes unless `data.no_split_apostrophe_words` is given and
the word is in the specified set, such as French [[c'est]] and [[quelqu'un]]. If `data.split_apostrophe` is true, the
default algorithm applies, which splits on all apostrophes except those at the beginning and end of a word (as in
Italian [['ndrangheta]] or [[po']]), and includes the apostrophe in the link to its left (so we auto-split French
[[l'eau]] as [[l']][[eau]] and [[l'altr'ieri]] as [[l']][altr']][[ieri]]). If `data.split_apostrophe` is specified
but not `true`, it should be a function of one argument that does custom apostrophe-splitting. The argument is the word
to split, and the return value should be the split and linked word.
]=]
local function add_single_word_links(space_word, data)
	local space_word_no_punct, punct = rmatch(space_word, "^(.*)([,;:?!])$")
	space_word_no_punct = space_word_no_punct or space_word
	punct = punct or ""
	local words
	if space_word_no_punct:find("^%-") or space_word_no_punct:find("%-$") then
		-- don't split prefixes and suffixes
		words = {space_word_no_punct}
	elseif type(data.split_hyphen_when_space) == "function" then
		words = data.split_hyphen_when_space(space_word_no_punct)
		if type(words) == "string" then
			return words .. punct
		end
	end
	if not words then
		if data.split_hyphen_when_space then
			words = rsplit(space_word_no_punct, "%-")
		else
			words = {space_word_no_punct}
		end
	end
	local linked_words = {}
	for j, word in ipairs(words) do
		if j < #words and data.include_hyphen_prefixes and data.include_hyphen_prefixes[word] then
			word = "[[" .. word .. "-]]"
		else
			-- Don't split on apostrophes if the word is in `no_split_apostrophe_words`.
			if (not data.no_split_apostrophe_words or not data.no_split_apostrophe_words[word]) and
				data.split_apostrophe and word:find("'") then
				if data.split_apostrophe == true then
					word = export.default_split_apostrophe(word)
				else -- custom apostrophe splitter/linker
					word = data.split_apostrophe(word)
				end
			else
				word = "[[" .. word .. "]]"
			end
			if j < #words then
				word = word .. "-"
			end
		end
		table.insert(linked_words, word)
	end
	return table.concat(linked_words) .. punct
end

-- Auto-add links to a multiword term. Links are not added to single-word terms. We split on spaces, and also on hyphens
-- if `split_hyphen` is given or the word has no spaces. In addition, we split on apostrophes, including the apostrophe
-- in the link to its left (so we auto-split "de l'eau" "[[de]] [[l']][[eau]]"). We don't always split on hyphens
-- because of cases like "boire du petit-lait" where "petit-lait" should be linked as a whole, but provide the option to
-- do it for cases like "croyez-le ou non". If there's no space, however, then it makes sense to split on hyphens by
-- default (e.g. for "avant-avant-hier"). Cases where only some of the hyphens should be split can always be handled by
-- explicitly specifying the head (e.g. "Nord-Pas-de-Calais" given as head=[[Nord]]-[[Pas-de-Calais]]).
--
-- `no_split_apostrophe_words` and `include_hyphen_prefixes` allow for special-case handling of particular words and
-- are as described in the comment above add_single_word_links().
function export.add_links_to_multiword_term(term, data)
	if rfind(term, "[%[%]]") then
		return term
	end
	if not rfind(term, " ") then
		data = require(table_module).shallowcopy(data)
		data.split_hyphen_when_space = true
	end
	local words = rsplit(term, " ")
	local linked_words = {}
	for _, word in ipairs(words) do
		table.insert(linked_words, add_single_word_links(word, data))
	end
	local retval = table.concat(linked_words, " ")
	-- If we ended up with a single link consisting of the entire term,
	-- remove the link.
	local unlinked_retval = rmatch(retval, "^%[%[([^%[%]]*)%]%]$")
	return unlinked_retval or retval
end


-- Badly named older entry point. FIXME: Obsolete me!
function export.add_lemma_links(lemma, split_hyphen_when_space)
	return export.add_links_to_multiword_term(lemma, {split_hyphen_when_space = split_hyphen_when_space})
end


-- Ensure that brackets display literally in error messages. Replacing with equivalent HTML escapes doesn't work
-- because they are displayed literally; but inserting a Unicode word-joiner symbol works.
local function escape_wikicode(term)
	return require(parse_utilities_module).escape_wikicode(term)
end


-- Given a `linked_term` that is the output of add_links_to_multiword_term(), apply modifications as given in
-- `modifier_spec` to change the link destination of subterms (normally single-word non-lemma forms; sometimes
-- collections of adjacent words). This is usually used to link non-lemma forms to their corresponding lemma, but can
-- also be used to replace a span of adjacent separately-linked words to a single multiword lemma. The format of
-- `modifier_spec` is one or more semicolon-separated subterm specs, where each such spec is of the form
-- SUBTERM:DEST, where SUBTERM is one or more words in the `linked_term` but without brackets in them, and DEST is the
-- corresponding link destination to link the subterm to. Any occurrence of ~ in DEST is replaced with SUBTERM.
-- Alternatively, a single modifier spec can be of the form BEGIN[FROM:TO], which is equivalent to writing
-- BEGINFROM:BEGINTO (see example below).
--
-- For example, given the source phrase [[il bue che dice cornuto all'asino]] "the pot calling the kettle black"
-- (literally "the ox that calls the donkey horned/cuckolded"), the result of calling add_links_to_multiword_term()
-- is [[il]] [[bue]] [[che]] [[dice]] [[cornuto]] [[all']][[asino]]. With a modifier_spec of 'dice:dire', the result
-- is [[il]] [[bue]] [[che]] [[dire|dice]] [[cornuto]] [[all']][[asino]]. Here, based on the modifier spec, the
-- non-lemma form [[dice]] is replaced with the two-part link [[dire|dice]].
--
-- Another example: given the source phrase [[chi semina vento raccoglie tempesta]] "sow the wind, reap the whirlwind"
-- (literally (he) who sows wind gathers [the] tempest"). The result of calling add_links_to_multiword_term() is
-- [[chi]] [[semina]] [[vento]] [[raccoglie]] [[tempesta]], and with a modifier_spec of 'semina:~re; raccoglie:~re',
-- the result is [[chi]] [[seminare|semina]] [[vento]] [[raccogliere|raccoglie]] [[tempesta]]. Here we use the ~
-- notation to stand for the non-lemma form in the destination link.
--
-- A more complex example is [[se non hai altri moccoli puoi andare a letto al buio]], which becomes
-- [[se]] [[non]] [[hai]] [[altri]] [[moccoli]] [[puoi]] [[andare]] [[a]] [[letto]] [[al]] [[buio]] after calling
-- add_links_to_multiword_term(). With the following modifier_spec:
-- 'hai:avere; altr[i:o]; moccol[i:o]; puoi: potere; andare a letto:~; al buio:~', the result of applying the spec is
-- [[se]] [[non]] [[avere|hai]] [[altro|altri]] [[moccolo|moccoli]] [[potere|puoi]] [[andare a letto]] [[al buio]].
-- Here, we rely on the alternative notation mentioned above for e.g. 'altr[i:o]', which is equivalent to 'altri:altro',
-- and link multiword subterms using e.g. 'andare a letto:~'. (The code knows how to handle multiword subexpressions
-- properly, and if the link text and destination are the same, only a single-part link is formed.)
function export.apply_link_modifiers(linked_term, modifier_spec)
	local split_modspecs = rsplit(modifier_spec, "%s*;%s*")
	for j, modspec in ipairs(split_modspecs) do
		local subterm, dest, otherlang
		local begin_from, begin_to, rest, end_from, end_to = modspec:match("^%[(.-):(.*)%]([^:]*)%[(.-):(.*)%]$")
		if begin_from then
			subterm = begin_from .. rest .. end_from
			dest = begin_to .. rest .. end_to
		end
		if not subterm then
			rest, end_from, end_to = modspec:match("^([^:]*)%[(.-):(.*)%]$")
			if rest then
				subterm = rest .. end_from
				dest = rest .. end_to
			end
		end
		if not subterm then
			begin_from, begin_to, rest = modspec:match("^%[(.-):(.*)%]([^:]*)$")
			if begin_from then
				subterm = begin_from .. rest
				dest = begin_to .. rest
			end
		end
		if not subterm then
			subterm, dest = modspec:match("^(.-)%s*:%s*(.*)$")
			if subterm and subterm ~= "^" and subterm ~= "$" then
				local langdest
				-- Parse off an initial language code (e.g. 'en:Higgs', 'la:minūtia' or 'grc:σκατός'). Also handle
				-- Wikipedia prefixes ('w:Abatemarco' or 'w:it:Colle Val d'Elsa').
				otherlang, langdest = dest:match("^([A-Za-z0-9._-]+):([^ ].*)$")
				if otherlang == "w" then
					local foreign_wikipedia, foreign_term = langdest:match("^([A-Za-z0-9._-]+):([^ ].*)$")
					if foreign_wikipedia then
						otherlang = otherlang .. ":" .. foreign_wikipedia
						langdest = foreign_term
					end
					dest = ("%s:%s"):format(otherlang, langdest)
					otherlang = nil
				elseif otherlang then
					otherlang = require("Module:languages").getByCode(otherlang, true, "allow etym")
					dest = langdest
				end
			end
		end
		if not subterm then
			error(("Single modifier spec %s should be of the form SUBTERM:DEST where SUBTERM is one or more words in a multiword "
					.. "term and DEST is the destination to link the subterm to (possibly prefixed by a language code); or of "
					.. "the form BEGIN[FROM:TO], which is equivalent to BEGINFROM:BEGINTO; or similarly [FROM:TO]END, which is "
					.. "equivalent to FROMEND:TOEND"):
				format(modspec))
		end
		if subterm == "^" then
			linked_term = dest:gsub("_", " ") .. linked_term
		elseif subterm == "$" then
			linked_term = linked_term .. dest:gsub("_", " ")
		else
			if subterm:find("%[") then
				error(("Subterm '%s' in modifier spec '%s' cannot have brackets in it"):format(
					escape_wikicode(subterm), escape_wikicode(modspec)))
			end
			local strutil = require(string_utilities_module)
			local escaped_subterm = strutil.pattern_escape(subterm)
			local subterm_re = "%[%[" .. escaped_subterm:gsub("(%%?[ '%-])", "%%]*%1%%[*") .. "%]%]"
			local expanded_dest
			if dest:find("~") then
				expanded_dest = dest:gsub("~", strutil.replacement_escape(subterm))
			else
				expanded_dest = dest
			end
			if otherlang then
				expanded_dest = expanded_dest .. "#" .. otherlang:getCanonicalName()
			end

			local subterm_replacement
			if expanded_dest:find("%[") then
				-- Use the destination directly if it has brackets in it (e.g. to put brackets around parts of a word).
				subterm_replacement = expanded_dest
			elseif expanded_dest == subterm then
				subterm_replacement = "[[" .. subterm .. "]]"
			else
				subterm_replacement = "[[" .. expanded_dest .. "|" .. subterm .. "]]"
			end

			local replaced_linked_term = rsub(linked_term, subterm_re, strutil.replacement_escape(subterm_replacement))
			if replaced_linked_term == linked_term then
				error(("Subterm '%s' could not be located in %slinked expression %s, or replacement same as subterm"):format(
					subterm, j > 1 and "intermediate " or "", escape_wikicode(linked_term)))
			else
				linked_term = replaced_linked_term
			end
		end
	end

	return linked_term
end


return export