Module:gl-common

From Wiktionary, the free dictionary
Jump to navigation Jump to search

local export = {}

local romut_module = "Module:romance utilities"

local u = mw.ustring.char
local rsplit = mw.text.split
local rsubn = mw.ustring.gsub
local rfind = mw.ustring.find
local rmatch = mw.ustring.match

local TEMPC1 = u(0xFFF1)
local TEMPC2 = u(0xFFF2)
local TEMPV1 = u(0xFFF3)
local DIV = u(0xFFF4)
local unaccented_vowel = "aeiouüAEIOUÜ"
local accented_vowel = "áéíóúýÁÉÍÓÚÝ"
local vowel = unaccented_vowel .. accented_vowel
local V = "[" .. vowel .. "]"
export.V = V
local AV = "[" .. accented_vowel .. "]"
export.AV = AV
local NAV = "[^" .. accented_vowel .. "]"
export.NAV = NAV
local W = "[iyuw]" -- glide
export.W = W
local C = "[^" .. vowel .. ".]"
export.C = C
local remove_accent = {
	["á"]="a", ["é"]="e", ["í"]="i", ["ó"]="o", ["ú"]="u", ["ý"]="y",
	["Á"]="A", ["É"]="E", ["Í"]="I", ["Ó"]="O", ["Ú"]="U", ["Ý"]="Y",
}
export.remove_accent = remove_accent
local add_accent = {
	["a"]="á", ["e"]="é", ["i"]="í", ["o"]="ó", ["u"]="ú", ["y"]="ý",
	["A"]="Á", ["E"]="É", ["I"]="Í", ["O"]="Ó", ["U"]="Ú", ["Y"]="Ý",
}
export.add_accent = add_accent

-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
	local retval = rsubn(term, foo, bar)
	return retval
end

export.rsub = rsub

-- version of rsubn() that returns a 2nd argument boolean indicating whether
-- a substitution was made.
local function rsubb(term, foo, bar)
	local retval, nsubs = rsubn(term, foo, bar)
	return retval, nsubs > 0
end

export.rsubb = rsubb

-- apply rsub() repeatedly until no change
local function rsub_repeatedly(term, foo, bar)
	while true do
		local new_term = rsub(term, foo, bar)
		if new_term == term then
			return term
		end
		term = new_term
	end
end

export.rsub_repeatedly = rsub_repeatedly

function export.remove_final_accent(stem)
	return rsub(stem, "(" .. AV .. ")(" .. C .. "*)$", function(v, c) return (remove_accent[v] or v) .. c end)
end

function export.add_final_accent(stem)
	return rsub(stem, "(" .. NAV .. ")(" .. C .. "*)$", function(v, c) return (add_accent[v] or v) .. c end)
end

local prepositions = {
	-- a + optional article
	"a ",
	"ás? ",
	"aos? ",
	-- con + optional article
	"con ",
	"coa?s? ",
	-- de + optional article
	"de ",
	"d[oa]s? ",
	"d'",
	-- en/em + optional article
	"en ",
	"n[oa]s? ",
	-- por + optional article
	"por ",
	"pol[oa]s? ",
	-- para + optional article
	"para ",
	"pr[óá]s? ",
	-- others
	"at[aé] ",
	"com[oa] ",
	"entre ",
	"sen ",
	"so ",
	"sobre ",
}

local function call_handle_multiword(term, special, make_fun, fun_name)
	local retval = require(romut_module).handle_multiword(term, special, make_fun, prepositions)
	if retval then
		if #retval ~= 1 then
			error("Internal error: Should have one return value for " .. fun_name .. ": " .. table.concat(retval, ","))
		end
		return retval[1]
	end
	return nil
end

local function make_try(word)
	return function(from, to)
		local newval, changed = rsubb(word, from, to)
		if changed then
			return newval
		end
		return nil
	end
end

function export.make_plural(term, special)
	local retval = call_handle_multiword(term, special, export.make_plural, "make_plural")
	if retval then
		return retval
	end

	local try = make_try(term)

	-- Based on https://www.lingua.gal/c/document_library/get_file?file_path=/portal-lingua/celga/celga-1/material-alumno/Manual_Aula_de_Galego_1_resumo_gramatical.pdf
	return try("r$", "res") or
		try("z$", "ces") or
		try("(" .. V .. "be)l$", "%1is") or -- vowel + -bel
		try("(" .. AV .. ".*" .. V .. "l)$", "%1es") or -- non-final stress + -l e.g. [[túnel]] -> 'túneles'
		try("^(" .. C .. "*" .. V .. C .. "*l)$", "%1es") or -- monosyllable ending in -l e.g. [[sol]] -> 'soles'
		try("il$", "ís") or -- final stressed -il e.g. [[civil]] -> 'civís'
		try("(" .. V .. ")l$", "%1is") or -- any other vowel + -l e.g. [[papel]] -> 'papeis'
		try("(" .. V .. "[íú])s$", "%1ses") or -- vowel + stressed í/ú + -s e.g. [[país]] -> 'países'
		try("(" .. AV .. ")s$", -- other final accented vowel + -s e.g. [[autobús]] -> 'autobuses'
			function(av) return remove_accent[av] .. "ses" end) or
		try("(" .. V .. "[iu]?s)$", "%1es") or -- diphthong + final -s e.g. [[deus]] -> 'deuses'
		try("^(C" .. "*" .. V .. "s)$", "%1es") or -- monosyllable + final -s e.g. [[fros]] -> 'froses', [[gas]] -> 'gases'
		try("([sx])$", "%1") or -- other final -s or -x (stressed on penult or antepenult or ending in cluster), e.g.
								-- [[mércores]], [[lapis]], [[lux]], [[unisex]], [[luns]]
		term .. "s" -- ending in vowel, -n or other consonant e.g. [[cadeira]], [[marroquí]], [[xersei]], [[limón]],
					-- [[club]], [[clip]], [[robot]], [[álbum]]
end

function export.make_feminine(term, is_noun, special)
	local retval = call_handle_multiword(term, special, function(term) return export.make_feminine(term, is_noun) end,
		"make_feminine")
	if retval then
		return retval
	end

	local try = make_try(term)

	-- Based on https://www.lingua.gal/c/document_library/get_file?file_path=/portal-lingua/celga/celga-1/material-alumno/Manual_Aula_de_Galego_1_resumo_gramatical.pdf
	return
		try("o$", "a") or
		try("º$", "ª") or -- ordinal indicator
		try("^(" .. C .. "*)u$", "%1úa") or -- [[nu]] -> núa, [[cru]] -> crúa
		try("eu$", "ía") or -- [[sandeu]] -> sandía, [[xudeu]] -> xudía
		-- many nouns and adjectives in -án:
		-- [[afgán]], [[alazán]], [[aldeán]], [[alemán]], [[ancián]], [[aresán]], [[arnoián]], [[arousán]], [[artesán]],
		-- [[arzuán]], [[barregán]], [[bergantiñán]], [[bosquimán]], [[buxán]], [[caldelán]], [[camariñán]],
		-- [[capitán]], [[carnotán]], [[castelán]], [[catalán]], [[cidadán]], [[cirurxián]], [[coimbrán]], [[comarcán]],
		-- [[compostelán]], [[concidadán]], [[cortesán]], [[cotián]], [[cristián]], [[curmán]], [[desirmán]],
		-- [[ermitán]], [[ferrolán]], [[fisterrán]], [[gardián]], [[insán]], [[irmán]], [[louzán]], [[malpicán]],
		-- [[malsán]], [[mariñán]], [[marrán]], [[muradán]], [[musulmán]], [[muxián]], [[neurocirurxián]], [[nugallán]],
		-- [[otomán]], [[ourensán]], [[pagán]], [[paleocristián]], [[ponteareán]], [[pontecaldelán]], [[redondelán]],
		-- [[ribeirán]], [[rufián]], [[sacristán]], [[salnesán]], [[sancristán]], [[sultán]], [[tecelán]], [[temperán]],
		-- [[temporán]], [[truán]], [[turcomán]], [[ullán]], [[vilagarcián]], [[vilán]]
		--
		-- but not (instead in -ana):
		-- [[baleigán]], [[barbuzán]], [[barrigán]], [[barullán]], [[bergallán]], [[bocalán]], [[brután]], [[buleirán]],
		-- [[burrán]], [[burricán]], [[cabezán]], [[cachamoulán]], [[cachán]], [[cacholán]], [[cagán]], [[canelán]],
		-- [[cangallán]], [[carallán]], [[carcamán]], [[carneirán]], [[carroulán]], [[chalán]], [[charlatán]],
		-- [[cornán]], [[cornelán]], [[farfallán]], [[folán]], [[folgazán]], [[galbán]], [[guedellán]], [[lacazán]],
		-- [[langrán]], [[larpán]], [[leilán]], [[lerchán]], [[lombán]], [[lorán]], [[lordán]], [[loubán]],
		-- [[mentirán]], [[mourán]], [[orellán]], [[paduán]], [[pailán]], [[palafustrán]], [[papán]], [[parvallán]],
		-- [[paspán]], [[pastrán]], [[pelandrán]], [[pertegán]], [[pillabán]], [[porcallán]], [[ruán]],
		-- [[tangueleirán]], [[testalán]], [[testán]], [[toleirán]], [[vergallán]], [[zalapastrán]], [[zampallán]]
		try("án$", "á") or
		-- nouns in -z e.g. [[rapaz]]; but not [[feliz]], [[capaz]], [[perspicaz]], etc.
		-- only such adjective is [[andaluz]] -> andaluza, [[rapaz]] -> rapaza
		is_noun and try("z$", "za") or
		try("ín$", "ina") or -- [[bailarín]], [[benxamín]], [[danzarín]], [[galopín]], [[lampantín]], [[mandarín]],
							 -- [[palanquín]]; but not [[afín]], [[pimpín]], [[ruín]]
		-- [[abusón]], [[chorón]], [[felón]], etc.
		--
		-- but not (instead in -oa): [[anglosaxón]], [[baixosaxón]], [[beirón]], [[borgoñón]], [[bretón]], [[campión]],
		-- [[eslavón]], [[francón]], [[frisón]], [[gascón]], [[grisón]], [[ladrón]] (also fem. ladra), [[letón]],
		-- [[nipón]], [[patagón]], [[saxón]], [[teutón]], [[valón]], [[vascón]]
		--
		-- but not (invariable in singular): [[grelón]], [[maricón]], [[marón]], [[marrón]], [[roulón]], [[salmón]],
		-- [[xiprón]]
		try("ón$", "ona") or
		try("és$", "esa") or -- [[francés]], [[portugués]], [[fregués]], [[vigués]] etc.
							 -- but not [[cortés]], [[descortés]] 
		-- adjectives in:
		-- * [[-ador]], [[-edor]] ([[amortecedor]], [[compilador]], etc.), [[-idor]] ([[inhibidor]], etc.)
		-- * -tor ([[condutor]], [[construtor]], [[colector]], etc.)
		-- * -sor ([[agresor]], [[censor]], [[divisor]], etc.)
		-- but not:
		-- * [[anterior]]/[[posterior]]/[[inferior]]/[[júnior]]/[[maior]]/[[peor]]/[[mellor]]/etc.
		-- * [[bicolor]]/[[multicolor]]/etc.
		try("([dts]or)$", "%1a") or
		term
end

function export.make_masculine(term, special)
	local retval = call_handle_multiword(term, special, export.make_masculine, "make_masculine")
	if retval then
		return retval
	end

	local try = make_try(term)

	return
		try("([dts])ora$", "%1or") or
		try("a$", "o") or
		-- ordinal indicator
		try("ª$", "º") or
		term
end

-- Syllabify a word. This is copied and modified from [[Module:es-common]] and attempts to implements a full
-- syllabification algorithm, based on the corresponding code in [[Module:es-pronunc]]. This is more than is needed for
-- the purpose of this module, which doesn't care so much about syllable boundaries, but won't hurt.
function export.syllabify(word)
	word = DIV .. word .. DIV
	-- gu/qu + front vowel; make sure we treat the u as a consonant; a following i should not be treated as a consonant
	-- (may make no difference for Galician; necessary in Spanish for [[alguien]])
	word = rsub(word, "([gq])u([eiéí])", "%1" .. TEMPC2 .. "%2")
	local vowel_to_glide = { ["i"] = TEMPC1, ["u"] = TEMPC2 }
	-- i and u between vowels should behave like consonants ([[paranoia]], [[baiano]]); Spanish also has [[abreuense]],
	-- [[alauita]], [[Malaui]], etc. not in Galician
	word = rsub_repeatedly(word, "(" .. V .. ")([iu])(" .. V .. ")",
		function(v1, iu, v2) return v1 .. vowel_to_glide[iu] .. v2 end
	)
	-- y between consonants or after a consonant at the end of the word should behave like a vowel
	-- ([[ankylosaurio]], [[lycra]], [[hippy]], [[cherry]], etc.)
	word = rsub_repeatedly(word, "(" .. C .. ")y(" .. C .. ")",
		function(c1, c2) return c1 .. TEMPV1 .. c2 end
	)

	word = rsub_repeatedly(word, "(" .. V .. ")(" .. C .. W .. "?" .. V .. ")", "%1.%2")
	word = rsub_repeatedly(word, "(" .. V .. C .. ")(" .. C .. V .. ")", "%1.%2")
	word = rsub_repeatedly(word, "(" .. V .. C .. "+)(" .. C .. C .. V .. ")", "%1.%2")
	word = rsub(word, "([pbcktdg])%.([lr])", ".%1%2")
	word = rsub_repeatedly(word, "(" .. C .. ")%.s(" .. C .. ")", "%1s.%2")
	-- Any aeo, or stressed iu, should be syllabically divided from a following aeo or stressed iu.
	word = rsub_repeatedly(word, "([aeoáéíóúý])([aeoáéíóúý])", "%1.%2")
	word = rsub_repeatedly(word, "([ií])([ií])", "%1.%2")
	word = rsub_repeatedly(word, "([uú])([uú])", "%1.%2")
	word = rsub(word, "([" .. DIV .. TEMPC1 .. TEMPC2 .. TEMPV1 .. "])", {
		[DIV] = "",
		[TEMPC1] = "i",
		[TEMPC2] = "u",
		[TEMPV1] = "y",
	})
	return rsplit(word, "%.")
end


-- Return the index of the (last) stressed syllable.
function export.stressed_syllable(syllables)
	-- If a syllable is stressed, return it.
	for i = #syllables, 1, -1 do
		if rfind(syllables[i], AV) then
			return i
		end
	end
	-- Monosyllabic words are stressed on that syllable.
	if #syllables == 1 then
		return 1
	end
	local i = #syllables
	-- Unaccented words ending in a non-diphthong vowel optionally followed by n/s/ns are stressed on the preceding
	-- syllable.
	if rfind(syllables[i], V .. "n?s?$") and not rfind(syllables[i], V .. "[iu]n?s?$") then
		return i - 1
	end
	-- Remaining words are stressed on the last syllable.
	return i
end


-- Add an accent to the appropriate vowel in a syllable, if not already accented.
function export.add_accent_to_syllable(syllable)
	-- Don't do anything if syllable already stressed.
	if rfind(syllable, AV) then
		return syllable
	end
	-- Prefer to accent an a/e/o in case of a diphthong or triphthong (the first one if for some reason
	-- there are multiple, which should not occur with the standard syllabification algorithm);
	-- otherwise, do the first i or u in case of a diphthong ui or iu.
	if rfind(syllable, "[aeo]") then
		return rsub(syllable, "^(.-)([aeo])", function(prev, v) return prev .. add_accent[v] end)
	end
	return rsub(syllable, "^(.-)([iu])", function(prev, v) return prev .. add_accent[v] end)
end


-- Remove any accent from a syllable.
function export.remove_accent_from_syllable(syllable)
	return rsub(syllable, AV, remove_accent)
end


-- Return true if an accent is needed on syllable number `sylno` if that syllable were to receive the stress,
-- given the syllables of a word. The current accent may be on any syllable.
function export.accent_needed(syllables, sylno)
	-- Diphthongs iu and ui are normally stressed on the first vowel, so if the accent is on the second vowel,
	-- it's needed.
	if rfind(syllables[sylno], "iú") or rfind(syllables[sylno], "[uü]í") then
		return true
	end
	-- If the default-stressed syllable is different from `sylno`, accent is needed.
	local unaccented_syllables = {}
	for _, syl in ipairs(syllables) do
		table.insert(unaccented_syllables, export.remove_accent_from_syllable(syl))
	end
	local would_be_stressed_syl = export.stressed_syllable(unaccented_syllables)
	if would_be_stressed_syl ~= sylno then
		return true
	end
	-- At this point, we know that the stress would by default go on `sylno`, given the syllabification in
	-- `syllables`. Now we have to check for situations where removing the accent mark would result in a
	-- different syllabification. For example, países -> `pa.i.ses` but removing the accent mark would lead
	-- to `pai.ses`. Similarly, río -> `ri.o` but removing the accent mark would lead to single-syllable `rio`.
	-- We need to check whether (a) the stress falls on an i or u; (b) in the absence of an accent mark, the
	-- i or u would form a diphthong with a preceding or following vowel and the stress would be on that vowel.
	-- The conditions are slightly different when dealing with preceding or following vowels because iu and ui
	-- diphthongs are by default stressed on the first vowel.
	local accented_syllable = export.add_accent_to_syllable(unaccented_syllables[sylno])
	if sylno > 1 then
		if rfind(unaccented_syllables[sylno - 1], "[aeou]$") and rfind(accented_syllable, "^í") or
			rfind(unaccented_syllables[sylno - 1], "[aeio]$") and rfind(accented_syllable, "^ú") then
			return true
		end
	end
	if sylno < #syllables and rfind(accented_syllable, "[íú]$") and rfind(unaccented_syllables[sylno + 1], "^[aeo]") then
		return true
	end
	return false
end


-- FIXME: Next two copied from [[Module:es-common]]. Move to a utilities module.

-- Add links around words. If multiword_only, do it only in multiword forms.
function export.add_links(form, multiword_only)
	if form == "" or form == " " then
		return form
	end
	if not form:find("%[%[") then
		if rfind(form, "[%s%p]") then --optimization to avoid loading [[Module:headword]] on single-word forms
			local m_headword = require("Module:headword")
			if m_headword.head_is_multiword(form) then
				form = m_headword.add_multiword_links(form)
			end
		end
		if not multiword_only and not form:find("%[%[") then
			form = "[[" .. form .. "]]"
		end
	end
	return form
end


function export.strip_redundant_links(form)
	-- Strip redundant brackets surrounding entire form.
	return rmatch(form, "^%[%[([^%[%]]*)%]%]$") or form
end

return export