Jump to content

Module:Cyrs-translit

From Wiktionary, the free dictionary

This module will transliterate text in the Old Cyrillic script. It is used to transliterate Old Church Slavonic, Old East Slavic, Bulgar, Old Novgorodian, and Church Slavonic. The module should preferably not be called directly from templates or other modules. To use it from a template, use {{xlit}}. Within a module, use Module:languages#Language:transliterate.

For testcases, see Module:Cyrs-translit/testcases.

Functions

tr(text, lang, sc)
Transliterates a given piece of text written in the script specified by the code sc, and language specified by the code lang.
When the transliteration fails, returns nil.

local export = {}

local numbers = mw.loadData("Module:Cyrs-translit/numbers")

local ugsub = mw.ustring.gsub
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD
local U = mw.ustring.char
local umatch = mw.ustring.match
local usub = mw.ustring.sub
local ulower = mw.ustring.lower

local acute = U(0x301)
local grave = U(0x300)
local circumflex = U(0x302)
local kamora = U(0x0311)
local palatalization = U(0x0484)
local titlo = U(0x0483)
local dasia = U(0x0485)
local psili = U(0x0486)
local vzmet = U(0xA66F)

local breathing = psili .. dasia
local accent = "[" .. acute .. grave .. circumflex .. breathing .. "]*"
local vowels = "aAæÆeEiIoOœŒꝏꝎuUyY"
local vowel_or_soft = "[" .. vowels .. "ʹ]"

local common_letters = {
	["А"] = 'A', ["а"] = 'a',
	["Б"] = 'B', ["б"] = 'b',
	["В"] = 'V', ["в"] = 'v',
	["Г"] = 'G', ["г"] = 'g',
	["Д"] = 'D', ["д"] = 'd',
	["Е"] = 'E', ["е"] = 'e',
	["Ж"] = 'Ž', ["ж"] = 'ž',
	["Ѕ"] = 'Dz', ["ѕ"] = 'dz',
	["З"] = 'Z', ["з"] = 'z',
	["И"] = 'I', ["и"] = 'i',
	["І"] = 'I', ["і"] = 'i', -- Contrastive with "И".
	["Й"] = 'J', ["й"] = 'j',
	["Ꙉ"] = 'Ǵ', ["ꙉ"] = 'ǵ',
	["К"] = 'K', ["к"] = 'k',
	["Л"] = 'L', ["л"] = 'l',
	["М"] = 'M', ["м"] = 'm',
	["Н"] = 'N', ["н"] = 'n',
	["О"] = 'O', ["о"] = 'o',
	["Ꚙ"] = 'Ꝏ', ["ꚙ"] = 'ꝏ',
	["П"] = 'P', ["п"] = 'p',
	["Р"] = 'R', ["р"] = 'r',
	["С"] = 'S', ["с"] = 's',
	["Т"] = 'T', ["т"] = 't',
	["У"] = 'U', ["у"] = 'u',
	["Ꙋ"] = 'U', ["ꙋ"] = 'u',
	["Ф"] = 'F', ["ф"] = 'f',
	["Х"] = 'X', ["х"] = 'x',
	["Ѡ"] = 'O', ["ѡ"] = 'o', -- Contrastive with "О".
	["Ѿ"] = 'Ot', ["ѿ"] = 'ot', -- Becomes "otŭ" as appropriate.
	["Ѽ"] = 'Ô', ["ѽ"] = 'ô',
	["Ц"] = 'C', ["ц"] = 'c',
	["Ꙡ"] = 'Ć', ["ꙡ"] = 'ć', -- From a merger of "Ц" and "Ч" in Old Novgorodian.
	["Ч"] = 'Č', ["ч"] = 'č',
	["Џ"] = 'Dž', ["џ"] = 'dž',
	["Ш"] = 'Š', ["ш"] = 'š',
	["Щ"] = 'Št', ["щ"] = 'št',
	["Ъ"] = 'Ŭ', ["ъ"] = 'ŭ',
	["Ꙑ"] = 'Y', ["ꙑ"] = 'y',
	["Ь"] = 'Ĭ', ["ь"] = 'ĭ',
	["Ѣ"] = 'Ě', ["ѣ"] = 'ě',
	["Ꙓ"] = 'Jě', ["ꙓ"] = 'jě',
	["Ю"] = 'Ju', ["ю"] = 'ju',
	["Ꙗ"] = 'Ja', ["ꙗ"] = 'ja',
	["Ѥ"] = 'Je', ["ѥ"] = 'je',
	["Ѧ"] = 'Ę', ["ѧ"] = 'ę',
	["Ѫ"] = 'Ǫ', ["ѫ"] = 'ǫ',
	["Ѩ"] = 'Ję', ["ѩ"] = 'ję',
	["Ѭ"] = 'Jǫ', ["ѭ"] = 'jǫ',
	["Ꙛ"] = 'Œ', ["ꙛ"] = 'œ', -- Becomes "œ̨".
	["Ѯ"] = 'Ks', ["ѯ"] = 'ks',
	["Ѱ"] = 'Ps', ["ѱ"] = 'ps',
	["Ѳ"] = 'Θ', ["ѳ"] = 'θ',
	["Ѵ"] = 'Ü', ["ѵ"] = 'ü',
	["Ѷ"] = 'Ü', ["ѷ"] = 'ü', -- Contrastive with "Ѵ".
	["Ҁ"] = 'Q', ["ҁ"] = 'q',
	[psili] = '',
	[kamora] = circumflex,
}

local variants = {
	["ᲀ"] = 'в',
	["Ґ"] = 'Г', ["ґ"] = 'г',
	-- ["Ђ"] = 'Ꙉ', ["ђ"] = 'ꙉ',
	["ᲁ"] = 'д',
	["Ꙣ"] = 'Д' .. palatalization, ["ꙣ"] = 'д' .. palatalization,
	["Є"] = 'Е', ["є"] = 'е',
	["Э"] = 'Е', ["э"] = 'е',
	["Ꙃ"] = 'Ѕ', ["ꙃ"] = 'ѕ',
	["Ꙅ"] = 'Ѕ', ["ꙅ"] = 'ѕ',
	["Ꙁ"] = 'З', ["ꙁ"] = 'з',
	["Ӥ"] = 'И', ["ӥ"] = 'и',
	["Ї"] = 'І', ["ї"] = 'і',
	["Ꙇ"] = 'І', ["ꙇ"] = 'і',
	-- ["Ћ"] = 'Ꙉ', ["ћ"] = 'ꙉ',
	["Ꙥ"] = 'Л' .. palatalization, ["ꙥ"] = 'л' .. palatalization,
	["Ꙧ"] = 'М' .. palatalization, ["ꙧ"] = 'м' .. palatalization,
	["Ҥ"] = 'Н' .. palatalization, ["ҥ"] = 'н' .. palatalization,
	["Ѻ"] = 'О', ["ѻ"] = 'о',
	["Ꙩ"] = 'О', ["ꙩ"] = 'о',
	["Ꙫ"] = 'О', ["ꙫ"] = 'о',
	["Ꚛ"] = 'О', ["ꚛ"] = 'о',
	["Ꙭ"] = 'Ꚙ', ["ꙭ"] = 'ꚙ',
	["ꙮ"] = 'о',
	["ᲂ"] = 'о',
	["ᲃ"] = 'с',
	["ᲄ"] = 'т',
	["ᲅ"] = 'т',
	["Ѹ"] = 'Ꙋ', ["ѹ"] = 'ꙋ', ["ᲈ"] = 'ꙋ',
	["Ꙍ"] = 'Ѡ', ["ꙍ"] = 'ѡ',
	["Ы"] = 'Ꙑ', ["ы"] = 'ꙑ',
	["ᲆ"] = 'ъ',
	["ᲇ"] = 'ѣ',
	["Я"] = 'Ꙗ', ["я"] = 'ꙗ',
	["Ꙕ"] = 'Ю', ["ꙕ"] = 'ю',
	["Ꙙ"] = 'Ѧ', ["ꙙ"] = 'ѧ',
	["Ꙝ"] = 'Ѩ', ["ꙝ"] = 'ѩ',
}

-- A second round of substitutions, e.g. if the final output isn't a precomposed character, but needs to behave like one during processing.
local final_substitutions = {
	["Œ"] = "Œ̨", ["œ"] = "œ̨", 
	[titlo] = ":", [vzmet] = ":"
}

-- Letters converted to their iotated equivalents when word-initial.
local common_iotated_initial = {
	["Ѣ"] = 'Ꙓ', ["ѣ"] = 'ꙓ',
}

-- Letters converted to their iotated equivalents after vowels or a
-- palatalization mark.
local common_iotated_after_vowel_or_soft = {
	["Е"] = 'Ѥ', ["е"] = 'ѥ',
	["Ѣ"] = 'Ꙓ', ["ѣ"] = 'ꙓ',
	["Ѧ"] = 'Ѩ', ["ѧ"] = 'ѩ',
}

local lang_letters = {}
local lang_iotated_initial = {}
local lang_iotated_after_vowel_or_soft = {}
local uo_is_u = {}

-- Old East Slavic
	lang_letters["orv"] = setmetatable({
		["Щ"] = 'Šč', ["щ"] = 'šč',
	}, {__index = common_letters})
	
	lang_iotated_initial["orv"] = setmetatable({
		["Е"] = 'Ѥ', ["е"] = 'ѥ',
		["Ѧ"] = 'Ѩ', ["ѧ"] = 'ѩ',
	}, {__index = common_iotated_initial})

-- Old Novgorodian
	lang_letters["zle-ono"] = setmetatable({
		["Ц"] = 'Cʹ', ["ц"] = 'cʹ',
		["Ч"] = 'Cʹ', ["ч"] = 'cʹ',
		["Щ"] = 'Sʹcʹ', ["щ"] = 'sʹcʹ',
	}, {__index = common_letters})
	
	lang_iotated_initial["zle-ono"] = lang_iotated_initial["orv"]
	uo_is_u["zle-ono"] = true

-- Old Pskovian
	lang_letters["zle-ops"] = setmetatable({ -- In addition to zle-ono above.
		["Ж"] = 'Zʹ', ["ж"] = 'zʹ',
		["Ѕ"] = 'Dzʹ', ["ѕ"] = 'dzʹ',
		["З"] = 'Zʹ', ["з"] = 'zʹ',
		["С"] = 'Sʹ', ["с"] = 'sʹ',
		["Џ"] = 'Dzʹ', ["џ"] = 'dzʹ',
		["Ш"] = 'Sʹ', ["ш"] = 'sʹ',
		["Щ"] = 'Sʹk', ["щ"] = 'sʹk',
		["Ѣ"] = 'Æ', ["ѣ"] = 'æ',
		["Ꙓ"] = 'Jæ', ["ꙓ"] = 'jæ',
	}, {__index = lang_letters["zle-ono"]})
	
	lang_iotated_initial["zle-ops"] = setmetatable({ -- In addition to zle-ono above.
		["Ѫ"] = 'Ѭ', ["ѫ"] = 'ѭ',
	}, {__index = lang_iotated_initial["zle-ono"]})
	
	lang_iotated_after_vowel_or_soft["zle-ops"] = setmetatable({
		["Ѫ"] = 'Ѭ', ["ѫ"] = 'ѭ',
	}, {__index = common_iotated_after_vowel_or_soft})
	
	uo_is_u["zle-ops"] = true

local function handle_v(prev, v)
	return prev .. (v == "Ѵ" and "В" or "в")
end

local function handle_ou(o, ac)
	return (ulower(o) == o and "у" or "У") .. ac
end

local function handle_rough_breathing(base1, base2, diacritics)
	-- Mark rough breathing with "h".
	local base2_lower = ulower(base2)
	if not vowels:match(base2_lower) then
		return base1 .. base2 .. diacritics .. "h"
	end
	local base1_lower = ulower(base1)
	if not vowels:match(base1_lower) then
		return base1 .. (base2_lower == base2 and "h" or "H") .. base2_lower .. diacritics
	end
	return (base1_lower == base1 and "h" or "H") .. base1 .. base2 .. diacritics
end

function export.tr(text, lang, sc)
	if not sc then
		sc = require("Module:languages").getByCode(lang, nil, true):findBestScript(text):getCode()
	end
	if sc ~= "Cyrs" then
		return nil
	end
	
	local input = text
	
	-- Decompose any acute and grave accents.
	text = ugsub(toNFD(text), "[^" .. acute .. grave .. "]+", toNFC)
	
	-- Canonicalize any variants.
	text = text:gsub(".[\128-\191]*", variants)

	-- Transliterate the palatalization mark as prime.
	text = text:gsub(palatalization, "ʹ")
	
	-- Treat "Ѵ" as the consonant "В" (transliterated "V") in diphthongs that
	-- correspond to Ancient Greek "αυ", "ευ" and "ηυ" (equivalent to "аѵ", "еѵ"
	-- and "иѵ").  Note that "ιυ" ("іѵ") is not a diphthong, and "ου" ("оѵ") is
	-- a long vowel. However, this doesn't apply to "Ѷ", as the diacritic means
	-- it must be treated as a vowel.
	text = ugsub(text, "([аАеЕиИꙗꙖѥѤ]" .. accent .. ")([ѵѴ])", handle_v)
	
	local letters = lang_letters[lang] or common_letters
	
	-- Convert "ѿ" to "ѡт" if followed by a non-iotated vowel (including those
	-- which iotate only after vowels) or a palatalization mark, and "ѡтъ" in
	-- all other cases.
	text = ugsub(text, "([ѿѾ])(" .. accent .. ")()", function(ot, ac, loc)
		ot = (ot == "Ѿ" and "Ѡ" or "ѡ") .. ac .. "т"
		local nxt = toNFD(usub(text, loc, loc):gsub(".[\128-\191]*", letters))
		if not umatch(nxt, "^" .. vowel_or_soft) then
			ot = ot .. "ъ"
		end
		return ot
	end)
	
	-- Handle any vowels which are iotated at the start of words.
	local iotated_initial = lang_iotated_initial[lang] or common_iotated_initial
	-- Not possible to input iotated_initial directly, as mw.ustring.gsub
	-- doesn't respect metamethods...
	text = ugsub(text, "%f[%w].", function(m)
		return iotated_initial[m]
	end)
	
	-- Handle any vowels which are iotated after another vowel or a
	-- palatalization mark.
	local iotated_after_vowel_or_soft = lang_iotated_after_vowel_or_soft[lang] or common_iotated_after_vowel_or_soft
	text = ugsub(text, "()(" .. accent .. ")(.)", function(loc, ac, letter)
		local iotated = iotated_after_vowel_or_soft[letter]
		if iotated then
			loc = loc - 1
			local prev = toNFD((loc == 0 and "" or usub(text, loc, loc)):gsub(".[\128-\191]*", letters))
			if umatch(prev, vowel_or_soft .. "%W*$") then
				return ac .. iotated
			end
		end
	end)
	
	-- Treat "ъі" as "ꙑ", and make "ъ" tense ("ŷ") before "и" or an iotated
	-- vowel.
	text = ugsub(text, "([Ъъ])(" .. accent .. ")()([иИіІ]?)", function(yer, ac, loc, i)
		local nxt = toNFD(usub(text, loc, loc):gsub(".[\128-\191]*", letters)):match("^[iIjJ]")
		if nxt ~= nil then
			return (yer == "Ъ" and "Ꙑ" or "ꙑ") .. ((i == "і" or i == "І") and ac or circumflex .. ac .. i)
		end
	end)
	
	-- In some languages, treat "уо" ("uo") as "у" ("u").
	if uo_is_u[lang] then
		-- Not "ꚙ", which is an orthographically doubled "о".
		text = ugsub(text, "([уУѵѴѷѶ]" .. accent .. ")[оО]", "%1")
	end
	
	-- Treat "оу" ("ou") as "у" ("u") (but not "ꚙ").
	text = ugsub(text, "([оО])(" .. accent .. ")[уУѵѴѷѶ]", handle_ou)
	
	-- Substitute any numbers.
	for key, repl in pairs(numbers) do
		text = ugsub(text, key, repl)
	end

	-- Main substitution.
	text = text:gsub(".[\128-\191]*", letters)
	
	-- Handle any rough breathing marks.
	-- FIXME: this can't handle various edge cases.
	text = ugsub(toNFD(text), "(%w)(%w?)([^%w%s]*)[" .. dasia .. "]", handle_rough_breathing)
	
	if umatch(text, "[" .. breathing .. "]") then
		error("Invalid breathing marks in input " .. mw.dumpObject(input))
	end
	
	-- Final substitutions.
	text = text:gsub(".[\128-\191]*", final_substitutions)

	return toNFC(text)
end

return export