Module:aii-translit/sandbox

From Wiktionary, the free dictionary
Jump to navigation Jump to search

local export = {}

local U = mw.ustring.char
local rsub = mw.ustring.gsub
local rmatch = mw.ustring.match

local hbasa = U(0x73C)
local rwaha = U(0x73F)
local zlama_angular = U(0x739)
local zlama_horizontal = U(0x738)
local pthaha = U(0x732)
local zqapha = U(0x735)

local diacritic_vowels = hbasa .. rwaha .. zlama_angular .. zlama_horizontal .. pthaha .. zqapha
local diacritic_vowels_capture = "([" .. diacritic_vowels .. "])"

local talqana_above = U(0x747)
local combining_diaeresis = U(0x308)

-- we declare consonants representing vowels (matres lectionis) as constants to mitigate differences in how mixing
-- right-to-left and left-to-right characters in the same line appears in an IDE vs wiktionary.  Since matres is used in
-- concatenation via the .. operator,  "ܘ" .. "ܐ" on wiktionary would render as "ܐ" .. "ܘ" in an IDE
local alaph = U(0x710)
local waw = U(0x718)
local yudh = U(0x71D)

local combining_tilde_below = U(0x330)
local combining_tilde_above = U(0x303)
local combining_macron_below = U(0x331)
local combining_macron = U(0x304)
local qushshaya = U(0x741)
local rukkakha = U(0x742)
local combining_breve_below = U(0x32E)

local combining_dot_below = U(0x323)
local combining_dot_above = U(0x307)


-- constants for transliterated snippets which are used in later substitutions
local TR_THIRD_PERSON_FEM_SUFFIX = 'ōh'
local TR_WAW_PLUS_RVASA = 'ū'
local TR_WAW_PLUS_RVASA_SHORT = 'u'

local tt_transpose_punc = {
	-- left/right single/double quotes
	["“"] = "”",
	["”"] = "“",
	["‘"] = "’",
	["’"] = "‘",
	["؟"] = "?", -- question mark
	["«"] = '“', -- quotation mark
	["»"] = '”', -- quotation mark
	["،"] = ",", -- comma
	["؛"] = ";", -- semicolon
}

local tt_transpose_punc_keys = ''
for key, _ in pairs(tt_transpose_punc) do tt_transpose_punc_keys = tt_transpose_punc_keys .. key end

local fix = {
	{ diacritic_vowels_capture .. qushshaya, qushshaya .. "%1"},

	-- under the hood mw uses uses NFC which preempts the following two substitutions...
	-- feel free to uncomment if there's a test case which can be added that requires them to be uncommented
	-- https://www.mediawiki.org/wiki/Unicode_normalization_considerations
	-- { diacritic_vowels_capture .. rukkakha, rukkakha .. "%1"},
	-- { diacritic_vowels_capture .. combining_tilde_below, combining_tilde_below .. "%1" },

	-- partition punctuation marks so "starts with" and "ends with" substitutions work
	{"([".. tt_transpose_punc_keys .. "()!.:\"'])", "#%1#"},
}

local tt = {
	["ܦ"] = "p",
	["ܒ"] = "b",
	["ܬ"] = "t",
	["ܛ"] = "ṭ",
	["ܕ"] = "d",
	["ܟ"] = "k",
	["ܓ"] = "g",
	["ܩ"] = "q",
	["ܣ"] = "s",
	["ܨ"] = "ṣ",
	["ܙ"] = "z",
	["ܫ"] = "š",
	["ܚ"] = "ḥ",
	["ܥ"] = "ˁ",
	["ܗ"] = "h",
	["ܡ"] = "m",
	["ܢ"] = "n",
	["ܪ"] = "r",
	["ܠ"] = "l",
}

-- local tt_keys = ''
-- for key, _ in pairs(tt) do tt_keys = tt_keys .. key end

local tt_values = ''
for _, value in pairs(tt) do tt_values = tt_values .. value end

local mhagjana_capture =  "([" .. rsub('ܗܠܡܢܥܪ', ".", tt) .. alaph .. yudh .. waw  .. "])"
local marhetana_capture =  "([" .. rsub('ܦܒܬܛܕܟܓܩܣܨܙܫܚ', ".", tt) .. "])"

-- https://r12a.github.io/scripts/syrc/aii.html#single_letter_words
local bdul = 'ܒܕܘܠ'
local bdul_capture = '([' .. bdul .. '])'
local bdul_capture2 = '([' .. bdul .. '])([' .. bdul .. '])'

-- local alphabet = ''
-- for letter, _ in pairs(tt) do alphabet = alphabet .. letter end
-- alphabet = alphabet .. yudh .. waw .. alaph
-- local alphabet_capture = '([' .. alphabet .. '])'

local tt_next = {
	[waw] = "w",
	[yudh] = "y",

	[zlama_angular] = "ē",
	[zlama_horizontal] = "i",
	[pthaha] = "a",
	[zqapha] = "ā"
}

local glides = alaph .. yudh .. waw -- unvoweled, original values of matres lectionis (consonants representing vowels)
local consonants_minus_glides = tt_values .. "cžfḇṯḏḵḡ"
local consonants_minus_glides_cg = "([" .. consonants_minus_glides .. "])"

local consonants_capture = "([" .. glides .. consonants_minus_glides .. "])"
-- local consonants_capture_minus_alaph = "([" .. yudh .. waw .. consonants_minus_glides .. "])"

local vowels_w = TR_WAW_PLUS_RVASA_SHORT .. TR_WAW_PLUS_RVASA .. "ō"
local vowels_y = "eiēī"
local vowels = vowels_y .. vowels_w .. "aā"

local consonants_and_vowels_capture = "([" .. glides .. consonants_minus_glides .. vowels .. "])"

local genitive_endings = {"ܵܐ", "ܝܼ", "ܘܼܟ݂", "ܵܟ݂ܝ", "ܹܗ", "ܵܗ̇", "ܘܼܗܝ" , "ܘܿܗ̇" , "ܘܼܗ̇" , "ܲܢ", "ܵܘܟ݂ܘܿܢ", "ܗܘܿܢ", "ܵܝܗܝ" , "ܹ̈ܐ" , "ܹܐ" , "ܲܝ" , "ܲܝ̈" , "ܲܬ݂" , "ܵܬ݂̈" , "ܵܬ݂" , "ܝܼ̈", "ܘܼ̈ܟ݂", "ܵܟ݂ܝ̈", "ܘܼ̈ܗܝ", "̈ܘܿܗ̇", "ܘܼ̈ܗ̇" , "ܲܢ̈", "ܵܘ̈ܟ݂ܘܿܢ", "ܗ̈ܘܿܢ" , "ܵܝ̈ܗܝ"}
-- per "No alternation (the | operator)" https://www.mediawiki.org/wiki/Extension:Scribunto/Lua_reference_manual#Patterns
-- lua doesn't provide support to check if a word contains this, or that, or other via (this|that|other)
-- this means we can't simply check if the word ends in any of the possible masculine genitive endings with a capture group like
-- local genitive_endings_capture = "(" .. table.concat(genitive_endings,"|") .. ")"
local genitive_endings_special_cases = {
	["ܒܵܬ"] = "bātt",
	["ܚܘܼܒ"] = "ḥubb",
	["ܓܘܼܕ"] = "gudd",	["ܓܘܼܕܵܢ"] = "guddān",
	["ܦܘܼܡ"] = "pumm",
	["ܪܘܼܙ"] = "ruzz",
	["ܐܸܣܦܘܼܓ"] = "ispugg",
	["ܐܸܛܪܘܼܓ"] = "iṭrugg",	["ܐܸܛܪ̈ܘܼܓ"] = "iṭrugg",
	["ܫܹܢ݇ܬ"] = "šētt",
}

-- due to runtime performance concerns, we should strive to limit special cases to only the most common terms for which the transliterator is way off
local special_cases = {
	-- { aii_text, aii_translit_output }
	--
	-- the # symbol pads the start and end of a word, consider the follow examples
	-- #float#    only float matches
	-- #float     words starting with float like float or floats match
	-- float#     words ending with float like float or afloat match
	-- float      words containing float like float, floats, afloat and refloats match
	{"#ܒܗ" .. combining_dot_above .. "ܝ#", "#b-ay#"},
	{"ܗ" .. combining_dot_above .. "ܝ#", "aya#"},
	{"ܗ" .. combining_dot_above .. "ܘ#", "awa#"},
	{"ܡ" .. combining_dot_above .. "ܢ#", "man#"},
	{"ܡ" .. combining_dot_below .. "ܢ#", "min#"},
	-- commenting this out for now until test cases can be added for it
	-- {waw .. hbasa .. "ܟ݂#", "ōḵ#"}, {waw .. hbasa .. combining_diaeresis ..  "ܟ݂#", "ōḵ#"},
	{"ܟ̰ܵܐܝ", "čāy"},
	{"ܒܵܐܝ", "bāy"},
	{"ܐܲܦ̮ܘܿܟܵܕ", "avōkād"},
	{"ܝܼܫܘܿܥ#", "īšōˁ#"}, -- starts with vowel but not silent alaph
	-- "to be" without inital khwasa, ì
	{"#ܝܘܸܢ#", "#ìwen#"}, {"#ܝܘܵܢ#", "#ìwān#"},
	{"#ܝܘܲܚ#", "#ìwaḥ#"}, {"#ܝܘܸܬ#", "#ìwet#"},
	{"#ܝܘܵܬܝ#", "#ìwāt#"}, {"#ܝܬܘܿܢ#", "#ìtōn#"},
	{"#ܝܠܹܗ#", "#ìlēh#"}, {"#ܝܠܵܗ̇#", "#ìlāh#"},
	{"#ܝܢܵܐ#", "#ìnā#"}, {"#ܝܗ݇ܘܵܐ#", "#ìwā#"},
	{"#ܝܗ݇ܘܵܬ݇#", "#ìwā#"}, {"#ܝܗ݇ܘܵܘ#", "#ìwā#"},
	-- "to be" with inital khwasa, ī
	-- https://en.wiktionary.org/wiki/Template:aii-conj-verb/hawe
	{"ܝܼܘܸܢ#", "īwen#"}, {"ܝܼܘܵܢ", "īwān"},
	{"ܝܼܘܸܬ#", "īwet#"}, {"ܝܼܘܵܬܝ#", "īwāt#"},
	{"ܝܼܠܹܗ#", "īlēh#"}, {"ܝܼܠܵܗ̇#", "īlāh#"},
	{"ܝܼܘܲܚ#", "īwaḥ#"}, {"ܝܼܬܘܿܢ#", "ītōn#"}, {"ܝܼܢܵܐ#", "īnā#"},
	-- "to be" blends
	{"ܝܼܗ݇ܘܵܐ#", "īwā#"}, {"ܝܼܗ݇ܘܵܘ#", "īwā#"},
	-- "to be" imperative forms
	-- following substitutions starting with '#w' are to pre-empt 'w-' prefixing rule
	{"#ܗ݇ܘܝܼ", "#wī"}, {"#ܗ݇ܘܹܝܡܘܼܢ#", "#wēmūn#"},
	-- "to be" past particles
	{"#ܗ݇ܘܵܐ#", "#wā#"}, {"#ܗ݇ܘܵܘ#", "#wā#"}, {"#ܗ݇ܘܹܐ#", "#wē#"},
	-- "of" (fixed expressions)
	{"#ܕܝܵܡܵܐ#", "#d-yāmā#"}, {"#ܕܠܵܐ#", "#d-lā#"},
	{"#ܕܕܸܒ݂ܫܵܐ#", "#d-diḇšā#"}, {"#ܕܲܕܡܵܐ#", "#da-dmā#"},
	-- "to" (fixed expressions)
	{"ܠܒܲܕܲܪ#" , "l-baddar#"}, {"ܠܓܵܘܵܐ#", "l-gāwā#"},
	{"#ܠܡܵܐ#" , "#lemā#"},
	-- "per" (fixed expressions with time)
	{"ܒܕܲܩܝܼܩܵܐ#", "b-daqqīqā#"}, {"ܒܪܦܵܦܵܐ#", "b-rpāpā#"},
	{"ܒܫܵܥܬ݂ܵܐ#", "b-šāˁṯa#"}, {"ܒܫܵܥܲܬ݂#", "b-šāˁaṯ#"},
	{"ܒܝܵܘܡܵܐ#", "b-yāwmā#"}, {"ܒܝܘܿܡ#", "b-yōm#"},
	{"ܒܫܵܒ݂ܘܿܥܵܐ#", "b-šāḇōˁā#"}, {"ܒܡܸܬ݂ܚܵܐ#", "b-miṯḥā#"},
	{"ܒܝܲܪܚܵܐ#", "b-yarḥā#"}, {"ܒܝܼܪܲܚ#", "b-yraḥ#"},
	{"ܒܫܹܢ݇ܬܵܐ#", "b-šēttā#"}, {"ܒܫܹܢ݇ܬ#", "b-šēt#"},
	{"ܒܕܵܘܪܵܐ#", "b-dāwrā#"}, {"ܒܠܲܝܠܹܐ#", "b-laylē#"},
	{"ܒܨܲܦܪܵܐ#", "b-ṣaprā#"}, {"ܒܪܲܡܫܵܐ#", "b-ramšā#"},
	-- adverbs with clitics (fixed expressions)
	{"ܠܩܘܼܪܒܵܐ#", "l-qurbā#"}, {"ܒܩܘܼܪܒܵܐ#", "b-qurbā#"}, {"ܒܚܲܪܬܵܐ#", "b-ḥartā#"},
	{"ܒܟܠ#", "b-kul#"}, {"ܕܗܵܘܝܵܐ#", "d-hāwyā#"}, {"ܕܟܹܐ#", "d-kē#"},
	{"ܠܩܘܼܪܒܵܐ ܕ", "l-qurbā d-" .. ""},
	{"ܒܡܸܬ݂ܚܵܐ ܕ", "b-miṯḥā d-" .. ""},
	-- "all", "each", "every"
	{"ܟܠ#", "kul#"}, {"ܟܠܵܢ#", "kullān#"},
	{"ܟܠܘܼܟ݂#", "kullōḵ#"}, {"ܟܠܵܟ݂ܝ#", "kullāḵ#"},
	{"ܟܠܹܗ#", "kullēh#"}, {"ܟܠܵܗ̇#", "kullāh#"},
	{"ܟܠܘܼܗܝ#", "kullūh#"}, {"ܟܠܘܿܗ̇#", "kullōh#"},
	{"ܟܠܲܢ#", "kullan#"}, {"ܟܠܵܘܟ݂ܘܿܢ#", "kullāwḵōn#"},
	{"ܟܠܵܝܗܝ#", "kullāyh#"}, {"ܟܠܗܘܿܢ#", "kullhōn#"},
	{"ܟܠܵܢܵܐܝܼܬ#", "kullānāˀīt#"}, {"ܟܠܵܢܵܐܝܼܬ݂#", "kullānāˀīṯ#"},
	{"ܟܠܵܢܵܝ", "kullānāy"}, {"ܟܘܿܠܵܝ", "kollāy"},
	{"ܟܠܚܲܕ݇#", "kulḥa#"}, {"ܟܠܚܕ݂ܵܐ#", "kulḥḏā#"},
	{"ܟܠܫܲܢ݇ܬ#", "kulšat#"},
	-- "classical because"
	{"ܡܸܛܠ#", "miṭṭul#"}, {"ܡܸܛܠܬ", "miṭṭult"},
	{"ܡܸܛܠܵܐܝܼܬ݂#", "miṭṭullāˀīṯ#"},
	-- "houses"
	{"ܒܵܬܹ̈ܐ#", "bāttē#"},
	{"ܒܵܬܲܝ̈#", "bāttay#"},
	{"ܒܵܬܲܢ̈#", "bāttān#"},
	{"ܒܵܬܘܼ̈ܟ݂#", "bāttōḵ#"},
	{"ܒܵܬ̈ܗܘܿܢ#", "bātthōn#"},
	-- masc sing construct state rvasa
	{"ܓܘܼܕ#", "gud#"},	{"ܦܘܼܡ#", "pum#"},
	{"ܐܸܛܪܘܼܓ#", "iṭrug#"},	{"ܐܸܣܦܘܼܓ#", "ispug#"},
	-- countries and nationalities
	{"ܒܸܠܓܝܵܐ#" , "belgyā#"}, {"ܒܸܠܓܝܼܩܵܝ", "belgīqāy"},
	{"ܣܹܝܫܸܠܝܼܣ#" , "sēšellīs#"}, {"ܣܹܝܫܸܠ#", "sēšel#"}, {"ܣܹܝܫܸܠܵܝ", "sēšellāy"},
	-- popular slang terms
	{"ܝܲܐܠܵܗ#", "yallāh#"}, {"ܘܲܐܠܵܗ#", "wallāh#"},
	-- feminine imperative forms
	{"ܙܹܠ݇ܝ#", "zē#"}, {"ܬܵܐܝ#", "tā#"},
}

function export.tr(text, lang, sc)

	text = rsub(text, " | ", "# | #")
	text = "##" .. rsub(text, " ", "# #") .. "##"
	text = rsub(text, "ـ", "")
	for _, sub in ipairs(fix) do text = rsub(text, unpack(sub)) end

	-- Special cases
	-- print(text)
	for _, sub in ipairs(special_cases) do text = rsub(text, unpack(sub)) end
	-- print(text)

	-- for every special case, apply substitutions for every masc genitive ending
	for aii_prefix, aii_prefix_corrected_tr in pairs(genitive_endings_special_cases) do
		for _, masc_genitive_ending in ipairs(genitive_endings) do
			text = rsub(text, aii_prefix .. '(' .. masc_genitive_ending .. ')' .. '#',  aii_prefix_corrected_tr .. '%1#' )
		end
	end

	-- Ignore siyameh
	text = rsub(text, combining_diaeresis, "")

	text = rsub(text, "ܟ" .. combining_tilde_below, "č")
	text = rsub(text, "ܓ" .. combining_tilde_below, "j")
	text = rsub(text, "ܫ" .. combining_tilde_below, "ž")

	text = rsub(text, "ܙ" .. combining_tilde_above, "ž")
	text = rsub(text, "ܟ" .. combining_tilde_above, "č")
	text = rsub(text, "ܫ" .. combining_tilde_above, "ž")

	text = rsub(text, "ܦ" .. combining_breve_below, "f")

	text = rsub(text, "ܦ" .. qushshaya, "p") -- needs a test case
	text = rsub(text, "ܒ" .. qushshaya, "b")
	text = rsub(text, "ܬ" .. qushshaya, "t")
	text = rsub(text, "ܕ" .. qushshaya, "d")
	text = rsub(text, "ܟ" .. qushshaya, "k")
	text = rsub(text, "ܓ" .. qushshaya, "g")

	text = rsub(text, "ܒ" .. rukkakha, "ḇ")
	text = rsub(text, "ܬ" .. rukkakha, "ṯ")
	text = rsub(text, "ܕ" .. rukkakha, "ḏ")
	text = rsub(text, "ܟ" .. rukkakha, "ḵ")
	text = rsub(text, "ܓ" .. rukkakha, "ḡ")

	-- this covers b-, d-, w-, l- prefixing for words starting with an alaph
	-- https://r12a.github.io/scripts/syrc/aii.html#standalone
	-- and ALL special_cases starting with initial_translit_char
	local initial_translit_char = 'aī' -- accounts for substituted special cases starting with vowel sound
	local initial_char_capture = "([" .. alaph .. initial_translit_char .. "])"
	text = rsub(text, "#" .. bdul_capture2 ..  initial_char_capture, "#%1-%2-%3")
	text = rsub(text, "#" .. bdul_capture ..  initial_char_capture, "#%1-%2")

	text = rsub(text, waw .. hbasa .. "ܗ" .. combining_dot_above .. "#", TR_THIRD_PERSON_FEM_SUFFIX .. "#")

	text = rsub(text, yudh .. hbasa, "ī")
	text = rsub(text, waw .. rwaha, "ō")
	text = rsub(text, waw .. hbasa, TR_WAW_PLUS_RVASA)

	text = rsub(text, ".", tt_transpose_punc)
	text = rsub(text, ".", tt)
	text = rsub(text, "#" .. alaph .. "#", "#ˀ#") -- needs a test case

	text = rsub(text, consonants_capture .. mhagjana_capture .. combining_macron_below .. consonants_capture, "%1e%2%3")
	text = rsub(text, consonants_capture .. marhetana_capture .. combining_macron .. consonants_capture, "%1%2e%3")

	text = rsub(text, consonants_capture .. talqana_above, "")

	-- shorten waw + rvasa
	text = rsub(text, TR_WAW_PLUS_RVASA .. consonants_minus_glides_cg .. consonants_minus_glides_cg, TR_WAW_PLUS_RVASA_SHORT .. "%1%2")

	-- doubling consonants
	text = rsub(text, "([" .. zlama_horizontal .. pthaha .. "])" .. consonants_capture .. diacritic_vowels_capture, "%1%2%2%3")
	text = rsub(text, "([" .. zlama_horizontal .. pthaha .. "])" .. consonants_capture .. TR_THIRD_PERSON_FEM_SUFFIX , "%1%2%2" .. TR_THIRD_PERSON_FEM_SUFFIX)
	--

	text = rsub(text, combining_dot_above, "")

	text = rsub(text, consonants_capture .. zlama_angular .. yudh .. consonants_capture, "%1ē%2")
	text = rsub(text, consonants_capture .. yudh .. consonants_capture, "%1i%2")

	text = rsub(text, consonants_minus_glides_cg .. yudh .. "#", "%1#")

	text = rsub(text, alaph .. pthaha .. waw .. "#", "aw#") -- needs a test case (impossible combination of characters?)
	text = rsub(text, alaph .. pthaha .. yudh .. "#", "ay#") -- needs a test case

	text = rsub(text, "#" .. alaph .. zlama_angular .. yudh, "#ē")  -- needs a test case
	text = rsub(text, "#" .. alaph .. yudh, "#ī") -- needs a test case

	text = rsub(text, "#" .. yudh .. consonants_capture, "#%1")

	text = rsub(text, pthaha .. alaph .. "#", "a#") -- needs a test case
	text = rsub(text, zlama_angular .. alaph .. "#", "ē#")
	text = rsub(text, zqapha .. alaph .. "#", "ā#") -- needs a test case
	text = rsub(text, alaph .. "#", "ā#") -- needs a test case
	text = rsub(text, "#" .. alaph, "#")
	text = rsub(text, alaph, "ˀ")

	text = rsub(text, "#" .. waw .. consonants_and_vowels_capture, "#w-%1")

	text = rsub(text, ".", tt_next)

	text = rsub(text, "([ēīā])" .. "ˀ" .. consonants_capture, "%1%2")

	text = rsub(text, "([" .. vowels_w .. "])([" .. vowels .. "])", "%1w%2") -- needs a test case
	text = rsub(text, "([" .. vowels_y .. "])([" .. vowels .. "])", "%1y%2")

	text = rsub(text, "ˁˁ", "ˁ") -- needs a test case
	text = rsub(text, "ˀˀ", "ˀ") -- needs a test case
	text = rsub(text, "-ˀ", "-")

	text = rsub(text, "ḇḇ", "ḇ")
	text = rsub(text, "ḡḡ", "ḡ")
	text = rsub(text, "ḏḏ", "ḏ")
	text = rsub(text, "ḵḵ", "ḵ")
	text = rsub(text, "p̄p̄", "p̄")
	text = rsub(text, "ṯṯ", "ṯ")

	-- substitutions like this are preferred to be closer to the top, but it was hard to figure out how to do that
    -- shorten waw + rvasa a different way from before per verbal noun of the d stem
	local waw_cg = "([" .. consonants_minus_glides .. "wy])"
	text = rsub(text, '#' .. waw_cg .. TR_WAW_PLUS_RVASA .. waw_cg .. 'ā' .. waw_cg .. 'ā#', "%1" .. TR_WAW_PLUS_RVASA_SHORT .. '%2%2ā%3ā')

	-- local bdul_capture = '([bdwl])'
	-- text = rsub(text, "#" .. bdul_capture .. "([" .. vowels .. "])", "#%1-%2")

	text = rsub(text, "#", "")

	if not rmatch(text, "([-" .. vowels .. " ])") then
		require("Module:debug").track("aii-translit/lacking diacritics")
		return nil
	end

	return text
end

	
return export