Module:cmn-pron-Nanjing

From Wiktionary, the free dictionary
Jump to navigation Jump to search

This module does romanisation conversion, IPA conversion, etc. for Nanjing Mandarin. See {{zh-pron}}.


-- Nanjing mandarin

local export = {}

-- see the encoding below
local initials = {
	b = "p", p = "pʰ", m = "m", f = "f",
	d = "t", t = "tʰ", l = "l",
	g = "k", k = "kʰ", h = "x",
	j = "t͡ɕ", q = "t͡ɕʰ", x = "ɕ",
	Z = "t͡ʂ", C = "t͡ʂʰ", S = "ʂ", r = "ʐ",
	z = "t͡s", c = "t͡sʰ", s = "s",
	[""] = "",
}

-- see the encoding below (U=ü, N=ng)
local finals = {
	ii = "z̩", iU = "ʐ̩", i = "i", u = "u", U = "y",
	a = "a", ia = "ia", ua = "ua",
	o = "o",
	e = "e", E = "ə", ie = "ie", Ue = "ye",
	ai = "ɛ", iai = "iɛ", uai = "uɛ",
	ei = "əi", ui = "uəi",
	ao = "ɔ", iao = "iɔ",
	ou = "əɯ", iu = "iəɯ",
	an = "ã", ian = "iã", uan = "uã",
	ien = "iẽ", Uen = "yẽ",
	en = "ə̃", ["in"] = "ĩ", un = "uə̃", Un = "yĩ",
	on = "oŋ", ion = "ioŋ",
	iUq = "ʐ̩ʔ", iq = "iʔ", uq = "uʔ", Uq = "yʔ",
	aq = "aʔ", iaq = "iaʔ", uaq = "uaʔ",
	eq = "əʔ", ieq = "ieʔ", ueq = "ueʔ", Ueq = "yeʔ",
	oq = "oʔ", ioq = "ioʔ",
	er = "ər", ir = "iər", ur = "uər", Ur = "yər",
	ar = "ar", iar = "iar", uar = "uar",
	["or"] = "or", ior = "ior",
	ier = "ier",
	air = "ɛr", iair = "iɛr", uair = "uɛr",
	aor = "ɔr", iaor = "iɔr",
	anr = "ãr", ianr = "iãr", uanr = "uãr",
	enr = "ɵ̃r",
	m = "m̩", n = "n̩", N = "ŋ̍",
}

local tones = {
	["1"] = "31", --陰平(T1)
	["2"] = "24", --陽平(T2)
	["3"] = "11", --上(T3)
	["4"] = "44", --去(T4)
	["5"] = "5",  --入(T5)
	["0"] = "", -- toneless (T0)
	["11"]="33",
	["25"]="11", ["20"]="11",
	["31"]="12", ["33"]="12",
	["45"]="42",
	["55"]="3",
}

local function tone_superscript(text)
	text = text:gsub("[1-5]",{['1']='¹',['2']='²',['3']='³',['4']='⁴',['5']='⁵'})
	return text
end

local tone_sandhi_num = {
	["11"]="4",
	["25"]="3", ["20"]="3",
	["31"]="2", ["33"]="2",
	["45"]="1",
}

-- internal use, encode and decode digraphs
local digraph_encode = {
	ng = "N", zh = "Z", ch = "C", sh = "S",
	["\204\128"] = "1",
	["\204\129"] = "2",
	["\204\140"] = "3",
	["\204\132"] = "4",
	["\204\138"] = "5",
}
local digraph_decode = {
	N = "ng", Z = "zh", C = "ch", S = "sh", U = "ü",
	["0"] = "",
	["1"] = "\204\128",
	["2"] = "\204\129",
	["3"] = "\204\140",
	["4"] = "\204\132",
	["5"] = "\204\138",
	["6"] = '<span style="background-color:#F5DEB3">',
	["7"] = "</span>",
}
local function encode(text)
	text = mw.ustring.toNFD(text)
		:gsub("[A-Z]",function(c) return "^"..c:lower() end)
		:gsub("u\204\136","U")
		:gsub("[bpnzcs\204][vfgh\128\129\132\138\140]",digraph_encode)
		:gsub("n([1-5])g","N%1")
	return text
end
local function decode_error(text)
	text = text:gsub("[NZCSU]",digraph_decode)
	return text
end
local function decode(text)
	text = text
	 :gsub("N([0-5])","n%1g")
	 :gsub("[NZCSU1-7]",digraph_decode)
	 :gsub("%^([a-z])",string.upper)
	return mw.ustring.toNFC(text)
end

-- check that the text is a valid input e.g. ^lan2jin1 ^beq5hua4
local function check_syllable_format(text)
	local check = text:gsub("[ /]?[%^>]?[bpmfdtlgkhjqxZCSrzcsyw]?[aeiouUmnN][aeiou]*[nq]?r?[0-5]","")
	if check ~= "" then error("Nanjing: Invalid syllable(s): "..check) end
end

-- TODO: inverse of export.py_divide_syllables
local function py_join_syllables(text)
	text = text
	 :gsub("([bpmfdtlgkhjqxZCSrzcsyw]?)([aeiouUmnN][aeiou]*[nq]?r?)([0-5])", function(a,b,c)
		 	local d,e = b:match("^([iuU]?[aeiouU])(%l*)$")
		 	if d then
				return "'"..a..d..(c~="0" and c or "")..e
			else
				return "'"..a..b..(c~="0" and c or "")
			end
	 	end)
	 :gsub("'([bpmfdtlgkhjqxZCSrzcsyw][aeiouU])","%1")
	 :gsub("%f[^ %z]'","")
	return decode(text)
end

-- Lánjìn Be̊qhuā --> ^lan2jin1 ^beq5hua4
local function py_divide_syllables(text)
	local res = encode(text)
		:gsub("([aeiouU1-5])N%f[aeiouU]","%1n'g")
		:gsub("'?([bpmfdtlgkhjqxZCSrzcsyw][aeiouU])","'%1")
		:gsub("'?([bpmfdtlgkhjqxZCSrzcsyw]?[iuU]?[aeiouUmnN])([1-5]?)([aeiou]*[nq]?r?)",
			function(a,b,c) return a..c..(b~="" and b or "0") end)
	check_syllable_format(res)
	local check = py_join_syllables(res)
	if text ~= check then error("Nanjing: input should be "..check) end
	return res
end

local function py_numbered(text)
	text = text:gsub("[0-5]","<sup>%0</sup>")
	 :gsub("[NZCSU67]",digraph_decode)
	return text
end

-- canonize to adhere to pinyin rules, e.g. jü -> ju
local function py_canonize(text)
	text = text
		:gsub("([jqx])U","%1u")
		:gsub("%f[%l%u]u[in]?",{u="w",ui="wei",un="wen"})
		:gsub("%f[%l%u]w%f[qr0-5]","wu")
		:gsub("%f[%l%u]i[uU]?",{i="y",iu="you",iU="rii"})
		:gsub("%f[%l%u]y%f[nqr0-5]","yi")
		:gsub("iU","ii")
		:gsub("%f[%l%u]U","yu")
		:gsub("([ZCSr])i%f[qr0-5]","%1ii") -- give error for zhi
		:gsub("E","e")
	return text
end

-- normalize to initial+final, e.g. ju -> jü
local function py_normalize(text)
	local res = text
		:gsub("([jqx])u","%1U")
		:gsub("w[ue][in]?",{wu="u",wei="ui",wen="un"})
		:gsub("w","u")
		:gsub("%f[%l%u]y[iuo]u?",{yi="i",yu="U",you="iu"})
		:gsub("%f[%l%u]y","i")
		:gsub("([ZCSr])ii","%1iU")
		:gsub("riU%f[q0-5]","iU")
		:gsub("([bpmfdtlgkhjqxZCSrzcs])e0","%1E0")
	local check = py_canonize(res)
	if text ~= check then
		error("Nanjing: "..decode_error(text).." should be "..decode_error(check))
	end
	return res
end

local function py_to_ipa(text)
	text = text:gsub("[^ ]+",function(syllable)
		local a,b,c,d = syllable:match("^([bpmfdtlgkhjqxZCSrzcs]?)([aeiouUEmnN][aeiouU]*[nq]?r?)([0-5])([0-5]?)$")
		if not a then error("Nanjing: Invalid syllable: " .. decode_error(syllable)) end
		local e = d~="" and tones[c..d]
		return (initials[a] or error("Nanjing: Invalid initial: " .. decode_error(a)))
			.. (finals[b] or error ("Nanjing: Invalid final: " .. decode_error(b)))
			.. tones[c]
			.. (e and ("⁻"..e) or "")
		end)
	return "/" .. text .. "/"
end

-- returns (display_text, phonetic_text, ipa)
function export.py_process(text)
	local conv_display = {}
	local conv_hidden = {}
	local conv_numbered = {}
	local conv_ipa = {}
	local i = 0
	for reading in mw.text.gsplit(text,"/",true) do
		i = i + 1
		reading = py_divide_syllables(reading)
		conv_display[i] = py_join_syllables(reading:gsub(">([a-zZCSUN]+[0-5])","<sup>→%1</sup>"))
		local original = reading:gsub("([a-zZCSUN]+[0-5])>[a-zZCSUN]+[0-5]","%1")
		local phonetic = reading:gsub("[a-zZCSUN]+[0-5]>([a-zZCSUN]+[0-5])","6%17")
		phonetic = phonetic:gsub("%^","")
		reading = phonetic:gsub("%f[^0-5](7?6?[a-zZCSUN]+)([0-5])","%2%1%2")
		phonetic = reading:gsub("([a-zZCSUN]+)([0-5])([0-5])",function(a,b,c)
			local d = tone_sandhi_num[b..c]
			return d and ('6'..a..d..'7') or (a..b)
		end)
		phonetic = phonetic:gsub("([a-zZCS][a-zU]+)r3","6%1r27")
		phonetic = phonetic:gsub("6+","6"):gsub("7+","7")
		reading = reading:gsub("([a-zZCS][a-zU]+)r3","%1r2"):gsub("r2[0-5]","r2")
		local original_num = original:gsub("([0-5]) ?","%1 "):gsub(" $",""):gsub("%^","")
		local phonetic_num = phonetic:gsub("([0-5]7?) ?","%1 "):gsub(" $","")
		if phonetic:find("6") then
			conv_hidden[i] = py_join_syllables(original) .. " [Phonetic: " .. py_join_syllables(phonetic) .. "]"
			conv_numbered[i] = py_numbered(original_num) .. " [Phonetic: " .. py_numbered(phonetic_num) .. "]"
		else
			conv_hidden[i] = py_join_syllables(original)
			conv_numbered[i] = py_numbered(original_num)
		end
		reading = reading:gsub("[67]",""):gsub("([0-5][0-5]?) ?","%1 "):gsub(" $","")
		reading = py_normalize(reading)
		conv_ipa[i] = py_to_ipa(reading)
	end
	return table.concat(conv_display, " / "),
		table.concat(conv_hidden, " / "),
		table.concat(conv_numbered, " / "),
		tone_superscript(table.concat(conv_ipa, ", "))
end

return export