Module:hak-pron/sandbox

From Wiktionary, the free dictionary
Jump to navigation Jump to search

local export = {}
local m_string_utils = require("Module:string utilities")

local gsub = m_string_utils.gsub
local sub = m_string_utils.sub
local match = m_string_utils.match
local find = m_string_utils.find
local len = m_string_utils.len
local lower = m_string_utils.lower
local toNFD = mw.ustring.toNFD

function export.hrs_to_ipa(text, dialect)
	local initial_conv = {
		["b"] = "p", ["p"] = "pʰ", ["m"] = "m", ["f"] = "f", ["v"] = "v", ["bb"] = "b",
		["d"] = "t", ["t"] = "tʰ", ["n"] = "n", ["l"] = "l", 
		["g"] = "k", ["k"] = "kʰ", ["ng"] = "ŋ", ["h"] = "h",
		["z"] = "t͡s", ["c"] = "t͡sʰ", ["s"] = "s",
		["j"] = "t͡s", ["q"] = "t͡sʰ", ["x"] = "s",
		["zh"] = "t͡ʃ", ["ch"] = "t͡ʃʰ", ["sh"] = "ʃ", ["rh"] = "ʒ",
		[""] = "",
	}
	local final_conv = {
		["ii"] = "ɨ",
		["i"] = "i", ["e"] = "e", ["a"] = "a", ["o"] = "o", ["u"] = "u",
		["ie"] = "ie", ["eu"] = "eu", ["ieu"] = "ieu",
		["ia"] = "ia", ["ua"] = "ua",
		["ai"] = "ai", ["iai"] = "iai", ["uai"] = "uai",
		["au"] = "au", ["iau"] = "iau",
		["io"] = "io", ["oi"] = "oi", ["ioi"] = "ioi",
		["iu"] = "iu", ["ui"] = "ui", ["iui"] = "iui",
		["ue"] = "ue",
		["iim"] = "ɨm", ["im"] = "im",
		["em"] = "em", ["iem"] = "iem",
		["am"] = "am", ["iam"] = "iam",
		["iin"] = "ɨn", ["in"] = "in",
		["en"] = "en", ["ien"] = "ien", ["uen"] = "uen",
		["an"] = "an", ["ian"] = "ian", ["uan"] = "uan",
		["on"] = "on", ["ion"] = "ion",
		["un"] = "un", ["iun"] = "iun",
		["ang"] = "aŋ", ["iang"] = "iaŋ", ["uang"] = "uaŋ",
		["ong"] = "oŋ", ["iong"] = "ioŋ",
		["ung"] = "uŋ", ["iung"] = "iuŋ",
		["er"] = "ə",
		["iib"] = "ɨp", ["ib"] = "ip",
		["eb"] = "ep", ["ieb"] = "iep",
		["ab"] = "ap", ["iab"] = "iap",
		["iid"] = "ɨt", ["id"] = "it",
		["ed"] = "et", ["ied"] = "iet", ["ued"] = "uet",
		["ad"] = "at", ["iad"] = "iat", ["uad"] = "uat",
		["od"] = "ot", ["iod"] = "iot",
		["ud"] = "ut", ["iud"] = "iut",
		["ag"] = "ak", ["iag"] = "iak", ["uag"] = "uak",
		["og"] = "ok", ["iog"] = "iok",
		["ug"] = "uk", ["iug"] = "iuk",
		["m"] = "m̩", ["n"] = "n̩", ["ng"] = "ŋ̍",
	}
	
	local function get_tone(final, tone_mark, dialect)
		local mark_to_value = {
			["Hailu"] = {
				["ˋ"] = "53",
				[""] = "55",
				["ˊ"] = "24",
				["ˇ"] = "11",
				["˖"] = "33",
				["d"] = "5",
				["dˋ"] = "2",
			}
		}
		
		local mark = (find(final, "[ptk]$") and "d" or "") .. tone_mark
		
		return mark_to_value[dialect][mark] or ""
	end
	
	local function get_sandhi(syl_count, i, tone, dialect)
		if dialect == "Hailu" then
			if i < syl_count then
				if tone == "24" then
					return "33"
				elseif tone == "5" then
					return "2"
				end
			end
		end
		
		return ""
	end
	
	local sup = {
		["1"] = "¹", ["2"] = "²", ["3"] = "³", ["4"] = "⁴", ["5"] = "⁵", ["-"] = "⁻",
	}
	
	local function hrs_check_invalid(text)
		if not text then
			return nil
		end
		local common_errors = "[´`+⁺^]"
		local error_correction = {
			["´"] = "ˊ",
			["`"] = "ˋ",
			["+"] = "˖",
			["⁺"] = "˖",
			["^"] = "ˆ",
		}
		local correct = gsub(text, common_errors, error_correction)
		if text ~= correct then
			error("Invalid Hakka Romanization \"" .. text .. "\": please change it to \"" .. correct .. "\"")
		end
	end
	
	--check for common errors in input
	hrs_check_invalid(text)
	
	local syllables, initial, final, tone, sandhi, ipa = {}, {}, {}, {}, {}, {}
	
	syllables = mw.text.split(text, " ")
	
	for i, syllable in ipairs(syllables) do
		syllable = gsub(syllable, ",", "")
		
		--find initial, final, tone
		initial[i] = match(syllable, "^([bpmfvdtnlgkhzcsjqxr][ghb]?)") or ""
		tone[i] = match(syllable, "([ˊˇˋ˖])$") or ""
		final[i] = sub(syllable, len(initial[i]) + 1, -1 - len(tone[i]))
		
		--convert initial, final, tone
		initial[i] = initial_conv[initial[i]] or ""
		final[i] = final_conv[final[i]] or ""
		tone[i] = get_tone(final[i], tone[i], dialect)
		sandhi[i] = get_sandhi(#syllables, i, tone[i], dialect)
		
		ipa[i] = initial[i] .. final[i] ..
			gsub(tone[i] .. (sandhi[i] ~= "" and "-" or "") .. sandhi[i], "[12345%-]", sup)
	end
	
	return gsub(table.concat(ipa, " "), ",", "")
end

return export