Module:wuu-pron/sandbox
Jump to navigation
Jump to search
Use Wugniu input with prefixes; e.g.
Some characters sandhis as a tone different from the underlying tone, e.g. in Shanghainese 勿 is light checked (tone 8), but it sandhis like light departing (tone 6). Use
Currently supported locations are Shanghainese (
Internally, the Wiktionary Romanisation is immediately converted to Wugniu. The initial glottal stops in WR are ignored completely and not displayed in the output.
Some error checking functionality may be reduced.
All other symbols are the same as before.
IPA /y/ (no frication):
IPA /z̩ʷ/ (with frication)
"ye" inconsistencies:
狗
𧟰伊
- The following documentation is located at Module:wuu-pron/sandbox/documentation. [edit]
- Useful links: root page • root page’s subpages • links • transclusions • testcases • sandbox of (diff)
Sandbox of Module:wuu-pron.
Usage[edit]
- Old input
|w=3zaan he
gives: (this usage is not recommended)
|w=sh:6zaon he
gives:6veq8
where this is needed. |w=sh:6veq8 khu i
gives:sh
) and Suzhounese (sz
)&
and +
no longer displays in output (but they are still needed in the input in order for the sandhi chains to display correctly)Links[edit]
Module:wuu-pron/data - finals and tones
Module:wuu-pron/sandbox/Hangzhou - Jinhuanese
User:kc_kennylau/wuu-pron-sandbox-all-lects - Big testcase for all lects
Test cases[edit]
Feel free to add more :)
Errors[edit]
Lua error at line 110: Invalid final: "ah"
Lua error at line 184: Invalid syllable: 1a 'm. Wugniu expected, but another romanisation is supplied.
minidict or sth[edit]
Main module tests[edit]
local export = {}
local data = mw.loadData("Module:wuu-pron/data")
--[=[ for future direct wugniu to ipa
TODO:
- do IPA for glottalised nasal intials (currently the glottal stop is dropped)
- figure out display with many lects: hokkien-style collapsing?
- FIND DATA FOR 3+ SYLLABLE SANDHI AND RPS!!! (we can settle for trisyllabics right)
- modules for jinhua, taizhou, wenzhou
]=]--
local loc_names = { -- Basic 聲韻調 Disyll LPS Trisyll LPS Quad+ LPS RPS Logic
['sh'] = 'Shanghai', -- + + + + + + LIVE
['sj'] = 'Songjiang',-- + + + - - + LIVE
['cm'] = 'Chongming',-- + + + - + + LIVE
['sz'] = 'Suzhou', -- + + + + + + LIVE
--[[['ks'] = 'Kunshan', + + - - - - Wugniu "新派" ]]
['cz'] = 'Changzhou',-- + + + - - ? LIVE
['jx'] = 'Jiaxing', -- + + + - - + LIVE
['tx'] = 'Tongxiang',-- + + + - - + LIVE
['hn'] = 'Haining', -- + + + - + + LIVE Xiashi locality
['hy'] = 'Haiyan', -- + + + - - + LIVE
['hz'] = 'Hangzhou', -- + + + + + + LIVE
['sx'] = 'Shaoxing', -- + + + - + + LIVE
['nb'] = 'Ningbo', -- + + + - + + LIVE
}
-- default to "<loc_name> dialect" if empty
-- specifies the name of the Wikipedia article of the lect
local wiki_names = {
sh = 'Shanghainese'
}
local minidict = {
['sh'] = true,
['cm'] = true,
['sz'] = true,
['cz'] = true,
['jx'] = true,
['hz'] = true,
['sx'] = true,
['nb'] = true
}
local order = {'sh', 'sj', 'cm', 'sz', 'cz', 'jx', 'tx', 'hn', 'hy', 'hz', 'sx', 'nb'}
local ipa_initial = {
["p"] = "p", ["ph"] = "pʰ", ["b"] = "b", ["m"] = "m", ["f"] = "f", ["v"] = "v",
["t"] = "t", ["th"] = "tʰ", ["d"] = "d", ["n"] = "n", ["l"] = "l",
["ts"] = "t͡s", ["tsh"] = "t͡sʰ", ["s"] = "s", ["z"] = "z", ["c"] = "t͡ɕ", ["ch"] = "t͡ɕʰ",
["dz"] = "d͡z", ["j"] = "d͡ʑ", ["gn"] = "n̠ʲ", ["sh"] = "ɕ", ["zh"] = "ʑ",
["k"] = "k", ["kh"] = "kʰ", ["g"] = "ɡ", ["ng"] = "ŋ", ["h"] = "h", ["gh"] = "ɦ",
[""] = "",
}
local ipa_initial_override = {
--this always takes priority over the table above
--additional unique initials can also be defined here
--a question mark means the initial does not exist
--there must be empty tables for all locations, even if there is nothing there
['sh'] = {
["dz"] = "?"
},
['sj'] = {
["p"] = "ɓ", ["t"] = "ɗ",
["f"] = "ɸ", ["v"] = "β",
["ch"] = "cʰ", ["c"] = "c", ["j"] = "ɟ", ["sh"] = "ç",
["zh"] = "?", ["dz"] = "?"
},
['cm'] = {
["v"] = "fv", ["z"] = "sz", ["zh"] = "ɕʑ", ["gh"] = "hɦ",
},
['sz'] = {
["dz"] = "?", ["zh"] = "?"
},
--[[['ks'] = {
["zh"] = "?", ["h"] = "x"
},]]
['cz'] = {
},
['jx'] = {
['dz'] = "?"
},
['tx'] = {
},
['hn'] = {
},
['hy'] = {
["zh"] = "?"
},
['hz'] = {
["zh"] = "?"
},
['sx'] = {
},
['nb'] = {
}
}
local function get_initial(initial, loc)
return ipa_initial_override[loc][initial] or ipa_initial[initial] or error('Invalid initial: "' .. initial .. '"')
end
local function get_final(final, loc)
return data.ipa_final[loc][final] or error('Invalid final: "' .. final .. '"')
end
local ipa_syllabic = {
["m"] = "m̩", ["n"] = "n̩", ["ng"] = "ŋ̍",
}
-- diagnose tone error
local function diagnose_tones(word_length, loc, text, tone, tone2, tone3)
-- the cap on number of syllables
local syl_cap = ({sh=5,sj=3,cm=3,sz=4,cz=3,jx=3,tx=3,hn=3,hy=3,hz=5,sx=3,nb=3})[loc]
if syl_cap and word_length > syl_cap then
error(("Maximum %d syllables supported for %s."):format(syl_cap, loc))
end
-- the cap on number of specified tones
local tone_cap = ({sj=3,ks=2,cz=3,jx=3,tx=3,hn=3,hy=2})[loc]
if tone_cap then
local expected = math.min(tone_cap, word_length)
local received = 1 + (tone2 ~= '' and 1 or 0) + (tone3 ~= '' and 1 or 0)
if received ~= expected then
error(('Expected %d tones, but received %d: "%s:%s".'):format(expected, received, loc, text))
end
elseif loc == 'sz' or loc == 'sx' then
-- sz: tone is 7 or 8, but second tone not provided
error("For " .. loc .. ", second tone must be specified.")
end
error(('Incorrect tone notation "%s" for %s. See [[WT:AZH/Wu]].'):format(tone..tone2..tone3, loc))
end
local function tone_superscript(text)
return text:gsub('[1-5]',{['1']='¹',['2']='²',['3']='³',['4']='⁴',['5']='⁵'})
end
local function get_tone(text, loc)
local word_length = text:gsub("[^ ]+", ""):len() + 1
local tone, tone2, tone3 = text:match("^(.%u*)%w+ ?(%d?%u?)%w* ?(%d?%u?)")
if loc == "jx" and tone == "3" then
tone = text:find("^3[ptkc]s?h") and "3B" or "3A"
elseif loc == "cm" then
local result = nil
if tone:find("[MP]") then -- Verb + Motion / Verb + Pronoun
if word_length ~= 2 then error("cm: Unsupported word length.") end
result = data.tone_contours[loc][tone] or error("cm: Wrong motion/pronoun format.")
elseif tone:find("R",1,true) then -- Reduplication
local main_tone, redup_type, word, sub_tone = text:match("^(%d)R([VCN])(%l+) (%d)%3$")
main_tone, sub_tone = tonumber(main_tone), tonumber(sub_tone)
local conv_tone = (redup_type == "N" and main_tone%2==0 and word:find("^g?[mnl]") and main_tone-1) or main_tone
if sub_tone ~= conv_tone then error("cm: Wrong reduplication format.") end
result = data.tone_contours[loc]["R"..redup_type..main_tone]
end
if result then
return tone_superscript(result)
end
elseif loc == "sx" and tone:find("^%dA$") then
return tone_superscript(data.tone_contours[loc][tone])
end
local result = data.tone_contours[loc][word_length..tone..tone2..tone3]
or data.tone_contours[loc][word_length..tone..tone2]
or data.tone_contours[loc][word_length..tone]
return result and tone_superscript(result) or diagnose_tones(word_length, loc, text, tone, tone2, tone3)
end
local function RPS_tone_determ(word_length, tone, loc)
local result
if word_length == 1 then
result = data.tone_contours[loc][tone .. "s"] or data.tone_contours[loc]['s']
else
result = data.tone_contours[loc]["multiple"]
end
return tone_superscript(result)
end
local function rom_check(text, locs) --this checks wugniu
if text:match("%f[%l']['qx]") or text:match('ny') or text:match('hh') or text:match("h$") then
error('Invalid syllable: ' .. text ..'. Wugniu expected, but another romanisation is supplied.')
end
if text:match('ghi') and locs ~= 'cm' then
error('Invalid initial "ghi". Use "yi" instead.')
end
if text:match('ghu') and locs ~= 'cm' then
error('Invalid initial "ghu". Use "wu" instead.')
end
if text:match('%f[%l]y%f[%L]') then
error('Invalid syllable "y"')
end
if text:match('gn[aeou]') then
error('Palatalization expected. Insert an "i" after the "gn".')
end
if text:match('uw') then
error(('Invalid syllable in "%s".'):format(text))
end
if locs:find('cm') and (text:find('ueu') or text:find('uon') or text:find('ui')) then
error('cm: Mutation-only final found.')
end
return nil
end
function export.ipa_syl_conv(text, loc, initials, finals, syllabics, i, main_tone, tone)
-- get ipa from tables
local initial, final = text:match("^([td]?[pbmfvtdnlszcjghk][hng]?)(.+)$")
local if_syllabic = syllabics[text]
if loc == 'sx' and text == 'gn' then if_syllabic = "ɲ̩" end
if not initial or if_syllabic then
initial, final = '', text
end
if loc == 'cm' then -- mutation
local mutated_initial = i > 1 and initial == "z" and "z"
local preglottal = ""
if tone ~= "0" and (mutated_initial or initial:find("^g?[mnl]") or initial == "") then
preglottal = (i > 1 or main_tone:find("^[1357]$")) and "ʔ" or "ɦ"
end
return preglottal
.. (mutated_initial or initials(initial,loc))
.. (if_syllabic or finals(final,loc))
end
return initials(initial,loc) .. (if_syllabic or finals(final,loc))
end
function export.wugniu_to_ipa(original_text, loc, initials, finals, syllabics, tones)
local text, conv_text = "", ""
local tone_number = ""
original_text = original_text:gsub(" (%l+)(%d%u?)", ' %2%1')
if loc == 'cm' then
original_text = original_text:gsub("%f[%l]yi?","i"):gsub("%f[%l]wu?","u")
else
original_text = original_text:gsub("%f[%l]yi?","ghi"):gsub("%f[%l]wu?","ghu")
end
local reading = mw.text.split(original_text, ",", true)
local syllable = {}
local syl_tone = {}
for reading_index = 1, #reading, 1 do
local components = mw.text.split(reading[reading_index], "&", true)
for component_index = 1, #components do
local indep_words = mw.text.split(components[component_index], "+", true)
for indep_index = 1, #indep_words do
text = indep_words[indep_index]
tone_number = text:sub(1, 1)
local tone = tones(text, loc)
text = text:gsub("[^ %l]+", "")
local syllable = mw.text.split(text, " ", true)
local syl_tone = mw.text.split(tone, " ", true)
for i = 1, #syllable, 1 do
--RPS
if i == #syllable and indep_words[indep_index + 1] and tone ~= "³³" then
syl_tone[i] = RPS_tone_determ(#syllable, tone_number, loc)
end
syllable[i] = (syllable[i] ~= "" and export.ipa_syl_conv(syllable[i], loc, initials, finals, syllabics, i, tone_number, syl_tone[i]) or "")
.. (syl_tone[i] == "0" and "" or syl_tone[i])
end
indep_words[indep_index] = table.concat(syllable, " ")
end
components[component_index] = table.concat(indep_words, " ")
end
reading[reading_index] = table.concat(components, " ")
end
return table.concat(reading, "/, /")
end
function export.wikt_to_wugniu(text)
if type(text) == "table" then text = text.args[1] end
return text
--initials
:gsub("'+", {["'"]=""})
:gsub("%f[%l][jqxnh][jxyh]?", {j="c", jj="j", q="ch", x="sh", xx="zh", ny="gn", hh="gh"})
--vowels
:gsub("un", "uen")
:gsub("yoe", "ioe")
:gsub("y", "iu")
:gsub("aan", "aon")
:gsub("%f[er]r", "y")
--syllabics
:gsub("g?h?mm", "m")
:gsub("g?h?ngg", "ng")
--tones
:gsub("[2-5]", {['2']='5', ['3']='6', ['4']='7', ['5']='8'})
--gh rules
:gsub("ghi", "yi")
:gsub("yi%f[aeou]", "y")
:gsub("ghu", "wu")
:gsub("wu%f[aeo]", "w")
end
local function wugniu_to_wikt(text)
if type(text) == "table" then text = text.args[1] end
--initials
return export.wugniu_format(text
:gsub("%f[%l][cjszg][nh]?", {c="j", ch="q", j="jj", sh="x", zh="xx", gn="ny", gh="hh"})
:gsub("%f[%l]yi?", "hhi")
:gsub("wu?", "hhu")
--vowels
:gsub("y%f[%L]", "r")
:gsub("uen", "un")
:gsub("ioe", "yoe")
:gsub("iu", "y")
:gsub("aon", "aan")
--syllabics
:gsub("%f[%l][mn]g?%f[%L]", {m="mm",n="nn",ng="ngg"})
--initial hh and '
:gsub("([157])([mnl])", "%1'%2")
:gsub("([68])([mn][mng]g?)%f[%L]", "%1hh%2")
--tones
:gsub("[5-8]", {['5']='2', ['6']='3', ['7']='4', ['8']='5'}))
end
function export.wugniu_format(text, loc)
-- 1a a 1a 1a3 a1 -> ^1a-a-a_1-^1a_3-a_1
-- 1a3-3a5 -> ^1a_3-^3a_5
return text
:gsub("[%- &+,]", {["-"]="", [" "]="-", ["&"]=" ", ["+"]=" ", [","]="; "})
:gsub("(%-?)(%d?%u?)('?%l+)(%d?%u?)", function(dash, tone1, main, tone2)
if dash == '-' and tone2 == '' then
tone1, tone2 = tone2, tone1
end
if tone1 ~= '' then
tone1 = '<sup>' .. tone1 .. '</sup>'
end
if tone2 ~= '' then
tone2 = '<sub>' .. tone2 .. '</sub>'
end
return dash .. tone1 .. main .. tone2
end)
end
local function wikt_format(text)
return export.wugniu_format(text)
end
local function minidict_format(text)
-- 1A3 3B5 3C D3 E -> A^3 B^5 C^3 D^3 E
-- 1A B -> A^1 B
return text
:gsub("-", "")
:gsub("[&+]", " ")
:gsub(",", "; ")
:gsub("[1-8]?(%l+)([1-8])", '%1<sup>%2</sup>')
:gsub("([1-8])(%l+)", '%2<sup>%1</sup>')
:gsub("%f[%l]([mnlr]%l*)(<sup>[1357]</sup>)", "'%1%2")
:gsub("[1-8]",{
["1"]="平",["2"]="平",
["3"]="上",["4"]="上",
["5"]="去",["6"]="去",
["7"]="入",["8"]="入",
})
end
function export.wugniu_to_minidict(text, loc)
if type(text) == "table" then text = text.args[1] end
text = text:gsub('[yw]', {y = 'yi', w = 'wu'})
if loc == 'sx' then
text = text:gsub("[ei]+[nq]",{een="en",en="eon",iq="ieq"})
elseif loc == 'hz' then -- are we dealing with mergers?
text = text:gsub("[aeiu]+q?%f[%L]",{eu="ei",ieu="iu",aq="eq",iaq="ieq",iq="ieq",uaq="ueq"})
elseif loc == 'sz' or loc == 'cz' then
text = text:gsub("%f[%l]yie%f[%L]", "yiie") -- ye > yie
elseif loc == 'sh' then
text = text:gsub("ie%f[%L]", "iae")
elseif loc == 'cm' then
text = text:gsub("<sup>→%l+</sup>", "")
end
return minidict_format(text
--finals & syllabic
:gsub("iu([nq])", "iui%1")
:gsub("gher", "r")
:gsub("er", "r")
:gsub("q", "h"))
--initials
--Glottal stops? text = text:gsub("", "'")
:gsub("gn", "ny")
:gsub("nyi%f[aeou]", "ny")
:gsub('yi([aeiou])', 'y%1')
:gsub('wu([aeiou])', 'w%1')
end
-- various boilerplates
function export.name_boilerplate(name, wiki)
return '<i>[[w:' .. (wiki or name..' dialect') .. '|' .. name.. ']]</i>'
end
function export.consolas(text)
return '<span style="font-family: Consolas, monospace;">' .. text .. '</span>'
end
function export.wugniu_boilerplate(text)
return '\n*** <small><i>[[Wiktionary:About Chinese/Wu|Wugniu]]</i></small>: '
.. export.consolas(text)
end
function export.minidict_boilerplate(text)
return '\n*** <small><i>[[Wiktionary:About Chinese/Wu|MiniDict]]</i></small>: '
.. export.consolas(text)
end
function export.wikt_boilerplate(text)
return '\n*** <small><i>[[Wiktionary:About Chinese/Wu|Wiktionary Romanisation]]</i></small>: '
.. export.consolas(text)
end
function export.IPA_boilerplate(text, name, wiki)
text = text:gsub("(/?[^ /,]*/[^ /,]*/?)", '<span style="white-space: nowrap;">%1</span>')
return '\n*** <small>Sinological [[Wiktionary:International Phonetic Alphabet|IPA]]'
.. ' (' .. export.name_boilerplate(name, wiki) .. ')</small>: '
.. '<span class="IPA">/' .. text .. '/</span>'
end
-- backwards compatibility for old usage "|w=<text>"
function export.legacy(text, mode)
require("Module:debug").track("wuu-pron/legacy")
local wugniu_text = export.wikt_to_wugniu(text)
local show = '\n**<small>(<i>[[w:Taihu Wu|Northern]]</i>)</small>: ' .. export.consolas(export.wugniu_format(wugniu_text))
local hide = '\n**<small>(' .. export.name_boilerplate('Shanghai') .. ')</small>: ' .. export.wugniu_boilerplate(export.wugniu_format(wugniu_text))
.. export.minidict_boilerplate(export.wugniu_to_minidict(wugniu_text, 'sh'))
.. export.wikt_boilerplate(wikt_format(text))
.. export.IPA_boilerplate(export.wugniu_to_ipa(wugniu_text, 'sh', get_initial, get_final, ipa_syllabic, get_tone), 'Shanghai')
return show, hide
end
local function preprocess_IPA(text, loc)
if loc == 'hz' then
return text:gsub("%f[%l]([td]?[szcj]h?i?u)%f[aeonq]", "%1w")
elseif loc == 'sx' then
return text:gsub("[^,&]+%+[^,&]+", function(chain)
local tone1,mode,word1,tone2,word2 = chain:match("^(%d)([AP]?)(%l+)%+(%d)(%l+)$")
if not tone1 then error("sx: Wrong chain format.") end
if mode == '' then mode = 'O' end
if mode == 'A' then
return tone1..'A'..word1..'&'..tone2..word2
end
return tone1..word1..' '..tone2..mode..word2
end):gsub("#(%d)","%1N")
elseif loc == 'cm' then
return text:gsub("%f[%l]%l+<(%l*)>","%1")
end
return text
end
local function preprocess_wugniu(text, loc)
if loc == 'jx' then
return text:gsub("3[AB]","3")
elseif loc == 'cm' then
return text:gsub("[CMPR][VCN]?","")
:gsub("%f[%l](%l*)<(%l*)>(%l*)(%d?)","%1%3%4<sup>→%2%3</sup>")
elseif loc == 'sx' then
return text:gsub("[#CAP]","")
end
return text
end
local function preprocess_mutation(text, locs)
if locs:find('cm') then
text = text:gsub(" (%d?C?)([vzgd]h?)([%w<>]+)", function(tone, initial, final)
local mutated_initial = ({v="u",zh="",gh=""})[initial]
if mutated_initial == "u" and final:find("^u") then
mutated_initial = ""
elseif initial == "d" and final:find("^i") then
mutated_initial = "l"
end
if final:find("<") or not mutated_initial then
return " "..tone..initial..final
end
return " "..tone..initial.."<"..mutated_initial..">"..final
end)
end
if text:find("<") and locs ~= "cm" then
error("cm: Mutation is incompatible with collapsing.")
end
return text
end
function export.make(text)
text = text:gsub("勿", "6veq8") -- for backwards compatibility
if not text:match(':') then -- for backwards compatibility
return export.legacy(text)
end
local show = ""
local hide = ""
local roms = {}
text = mw.text.split(text, ';', true)
for i = 1,#text,1 do
local s = mw.text.split(text[i], ':', true)
if #s ~= 2 or #s[1] == 0 then
error("Wugniu: prefix is required or too many prefixes")
end
local locs, t = mw.text.split(s[1], ',', true), s[2]
t = preprocess_mutation(t, s[1])
local list = {}
local format_text = t
for _, loc in ipairs(locs) do
if loc_names[loc] then
list[loc] = true
else
error('Wugniu: prefix "' .. loc .. '" is not recognized')
end
format_text = preprocess_wugniu(format_text, loc)
end
rom_check(t, s[1])
local wugniu_text = export.wugniu_format(format_text, locs[1])
table.insert(roms,wugniu_text)
local names = {}
local minidicts = {}
local minidicts_locs = {}
local IPAs = {}
for _, loc in ipairs(order) do if list[loc] then
table.insert(names, export.name_boilerplate(loc_names[loc], wiki_names[loc]))
if minidict[loc] then
local minidict_result = export.wugniu_to_minidict(format_text, loc)
if not minidicts_locs[minidict_result] then
table.insert(minidicts, minidict_result)
minidicts_locs[minidict_result] = {}
end
table.insert(minidicts_locs[minidict_result], export.name_boilerplate(loc_names[loc], wiki_names[loc]))
end
local ipa_text = preprocess_IPA(t, loc)
ipa_text = export.wugniu_to_ipa(ipa_text, loc, get_initial, get_final, ipa_syllabic, get_tone)
table.insert(IPAs,export.IPA_boilerplate(ipa_text, loc_names[loc], wiki_names[loc]))
end end
hide = hide .. '\n** <small>(' .. table.concat(names,', ') .. ')</small>'
hide = hide .. export.wugniu_boilerplate(wugniu_text)
for _,minidict_text in ipairs(minidicts) do
hide = hide .. export.minidict_boilerplate(minidict_text)
if #minidicts > 1 then
hide = hide .. ' <sup>(' .. table.concat(minidicts_locs[minidict_text], ', ') .. ')</sup>'
end
end
if list.sh then
hide = hide .. export.wikt_boilerplate(wugniu_to_wikt(format_text))
end
hide = hide .. table.concat(IPAs, '')
end
show = '\n** <small>(<i>[[w:Taihu Wu|Northern]]</i>)</small>: ' .. export.consolas(table.concat(roms, ' / '))
return show, hide
end
return export