Module:User:Justinrleung

From Wiktionary, the free dictionary
Jump to navigation Jump to search

This is a private module sandbox of Justinrleung, for his own experimentation. Items in this module may be added and removed at Justinrleung's discretion; do not rely on this module's stability.


local export = {}

local gsub = mw.ustring.gsub
local find = mw.ustring.find
local gsplit = mw.text.gsplit

function export.extract_gloss(content, useetc)
	local senses = {}
	local len = mw.ustring.len
	local literally = match(content, 'zh%-forms[^}]*|lit=([^{|}]+)[|}]')
	local sense_id = 0
	local etc = false
	local translingual_section, zh_section, j, pos, section
	while true do
		-- Find language sections beginning with ==...== and ending with the same
		-- or an empty string. Grab the Chinese and Translingual ones.
		_, j, language_name, section = content:find("%f[=]==%s*([^=]+)%s*==(\n.-)\n==%f[^=]", pos)
		
		if j == nil then
			i, j, language_name, section = content:find("%f[=]==%s*([^=]+)%s*==(\n.+)", pos)
		end
		
		if j == nil then
			break
		else
			-- Move to the beginning of "==" at the end of the current match.
			pos = j - 1
		end
		
		if language_name == 'Translingual' then
			translingual_section = section
		elseif language_name == 'Chinese' then
			zh_section = section
			break
		end
	end
	
	if not zh_section then
		zh_section = translingual_section
		if not zh_section then
			return ""
		end
	elseif translingual_section then -- also use translingual section if Chinese section contains only rfdef
		zh_section = zh_section..translingual_section
	end

	
	-- Delete etymology sections, because they sometimes contain ordered lists,
	-- which would then be interpreted as definitions.
	zh_section = zh_section:gsub("\n===+Etymology.-(\n==)", "%1")
	
	for sense in zh_section:gmatch('\n# ([^\n]+)') do
		if not sense:match('rfdef') and not sense:match('defn') then
			sense_id = sense_id + 1
			if sense_id > 2 then
				etc = true
				break
			end
			table.insert(senses, sense)
		end
	end
	gloss_text = (literally and literally .. "; " or "") .. (senses[1] or "")
	local gloss_text_extend = gloss_text .. (senses[2] and "; " .. senses[2] or "")
	gloss_text = (len(gloss_text) < 80 and len(gloss_text_extend) < 160) and gloss_text_extend or gloss_text
	if gloss_text ~= gloss_text_extend then etc = true end

	local function replace_gloss(text)
		local function replace_wp(text)
			return text:gsub('{{w|([^|}]+)|?([^|}]*)}}',
				function(w_link, w_display)
					return '[[w:'..w_link..'|'..(w_display~='' and w_display or w_link)..']]'
			end)
		end
		
		if text:find("{{") then
			text = replace_wp(text)
			text = text:gsub(' %({{taxlink[^}%)]+}}%)', '')
				:gsub('{{zh%-l|%*([^}]*)}}', '%1')
				:gsub('{{lb|zh|[^}]*}}', '')
				:gsub('{{zh%-erhua form of|word=[^}]+}}', '')
				:gsub('{{zh%-erhua form of|([^}]+)}}', '%1')
				:gsub('{{zh%-alt%-name|[^}]+|([^\n]+)}}', '%1')
				:gsub('{{zh%-short%-comp|[^}]+|t=([^\n}|]+)[^}]*}}', '%1')
				:gsub('{{zh%-short%-comp|[^}]+}}', '')
				:gsub('{{zh%-classifier|[^}]+|t=([^\n}|]+)[^}]*}}', '%1')
				:gsub('{{zh%-classifier|[^}]+}}', '')
				:gsub('{{zh%-alt%-form|[^}]+}}', '')
				:gsub('{{zh%-[^dm|}][^|}]+|[^|}]+|([^\n}|]+)}}', '%1')
				:gsub('{{place|zh|[^}]*t=([^\n}|]+)[^}]*}}', '%1')
				:gsub('{{vern', '{{w')
				:gsub('|', "|")
		end
		text = text:gsub('( ?)([{%(]+[^}%){%(]+[}%)]+)', function(space, captured)
			local taxlink = captured:match("{{taxlink|([^|}]+)")
			local wiki_link = 
				 taxlink and "''" .. taxlink .. "''" or 
				(match(captured, "({{w|.+}})") or false)
			return wiki_link and space..wiki_link or "" end)
		text = mw.text.split(text, ';')
		local text_sec = {}
		for _, s in ipairs(text) do
			if s:find'%w' then
				table.insert(text_sec, (s:gsub('^%s+',''):gsub('%s+$','')))
			end
		end
		return table.concat(text_sec, '; ')
	end
	gloss_text = replace_gloss(gloss_text)
	gloss_text = replace_gloss(gloss_text)
	if etc and useetc and gloss_text ~= "" then
		gloss_text = gloss_text .. "; etc."
	elseif gloss_text:find("{{") then --temporary solution to suppress wikitext issues
		gloss_text = ""
	end
	return gloss_text
end

function export.is_redirect(frame)
	if mw.title.new( frame.args[1] ).isRedirect then 
		return 1
	else
		return 0
	end
end

function export.link(text)
	return require("Module:links").language_link(text, nil, require("Module:languages").getByCode("zh"))
end

local function ine(var)
	if var == "" then
		return nil
	else
		return var
	end
end

local tones = '[̄́̌̀]'
local py_tone = {
	['̄'] = '1',
	['́'] = '2',
	['̌'] = '3',
	['̀'] = '4'
}

function export.py_transform(text, detone, not_spaced)
	if type(text) == 'table' then text, detone, not_spaced = text.args[1], text.args[2], text.args[3] end
	if find(text, '​') then
		error("Pinyin contains the hidden character: ​ (U+200B). Please remove that character from the text.")
	end
	detone = ine(detone)
	not_spaced = ine(not_spaced)
	text = gsub(gsub(mw.ustring.toNFD(text), mw.ustring.toNFD('ê'), 'ê'), mw.ustring.toNFD('ü'), 'ü')
	if find(mw.ustring.lower(text), '[aeiouêü]' .. tones .. '[aeiou]?[aeiouêü]' .. tones .. '') and not not_spaced then
		error(("Missing apostrophe before null-initial syllable - should be \"%s\" instead."):format(gsub(text, '([aeiouêü]' .. tones .. '[aeiou]?)([aeiouêü]' .. tones .. ')', "%1'%2"))) end
	original_text = text
	text = gsub(text,'([aoeAOE])([iou])(' .. tones .. ')', '%1%3%2')
	text = gsub(text,'([iuü])(' .. tones .. ')([aeiou])', '%1%3%2')
	if text ~= original_text then
		error("Incorrect diacritic placement in Pinyin - should be \"".. text .. "\" instead.") end
	text = mw.ustring.lower(text)
	if not mw.ustring.find(text, tones) and text:find('[1-5]') then
		return gsub(text, '(%d)(%l)', '%1 %2')
	end
	text = gsub(text, "#", " #")
	if find(text, '[一不,.?]') then
		text = gsub(text, '([一不])$', {['一'] = ' yī', ['不'] = ' bù'})
		text = gsub(text, '([一不])', ' %1 ')
		text = gsub(text, '([,.?])', ' %1 ')
		text = gsub(text, ' +', ' ')
		text = gsub(text, '^ ', '')
		text = gsub(text, ' $', '')
		text = gsub(text, '%. %. %.', '...')
	end
	text = gsub(text, "['%-]", ' ')
	text = gsub(text, '([aeiouêümn]' .. tones .. '?n?g?r?)([bpmfdtnlgkhjqxzcsywr]h?)', '%1 %2')
	text = gsub(text, ' ([grn])$', '%1')
	text = gsub(text, ' ([grn]) ', '%1 ')
	if detone then
		text = gsub(text, tones, py_tone)
		text = gsub(text, '([1234])([^ ]*)', '%2%1')
		text = gsub(text, '([%lüê]) ', '%15 ')
		text = gsub(text, '([%lüê])$', '%15')
	end
	if not_spaced then
		text = gsub(text, ' ', '')
	end
	return mw.ustring.toNFC(text)
end

function export.py_tongyong(text)
	if type(text) == 'table' then text = text.args[1] end
	
	local ty_tone = {
		["1"] = "", ["2"] = "\204\129", ["3"] = "\204\140", ["4"] = "\204\128", ["5"] = "\204\138"
	}
	
	local function num_to_mark(syllable, tone)
		tone = ty_tone[tone]
		if tone ~= "" then
			if syllable:find('[aeê]') then
				syllable = syllable:gsub("([aeê])", "%1" .. tone)
			elseif syllable:find('o') then
				syllable = syllable:gsub("(o)", "%1" .. tone)
			elseif syllable:find('[iu]') then
				syllable = syllable:gsub("([iu])", "%1" .. tone)
			end
		end
		return syllable
	end
	
	local words = {}
	for word in gsplit(text, " ") do
		local cap = word:find("^[A-Z]")
		word = export.py_transform(word, true)
		local syllables = {}
		for syllable in gsplit(word, " ") do
			syllable = syllable:gsub("([zcs]h?)i", "%1ih")
			syllable = syllable:gsub("ü", "yu")
			syllable = syllable:gsub("([jqx])u", "%1yu")
			syllable = syllable:gsub("iu", "iou")
			syllable = syllable:gsub("ui", "uei")
			syllable = syllable:gsub("([wf])eng", "%1ong")
			syllable = syllable:gsub("wen", "wun")
			syllable = syllable:gsub("iong", "yong")
			syllable = syllable:gsub("^zh", "jh")
			syllable = syllable:gsub("^q", "c")
			syllable = syllable:gsub("^x", "s")
			syllable = #syllables ~= 0 and syllable:gsub("^([aeo])", "'%1") or syllable
			syllable = syllable:gsub("^([^1-5]+)([1-5])$", num_to_mark)
			
			table.insert(syllables, syllable)
		end
		word = table.concat(syllables, "")
		word = cap and word:gsub("^.", string.upper) or word
		table.insert(words, word)
	end
	
	return mw.ustring.toNFC(table.concat(words, " "))
end

function export.pfs_check_invalid(text)
	local correct = mw.ustring.toNFD(text) .. "-"
	local accent = "[́̀̂̍]"
	local switch = "%1%3%2%4"
	correct = gsub(correct, "(o)([ae])(" .. accent .. ")([ⁿ%-/ ])", switch)
	--correct = gsub(correct, "(o)(" .. accent .. ")([ae])([imnptkh][gh]?ⁿ?)", switch)
	--correct = gsub(correct, "(oa)(i)(" .. accent .. ")(h?ⁿ?)", switch)
	--correct = gsub(correct, "(a)([iu])(" .. accent .. ")(h?ⁿ?)", switch)
	--correct = gsub(correct, "(i)(" .. accent .. ")([aou])(u?[mnptkh]?g?ⁿ?)", switch)
	--correct = gsub(correct, "(ia)(u)(" .. accent .. ")(h?ⁿ?)", switch)
	--correct = gsub(correct, "(u)(i)(" .. accent .. ")([hⁿ]?)", switch)
	--correct = gsub(correct, "(e)(e)(" .. accent .. ")(h?ⁿ?)", switch)
	--correct = gsub(correct, "(o" .. accent .. ")[ou·]", "%1͘")
	correct = mw.ustring.toNFC(gsub(correct, "-$", ""))
	--if text ~= correct then
		--error("invalid poj \"" .. gsub(text, "-$", "") .. "\": correct poj is \"" .. correct .. "\"")
	--end
	return correct
end

function export.gd_to_ipa(text)
	local initial_conv = {
		["b"] = "p", ["p"] = "pʰ", ["m"] = "m", ["f"] = "f", ["v"] = "ʋ",
		["d"] = "t", ["t"] = "tʰ", ["n"] = "n", ["l"] = "l", 
		["g"] = "k", ["k"] = "kʰ", ["ng"] = "ŋ", ["h"] = "h", [""] = "",
		["z"] = "t͡s", ["c"] = "t͡sʰ", ["s"] = "s",
		["j"] = "t͡ɕ", ["q"] = "t͡ɕʰ", ["x"] = "ɕ"
	}
	local final_conv = {
		["ii"] = "z̩", ["i"] = "i", ["u"] = "u",
		["a"] = "a", ["ia"] = "ia", ["ua"] = "ua",
		["ê"] = "e", ["iê"] = "ie", ["uê"] = "ue",
		["o"] = "o", ["io"] = "io", ["uo"] = "uo",
		["m"] = "m̩", ["n"] = "n̩",
		["ai"] = "aɪ", ["iai"] = "iaɪ", ["uai"] = "uaɪ",
		["oi"] = "oɪ",
		["ui"] = "uɪ", ["iui"] = "iuɪ",
		["au"] = "au", ["iau"] = "iau",
		["êu"] = "eu",
		["iu"] = "iu",
		["em"] = "əm", ["im"] = "im",
		["am"] = "am", ["iam"] = "iam",
		["êm"] = "ɛm",
		["en"] = "ən", ["in"] = "in",
		["an"] = "an", ["ian"] = "ian", ["uan"] = "uan",
		["ên"] = "ɛn", ["iên"] = "iɛn", ["uên"] = "uɛn",
		["on"] = "ɔn", ["ion"] = "iɔn", ["uon"] = "uɔn",
		["un"] = "un", ["iun"] = "iun",
		["ang"] = "aŋ", ["iang"] = "iaŋ", ["uang"] = "uaŋ",
		["ong"] = "ɔŋ", ["iong"] = "iɔŋ", ["uong"] = "uɔŋ",
		["ung"] = "ʊŋ", ["iung"] = "iʊŋ",
		["eb"] = "əp̚", ["ib"] = "ip̚",
		["ab"] = "ap̚", ["iab"] = "iap̚",
		["êb"] = "ɛp̚",
		["ed"] = "ət̚", ["id"] = "it̚",
		["ad"] = "at̚", ["iad"] = "iat̚", ["uad"] = "uat̚",
		["êd"] = "ɛt̚", ["iêd"] = "iɛt̚", ["uêd"] = "uɛt̚",
		["od"] = "ɔt̚",
		["ud"] = "ut̚", ["iud"] = "iut̚",
		["ag"] = "ak̚", ["iag"] = "iak̚", ["uag"] = "uak̚",
		["og"] = "ɔk̚", ["iog"] = "iɔk̚", ["uog"] = "uɔk̚",
		["ug"] = "ʊk̚", ["iug"] = "iʊk̚"
	}
	local tone_conv = {
		["1"] = "⁴⁴", ["2"] = "¹¹",
		["3"] = "³¹",
		["4"] = "⁵³",
		["5"] = "¹", ["6"] = "⁵",
		["1*"] = "⁴⁴⁻³⁵",
		["4*"] = "⁵³⁻⁵⁵"
	}
	local palatal = {
		['g'] = 'c',
		['k'] = 'cʰ',
		['ng'] = 'ɲ',
		['h'] = 'ç'
	}
	
	if type(text) == 'table' then text = text.args[1] end
	local syllables = mw.text.split(mw.ustring.gsub(text, 'gd=', ''), ' ')
	local initial, final, tone, ipa, result = {}, {}, {}, {}, {}
	for i, syllable in ipairs(syllables) do
		initial[i] = mw.ustring.match(syllable, "^[bpmfvdtnlgkhzcsjqx]?g?")
		final[i] = mw.ustring.match(mw.ustring.sub(syllable, mw.ustring.len(initial[index]) + 1, -1), "^[^1-6]*")
		final[i] = mw.ustring.gsub(mw.ustring.gsub(final[i], "^yi", "i"), "^y", "")
		if mw.ustring.find(initial[i], "[zcs]") and final[i] == "i" then
			final[i] = "ii"
		end
		if final[i] == "" then
			final[i] = initial[i]
			initial[i] = ""
		end
		tone[i] = mw.ustring.match(syllable, "[1-6]$")
	end
	for i, syllable in ipairs(syllables) do
		initial[i] = (mw.ustring.find(final[i], "^i") and palatal[initial[i]] or initial_conv[initial[i]]) or error(("Unrecognised initial: \"%s\""):format(initial[i]))
		final[i] = final_conv[final[i]] or error(("Unrecognised final: \"%s\""):format(final[i]))
		if mw.ustring.match(tone[i], "[14]") and mw.ustring.match(tone[i+1] or "", "[2345]") then
			tone[i] = tone[i] .. "*"
		end
		tone[i] = tone_conv[tone[i]]
		ipa[i] = initial[i] .. final[i] .. tone[i]
	end
	
	return table.concat(ipa, " ")
end

function export.pfs_to_hrs(text)
	if type(text) == 'table' then text = text.args[1] end
	local syllables = mw.text.split(mw.ustring.gsub(mw.ustring.gsub(mw.ustring.lower(text), 'pfs=', ''), ' ', '-'), "-")
	for i, syllable in ipairs(syllables) do
		-- change consonants
		syllable = mw.ustring.gsub(syllable,'[ptky]',{['p']='b',['t']='d',['k']='g',['y']='i'})
		syllable = mw.ustring.gsub(syllable,'[bdgc]h',{['bh']='p',['dh']='t',['gh']='k',['ch']='z'})
		syllable = mw.ustring.gsub(syllable,'zh','c')
		local palatal = {['z']='j',['c']='q',['s']='x',['i']=''}
		syllable = mw.ustring.gsub(syllable,'([zcsi])([iíìî])', function(a,b) return palatal[a]..b end)
		
		-- find tones
		local tone = ''
		if mw.ustring.find(syllable, '[âêîôû̂]') then
			tone = '´'
		elseif mw.ustring.find(syllable, '[àèìòùǹ̀]') then
			tone = 'ˇ'
		elseif mw.ustring.find(syllable, '[áéíóúń́]') or
			(mw.ustring.find(syllable, '[aeiouṳ][bdg]$') and not mw.ustring.find(syllable, '̍')) then
			tone = '`'
		end
		
		-- remove tone marks and fix vowels
		local final_conv = {
			['á'] = 'a', ['é'] = 'e', ['í'] = 'i', ['ó'] = 'o', ['ú'] = 'u', ['́'] = '',
			['à'] = 'a', ['è'] = 'e', ['ì'] = 'i', ['ò'] = 'o', ['ù'] = 'u', ['̀'] = '',
			['â'] = 'a', ['ê'] = 'e', ['î'] = 'i', ['ô'] = 'o', ['û'] = 'u', ['̂'] = '',
			['ń'] = 'n', ['ǹ'] = 'n',
			['̍'] = '',
			['ṳ'] = 'ii',
		}
		syllable = mw.ustring.gsub(syllable, '[âêîôû̂àèìòù̀áéíóú́ńǹ̍ṳ]', final_conv)
		syllable = mw.ustring.gsub(syllable, 'o([ae])', 'u%1')
		
		-- add new tone marks
		syllables[i] = syllable .. tone
	end
	return table.concat(syllables, " ")
end

function export.test()
	local a = "abc"
	local b = "abc"
	local c = {}
	c[a] = 5
	return (c[b] == c[a])
end

return export