Module:grc-pronunciation/sandbox: difference between revisions

From Wiktionary, the free dictionary
Jump to navigation Jump to search
Content deleted Content added
No edit summary
transcribing long diphthongs with i and nonsyllabic diacritic
Line 32: Line 32:
--Combining diacritics are tricky.
--Combining diacritics are tricky.
local tie = U(0x35C) -- tie bar
local tie = U(0x35C) -- tie bar
local nonsyllabic = U(0x32F)
local high = U(0x341) -- combining acute tone mark
local high = U(0x341) -- combining acute tone mark
local low = U(0x340) -- combining grave tone mark
local low = U(0x340) -- combining grave tone mark
Line 235: Line 236:
elseif m_data[vCurrent].subi then
elseif m_data[vCurrent].subi then
if m_data[vCurrent].accent == 'acute' then
if m_data[vCurrent].accent == 'acute' then
vFP = vIP..tie..'ːj'..high
vFP = vIP..'ːi'..nonsyllabic..high
elseif m_data[vCurrent].accent == 'grave' then
elseif m_data[vCurrent].accent == 'grave' then
vFP = vIP..tie..'ːj'..low
vFP = vIP..'ːi'..nonsyllabic..low
elseif m_data[vCurrent].accent == 'circum' then
elseif m_data[vCurrent].accent == 'circum' then
vFP = vIP..high..tie..'ːj'
vFP = vIP..high..'ːi'..nonsyllabic
else
else
vFP = vIP..tie..'ːj'
vFP = vIP..'ːi'..nonsyllabic
end
end
elseif vLength == 'long' then
elseif vLength == 'long' then

Revision as of 07:29, 30 January 2017


Data for the sandbox module: Module:grc-pronunciation/sandbox/data.

Testcases

Lua error in package.lua at line 80: module 'Module:a' not found


local export = {}

local strip_accent = require('Module:grc-accent').strip_accent
-- [[Module:grc-utilities]] converts sequences of diacritics to the order required by this module,
-- then replaces combining macrons and breves with spacing ones.
local rearrangeDiacritics = require("Module:grc-utilities").pronunciationOrder
local m_data = mw.loadData("Module:grc-pronunciation/sandbox/data")
local m_IPA = require("Module:IPA")
local m_a = require("Module:a")
local lang = require("Module:languages").getByCode("grc")
local m_links = require("Module:links")
local tag_text = require("Module:script utilities").tag_text

local periods = {'cla', 'koi1', 'koi2', 'byz1', 'byz2'}
local inlinePeriods = {'cla', 'koi2', 'byz2'}

local rfind = mw.ustring.find
local usub = mw.ustring.sub
local rmatch = mw.ustring.match
local rsubn = mw.ustring.gsub
local ulen = mw.ustring.len
local ulower = mw.ustring.lower
local U = mw.ustring.char
local function fetch(s, i)
	--[==[
	because we fetch a single character at a time so often
	out of bounds fetch gives ''
	]==]
	return usub(s, i, i)
end

--Combining diacritics are tricky.
local tie = U(0x35C)		-- tie bar
local nonsyllabic = U(0x32F)
local high = U(0x341)		-- combining acute tone mark
local low = U(0x340)		-- combining grave tone mark
local midHigh = U(0x1DC4)	-- mid–high pitch
local midLow = U(0x1DC6)	-- mid–low pitch
local highMid = U(0x1DC7)	-- high–mid pitch
local voiceless = U(0x325)	-- combining ring below
local aspirated = 'ʰ'
local macron = '¯'
local breve = '˘'

local function is(text, X)
	if not text or not X then
		return false
	end
	pattern = m_data.chars[X] or error("No data for \"" .. X .. "\".", 2)
	if X == "frontDiphth" then
		pattern = "^" .. pattern .. "$"
	else
		pattern = "^[" .. pattern .. "]$"
	end
	return rfind(text, pattern)
end

local env_functions = {
	preFront = function(term, index)
		local letter1, letter2 = fetch(term, index + 1), fetch(term, index + 2)
		return is(strip_accent(letter1), "frontVowel") or (is(strip_accent(letter1 .. letter2), "frontDiphth") and not is(letter2, "iDiaer"))
	end,
	isIDiphth = function(term, index)
		local letter = fetch(term, index + 1)
		return strip_accent(letter) == 'ι' and not m_data[letter].diar
	end,
	isUDiphth = function(term, index)
		local letter = fetch(term, index + 1)
		return strip_accent(letter) == 'υ' and not m_data[letter].diar
	end,
	isAspDiphth = function(term, index)
		return m_data[fetch(term, index + 1)].breath == 'rough'
	end,
	isAcuteDiphth = function(term, index)
		return m_data[fetch(term, index + 1)].accent == 'acute'
	end,
	isGraveDiphth = function(term, index)
		return m_data[fetch(term, index + 1)].accent == 'grave'
	end,
	isCircumDiphth = function(term, index)
		return m_data[fetch(term, index + 1)].accent == 'circum'
	end,
	isAccentDiphth = function(term, index)
		return m_data[fetch(term, index + 1)].accent
	end,
}

local function decode(condition, x, term)
	--[==[
		"If" and "and" statements.
		Note that we're finding the last operator first, 
		which means that the first will get ultimately get decided first.
		If + ("and") or / ("or") is found, the function is called again,
		until if-statements are found.
		In if-statements:
		* A number represents the character under consideration:
			1 the next character, 0 the current, and -1 the previous.
		* Equals sign (=) checks to see if the character under consideration
			is equal to a character.
		* Period (.) plus a word sends the module to the corresponding entry
			in the letter's data table.
		* Tilde (~) calls a function on the character under consideration,
			if the function exists.
	]==]
	if rfind(condition, '[+/]') then
		-- Find slash or plus sign preceded by something else, and followed by anything
		-- (including another sequence of slash or plus sign and something else).
		local condition1, sep, condition2 = rmatch(condition, "^([^/+]*)([/+])(.*)$")
			or error('Condition "' .. condition or 'nil' .. '" is improperly formed')
		if sep == '/' then		-- logic operator: or
			return decode(condition1, x, term) or decode(condition2, x, term)
		elseif sep == '+' then	-- logical operator: and
			return decode(condition1, x, term) and decode(condition2, x, term)
		end
	elseif rfind(condition, '=') then				-- check character identity
		local offset, char = unpack(mw.text.split(condition, "="))
		return char == fetch(term, x + offset) -- out of bounds fetch gives ''
	elseif rfind(condition, '%.') then				-- check character quality
		local offset, quality = unpack(mw.text.split(condition, "%."))
		return m_data[fetch(term, x + offset)][quality]
	elseif rfind(condition, '~') then				-- check character(s) using function
		local offset, func = unpack(mw.text.split(condition, "~"))
		return env_functions[func] and env_functions[func](term, x + offset) or false
	end
end

local function check(p, period, x, term)
	if type(p) == 'string' then
		return p
	elseif type(p) == 'table' then   --This table is sequential, with a variable number of entries.
		for _, possP in ipairs(p) do
			if type(possP) == 'string' then
				return possP
			elseif type(possP) == 'table' then    --This table is paired, with two values: a condition and a result.
				rawCondition, rawResult = possP[1], possP[2]
				if decode(rawCondition, x, term) then
					return (type(rawResult) == 'string') and rawResult or check(rawResult, period, x, term)
				end	
			end
		end
	end
end

local function convert_term(term)
	local IPAs = {}
	for _, period in ipairs(periods) do
		IPAs[period] = { notes = {} }
	end
	local length = ulen(term)
	local x = 1
	local letter = ''
	local nextLetter = ''
	local ambig = {}
	local diphthong = false
	--local clusters = {}
	--local cluster = ''
	while x <= length do
		letter = fetch(term, x)
		nextLetter = fetch(term, x + 1)
		local data = m_data[letter]
		if not data then
			
		elseif data.type == 'consonant' or strip_accent(letter) == 'ω' or strip_accent(letter) == 'η' then
			for _, period in ipairs(periods) do
				table.insert(IPAs[period], check(data.p[period], period, x, term))
			end
		elseif data.type == 'vowel' then
			--Start with a diphthong check
			local diphthong = false
			local vCurrent, vForm, vLength
			local nextData = m_data[nextLetter]
			if is(letter, 'iDiphth') and (nextData and nextData.type == 'vowel' and ( strip_accent(nextLetter) == 'ι' and not nextData.diar)) then
				diphthong = letter..'ι'
			elseif is(letter, 'uDiphth') and (nextData and nextData.type == 'vowel' and (strip_accent(nextLetter) == 'υ' and not nextData.diar)) then
				diphthong = letter..'υ'
			end
			
			if diphthong then
				vCurrent = nextLetter
				vForm = diphthong
				vLength = 'long'
				x = x + 1
				nextLetter = fetch(term, x + 1)
			else 
				vCurrent = letter
				vForm = strip_accent(letter)
				vLength = m_data[vCurrent].length or m_data[vForm].length
			end
			
			for _, period in ipairs(periods) do
				if m_data[vCurrent].breath == 'rough' then
					if period == 'cla' then
						table.insert(IPAs['cla'], 'h')
					elseif period == 'koi1' then
						table.insert(IPAs['koi1'], '(h)')
					end
				end 
				if period ~= 'cla' then
					--All other periods have a stress accent, instead of Classical's tonal accent.
					--The stress diacritic is initially placed immediately preceding the vowel,
					--and is moved to the front of the syllable during syllabification.
					if m_data[vCurrent].accent then
						table.insert(IPAs[period], 'ˈ')
					end
					local vIP = check(m_data[vForm].p[period], period, x, term)
					table.insert(IPAs[period], vIP)
				else
					-- Classical vowels are hard.
					-- [[Module:grc-utilities]] converts combining macrons and breves to spacing ones.
					if vLength == 'either' then
						if m_data[vCurrent].accent == 'circum' or nextLetter == macron or m_data[vCurrent].subi then
							vLength = 'long'			
						elseif nextLetter == breve then
							vLength = 'short'
						else
							local ambiguousVowel = tag_text(vCurrent, lang, nil, 'term')
							table.insert(IPAs.cla.notes, 'ambiguous vowel ' .. ambiguousVowel
								.. ' at ' .. x)
							table.insert(ambig, ambiguousVowel)
						end
					end
					
					local vIP = check(m_data[vForm].p.cla, cla, x, term)  --vIP stands for initial pronunciation
					
					--There has to be a prettier way to do this, but I just can't think of it.
					if ulen(vIP) > 1 then  --i.e. if it's a phonetic dipthong
						if m_data[vCurrent].accent == 'acute' then
							vFP = vIP..high
						elseif m_data[vCurrent].accent == 'grave' then
							vFP = vIP..low
						elseif m_data[vCurrent].accent == 'circum' then
							vFP = fetch(vIP, 1)..high..usub(vIP, 2)
						else
							vFP = vIP
						end
					elseif m_data[vCurrent].subi then
						if m_data[vCurrent].accent == 'acute' then
							vFP = vIP..'ːi'..nonsyllabic..high
						elseif m_data[vCurrent].accent == 'grave' then
							vFP = vIP..'ːi'..nonsyllabic..low
						elseif m_data[vCurrent].accent == 'circum' then
							vFP = vIP..high..'ːi'..nonsyllabic
						else
							vFP = vIP..'ːi'..nonsyllabic
						end
					elseif vLength == 'long' then
						if m_data[vCurrent].accent == 'acute' then
							vFP = vIP..midHigh..'ː'
						elseif m_data[vCurrent].accent == 'grave' then
							vFP = vIP..midLow..'ː'
						elseif m_data[vCurrent].accent == 'circum' then
							vFP = vIP..highMid..'ː'
						else
							vFP = vIP..'ː'
						end
					else
						if m_data[vCurrent].accent == 'acute' then
							vFP = vIP..high
						elseif m_data[vCurrent].accent == 'grave' then
							vFP = vIP..low
						else
							vFP = vIP
						end
					end
					table.insert(IPAs['cla'], vFP)
				end
			end
		end
		x = x + 1
	end
	
	--Concatenate the IPAs
	for _, period in ipairs(periods) do
		IPAs[period] = { IPA = table.concat(IPAs[period], ''), notes = IPAs[period]['notes'] }
	end
	
	return IPAs, ambig
end

local function syllabify(IPAs)
	--Syllabify
	for _, period in ipairs(periods) do
		local word = IPAs[period].IPA
		local syllables = {}
		local cVowel, nVowel, sBreak, stress, wordEnd, searching
		while word ~= '' do
			cVowel, nVowel, sBreak, stress = false, false, false, false
			
			--First thing is to find the first vowel.
			searching = 1
			cVowelFound = false
			while not cVowel do
				letter = fetch(word, searching)
				if cVowelFound then
					if is(letter, "vowel") or is(letter, "cons") or letter == '' or letter == 'ˈ' then
						cVowel = searching - 1
					elseif is(letter, "diacritic") then
						searching = searching + 1
					elseif letter == tie then
						cVowelFound = false
						searching = searching + 1
					else
						searching = searching + 1
					end
				else
					if is(letter, "vowel") then
						cVowelFound = true
					elseif letter == 'ˈ' then
						stress = true
					elseif letter == '' then  --This shouldn't happen.
						cVowel = true
						wordEnd = true
					end
					searching = searching + 1
				end
			end
		
			--Next we try and find the next vowel or the end.
			searching = cVowel + 1
			while (not nVowel) and (not wordEnd) do
				letter = fetch(word, searching)
				if is(letter, "vowel") or letter == 'ˈ' then
					nVowel = searching
				elseif letter == '' then
					wordEnd = true
				else
					searching = searching + 1
				end
			end
			
			--Finally we find the syllable break point.
			if wordEnd then
				sBreak = ulen(word)
			elseif is(fetch(word, nVowel - 1), "liquid") then
				if is(fetch(word, nVowel - 2), "obst") then
					sBreak = nVowel - 3
				elseif fetch(word, nVowel - 2) == aspirated and is(fetch(word, nVowel - 3), "obst") then
					sBreak = nVowel - 4
				else
					sBreak = nVowel - 2
				end
			elseif is(fetch(word, nVowel - 1), "cons") then
				sBreak = nVowel - 2
			elseif fetch(word, nVowel - 1) == aspirated and is(fetch(word, nVowel - 2), "obst") then
				sBreak = nVowel - 3
			elseif fetch(word, nVowel - 1) == voiceless and fetch(word, nVowel - 2) == 'r' then
				sBreak = nVowel - 3
			else
				sBreak = nVowel - 1
			end
			
			--Pull everything up to and including the syllable Break.
			local syllable = usub(word, 1, sBreak)
			
			--If there is a stress accent, then we need to move it to the 
			--beginning of the syllable, unless it is a monosyllabic word,
			--in which case we remove it altogether.
			if stress then
				if next(syllables) or syllable ~= word then
					syllable = 'ˈ' .. rsubn(syllable, 'ˈ', '')
				else 
					syllable = rsubn(syllable, 'ˈ', '')
				end
				stress = false
			end
			table.insert(syllables, syllable)
			word = usub(word, sBreak + 1)
		end
		
		if #syllables > 0 then
			IPAs[period].IPA = table.concat(syllables, '.')
			IPAs[period].IPA = rsubn(IPAs[period].IPA, '%.ˈ', 'ˈ')
		end
	end
	return IPAs
end

local function make_table(IPAs, ambig)
	--Final format
	local inlineProns = {}
	local listOfProns = {}
	
	for _, period in ipairs(inlinePeriods) do
		local pron = '/' .. IPAs[period].IPA .. '/'
		table.insert(inlineProns, {pron = pron})
		table.insert(listOfProns, pron)
	end
	
	local inlineIPAlength = ulen("IPA(key): " .. table.concat(listOfProns, ' → '))
	
	local inline = '<div class="vsShow" style="display:none">\n* ' .. m_IPA.format_IPA_full(lang, inlineProns, nil, ' → ') .. '</div>'
	
	local fullProns = {}
	for _, period in ipairs(periods) do
		local notes = (#IPAs[period].notes > 0) and ('<span class="previewonly"><br>' .. table.concat(IPAs[period].notes, ', ') .. '</span>') or ''
		table.insert(fullProns, '* ' .. m_a.show({'grc-' .. period}) .. ' ' ..  m_IPA.format_IPA_full(lang, {{pron = '/' .. IPAs[period].IPA .. '/'}}) .. notes)
	end
	
	local ambignote = ''
	-- The table ambig is filled with all the ambiguous vowels that have been found in the term.
	if #ambig > 0 then
		local agr = {}
		if #ambig > 1 then
			agr = { 's ', 'each one' }
		else
			agr = { ' ', 'it' }
		end
			
		ambignote = '\n<p class="previewonly">Mark the vowel length of the ambiguous vowel' .. agr[1]
			.. mw.text.listToText(ambig) .. ' by adding a macron after ' .. agr[2]
			.. ' if it is long, or a breve if it is short. By default, [[Module:grc-pronunciation]] assumes it is short if unmarked.<br><small>[This message shows only in preview mode.]</small></span></p>\n'
	end
	
	local full = '<div class="vsHide">\n' .. table.concat(fullProns, '\n') .. ambignote .. '</div>'
	
	return '<div class="vsSwitcher vsToggleCategory-pronunciations" style="width: ' .. inlineIPAlength * 0.68 .. 'em;"><span class="vsToggleElement" style="float: right;">&nbsp;</span>' .. inline .. full .. '</div>'
end

function export.create(frame)
	local params = {
		[1] = {alias_of = 'w'},
		w = {default = mw.title.getCurrentTitle().text},
	}
	local args = require("Module:parameters").process(frame:getParent().args, params)
	
	term = ulower(args.w)
	term = rsubn(term, 'ς', 'σ')
	term = rsubn(term, 'ῤ', 'ρ')
	term = rearrangeDiacritics(term)
	
	IPAs, ambig = convert_term(term)
	
	IPAs = syllabify(IPAs)
	
	return make_table(IPAs, ambig)
end

function export.example(frame)
	local term = frame:getParent().args[1]
	return m_links.full_link({term = term, lang = lang}) .. " || " .. export.create(frame)
end

return export
--Things we still need:
--Voicing of sigma around (after?) voiced stops. 
--Proper alerts for editors, especially on ambiguous vowels.