Module:ru-pron: difference between revisions

From Wiktionary, the free dictionary
Jump to navigation Jump to search
Content deleted Content added
No edit summary
+ого -> ово
Line 80: Line 80:
['ндш'] = 'нш',
['ндш'] = 'нш',
['нтг'] = 'нг',
['нтг'] = 'нг',
['(.[ое])го'] = '%1во',
['э'] = 'ɛ',
['э'] = 'ɛ',

Revision as of 10:21, 7 June 2014

This module generates IPA for Russian words. There are three entry points:

  • ru_IPA() directly implements Template:ru-IPA, and is meant to be called from that template.
  • ipa() generates the raw IPA for Cyrillic text and is meant to be called from other code. It returns a list.
  • ipa_string() generates the raw IPA for Cyrillic text and is meant to be called from other code (e.g. from Module:ru-pron/testcases) or as a template (this is used in User:Benwing2's bot scripts). It returns a string, with multiple variants separated by comma+space.

The primary documentation for this module can be found in the documentation for Template:ru-IPA.

Testcases


local export = {}
local gsub = mw.ustring.gsub
local sub = mw.ustring.sub
local find = mw.ustring.find
local len = mw.ustring.len
local match = mw.ustring.match

local m_ru_translit = require("Module:ru-translit")
local vowels, vowels_c, non_vowels, non_vowels_c, cyr_vowels_c = '[aäeëɛəiyoöuü]', '([aäeëɛəiyoöuü])', '[^aäeëɛəiyoöuü]', '([^aäeëɛəiyoöuü])', '([аяеёэиыоую])'

local perm_syl_onset = {
	['str'] = true, 
	['sp'] = true, ['st'] = true, ['sk'] = true, ['sf'] = true, ['sx'] = true, ['sc'] = true, 
	['pr'] = true, ['kr'] = true, ['fr'] = true, ['xr'] = true, 
	['pl'] = true, ['tl'] = true, ['kl'] = true, ['fl'] = true, ['xl'] = true, 
	['ml'] = true, ['mn'] = true,
	['šč'] = true, ['dž'] = true,
	--['zb'] = true, ['zd'] = true, ['zg'] = true, ['zv'] = true, 
	--['br'] = true, ['dr'] = true, ['gr'] = true, ['vr'] = true,
	--['tr'] = true, 
	--['bl'] = true, ['dl'] = true, ['gl'] = true, ['vl'] = true,
}

local translit_conv = {
	['c'] = 't͡s', ['č'] = 't͡ɕ', ['ĵ'] = 'd͡z', ['ǰ'] = 'd͡ʑ', ['ǯ'] = 'ɕ', ['š'] = 'ʂ', ['ž'] = 'ʐ', ['χ'] = 'ɣ'
}

local allophones = {
	['a'] = { 'a', 'ɐ', 'ə' },
	['e'] = { 'e', 'ɪ', 'ɪ' },
	['i'] = { 'i', 'ɪ', 'ɪ' },
	['o'] = { 'o', 'ɐ', 'ə' },
	['u'] = { 'u', 'ʊ', 'ʊ' },
	['y'] = { 'ɨ', 'ɨ', 'ɨ' },
	['ɛ'] = { 'ɛ', 'ɨ', 'ɨ' },
	['ä'] = { 'æ', 'ɪ', 'ɪ' },
	['ë'] = { 'e', 'ɪ', 'ɪ' },
	['ö'] = { 'ɵ', 'ɪ', 'ɪ' },
	['ü'] = { 'ʉ', 'ʉ', 'ʉ' },
	['ə'] = { 'ə', 'ə', 'ə' },
}

local devoicing = {
	['b'] = 'p', ['d'] = 't', ['g'] = 'k',
	['z'] = 's', ['v'] = 'f',
	['ž'] = 'š',
	
	['bʲ'] = 'pʲ', ['dʲ'] = 'tʲ',
	['zʲ'] = 'sʲ', ['vʲ'] = 'fʲ',
	['žʲ'] = 'šʲ'
}

local voicing = {
	['p'] = 'b', ['t'] = 'd', ['k'] = 'g',
	['s'] = 'z', ['f'] = 'v',
	['š'] = 'ž', ['c'] = 'ĵ', ['č'] = 'ǰ', ['x'] = 'χ'
}

local phon_respellings = {
	[vowels_c .. '([шж])ю'] = '%1%2у', [vowels_c .. '([шжц])е'] = '%1%2э', [vowels_c .. '([шжц])и'] = '%1%2ы', [vowels_c .. '([шж])ё'] = '%1%2о́',
	['дс' .. cyr_vowels_c] = 'цс%1', ['[дт]з' .. cyr_vowels_c] = 'ĵз%1', ['[дт]с'] = 'ц',
	['([щч])о'] = '%1ё', ['([щч])а'] = '%1я', ['([щч])у'] = '%1ю',
	
	['([^р])[дт]ц'] = '%1цц', ['[тд]ч'] = 'чч',
	['́ть?ся'] = 'цца', ['([^́])ть?ся'] = '%1ца',

	['([шжщч])ь$'] = '%1',
	
	['рдц'] = 'рц', ['рдч'] = 'рч',
	['здн'] = 'зн', ['здц'] = 'сц',
	['лнц'] = 'нц',	['ндц'] = 'нц',
	['стл'] = 'сл', ['стн'] = 'сн',
	['[сз]ч'] = 'щ', ['сщ'] = 'щ',
	['сш'] = 'шш', ['сж'] = 'жж',
	['н[дт]ск'] = 'нск',
	['[сз]ск'] = 'ск',
	['стск'] = 'ск',	
	['вств'] = 'ств',
	['гк'] = 'хк',
	['ндш'] = 'нш',
	['нтг'] = 'нг',
	['(.[ое])го'] = '%1во',
	
	['э'] = 'ɛ',
}

local cons_assim_palatal = {
	['compulsory'] = {
		['stʲ'] = true, ['zdʲ'] = true,
		['nč'] = true,  ['nǯ'] = true
	},
	
	['optional'] = {
		['slʲ'] = true, ['zlʲ'] = true, ['snʲ'] = true, ['znʲ'] = true, 
		['tnʲ'] = true, ['dnʲ'] = true,
		['nsʲ'] = true, ['nzʲ'] = true, ['ntʲ'] = true, ['ndʲ'] = true,
	}
}

--prepositions and particles
--@Wyang - they may carry the stress too, as alternatives - по́ небу/по не́бу, etc.
local accentless_prep = {
	['без'] = true, ['близ'] = true,
	['в'] = true, ['вдоль'] = true, ['вне'] = true, ['внутрь'] = true, ['во'] = true, ['вслед'] = true,
	['для'] = true, ['до'] = true,
	['за'] = true,
	['из'] = true,
	['к'] = true, ['ко'] = true,
	['меж'] = true,
	['на'] = true, ['над'] = true,
	['о'] = true, ['об'] = true, ['от'] = true,
	['по'] = true, ['под'] = true, ['пред'] = true, ['при'] = true, ['про'] = true,
	['с'] = true, ['сверх'] = true, ['свыше'] = true, ['сквозь'] = true, ['со'] = true, ['средь'] = true,
	['у'] = true,
	-- negative particle
	['не'] = true
}

function export.ipa(text)
	if type(text) == 'table' then text = text.args[1] end
	
	--phonetic respellings
	for a, b in pairs(phon_respellings) do
		text = gsub(text, a, b)
	end

	--make monosyllabic prepositions liaise with the following word
	local word = mw.text.split(text, " ", true)
	for i = 1, #word do
		if accentless_prep[word[i]] then
			word[i+1] = word[i] .. '‿' .. word[i+1]
			word[i+1] = gsub(word[i+1], '([бдкствхзж])‿и', '%1‿ы')
			table.remove(word, i)
		end
	end
	text = table.concat(word, " ")

	--transliterate and tidy up
	text = m_ru_translit.tr(mw.ustring.lower(text))
	text = gsub(text, 'šč', 'ǯǯ')
	text = gsub(text, 'ó', 'o' .. '́')
	
	--rewrite iotated vowels
	text = gsub(text, 'j[aeou]', {
		['ja'] = 'ä',
		['je'] = 'ë',
		['jo'] = 'ö',
		['ju'] = 'ü'})

	--voicing/devoicing assimilations
	text = gsub(text, '([bdgzvž])([ %-%‿]?[ptksčšǯcx])', function(a, b)
		return devoicing[a] .. b end)
	text = gsub(text, '([ptksfšcčx])([ %-%‿]?[bdgzž])', function(a, b)
		return voicing[a] .. b end)

	word = mw.text.split(text, " ", true)
	for i = 1, #word do
		local syllable, syl_conv, pos, stress = {}, {}, {}, {}
		local count = 0
		pron = word[i]
		
		--re-notate orthographic geminate consonants
		pron = gsub(pron, (non_vowels_c) .. '%1', '%1ː')

		--optional iotation of 'e' in a two-vowel sequence and reduction of word-final 'e'
		pron = gsub(pron, '([aäeëɛiyoöuü]́?)ë([^́])', '%1(j)ë%2')
		pron = gsub(pron, 'e$', 'ə')
		pron = gsub(pron, '([aäeëɛəiyoöuüʹ])(́?)[äë]$', '%1%2jə')
		pron = gsub(pron, non_vowels_c .. 'ä$', '%1ʲə')
		pron = gsub(pron, '%(j%)jə', 'jə')

 		--syllabify
 		pron = gsub(pron, 'ʹ([äëöü])', 'ʹ/%1')
		pron = gsub(pron, '([aäeëɛəiyoöuü]́?)', '%1/')
		pron = gsub(pron, '/+$', '')
		pron = gsub(pron, '/([^/aäeëɛəiyoöuü]*)([^/aäeëɛəiyoöuüʹːʲ])(ʹ?ʲ?ː?[aäeëɛəiyoöuü])', '%1/%2%3')
		pron = gsub(pron, '([^/aäeëɛəiyoöuü]?)([^/aäeëɛəiyoöuü])/([^/aäeëɛəiyoöuüʹːʲ])(ʹ?ʲ?ː?[aäeëɛəiyoöuü])', function(a, b, c, d)
			if perm_syl_onset[a .. b .. c] then
				return '/' .. a .. b .. c .. d
			elseif perm_syl_onset[b .. c] then
				return a .. '/' .. b .. c .. d
			end end)
		pron = gsub(pron, '/([^/aäeëɛəiyoöuü]+)$', '%1')
		
		--remove accent marks from monosyllables
		if len(gsub(pron, non_vowels_c, '')) == 1 then
			pron = gsub(pron, '\204\129', '')
		end
		
		--write syllable indexes of stressed syllables to a table
		trimmed_pron = pron
		while match(trimmed_pron, '\204\129') do -- U+0301 COMBINING ACUTE ACCENT
			accent_pos = find(trimmed_pron, '\204\129')
			count = count + len(gsub(sub(trimmed_pron, 1, accent_pos - 1), '[^%/]', ''))
			table.insert(pos, count + 1)
			trimmed_pron = sub(trimmed_pron, accent_pos + 1, -1)
		end
		
		--treated monosyllabic non-prepositions as if accented
		pron = gsub(pron, '(.*)' .. vowels_c .. '(.*)', function(a, b, c)
			if not match(a .. c, vowels) then
				table.insert(pos, 1)
			end end)

		--split by syllable
		syllable = mw.text.split(pron, '/', true)
		if #syllable == 1 then
			table.insert(pos, 1)
		end

		--transform the table of stress positions
		for _, pos in ipairs(pos) do
			stress[pos] = true
		end

		for j = 1, #syllable do
			local syl = syllable[j]

			--remove consonant geminacy if non-initial and non-post-tonic
			if match(syl, 'ː') then
				if (j == 1 and not match(syl, 'ː$')) or stress[j-1] or match((syllable[j-1] or '') .. syl, '^o[cčǰ]ː') then
				else
					syl = gsub(syl, '([^ǯ])ː', '%1')
				end
			end

			--assimilative palatalisation of consonants when followed by front vowels
			if match(syl, '^[^cĵšžaäeëɛiyoöuü]*[eiəäëöüʹ]') or match(syl, '^[cĵ][äëü]') then
				syl = gsub(syl, '^([ʺʹ]?)([äëöü])', '%1j%2')
				if not match(syl, 'ʺ') then
					syl = gsub(syl, non_vowels_c .. '([ʹːj]?[aäeëɛəiyoöuüʹ])', function(a, b)
						return gsub(a, '([mnpbtdkgcfvszxrl])', '%1ʲ') .. b end)
				end
			end
			syl = gsub(syl, 'ʺ', '')
			syl = gsub(syl, '(.?ː?)ʹ', function(a)
				if match(a, '[čǰšǯ]') then
					return a
				elseif a ~= 'ʲ' then
					return a .. 'ʲ'
				else
					return 'ʲ'
				end end)
			
			--retraction of front vowels in syllables blocking assimilative palatalisation
			if not match(syl, 'ʲː?' .. vowels) and not match(syl, '[čǰǯ]ː?[ei]') and not match(syl, '^i') then
				syl = gsub(syl, '[ei]', {['e'] = 'ɛ', ['i'] = 'y'})
			end
			
			syl = gsub(syl, '-', '')
			
			--vowel allophony
			if stress[j] or (j == #syllable and match(syllable[j-1] .. syllable[j], '[aieäëü]́?o')) then
				syl = gsub(syl, '(.*)́', 'ˈ%1')
				syl = gsub(syl, '([ʲčǰǯ]ː?)o', '%1ö')
				syl = gsub(syl, vowels_c, function(a)
					if a ~= '' then
						return allophones[a][1]
					end end)
			
			elseif stress[j+1] or (j == 1 and match(syl, '^' .. vowels)) then
				syl = gsub(syl, vowels_c, function(a)
					if a ~= '' then
						return allophones[a][2]
					end end)
				
			else
				syl = gsub(syl, vowels_c, function(a)
					if a ~= '' then
						return allophones[a][3]
					end end)
			end
			
			syl_conv[j] = syl
		end

		pron = table.concat(syl_conv, "")
		
		--long vowels
		pron = gsub(pron, '[ɐə]ɐ(%l?)ˈ', 'ɐː%1ˈ')
		pron = gsub(pron, 'əə', 'əː')
		
		--consonant assimilative palatalisation
		pron = gsub(pron, '([szntd])(ˈ?)([tdčǰǯlnsz]ʲ?)', function(a, b, c)
			if cons_assim_palatal['compulsory'][a..c] then
				return a .. 'ʲ' .. b .. c
			elseif cons_assim_palatal['optional'][a..c] then
				return a .. '⁽ʲ⁾' .. b .. c
			end end)
		
		--fronting of stressed 'a' between soft consonants
		pron = gsub(pron, 'ˈ(..?.?)a(...)', function(a, b)
			if match(a, '[ʲčǰǯ]') and match(b, '[ʲčǰǯ]') then
				return 'ˈ' .. a .. 'æ' .. b
			end end)

		--final devoicing
		pron = gsub(pron, '([bdgzvž]ʲ?)$', devoicing)

		pron = gsub(pron, '[cčĵǰšžǯχ]', translit_conv)
		if match(word[i], 'sä$') then
			pron = gsub(pron, 'sʲə$', 's⁽ʲ⁾ə')
		end
		word[i] = pron
	end
	
	return table.concat(word, "")
end

return export