Module:mr-Modi-translit

From Wiktionary, the free dictionary
Jump to navigation Jump to search

This module will transliterate Marathi language text per WT:MR TR. It is also used to transliterate Varhadi. The module should preferably not be called directly from templates or other modules. To use it from a template, use {{xlit}}. Within a module, use Module:languages#Language:transliterate.

For testcases, see Module:mr-Modi-translit/testcases.

Functions

tr(text, lang, sc)
Transliterates a given piece of text written in the script specified by the code sc, and language specified by the code lang.
When the transliteration fails, returns nil.

local export = {}

local u = require("Module:string/char")
local gsub = mw.ustring.gsub
local find = mw.ustring.find

local ZWJ = u(0x200D)

local conv = {
	-- consonants
	['๐‘˜Ž']='k', ['๐‘˜']='kh', ['๐‘˜']='g', ['๐‘˜‘']='gh', ['๐‘˜’']='แน…',
	['๐‘˜“']='c', ['๐‘˜”']='ch', ['๐‘˜•']='j', ['๐‘˜–']='jh', ['๐‘˜—']='รฑ', 
	['๐‘˜˜']='แนญ', ['๐‘˜™']='แนญh', ['๐‘˜š']='แธ', ['๐‘˜›']='แธh', ['๐‘˜œ']='แน‡', 
	['๐‘˜']='t', ['๐‘˜ž']='th', ['๐‘˜Ÿ']='d', ['๐‘˜ ']='dh', ['๐‘˜ก']='n', 
	['๐‘˜ข']='p', ['๐‘˜ฃ']='ph', ['๐‘˜ค']='b', ['๐‘˜ฅ']='bh', ['๐‘˜ฆ']='m',
	['๐‘˜ง']='y', ['๐‘˜จ']='r', ['๐‘˜ฉ']='l', ['๐‘˜ช']='v', ['๐‘˜ฏ']='แธท',
	['๐‘˜ซ']='ล›', ['๐‘˜ฌ']='แนฃ', ['๐‘˜ญ']='s', ['๐‘˜ฎ']='h',
	['๐‘˜จ๐‘˜ฟ'..ZWJ] = 'r',
	-- ['๐‘˜•๐‘˜ฟ๐‘˜—'] = 'dny',

	-- vowel diacritics
	----  only in script charts: ['๐‘˜ฑ'] = 'i', ['๐‘˜ด'] ='ลซ',
	['๐‘˜ณ'] = 'u', ['๐‘˜น'] = 'e', ['๐‘˜ป'] = 'o', 
	['๐‘˜ฐ'] = 'ฤ', ['๐‘˜ฒ'] = 'ฤซ',
	['๐‘˜ต'] = 'ru',
	['๐‘˜บ'] = 'ai', ['๐‘˜ผ'] = 'au',
	-- ['๐‘˜ฐ๐‘™€'] = 'ล',
	['๐‘™€'] = 'ฤ•',

	-- vowel signs
	----  only in script charts: ['๐‘˜‚'] = 'i', ['๐‘˜…'] ='ลซ',
	['๐‘˜€'] = 'a', ['๐‘˜„'] = 'u', ['๐‘˜Š'] = 'e', ['๐‘˜Œ'] = 'o',
	['๐‘˜'] = 'ฤ', ['๐‘˜ƒ'] = 'ฤซ',
	['๐‘˜†'] = 'ล•', 
	['๐‘˜‹'] = 'ai', ['๐‘˜'] = 'au', 
	['๐‘˜๐‘™€'] = 'ล',
	['๐‘˜€๐‘™€'] = 'ฤ•', ['๐‘˜Š๐‘™€'] = 'ฤ•',
	
	['๐‘˜Œ๐‘˜ฆ๐‘˜ฟ'] = 'om',
	
	-- chandrabindu
	--- ['๐‘™€๐‘˜ฝ'] = 'ฬƒ',
	
	-- anusvara
	['๐‘˜ฝ'] = 'แน',
	
	-- visarga
	['๐‘˜พ'] = 'แธฅ',
	
	-- virama
	['๐‘˜ฟ'] = '',
	
	-- numerals
	['๐‘™'] = '0', ['๐‘™‘'] = '1', ['๐‘™’'] = '2', ['๐‘™“'] = '3', ['๐‘™”'] = '4',
	['๐‘™•'] = '5', ['๐‘™–'] = '6', ['๐‘™—'] = '7', ['๐‘™˜'] = '8', ['๐‘™™'] = '9',
	
	-- punctuation
	['๐‘™'] = '.', -- danda
	['๐‘™‚'] = '.', -- double danda
	['+'] = '', -- compound separator
	
	-- abbreviation sign
	['๐‘™ƒ'] = '.',
}

local nasal_assim = {
	['๐‘˜Ž'] = '๐‘˜’', ['๐‘˜'] = '๐‘˜’', ['๐‘˜'] = '๐‘˜’', ['๐‘˜‘'] = '๐‘˜’', 
	['๐‘˜“'] = '๐‘˜—', ['๐‘˜”'] = '๐‘˜—', ['๐‘˜•'] = '๐‘˜—', ['๐‘˜–'] = '๐‘˜—',  
	['๐‘˜˜'] = '๐‘˜œ', ['๐‘˜™'] = '๐‘˜œ', ['๐‘˜š'] = '๐‘˜œ', ['๐‘˜›'] = '๐‘˜œ',
	['๐‘˜ข'] = '๐‘˜ฆ', ['๐‘˜ฃ'] = '๐‘˜ฆ', ['๐‘˜ค'] = '๐‘˜ฆ', ['๐‘˜ฅ'] = '๐‘˜ฆ', ['๐‘˜ฆ'] = '๐‘˜ฆ',
	['๐‘˜ง'] = 'i', ['๐‘˜จ'] = '๐‘˜„', ['๐‘˜ฉ'] = '๐‘˜ฉ', ['๐‘˜ช'] = '๐‘˜„',
	['๐‘˜ซ'] = '๐‘˜„', ['๐‘˜ฌ'] = '๐‘˜„', ['๐‘˜ญ'] = '๐‘˜„', ['๐‘˜ฎ'] = '๐‘˜„',
}

local perm_cl = {
	['๐‘˜ฆ๐‘˜ฟ๐‘˜ฉ'] = true, ['๐‘˜ช๐‘˜ฟ๐‘˜ฉ'] = true, ['๐‘˜ก๐‘˜ฟ๐‘˜ฉ'] = true,
	
}

local all_cons, special_cons = '๐‘˜Ž๐‘˜๐‘˜๐‘˜‘๐‘˜’๐‘˜“๐‘˜”๐‘˜•๐‘˜–๐‘˜—๐‘˜˜๐‘˜™๐‘˜š๐‘˜›๐‘˜๐‘˜ž๐‘˜Ÿ๐‘˜ ๐‘˜ข๐‘˜ฃ๐‘˜ค๐‘˜ฅ๐‘˜ซ๐‘˜ฌ๐‘˜ญ๐‘˜ง๐‘˜จ๐‘˜ฉ๐‘˜ช๐‘˜ฎ๐‘˜œ๐‘˜ก๐‘˜ฆ๐‘˜ฏ', '๐‘˜Ÿ๐‘˜๐‘˜ง๐‘˜จ๐‘˜ฉ๐‘˜ช๐‘˜ฎ๐‘˜ก๐‘˜ฆ'
local vowel, vowel_sign = '%*a๐‘˜ฑ๐‘˜ณ๐‘˜ต๐‘˜น๐‘˜ป๐‘˜ฐ๐‘˜ฒ๐‘˜ด๐‘˜บ๐‘˜ผ๐‘™€', '๐‘˜€๐‘˜‚๐‘˜„๐‘˜Š๐‘˜Œ๐‘˜๐‘˜ƒ๐‘˜…๐‘˜†๐‘˜‹๐‘˜๐‘˜€๐‘™€'
local syncope_pattern = '([' .. vowel .. vowel_sign .. '])([' .. all_cons .. '])a([' .. all_cons .. '])([เค‚]?[' .. vowel .. vowel_sign .. '])'

local function rev_string(text)
	local char_array, i = {}, 1
	for char in string.gmatch(text, "[%z\1-\127\194-\244][\128-\191]*") do -- UTF-8 character pattern
		char_array[i] = char
		i = i + 1
	end
	return table.concat(require("Module:table").reverse(char_array))
end

function export.tr(text, lang, sc)
	-- text = gsub(text, 'เคพเค', 'เฅ‰' .. 'เค‚')
	-- text = gsub(text, 'เค', 'เฅ…' .. 'เค‚')
	text = gsub(text, '([^' .. vowel .. vowel_sign .. '])๐‘˜ฝ ', '%1๐‘˜€ ')
	text = gsub(text, '([^' .. vowel .. vowel_sign .. '])๐‘˜ฝ$', '%1๐‘˜€')
	text = gsub(text, '([' .. all_cons .. '])([' .. vowel .. '๐‘˜ฟ]?)', function(c, d)
		return c .. (d == "" and 'a' or d) end)
	for word in mw.ustring.gmatch(text, "[๐‘˜€-๐‘™™a]+") do
		local orig_word = word
		word = rev_string(word)
		word = gsub(word, '^a([' .. all_cons .. '][' .. vowel .. vowel_sign .. '])', '%1')
		while find(word, syncope_pattern) do
			word = gsub(word, syncope_pattern, '%1%2%3%4')
		end
		word = gsub(word, '(.?)๐‘˜ฝ(.)', function(succ, prev)
			return succ .. (succ..prev == "a" and "๐‘˜ฟ๐‘˜ฆ" or 
				(succ == "" and find(prev, '[' .. vowel .. ']') and "ฬƒ" or nasal_assim[succ] or "n")) .. prev end)
		text = gsub(text, orig_word, rev_string(word))
	end
	text = gsub(text, '.', conv)
	text = gsub(text, 'a([iu])ฬƒ', 'aอ %1')
	text = gsub(text, 'aa', 'a')
    text = gsub(text, 'รฑjรฑ', 'ndny')
    text = gsub(text, 'jรฑ', 'dny')
	return mw.ustring.toNFC(text)
end

return export