Module:grc-translit: difference between revisions

From Wiktionary, the free dictionary
Jump to navigation Jump to search
Content deleted Content added
m unused variable
moved from Module:grc-translit/sandbox; commented out code that is not yet agreed on, except for xi = ks
Line 3: Line 3:
local m_data = mw.loadData("Module:grc-utilities/data")
local m_data = mw.loadData("Module:grc-utilities/data")
local chars = m_data.named
local chars = m_data.named
local gsub = mw.ustring.gsub


local tt = {
local tt = {
Line 10: Line 8:
["α"] = "a",
["α"] = "a",
["ε"] = "e",
["ε"] = "e",
["η"] = "ē",
["η"] = "e"..chars.macron,
["ι"] = "i",
["ι"] = "i",
["ο"] = "o",
["ο"] = "o",
["υ"] = "u",
["υ"] = "u",
["ω"] = "ō",
-- ["υ"] = "y",
["ω"] = "o"..chars.macron,


-- Consonants
-- Consonants
Line 26: Line 25:
["μ"] = "m",
["μ"] = "m",
["ν"] = "n",
["ν"] = "n",
["ξ"] = "x",
["ξ"] = "ks",
["π"] = "p",
["π"] = "p",
["ρ"] = "r",
["ρ"] = "r",
Line 58: Line 57:
}
}


local diacritics = m_data.all
local sub = mw.ustring.sub
local gsub = mw.ustring.gsub
local match = mw.ustring.match


function export.tr(text, lang, sc)
local function get(text, index)
return sub(text, index, index)
-- If the script is given as Cprt, then forward the transliteration to that module
end
if sc == "Cprt" then

return require("Module:Cprt-translit").tr(text, lang, sc)
-- Concatenate or insert.
local function add(list, index, text)
if list[index] then
list[index] = list[index] .. text
else
list[index] = text
end
end
end


local i_diphthong = "[ΑΕΗΟΥΩαεηουω]ι"
-- decompose text
local u_diphthong = "[ΑΕΗΟΩαεηοω]υ"
text = mw.ustring.toNFD(text)
local diacritic = m_data.all

-- tokenize
local function is(char, X)
return match(char, '^' .. X .. '$')
end

local function getTokens(text)
if type(text) ~= "string" then
error("Text is not a string", 2)
end
local tokens = {}
local tokens = {}
-- token tracks our position in the table of tokens.
local prev = 0
local token = 1
for i = 1, mw.ustring.len(text) do
local char = mw.ustring.sub(text, i, i)
while mw.ustring.len(text) > 0 do
local char1 = get(text, 1)
if char == 'ι' and tokens[prev] and mw.ustring.match(tokens[prev], '[ΑΕΗΟΥΩαεηουω]') then
local char2 = get(text, 2)
tokens[prev] = tokens[prev]..'ι'
local twoChars = char2 and char1 .. char2
elseif char == 'υ' and tokens[prev] and mw.ustring.match(tokens[prev], '[ΑΕΗΟΩαεηοω]') then
-- Look for a diacritic and add it to the current token. Remove it from the text.
tokens[prev] = tokens[prev]..'υ'
elseif mw.ustring.match(char, diacritics) then
if char1 and is(char1, diacritic) then
tokens[prev] = tokens[prev]..char
add(tokens, token, char1)
text = sub(text, 2)
-- Look for a diphthong and add it to the current token. Remove it from the text.
elseif twoChars and (is(twoChars, u_diphthong) or is(twoChars, i_diphthong)) then
add(tokens, token, twoChars)
text = sub(text, 3)
else
else
-- Add the current character to the next token. Remove it from the text.
prev = prev + 1
tokens[prev] = char
token = token + 1
add(tokens, token, char1)
text = sub(text, 2)
end
end
end
end
--now read the tokens
return tokens
end
out = ''

local function interpret(tokens)
if type(tokens) ~= "table" then
error("Tokens is not a table", 2)
end
out = {}
for i, token in pairs(tokens) do
for i, token in pairs(tokens) do
t = mw.ustring.gsub(mw.ustring.lower(token), '.', function(x) return tt[x] end)
t = gsub(mw.ustring.lower(token), '.', function(x) return tt[x] end)
-- elseif is misleading (these are independent) but it's more concise this way
-- elseif is misleading (these are independent) but it's more concise this way
--[[if #token > 1 then
if token == 'γ' and tokens[i+1] and mw.ustring.match(tokens[i+1], '[κγχξ]') then
if match(token, chars.macron .. '.*' .. chars.diaeresis) then
t = gsub(t, chars.diaeresis, '')
elseif match(token, u_diphthong) and not match(token, chars.diaeresis) then
t = gsub(t, 'y', 'u')
end
end]]
if token == 'γ' and tokens[i+1] and match(tokens[i+1], '[κγχξ]') then
t = 'n'
t = 'n'
elseif token == 'ρ' and tokens[i-1] and tokens[i-1] == 'ρ' then
elseif token == 'ρ' and tokens[i-1] and tokens[i-1] == 'ρ' then
t = 'rh'
t = 'rh'
elseif mw.ustring.match(token, '[ΑΕΗΟΩαεηοω]υ') or mw.ustring.match(token, '[Υυ]ι') then
elseif match(token, '[αΑ].*' .. chars.subscript) then
t = gsub(t, 'y', 'u')
elseif mw.ustring.match(token, '[αΑ].*' .. chars.subscript) then
t = gsub(t, '([aA])', '%1' .. chars.macron)
t = gsub(t, '([aA])', '%1' .. chars.macron)
end
end
--[[ for moving an acute to the offglide of a long diphthong
if mw.ustring.match(token, chars.rough) then
if match(token, chars.subscript) and match(token, chars.acute) then
t = gsub(t, '(' .. chars.acute .. ')(i)', '%2%1')
if mw.ustring.match(token, '[Ρρ]') then
end
]]
if match(token, chars.rough) then
if match(token, '[Ρρ]') then
-- Could add Ϝϝ, but such words would likely be written with ϝη
t = t .. 'h'
t = t .. 'h'
else
else
Line 110: Line 150:
end
end
if match(t, chars.Latin_circum) then
t = mw.ustring.toNFD(t) -- we can't manually enter them as e/o + macron in the table because it'll recombine apparently
if mw.ustring.match(t, chars.Latin_circum) then
t = gsub(t, chars.macron, '')
t = gsub(t, chars.macron, '')
end
end
if token ~= mw.ustring.lower(token) then
if token ~= mw.ustring.lower(token) then
t = mw.ustring.upper(mw.ustring.sub(t, 1, 1) ) .. mw.ustring.lower(mw.ustring.sub(t, 2) )
t = mw.ustring.upper(get(t, 1) ) .. mw.ustring.lower(mw.ustring.sub(t, 2) )
end
end
out = out .. t
table.insert(out, t)
end
end
return out
return table.concat(out)
end

function export.tr(text, lang, sc)
-- If the script is given as Cprt, then forward the transliteration to that module
if sc == "Cprt" then
return require("Module:Cprt-translit").tr(text, lang, sc)
end

-- decompose text
text = mw.ustring.toNFD(text)
tokens = getTokens(text)
return interpret(tokens)
end
end



Revision as of 22:49, 30 January 2017

This module will transliterate Ancient Greek language text per WT:GRC TR. It is also used to transliterate Demotic, Greek, Paeonian, Old Ossetic, Oscan, Dacian, Ancient Macedonian, and Phrygian. The module should preferably not be called directly from templates or other modules. To use it from a template, use {{xlit}}. Within a module, use Module:languages#Language:transliterate.

For testcases, see Module:grc-translit/testcases.

Functions

tr(text, lang, sc)
Transliterates a given piece of text written in the script specified by the code sc, and language specified by the code lang.
When the transliteration fails, returns nil.

8 of 36 tests failed. (refresh)

TextExpectedActual
testcases for tr function in Module:grc-translit:
Passedλόγοςlógoslógos
Failedσφίγξsphínxsphínks
Failedϝάναξwánaxwánaks
Passedοἷαιhoîaihoîai
current problems
FailedΙΧΘΥΣIKHTHUSIKhThUS
FailedΥἱός'''Hu'''iós'''U'''hiós
u/y
Passedταῦροςtaûrostaûros
Failedνηῦςnēûsneûs
Passedσῦςsûssûs
Passedὗςhûshûs
Passedγυῖονguîonguîon
Passedἀναῡ̈τέωanaṻtéōanaṻtéō
Passedδαΐφρωνdaḯphrōndaḯphrōn
vowel length
Passedτῶνtôntôn
Passedτοὶtoìtoì
Passedτῷtôitôi
Passedτούτῳtoútōitoútōi
Passedσοφίᾳsophíāisophíāi
Passedμᾱ̆νόςmānósmānós
h (rough breathing)
Passedhoho
Passedοἱhoihoi
Passedεὕρισκεheúriskeheúriske
Passedὑϊκόςhuïkóshuïkós
Passedπυρρόςpurrhóspurrhós
Passedῥέωrhéōrhéō
Passedσάἁμονsáhamonsáhamon
capitals
PassedὈδυσσεύςOdusseúsOdusseús
PassedΕἵλωςHeílōsHeílōs
PassedᾍδηςHā́idēsHā́idēs
Passedἡ Ἑλήνηhē Helḗnēhē Helḗnē
punctuation
Failedἔχεις μοι εἰπεῖν, ὦ Σώκρατες, ἆρα διδακτὸν ἡ ἀρετή;ékheis moi eipeîn, ô Sṓkrates, âra didaktòn hē aretḗ?ékheis moi eipeîn, ô Sṓkrates, âra didaktòn hē aretḗ;
Failedτί τηνικάδε ἀφῖξαι, ὦ Κρίτων; ἢ οὐ πρῲ ἔτι ἐστίν;tí tēnikáde aphîxai, ô Krítōn? ḕ ou prṑi éti estín?tí tēnikáde aphîksai, ô Krítōn; ḕ ou prṑi éti estín;
Failedτούτων φωνήεντα μέν ἐστιν ἑπτά· α ε η ι ο υ ω.toútōn phōnḗenta mén estin heptá; a e ē i o u ō.toútōn phōnḗenta mén estin heptá· a e ē i o u ō.
Passedπήγ(νῡμῐ)pḗg(nūmi)pḗg(nūmi)
HTML entities
Passedκαλός καὶ ἀγαθόςkalós kaì agathóskalós kaì agathós
Passedκαλός καὶ ἀγαθόςkalós kaì agathóskalós kaì agathós

local export = {}

local m_data = mw.loadData("Module:grc-utilities/data")
local chars = m_data.named

local tt = {
	-- Vowels
	["α"] = "a",
	["ε"] = "e",
	["η"] = "e"..chars.macron,
	["ι"] = "i",
	["ο"] = "o",
	["υ"] = "u",
--	["υ"] = "y",
	["ω"] = "o"..chars.macron,

	-- Consonants
	["β"] = "b",
	["γ"] = "g",
	["δ"] = "d",
	["ζ"] = "z",
	["θ"] = "th",
	["κ"] = "k",
	["λ"] = "l",
	["μ"] = "m",
	["ν"] = "n",
	["ξ"] = "ks",
	["π"] = "p",
	["ρ"] = "r",
	["σ"] = "s",
	["ς"] = "s",
	["τ"] = "t",
	["φ"] = "ph",
	["χ"] = "kh",
	["ψ"] = "ps",
	
	-- Archaic letters
	["ϝ"] = "w",
	["ϻ"] = "ś",
	["ϙ"] = "q",
	["ϡ"] = "š",
	["ͷ"] = "v",
	
	-- Diacritics
	[chars.macron] = chars.macron, -- macron 304
	[chars.breve] = '', -- breve 306
	[chars.smooth] = '', -- psili 313
	[chars.rough] = '', -- dasia 314
	[chars.diaeresis] = chars.diaeresis, -- trema 308
	[chars.grave] = chars.grave, -- grave 300
	[chars.acute] = chars.acute, -- acute 301
	[chars.circum] = chars.Latin_circum, -- circumflex 342
	[chars.subscript] = 'i', -- hypogegrammene 345
	
	-- For internal processing of diaeresis
	['+'] = '',
}

local sub = mw.ustring.sub
local gsub = mw.ustring.gsub
local match = mw.ustring.match

local function get(text, index)
	return sub(text, index, index)
end

-- Concatenate or insert.
local function add(list, index, text)
	if list[index] then
		list[index] = list[index] .. text
	else
		list[index] = text
	end
end

local i_diphthong = "[ΑΕΗΟΥΩαεηουω]ι"
local u_diphthong = "[ΑΕΗΟΩαεηοω]υ"
local diacritic = m_data.all

local function is(char, X)
	return match(char, '^' .. X .. '$')
end

local function getTokens(text)
	if type(text) ~= "string" then
		error("Text is not a string", 2)
	end
	local tokens = {}
	-- token tracks our position in the table of tokens.
	local token = 1
	while mw.ustring.len(text) > 0 do
		local char1 = get(text, 1)
		local char2 = get(text, 2)
		local twoChars = char2 and char1 .. char2
		-- Look for a diacritic and add it to the current token. Remove it from the text.
		if char1 and is(char1, diacritic) then
			add(tokens, token, char1)
			text = sub(text, 2)
		-- Look for a diphthong and add it to the current token. Remove it from the text.
		elseif twoChars and (is(twoChars, u_diphthong) or is(twoChars, i_diphthong)) then
			add(tokens, token, twoChars)
			text = sub(text, 3)
		else
		-- Add the current character to the next token. Remove it from the text.
			token = token + 1
			add(tokens, token, char1)
			text = sub(text, 2)
		end
	end
	
	return tokens
end

local function interpret(tokens)
	if type(tokens) ~= "table" then
		error("Tokens is not a table", 2)
	end
	
	out = {}
	for i, token in pairs(tokens) do
		t = gsub(mw.ustring.lower(token), '.', function(x) return tt[x] end)
		
		-- elseif is misleading (these are independent) but it's more concise this way
	--[[if #token > 1 then
			if match(token, chars.macron .. '.*' .. chars.diaeresis) then
				t = gsub(t, chars.diaeresis, '')
			elseif match(token, u_diphthong) and not match(token, chars.diaeresis) then
				t = gsub(t, 'y', 'u') 
			end
		end]]
		if token == 'γ' and tokens[i+1] and match(tokens[i+1], '[κγχξ]') then
			t = 'n'
		elseif token == 'ρ' and tokens[i-1] and tokens[i-1] == 'ρ' then
			t = 'rh'
		elseif match(token, '[αΑ].*' .. chars.subscript) then
			t = gsub(t, '([aA])', '%1' .. chars.macron)
		end
		--[[ for moving an acute to the offglide of a long diphthong
		if match(token, chars.subscript) and match(token, chars.acute) then
			t = gsub(t, '(' .. chars.acute .. ')(i)', '%2%1')
		end
		]]
		if match(token, chars.rough) then
			if match(token, '[Ρρ]') then
			-- Could add Ϝϝ, but such words would likely be written with ϝη
				t = t .. 'h'
			else
				t = 'h' .. t
			end
		end
	
		if match(t, chars.Latin_circum) then
			t = gsub(t, chars.macron, '')
		end
		
		if token ~= mw.ustring.lower(token) then
			t = mw.ustring.upper(get(t, 1) ) .. mw.ustring.lower(mw.ustring.sub(t, 2) )
		end
		table.insert(out, t)
	end
	return table.concat(out)
end

function export.tr(text, lang, sc)
	-- If the script is given as Cprt, then forward the transliteration to that module
	if sc == "Cprt" then
		return require("Module:Cprt-translit").tr(text, lang, sc)
	end

	-- decompose text
	text = mw.ustring.toNFD(text)
	
	tokens = getTokens(text)
	return interpret(tokens)
end

return export