Module:grc-translit: difference between revisions

From Wiktionary, the free dictionary
Jump to navigation Jump to search
Content deleted Content added
whoops
using variables for diacritics makes code more readable
Line 1: Line 1:
local export = {}
local export = {}

local PSILI = mw.ustring.char(0x313)
local DASIA = mw.ustring.char(0x314)
local SUBSCRIPT = mw.ustring.char(0x345)

local TREMA = mw.ustring.char(0x308)

local GRAVE = mw.ustring.char(0x300)
local ACUTE = mw.ustring.char(0x301)
local GREEKCIRCUMFLEX = mw.ustring.char(0x342)
local CIRCUMFLEX = mw.ustring.char(0x302)

local MACRON = mw.ustring.char(0x304)
local BREVE = mw.ustring.char(0x306)


local tt = {
local tt = {
Line 39: Line 53:
-- Diacritics
-- Diacritics
['̄'] = '̄', -- macron 304
[MACRON] = MACRON,
['̆'] = '', -- breve 306
[BREVE] = '',
['̓'] = '', -- psili 313
[PSILI] = '',
['̔'] = '', -- dasia 314
[DASIA] = '',
['̈'] = '̈', -- trema 308
[TREMA] = TREMA,
['̀'] = '̀', -- grave 300
[GRAVE] = GRAVE,
['́'] = '́', -- acute 301
[ACUTE] = ACUTE,
[GREEKCIRCUMFLEX] = CIRCUMFLEX,
['͂'] = '̂', -- circumflex 342
['ͅ'] = 'i', -- hypogegrammene 345
[SUBSCRIPT] = 'i',
-- For internal processing of diaeresis
-- For internal processing of diaeresis
Line 53: Line 67:
}
}


local diacritics = PSILI..DASIA..SUBSCRIPT..MACRON..BREVE..TREMA..GRAVE..ACUTE..GREEKCIRCUMFLEX
local diacritics = '[̄̆̓̔̈̀́͂ͅ]'


function export.tr(text, lang, sc)
function export.tr(text, lang, sc)
Line 66: Line 80:
text = mw.ustring.toNFD(text)
text = mw.ustring.toNFD(text)
text = gsub(text,'([ιυ])([̄̆]?)̈','+%1%')
text = gsub(text,'([ιυ])(['..BREVE..MACRON..']?)'..TREMA,'+%1%2'..TREMA)
--tokenize
--tokenize
Line 97: Line 111:
elseif mw.ustring.match(token,'[ΑΕΗΟΩαεηοω]υ') or mw.ustring.match(token,'[Υυ]ι') then
elseif mw.ustring.match(token,'[ΑΕΗΟΩαεηοω]υ') or mw.ustring.match(token,'[Υυ]ι') then
t = mw.ustring.gsub(t,'y','u')
t = mw.ustring.gsub(t,'y','u')
elseif mw.ustring.match(token,'[αΑ].*ͅ') then
elseif mw.ustring.match(token,'[αΑ].*'..SUBSCRIPT) then
t = mw.ustring.gsub(t,'([aA])','%')
t = mw.ustring.gsub(t,'([aA])','%1'..MACRON)
end
end
if mw.ustring.match(token,'̔') then
if mw.ustring.match(token,DASIA) then
if mw.ustring.match(token,'[Ρρ]') then
if mw.ustring.match(token,'[Ρρ]') then
t = t .. 'h'
t = t .. 'h'
Line 110: Line 124:
t = mw.ustring.toNFD(t) -- we can't manually enter them as e/o + macron in the table because it'll recombine apparently
t = mw.ustring.toNFD(t) -- we can't manually enter them as e/o + macron in the table because it'll recombine apparently
if mw.ustring.match(t,'̂') then
if mw.ustring.match(t,CIRCUMFLEX) then
t = mw.ustring.gsub(t,'̄','')
t = mw.ustring.gsub(t,MACRON,'')
end
end

Revision as of 20:53, 7 January 2017

This module will transliterate Ancient Greek language text per WT:GRC TR. It is also used to transliterate Demotic, Greek, Paeonian, Old Ossetic, Oscan, Dacian, Ancient Macedonian, and Phrygian. The module should preferably not be called directly from templates or other modules. To use it from a template, use {{xlit}}. Within a module, use Module:languages#Language:transliterate.

For testcases, see Module:grc-translit/testcases.

Functions

tr(text, lang, sc)
Transliterates a given piece of text written in the script specified by the code sc, and language specified by the code lang.
When the transliteration fails, returns nil.

18 of 36 tests failed. (refresh)

TextExpectedActual
testcases for tr function in Module:grc-translit:
Passedλόγοςlógoslógos
Passedσφίγξsphínxsphínx
Passedϝάναξwánaxwánax
Failedοἷαιhoîaioiĥai
current problems
FailedΙΧΘΥΣIKHTHUSIKhThUS
FailedΥἱός'''Hu'''iós'''U'''ihós
u/y
Passedταῦροςtaûrostaûros
Passedνηῦςnēûsnēûs
Passedσῦςsûssûs
Failedὗςhûsuĥs
Passedγυῖονguîonguîon
Passedἀναῡ̈τέωanaṻtéōanaṻtéō
Passedδαΐφρωνdaḯphrōndaḯphrōn
vowel length
Failedτῶνtôntō̂n
Passedτοὶtoìtoì
Failedτῷtôitō̂i
Passedτούτῳtoútōitoútōi
Failedσοφίᾳsophíāisophíai
Passedμᾱ̆νόςmānósmānós
h (rough breathing)
Failedhooh
Failedοἱhoioih
Failedεὕρισκεheúriskeeuh́riske
Failedὑϊκόςhuïkósuhïkós
Passedπυρρόςpurrhóspurrhós
Passedῥέωrhéōrhéō
Failedσάἁμονsáhamonsáahmon
capitals
PassedὈδυσσεύςOdusseúsOdusseús
FailedΕἵλωςHeílōsEih́lōs
FailedᾍδηςHā́idēsAh́idēs
Failedἡ Ἑλήνηhē Helḗnēēh Ehlḗnē
punctuation
Failedἔχεις μοι εἰπεῖν, ὦ Σώκρατες, ἆρα διδακτὸν ἡ ἀρετή;ékheis moi eipeîn, ô Sṓkrates, âra didaktòn hē aretḗ?ékheis moi eipeîn, ō̂ Sṓkrates, âra didaktòn ēh aretḗ;
Failedτί τηνικάδε ἀφῖξαι, ὦ Κρίτων; ἢ οὐ πρῲ ἔτι ἐστίν;tí tēnikáde aphîxai, ô Krítōn? ḕ ou prṑi éti estín?tí tēnikáde aphîxai, ō̂ Krítōn; ḕ ou prṑi éti estín;
Failedτούτων φωνήεντα μέν ἐστιν ἑπτά· α ε η ι ο υ ω.toútōn phōnḗenta mén estin heptá; a e ē i o u ō.toútōn phōnḗenta mén estin ehptá· a e ē i o u ō.
Passedπήγ(νῡμῐ)pḗg(nūmi)pḗg(nūmi)
HTML entities
Passedκαλός καὶ ἀγαθόςkalós kaì agathóskalós kaì agathós
Passedκαλός καὶ ἀγαθόςkalós kaì agathóskalós kaì agathós

local export = {}

local PSILI = mw.ustring.char(0x313)
local DASIA = mw.ustring.char(0x314)
local SUBSCRIPT = mw.ustring.char(0x345)

local TREMA = mw.ustring.char(0x308)

local GRAVE = mw.ustring.char(0x300)
local ACUTE = mw.ustring.char(0x301)
local GREEKCIRCUMFLEX = mw.ustring.char(0x342)
local CIRCUMFLEX = mw.ustring.char(0x302)

local MACRON = mw.ustring.char(0x304)
local BREVE = mw.ustring.char(0x306)

local tt = {
	-- Vowels
	["α"] = "a",
	["ε"] = "e",
	["η"] = "ē",
	["ι"] = "i",
	["ο"] = "o",
	["υ"] = "u",
	["ω"] = "ō",

	-- Consonants
	["β"] = "b",
	["γ"] = "g",
	["δ"] = "d",
	["ζ"] = "z",
	["θ"] = "th",
	["κ"] = "k",
	["λ"] = "l",
	["μ"] = "m",
	["ν"] = "n",
	["ξ"] = "x",
	["π"] = "p",
	["ρ"] = "r",
	["σ"] = "s",
	["ς"] = "s",
	["τ"] = "t",
	["φ"] = "ph",
	["χ"] = "kh",
	["ψ"] = "ps",
	
	-- Archaic letters
	["ϝ"] = "w",
	["ϻ"] = "ś",
	["ϙ"] = "q",
	["ϡ"] = "š",
	["ͷ"] = "v",
	
	-- Diacritics
	[MACRON] = MACRON,
	[BREVE] = '',
	[PSILI] = '',
	[DASIA] = '',
	[TREMA] = TREMA,
	[GRAVE] = GRAVE,
	[ACUTE] = ACUTE,
	[GREEKCIRCUMFLEX] = CIRCUMFLEX,
	[SUBSCRIPT] = 'i',
	
	-- For internal processing of diaeresis
	['+'] = '',
}

local diacritics = PSILI..DASIA..SUBSCRIPT..MACRON..BREVE..TREMA..GRAVE..ACUTE..GREEKCIRCUMFLEX

function export.tr(text, lang, sc)
	-- If the script is given as Cprt, then forward the transliteration to that module
	if sc == "Cprt" then
		return require("Module:Cprt-translit").tr(text, lang, sc)
	end
	
	local gsub = mw.ustring.gsub

	-- decompose text
	text = mw.ustring.toNFD(text)
	
	text = gsub(text,'([ιυ])(['..BREVE..MACRON..']?)'..TREMA,'+%1%2'..TREMA)
	
	--tokenize
	tokens = {}
	ti = 0 -- it gets incremented every time
	for i = 1,mw.ustring.len(text) do
		ch = mw.ustring.sub(text,i,i)
		if ch == 'ι' and tokens[ti] and mw.ustring.match(tokens[ti],'[ΑΕΗΟΥΩαεηουω]') then
			tokens[ti] = tokens[ti]..'ι'
		elseif ch == 'υ' and tokens[ti] and mw.ustring.match(tokens[ti],'[ΑΕΗΟΩαεηοω]') then
			tokens[ti] = tokens[ti]..'υ'
		elseif mw.ustring.match(ch,diacritics) then
			tokens[ti] = tokens[ti]..ch
		else
			ti = ti+1
			tokens[ti] = ch
		end
	end
	
	--now read the tokens
	out = ''
	for i,token in pairs(tokens) do
		t = mw.ustring.gsub(mw.ustring.lower(token),'.',function(x) return tt[x] end)
		
		-- elseif is misleading (these are independent) but it's more concise this way
		if token == 'γ' and tokens[i+1] and mw.ustring.match(tokens[i+1],'[κγχξ]') then
			t = 'n'
		elseif token == 'ρ' and tokens[i-1] and tokens[i-1] == 'ρ' then
			t = 'rh'
		elseif mw.ustring.match(token,'[ΑΕΗΟΩαεηοω]υ') or mw.ustring.match(token,'[Υυ]ι') then
			t = mw.ustring.gsub(t,'y','u')
		elseif mw.ustring.match(token,'[αΑ].*'..SUBSCRIPT) then
			t = mw.ustring.gsub(t,'([aA])','%1'..MACRON)
		end
		
		if mw.ustring.match(token,DASIA) then
			if mw.ustring.match(token,'[Ρρ]') then
				t = t .. 'h'
			else
				t = 'h' .. t
			end
		end
	
		t = mw.ustring.toNFD(t) -- we can't manually enter them as e/o + macron in the table because it'll recombine apparently
		if mw.ustring.match(t,CIRCUMFLEX) then
			t = mw.ustring.gsub(t,MACRON,'')
		end
		
		if token ~= mw.ustring.lower(token) then
			t = mw.ustring.upper(mw.ustring.sub(t,1,1) ) .. mw.ustring.lower(mw.ustring.sub(t,2) )
		end
		out = out .. t
	end
	return out
end

return export