Module:grc-translit: difference between revisions

Content deleted Content added

Inline

Revision as of 22:49, 30 January 2017

The following documentation is located at Module:grc-translit/documentation. ^[edit] Categories were auto-generated by Module:module categorization. ^[edit]

Useful links: subpage list • links • transclusions • testcases • sandbox (diff)

This module will transliterate Ancient Greek language text per WT:GRC TR. It is also used to transliterate Demotic, Greek, Paeonian, Old Ossetic, Oscan, Dacian, Ancient Macedonian, and Phrygian. The module should preferably not be called directly from templates or other modules. To use it from a template, use {{xlit}}. Within a module, use Module:languages#Language:transliterate.

For testcases, see Module:grc-translit/testcases.

Functions

tr(text, lang, sc): Transliterates a given piece of text written in the script specified by the code sc, and language specified by the code lang.; When the transliteration fails, returns nil.

8 of 36 tests failed. (refresh)

testcases for `tr` function in Module:grc-translit:
	Text	Expected	Actual
	λόγος	lógos	lógos
	σφίγξ	sphínx	sphínks
	ϝάναξ	wánax	wánaks
	οἷαι	hoîai	hoîai
current problems
	ΙΧΘΥΣ	IKHTHUS	IKhThUS
	Υἱός	'''Hu'''iós	'''U'''hiós
u/y
	ταῦρος	taûros	taûros
	νηῦς	nēûs	neûs
	σῦς	sûs	sûs
	ὗς	hûs	hûs
	γυῖον	guîon	guîon
	ἀναῡ̈τέω	anaṻtéō	anaṻtéō
	δαΐφρων	daḯphrōn	daḯphrōn
vowel length
	τῶν	tôn	tôn
	τοὶ	toì	toì
	τῷ	tôi	tôi
	τούτῳ	toútōi	toútōi
	σοφίᾳ	sophíāi	sophíāi
	μᾱ̆νός	mānós	mānós
h (rough breathing)
	ὁ	ho	ho
	οἱ	hoi	hoi
	εὕρισκε	heúriske	heúriske
	ὑϊκός	huïkós	huïkós
	πυρρός	purrhós	purrhós
	ῥέω	rhéō	rhéō
	σάἁμον	sáhamon	sáhamon
capitals
	Ὀδυσσεύς	Odusseús	Odusseús
	Εἵλως	Heílōs	Heílōs
	ᾍδης	Hā́idēs	Hā́idēs
	ἡ Ἑλήνη	hē Helḗnē	hē Helḗnē
punctuation
	ἔχεις μοι εἰπεῖν, ὦ Σώκρατες, ἆρα διδακτὸν ἡ ἀρετή;	ékheis moi eipeîn, ô Sṓkrates, âra didaktòn hē aretḗ?	ékheis moi eipeîn, ô Sṓkrates, âra didaktòn hē aretḗ;
	τί τηνικάδε ἀφῖξαι, ὦ Κρίτων; ἢ οὐ πρῲ ἔτι ἐστίν;	tí tēnikáde aphîxai, ô Krítōn? ḕ ou prṑi éti estín?	tí tēnikáde aphîksai, ô Krítōn; ḕ ou prṑi éti estín;
	τούτων φωνήεντα μέν ἐστιν ἑπτά· α ε η ι ο υ ω.	toútōn phōnḗenta mén estin heptá; a e ē i o u ō.	toútōn phōnḗenta mén estin heptá· a e ē i o u ō.
	πήγ(νῡμῐ)	pḗg(nūmi)	pḗg(nūmi)
HTML entities
	καλός καὶ ἀγαθός	kalós kaì agathós	kalós kaì agathós
	καλός καὶ ἀγαθός	kalós kaì agathós	kalós kaì agathós

local export = {}

local m_data = mw.loadData("Module:grc-utilities/data")
local chars = m_data.named

local tt = {
	-- Vowels
	["α"] = "a",
	["ε"] = "e",
	["η"] = "e"..chars.macron,
	["ι"] = "i",
	["ο"] = "o",
	["υ"] = "u",
--	["υ"] = "y",
	["ω"] = "o"..chars.macron,

	-- Consonants
	["β"] = "b",
	["γ"] = "g",
	["δ"] = "d",
	["ζ"] = "z",
	["θ"] = "th",
	["κ"] = "k",
	["λ"] = "l",
	["μ"] = "m",
	["ν"] = "n",
	["ξ"] = "ks",
	["π"] = "p",
	["ρ"] = "r",
	["σ"] = "s",
	["ς"] = "s",
	["τ"] = "t",
	["φ"] = "ph",
	["χ"] = "kh",
	["ψ"] = "ps",
	
	-- Archaic letters
	["ϝ"] = "w",
	["ϻ"] = "ś",
	["ϙ"] = "q",
	["ϡ"] = "š",
	["ͷ"] = "v",
	
	-- Diacritics
	[chars.macron] = chars.macron, -- macron 304
	[chars.breve] = '', -- breve 306
	[chars.smooth] = '', -- psili 313
	[chars.rough] = '', -- dasia 314
	[chars.diaeresis] = chars.diaeresis, -- trema 308
	[chars.grave] = chars.grave, -- grave 300
	[chars.acute] = chars.acute, -- acute 301
	[chars.circum] = chars.Latin_circum, -- circumflex 342
	[chars.subscript] = 'i', -- hypogegrammene 345
	
	-- For internal processing of diaeresis
	['+'] = '',
}

local sub = mw.ustring.sub
local gsub = mw.ustring.gsub
local match = mw.ustring.match

local function get(text, index)
	return sub(text, index, index)
end

-- Concatenate or insert.
local function add(list, index, text)
	if list[index] then
		list[index] = list[index] .. text
	else
		list[index] = text
	end
end

local i_diphthong = "[ΑΕΗΟΥΩαεηουω]ι"
local u_diphthong = "[ΑΕΗΟΩαεηοω]υ"
local diacritic = m_data.all

local function is(char, X)
	return match(char, '^' .. X .. '$')
end

local function getTokens(text)
	if type(text) ~= "string" then
		error("Text is not a string", 2)
	end
	local tokens = {}
	-- token tracks our position in the table of tokens.
	local token = 1
	while mw.ustring.len(text) > 0 do
		local char1 = get(text, 1)
		local char2 = get(text, 2)
		local twoChars = char2 and char1 .. char2
		-- Look for a diacritic and add it to the current token. Remove it from the text.
		if char1 and is(char1, diacritic) then
			add(tokens, token, char1)
			text = sub(text, 2)
		-- Look for a diphthong and add it to the current token. Remove it from the text.
		elseif twoChars and (is(twoChars, u_diphthong) or is(twoChars, i_diphthong)) then
			add(tokens, token, twoChars)
			text = sub(text, 3)
		else
		-- Add the current character to the next token. Remove it from the text.
			token = token + 1
			add(tokens, token, char1)
			text = sub(text, 2)
		end
	end
	
	return tokens
end

local function interpret(tokens)
	if type(tokens) ~= "table" then
		error("Tokens is not a table", 2)
	end
	
	out = {}
	for i, token in pairs(tokens) do
		t = gsub(mw.ustring.lower(token), '.', function(x) return tt[x] end)
		
		-- elseif is misleading (these are independent) but it's more concise this way
	--[[if #token > 1 then
			if match(token, chars.macron .. '.*' .. chars.diaeresis) then
				t = gsub(t, chars.diaeresis, '')
			elseif match(token, u_diphthong) and not match(token, chars.diaeresis) then
				t = gsub(t, 'y', 'u') 
			end
		end]]
		if token == 'γ' and tokens[i+1] and match(tokens[i+1], '[κγχξ]') then
			t = 'n'
		elseif token == 'ρ' and tokens[i-1] and tokens[i-1] == 'ρ' then
			t = 'rh'
		elseif match(token, '[αΑ].*' .. chars.subscript) then
			t = gsub(t, '([aA])', '%1' .. chars.macron)
		end
		--[[ for moving an acute to the offglide of a long diphthong
		if match(token, chars.subscript) and match(token, chars.acute) then
			t = gsub(t, '(' .. chars.acute .. ')(i)', '%2%1')
		end
		]]
		if match(token, chars.rough) then
			if match(token, '[Ρρ]') then
			-- Could add Ϝϝ, but such words would likely be written with ϝη
				t = t .. 'h'
			else
				t = 'h' .. t
			end
		end
	
		if match(t, chars.Latin_circum) then
			t = gsub(t, chars.macron, '')
		end
		
		if token ~= mw.ustring.lower(token) then
			t = mw.ustring.upper(get(t, 1) ) .. mw.ustring.lower(mw.ustring.sub(t, 2) )
		end
		table.insert(out, t)
	end
	return table.concat(out)
end

function export.tr(text, lang, sc)
	-- If the script is given as Cprt, then forward the transliteration to that module
	if sc == "Cprt" then
		return require("Module:Cprt-translit").tr(text, lang, sc)
	end

	-- decompose text
	text = mw.ustring.toNFD(text)
	
	tokens = getTokens(text)
	return interpret(tokens)
end

return export

Module:grc-translit: difference between revisions

Revision as of 22:49, 30 January 2017

Functions

Navigation menu

Search