Module:grc-translit/sandbox

From Wiktionary, the free dictionary
Jump to navigation Jump to search

2 of 35 tests failed. (refresh)

TextExpectedActual
test_links:
PassedΛΌΓΟΣLÓGOSLÓGOS
FailedΟἿΑΙHOÎAIHoîAi
FailedῬΉΤΩΡRHḖTŌRRhḖTŌR
Passedλόγοςlógoslógos
Passedσφίγξsphínxsphínx
Passedϝάναξwánaxwánax
Passedοἷαιhoîaihoîai
Passedταῦροςtaûrostaûros
Passedνηῦςnēûsnēûs
Passedσῦςsûssûs
Passedὗςhûshûs
Passedγυῖονguîonguîon
Passedἀναῡ̈τέωanaṻtéōanaṻtéō
Passedδαΐφρωνdaḯphrōndaḯphrōn
Passedτῶνtôntôn
Passedτοὶtoìtoì
Passedτῷtôitôi
Passedτούτῳtoútōitoútōi
Passedσοφίᾳsophíāisophíāi
Passedμᾱ̆νόςmānósmānós
Passedhoho
Passedοἱhoihoi
Passedεὕρισκεheúriskeheúriske
Passedὑϊκόςhuïkóshuïkós
Passedπυρρόςpurrhóspurrhós
Passedῥέωrhéōrhéō
Passedσάἁμονsáhamonsáhamon
PassedὈδυσσεύςOdusseúsOdusseús
PassedΕἵλωςHeílōsHeílōs
PassedᾍδηςHā́idēsHā́idēs
Passedἡ Ἑλήνηhē Helḗnēhē Helḗnē
Passed𐠠𐠒𐠯𐠗pi-lo-ti-mopi-lo-ti-mo
Passedἔχεις μοι εἰπεῖν, ὦ Σώκρατες, ἆρα διδακτὸν ἡ ἀρετή;ékheis moi eipeîn, ô Sṓkrates, âra didaktòn hē aretḗ?ékheis moi eipeîn, ô Sṓkrates, âra didaktòn hē aretḗ?
Passedτί τηνικάδε ἀφῖξαι, ὦ Κρίτων; ἢ οὐ πρῲ ἔτι ἐστίν;tí tēnikáde aphîxai, ô Krítōn? ḕ ou prṑi éti estín?tí tēnikáde aphîxai, ô Krítōn? ḕ ou prṑi éti estín?
Passedτούτων φωνήεντα μέν ἐστιν ἑπτά· α ε η ι ο υ ω.toútōn phōnḗenta mén estin heptá; a e ē i o u ō.toútōn phōnḗenta mén estin heptá; a e ē i o u ō.

local export = {}

local m_data = require('Module:grc-utilities/data')
local tokenize = require('Module:grc-utilities').tokenize

local ufind = mw.ustring.find
local ugsub = mw.ustring.gsub
local U = mw.ustring.char
local ulower = mw.ustring.lower
local uupper = mw.ustring.upper

-- Can't do range with null byte apparently.
local UTF8char = '[\1-\127\194-\244][\128-\191]*'

-- Diacritics
local diacritics = m_data.named

-- Greek
local acute = diacritics.acute
local grave = diacritics.grave
local circumflex = diacritics.circum
local diaeresis = diacritics.diaeresis
local smooth = diacritics.smooth
local rough = diacritics.rough
local macron = diacritics.macron
local breve = diacritics.breve
local subscript = diacritics.subscript

-- Latin
local hat = diacritics.Latin_circum

local macron_diaeresis = macron .. diaeresis .. "?" .. hat
-- equivalent to '[αΑ]'
local alpha = '\206[\177\145]'
local a_subscript = '^' .. alpha .. '.*' .. subscript .. '$'
local is_velar = {
	['κ'] = true,
	['γ'] = true,
	['χ'] = true,
	['ξ'] = true,
}

local tt = {
	-- Vowels
	["α"] = "a",
	["ε"] = "e",
	["η"] = "e" .. macron,
	["ι"] = "i",
	["ο"] = "o",
	["υ"] = "u",
	["ω"] = "o" .. macron,

	-- Consonants
	["β"] = "b",
	["γ"] = "g",
	["δ"] = "d",
	["ζ"] = "z",
	["θ"] = "th",
	["κ"] = "k",
	["λ"] = "l",
	["μ"] = "m",
	["ν"] = "n",
	["ξ"] = "x",
	["π"] = "p",
	["ρ"] = "r",
	["σ"] = "s",
	["ς"] = "s",
	["τ"] = "t",
	["φ"] = "ph",
	["χ"] = "kh",
	["ψ"] = "ps",
	
	-- Archaic letters
	["ϝ"] = "w",
	["ϻ"] = "ś",
	["ϙ"] = "q",
	["ϡ"] = "š",
	["ͷ"] = "v",
	
	-- Diacritics
	-- unchanged: macron, diaeresis, grave, acute
	[breve] = '',
	[smooth] = '',
	[rough] = '',
	[circumflex] = hat,
	[subscript] = 'i',
}

function export.tr(text, lang, sc)
	-- If the script is given as Cprt, then forward the transliteration to that module.
	-- This should not be necessary, as [[Module:translit-redirect]] redirects
	-- to this module only if script is polytonic.
	if sc == "Cprt" then
		-- [[Special:WhatLinksHere/Wiktionary:Tracking/grc-translit/Cprt]]
		require('Module:debug').track('grc-translit/Cprt')
		return require('Module:Cprt-translit').tr(text, lang, sc)
	end
	
	if text == '῾' then
		return 'h'
	end
	
	--[[
		Replace semicolon or Greek question mark with regular question mark,
		except after an ASCII alphanumeric character (to avoid converting
		semicolons in HTML entities).
	]]
	text = ugsub(text, "([^A-Za-z0-9])[;" .. U(0x37E) .. "]", "%1?")
	
	-- Handle the middle dot. It is equivalent to semicolon or colon, but semicolon is probably more common.
	text = text:gsub("·", ";")
	
	local tokens = tokenize(text)

	--now read the tokens
	local output = {}
	for i, token in pairs(tokens) do
		-- Convert token to lowercase and substitute each character
		-- for its transliteration
		local translit = ulower(token):gsub(UTF8char, tt)
		
		if token == 'γ' and is_velar[tokens[i + 1]] then
			-- γ before a velar should be <n>
			translit = 'n'
		elseif token == 'ρ' and tokens[i - 1] == 'ρ' then
			-- ρ after ρ should be <rh>
			translit = 'rh'
		elseif token:find(a_subscript) then
			-- add macron to ᾳ
			translit = translit:gsub('[Aa]', '%0' .. macron)
		end
		
		if token:find(rough) then
			if ufind(token, '^[Ρρ]') then
				translit = translit .. 'h'
			else -- vowel
				translit = 'h' .. translit
			end
		end
		
		-- Remove macron from a vowel that has a circumflex.
		if ufind(translit, macron_diaeresis) then
			translit = translit:gsub(macron, '')
		end
		
		-- Capitalize first character of transliteration.
		if token ~= ulower(token) then
			translit = translit:gsub("^" .. UTF8char, uupper)
		end
		
		table.insert(output, translit)
	end
	output = table.concat(output)
	
	return output
end

return export