Module:grc-translit
Appearance
- The following documentation is located at Module:grc-translit/documentation. [edit] Categories were auto-generated by Module:module categorization. [edit]
- Useful links: subpage list • links • transclusions • testcases • sandbox
This module will transliterate Ancient Greek language text per WT:GRC TR. It is also used to transliterate Proto-Brythonic, Gaulish, Messapic, Eteocretan, Demotic, Paeonian, Pre-Samnite, Oscan, Sicel, Thracian, Bactrian, Dacian, Galatian, Alanic, Elymian, Old Median, Ancient Macedonian, Phrygian, and Punic.
The module should preferably not be called directly from templates or other modules.
To use it from a template, use {{xlit}}.
Within a module, use Module:languages#Language:transliterate.
For testcases, see Module:grc-translit/testcases.
Functions
tr(text, lang, sc)- Transliterates a given piece of
textwritten in the script specified by the codesc, and language specified by the codelang. - When the transliteration fails, returns
nil.
2 of 66 tests failed. (refresh)
| Text | Expected | Actual | |
|---|---|---|---|
| λόγος | lógos | lógos | |
| οἷαι | hoîai | hoîai | |
| velar | |||
| ἄγγελος | ángelos | ángelos | |
| ἔγκειμαι | énkeimai | énkeimai | |
| σφίγξ | sphínx | sphínx | |
| τυγχάνω | tunkhánō | tunkhánō | |
| Ἀγϙυλίων | Anqulíōn | Anqulíōn | |
| archaic letters | |||
| Ϙόρῐνθοϻ | Qórĭnthos | Qórĭnthos | |
| ϝάναξ | wánax | wánax | |
| ἀρκͱᾱγέτας | arkhāgétas | arkhāgétas | |
| *-ϳω | *-jō | *-jō | |
| current problems | |||
| Υἱός | '''Hu'''iós | '''U'''hiós | |
| u/y | |||
| ταῦρος | taûros | taûros | |
| νηῦς | nēûs | nēûs | |
| σῦς | sûs | sûs | |
| ὗς | hûs | hûs | |
| γυῖον | guîon | guîon | |
| αὐτοϋιός | autoüiós | autoüiós | |
| ἀναῡ̈τέω | anaṻtéō | anaṻtéō | |
| δαΐφρων | daḯphrōn | daḯphrōn | |
| πρηῠ́ς | prēŭ́s | prēŭ́s | |
| ταὧς | tahôs | tahôs | |
| vowel length | |||
| τῶν | tôn | tôn | |
| τοὶ | toì | toì | |
| τῷ | tōî | tōî | |
| τούτῳ | toútōi | toútōi | |
| σοφίᾳ | sophíāi | sophíāi | |
| Θρᾴκη | Thrāíkē | Thrāíkē | |
| προσηύδᾱ | prosēúdā | prosēúdā | |
| Καῖσᾰρ | Kaîsăr | Kaîsăr | |
| ᾰ̓γᾰ́πη | ăgắpē | ăgắpē | |
| μᾱ̆νός | mā̆nós | mā̆nós | |
| ὑπόγυͅον | hupógūion | hupógūion | |
| αὐτοϋιός | autoüiós | autoüiós | |
| τετληϋῖα | tetlēüîa | tetlēüîa | |
| ἑᾱυτοῦ | heāutoû | heāutoû | |
| ᾱὑτοῦ | hāutoû | āhutoû | |
| **αὐτουͅός | **autoūiós | **autoūiós | |
| **αὐτοῧͅος | **autoṻîos | **autoṻîos | |
| breathing | |||
| ὁ | ho | ho | |
| οἱ | hoi | hoi | |
| εὕρισκε | heúriske | heúriske | |
| ὑϊκός | huïkós | huïkós | |
| πυρρός | purrhós | purrhós | |
| ῥέω | rhéō | rhéō | |
| μύῤῥᾱ | múrrhā | múrrhā | |
| ῤάριον | rárion | rárion | |
| Ρ̓ᾶρος | Râros | Râros | |
| σάἁμον | sáhamon | sáhamon | |
| ταὧς | tahôs | tahôs | |
| ϝ̔έ | whé | whé | |
| **ἔῥῥευσᾰ | **érhrheusă | **érhrheusă | |
| **Βοῤῤᾶς | **Borrâs | **Borrâs | |
| capitals | |||
| Ὀδυσσεύς | Odusseús | Odusseús | |
| Εἵλως | Heílōs | Heílōs | |
| ᾍδης | Hāídēs | Hāídēs | |
| ἡ Ἑλήνη | hē Helḗnē | hē Helḗnē | |
| ΙΧΘΥΣ | IKHTHUS | IKHTHUS | |
| punctuation | |||
| ἔχεις μοι εἰπεῖν, ὦ Σώκρατες, ἆρα διδακτὸν ἡ ἀρετή; | ékheis moi eipeîn, ô Sṓkrates, âra didaktòn hē aretḗ? | ékheis moi eipeîn, ô Sṓkrates, âra didaktòn hē aretḗ? | |
| τί τηνικάδε ἀφῖξαι, ὦ Κρίτων; ἢ οὐ πρῲ ἔτι ἐστίν; | tí tēnikáde aphîxai, ô Krítōn? ḕ ou prōì éti estín? | tí tēnikáde aphîxai, ô Krítōn? ḕ ou prōì éti estín? | |
| τούτων φωνήεντα μέν ἐστιν ἑπτά· α ε η ι ο υ ω. | toútōn phōnḗenta mén estin heptá; a e ē i o u ō. | toútōn phōnḗenta mén estin heptá; a e ē i o u ō. | |
| πήγ(νῡμῐ) | pḗg(nūmĭ) | pḗg(nūmĭ) | |
| ἄ(γ)γελος | á(n)gelos | á(n)gelos | |
| ἄγκυρ(ρ)α | ánkur(rh)a | ánkur(rh)a | |
| HTML entities | |||
| καλός καὶ ἀγαθός | kalós kaì agathós | kalós kaì agathós | |
| καλός καὶ ἀγαθός | kalós kaì agathós | kalós kaì agathós | |
local export = {}
local m_grc_utils = require("Module:grc-utilities")
local m_grc_utils_data = require("Module:grc-utilities/data")
local m_str_utils = require("Module:string utilities")
local tokenize = require("Module:grc-utilities").tokenize
local canonicalize = m_grc_utils.canonicalize
local concat = table.concat
local insert = table.insert
local split = m_str_utils.split
local u = m_str_utils.char
local ugsub = m_str_utils.gsub
local ulower = m_str_utils.lower
local umatch = mw.ustring.match
local uupper = m_str_utils.upper
-- Diacritics
local diacritic = m_grc_utils_data.diacritic
local diacritics = m_grc_utils_data.diacritics
-- Greek
local acute = diacritics.acute
local grave = diacritics.grave
local circumflex = diacritics.circum
local smooth = diacritics.smooth
local rough = diacritics.rough
local breve = diacritics.breve
local macron = diacritics.macron
local subscript = diacritics.subscript
local vowel = m_grc_utils_data.vowel
-- Latin
local hat = diacritics.Latin_circum
local au_subscript = "^[αυ].*" .. subscript .. "$"
local question_mark = u(0x37E)
local velar = "[γκξχϙ]"
local long_vowels = { -- Macron will be added.
["η"] = "e",
["ω"] = "o",
}
local tt = {
-- Vowels
["α"] = "a",
["ε"] = "e",
["ι"] = "i",
["ο"] = "o",
["υ"] = "u",
-- Consonants
["β"] = "b",
["γ"] = "g",
["δ"] = "d",
["ζ"] = "z",
["θ"] = "th",
["κ"] = "k",
["λ"] = "l",
["μ"] = "m",
["ν"] = "n",
["ξ"] = "x",
["π"] = "p",
["ρ"] = "r",
["σ"] = "s",
["ς"] = "s",
["τ"] = "t",
["φ"] = "ph",
["χ"] = "kh",
["ψ"] = "ps",
-- Other letters
["ϛ"] = "st",
["ϝ"] = "w",
["ͱ"] = "h",
["ϳ"] = "j",
["ϙ"] = "q",
["ϻ"] = "s",
["ϸ"] = "š",
["ͳ"] = "s",
--["ͷ"] = "v", Differs by dialect.
-- Diacritics
-- unchanged: macron, diaeresis, grave, acute
[smooth] = "",
[rough] = "",
[circumflex] = hat,
[subscript] = "i",
}
local function get_next_token(tokens, i)
local new = i + 1
local token = tokens[new]
while token and token:match("[()[%]{}]") do
new = new + 1
token = tokens[new]
end
return new, token, token and ulower(token), concat(tokens, nil, i + 1, new - 1)
end
local function translit_letter(letter, trail)
local tr = long_vowels[letter]
return (tr and (tr .. (trail:find(breve) and "" or macron)) or tt[letter] or letter) .. trail:gsub(".[\128-\191]*", tt)
end
local function do_translit(token)
-- Put iota subscript before accent marks, so that they appear on "i".
token = ugsub(token, "([" .. acute .. grave .. circumflex .. "]+)" .. subscript, subscript .. "%1")
return ugsub(token, "(.)(%W*)", translit_letter)
end
local function remove_macron_if_hat(m)
return m:find(hat) and m:gsub(macron, "") or m
end
local function insert_translit(output, translit, token, lower_token, next_token, next_token_lower, suffix)
-- Remove any duplicate diacritics (this shouldn't really happen).
local n
repeat
translit, n = ugsub(translit, "(" .. diacritic .. ")(%W-)%1", "%1%2")
until n == 0
-- Remove macron from a vowel that has a circumflex.
translit = ugsub(translit, "%W+", remove_macron_if_hat)
-- If capitalized, only capitalize the first letter unless the following token is capitalized as well.
insert(
output,
(token == lower_token and translit or
next_token == next_token_lower and translit:gsub("^" .. ".[\128-\191]*", uupper) or
uupper(translit)
) .. suffix
)
end
function export.tr(text, lang, sc)
if text == "῾" then
return "h"
end
-- in case of bold/italic text; only works in testcases submodule, not in sandbox, so outcommented
-- local remove_rough = {
-- ['ἱ'] = 'ι', ['ἵ'] = 'ί', ['ἳ'] = 'ὶ', ['ἷ'] = 'ῖ',
-- ['ὑ'] = 'υ', ['ὕ'] = 'ύ', ['ὓ'] = 'ὺ', ['ὗ'] = 'ῦ',
-- }
-- text = ugsub(text, "([αᾰᾱΑᾸᾹεΕηΗοΟυῠῡΥῨῩωΩ])(\'\'\'?)([ἱἵἳἷὑὕὓὗ])",
-- function(a,b,c)
-- return a .. rough .. b .. remove_rough[c]
-- end)
--[[
Replace semicolon or Greek question mark with regular question mark,
except any that occur in HTML entities. Use split to separate out the
chunks between any entities.
]]
text = split(canonicalize(text), "(&#?%w+;)")
for i = 1, #text, 2 do
text[i] = text[i]:gsub(";", "?"):gsub(question_mark, "?")
end
text = concat(text)
-- Handle the middle dot. It is equivalent to semicolon or colon, but semicolon is probably more common.
text = text:gsub("·", ";")
local tokens = tokenize(text)
--now read the tokens
local next_i, next_token, next_token_lower, suffix = get_next_token(tokens, 0)
local output = {suffix}
while next_token do
local i, token, lower_token, is_rough = next_i, next_token, next_token_lower
local translit = do_translit(lower_token)
next_i, next_token, next_token_lower, suffix = get_next_token(tokens, i)
-- γ before a velar should be <n>
if lower_token:find("γ") and next_token_lower and umatch(next_token_lower, velar) then
translit = translit:gsub("g", "n")
elseif lang == "xbc" and lower_token:find("φ") then
translit = translit:gsub("ph", "f")
elseif token == "ρ"..rough then
translit = "rh"
elseif token == "ρ"..smooth then
translit = "r"
-- ρ after ρ should be <rh>
elseif lang == "grc" and lower_token:find("ρ") then
-- Keep adding ρs until they run out. Set is_rough, so that "h" will get appended.
while next_token_lower and next_token_lower:find("ρ") do
insert_translit(output, translit, token, lower_token, next_token, next_token_lower, suffix)
i, token, lower_token, is_rough = next_i, next_token, next_token_lower, true
translit = do_translit(lower_token)
next_i, next_token, next_token_lower, suffix = get_next_token(tokens, i)
end
-- add macron to ᾳ
elseif umatch(lower_token, au_subscript) then
translit = translit:gsub("[au]", "%0" .. macron)
end
if is_rough or lower_token:find(rough) then
if umatch(lower_token, vowel) then
translit = "h" .. translit
else
local final = umatch(translit, "(%w)%W*$")
if final and final ~= "h" then
translit = translit .. "h"
end
end
end
insert_translit(output, translit, token, lower_token, next_token, next_token_lower, suffix)
end
return concat(output)
end
return export
Categories:
- Transliteration modules used by 20 languages
- Ancient Greek modules
- Transliteration modules
- Failing testcase modules
- Oscan modules
- Thracian modules
- Gaulish modules
- Galatian modules
- Old Median modules
- Messapic modules
- Demotic modules
- Proto-Brythonic modules
- Phrygian modules
- Alanic modules
- Elymian modules
- Punic modules
- Eteocretan modules
- Dacian modules
- Ancient Macedonian modules
- Pre-Samnite modules
- Paeonian modules
- Bactrian modules
- Sicel modules