Module:User:Mzajac/transform
Jump to navigation
Jump to search
- This module lacks a documentation subpage. You may create it.
- Useful links: root page • root page’s subpages • links • transclusions • testcases • user page • user talk page • userspace
This is a private module sandbox of Mzajac, for their own experimentation. Items in this module may be added and removed at Mzajac's discretion; do not rely on this module's stability.
-- Transliterations
--
-- invoke with {{#invoke:User:Mzajac/transform|romanize|[text]|method=[method]}}
--
-- where method=
-- scholarly (DEFAULT): According to [[Wiktionary:Ukrainian transliteration]], following Daniels and
-- Bright (1996) ''World’s Writing Systems''.
-- alaloc: ALA–LC, 1997 http://www.loc.gov/catdir/cpso/roman.html
-- bgn: BGN/PCGN 1965 http://libraries.ucsd.edu/bib/fed/USBGN_romanization.pdf
-- iso-1968: ISO/R 9:1968, (Ukrainian language-specific) variant 1.
-- iso: ISO 9:1995
-- ungegn: UNGEGN, after the Ukrainian National system, 2012 http://www.eki.ee/wgrs/rom1_uk.pdf.
-- Bugs
-- [none]
-- To do
--
-- Distinguish all caps from initial caps in context,
-- e.g., ХАТА = KHATA (not KhATA); Хата = Khata (not KHata); хата = khata
-- Convert only single apostrophes within words, or before soft vowels, to distinguish them from single
-- quotation marks and wikitext emphasis.
-- Set the order for method="all"
--
-- Tables for
-- uk-Latn-t-uk-Cyrl-m0-iso-1968-v2 ?= uk-Latn-x-british (British Standard)
-- Configuration
-- default romanization method
local methodDefault = "scholarly"
-- characters to be replaced
local searchDefault = "[АБВГҐДЕЄЖЗИIЇЙКЛМНОПРСТУФХЦЧШЩЬЮЯЪЁЫѢЭѪѲѴабвгґдеєжзиіїйклмнопрстуфхцчшщьюяъёыѣэѫѳѵ'’ʼ]"
local transform = {}
-- load transliteration tables from a data module
-- * Cyrillic characters as table indexes seem to fail when imported through mw.loadData *
-- * USING LOCAL DATA INSTEAD *
--[[
local ttable = mw.loadData('Module:User:Mzajac/transform/uk-Latn-t-uk-Cyrl')
]]--
-- Ukrainian Romanization tables
local ttable = {
["scholarly"] = {
["shortname"] = "Scholarly",
["name"] = "Scholarly",
["lang"] = "uk-Latn-t-uk-Cyrl-x-scholarly",
["patterns"] = {
[1] = {
["search"] = searchDefault,
["replace"] = {
["А"] = "A",
["а"] = "a",
["Б"] = "B",
["б"] = "b",
["В"] = "V",
["в"] = "v",
["Г"] = "H",
["г"] = "h",
["Ґ"] = "G",
["ґ"] = "g",
["Д"] = "D",
["д"] = "d",
["Е"] = "E",
["е"] = "e",
["Є"] = "Je",
["є"] = "je",
["Ж"] = "Ž",
["ж"] = "ž",
["З"] = "Z",
["з"] = "z",
["И"] = "Y",
["и"] = "y",
["I"] = "I",
["і"] = "i",
["Ї"] = "Ji",
["ї"] = "ji",
["Й"] = "J",
["й"] = "j",
["К"] = "K",
["к"] = "k",
["Л"] = "L",
["л"] = "l",
["М"] = "M",
["м"] = "m",
["Н"] = "N",
["н"] = "n",
["О"] = "O",
["о"] = "o",
["П"] = "P",
["п"] = "p",
["Р"] = "R",
["р"] = "r",
["С"] = "S",
["с"] = "s",
["Т"] = "T",
["т"] = "t",
["У"] = "U",
["у"] = "u",
["Ф"] = "F",
["ф"] = "f",
["Х"] = "X",
["х"] = "x",
["Ц"] = "C",
["ц"] = "c",
["Ч"] = "Č",
["ч"] = "č",
["Ш"] = "Š",
["ш"] = "š",
["Щ"] = "Šč",
["щ"] = "šč",
["Ь"] = "ʹ",
["ь"] = "ʹ",
["Ю"] = "Ju",
["ю"] = "ju",
["Я"] = "Ja",
["я"] = "ja",
["'"] = "ʺ", -- apostrophe
["’"] = "ʺ", -- right single quotation mark
["ʼ"] = "ʺ", -- modifier letter apostrophe
-- Archaic letters
["Ъ"] = "ʺ",
["ъ"] = "ʺ",
["Ё"] = "Ë",
["ё"] = "ë",
["Ы"] = "Y",
["ы"] = "y",
["Ѣ"] = "Ě",
["ѣ"] = "ě",
["Э"] = "È",
["э"] = "è",
["Ѫ"] = "�",
["ѫ"] = "�",
["Ѳ"] = "�",
["ѳ"] = "�",
["Ѵ"] = "�",
["ѵ"] = "�",
}
}
}
},
["ungegn"] = {
["shortname"] = "UNGEGN",
["name"] = "Ukrainian National/UNGEGN",
["lang"] = "uk-Latn-t-uk-Cyrl-m0-ungegn-2012",
["patterns"] = {
-- note 1 [Г]: “gh is used in the romanization of зг (zgh)”
[1] = {
["search"] = "[зЗ][гГ]",
["replace"] = {
["ЗГ"] = "ZGH",
["Зг"] = "Zgh",
["зг"] = "zgh",
}
},
-- note 2 [Є, Ї, Й, Ю, Я]: “The second variant is used at the beginning of a word”
[2] = {
["search"] = "([^%a'’ʼ])Є",
["replace"] = "%1Ye",
},
[3] = {
["search"] = "([^%a'’ʼ])є",
["replace"] = "%1ye",
},
[4] = {
["search"] = "([^%a'’ʼ])Ї",
["replace"] = "%1Yi",
},
[5] = {
["search"] = "([^%a'’ʼ])ї",
["replace"] = "%1yi",
},
[6] = {
["search"] = "([^%a'’ʼ])Й",
["replace"] = "%1Y",
},
[7] = {
["search"] = "([^%a'’ʼ])й",
["replace"] = "%1y",
},
[8] = {
["search"] = "([^%a'’ʼ])Ю",
["replace"] = "%1Yu",
},
[9] = {
["search"] = "([^%a'’ʼ])ю",
["replace"] = "%1yu",
},
[10] = {
["search"] = "([^%a'’ʼ])Я",
["replace"] = "%1Ya",
},
[11] = {
["search"] = "([^%a'’ʼ])я",
["replace"] = "%1ya",
},
-- Default 1-letter replacements
[12] = {
["search"] = searchDefault,
["replace"] = {
["А"] = "A",
["а"] = "a",
["Б"] = "B",
["б"] = "b",
["В"] = "V",
["в"] = "v",
["Г"] = "H",
["г"] = "h",
["Ґ"] = "G",
["ґ"] = "g",
["Д"] = "D",
["д"] = "d",
["Е"] = "E",
["е"] = "e",
["Є"] = "Ie",
["є"] = "ie",
["Ж"] = "Zh",
["ж"] = "zh",
["З"] = "Z",
["з"] = "z",
["И"] = "Y",
["и"] = "y",
["I"] = "I",
["і"] = "i",
["Ї"] = "I",
["ї"] = "i",
["Й"] = "I",
["й"] = "i",
["К"] = "K",
["к"] = "k",
["Л"] = "L",
["л"] = "l",
["М"] = "M",
["м"] = "m",
["Н"] = "N",
["н"] = "n",
["О"] = "O",
["о"] = "o",
["П"] = "P",
["п"] = "p",
["Р"] = "R",
["р"] = "r",
["С"] = "S",
["с"] = "s",
["Т"] = "T",
["т"] = "t",
["У"] = "U",
["у"] = "u",
["Ф"] = "F",
["ф"] = "f",
["Х"] = "Kh",
["х"] = "kh",
["Ц"] = "Ts",
["ц"] = "ts",
["Ч"] = "Ch",
["ч"] = "ch",
["Ш"] = "Sh",
["ш"] = "sh",
["Щ"] = "Shch",
["щ"] = "shch",
["Ь"] = "",
["ь"] = "",
["Ю"] = "Iu",
["ю"] = "iu",
["Я"] = "Ia",
["я"] = "ia",
["'"] = "", -- apostrophe
["’"] = "", -- right single quotation mark
["ʼ"] = "", -- modifier letter apostrophe
-- Archaic letters (non-standard)
["Ъ"] = "",
["ъ"] = "",
["Ё"] = "Ë",
["ё"] = "ë",
["Ы"] = "Y",
["ы"] = "y",
["Ѣ"] = "Ě",
["ѣ"] = "ě",
["Э"] = "E",
["э"] = "e",
["Ѫ"] = "�",
["ѫ"] = "�",
["Ѳ"] = "�",
["ѳ"] = "�",
["Ѵ"] = "�",
["ѵ"] = "�",
}
}
}
},
["iso-1968"] = {
["shortname"] = "ISO 1968",
["name"] = "ISO/R 9:1968, Ukrainian variant",
["lang"] = "uk-Latn-t-uk-Cyrl-m0-iso-1968",
["patterns"] = {
[1] = {
["search"] = searchDefault,
["replace"] = {
["А"] = "A",
["а"] = "a",
["Б"] = "B",
["б"] = "b",
["В"] = "V",
["в"] = "v",
["Г"] = "H",
["г"] = "h",
["Ґ"] = "G",
["ґ"] = "g",
["Д"] = "D",
["д"] = "d",
["Е"] = "E",
["е"] = "e",
["Є"] = "Je",
["є"] = "je",
["Ж"] = "Ž",
["ж"] = "ž",
["З"] = "Z",
["з"] = "z",
["И"] = "Y",
["и"] = "y",
["I"] = "I",
["і"] = "i",
["Ї"] = "Ï",
["ї"] = "ï",
["Й"] = "J",
["й"] = "j",
["К"] = "K",
["к"] = "k",
["Л"] = "L",
["л"] = "l",
["М"] = "M",
["м"] = "m",
["Н"] = "N",
["н"] = "n",
["О"] = "O",
["о"] = "o",
["П"] = "P",
["п"] = "p",
["Р"] = "R",
["р"] = "r",
["С"] = "S",
["с"] = "s",
["Т"] = "T",
["т"] = "t",
["У"] = "U",
["у"] = "u",
["Ф"] = "F",
["ф"] = "f",
["Х"] = "Ch",
["х"] = "ch",
["Ц"] = "C",
["ц"] = "c",
["Ч"] = "Č",
["ч"] = "č",
["Ш"] = "Š",
["ш"] = "š",
["Щ"] = "Šč",
["щ"] = "šč",
["Ь"] = "’",
["ь"] = "’",
["Ю"] = "Ju",
["ю"] = "ju",
["Я"] = "Ja",
["я"] = "ja",
["'"] = "", -- apostrophe
["’"] = "", -- right single quotation mark
["ʼ"] = "", -- modifier letter apostrophe
-- Archaic letters
["Ъ"] = "Ǎ",
["ъ"] = "ǎ",
-- Archaic letters (borrowed from other language columns in ISO/R 9:1968)
["Ё"] = "Ë",
["ё"] = "ë",
["Ы"] = "Y",
["ы"] = "y",
["Ѣ"] = "Ě",
["ѣ"] = "ě",
["Э"] = "Ė",
["э"] = "ė",
["Ѫ"] = "ʺ̣",
["ѫ"] = "ʺ̣",
["Ѳ"] = "Ḟ",
["ѳ"] = "ḟ",
["Ѵ"] = "Ẏ",
["ѵ"] = "ẏ",
}
}
}
},
["iso"] = {
["shortname"] = "ISO",
["name"] = "ISO 9:1995",
["lang"] = "uk-Latn-t-uk-Cyrl-m0-iso-1995",
["patterns"] = {
[1] = {
["search"] = searchDefault,
["replace"] = {
["А"] = "A",
["а"] = "a",
["Б"] = "B",
["б"] = "b",
["В"] = "V",
["в"] = "v",
["Г"] = "G",
["г"] = "g",
["Ґ"] = "G̀",
["ґ"] = "g̀",
["Д"] = "D",
["д"] = "d",
["Е"] = "E",
["е"] = "e",
["Є"] = "Ê",
["є"] = "ê",
["Ж"] = "Ž",
["ж"] = "ž",
["З"] = "Z",
["з"] = "z",
["И"] = "I",
["и"] = "i",
["I"] = "Ì",
["і"] = "ì",
["Ї"] = "Ï",
["ї"] = "ï",
["Й"] = "J",
["й"] = "j",
["К"] = "K",
["к"] = "k",
["Л"] = "L",
["л"] = "l",
["М"] = "M",
["м"] = "m",
["Н"] = "N",
["н"] = "n",
["О"] = "O",
["о"] = "o",
["П"] = "P",
["п"] = "p",
["Р"] = "R",
["р"] = "r",
["С"] = "S",
["с"] = "s",
["Т"] = "T",
["т"] = "t",
["У"] = "U",
["у"] = "u",
["Ф"] = "F",
["ф"] = "f",
["Х"] = "H",
["х"] = "h",
["Ц"] = "C",
["ц"] = "c",
["Ч"] = "Č",
["ч"] = "č",
["Ш"] = "Š",
["ш"] = "š",
["Щ"] = "Ŝ",
["щ"] = "ŝ",
["Ь"] = "ʹ",
["ь"] = "ʹ",
["Ю"] = "Û",
["ю"] = "û",
["Я"] = "Â",
["я"] = "â",
["'"] = "ˋ", -- apostrophe
["’"] = "ˋ", -- right single quotation mark
["ʼ"] = "ˋ", -- modifier letter apostrophe
-- Archaic letters
["Ъ"] = "ʺ",
["ъ"] = "ʺ",
["Ё"] = "Ë",
["ё"] = "ë",
["Ы"] = "Y",
["ы"] = "y",
["Ѣ"] = "Ě",
["ѣ"] = "ě",
["Э"] = "È",
["э"] = "è",
["Ѫ"] = "Ǎ",
["ѫ"] = "ǎ",
["Ѳ"] = "F̀",
["ѳ"] = "f̀",
["Ѵ"] = "Ỳ",
["ѵ"] = "ỳ",
}
}
}
},
["alaloc"] = {
["shortname"] = "ALA–LC",
["name"] = "US Library of Congress",
["lang"] = "uk-Latn-t-uk-Cyrl-m0-alaloc-1997",
["patterns"] = {
[1] = {
["search"] = searchDefault,
["replace"] = {
["А"] = "A",
["а"] = "a",
["Б"] = "B",
["б"] = "b",
["В"] = "V",
["в"] = "v",
["Г"] = "H",
["г"] = "h",
["Ґ"] = "G",
["ґ"] = "g",
["Д"] = "D",
["д"] = "d",
["Е"] = "E",
["е"] = "e",
["Є"] = "I͡e",
["є"] = "i͡e",
["Ж"] = "Z͡h",
["ж"] = "z͡h",
["З"] = "Z",
["з"] = "z",
["И"] = "Y",
["и"] = "y",
["I"] = "I",
["і"] = "i",
["Ї"] = "Ï",
["ї"] = "ï",
["Й"] = "Ĭ",
["й"] = "ĭ",
["К"] = "K",
["к"] = "k",
["Л"] = "L",
["л"] = "l",
["М"] = "M",
["м"] = "m",
["Н"] = "N",
["н"] = "n",
["О"] = "O",
["о"] = "o",
["П"] = "P",
["п"] = "p",
["Р"] = "R",
["р"] = "r",
["С"] = "S",
["с"] = "s",
["Т"] = "T",
["т"] = "t",
["У"] = "U",
["у"] = "u",
["Ф"] = "F",
["ф"] = "f",
["Х"] = "Kh",
["х"] = "kh",
["Ц"] = "T͡s",
["ц"] = "t͡s",
["Ч"] = "Ch",
["ч"] = "ch",
["Ш"] = "Sh",
["ш"] = "sh",
["Щ"] = "Shch",
["щ"] = "shch",
["Ь"] = "ʹ",
["ь"] = "ʹ",
["Ю"] = "I͡u",
["ю"] = "i͡u",
["Я"] = "I͡a",
["я"] = "i͡a",
["'"] = "", -- apostrophe
["’"] = "", -- right single quotation mark
["ʼ"] = "", -- modifier letter apostrophe
-- Archaic letters (non-standard)
["Ъ"] = "",
["ъ"] = "",
["Ё"] = "Ë",
["ё"] = "ë",
["Ы"] = "Y",
["ы"] = "y",
["Ѣ"] = "Ě",
["ѣ"] = "ě",
["Э"] = "E",
["э"] = "e",
["Ѫ"] = "�",
["ѫ"] = "�",
["Ѳ"] = "�",
["ѳ"] = "�",
["Ѵ"] = "�",
["ѵ"] = "�",
}
}
}
},
["bgn"] = {
["shortname"] = "BGN/PCGN",
["name"] = "US Board on Geographic Names and British Permanent Committee on Geographical Names",
["lang"] = "uk-Latn-t-uk-Cyrl-m0-bgn-1965",
["patterns"] = {
-- note 1: “The character sequences зг, кг, сг, тс, and цг may be romanized z·h, k·h, s·h, t·s, and ts·h
-- in order to differentiate those romanizations from the digraphs zh, kh, sh, ts, and the letter
-- sequence tsh, which are used to render the characters ж, х, ш, ц, and the character sequence тш.”
[1] = {
["search"] = "[зЗкКсСцЦ][гГ]",
["replace"] = {
["ЗГ"] = "Z·H",
["Зг"] = "Z·h",
["зг"] = "z·h",
["КГ"] = "K·H",
["Кг"] = "K·h",
["кг"] = "k·h",
["СГ"] = "S·H",
["Сг"] = "S·h",
["сг"] = "s·h",
["ЦГ"] = "TS·H",
["Цг"] = "Ts·h",
["цг"] = "ts·h",
}
},
[2] = {
["search"] = "[тТ][сС]",
["replace"] = {
["ТС"] = "T·S",
["Тс"] = "T·s",
["тс"] = "t·s",
}
},
-- Default 1-letter replacements
[3] = {
["search"] = searchDefault,
["replace"] = {
["А"] = "A",
["а"] = "a",
["Б"] = "B",
["б"] = "b",
["В"] = "V",
["в"] = "v",
["Г"] = "H",
["г"] = "h",
["Ґ"] = "G",
["ґ"] = "g",
["Д"] = "D",
["д"] = "d",
["Е"] = "E",
["е"] = "e",
["Є"] = "Ye",
["є"] = "ye",
["Ж"] = "Zh",
["ж"] = "zh",
["З"] = "Z",
["з"] = "z",
["И"] = "Y",
["и"] = "y",
["I"] = "I",
["і"] = "i",
["Ї"] = "Yi",
["ї"] = "yi",
["Й"] = "Y",
["й"] = "y",
["К"] = "K",
["к"] = "k",
["Л"] = "L",
["л"] = "l",
["М"] = "M",
["м"] = "m",
["Н"] = "N",
["н"] = "n",
["О"] = "O",
["о"] = "o",
["П"] = "P",
["п"] = "p",
["Р"] = "R",
["р"] = "r",
["С"] = "S",
["с"] = "s",
["Т"] = "T",
["т"] = "t",
["У"] = "U",
["у"] = "u",
["Ф"] = "F",
["ф"] = "f",
["Х"] = "Kh",
["х"] = "kh",
["Ц"] = "Ts",
["ц"] = "ts",
["Ч"] = "Ch",
["ч"] = "ch",
["Ш"] = "Sh",
["ш"] = "sh",
["Щ"] = "Shch",
["щ"] = "shch",
["Ь"] = "’",
["ь"] = "’",
["Ю"] = "Yu",
["ю"] = "yu",
["Я"] = "Ya",
["я"] = "ya",
["'"] = "ˮ", -- apostrophe
["’"] = "ˮ", -- right single quotation mark
["ʼ"] = "ˮ", -- modifier letter apostrophe
-- Archaic letters (non-standard)
["Ъ"] = "",
["ъ"] = "",
["Ё"] = "Ë",
["ё"] = "ë",
["Ы"] = "Y",
["ы"] = "y",
["Ѣ"] = "Ě",
["ѣ"] = "ě",
["Э"] = "E",
["э"] = "e",
["Ѫ"] = "�",
["ѫ"] = "�",
["Ѳ"] = "�",
["ѳ"] = "�",
["Ѵ"] = "�",
["ѵ"] = "�",
}
}
}
},
}
-- handle input
function transform.romanize(frame)
local inputText = frame.args[1]
local method = frame.args.method or methodDefault
if ttable[method] then -- just do a conversion
return transform.convert(inputText, method)
elseif method == "all" then -- loop through all methods
local result = "" -- overall start tag
local i = 1
for theMethod, theValue in pairs(ttable) do
-- comma following previous iteration
if i > 1 then result = result .. ", " end
i = i + 1
result = result .. "<span>" -- instance start tag
result = result .. transform.convert(inputText, theMethod, true)
result = result .. "</span>" -- instance end tag
end
result = result .. "" -- overall end tag
return result
else
error("Transliteration method “" .. method .. "” is not supported")
end
end
-- do the conversion
function transform.convert(inputText, method, showLabel)
local result = ""
result = result .. "<span"
result = result .. " lang='" .. ttable[method]["lang"] .. "'"
result = result .. " title='Romanized Ukrainian (" .. ttable[method]["name"] .. ")'"
result = result .. ">"
inputText = " " .. inputText .. " " -- pad with spaces to allow boundary patterns (working around lack of %f pattern)
for thePattern, theReplacements in ipairs(ttable[method]["patterns"]) do
inputText = (mw.ustring.gsub(inputText, ttable[method]["patterns"][thePattern]["search"], ttable[method]["patterns"][thePattern]["replace"]))
end
inputText = mw.ustring.sub(inputText, 2, mw.ustring.len(inputText) - 1) -- un-pad with spaces
result = result .. inputText
result = result .. "</span>"
if showLabel then
result = result .. " ("
if ttable[method]["shortname"] ~= ttable[method]["name"] then -- Use an abbr element if the short name doesn’t match name
result = result .. "<abbr title='" .. ttable[method]["name"] .. "'>"
end
result = result .. ttable[method]["shortname"]
if ttable[method]["shortname"] ~= ttable[method]["name"] then
result = result .. "</abbr>"
end
result = result .. ")"
end
return result
end
return transform