Module:okm-translit

From Wiktionary, the free dictionary
Jump to navigation Jump to search

This module will transliterate Middle Korean language text. It is also used to transliterate Early Modern Korean. The module should preferably not be called directly from templates or other modules. To use it from a template, use {{xlit}}. Within a module, use Module:languages#Language:transliterate.

For testcases, see Module:okm-translit/testcases.

Functions

tr(text, lang, sc)
Transliterates a given piece of text written in the script specified by the code sc, and language specified by the code lang.
When the transliteration fails, returns nil.

local export = {}
local gsub = mw.ustring.gsub

local chars_Hani = require('Module:scripts').getByCode('Hani'):getCharacters()
local chars_Hang = require('Module:scripts').getByCode('Hang'):getCharacters()

-- https://github.com/szc126/rime-slg-korean/blob/main/slg_break_jamo.yaml
-- https://github.com/szc126/rime-slg-korean/blob/main/soolegi_yethangeul.custom.yaml
local tt_complex = {
['ᄢ']='ᄇᄉᄀ',
['ᄣ']='ᄇᄉᄃ',
['ᄤ']='ᄇᄉᄇ',
['ᄥ']='ᄇᄉᄉ',
['ᄦ']='ᄇᄉᄌ',
['ᄳ']='ᄉᄇᄀ',
['ᄴ']='ᄉᄉᄉ',
['ꥥ']='ᄅᄀᄀ',
['ꥧ']='ᄅᄃᄃ',
['ꥪ']='ᄅᄇᄇ',
['ꥲ']='ᄇᄉᄐ',
['ꥵ']='ᄉᄉᄇ',
['ꥸ']='ᄌᄌᄒ',
['ᄁ']='ᄀᄀ',
['ᄄ']='ᄃᄃ',
['ᄈ']='ᄇᄇ',
['ᄊ']='ᄉᄉ',
['ᄍ']='ᄌᄌ',
['ᄓ']='ᄂᄀ',
['ᄔ']='ᄂᄂ',
['ᄕ']='ᄂᄃ',
['ᄖ']='ᄂᄇ',
['ᄗ']='ᄃᄀ',
['ᄘ']='ᄅᄂ',
['ᄙ']='ᄅᄅ',
['ᄚ']='ᄅᄒ',
['ᄜ']='ᄆᄇ',
['ᄞ']='ᄇᄀ',
['ᄟ']='ᄇᄂ',
['ᄠ']='ᄇᄃ',
['ᄡ']='ᄇᄉ',
['ᄧ']='ᄇᄌ',
['ᄨ']='ᄇᄎ',
['ᄩ']='ᄇᄐ',
['ᄪ']='ᄇᄑ',
['ᄬ']='ᄫᄫ',
['ᄭ']='ᄉᄀ',
['ᄮ']='ᄉᄂ',
['ᄯ']='ᄉᄃ',
['ᄰ']='ᄉᄅ',
['ᄱ']='ᄉᄆ',
['ᄲ']='ᄉᄇ',
['ᄵ']='ᄉᄋ',
['ᄶ']='ᄉᄌ',
['ᄷ']='ᄉᄎ',
['ᄸ']='ᄉᄏ',
['ᄹ']='ᄉᄐ',
['ᄺ']='ᄉᄑ',
['ᄻ']='ᄉᄒ',
['ᄽ']='ᄼᄼ',
['ᄿ']='ᄾᄾ',
['ᅁ']='ᄋᄀ',
['ᅂ']='ᄋᄃ',
['ᅃ']='ᄋᄆ',
['ᅄ']='ᄋᄇ',
['ᅅ']='ᄋᄉ',
['ᅆ']='ᄋᅀ',
['ᅇ']='ᄋᄋ',
['ᅈ']='ᄋᄌ',
['ᅉ']='ᄋᄎ',
['ᅊ']='ᄋᄐ',
['ᅋ']='ᄋᄑ',
['ᅍ']='ᄌᄋ',
['ᅏ']='ᅎᅎ',
['ᅑ']='ᅐᅐ',
['ᅒ']='ᄎᄏ',
['ᅓ']='ᄎᄒ',
['ᅖ']='ᄑᄇ',
['ᅘ']='ᄒᄒ',
['ᅚ']='ᄀᄃ',
['ᅛ']='ᄂᄉ',
['ᅜ']='ᄂᄌ',
['ᅝ']='ᄂᄒ',
['ᅞ']='ᄃᄅ',
['ꥠ']='ᄃᄆ',
['ꥡ']='ᄃᄇ',
['ꥢ']='ᄃᄉ',
['ꥣ']='ᄃᄌ',
['ꥤ']='ᄅᄀ',
['ꥦ']='ᄅᄃ',
['ꥨ']='ᄅᄆ',
['ꥩ']='ᄅᄇ',
['ꥫ']='ᄅᄫ',
['ꥬ']='ᄅᄉ',
['ꥭ']='ᄅᄌ',
['ꥮ']='ᄅᄏ',
['ꥯ']='ᄆᄀ',
['ꥰ']='ᄆᄃ',
['ꥱ']='ᄆᄉ',
['ꥳ']='ᄇᄏ',
['ꥴ']='ᄇᄒ',
['ꥶ']='ᄋᄅ',
['ꥷ']='ᄋᄒ',
['ꥹ']='ᄐᄐ',
['ꥺ']='ᄑᄒ',
['ꥻ']='ᄒᄉ',
['ꥼ']='ᅙᅙ',

['ᆅ']='@ᅩ@ᅡ@',
['ᆒ']='@ᅮ@ᅥ@',
['ᅹ']='@ᅡ@ᅩ',
['ᆄ']='@ᅩ@ᅡ',
['ᆆ']='@ᅩ@ᅥ',
['ᆑ']='@ᅮ@ᅥ',
['ᆥ']='@ᅥ@ᅡ',
['ᆐ']='@ᅮᅥ@',
['ힳ']='@ᅩᅡ@',
['ힷ']='@ᅮᅡ@',
['ᆁ']='ᅩ@ᅥ@',
['ᆌ']='ᅮ@ᅥ@',
['ᆧ']='ᅩ@ᅡ@',
['ힽ']='ᅵ@ᅡᅩ',
['ힾ']='ᅵ@ᅡ@',
['ퟀ']='ᅵ@ᅥ@',
['ᅤ']='@ᅡ@',
['ᅨ']='@ᅥ@',
['ᅸ']='@ᅡᅩ',
['ᅽ']='@ᅥᅩ',
['ᅾ']='@ᅥᅮ',
['ᆇ']='@ᅩᅩ',
['ᆈ']='@ᅩ@',
['ᆎ']='@ᅮᅡ',
['ᆏ']='@ᅮᅥ',
['ᆓ']='@ᅮᅮ',
['ᆔ']='@ᅮ@',
['ᆤ']='@ᅡᅮ',
['ힲ']='@ᅩᅡ',
['ힴ']='@ᅩᅥ',
['ힸ']='@ᅮᅩ',
['ᆙ']='ᅵ@ᅡ',
['ᆦ']='ᅩ@ᅡ',
['ힰ']='ᅩ@ᅥ',
['ힵ']='ᅮ@ᅥ',
['ힿ']='ᅵ@ᅥ',
['ퟂ']='ᅵ@ᅩ',
['ퟃ']='ᅵ@ᅮ',
['ᅫ']='ᅩᅡ@',
['ᅰ']='ᅮᅥ@',
['ᆀ']='ᅩᅥ@',
['ᆊ']='ᅮᅡ@',
['ᆋ']='ᅮᅥᅳ',
['ᆗ']='ᅳᅵᅮ',
['ힱ']='ᅩᅩᅵ',
['ힶ']='ᅮᅵ@',
['ힻ']='ᅳᅥ@',
['ퟁ']='ᅵᅩᅵ',
['ퟆ']='ᆞᅥ@',
['ᅣ']='@ᅡ',
['ᅧ']='@ᅥ',
['ᅭ']='@ᅩ',
['ᅲ']='@ᅮ',
['ᅢ']='ᅡ@',
['ᅦ']='ᅥ@',
['ᅪ']='ᅩᅡ',
['ᅬ']='ᅩ@',
['ᅯ']='ᅮᅥ',
['ᅱ']='ᅮ@',
['ᅴ']='ᅳ@',
['ᅶ']='ᅡᅩ',
['ᅷ']='ᅡᅮ',
['ᅺ']='ᅥᅩ',
['ᅻ']='ᅥᅮ',
['ᅼ']='ᅥᅳ',
['ᅿ']='ᅩᅥ',
['ᆂ']='ᅩᅩ',
['ᆃ']='ᅩᅮ',
['ᆉ']='ᅮᅡ',
['ᆍ']='ᅮᅮ',
['ᆕ']='ᅳᅮ',
['ᆖ']='ᅳᅳ',
['ᆘ']='ᅵᅡ',
['ᆚ']='ᅵᅩ',
['ᆛ']='ᅵᅮ',
['ᆜ']='ᅵᅳ',
['ᆝ']='ᅵᆞ',
['ᆟ']='ᆞᅥ',
['ᆠ']='ᆞᅮ',
['ᆡ']='ᆞ@',
['ᆢ']='ᆞᆞ',
['ᆣ']='ᅡᅳ',
['ힹ']='ᅳᅡ',
['ힺ']='ᅳᅥ',
['ힼ']='ᅳᅩ',
['ퟄ']='ᅵ@',
['ퟅ']='ᆞᅡ',

['ᇄ']='ᆨᆺᆨ',
['ᇌ']='ᆯᆨᆺ',
['ᇏ']='ᆯᆮᇂ',
['ᇑ']='ᆯᆷᆨ',
['ᇒ']='ᆯᆷᆺ',
['ᇓ']='ᆯᆸᆺ',
['ᇔ']='ᆯᆸᇂ',
['ᇖ']='ᆯᆺᆺ',
['ᇞ']='ᆷᆺᆺ',
['ᇭ']='ᇰᆨᆨ',
['ퟎ']='ᆮᆮᆸ',
['ퟑ']='ᆮᆺᆨ',
['ퟕ']='ᆯᆨᆨ',
['ퟖ']='ᆯᆨᇂ',
['ퟗ']='ᆯᆯᆿ',
['ퟘ']='ᆯᆷᇂ',
['ퟙ']='ᆯᆸᆮ',
['ퟚ']='ᆯᆸᇁ',
['ퟜ']='ᆯᇹᇂ',
['ퟟ']='ᆷᆫᆫ',
['ퟡ']='ᆷᆸᆺ',
['ퟤ']='ᆸᆯᇁ',
['ퟧ']='ᆸᆺᆮ',
['ퟬ']='ᆺᆺᆨ',
['ퟭ']='ᆺᆺᆮ',
['ퟸ']='ᆽᆸᆸ',
['ᆩ']='ᆨᆨ',
['ᆪ']='ᆨᆺ',
['ᆬ']='ᆫᆽ',
['ᆭ']='ᆫᇂ',
['ᆰ']='ᆯᆨ',
['ᆱ']='ᆯᆷ',
['ᆲ']='ᆯᆸ',
['ᆳ']='ᆯᆺ',
['ᆴ']='ᆯᇀ',
['ᆵ']='ᆯᇁ',
['ᆶ']='ᆯᇂ',
['ᆹ']='ᆸᆺ',
['ᆻ']='ᆺᆺ',
['ᇃ']='ᆨᆯ',
['ᇅ']='ᆫᆨ',
['ᇆ']='ᆫᆮ',
['ᇇ']='ᆫᆺ',
['ᇈ']='ᆫᇫ',
['ᇉ']='ᆫᇀ',
['ᇊ']='ᆮᆨ',
['ᇋ']='ᆮᆯ',
['ᇍ']='ᆯᆫ',
['ᇎ']='ᆯᆮ',
['ᇐ']='ᆯᆯ',
['ᇕ']='ᆯᇦ',
['ᇗ']='ᆯᇫ',
['ᇘ']='ᆯᆿ',
['ᇙ']='ᆯᇹ',
['ᇚ']='ᆷᆨ',
['ᇛ']='ᆷᆯ',
['ᇜ']='ᆷᆸ',
['ᇝ']='ᆷᆺ',
['ᇟ']='ᆷᇫ',
['ᇠ']='ᆷᆾ',
['ᇡ']='ᆷᇂ',
['ᇣ']='ᆸᆯ',
['ᇤ']='ᆸᇁ',
['ᇥ']='ᆸᇂ',
['ᇧ']='ᆺᆨ',
['ᇨ']='ᆺᆮ',
['ᇩ']='ᆺᆯ',
['ᇪ']='ᆺᆸ',
['ᇬ']='ᇰᆨ',
['ᇮ']='ᇰᇰ',
['ᇯ']='ᇰᆿ',
['ᇱ']='ᇰᆺ',
['ᇲ']='ᇰᇫ',
['ᇳ']='ᇁᆸ',
['ᇵ']='ᇂᆫ',
['ᇶ']='ᇂᆯ',
['ᇷ']='ᇂᆷ',
['ᇸ']='ᇂᆸ',
['ᇺ']='ᆨᆫ',
['ᇻ']='ᆨᆸ',
['ᇼ']='ᆨᆾ',
['ᇽ']='ᆨᆿ',
['ᇾ']='ᆨᇂ',
['ᇿ']='ᆫᆫ',
['ퟋ']='ᆫᆯ',
['ퟌ']='ᆫᆾ',
['ퟍ']='ᆮᆮ',
['ퟏ']='ᆮᆸ',
['ퟐ']='ᆮᆺ',
['ퟒ']='ᆮᆽ',
['ퟓ']='ᆮᆾ',
['ퟔ']='ᆮᇀ',
['ퟛ']='ᆯᇰ',
['ퟞ']='ᆷᆫ',
['ퟠ']='ᆷᆷ',
['ퟢ']='ᆷᆽ',
['ퟣ']='ᆸᆮ',
['ퟥ']='ᆸᆷ',
['ퟦ']='ᆸᆸ',
['ퟨ']='ᆸᆽ',
['ퟩ']='ᆸᆾ',
['ퟪ']='ᆺᆷ',
['ퟫ']='ᆺᇦ',
['ퟮ']='ᆺᇫ',
['ퟯ']='ᆺᆽ',
['ퟰ']='ᆺᆾ',
['ퟱ']='ᆺᇀ',
['ퟲ']='ᆺᇂ',
['ퟳ']='ᇫᆸ',
['ퟴ']='ᇫᇦ',
['ퟵ']='ᇰᆷ',
['ퟶ']='ᇰᇂ',
['ퟷ']='ᆽᆸ',
['ퟹ']='ᆽᆽ',
['ퟺ']='ᇁᆺ',
['ퟻ']='ᇁᇀ',

-- compatibility jamo
['ㅩ']='ᄅᄀᄉ',
['ㅫ']='ᄅᄇᄉ',
['ㅴ']='ᄇᄉᄀ',
['ㅵ']='ᄇᄉᄃ',
['ㄲ']='ᄀᄀ',
['ㄸ']='ᄃᄃ',
['ㅃ']='ᄇᄇ',
['ㄳ']='ᄀᄉ',
['ㄵ']='ᄂᄌ',
['ㄶ']='ᄂᄒ',
['ㄺ']='ᄅᄀ',
['ㄻ']='ᄅᄆ',
['ㄼ']='ᄅᄇ',
['ㄽ']='ᄅᄉ',
['ㄾ']='ᄅᄐ',
['ㄿ']='ᄅᄑ',
['ㅀ']='ᄅᄒ',
['ㅄ']='ᄇᄉ',
['ㅆ']='ᄉᄉ',
['ㅉ']='ᄌᄌ',
['ㅥ']='ᄂᄂ',
['ㅦ']='ᄂᄃ',
['ㅧ']='ᄂᄉ',
['ㅨ']='ᄂᅀ',
['ㅪ']='ᄅᄃ',
['ㅬ']='ᄅᅀ',
['ㅭ']='ᄅᅙ',
['ㅮ']='ᄆᄇ',
['ㅯ']='ᄆᄉ',
['ㅰ']='ᄆᅀ',
['ㅲ']='ᄇᄀ',
['ㅳ']='ᄇᄃ',
['ㅶ']='ᄇᄌ',
['ㅷ']='ᄇᄐ',
['ㅹ']='ᄫᄫ',
['ㅺ']='ᄉᄀ',
['ㅻ']='ᄉᄂ',
['ㅼ']='ᄉᄃ',
['ㅽ']='ᄉᄇ',
['ㅾ']='ᄉᄌ',
['ㆀ']='ᄋᄋ',
['ㆂ']='ᅌᄉ',
['ㆃ']='ᅌᅀ',
['ㆅ']='ᄒᄒ',
['ㄱ']='ᄀ',
['ㄴ']='ᄂ',
['ㄷ']='ᄃ',
['ㄹ']='ᄅ',
['ㅁ']='ᄆ',
['ㅂ']='ᄇ',
['ㅅ']='ᄉ',
['ㅇ']='ᄋ',
['ㅈ']='ᄌ',
['ㅊ']='ᄎ',
['ㅋ']='ᄏ',
['ㅌ']='ᄐ',
['ㅍ']='ᄑ',
['ㅎ']='ᄒ',
['ㅤ']='ᅟ', -- filler
['ㅱ']='ᄝ',
['ㅸ']='ᄫ',
['ㅿ']='ᅀ',
['ㆁ']='ᅌ',
['ㆄ']='ᅗ',
['ㆆ']='ᅙ',

['ㆈ']='@ᅩ@ᅡᅵ',
['ㆋ']='@ᅮ@ᅥᅵ',
['ㆇ']='@ᅩ@ᅡ',
['ㆊ']='@ᅮ@ᅥ',
['ㅒ']='@ᅡᅵ',
['ㅖ']='@ᅥᅵ',
['ㅙ']='ᅩᅡᅵ',
['ㅞ']='ᅮᅥᅵ',
['ㆉ']='@ᅩᅵ',
['ㆌ']='@ᅮᅵ',
['ㅐ']='ᅡᅵ',
['ㅑ']='@ᅡ',
['ㅔ']='ᅥᅵ',
['ㅕ']='@ᅥ',
['ㅘ']='ᅩᅡ',
['ㅚ']='ᅩᅵ',
['ㅛ']='@ᅩ',
['ㅝ']='ᅮᅥ',
['ㅟ']='ᅮᅵ',
['ㅠ']='@ᅮ',
['ㅢ']='ᅳᅵ',
['ㅏ']='ᅡ',
['ㅓ']='ᅥ',
['ㅗ']='ᅩ',
['ㅜ']='ᅮ',
['ㅡ']='ᅳ',
['ㅣ']='ᅵ',
['ㆍ']='ᆞ',
}

local tt = [==[
BREAK	1

# remove hanja from (ex.) 사뎐(辭典)
# caps prob. isn't necessary since the "base" text is actually hangeul?
# Hani regex is a reasonable subset of Hani from [[Module:scripts/data]],
# last checked on 20220221
%([一-鿿㐀-䶿𠀀-𮯯𰀀-𱍏]+%)	×

# to yale

# non-simple
gᄋ	Ğ # voiced velar fricative /ɣ/
ᄋᄋ	Ő
@ᅮ	yu
@ᅩ	yo
ᅩᅡ	wa
ᅮᅥ	we
ᅵᆞ	yo
ᆞᆞ	yo

# choseong
ᄀ	K
ᄂ	N
ᄃ	T
ᄅ	L
ᄆ	M
ᄇ	P
ᄉ	S
ᄋ	Ø
ᄌ	C
ᄎ	CH
ᄏ	KH
ᄐ	TH
ᄑ	PH
ᄒ	H
ᄝ	◆
ᄫ	Ƃ
ᅗ	◆
ᄛ	◆
ᅌ	Ŋ
ᅀ	Z
ᅙ	Q
ᄼ	◆
ᅎ	◆
ᅔ	◆
ᄾ	◆
ᅐ	◆
ᅕ	◆
ᅟ	× # filler

# jungseong
@	y
ᅡ	a
ᅥ	e
ᅩ	wo
ᅮ	wu
ᅳ	u
ᅵ	i
ᆞ	o
ᅠ	× # filler

# jongseong
ᆨ	k
ᆫ	n
ᆮ	t
ᆯ	l
ᆷ	m
ᆸ	p
ᆺ	s
ᆼ	ø
ᆽ	c
ᆾ	ch
ᆿ	kh
ᇀ	th
ᇁ	ph
ᇂ	h
ᇢ	◆
ᇦ	ƃ
ᇴ	◆
ퟝ	◆
ᇰ	ŋ
ᇫ	z
ᇹ	q

# tone
〮	↑
〯	→

# tone diacritic location
([aiueo]+)([y]?)([↑→↓])	%1%3%2

# hyphens within syllables
# CV-y
# CVC-C
# CV-C
# C-V
%-%-%-%-(.-[wyaiueo↑→↓]+)(y)	%1-%2
%-%-%-(.-[wyaiueo↑→↓]+[^wyaiueo ])([^wyaiueo ])	%1-%2
%-%-%-(.-[wyaiueo↑→↓]+)	%1-
%-%-(.-)([wyaiueo])	%1-%2

# 子(ᄌᆞ)ㅣ
(%))(%-?)i	%1%2y

Ø	×

BREAK	2

↑	́
→	̌
↓	̀

ğ	G
ő	OO
Ø	NG # capitalized hanja readings
ø	ng
ƃ	W
Ŋ	NG # capitalized hanja readings
ŋ	ng
]==]

tt = mw.text.trim(tt)
tt = mw.ustring.gsub(tt, '%s*#[^\n]+', '') -- remove comments
tt = mw.ustring.gsub(tt, '\n+', '\n') -- remove empty lines

local a, b, c, d = 'ᄀᄂᄃᄅᄆᄇᄉᄋᄌᄎᄏᄐᄑᄒᄝᄫᅗᄛᅌᅀᅙᄼᅎᅔᄾᅐᅕᅟ', '@ᅡᅥᅩᅮᅳᅵᆞᅠ', 'ᆨᆫᆮᆯᆷᆸᆺᆼᆽᆾᆿᇀᇁᇂᇢᇦᇴퟝᇰᇫᇹ', '〮〯'

function export.tr(text, lang, sc)
	text = gsub(text, "%<%/?r[pt]%>", "")
	text = gsub(text, "%<%/?ruby%>", "")

	if not mw.ustring.match(text, '[' .. chars_Hang .. ']') then
		return nil
	end

	local bool_tone_marking = mw.ustring.find(text, ('[%s]'):format(d))

	text = mw.ustring.toNFD(text)

	text = mw.ustring.gsub(text, '.', tt_complex)

	for line in mw.text.gsplit(tt, '\n') do
		local _, __, pattern, repl = mw.ustring.find(line, '(.+)\t(.+)')

		if pattern .. repl == 'BREAK1' then
			-- add period between hanja readings
			text = mw.ustring.gsub(text, '([' .. chars_Hani .. '])%((.-)%)', function(hanja, reading)
				return hanja .. '(' .. mw.ustring.gsub(reading, ('([%s]+)'):format(a), '.%1') .. ')'
			end)

			if bool_tone_marking then
				-- move the location of tone marks for easier handling and
				-- mark low tone
				text = mw.ustring.gsub(text, ('([%s]+)([%s]+)([%s]*)([%s]*)'):format(a, b, c, d), function(a, b, c, d)
					return a .. b .. (d == '' and '↓' or d) .. (c == '' and '' or c)
				end)
			end
		elseif pattern .. repl == 'BREAK2' then
			text = mw.ustring.lower(text)

			-- hanja readings
			-- ref. [[Module:Ethi-translit]]
			text = mw.ustring.gsub(text, '()([' .. chars_Hani .. ']+)%((.-)%)()', function(start_pos, hanja, reading, end_pos)
				-- treat final ieung as null if tones are marked (is this a safe assumption?)
				if bool_tone_marking then
					reading = mw.ustring.gsub(reading, 'ø', '')
				end
				-- convert to uppercase
				reading = mw.ustring.upper(reading)
				return reading
			end)
			-- remove hanja reading leading period
			text = mw.ustring.gsub(text, '^%.', '')
			text = mw.ustring.gsub(text, "'''%.", "'''")
			text = mw.ustring.gsub(text, '(%s)%.', '%1')
		else
			if repl == '×' then
				repl = ''
			end
			text = mw.ustring.gsub(text, pattern, repl)
		end
	end

	-- track failed romanizations
	-- (black diamond instead of U+FFFD to avoid warnings when saving this page)
	if mw.ustring.match(text, '◆') then
		require('Module:debug').track('okm-translit/failed romanization')
	end

	return text
end

return export