Module:User:Erutuon/he-translit-circumflex

From Wiktionary, the free dictionary
Jump to navigation Jump to search

This is a private module sandbox of Erutuon, for his own experimentation. Items in this module may be added and removed at Erutuon's discretion; do not rely on this module's stability.


-- Currently based on
-- https://ia803104.us.archive.org/7/items/A_Students_Vocabulary_For_Biblical_Hebrew_And_Aramaic/A%20Student%27s%20Vocabulary%20for%20Biblical%20Hebrew%20and%20Aramaic_text.pdf
local export = {}

local Array = require "Module:array"

local U = mw.ustring.char
local ufind  = mw.ustring.find
local ugsub  = mw.ustring.gsub
local ulen   = mw.ustring.len
local umatch = mw.ustring.match
local usub   = mw.ustring.sub

local sheva = U(0x05B0)
local hataf_segol = U(0x05B1)
local hataf_patah = U(0x05B2)
local hataf_qamats = U(0x05B3)
local hiriq = U(0x05B4)
local tsere = U(0x05B5)
local segol = U(0x05B6)
local patah = U(0x05B7)
local qamats = U(0x05B8)
local qamats_qatan = U(0x05C7)
local holam = U(0x05B9)
local holam_haser_for_waw = U(0x05BA)
local qubuts = U(0x05BB)
local dagesh_mappiq = U(0x05BC)
local shin_dot = U(0x05C1)
local sin_dot = U(0x05C2)

local macron_above = U(0x0304)
local macron_below = U(0x0331)
local macron = "[" .. macron_above .. macron_below .. "]"

local alef = "א"
local he = "ה"
local waw = "ו"
local yod = "י"
local vowel_letters = alef .. he .. waw .. yod
local shin_sin = 'ש'
-- local vowel_letter = "[" .. vowel_letters .. "]"

-- -- '0' represents silent sheva
-- local vowel_points = (
-- 	sheva .. hataf_segol .. hataf_patah .. hataf_qamats .. hiriq .. tsere ..
-- 	segol .. patah .. qamats .. qamats_qatan .. holam .. qubuts .. '0' ..
-- 	holam_haser_for_waw
-- )
-- local vowel_point = "[" .. vowel_points .. "]"
local short_vowels = segol .. patah .. hiriq .. qubuts .. qamats_qatan
local short_vowel = "[" .. short_vowels .. "]"

local shuruq = waw .. dagesh_mappiq
local holam_male = waw .. holam

local schwa = 'ə'
local superscript_a = 'ᵃ'

local vowel_map = {
	[sheva] = '',
	[hataf_segol] = 'ĕ',
	[hataf_patah] = 'ă',
	[hataf_qamats] = 'ŏ',
	[hiriq] = 'i',
	[tsere] = 'ē',
	[segol] = 'e',
	[patah] = 'a',
	[qamats] = 'ā',
	[qamats_qatan] = 'o',
	[qubuts] = 'u',
	[holam] = 'ō',
	-- [shin_dot] = '',
	-- [sin_dot] = '',
	[holam_male] = 'ô',
	[shuruq] = 'û',
}

local plene_map = {
	-- [sheva] = '', -- ə
	-- [hataf_segol] = 'ĕ',
	-- [hataf_patah] = 'ă',
	-- [hataf_qamats] = 'ŏ',
	[hiriq] = 'î',
	[tsere] = 'ê',
	-- [segol] = 'ệ', -- Lambdin's Introduction to Biblical Hebrew uses this.
	-- [patah] = 'a',
	[qamats] = 'â',
	-- [qamats_qatan] = 'o', -- if plene, then misspelling?
	-- [qubuts] = 'u',
	-- [holam] = 'ō',
	-- [holam_male] = 'ô',
	-- [shuruq] = 'û',
	
}

local vowel_diacritics = Array.keys(vowel_map):filter(function(vowel) return ulen(vowel) == 1 end):concat()

local bet = 'ב'
local gimel = 'ג'
local dalet = 'ד'
local kaf = 'כ'
local kaf_final = 'ך'
local pe = 'פ'
local pe_final = 'ף'
local tav = 'ת'
local bgdkpt = bet .. gimel .. dalet .. kaf .. kaf_final .. pe .. pe_final .. tav

local het = 'ח'
local ayn = 'ע'

local letter_map = {
	[alef] = 'ʾ',
	[bet] = 'b' .. macron_below,
	[gimel] = 'g' .. macron_above,
	[dalet] = 'd' .. macron_below,
	['ה'] = 'h',
	[waw] = 'w',
	['ז'] = 'z',
	[het] = 'ḥ',
	['ט'] = 'ṭ',
	['י'] = 'y',
	[kaf] = 'k' .. macron_below,
	[kaf_final] = 'k' .. macron_below,
	['ל'] = 'l',
	['מ'] = 'm',
	['ם'] = 'm',
	['נ'] = 'n',
	['ן'] = 'n',
	['ס'] = 's',
	[ayn] = 'ʿ',
	[pe] = 'p' .. macron_above,
	[pe_final] = 'p' .. macron_above,
	['צ'] = 'ṣ',
	['ץ'] = 'ṣ',
	['ק'] = 'q',
	['ר'] = 'r',
	[tav] = 't' .. macron_below,
}

local shin_sin_map = {
	[shin_dot] = "š",
	[sin_dot] = "ś",
}

local letters = shin_sin .. Array.keys(letter_map):filter(function(letter) return ulen(letter) == 1 end):concat()

local punctuation_map = {
	["־"] = "-",
	["׃"] = ".",
}

-- Fix illogical order of diacritics in Unicode normalization.
function export.normalize(text)
	-- Comment from [[Module:he-translit]]:
	-- The default order is: consonant, vowel point, dagesh or mappiq, shin or sin dot.
	-- The desired order is: consonant, shin or sin dot, dagesh or mappiq, vowel point.
	text = ugsub(text, "([" .. vowel_diacritics .. ']*)(' .. dagesh_mappiq .. "*)([" .. shin_dot .. sin_dot .. "]*)", "%3%2%1")
	text = ugsub(
		text,
		"[" .. hiriq .. patah .. qamats .. qamats_qatan .. sheva .. "]+",
		function(vowels)
			if ulen(vowels) == 2 then
				local first, second = umatch(vowels, "^(.)(.)$")
				-- יְרוּשָׁלִַם
				if (first == hiriq and second ~= hiriq)
				-- יְרוּשָׁלְַמָה
				or (first == sheva and (second == patah or second == qamats or second == qamats_qatan)) then
					return second .. first
				end
			end
		end)
	return text
end

local function match_alt_one(text, code_point_pos, patterns)
	for _, pattern in ipairs(patterns) do
		local start_pos, end_pos, capture = ufind(text, pattern, code_point_pos)
		if start_pos == code_point_pos then
			-- Return first capture (if any) and end of match
			return capture, end_pos
		end
	end
end

local token_patterns = {
	"(" .. holam_male .. ")",
	"([" .. letters .. waw .. "][" .. shin_dot .. sin_dot .. "]?" .. dagesh_mappiq .. "?)",
	"(.)",
}

local function next_token(text, code_point_pos)
	return match_alt_one(text, code_point_pos, token_patterns)
end

-- Validate shin dot and sin dot?
local function tokenize(text)
	local pos = 1
	local tokens = {}
	while true do
		local token, next_pos = next_token(text, pos)
		if not next_pos then
			break
		end
		pos = next_pos + 1
		table.insert(tokens, token)
	end
	return tokens
end

export.tokenize = tokenize

-- Indicates that a token may be a consonant.
local function is_consonant(token)
	return token ~= nil and ufind(token, "[" .. letters .. "]", 1) == 1
end

local function may_be_silent(token)
	return token ~= nil and vowel_letters:find(token, 1, true) ~= nil
end

-- Indicates that a token is definitely a vowel.
-- Shuruq not covered because it could be a ww.
local function is_vowel(token)
	return token == holam_male or token ~= nil and vowel_diacritics:find(token, 1, true) ~= nil
end

local function is_preceded_by_unchangeable_vowel(tokens, i)
	local token1, token2 = tokens[i - 2], tokens[i - 1]
	return token2 == shuruq -- Don't check that this is waw with dagesh.
		or token2 == holam_male
		or token2 == yod and (token1 == hiriq or token1 == tsere or token1 == segol)
end

local function has_dagesh(token)
	return token:find(dagesh_mappiq, 1, true) ~= nil
end

local function is_waw(token)
	return token:find(waw, 1, true) == 1
end

local function is_he(token)
	return token:find(he, 1, true) == 1
end

local function is_bgdkpt(token)
	return ufind(token, "^[" .. bgdkpt .. "]") == 1
end

local function is_word_boundary(token)
	return token == nil or ufind(token, "^[%s%p]$") ~= nil
end

local function get_letter(token)
	-- assert(ufind(token, "[" .. letters .. "]") == 1)
	if token ~= nil then
		return usub(token, 1, 1)
	end
end

local function get_dot(token)
	return umatch(token, "[" .. shin_dot .. sin_dot .. "]")
end

local function is_followed_by_vowel(tokens, i)
	local next_token = tokens[i + 1]
	return is_vowel(next_token) or next_token == shuruq
end

local function is_preceded_by_vowel(tokens, i)
	i = i - 1
	while may_be_silent(tokens[i]) do
		i = i - 1
	end
	return is_vowel(tokens[i]) or tokens[i] == shuruq
end

local function makes_furtive_patah(token)
	local pos, letter = ufind(token, "([" .. ayn .. het .. he .. "])")
	return pos == 1 and (token ~= he or has_dagesh(token))
end

function export.transliterate(text)
	local tokens = export.tokenize(export.normalize(text))
	local transliteration = {}
	local function add_tr(val)
		assert(type(val) == "string")
		table.insert(transliteration, val)
	end
	-- Use a manually incremented loop so we can skip
	-- furtive patah and matres lectionis tokens.
	local i = 1
	while true do
		local token = tokens[i]
		if not token then
			break
		end
		if is_waw(token) then
			if token == holam_male then
				if tokens[i - 1] == sheva then
					add_tr(letter_map[waw] .. vowel_map[holam])
				else
					add_tr(vowel_map[holam_male])
				end
			-- waw with dagesh, shuruq
			elseif has_dagesh(token) then
				if is_consonant(tokens[i - 1]) or is_word_boundary(tokens[i - 1]) then
					add_tr(vowel_map[shuruq])
				else
					add_tr("ww")
				end
			else
				add_tr("w")
			end
		elseif is_consonant(token) then
			local letter = get_letter(token)
			local tr = assert(letter_map[letter] or shin_sin_map[get_dot(token)] or letter == shin_sin and shin_sin_map[sin_dot], token)
			if has_dagesh(token) then
				tr = ugsub(tr, macron, "")
				-- Don't double he.
				-- Don't double bgdkpt after sheva or at beginning of word.
				if not is_he(token) and not (is_bgdkpt(token) and (tokens[i - 1] == sheva or is_word_boundary(tokens[i - 1]))) then
					tr = tr .. tr
				end
			end
			-- Transcribe furtive patah before its consonant and skip it.
			if makes_furtive_patah(token) and tokens[i + 1] == patah and is_word_boundary(tokens[i + 2]) then
				add_tr(superscript_a)
				i = i + 1
			end
			add_tr(tr)
		elseif is_vowel(token) then
			if ((token == tsere or token == hiriq) and tokens[i + 1] == yod)
			or (token == qamats and tokens[i + 1] == he and not is_vowel(tokens[i + 2])) then
				add_tr(plene_map[token])
				i = i + 1 -- Skip mater lectionis.
			-- Handle vocalic sheva
			elseif token == sheva
			and (
				-- after initial consonant unless following consonant has dagesh
				(is_word_boundary(tokens[i - 2]) and not has_dagesh(tokens[i + 1]))
				-- after another sheva not at end of word
				or (tokens[i - 2] == sheva and not is_word_boundary(tokens[i + 1]))
				-- between identical consonants
				or get_letter(tokens[i - 1]) == get_letter(tokens[i + 1])
				-- after unchangeable vowel
				or is_preceded_by_unchangeable_vowel(tokens, i - 1)
			) then
				add_tr(schwa)
			elseif
				-- implicit ktiv/qre from [[Module:he-translit/testcases]]:
				-- יְרוּשָׁלְַמָה, יְרוּשָׁלְָמָה
				token == sheva
				and (tokens[i - 1] == patah or tokens[i - 1] == qamats
					or tokens[i - 1] == qamats_qatan)
			then
				add_tr("y")
			elseif
				-- implicit ktiv/qre from [[Module:he-translit/testcases]]:
				-- יְרוּשָׁלִַם, יְרוּשָׁלִָם
				token == hiriq
				and (tokens[i - 1] == patah or tokens[i - 1] == qamats
					or tokens[i - 1] == qamats_qatan)
			then
				add_tr("yi")
			-- qamats in possibly closed syllable,
			-- as long as following two consonants are not identical, in which
			-- case the sheva has to be pronounced, putting the qamats
			-- in an open syllable
			elseif token == qamats and tokens[i + 2] == sheva and not (is_consonant(tokens[i + 1]) and is_consonant(tokens[i + 3]) and tokens[i + 1] == tokens[i + 3]) then
				add_tr(vowel_map[qamats_qatan])
			elseif (token == patah or token == qamats) and tokens[i + 1] == yod and is_consonant(tokens[i + 2]) then
				add_tr(vowel_map[token])
				add_tr("i") -- ???
				i = i + 1
			else
				add_tr(vowel_map[token])
			end
		else
			add_tr(punctuation_map[token] or token)
		end
		i = i + 1
	end
	return table.concat(transliteration)
end

return export