Module:grc-utilities

Definition from Wiktionary, the free dictionary
Jump to: navigation, search

This module contains four functions, three of which are called by other modules.

standardDiacritics takes spacing or nonstandard diacritics and converts them to standard combining diacritics. This function is used by pronunciationOrder.

reorderDiacritics takes the diacritics, removes them from the letter (mw.ustring.toNFD), and reorders them so that macrons or breves are first; diaeresis or breathing mark is second; acute, grave, or circumflex is third; and iota subscript is last. Aside from the iota subscript part, this is the only order in which the diacritics can display correctly, as explained elsewhere. This function is used by Module:typing-aids and {{chars}}.

  • ά̓̆νερ (α◌́◌̓◌̆νερ) → ᾰ̓́νερ (α◌̆◌̓◌́νερ)

pronunciationOrder does the same thing, except it puts the macron or breve and iota subscript last and recombines the diacritics (mw.ustring.toNFC) after reordering them. The diaeresis or breathing mark and accent mark will recombine, while the macron and breve remains uncombined as a combining character. This function is used by Module:grc-pronunciation and {{grc-IPA}}.

Module:grc-utilities/data holds the diacritic definitions and substitutions that are used by this module.

Tokenization[edit]

The function tokenize breaks the text into meaningful units of a single consonant or monophthong letter, or diphthong, with any diacritics, as shown below. This function is used by Module:grc-translit and Module:grc-accent, and by the sandbox module Module:grc-pronunciation/sandbox.

The first argument is the word to be tokenized. The second is a boolean: if true, the function will group εω together as a diphthong, for instance in πόλεως (póleōs), genitive of πόλῐς (pólis, city state).

word tokens
ἡμεῖς ἡ, μ, εῖ, ς
οἷαι οἷ, αι
ἀναῡ̈τέω ἀ, ν, α, ῡ̈, τ, έ, ω
δαΐφρων δ, α, ΐ, φ, ρ, ω, ν
τούτῳ τ, ού, τ, ῳ
ὑϊκός ὑ, ϊ, κ, ό, ς
ἡ Ἑλήνη ἡ,  , Ἑ, λ, ή, ν, η
νηῦς ν, ηῦ, ς
υἱός υἱ, ό, ς
ὄργυιᾰ ὄ, ρ, γ, υι, ᾰ
οὐ δοκεῖν ἀλλ᾽ εἶναι ἀγαθὸν οὐ,  , δ, ο, κ, εῖ, ν,  , ἀ, λ, λ, ᾽,  , εἶ, ν, αι,  , ἀ, γ, α, θ, ὸ, ν

Testcases[edit]

 

local export = {}

local m_script_utils = require("Module:script utilities")
local m_links = require("Module:links")
local lang = require("Module:languages").getByCode("grc")
local sc = require("Module:scripts").getByCode("polytonic")

local m_data = mw.loadData("Module:grc-utilities/data")
local groups = m_data.groups
local diacritic_order = m_data.diacritic_order
local conversions = m_data.conversions
local diacritics = m_data.diacritics
local diacritic = m_data.diacritic
local diaeresis = diacritics.diaeresis
local macron = diacritics.macron
local breve = diacritics.breve
local spacing_macron = diacritics.spacing_macron
local spacing_breve = diacritics.spacing_breve
local circumflex = diacritics.circum
local subscript = diacritics.subscript
local combining_diacritic = m_data.combining_diacritic

local i_diphthong = "^[ΑΕΗΟΥΩαεηουω][Ιι]$"
local u_diphthong = "^[ΑΕΗΟΩαεηοω][Υυ]$"
local synaeresis = "^[Εε][Ωω]$"
local diaer_patt = "^[" .. macron .. breve .. "]?" .. diaeresis
local UTF8char = "[\1-\127\194-\244][\128-\191]*"

local find = mw.ustring.find
local match = mw.ustring.match
local gmatch = mw.ustring.gmatch
local sub = mw.ustring.sub
local gsub = mw.ustring.gsub
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD

local sparseConcat = require("Module:table").sparseConcat

local checkType = require "libraryUtil".checkType

local function _check(funcName)
	return function(argIndex, arg, expectType, nilOk)
		return checkType(funcName, argIndex, arg, expectType, nilOk)
	end
end

-- Perform a function on each Unicode character in a string.
local function forEach(str, func)
	for char in string.gmatch(str, UTF8char) do
		func(char)
	end
end

-- This concatenates or inserts a character, then removes it from the text.
local function add(list, index, chars, text)
	if not chars then
		error("The function add cannot act on a nil character.")
	end
	if list[index] then
		list[index] = list[index] .. chars
	else
		list[index] = chars
	end
	-- Basic string function works here.
	return text:sub(#chars + 1)
end

function export.tag(term, face)
	return m_script_utils.tag_text(term, lang, sc, face)
end

function export.link(term, face, alt, tr)
	return m_links.full_link( { term = term, alt = alt, lang = lang, sc = sc, tr = tr }, face)
end

local function linkNoTag(term, alt)
	return m_links.language_link{ term = term, lang = lang, alt = alt }
end

-- Convert spacing to combining diacritics, and nonstandard to standard polytonic Greek.
function export.standardDiacritics(text)
	text = toNFD(text)
	
	text = text:gsub(UTF8char, conversions)
	
	return text
end

--[=[	This function arranges diacritics in the following order:
			1. macron or breve
			2. breathings or diaeresis
			3. acute, circumflex, or grave
			4. iota subscript
		Used by [[Module:typing-aids]].
		
		Returns an error if a sequence of diacritics contains more than one
		of each category.
]=]
local function reorderDiacriticSequence(diacritics)
	local output = {}
	forEach(diacritics,
		function (diacritic)
			local index = diacritic_order[diacritic]
			if not output[index] then
				output[index] = diacritic
			else
				-- Place breve after macron.
				if diacritic == breve then
					index = index + 1
				end
				-- The following might have odd results when there
				-- are three or more diacritics.
				table.insert(output, index, diacritic)
				-- [[Special:WhatLinksHere/Template:tracking/grc-utils/too many diacritics]]
				require("Module:debug").track("grc-utils/too many diacritics")
				--[[
				local m_templates = require("Module:grc-utilities/templates")
				error("There are two diacritics, " ..
						m_templates.addDottedCircle(output[index]) .. " and " ..
						m_templates.addDottedCircle(diacritic) ..
						" that belong in the same position. There should be only one."
				)
				--]]
			end
		end)
	return sparseConcat(output)
end

function export.reorderDiacritics(text)
	local d = diacritics
	
	return (gsub(toNFD(text),
		combining_diacritic .. combining_diacritic .. "+",
		reorderDiacriticSequence))
end

--[=[
		This breaks a word into meaningful "tokens", which are
		individual letters or diphthongs with their diacritics.
		Used by [[Module:grc-accent]] and [[Module:grc-pronunciation]].
--]=]

local function isDiphthong(chars, nextChars, isNoun)
	return
		( find(chars, i_diphthong) or
			find(chars, u_diphthong) ) and
			not find(nextChars, diaer_patt)
		or isNoun and find(chars, synaeresis)
end

function export.tokenize(text, isNoun)
	local check = _check "tokenize"
	check(1, text, "string")
	check(2, isNoun, "boolean", true)
	
	if type(text) ~= "string" then
		error("Text is not a string", 2)
	end
	
	-- decompose and reorder diacritics
	text = export.reorderDiacritics(text)
	
	local tokens = {}
	-- token tracks our position in the table of tokens.
	local i = 0
	while #text > 0 do
		local char = sub(text, 1, 1) or ""
		local chars = sub(text, 1, 2) or ""
		local nextChars = sub(text, 3, 4) or ""
		-- Look for a diacritic and add it to the current token. Remove it from the text.
		if find(char, diacritic) then
			text = add(tokens, i, char, text)
	--[[	See if the next two characters form a diphthong and if so,
			add them to the current token. Remove them from the text.
			If there's a diaeresis, it will be immediately after
			the second of the two characters, or after a macron or breve.	]]
		elseif isDiphthong(chars, nextChars, isNoun) then
			i = i + 1
			text = add(tokens, i, chars, text)
		else
		-- Add the current character to the next token. Remove it from the text.
			i = i + 1
			text = add(tokens, i, char, text)
		end
	end
	
	return require("Module:table").compressSparseArray(tokens)
end

--[=[	Places diacritics in the following order:
			1. breathings or diaeresis
			2. acute, circumflex, or grave
			3. macron or breve
			4. iota subscript
		Used by [[Module:grc-pronunciation]].		]=]
function export.pronunciationOrder(text)
	text = export.standardDiacritics(text)
	
	if not find(text, groups[1]) then
		return toNFC(text)
	end
	
	text = gsub(text,
		diacritic .. diacritic .. "+",
		function(sequence)
			-- Put breathing and diaeresis first, then accents, then macron or breve
			return table.concat{
				match(sequence, groups[2]) or "",
				match(sequence, groups[3]) or "",
				match(sequence, groups[1]) or "",
				match(sequence, groups[4]) or ""
			}
		end)
	
	text = gsub(text, macron, spacing_macron) -- combining to spacing macron
	text = gsub(text, breve, spacing_breve) -- combining to spacing breve
	
	return toNFC(text)
end


-- Returns a table of any ambiguous vowels in the text, language-tagged.
function export.findAmbig(text, noTag)
	if (not text) or type(text) ~= "string" then
		error("The input to function findAmbig is nonexistent or not a string")
	end
	
	local lengthDiacritic = "[" .. macron .. breve .. circumflex .. subscript .. "]"
	local aiu_diacritic = "^([" .. "αιυ" .. "])(" .. diacritic .. "*)$"
	
	-- breaks the word into units
	local output, vowels = {}, {}
	for _, token in ipairs(export.tokenize(text)) do
		if not find(token, m_data.consonant) then
			local vowel, diacritics = match(
				token,
				aiu_diacritic
			)
			
			if vowel and (diacritics == "" or
					not find(diacritics, lengthDiacritic)) then
				local diacriticked_vowel
				if not noTag then
					diacriticked_vowel = export.tag(vowel .. diacritics)
				else
					diacriticked_vowel = vowel
				end
				
				table.insert(output, diacriticked_vowel)
				
				-- Lists the vowel letters that are ambiguous, for categorization purposes.
				vowel = mw.ustring.lower(vowel)
				if not vowels[vowel] then
					vowels[vowel] = true
				end
			end
		end
	end
		
	return output, vowels
end

return export