Module:User:AmazingJus/sco

From Wiktionary, the free dictionary
Jump to navigation Jump to search
Word Expected Actual
stane ˈsten sta_en
yowe ˈjʌʉː yowe
thousand ˈhʉsən Lua error: bad argument #1 to 'match' (string expected, got nil)
traivel ˈtreːvəl traivɛˑ~æˑl
efternuin ˌɛftərˈnøn ɛˑ~æˑftɛˑ~æˑr+nuin
TextExpectedActual
test_pron:
Script error during testing: Module:User:AmazingJus/sco/testcases:10: attempt to call field 'link' (a nil value)
stack traceback:
	[C]: in function 'link'
	Module:User:AmazingJus/sco/testcases:10: in function 'check_IPA'
	Module:User:AmazingJus/sco/testcases:31: in function <Module:User:AmazingJus/sco/testcases:17>
	(tail call): ?
	[C]: in function 'xpcall'
	Module:UnitTests:369: in function <Module:UnitTests:328>
	(tail call): ?
	mw.lua:527: in function <mw.lua:507>
	[C]: ?
	[C]: in function 'expandTemplate'
	mw.lua:333: in function 'expandTemplate'
	Module:documentation:864: in function 'chunk'
	mw.lua:527: in function <mw.lua:507>
	[C]: ?

local export = {}

local lang = require("Module:languages").getByCode("sco")
local m_IPA = require("Module:IPA")

local gmatch = mw.ustring.gmatch
local gsplit = mw.text.gsplit
local match = mw.ustring.match
local gsubn = mw.ustring.gsub
local len = mw.ustring.len
local lower = mw.ustring.lower
local sub = mw.ustring.sub

-- version of gsubn() that discards all but the first return value
local function gsub(term, foo, bar, n)
	local retval = gsubn(term, foo, bar, n)
	return retval
end

--[[ Dialect abbreviations:
* Insular:
** Orkney: or
** Shetland: sh
* Northern:
** North Northern: nn
** Mid Northern: mn
** South Northern: sn
* Central:
** North East Central: nec
** South East Central: sec
** West Central: wc
** South West Central: swc
* Southern: s
* Ulster:
** Western Ulster: wu
** Central Ulster: cu
** Eastern Ulster: eu
--]]

--[[
TODO:
-- * Consider unstressed vowels (schwa)
-- * Place the morpheme splitting in the main evaluation function
-- * Work on consonant rules
-- * Consider adding unique dialects based on word inputted
-- * Consider unique pronunciation for suffixes
--]]

--[[ DATA STRUCTURES
--]]
-- list pronunciations for different vowel spellings
-- {"pattern", "pos"} represent the surrounding letters with before (-1) or after (1) a word, false represents every other condition
-- "ˑ" indicates a vowel affected by the scottish vowel length rule
local s = {
	["a"] = {
		[{"n[gd]?", 1}] = "a~ɑ",
		[{nil, 1}] = "ɑˑ,e",
		[false] = "aˑ"
	},
	["e"] = {
		[false] = "ɛˑ~æˑ"
	},
	["i"] = {
		[{"n?g", 1}] = "əi",
		[{"ch", 1}] = "əi",
		[{"wh", -1}] = "ʌ",
		[false] = "ɪ"
	},
	["o"] = {
		[{nil, -1}] = "wʌˑ",
		[{"ch", 1}] = "ʌu",
		[false] = "ɔˑ"
	},
	["u"] = {
		[false] = "ʌ"
	}
}

-- all possible multi-letter graphemes needed for tokenisation
local multigraphs = {
	"a_e", "e_e", "i_e", "o_e", "owe", "u_e", "y_e",
	"aa", "ae", "ai", "au", "aw", "ay", "ea", "ee", "ei", "eu", "ew", "ey", "ie", "oa", "oi", "oo", "ou", "ow", "oy", "ui",
	"ch", "ck", "kn", "ld", "mb", "nd", "ng", "nk", "qu", "sh", "th", "wh", "wr"
}

-- common morphemes
local morphemes = {
	unstressed = {"ae", "ane", "dae", "hae", "na", "nae", "sae", "tae", "the"}, -- unstressed particles
	prefixes = {"a"}, -- prefixes
	suffixes = {"fu", "le", "na", "the", "se"} -- suffixes
}

--[[ HELPER FUNCTIONS
--]]
-- handle vowel length according to scottish vowel length rule
local function handle_vowel_length(word)
	-- long if before /r/ and voiced fricatives
	word = gsub(word, "ˑ([rvzðʒ])", "ː%1")
	-- also long morpheme-finally
	word = gsub(word, "ˑ$", "ː")

	-- otherwise short
	word = gsub(word, "ˑ", "")

	return word
end

-- handle stress
local function handle_stress(word)
	-- apply morpheme rules if no explicit stress marker and not an unstressed particle
	if not match(word, "ˈ") and not morphemes.unstressed[word] then
		-- stress after prefix "a-"
		if match(word, "^a[^aeiou][aeiou]") then
			word = "aˈ" .. sub(word, 2)
		-- otherwise add stress on the first syllable of a morpheme
		else
			word = "ˈ" .. word
		end
	end

	return word
end

-- split any potential suffixes from word
local function split_suffixes(word)
	-- loop over all possible suffixes
	for _, suffix in ipairs(morphemes.suffixes) do
		if sub(word, -len(suffix)) == suffix then
			return sub(word, 1, -len(suffix)-1), suffix
		end
	end

	-- return suffixless word otherwise
	return word, nil
end

--[[ MAIN FUNCTIONS
--]]
-- tokenise word into individual graphemes and affixes
local function tokenise(word)
	-- initialise index and tokenised array
	local i = 1
	local tokenised = {}

	-- split any suffixes from the base word
	local base_word, suffix = split_suffixes(word)

	-- respell vowel + consonant + e as vowel + _e + consonant for easier parsing
	base_word = gsub(base_word, "([^aeiou][aeiouy])([^aeiouwy])e([^aeiou])", "%1_e%2%3")
	base_word = gsub(base_word, "([^aeiou][aeiouy])([^aeiouwy])e$", "%1_e%2")

	-- loop over entire base word
	while i <= len(base_word) do
		-- loop over all possible multigraphs
		local found = false
		for _, multigraph in ipairs(multigraphs) do
			-- check for a matching multigraph
			if sub(base_word, i, i + len(multigraph) - 1) == multigraph then
				-- add multigraph to tokenised
				table.insert(tokenised, multigraph)
				i = i + len(multigraph)
				found = true
				break
			end
		end
		-- add single grapheme if no multigraph found
		if not found then
			table.insert(tokenised, sub(base_word, i, i))
			i = i + 1
		end
	end

	-- add suffix to the tokenized table at the end
	if suffix then
		table.insert(tokenised, suffix)
	end

	return tokenised
end

-- process phonemes for tokens
local function to_phonemes(tokens)
	local phonemes = {}

	for i = 1, #tokens do
		local char = tokens[i]

		-- ensure char is not nil and exists in table
		if char and s[char] then -- use s temporarily
			-- determine surrounding context
			local before = i > 1 and tokens[i - 1] or nil
			local after = i < #tokens and tokens[i + 1] or nil

			local match_found = false
			-- check conditions in table
			for pattern, replacement in pairs(s[char]) do
				-- if no specific condition were defined for character
				if pattern == false then
					phonemes[#phonemes + 1] = replacement
					match_found = true
					break
				elseif type(pattern) == "table" then
					local pos = pattern[2]
					local context = pos == -1 and before or after
					-- match true given a context and pattern
					if (not context and not pattern[1]) or (context and match(pattern[1], context)) then
						match_found = true
						phonemes[#phonemes + 1] = replacement
						break
					end
				end
			end

			-- add match to table
			if not match_found then
				phonemes[#phonemes + 1] = char
			end
		else
			-- otherwise append char as is
			phonemes[#phonemes + 1] = char or ''
		end
	end

	return table.concat(phonemes)
end

-- generate IPA pronunciation of word
function export.toIPA(entry)
	if type(entry) == "table" then
		entry = entry.args[1]
	end

	-- make text lowercase
	entry = lower(entry)

	local words = {}
	-- loop over each word
	for word in gsplit(entry, "%s") do
		-- tokenise word into graphemes
		local tokenised = tokenise(word)

		-- process phonemes for tokens
		local processed = to_phonemes(tokenised)

		-- add processed word to word array
		table.insert(words, processed)
	end

	return table.concat(words, " ")
end

-- export function for IPA
function export.show(entry)
	if type(entry) == "table" then
		entry = entry.args[1]
	end

	-- return processed pronunciation
	return export.toIPA(entry)
end

return export