Module:User:AmazingJus/sco

From Wiktionary, the free dictionary
Jump to navigation Jump to search

This is a private module sandbox of AmazingJus, for his own experimentation. Items in this module may be added and removed at AmazingJus's discretion; do not rely on this module's stability.


local export = {}

local lang = require("Module:languages").getByCode("sco")
local m_IPA = require("Module:IPA")

local gmatch = mw.ustring.gmatch
local gsplit = mw.text.gsplit
local match = mw.ustring.match
local gsubn = mw.ustring.gsub
local len = mw.ustring.len
local lc = mw.ustring.lower
local sub = mw.ustring.sub

-- version of gsubn() that discards all but the first return value
local function gsub(term, foo, bar, n)
	local retval = gsubn(term, foo, bar, n)
	return retval
end

--[[ Dialect abbreviations:
* Insular:
** Orkney: or
** Shetland: sh
* Northern:
** North Northern: nn
** Mid Northern: mn
** South Northern: sn
* Central:
** North East Central: nec
** South East Central: sec
** West Central: wc
** South West Central: swc
* Southern: s
* Ulster:
** Western Ulster: wu
** Central Ulster: cu
** Eastern Ulster: eu
--]]

--[[
TODO:
-- * Consider unstressed vowels (schwa)
-- * Place the morpheme splitting in the main evaluation function
-- * Work on consonant rules
-- * Consider adding unique dialects based on word inputted
-- * Consider unique pronunciation for suffixes
--]]

--[[ DATA STRUCTURES
--]]
-- all possible multi-letter graphemes needed for tokenisation
local multigraphs = {
	"a_e", "e_e", "i_e", "o_e", "owe", "u_e", "y_e",
	"aa", "ae", "ai", "au", "aw", "ay", "ea", "ee", "ei", "eu", "ew", "ey", "ie", "oa", "oi", "oo", "ou", "ow", "oy", "ui",
	"ch", "ck", "kn", "ld", "mb", "nd", "ng", "nk", "qu", "sh", "th", "wh", "wr"
}

-- common morphemes
local morphemes = {
	unstressed = {"ae", "ane", "dae", "hae", "na", "nae", "sae", "tae", "the"}, -- unstressed particles
	prefixes = {"a"}, -- prefixes
	suffixes = {"ae", "ie", "fu", "le", "na"} -- suffixes
}

--[[ HELPER FUNCTIONS
--]]

-- handle vowel length according to scottish vowel length rule
local function handle_vowel_length(word)
	-- long if before /r/ and voiced fricatives
	word = gsub(word, "ˑ([rvzðʒ])", "ː%1")
	-- also long morpheme-finally
	word = gsub(word, "ˑ$", "ː")

	-- otherwise short
	word = gsub(word, "ˑ", "")

	return word
end

-- handle stress
local function handle_stress(word)
	-- apply morpheme rules if no explicit stress marker and not a common unstressed particle
	if not match(word, "ˈ") and not morphemes.unstressed[word] then
		-- stress after prefix "a-"
		if match(word, "^a[^aeiou][aeiou]") then
			word = "aˈ" .. sub(word, 2)
		-- otherwise add stress on the first syllable of a morpheme
		else
			word = "ˈ" .. word
		end
	end

	return word
end

--[[ MAIN FUNCTIONS
--]]
-- tokenise word into individual graphemes and affixes
local function tokenise(word)
	-- initialise index and tokenised array
	local i = 1
	local tokenised = {}

	-- respell vowel + consonant + e as vowel + _e + consonant for easier parsing
	word = gsub(word, "([^aeiou][aeiouy])([^aeiouwy])e([^aeiou])", "%1_e%2%3")
	word = gsub(word, "([^aeiou][aeiouy])([^aeiouwy])e$", "%1_e%2")

	-- loop over entire word
	while i <= len(word) do
		-- loop over all possible multigraphs
		for _, multigraph in ipairs(multigraphs) do
			-- check for a matching multigraph
			if sub(word, i, i + len(multigraph) - 1) == multigraph then
				-- add multigraph to tokenised
				table.insert(tokenised, multigraph)
				i = i + len(multigraph)
				break
			end
		end
		-- add single grapheme if no multigraph found
		if i <= len(word) then
			table.insert(tokenised, sub(word, i, i))
			i = i + 1
		end
	end

	return tokenised
end

-- generate pronunciation of word
local function pron(entry)
	-- make text lowercase
	entry = lc(entry)
	local words = {}

	-- loop over each word
	for word in gsplit(entry, "%s") do
		-- tokenise word into graphemes
		word = tokenise(word)

		-- add tokenised word to word array
		table.insert(words, word)
	end

	return words
end

-- export function to IPA
function export.toIPA(word)
	if type(word) == "table" then
		word = word.args[1]
	end

	-- process pronunciation
	word = pron(word)

	return word
end

return export