Module:IPA

Definition from Wiktionary, the free dictionary
Jump to: navigation, search

This module is used by the templates {{IPA}} and {{IPAchar}} to format IPA in entries, and it also converts X-SAMPA (an ASCII version of IPA) to IPA for the templates {{x2i}}, {{x2ipa}}, and {{x2ipachar}}. The actual functions called by these templates are found in Module:IPA/templates

The function format_IPA_full generates the content of the template {{IPA}}. It should also be used by pronunciation modules for specific languages. (It is used, for instance, by Module:ru-pron.) It generates a label IPA (key), followed by a list of IPA transcriptions with the class attribute "IPA" added to them, and a language-specific category (such as Category:English terms with IPA pronunciation).

The function format_IPA_multiple generates the content of the template {{IPAchar}}. It is similar to format_IPA_full, but does not add a label or categories.

Data is in Module:IPA/data, Module:IPA/data/symbols, and Module:IPA/data/X-SAMPA.

Unit tests

See also: Module:IPA/testcases

IPA to X-SAMPA back to IPA

Term IPA Generated X-SAMPA Regenerated IPA Matched?
dictionary /ˈdɪkʃən(ə)ɹi/ /"dIkS@n(@)r\i/ /ˈdɪkʃən(ə)ɹi/ yes
/ˈdɪkʃənɛɹi/ /"dIkS@nEr\i/ /ˈdɪkʃənɛɹi/ yes
Україна (Ukrajina) /ukrɑˈjɪnɑ/ /ukrA"jInA/ /ukrɑˈjɪnɑ/ yes
نوروز [næu̯ˈɾoːz] [n{u_^"4o:z] [næu̯ˈɾoːz] yes
[nou̯ˈɾuːz] [nou_^"4u:z] [nou̯ˈɾuːz] yes
[noːˈɾuːz] [no:"4u:z] [noːˈɾuːz] yes
[næu̯ˈɾɵːz] [n{u_^"48:z] [næu̯ˈɾɵːz] yes
新年 [ɕɪn˥˥niɛn˧˥] [s\In__T__TniEn__M__T] [ɕɪn˥˥niɛn˧˥] yes
battleship [ˈbætl̩ʃɪp] ["b{tl=SIp] [ˈbætl̩ʃɪp] yes
báid [bˠɑːdʲ] [b_GA:d_j] [bˠɑːdʲ] yes
Deutsch [dɔʏ̯t͡ʃ] [dOY_^t__S] [dɔʏ̯t͡ʃ] yes
dóigh [d̪ˠoːɟ] [d_d_Go:J\] [d̪ˠoːɟ] yes
murder [ˈmɝdɚ] ["m3`d@`] [ˈmɝdɚ] yes

local export = {}
-- [[Module:IPA/data]]

local m_data = mw.loadData('Module:IPA/data') -- [[Module:IPA/data]]
local m_symbols = mw.loadData('Module:IPA/data/symbols') -- [[Module:IPA/data/symbols]]
local m_XSAMPA = mw.loadData('Module:IPA/data/X-SAMPA')
local m_syllables = require('Module:syllables') -- [[Module:syllables]]
local m_languages = require('Module:languages')
local m_links = require('Module:links')

local sub = mw.ustring.sub
local find = mw.ustring.find
local gsub = mw.ustring.gsub
local match = mw.ustring.match
local gmatch = mw.ustring.gmatch
local U = mw.ustring.char

function export.format_IPA_full(lang, items, err, separator)
	local IPA_key, key_link, err_text, prefix, IPAs, category
	local hasKey = m_data.langs_with_infopages
	local namespace = mw.title.getCurrentTitle().nsText
	
	if err then
		err_text = '<span class="error">' .. err .. '</span>'
	else
		if hasKey[lang:getCode()] then
			IPA_key = "Appendix:" .. lang:getCanonicalName() .. " pronunciation"
		else
			IPA_key = "wikipedia:" .. lang:getCanonicalName() .. " phonology"
		end
		
		key_link = "[[" .. IPA_key .. "|key]]"
	end

	
	local prefix = "[[Wiktionary:International Phonetic Alphabet|IPA]]<sup>(" .. ( key_link or err_text ) .. ")</sup>:&#32;"
	
	IPAs = export.format_IPA_multiple(lang, items, separator)
	
	if lang and (namespace == "" or namespace == "Reconstruction") then 
		category = "[[Category:" .. lang:getCanonicalName() .. " terms with IPA pronunciation]]"
	else
		category = ""
	end

	return prefix .. IPAs .. category
end

local function determine_repr(pron)
	local repr_mark = {}
	local repr, reconstructed
	
	-- remove initial asterisk before representation marks, used on some Reconstruction pages
	if find(pron, "^%*") then
		reconstructed = true
		pron = sub(pron, 2)
	end
	
	local representation_types = {
		['/'] = { right = '/', type = 'phonemic', },
		['['] = { right = ']', type = 'phonetic', },
		['⟨'] = { right = '⟩', type = 'orthographic', },
		['-'] = { type = 'rhyme' },
	}
	
	repr_mark.i, repr_mark.f, repr_mark.left, repr_mark.right = find(pron, '^(.).-(.)$')
	
	local representation_type = representation_types[repr_mark.left]
	
	if representation_type then
		if representation_type.right then
			if repr_mark.right == representation_type.right then
				repr = representation_type.type
			end
		else
			repr = representation_type.type
		end
	else
		repr = nil
	end
	
	return repr, reconstructed
end

local function hasInvalidSeparators(transcription)
	if find(transcription, "%.[ˈˌ]") then
		return true
	else
		return false
	end
end

function export.format_IPA_multiple(lang, items, separator)
	local notes = {}
	local categories = {}
	separator = separator or ', '
	
	-- Format
	if #items == 0 then
		if mw.title.getCurrentTitle().nsText == "Template" then
			table.insert(items, {pron = "/aɪ piː ˈeɪ/"})
		else
			table.insert(categories, "[[Category:Pronunciation templates without a pronunciation]]")
		end
	end
	
	local bits = {}
	
	for _, item in ipairs(items) do
		local bit = export.format_IPA(lang, item.pron)
		
		if item.qualifiers and #item.qualifiers > 0 then
			bit = require("Module:qualifier").format_qualifier(item.qualifiers) .. " " .. bit
		end
		
		if item.note then
			bit = bit .. mw.getCurrentFrame():extensionTag("ref", item.note)
		end
		
		table.insert(bits, bit)
		
		--[=[	[[Special:WhatLinksHere/Template:tracking/IPA/syntax-error]]
				The length or gemination symbol should not appear after a syllable break or stress symbol.	]=]
		
		if find(item.pron, "[ˈˌ%.][ːˑ]") then
			require("Module:debug").track("IPA/syntax-error")
		end
		
		if lang then
			-- Add syllable count if the language's diphthongs are listed in [[Module:syllables]].
			if mw.title.getCurrentTitle().namespace == 0 then
				if m_syllables.hasDiphthongs(lang) then
					if determine_repr(item.pron) == "phonemic" or lang:getCode() == "ru" then
						local syllable_count = m_syllables.getVowels(item.pron, lang)
						if syllable_count then
							table.insert(categories, "[[Category:" .. lang:getCanonicalName() .. " " .. syllable_count .. "-syllable words]]")
						end	
					end
				end
			end

			if lang:getCode() == "en" then
				if hasInvalidSeparators(item.pron) then
					table.insert(categories, "[[Category:IPA for English using .ˈ or .ˌ]]")
				end
			end
		end
	end

	return table.concat(bits, separator) .. table.concat(categories)
end

-- Takes an IPA pronunciation and formats it and adds cleanup categories.
function export.format_IPA(lang, pron, split_output)
	local err = {}
	local categories = {}
	
	-- Detect whether this is a phonemic or phonetic transcription
	local repr, reconstructed = determine_repr(pron)
	
	if reconstructed then
		pron = sub(pron, 2)
	end
	
	-- If valid, strip the representation marks
	if repr == "phonemic" then
		pron = sub(pron, 2, -2)
	elseif repr == "phonetic" then
		pron = sub(pron, 2, -2)
	elseif repr == "orthographic" then
		pron = sub(pron, 2, -2)
	elseif repr == "rhyme" then
		pron = sub(pron, 2)
	else
		table.insert(categories, "[[Category:IPA pronunciations with invalid representation marks]]")
		-- table.insert(err, "invalid representation marks")
		-- Removed because it's annoying when previewing pronunciation pages.
	end
	
	-- Check for obsolete and nonstandard symbols
	for i, symbol in ipairs(m_data.nonstandard) do
		local result = {}
		for nonstandard in gmatch(pron, symbol) do
			table.insert(result, nonstandard)
			table.insert(categories, "[[Category:IPA pronunciations with obsolete or nonstandard characters|" .. nonstandard .. "]]")
		end
		
		if #result > 0 then
			table.insert(err, "obsolete or nonstandard characters (" .. table.concat(result) .. ")")
			break
		end
	end
	
	--[[ Check for invalid symbols after removing the following:
			1. wikilinks
			2. paired HTML tags
			3. bolding
			4. italics
			5. HTML entity for space
			6. asterisk at beginning of transcription
			7. comma followed by spacing characters
			8. superscripts enclosed in superscript parentheses		]]
	local result = gsub(pron, '%[%[(.*)%]%]', '%1')
	result = gsub(result, "<(%l+)[^>]*>([^<]+)</%1>", "%2")
	result = gsub(result, "'''([^']*)'''", "%1")
	result = gsub(result, "''([^']*)''", "%1")
	result = gsub(result, "&#32;", "")
	result = gsub(result, "^%*", "")
	result = gsub(result, ",%s+", "")
	result = gsub(result, "⁽[".. m_symbols.superscripts .. "]+⁾", "")
	result = gsub(result, '[' .. m_symbols.valid .. ']', '')
	if result ~= '' then
		local suggestions = {}
		mw.log(pron,result)
		local namespace = mw.title.getCurrentTitle().namespace
		local category
		if namespace == 0 then
			-- main namespace
			category = "IPA pronunciations with invalid IPA characters"
		elseif namespace == 118 then
			-- reconstruction namespace
			category = "IPA pronunciations with invalid IPA characters/reconstruction"
		else
			category = "IPA pronunciations with invalid IPA characters/non_mainspace"
		end
		for character in gmatch(result, ".") do
			local suggestion = m_symbols.suggestions[character]
			if suggestion then
				table.insert(suggestions, character .. " with " .. suggestion)
			end
			table.insert(categories, "[[Category:" .. category .. "|" .. character .. "]]")
		end
		table.insert(err, "invalid IPA characters (" .. result .. ")")
		if #suggestions > 0 then
			table.insert(err, "replace " .. table.concat(suggestions, ", "))
		end
	end
	
	-- Reference inside IPA template usage
	-- FIXME: Doesn't work; you can't put HTML in module output.
	--if mw.ustring.find(pron, '</ref>') then
	--	table.insert(categories, "[[Category:IPA pronunciations with reference]]")
	--end
	
	if repr == "phonemic" or repr == "rhyme" then
		if lang and m_data.phonemes[lang:getCode()] then
			local valid_phonemes = m_data.phonemes[lang:getCode()]
			local rest = pron
			local phonemes = {}
			
			while mw.ustring.len(rest) > 0 do
				local longestmatch = ""
				
				if sub(rest, 1, 1) == "(" or sub(rest, 1, 1) == ")" then
					longestmatch = sub(rest, 1, 1)
				else
					for _, phoneme in ipairs(valid_phonemes) do
						if mw.ustring.len(phoneme) > mw.ustring.len(longestmatch) and sub(rest, 1, mw.ustring.len(phoneme)) == phoneme then
							longestmatch = phoneme
						end
					end
				end
				
				if mw.ustring.len(longestmatch) > 0 then
					table.insert(phonemes, longestmatch)
					rest = sub(rest, mw.ustring.len(longestmatch) + 1)
				else
					local phoneme = sub(rest, 1, 1)
					table.insert(phonemes, "<span style=\"color: red\">" .. phoneme .. "</span>")
					rest = sub(rest, 2)
					table.insert(categories, "[[Category:IPA pronunciations with invalid phonemes/" .. lang:getCode() .. "]]")
					require("Module:debug").track("IPA/invalid phonemes/" .. phoneme)
				end
			end
			
			pron = table.concat(phonemes)
		end
		
		if repr == "phonemic" then
			pron = "/" .. pron .. "/"
		else
			pron = "-" .. pron
		end
	elseif repr == "phonetic" then
		pron = "[" .. pron .. "]"
	elseif repr == "orthographic" then
		pron = "⟨" .. pron .. "⟩"
	end
	
	if reconstructed then
		pron = "*" .. pron
	end
	
	if #err > 0 then
		err = ' <span class="previewonly error" style="font-size: small;>' .. table.concat(err, ', ') .. '</span>'
	else
		err = ""
	end
	
	if split_output then -- for use of IPA in links 
		return '<span class="IPA" lang="">' .. pron .. '</span>', table.concat(categories), err
	else
		return '<span class="IPA" lang="">' .. pron .. '</span>' .. err .. table.concat(categories)
	end
end

-- IPA <-> XSAMPA lookup tables
local i2x_lookup = {}
local function Populate_IPA_XSAMPA_LookupTables()
	if #i2x_lookup == 0 then
		for XSAMPA_symbol, data in pairs(m_XSAMPA) do
			local IPA_symbol = data[1]
			i2x_lookup[IPA_symbol] = XSAMPA_symbol
			
			local with_descender = data.with_descender
			if with_descender then
				i2x_lookup[with_descender] = XSAMPA_symbol
			end
		end
	end
	return i2x_lookup
end


function export.IPA_to_XSAMPA(text)
	Populate_IPA_XSAMPA_LookupTables()
	
	local escape = false
	if type(text) == 'table' then -- a frame, extract args
		text = text.args[1]
		text = text:gsub('{{=}}','='):gsub('{{!}}','|')
		text = mw.text.decode(text) -- XXX
		escape = true
	end

	text = gsub(text, 'ːː', ':') -- this basically sums up m_symbols[2].XSAMPA
	text = gsub(text, '.', i2x_lookup)

	if escape then
		text = mw.text.nowiki(text)
	end
	return text
end

function export.XSAMPA_to_IPA(text)
	local data = m_XSAMPA
	
	local escape = false
	if type(text) == 'table' then -- a frame, extract args
		text = text.args[1]
		text = mw.text.decode(text) -- XXX
		escape = true
	end
	
	-- Simpler function adapted from [[w:Module:Sandbox/Erutuon/X-SAMPA]]
	local output, characteristics  = {}, {}
	local angle_bracket
	if sub(text, 1, 1) == "<" and sub(text, -1) == ">" then
		table.insert(output, "⟨")
		angle_bracket =  "⟩"
		text = sub(text, 2, -2)
	end
	
	while #text > 0 do
		local substrings = {
			sub(text, 1, 4),
			sub(text, 1, 3),
			sub(text, 1, 2),
			sub(text, 1, 1)
		}
		
		for i, substring in ipairs(substrings) do
			local result, IPA, with_descender, has_descender, is_diacritic
			
			if data[substring] then
				result = data[substring]
				IPA = result[1]
				with_descender = result.with_descender
				has_descender = result.has_descender
				diacritic = result.is_diacritic
				if with_descender then
					-- Go backwords through the transcription, skipping any diacritics.
					local i = 0
					while characteristics[#characteristics - i].is_diacritic do
						i = i + 1
					end
					--[[	Look at the first non-diacritic symbol before the current symbol.
							If it has a descender, use the descender form of the current symbol. ]]
					if characteristics[#characteristics - i].has_descender then
						IPA = with_descender
					end
				end
			elseif not substrings[i + 1] then
				IPA = substring
			end
			
			if IPA then
				text = sub(text, 6 - i)
				table.insert(output, IPA)
				table.insert(characteristics, { has_descender = has_descender, is_diacritic = is_diacritic } )
				break
			end
		end
	end
	table.insert(output, angle_bracket)
	
	output = table.concat(output)
	
	if escape then
--		output = mw.text.nowiki(output)
	end

	return output
end

function export.example(frame)
	local output = {}
	
	table.insert(
		output,
[[
{| class="wikitable"
! Term !! IPA !! Generated X-SAMPA !! Regenerated IPA !! Matched?
]]
	)
	local row =
[[
|-
| link || IPA || XSAMPA || regenerated_IPA || matched
]]
	
	local examples = mw.text.split(frame.args[1], ",%s*")
	
	for _, example in pairs(examples) do
		local lang, word = match(example, "(%l%l%l?):(.+) [/%[]")
		
		if lang then
			lang = m_languages.getByCode(lang) or error('"' .. lang .. '" is not a valid language code.')
		end
		
		local IPA = match(example, "/[^/]+/")
			or match(example, "%[[^%]]+%]")
			or error('No IPA transcription found in "' .. example .. '".')
		local XSAMPA = export.IPA_to_XSAMPA(IPA)
		local regenerated_IPA = export.XSAMPA_to_IPA(XSAMPA)
		
		content = {
			link = lang and word and m_links.full_link{ term = word, lang = lang },
			matched = IPA == regenerated_IPA
				and '<span style="color: green;">yes</span>'
				or '<span style="color: red;">no</span>',
			IPA = '<span class="IPA">' .. IPA .. '</span>',
			XSAMPA = '<code>' .. XSAMPA .. '</span>',
			regenerated_IPA = '<span class="IPA">' .. regenerated_IPA .. '</span>'
		}
		
		local function add_content(item)
			return content[item] or ""
		end
		local row = gsub(row, "[%a_]+", add_content)
		table.insert(output, row)
	end
	
	table.insert(output, "|}")
	
	return table.concat(output)
end

return export