Module:za-sortkey

From Wiktionary, the free dictionary
Jump to navigation Jump to search

This module will sort Zhuang language text. The module should preferably not be called directly from templates or other modules. To use it from a template, use {{sortkey}}. Within a module, use Module:languages#Language:makeSortKey.

For testcases, see Module:za-sortkey/testcases.

Functions

makeSortKey(text, lang, sc)
Generates a sortkey for a given piece of text written in the script specified by the code sc, and language specified by the code lang.
When the sort fails, returns nil.
Alphabetic order
a ae (ə) b by c d e f g gv gy h i k l m mb (ƃ) my n nd (ƌ) ng (ŋ) ngv (ŋv) ny o oe (ɵ) p r s t u v w (ɯ) y
A A₂ (A₂ₐ) B B₂ C D E F G G₂ G₃ H I K L M M₂ (M₂ₐ) M₃ N N₂ (N₂ₐ) N₃ (N₃ₐ) N₄ (N₄ₐ) N₅ O O₂ (O₂ₐ) P R S T U V W Wₐ Y

Note: letters from the old orthography (in brackets) are sorted immediately after their new equivalents.

Order of tones
z (ƨ) j (з) x (ч) q (ƽ) h (ƅ)
² (²ᵃ) ³ (³ᵃ) (⁴ᵃ) (⁵ᵃ) (⁶ᵃ)

Note: "h" will sort as H if used as a consonant, or if used as a tone letter.

If a syllable has no tone letter but ends with a consonant, then the following tone values are used:

m n ng (ŋ) k p t b d g
N₃¹ (N₃ₐ¹) K⁷ P⁷ T⁷ B⁸ D⁸ G⁸

If new_bor=y or new_bor=1 are detected as parameters of {{za-pron}} on the page, then tone 5 is substituted for tone 1 in the sortkey. If {{za-1957 spelling of}} or {{za-1957 orthography of}} are detected on the page, then the page for the new orthography is checked for new_bor=y or new_bor=1 as well.

Examples

  • N₂A₂³ DAN₃³ VUEN₃²DA₂⁵ SIEN₃¹ DAN₃³ SIEN¹
ndaej dangj vuengzdaeq sieng dangj sien
  • LWG⁸FWN₃²G₃AN₃¹
lwgfwngzgyang
  • LAN₃⁶ BIT⁷ RO₂N₃² RA₂M⁴
langh bit roengz raemx
  • FAN₃²CWN₃²GAN₃³ᵃ
Faŋƨcɯŋƨgaŋз
  • GIEN²N₂AN₃¹CAN₃⁵
gienzndangcangq

Tone 5 substitution:

  • GUN₃⁵CAN³DAN₃³
gungcanjdangj
  • GUN₃⁵CAN³DAN₃³ᵃ
guŋcanзdaŋз
(due to the new_bor=1 parameter on gungcanjdangj)


  • bya (B₂A¹)
  • byaz (B₂A²)
  • byaƨ (B₂A²ᵃ)
  • byaj (B₂A³)
  • byaз (B₂A³ᵃ)
  • byax (B₂A⁴)
  • byaч (B₂A⁴ᵃ)
  • byaq (B₂A⁵)
  • byaƽ (B₂A⁵ᵃ)
  • byah (B₂A⁶)
  • byaƅ (B₂A⁶ᵃ)
  • byab (B₂AB⁸)
  • byad (B₂AD⁸)
  • byag (B₂AG⁸)
  • byak (B₂AK⁷)
  • byam (B₂AM¹)
  • byan (B₂AN¹)
  • byang (B₂AN₃¹)
  • byaŋ (B₂AN₃¹!)
  • byap (B₂AP⁷)
  • byat (B₂AT⁷)


  • a'a (A¹A¹)
  • aba (A¹BA¹)
  • a'ba (A¹BA¹)
  • a'da (A¹DA¹)
  • ada (A¹DA¹)
  • a'ga (A¹GA¹)
  • aga (A¹GA¹)
  • a'ha (A¹HA¹)
  • aha (A¹HA¹)
  • aka (A¹KA¹)
  • a'ma (A¹MA¹)
  • ama (A¹MA¹)
  • a'na (A¹NA¹)
  • ana (A¹NA¹)
  • anga (A¹N₃A¹)
  • apa (A¹PA¹)
  • ata (A¹TA¹)
  • aza (A²A¹)
  • aƨa (A²A¹!)
  • aja (A³A¹)
  • aзa (A³A¹!)
  • axa (A⁴A¹)
  • aчa (A⁴A¹!)
  • aqa (A⁵A¹)
  • aƽa (A⁵A¹!)
  • ah'a (A⁶A¹)
  • aƅa (A⁶A¹!)
  • abza (AB²A¹)
  • abƨa (AB²A¹!)
  • abja (AB³A¹)
  • abзa (AB³A¹!)
  • abxa (AB⁴A¹)
  • abчa (AB⁴A¹!)
  • abqa (AB⁵A¹)
  • abƽa (AB⁵A¹!)
  • abh'a (AB⁶A¹)
  • abƅa (AB⁶A¹!)
  • ab'a (AB⁸A¹)
  • ab'ha (AB⁸HA¹)
  • abha (AB⁸HA¹)
  • adza (AD²A¹)
  • adƨa (AD²A¹!)
  • adja (AD³A¹)
  • adзa (AD³A¹!)
  • adxa (AD⁴A¹)
  • adчa (AD⁴A¹!)
  • adqa (AD⁵A¹)
  • adƽa (AD⁵A¹!)
  • adh'a (AD⁶A¹)
  • adƅa (AD⁶A¹!)
  • ad'a (AD⁸A¹)
  • ad'ha (AD⁸HA¹)
  • adha (AD⁸HA¹)
  • agza (AG²A¹)
  • agƨa (AG²A¹!)
  • agja (AG³A¹)
  • agзa (AG³A¹!)
  • agxa (AG⁴A¹)
  • agчa (AG⁴A¹!)
  • agqa (AG⁵A¹)
  • agƽa (AG⁵A¹!)
  • agƅ (AG⁶ᵃ)
  • agh'a (AG⁶A¹)
  • ag'a (AG⁸A¹)
  • ag'ha (AG⁸HA¹)
  • agha (AG⁸HA¹)
  • akza (AK²A¹)
  • akƨa (AK²A¹!)
  • akja (AK³A¹)
  • akзa (AK³A¹!)
  • akxa (AK⁴A¹)
  • akчa (AK⁴A¹!)
  • akqa (AK⁵A¹)
  • akƽa (AK⁵A¹!)
  • akh'a (AK⁶A¹)
  • akƅa (AK⁶A¹!)
  • akha (AK⁷HA¹)
  • ak'ha (AK⁷HA¹)
  • am'a (AM¹A¹)
  • am'ha (AM¹HA¹)
  • amha (AM¹HA¹)
  • amza (AM²A¹)
  • amƨa (AM²A¹!)
  • amja (AM³A¹)
  • amзa (AM³A¹!)
  • amxa (AM⁴A¹)
  • amчa (AM⁴A¹!)
  • amqa (AM⁵A¹)
  • amƽa (AM⁵A¹!)
  • amh'a (AM⁶A¹)
  • amƅa (AM⁶A¹!)
  • an'a (AN¹A¹)
  • an'ga (AN¹GA¹)
  • an'ha (AN¹HA¹)
  • anha (AN¹HA¹)
  • anza (AN²A¹)
  • anƨa (AN²A¹!)
  • anja (AN³A¹)
  • anзa (AN³A¹!)
  • anxa (AN⁴A¹)
  • anчa (AN⁴A¹!)
  • anqa (AN⁵A¹)
  • anƽa (AN⁵A¹!)
  • anh'a (AN⁶A¹)
  • anƅa (AN⁶A¹!)
  • (AN₃¹!)
  • ang'a (AN₃¹A¹)
  • ang'ha (AN₃¹HA¹)
  • angha (AN₃¹HA¹)
  • angza (AN₃²A¹)
  • aŋƨa (AN₃²A¹!)
  • angja (AN₃³A¹)
  • aŋзa (AN₃³A¹!)
  • angxa (AN₃⁴A¹)
  • aŋчa (AN₃⁴A¹!)
  • angqa (AN₃⁵A¹)
  • aŋƽa (AN₃⁵A¹!)
  • angh'a (AN₃⁶A¹)
  • aŋƅa (AN₃⁶A¹!)
  • apza (AP²A¹)
  • apƨa (AP²A¹!)
  • apja (AP³A¹)
  • apзa (AP³A¹!)
  • apxa (AP⁴A¹)
  • apчa (AP⁴A¹!)
  • apqa (AP⁵A¹)
  • apƽa (AP⁵A¹!)
  • aph'a (AP⁶A¹)
  • apƅa (AP⁶A¹!)
  • ap'ha (AP⁷HA¹)
  • apha (AP⁷HA¹)
  • atza (AT²A¹)
  • atƨa (AT²A¹!)
  • atja (AT³A¹)
  • atзa (AT³A¹!)
  • atxa (AT⁴A¹)
  • atчa (AT⁴A¹!)
  • atqa (AT⁵A¹)
  • atƽa (AT⁵A¹!)
  • ath'a (AT⁶A¹)
  • atƅa (AT⁶A¹!)
  • at'ha (AT⁷HA¹)
  • atha (AT⁷HA¹)

local export = {}
local u = require("Module:string/char")
local UTF8_char = "[%z\1-\127\194-\244][\128-\191]*"
local a, b, c, d, e, f, g = u(0xF000), u(0xF001), u(0xF002), u(0xF003)
local b2 = u(0xF100)
local g2, g3 = u(0xF200), u(0xF201)
local m2, m4 = u(0xF300), u(0xF301)
local n2, n4, n6, n7, n8 = u(0xF400), u(0xF401), u(0xF402), u(0xF403), u(0xF404)

local remove_diacritics = "'" -- apostrophe

local oneCharInit = {
	["z"] = "2", ["ƨ"] = "2!", ["j"] = "3", ["з"] = "3!", ["x"] = "4", ["ч"] = "4!", ["q"] = "5", ["ƽ"] = "5!", ["ƅ"] = "6!"
}

local twoCharsInit = {
	["by"] = b2, ["gv"] = g2, ["gy"] = g3, ["mb"] = m2, ["my"] = m4, ["nd"] = n2, ["ng"] = n4, ["ŋv"] = n7, ["ny"] = n8
}

local threeCharsInit = {
	["ngv"] = n6
}

local conditionalTones1 = {
	["h"] = "6"
}

local conditionalTones2 = {
	["m"] = "m1", ["n"] = "n1", [n4] = n4 .. "1", ["ŋ"] = "ŋ1", ["k"] = "k7", ["p"] = "p7", ["t"] = "t7", ["b"] = "b8", ["d"] = "d8", ["g"] = "g8"
}

local oneCharFinal = {
	["ə"] = "a" .. a .. "!", [b2] = "b" .. a, [g2] = "g" .. a, [g3] = "g" .. b, [m2] = "m" .. a, ["ƃ"] = "m" .. a .. "!", [m4] = "m" .. b, [n2] = "n" .. a, ["ƌ"] = "n" .. a .. "!", [n4] = "n" .. b, ["ŋ"] = "n" .. b .. "!", [n6] = "n" .. c, [n7] = "n" .. c .. "!", [n8] = "n" .. d, ["ɵ"] = "o" .. a .. "!", ["ɯ"] = "w!"
}

local twoCharsFinal = {
	["ae"] = "a" .. a, ["oe"] = "o" .. a
}

function export.makeSortKey(text, lang, sc)
	local origText = text
	text = mw.ustring.lower(text)
	
	-- convert any consonant clusters to single characters, which is necessary for later regexes, and unconditional tone letters to numbers
	for from, to in pairs(threeCharsInit) do
		text = text:gsub(from, to)
	end
	
	for from, to in pairs(twoCharsInit) do
		text = text:gsub(from, to)
	end
	
	text = text:gsub(UTF8_char, oneCharInit)
	
	-- conditionally convert any conditional tone letters to numbers (e.g. "h" can be a consonant or a tone letter)
	for from, to in pairs(conditionalTones1) do
		text = text:gsub(from .. "$", to)
		text = mw.ustring.gsub(text, from .. "([^1-8aeiouwəɵɯ])", to .. "%1")
	end
	
	-- conditionally add a tone number to any syllable-final consonants which do not have them
	for from, to in pairs(conditionalTones2) do
		text = text:gsub(from .. "$", to)
		text = mw.ustring.gsub(text, from .. "([^1-8aeiouwəɵɯ])", to .. "%1")
	end
	
	-- conditionally add a tone number to any syllable-final vowels which do not have them
	text = mw.ustring.gsub(text, "([^1-8%s%p])$", "%11")
	text = mw.ustring.gsub(text, "([1-8][" .. a .. "-" .. d .. "])1$", "%1")
	text = mw.ustring.gsub(text, "([aeiouwəɵɯ])([^1-8aeiouwəɵɯ][^1-8])", "%11%2")
	
	-- convert clusters and non-ASCII characters to final form, to achieve correct order
	for from, to in pairs(twoCharsFinal) do
		text = text:gsub(from, to)
	end
	
	text = text:gsub(UTF8_char, oneCharFinal)
	
	-- move "!" to the end and remove any duplicates, to ensure old orthography terms are sorted immediately after their new equivalents
	for old in text:gmatch("!") do text = text:gsub("(!)(.+)", "%2%1") end
	text = text:gsub("!+", "!")
	
	-- if tone 5 is substituted for tone 1 in pronunciation, also substitute in sortkey (i.e. as though "q" were written)
	local page = mw.title.new(origText):getContent() or ""
	if mw.ustring.match(page, "{{za%-pron|.*new_bor=1}}") or mw.ustring.match(page, "{{za%-pron|.*new_bor=y}}") then
		text = mw.ustring.gsub(text, "1", "5")
	-- if the page has the old orthography template, then check the modern orthography page and substitute if present there (i.e. as though "ƽ" were written)
	elseif mw.ustring.match(page, "{{za%-1957 spelling of|.*}}") then
		local parentPage = mw.title.new(mw.ustring.match(page, "{{za%-1957 spelling of|(.-)}}")):getContent() or ""
		if mw.ustring.match(parentPage, "{{za%-pron|.*new_bor=1}}") or mw.ustring.match(parentPage, "{{za%-pron|.*new_bor=y}}") then
			text = mw.ustring.gsub(text, "1", "5" .. a)
		end
	elseif mw.ustring.match(page, "{{za%-1957 orthography of|.*}}") then
		local parentPage = mw.title.new(mw.ustring.match(page, "{{za%-1957 orthography of|(.-)}}")):getContent() or ""
		if mw.ustring.match(parentPage, "{{za%-pron|.*new_bor=1}}") or mw.ustring.match(parentPage, "{{za%-pron|.*new_bor=y}}") then
			text = mw.ustring.gsub(text, "1", "5" .. a)
		end
	end
	
	-- decompose, remove appropriate diacritics, then recompose again
	return mw.ustring.upper(mw.ustring.toNFC(mw.ustring.gsub(mw.ustring.toNFD(text), "[" .. remove_diacritics .. "]", "")))
end

local za = require("Module:languages").getByCode("za")
local function tag(text)
	return require("Module:script utilities").tag_text(text, za)
end

local showsubst1 = {
	["0"] = "⁰", ["1"] = "¹", ["2"] = "²", ["3"] = "³", ["4"] = "⁴", ["5"] = "⁵", ["6"] = "⁶", ["7"] = "⁷", ["8"] = "⁸"
}

local showsubst2 = {
	["2!"] = "²ᵃ", ["3!"] = "³ᵃ", ["4!"] = "⁴ᵃ", ["5!"] = "⁵ᵃ", ["6!"] = "⁶ᵃ", ["A" .. a] = "A₂", ["A" .. a .. "!"] = "A₂ₐ", ["B" .. a] = "B₂", ["G" .. a] = "G₂", ["G" .. b] = "G₃", ["M" .. a] = "M₂", ["M" .. a .. "!"] = "M₂ₐ", ["M" .. b] = "M₃", ["N" .. a] = "N₂", ["N" .. a .. "!"] = "N₂ₐ", ["N" .. b] = "N₃", ["N" .. b .. "!"] = "N₃ₐ", ["N" .. c] = "N₄", ["N" .. c .. "!"] = "N₄ₐ", ["N" .. d] = "N₅", ["O" .. a] = "O₂", ["O" .. a .. "!"] = "O₂ₐ", ["W!"] = "Wₐ"
}

function export.showSortkey(frame)
	local output = {}
	
	for _, word in ipairs(frame.args) do
		local sc = za:findBestScript(word):getCode()
		local sortkey = export.makeSortKey(word, "za", sc)
		for from, to in pairs(showsubst2) do
			sortkey = mw.ustring.gsub(sortkey, from, to)
		end
		for from, to in pairs(showsubst1) do
			sortkey = mw.ustring.gsub(sortkey, from, to)
		end
		local example = "\n* <code>" .. sortkey .. "</code>\n: " .. tag(word)
		table.insert(output, example)
	end
	
	return table.concat(output)
end

function export.showSorting(frame)
	local terms = {}
	
	for _, term in ipairs(frame.args) do
		table.insert(terms, term)
	end
	
	local makeSortKey = require("Module:fun").memoize(export.makeSortKey)
	local function comp(term1, term2)
		return makeSortKey(term1) < makeSortKey(term2)
	end
	
	table.sort(terms, comp)
	
	for i, term in pairs(terms) do
		local sc = za:findBestScript(term):getCode()
		local sortkey = export.makeSortKey(term, "za", sc)
		for from, to in pairs(showsubst2) do
			sortkey = mw.ustring.gsub(sortkey, from, to)
		end
		for from, to in pairs(showsubst1) do
			sortkey = mw.ustring.gsub(sortkey, from, to)
		end
		terms[i] = "\n* " .. tag(term) .. " (<code>" .. sortkey .. "</code>)"
	end
	
	return table.concat(terms)
end

return export