Module:Jpan-headword: difference between revisions

From Wiktionary, the free dictionary
Jump to navigation Jump to search
Content deleted Content added
No edit summary
trying to make romaji be tagged with Latn rather than Jpan
Line 6: Line 6:
local lang = require("Module:languages").getByCode("ja")
local lang = require("Module:languages").getByCode("ja")
local sc = require("Module:scripts").getByCode("Jpan")
local sc = require("Module:scripts").getByCode("Jpan")
local Latn = require("Module:scripts").getByCode("Latn")


local Japanese_symbols = '%ー・=?!。、'
local Japanese_symbols = '%ー・=?!。、'
Line 135: Line 136:
-- add link manually for WT:ACCEL unless headword is for suru verb
-- add link manually for WT:ACCEL unless headword is for suru verb
if data.pos_category == "suru verbs" then
if data.pos_category == "suru verbs" then
table.insert(data.inflections, {label = "rōmaji", "[[" .. rom .. "]] [[suru]]"})
table.insert(data.inflections, {label = "rōmaji", "[[" .. rom .. "]] [[suru]]", sc = Latn})
elseif detect_result then
elseif detect_result then
-- only accelerate romaji creation for kana entries
-- only accelerate romaji creation for kana entries
table.insert(data.inflections, {label = "rōmaji", accel = "romanized-form-of", rom})
table.insert(data.inflections, {label = "rōmaji", accel = "romanized-form-of", rom, sc = Latn})
else
else
table.insert(data.inflections, {label = "rōmaji", rom})
table.insert(data.inflections, {label = "rōmaji", rom, sc = Latn})
end
end
end
end

Revision as of 17:18, 17 May 2017

This implements Japanese headword-line templates and all of the associated templates that they called to do categorization and error checking.


local m_ja = require("Module:ja")

local export = {}
local pos_functions = {}

local lang = require("Module:languages").getByCode("ja")
local sc = require("Module:scripts").getByCode("Jpan")
local Latn = require("Module:scripts").getByCode("Latn")

local Japanese_symbols = '%ー・=?!。、'
local katakana_range = 'ァ-ヺーヽヾ'
local hiragana_range = 'ぁ-ゖーゞゝ'
local kana_range = katakana_range .. hiragana_range .. Japanese_symbols
local Japanese_scripts_range = kana_range .. '一-鿌・々'

local katakana_pattern = '^[' .. katakana_range .. Japanese_symbols .. ']*$'
local hiragana_pattern = '^[' .. hiragana_range .. Japanese_symbols .. ']*$'
local kana_pattern = '^[' .. kana_range .. ']*$'
local kana_pattern_full = '^[、' .. kana_range .. '%s%.%-%^]*$'
local kana_pattern_char = '[、' .. kana_range .. '%s%.%-%^]'

local is_suru_verb = false

local function detect_kana_script(kana)
	if mw.ustring.find(kana, katakana_pattern) then
		return 'kata'
	elseif mw.ustring.find(kana, hiragana_pattern) then
		return 'hira'
	elseif mw.ustring.find(kana, kana_pattern) then
		return 'both'
	else
		return nil
	end
end

local function kana_to_romaji(kana, data, args)
	-- make adjustments for -u verbs and -i adjectives by placing a period before the last character
	-- to prevent romanizing long vowels with macrons
	if (data.pos_category == "verbs" and not is_suru_verb) or (data.pos_category == "adjectives" and (args["infl"] == "i" or args["infl"] == "い")) then
		kana = mw.ustring.gsub(kana,'([うい])$','.%1')
	end
	local romaji = m_ja.kana_to_romaji(kana)

	-- init caps for proper nouns
	if data.pos_category == "proper nouns" then
		romaji = mw.ustring.gsub(romaji, "^%l", mw.ustring.upper)
		romaji = mw.ustring.gsub(romaji, " %l", mw.ustring.upper)
		romaji = mw.ustring.gsub(romaji, "-%l", mw.ustring.upper)
	end

	-- hyphens for prefixes, suffixes, and counters (classifiers)
	if data.pos_category == "prefixes" then
		return romaji .. "-"
	elseif data.pos_category == "suffixes" or data.pos_category == "counters" or data.pos_category == "classifiers" then
		return "-" .. romaji
	else
		return romaji
	end
end

local en_numerals = {
	"one", "two", "three", "four", "five",
	"six", "seven", "eight", "nine", "ten",
	"eleven", "twelve", "thirteen", "fourteen", "fifteen"
}

local en_grades = {
	"first grade", "second grade", "third grade",
	"fourth grade", "fifth grade", "sixth grade",
	"secondary school", "jinmeiyō", "hyōgaiji"
}

-- adds category Japanese terms spelled with jōyō kanji or Japanese terms spelled with non-jōyō kanji
-- (if it contains any kanji)
local function categorize_by_kanji(data, PAGENAME)
	-- remove non-kanji characters
	local onlykanji = mw.ustring.gsub(PAGENAME, '[^一-鿌]', '')

	local number_of_kanji = mw.ustring.len(onlykanji)
	if number_of_kanji > 0 then
		for i=1,mw.ustring.len(onlykanji) do
			table.insert(data.categories, ("Japanese terms spelled with %s kanji"):format(en_grades[m_ja.kanji_grade(mw.ustring.sub(onlykanji,i,i))]))
		end

		-- categorize by number of kanji
		if number_of_kanji == 1 then
			table.insert(data.categories, "Japanese terms written with one Han script character")
		elseif en_numerals[number_of_kanji] then
			table.insert(data.categories, ("Japanese terms written with %s Han script characters"):format(en_numerals[number_of_kanji]))
		end	
	end
end

-- if this term is composed of only a single kanji, it does not have kanjitab/kanji reading tab
-- which generate "Japanese terms spelled with .. " categories, and since it is only one kanji
-- we know the kanji reading
-- (this category is for maintenance because many of these need attention)
local function singlekanji_term(data, PAGENAME)
	if mw.ustring.len(PAGENAME) == 1 and mw.ustring.match(PAGENAME, '[一-鿌]') then
		table.insert(data.categories, "Japanese terms spelled with " .. PAGENAME)
		table.insert(data.categories, "Japanese single-kanji terms")
	end
end

-- get a kana form to use, in order of preference: unnamed, hira, kana, pagename
local function find_kana(args, PAGENAME)
	for i,arg in ipairs(args) do
		if args[i] and mw.ustring.find(args[i], kana_pattern_full) then return args[i] end
	end
	if mw.ustring.find(PAGENAME, kana_pattern_full) then return PAGENAME end
	local hira = args["hira"] or ""; if hira ~= "" then return hira end
	local kata = args["kata"] or ""; if kata ~= "" then return kata end
	error("No kana detected in the unnamed parameters, |hira= and |kata= parameter. See template documentation for details.")
end

-- go through args and build inflections by finding whatever kanas were given to us
local function find_inflections(args, data, PAGENAME)
	local detect_result = detect_kana_script(PAGENAME)
	local function romanization(auto_rom)
		-- accept the automatic romanization generated in function kana_to_romaji() above
		-- compare that to the manual romanization if it exists and add it to inflections
		local rom = args["rom"] or ""
		if rom == "" then rom = auto_rom end

		-- check auto rom against manual and put in hidden category if they differ
		if rom ~= auto_rom then
			table.insert(data.categories, "Japanese terms with romaji needing attention")
		end

		-- throw an error if there is no romanization
		if rom == "" then
			error("Japanese terms must have a kana form.")
		end

		-- add romaji
		-- add link manually for WT:ACCEL unless headword is for suru verb
		if data.pos_category == "suru verbs" then
			table.insert(data.inflections, {label = "rōmaji", "[[" .. rom .. "]] [[suru]]", sc = Latn})
		elseif detect_result then
			-- only accelerate romaji creation for kana entries
			table.insert(data.inflections, {label = "rōmaji", accel = "romanized-form-of", rom, sc = Latn})
		else
			table.insert(data.inflections, {label = "rōmaji", rom, sc = Latn})
		end
	end

	local allkana,original,readings,romajis,romaji_lookup = {},{},{},{},{}

	for i,arg in ipairs(args) do
		if arg and arg ~= "" and mw.ustring.find(arg, kana_pattern_full) then table.insert(allkana, arg) end
	end

	-- accept "hira" and "kata" but let Lua decide if they are really hiragana or katakana
	if args["hira"] and args["hira"] ~= "" and mw.ustring.find(args["hira"], kana_pattern_full) then table.insert(allkana, args["hira"]) end
	if args["kata"] and args["kata"] ~= "" and mw.ustring.find(args["kata"], kana_pattern_full) then table.insert(allkana, args["kata"]) end

	if mw.ustring.find(PAGENAME, kana_pattern_full) then
		if #allkana == 0 then table.insert(allkana, PAGENAME) end
	end

	for i = 1, #allkana do	
		-- auto_romanization
		romajis[i] = kana_to_romaji(allkana[i], data, args)
		-- remove markup
		table.insert(original,allkana[i])
		allkana[i] = mw.ustring.gsub(allkana[i], '[%s%.%-%^]', '')
	end
	for i = 1, #allkana do
		-- if this is not kana, blank it out
		if allkana and not mw.ustring.match(allkana[i], kana_pattern_char) then
			allkana[i] = ""
		else
			-- if this is kana, count it as another effective reading (ignoring hiragana-katakana distinction)
			readings[m_ja.kata_to_hira(allkana[i])] = 1
		end
		-- only if this kana is different from the page name
		if allkana[i] ~= PAGENAME and allkana[i] ~= "" then
			-- find script type and put it in "label"
			local labelval = ""
			local alternative = true
			for j = 1, i-1 do
				if allkana[j] and romajis[i] == romajis[j] then
					alternative = false
				end
			end
			if i>1 and alternative then labelval = "alternative reading"
			elseif detect_kana_script(allkana[i]) == 'both' then labelval = "hiragana and katakana"
			elseif detect_kana_script(allkana[i]) == 'hira' then labelval = "hiragana"
			else labelval = "katakana" end

			-- add everything to inflections, except historical hiragana which is next
			if data.pos_category == "nouns" or data.pos_category == "proper nouns" or data.pos_category == "verbs" or data.pos_category == "adjectives" or data.pos_category == "adverbs" then
				-- enable accelerated entry creation using hiragana links for certain parts of speech
				if mw.ustring.match(original[i],"[%. ]") then
					local tr = mw.ustring.gsub(original[i], " ", "-")
					table.insert(data.inflections, {label = labelval, accel = ("kana-%s-form-of transliteration-%s"):format(data.pos_category:sub(1, data.pos_category:len()-1):gsub(' ','-'), tr), allkana[i]})
				else
					table.insert(data.inflections, {label = labelval, accel = ("kana-%s-form-of"):format(data.pos_category:sub(1, data.pos_category:len()-1):gsub(' ','-')), allkana[i]})
				end
			elseif data.pos_category == "suru verbs" then
				table.insert(data.inflections, {label = labelval, "[[" .. allkana[i] .. "]][[する]]"})
			else
				table.insert(data.inflections, {label = labelval, allkana[i]})
			end
		end

		-- do the romanization business if it passes through every check
		local undergo_romanization = true
		if allkana[i] ~= "" then
			if allkana[i] == PAGENAME and not mw.ustring.find(PAGENAME, kana_pattern_full) then
				undergo_romanization = false
			else
				for j=i+1, #allkana do
					if allkana[j] and romajis[i] == romajis[j] then
						undergo_romanization = false
					end
				end
			end
		end
		if undergo_romanization then romanization(romajis[i]) end
	end

	local hhira = args["hhira"] or ""
	if hhira ~= "" then
		if data.pos_category == "suru verbs" then
			table.insert(data.inflections, {label = "historical hiragana", "[[" .. hhira .. "]][[する]]"})
		else
			table.insert(data.inflections, {label = "historical hiragana", hhira})
		end
	end

	local hkata = args["hkata"] or ""
	if hkata ~= "" then
		if data.pos_category == "suru verbs" then
			table.insert(data.inflections, {label = "historical katakana", "[[" .. hkata .. "]][[する]]"})
		else
			table.insert(data.inflections, {label = "historical katakana", hkata})
		end
	end
	
	local num_readings = 0
	for _ in pairs(readings) do
		num_readings = num_readings + 1
	end
	
	if num_readings > 1 then
		table.insert(data.categories, "Japanese words with multiple readings")
	end
end

-- categorize by the script of the pagename or specific characters contained in it
local function extra_categorization(data, PAGENAME, katakana_category)
	-- if PAGENAME is hiragana, put in that category, same for katakana (but do it at the end)
	if detect_kana_script(PAGENAME) == 'hira' then table.insert(data.categories, "Japanese hiragana") end
	if detect_kana_script(PAGENAME) == 'kata' then table.insert(katakana_category, "Japanese katakana") end
	if mw.ustring.find(PAGENAME, "[^" .. Japanese_scripts_range .. "]") and mw.ustring.find(PAGENAME, '[' .. Japanese_scripts_range .. ']') then
		table.insert(data.categories, "Japanese terms written in multiple scripts") end

	for _,character in ipairs({'々','〆','ヶ','ゝ','ゞ','ヽ','ヾ','ゐ','ヰ','ゑ','ヱ','ゔ','ヷ','ヸ','ヹ','ヺ','・'}) do
		if mw.ustring.match(PAGENAME,character) then
			table.insert(data.categories, ("Japanese terms spelled with %s"):format(character))
		end
	end
end

local aliases = {
	['transitive']='tr', ['trans']='tr',
	['intransitive']='in', ['intrans']='in', ['intr']='in',
	['godan']='1', ['ichidan']='2', ['irregular']='3'
}

pos_functions["verbs"] = function(args, data)
	-- transitivity
	local tr = args["tr"] or ""
	tr = aliases[tr] or tr
	if tr ~= "" then
		if tr == "tr" then table.insert(data.inflections, {label = "transitive"}) end
		if tr == "in" then table.insert(data.inflections, {label = "intransitive"}) end
		if tr == "both" then table.insert(data.inflections, {label = "transitive and intransitive"}) end
	else
		table.insert(data.categories, "Japanese verbs without transitivity")
	end

	-- conjugation type
	local conjugation = args["type"] or ""	
	conjugation = aliases[conjugation] or conjugation
	
	if conjugation == "1" then
		table.insert(data.inflections, {label = "godan conjugation"})
		table.insert(data.categories, "Japanese type 1 verbs")
	elseif conjugation == "2" then
		table.insert(data.inflections, {label = "ichidan conjugation"})
		table.insert(data.categories, "Japanese type 2 verbs")
	elseif conjugation == "3" then
		-- hidden temporary maintenance category
		-- (suru verbs should use ja-verb-suru but sometime erroneously use ja-verb with type=3 instead)
		table.insert(data.inflections, {label = "irregular conjugation"})
		table.insert(data.categories, "Japanese type 3 verbs")
		
		if mw.ustring.match(PAGENAME,'する$') then
			table.insert(data.categories, "Japanese terms using ja-verb with type 3")
		end
	elseif conjugation == "yo" then
		table.insert(data.inflections, {label = "yodan conjugation"})
		table.insert(data.categories, "Japanese yodan verbs")
	elseif conjugation == "ni" then
		table.insert(data.inflections, {label = "nidan conjugation"})
		table.insert(data.categories, "Japanese nidan verbs")
	end

	-- >> maintenance category <<
	-- check if this ends in something other than acceptable kana in a modern verb (and isn't already categorised as yodan or nidan)
	if not mw.ustring.match(PAGENAME, '[うくぐすつぬぶむる]$') and conjugation ~= "yo" and conjugation ~= "ni" then
		table.insert(data.categories, "Japanese verbs without modern conjugations")
	end
end

pos_functions["auxiliary verbs"] = function(args, data)
	data.pos_category = "verbs"
	table.insert(data.categories, "Japanese auxiliary verbs")
end

pos_functions["suru verbs"] = function(args, data)
	data.pos_category = "verbs"
	table.insert(data.categories, "Japanese type 3 verbs")
	
	-- transitivity
	local tr = args["tr"] or ""
	tr = aliases[tr] or tr
	
	if tr == "tr" then
		table.insert(data.inflections, {label = "transitive"})
	elseif tr == "in" then
		table.insert(data.inflections, {label = "intransitive"})
	elseif tr == "both" then
		table.insert(data.inflections, {label = "transitive and intransitive"})
	elseif tr == "" then
		table.insert(data.categories, "Japanese verbs without transitivity")
	end
end

pos_functions["adjectives"] = function(args, data)
	-- categorize by inflection type
	local infl = args["infl"] or ""

	if infl == "i" or infl == "い" then
		table.insert(data.inflections, {label = "-i inflection"})
		table.insert(data.categories, "Japanese い-i adjectives")
	elseif infl == "na" or infl == "な" then
		table.insert(data.inflections, {label = "-na inflection"})
		table.insert(data.categories, "Japanese な-na adjectives")
	elseif infl == "nari" or infl == "なり" then
		table.insert(data.inflections, {label = "-nari inflection"})
		table.insert(data.categories, "Japanese なり-nari adjectives")
	elseif infl == "tari" or infl == "たり" then
		table.insert(data.inflections, {label = "-tari inflection"})
		table.insert(data.categories, "Japanese たり-tari adjectives")
	end
end

pos_functions["nouns"] = function(args, data)
	-- the counter (classifier) parameter, only relevant for nouns
	local counter = args["count"] or ""
	
	if counter == "-" then
		table.insert(data.inflections, {label = "uncountable"})
	elseif counter ~= "" then
		table.insert(data.inflections, {label = "counter", counter})
	end
end

-- The main entry point.
-- This is the only function that can be invoked from a template.
function export.show(frame)
	PAGENAME = mw.title.getCurrentTitle().text
	local args = frame:getParent().args
	local poscat = frame.args[1] or error("Part of speech has not been specified. Please pass parameter 1 to the module invocation.")
	
	local head = args["head"] or PAGENAME
	
	if poscat == "suru verbs" then
		head = head .. "[[する]]"
		is_suru_verb = true
	end

	if args["decl"] and (not args["infl"] or args["infl"] == "") then
		args["infl"] = args["decl"]
	end
	
	local data = {lang = lang, sc = sc, pos_category = poscat, categories = {}, heads = {head}, inflections = {}}
	local katakana_category = {}
	
	local kana = find_kana(args, PAGENAME)
	
	-- the presence of kyūjitai param indicates that this is shinjitai kanji entry and vice versa
	local kyu = args["kyu"] or ""
	local shin = args["shin"] or ""
	
	if kyu == "" then
		kyu = nil
	else
		table.insert(data.inflections, {label = "[[shinjitai]] kanji"})
		table.insert(data.inflections, {label = "[[kyūjitai]] kanji", kyu})
	end
		
	if shin ~= "" then
		table.insert(data.inflections, {label = "[[kyūjitai]] kanji"})
		table.insert(data.inflections, {label = "[[shinjitai]] kanji", shin})
	end

	-- add certain "inflections" and categories for adjectives, verbs, or nouns
	if pos_functions[poscat] then
		pos_functions[poscat](args, data)
	end
	
	-- sort out all the kanas and do the romanization business
	find_inflections(args, data, PAGENAME, kana)
	
	-- categorize by joyo kanji and number of kanji
	categorize_by_kanji(data, PAGENAME)
	-- generate "Japanese terms spelled with ... read as ..." for single-kanji terms
	singlekanji_term(data, PAGENAME)
	-- add categories for terms with iteration marks (which are not kanji and hence are not categorized by ja-kanjitab)
	extra_categorization(data, PAGENAME, katakana_category)
	
	-- will only use sortkey if sortkey is different from PAGENAME
	-- when katakana in PAGENAME is converted to hiragana
	local sort_key = m_ja.jsort(kana)
	
	if sort_key == m_ja.kata_to_hira(PAGENAME) then
		return
			require("Module:headword").full_headword(data) ..
			require("Module:utilities").format_categories(katakana_category, lang)
	else
		-- convert sortkey to katakana version for katakana terms category (should sort by katakana)
		data.sort_key = sort_key
		return
			require("Module:headword").full_headword(data) ..
			require("Module:utilities").format_categories(katakana_category, lang, m_ja.hira_to_kata(sort_key))
	end
end

return export