Module:ja-parse

From Wiktionary, the free dictionary
Jump to navigation Jump to search

Auxiliary functions to parse the source of Japanese entries. Currently only used by Module:ja-see.


local export = {}

local len = mw.ustring.len
local sub = mw.ustring.sub
local gsub = mw.ustring.gsub
local find = mw.ustring.find
local match = mw.ustring.match
local gmatch = mw.ustring.gmatch

local m_ja = require('Module:ja')

-- Auxiliary functions

local kanji_pattern = "一-鿿㐀-䶿﨎﨏﨑﨓﨔﨟﨡﨣﨤﨧-﨩𠀀-𪛟𪜀-𮯯𰀀-𱍏"
local kana_pattern = 'ぁ-ゖァ-ヺー'
local japanese_pattern = kana_pattern .. kanji_pattern .. 'a-zA-Z0-9〆々'

local headword_templates = {
	['ja-adj'] = true, ['ja-pos'] = true, ['ja-noun'] = true, ['ja-phrase'] = true,
	['ja-verb'] = true, ['ja-verb form'] = true, ['ja-verb-suru'] = true,
}

local function find_headword_template(wikitext)
	local index =
		wikitext:find('{{ja%-noun[|}]') or
		wikitext:find('{{ja%-adj[|}]') or
		wikitext:find('{{ja%-pos[|}]') or
		wikitext:find('{{ja%-phrase[|}]') or
		wikitext:find('{{ja%-verb[|}]') or
		wikitext:find('{{ja%-verb form[|}]') or
		wikitext:find('{{ja%-verb%-suru[|}]')
	if index then
		-- This assumes that the template has matching braces.
		return wikitext:match('%b{}', index)
	end
end

local function parse_template(wikitext) -- only supports the simplest format
	local template = wikitext
	template = template:gsub('%[%[([^%[%]|]-)|([^%[%]|]-)%]%]', '[[%1`%2]]')
	local name
	local args = {}
	for glob in mw.text.gsplit(template:gsub('^{{', ''):gsub('}}$', ''), '|') do
		if not name then
			name = glob
		else
			glob = glob:gsub('`', '|')
			local key, value = match(glob, "(.-)=(.*)")
			if key and value then
				args[key] = value
			else
				table.insert(args, glob)
			end
		end
	end
	return name, args
end

local function contains(list, item)
	for i = 1, #list do
		if list[i] == item then return true end
	end
	return false
end

-- Part I: functions to parse entries into words

function export.words(page_title)
	local page = mw.title.new(page_title):getContent() or ''
	local l2 = match(page, '==Japanese==\n(.-)\n==[^=]+==\n') or match(page, '==Japanese==\n(.*)') or ''

	-- split into L3 sections
	local l3_sections = {}
	local multi_etym = false

	-- special hack for kanji entries
	if not find(l2, '===Etymology 1===') and (find(l2, '===Kanji===') or find(l2, '===Kanji %d+===')) then
		l2 = gsub(l2, '{{ja%-kanjitab', '=== ===\n{{ja-kanjitab')
	end

	local current_l3_title = ''
	local current_l3_content = {}
	for v in l2:gmatch('[^\n]+') do
		if find(v, '^===[^=]') then
			table.insert(l3_sections, { current_l3_title, table.concat(current_l3_content, '\n') })
			current_l3_title = match(v, '^===([^=]+)')
			if current_l3_title == 'Etymology 1' then multi_etym = true end
			current_l3_content = {}
		end
		table.insert(current_l3_content, v)
	end
	table.insert(l3_sections, { current_l3_title, table.concat(current_l3_content, '\n') })

	-- group the L3 sections into words
	local words = {}
	if multi_etym then
		for _, v in ipairs(l3_sections) do
			local header = v[1]
			local content = v[2]
			if find(header, '^Etymology %d+$') then
				table.insert(words, content)
			end
		end
	else
		local word = {}
		for _, v in ipairs(l3_sections) do
			local header = v[1]
			local content = v[2]
			if not (header == 'Kanji' or find(header, '^Kanji %d+$')) then
				table.insert(word, content)
			end
		end
		word = table.concat(word, '\n')
		table.insert(words, word)
	end

	local result = {}

	local function add(list, item)
		if not contains(list, item) then table.insert(list, item) end
	end

	local function insert_spelling(entry, spelling)
		if spelling then
			if find(m_ja.script(spelling), 'Hani') then
				add(entry.kanji_spellings, spelling)
			else
				add(entry.kana_spellings, spelling)
			end
		end
	end

	for _, word in ipairs(words) do
		local entry = {
			word,
			type = '',
			kana_spellings = {},
			kanji_spellings = {},
			historical_spellings = {},
		}
		insert_spelling(entry, page_title)

		local ja_see = find(word, '{{ja%-see[|}]') or find(word, '{{ja%-see-kango[|}]')
		if ja_see then
			entry.type = 'redirect'
			for link_title in gmatch(match(word, '.-}}', ja_see), '[' .. japanese_pattern .. ']+') do
				insert_spelling(entry, link_title)
			end
		else
			local ja_kanjitab = word:find('{{ja%-kanjitab[|}]')
			local headword_template = find_headword_template(word)
			if ja_kanjitab then
				entry.type = 'lemma'
				local _, args = parse_template(word:match('%b{}', ja_kanjitab))
				if args.alt and args.alt ~= "" and args.alt ~= "-" then
					for alt_spelling in mw.text.gsplit(args.alt, ',') do
						insert_spelling(entry, alt_spelling:gsub(':.+', ''))
					end
				end
			end
			if headword_template then
				entry.type = 'lemma'
				local _, args = parse_template(headword_template)
				for i = 1, #args do
					if find(args[i], '[' .. japanese_pattern .. ']') then
						insert_spelling(entry, m_ja.remove_ruby_markup(args[i]))
					end
				end
				add(entry.historical_spellings, args.hhira)
				add(entry.historical_spellings, args.hkata)
			end
		end
		table.insert(result, entry)
	end
	return result
end

-- Part II: functions to extract definitions and categories from a word

function export.parse_word(wikitext, lemma, nonlemma, frame, reading)
	local def = {}
	local cat = {}
	local current_section = ''

	for line in wikitext:gmatch('[^\n]+') do
		if line:find('^#+ ') then
			if not line:find('{{rfdef') and not (
				-- the nonlemma entry is a kanji spelling and
				find(nonlemma, '[' .. kanji_pattern .. ']') and
				-- is not listed in {{ja-def}} or the lemma entry has <!-- kana only -->
				(line:find('{{ja%-def|') and not line:find('|' .. nonlemma .. '[|}]') or line:find('<!%-%- kana only %-%->'))
			) then
				table.insert(def, { line:gsub("<ref[ >].-</ref>", ""), pos = current_section })
			end
		elseif line:find('^===') then
			current_section = line:gsub("^=*(.-)=*$", "%1")
		else
			table.insert(cat, line)
		end
	end

	-- expand the other parts for categories
	local cat = table.concat(cat, '\n')
	cat = gsub(cat, '<ref', '')
	local function process_template_header(a, b) -- if the template begins with "{{ja-usex|", a is "ja-usex" and b is "|".
		local templates_to_include = {
			-- Categories generated by these templates are copied.
			-- It is currently empty here.
			-- ['ExampleTemplateName'] = true,
		}
		if headword_templates[a] then
			local source_script = m_ja.script(lemma)
			if source_script == 'Hira' or source_script == 'Kana' or source_script == 'Hira+Kana' then
				return '{{' .. a .. '|hira=' .. lemma .. b
			else
				return '{{' .. a .. b
			end
		elseif a:find('^R%:') then
			return '{{=' .. b
		elseif a == 'ja-usex' or a:find('^quote') then -- special hack
			return '[[Category:Japanese terms with usage examples]]{{=' .. b
		elseif not templates_to_include[a] then
			return '{{=' .. b
		else
			return '{{' .. a .. b
		end
	end
	cat = gsub(cat, '{{([^|}\n]+)\n?([|}])', process_template_header)
	cat = gsub(cat, '{{ja%-pron.-}}', function(pron)
		local result = ''
		if not find(pron, '|noipa=') then result = result .. '[[Category:Japanese terms with IPA pronunciation]]' end
		if find(pron, '|a=') or find(pron, '|audio=') then result = result .. '[[Category:Japanese terms with audio links]]' end
		return result
		end)
	cat = frame:preprocess(cat)

	local cat2 = {}
	for i in gmatch(cat, '%[%[Category:.-%]%]') do
		i = gsub(i, '|.*', ']]')
		if i == '[[Category:Japanese lemmas]]' then i = '[[Category:Japanese non-lemma forms]]' end
		i = gsub(i, '%]%]', '|' .. (require("Module:languages").getByCode("ja"):makeSortKey(reading)) .. ']]')
		table.insert(cat2, i)
	end
	cat = table.concat(cat2)
	-- one might want to modify the sortkeys here

	return def, cat
end

return export