Module:Ja-headword

local m_ja = require("Module:ja")

local find = mw.ustring.find

local export = {}
local pos_functions = {}

local lang = require("Module:languages").getByCode("ja")
local sc = require("Module:scripts").getByCode("Jpan")
local Latn = require("Module:scripts").getByCode("Latn")

local Japanese_symbols = '%ｰ・＝？！。、'
local katakana_range = 'ァ-ヺーヽヾ'
local hiragana_range = 'ぁ-ゖーゞゝ'
local kana_range = katakana_range .. hiragana_range .. Japanese_symbols
local Japanese_scripts_range = kana_range .. '一-鿌・々'

local katakana_pattern = '^[' .. katakana_range .. Japanese_symbols .. ']*$'
local hiragana_pattern = '^[' .. hiragana_range .. Japanese_symbols .. ']*$'
local kana_pattern = '^[' .. kana_range .. ']*$'
local kana_pattern_full = '^[、' .. kana_range .. '%s%.%-%^]*$'
local kana_pattern_char = '[、' .. kana_range .. '%s%.%-%^]'

local is_suru_verb = false

local function detect_kana_script(kana)
	if find(kana, katakana_pattern) then
		return 'kata'
	elseif find(kana, hiragana_pattern) then
		return 'hira'
	elseif find(kana, kana_pattern) then
		return 'both'
	else
		return nil
	end
end

local function kana_to_romaji(kana, data, args)
	-- make adjustments for -u verbs and -i adjectives by placing a period before the last character
	-- to prevent romanizing long vowels with macrons
	if (data.pos_category == "verbs" and not is_suru_verb) or (data.pos_category == "adjectives" and (args["infl"] == "i" or args["infl"] == "い")) then
		kana = mw.ustring.gsub(kana,'([うい])$','.%1')
	end
	local romaji = m_ja.kana_to_romaji(kana)

	-- init caps for proper nouns
	if data.pos_category == "proper nouns" then
		romaji = mw.ustring.gsub(romaji, "^%l", mw.ustring.upper)
		romaji = mw.ustring.gsub(romaji, " %l", mw.ustring.upper)
		romaji = mw.ustring.gsub(romaji, "-%l", mw.ustring.upper)
	end

	-- hyphens for prefixes, suffixes, and counters (classifiers)
	if data.pos_category == "prefixes" then
		return romaji .. "-"
	elseif data.pos_category == "suffixes" or data.pos_category == "counters" or data.pos_category == "classifiers" then
		return "-" .. romaji
	else
		return romaji
	end
end

local en_numerals = {
	"one", "two", "three", "four", "five",
	"six", "seven", "eight", "nine", "ten",
	"eleven", "twelve", "thirteen", "fourteen", "fifteen"
}

local en_grades = {
	"first grade", "second grade", "third grade",
	"fourth grade", "fifth grade", "sixth grade",
	"secondary school", "jinmeiyō", "hyōgaiji"
}

-- adds category Japanese terms spelled with jōyō kanji or Japanese terms spelled with non-jōyō kanji
-- (if it contains any kanji)
local function categorize_by_kanji(data, PAGENAME)
	-- remove non-kanji characters
	local onlykanji = mw.ustring.gsub(PAGENAME, '[^一-鿌]', '')

	local number_of_kanji = mw.ustring.len(onlykanji)
	if number_of_kanji > 0 then
		for i=1,mw.ustring.len(onlykanji) do
			table.insert(data.categories, ("Japanese terms spelled with %s kanji"):format(en_grades[m_ja.kanji_grade(mw.ustring.sub(onlykanji,i,i))]))
		end

		-- categorize by number of kanji
		if number_of_kanji == 1 then
			table.insert(data.categories, "Japanese terms written with one Han script character")
		elseif en_numerals[number_of_kanji] then
			table.insert(data.categories, ("Japanese terms written with %s Han script characters"):format(en_numerals[number_of_kanji]))
		end	
	end
end

-- if this term is composed of only a single kanji, it does not have kanjitab/kanji reading tab
-- which generate "Japanese terms spelled with .. " categories, and since it is only one kanji
-- we know the kanji reading
-- (this category is for maintenance because many of these need attention)
local function singlekanji_term(data, PAGENAME)
	if mw.ustring.len(PAGENAME) == 1 and mw.ustring.match(PAGENAME, '[一-鿌]') then
		table.insert(data.categories, "Japanese terms spelled with " .. PAGENAME)
		table.insert(data.categories, "Japanese single-kanji terms")
	end
end

-- get a kana form to use, in order of preference: unnamed, hira, kana, pagename
local function find_kana(args, PAGENAME)
	for i,arg in ipairs(args) do
		if args[i] and find(args[i], kana_pattern_full) then return args[i] end
	end
	if find(PAGENAME, kana_pattern_full) then return PAGENAME end
	local hira = args["hira"] or ""; if hira ~= "" then return hira end
	local kata = args["kata"] or ""; if kata ~= "" then return kata end
	error("No kana detected in the unnamed parameters, |hira= and |kata= parameter. See template documentation for details.")
end

-- go through args and build inflections by finding whatever kanas were given to us
local function find_inflections(args, data, PAGENAME)
	local detect_result = detect_kana_script(PAGENAME)
	local function romanization(auto_rom)
		-- accept the automatic romanization generated in function kana_to_romaji() above
		-- compare that to the manual romanization if it exists and add it to inflections
		local rom = args["rom"] or ""
		if rom == "" then rom = auto_rom end

		-- check auto rom against manual and put in hidden category if they differ
		if rom ~= auto_rom then
			table.insert(data.categories, "Japanese terms with romaji needing attention")
		end

		-- throw an error if there is no romanization
		if rom == "" then
			error("Japanese terms must have a kana form.")
		end

		-- add romaji
		-- add link manually for WT:ACCEL unless headword is for suru verb
		if data.pos_category == "suru verbs" then
			table.insert(data.inflections, {label = "rōmaji", "[[" .. rom .. "]] [[suru]]", sc = Latn})
		elseif detect_result then
			-- only accelerate romaji creation for kana entries
			table.insert(data.inflections, {label = "rōmaji", accel = "romanized-form-of", rom, sc = Latn})
		else
			table.insert(data.inflections, {label = "rōmaji", rom, sc = Latn})
		end
	end

	local allkana,original,readings,romajis,romaji_lookup = {},{},{},{},{}

	for i,arg in ipairs(args) do
		if arg and arg ~= "" and find(arg, kana_pattern_full) then table.insert(allkana, arg) end
	end

	-- accept "hira" and "kata" but let Lua decide if they are really hiragana or katakana
	if args["hira"] and args["hira"] ~= "" and find(args["hira"], kana_pattern_full) then table.insert(allkana, args["hira"]) end
	if args["kata"] and args["kata"] ~= "" and find(args["kata"], kana_pattern_full) then table.insert(allkana, args["kata"]) end

	if find(PAGENAME, kana_pattern_full) then
		if #allkana == 0 then table.insert(allkana, PAGENAME) end
	end

	for i = 1, #allkana do	
		-- auto_romanization
		romajis[i] = kana_to_romaji(allkana[i], data, args)
		-- remove markup
		table.insert(original,allkana[i])
		allkana[i] = mw.ustring.gsub(allkana[i], '[%s%.%-%^]', '')
	end
	for i = 1, #allkana do
		-- if this is not kana, blank it out
		if allkana and not mw.ustring.match(allkana[i], kana_pattern_char) then
			allkana[i] = ""
		else
			-- if this is kana, count it as another effective reading (ignoring hiragana-katakana distinction)
			readings[m_ja.kata_to_hira(allkana[i])] = 1
		end
		-- only if this kana is different from the page name
		if allkana[i] ~= PAGENAME and allkana[i] ~= "" then
			-- find script type and put it in "label"
			local labelval = ""
			local alternative = true
			for j = 1, i-1 do
				if allkana[j] and romajis[i] == romajis[j] then
					alternative = false
				end
			end
			if i>1 and alternative then labelval = "alternative reading"
			elseif detect_kana_script(allkana[i]) == 'both' then labelval = "hiragana and katakana"
			elseif detect_kana_script(allkana[i]) == 'hira' then labelval = "hiragana"
			else labelval = "katakana" end

			-- add everything to inflections, except historical hiragana which is next
			if data.pos_category == "nouns" or data.pos_category == "proper nouns" or data.pos_category == "verbs" or data.pos_category == "adjectives" or data.pos_category == "adverbs" then
				-- enable accelerated entry creation using hiragana links for certain parts of speech
				if mw.ustring.match(original[i],"[%. ]") then
					local tr = mw.ustring.gsub(original[i], " ", "-")
					table.insert(data.inflections, {label = labelval, accel = ("kana-%s-form-of transliteration-%s"):format(data.pos_category:sub(1, data.pos_category:len()-1):gsub(' ','-'), tr), allkana[i]})
				else
					table.insert(data.inflections, {label = labelval, accel = ("kana-%s-form-of"):format(data.pos_category:sub(1, data.pos_category:len()-1):gsub(' ','-')), allkana[i]})
				end
			elseif data.pos_category == "suru verbs" then
				table.insert(data.inflections, {label = labelval, "[[" .. allkana[i] .. "]][[する]]"})
			else
				table.insert(data.inflections, {label = labelval, allkana[i]})
			end
		end

		-- do the romanization business if it passes through every check
		local undergo_romanization = true
		if allkana[i] ~= "" then
			if allkana[i] == PAGENAME and not find(PAGENAME, kana_pattern_full) then
				undergo_romanization = false
			else
				for j=i+1, #allkana do
					if allkana[j] and romajis[i] == romajis[j] then
						undergo_romanization = false
					end
				end
			end
		end
		if undergo_romanization then romanization(romajis[i]) end
	end

	local hhira = args["hhira"] or ""
	if hhira ~= "" then
		if data.pos_category == "suru verbs" then
			table.insert(data.inflections, {label = "historical hiragana", "[[" .. hhira .. "]][[する]]"})
		else
			table.insert(data.inflections, {label = "historical hiragana", hhira})
		end
	end

	local hkata = args["hkata"] or ""
	if hkata ~= "" then
		if data.pos_category == "suru verbs" then
			table.insert(data.inflections, {label = "historical katakana", "[[" .. hkata .. "]][[する]]"})
		else
			table.insert(data.inflections, {label = "historical katakana", hkata})
		end
	end
	
	local num_readings = 0
	for _ in pairs(readings) do
		num_readings = num_readings + 1
	end
	
	if num_readings > 1 then
		table.insert(data.categories, "Japanese words with multiple readings")
	end
end

-- categorize by the script of the pagename or specific characters contained in it
local function extra_categorization(data, PAGENAME, katakana_category)
	-- if PAGENAME is hiragana, put in that category, same for katakana (but do it at the end)
	if detect_kana_script(PAGENAME) == 'hira' then table.insert(data.categories, "Japanese hiragana") end
	if detect_kana_script(PAGENAME) == 'kata' then table.insert(katakana_category, "Japanese katakana") end
	if find(PAGENAME, "[^" .. Japanese_scripts_range .. "]") and find(PAGENAME, '[' .. Japanese_scripts_range .. ']') then
		table.insert(data.categories, "Japanese terms written in multiple scripts") end

	for _,character in ipairs({'々','〆','ヶ','ゝ','ゞ','ヽ','ヾ','ゐ','ヰ','ゑ','ヱ','ゔ','ヷ','ヸ','ヹ','ヺ','・'}) do
		if mw.ustring.match(PAGENAME,character) then
			table.insert(data.categories, ("Japanese terms spelled with %s"):format(character))
		end
	end
end

local aliases = {
	['transitive']='tr', ['trans']='tr',
	['intransitive']='in', ['intrans']='in', ['intr']='in',
	['godan']='1', ['ichidan']='2', ['irregular']='3'
}

pos_functions["verbs"] = function(args, data)
	-- transitivity
	local tr = args["tr"] or ""
	tr = aliases[tr] or tr
	if tr ~= "" then
		if tr == "tr" then table.insert(data.inflections, {label = "transitive"})
		table.insert(data.categories, "Japanese transitive verbs") end
		if tr == "in" then table.insert(data.inflections, {label = "intransitive"})
		table.insert(data.categories, "Japanese intransitive verbs") end
		if tr == "both" then table.insert(data.inflections, {label = "transitive and intransitive"})
		table.insert(data.categories, "Japanese transitive verbs")
		table.insert(data.categories, "Japanese intransitive verbs") end
	else
		table.insert(data.categories, "Japanese verbs without transitivity")
	end

	-- conjugation type
	local conjugation = args["type"] or ""	
	conjugation = aliases[conjugation] or conjugation
	
	if conjugation == "1" then
		table.insert(data.inflections, {label = "godan conjugation"})
		table.insert(data.categories, "Japanese type 1 verbs")
	elseif conjugation == "2" then
		table.insert(data.inflections, {label = "ichidan conjugation"})
		table.insert(data.categories, "Japanese type 2 verbs")
	elseif conjugation == "3" then
		-- hidden temporary maintenance category
		-- (suru verbs should use ja-verb-suru but sometime erroneously use ja-verb with type=3 instead)
		table.insert(data.inflections, {label = "irregular conjugation"})
		table.insert(data.categories, "Japanese type 3 verbs")
		
		if mw.ustring.match(PAGENAME,'する$') then
			table.insert(data.categories, "Japanese terms using ja-verb with type 3")
		end
	elseif conjugation == "yo" then
		table.insert(data.inflections, {label = "yodan conjugation"})
		table.insert(data.categories, "Japanese yodan verbs")
	elseif conjugation == "ni" then
		table.insert(data.inflections, {label = "nidan conjugation"})
		table.insert(data.categories, "Japanese nidan verbs")
	end

	-- >> maintenance category <<
	-- check if this ends in something other than acceptable kana in a modern verb (and isn't already categorised as yodan or nidan)
	if not mw.ustring.match(PAGENAME, '[うくぐすつぬぶむる]$') and conjugation ~= "yo" and conjugation ~= "ni" then
		table.insert(data.categories, "Japanese verbs without modern conjugations")
	end
end

pos_functions["auxiliary verbs"] = function(args, data)
	data.pos_category = "verbs"
	table.insert(data.categories, "Japanese auxiliary verbs")
end

pos_functions["suru verbs"] = function(args, data)
	data.pos_category = "verbs"
	table.insert(data.categories, "Japanese type 3 verbs")
	
	-- transitivity
	local tr = args["tr"] or ""
	tr = aliases[tr] or tr
	
	if tr == "tr" then
		table.insert(data.inflections, {label = "transitive"})
	elseif tr == "in" then
		table.insert(data.inflections, {label = "intransitive"})
	elseif tr == "both" then
		table.insert(data.inflections, {label = "transitive and intransitive"})
	elseif tr == "" then
		table.insert(data.categories, "Japanese verbs without transitivity")
	end
end

pos_functions["adjectives"] = function(args, data)
	-- categorize by inflection type
	local infl = args["infl"] or ""

	if infl == "i" or infl == "い" then
		table.insert(data.inflections, {label = "-i inflection"})
		table.insert(data.categories, "Japanese い-i adjectives")
	elseif infl == "na" or infl == "な" then
		table.insert(data.inflections, {label = "-na inflection"})
		table.insert(data.categories, "Japanese な-na adjectives")
	elseif infl == "nari" or infl == "なり" then
		table.insert(data.inflections, {label = "-nari inflection"})
		table.insert(data.categories, "Japanese なり-nari adjectives")
	elseif infl == "tari" or infl == "たり" then
		table.insert(data.inflections, {label = "-tari inflection"})
		table.insert(data.categories, "Japanese たり-tari adjectives")
	end
end

pos_functions["nouns"] = function(args, data)
	-- the counter (classifier) parameter, only relevant for nouns
	local counter = args["count"] or ""
	
	if counter == "-" then
		table.insert(data.inflections, {label = "uncountable"})
	elseif counter ~= "" then
		table.insert(data.inflections, {label = "counter", counter})
	end
end

-- The main entry point.
-- This is the only function that can be invoked from a template.
function export.show(frame)
	PAGENAME = mw.title.getCurrentTitle().text
	local args = frame:getParent().args
	local poscat = frame.args[1] or error("Part of speech has not been specified. Please pass parameter 1 to the module invocation.")
	
	local head = args["head"] or PAGENAME
	
	if poscat == "suru verbs" then
		head = head .. "[[する]]"
		is_suru_verb = true
	end

	if args["decl"] and (not args["infl"] or args["infl"] == "") then
		args["infl"] = args["decl"]
	end
	
	local data = {lang = lang, sc = sc, pos_category = poscat, categories = {}, heads = {head}, inflections = {}}
	local katakana_category = {}
	
	local kana = find_kana(args, PAGENAME)
	
	-- the presence of kyūjitai param indicates that this is shinjitai kanji entry and vice versa
	local kyu = args["kyu"] or ""
	local shin = args["shin"] or ""
	
	if kyu == "" then
		kyu = nil
	else
		table.insert(data.inflections, {label = "[[shinjitai]] kanji"})
		table.insert(data.inflections, {label = "[[kyūjitai]] kanji", kyu})
	end
		
	if shin ~= "" then
		table.insert(data.inflections, {label = "[[kyūjitai]] kanji"})
		table.insert(data.inflections, {label = "[[shinjitai]] kanji", shin})
	end

	-- add certain "inflections" and categories for adjectives, verbs, or nouns
	if pos_functions[poscat] then
		pos_functions[poscat](args, data)
	end
	
	-- sort out all the kanas and do the romanization business
	find_inflections(args, data, PAGENAME, kana)
	
	-- categorize by joyo kanji and number of kanji
	categorize_by_kanji(data, PAGENAME)
	-- generate "Japanese terms spelled with ... read as ..." for single-kanji terms
	singlekanji_term(data, PAGENAME)
	-- add categories for terms with iteration marks (which are not kanji and hence are not categorized by ja-kanjitab)
	extra_categorization(data, PAGENAME, katakana_category)
	
	if find(PAGENAME, "[" .. katakana_range .. "]") and find(PAGENAME, "[" .. hiragana_range .. "]") then
		table.insert(data.categories, "Japanese terms spelled with mixed kana")
	end
	
	-- will only use sortkey if sortkey is different from PAGENAME
	-- when katakana in PAGENAME is converted to hiragana
	local sort_key = m_ja.jsort(kana)
	
	if sort_key == m_ja.kata_to_hira(PAGENAME) then
		return
			require("Module:headword").full_headword(data) ..
			require("Module:utilities").format_categories(katakana_category, lang)
	else
		-- convert sortkey to katakana version for katakana terms category (should sort by katakana)
		data.sort_key = sort_key
		return
			require("Module:headword").full_headword(data) ..
			require("Module:utilities").format_categories(katakana_category, lang, m_ja.hira_to_kata(sort_key))
	end
end

return export