Module:Grc-utilities

local export = {}

local m_script_utils = require("Module:script utilities")
local m_links = require("Module:links")
local lang = require("Module:languages").getByCode("grc")
local sc = require("Module:scripts").getByCode("polytonic")

local m_data = mw.loadData("Module:grc-utilities/data")
local groups = m_data.groups
local conversions = m_data.conversions
local diacritics = m_data.diacritics
local diacritic = m_data.diacritic
local diaeresis = diacritics.diaeresis
local macron = diacritics.macron
local breve = diacritics.breve
local spacing_macron = diacritics.spacing_macron
local spacing_breve = diacritics.spacing_breve
local circumflex = diacritics.circum
local subscript = diacritics.subscript

local i_diphthong = "[ΑΕΗΟΥΩαεηουω]ι"
local u_diphthong = "[ΑΕΗΟΩαεηοω]υ"

local find = mw.ustring.find
local match = mw.ustring.match
local gmatch = mw.ustring.gmatch
local sub = mw.ustring.sub
local gsub = mw.ustring.gsub
local U = mw.ustring.char
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD

local dottedCircle = U(0x25CC)

-- This concatenates or inserts a character, then removes it from the text.
local function add(list, index, chars, text)
	if not chars then
		error("The function add cannot act on a nil character.")
	end
	if list[index] then
		list[index] = list[index] .. chars
	else
		list[index] = chars
	end
	local length = mw.ustring.len(chars)
	return sub(text, length + 1)
end

function export.addDottedCircle(text)
	if type(text) == "string" then
		text = gsub(text, "(" .. diacritic .. ")", dottedCircle .. "%1")
		return text
	end
end

function export.tag(term, face)
	return m_script_utils.tag_text(term, lang, sc, face)
end

function export.link(term, face, alt, tr)
	return m_links.full_link( { term = term, alt = alt, lang = lang, sc = sc, tr = tr }, face)
end

local function linkNoTag(term, alt)
	return m_links.language_link{ term = term, lang = lang, alt = alt }
end

-- Convert spacing to combining diacritics, and nonstandard to standard polytonic Greek.
function export.standardDiacritics(text)
	text = toNFD(text)
	
	for nonstandard, standard in pairs(conversions) do
		text = gsub(text, nonstandard, standard)
	end
	
	return text
end

--[=[	This function arranges diacritics in the following order:
			1. macron or breve
			2. breathings or diaeresis
			3. acute, circumflex, or grave
			4. iota subscript
		Used by [[Module:typing-aids]].
]=]
function export.reorderDiacritics(text)
	text = toNFD(text)
	
	-- Find a sequence of at least two diacritics.
	for sequence in gmatch(text, diacritic .. diacritic .. "+") do
		
		local outputDiacritics = {}
		
		for i, group in ipairs(groups) do
			local diacritic = match(sequence, group) or ""
			outputDiacritics[i] = diacritic
		end
		
		local diacriticsReplacement = table.concat(outputDiacritics)
		
		text = gsub(text, sequence, diacriticsReplacement)
	end
	
	return text
end

--[=[	This breaks a word into meaningful "tokens", which are
		individual letters or diphthongs with their diacritics.
		Used by [[Module:grc-accent]] and [[Module:grc-pronunciation]].	]=]--

function export.tokenize(text)
	
	-- standardize, decompose, and reorder diacritics
	text = export.standardDiacritics(text)
	text = export.reorderDiacritics(text)
	if type(text) ~= "string" then
		error("Text is not a string", 2)
	end
	
	local tokens = {}
	-- token tracks our position in the table of tokens.
	local i = 0
	while mw.ustring.len(text) > 0 do
		local char = sub(text, 1, 1) or ""
		local chars = sub(text, 1, 2) or ""
		local nextchars = sub(text, 3, 4) or ""
		-- Look for a diacritic and add it to the current token. Remove it from the text.
		if find(char, diacritic) then
			text = add(tokens, i, char, text)
	--[[	See if the next two characters form a diphthong and if so,
			add them to the current token. Remove them from the text.
			If there's a diaeresis, it will be immediately after
			the second of the two characters, or after a macron or breve.	]]
		elseif ( find(chars, '^' .. i_diphthong .. '$') or  find(chars, '^' .. u_diphthong .. '$') ) and not match(nextchars, "^[" .. macron .. breve .. "]?" .. diaeresis) then
			i = i + 1
			text = add(tokens, i, chars, text)
		else
		-- Add the current character to the next token. Remove it from the text.
			i = i + 1
			text = add(tokens, i, char, text)
		end
	end
	
	tokens.maxindex = i
	
	tokens = require("Module:table").compressSparseArray(tokens)
	
	local err = ""
	if not tokens.maxindex == #tokens then
		err = "There must have been a nil value in the tokens table."
	end
	
	return tokens, err
end

function export.printTokens(frame)
	text = frame.args[1]
	
	if text then
		local tokens, err = export.tokenize(text)
		for i, token in pairs(tokens) do
			if token == " " then
				tokens[i] = '<span style="background-color: lightgray;">&nbsp;</span>'
			end
		end
		return "|-\n| " .. export.tag(text) .. " || " .. export.tag(table.concat(tokens, ", ")) .. " || " .. err
	else
		error("Provide text to tokenize in first parameter.")
	end
end

--[=[	Places diacritics in the following order:
			1. breathings or diaeresis
			2. acute, circumflex, or grave
			3. macron or breve
			4. iota subscript
		Used by [[Module:grc-pronunciation]].		]=]
function export.pronunciationOrder(text)
	text = export.standardDiacritics(text)
	
	for sequence in gmatch(text, diacritic .. diacritic .. "+") do
		-- Put breathing and diaeresis first, then accents, then macron or breve
		local diacriticsReplacement = table.concat{
			match(sequence, groups[2]) or "",
			match(sequence, groups[3]) or "",
			match(sequence, groups[1]) or "",
			match(sequence, groups[4]) or ""
		}
		
		text = gsub(text, sequence, diacriticsReplacement)
	end
	
	text = gsub(text, macron, spacing_macron) -- combining to spacing macron
	text = gsub(text, breve, spacing_breve) -- combining to spacing breve
	
	return toNFC(text)
end

-- Returns a table of any ambiguous vowels in the text, language-tagged.
function export.findAmbig(text, noTag)
	if (not text) or type(text) ~= "string" then
		error("The input to function findAmbig is nonexistent or not a string")
	end
	
	-- breaks the word into units
	local tokens = export.tokenize(text)
	if not tokens then
		error("No tokens.")
	elseif type(tokens) ~= "table" then
		error("tokens aren't a table.")
	end
	
	-- ipairs() won't work because tokens[1] is nil.
	local output = {}
	local vowels = {}
	for _, token in pairs(tokens) do
		if not find(token, m_data.consonant) then
			local vowel, diacritics = match(token, "^([" .. "αιυ" .. "])(" .. diacritic .. "*)$")
			
			if vowel then
				if not diacritics
					or not (
						find(diacritics, macron)
						or find(diacritics, breve)
						or find(diacritics, circumflex)
						or find(diacritics, subscript) )
					then
					
					local diacriticked_vowel
					if not noTag then
						diacriticked_vowel = export.tag(vowel .. diacritics)
					else
						diacriticked_vowel = vowel
					end
					
					table.insert(output, diacriticked_vowel)
					
					-- Lists the vowel letters that are ambiguous, for categorization purposes.
					vowel = mw.ustring.lower(vowel)
					if not vowels[vowel] then
						vowels[vowel] = true
					end
				end
			end
		end
	end
		
	return output, vowels
end

function export.printDiacritics(frame)
	local functionToPrint = frame.args[1] or error('Specify a function in the first parameter.')
	local term = frame.args[2] or error('Add text in the second parameter.')
	
	local result = export[functionToPrint](term)
	
	-- Show diacritics above or below a dotted circle.
	content = {
		term = export.tag(term),
		term_decomposition = export.tag(export.addDottedCircle(toNFD(term))),
		result = export.tag(result),
		result_decomposition = export.tag(export.addDottedCircle(result)),
	}
	
	local output = [[ term (term_decomposition) → result (result_decomposition)]]
	
	local function addContent(item)
		return content[item] or ""
	end
	
	output = gsub(output, "[%a_]+", addContent)

	return output
end

function export.decompose(frame)
	local params = {
		[1] = {},
		["link"] = { type = "boolean" },
	}
	
	args = require("Module:parameters").process(frame.args, params)
	
	local text = args[1]
	text = toNFD(text)
	local link = args.link
	local composed
	
	if link then
		composed = export.link(text, nil, nil, "-")
	else
		composed = export.tag(text)
	end
	
	local decomposed = export.addDottedCircle(text)
	
	if link then
		local result = {}
		for seat, letter in gmatch(decomposed, "(" .. dottedCircle .. "?)(.)") do
			local link
			if letter then
				link = linkNoTag(letter, seat .. letter)
			end
			
			table.insert(result, link)
		end
		decomposed = table.concat(result)
	end
	
	decomposed = export.tag(decomposed)
	
	return composed .. " (" .. decomposed .. ")"
end

return export