local p = {}
local ustring = mw.ustring
local U = ustring.char
local find = ustring.find
local gsub = ustring.gsub
local decompose = ustring.toNFD
local lower = ustring.lower
local upper = ustring.upper
local str_gmatch = string.gmatch
local macron = U(0x304)
local breve = U(0x306)
local rough = U(0x314)
local smooth = U(0x313)
local diaeresis = U(0x308)
local acute = U(0x301)
local grave = U(0x300)
local circumflex = U(0x342)
local Latin_circumflex = U(0x302)
local subscript = U(0x345)
local macron_circumflex = macron .. diaeresis .. '?' .. Latin_circumflex
local is_velar = { ['κ'] = true, ['γ'] = true, ['χ'] = true, ['ξ'] = true, }
local UTF8_char = "[%z\1-\127\194-\244][\128-\191]*"
local basic_Greek = "[\206-\207][\128-\191]" -- excluding first line of Greek and Coptic block: ͰͱͲͳʹ͵Ͷͷͺͻͼͽ;Ϳ
local info = {}
-- The tables are shared among different characters so that they can be checked
-- for equality if needed, and to use less space.
local vowel = { vowel = true, diacritic_seat = true }
local iota = { vowel = true, diacritic_seat = true, offglide = true }
local upsilon = { vowel = true, diacritic_seat = true, offglide = true }
-- Technically rho is only a seat for rough or smooth breathing.
local rho = { consonant = true, diacritic_seat = true }
local consonant = { consonant = true }
local diacritic = { diacritic = true }
-- Needed for equality comparisons.
local breathing = { diacritic = true }
local function add_info(characters, t)
if type(characters) == "string" then
for character in string.gmatch(characters, UTF8_char) do
info[character] = t
end
else
for _, character in ipairs(characters) do
info[character] = t
end
end
end
add_info({ macron, breve,
diaeresis,
acute, grave, circumflex,
subscript,
}, diacritic)
add_info({rough, smooth}, breathing)
add_info("ΑΕΗΟΩαεηοω", vowel)
add_info("Ιι", iota)
add_info("Υυ", upsilon)
add_info("ΒΓΔΖΘΚΛΜΝΞΠΡΣΤΦΧΨϜϘϺϷͶϠβγδζθκλμνξπρσςτφχψϝϙϻϸͷϡ", consonant)
add_info("Ρρ", rho)
local not_recognized = {}
setmetatable(info, { __index =
function()
return not_recognized
end
})
local function quote(str)
return "“" .. str .. "”"
end
local tt = {
-- Vowels
["α"] = "a",
["ε"] = "e",
["η"] = "e" .. macron,
["ι"] = "i",
["ο"] = "o",
["υ"] = "u",
["ω"] = "o" .. macron,
-- Consonants
["β"] = "b",
["γ"] = "g",
["δ"] = "d",
["ζ"] = "z",
["θ"] = "th",
["κ"] = "k",
["λ"] = "l",
["μ"] = "m",
["ν"] = "n",
["ξ"] = "x",
["π"] = "p",
["ρ"] = "r",
["σ"] = "s",
["ς"] = "s",
["τ"] = "t",
["φ"] = "ph",
["χ"] = "kh",
["ψ"] = "ps",
-- Archaic letters
["ϝ"] = "w",
["ϻ"] = "ś",
["ϙ"] = "q",
["ϡ"] = "š",
["ͷ"] = "v",
-- Diacritics
-- unchanged: macron, diaeresis, grave, acute
[breve] = '',
[smooth] = '',
[rough] = '',
[circumflex] = Latin_circumflex,
[subscript] = 'i',
}
--[=[
This breaks a word into meaningful "tokens", which are
individual letters or diphthongs with their diacritics.
Used by [[Module:grc-accent]] and [[Module:grc-pronunciation]].
--]=]
local function tokenize(text)
local tokens, vowel_info, prev_info = {}, {}, {}
local token_i = 1
local prev
for character in str_gmatch(decompose(text), UTF8_char) do
local curr_info = info[character]
-- Split vowels between tokens if not a diphthong.
if curr_info.vowel then
if prev and (not (curr_info.offglide and prev_info.vowel)
-- υυ → υ, υ
-- ιυ → ι, υ
or prev_info.offglide and curr_info == upsilon) then
token_i = token_i + 1
end
tokens[token_i] = (tokens[token_i] or "") .. character
table.insert(vowel_info, { index = token_i })
elseif curr_info.diacritic then
tokens[token_i] = (tokens[token_i] or "") .. character
if prev_info.vowel or prev_info.diacritic then
if character == diaeresis then
-- Current token is vowel, vowel, possibly other diacritics,
-- and a diaeresis.
-- Split the current token into two:
-- the first letter, then the second letter plus any diacritics.
local previous_vowel, vowel_with_diaeresis = string.match(tokens[token_i], "^(" .. basic_Greek .. ")(" .. basic_Greek .. ".+)")
if previous_vowel then
tokens[token_i], tokens[token_i + 1] = previous_vowel, vowel_with_diaeresis
token_i = token_i + 1
end
end
elseif prev_info == rho then
if curr_info ~= breathing then
return string.format("The character %s canna hae the accent %s on it.", prev, "◌" .. character)
end
else
error("The character " .. quote(prev) .. " cannot have a diacritic on it.")
end
elseif curr_info == rho then
if prev and not (prev_info == breathing and info[string.match(tokens[token_i], "^" .. basic_Greek)] == rho) then
token_i = token_i + 1
end
tokens[token_i] = (tokens[token_i] or "") .. character
else
if prev then
token_i = token_i + 1
end
tokens[token_i] = (tokens[token_i] or "") .. character
end
prev = character
prev_info = curr_info
end
return tokens
end
function p.transliterate(text)
text = decompose(text)
--[[
if text == '῾' then
return 'h'
end
--]]
--[[
Replace semicolon or Greek question mark with regular question mark,
except after an ASCII alphanumeric character (to avoid converting
semicolons in HTML entities).
--]]
text = gsub(text, "([^A-Za-z0-9])[;" .. U(0x37E) .. "]", "%1?")
-- Handle the middle dot. It is equivalent to semicolon or colon, but semicolon is probably more common.
text = text:gsub("·", ";")
local tokens = tokenize(text)
--now read the tokens
local output = {}
for i, token in pairs(tokens) do
-- substitute each character in the token for its transliteration
local translit = gsub(mw.ustring.lower(token), '.', tt)
if token == 'γ' and is_velar[tokens[i + 1]] then
-- γ before a velar should be <n>
translit = 'n'
elseif token == 'ρ' and tokens[i - 1] == 'ρ' then
-- ρ after ρ should be <rh>
translit = 'rh'
elseif find(token, '^[αΑ].*' .. subscript .. '$') then
-- add macron to ᾳ
translit = gsub(translit, '([aA])', '%1' .. macron)
end
if token:find(rough) then
if find(token, '[Ρρ]') then
translit = translit .. 'h'
else -- vowel
translit = 'h' .. translit
end
end
-- Remove macron from a vowel that has a circumflex.
if find(translit, macron_circumflex) then
translit = translit:gsub(macron, '')
end
-- Capitalize first character of transliteration.
if token ~= lower(token) then
translit = gsub(translit, "^.", upper)
end
table.insert(output, translit)
end
return table.concat(output)
end
function p.translit(frame)
local args = frame:getParent().args
local text = frame.args[1] or args[1]
local transliteration = p.transliterate(text)
return '<span title="Ancient Greek transliteration" lang="grc-Latn"><i>' .. transliteration .. '</i></span>'
end
function p.bare_translit(frame)
return p.transliterate(frame.args[1] or frame:getParent().args[1])
end
return p