local module_name = "string_utilities"
local export = {}
local rfind = mw.ustring.find
local format_escapes = {
["op"] = "{",
["cl"] = "}",
}
function export.format(str, tbl)
return (string.gsub(str, "{(\\?)((\\?)[^{}]*)}", function (p1, name, p2)
if #p1 + #p2 == 1 then
return format_escapes[name] or error(module_name .. ".format: unrecognized escape sequence '{\\" .. name .. "}'")
else
return tbl[name] or error(module_name .. ".format: '" .. name .. "' not found in table")
end
end))
end
-- Reimplementation of mw.ustring.split() that includes any capturing
-- groups in the splitting pattern. This works like Python's re.split()
-- function, except that it has Lua's behavior when the split pattern
-- is empty (i.e. advancing by one character at a time; Python returns the
-- whole remainder of the string).
function export.capturing_split(str, pattern)
local ret = {}
-- (.-) corresponds to (.*?) in Python or Perl; () captures the
-- current position after matching.
pattern = "(.-)" .. pattern .. "()"
local start = 1
while true do
-- Did we reach the end of the string?
if start > #str then
table.insert(ret, "")
return ret
end
-- match() returns all captures as multiple return values;
-- we need to insert into a table to get them all.
local captures = {mw.ustring.match(str, pattern, start)}
-- If no match, add the remainder of the string.
if #captures == 0 then
table.insert(ret, mw.ustring.sub(str, start))
return ret
end
local newstart = table.remove(captures)
-- Special case: If we don't advance by any characters, then advance
-- by one character; this avoids an infinite loop, and makes splitting
-- by an empty string work the way mw.ustring.split() does. If we
-- reach the end of the string this way, return immediately, so we
-- don't get a final empty string.
if newstart == start then
table.insert(ret, mw.ustring.sub(str, start, start))
table.remove(captures, 1)
start = start + 1
if start > #str then
return ret
end
else
table.insert(ret, table.remove(captures, 1))
start = newstart
end
-- Insert any captures from the splitting pattern.
for _, x in ipairs(captures) do
table.insert(ret, x)
end
end
end
local function uclcfirst(text, dolower)
local function douclcfirst(text)
-- Actual function to re-case of the first letter.
local first_letter = mw.ustring.sub(text, 1, 1)
first_letter = dolower and mw.ustring.lower(first_letter) or mw.ustring.upper(first_letter)
return first_letter .. mw.ustring.sub(text, 2)
end
-- If there's a link at the beginning, re-case the first letter of the
-- link text. This pattern matches both piped and unpiped links.
-- If the link is not piped, the second capture (linktext) will be empty.
local link, linktext, remainder = mw.ustring.match(text, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$")
if link then
return "[[" .. link .. "|" .. douclcfirst(linktext ~= "" and linktext or link) .. "]]" .. remainder
end
return douclcfirst(text)
end
function export.ucfirst(text)
return uclcfirst(text, false)
end
function export.lcfirst(text)
return uclcfirst(text, true)
end
function export.add_indefinite_article(text, uppercase)
local is_vowel = false
-- If there's a link at the beginning, examine the first letter of the
-- link text. This pattern matches both piped and unpiped links.
-- If the link is not piped, the second capture (linktext) will be empty.
local link, linktext, remainder = mw.ustring.match(text, "^%[%[([^|%]]+)%|?(.-)%]%](.*)$")
if link then
is_vowel = rfind(linktext ~= "" and linktext or link, "^[AEIOUaeiou]")
else
is_vowel = rfind(text, "^[AEIOUaeiou]")
end
return (is_vowel and (uppercase and "An " or "an ") or (uppercase and "A " or "a ")) .. text
end
return export