Aller au contenu

Module:bac à sable/Darmo117/regex

Définition, traduction, prononciation, anagramme et synonyme sur le dictionnaire libre Wiktionnaire.

La documentation pour ce module peut être créée à Module:bac à sable/Darmo117/regex/Documentation

local m_table = require('Module:table')

local p = {}

-- Constants

local START = 'start'
local END = 'end'

local OR = 'or'
local GROUP = 'group'
local CLASS = 'class'

local QUANTIFIED = 'quantified'
local SYMBOL = 'symbol'
local NON_GREEDY = 'non-greedy'

-- TODO trouver les autres classes
local PREDEFINED_CLASSES = {
  ['w'] = function(c)
    return nil -- TODO
  end,
  ['d'] = function(c)
    return tonumber(c) ~= nil
  end,
  ['s'] = function(c)
    return nil -- TODO
  end,
  ['b'] = function(c)
    return nil -- TODO
  end,
}

for class, predicate in pairs(PREDEFINED_CLASSES) do
  PREDEFINED_CLASSES[mw.ustring.upper(class)] = function(c)
    return not predicate(c)
  end
end

PREDEFINED_CLASSES['n'] = function(c)
  return c == '\n'
end
PREDEFINED_CLASSES['r'] = function(c)
  return c == '\r'
end
PREDEFINED_CLASSES['.'] = function(_)
  return true
end

-- Utility functions

--- Tests wether the given element is a group.
local function isGroup(element)
  return type(element) == 'table' and element[GROUP] ~= nil
end

--- Tests wether the given element is a character class.
local function isClass(element)
  return type(element) == 'table' and type(element[CLASS]) == 'function'
end

--- Tests wether the given element is a character literal.
local function isLiteral(element)
  return type(element) == 'string'
end

--- Tests wether the given element is a character literal.
local function isStarOrPlusQuantifier(element)
  return type(element) == 'table' and (element[SYMBOL] == '*' or element[SYMBOL] == '+')
end

local function checkIfQuantifiable(element, i)
  if not isGroup(element) and not isClass(element) and not isLiteral(element) then
    error(mw.ustring.format('nothing to repeat at position %d', i), 0)
  end
end

local function createQuantifier(min, max, value, symbol)
  return {
    [QUANTIFIED] = value,
    [SYMBOL] = symbol,
    [NON_GREEDY] = false,
    ['min'] = min,
    ['max'] = max,
  }
end

-- Functions

--- Handles the simple quantifiers (*, + and ?).
--- @param tree table The current regex tree.
--- @param c string The current character.
--- @param i number The current index.
local function handleQuantifier(tree, c, i)
  local lastIndex = m_table.length(tree)
  local prevValue = tree[lastIndex]
  local min, max

  checkIfQuantifiable(prevValue, i - 1)

  if c == '*' then
    min = 0
    max = math.huge
  elseif c == '+' then
    min = 1
    max = math.huge
  elseif c == '?' then
    min = 0
    max = 1
  end

  tree[lastIndex] = createQuantifier(min, max, prevValue, c)
end

--- Compiles the given regex.
--- May throw an error if a syntax error is encountered.
--- @param regex string The regular expression to compile.
--- @param endChar string The character to stop at.
---                       If it is not found, an error is raised.
--- @param startIndex number The index to start at in the regex.
--- @return table The compiled regular expression as a tree.
local function pcreCompile_impl(regex, endChar, startIndex)
  local regexTree = {}
  local endCharFound = false
  local len = mw.ustring.len(regex)
  local inGroup = endChar == ')'
  local inOr = false
  local inQuantifierRange = endChar == '}'
  local commaFoundInRange = false
  local otherCharFoundInRange = false
  local inClass = endChar == ']'
  local escape = false
  local i = startIndex
  local currentTree = regexTree

  if inQuantifierRange then
    currentTree.min = -1
    currentTree.max = -1
  end

  repeat
    local c = mw.ustring.sub(regex, i, i)

    if not escape and c == endChar then
      if inQuantifierRange then
        if currentTree.min ~= -1 then
          if currentTree.max == -1 then
            if commaFoundInRange then
              currentTree.max = math.huge
            else
              currentTree.max = currentTree.min
            end
            -- Quantifier range is out of order.
          elseif currentTree.max < currentTree.min then
            error(mw.ustring.format('quantifier range min greater than max at position %d', i), 0)
          end
        else
          currentTree.min = nil
          currentTree.max = nil
          -- To keep the “}” after returning.
          i = i - 1
        end
      end

      endCharFound = true
    else

      if inQuantifierRange then
        table.insert(currentTree, c)

        if not otherCharFoundInRange then
          local digit = tonumber(c)

          if digit ~= nil then
            if commaFoundInRange then
              if currentTree.max == -1 then
                currentTree.max = 0
              end
              currentTree.max = currentTree.max * 10 + digit
            else
              if currentTree.min == -1 then
                currentTree.min = 0
              end
              currentTree.min = currentTree.min * 10 + digit
            end
          elseif c == ',' then
            commaFoundInRange = true
          else
            currentTree.min = nil
            currentTree.max = nil
            -- Set “endChar” to nil to avoid raising the error before the last return.
            endChar = nil
            -- Remove last char and step back in case “c” is a “(”.
            table.remove(currentTree, m_table.length(currentTree))
            i = i - 1
            break
          end
        end

        -- In group or root regex
      else

        -- Capture group
        if not escape and c == '(' then
          -- TODO gérer ?:, ?<=, ?=, ?<!, ?! et ?> (atomic group = no backtrack)
          local subGroup, endIndex = pcreCompile_impl(regex, ')', i + 1)
          table.insert(currentTree, { [GROUP] = subGroup })
          i = endIndex

          -- Stray end of group
        elseif not escape and c == ')' and not inGroup then
          error(mw.ustring.format('unexpected ")" at position %d', i), 0)

          -- Alternative
        elseif not escape and c == '|' then
          if not inOr then
            inOr = true
            regexTree = { [OR] = { regexTree } }
          end
          table.insert(regexTree[OR], {})
          currentTree = regexTree[OR][m_table.length(regexTree[OR])]

          -- Line/string start
        elseif not escape and c == '^' then
          table.insert(currentTree, START)

          -- Line/string end
        elseif not escape and c == '$' then
          table.insert(currentTree, END)

          -- Dot special character
        elseif not escape and c == '.' or escape and PREDEFINED_CLASSES[c] then
          table.insert(currentTree, { [CLASS] = PREDEFINED_CLASSES[c] })

          -- Quantifiers (*, + and ?)
        elseif not escape and (c == '*' or c == '+' or c == '?') then
          local lastIndex = m_table.length(currentTree)
          local prevValue = currentTree[lastIndex]
          local skip = false

          if c == '?' and isStarOrPlusQuantifier(prevValue) then
            prevValue[NON_GREEDY] = true
            skip = true
          end
          if not skip then
            handleQuantifier(currentTree, c, i)
          end

          -- Quantifier range
        elseif not escape and c == '{' then
          local repetition, endIndex = pcreCompile_impl(regex, '}', i + 1)

          if repetition.min ~= nil then
            local lastIndex = m_table.length(currentTree)
            local prevValue = currentTree[lastIndex]
            checkIfQuantifiable(prevValue, i - 1)
            currentTree[lastIndex] = createQuantifier(repetition.min, repetition.max, prevValue, '{}')
          else
            table.insert(currentTree, '{')
            m_table.arrayConcat(currentTree, repetition)
          end
          i = endIndex

          -- Escape next character
        elseif not escape and c == '\\' then
          escape = true

          -- Default case: add literal character to the current sequence
        else
          table.insert(currentTree, c)
          if escape then
            escape = false
          end
        end
      end

      i = i + 1
    end
  until endCharFound or i == len + 1

  if not endCharFound and endChar ~= nil then
    error(mw.ustring.format('expected "%s" at position %d', endChar, i), 0)
  end

  return regexTree, i
end

--- Compiles the given regular expression.
--- May throw an error if a syntax error is encountered.
--- @param regex string The regular expression to compile.
--- @return table The compiled regular expression as a tree.
function p.pcreCompile(regex)
  local success, value = pcall(pcreCompile_impl, regex, nil, 1)

  if success then
    return value
  else
    error('Regex syntax error: ' .. value)
  end
end

-- Flags
p.I = 'i'
p.G = 'g'
p.M = 'm'

--- Tests wether the given string matches the given regex
--- then returns all captured matches.
--- @param s string The string to test.
--- @param regex string|table The regex (as a string or compiled).
--- @param offset number The index to start from in parameter "s".
--- @param flags table|nil The flags (i, g or m).
--- @return table|nil An array containing all matches or nil if the match failed.
---                   The array index 0 contains the whole match, subsequent indices
---                   contain the values from the corresponding capture group if any.
function p.pcreMatch(s, regex, offset, flags)
  local compiledRegex = type(regex) == 'table' and regex or p.pcreCompile(regex)
  local startIndex = offset or 1
  flags = flags or {}

  -- TODO

  return nil
end

--- Tests wether the given string matches the given regex.
--- @param s string The string to test.
--- @param regex string|table The regex (as a string or compiled).
--- @param offset number The index to start from in parameter "s".
--- @param flags table|nil The flags (i, g or m).
--- @return boolean True if and only if parameter "s" matches the
---                 given regex from the given index.
function p.pcreFind(s, regex, offset, flags)
  return p.pcreMatch(s, regex, offset, flags) ~= nil
end

return p