263 lines
7.1 KiB
Python
263 lines
7.1 KiB
Python
"""
|
|
Parser for parsing a regular expression.
|
|
Take a string representing a regular expression and return the root node of its
|
|
parse tree.
|
|
|
|
usage::
|
|
|
|
root_node = parse_regex('(hello|world)')
|
|
|
|
Remarks:
|
|
- The regex parser processes multiline, it ignores all whitespace and supports
|
|
multiple named groups with the same name and #-style comments.
|
|
|
|
Limitations:
|
|
- Lookahead is not supported.
|
|
"""
|
|
from __future__ import unicode_literals
|
|
import re
|
|
|
|
__all__ = (
|
|
'Repeat',
|
|
'Variable',
|
|
'Regex',
|
|
'Lookahead',
|
|
|
|
'tokenize_regex',
|
|
'parse_regex',
|
|
)
|
|
|
|
|
|
class Node(object):
|
|
"""
|
|
Base class for all the grammar nodes.
|
|
(You don't initialize this one.)
|
|
"""
|
|
def __add__(self, other_node):
|
|
return Sequence([self, other_node])
|
|
|
|
def __or__(self, other_node):
|
|
return Any([self, other_node])
|
|
|
|
|
|
class Any(Node):
|
|
"""
|
|
Union operation (OR operation) between several grammars. You don't
|
|
initialize this yourself, but it's a result of a "Grammar1 | Grammar2"
|
|
operation.
|
|
"""
|
|
def __init__(self, children):
|
|
self.children = children
|
|
|
|
def __or__(self, other_node):
|
|
return Any(self.children + [other_node])
|
|
|
|
def __repr__(self):
|
|
return '%s(%r)' % (self.__class__.__name__, self.children)
|
|
|
|
|
|
class Sequence(Node):
|
|
"""
|
|
Concatenation operation of several grammars. You don't initialize this
|
|
yourself, but it's a result of a "Grammar1 + Grammar2" operation.
|
|
"""
|
|
def __init__(self, children):
|
|
self.children = children
|
|
|
|
def __add__(self, other_node):
|
|
return Sequence(self.children + [other_node])
|
|
|
|
def __repr__(self):
|
|
return '%s(%r)' % (self.__class__.__name__, self.children)
|
|
|
|
|
|
class Regex(Node):
|
|
"""
|
|
Regular expression.
|
|
"""
|
|
def __init__(self, regex):
|
|
re.compile(regex) # Validate
|
|
|
|
self.regex = regex
|
|
|
|
def __repr__(self):
|
|
return '%s(/%s/)' % (self.__class__.__name__, self.regex)
|
|
|
|
|
|
class Lookahead(Node):
|
|
"""
|
|
Lookahead expression.
|
|
"""
|
|
def __init__(self, childnode, negative=False):
|
|
self.childnode = childnode
|
|
self.negative = negative
|
|
|
|
def __repr__(self):
|
|
return '%s(%r)' % (self.__class__.__name__, self.childnode)
|
|
|
|
|
|
class Variable(Node):
|
|
"""
|
|
Mark a variable in the regular grammar. This will be translated into a
|
|
named group. Each variable can have his own completer, validator, etc..
|
|
|
|
:param childnode: The grammar which is wrapped inside this variable.
|
|
:param varname: String.
|
|
"""
|
|
def __init__(self, childnode, varname=None):
|
|
self.childnode = childnode
|
|
self.varname = varname
|
|
|
|
def __repr__(self):
|
|
return '%s(childnode=%r, varname=%r)' % (
|
|
self.__class__.__name__, self.childnode, self.varname)
|
|
|
|
|
|
class Repeat(Node):
|
|
def __init__(self, childnode, min_repeat=0, max_repeat=None, greedy=True):
|
|
self.childnode = childnode
|
|
self.min_repeat = min_repeat
|
|
self.max_repeat = max_repeat
|
|
self.greedy = greedy
|
|
|
|
def __repr__(self):
|
|
return '%s(childnode=%r)' % (self.__class__.__name__, self.childnode)
|
|
|
|
|
|
def tokenize_regex(input):
|
|
"""
|
|
Takes a string, representing a regular expression as input, and tokenizes
|
|
it.
|
|
|
|
:param input: string, representing a regular expression.
|
|
:returns: List of tokens.
|
|
"""
|
|
# Regular expression for tokenizing other regular expressions.
|
|
p = re.compile(r'''^(
|
|
\(\?P\<[a-zA-Z0-9_-]+\> | # Start of named group.
|
|
\(\?#[^)]*\) | # Comment
|
|
\(\?= | # Start of lookahead assertion
|
|
\(\?! | # Start of negative lookahead assertion
|
|
\(\?<= | # If preceded by.
|
|
\(\?< | # If not preceded by.
|
|
\(?: | # Start of group. (non capturing.)
|
|
\( | # Start of group.
|
|
\(?[iLmsux] | # Flags.
|
|
\(?P=[a-zA-Z]+\) | # Back reference to named group
|
|
\) | # End of group.
|
|
\{[^{}]*\} | # Repetition
|
|
\*\? | \+\? | \?\?\ | # Non greedy repetition.
|
|
\* | \+ | \? | # Repetition
|
|
\#.*\n | # Comment
|
|
\\. |
|
|
|
|
# Character group.
|
|
\[
|
|
( [^\]\\] | \\.)*
|
|
\] |
|
|
|
|
[^(){}] |
|
|
.
|
|
)''', re.VERBOSE)
|
|
|
|
tokens = []
|
|
|
|
while input:
|
|
m = p.match(input)
|
|
if m:
|
|
token, input = input[:m.end()], input[m.end():]
|
|
if not token.isspace():
|
|
tokens.append(token)
|
|
else:
|
|
raise Exception('Could not tokenize input regex.')
|
|
|
|
return tokens
|
|
|
|
|
|
def parse_regex(regex_tokens):
|
|
"""
|
|
Takes a list of tokens from the tokenizer, and returns a parse tree.
|
|
"""
|
|
# We add a closing brace because that represents the final pop of the stack.
|
|
tokens = [')'] + regex_tokens[::-1]
|
|
|
|
def wrap(lst):
|
|
""" Turn list into sequence when it contains several items. """
|
|
if len(lst) == 1:
|
|
return lst[0]
|
|
else:
|
|
return Sequence(lst)
|
|
|
|
def _parse():
|
|
or_list = []
|
|
result = []
|
|
|
|
def wrapped_result():
|
|
if or_list == []:
|
|
return wrap(result)
|
|
else:
|
|
or_list.append(result)
|
|
return Any([wrap(i) for i in or_list])
|
|
|
|
while tokens:
|
|
t = tokens.pop()
|
|
|
|
if t.startswith('(?P<'):
|
|
variable = Variable(_parse(), varname=t[4:-1])
|
|
result.append(variable)
|
|
|
|
elif t in ('*', '*?'):
|
|
greedy = (t == '*')
|
|
result[-1] = Repeat(result[-1], greedy=greedy)
|
|
|
|
elif t in ('+', '+?'):
|
|
greedy = (t == '+')
|
|
result[-1] = Repeat(result[-1], min_repeat=1, greedy=greedy)
|
|
|
|
elif t in ('?', '??'):
|
|
if result == []:
|
|
raise Exception('Nothing to repeat.' + repr(tokens))
|
|
else:
|
|
greedy = (t == '?')
|
|
result[-1] = Repeat(result[-1], min_repeat=0, max_repeat=1, greedy=greedy)
|
|
|
|
elif t == '|':
|
|
or_list.append(result)
|
|
result = []
|
|
|
|
elif t in ('(', '(?:'):
|
|
result.append(_parse())
|
|
|
|
elif t == '(?!':
|
|
result.append(Lookahead(_parse(), negative=True))
|
|
|
|
elif t == '(?=':
|
|
result.append(Lookahead(_parse(), negative=False))
|
|
|
|
elif t == ')':
|
|
return wrapped_result()
|
|
|
|
elif t.startswith('#'):
|
|
pass
|
|
|
|
elif t.startswith('{'):
|
|
# TODO: implement!
|
|
raise Exception('{}-style repitition not yet supported' % t)
|
|
|
|
elif t.startswith('(?'):
|
|
raise Exception('%r not supported' % t)
|
|
|
|
elif t.isspace():
|
|
pass
|
|
else:
|
|
result.append(Regex(t))
|
|
|
|
raise Exception("Expecting ')' token")
|
|
|
|
result = _parse()
|
|
|
|
if len(tokens) != 0:
|
|
raise Exception("Unmatched parantheses.")
|
|
else:
|
|
return result
|