You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

155 lines
3.4 KiB

#=======================================================================
#
# Python Lexical Analyser
#
# Traditional Regular Expression Syntax
#
#=======================================================================
from Regexps import *
from Errors import PlexError
class RegexpSyntaxError(PlexError):
pass
def re(s):
"""
Convert traditional string representation of regular expression |s|
into Plex representation.
"""
return REParser(s).parse_re()
class REParser:
def __init__(self, s):
self.s = s
self.i = -1
self.end = 0
self.next()
def parse_re(self):
re = self.parse_alt()
if not self.end:
self.error("Unexpected %s" % repr(self.c))
return re
def parse_alt(self):
"""Parse a set of alternative regexps."""
re = self.parse_seq()
if self.c == '|':
re_list = [re]
while self.c == '|':
self.next()
re_list.append(self.parse_seq())
re = apply(Alt, tuple(re_list))
return re
def parse_seq(self):
"""Parse a sequence of regexps."""
re_list = []
while not self.end and not self.c in "|)":
re_list.append(self.parse_mod())
return apply(Seq, tuple(re_list))
def parse_mod(self):
"""Parse a primitive regexp followed by *, +, ? modifiers."""
re = self.parse_prim()
while not self.end and self.c in "*+?":
if self.c == '*':
re = Rep(re)
elif self.c == '+':
re = Rep1(re)
else: # self.c == '?'
re = Opt(re)
self.next()
return re
def parse_prim(self):
"""Parse a primitive regexp."""
c = self.get()
if c == '.':
re = AnyBut("\n")
elif c == '^':
re = Bol
elif c == '$':
re = Eol
elif c == '(':
re = self.parse_alt()
self.expect(')')
elif c == '[':
re = self.parse_charset()
self.expect(']')
else:
if c == '\\':
c = self.get()
re = Char(c)
return re
def parse_charset(self):
"""Parse a charset. Does not include the surrounding []."""
char_list = []
invert = 0
if self.c == '^':
invert = 1
self.next()
if self.c == ']':
char_list.append(']')
self.next()
while not self.end and self.c <> ']':
c1 = self.get()
if self.c == '-' and self.lookahead(1) <> ']':
self.next()
c2 = self.get()
for a in xrange(ord(c1), ord(c2) + 1):
char_list.append(chr(a))
else:
char_list.append(c1)
chars = string.join(char_list, "")
if invert:
return AnyBut(chars)
else:
return Any(chars)
def next(self):
"""Advance to the next char."""
s = self.s
i = self.i = self.i + 1
if i < len(s):
self.c = s[i]
else:
self.c = ''
self.end = 1
def get(self):
if self.end:
self.error("Premature end of string")
c = self.c
self.next()
return c
def lookahead(self, n):
"""Look ahead n chars."""
j = self.i + n
if j < len(self.s):
return self.s[j]
else:
return ''
def expect(self, c):
"""
Expect to find character |c| at current position.
Raises an exception otherwise.
"""
if self.c == c:
self.next()
else:
self.error("Missing %s" % repr(c))
def error(self, mess):
"""Raise exception to signal syntax error in regexp."""
raise RegexpSyntaxError("Syntax error in regexp %s at position %d: %s" % (
repr(self.s), self.i, mess))