You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

558 lines
16 KiB

#=======================================================================
#
# Python Lexical Analyser
#
# Regular Expressions
#
#=======================================================================
import array
import string
import types
from sys import maxint
import Errors
#
# Constants
#
BOL = 'bol'
EOL = 'eol'
EOF = 'eof'
nl_code = ord('\n')
#
# Helper functions
#
def chars_to_ranges(s):
"""
Return a list of character codes consisting of pairs
[code1a, code1b, code2a, code2b,...] which cover all
the characters in |s|.
"""
char_list = list(s)
char_list.sort()
i = 0
n = len(char_list)
result = []
while i < n:
code1 = ord(char_list[i])
code2 = code1 + 1
i = i + 1
while i < n and code2 >= ord(char_list[i]):
code2 = code2 + 1
i = i + 1
result.append(code1)
result.append(code2)
return result
def uppercase_range(code1, code2):
"""
If the range of characters from code1 to code2-1 includes any
lower case letters, return the corresponding upper case range.
"""
code3 = max(code1, ord('a'))
code4 = min(code2, ord('z') + 1)
if code3 < code4:
d = ord('A') - ord('a')
return (code3 + d, code4 + d)
else:
return None
def lowercase_range(code1, code2):
"""
If the range of characters from code1 to code2-1 includes any
upper case letters, return the corresponding lower case range.
"""
code3 = max(code1, ord('A'))
code4 = min(code2, ord('Z') + 1)
if code3 < code4:
d = ord('a') - ord('A')
return (code3 + d, code4 + d)
else:
return None
def CodeRanges(code_list):
"""
Given a list of codes as returned by chars_to_ranges, return
an RE which will match a character in any of the ranges.
"""
re_list = []
for i in xrange(0, len(code_list), 2):
re_list.append(CodeRange(code_list[i], code_list[i + 1]))
return apply(Alt, tuple(re_list))
def CodeRange(code1, code2):
"""
CodeRange(code1, code2) is an RE which matches any character
with a code |c| in the range |code1| <= |c| < |code2|.
"""
if code1 <= nl_code < code2:
return Alt(RawCodeRange(code1, nl_code),
RawNewline,
RawCodeRange(nl_code + 1, code2))
else:
return RawCodeRange(code1, code2)
#
# Abstract classes
#
class RE:
"""RE is the base class for regular expression constructors.
The following operators are defined on REs:
re1 + re2 is an RE which matches |re1| followed by |re2|
re1 | re2 is an RE which matches either |re1| or |re2|
"""
nullable = 1 # True if this RE can match 0 input symbols
match_nl = 1 # True if this RE can match a string ending with '\n'
str = None # Set to a string to override the class's __str__ result
def build_machine(self, machine, initial_state, final_state,
match_bol, nocase):
"""
This method should add states to |machine| to implement this
RE, starting at |initial_state| and ending at |final_state|.
If |match_bol| is true, the RE must be able to match at the
beginning of a line. If nocase is true, upper and lower case
letters should be treated as equivalent.
"""
raise exceptions.UnimplementedMethod("%s.build_machine not implemented" %
self.__class__.__name__)
def build_opt(self, m, initial_state, c):
"""
Given a state |s| of machine |m|, return a new state
reachable from |s| on character |c| or epsilon.
"""
s = m.new_state()
initial_state.link_to(s)
initial_state.add_transition(c, s)
return s
def __add__(self, other):
return Seq(self, other)
def __or__(self, other):
return Alt(self, other)
def __str__(self):
if self.str:
return self.str
else:
return self.calc_str()
def check_re(self, num, value):
if not isinstance(value, RE):
self.wrong_type(num, value, "Plex.RE instance")
def check_string(self, num, value):
if type(value) <> type(''):
self.wrong_type(num, value, "string")
def check_char(self, num, value):
self.check_string(num, value)
if len(value) <> 1:
raise Errors.PlexValueError("Invalid value for argument %d of Plex.%s."
"Expected a string of length 1, got: %s" % (
num, self.__class__.__name__, repr(value)))
def wrong_type(self, num, value, expected):
if type(value) == types.InstanceType:
got = "%s.%s instance" % (
value.__class__.__module__, value.__class__.__name__)
else:
got = type(value).__name__
raise Errors.PlexTypeError("Invalid type for argument %d of Plex.%s "
"(expected %s, got %s" % (
num, self.__class__.__name__, expected, got))
#
# Primitive RE constructors
# -------------------------
#
# These are the basic REs from which all others are built.
#
## class Char(RE):
## """
## Char(c) is an RE which matches the character |c|.
## """
## nullable = 0
## def __init__(self, char):
## self.char = char
## self.match_nl = char == '\n'
## def build_machine(self, m, initial_state, final_state, match_bol, nocase):
## c = self.char
## if match_bol and c <> BOL:
## s1 = self.build_opt(m, initial_state, BOL)
## else:
## s1 = initial_state
## if c == '\n' or c == EOF:
## s1 = self.build_opt(m, s1, EOL)
## if len(c) == 1:
## code = ord(self.char)
## s1.add_transition((code, code+1), final_state)
## if nocase and is_letter_code(code):
## code2 = other_case_code(code)
## s1.add_transition((code2, code2+1), final_state)
## else:
## s1.add_transition(c, final_state)
## def calc_str(self):
## return "Char(%s)" % repr(self.char)
def Char(c):
"""
Char(c) is an RE which matches the character |c|.
"""
if len(c) == 1:
result = CodeRange(ord(c), ord(c) + 1)
else:
result = SpecialSymbol(c)
result.str = "Char(%s)" % repr(c)
return result
class RawCodeRange(RE):
"""
RawCodeRange(code1, code2) is a low-level RE which matches any character
with a code |c| in the range |code1| <= |c| < |code2|, where the range
does not include newline. For internal use only.
"""
nullable = 0
match_nl = 0
range = None # (code, code)
uppercase_range = None # (code, code) or None
lowercase_range = None # (code, code) or None
def __init__(self, code1, code2):
self.range = (code1, code2)
self.uppercase_range = uppercase_range(code1, code2)
self.lowercase_range = lowercase_range(code1, code2)
def build_machine(self, m, initial_state, final_state, match_bol, nocase):
if match_bol:
initial_state = self.build_opt(m, initial_state, BOL)
initial_state.add_transition(self.range, final_state)
if nocase:
if self.uppercase_range:
initial_state.add_transition(self.uppercase_range, final_state)
if self.lowercase_range:
initial_state.add_transition(self.lowercase_range, final_state)
def calc_str(self):
return "CodeRange(%d,%d)" % (self.code1, self.code2)
class _RawNewline(RE):
"""
RawNewline is a low-level RE which matches a newline character.
For internal use only.
"""
nullable = 0
match_nl = 1
def build_machine(self, m, initial_state, final_state, match_bol, nocase):
if match_bol:
initial_state = self.build_opt(m, initial_state, BOL)
s = self.build_opt(m, initial_state, EOL)
s.add_transition((nl_code, nl_code + 1), final_state)
RawNewline = _RawNewline()
class SpecialSymbol(RE):
"""
SpecialSymbol(sym) is an RE which matches the special input
symbol |sym|, which is one of BOL, EOL or EOF.
"""
nullable = 0
match_nl = 0
sym = None
def __init__(self, sym):
self.sym = sym
def build_machine(self, m, initial_state, final_state, match_bol, nocase):
# Sequences 'bol bol' and 'bol eof' are impossible, so only need
# to allow for bol if sym is eol
if match_bol and self.sym == EOL:
initial_state = self.build_opt(m, initial_state, BOL)
initial_state.add_transition(self.sym, final_state)
class Seq(RE):
"""Seq(re1, re2, re3...) is an RE which matches |re1| followed by
|re2| followed by |re3|..."""
def __init__(self, *re_list):
nullable = 1
for i in xrange(len(re_list)):
re = re_list[i]
self.check_re(i, re)
nullable = nullable and re.nullable
self.re_list = re_list
self.nullable = nullable
i = len(re_list)
match_nl = 0
while i:
i = i - 1
re = re_list[i]
if re.match_nl:
match_nl = 1
break
if not re.nullable:
break
self.match_nl = match_nl
def build_machine(self, m, initial_state, final_state, match_bol, nocase):
re_list = self.re_list
if len(re_list) == 0:
initial_state.link_to(final_state)
else:
s1 = initial_state
n = len(re_list)
for i in xrange(n):
if i < n - 1:
s2 = m.new_state()
else:
s2 = final_state
re = re_list[i]
re.build_machine(m, s1, s2, match_bol, nocase)
s1 = s2
match_bol = re.match_nl or (match_bol and re.nullable)
def calc_str(self):
return "Seq(%s)" % string.join(map(str, self.re_list), ",")
class Alt(RE):
"""Alt(re1, re2, re3...) is an RE which matches either |re1| or
|re2| or |re3|..."""
def __init__(self, *re_list):
self.re_list = re_list
nullable = 0
match_nl = 0
nullable_res = []
non_nullable_res = []
i = 1
for re in re_list:
self.check_re(i, re)
if re.nullable:
nullable_res.append(re)
nullable = 1
else:
non_nullable_res.append(re)
if re.match_nl:
match_nl = 1
i = i + 1
self.nullable_res = nullable_res
self.non_nullable_res = non_nullable_res
self.nullable = nullable
self.match_nl = match_nl
def build_machine(self, m, initial_state, final_state, match_bol, nocase):
for re in self.nullable_res:
re.build_machine(m, initial_state, final_state, match_bol, nocase)
if self.non_nullable_res:
if match_bol:
initial_state = self.build_opt(m, initial_state, BOL)
for re in self.non_nullable_res:
re.build_machine(m, initial_state, final_state, 0, nocase)
def calc_str(self):
return "Alt(%s)" % string.join(map(str, self.re_list), ",")
class Rep1(RE):
"""Rep1(re) is an RE which matches one or more repetitions of |re|."""
def __init__(self, re):
self.check_re(1, re)
self.re = re
self.nullable = re.nullable
self.match_nl = re.match_nl
def build_machine(self, m, initial_state, final_state, match_bol, nocase):
s1 = m.new_state()
s2 = m.new_state()
initial_state.link_to(s1)
self.re.build_machine(m, s1, s2, match_bol or self.re.match_nl, nocase)
s2.link_to(s1)
s2.link_to(final_state)
def calc_str(self):
return "Rep1(%s)" % self.re
class SwitchCase(RE):
"""
SwitchCase(re, nocase) is an RE which matches the same strings as RE,
but treating upper and lower case letters according to |nocase|. If
|nocase| is true, case is ignored, otherwise it is not.
"""
re = None
nocase = None
def __init__(self, re, nocase):
self.re = re
self.nocase = nocase
self.nullable = re.nullable
self.match_nl = re.match_nl
def build_machine(self, m, initial_state, final_state, match_bol, nocase):
self.re.build_machine(m, initial_state, final_state, match_bol,
self.nocase)
def calc_str(self):
if self.nocase:
name = "NoCase"
else:
name = "Case"
return "%s(%s)" % (name, self.re)
#
# Composite RE constructors
# -------------------------
#
# These REs are defined in terms of the primitive REs.
#
Empty = Seq()
Empty.__doc__ = \
"""
Empty is an RE which matches the empty string.
"""
Empty.str = "Empty"
def Str1(s):
"""
Str1(s) is an RE which matches the literal string |s|.
"""
result = apply(Seq, tuple(map(Char, s)))
result.str = "Str(%s)" % repr(s)
return result
def Str(*strs):
"""
Str(s) is an RE which matches the literal string |s|.
Str(s1, s2, s3, ...) is an RE which matches any of |s1| or |s2| or |s3|...
"""
if len(strs) == 1:
return Str1(strs[0])
else:
result = apply(Alt, tuple(map(Str1, strs)))
result.str = "Str(%s)" % string.join(map(repr, strs), ",")
return result
def Any(s):
"""
Any(s) is an RE which matches any character in the string |s|.
"""
#result = apply(Alt, tuple(map(Char, s)))
result = CodeRanges(chars_to_ranges(s))
result.str = "Any(%s)" % repr(s)
return result
def AnyBut(s):
"""
AnyBut(s) is an RE which matches any character (including
newline) which is not in the string |s|.
"""
ranges = chars_to_ranges(s)
ranges.insert(0, -maxint)
ranges.append(maxint)
result = CodeRanges(ranges)
result.str = "AnyBut(%s)" % repr(s)
return result
AnyChar = AnyBut("")
AnyChar.__doc__ = \
"""
AnyChar is an RE which matches any single character (including a newline).
"""
AnyChar.str = "AnyChar"
def Range(s1, s2 = None):
"""
Range(c1, c2) is an RE which matches any single character in the range
|c1| to |c2| inclusive.
Range(s) where |s| is a string of even length is an RE which matches
any single character in the ranges |s[0]| to |s[1]|, |s[2]| to |s[3]|,...
"""
if s2:
result = CodeRange(ord(s1), ord(s2) + 1)
result.str = "Range(%s,%s)" % (s1, s2)
else:
ranges = []
for i in range(0, len(s1), 2):
ranges.append(CodeRange(ord(s1[i]), ord(s1[i+1]) + 1))
result = apply(Alt, tuple(ranges))
result.str = "Range(%s)" % repr(s1)
return result
def Opt(re):
"""
Opt(re) is an RE which matches either |re| or the empty string.
"""
result = Alt(re, Empty)
result.str = "Opt(%s)" % re
return result
def Rep(re):
"""
Rep(re) is an RE which matches zero or more repetitions of |re|.
"""
result = Opt(Rep1(re))
result.str = "Rep(%s)" % re
return result
def NoCase(re):
"""
NoCase(re) is an RE which matches the same strings as RE, but treating
upper and lower case letters as equivalent.
"""
return SwitchCase(re, nocase = 1)
def Case(re):
"""
Case(re) is an RE which matches the same strings as RE, but treating
upper and lower case letters as distinct, i.e. it cancels the effect
of any enclosing NoCase().
"""
return SwitchCase(re, nocase = 0)
#
# RE Constants
#
Bol = Char(BOL)
Bol.__doc__ = \
"""
Bol is an RE which matches the beginning of a line.
"""
Bol.str = "Bol"
Eol = Char(EOL)
Eol.__doc__ = \
"""
Eol is an RE which matches the end of a line.
"""
Eol.str = "Eol"
Eof = Char(EOF)
Eof.__doc__ = \
"""
Eof is an RE which matches the end of the file.
"""
Eof.str = "Eof"