import re import base64 import binascii import re # Partial implementation of Rivest's proposed S-Expressions standard # as documented at # http://people.csail.mit.edu/rivest/Sexp.txt # # It's slightly optimized. # # Not implemented: # [display hints] # {basic transport} __all__ = [ 'FormatError', 'parse' ] class FormatError(Exception): """Raised when parsing fails.""" pass _TOKEN_PAT = r"[a-zA-Z\-\.\/\_\:\*\+\=][a-zA-Z0-9\-\.\/\_\:\*\+\=]*" # Regular expression to match a single lexeme from an encode s-expression. _LEXEME_START_RE = re.compile( r""" \s* (?: (%s) | # Grp 0: A token. ([0-9]*(?: [\:\|\#\{] | # Grp1 : start of string... \"(?:[^\\\"]+|\\.)*\")) | # or qstring. ([\(\)]) # Grp 2: a paren of some kind. )""" %_TOKEN_PAT,re.X|re.M|re.S) class _P: """Helper class for parenthesis tokens.""" def __init__(self, val): self.val = val def __repr__(self): return "_P(%r)"%self.val _OPEN_PAREN = _P("(") _CLOSE_PAREN = _P(")") del _P _SPACE_RE = re.compile(r'\s+') # Matches all characters in a string that we need to unquote. _UNQUOTE_CHAR_RE = re.compile(r''' \\ (?: [abtnvfr] | \r \n ? | \n \r ? | [xX] [A-Fa-f0-9]{2} | [0-8]{1,3} ) ''') # Map from quoted representation to unquoted format. _UNQUOTE_CHAR_MAP = { 'a': '\a', 'b': '\b', 't': '\t', 'n': '\n', 'v': '\v', 'f': '\f', 'r': '\r' } def _unquoteChar(ch, _U=_UNQUOTE_CHAR_MAP): ch = ch[1:] try: return _U[ch] except KeyError: pass if ch[0] in "\n\r": return "" elif ch[0] in 'xX': return chr(int(ch[1:], 16)) else: i = int(ch[1:], 8) if i >= 256: raise FormatError("Octal character format out of range.") return chr(i) def _lexItems(s): """Generator that iterates over the lexical items in an encoded s-expression. Yields a string for strings, or the special objects _OPEN_PAREN and _CLOSE_PAREN. >>> list(_lexItems('(4:a)b hello) (world 1:a 0: ')) [_P('('), 'a)b ', 'hello', _P(')'), _P('('), 'world', 'a', ''] >>> list(_lexItems('a b-c 1#20#2#2061##686877# |aGVsbG8gd29ybGQ|')) ['a', 'b-c', ' ', ' a', 'hhw', 'hello world'] >>> list(_lexItems('#2 0# |aGVs\\nbG8 gd29yb GQ| ')) [' ', 'hello world'] >>> list(_lexItems('|YWJjZA==| x |YWJjZA| 3|Y W J j|')) ['abcd', 'x', 'abcd', 'abc'] >>> list(_lexItems('("1""234""hello world" 3"abc" 4" " )')) [_P('('), '1', '234', 'hello world', 'abc', ' ', _P(')')] """ s = s.strip() while s: m = _LEXEME_START_RE.match(s) if not m: raise FormatError("No pattern match at %r"%s[:30]) g = m.groups() if g[2]: if g[2] == "(": yield _OPEN_PAREN else: yield _CLOSE_PAREN s = s[m.end():] elif g[0]: # we have a token. Go with that. yield g[0] s = s[m.end():] else: assert g[1] lastChar = g[1][-1] if lastChar == '"': qidx = g[1].index('"') quoted = g[1][qidx+1:-1] # All but quotes. data = _UNQUOTE_CHAR_RE.sub(_unquoteChar, quoted) if qidx != 0: num = int(g[1][:qidx], 10) if num != len(data): raise FormatError("Bad length on quoted string") yield data s = s[m.end():] continue num = g[1][:-1] if len(num): num = int(num, 10) else: num = None if lastChar == ':': if num is None: raise FormatError() s = s[m.end():] if len(s) < num: raise FormatError() yield s[:num] s = s[num:] elif lastChar == '#': s = s[m.end():] try: nextHash = s.index('#') except ValueError: raise FormatError("Unterminated # string") dataStr = _SPACE_RE.sub("", s[:nextHash]) try: data = binascii.a2b_hex(dataStr) except TypeError: raise FormatError("Bad hex string") if num is not None and len(data) != num: raise FormatError("Bad number on hex string") yield data s = s[nextHash+1:] elif lastChar == '|': s = s[m.end():] try: nextBar = s.index('|') except ValueError: raise FormatError("Unterminated | string") dataStr = _SPACE_RE.sub("", s[:nextBar]) # Re-pad. mod = len(dataStr) % 4 if mod: dataStr += "=" * (4 - mod) try: data = binascii.a2b_base64(dataStr) except TypeError: raise FormatError("Bad base64 string") if num is not None and len(data) != num: raise FormatError("Bad number on base64 string") yield data s = s[nextBar+1:] else: assert None def parse(s): """ >>> parse("()") [] >>> parse("(1:X3:abc1:d)") ['X', 'abc', 'd'] >>> parse("(1:X((3:abc))1:d)") ['X', [['abc']], 'd'] >>> parse("(a b (d\\ne f) (g) #ff00ff# |aGVsbG8gd29ybGQ|)") ['a', 'b', ['d', 'e', 'f'], ['g'], '\\xff\\x00\\xff', 'hello world'] """ outermost = [] stack = [ ] push = stack.append pop = stack.pop add = outermost.append for item in _lexItems(s): if item is _OPEN_PAREN: next = [] add(next) push(add) add = next.append elif item is _CLOSE_PAREN: add = pop() else: # it's a string. add(item) if len(outermost) != 1: raise FormatError("No enclosing parenthesis on list") return outermost[0]