diff options
Diffstat (limited to 'lib/sexp/encode.py')
| -rw-r--r-- | lib/sexp/encode.py | 223 | 
1 files changed, 223 insertions, 0 deletions
diff --git a/lib/sexp/encode.py b/lib/sexp/encode.py new file mode 100644 index 0000000..1df6406 --- /dev/null +++ b/lib/sexp/encode.py @@ -0,0 +1,223 @@ + + +import base64 +import binascii +import re +import hashlib + +def _encodeHex(s): +    """ +      Encode a string in hex format. + +      >>> _encodeHex("Hello world") +      '#48656c6c6f20776f726c64#' +      >>> _encodeHex("") +      '##' +    """ +    return "#%s#"%binascii.b2a_hex(s) + +def _encodeBase64(s): +    """ +       Encode a string in base64 format, with embedded newlines. + +       >>> _encodeBase64("") +       '||' +       >>> _encodeBase64("Hello world") +       '|SGVsbG8gd29ybGQ=|' +       >>> print _encodeBase64("Hello world") +       |SGVsbG8gd29ybGQ=| +       >>> _encodeBase64("Good night, sweet prince! A flock of angels " +       ...               "sing thee to thy rest") +       '|R29vZCBuaWdodCwgc3dlZXQgcHJpbmNlISBBIGZsb2NrIG9mIGFuZ2VscyBzaW5nIHRoZWUgdG8g\\ndGh5IHJlc3Q=|' + +    """ +    return "|%s|"%base64.encodestring(s).strip() + +# Map from a character value to its representation in a quoted-string. +_QUOTED_MAP = { '\b' : "\\b", +                '\t' : "\\t", +                '\v' : "\\v", +                '\n' : "\\n", +                '\f' : "\\f", +                '\r' : "\\r", +                '"'  : "\"", +                '\b' : "\\b", +                '\\' : "\\", } +for x in xrange(256): +    if 32 <= x <= 126: +        _QUOTED_MAP[chr(x)] = chr(x) +    elif not _QUOTED_MAP.has_key(chr(x)): +        _QUOTED_MAP[chr(x)] = "\\x%02x"%x +del x + + +_QUOTED_CHAR_RE = re.compile(r'[^\ -\~]') +def _replaceQuotedChar(match, _Q=_QUOTED_MAP): +    """Helper function for replacing .""" +    return _Q[match.group(0)] + +def _encodeQuoted(s, _Q=_QUOTED_MAP): +    """ +       >>> _encodeQuoted("") +       '""' +       >>> _encodeQuoted("Hello world") +       '"Hello world"' +       >>> print _encodeQuoted("Hello \xff\b") +       "Hello \\xff\\b" +    """ +    # This implementation is a slower for the case where lots of stuff +    # needs quoting, but faster for the case where only some stuff +    # needs quoting.  If more than about 1/4 of the characters need +    # quoting, then the commented-out version below is faster.  Yes, +    # this is a stupid overoptimization. +    return '"%s"'%(_QUOTED_CHAR_RE.sub(_replaceQuotedChar, s)) + +    #return '"%s"'%("".join(map(_QUOTED_MAP.__getitem__, s))) + +def _encodeRaw(s): +    """ +       Encode a string in the "raw" format used for canonical encodings. + +       >>> _encodeRaw("") +       '0:' +       >>> _encodeRaw(" ") +       '1: ' +       >>> _encodeRaw(" \\n") +       '2: \\n' +    """ +    return "%d:%s"%(len(s),s) + +_TOKEN_PAT = r"[a-zA-Z\-\.\/\_\:\*\+\=][a-zA-Z0-9\-\.\/\_\:\*\+\=]*" + +_TOKEN_RE = re.compile(_TOKEN_PAT) +def _writeToken(write,s): +    """Write a string in the token (unencoded) format.  Only works for strings +       matching _TOKEN_RE. +    """ +    assert _TOKEN_RE.match(s) +    return s + +def _encodeCleanest(s, indent=0): +    """Encode s in whatever format seems most human-readable.""" + +    if _TOKEN_RE.match(s): +        return s +    n = 0 +    for ch in s: +        if _QUOTED_MAP[ch] != ch: +            n += 1 +    if n > 3 and n > len(s)//4: +        if len(s) > 16: +            return _encodeBase64(s).replace("\n", " "*(indent+1)+"\n") +        else: +            return _encodeHex(s) +    else: +        return _encodeQuoted(s) + +def _encodePrettyPrint(s, write, indent=0, niceWidth=80): +    if isinstance(s, str): +        write(_encodeCleanest(s)) +        return +    elif len(s) == 0: +        write("()") +        return + +    if isinstance(s[0], str): +        parts = [ " "*indent, "(", _encodeCleanest(s), "\n" ] +    else: +        parts = [ "(" ] + +def _encodeCanonical(rep, append): +    """Given an s-expression in <rep>, encode it in canonical format, +       passing each part to the function "append" as it is done. +    """ +    if isinstance(rep, str): +        append(_encodeRaw(rep)) +        return + +    append("(") + +    stack = [ ] +    push = stack.append +    pop = stack.pop +    idx = 0 +    while 1: +        while idx == len(rep): +            append(")") +            try: +                rep,idx = pop() +            except IndexError: +                return +        if isinstance(rep[idx], str): +            append(_encodeRaw(rep[idx])) +            idx += 1 +            continue +        push((rep,idx+1)) +        rep = rep[idx] +        idx = 0 +        append("(") + +def encode_canonical(rep): +    """Return the canonical encoding of the s-expression <rep>. + +       >>> encode_canonical("abc") +       '3:abc' +       >>> encode_canonical(["a"]) +       '(1:a)' +       >>> encode_canonical(["a", "bc"]) +       '(1:a2:bc)' +       >>> encode_canonical([[["X", "ab c"]], "d"]) +       '(((1:X4:ab c))1:d)' +    """ +    parts = [] +    _encodeCanonical(rep, parts.append) +    return "".join(parts) + +def hash_canonical(rep, hashobj): +    """Given a hashlib hash object <hashobj>, adds the canonical +       encoding of the s-expression <rep> to hashobj. + +       >>> import hashlib +       >>> s = hashlib.sha256() +       >>> s.update("(3:abc(6:hello 5:world)(1:9))") +       >>> s.hexdigest() +       '43f7726155f2700ff0d84240f3aaa9e5a1ee2e2c9e4702f7ac3ebcd45fd2f397' +       >>> s = hashlib.sha256() +       >>> hash_canonical(["abc", ["hello ", "world"], ["9"] ], s) +       >>> s.hexdigest() +       '43f7726155f2700ff0d84240f3aaa9e5a1ee2e2c9e4702f7ac3ebcd45fd2f397' +    """ +    _encodeCanonical(rep, hashobj.update) + +def _encodePretty(rep, append, indent_step=2, niceWidth=80): +    stack = [] +    idx = 0 +    indent = 0 +    append("(") +    pop = stack.pop +    push = stack.append + +    while 1: +        while idx == len(rep): +            append(")") +            indent -= indent_step +            try: +                rep,idx = pop() +            except IndexError: +                append("\n") +                return +            else: +                append(" ") +        if isinstance(rep[idx], str): +            _encodePrettyPrint(rep[idx], append, indent, niceWidth) +            idx += 1 +            if idx < len(rep): +                append(" ") +            continue +        push((rep,idx+1)) +        rep = rep[idx] +        idx = 0 +        indent += indent_step +        append("\n%s("%(" "*indent)) + +  | 
