From 7b434c8d229b867df4523a13ec97e30ec26b6b7b Mon Sep 17 00:00:00 2001 From: Nick Mathewson Date: Sat, 30 Aug 2008 16:33:47 +0000 Subject: Initial auto-updater commit: s-expression libray and format spec. git-svn-id: file:///home/or/svnrepo/updater/trunk@16692 55e972cd-5a19-0410-ae62-a4d7a52db4cd --- lib/sexp/encode.py | 223 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 223 insertions(+) create mode 100644 lib/sexp/encode.py (limited to 'lib/sexp/encode.py') diff --git a/lib/sexp/encode.py b/lib/sexp/encode.py new file mode 100644 index 0000000..1df6406 --- /dev/null +++ b/lib/sexp/encode.py @@ -0,0 +1,223 @@ + + +import base64 +import binascii +import re +import hashlib + +def _encodeHex(s): + """ + Encode a string in hex format. + + >>> _encodeHex("Hello world") + '#48656c6c6f20776f726c64#' + >>> _encodeHex("") + '##' + """ + return "#%s#"%binascii.b2a_hex(s) + +def _encodeBase64(s): + """ + Encode a string in base64 format, with embedded newlines. + + >>> _encodeBase64("") + '||' + >>> _encodeBase64("Hello world") + '|SGVsbG8gd29ybGQ=|' + >>> print _encodeBase64("Hello world") + |SGVsbG8gd29ybGQ=| + >>> _encodeBase64("Good night, sweet prince! A flock of angels " + ... "sing thee to thy rest") + '|R29vZCBuaWdodCwgc3dlZXQgcHJpbmNlISBBIGZsb2NrIG9mIGFuZ2VscyBzaW5nIHRoZWUgdG8g\\ndGh5IHJlc3Q=|' + + """ + return "|%s|"%base64.encodestring(s).strip() + +# Map from a character value to its representation in a quoted-string. +_QUOTED_MAP = { '\b' : "\\b", + '\t' : "\\t", + '\v' : "\\v", + '\n' : "\\n", + '\f' : "\\f", + '\r' : "\\r", + '"' : "\"", + '\b' : "\\b", + '\\' : "\\", } +for x in xrange(256): + if 32 <= x <= 126: + _QUOTED_MAP[chr(x)] = chr(x) + elif not _QUOTED_MAP.has_key(chr(x)): + _QUOTED_MAP[chr(x)] = "\\x%02x"%x +del x + + +_QUOTED_CHAR_RE = re.compile(r'[^\ -\~]') +def _replaceQuotedChar(match, _Q=_QUOTED_MAP): + """Helper function for replacing .""" + return _Q[match.group(0)] + +def _encodeQuoted(s, _Q=_QUOTED_MAP): + """ + >>> _encodeQuoted("") + '""' + >>> _encodeQuoted("Hello world") + '"Hello world"' + >>> print _encodeQuoted("Hello \xff\b") + "Hello \\xff\\b" + """ + # This implementation is a slower for the case where lots of stuff + # needs quoting, but faster for the case where only some stuff + # needs quoting. If more than about 1/4 of the characters need + # quoting, then the commented-out version below is faster. Yes, + # this is a stupid overoptimization. + return '"%s"'%(_QUOTED_CHAR_RE.sub(_replaceQuotedChar, s)) + + #return '"%s"'%("".join(map(_QUOTED_MAP.__getitem__, s))) + +def _encodeRaw(s): + """ + Encode a string in the "raw" format used for canonical encodings. + + >>> _encodeRaw("") + '0:' + >>> _encodeRaw(" ") + '1: ' + >>> _encodeRaw(" \\n") + '2: \\n' + """ + return "%d:%s"%(len(s),s) + +_TOKEN_PAT = r"[a-zA-Z\-\.\/\_\:\*\+\=][a-zA-Z0-9\-\.\/\_\:\*\+\=]*" + +_TOKEN_RE = re.compile(_TOKEN_PAT) +def _writeToken(write,s): + """Write a string in the token (unencoded) format. Only works for strings + matching _TOKEN_RE. + """ + assert _TOKEN_RE.match(s) + return s + +def _encodeCleanest(s, indent=0): + """Encode s in whatever format seems most human-readable.""" + + if _TOKEN_RE.match(s): + return s + n = 0 + for ch in s: + if _QUOTED_MAP[ch] != ch: + n += 1 + if n > 3 and n > len(s)//4: + if len(s) > 16: + return _encodeBase64(s).replace("\n", " "*(indent+1)+"\n") + else: + return _encodeHex(s) + else: + return _encodeQuoted(s) + +def _encodePrettyPrint(s, write, indent=0, niceWidth=80): + if isinstance(s, str): + write(_encodeCleanest(s)) + return + elif len(s) == 0: + write("()") + return + + if isinstance(s[0], str): + parts = [ " "*indent, "(", _encodeCleanest(s), "\n" ] + else: + parts = [ "(" ] + +def _encodeCanonical(rep, append): + """Given an s-expression in , encode it in canonical format, + passing each part to the function "append" as it is done. + """ + if isinstance(rep, str): + append(_encodeRaw(rep)) + return + + append("(") + + stack = [ ] + push = stack.append + pop = stack.pop + idx = 0 + while 1: + while idx == len(rep): + append(")") + try: + rep,idx = pop() + except IndexError: + return + if isinstance(rep[idx], str): + append(_encodeRaw(rep[idx])) + idx += 1 + continue + push((rep,idx+1)) + rep = rep[idx] + idx = 0 + append("(") + +def encode_canonical(rep): + """Return the canonical encoding of the s-expression . + + >>> encode_canonical("abc") + '3:abc' + >>> encode_canonical(["a"]) + '(1:a)' + >>> encode_canonical(["a", "bc"]) + '(1:a2:bc)' + >>> encode_canonical([[["X", "ab c"]], "d"]) + '(((1:X4:ab c))1:d)' + """ + parts = [] + _encodeCanonical(rep, parts.append) + return "".join(parts) + +def hash_canonical(rep, hashobj): + """Given a hashlib hash object , adds the canonical + encoding of the s-expression to hashobj. + + >>> import hashlib + >>> s = hashlib.sha256() + >>> s.update("(3:abc(6:hello 5:world)(1:9))") + >>> s.hexdigest() + '43f7726155f2700ff0d84240f3aaa9e5a1ee2e2c9e4702f7ac3ebcd45fd2f397' + >>> s = hashlib.sha256() + >>> hash_canonical(["abc", ["hello ", "world"], ["9"] ], s) + >>> s.hexdigest() + '43f7726155f2700ff0d84240f3aaa9e5a1ee2e2c9e4702f7ac3ebcd45fd2f397' + """ + _encodeCanonical(rep, hashobj.update) + +def _encodePretty(rep, append, indent_step=2, niceWidth=80): + stack = [] + idx = 0 + indent = 0 + append("(") + pop = stack.pop + push = stack.append + + while 1: + while idx == len(rep): + append(")") + indent -= indent_step + try: + rep,idx = pop() + except IndexError: + append("\n") + return + else: + append(" ") + if isinstance(rep[idx], str): + _encodePrettyPrint(rep[idx], append, indent, niceWidth) + idx += 1 + if idx < len(rep): + append(" ") + continue + push((rep,idx+1)) + rep = rep[idx] + idx = 0 + indent += indent_step + append("\n%s("%(" "*indent)) + + -- cgit v1.2.3