summaryrefslogtreecommitdiff
path: root/lib/sexp/encode.py
diff options
context:
space:
mode:
Diffstat (limited to 'lib/sexp/encode.py')
-rw-r--r--lib/sexp/encode.py223
1 files changed, 223 insertions, 0 deletions
diff --git a/lib/sexp/encode.py b/lib/sexp/encode.py
new file mode 100644
index 0000000..1df6406
--- /dev/null
+++ b/lib/sexp/encode.py
@@ -0,0 +1,223 @@
+
+
+import base64
+import binascii
+import re
+import hashlib
+
+def _encodeHex(s):
+ """
+ Encode a string in hex format.
+
+ >>> _encodeHex("Hello world")
+ '#48656c6c6f20776f726c64#'
+ >>> _encodeHex("")
+ '##'
+ """
+ return "#%s#"%binascii.b2a_hex(s)
+
+def _encodeBase64(s):
+ """
+ Encode a string in base64 format, with embedded newlines.
+
+ >>> _encodeBase64("")
+ '||'
+ >>> _encodeBase64("Hello world")
+ '|SGVsbG8gd29ybGQ=|'
+ >>> print _encodeBase64("Hello world")
+ |SGVsbG8gd29ybGQ=|
+ >>> _encodeBase64("Good night, sweet prince! A flock of angels "
+ ... "sing thee to thy rest")
+ '|R29vZCBuaWdodCwgc3dlZXQgcHJpbmNlISBBIGZsb2NrIG9mIGFuZ2VscyBzaW5nIHRoZWUgdG8g\\ndGh5IHJlc3Q=|'
+
+ """
+ return "|%s|"%base64.encodestring(s).strip()
+
+# Map from a character value to its representation in a quoted-string.
+_QUOTED_MAP = { '\b' : "\\b",
+ '\t' : "\\t",
+ '\v' : "\\v",
+ '\n' : "\\n",
+ '\f' : "\\f",
+ '\r' : "\\r",
+ '"' : "\"",
+ '\b' : "\\b",
+ '\\' : "\\", }
+for x in xrange(256):
+ if 32 <= x <= 126:
+ _QUOTED_MAP[chr(x)] = chr(x)
+ elif not _QUOTED_MAP.has_key(chr(x)):
+ _QUOTED_MAP[chr(x)] = "\\x%02x"%x
+del x
+
+
+_QUOTED_CHAR_RE = re.compile(r'[^\ -\~]')
+def _replaceQuotedChar(match, _Q=_QUOTED_MAP):
+ """Helper function for replacing ."""
+ return _Q[match.group(0)]
+
+def _encodeQuoted(s, _Q=_QUOTED_MAP):
+ """
+ >>> _encodeQuoted("")
+ '""'
+ >>> _encodeQuoted("Hello world")
+ '"Hello world"'
+ >>> print _encodeQuoted("Hello \xff\b")
+ "Hello \\xff\\b"
+ """
+ # This implementation is a slower for the case where lots of stuff
+ # needs quoting, but faster for the case where only some stuff
+ # needs quoting. If more than about 1/4 of the characters need
+ # quoting, then the commented-out version below is faster. Yes,
+ # this is a stupid overoptimization.
+ return '"%s"'%(_QUOTED_CHAR_RE.sub(_replaceQuotedChar, s))
+
+ #return '"%s"'%("".join(map(_QUOTED_MAP.__getitem__, s)))
+
+def _encodeRaw(s):
+ """
+ Encode a string in the "raw" format used for canonical encodings.
+
+ >>> _encodeRaw("")
+ '0:'
+ >>> _encodeRaw(" ")
+ '1: '
+ >>> _encodeRaw(" \\n")
+ '2: \\n'
+ """
+ return "%d:%s"%(len(s),s)
+
+_TOKEN_PAT = r"[a-zA-Z\-\.\/\_\:\*\+\=][a-zA-Z0-9\-\.\/\_\:\*\+\=]*"
+
+_TOKEN_RE = re.compile(_TOKEN_PAT)
+def _writeToken(write,s):
+ """Write a string in the token (unencoded) format. Only works for strings
+ matching _TOKEN_RE.
+ """
+ assert _TOKEN_RE.match(s)
+ return s
+
+def _encodeCleanest(s, indent=0):
+ """Encode s in whatever format seems most human-readable."""
+
+ if _TOKEN_RE.match(s):
+ return s
+ n = 0
+ for ch in s:
+ if _QUOTED_MAP[ch] != ch:
+ n += 1
+ if n > 3 and n > len(s)//4:
+ if len(s) > 16:
+ return _encodeBase64(s).replace("\n", " "*(indent+1)+"\n")
+ else:
+ return _encodeHex(s)
+ else:
+ return _encodeQuoted(s)
+
+def _encodePrettyPrint(s, write, indent=0, niceWidth=80):
+ if isinstance(s, str):
+ write(_encodeCleanest(s))
+ return
+ elif len(s) == 0:
+ write("()")
+ return
+
+ if isinstance(s[0], str):
+ parts = [ " "*indent, "(", _encodeCleanest(s), "\n" ]
+ else:
+ parts = [ "(" ]
+
+def _encodeCanonical(rep, append):
+ """Given an s-expression in <rep>, encode it in canonical format,
+ passing each part to the function "append" as it is done.
+ """
+ if isinstance(rep, str):
+ append(_encodeRaw(rep))
+ return
+
+ append("(")
+
+ stack = [ ]
+ push = stack.append
+ pop = stack.pop
+ idx = 0
+ while 1:
+ while idx == len(rep):
+ append(")")
+ try:
+ rep,idx = pop()
+ except IndexError:
+ return
+ if isinstance(rep[idx], str):
+ append(_encodeRaw(rep[idx]))
+ idx += 1
+ continue
+ push((rep,idx+1))
+ rep = rep[idx]
+ idx = 0
+ append("(")
+
+def encode_canonical(rep):
+ """Return the canonical encoding of the s-expression <rep>.
+
+ >>> encode_canonical("abc")
+ '3:abc'
+ >>> encode_canonical(["a"])
+ '(1:a)'
+ >>> encode_canonical(["a", "bc"])
+ '(1:a2:bc)'
+ >>> encode_canonical([[["X", "ab c"]], "d"])
+ '(((1:X4:ab c))1:d)'
+ """
+ parts = []
+ _encodeCanonical(rep, parts.append)
+ return "".join(parts)
+
+def hash_canonical(rep, hashobj):
+ """Given a hashlib hash object <hashobj>, adds the canonical
+ encoding of the s-expression <rep> to hashobj.
+
+ >>> import hashlib
+ >>> s = hashlib.sha256()
+ >>> s.update("(3:abc(6:hello 5:world)(1:9))")
+ >>> s.hexdigest()
+ '43f7726155f2700ff0d84240f3aaa9e5a1ee2e2c9e4702f7ac3ebcd45fd2f397'
+ >>> s = hashlib.sha256()
+ >>> hash_canonical(["abc", ["hello ", "world"], ["9"] ], s)
+ >>> s.hexdigest()
+ '43f7726155f2700ff0d84240f3aaa9e5a1ee2e2c9e4702f7ac3ebcd45fd2f397'
+ """
+ _encodeCanonical(rep, hashobj.update)
+
+def _encodePretty(rep, append, indent_step=2, niceWidth=80):
+ stack = []
+ idx = 0
+ indent = 0
+ append("(")
+ pop = stack.pop
+ push = stack.append
+
+ while 1:
+ while idx == len(rep):
+ append(")")
+ indent -= indent_step
+ try:
+ rep,idx = pop()
+ except IndexError:
+ append("\n")
+ return
+ else:
+ append(" ")
+ if isinstance(rep[idx], str):
+ _encodePrettyPrint(rep[idx], append, indent, niceWidth)
+ idx += 1
+ if idx < len(rep):
+ append(" ")
+ continue
+ push((rep,idx+1))
+ rep = rep[idx]
+ idx = 0
+ indent += indent_step
+ append("\n%s("%(" "*indent))
+
+