diff options
-rw-r--r-- | lib/sexp/__init__.py | 2 | ||||
-rw-r--r-- | lib/sexp/access.py | 373 | ||||
-rw-r--r-- | lib/sexp/encode.py | 223 | ||||
-rw-r--r-- | lib/sexp/parse.py | 210 | ||||
-rw-r--r-- | lib/sexp/tests.py | 31 | ||||
-rw-r--r-- | specs/U1-automatic-updates.txt | 377 | ||||
-rw-r--r-- | specs/U2-formats.txt | 430 |
7 files changed, 1646 insertions, 0 deletions
diff --git a/lib/sexp/__init__.py b/lib/sexp/__init__.py new file mode 100644 index 0000000..1ccccc3 --- /dev/null +++ b/lib/sexp/__init__.py @@ -0,0 +1,2 @@ + +__all__ = [ 'parse', 'encode', 'access', 'tests' ] diff --git a/lib/sexp/access.py b/lib/sexp/access.py new file mode 100644 index 0000000..d9b40dc --- /dev/null +++ b/lib/sexp/access.py @@ -0,0 +1,373 @@ + +import re +import sys + +def s_tag(s): + """ + >>> s_tag("a string") is None + True + >>> s_tag(["a-tagged", "list"]) + 'a-tagged' + >>> s_tag([["untagged"], "list"]) is None + True + """ + if len(s) and not isinstance(s, str) and isinstance(s[0],str): + return s[0] + else: + return None + +def s_child(s, tag): + for child in s: + if s_tag(child) == tag: + return child + return None + +def s_children(s, tag): + return (ch for ch in s if s_tag(ch) == tag) + +def s_descendants(s, tags=()): + stack = [ ] + push = stack.append + pop = stack.pop + + idx = 0 + while 1: + while idx == len(s): + try: + s, idx = pop() + except IndexError: + return + if isinstance(s[idx], str): + idx += 1 + continue + if s_tag(s[idx]) in tags: + yield s[idx] + + +class SExpr(list): + def __init__(self, stuff=()): + list.__init__(self, stuff) + self._d = None + + def __getattr__(self, item): + if self._d is None: self._buildDict() + return self[self._d[item]] + + def __getitem__(self, idx): + item = list.__getitem__(self, idx) + if type(item) in (list, tuple): #exact match only. + item = self[idx] = SExpr(item) + return item + + def _buildDict(self): + self._d = d = {} + for idx in xrange(len(self)): + item = list.__getitem__(self, idx) + t = s_tag(item) + if t is not None: + d[t] = idx + +def _s_lookup_all(s, path, callback): + + # XXXX: Watch out; ** gets pretty heavy pretty fast. + + if isinstance(path, str): + path = path.split(".") + + if len(path) == 0: + callback(s) + return + + for p_idx in xrange(len(path)): + p_item = path[p_idx] + + if p_item == '*': + for ch in s: + if not isinstance(ch, str): + _s_lookup_all(s, path[p_idx+1:], callback) + return + elif p_item == '**': + for ch in s_descendants(s): + if not isinstance(ch, str): + _s_lookup_all(s, path[p_idx+1:], callback) + return + elif p_item.startswith('**'): + for ch in s_descendants(s): + if s_tag(ch) == p_item[2:]: + _s_lookup_all(s, path[p_idx+1:], callback) + else: + s = s_child(s, p_item) + if s is None: + return + + callback(s) + +def s_lookup_all(s, path): + result = [] + _s_lookup_all(s, path, result.append) + return result + +def s_lookup(s, path): + r = s_lookup_all(s, path) + if len(r): + return r[0] + return None + +class _Singleton: + def isSingleton(self): + return True + def matches(self, item): + raise NotImplemented() + + def clear(self): + self._got = False + def matchItem(self, item): + if not self._got and self.matches(item): + self._got = True + return True + else: + return False + def isSatisfied(self): + return self._got + +class _Span: + def isSingleton(self): + return False + def clear(self): + pass + def matchItem(self, item): + raise NotImplemented() + def isSatisfied(self): + raise NotImplemented() + +class _AnyItem(_Singleton): + def matches(self,item): + return True + def rep(self): + return "?" +class _AnyString(_Singleton): + def matches(self,item): + return isinstance(item, str) + def rep(self): + return "." +class _ExactString(_Singleton): + def __init__(self, s): + self._s = s + def matches(self, item): + return item == self._s + def rep(self): + return "=%s"%self._s +class _ReString(_Singleton): + def __init__(self, s, regex=None): + if regex is None: + regex = re.compile(s) + self._re = regex + self._s = s + def matches(self, item): + if not isinstance(item, str): + return False + m = self._re.match(item) + return m and m.end() == len(item) + def rep(self): + return "%s"%self._s +class _List(_Singleton): + def __init__(self, subpatterns): + self._pats = subpatterns + def clear(self): + _Singleton.clear(self) + for p in self._pats: + p.clear() + def matches(self, item): + if isinstance(item, str): + return False + + i_idx = 0 + pat_idx = 0 + while i_idx < len(item): + try: + subpat = self._pats[pat_idx] + except: + return False # Too many items. + + if subpat.isSingleton(): + if not subpat.matches(item[i_idx]): + return False + i_idx += 1 + pat_idx += 1 + else: + subpat.clear() + while i_idx < len(item) and subpat.matchItem(item[i_idx]): + i_idx += 1 + if not subpat.isSatisfied(): + return False + pat_idx += 1 + + # Out of items, but we have more patterns. Make sure they all accept + # 0 items. + if pat_idx < len(self._pats): + for subpat in self._pats[pat_idx:]: + subpat.clear() + if not subpat.isSatisfied(): + return False + return True + + def rep(self): + return [ p.rep() for p in self._pats ] + +class _AnyItems(_Span): + def matchItem(self, item): + return True + def isSatisfied(self): + return True + def rep(self): + return "*" + +class _NMatches(_Span): + def __init__(self, alternatives, lo, hi): + self.lo = lo + self.hi = hi + self.count = 0 + self.alternatives = alternatives + for a in alternatives: + if not a.isSingleton(): + raise SchemaFormatError("Nexted span inside span") + def clear(self): + self.count = 0 + for p in self.alternatives: + p.clear() + def matchItem(self, item): + if self.count == self.hi: + return False + for p in self.alternatives: + if p.matches(item): + self.count += 1 + return True + return False + def isSatisfied(self): + return self.lo <= self.count <= self.hi + + def rep(self): + name = { (1,1): ":oneof", + (0,1): ":maybe", + (0,sys.maxint): ":anyof", + (1,sys.maxint): ":someof" }.get((self.lo, self.hi)) + if name is None: + name = ":%d-%d"%(self.lo, self.hi) + result = [ name ] + result.extend(p.rep() for p in self.alternatives) + return result + +class _Unordered(_Span): + def __init__(self, alternatives): + self.alternatives = alternatives + def clear(self): + for p in self.alternatives: + p.clear() + def matchItem(self, item): + for p in self.alternatives: + if p.matchItem(item): + return True + return False + def isSatisfied(self): + for p in self.alternatives: + if not p.isSatisfied(): + return False + return True + def rep(self): + result = [ ":unordered" ] + result.extend(p.rep() for p in self.alternatives) + return result + +class SchemaFormatError(Exception): + pass + +_RE_PAT = re.compile(r'/((?:[\\.]|[^\\/]+)*)/([ilmstx]*)', re.I) + +def parseSchema(s, table): + + if isinstance(s, str): + if not len(s): + raise SchemaFormatError("Empty string encountered") + if s == '*': + return _AnyItems() + elif s == '?': + return _AnyItem() + elif s == '.': + return _AnyString() + elif s.startswith('='): + return _ExactString(s[1:]) + elif s.startswith('.'): + try: + return table[s[1:]] + except KeyError: + raise SchemaFormatError("Unknown reference %s"%s) + else: + m = _RE_PAT.match(s) + if m: + flags = 0 + for char in m.group(2): + flags |= { "i":re.I, "l":re.L, "m":re.M, "s":re.S, + "t":re.T, "x":re.X }[char.lower()] + try: + p = re.compile(m.group(1), flags) + except re.error, e: + raise SchemaFormatError("Couldn't compile %s"%s) + + return _ReString(s, p) + + raise SchemaFormatError("Confusing entry %s"%s) + elif len(s) and isinstance(s[0], str) and s[0].startswith(':'): + tag = s[0] + + m = re.match(r'\:(\d*)(\-\d*)?$', tag) + if m: + g = m.groups() + if g[0]: + lo = int(g[0], 10) + else: + lo = 0 + if g[1]: + if len(g[1]) > 1: + hi = int(g[1][1:], 10) + else: + hi = sys.maxint + else: + hi = lo + else: + try: + lo,hi = { ":maybe": (0,1), + ":oneof": (1,1), + ":anyof": (0,sys.maxint), + ":someof":(1,sys.maxint), + ":unordered": (None, None) }[tag] + except KeyError: + raise SchemaFormatError("Unknown tag %s"%tag) + + subitems = [ parseSchema(i, table) for i in s[1:] ] + if lo is not None: + return _NMatches(subitems, lo, hi) + else: + return _Unordered(subitems) + else: + return _List([ parseSchema(i, table) for i in s ]) + +# Schema format: +# "=string" matches a string itself. +# "*" matches any number of items and can occur only at the end of a list. +# "?" matches a single item. +# "." matches any string. +# "/re/" matches a regex. +# ".name" matches a named schema +# +# (i1 i2 i3) matches a list of i1 followed by i2 followed by i3. +# +# (:maybe i1) matches zero or one of i1. +# (:oneof i1 i2 i3) matches any one of the items i1, i2, i3. +# (:anyof i1 i2 i3) matches zero or more of the items i1, i2, i3. +# (:someof i1 i2 i3) matches one or more of i1, i2, and i3. +# +# (:unordered i1 i2 i3) matches all of i1, i2, and i3, in any order. +# +# The matching algorithm is a stupid greedy algorithm. If you need to +# check stuff it can't handle, write a new thing. + diff --git a/lib/sexp/encode.py b/lib/sexp/encode.py new file mode 100644 index 0000000..1df6406 --- /dev/null +++ b/lib/sexp/encode.py @@ -0,0 +1,223 @@ + + +import base64 +import binascii +import re +import hashlib + +def _encodeHex(s): + """ + Encode a string in hex format. + + >>> _encodeHex("Hello world") + '#48656c6c6f20776f726c64#' + >>> _encodeHex("") + '##' + """ + return "#%s#"%binascii.b2a_hex(s) + +def _encodeBase64(s): + """ + Encode a string in base64 format, with embedded newlines. + + >>> _encodeBase64("") + '||' + >>> _encodeBase64("Hello world") + '|SGVsbG8gd29ybGQ=|' + >>> print _encodeBase64("Hello world") + |SGVsbG8gd29ybGQ=| + >>> _encodeBase64("Good night, sweet prince! A flock of angels " + ... "sing thee to thy rest") + '|R29vZCBuaWdodCwgc3dlZXQgcHJpbmNlISBBIGZsb2NrIG9mIGFuZ2VscyBzaW5nIHRoZWUgdG8g\\ndGh5IHJlc3Q=|' + + """ + return "|%s|"%base64.encodestring(s).strip() + +# Map from a character value to its representation in a quoted-string. +_QUOTED_MAP = { '\b' : "\\b", + '\t' : "\\t", + '\v' : "\\v", + '\n' : "\\n", + '\f' : "\\f", + '\r' : "\\r", + '"' : "\"", + '\b' : "\\b", + '\\' : "\\", } +for x in xrange(256): + if 32 <= x <= 126: + _QUOTED_MAP[chr(x)] = chr(x) + elif not _QUOTED_MAP.has_key(chr(x)): + _QUOTED_MAP[chr(x)] = "\\x%02x"%x +del x + + +_QUOTED_CHAR_RE = re.compile(r'[^\ -\~]') +def _replaceQuotedChar(match, _Q=_QUOTED_MAP): + """Helper function for replacing .""" + return _Q[match.group(0)] + +def _encodeQuoted(s, _Q=_QUOTED_MAP): + """ + >>> _encodeQuoted("") + '""' + >>> _encodeQuoted("Hello world") + '"Hello world"' + >>> print _encodeQuoted("Hello \xff\b") + "Hello \\xff\\b" + """ + # This implementation is a slower for the case where lots of stuff + # needs quoting, but faster for the case where only some stuff + # needs quoting. If more than about 1/4 of the characters need + # quoting, then the commented-out version below is faster. Yes, + # this is a stupid overoptimization. + return '"%s"'%(_QUOTED_CHAR_RE.sub(_replaceQuotedChar, s)) + + #return '"%s"'%("".join(map(_QUOTED_MAP.__getitem__, s))) + +def _encodeRaw(s): + """ + Encode a string in the "raw" format used for canonical encodings. + + >>> _encodeRaw("") + '0:' + >>> _encodeRaw(" ") + '1: ' + >>> _encodeRaw(" \\n") + '2: \\n' + """ + return "%d:%s"%(len(s),s) + +_TOKEN_PAT = r"[a-zA-Z\-\.\/\_\:\*\+\=][a-zA-Z0-9\-\.\/\_\:\*\+\=]*" + +_TOKEN_RE = re.compile(_TOKEN_PAT) +def _writeToken(write,s): + """Write a string in the token (unencoded) format. Only works for strings + matching _TOKEN_RE. + """ + assert _TOKEN_RE.match(s) + return s + +def _encodeCleanest(s, indent=0): + """Encode s in whatever format seems most human-readable.""" + + if _TOKEN_RE.match(s): + return s + n = 0 + for ch in s: + if _QUOTED_MAP[ch] != ch: + n += 1 + if n > 3 and n > len(s)//4: + if len(s) > 16: + return _encodeBase64(s).replace("\n", " "*(indent+1)+"\n") + else: + return _encodeHex(s) + else: + return _encodeQuoted(s) + +def _encodePrettyPrint(s, write, indent=0, niceWidth=80): + if isinstance(s, str): + write(_encodeCleanest(s)) + return + elif len(s) == 0: + write("()") + return + + if isinstance(s[0], str): + parts = [ " "*indent, "(", _encodeCleanest(s), "\n" ] + else: + parts = [ "(" ] + +def _encodeCanonical(rep, append): + """Given an s-expression in <rep>, encode it in canonical format, + passing each part to the function "append" as it is done. + """ + if isinstance(rep, str): + append(_encodeRaw(rep)) + return + + append("(") + + stack = [ ] + push = stack.append + pop = stack.pop + idx = 0 + while 1: + while idx == len(rep): + append(")") + try: + rep,idx = pop() + except IndexError: + return + if isinstance(rep[idx], str): + append(_encodeRaw(rep[idx])) + idx += 1 + continue + push((rep,idx+1)) + rep = rep[idx] + idx = 0 + append("(") + +def encode_canonical(rep): + """Return the canonical encoding of the s-expression <rep>. + + >>> encode_canonical("abc") + '3:abc' + >>> encode_canonical(["a"]) + '(1:a)' + >>> encode_canonical(["a", "bc"]) + '(1:a2:bc)' + >>> encode_canonical([[["X", "ab c"]], "d"]) + '(((1:X4:ab c))1:d)' + """ + parts = [] + _encodeCanonical(rep, parts.append) + return "".join(parts) + +def hash_canonical(rep, hashobj): + """Given a hashlib hash object <hashobj>, adds the canonical + encoding of the s-expression <rep> to hashobj. + + >>> import hashlib + >>> s = hashlib.sha256() + >>> s.update("(3:abc(6:hello 5:world)(1:9))") + >>> s.hexdigest() + '43f7726155f2700ff0d84240f3aaa9e5a1ee2e2c9e4702f7ac3ebcd45fd2f397' + >>> s = hashlib.sha256() + >>> hash_canonical(["abc", ["hello ", "world"], ["9"] ], s) + >>> s.hexdigest() + '43f7726155f2700ff0d84240f3aaa9e5a1ee2e2c9e4702f7ac3ebcd45fd2f397' + """ + _encodeCanonical(rep, hashobj.update) + +def _encodePretty(rep, append, indent_step=2, niceWidth=80): + stack = [] + idx = 0 + indent = 0 + append("(") + pop = stack.pop + push = stack.append + + while 1: + while idx == len(rep): + append(")") + indent -= indent_step + try: + rep,idx = pop() + except IndexError: + append("\n") + return + else: + append(" ") + if isinstance(rep[idx], str): + _encodePrettyPrint(rep[idx], append, indent, niceWidth) + idx += 1 + if idx < len(rep): + append(" ") + continue + push((rep,idx+1)) + rep = rep[idx] + idx = 0 + indent += indent_step + append("\n%s("%(" "*indent)) + + diff --git a/lib/sexp/parse.py b/lib/sexp/parse.py new file mode 100644 index 0000000..a33b79a --- /dev/null +++ b/lib/sexp/parse.py @@ -0,0 +1,210 @@ + +import re +import base64 +import binascii +import re + +# Partial implementation of Rivest's proposed S-Expressions standard +# as documented at +# http://people.csail.mit.edu/rivest/Sexp.txt +# +# It's slightly optimized. +# +# Not implemented: +# [display hints] +# {basic transport} + +__all__ = [ 'FormatError', 'parse' ] + +class FormatError(Exception): + """Raised when parsing fails.""" + pass + +_TOKEN_PAT = r"[a-zA-Z\-\.\/\_\:\*\+\=][a-zA-Z0-9\-\.\/\_\:\*\+\=]*" +# Regular expression to match a single lexeme from an encode s-expression. +_LEXEME_START_RE = re.compile( + r""" \s* (?: (%s) | # Grp 0: A token. + ([0-9]*(?: [\:\|\#\{] | # Grp1 : start of string... + \"(?:[^\\\"]+|\\.)*\")) | # or qstring. + ([\(\)]) # Grp 2: a paren of some kind. + )""" + %_TOKEN_PAT,re.X|re.M|re.S) + +class _P: + """Helper class for parenthesis tokens.""" + def __init__(self, val): + self.val = val + def __repr__(self): + return "_P(%r)"%self.val + +_OPEN_PAREN = _P("(") +_CLOSE_PAREN = _P(")") +del _P +_SPACE_RE = re.compile(r'\s+') + +# Matches all characters in a string that we need to unquote. +_UNQUOTE_CHAR_RE = re.compile(r''' + \\ (?: [abtnvfr] | \r \n ? | \n \r ? | [xX] [A-Fa-f0-9]{2} | [0-8]{1,3} ) + ''') + +# Map from quoted representation to unquoted format. +_UNQUOTE_CHAR_MAP = { 'a': '\a', + 'b': '\b', + 't': '\t', + 'n': '\n', + 'v': '\v', + 'f': '\f', + 'r': '\r' } +def _unquoteChar(ch, _U=_UNQUOTE_CHAR_MAP): + ch = ch[1:] + try: + return _U[ch] + except KeyError: + pass + if ch[0] in "\n\r": + return "" + elif ch[0] in 'xX': + return chr(int(ch[1:], 16)) + else: + i = int(ch[1:], 8) + if i >= 256: + raise FormatError("Octal character format out of range.") + return chr(i) + +def _lexItems(s): + """Generator that iterates over the lexical items in an encoded + s-expression. Yields a string for strings, or the special objects + _OPEN_PAREN and _CLOSE_PAREN. + + >>> list(_lexItems('(4:a)b hello) (world 1:a 0: ')) + [_P('('), 'a)b ', 'hello', _P(')'), _P('('), 'world', 'a', ''] + + >>> list(_lexItems('a b-c 1#20#2#2061##686877# |aGVsbG8gd29ybGQ|')) + ['a', 'b-c', ' ', ' a', 'hhw', 'hello world'] + + >>> list(_lexItems('#2 0# |aGVs\\nbG8 gd29yb GQ| ')) + [' ', 'hello world'] + + >>> list(_lexItems('|YWJjZA==| x |YWJjZA| 3|Y W J j|')) + ['abcd', 'x', 'abcd', 'abc'] + + >>> list(_lexItems('("1""234""hello world" 3"abc" 4" " )')) + [_P('('), '1', '234', 'hello world', 'abc', ' ', _P(')')] + + """ + s = s.strip() + while s: + m = _LEXEME_START_RE.match(s) + if not m: + raise FormatError("No pattern match at %r"%s[:30]) + g = m.groups() + if g[2]: + if g[2] == "(": + yield _OPEN_PAREN + else: + yield _CLOSE_PAREN + s = s[m.end():] + elif g[0]: + # we have a token. Go with that. + yield g[0] + s = s[m.end():] + else: + assert g[1] + lastChar = g[1][-1] + if lastChar == '"': + qidx = g[1].index('"') + quoted = g[1][qidx+1:-1] # All but quotes. + data = _UNQUOTE_CHAR_RE.sub(_unquoteChar, quoted) + if qidx != 0: + num = int(g[1][:qidx], 10) + if num != len(data): + raise FormatError("Bad length on quoted string") + yield data + s = s[m.end():] + continue + + num = g[1][:-1] + if len(num): + num = int(num, 10) + else: + num = None + + if lastChar == ':': + if num is None: + raise FormatError() + s = s[m.end():] + if len(s) < num: + raise FormatError() + yield s[:num] + s = s[num:] + elif lastChar == '#': + s = s[m.end():] + try: + nextHash = s.index('#') + except ValueError: + raise FormatError("Unterminated # string") + dataStr = _SPACE_RE.sub("", s[:nextHash]) + try: + data = binascii.a2b_hex(dataStr) + except TypeError: + raise FormatError("Bad hex string") + if num is not None and len(data) != num: + raise FormatError("Bad number on hex string") + yield data + s = s[nextHash+1:] + elif lastChar == '|': + s = s[m.end():] + try: + nextBar = s.index('|') + except ValueError: + raise FormatError("Unterminated | string") + dataStr = _SPACE_RE.sub("", s[:nextBar]) + # Re-pad. + mod = len(dataStr) % 4 + if mod: + dataStr += "=" * (4 - mod) + try: + data = binascii.a2b_base64(dataStr) + except TypeError: + raise FormatError("Bad base64 string") + if num is not None and len(data) != num: + raise FormatError("Bad number on base64 string") + yield data + s = s[nextBar+1:] + else: + assert None + +def parse(s): + """ + >>> parse("()") + [] + >>> parse("(1:X3:abc1:d)") + ['X', 'abc', 'd'] + >>> parse("(1:X((3:abc))1:d)") + ['X', [['abc']], 'd'] + >>> parse("(a b (d\\ne f) (g) #ff00ff# |aGVsbG8gd29ybGQ|)") + ['a', 'b', ['d', 'e', 'f'], ['g'], '\\xff\\x00\\xff', 'hello world'] + + """ + outermost = [] + stack = [ ] + push = stack.append + pop = stack.pop + add = outermost.append + + for item in _lexItems(s): + if item is _OPEN_PAREN: + next = [] + add(next) + push(add) + add = next.append + elif item is _CLOSE_PAREN: + add = pop() + else: + # it's a string. + add(item) + + if len(outermost) != 1: + raise FormatError("No enclosing parenthesis on list") + return outermost[0] + diff --git a/lib/sexp/tests.py b/lib/sexp/tests.py new file mode 100644 index 0000000..7decd03 --- /dev/null +++ b/lib/sexp/tests.py @@ -0,0 +1,31 @@ + +import unittest +import doctest + +import sexp.parse +import sexp.access +import sexp.encode + +class EncodingTest(unittest.TestCase): + def testQuotedString(self): + self.assertEquals(1,1) + + + +def suite(): + import sexp.tests + suite = unittest.TestSuite() + + suite.addTest(doctest.DocTestSuite(sexp.encode)) + suite.addTest(doctest.DocTestSuite(sexp.parse)) + suite.addTest(doctest.DocTestSuite(sexp.access)) + + loader = unittest.TestLoader() + suite.addTest(loader.loadTestsFromModule(sexp.tests)) + + return suite + + +if __name__ == '__main__': + + unittest.TextTestRunner(verbosity=1).run(suite()) diff --git a/specs/U1-automatic-updates.txt b/specs/U1-automatic-updates.txt new file mode 100644 index 0000000..b234748 --- /dev/null +++ b/specs/U1-automatic-updates.txt @@ -0,0 +1,377 @@ + +Filename: xxx-automatic-updates.txt +Title: Automatic Software Update Protocol +Version: $Revision$ +Last-Modified: $Date$ +Author: Matt Edman +Created: 30-July-2008 +Status: Open + + +Scope + + This proposal specifies the method by which an automatic update client can + determine the most recent recommended Tor installation package for the + user's platform, download the package, and then verify that the package was + downloaded successfully. While this proposal focuses on only the Tor + software, the protocol defined is sufficiently extensible such that other + components of the Tor bundles, like Vidalia, Polipo, and Torbutton, can be + managed and updated by the automatic update client as well. + + The initial target platform for the automatic update framework is Windows, + given that's the platform used by a majority of our users and that it lacks + a sane package management system that many Linux distributions already have. + Our second target platform will be Mac OS X, and so the protocol will be + designed with this near-future direction in mind. + + Other client-side aspects of the automatic update process, such as user + interaction, the interface presented, and actual package installation + procedure, are outside the scope of this proposal. + + +Motivation + + Tor releases new versions frequently, often with important security, + anonymity, and stability fixes. Thus, it is important for users to be able + to promptly recognize when new versions are available and to easily + download, authenticate, and install updated Tor and Tor-related software + packages. + + Tor's control protocol [2] provides a method by which controllers can + identify when the user's Tor software is obsolete or otherwise no longer + recommended. Currently, however, no mechanism exists for clients to + automatically download and install updated Tor and Tor-related software for + the user. + + +Design Overview + + The core of the automatic update framework is a well-defined file called a + "recommended-packages" file. The recommended-packages file is accessible via + HTTP[S] at one or more well-defined URLs. An example recommended-packages + URL may be: + + https://updates.torproject.org/recommended-packages + + The recommended-packages document is formatted according to Section 1.2 + below and specifies the most recent recommended installation package + versions for Tor or Tor-related software, as well as URLs at which the + packages and their signatures can be downloaded. + + An automatic update client process runs on the Tor user's computer and + periodically retrieves the recommended-packages file according to the method + described in Section 2.0. As described further in Section 1.2, the + recommended-packages file is signed and can be verified by the automatic + update client with one or more public keys included in the client software. + Since it is signed, the recommended-packages file can be mirrored by + multiple hosts (e.g., Tor directory authorities), whose URLs are included in + the automatic update client's configuration. + + After retrieving and verifying the recommended-packages file, the automatic + update client compares the versions of the recommended software packages + listed in the file with those currently installed on the end-user's + computer. If one or more of the installed packages is determined to be out + of date, an updated package and its signature will be downloaded from one of + the package URLs listed in the recommended-packages file as described in + Section 2.2. + + The automatic update system uses a multilevel signing key scheme for package + signatures. There are a small number of entities we call "packaging + authorities" that each have their own signing key. A packaging authority is + responsible for signing and publishing the recommended-packages file. + Additionally, each individual packager responsible for producing an + installation package for one or more platforms has their own signing key. + Every packager's signing key must be signed by at least one of the packaging + authority keys. + + +Specification + + 1. recommended-packages Specification + + In this section we formally specify the format of the published + recommended-packages file. + + 1.1. Document Meta-format + + The recommended-packages document follows the lightweight extensible + information format defined in Tor's directory protocol specification [1]. In + the interest of self-containment, we have reproduced the relevant portions + of that format's specification in this Section. (Credits to Nick Mathewson + for much of the original format definition language.) + + The highest level object is a Document, which consists of one or more + Items. Every Item begins with a KeywordLine, followed by zero or more + Objects. A KeywordLine begins with a Keyword, optionally followed by + whitespace and more non-newline characters, and ends with a newline. A + Keyword is a sequence of one or more characters in the set [A-Za-z0-9-]. + An Object is a block of encoded data in pseudo-Open-PGP-style + armor. (cf. RFC 2440) + + More formally: + + Document ::= (Item | NL)+ + Item ::= KeywordLine Object* + KeywordLine ::= Keyword NL | Keyword WS ArgumentChar+ NL + Keyword ::= KeywordChar+ + KeywordChar ::= 'A' ... 'Z' | 'a' ... 'z' | '0' ... '9' | '-' + ArgumentChar ::= any printing ASCII character except NL. + WS ::= (SP | TAB)+ + Object ::= BeginLine Base-64-encoded-data EndLine + BeginLine ::= "-----BEGIN " Keyword "-----" NL + EndLine ::= "-----END " Keyword "-----" NL + + The BeginLine and EndLine of an Object must use the same keyword. + + In our Document description below, we also tag Items with a multiplicity in + brackets. Possible tags are: + + "At start, exactly once": These items MUST occur in every instance of the + document type, and MUST appear exactly once, and MUST be the first item in + their documents. + + "Exactly once": These items MUST occur exactly one time in every + instance of the document type. + + "Once or more": These items MUST occur at least once in any instance + of the document type, and MAY occur more than once. + + "At end, exactly once": These items MUST occur in every instance of + the document type, and MUST appear exactly once, and MUST be the + last item in their documents. + + 1.2. recommended-packages Document Format + + When interpreting a recommended-packages Document, software MUST ignore + any KeywordLine that starts with a keyword it doesn't recognize; future + implementations MUST NOT require current automatic update clients to + understand any KeywordLine not currently described. + + In lines that take multiple arguments, extra arguments SHOULD be + accepted and ignored. + + The currently defined Items contained in a recommended-packages document + are: + + "recommended-packages-format" SP number NL + + [Exactly once] + + This Item specifies the version of the recommended-packages format that + is contained in the subsequent document. The version defined in this + proposal is version "1". Subsequent iterations of this protocol MUST + increment this value if they introduce incompatible changes to the + document format and MAY increment this value if they only introduce + additional Keywords. + + "published" SP YYYY-MM-DD SP HH:MM:SS NL + + [Exactly once] + + The time, in GMT, when this recommended-packages document was generated. + Automatic update clients SHOULD ignore Documents over 60 days old. + + "tor-stable-win32-version" SP TorVersion NL + + [Exactly once] + + This keyword specifies the latest recommended release of Tor's "stable" + branch for the Windows platform that has an installation package + available. Note that this version does not necessarily correspond to the + most recently tagged stable Tor version, since that version may not yet + have an installer package available, or may have known issues on + Windows. + + The TorVersion field is formatted according to Section 2 of Tor's + version specification [3]. + + "tor-stable-win32-package" SP Url NL + + [Once or more] + + This Item specifies the location from which the most recent + recommended Windows installation package for Tor's stable branch can be + downloaded. + + When this Item appears multiple times within the Document, automatic + update clients SHOULD select randomly from the available package + mirrors. + + "tor-dev-win32-version" SP TorVersion NL + + [Exactly once] + + This Item specifies the latest recommended release of Tor's + "development" branch for the Windows platform that has an installation + package available. The same caveats from the description of + "tor-stable-win32-version" also apply to this keyword. + + The TorVersion field is formatted according to Section 2 of Tor's + version specification [3]. + + "tor-dev-win32-package" SP Url NL + + [Once or more] + + This Item specifies the location from which the most recent recommended + Windows installation package and its signature for Tor's development + branch can be downloaded. + + When this Keyword appears multiple times within the Document, automatic + update clients SHOULD select randomly from the available package + mirrors. + + "signature" NL SIGNATURE NL + + [At end, exactly once] + + The "SIGNATURE" Object contains a PGP signature (using a packaging + authority signing key) of the entire document, taken from the beginning + of the "recommended-packages-format" keyword, through the newline after + the "signature" Keyword. + + + 2. Automatic Update Client Behavior + + The client-side component of the automatic update framework is an + application that runs on the end-user's machine. It is responsible for + fetching and verifying a recommended-packages document, as well as + downloading, verifying, and subsequently installing any necessary updated + software packages. + + 2.1. Download and verify a recommended-packages document + + The first step in the automatic update process is for the client to download + a copy of the recommended-packages file. The automatic update client + contains a (hardcoded and/or user-configurable) list of URLs from which it + will attempt to retrieve a recommended-packages file. + + Connections to each of the recommended-packages URLs SHOULD be attempted in + the following order: + + 1) HTTPS over Tor + 2) HTTP over Tor + 3) Direct HTTPS + 4) Direct HTTP + + If the client fails to retrieve a recommended-packages document via any of + the above connection methods from any of the configured URLs, the client + SHOULD retry its download attempts following an exponential back-off + algorithm. After the first failed attempt, the client SHOULD delay one hour + before attempting again, up to a maximum of 24 hours delay between retry + attempts. + + After successfully downloading a recommended-packages file, the automatic + update client will verify the signature using one of the public keys + distributed with the client software. If more than one recommended-packages + file is downloaded and verified, the file with the most recent "published" + date that is verified will be retained and the rest discarded. + + 2.2. Download and verify the updated packages + + The automatic update client next compares the latest recommended package + version from the recommended-packages document with the currently installed + Tor version. If the user currently has installed a Tor version from Tor's + "development" branch, then the version specified in "tor-dev-*-version" Item + is used for comparison. Similarly, if the user currently has installed a Tor + version from Tor's "stable" branch, then the version specified in the + "tor-stable-*version" Item is used for comparison. Version comparisons are + done according to Tor's version specification [3]. + + If the automatic update client determines an installation package newer than + the user's currently installed version is available, it will attempt to + download a package appropriate for the user's platform and Tor branch from a + URL specified by a "tor-[branch]-[platform]-package" Item. If more than one + mirror for the selected package is available, a mirror will be chosen at + random from all those available. + + The automatic update client must also download a ".asc" signature file for + the retrieved package. The URL for the package signature is the same as that + for the package itself, except with the extension ".asc" appended to the + package URL. + + Connections to download the updated package and its signature SHOULD be + attempted in the same order described in Section 2.1. + + After completing the steps described in Sections 2.1 and 2.2, the automatic + update client will have downloaded and verified a copy of the latest Tor + installation package. It can then take whatever subsequent platform-specific + steps are necessary to install the downloaded software updates. + + 2.3. Periodic checking for updates + + The automatic update client SHOULD maintain a local state file in which it + records (at a minimum) the timestamp at which it last retrieved a + recommended-packages file and the timestamp at which the client last + successfully downloaded and installed a software update. + + Automatic update clients SHOULD check for an updated recommended-packages + document at most once per day but at least once every 30 days. + + + 3. Future Extensions + + There are several possible areas for future extensions of this framework. + The extensions below are merely suggestions and should be the subject of + their own proposal before being implemented. + + 3.1. Additional Software Updates + + There are several software packages often included in Tor bundles besides + Tor, such as Vidalia, Privoxy or Polipo, and Torbutton. The versions and + download locations of updated installation packages for these bundle + components can be easily added to the recommended-packages document + specification above. + + 3.2. Including ChangeLog Information + + It may be useful for automatic update clients to be able to display for + users a summary of the changes made in the latest Tor or Tor-related + software release, before the user chooses to install the update. In the + future, we can add keywords to the specification in Section 1.2 that specify + the location of a ChangeLog file for the latest recommended package + versions. It may also be desirable to allow localized ChangeLog information, + so that the automatic update client can fetch release notes in the + end-user's preferred language. + + 3.3. Weighted Package Mirror Selection + + We defined in Section 1.2 a method by which automatic update clients can + select from multiple available package mirrors. We may want to add a Weight + argument to the "*-package" Items that allows the recommended-packages file + to suggest to clients the probability with which a package mirror should be + chosen. This will allow clients to more appropriately distribute package + downloads across available mirrors proportional to their approximate + bandwidth. + + +Implementation + + Implementation of this proposal will consist of two separate components. + + The first component is a small "au-publish" tool that takes as input a + configuration file specifying the information described in Section 1.2 and a + private key. The tool is run by a "packaging authority" (someone responsible + for publishing updated installation packages), who will be prompted to enter + the passphrase for the private key used to sign the recommended-packages + document. The output of the tool is a document formatted according to + Section 1.2, with a signature appended at the end. The resulting document + can then be published to any of the update mirrors. + + The second component is an "au-client" tool that is run on the end-user's + machine. It periodically checks for updated installation packages according + to Section 2 and fetches the packages if necessary. The public keys used + to sign the recommended-packages file and any of the published packages are + included in the "au-client" tool. + + +References + + [1] Tor directory protocol (version 3), + https://tor-svn.freehaven.net/svn/tor/trunk/doc/spec/dir-spec.txt + + [2] Tor control protocol (version 2), + https://tor-svn.freehaven.net/svn/tor/trunk/doc/spec/control-spec.txt + + [3] Tor version specification, + https://tor-svn.freehaven.net/svn/tor/trunk/doc/spec/version-spec.txt diff --git a/specs/U2-formats.txt b/specs/U2-formats.txt new file mode 100644 index 0000000..4b8da57 --- /dev/null +++ b/specs/U2-formats.txt @@ -0,0 +1,430 @@ + + +Scope + + This document descibes a repository and document format for use in + distributing Tor bundle updates. It is meant to be a component of + an overall automatic update system. + + Not described in this document is the design the packages or their + install process, though some requirements are listed. + +Proposed code name + + Since "auto-update" is so generic, I've been thinking about going with + with "hapoc" or "glider" or "petaurus", all based on the sugar + glider you get when you search for "handy pocket creature". + +Metaformat + + All documents use Rivest's SEXP meta-format as documented at + http://people.csail.mit.edu/rivest/sexp.html + with the restriction that no "display hint" fields are to be used, + and the base64 transit encoding isn't used either. + + In descriptions of syntax below, we use regex-style qualifiers, so + that in + (sofa slipcover? occupant* leg+) + the sofa will have an optional slipcover, zero or more occupants, + and one or more legs. This pattern matches (sofa leg) and (sofa + slipcover occupant occupant leg leg leg leg) but not (sofa leg + slipcover). + + We also use a braces notation to indicate elements that can occur + in any order. For example, + (bread {flour+ eggs? yeast}) + matches a list starting with "bread", and then containing a one or + more occurances of flours, zero or one occurances of eggs, and one + occurance of yeast, in any order. This pattern matches (bread eggs + yeast flour) but not (bread yeast) or (bread flour eggs yeast + macadamias). + + +Goals + + It should be possible to mirror a repository using only rsync and cron. + + Separate keys should be used for different people and different + roles. + + Only a minimal set of keys should have to be kept online to keep + the system running. + + The system should handle any single computer or system or person + being unavailable. + + The system should be pretty future-proof. + + The client-side of the architecture should be really easy to implement. + +Non-goals: + + This is not a package format. Instead, we reuse existing package + formats for each platform. + + This is not a general-purpose package manager like yum or apt: it + assumes that users will want to have one or more of a set of + "bundles", not an arbitary selection of packages dependant on one + another. + + This is also not a general-purpose package format. It assumes the + existance of an external package format that can handle install, + update, remove, and version query. + +Architecture: Repository + + A "repository" is a directory hierarchy containing packages, + bundles, and metadata, all signed. + + A "package" is a single independent downloadable, installable + binary archive. It could be an MSI, an RPM, a DMG, or so on. + Every package is compiled version of some version of some piece of + software (an 'application') for some (os, architecture, + version) combinations. Some packages are "glue" that make other + packages work well together or get configured properly. + + A "bundle" is a list of of packages to be installed together. + Examples might be "Tor Browser Bundle" or "Tor plus controller". A + bundle is versioned, and every bundle is for a particular (os, + architecture) combination. Bundles specify which order to install + or update their components. + + Metadata is used to: + - Find mirrors + - Validate packages, bundles, and metadata + - Make sure information is up-to-date + - Determine which packages are in a bundle + + The filesystem layout in the repository is used for two purposes: + - To give mirrors an easy way to mirror only some of the repository. + - To specify which parts of the repository a given key has the + authority to sign. + +Architecture: Roles + + Every role in the system are associated with a key. Replacing + anything but a root key is supposed to be relatively easy. + + Root-keys sign other keys, and certify them as belonging to roles. + Clients are configured to know the root keys. + + Bundle keys certify the contents of a bundle. + + Package keys certify packages for a given program or set of + programs. + + Mirror keys certify a list of mirrors. We expect this to be an + automated process. + + Timestamp keys certify that given versions of other metadata + documents are up-to-date. They are the only keys that absolutely + need to be kept online. (If they are not, timestamps won't be + generated.) + +Directory layout + + The following files exist in all repositories and mirrors: + + /meta/keys.txt + + Signed by the root keys; indicates keys and roles. + [XXXX I'm using the txt extension here. Is that smart?] + + /meta/mirrors.txt + + Signed by the mirror key; indicates which parts of the repo + are mirrored where. + + /meta/timestamp.txt + + Signed by the timestamp key; indicates hashes and timestamps + for the latest versions of keys.txt and mirrors.txt. Also + indicates the latest version of each bundle for each os/arch. + + /bundleinfo/bundlename/os-arch/bundlename-os-arch-bundleversion.txt + + Signed by the appropriate bundle key. Describes what + packages make up a bundle, and what order to install, + uninstall, and upgrade them in. + + /pkginfo/packagename/os-arch/version/packagename-os-arch-packageversion.txt + + Signed by the appropriate package key. Tells the name of the + file that makes up the bundle, its hash, and what procedure + is used to install it. + + /packages/packagename/os-arch/version/(some filename) + + The actual files + +File formats: general principles + + We use tagged lists (lists whose first element is a string) to + indicate typed objects. Tags are generally lower-case, with + hyphens used for separation. Think Lispy. + + We use attrlists [lists of (key value) lists] to indicate a + multimap from keys to values. Clients MUST accept unrecognized + keys in these attrlists. The syntax for an attrlist with two + recognized and required keys is typically given as ({(key1 val1) + (key2 val2) (ATTR VAL)*}), indicating that the keys can occur in + any order, intermixed with other attributes. + + Timestamp files will be downloaded very frequently; all other files + will be much smaller in size than package files. Thus, + size-optimization for timestamp files makes sense and most other + other space optimizations don't. + + Versions are represented as lists of the form (v I1 I2 I3 I4 ...) + where each item is a number or alphanumeric version component. For + example, the version "0.2.1.5-alpha" is represented as (v 0 2 1 5 + alpha). + + All signed files are of the format: + + (signed + X + (signature ({(keyid K) (method M) (ATTR VAL)*}) SIG)+ + ) + + where: X is a list whose fist element describes the signed object. + K is the identifier of a key signing the document + M is the method to be used to make the signature + (ATTR VAL) is an arbitrary list whose first element is a + string. + SIG is a signature of the canonical encoding of X using the + identified key. + + We define two signing methods at present: + sha256-oaep : A signature of the SHA256 hash of the canonical + encoding of X, using OAEP+ padding. [XXXX say more about mgf] + + All times are given as strings of the format "YYYY-MM-DD HH:MM:SS", + in UTC. + + All keys are of the format: + (pubkey ({(type TYPE) (ATTR VAL)*}) KEYVAL) + where TYPE is a string describing the type of the key and how it's + used to sign documents. The type determines the interpretation of + KEYVAL. + + The ID of a key is the type field concatenated with the SHA-256 + hash of the canonical encoding of the KEYVAL field. + + We define one keytype at present: 'rsa'. The KEYVAL in this case is a + 2-element list of (e p), with both values given in big-endian + binary format. [This makes keys 45-60% more compact.] + +File formats: key list + + (keylist + (ts TIME) + (keys + ((key ({(roles (ROLE PATH)+) (ATTR VAL)*}) KEY)*) + ... + ) + + The "ts" line describes when the keys file was updated. Clients + MUST NOT replace a file with an older one, and SHOULD NOT accept a + file too far in the future. + + A ROLE is one of "timestamp" "mirrors" "bundle" or "package" + + PATH is a path relative to the top of the directory hierarchy. It + may contain "*" elements to indicate "any file", and may end with a + "/**" element to indicate all files under a given point. + +File formats: mirror list + + (mirrorlist + (ts TIME) + (mirrors + ( (mirror ({(name N) (urlbase U) (contents PATH+) (ATTR VAL)})) * ) + ... + ) + + Every mirror is a copy of some or all of the directory hierarchy + containing at least the /meta, /bundles/, and /pkginfo directories. + + N is a descriptive name for the mirror; U is the URL of the mirror's + base (i.e., the parent of the "meta" directory); and the PATH + elements are the components describing how much of the packages + directory is mirrored. Their format is as in the keylist file. + +File formats: timestamp files + (ts + ({(at TIME) + (m TIME MIRRORLISTHASH) + (k TIME KEYLISTHASH) + (b NAME VERSION TIME PATH HASH)*}) + ) + + TIME is when the timestamp was signed. MIRRORLISTHASH is the digest + of the mirror-list file; KEYLISTHASH is the digest of the key list + file; and the 'b' entries are a list of the latest version of each + bundles and their locations and hashes. + +File formats: bundle files + + (bundle + (at TIME) + (os OS) + [(arch ARCH)] + (version V) + (packages + (NAME VERSION PATH HASH ({(order INST UPDATE REMOVE) + (optional)? + (gloss LANG TEXT)* + (longloss LANG TEXT)* + (ATTR VAL)*})? )* ) + ) + + Most elements are self-explanatory; the INST, UPDATE, and REMOVE + elements of the order element are numbers defining the order in + which the packages are installed, updated, and removed respectively. + The "optional" element is present if the package is optional. + "Gloss" is an short utf-8 human-readable string explaining what the + package provides for the bundle; "longloss" is a longer such + utf-8 string. + + (Note that the gloss strings are meant, not to describe the package, + but to describe what the package provides for the bundle. For + example, "The Anonymous Email Bundle needs the Python Runtime to run + Mixminion.") + +File formats: package files + (package + ({(name NAME) + (version VERSION) + (format FMT ((ATTR VAL)*)? ) + (path PATH) + (ts TIME) + (digest HASH) + (shortdesc LANG TEXT)* + (longdesc LANG TEXT)* + (ATTR VAL)* }) + ) + + Most elements are self-explanatory. The "FMT" element describes the + file format of the package, which should give enough information + about how to install it. + + No two package files in the same repository should have the same + name and version. If a package needs to be changed, the version + MUST be incremented. + +Workflows: The client application + + Periodically, the client updater fetches a timestamp file from a + mirror. If the timestamp in the file is up-to-date, the client + first checks to see whether the keys file listed is one that the + client has. If not, the client fetches it, makes sure the hash of + the keys file matches the hash in the timestamp file, makes sure its + date is more recent than any keys file they have but not too far in + the future, and that it is signed by enough root keys that the + client recognizes. + + [If the timestamp file is not up-to-date, the client tries a + few mirrors until it finds one with a good timestamp.] + + [If the keys file from a mirror does not match the timestamp + file, the client tries a new mirror for both.] + + [If the keys file is not signed by enough root keys, the client + warns the user and tries another mirror for both the timestamp + file and the keys file.] + + Once the client has an up-to-date keys file, the client checks the + signature on the timestamp file. Assuming it checks out, the client + refreshes the mirror list as needed, and refreshes any bundle files + to which the user is subscribed if the client does not have + the latest version of those files. The client checks signatures on + these files, and fetches package metadata for any packages listed in + the bundle file that the client does not have, checks signatures on + these, and fetches binaries for packages that might need to be + installed or updated. As the packages arrive, clients check their + hashes. + + Once the client has gotten enough packages, it informs the user that + new packages have arrived, and asks them if they want to update. + + Clients SHOULD cache at least the latest versions they have received + of all files. + +Workflow: Mirrors + + Periodically, mirrors do an rsync or equivalent to fetch the latest + version of whatever parts of the repository have changed since the + version they currently hold. Mirrors SHOULD replace older versions + of the repository idempotently, so that clients are less likely to + see inconsistant state. Mirrors SHOULD validate the information + they receive, and not serve partial or inconsistant files. + +Workflow: Packagers + + When a new binary package is done, the person making the package + runs a tool to generate and sign a package file, and sends both the + package and the package file to a repository admin. Typically, the + base package file will be generated by inserting a version into a + template. + + Packages MAY have as part of their build process a script to + generate the appropriately versioned package file. This script + should at a minimum demand a build version, or use a timestamp in + place of a build version, to prevent two packages with the same + version from being created. + +Workflow: bundlers + + When the packages in a bundle are done, the bundler runs a tool on + the package files to generate and sign a bundle file. Typically, + this tool uses a template bundle file. + +Workflow: repository administrators + + Repository administrators use a tool to validate signed files into the + repository. The repository should not be altered manually. + + This tool acts as follows: + - Package files may be added, but never replaced. + - Bundle files may be added, but never replaced. + - No file may be added unless it is syntactically valid and + signed by a key in the keys file authorized to sign files of + this type in this file's location location. + + - A package file may not be added unless all of its binary + packages match their hashes. + + - A bundle file may not be added unless all of its package files + are present and match their hashes. + + - When adding a new keylist, bundle, or mirrors list, the + timestamp file must be regenerated immediately. + +Timing: + + The timestamp file SHOULD be regenerated every 15 minutes. Mirrors + SHOULD attempt to update every hour. Clients SHOULD accept a + timestamp file up to 6 hours old. + +Format versioning and forward-compatibility: + + All of the above formats include the ability to add more + attribute-value fields for backwards-compatible format changes. If + we need to make a backwards incompatible format change, we create a + new filename for the new format. + +Key management and migration: + + Root keys should be kept offline. All keys except timestamp and + mirror keys should be stored encrypted. + + All the formats above allow for multiple keys to sign a single + document. To replace a compromised master key, it suffices to sign + keylist documents with both the compromised key and its replacement + until all clients have updated to a new version of the autoupdater. + + To replace another key, it suffices to authorize the new key in the + keylist. Note that a new package or bundle key must re-sign and + issue new versions of all packages or bundles it has generated. + |