hey!

From 4b0948ddb3a428f8a5330e05745b2fbd4ccf9375 Mon Sep 17 00:00:00 2001 From: Robert Newson Date: Mon, 26 Jul 2010 17:21:30 +0000 Subject: Add SSL support to CouchDB. To enable SSL you need to do three things; 1) enable the httpsd daemon in local.ini (you can just uncomment the line). 2) supply your PEM-encoded cert and key files in the [ssl] section. 3) start CouchDB. CouchDB will now, in addition to handling HTTP on port 5984, accept SSL connections on port 6984. The patch itself adds SSL support by updating the local version of Mochiweb to the latest. The upstream release includes our local tweak to support large numbers and to handle Accept-Encoding headers. Our local Mochiweb fork changed the default idle timeout from 10 seconds to 5 minutes, and it was agreed on #irc to revert this change. The only tweaks to Mochiweb were in mochiweb.app.src (to record the git commit I built from) and the removal of Makefile (replaced by Makefile.am). Futon received many tweaks as we have 'http://' hardcoded all over. All such instances now use window.location.protocol + '//'. CouchDB received a tweak to use the right scheme in couch_httpd:absolute_uri (it now gets it from the Mochireq and not mochiweb_socket_server). git-svn-id: https://svn.apache.org/repos/asf/couchdb/trunk@979368 13f79535-47bb-0310-9956-ffa450edef68 --- src/mochiweb/mochiweb_html.erl | 668 ++++++++++++++++++++++++++--------------- 1 file changed, 418 insertions(+), 250 deletions(-) (limited to 'src/mochiweb/mochiweb_html.erl') diff --git a/src/mochiweb/mochiweb_html.erl b/src/mochiweb/mochiweb_html.erl index 77100d50..a15c359c 100644 --- a/src/mochiweb/mochiweb_html.erl +++ b/src/mochiweb/mochiweb_html.erl @@ -4,9 +4,9 @@ %% @doc Loosely tokenizes and generates parse trees for HTML 4. -module(mochiweb_html). -export([tokens/1, parse/1, parse_tokens/1, to_tokens/1, escape/1, - escape_attr/1, to_html/1, test/0]). + escape_attr/1, to_html/1]). -% This is a macro to placate syntax highlighters.. +%% This is a macro to placate syntax highlighters.. -define(QUOTE, $\"). -define(SQUOTE, $\'). -define(ADV_COL(S, N), @@ -35,6 +35,8 @@ -define(IS_LITERAL_SAFE(C), ((C >= $A andalso C =< $Z) orelse (C >= $a andalso C =< $z) orelse (C >= $0 andalso C =< $9))). +-define(PROBABLE_CLOSE(C), + (C =:= $> orelse ?IS_WHITESPACE(C))). -record(decoder, {line=1, column=1, @@ -89,6 +91,7 @@ to_tokens(T={doctype, _}) -> to_tokens(T={comment, _}) -> [T]; to_tokens({Tag0, Acc}) -> + %% This is only allowed in sub-tags: {p, [{"class", "foo"}]} to_tokens({Tag0, [], Acc}); to_tokens({Tag0, Attrs, Acc}) -> Tag = to_tag(Tag0), @@ -124,40 +127,6 @@ escape_attr(I) when is_integer(I) -> escape_attr(F) when is_float(F) -> escape_attr(mochinum:digits(F), []). -%% @spec test() -> ok -%% @doc Run tests for mochiweb_html. -test() -> - test_destack(), - test_tokens(), - test_tokens2(), - test_parse(), - test_parse2(), - test_parse_tokens(), - test_escape(), - test_escape_attr(), - test_to_html(), - ok. - - -%% Internal API - -test_to_html() -> - Expect = <<"hey!

what's up

sucka

">>, - Expect = iolist_to_binary( - to_html({html, [], - [{<<"head">>, [], - [{title, <<"hey!">>}]}, - {body, [], - [{p, [{class, foo}], [<<"what's">>, <<" up">>, {br}]}, - {'div', <<"sucka">>}, - {comment, <<" comment! ">>}]}]})), - Expect1 = <<"">>, - Expect1 = iolist_to_binary( - to_html({doctype, - [<<"html">>, <<"PUBLIC">>, - <<"-//W3C//DTD XHTML 1.0 Transitional//EN">>, - <<"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">>]})), - ok. to_html([], Acc) -> lists:reverse(Acc); to_html([{'=', Content} | Rest], Acc) -> @@ -205,16 +174,6 @@ attrs_to_html([{K, V} | Rest], Acc) -> [[<<" ">>, escape(K), <<"=\"">>, escape_attr(V), <<"\"">>] | Acc]). -test_escape() -> - <<""\"word <<up!"">> = - escape(<<""\"word <>), - ok. - -test_escape_attr() -> - <<"""word <<up!"">> = - escape_attr(<<""\"word <>), - ok. - escape([], Acc) -> list_to_binary(lists:reverse(Acc)); escape("<" ++ Rest, Acc) -> @@ -257,6 +216,9 @@ to_tokens([{Tag0, [T0={'=', _C0} | R1]} | Rest], Acc) -> to_tokens([{Tag0, [T0={comment, _C0} | R1]} | Rest], Acc) -> %% Allow {comment, iolist()} to_tokens([{Tag0, R1} | Rest], [T0 | Acc]); +to_tokens([{Tag0, [T0={pi, _S0, _A0} | R1]} | Rest], Acc) -> + %% Allow {pi, binary(), list()} + to_tokens([{Tag0, R1} | Rest], [T0 | Acc]); to_tokens([{Tag0, [{T0, A0=[{_, _} | _]} | R1]} | Rest], Acc) -> %% Allow {p, [{"class", "foo"}]} to_tokens([{Tag0, [{T0, A0, []} | R1]} | Rest], Acc); @@ -290,39 +252,6 @@ to_tokens([{Tag0, [B | R1]} | Rest], Acc) when is_binary(B) -> Tag = to_tag(Tag0), to_tokens([{Tag, R1} | Rest], [{data, B, false} | Acc]). -test_tokens() -> - [{start_tag, <<"foo">>, [{<<"bar">>, <<"baz">>}, - {<<"wibble">>, <<"wibble">>}, - {<<"alice">>, <<"bob">>}], true}] = - tokens(<<"">>), - [{start_tag, <<"foo">>, [{<<"bar">>, <<"baz">>}, - {<<"wibble">>, <<"wibble">>}, - {<<"alice">>, <<"bob">>}], true}] = - tokens(<<"">>), - [{comment, <<"[if lt IE 7]>\n\n>}] = - tokens(<<"">>), - [{start_tag, <<"script">>, [{<<"type">>, <<"text/javascript">>}], false}, - {data, <<" A= B <= C ">>, false}, - {end_tag, <<"script">>}] = - tokens(<<"">>), - [{start_tag, <<"script">>, [{<<"type">>, <<"text/javascript">>}], false}, - {data, <<" A= B <= C ">>, false}, - {end_tag, <<"script">>}] = - tokens(<<"">>), - [{start_tag, <<"script">>, [{<<"type">>, <<"text/javascript">>}], false}, - {data, <<" A= B <= C ">>, false}, - {end_tag, <<"script">>}] = - tokens(<<"">>), - [{start_tag, <<"script">>, [{<<"type">>, <<"text/javascript">>}], false}, - {data, <<" A= B <= C ">>, false}, - {end_tag, <<"script">>}] = - tokens(<<"">>), - [{start_tag, <<"textarea">>, [], false}, - {data, <<"">>, false}, - {end_tag, <<"textarea">>}] = - tokens(<<"">>), - ok. - tokens(B, S=#decoder{offset=O}, Acc) -> case B of <<_:O/binary>> -> @@ -374,7 +303,8 @@ tokenize(B, S=#decoder{offset=O}) -> {{end_tag, Tag}, S2}; <<_:O/binary, "<", C, _/binary>> when ?IS_WHITESPACE(C) -> %% This isn't really strict HTML - tokenize_data(B, ?INC_COL(S)); + {{data, Data, _Whitespace}, S1} = tokenize_data(B, ?INC_COL(S)), + {{data, <<$<, Data/binary>>, false}, S1}; <<_:O/binary, "<", _/binary>> -> {Tag, S1} = tokenize_literal(B, ?INC_COL(S)), {Attrs, S2} = tokenize_attributes(B, S1), @@ -385,149 +315,6 @@ tokenize(B, S=#decoder{offset=O}) -> tokenize_data(B, S) end. -test_parse() -> - D0 = <<" - - - - Foo - - - - - - - CDATA>>]]> -">>, - Expect = {<<"html">>, [], - [{<<"head">>, [], - [{<<"meta">>, - [{<<"http-equiv">>,<<"Content-Type">>}, - {<<"content">>,<<"text/html; charset=UTF-8">>}], - []}, - {<<"title">>,[],[<<"Foo">>]}, - {<<"link">>, - [{<<"rel">>,<<"stylesheet">>}, - {<<"type">>,<<"text/css">>}, - {<<"href">>,<<"/static/rel/dojo/resources/dojo.css">>}, - {<<"media">>,<<"screen">>}], - []}, - {<<"link">>, - [{<<"rel">>,<<"stylesheet">>}, - {<<"type">>,<<"text/css">>}, - {<<"href">>,<<"/static/foo.css">>}, - {<<"media">>,<<"screen">>}], - []}, - {comment,<<"[if lt IE 7]>\n \n >}, - {<<"link">>, - [{<<"rel">>,<<"icon">>}, - {<<"href">>,<<"/static/images/favicon.ico">>}, - {<<"type">>,<<"image/x-icon">>}], - []}, - {<<"link">>, - [{<<"rel">>,<<"shortcut icon">>}, - {<<"href">>,<<"/static/images/favicon.ico">>}, - {<<"type">>,<<"image/x-icon">>}], - []}]}, - {<<"body">>, - [{<<"id">>,<<"home">>}, - {<<"class">>,<<"tundra">>}], - [<<"<CDATA>>">>]}]}, - Expect = parse(D0), - ok. - -test_tokens2() -> - D0 = <<"from __future__ import *http://bob.pythonmac.orgBob's Rants">>, - Expect = [{start_tag,<<"channel">>,[],false}, - {start_tag,<<"title">>,[],false}, - {data,<<"from __future__ import *">>,false}, - {end_tag,<<"title">>}, - {start_tag,<<"link">>,[],true}, - {data,<<"http://bob.pythonmac.org">>,false}, - {end_tag,<<"link">>}, - {start_tag,<<"description">>,[],false}, - {data,<<"Bob's Rants">>,false}, - {end_tag,<<"description">>}, - {end_tag,<<"channel">>}], - Expect = tokens(D0), - ok. - -test_parse2() -> - D0 = <<"from __future__ import *http://bob.pythonmac.org
fooBob's Rants">>, - Expect = {<<"channel">>,[], - [{<<"title">>,[],[<<"from __future__ import *">>]}, - {<<"link">>,[],[ - <<"http://bob.pythonmac.org">>, - {<<"br">>,[],[]}, - <<"foo">>]}, - {<<"description">>,[],[<<"Bob's Rants">>]}]}, - Expect = parse(D0), - ok. - -test_parse_tokens() -> - D0 = [{doctype,[<<"HTML">>,<<"PUBLIC">>,<<"-//W3C//DTD HTML 4.01 Transitional//EN">>]}, - {data,<<"\n">>,true}, - {start_tag,<<"html">>,[],false}], - {<<"html">>, [], []} = parse_tokens(D0), - D1 = D0 ++ [{end_tag, <<"html">>}], - {<<"html">>, [], []} = parse_tokens(D1), - D2 = D0 ++ [{start_tag, <<"body">>, [], false}], - {<<"html">>, [], [{<<"body">>, [], []}]} = parse_tokens(D2), - D3 = D0 ++ [{start_tag, <<"head">>, [], false}, - {end_tag, <<"head">>}, - {start_tag, <<"body">>, [], false}], - {<<"html">>, [], [{<<"head">>, [], []}, {<<"body">>, [], []}]} = parse_tokens(D3), - D4 = D3 ++ [{data,<<"\n">>,true}, - {start_tag,<<"div">>,[{<<"class">>,<<"a">>}],false}, - {start_tag,<<"a">>,[{<<"name">>,<<"#anchor">>}],false}, - {end_tag,<<"a">>}, - {end_tag,<<"div">>}, - {start_tag,<<"div">>,[{<<"class">>,<<"b">>}],false}, - {start_tag,<<"div">>,[{<<"class">>,<<"c">>}],false}, - {end_tag,<<"div">>}, - {end_tag,<<"div">>}], - {<<"html">>, [], - [{<<"head">>, [], []}, - {<<"body">>, [], - [{<<"div">>, [{<<"class">>, <<"a">>}], [{<<"a">>, [{<<"name">>, <<"#anchor">>}], []}]}, - {<<"div">>, [{<<"class">>, <<"b">>}], [{<<"div">>, [{<<"class">>, <<"c">>}], []}]} - ]}]} = parse_tokens(D4), - D5 = [{start_tag,<<"html">>,[],false}, - {data,<<"\n">>,true}, - {data,<<"boo">>,false}, - {data,<<"hoo">>,false}, - {data,<<"\n">>,true}, - {end_tag,<<"html">>}], - {<<"html">>, [], [<<"\nboohoo\n">>]} = parse_tokens(D5), - D6 = [{start_tag,<<"html">>,[],false}, - {data,<<"\n">>,true}, - {data,<<"\n">>,true}, - {end_tag,<<"html">>}], - {<<"html">>, [], []} = parse_tokens(D6), - D7 = [{start_tag,<<"html">>,[],false}, - {start_tag,<<"ul">>,[],false}, - {start_tag,<<"li">>,[],false}, - {data,<<"word">>,false}, - {start_tag,<<"li">>,[],false}, - {data,<<"up">>,false}, - {end_tag,<<"li">>}, - {start_tag,<<"li">>,[],false}, - {data,<<"fdsa">>,false}, - {start_tag,<<"br">>,[],true}, - {data,<<"asdf">>,false}, - {end_tag,<<"ul">>}, - {end_tag,<<"html">>}], - {<<"html">>, [], - [{<<"ul">>, [], - [{<<"li">>, [], [<<"word">>]}, - {<<"li">>, [], [<<"up">>]}, - {<<"li">>, [], [<<"fdsa">>,{<<"br">>, [], []}, <<"asdf">>]}]}]} = parse_tokens(D7), - ok. - tree_data([{data, Data, Whitespace} | Rest], AllWhitespace, Acc) -> tree_data(Rest, (Whitespace andalso AllWhitespace), [Data | Acc]); tree_data(Rest, AllWhitespace, Acc) -> @@ -556,7 +343,9 @@ tree(L=[{data, _Data, _Whitespace} | _], S) -> tree(Rest, S); {Data, false, Rest} -> tree(Rest, append_stack_child(Data, S)) - end. + end; +tree([{doctype, _} | Rest], Stack) -> + tree(Rest, Stack). norm({Tag, Attrs}) -> {norm(Tag), [{norm(K), iolist_to_binary(V)} || {K, V} <- Attrs], []}; @@ -565,21 +354,6 @@ norm(Tag) when is_binary(Tag) -> norm(Tag) -> list_to_binary(string:to_lower(Tag)). -test_destack() -> - {<<"a">>, [], []} = - destack([{<<"a">>, [], []}]), - {<<"a">>, [], [{<<"b">>, [], []}]} = - destack([{<<"b">>, [], []}, {<<"a">>, [], []}]), - {<<"a">>, [], [{<<"b">>, [], [{<<"c">>, [], []}]}]} = - destack([{<<"c">>, [], []}, {<<"b">>, [], []}, {<<"a">>, [], []}]), - [{<<"a">>, [], [{<<"b">>, [], [{<<"c">>, [], []}]}]}] = - destack(<<"b">>, - [{<<"c">>, [], []}, {<<"b">>, [], []}, {<<"a">>, [], []}]), - [{<<"b">>, [], [{<<"c">>, [], []}]}, {<<"a">>, [], []}] = - destack(<<"c">>, - [{<<"c">>, [], []}, {<<"b">>, [], []},{<<"a">>, [], []}]), - ok. - stack(T1={TN, _, _}, Stack=[{TN, _, _} | _Rest]) when TN =:= <<"li">> orelse TN =:= <<"option">> -> [T1 | destack(TN, Stack)]; @@ -719,9 +493,10 @@ find_qgt(Bin, S=#decoder{offset=O}) -> case Bin of <<_:O/binary, "?>", _/binary>> -> ?ADV_COL(S, 2); - <<_:O/binary, C, _/binary>> -> - find_qgt(Bin, ?INC_CHAR(S, C)); - _ -> + %% tokenize_attributes takes care of this state: + %% <<_:O/binary, C, _/binary>> -> + %% find_qgt(Bin, ?INC_CHAR(S, C)); + <<_:O/binary>> -> S end. @@ -766,7 +541,7 @@ tokenize_charref(Bin, S=#decoder{offset=O}, Start) -> <<_:Start1/binary, R:Len1/binary, _/binary>> = Bin, R; Unichar -> - list_to_binary(xmerl_ucs:to_utf8(Unichar)) + mochiutf8:codepoint_to_bytes(Unichar) end, {{data, Data, false}, ?INC_COL(S)}; _ -> @@ -791,11 +566,10 @@ tokenize_doctype(Bin, S=#decoder{offset=O}, Acc) -> tokenize_word_or_literal(Bin, S=#decoder{offset=O}) -> case Bin of - <<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) -> - {error, {whitespace, [C], S}}; <<_:O/binary, C, _/binary>> when C =:= ?QUOTE orelse C =:= ?SQUOTE -> tokenize_word(Bin, ?INC_COL(S), C); - _ -> + <<_:O/binary, C, _/binary>> when not ?IS_WHITESPACE(C) -> + %% Sanity check for whitespace tokenize_literal(Bin, S, []) end. @@ -852,13 +626,14 @@ tokenize_script(Bin, S=#decoder{offset=O}) -> tokenize_script(Bin, S=#decoder{offset=O}, Start) -> case Bin of %% Just a look-ahead, we want the end_tag separately - <<_:O/binary, $<, $/, SS, CC, RR, II, PP, TT, _/binary>> + <<_:O/binary, $<, $/, SS, CC, RR, II, PP, TT, ZZ, _/binary>> when (SS =:= $s orelse SS =:= $S) andalso (CC =:= $c orelse CC =:= $C) andalso (RR =:= $r orelse RR =:= $R) andalso (II =:= $i orelse II =:= $I) andalso (PP =:= $p orelse PP =:= $P) andalso - (TT=:= $t orelse TT =:= $T) -> + (TT=:= $t orelse TT =:= $T) andalso + ?PROBABLE_CLOSE(ZZ) -> Len = O - Start, <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin, {{data, Raw, false}, S}; @@ -874,7 +649,7 @@ tokenize_textarea(Bin, S=#decoder{offset=O}) -> tokenize_textarea(Bin, S=#decoder{offset=O}, Start) -> case Bin of %% Just a look-ahead, we want the end_tag separately - <<_:O/binary, $<, $/, TT, EE, XX, TT2, AA, RR, EE2, AA2, _/binary>> + <<_:O/binary, $<, $/, TT, EE, XX, TT2, AA, RR, EE2, AA2, ZZ, _/binary>> when (TT =:= $t orelse TT =:= $T) andalso (EE =:= $e orelse EE =:= $E) andalso (XX =:= $x orelse XX =:= $X) andalso @@ -882,7 +657,8 @@ tokenize_textarea(Bin, S=#decoder{offset=O}, Start) -> (AA =:= $a orelse AA =:= $A) andalso (RR =:= $r orelse RR =:= $R) andalso (EE2 =:= $e orelse EE2 =:= $E) andalso - (AA2 =:= $a orelse AA2 =:= $A) -> + (AA2 =:= $a orelse AA2 =:= $A) andalso + ?PROBABLE_CLOSE(ZZ) -> Len = O - Start, <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin, {{data, Raw, false}, S}; @@ -891,3 +667,395 @@ tokenize_textarea(Bin, S=#decoder{offset=O}, Start) -> <<_:Start/binary, Raw/binary>> -> {{data, Raw, false}, S} end. + + +%% +%% Tests +%% +-include_lib("eunit/include/eunit.hrl"). +-ifdef(TEST). + +to_html_test() -> + ?assertEqual( + <<"hey!

what's up

sucka

RAW!">>, + iolist_to_binary( + to_html({html, [], + [{<<"head">>, [], + [{title, <<"hey!">>}]}, + {body, [], + [{p, [{class, foo}], [<<"what's">>, <<" up">>, {br}]}, + {'div', <<"sucka">>}, + {'=', <<"RAW!">>}, + {comment, <<" comment! ">>}]}]}))), + ?assertEqual( + <<"">>, + iolist_to_binary( + to_html({doctype, + [<<"html">>, <<"PUBLIC">>, + <<"-//W3C//DTD XHTML 1.0 Transitional//EN">>, + <<"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">>]}))), + ?assertEqual( + <<"">>, + iolist_to_binary( + to_html({<<"html">>,[], + [{pi, <<"xml:namespace">>, + [{<<"prefix">>,<<"o">>}, + {<<"ns">>,<<"urn:schemas-microsoft-com:office:office">>}]}]}))), + ok. + +escape_test() -> + ?assertEqual( + <<""\"word ><<up!"">>, + escape(<<""\"word ><>)), + ?assertEqual( + <<""\"word ><<up!"">>, + escape(""\"word ><>, + escape('"\"word >< + ?assertEqual( + <<"""word ><<up!"">>, + escape_attr(<<""\"word ><>)), + ?assertEqual( + <<"""word ><<up!"">>, + escape_attr(""\"word ><>, + escape_attr('"\"word ><>, + escape_attr(12345)), + ?assertEqual( + <<"1.5">>, + escape_attr(1.5)), + ok. + +tokens_test() -> + ?assertEqual( + [{start_tag, <<"foo">>, [{<<"bar">>, <<"baz">>}, + {<<"wibble">>, <<"wibble">>}, + {<<"alice">>, <<"bob">>}], true}], + tokens(<<"">>)), + ?assertEqual( + [{start_tag, <<"foo">>, [{<<"bar">>, <<"baz">>}, + {<<"wibble">>, <<"wibble">>}, + {<<"alice">>, <<"bob">>}], true}], + tokens(<<"">>)), + ?assertEqual( + [{comment, <<"[if lt IE 7]>\n\n>}], + tokens(<<"">>)), + ?assertEqual( + [{start_tag, <<"script">>, [{<<"type">>, <<"text/javascript">>}], false}, + {data, <<" A= B <= C ">>, false}, + {end_tag, <<"script">>}], + tokens(<<"">>)), + ?assertEqual( + [{start_tag, <<"script">>, [{<<"type">>, <<"text/javascript">>}], false}, + {data, <<" A= B <= C ">>, false}, + {end_tag, <<"script">>}], + tokens(<<"">>)), + ?assertEqual( + [{start_tag, <<"script">>, [{<<"type">>, <<"text/javascript">>}], false}, + {data, <<" A= B <= C ">>, false}, + {end_tag, <<"script">>}], + tokens(<<"">>)), + ?assertEqual( + [{start_tag, <<"script">>, [{<<"type">>, <<"text/javascript">>}], false}, + {data, <<" A= B <= C ">>, false}, + {end_tag, <<"script">>}], + tokens(<<"">>)), + ?assertEqual( + [{start_tag, <<"textarea">>, [], false}, + {data, <<"">>, false}, + {end_tag, <<"textarea">>}], + tokens(<<"">>)), + ?assertEqual( + [{start_tag, <<"textarea">>, [], false}, + {data, <<"">>, false}], + tokens(<<"

<html></body></textareaz>">>)),
+    ?assertEqual(
+       [{pi, <<"xml:namespace">>,
+         [{<<"prefix">>,<<"o">>},
+          {<<"ns">>,<<"urn:schemas-microsoft-com:office:office">>}]}],
+       tokens(<<"<?xml:namespace prefix=\"o\" ns=\"urn:schemas-microsoft-com:office:office\"?>">>)),
+    ?assertEqual(
+       [{pi, <<"xml:namespace">>,
+         [{<<"prefix">>,<<"o">>},
+          {<<"ns">>,<<"urn:schemas-microsoft-com:office:office">>}]}],
+       tokens(<<"<?xml:namespace prefix=o ns=urn:schemas-microsoft-com:office:office \n?>">>)),
+    ?assertEqual(
+       [{pi, <<"xml:namespace">>,
+         [{<<"prefix">>,<<"o">>},
+          {<<"ns">>,<<"urn:schemas-microsoft-com:office:office">>}]}],
+       tokens(<<"<?xml:namespace prefix=o ns=urn:schemas-microsoft-com:office:office">>)),
+    ?assertEqual(
+       [{data, <<"<">>, false}],
+       tokens(<<"<">>)),
+    ?assertEqual(
+       [{data, <<"not html ">>, false},
+        {data, <<"< at all">>, false}],
+       tokens(<<"not html < at all">>)),
+    ok.
+
+parse_test() ->
+    D0 = <<"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">
+<html>
+ <head>
+   <meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">
+   <title>Foo</title>
+   <link rel=\"stylesheet\" type=\"text/css\" href=\"/static/rel/dojo/resources/dojo.css\" media=\"screen\">
+   <link rel=\"stylesheet\" type=\"text/css\" href=\"/static/foo.css\" media=\"screen\">
+   
+   <link rel=\"icon\" href=\"/static/images/favicon.ico\" type=\"image/x-icon\">
+   <link rel=\"shortcut icon\" href=\"/static/images/favicon.ico\" type=\"image/x-icon\">
+ </head>
+ <body id=\"home\" class=\"tundra\"><![CDATA[<<thisCDATA>>]]></body>
+</html>">>,
+    ?assertEqual(
+       {<<"html">>, [],
+        [{<<"head">>, [],
+          [{<<"meta">>,
+            [{<<"http-equiv">>,<<"Content-Type">>},
+             {<<"content">>,<<"text/html; charset=UTF-8">>}],
+            []},
+           {<<"title">>,[],[<<"Foo">>]},
+           {<<"link">>,
+            [{<<"rel">>,<<"stylesheet">>},
+             {<<"type">>,<<"text/css">>},
+             {<<"href">>,<<"/static/rel/dojo/resources/dojo.css">>},
+             {<<"media">>,<<"screen">>}],
+            []},
+           {<<"link">>,
+            [{<<"rel">>,<<"stylesheet">>},
+             {<<"type">>,<<"text/css">>},
+             {<<"href">>,<<"/static/foo.css">>},
+             {<<"media">>,<<"screen">>}],
+            []},
+           {comment,<<"[if lt IE 7]>\n   <style type=\"text/css\">\n     .no_ie { display: none; }\n   </style>\n   <![endif]">>},
+           {<<"link">>,
+            [{<<"rel">>,<<"icon">>},
+             {<<"href">>,<<"/static/images/favicon.ico">>},
+             {<<"type">>,<<"image/x-icon">>}],
+            []},
+           {<<"link">>,
+            [{<<"rel">>,<<"shortcut icon">>},
+             {<<"href">>,<<"/static/images/favicon.ico">>},
+             {<<"type">>,<<"image/x-icon">>}],
+            []}]},
+         {<<"body">>,
+          [{<<"id">>,<<"home">>},
+           {<<"class">>,<<"tundra">>}],
+          [<<"<<thisCDATA>>">>]}]},
+       parse(D0)),
+    ?assertEqual(
+       {<<"html">>,[],
+        [{pi, <<"xml:namespace">>,
+          [{<<"prefix">>,<<"o">>},
+           {<<"ns">>,<<"urn:schemas-microsoft-com:office:office">>}]}]},
+       parse(
+         <<"<html><?xml:namespace prefix=\"o\" ns=\"urn:schemas-microsoft-com:office:office\"?></html>">>)),
+    ?assertEqual(
+       {<<"html">>, [],
+        [{<<"dd">>, [], [<<"foo">>]},
+         {<<"dt">>, [], [<<"bar">>]}]},
+       parse(<<"<html><dd>foo<dt>bar</html>">>)),
+    %% Singleton sadness
+    ?assertEqual(
+       {<<"html">>, [],
+        [{<<"link">>, [], []},
+         <<"foo">>,
+         {<<"br">>, [], []},
+         <<"bar">>]},
+       parse(<<"<html><link>foo<br>bar</html>">>)),
+    ?assertEqual(
+       {<<"html">>, [],
+        [{<<"link">>, [], [<<"foo">>,
+                           {<<"br">>, [], []},
+                           <<"bar">>]}]},
+       parse(<<"<html><link>foo<br>bar</link></html>">>)),
+    ok.
+
+exhaustive_is_singleton_test() ->
+    T = mochiweb_cover:clause_lookup_table(?MODULE, is_singleton),
+    [?assertEqual(V, is_singleton(K)) || {K, V} <- T].
+
+tokenize_attributes_test() ->
+    ?assertEqual(
+       {<<"foo">>,
+        [{<<"bar">>, <<"b\"az">>},
+         {<<"wibble">>, <<"wibble">>},
+         {<<"taco", 16#c2, 16#a9>>, <<"bell">>},
+         {<<"quux">>, <<"quux">>}],
+        []},
+       parse(<<"<foo bar=\"b"az\" wibble taco©=bell quux">>)),
+    ok.
+
+tokens2_test() ->
+    D0 = <<"<channel><title>from __future__ import *</title><link>http://bob.pythonmac.org</link><description>Bob's Rants</description></channel>">>,
+    ?assertEqual(
+       [{start_tag,<<"channel">>,[],false},
+        {start_tag,<<"title">>,[],false},
+        {data,<<"from __future__ import *">>,false},
+        {end_tag,<<"title">>},
+        {start_tag,<<"link">>,[],true},
+        {data,<<"http://bob.pythonmac.org">>,false},
+        {end_tag,<<"link">>},
+        {start_tag,<<"description">>,[],false},
+        {data,<<"Bob's Rants">>,false},
+        {end_tag,<<"description">>},
+        {end_tag,<<"channel">>}],
+       tokens(D0)),
+    ok.
+
+to_tokens_test() ->
+    ?assertEqual(
+       [{start_tag, <<"p">>, [{class, 1}], false},
+        {end_tag, <<"p">>}],
+       to_tokens({p, [{class, 1}], []})),
+    ?assertEqual(
+       [{start_tag, <<"p">>, [], false},
+        {end_tag, <<"p">>}],
+       to_tokens({p})),
+    ?assertEqual(
+       [{'=', <<"data">>}],
+       to_tokens({'=', <<"data">>})),
+    ?assertEqual(
+       [{comment, <<"comment">>}],
+       to_tokens({comment, <<"comment">>})),
+    %% This is only allowed in sub-tags:
+    %% {p, [{"class", "foo"}]} as {p, [{"class", "foo"}], []}
+    %% On the outside it's always treated as follows:
+    %% {p, [], [{"class", "foo"}]} as {p, [], [{"class", "foo"}]}
+    ?assertEqual(
+       [{start_tag, <<"html">>, [], false},
+        {start_tag, <<"p">>, [{class, 1}], false},
+        {end_tag, <<"p">>},
+        {end_tag, <<"html">>}],
+       to_tokens({html, [{p, [{class, 1}]}]})),
+    ok.
+
+parse2_test() ->
+    D0 = <<"<channel><title>from __future__ import *</title><link>http://bob.pythonmac.org<br>foo</link><description>Bob's Rants</description></channel>">>,
+    ?assertEqual(
+       {<<"channel">>,[],
+        [{<<"title">>,[],[<<"from __future__ import *">>]},
+         {<<"link">>,[],[
+                         <<"http://bob.pythonmac.org">>,
+                         {<<"br">>,[],[]},
+                         <<"foo">>]},
+         {<<"description">>,[],[<<"Bob's Rants">>]}]},
+       parse(D0)),
+    ok.
+
+parse_tokens_test() ->
+    D0 = [{doctype,[<<"HTML">>,<<"PUBLIC">>,<<"-//W3C//DTD HTML 4.01 Transitional//EN">>]},
+          {data,<<"\n">>,true},
+          {start_tag,<<"html">>,[],false}],
+    ?assertEqual(
+       {<<"html">>, [], []},
+       parse_tokens(D0)),
+    D1 = D0 ++ [{end_tag, <<"html">>}],
+    ?assertEqual(
+       {<<"html">>, [], []},
+       parse_tokens(D1)),
+    D2 = D0 ++ [{start_tag, <<"body">>, [], false}],
+    ?assertEqual(
+       {<<"html">>, [], [{<<"body">>, [], []}]},
+       parse_tokens(D2)),
+    D3 = D0 ++ [{start_tag, <<"head">>, [], false},
+                {end_tag, <<"head">>},
+                {start_tag, <<"body">>, [], false}],
+    ?assertEqual(
+       {<<"html">>, [], [{<<"head">>, [], []}, {<<"body">>, [], []}]},
+       parse_tokens(D3)),
+    D4 = D3 ++ [{data,<<"\n">>,true},
+                {start_tag,<<"div">>,[{<<"class">>,<<"a">>}],false},
+                {start_tag,<<"a">>,[{<<"name">>,<<"#anchor">>}],false},
+                {end_tag,<<"a">>},
+                {end_tag,<<"div">>},
+                {start_tag,<<"div">>,[{<<"class">>,<<"b">>}],false},
+                {start_tag,<<"div">>,[{<<"class">>,<<"c">>}],false},
+                {end_tag,<<"div">>},
+                {end_tag,<<"div">>}],
+    ?assertEqual(
+       {<<"html">>, [],
+        [{<<"head">>, [], []},
+         {<<"body">>, [],
+          [{<<"div">>, [{<<"class">>, <<"a">>}], [{<<"a">>, [{<<"name">>, <<"#anchor">>}], []}]},
+           {<<"div">>, [{<<"class">>, <<"b">>}], [{<<"div">>, [{<<"class">>, <<"c">>}], []}]}
+          ]}]},
+       parse_tokens(D4)),
+    D5 = [{start_tag,<<"html">>,[],false},
+          {data,<<"\n">>,true},
+          {data,<<"boo">>,false},
+          {data,<<"hoo">>,false},
+          {data,<<"\n">>,true},
+          {end_tag,<<"html">>}],
+    ?assertEqual(
+       {<<"html">>, [], [<<"\nboohoo\n">>]},
+       parse_tokens(D5)),
+    D6 = [{start_tag,<<"html">>,[],false},
+          {data,<<"\n">>,true},
+          {data,<<"\n">>,true},
+          {end_tag,<<"html">>}],
+    ?assertEqual(
+       {<<"html">>, [], []},
+       parse_tokens(D6)),
+    D7 = [{start_tag,<<"html">>,[],false},
+          {start_tag,<<"ul">>,[],false},
+          {start_tag,<<"li">>,[],false},
+          {data,<<"word">>,false},
+          {start_tag,<<"li">>,[],false},
+          {data,<<"up">>,false},
+          {end_tag,<<"li">>},
+          {start_tag,<<"li">>,[],false},
+          {data,<<"fdsa">>,false},
+          {start_tag,<<"br">>,[],true},
+          {data,<<"asdf">>,false},
+          {end_tag,<<"ul">>},
+          {end_tag,<<"html">>}],
+    ?assertEqual(
+       {<<"html">>, [],
+        [{<<"ul">>, [],
+          [{<<"li">>, [], [<<"word">>]},
+           {<<"li">>, [], [<<"up">>]},
+           {<<"li">>, [], [<<"fdsa">>,{<<"br">>, [], []}, <<"asdf">>]}]}]},
+       parse_tokens(D7)),
+    ok.
+
+destack_test() ->
+    {<<"a">>, [], []} =
+        destack([{<<"a">>, [], []}]),
+    {<<"a">>, [], [{<<"b">>, [], []}]} =
+        destack([{<<"b">>, [], []}, {<<"a">>, [], []}]),
+    {<<"a">>, [], [{<<"b">>, [], [{<<"c">>, [], []}]}]} =
+     destack([{<<"c">>, [], []}, {<<"b">>, [], []}, {<<"a">>, [], []}]),
+    [{<<"a">>, [], [{<<"b">>, [], [{<<"c">>, [], []}]}]}] =
+     destack(<<"b">>,
+             [{<<"c">>, [], []}, {<<"b">>, [], []}, {<<"a">>, [], []}]),
+    [{<<"b">>, [], [{<<"c">>, [], []}]}, {<<"a">>, [], []}] =
+     destack(<<"c">>,
+             [{<<"c">>, [], []}, {<<"b">>, [], []},{<<"a">>, [], []}]),
+    ok.
+
+doctype_test() ->
+    ?assertEqual(
+       {<<"html">>,[],[{<<"head">>,[],[]}]},
+       mochiweb_html:parse("<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">"
+                           "<html><head></head></body></html>")),
+    %% http://code.google.com/p/mochiweb/issues/detail?id=52
+    ?assertEqual(
+       {<<"html">>,[],[{<<"head">>,[],[]}]},
+       mochiweb_html:parse("<html>"
+                           "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">"
+                           "<head></head></body></html>")),
+    ok.
+
+-endif.
-- 
cgit v1.2.3