summaryrefslogtreecommitdiff
path: root/1.1.x/src/mochiweb/mochiweb_html.erl
diff options
context:
space:
mode:
authorRobert Newson <rnewson@apache.org>2011-05-17 11:15:14 +0000
committerRobert Newson <rnewson@apache.org>2011-05-17 11:15:14 +0000
commite8e4b0d293021fe90326a85828f3cfb087bf18b7 (patch)
tree986f544eac623ec23b769b36828894f93a173aa3 /1.1.x/src/mochiweb/mochiweb_html.erl
parentda6a5322b0b8084f434752060caa8be214c6f4fa (diff)
tagging 1.1.0
git-svn-id: https://svn.apache.org/repos/asf/couchdb/tags/1.1.0@1104149 13f79535-47bb-0310-9956-ffa450edef68
Diffstat (limited to '1.1.x/src/mochiweb/mochiweb_html.erl')
-rw-r--r--1.1.x/src/mochiweb/mochiweb_html.erl1061
1 files changed, 1061 insertions, 0 deletions
diff --git a/1.1.x/src/mochiweb/mochiweb_html.erl b/1.1.x/src/mochiweb/mochiweb_html.erl
new file mode 100644
index 00000000..a15c359c
--- /dev/null
+++ b/1.1.x/src/mochiweb/mochiweb_html.erl
@@ -0,0 +1,1061 @@
+%% @author Bob Ippolito <bob@mochimedia.com>
+%% @copyright 2007 Mochi Media, Inc.
+
+%% @doc Loosely tokenizes and generates parse trees for HTML 4.
+-module(mochiweb_html).
+-export([tokens/1, parse/1, parse_tokens/1, to_tokens/1, escape/1,
+ escape_attr/1, to_html/1]).
+
+%% This is a macro to placate syntax highlighters..
+-define(QUOTE, $\").
+-define(SQUOTE, $\').
+-define(ADV_COL(S, N),
+ S#decoder{column=N+S#decoder.column,
+ offset=N+S#decoder.offset}).
+-define(INC_COL(S),
+ S#decoder{column=1+S#decoder.column,
+ offset=1+S#decoder.offset}).
+-define(INC_LINE(S),
+ S#decoder{column=1,
+ line=1+S#decoder.line,
+ offset=1+S#decoder.offset}).
+-define(INC_CHAR(S, C),
+ case C of
+ $\n ->
+ S#decoder{column=1,
+ line=1+S#decoder.line,
+ offset=1+S#decoder.offset};
+ _ ->
+ S#decoder{column=1+S#decoder.column,
+ offset=1+S#decoder.offset}
+ end).
+
+-define(IS_WHITESPACE(C),
+ (C =:= $\s orelse C =:= $\t orelse C =:= $\r orelse C =:= $\n)).
+-define(IS_LITERAL_SAFE(C),
+ ((C >= $A andalso C =< $Z) orelse (C >= $a andalso C =< $z)
+ orelse (C >= $0 andalso C =< $9))).
+-define(PROBABLE_CLOSE(C),
+ (C =:= $> orelse ?IS_WHITESPACE(C))).
+
+-record(decoder, {line=1,
+ column=1,
+ offset=0}).
+
+%% @type html_node() = {string(), [html_attr()], [html_node() | string()]}
+%% @type html_attr() = {string(), string()}
+%% @type html_token() = html_data() | start_tag() | end_tag() | inline_html() | html_comment() | html_doctype()
+%% @type html_data() = {data, string(), Whitespace::boolean()}
+%% @type start_tag() = {start_tag, Name, [html_attr()], Singleton::boolean()}
+%% @type end_tag() = {end_tag, Name}
+%% @type html_comment() = {comment, Comment}
+%% @type html_doctype() = {doctype, [Doctype]}
+%% @type inline_html() = {'=', iolist()}
+
+%% External API.
+
+%% @spec parse(string() | binary()) -> html_node()
+%% @doc tokenize and then transform the token stream into a HTML tree.
+parse(Input) ->
+ parse_tokens(tokens(Input)).
+
+%% @spec parse_tokens([html_token()]) -> html_node()
+%% @doc Transform the output of tokens(Doc) into a HTML tree.
+parse_tokens(Tokens) when is_list(Tokens) ->
+ %% Skip over doctype, processing instructions
+ F = fun (X) ->
+ case X of
+ {start_tag, _, _, false} ->
+ false;
+ _ ->
+ true
+ end
+ end,
+ [{start_tag, Tag, Attrs, false} | Rest] = lists:dropwhile(F, Tokens),
+ {Tree, _} = tree(Rest, [norm({Tag, Attrs})]),
+ Tree.
+
+%% @spec tokens(StringOrBinary) -> [html_token()]
+%% @doc Transform the input UTF-8 HTML into a token stream.
+tokens(Input) ->
+ tokens(iolist_to_binary(Input), #decoder{}, []).
+
+%% @spec to_tokens(html_node()) -> [html_token()]
+%% @doc Convert a html_node() tree to a list of tokens.
+to_tokens({Tag0}) ->
+ to_tokens({Tag0, [], []});
+to_tokens(T={'=', _}) ->
+ [T];
+to_tokens(T={doctype, _}) ->
+ [T];
+to_tokens(T={comment, _}) ->
+ [T];
+to_tokens({Tag0, Acc}) ->
+ %% This is only allowed in sub-tags: {p, [{"class", "foo"}]}
+ to_tokens({Tag0, [], Acc});
+to_tokens({Tag0, Attrs, Acc}) ->
+ Tag = to_tag(Tag0),
+ to_tokens([{Tag, Acc}], [{start_tag, Tag, Attrs, is_singleton(Tag)}]).
+
+%% @spec to_html([html_token()] | html_node()) -> iolist()
+%% @doc Convert a list of html_token() to a HTML document.
+to_html(Node) when is_tuple(Node) ->
+ to_html(to_tokens(Node));
+to_html(Tokens) when is_list(Tokens) ->
+ to_html(Tokens, []).
+
+%% @spec escape(string() | atom() | binary()) -> binary()
+%% @doc Escape a string such that it's safe for HTML (amp; lt; gt;).
+escape(B) when is_binary(B) ->
+ escape(binary_to_list(B), []);
+escape(A) when is_atom(A) ->
+ escape(atom_to_list(A), []);
+escape(S) when is_list(S) ->
+ escape(S, []).
+
+%% @spec escape_attr(string() | binary() | atom() | integer() | float()) -> binary()
+%% @doc Escape a string such that it's safe for HTML attrs
+%% (amp; lt; gt; quot;).
+escape_attr(B) when is_binary(B) ->
+ escape_attr(binary_to_list(B), []);
+escape_attr(A) when is_atom(A) ->
+ escape_attr(atom_to_list(A), []);
+escape_attr(S) when is_list(S) ->
+ escape_attr(S, []);
+escape_attr(I) when is_integer(I) ->
+ escape_attr(integer_to_list(I), []);
+escape_attr(F) when is_float(F) ->
+ escape_attr(mochinum:digits(F), []).
+
+to_html([], Acc) ->
+ lists:reverse(Acc);
+to_html([{'=', Content} | Rest], Acc) ->
+ to_html(Rest, [Content | Acc]);
+to_html([{pi, Tag, Attrs} | Rest], Acc) ->
+ Open = [<<"<?">>,
+ Tag,
+ attrs_to_html(Attrs, []),
+ <<"?>">>],
+ to_html(Rest, [Open | Acc]);
+to_html([{comment, Comment} | Rest], Acc) ->
+ to_html(Rest, [[<<"<!--">>, Comment, <<"-->">>] | Acc]);
+to_html([{doctype, Parts} | Rest], Acc) ->
+ Inside = doctype_to_html(Parts, Acc),
+ to_html(Rest, [[<<"<!DOCTYPE">>, Inside, <<">">>] | Acc]);
+to_html([{data, Data, _Whitespace} | Rest], Acc) ->
+ to_html(Rest, [escape(Data) | Acc]);
+to_html([{start_tag, Tag, Attrs, Singleton} | Rest], Acc) ->
+ Open = [<<"<">>,
+ Tag,
+ attrs_to_html(Attrs, []),
+ case Singleton of
+ true -> <<" />">>;
+ false -> <<">">>
+ end],
+ to_html(Rest, [Open | Acc]);
+to_html([{end_tag, Tag} | Rest], Acc) ->
+ to_html(Rest, [[<<"</">>, Tag, <<">">>] | Acc]).
+
+doctype_to_html([], Acc) ->
+ lists:reverse(Acc);
+doctype_to_html([Word | Rest], Acc) ->
+ case lists:all(fun (C) -> ?IS_LITERAL_SAFE(C) end,
+ binary_to_list(iolist_to_binary(Word))) of
+ true ->
+ doctype_to_html(Rest, [[<<" ">>, Word] | Acc]);
+ false ->
+ doctype_to_html(Rest, [[<<" \"">>, escape_attr(Word), ?QUOTE] | Acc])
+ end.
+
+attrs_to_html([], Acc) ->
+ lists:reverse(Acc);
+attrs_to_html([{K, V} | Rest], Acc) ->
+ attrs_to_html(Rest,
+ [[<<" ">>, escape(K), <<"=\"">>,
+ escape_attr(V), <<"\"">>] | Acc]).
+
+escape([], Acc) ->
+ list_to_binary(lists:reverse(Acc));
+escape("<" ++ Rest, Acc) ->
+ escape(Rest, lists:reverse("&lt;", Acc));
+escape(">" ++ Rest, Acc) ->
+ escape(Rest, lists:reverse("&gt;", Acc));
+escape("&" ++ Rest, Acc) ->
+ escape(Rest, lists:reverse("&amp;", Acc));
+escape([C | Rest], Acc) ->
+ escape(Rest, [C | Acc]).
+
+escape_attr([], Acc) ->
+ list_to_binary(lists:reverse(Acc));
+escape_attr("<" ++ Rest, Acc) ->
+ escape_attr(Rest, lists:reverse("&lt;", Acc));
+escape_attr(">" ++ Rest, Acc) ->
+ escape_attr(Rest, lists:reverse("&gt;", Acc));
+escape_attr("&" ++ Rest, Acc) ->
+ escape_attr(Rest, lists:reverse("&amp;", Acc));
+escape_attr([?QUOTE | Rest], Acc) ->
+ escape_attr(Rest, lists:reverse("&quot;", Acc));
+escape_attr([C | Rest], Acc) ->
+ escape_attr(Rest, [C | Acc]).
+
+to_tag(A) when is_atom(A) ->
+ norm(atom_to_list(A));
+to_tag(L) ->
+ norm(L).
+
+to_tokens([], Acc) ->
+ lists:reverse(Acc);
+to_tokens([{Tag, []} | Rest], Acc) ->
+ to_tokens(Rest, [{end_tag, to_tag(Tag)} | Acc]);
+to_tokens([{Tag0, [{T0} | R1]} | Rest], Acc) ->
+ %% Allow {br}
+ to_tokens([{Tag0, [{T0, [], []} | R1]} | Rest], Acc);
+to_tokens([{Tag0, [T0={'=', _C0} | R1]} | Rest], Acc) ->
+ %% Allow {'=', iolist()}
+ to_tokens([{Tag0, R1} | Rest], [T0 | Acc]);
+to_tokens([{Tag0, [T0={comment, _C0} | R1]} | Rest], Acc) ->
+ %% Allow {comment, iolist()}
+ to_tokens([{Tag0, R1} | Rest], [T0 | Acc]);
+to_tokens([{Tag0, [T0={pi, _S0, _A0} | R1]} | Rest], Acc) ->
+ %% Allow {pi, binary(), list()}
+ to_tokens([{Tag0, R1} | Rest], [T0 | Acc]);
+to_tokens([{Tag0, [{T0, A0=[{_, _} | _]} | R1]} | Rest], Acc) ->
+ %% Allow {p, [{"class", "foo"}]}
+ to_tokens([{Tag0, [{T0, A0, []} | R1]} | Rest], Acc);
+to_tokens([{Tag0, [{T0, C0} | R1]} | Rest], Acc) ->
+ %% Allow {p, "content"} and {p, <<"content">>}
+ to_tokens([{Tag0, [{T0, [], C0} | R1]} | Rest], Acc);
+to_tokens([{Tag0, [{T0, A1, C0} | R1]} | Rest], Acc) when is_binary(C0) ->
+ %% Allow {"p", [{"class", "foo"}], <<"content">>}
+ to_tokens([{Tag0, [{T0, A1, binary_to_list(C0)} | R1]} | Rest], Acc);
+to_tokens([{Tag0, [{T0, A1, C0=[C | _]} | R1]} | Rest], Acc)
+ when is_integer(C) ->
+ %% Allow {"p", [{"class", "foo"}], "content"}
+ to_tokens([{Tag0, [{T0, A1, [C0]} | R1]} | Rest], Acc);
+to_tokens([{Tag0, [{T0, A1, C1} | R1]} | Rest], Acc) ->
+ %% Native {"p", [{"class", "foo"}], ["content"]}
+ Tag = to_tag(Tag0),
+ T1 = to_tag(T0),
+ case is_singleton(norm(T1)) of
+ true ->
+ to_tokens([{Tag, R1} | Rest], [{start_tag, T1, A1, true} | Acc]);
+ false ->
+ to_tokens([{T1, C1}, {Tag, R1} | Rest],
+ [{start_tag, T1, A1, false} | Acc])
+ end;
+to_tokens([{Tag0, [L | R1]} | Rest], Acc) when is_list(L) ->
+ %% List text
+ Tag = to_tag(Tag0),
+ to_tokens([{Tag, R1} | Rest], [{data, iolist_to_binary(L), false} | Acc]);
+to_tokens([{Tag0, [B | R1]} | Rest], Acc) when is_binary(B) ->
+ %% Binary text
+ Tag = to_tag(Tag0),
+ to_tokens([{Tag, R1} | Rest], [{data, B, false} | Acc]).
+
+tokens(B, S=#decoder{offset=O}, Acc) ->
+ case B of
+ <<_:O/binary>> ->
+ lists:reverse(Acc);
+ _ ->
+ {Tag, S1} = tokenize(B, S),
+ case parse_flag(Tag) of
+ script ->
+ {Tag2, S2} = tokenize_script(B, S1),
+ tokens(B, S2, [Tag2, Tag | Acc]);
+ textarea ->
+ {Tag2, S2} = tokenize_textarea(B, S1),
+ tokens(B, S2, [Tag2, Tag | Acc]);
+ none ->
+ tokens(B, S1, [Tag | Acc])
+ end
+ end.
+
+parse_flag({start_tag, B, _, false}) ->
+ case string:to_lower(binary_to_list(B)) of
+ "script" ->
+ script;
+ "textarea" ->
+ textarea;
+ _ ->
+ none
+ end;
+parse_flag(_) ->
+ none.
+
+tokenize(B, S=#decoder{offset=O}) ->
+ case B of
+ <<_:O/binary, "<!--", _/binary>> ->
+ tokenize_comment(B, ?ADV_COL(S, 4));
+ <<_:O/binary, "<!DOCTYPE", _/binary>> ->
+ tokenize_doctype(B, ?ADV_COL(S, 10));
+ <<_:O/binary, "<![CDATA[", _/binary>> ->
+ tokenize_cdata(B, ?ADV_COL(S, 9));
+ <<_:O/binary, "<?", _/binary>> ->
+ {Tag, S1} = tokenize_literal(B, ?ADV_COL(S, 2)),
+ {Attrs, S2} = tokenize_attributes(B, S1),
+ S3 = find_qgt(B, S2),
+ {{pi, Tag, Attrs}, S3};
+ <<_:O/binary, "&", _/binary>> ->
+ tokenize_charref(B, ?INC_COL(S));
+ <<_:O/binary, "</", _/binary>> ->
+ {Tag, S1} = tokenize_literal(B, ?ADV_COL(S, 2)),
+ {S2, _} = find_gt(B, S1),
+ {{end_tag, Tag}, S2};
+ <<_:O/binary, "<", C, _/binary>> when ?IS_WHITESPACE(C) ->
+ %% This isn't really strict HTML
+ {{data, Data, _Whitespace}, S1} = tokenize_data(B, ?INC_COL(S)),
+ {{data, <<$<, Data/binary>>, false}, S1};
+ <<_:O/binary, "<", _/binary>> ->
+ {Tag, S1} = tokenize_literal(B, ?INC_COL(S)),
+ {Attrs, S2} = tokenize_attributes(B, S1),
+ {S3, HasSlash} = find_gt(B, S2),
+ Singleton = HasSlash orelse is_singleton(norm(binary_to_list(Tag))),
+ {{start_tag, Tag, Attrs, Singleton}, S3};
+ _ ->
+ tokenize_data(B, S)
+ end.
+
+tree_data([{data, Data, Whitespace} | Rest], AllWhitespace, Acc) ->
+ tree_data(Rest, (Whitespace andalso AllWhitespace), [Data | Acc]);
+tree_data(Rest, AllWhitespace, Acc) ->
+ {iolist_to_binary(lists:reverse(Acc)), AllWhitespace, Rest}.
+
+tree([], Stack) ->
+ {destack(Stack), []};
+tree([{end_tag, Tag} | Rest], Stack) ->
+ case destack(norm(Tag), Stack) of
+ S when is_list(S) ->
+ tree(Rest, S);
+ Result ->
+ {Result, []}
+ end;
+tree([{start_tag, Tag, Attrs, true} | Rest], S) ->
+ tree(Rest, append_stack_child(norm({Tag, Attrs}), S));
+tree([{start_tag, Tag, Attrs, false} | Rest], S) ->
+ tree(Rest, stack(norm({Tag, Attrs}), S));
+tree([T={pi, _Tag, _Attrs} | Rest], S) ->
+ tree(Rest, append_stack_child(T, S));
+tree([T={comment, _Comment} | Rest], S) ->
+ tree(Rest, append_stack_child(T, S));
+tree(L=[{data, _Data, _Whitespace} | _], S) ->
+ case tree_data(L, true, []) of
+ {_, true, Rest} ->
+ tree(Rest, S);
+ {Data, false, Rest} ->
+ tree(Rest, append_stack_child(Data, S))
+ end;
+tree([{doctype, _} | Rest], Stack) ->
+ tree(Rest, Stack).
+
+norm({Tag, Attrs}) ->
+ {norm(Tag), [{norm(K), iolist_to_binary(V)} || {K, V} <- Attrs], []};
+norm(Tag) when is_binary(Tag) ->
+ Tag;
+norm(Tag) ->
+ list_to_binary(string:to_lower(Tag)).
+
+stack(T1={TN, _, _}, Stack=[{TN, _, _} | _Rest])
+ when TN =:= <<"li">> orelse TN =:= <<"option">> ->
+ [T1 | destack(TN, Stack)];
+stack(T1={TN0, _, _}, Stack=[{TN1, _, _} | _Rest])
+ when (TN0 =:= <<"dd">> orelse TN0 =:= <<"dt">>) andalso
+ (TN1 =:= <<"dd">> orelse TN1 =:= <<"dt">>) ->
+ [T1 | destack(TN1, Stack)];
+stack(T1, Stack) ->
+ [T1 | Stack].
+
+append_stack_child(StartTag, [{Name, Attrs, Acc} | Stack]) ->
+ [{Name, Attrs, [StartTag | Acc]} | Stack].
+
+destack(TagName, Stack) when is_list(Stack) ->
+ F = fun (X) ->
+ case X of
+ {TagName, _, _} ->
+ false;
+ _ ->
+ true
+ end
+ end,
+ case lists:splitwith(F, Stack) of
+ {_, []} ->
+ %% If we're parsing something like XML we might find
+ %% a <link>tag</link> that is normally a singleton
+ %% in HTML but isn't here
+ case {is_singleton(TagName), Stack} of
+ {true, [{T0, A0, Acc0} | Post0]} ->
+ case lists:splitwith(F, Acc0) of
+ {_, []} ->
+ %% Actually was a singleton
+ Stack;
+ {Pre, [{T1, A1, []} | Post1]} ->
+ [{T0, A0, [{T1, A1, lists:reverse(Pre)} | Post1]}
+ | Post0]
+ end;
+ _ ->
+ %% No match, no state change
+ Stack
+ end;
+ {_Pre, [_T]} ->
+ %% Unfurl the whole stack, we're done
+ destack(Stack);
+ {Pre, [T, {T0, A0, Acc0} | Post]} ->
+ %% Unfurl up to the tag, then accumulate it
+ [{T0, A0, [destack(Pre ++ [T]) | Acc0]} | Post]
+ end.
+
+destack([{Tag, Attrs, Acc}]) ->
+ {Tag, Attrs, lists:reverse(Acc)};
+destack([{T1, A1, Acc1}, {T0, A0, Acc0} | Rest]) ->
+ destack([{T0, A0, [{T1, A1, lists:reverse(Acc1)} | Acc0]} | Rest]).
+
+is_singleton(<<"br">>) -> true;
+is_singleton(<<"hr">>) -> true;
+is_singleton(<<"img">>) -> true;
+is_singleton(<<"input">>) -> true;
+is_singleton(<<"base">>) -> true;
+is_singleton(<<"meta">>) -> true;
+is_singleton(<<"link">>) -> true;
+is_singleton(<<"area">>) -> true;
+is_singleton(<<"param">>) -> true;
+is_singleton(<<"col">>) -> true;
+is_singleton(_) -> false.
+
+tokenize_data(B, S=#decoder{offset=O}) ->
+ tokenize_data(B, S, O, true).
+
+tokenize_data(B, S=#decoder{offset=O}, Start, Whitespace) ->
+ case B of
+ <<_:O/binary, C, _/binary>> when (C =/= $< andalso C =/= $&) ->
+ tokenize_data(B, ?INC_CHAR(S, C), Start,
+ (Whitespace andalso ?IS_WHITESPACE(C)));
+ _ ->
+ Len = O - Start,
+ <<_:Start/binary, Data:Len/binary, _/binary>> = B,
+ {{data, Data, Whitespace}, S}
+ end.
+
+tokenize_attributes(B, S) ->
+ tokenize_attributes(B, S, []).
+
+tokenize_attributes(B, S=#decoder{offset=O}, Acc) ->
+ case B of
+ <<_:O/binary>> ->
+ {lists:reverse(Acc), S};
+ <<_:O/binary, C, _/binary>> when (C =:= $> orelse C =:= $/) ->
+ {lists:reverse(Acc), S};
+ <<_:O/binary, "?>", _/binary>> ->
+ {lists:reverse(Acc), S};
+ <<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) ->
+ tokenize_attributes(B, ?INC_CHAR(S, C), Acc);
+ _ ->
+ {Attr, S1} = tokenize_literal(B, S),
+ {Value, S2} = tokenize_attr_value(Attr, B, S1),
+ tokenize_attributes(B, S2, [{Attr, Value} | Acc])
+ end.
+
+tokenize_attr_value(Attr, B, S) ->
+ S1 = skip_whitespace(B, S),
+ O = S1#decoder.offset,
+ case B of
+ <<_:O/binary, "=", _/binary>> ->
+ S2 = skip_whitespace(B, ?INC_COL(S1)),
+ tokenize_word_or_literal(B, S2);
+ _ ->
+ {Attr, S1}
+ end.
+
+skip_whitespace(B, S=#decoder{offset=O}) ->
+ case B of
+ <<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) ->
+ skip_whitespace(B, ?INC_CHAR(S, C));
+ _ ->
+ S
+ end.
+
+tokenize_literal(Bin, S) ->
+ tokenize_literal(Bin, S, []).
+
+tokenize_literal(Bin, S=#decoder{offset=O}, Acc) ->
+ case Bin of
+ <<_:O/binary, $&, _/binary>> ->
+ {{data, Data, false}, S1} = tokenize_charref(Bin, ?INC_COL(S)),
+ tokenize_literal(Bin, S1, [Data | Acc]);
+ <<_:O/binary, C, _/binary>> when not (?IS_WHITESPACE(C)
+ orelse C =:= $>
+ orelse C =:= $/
+ orelse C =:= $=) ->
+ tokenize_literal(Bin, ?INC_COL(S), [C | Acc]);
+ _ ->
+ {iolist_to_binary(lists:reverse(Acc)), S}
+ end.
+
+find_qgt(Bin, S=#decoder{offset=O}) ->
+ case Bin of
+ <<_:O/binary, "?>", _/binary>> ->
+ ?ADV_COL(S, 2);
+ %% tokenize_attributes takes care of this state:
+ %% <<_:O/binary, C, _/binary>> ->
+ %% find_qgt(Bin, ?INC_CHAR(S, C));
+ <<_:O/binary>> ->
+ S
+ end.
+
+find_gt(Bin, S) ->
+ find_gt(Bin, S, false).
+
+find_gt(Bin, S=#decoder{offset=O}, HasSlash) ->
+ case Bin of
+ <<_:O/binary, $/, _/binary>> ->
+ find_gt(Bin, ?INC_COL(S), true);
+ <<_:O/binary, $>, _/binary>> ->
+ {?INC_COL(S), HasSlash};
+ <<_:O/binary, C, _/binary>> ->
+ find_gt(Bin, ?INC_CHAR(S, C), HasSlash);
+ _ ->
+ {S, HasSlash}
+ end.
+
+tokenize_charref(Bin, S=#decoder{offset=O}) ->
+ tokenize_charref(Bin, S, O).
+
+tokenize_charref(Bin, S=#decoder{offset=O}, Start) ->
+ case Bin of
+ <<_:O/binary>> ->
+ <<_:Start/binary, Raw/binary>> = Bin,
+ {{data, Raw, false}, S};
+ <<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C)
+ orelse C =:= ?SQUOTE
+ orelse C =:= ?QUOTE
+ orelse C =:= $/
+ orelse C =:= $> ->
+ Len = O - Start,
+ <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
+ {{data, Raw, false}, S};
+ <<_:O/binary, $;, _/binary>> ->
+ Len = O - Start,
+ <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
+ Data = case mochiweb_charref:charref(Raw) of
+ undefined ->
+ Start1 = Start - 1,
+ Len1 = Len + 2,
+ <<_:Start1/binary, R:Len1/binary, _/binary>> = Bin,
+ R;
+ Unichar ->
+ mochiutf8:codepoint_to_bytes(Unichar)
+ end,
+ {{data, Data, false}, ?INC_COL(S)};
+ _ ->
+ tokenize_charref(Bin, ?INC_COL(S), Start)
+ end.
+
+tokenize_doctype(Bin, S) ->
+ tokenize_doctype(Bin, S, []).
+
+tokenize_doctype(Bin, S=#decoder{offset=O}, Acc) ->
+ case Bin of
+ <<_:O/binary>> ->
+ {{doctype, lists:reverse(Acc)}, S};
+ <<_:O/binary, $>, _/binary>> ->
+ {{doctype, lists:reverse(Acc)}, ?INC_COL(S)};
+ <<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) ->
+ tokenize_doctype(Bin, ?INC_CHAR(S, C), Acc);
+ _ ->
+ {Word, S1} = tokenize_word_or_literal(Bin, S),
+ tokenize_doctype(Bin, S1, [Word | Acc])
+ end.
+
+tokenize_word_or_literal(Bin, S=#decoder{offset=O}) ->
+ case Bin of
+ <<_:O/binary, C, _/binary>> when C =:= ?QUOTE orelse C =:= ?SQUOTE ->
+ tokenize_word(Bin, ?INC_COL(S), C);
+ <<_:O/binary, C, _/binary>> when not ?IS_WHITESPACE(C) ->
+ %% Sanity check for whitespace
+ tokenize_literal(Bin, S, [])
+ end.
+
+tokenize_word(Bin, S, Quote) ->
+ tokenize_word(Bin, S, Quote, []).
+
+tokenize_word(Bin, S=#decoder{offset=O}, Quote, Acc) ->
+ case Bin of
+ <<_:O/binary>> ->
+ {iolist_to_binary(lists:reverse(Acc)), S};
+ <<_:O/binary, Quote, _/binary>> ->
+ {iolist_to_binary(lists:reverse(Acc)), ?INC_COL(S)};
+ <<_:O/binary, $&, _/binary>> ->
+ {{data, Data, false}, S1} = tokenize_charref(Bin, ?INC_COL(S)),
+ tokenize_word(Bin, S1, Quote, [Data | Acc]);
+ <<_:O/binary, C, _/binary>> ->
+ tokenize_word(Bin, ?INC_CHAR(S, C), Quote, [C | Acc])
+ end.
+
+tokenize_cdata(Bin, S=#decoder{offset=O}) ->
+ tokenize_cdata(Bin, S, O).
+
+tokenize_cdata(Bin, S=#decoder{offset=O}, Start) ->
+ case Bin of
+ <<_:O/binary, "]]>", _/binary>> ->
+ Len = O - Start,
+ <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
+ {{data, Raw, false}, ?ADV_COL(S, 3)};
+ <<_:O/binary, C, _/binary>> ->
+ tokenize_cdata(Bin, ?INC_CHAR(S, C), Start);
+ _ ->
+ <<_:O/binary, Raw/binary>> = Bin,
+ {{data, Raw, false}, S}
+ end.
+
+tokenize_comment(Bin, S=#decoder{offset=O}) ->
+ tokenize_comment(Bin, S, O).
+
+tokenize_comment(Bin, S=#decoder{offset=O}, Start) ->
+ case Bin of
+ <<_:O/binary, "-->", _/binary>> ->
+ Len = O - Start,
+ <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
+ {{comment, Raw}, ?ADV_COL(S, 3)};
+ <<_:O/binary, C, _/binary>> ->
+ tokenize_comment(Bin, ?INC_CHAR(S, C), Start);
+ <<_:Start/binary, Raw/binary>> ->
+ {{comment, Raw}, S}
+ end.
+
+tokenize_script(Bin, S=#decoder{offset=O}) ->
+ tokenize_script(Bin, S, O).
+
+tokenize_script(Bin, S=#decoder{offset=O}, Start) ->
+ case Bin of
+ %% Just a look-ahead, we want the end_tag separately
+ <<_:O/binary, $<, $/, SS, CC, RR, II, PP, TT, ZZ, _/binary>>
+ when (SS =:= $s orelse SS =:= $S) andalso
+ (CC =:= $c orelse CC =:= $C) andalso
+ (RR =:= $r orelse RR =:= $R) andalso
+ (II =:= $i orelse II =:= $I) andalso
+ (PP =:= $p orelse PP =:= $P) andalso
+ (TT=:= $t orelse TT =:= $T) andalso
+ ?PROBABLE_CLOSE(ZZ) ->
+ Len = O - Start,
+ <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
+ {{data, Raw, false}, S};
+ <<_:O/binary, C, _/binary>> ->
+ tokenize_script(Bin, ?INC_CHAR(S, C), Start);
+ <<_:Start/binary, Raw/binary>> ->
+ {{data, Raw, false}, S}
+ end.
+
+tokenize_textarea(Bin, S=#decoder{offset=O}) ->
+ tokenize_textarea(Bin, S, O).
+
+tokenize_textarea(Bin, S=#decoder{offset=O}, Start) ->
+ case Bin of
+ %% Just a look-ahead, we want the end_tag separately
+ <<_:O/binary, $<, $/, TT, EE, XX, TT2, AA, RR, EE2, AA2, ZZ, _/binary>>
+ when (TT =:= $t orelse TT =:= $T) andalso
+ (EE =:= $e orelse EE =:= $E) andalso
+ (XX =:= $x orelse XX =:= $X) andalso
+ (TT2 =:= $t orelse TT2 =:= $T) andalso
+ (AA =:= $a orelse AA =:= $A) andalso
+ (RR =:= $r orelse RR =:= $R) andalso
+ (EE2 =:= $e orelse EE2 =:= $E) andalso
+ (AA2 =:= $a orelse AA2 =:= $A) andalso
+ ?PROBABLE_CLOSE(ZZ) ->
+ Len = O - Start,
+ <<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
+ {{data, Raw, false}, S};
+ <<_:O/binary, C, _/binary>> ->
+ tokenize_textarea(Bin, ?INC_CHAR(S, C), Start);
+ <<_:Start/binary, Raw/binary>> ->
+ {{data, Raw, false}, S}
+ end.
+
+
+%%
+%% Tests
+%%
+-include_lib("eunit/include/eunit.hrl").
+-ifdef(TEST).
+
+to_html_test() ->
+ ?assertEqual(
+ <<"<html><head><title>hey!</title></head><body><p class=\"foo\">what's up<br /></p><div>sucka</div>RAW!<!-- comment! --></body></html>">>,
+ iolist_to_binary(
+ to_html({html, [],
+ [{<<"head">>, [],
+ [{title, <<"hey!">>}]},
+ {body, [],
+ [{p, [{class, foo}], [<<"what's">>, <<" up">>, {br}]},
+ {'div', <<"sucka">>},
+ {'=', <<"RAW!">>},
+ {comment, <<" comment! ">>}]}]}))),
+ ?assertEqual(
+ <<"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">">>,
+ iolist_to_binary(
+ to_html({doctype,
+ [<<"html">>, <<"PUBLIC">>,
+ <<"-//W3C//DTD XHTML 1.0 Transitional//EN">>,
+ <<"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">>]}))),
+ ?assertEqual(
+ <<"<html><?xml:namespace prefix=\"o\" ns=\"urn:schemas-microsoft-com:office:office\"?></html>">>,
+ iolist_to_binary(
+ to_html({<<"html">>,[],
+ [{pi, <<"xml:namespace">>,
+ [{<<"prefix">>,<<"o">>},
+ {<<"ns">>,<<"urn:schemas-microsoft-com:office:office">>}]}]}))),
+ ok.
+
+escape_test() ->
+ ?assertEqual(
+ <<"&amp;quot;\"word &gt;&lt;&lt;up!&amp;quot;">>,
+ escape(<<"&quot;\"word ><<up!&quot;">>)),
+ ?assertEqual(
+ <<"&amp;quot;\"word &gt;&lt;&lt;up!&amp;quot;">>,
+ escape("&quot;\"word ><<up!&quot;")),
+ ?assertEqual(
+ <<"&amp;quot;\"word &gt;&lt;&lt;up!&amp;quot;">>,
+ escape('&quot;\"word ><<up!&quot;')),
+ ok.
+
+escape_attr_test() ->
+ ?assertEqual(
+ <<"&amp;quot;&quot;word &gt;&lt;&lt;up!&amp;quot;">>,
+ escape_attr(<<"&quot;\"word ><<up!&quot;">>)),
+ ?assertEqual(
+ <<"&amp;quot;&quot;word &gt;&lt;&lt;up!&amp;quot;">>,
+ escape_attr("&quot;\"word ><<up!&quot;")),
+ ?assertEqual(
+ <<"&amp;quot;&quot;word &gt;&lt;&lt;up!&amp;quot;">>,
+ escape_attr('&quot;\"word ><<up!&quot;')),
+ ?assertEqual(
+ <<"12345">>,
+ escape_attr(12345)),
+ ?assertEqual(
+ <<"1.5">>,
+ escape_attr(1.5)),
+ ok.
+
+tokens_test() ->
+ ?assertEqual(
+ [{start_tag, <<"foo">>, [{<<"bar">>, <<"baz">>},
+ {<<"wibble">>, <<"wibble">>},
+ {<<"alice">>, <<"bob">>}], true}],
+ tokens(<<"<foo bar=baz wibble='wibble' alice=\"bob\"/>">>)),
+ ?assertEqual(
+ [{start_tag, <<"foo">>, [{<<"bar">>, <<"baz">>},
+ {<<"wibble">>, <<"wibble">>},
+ {<<"alice">>, <<"bob">>}], true}],
+ tokens(<<"<foo bar=baz wibble='wibble' alice=bob/>">>)),
+ ?assertEqual(
+ [{comment, <<"[if lt IE 7]>\n<style type=\"text/css\">\n.no_ie { display: none; }\n</style>\n<![endif]">>}],
+ tokens(<<"<!--[if lt IE 7]>\n<style type=\"text/css\">\n.no_ie { display: none; }\n</style>\n<![endif]-->">>)),
+ ?assertEqual(
+ [{start_tag, <<"script">>, [{<<"type">>, <<"text/javascript">>}], false},
+ {data, <<" A= B <= C ">>, false},
+ {end_tag, <<"script">>}],
+ tokens(<<"<script type=\"text/javascript\"> A= B <= C </script>">>)),
+ ?assertEqual(
+ [{start_tag, <<"script">>, [{<<"type">>, <<"text/javascript">>}], false},
+ {data, <<" A= B <= C ">>, false},
+ {end_tag, <<"script">>}],
+ tokens(<<"<script type =\"text/javascript\"> A= B <= C </script>">>)),
+ ?assertEqual(
+ [{start_tag, <<"script">>, [{<<"type">>, <<"text/javascript">>}], false},
+ {data, <<" A= B <= C ">>, false},
+ {end_tag, <<"script">>}],
+ tokens(<<"<script type = \"text/javascript\"> A= B <= C </script>">>)),
+ ?assertEqual(
+ [{start_tag, <<"script">>, [{<<"type">>, <<"text/javascript">>}], false},
+ {data, <<" A= B <= C ">>, false},
+ {end_tag, <<"script">>}],
+ tokens(<<"<script type= \"text/javascript\"> A= B <= C </script>">>)),
+ ?assertEqual(
+ [{start_tag, <<"textarea">>, [], false},
+ {data, <<"<html></body>">>, false},
+ {end_tag, <<"textarea">>}],
+ tokens(<<"<textarea><html></body></textarea>">>)),
+ ?assertEqual(
+ [{start_tag, <<"textarea">>, [], false},
+ {data, <<"<html></body></textareaz>">>, false}],
+ tokens(<<"<textarea ><html></body></textareaz>">>)),
+ ?assertEqual(
+ [{pi, <<"xml:namespace">>,
+ [{<<"prefix">>,<<"o">>},
+ {<<"ns">>,<<"urn:schemas-microsoft-com:office:office">>}]}],
+ tokens(<<"<?xml:namespace prefix=\"o\" ns=\"urn:schemas-microsoft-com:office:office\"?>">>)),
+ ?assertEqual(
+ [{pi, <<"xml:namespace">>,
+ [{<<"prefix">>,<<"o">>},
+ {<<"ns">>,<<"urn:schemas-microsoft-com:office:office">>}]}],
+ tokens(<<"<?xml:namespace prefix=o ns=urn:schemas-microsoft-com:office:office \n?>">>)),
+ ?assertEqual(
+ [{pi, <<"xml:namespace">>,
+ [{<<"prefix">>,<<"o">>},
+ {<<"ns">>,<<"urn:schemas-microsoft-com:office:office">>}]}],
+ tokens(<<"<?xml:namespace prefix=o ns=urn:schemas-microsoft-com:office:office">>)),
+ ?assertEqual(
+ [{data, <<"<">>, false}],
+ tokens(<<"&lt;">>)),
+ ?assertEqual(
+ [{data, <<"not html ">>, false},
+ {data, <<"< at all">>, false}],
+ tokens(<<"not html < at all">>)),
+ ok.
+
+parse_test() ->
+ D0 = <<"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">
+<html>
+ <head>
+ <meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">
+ <title>Foo</title>
+ <link rel=\"stylesheet\" type=\"text/css\" href=\"/static/rel/dojo/resources/dojo.css\" media=\"screen\">
+ <link rel=\"stylesheet\" type=\"text/css\" href=\"/static/foo.css\" media=\"screen\">
+ <!--[if lt IE 7]>
+ <style type=\"text/css\">
+ .no_ie { display: none; }
+ </style>
+ <![endif]-->
+ <link rel=\"icon\" href=\"/static/images/favicon.ico\" type=\"image/x-icon\">
+ <link rel=\"shortcut icon\" href=\"/static/images/favicon.ico\" type=\"image/x-icon\">
+ </head>
+ <body id=\"home\" class=\"tundra\"><![CDATA[&lt;<this<!-- is -->CDATA>&gt;]]></body>
+</html>">>,
+ ?assertEqual(
+ {<<"html">>, [],
+ [{<<"head">>, [],
+ [{<<"meta">>,
+ [{<<"http-equiv">>,<<"Content-Type">>},
+ {<<"content">>,<<"text/html; charset=UTF-8">>}],
+ []},
+ {<<"title">>,[],[<<"Foo">>]},
+ {<<"link">>,
+ [{<<"rel">>,<<"stylesheet">>},
+ {<<"type">>,<<"text/css">>},
+ {<<"href">>,<<"/static/rel/dojo/resources/dojo.css">>},
+ {<<"media">>,<<"screen">>}],
+ []},
+ {<<"link">>,
+ [{<<"rel">>,<<"stylesheet">>},
+ {<<"type">>,<<"text/css">>},
+ {<<"href">>,<<"/static/foo.css">>},
+ {<<"media">>,<<"screen">>}],
+ []},
+ {comment,<<"[if lt IE 7]>\n <style type=\"text/css\">\n .no_ie { display: none; }\n </style>\n <![endif]">>},
+ {<<"link">>,
+ [{<<"rel">>,<<"icon">>},
+ {<<"href">>,<<"/static/images/favicon.ico">>},
+ {<<"type">>,<<"image/x-icon">>}],
+ []},
+ {<<"link">>,
+ [{<<"rel">>,<<"shortcut icon">>},
+ {<<"href">>,<<"/static/images/favicon.ico">>},
+ {<<"type">>,<<"image/x-icon">>}],
+ []}]},
+ {<<"body">>,
+ [{<<"id">>,<<"home">>},
+ {<<"class">>,<<"tundra">>}],
+ [<<"&lt;<this<!-- is -->CDATA>&gt;">>]}]},
+ parse(D0)),
+ ?assertEqual(
+ {<<"html">>,[],
+ [{pi, <<"xml:namespace">>,
+ [{<<"prefix">>,<<"o">>},
+ {<<"ns">>,<<"urn:schemas-microsoft-com:office:office">>}]}]},
+ parse(
+ <<"<html><?xml:namespace prefix=\"o\" ns=\"urn:schemas-microsoft-com:office:office\"?></html>">>)),
+ ?assertEqual(
+ {<<"html">>, [],
+ [{<<"dd">>, [], [<<"foo">>]},
+ {<<"dt">>, [], [<<"bar">>]}]},
+ parse(<<"<html><dd>foo<dt>bar</html>">>)),
+ %% Singleton sadness
+ ?assertEqual(
+ {<<"html">>, [],
+ [{<<"link">>, [], []},
+ <<"foo">>,
+ {<<"br">>, [], []},
+ <<"bar">>]},
+ parse(<<"<html><link>foo<br>bar</html>">>)),
+ ?assertEqual(
+ {<<"html">>, [],
+ [{<<"link">>, [], [<<"foo">>,
+ {<<"br">>, [], []},
+ <<"bar">>]}]},
+ parse(<<"<html><link>foo<br>bar</link></html>">>)),
+ ok.
+
+exhaustive_is_singleton_test() ->
+ T = mochiweb_cover:clause_lookup_table(?MODULE, is_singleton),
+ [?assertEqual(V, is_singleton(K)) || {K, V} <- T].
+
+tokenize_attributes_test() ->
+ ?assertEqual(
+ {<<"foo">>,
+ [{<<"bar">>, <<"b\"az">>},
+ {<<"wibble">>, <<"wibble">>},
+ {<<"taco", 16#c2, 16#a9>>, <<"bell">>},
+ {<<"quux">>, <<"quux">>}],
+ []},
+ parse(<<"<foo bar=\"b&quot;az\" wibble taco&copy;=bell quux">>)),
+ ok.
+
+tokens2_test() ->
+ D0 = <<"<channel><title>from __future__ import *</title><link>http://bob.pythonmac.org</link><description>Bob's Rants</description></channel>">>,
+ ?assertEqual(
+ [{start_tag,<<"channel">>,[],false},
+ {start_tag,<<"title">>,[],false},
+ {data,<<"from __future__ import *">>,false},
+ {end_tag,<<"title">>},
+ {start_tag,<<"link">>,[],true},
+ {data,<<"http://bob.pythonmac.org">>,false},
+ {end_tag,<<"link">>},
+ {start_tag,<<"description">>,[],false},
+ {data,<<"Bob's Rants">>,false},
+ {end_tag,<<"description">>},
+ {end_tag,<<"channel">>}],
+ tokens(D0)),
+ ok.
+
+to_tokens_test() ->
+ ?assertEqual(
+ [{start_tag, <<"p">>, [{class, 1}], false},
+ {end_tag, <<"p">>}],
+ to_tokens({p, [{class, 1}], []})),
+ ?assertEqual(
+ [{start_tag, <<"p">>, [], false},
+ {end_tag, <<"p">>}],
+ to_tokens({p})),
+ ?assertEqual(
+ [{'=', <<"data">>}],
+ to_tokens({'=', <<"data">>})),
+ ?assertEqual(
+ [{comment, <<"comment">>}],
+ to_tokens({comment, <<"comment">>})),
+ %% This is only allowed in sub-tags:
+ %% {p, [{"class", "foo"}]} as {p, [{"class", "foo"}], []}
+ %% On the outside it's always treated as follows:
+ %% {p, [], [{"class", "foo"}]} as {p, [], [{"class", "foo"}]}
+ ?assertEqual(
+ [{start_tag, <<"html">>, [], false},
+ {start_tag, <<"p">>, [{class, 1}], false},
+ {end_tag, <<"p">>},
+ {end_tag, <<"html">>}],
+ to_tokens({html, [{p, [{class, 1}]}]})),
+ ok.
+
+parse2_test() ->
+ D0 = <<"<channel><title>from __future__ import *</title><link>http://bob.pythonmac.org<br>foo</link><description>Bob's Rants</description></channel>">>,
+ ?assertEqual(
+ {<<"channel">>,[],
+ [{<<"title">>,[],[<<"from __future__ import *">>]},
+ {<<"link">>,[],[
+ <<"http://bob.pythonmac.org">>,
+ {<<"br">>,[],[]},
+ <<"foo">>]},
+ {<<"description">>,[],[<<"Bob's Rants">>]}]},
+ parse(D0)),
+ ok.
+
+parse_tokens_test() ->
+ D0 = [{doctype,[<<"HTML">>,<<"PUBLIC">>,<<"-//W3C//DTD HTML 4.01 Transitional//EN">>]},
+ {data,<<"\n">>,true},
+ {start_tag,<<"html">>,[],false}],
+ ?assertEqual(
+ {<<"html">>, [], []},
+ parse_tokens(D0)),
+ D1 = D0 ++ [{end_tag, <<"html">>}],
+ ?assertEqual(
+ {<<"html">>, [], []},
+ parse_tokens(D1)),
+ D2 = D0 ++ [{start_tag, <<"body">>, [], false}],
+ ?assertEqual(
+ {<<"html">>, [], [{<<"body">>, [], []}]},
+ parse_tokens(D2)),
+ D3 = D0 ++ [{start_tag, <<"head">>, [], false},
+ {end_tag, <<"head">>},
+ {start_tag, <<"body">>, [], false}],
+ ?assertEqual(
+ {<<"html">>, [], [{<<"head">>, [], []}, {<<"body">>, [], []}]},
+ parse_tokens(D3)),
+ D4 = D3 ++ [{data,<<"\n">>,true},
+ {start_tag,<<"div">>,[{<<"class">>,<<"a">>}],false},
+ {start_tag,<<"a">>,[{<<"name">>,<<"#anchor">>}],false},
+ {end_tag,<<"a">>},
+ {end_tag,<<"div">>},
+ {start_tag,<<"div">>,[{<<"class">>,<<"b">>}],false},
+ {start_tag,<<"div">>,[{<<"class">>,<<"c">>}],false},
+ {end_tag,<<"div">>},
+ {end_tag,<<"div">>}],
+ ?assertEqual(
+ {<<"html">>, [],
+ [{<<"head">>, [], []},
+ {<<"body">>, [],
+ [{<<"div">>, [{<<"class">>, <<"a">>}], [{<<"a">>, [{<<"name">>, <<"#anchor">>}], []}]},
+ {<<"div">>, [{<<"class">>, <<"b">>}], [{<<"div">>, [{<<"class">>, <<"c">>}], []}]}
+ ]}]},
+ parse_tokens(D4)),
+ D5 = [{start_tag,<<"html">>,[],false},
+ {data,<<"\n">>,true},
+ {data,<<"boo">>,false},
+ {data,<<"hoo">>,false},
+ {data,<<"\n">>,true},
+ {end_tag,<<"html">>}],
+ ?assertEqual(
+ {<<"html">>, [], [<<"\nboohoo\n">>]},
+ parse_tokens(D5)),
+ D6 = [{start_tag,<<"html">>,[],false},
+ {data,<<"\n">>,true},
+ {data,<<"\n">>,true},
+ {end_tag,<<"html">>}],
+ ?assertEqual(
+ {<<"html">>, [], []},
+ parse_tokens(D6)),
+ D7 = [{start_tag,<<"html">>,[],false},
+ {start_tag,<<"ul">>,[],false},
+ {start_tag,<<"li">>,[],false},
+ {data,<<"word">>,false},
+ {start_tag,<<"li">>,[],false},
+ {data,<<"up">>,false},
+ {end_tag,<<"li">>},
+ {start_tag,<<"li">>,[],false},
+ {data,<<"fdsa">>,false},
+ {start_tag,<<"br">>,[],true},
+ {data,<<"asdf">>,false},
+ {end_tag,<<"ul">>},
+ {end_tag,<<"html">>}],
+ ?assertEqual(
+ {<<"html">>, [],
+ [{<<"ul">>, [],
+ [{<<"li">>, [], [<<"word">>]},
+ {<<"li">>, [], [<<"up">>]},
+ {<<"li">>, [], [<<"fdsa">>,{<<"br">>, [], []}, <<"asdf">>]}]}]},
+ parse_tokens(D7)),
+ ok.
+
+destack_test() ->
+ {<<"a">>, [], []} =
+ destack([{<<"a">>, [], []}]),
+ {<<"a">>, [], [{<<"b">>, [], []}]} =
+ destack([{<<"b">>, [], []}, {<<"a">>, [], []}]),
+ {<<"a">>, [], [{<<"b">>, [], [{<<"c">>, [], []}]}]} =
+ destack([{<<"c">>, [], []}, {<<"b">>, [], []}, {<<"a">>, [], []}]),
+ [{<<"a">>, [], [{<<"b">>, [], [{<<"c">>, [], []}]}]}] =
+ destack(<<"b">>,
+ [{<<"c">>, [], []}, {<<"b">>, [], []}, {<<"a">>, [], []}]),
+ [{<<"b">>, [], [{<<"c">>, [], []}]}, {<<"a">>, [], []}] =
+ destack(<<"c">>,
+ [{<<"c">>, [], []}, {<<"b">>, [], []},{<<"a">>, [], []}]),
+ ok.
+
+doctype_test() ->
+ ?assertEqual(
+ {<<"html">>,[],[{<<"head">>,[],[]}]},
+ mochiweb_html:parse("<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">"
+ "<html><head></head></body></html>")),
+ %% http://code.google.com/p/mochiweb/issues/detail?id=52
+ ?assertEqual(
+ {<<"html">>,[],[{<<"head">>,[],[]}]},
+ mochiweb_html:parse("<html>"
+ "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">"
+ "<head></head></body></html>")),
+ ok.
+
+-endif.