path: root/src/mochiweb/mochiutf8.erl
diff options
authorRobert Newson <>2010-07-26 17:21:30 +0000
committerRobert Newson <>2010-07-26 17:21:30 +0000
commit4b0948ddb3a428f8a5330e05745b2fbd4ccf9375 (patch)
tree5ab1dde286028653d5569ceae6dfc883fa365b7a /src/mochiweb/mochiutf8.erl
parentcd214b23e8129868d4a7020ddafd55a16e496652 (diff)
Add SSL support to CouchDB.
To enable SSL you need to do three things; 1) enable the httpsd daemon in local.ini (you can just uncomment the line). 2) supply your PEM-encoded cert and key files in the [ssl] section. 3) start CouchDB. CouchDB will now, in addition to handling HTTP on port 5984, accept SSL connections on port 6984. The patch itself adds SSL support by updating the local version of Mochiweb to the latest. The upstream release includes our local tweak to support large numbers and to handle Accept-Encoding headers. Our local Mochiweb fork changed the default idle timeout from 10 seconds to 5 minutes, and it was agreed on #irc to revert this change. The only tweaks to Mochiweb were in (to record the git commit I built from) and the removal of Makefile (replaced by Futon received many tweaks as we have 'http://' hardcoded all over. All such instances now use window.location.protocol + '//'. CouchDB received a tweak to use the right scheme in couch_httpd:absolute_uri (it now gets it from the Mochireq and not mochiweb_socket_server). git-svn-id: 13f79535-47bb-0310-9956-ffa450edef68
Diffstat (limited to 'src/mochiweb/mochiutf8.erl')
1 files changed, 316 insertions, 0 deletions
diff --git a/src/mochiweb/mochiutf8.erl b/src/mochiweb/mochiutf8.erl
new file mode 100644
index 00000000..206e1186
--- /dev/null
+++ b/src/mochiweb/mochiutf8.erl
@@ -0,0 +1,316 @@
+%% @copyright 2010 Mochi Media, Inc.
+%% @author Bob Ippolito <>
+%% @doc Algorithm to convert any binary to a valid UTF-8 sequence by ignoring
+%% invalid bytes.
+-export([valid_utf8_bytes/1, codepoint_to_bytes/1, bytes_to_codepoints/1]).
+-export([bytes_foldl/3, codepoint_foldl/3, read_codepoint/1, len/1]).
+%% External API
+-type unichar_low() :: 0..16#d7ff.
+-type unichar_high() :: 16#e000..16#10ffff.
+-type unichar() :: unichar_low() | unichar_high().
+-spec codepoint_to_bytes(unichar()) -> binary().
+%% @doc Convert a unicode codepoint to UTF-8 bytes.
+codepoint_to_bytes(C) when (C >= 16#00 andalso C =< 16#7f) ->
+ %% U+0000 - U+007F - 7 bits
+ <<C>>;
+codepoint_to_bytes(C) when (C >= 16#080 andalso C =< 16#07FF) ->
+ %% U+0080 - U+07FF - 11 bits
+ <<0:5, B1:5, B0:6>> = <<C:16>>,
+ <<2#110:3, B1:5,
+ 2#10:2, B0:6>>;
+codepoint_to_bytes(C) when (C >= 16#0800 andalso C =< 16#FFFF) andalso
+ (C < 16#D800 orelse C > 16#DFFF) ->
+ %% U+0800 - U+FFFF - 16 bits (excluding UTC-16 surrogate code points)
+ <<B2:4, B1:6, B0:6>> = <<C:16>>,
+ <<2#1110:4, B2:4,
+ 2#10:2, B1:6,
+ 2#10:2, B0:6>>;
+codepoint_to_bytes(C) when (C >= 16#010000 andalso C =< 16#10FFFF) ->
+ %% U+10000 - U+10FFFF - 21 bits
+ <<0:3, B3:3, B2:6, B1:6, B0:6>> = <<C:24>>,
+ <<2#11110:5, B3:3,
+ 2#10:2, B2:6,
+ 2#10:2, B1:6,
+ 2#10:2, B0:6>>.
+-spec codepoints_to_bytes([unichar()]) -> binary().
+%% @doc Convert a list of codepoints to a UTF-8 binary.
+codepoints_to_bytes(L) ->
+ <<<<(codepoint_to_bytes(C))/binary>> || C <- L>>.
+-spec read_codepoint(binary()) -> {unichar(), binary(), binary()}.
+read_codepoint(Bin = <<2#0:1, C:7, Rest/binary>>) ->
+ %% U+0000 - U+007F - 7 bits
+ <<B:1/binary, _/binary>> = Bin,
+ {C, B, Rest};
+read_codepoint(Bin = <<2#110:3, B1:5,
+ 2#10:2, B0:6,
+ Rest/binary>>) ->
+ %% U+0080 - U+07FF - 11 bits
+ case <<B1:5, B0:6>> of
+ <<C:11>> when C >= 16#80 ->
+ <<B:2/binary, _/binary>> = Bin,
+ {C, B, Rest}
+ end;
+read_codepoint(Bin = <<2#1110:4, B2:4,
+ 2#10:2, B1:6,
+ 2#10:2, B0:6,
+ Rest/binary>>) ->
+ %% U+0800 - U+FFFF - 16 bits (excluding UTC-16 surrogate code points)
+ case <<B2:4, B1:6, B0:6>> of
+ <<C:16>> when (C >= 16#0800 andalso C =< 16#FFFF) andalso
+ (C < 16#D800 orelse C > 16#DFFF) ->
+ <<B:3/binary, _/binary>> = Bin,
+ {C, B, Rest}
+ end;
+read_codepoint(Bin = <<2#11110:5, B3:3,
+ 2#10:2, B2:6,
+ 2#10:2, B1:6,
+ 2#10:2, B0:6,
+ Rest/binary>>) ->
+ %% U+10000 - U+10FFFF - 21 bits
+ case <<B3:3, B2:6, B1:6, B0:6>> of
+ <<C:21>> when (C >= 16#010000 andalso C =< 16#10FFFF) ->
+ <<B:4/binary, _/binary>> = Bin,
+ {C, B, Rest}
+ end.
+-spec codepoint_foldl(fun((unichar(), _) -> _), _, binary()) -> _.
+codepoint_foldl(F, Acc, <<>>) when is_function(F, 2) ->
+ Acc;
+codepoint_foldl(F, Acc, Bin) ->
+ {C, _, Rest} = read_codepoint(Bin),
+ codepoint_foldl(F, F(C, Acc), Rest).
+-spec bytes_foldl(fun((binary(), _) -> _), _, binary()) -> _.
+bytes_foldl(F, Acc, <<>>) when is_function(F, 2) ->
+ Acc;
+bytes_foldl(F, Acc, Bin) ->
+ {_, B, Rest} = read_codepoint(Bin),
+ bytes_foldl(F, F(B, Acc), Rest).
+-spec bytes_to_codepoints(binary()) -> [unichar()].
+bytes_to_codepoints(B) ->
+ lists:reverse(codepoint_foldl(fun (C, Acc) -> [C | Acc] end, [], B)).
+-spec len(binary()) -> non_neg_integer().
+len(<<>>) ->
+ 0;
+len(B) ->
+ {_, _, Rest} = read_codepoint(B),
+ 1 + len(Rest).
+-spec valid_utf8_bytes(B::binary()) -> binary().
+%% @doc Return only the bytes in B that represent valid UTF-8. Uses
+%% the following recursive algorithm: skip one byte if B does not
+%% follow UTF-8 syntax (a 1-4 byte encoding of some number),
+%% skip sequence of 2-4 bytes if it represents an overlong encoding
+%% or bad code point (surrogate U+D800 - U+DFFF or > U+10FFFF).
+valid_utf8_bytes(B) when is_binary(B) ->
+ binary_skip_bytes(B, invalid_utf8_indexes(B)).
+%% Internal API
+-spec binary_skip_bytes(binary(), [non_neg_integer()]) -> binary().
+%% @doc Return B, but skipping the 0-based indexes in L.
+binary_skip_bytes(B, []) ->
+ B;
+binary_skip_bytes(B, L) ->
+ binary_skip_bytes(B, L, 0, []).
+%% @private
+-spec binary_skip_bytes(binary(), [non_neg_integer()], non_neg_integer(), iolist()) -> binary().
+binary_skip_bytes(B, [], _N, Acc) ->
+ iolist_to_binary(lists:reverse([B | Acc]));
+binary_skip_bytes(<<_, RestB/binary>>, [N | RestL], N, Acc) ->
+ binary_skip_bytes(RestB, RestL, 1 + N, Acc);
+binary_skip_bytes(<<C, RestB/binary>>, L, N, Acc) ->
+ binary_skip_bytes(RestB, L, 1 + N, [C | Acc]).
+-spec invalid_utf8_indexes(binary()) -> [non_neg_integer()].
+%% @doc Return the 0-based indexes in B that are not valid UTF-8.
+invalid_utf8_indexes(B) ->
+ invalid_utf8_indexes(B, 0, []).
+%% @private.
+-spec invalid_utf8_indexes(binary(), non_neg_integer(), [non_neg_integer()]) -> [non_neg_integer()].
+invalid_utf8_indexes(<<C, Rest/binary>>, N, Acc) when C < 16#80 ->
+ %% U+0000 - U+007F - 7 bits
+ invalid_utf8_indexes(Rest, 1 + N, Acc);
+invalid_utf8_indexes(<<C1, C2, Rest/binary>>, N, Acc)
+ when C1 band 16#E0 =:= 16#C0,
+ C2 band 16#C0 =:= 16#80 ->
+ %% U+0080 - U+07FF - 11 bits
+ case ((C1 band 16#1F) bsl 6) bor (C2 band 16#3F) of
+ C when C < 16#80 ->
+ %% Overlong encoding.
+ invalid_utf8_indexes(Rest, 2 + N, [1 + N, N | Acc]);
+ _ ->
+ %% Upper bound U+07FF does not need to be checked
+ invalid_utf8_indexes(Rest, 2 + N, Acc)
+ end;
+invalid_utf8_indexes(<<C1, C2, C3, Rest/binary>>, N, Acc)
+ when C1 band 16#F0 =:= 16#E0,
+ C2 band 16#C0 =:= 16#80,
+ C3 band 16#C0 =:= 16#80 ->
+ %% U+0800 - U+FFFF - 16 bits
+ case ((((C1 band 16#0F) bsl 6) bor (C2 band 16#3F)) bsl 6) bor
+ (C3 band 16#3F) of
+ C when (C < 16#800) orelse (C >= 16#D800 andalso C =< 16#DFFF) ->
+ %% Overlong encoding or surrogate.
+ invalid_utf8_indexes(Rest, 3 + N, [2 + N, 1 + N, N | Acc]);
+ _ ->
+ %% Upper bound U+FFFF does not need to be checked
+ invalid_utf8_indexes(Rest, 3 + N, Acc)
+ end;
+invalid_utf8_indexes(<<C1, C2, C3, C4, Rest/binary>>, N, Acc)
+ when C1 band 16#F8 =:= 16#F0,
+ C2 band 16#C0 =:= 16#80,
+ C3 band 16#C0 =:= 16#80,
+ C4 band 16#C0 =:= 16#80 ->
+ %% U+10000 - U+10FFFF - 21 bits
+ case ((((((C1 band 16#0F) bsl 6) bor (C2 band 16#3F)) bsl 6) bor
+ (C3 band 16#3F)) bsl 6) bor (C4 band 16#3F) of
+ C when (C < 16#10000) orelse (C > 16#10FFFF) ->
+ %% Overlong encoding or invalid code point.
+ invalid_utf8_indexes(Rest, 4 + N, [3 + N, 2 + N, 1 + N, N | Acc]);
+ _ ->
+ invalid_utf8_indexes(Rest, 4 + N, Acc)
+ end;
+invalid_utf8_indexes(<<_, Rest/binary>>, N, Acc) ->
+ %% Invalid char
+ invalid_utf8_indexes(Rest, 1 + N, [N | Acc]);
+invalid_utf8_indexes(<<>>, _N, Acc) ->
+ lists:reverse(Acc).
+%% Tests
+binary_skip_bytes_test() ->
+ ?assertEqual(<<"foo">>,
+ binary_skip_bytes(<<"foo">>, [])),
+ ?assertEqual(<<"foobar">>,
+ binary_skip_bytes(<<"foo bar">>, [3])),
+ ?assertEqual(<<"foo">>,
+ binary_skip_bytes(<<"foo bar">>, [3, 4, 5, 6])),
+ ?assertEqual(<<"oo bar">>,
+ binary_skip_bytes(<<"foo bar">>, [0])),
+ ok.
+invalid_utf8_indexes_test() ->
+ ?assertEqual(
+ [],
+ invalid_utf8_indexes(<<"unicode snowman for you: ", 226, 152, 131>>)),
+ ?assertEqual(
+ [0],
+ invalid_utf8_indexes(<<128>>)),
+ ?assertEqual(
+ [57,59,60,64,66,67],
+ invalid_utf8_indexes(<<"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; (",
+ 167, 65, 170, 186, 73, 83, 80, 166, 87, 186, 217, 41, 41>>)),
+ ok.
+codepoint_to_bytes_test() ->
+ %% U+0000 - U+007F - 7 bits
+ %% U+0080 - U+07FF - 11 bits
+ %% U+0800 - U+FFFF - 16 bits (excluding UTC-16 surrogate code points)
+ %% U+10000 - U+10FFFF - 21 bits
+ ?assertEqual(
+ <<"a">>,
+ codepoint_to_bytes($a)),
+ ?assertEqual(
+ <<16#c2, 16#80>>,
+ codepoint_to_bytes(16#80)),
+ ?assertEqual(
+ <<16#df, 16#bf>>,
+ codepoint_to_bytes(16#07ff)),
+ ?assertEqual(
+ <<16#ef, 16#bf, 16#bf>>,
+ codepoint_to_bytes(16#ffff)),
+ ?assertEqual(
+ <<16#f4, 16#8f, 16#bf, 16#bf>>,
+ codepoint_to_bytes(16#10ffff)),
+ ok.
+bytes_foldl_test() ->
+ ?assertEqual(
+ <<"abc">>,
+ bytes_foldl(fun (B, Acc) -> <<Acc/binary, B/binary>> end, <<>>, <<"abc">>)),
+ ?assertEqual(
+ <<"abc", 226, 152, 131, 228, 184, 173, 194, 133, 244,143,191,191>>,
+ bytes_foldl(fun (B, Acc) -> <<Acc/binary, B/binary>> end, <<>>,
+ <<"abc", 226, 152, 131, 228, 184, 173, 194, 133, 244,143,191,191>>)),
+ ok.
+bytes_to_codepoints_test() ->
+ ?assertEqual(
+ "abc" ++ [16#2603, 16#4e2d, 16#85, 16#10ffff],
+ bytes_to_codepoints(<<"abc", 226, 152, 131, 228, 184, 173, 194, 133, 244,143,191,191>>)),
+ ok.
+codepoint_foldl_test() ->
+ ?assertEqual(
+ "cba",
+ codepoint_foldl(fun (C, Acc) -> [C | Acc] end, [], <<"abc">>)),
+ ?assertEqual(
+ [16#10ffff, 16#85, 16#4e2d, 16#2603 | "cba"],
+ codepoint_foldl(fun (C, Acc) -> [C | Acc] end, [],
+ <<"abc", 226, 152, 131, 228, 184, 173, 194, 133, 244,143,191,191>>)),
+ ok.
+len_test() ->
+ ?assertEqual(
+ 29,
+ len(<<"unicode snowman for you: ", 226, 152, 131, 228, 184, 173, 194, 133, 244, 143, 191, 191>>)),
+ ok.
+codepoints_to_bytes_test() ->
+ ?assertEqual(
+ iolist_to_binary(lists:map(fun codepoint_to_bytes/1, lists:seq(1, 1000))),
+ codepoints_to_bytes(lists:seq(1, 1000))),
+ ok.
+valid_utf8_bytes_test() ->
+ ?assertEqual(
+ <<"invalid U+11ffff: ">>,
+ valid_utf8_bytes(<<"invalid U+11ffff: ", 244, 159, 191, 191>>)),
+ ?assertEqual(
+ <<"U+10ffff: ", 244, 143, 191, 191>>,
+ valid_utf8_bytes(<<"U+10ffff: ", 244, 143, 191, 191>>)),
+ ?assertEqual(
+ <<"overlong 2-byte encoding (a): ">>,
+ valid_utf8_bytes(<<"overlong 2-byte encoding (a): ", 2#11000001, 2#10100001>>)),
+ ?assertEqual(
+ <<"overlong 2-byte encoding (!): ">>,
+ valid_utf8_bytes(<<"overlong 2-byte encoding (!): ", 2#11000000, 2#10100001>>)),
+ ?assertEqual(
+ <<"mu: ", 194, 181>>,
+ valid_utf8_bytes(<<"mu: ", 194, 181>>)),
+ ?assertEqual(
+ <<"bad coding bytes: ">>,
+ valid_utf8_bytes(<<"bad coding bytes: ", 2#10011111, 2#10111111, 2#11111111>>)),
+ ?assertEqual(
+ <<"low surrogate (unpaired): ">>,
+ valid_utf8_bytes(<<"low surrogate (unpaired): ", 237, 176, 128>>)),
+ ?assertEqual(
+ <<"high surrogate (unpaired): ">>,
+ valid_utf8_bytes(<<"high surrogate (unpaired): ", 237, 191, 191>>)),
+ ?assertEqual(
+ <<"unicode snowman for you: ", 226, 152, 131>>,
+ valid_utf8_bytes(<<"unicode snowman for you: ", 226, 152, 131>>)),
+ ?assertEqual(
+ <<"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; (AISPW))">>,
+ valid_utf8_bytes(<<"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; (",
+ 167, 65, 170, 186, 73, 83, 80, 166, 87, 186, 217, 41, 41>>)),
+ ok.