diff options
Diffstat (limited to 'src/mochiweb/mochiutf8.erl')
-rw-r--r-- | src/mochiweb/mochiutf8.erl | 316 |
1 files changed, 316 insertions, 0 deletions
diff --git a/src/mochiweb/mochiutf8.erl b/src/mochiweb/mochiutf8.erl new file mode 100644 index 00000000..206e1186 --- /dev/null +++ b/src/mochiweb/mochiutf8.erl @@ -0,0 +1,316 @@ +%% @copyright 2010 Mochi Media, Inc. +%% @author Bob Ippolito <bob@mochimedia.com> + +%% @doc Algorithm to convert any binary to a valid UTF-8 sequence by ignoring +%% invalid bytes. + +-module(mochiutf8). +-export([valid_utf8_bytes/1, codepoint_to_bytes/1, bytes_to_codepoints/1]). +-export([bytes_foldl/3, codepoint_foldl/3, read_codepoint/1, len/1]). + +%% External API + +-type unichar_low() :: 0..16#d7ff. +-type unichar_high() :: 16#e000..16#10ffff. +-type unichar() :: unichar_low() | unichar_high(). + +-spec codepoint_to_bytes(unichar()) -> binary(). +%% @doc Convert a unicode codepoint to UTF-8 bytes. +codepoint_to_bytes(C) when (C >= 16#00 andalso C =< 16#7f) -> + %% U+0000 - U+007F - 7 bits + <<C>>; +codepoint_to_bytes(C) when (C >= 16#080 andalso C =< 16#07FF) -> + %% U+0080 - U+07FF - 11 bits + <<0:5, B1:5, B0:6>> = <<C:16>>, + <<2#110:3, B1:5, + 2#10:2, B0:6>>; +codepoint_to_bytes(C) when (C >= 16#0800 andalso C =< 16#FFFF) andalso + (C < 16#D800 orelse C > 16#DFFF) -> + %% U+0800 - U+FFFF - 16 bits (excluding UTC-16 surrogate code points) + <<B2:4, B1:6, B0:6>> = <<C:16>>, + <<2#1110:4, B2:4, + 2#10:2, B1:6, + 2#10:2, B0:6>>; +codepoint_to_bytes(C) when (C >= 16#010000 andalso C =< 16#10FFFF) -> + %% U+10000 - U+10FFFF - 21 bits + <<0:3, B3:3, B2:6, B1:6, B0:6>> = <<C:24>>, + <<2#11110:5, B3:3, + 2#10:2, B2:6, + 2#10:2, B1:6, + 2#10:2, B0:6>>. + +-spec codepoints_to_bytes([unichar()]) -> binary(). +%% @doc Convert a list of codepoints to a UTF-8 binary. +codepoints_to_bytes(L) -> + <<<<(codepoint_to_bytes(C))/binary>> || C <- L>>. + +-spec read_codepoint(binary()) -> {unichar(), binary(), binary()}. +read_codepoint(Bin = <<2#0:1, C:7, Rest/binary>>) -> + %% U+0000 - U+007F - 7 bits + <<B:1/binary, _/binary>> = Bin, + {C, B, Rest}; +read_codepoint(Bin = <<2#110:3, B1:5, + 2#10:2, B0:6, + Rest/binary>>) -> + %% U+0080 - U+07FF - 11 bits + case <<B1:5, B0:6>> of + <<C:11>> when C >= 16#80 -> + <<B:2/binary, _/binary>> = Bin, + {C, B, Rest} + end; +read_codepoint(Bin = <<2#1110:4, B2:4, + 2#10:2, B1:6, + 2#10:2, B0:6, + Rest/binary>>) -> + %% U+0800 - U+FFFF - 16 bits (excluding UTC-16 surrogate code points) + case <<B2:4, B1:6, B0:6>> of + <<C:16>> when (C >= 16#0800 andalso C =< 16#FFFF) andalso + (C < 16#D800 orelse C > 16#DFFF) -> + <<B:3/binary, _/binary>> = Bin, + {C, B, Rest} + end; +read_codepoint(Bin = <<2#11110:5, B3:3, + 2#10:2, B2:6, + 2#10:2, B1:6, + 2#10:2, B0:6, + Rest/binary>>) -> + %% U+10000 - U+10FFFF - 21 bits + case <<B3:3, B2:6, B1:6, B0:6>> of + <<C:21>> when (C >= 16#010000 andalso C =< 16#10FFFF) -> + <<B:4/binary, _/binary>> = Bin, + {C, B, Rest} + end. + +-spec codepoint_foldl(fun((unichar(), _) -> _), _, binary()) -> _. +codepoint_foldl(F, Acc, <<>>) when is_function(F, 2) -> + Acc; +codepoint_foldl(F, Acc, Bin) -> + {C, _, Rest} = read_codepoint(Bin), + codepoint_foldl(F, F(C, Acc), Rest). + +-spec bytes_foldl(fun((binary(), _) -> _), _, binary()) -> _. +bytes_foldl(F, Acc, <<>>) when is_function(F, 2) -> + Acc; +bytes_foldl(F, Acc, Bin) -> + {_, B, Rest} = read_codepoint(Bin), + bytes_foldl(F, F(B, Acc), Rest). + +-spec bytes_to_codepoints(binary()) -> [unichar()]. +bytes_to_codepoints(B) -> + lists:reverse(codepoint_foldl(fun (C, Acc) -> [C | Acc] end, [], B)). + +-spec len(binary()) -> non_neg_integer(). +len(<<>>) -> + 0; +len(B) -> + {_, _, Rest} = read_codepoint(B), + 1 + len(Rest). + +-spec valid_utf8_bytes(B::binary()) -> binary(). +%% @doc Return only the bytes in B that represent valid UTF-8. Uses +%% the following recursive algorithm: skip one byte if B does not +%% follow UTF-8 syntax (a 1-4 byte encoding of some number), +%% skip sequence of 2-4 bytes if it represents an overlong encoding +%% or bad code point (surrogate U+D800 - U+DFFF or > U+10FFFF). +valid_utf8_bytes(B) when is_binary(B) -> + binary_skip_bytes(B, invalid_utf8_indexes(B)). + +%% Internal API + +-spec binary_skip_bytes(binary(), [non_neg_integer()]) -> binary(). +%% @doc Return B, but skipping the 0-based indexes in L. +binary_skip_bytes(B, []) -> + B; +binary_skip_bytes(B, L) -> + binary_skip_bytes(B, L, 0, []). + +%% @private +-spec binary_skip_bytes(binary(), [non_neg_integer()], non_neg_integer(), iolist()) -> binary(). +binary_skip_bytes(B, [], _N, Acc) -> + iolist_to_binary(lists:reverse([B | Acc])); +binary_skip_bytes(<<_, RestB/binary>>, [N | RestL], N, Acc) -> + binary_skip_bytes(RestB, RestL, 1 + N, Acc); +binary_skip_bytes(<<C, RestB/binary>>, L, N, Acc) -> + binary_skip_bytes(RestB, L, 1 + N, [C | Acc]). + +-spec invalid_utf8_indexes(binary()) -> [non_neg_integer()]. +%% @doc Return the 0-based indexes in B that are not valid UTF-8. +invalid_utf8_indexes(B) -> + invalid_utf8_indexes(B, 0, []). + +%% @private. +-spec invalid_utf8_indexes(binary(), non_neg_integer(), [non_neg_integer()]) -> [non_neg_integer()]. +invalid_utf8_indexes(<<C, Rest/binary>>, N, Acc) when C < 16#80 -> + %% U+0000 - U+007F - 7 bits + invalid_utf8_indexes(Rest, 1 + N, Acc); +invalid_utf8_indexes(<<C1, C2, Rest/binary>>, N, Acc) + when C1 band 16#E0 =:= 16#C0, + C2 band 16#C0 =:= 16#80 -> + %% U+0080 - U+07FF - 11 bits + case ((C1 band 16#1F) bsl 6) bor (C2 band 16#3F) of + C when C < 16#80 -> + %% Overlong encoding. + invalid_utf8_indexes(Rest, 2 + N, [1 + N, N | Acc]); + _ -> + %% Upper bound U+07FF does not need to be checked + invalid_utf8_indexes(Rest, 2 + N, Acc) + end; +invalid_utf8_indexes(<<C1, C2, C3, Rest/binary>>, N, Acc) + when C1 band 16#F0 =:= 16#E0, + C2 band 16#C0 =:= 16#80, + C3 band 16#C0 =:= 16#80 -> + %% U+0800 - U+FFFF - 16 bits + case ((((C1 band 16#0F) bsl 6) bor (C2 band 16#3F)) bsl 6) bor + (C3 band 16#3F) of + C when (C < 16#800) orelse (C >= 16#D800 andalso C =< 16#DFFF) -> + %% Overlong encoding or surrogate. + invalid_utf8_indexes(Rest, 3 + N, [2 + N, 1 + N, N | Acc]); + _ -> + %% Upper bound U+FFFF does not need to be checked + invalid_utf8_indexes(Rest, 3 + N, Acc) + end; +invalid_utf8_indexes(<<C1, C2, C3, C4, Rest/binary>>, N, Acc) + when C1 band 16#F8 =:= 16#F0, + C2 band 16#C0 =:= 16#80, + C3 band 16#C0 =:= 16#80, + C4 band 16#C0 =:= 16#80 -> + %% U+10000 - U+10FFFF - 21 bits + case ((((((C1 band 16#0F) bsl 6) bor (C2 band 16#3F)) bsl 6) bor + (C3 band 16#3F)) bsl 6) bor (C4 band 16#3F) of + C when (C < 16#10000) orelse (C > 16#10FFFF) -> + %% Overlong encoding or invalid code point. + invalid_utf8_indexes(Rest, 4 + N, [3 + N, 2 + N, 1 + N, N | Acc]); + _ -> + invalid_utf8_indexes(Rest, 4 + N, Acc) + end; +invalid_utf8_indexes(<<_, Rest/binary>>, N, Acc) -> + %% Invalid char + invalid_utf8_indexes(Rest, 1 + N, [N | Acc]); +invalid_utf8_indexes(<<>>, _N, Acc) -> + lists:reverse(Acc). + +%% +%% Tests +%% +-include_lib("eunit/include/eunit.hrl"). +-ifdef(TEST). + +binary_skip_bytes_test() -> + ?assertEqual(<<"foo">>, + binary_skip_bytes(<<"foo">>, [])), + ?assertEqual(<<"foobar">>, + binary_skip_bytes(<<"foo bar">>, [3])), + ?assertEqual(<<"foo">>, + binary_skip_bytes(<<"foo bar">>, [3, 4, 5, 6])), + ?assertEqual(<<"oo bar">>, + binary_skip_bytes(<<"foo bar">>, [0])), + ok. + +invalid_utf8_indexes_test() -> + ?assertEqual( + [], + invalid_utf8_indexes(<<"unicode snowman for you: ", 226, 152, 131>>)), + ?assertEqual( + [0], + invalid_utf8_indexes(<<128>>)), + ?assertEqual( + [57,59,60,64,66,67], + invalid_utf8_indexes(<<"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; (", + 167, 65, 170, 186, 73, 83, 80, 166, 87, 186, 217, 41, 41>>)), + ok. + +codepoint_to_bytes_test() -> + %% U+0000 - U+007F - 7 bits + %% U+0080 - U+07FF - 11 bits + %% U+0800 - U+FFFF - 16 bits (excluding UTC-16 surrogate code points) + %% U+10000 - U+10FFFF - 21 bits + ?assertEqual( + <<"a">>, + codepoint_to_bytes($a)), + ?assertEqual( + <<16#c2, 16#80>>, + codepoint_to_bytes(16#80)), + ?assertEqual( + <<16#df, 16#bf>>, + codepoint_to_bytes(16#07ff)), + ?assertEqual( + <<16#ef, 16#bf, 16#bf>>, + codepoint_to_bytes(16#ffff)), + ?assertEqual( + <<16#f4, 16#8f, 16#bf, 16#bf>>, + codepoint_to_bytes(16#10ffff)), + ok. + +bytes_foldl_test() -> + ?assertEqual( + <<"abc">>, + bytes_foldl(fun (B, Acc) -> <<Acc/binary, B/binary>> end, <<>>, <<"abc">>)), + ?assertEqual( + <<"abc", 226, 152, 131, 228, 184, 173, 194, 133, 244,143,191,191>>, + bytes_foldl(fun (B, Acc) -> <<Acc/binary, B/binary>> end, <<>>, + <<"abc", 226, 152, 131, 228, 184, 173, 194, 133, 244,143,191,191>>)), + ok. + +bytes_to_codepoints_test() -> + ?assertEqual( + "abc" ++ [16#2603, 16#4e2d, 16#85, 16#10ffff], + bytes_to_codepoints(<<"abc", 226, 152, 131, 228, 184, 173, 194, 133, 244,143,191,191>>)), + ok. + +codepoint_foldl_test() -> + ?assertEqual( + "cba", + codepoint_foldl(fun (C, Acc) -> [C | Acc] end, [], <<"abc">>)), + ?assertEqual( + [16#10ffff, 16#85, 16#4e2d, 16#2603 | "cba"], + codepoint_foldl(fun (C, Acc) -> [C | Acc] end, [], + <<"abc", 226, 152, 131, 228, 184, 173, 194, 133, 244,143,191,191>>)), + ok. + +len_test() -> + ?assertEqual( + 29, + len(<<"unicode snowman for you: ", 226, 152, 131, 228, 184, 173, 194, 133, 244, 143, 191, 191>>)), + ok. + +codepoints_to_bytes_test() -> + ?assertEqual( + iolist_to_binary(lists:map(fun codepoint_to_bytes/1, lists:seq(1, 1000))), + codepoints_to_bytes(lists:seq(1, 1000))), + ok. + +valid_utf8_bytes_test() -> + ?assertEqual( + <<"invalid U+11ffff: ">>, + valid_utf8_bytes(<<"invalid U+11ffff: ", 244, 159, 191, 191>>)), + ?assertEqual( + <<"U+10ffff: ", 244, 143, 191, 191>>, + valid_utf8_bytes(<<"U+10ffff: ", 244, 143, 191, 191>>)), + ?assertEqual( + <<"overlong 2-byte encoding (a): ">>, + valid_utf8_bytes(<<"overlong 2-byte encoding (a): ", 2#11000001, 2#10100001>>)), + ?assertEqual( + <<"overlong 2-byte encoding (!): ">>, + valid_utf8_bytes(<<"overlong 2-byte encoding (!): ", 2#11000000, 2#10100001>>)), + ?assertEqual( + <<"mu: ", 194, 181>>, + valid_utf8_bytes(<<"mu: ", 194, 181>>)), + ?assertEqual( + <<"bad coding bytes: ">>, + valid_utf8_bytes(<<"bad coding bytes: ", 2#10011111, 2#10111111, 2#11111111>>)), + ?assertEqual( + <<"low surrogate (unpaired): ">>, + valid_utf8_bytes(<<"low surrogate (unpaired): ", 237, 176, 128>>)), + ?assertEqual( + <<"high surrogate (unpaired): ">>, + valid_utf8_bytes(<<"high surrogate (unpaired): ", 237, 191, 191>>)), + ?assertEqual( + <<"unicode snowman for you: ", 226, 152, 131>>, + valid_utf8_bytes(<<"unicode snowman for you: ", 226, 152, 131>>)), + ?assertEqual( + <<"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; (AISPW))">>, + valid_utf8_bytes(<<"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; (", + 167, 65, 170, 186, 73, 83, 80, 166, 87, 186, 217, 41, 41>>)), + ok. + +-endif. |