diff options
author | Adam Kocoloski <kocolosk@apache.org> | 2009-06-08 14:24:54 +0000 |
---|---|---|
committer | Adam Kocoloski <kocolosk@apache.org> | 2009-06-08 14:24:54 +0000 |
commit | d4ac5c5083a310c16ac1754e3dbc3fafe8c5eb66 (patch) | |
tree | 8f9ae25b7e8d19dc741dadba461a3d75a287caa4 /src/mochiweb | |
parent | cfab1d22a87d1da8cbe15a7ac1886d67b4a928a3 (diff) |
accept UTF-16 surrogate pairs. Fixes COUCHDB-327, COUCHDB-333
git-svn-id: https://svn.apache.org/repos/asf/couchdb/trunk@782643 13f79535-47bb-0310-9956-ffa450edef68
Diffstat (limited to 'src/mochiweb')
-rw-r--r-- | src/mochiweb/mochijson2.erl | 18 |
1 files changed, 14 insertions, 4 deletions
diff --git a/src/mochiweb/mochijson2.erl b/src/mochiweb/mochijson2.erl index 8bfd23c8..7d7a8aff 100644 --- a/src/mochiweb/mochijson2.erl +++ b/src/mochiweb/mochijson2.erl @@ -371,11 +371,20 @@ tokenize_string(B, S=#decoder{offset=O}, Acc) -> tokenize_string(B, ?ADV_COL(S, 2), [$\r | Acc]); <<_:O/binary, "\\t", _/binary>> -> tokenize_string(B, ?ADV_COL(S, 2), [$\t | Acc]); - <<_:O/binary, "\\u", C3, C2, C1, C0, _/binary>> -> - %% coalesce UTF-16 surrogate pair? + <<_:O/binary, "\\u", C3, C2, C1, C0, Rest/binary>> -> C = erlang:list_to_integer([C3, C2, C1, C0], 16), - Acc1 = lists:reverse(xmerl_ucs:to_utf8(C), Acc), - tokenize_string(B, ?ADV_COL(S, 6), Acc1); + if C > 16#D7FF, C < 16#DC00 -> + %% coalesce UTF-16 surrogate pair + <<"\\u", D3, D2, D1, D0, _/binary>> = Rest, + D = erlang:list_to_integer([D3,D2,D1,D0], 16), + [CodePoint] = xmerl_ucs:from_utf16be(<<C:16/big-unsigned-integer, + D:16/big-unsigned-integer>>), + Acc1 = lists:reverse(xmerl_ucs:to_utf8(CodePoint), Acc), + tokenize_string(B, ?ADV_COL(S, 12), Acc1); + true -> + Acc1 = lists:reverse(xmerl_ucs:to_utf8(C), Acc), + tokenize_string(B, ?ADV_COL(S, 6), Acc1) + end; <<_:O/binary, C, _/binary>> -> tokenize_string(B, ?INC_CHAR(S, C), [C | Acc]) end. @@ -541,6 +550,7 @@ equiv_list([V1 | L1], [V2 | L2]) -> test_all() -> [1199344435545.0, 1] = decode(<<"[1199344435545.0,1]">>), + <<16#F0,16#9D,16#9C,16#95>> = decode([34,"\\ud835","\\udf15",34]), test_one(e2j_test_vec(utf8), 1). test_one([], _N) -> |