From 569c6676a6ddb0ff73821d7693b5e18ddef809b9 Mon Sep 17 00:00:00 2001 From: Hans-Christoph Steiner Date: Thu, 16 Oct 2014 22:51:35 -0400 Subject: Imported Upstream version 3.2.0 --- test/fts4unicode.test | 200 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 185 insertions(+), 15 deletions(-) (limited to 'test/fts4unicode.test') diff --git a/test/fts4unicode.test b/test/fts4unicode.test index 8bd83f6..f237119 100644 --- a/test/fts4unicode.test +++ b/test/fts4unicode.test @@ -44,31 +44,36 @@ proc do_unicode_token_test3 {tn args} { } do_unicode_token_test 1.0 {a B c D} {0 a a 1 b B 2 c c 3 d D} -do_unicode_token_test 1.1 {Ä Ö Ü} {0 ä Ä 1 ö Ö 2 ü Ü} -do_unicode_token_test 1.2 {xÄx xÖx xÜx} {0 xäx xÄx 1 xöx xÖx 2 xüx xÜx} + +do_unicode_token_test 1.1 "\uC4 \uD6 \uDC" \ + "0 \uE4 \uC4 1 \uF6 \uD6 2 \uFC \uDC" + +do_unicode_token_test 1.2 "x\uC4x x\uD6x x\uDCx" \ + "0 x\uE4x x\uC4x 1 x\uF6x x\uD6x 2 x\uFCx x\uDCx" # 0x00DF is a small "sharp s". 0x1E9E is a capital sharp s. do_unicode_token_test 1.3 "\uDF" "0 \uDF \uDF" -do_unicode_token_test 1.4 "\u1E9E" "0 ß \u1E9E" -do_unicode_token_test 1.5 "\u1E9E" "0 \uDF \u1E9E" +do_unicode_token_test 1.4 "\u1E9E" "0 \uDF \u1E9E" -do_unicode_token_test 1.6 "The quick brown fox" { +do_unicode_token_test 1.5 "The quick brown fox" { 0 the The 1 quick quick 2 brown brown 3 fox fox } -do_unicode_token_test 1.7 "The\u00bfquick\u224ebrown\u2263fox" { +do_unicode_token_test 1.6 "The\u00bfquick\u224ebrown\u2263fox" { 0 the The 1 quick quick 2 brown brown 3 fox fox } -do_unicode_token_test2 1.8 {a B c D} {0 a a 1 b B 2 c c 3 d D} -do_unicode_token_test2 1.9 {Ä Ö Ü} {0 a Ä 1 o Ö 2 u Ü} -do_unicode_token_test2 1.10 {xÄx xÖx xÜx} {0 xax xÄx 1 xox xÖx 2 xux xÜx} +do_unicode_token_test2 1.7 {a B c D} {0 a a 1 b B 2 c c 3 d D} +do_unicode_token_test2 1.8 "\uC4 \uD6 \uDC" "0 a \uC4 1 o \uD6 2 u \uDC" + +do_unicode_token_test2 1.9 "x\uC4x x\uD6x x\uDCx" \ + "0 xax x\uC4x 1 xox x\uD6x 2 xux x\uDCx" # Check that diacritics are removed if remove_diacritics=1 is specified. # And that they do not break tokens. -do_unicode_token_test2 1.11 "xx\u0301xx" "0 xxxx xx\u301xx" +do_unicode_token_test2 1.10 "xx\u0301xx" "0 xxxx xx\u301xx" # Title-case mappings work -do_unicode_token_test 1.12 "\u01c5" "0 \u01c6 \u01c5" +do_unicode_token_test 1.11 "\u01c5" "0 \u01c6 \u01c5" #------------------------------------------------------------------------- # @@ -378,11 +383,176 @@ foreach T $tokenizers { do_isspace_test 6.$T.18 $T 12288 do_isspace_test 6.$T.19 $T {32 160 5760 6158} - do_isspace_test 6.$T.19 $T {8192 8193 8194 8195} - do_isspace_test 6.$T.19 $T {8196 8197 8198 8199} - do_isspace_test 6.$T.19 $T {8200 8201 8202 8239} - do_isspace_test 6.$T.19 $T {8287 12288} + do_isspace_test 6.$T.20 $T {8192 8193 8194 8195} + do_isspace_test 6.$T.21 $T {8196 8197 8198 8199} + do_isspace_test 6.$T.22 $T {8200 8201 8202 8239} + do_isspace_test 6.$T.23 $T {8287 12288} +} + +#------------------------------------------------------------------------- +# Test that the private use ranges are treated as alphanumeric. +# +foreach {tn1 c} { + 1 \ue000 2 \ue001 3 \uf000 4 \uf8fe 5 \uf8ff +} { + foreach {tn2 config res} { + 1 "" "0 hello*world hello*world" + 2 "separators=*" "0 hello hello 1 world world" + } { + set config [string map [list * $c] $config] + set input [string map [list * $c] "hello*world"] + set output [string map [list * $c] $res] + do_unicode_token_test3 7.$tn1.$tn2 {*}$config $input $output + } +} + +#------------------------------------------------------------------------- +# Cursory test of remove_diacritics=0. +# +# 00C4;LATIN CAPITAL LETTER A WITH DIAERESIS +# 00D6;LATIN CAPITAL LETTER O WITH DIAERESIS +# 00E4;LATIN SMALL LETTER A WITH DIAERESIS +# 00F6;LATIN SMALL LETTER O WITH DIAERESIS +# +do_execsql_test 8.1.1 " + CREATE VIRTUAL TABLE t3 USING fts4(tokenize=unicode61 'remove_diacritics=1'); + INSERT INTO t3 VALUES('o'); + INSERT INTO t3 VALUES('a'); + INSERT INTO t3 VALUES('O'); + INSERT INTO t3 VALUES('A'); + INSERT INTO t3 VALUES('\xD6'); + INSERT INTO t3 VALUES('\xC4'); + INSERT INTO t3 VALUES('\xF6'); + INSERT INTO t3 VALUES('\xE4'); +" +do_execsql_test 8.1.2 { + SELECT rowid FROM t3 WHERE t3 MATCH 'o'; +} {1 3 5 7} +do_execsql_test 8.1.3 { + SELECT rowid FROM t3 WHERE t3 MATCH 'a'; +} {2 4 6 8} +do_execsql_test 8.2.1 { + CREATE VIRTUAL TABLE t4 USING fts4(tokenize=unicode61 "remove_diacritics=0"); + INSERT INTO t4 SELECT * FROM t3; +} +do_execsql_test 8.2.2 { + SELECT rowid FROM t4 WHERE t4 MATCH 'o'; +} {1 3} +do_execsql_test 8.2.3 { + SELECT rowid FROM t4 WHERE t4 MATCH 'a'; +} {2 4} + +#------------------------------------------------------------------------- +# +foreach {tn sql} { + 1 { + CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 [tokenchars= .]); + CREATE VIRTUAL TABLE t6 USING fts4( + tokenize=unicode61 [tokenchars=="] "tokenchars=[]"); + CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 [separators=x\xC4]); + } + 2 { + CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 "tokenchars= ."); + CREATE VIRTUAL TABLE t6 USING fts4(tokenize=unicode61 "tokenchars=[=""]"); + CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 "separators=x\xC4"); + } + 3 { + CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 'tokenchars= .'); + CREATE VIRTUAL TABLE t6 USING fts4(tokenize=unicode61 'tokenchars=="[]'); + CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 'separators=x\xC4'); + } + 4 { + CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 `tokenchars= .`); + CREATE VIRTUAL TABLE t6 USING fts4(tokenize=unicode61 `tokenchars=[="]`); + CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 `separators=x\xC4`); + } +} { + do_execsql_test 9.$tn.0 { + DROP TABLE IF EXISTS t5; + DROP TABLE IF EXISTS t5aux; + DROP TABLE IF EXISTS t6; + DROP TABLE IF EXISTS t6aux; + DROP TABLE IF EXISTS t7; + DROP TABLE IF EXISTS t7aux; + } + do_execsql_test 9.$tn.1 $sql + + do_execsql_test 9.$tn.2 { + CREATE VIRTUAL TABLE t5aux USING fts4aux(t5); + INSERT INTO t5 VALUES('one two three/four.five.six'); + SELECT * FROM t5aux; + } { + four.five.six * 1 1 four.five.six 0 1 1 + {one two three} * 1 1 {one two three} 0 1 1 + } + + do_execsql_test 9.$tn.3 { + CREATE VIRTUAL TABLE t6aux USING fts4aux(t6); + INSERT INTO t6 VALUES('alpha=beta"gamma/delta[epsilon]zeta'); + SELECT * FROM t6aux; + } { + {alpha=beta"gamma} * 1 1 {alpha=beta"gamma} 0 1 1 + {delta[epsilon]zeta} * 1 1 {delta[epsilon]zeta} 0 1 1 + } + + do_execsql_test 9.$tn.4 { + CREATE VIRTUAL TABLE t7aux USING fts4aux(t7); + INSERT INTO t7 VALUES('alephxbeth\xC4gimel'); + SELECT * FROM t7aux; + } { + aleph * 1 1 aleph 0 1 1 + beth * 1 1 beth 0 1 1 + gimel * 1 1 gimel 0 1 1 + } +} + +# Check that multiple options are handled correctly. +# +do_execsql_test 10.1 { + DROP TABLE IF EXISTS t1; + CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61 + "tokenchars=xyz" "tokenchars=.=" "separators=.=" "separators=xy" + "separators=a" "separators=a" "tokenchars=a" "tokenchars=a" + ); + + INSERT INTO t1 VALUES('oneatwoxthreeyfour'); + INSERT INTO t1 VALUES('a.single=word'); + CREATE VIRTUAL TABLE t1aux USING fts4aux(t1); + SELECT * FROM t1aux; +} { + .single=word * 1 1 .single=word 0 1 1 + four * 1 1 four 0 1 1 + one * 1 1 one 0 1 1 + three * 1 1 three 0 1 1 + two * 1 1 two 0 1 1 +} + +# Test that case folding happens after tokenization, not before. +# +do_execsql_test 10.2 { + DROP TABLE IF EXISTS t2; + CREATE VIRTUAL TABLE t2 USING fts4(tokenize=unicode61 "separators=aB"); + INSERT INTO t2 VALUES('oneatwoBthree'); + INSERT INTO t2 VALUES('onebtwoAthree'); + CREATE VIRTUAL TABLE t2aux USING fts4aux(t2); + SELECT * FROM t2aux; +} { + one * 1 1 one 0 1 1 + onebtwoathree * 1 1 onebtwoathree 0 1 1 + three * 1 1 three 0 1 1 + two * 1 1 two 0 1 1 } +# Test that the tokenchars and separators options work with the +# fts3tokenize table. +# +do_execsql_test 11.1 { + CREATE VIRTUAL TABLE ft1 USING fts3tokenize( + "unicode61", "tokenchars=@.", "separators=1234567890" + ); + SELECT token FROM ft1 WHERE input = 'berlin@street123sydney.road'; +} { + berlin@street sydney.road +} finish_test -- cgit v1.2.3