From 7bb481fda9ecb134804b49c2ce77ca28f7eea583 Mon Sep 17 00:00:00 2001 From: Hans-Christoph Steiner Date: Fri, 30 Mar 2012 20:42:12 -0400 Subject: Imported Upstream version 2.0.3 --- ext/fts3/README.tokenizers | 133 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 ext/fts3/README.tokenizers (limited to 'ext/fts3/README.tokenizers') diff --git a/ext/fts3/README.tokenizers b/ext/fts3/README.tokenizers new file mode 100644 index 0000000..e06803a --- /dev/null +++ b/ext/fts3/README.tokenizers @@ -0,0 +1,133 @@ + +1. FTS3 Tokenizers + + When creating a new full-text table, FTS3 allows the user to select + the text tokenizer implementation to be used when indexing text + by specifying a "tokenize" clause as part of the CREATE VIRTUAL TABLE + statement: + + CREATE VIRTUAL TABLE USING fts3( + [, tokenize []] + ); + + The built-in tokenizers (valid values to pass as ) are + "simple" and "porter". + + should consist of zero or more white-space separated + arguments to pass to the selected tokenizer implementation. The + interpretation of the arguments, if any, depends on the individual + tokenizer. + +2. Custom Tokenizers + + FTS3 allows users to provide custom tokenizer implementations. The + interface used to create a new tokenizer is defined and described in + the fts3_tokenizer.h source file. + + Registering a new FTS3 tokenizer is similar to registering a new + virtual table module with SQLite. The user passes a pointer to a + structure containing pointers to various callback functions that + make up the implementation of the new tokenizer type. For tokenizers, + the structure (defined in fts3_tokenizer.h) is called + "sqlite3_tokenizer_module". + + FTS3 does not expose a C-function that users call to register new + tokenizer types with a database handle. Instead, the pointer must + be encoded as an SQL blob value and passed to FTS3 through the SQL + engine by evaluating a special scalar function, "fts3_tokenizer()". + The fts3_tokenizer() function may be called with one or two arguments, + as follows: + + SELECT fts3_tokenizer(); + SELECT fts3_tokenizer(, ); + + Where is a string identifying the tokenizer and + is a pointer to an sqlite3_tokenizer_module + structure encoded as an SQL blob. If the second argument is present, + it is registered as tokenizer and a copy of it + returned. If only one argument is passed, a pointer to the tokenizer + implementation currently registered as is returned, + encoded as a blob. Or, if no such tokenizer exists, an SQL exception + (error) is raised. + + SECURITY: If the fts3 extension is used in an environment where potentially + malicious users may execute arbitrary SQL (i.e. gears), they should be + prevented from invoking the fts3_tokenizer() function, possibly using the + authorisation callback. + + See "Sample code" below for an example of calling the fts3_tokenizer() + function from C code. + +3. ICU Library Tokenizers + + If this extension is compiled with the SQLITE_ENABLE_ICU pre-processor + symbol defined, then there exists a built-in tokenizer named "icu" + implemented using the ICU library. The first argument passed to the + xCreate() method (see fts3_tokenizer.h) of this tokenizer may be + an ICU locale identifier. For example "tr_TR" for Turkish as used + in Turkey, or "en_AU" for English as used in Australia. For example: + + "CREATE VIRTUAL TABLE thai_text USING fts3(text, tokenizer icu th_TH)" + + The ICU tokenizer implementation is very simple. It splits the input + text according to the ICU rules for finding word boundaries and discards + any tokens that consist entirely of white-space. This may be suitable + for some applications in some locales, but not all. If more complex + processing is required, for example to implement stemming or + discard punctuation, this can be done by creating a tokenizer + implementation that uses the ICU tokenizer as part of its implementation. + + When using the ICU tokenizer this way, it is safe to overwrite the + contents of the strings returned by the xNext() method (see + fts3_tokenizer.h). + +4. Sample code. + + The following two code samples illustrate the way C code should invoke + the fts3_tokenizer() scalar function: + + int registerTokenizer( + sqlite3 *db, + char *zName, + const sqlite3_tokenizer_module *p + ){ + int rc; + sqlite3_stmt *pStmt; + const char zSql[] = "SELECT fts3_tokenizer(?, ?)"; + + rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0); + if( rc!=SQLITE_OK ){ + return rc; + } + + sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC); + sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC); + sqlite3_step(pStmt); + + return sqlite3_finalize(pStmt); + } + + int queryTokenizer( + sqlite3 *db, + char *zName, + const sqlite3_tokenizer_module **pp + ){ + int rc; + sqlite3_stmt *pStmt; + const char zSql[] = "SELECT fts3_tokenizer(?)"; + + *pp = 0; + rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0); + if( rc!=SQLITE_OK ){ + return rc; + } + + sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC); + if( SQLITE_ROW==sqlite3_step(pStmt) ){ + if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){ + memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp)); + } + } + + return sqlite3_finalize(pStmt); + } -- cgit v1.2.3