summaryrefslogtreecommitdiff
path: root/ext/fts2/fts2_tokenizer.c
diff options
context:
space:
mode:
Diffstat (limited to 'ext/fts2/fts2_tokenizer.c')
-rw-r--r--ext/fts2/fts2_tokenizer.c371
1 files changed, 371 insertions, 0 deletions
diff --git a/ext/fts2/fts2_tokenizer.c b/ext/fts2/fts2_tokenizer.c
new file mode 100644
index 0000000..f8b0663
--- /dev/null
+++ b/ext/fts2/fts2_tokenizer.c
@@ -0,0 +1,371 @@
+/*
+** 2007 June 22
+**
+** The author disclaims copyright to this source code. In place of
+** a legal notice, here is a blessing:
+**
+** May you do good and not evil.
+** May you find forgiveness for yourself and forgive others.
+** May you share freely, never taking more than you give.
+**
+******************************************************************************
+**
+** This is part of an SQLite module implementing full-text search.
+** This particular file implements the generic tokenizer interface.
+*/
+
+/*
+** The code in this file is only compiled if:
+**
+** * The FTS2 module is being built as an extension
+** (in which case SQLITE_CORE is not defined), or
+**
+** * The FTS2 module is being built into the core of
+** SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
+*/
+#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
+
+
+#include "sqlite3.h"
+#include "sqlite3ext.h"
+SQLITE_EXTENSION_INIT1
+
+#include "fts2_hash.h"
+#include "fts2_tokenizer.h"
+#include <assert.h>
+
+/*
+** Implementation of the SQL scalar function for accessing the underlying
+** hash table. This function may be called as follows:
+**
+** SELECT <function-name>(<key-name>);
+** SELECT <function-name>(<key-name>, <pointer>);
+**
+** where <function-name> is the name passed as the second argument
+** to the sqlite3Fts2InitHashTable() function (e.g. 'fts2_tokenizer').
+**
+** If the <pointer> argument is specified, it must be a blob value
+** containing a pointer to be stored as the hash data corresponding
+** to the string <key-name>. If <pointer> is not specified, then
+** the string <key-name> must already exist in the has table. Otherwise,
+** an error is returned.
+**
+** Whether or not the <pointer> argument is specified, the value returned
+** is a blob containing the pointer stored as the hash data corresponding
+** to string <key-name> (after the hash-table is updated, if applicable).
+*/
+static void scalarFunc(
+ sqlite3_context *context,
+ int argc,
+ sqlite3_value **argv
+){
+ fts2Hash *pHash;
+ void *pPtr = 0;
+ const unsigned char *zName;
+ int nName;
+
+ assert( argc==1 || argc==2 );
+
+ pHash = (fts2Hash *)sqlite3_user_data(context);
+
+ zName = sqlite3_value_text(argv[0]);
+ nName = sqlite3_value_bytes(argv[0])+1;
+
+ if( argc==2 ){
+ void *pOld;
+ int n = sqlite3_value_bytes(argv[1]);
+ if( n!=sizeof(pPtr) ){
+ sqlite3_result_error(context, "argument type mismatch", -1);
+ return;
+ }
+ pPtr = *(void **)sqlite3_value_blob(argv[1]);
+ pOld = sqlite3Fts2HashInsert(pHash, (void *)zName, nName, pPtr);
+ if( pOld==pPtr ){
+ sqlite3_result_error(context, "out of memory", -1);
+ return;
+ }
+ }else{
+ pPtr = sqlite3Fts2HashFind(pHash, zName, nName);
+ if( !pPtr ){
+ char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
+ sqlite3_result_error(context, zErr, -1);
+ sqlite3_free(zErr);
+ return;
+ }
+ }
+
+ sqlite3_result_blob(context, (void *)&pPtr, sizeof(pPtr), SQLITE_TRANSIENT);
+}
+
+#ifdef SQLITE_TEST
+
+#include <tcl.h>
+#include <string.h>
+
+/*
+** Implementation of a special SQL scalar function for testing tokenizers
+** designed to be used in concert with the Tcl testing framework. This
+** function must be called with two arguments:
+**
+** SELECT <function-name>(<key-name>, <input-string>);
+** SELECT <function-name>(<key-name>, <pointer>);
+**
+** where <function-name> is the name passed as the second argument
+** to the sqlite3Fts2InitHashTable() function (e.g. 'fts2_tokenizer')
+** concatenated with the string '_test' (e.g. 'fts2_tokenizer_test').
+**
+** The return value is a string that may be interpreted as a Tcl
+** list. For each token in the <input-string>, three elements are
+** added to the returned list. The first is the token position, the
+** second is the token text (folded, stemmed, etc.) and the third is the
+** substring of <input-string> associated with the token. For example,
+** using the built-in "simple" tokenizer:
+**
+** SELECT fts_tokenizer_test('simple', 'I don't see how');
+**
+** will return the string:
+**
+** "{0 i I 1 dont don't 2 see see 3 how how}"
+**
+*/
+static void testFunc(
+ sqlite3_context *context,
+ int argc,
+ sqlite3_value **argv
+){
+ fts2Hash *pHash;
+ sqlite3_tokenizer_module *p;
+ sqlite3_tokenizer *pTokenizer = 0;
+ sqlite3_tokenizer_cursor *pCsr = 0;
+
+ const char *zErr = 0;
+
+ const char *zName;
+ int nName;
+ const char *zInput;
+ int nInput;
+
+ const char *zArg = 0;
+
+ const char *zToken;
+ int nToken;
+ int iStart;
+ int iEnd;
+ int iPos;
+
+ Tcl_Obj *pRet;
+
+ assert( argc==2 || argc==3 );
+
+ nName = sqlite3_value_bytes(argv[0]);
+ zName = (const char *)sqlite3_value_text(argv[0]);
+ nInput = sqlite3_value_bytes(argv[argc-1]);
+ zInput = (const char *)sqlite3_value_text(argv[argc-1]);
+
+ if( argc==3 ){
+ zArg = (const char *)sqlite3_value_text(argv[1]);
+ }
+
+ pHash = (fts2Hash *)sqlite3_user_data(context);
+ p = (sqlite3_tokenizer_module *)sqlite3Fts2HashFind(pHash, zName, nName+1);
+
+ if( !p ){
+ char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
+ sqlite3_result_error(context, zErr, -1);
+ sqlite3_free(zErr);
+ return;
+ }
+
+ pRet = Tcl_NewObj();
+ Tcl_IncrRefCount(pRet);
+
+ if( SQLITE_OK!=p->xCreate(zArg ? 1 : 0, &zArg, &pTokenizer) ){
+ zErr = "error in xCreate()";
+ goto finish;
+ }
+ pTokenizer->pModule = p;
+ if( SQLITE_OK!=p->xOpen(pTokenizer, zInput, nInput, &pCsr) ){
+ zErr = "error in xOpen()";
+ goto finish;
+ }
+ pCsr->pTokenizer = pTokenizer;
+
+ while( SQLITE_OK==p->xNext(pCsr, &zToken, &nToken, &iStart, &iEnd, &iPos) ){
+ Tcl_ListObjAppendElement(0, pRet, Tcl_NewIntObj(iPos));
+ Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
+ zToken = &zInput[iStart];
+ nToken = iEnd-iStart;
+ Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
+ }
+
+ if( SQLITE_OK!=p->xClose(pCsr) ){
+ zErr = "error in xClose()";
+ goto finish;
+ }
+ if( SQLITE_OK!=p->xDestroy(pTokenizer) ){
+ zErr = "error in xDestroy()";
+ goto finish;
+ }
+
+finish:
+ if( zErr ){
+ sqlite3_result_error(context, zErr, -1);
+ }else{
+ sqlite3_result_text(context, Tcl_GetString(pRet), -1, SQLITE_TRANSIENT);
+ }
+ Tcl_DecrRefCount(pRet);
+}
+
+static
+int registerTokenizer(
+ sqlite3 *db,
+ char *zName,
+ const sqlite3_tokenizer_module *p
+){
+ int rc;
+ sqlite3_stmt *pStmt;
+ const char zSql[] = "SELECT fts2_tokenizer(?, ?)";
+
+ rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
+ if( rc!=SQLITE_OK ){
+ return rc;
+ }
+
+ sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
+ sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
+ sqlite3_step(pStmt);
+
+ return sqlite3_finalize(pStmt);
+}
+
+static
+int queryFts2Tokenizer(
+ sqlite3 *db,
+ char *zName,
+ const sqlite3_tokenizer_module **pp
+){
+ int rc;
+ sqlite3_stmt *pStmt;
+ const char zSql[] = "SELECT fts2_tokenizer(?)";
+
+ *pp = 0;
+ rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
+ if( rc!=SQLITE_OK ){
+ return rc;
+ }
+
+ sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
+ if( SQLITE_ROW==sqlite3_step(pStmt) ){
+ if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){
+ memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp));
+ }
+ }
+
+ return sqlite3_finalize(pStmt);
+}
+
+void sqlite3Fts2SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule);
+
+/*
+** Implementation of the scalar function fts2_tokenizer_internal_test().
+** This function is used for testing only, it is not included in the
+** build unless SQLITE_TEST is defined.
+**
+** The purpose of this is to test that the fts2_tokenizer() function
+** can be used as designed by the C-code in the queryFts2Tokenizer and
+** registerTokenizer() functions above. These two functions are repeated
+** in the README.tokenizer file as an example, so it is important to
+** test them.
+**
+** To run the tests, evaluate the fts2_tokenizer_internal_test() scalar
+** function with no arguments. An assert() will fail if a problem is
+** detected. i.e.:
+**
+** SELECT fts2_tokenizer_internal_test();
+**
+*/
+static void intTestFunc(
+ sqlite3_context *context,
+ int argc,
+ sqlite3_value **argv
+){
+ int rc;
+ const sqlite3_tokenizer_module *p1;
+ const sqlite3_tokenizer_module *p2;
+ sqlite3 *db = (sqlite3 *)sqlite3_user_data(context);
+
+ /* Test the query function */
+ sqlite3Fts2SimpleTokenizerModule(&p1);
+ rc = queryFts2Tokenizer(db, "simple", &p2);
+ assert( rc==SQLITE_OK );
+ assert( p1==p2 );
+ rc = queryFts2Tokenizer(db, "nosuchtokenizer", &p2);
+ assert( rc==SQLITE_ERROR );
+ assert( p2==0 );
+ assert( 0==strcmp(sqlite3_errmsg(db), "unknown tokenizer: nosuchtokenizer") );
+
+ /* Test the storage function */
+ rc = registerTokenizer(db, "nosuchtokenizer", p1);
+ assert( rc==SQLITE_OK );
+ rc = queryFts2Tokenizer(db, "nosuchtokenizer", &p2);
+ assert( rc==SQLITE_OK );
+ assert( p2==p1 );
+
+ sqlite3_result_text(context, "ok", -1, SQLITE_STATIC);
+}
+
+#endif
+
+/*
+** Set up SQL objects in database db used to access the contents of
+** the hash table pointed to by argument pHash. The hash table must
+** been initialised to use string keys, and to take a private copy
+** of the key when a value is inserted. i.e. by a call similar to:
+**
+** sqlite3Fts2HashInit(pHash, FTS2_HASH_STRING, 1);
+**
+** This function adds a scalar function (see header comment above
+** scalarFunc() in this file for details) and, if ENABLE_TABLE is
+** defined at compilation time, a temporary virtual table (see header
+** comment above struct HashTableVtab) to the database schema. Both
+** provide read/write access to the contents of *pHash.
+**
+** The third argument to this function, zName, is used as the name
+** of both the scalar and, if created, the virtual table.
+*/
+int sqlite3Fts2InitHashTable(
+ sqlite3 *db,
+ fts2Hash *pHash,
+ const char *zName
+){
+ int rc = SQLITE_OK;
+ void *p = (void *)pHash;
+ const int any = SQLITE_ANY;
+ char *zTest = 0;
+ char *zTest2 = 0;
+
+#ifdef SQLITE_TEST
+ void *pdb = (void *)db;
+ zTest = sqlite3_mprintf("%s_test", zName);
+ zTest2 = sqlite3_mprintf("%s_internal_test", zName);
+ if( !zTest || !zTest2 ){
+ rc = SQLITE_NOMEM;
+ }
+#endif
+
+ if( rc!=SQLITE_OK
+ || (rc = sqlite3_create_function(db, zName, 1, any, p, scalarFunc, 0, 0))
+ || (rc = sqlite3_create_function(db, zName, 2, any, p, scalarFunc, 0, 0))
+#ifdef SQLITE_TEST
+ || (rc = sqlite3_create_function(db, zTest, 2, any, p, testFunc, 0, 0))
+ || (rc = sqlite3_create_function(db, zTest, 3, any, p, testFunc, 0, 0))
+ || (rc = sqlite3_create_function(db, zTest2, 0, any, pdb, intTestFunc, 0, 0))
+#endif
+ );
+
+ sqlite3_free(zTest);
+ sqlite3_free(zTest2);
+ return rc;
+}
+
+#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */