summaryrefslogtreecommitdiff
path: root/test/fts3_common.tcl
blob: 2ed1f70bf626bee9781bdba5393c3a004227c841 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
# 2009 November 04
#
# The author disclaims copyright to this source code.  In place of
# a legal notice, here is a blessing:
#
#    May you do good and not evil.
#    May you find forgiveness for yourself and forgive others.
#    May you share freely, never taking more than you give.
#
#***********************************************************************
#
# This file contains common code used the fts3 tests. At one point
# equivalent functionality was implemented in C code. But it is easier
# to use Tcl.
#

#-------------------------------------------------------------------------
# INSTRUCTIONS
#
# The following commands are available:
#
#   fts3_build_db_1 N
#     Using database handle [db] create an FTS4 table named t1 and populate
#     it with N rows of data. N must be less than 10,000. Refer to the
#     header comments above the proc implementation below for details.
#
#   fts3_build_db_2 N
#     Using database handle [db] create an FTS4 table named t2 and populate
#     it with N rows of data. N must be less than 100,000. Refer to the
#     header comments above the proc implementation below for details.
#
#   fts3_integrity_check TBL
#     TBL must be an FTS table in the database currently opened by handle
#     [db]. This proc loads and tokenizes all documents within the table,
#     then checks that the current contents of the FTS index matches the
#     results.
#
#   fts3_terms TBL WHERE
#     Todo.
#
#   fts3_doclist TBL TERM WHERE
#     Todo.
#
#
#

#-------------------------------------------------------------------------
# USAGE: fts3_build_db_1 SWITCHES N
#
# Build a sample FTS table in the database opened by database connection 
# [db]. The name of the new table is "t1".
#
proc fts3_build_db_1 {args} {

  set default(-module) fts4

  set nArg [llength $args]
  if {($nArg%2)==0} {
    error "wrong # args: should be \"fts3_build_db_1 ?switches? n\""
  }

  set n [lindex $args [expr $nArg-1]]
  array set opts [array get default]
  array set opts [lrange $args 0 [expr $nArg-2]]
  foreach k [array names opts] {
    if {0==[info exists default($k)]} { error "unknown option: $k" }
  }

  if {$n > 10000} {error "n must be <= 10000"}
  db eval "CREATE VIRTUAL TABLE t1 USING $opts(-module) (x, y)"

  set xwords [list zero one two three four five six seven eight nine ten]
  set ywords [list alpha beta gamma delta epsilon zeta eta theta iota kappa]

  for {set i 0} {$i < $n} {incr i} {
    set x ""
    set y ""

    set x [list]
    lappend x [lindex $xwords [expr ($i / 1000) % 10]]
    lappend x [lindex $xwords [expr ($i / 100)  % 10]]
    lappend x [lindex $xwords [expr ($i / 10)   % 10]]
    lappend x [lindex $xwords [expr ($i / 1)   % 10]]

    set y [list]
    lappend y [lindex $ywords [expr ($i / 1000) % 10]]
    lappend y [lindex $ywords [expr ($i / 100)  % 10]]
    lappend y [lindex $ywords [expr ($i / 10)   % 10]]
    lappend y [lindex $ywords [expr ($i / 1)   % 10]]

    db eval { INSERT INTO t1(docid, x, y) VALUES($i, $x, $y) }
  }
}

#-------------------------------------------------------------------------
# USAGE: fts3_build_db_2 N ARGS
#
# Build a sample FTS table in the database opened by database connection 
# [db]. The name of the new table is "t2".
#
proc fts3_build_db_2 {args} {

  set default(-module) fts4
  set default(-extra)   ""

  set nArg [llength $args]
  if {($nArg%2)==0} {
    error "wrong # args: should be \"fts3_build_db_1 ?switches? n\""
  }

  set n [lindex $args [expr $nArg-1]]
  array set opts [array get default]
  array set opts [lrange $args 0 [expr $nArg-2]]
  foreach k [array names opts] {
    if {0==[info exists default($k)]} { error "unknown option: $k" }
  }

  if {$n > 100000} {error "n must be <= 100000"}

  set sql "CREATE VIRTUAL TABLE t2 USING $opts(-module) (content"
  if {$opts(-extra) != ""} {
    append sql ", " $opts(-extra)
  }
  append sql ")"
  db eval $sql

  set chars [list a b c d e f g h  i j k l m n o p  q r s t u v w x  y z ""]

  for {set i 0} {$i < $n} {incr i} {
    set word ""
    set nChar [llength $chars]
    append word [lindex $chars [expr {($i / 1)   % $nChar}]]
    append word [lindex $chars [expr {($i / $nChar)  % $nChar}]]
    append word [lindex $chars [expr {($i / ($nChar*$nChar)) % $nChar}]]

    db eval { INSERT INTO t2(docid, content) VALUES($i, $word) }
  }
}

#-------------------------------------------------------------------------
# USAGE: fts3_integrity_check TBL
#
# This proc is used to verify that the full-text index is consistent with
# the contents of the fts3 table. In other words, it checks that the
# data in the %_contents table matches that in the %_segdir and %_segments 
# tables.
#
# This is not an efficient procedure. It uses a lot of memory and a lot
# of CPU. But it is better than not checking at all.
#
# The procedure is:
#
#   1) Read the entire full-text index from the %_segdir and %_segments
#      tables into memory. For each entry in the index, the following is
#      done:
#
#          set C($iDocid,$iCol,$iPosition) $zTerm
#
#   2) Iterate through each column of each row of the %_content table. 
#      Tokenize all documents, and check that for each token there is
#      a corresponding entry in the $C array. After checking a token,
#      [unset] the $C array entry.
#
#   3) Check that array $C is now empty.
#      
#
proc fts3_integrity_check {tbl} {

  fts3_read2 $tbl 1 A

  foreach zTerm [array names A] {
    #puts $zTerm
    foreach doclist $A($zTerm) {
      set docid 0
      while {[string length $doclist]>0} {
        set iCol 0
        set iPos 0
        set lPos [list]
        set lCol [list]

        # First varint of a doclist-entry is the docid. Delta-compressed
        # with respect to the docid of the previous entry.
        #
        incr docid [gobble_varint doclist]
        if {[info exists D($zTerm,$docid)]} {
          while {[set iDelta [gobble_varint doclist]] != 0} {}
          continue
        }
        set D($zTerm,$docid) 1

        # Gobble varints until the 0x00 that terminates the doclist-entry
        # is found.
        while {[set iDelta [gobble_varint doclist]] > 0} {
          if {$iDelta == 1} {
            set iCol [gobble_varint doclist]
            set iPos 0
          } else {
            incr iPos $iDelta
            incr iPos -2
            set C($docid,$iCol,$iPos) $zTerm
          }
        }
      }
    }
  }

  foreach key [array names C] {
    #puts "$key -> $C($key)"
  }


  db eval "SELECT * FROM ${tbl}_content" E {
    set iCol 0
    set iDoc $E(docid)
    foreach col [lrange $E(*) 1 end] {
      set c $E($col)
      set sql {SELECT fts3_tokenizer_test('simple', $c)}

      foreach {pos term dummy} [db one $sql] {
        if {![info exists C($iDoc,$iCol,$pos)]} {
          set es "Error at docid=$iDoc col=$iCol pos=$pos. Index is missing"
          lappend errors $es
        } else {
          if {[string compare $C($iDoc,$iCol,$pos) $term]} {
            set    es "Error at docid=$iDoc col=$iCol pos=$pos. Index "
            append es "has \"$C($iDoc,$iCol,$pos)\", document has \"$term\""
            lappend errors $es
          }
          unset C($iDoc,$iCol,$pos)
        }
      }
      incr iCol
    }
  }

  foreach c [array names C] {
    lappend errors "Bad index entry: $c -> $C($c)"
  }

  if {[info exists errors]} { return [join $errors "\n"] }
  return "ok"
}

# USAGE: fts3_terms TBL WHERE
#
# Argument TBL must be the name of an FTS3 table. Argument WHERE is an
# SQL expression that will be used as the WHERE clause when scanning
# the %_segdir table. As in the following query:
#
#   "SELECT * FROM ${TBL}_segdir WHERE ${WHERE}"
#
# This function returns a list of all terms present in the segments
# selected by the statement above.
#
proc fts3_terms {tbl where} {
  fts3_read $tbl $where a
  return [lsort [array names a]]
}


# USAGE: fts3_doclist TBL TERM WHERE
#
# Argument TBL must be the name of an FTS3 table. TERM is a term that may
# or may not be present in the table. Argument WHERE is used to select a 
# subset of the b-tree segments in the associated full-text index as 
# described above for [fts3_terms].
#
# This function returns the results of merging the doclists associated
# with TERM in the selected segments. Each doclist is an element of the
# returned list. Each doclist is formatted as follows:
#
#   [$docid ?$col[$off1 $off2...]?...]
#
# The formatting is odd for a Tcl command in order to be compatible with
# the original C-language implementation. If argument WHERE is "1", then 
# any empty doclists are omitted from the returned list.
#
proc fts3_doclist {tbl term where} {
  fts3_read $tbl $where a


  foreach doclist $a($term) {
    set docid 0

    while {[string length $doclist]>0} {
      set iCol 0
      set iPos 0
      set lPos [list]
      set lCol [list]
      incr docid [gobble_varint doclist]
  
      while {[set iDelta [gobble_varint doclist]] > 0} {
        if {$iDelta == 1} {
          lappend lCol [list $iCol $lPos]
          set iPos 0
          set lPos [list]
          set iCol [gobble_varint doclist]
        } else {
          incr iPos $iDelta
          incr iPos -2
          lappend lPos $iPos
        }
      }
  
      if {[llength $lPos]>0} {
        lappend lCol [list $iCol $lPos]
      }
  
      if {$where != "1" || [llength $lCol]>0} {
        set ret($docid) $lCol
      } else {
        unset -nocomplain ret($docid)
      }
    }
  }

  set lDoc [list]
  foreach docid [lsort -integer [array names ret]] {
    set lCol [list]
    set cols ""
    foreach col $ret($docid) {
      foreach {iCol lPos} $col {}
      append cols " $iCol\[[join $lPos { }]\]"
    }
    lappend lDoc "\[${docid}${cols}\]"
  }

  join $lDoc " "
}

###########################################################################

proc gobble_varint {varname} {
  upvar $varname blob
  set n [read_fts3varint $blob ret]
  set blob [string range $blob $n end]
  return $ret
}
proc gobble_string {varname nLength} {
  upvar $varname blob
  set ret [string range $blob 0 [expr $nLength-1]]
  set blob [string range $blob $nLength end]
  return $ret
}

# The argument is a blob of data representing an FTS3 segment leaf. 
# Return a list consisting of alternating terms (strings) and doclists
# (blobs of data).
#
proc fts3_readleaf {blob} {
  set zPrev ""
  set terms [list]

  while {[string length $blob] > 0} {
    set nPrefix [gobble_varint blob]
    set nSuffix [gobble_varint blob]

    set zTerm [string range $zPrev 0 [expr $nPrefix-1]]
    append zTerm [gobble_string blob $nSuffix]
    set nDoclist [gobble_varint blob]
    set doclist [gobble_string blob $nDoclist]

    lappend terms $zTerm $doclist
    set zPrev $zTerm
  }

  return $terms
}

proc fts3_read2 {tbl where varname} {
  upvar $varname a
  array unset a
  db eval " SELECT start_block, leaves_end_block, root 
            FROM ${tbl}_segdir WHERE $where
            ORDER BY level ASC, idx DESC
  " {
    set c 0
    binary scan $root c c
    if {$c==0} {
      foreach {t d} [fts3_readleaf $root] { lappend a($t) $d }
    } else {
      db eval " SELECT block 
                FROM ${tbl}_segments 
                WHERE blockid>=$start_block AND blockid<=$leaves_end_block
                ORDER BY blockid
      " {
        foreach {t d} [fts3_readleaf $block] { lappend a($t) $d }
      }
    }
  }
}

proc fts3_read {tbl where varname} {
  upvar $varname a
  array unset a
  db eval " SELECT start_block, leaves_end_block, root 
            FROM ${tbl}_segdir WHERE $where
            ORDER BY level DESC, idx ASC
  " {
    if {$start_block == 0} {
      foreach {t d} [fts3_readleaf $root] { lappend a($t) $d }
    } else {
      db eval " SELECT block 
                FROM ${tbl}_segments 
                WHERE blockid>=$start_block AND blockid<$leaves_end_block
                ORDER BY blockid
      " {
        foreach {t d} [fts3_readleaf $block] { lappend a($t) $d }

      }
    }
  }
}

##########################################################################