summaryrefslogtreecommitdiff
path: root/ext/misc/percentile.c
blob: 74a4c9d12b45a12477cde8c131698760d469819c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
/*
** 2013-05-28
**
** The author disclaims copyright to this source code.  In place of
** a legal notice, here is a blessing:
**
**    May you do good and not evil.
**    May you find forgiveness for yourself and forgive others.
**    May you share freely, never taking more than you give.
**
******************************************************************************
**
** This file contains code to implement the percentile(Y,P) SQL function
** as described below:
**
**   (1)  The percentile(Y,P) function is an aggregate function taking
**        exactly two arguments.
**
**   (2)  If the P argument to percentile(Y,P) is not the same for every
**        row in the aggregate then an error is thrown.  The word "same"
**        in the previous sentence means that the value differ by less
**        than 0.001.
**
**   (3)  If the P argument to percentile(Y,P) evaluates to anything other
**        than a number in the range of 0.0 to 100.0 inclusive then an
**        error is thrown.
**
**   (4)  If any Y argument to percentile(Y,P) evaluates to a value that
**        is not NULL and is not numeric then an error is thrown.
**
**   (5)  If any Y argument to percentile(Y,P) evaluates to plus or minus
**        infinity then an error is thrown.  (SQLite always interprets NaN
**        values as NULL.)
**
**   (6)  Both Y and P in percentile(Y,P) can be arbitrary expressions,
**        including CASE WHEN expressions.
**
**   (7)  The percentile(Y,P) aggregate is able to handle inputs of at least
**        one million (1,000,000) rows.
**
**   (8)  If there are no non-NULL values for Y, then percentile(Y,P)
**        returns NULL.
**
**   (9)  If there is exactly one non-NULL value for Y, the percentile(Y,P)
**        returns the one Y value.
**
**  (10)  If there N non-NULL values of Y where N is two or more and
**        the Y values are ordered from least to greatest and a graph is
**        drawn from 0 to N-1 such that the height of the graph at J is
**        the J-th Y value and such that straight lines are drawn between
**        adjacent Y values, then the percentile(Y,P) function returns
**        the height of the graph at P*(N-1)/100.
**
**  (11)  The percentile(Y,P) function always returns either a floating
**        point number or NULL.
**
**  (12)  The percentile(Y,P) is implemented as a single C99 source-code
**        file that compiles into a shared-library or DLL that can be loaded
**        into SQLite using the sqlite3_load_extension() interface.
*/
#include "sqlite3ext.h"
SQLITE_EXTENSION_INIT1
#include <assert.h>
#include <string.h>
#include <stdlib.h>

/* The following object is the session context for a single percentile()
** function.  We have to remember all input Y values until the very end.
** Those values are accumulated in the Percentile.a[] array.
*/
typedef struct Percentile Percentile;
struct Percentile {
  unsigned nAlloc;     /* Number of slots allocated for a[] */
  unsigned nUsed;      /* Number of slots actually used in a[] */
  double rPct;         /* 1.0 more than the value for P */
  double *a;           /* Array of Y values */
};

/*
** Return TRUE if the input floating-point number is an infinity.
*/
static int isInfinity(double r){
  sqlite3_uint64 u;
  assert( sizeof(u)==sizeof(r) );
  memcpy(&u, &r, sizeof(u));
  return ((u>>52)&0x7ff)==0x7ff;
}

/*
** Return TRUE if two doubles differ by 0.001 or less
*/
static int sameValue(double a, double b){
  a -= b;
  return a>=-0.001 && a<=0.001;
}

/*
** The "step" function for percentile(Y,P) is called once for each
** input row.
*/
static void percentStep(sqlite3_context *pCtx, int argc, sqlite3_value **argv){
  Percentile *p;
  double rPct;
  int eType;
  double y;
  assert( argc==2 );

  /* Requirement 3:  P must be a number between 0 and 100 */
  eType = sqlite3_value_numeric_type(argv[1]);
  rPct = sqlite3_value_double(argv[1]);
  if( (eType!=SQLITE_INTEGER && eType!=SQLITE_FLOAT) ||
      ((rPct = sqlite3_value_double(argv[1]))<0.0 || rPct>100.0) ){
    sqlite3_result_error(pCtx, "2nd argument to percentile() is not "
                         "a number between 0.0 and 100.0", -1);
    return;
  }

  /* Allocate the session context. */
  p = (Percentile*)sqlite3_aggregate_context(pCtx, sizeof(*p));
  if( p==0 ) return;

  /* Remember the P value.  Throw an error if the P value is different
  ** from any prior row, per Requirement (2). */
  if( p->rPct==0.0 ){
    p->rPct = rPct+1.0;
  }else if( !sameValue(p->rPct,rPct+1.0) ){
    sqlite3_result_error(pCtx, "2nd argument to percentile() is not the "
                               "same for all input rows", -1);
    return;
  }

  /* Ignore rows for which Y is NULL */
  eType = sqlite3_value_type(argv[0]);
  if( eType==SQLITE_NULL ) return;

  /* If not NULL, then Y must be numeric.  Otherwise throw an error.
  ** Requirement 4 */
  if( eType!=SQLITE_INTEGER && eType!=SQLITE_FLOAT ){
    sqlite3_result_error(pCtx, "1st argument to percentile() is not "
                               "numeric", -1);
    return;
  }

  /* Throw an error if the Y value is infinity or NaN */
  y = sqlite3_value_double(argv[0]);
  if( isInfinity(y) ){
    sqlite3_result_error(pCtx, "Inf input to percentile()", -1);
    return;
  }

  /* Allocate and store the Y */
  if( p->nUsed>=p->nAlloc ){
    unsigned n = p->nAlloc*2 + 250;
    double *a = sqlite3_realloc(p->a, sizeof(double)*n);
    if( a==0 ){
      sqlite3_free(p->a);
      memset(p, 0, sizeof(*p));
      sqlite3_result_error_nomem(pCtx);
      return;
    }
    p->nAlloc = n;
    p->a = a;
  }
  p->a[p->nUsed++] = y;
}

/*
** Compare to doubles for sorting using qsort()
*/
static int doubleCmp(const void *pA, const void *pB){
  double a = *(double*)pA;
  double b = *(double*)pB;
  if( a==b ) return 0;
  if( a<b ) return -1;
  return +1;
}

/*
** Called to compute the final output of percentile() and to clean
** up all allocated memory.
*/
static void percentFinal(sqlite3_context *pCtx){
  Percentile *p;
  unsigned i1, i2;
  double v1, v2;
  double ix, vx;
  p = (Percentile*)sqlite3_aggregate_context(pCtx, 0);
  if( p==0 ) return;
  if( p->a==0 ) return;
  if( p->nUsed ){
    qsort(p->a, p->nUsed, sizeof(double), doubleCmp);
    ix = (p->rPct-1.0)*(p->nUsed-1)*0.01;
    i1 = (unsigned)ix;
    i2 = ix==(double)i1 || i1==p->nUsed-1 ? i1 : i1+1;
    v1 = p->a[i1];
    v2 = p->a[i2];
    vx = v1 + (v2-v1)*(ix-i1);
    sqlite3_result_double(pCtx, vx);
  }
  sqlite3_free(p->a);
  memset(p, 0, sizeof(*p));
}


#ifdef _WIN32
__declspec(dllexport)
#endif
int sqlite3_percentile_init(
  sqlite3 *db, 
  char **pzErrMsg, 
  const sqlite3_api_routines *pApi
){
  int rc = SQLITE_OK;
  SQLITE_EXTENSION_INIT2(pApi);
  (void)pzErrMsg;  /* Unused parameter */
  rc = sqlite3_create_function(db, "percentile", 2, SQLITE_UTF8, 0,
                               0, percentStep, percentFinal);
  return rc;
}