Main Page | Directories | File List

encode.c

00001 /*
00002 ** 2002 April 25
00003 **
00004 ** The author disclaims copyright to this source code.  In place of
00005 ** a legal notice, here is a blessing:
00006 **
00007 **    May you do good and not evil.
00008 **    May you find forgiveness for yourself and forgive others.
00009 **    May you share freely, never taking more than you give.
00010 **
00011 *************************************************************************
00012 ** This file contains helper routines used to translate binary data into
00013 ** a null-terminated string (suitable for use in SQLite) and back again.
00014 ** These are convenience routines for use by people who want to store binary
00015 ** data in an SQLite database.  The code in this file is not used by any other
00016 ** part of the SQLite library.
00017 **
00018 ** $Id: encode.c,v 1.12 2004/03/17 18:44:46 drh Exp $
00019 */
00020 #include <string.h>
00021 #include <assert.h>
00022 
00023 /*
00024 ** How This Encoder Works
00025 **
00026 ** The output is allowed to contain any character except 0x27 (') and
00027 ** 0x00.  This is accomplished by using an escape character to encode
00028 ** 0x27 and 0x00 as a two-byte sequence.  The escape character is always
00029 ** 0x01.  An 0x00 is encoded as the two byte sequence 0x01 0x01.  The
00030 ** 0x27 character is encoded as the two byte sequence 0x01 0x28.  Finally,
00031 ** the escape character itself is encoded as the two-character sequence
00032 ** 0x01 0x02.
00033 **
00034 ** To summarize, the encoder works by using an escape sequences as follows:
00035 **
00036 **       0x00  ->  0x01 0x01
00037 **       0x01  ->  0x01 0x02
00038 **       0x27  ->  0x01 0x28
00039 **
00040 ** If that were all the encoder did, it would work, but in certain cases
00041 ** it could double the size of the encoded string.  For example, to
00042 ** encode a string of 100 0x27 characters would require 100 instances of
00043 ** the 0x01 0x03 escape sequence resulting in a 200-character output.
00044 ** We would prefer to keep the size of the encoded string smaller than
00045 ** this.
00046 **
00047 ** To minimize the encoding size, we first add a fixed offset value to each 
00048 ** byte in the sequence.  The addition is modulo 256.  (That is to say, if
00049 ** the sum of the original character value and the offset exceeds 256, then
00050 ** the higher order bits are truncated.)  The offset is chosen to minimize
00051 ** the number of characters in the string that need to be escaped.  For
00052 ** example, in the case above where the string was composed of 100 0x27
00053 ** characters, the offset might be 0x01.  Each of the 0x27 characters would
00054 ** then be converted into an 0x28 character which would not need to be
00055 ** escaped at all and so the 100 character input string would be converted
00056 ** into just 100 characters of output.  Actually 101 characters of output - 
00057 ** we have to record the offset used as the first byte in the sequence so
00058 ** that the string can be decoded.  Since the offset value is stored as
00059 ** part of the output string and the output string is not allowed to contain
00060 ** characters 0x00 or 0x27, the offset cannot be 0x00 or 0x27.
00061 **
00062 ** Here, then, are the encoding steps:
00063 **
00064 **     (1)   Choose an offset value and make it the first character of
00065 **           output.
00066 **
00067 **     (2)   Copy each input character into the output buffer, one by
00068 **           one, adding the offset value as you copy.
00069 **
00070 **     (3)   If the value of an input character plus offset is 0x00, replace
00071 **           that one character by the two-character sequence 0x01 0x01.
00072 **           If the sum is 0x01, replace it with 0x01 0x02.  If the sum
00073 **           is 0x27, replace it with 0x01 0x03.
00074 **
00075 **     (4)   Put a 0x00 terminator at the end of the output.
00076 **
00077 ** Decoding is obvious:
00078 **
00079 **     (5)   Copy encoded characters except the first into the decode 
00080 **           buffer.  Set the first encoded character aside for use as
00081 **           the offset in step 7 below.
00082 **
00083 **     (6)   Convert each 0x01 0x01 sequence into a single character 0x00.
00084 **           Convert 0x01 0x02 into 0x01.  Convert 0x01 0x28 into 0x27.
00085 **
00086 **     (7)   Subtract the offset value that was the first character of
00087 **           the encoded buffer from all characters in the output buffer.
00088 **
00089 ** The only tricky part is step (1) - how to compute an offset value to
00090 ** minimize the size of the output buffer.  This is accomplished by testing
00091 ** all offset values and picking the one that results in the fewest number
00092 ** of escapes.  To do that, we first scan the entire input and count the
00093 ** number of occurances of each character value in the input.  Suppose
00094 ** the number of 0x00 characters is N(0), the number of occurances of 0x01
00095 ** is N(1), and so forth up to the number of occurances of 0xff is N(255).
00096 ** An offset of 0 is not allowed so we don't have to test it.  The number
00097 ** of escapes required for an offset of 1 is N(1)+N(2)+N(40).  The number
00098 ** of escapes required for an offset of 2 is N(2)+N(3)+N(41).  And so forth.
00099 ** In this way we find the offset that gives the minimum number of escapes,
00100 ** and thus minimizes the length of the output string.
00101 */
00102 
00103 /*
00104 ** Encode a binary buffer "in" of size n bytes so that it contains
00105 ** no instances of characters '\'' or '\000'.  The output is 
00106 ** null-terminated and can be used as a string value in an INSERT
00107 ** or UPDATE statement.  Use sqlite_decode_binary() to convert the
00108 ** string back into its original binary.
00109 **
00110 ** The result is written into a preallocated output buffer "out".
00111 ** "out" must be able to hold at least 2 +(257*n)/254 bytes.
00112 ** In other words, the output will be expanded by as much as 3
00113 ** bytes for every 254 bytes of input plus 2 bytes of fixed overhead.
00114 ** (This is approximately 2 + 1.0118*n or about a 1.2% size increase.)
00115 **
00116 ** The return value is the number of characters in the encoded
00117 ** string, excluding the "\000" terminator.
00118 **
00119 ** If out==NULL then no output is generated but the routine still returns
00120 ** the number of characters that would have been generated if out had
00121 ** not been NULL.
00122 */
00123 int sqlite_encode_binary(const unsigned char *in, int n, unsigned char *out){
00124   int i, j, e, m;
00125   unsigned char x;
00126   int cnt[256];
00127   if( n<=0 ){
00128     if( out ){
00129       out[0] = 'x';
00130       out[1] = 0;
00131     }
00132     return 1;
00133   }
00134   memset(cnt, 0, sizeof(cnt));
00135   for(i=n-1; i>=0; i--){ cnt[in[i]]++; }
00136   m = n;
00137   for(i=1; i<256; i++){
00138     int sum;
00139     if( i=='\'' ) continue;
00140     sum = cnt[i] + cnt[(i+1)&0xff] + cnt[(i+'\'')&0xff];
00141     if( sum<m ){
00142       m = sum;
00143       e = i;
00144       if( m==0 ) break;
00145     }
00146   }
00147   if( out==0 ){
00148     return n+m+1;
00149   }
00150   out[0] = e;
00151   j = 1;
00152   for(i=0; i<n; i++){
00153     x = in[i] - e;
00154     if( x==0 || x==1 || x=='\''){
00155       out[j++] = 1;
00156       x++;
00157     }
00158     out[j++] = x;
00159   }
00160   out[j] = 0;
00161   assert( j==n+m+1 );
00162   return j;
00163 }
00164 
00165 /*
00166 ** Decode the string "in" into binary data and write it into "out".
00167 ** This routine reverses the encoding created by sqlite_encode_binary().
00168 ** The output will always be a few bytes less than the input.  The number
00169 ** of bytes of output is returned.  If the input is not a well-formed
00170 ** encoding, -1 is returned.
00171 **
00172 ** The "in" and "out" parameters may point to the same buffer in order
00173 ** to decode a string in place.
00174 */
00175 int sqlite_decode_binary(const unsigned char *in, unsigned char *out){
00176   int i, e;
00177   unsigned char c;
00178   e = *(in++);
00179   i = 0;
00180   while( (c = *(in++))!=0 ){
00181     if( c==1 ){
00182       c = *(in++) - 1;
00183     }
00184     out[i++] = c + e;
00185   }
00186   return i;
00187 }
00188 
00189 #ifdef ENCODER_TEST
00190 #include <stdio.h>
00191 /*
00192 ** The subroutines above are not tested by the usual test suite.  To test
00193 ** these routines, compile just this one file with a -DENCODER_TEST=1 option
00194 ** and run the result.
00195 */
00196 int main(int argc, char **argv){
00197   int i, j, n, m, nOut, nByteIn, nByteOut;
00198   unsigned char in[30000];
00199   unsigned char out[33000];
00200 
00201   nByteIn = nByteOut = 0;
00202   for(i=0; i<sizeof(in); i++){
00203     printf("Test %d: ", i+1);
00204     n = rand() % (i+1);
00205     if( i%100==0 ){
00206       int k;
00207       for(j=k=0; j<n; j++){
00208         /* if( k==0 || k=='\'' ) k++; */
00209         in[j] = k;
00210         k = (k+1)&0xff;
00211       }
00212     }else{
00213       for(j=0; j<n; j++) in[j] = rand() & 0xff;
00214     }
00215     nByteIn += n;
00216     nOut = sqlite_encode_binary(in, n, out);
00217     nByteOut += nOut;
00218     if( nOut!=strlen(out) ){
00219       printf(" ERROR return value is %d instead of %d\n", nOut, strlen(out));
00220       exit(1);
00221     }
00222     if( nOut!=sqlite_encode_binary(in, n, 0) ){
00223       printf(" ERROR actual output size disagrees with predicted size\n");
00224       exit(1);
00225     }
00226     m = (256*n + 1262)/253;
00227     printf("size %d->%d (max %d)", n, strlen(out)+1, m);
00228     if( strlen(out)+1>m ){
00229       printf(" ERROR output too big\n");
00230       exit(1);
00231     }
00232     for(j=0; out[j]; j++){
00233       if( out[j]=='\'' ){
00234         printf(" ERROR contains (')\n");
00235         exit(1);
00236       }
00237     }
00238     j = sqlite_decode_binary(out, out);
00239     if( j!=n ){
00240       printf(" ERROR decode size %d\n", j);
00241       exit(1);
00242     }
00243     if( memcmp(in, out, n)!=0 ){
00244       printf(" ERROR decode mismatch\n");
00245       exit(1);
00246     }
00247     printf(" OK\n");
00248   }
00249   fprintf(stderr,"Finished.  Total encoding: %d->%d bytes\n",
00250           nByteIn, nByteOut);
00251   fprintf(stderr,"Avg size increase: %.3f%%\n",
00252     (nByteOut-nByteIn)*100.0/(double)nByteIn);
00253 }
00254 #endif /* ENCODER_TEST */

Generated on Sun Dec 25 12:29:51 2005 for sqlite 2.8.17 by  doxygen 1.4.2