00001 /* 00002 ** 2002 April 25 00003 ** 00004 ** The author disclaims copyright to this source code. In place of 00005 ** a legal notice, here is a blessing: 00006 ** 00007 ** May you do good and not evil. 00008 ** May you find forgiveness for yourself and forgive others. 00009 ** May you share freely, never taking more than you give. 00010 ** 00011 ************************************************************************* 00012 ** This file contains helper routines used to translate binary data into 00013 ** a null-terminated string (suitable for use in SQLite) and back again. 00014 ** These are convenience routines for use by people who want to store binary 00015 ** data in an SQLite database. The code in this file is not used by any other 00016 ** part of the SQLite library. 00017 ** 00018 ** $Id: encode.c,v 1.12 2004/03/17 18:44:46 drh Exp $ 00019 */ 00020 #include <string.h> 00021 #include <assert.h> 00022 00023 /* 00024 ** How This Encoder Works 00025 ** 00026 ** The output is allowed to contain any character except 0x27 (') and 00027 ** 0x00. This is accomplished by using an escape character to encode 00028 ** 0x27 and 0x00 as a two-byte sequence. The escape character is always 00029 ** 0x01. An 0x00 is encoded as the two byte sequence 0x01 0x01. The 00030 ** 0x27 character is encoded as the two byte sequence 0x01 0x28. Finally, 00031 ** the escape character itself is encoded as the two-character sequence 00032 ** 0x01 0x02. 00033 ** 00034 ** To summarize, the encoder works by using an escape sequences as follows: 00035 ** 00036 ** 0x00 -> 0x01 0x01 00037 ** 0x01 -> 0x01 0x02 00038 ** 0x27 -> 0x01 0x28 00039 ** 00040 ** If that were all the encoder did, it would work, but in certain cases 00041 ** it could double the size of the encoded string. For example, to 00042 ** encode a string of 100 0x27 characters would require 100 instances of 00043 ** the 0x01 0x03 escape sequence resulting in a 200-character output. 00044 ** We would prefer to keep the size of the encoded string smaller than 00045 ** this. 00046 ** 00047 ** To minimize the encoding size, we first add a fixed offset value to each 00048 ** byte in the sequence. The addition is modulo 256. (That is to say, if 00049 ** the sum of the original character value and the offset exceeds 256, then 00050 ** the higher order bits are truncated.) The offset is chosen to minimize 00051 ** the number of characters in the string that need to be escaped. For 00052 ** example, in the case above where the string was composed of 100 0x27 00053 ** characters, the offset might be 0x01. Each of the 0x27 characters would 00054 ** then be converted into an 0x28 character which would not need to be 00055 ** escaped at all and so the 100 character input string would be converted 00056 ** into just 100 characters of output. Actually 101 characters of output - 00057 ** we have to record the offset used as the first byte in the sequence so 00058 ** that the string can be decoded. Since the offset value is stored as 00059 ** part of the output string and the output string is not allowed to contain 00060 ** characters 0x00 or 0x27, the offset cannot be 0x00 or 0x27. 00061 ** 00062 ** Here, then, are the encoding steps: 00063 ** 00064 ** (1) Choose an offset value and make it the first character of 00065 ** output. 00066 ** 00067 ** (2) Copy each input character into the output buffer, one by 00068 ** one, adding the offset value as you copy. 00069 ** 00070 ** (3) If the value of an input character plus offset is 0x00, replace 00071 ** that one character by the two-character sequence 0x01 0x01. 00072 ** If the sum is 0x01, replace it with 0x01 0x02. If the sum 00073 ** is 0x27, replace it with 0x01 0x03. 00074 ** 00075 ** (4) Put a 0x00 terminator at the end of the output. 00076 ** 00077 ** Decoding is obvious: 00078 ** 00079 ** (5) Copy encoded characters except the first into the decode 00080 ** buffer. Set the first encoded character aside for use as 00081 ** the offset in step 7 below. 00082 ** 00083 ** (6) Convert each 0x01 0x01 sequence into a single character 0x00. 00084 ** Convert 0x01 0x02 into 0x01. Convert 0x01 0x28 into 0x27. 00085 ** 00086 ** (7) Subtract the offset value that was the first character of 00087 ** the encoded buffer from all characters in the output buffer. 00088 ** 00089 ** The only tricky part is step (1) - how to compute an offset value to 00090 ** minimize the size of the output buffer. This is accomplished by testing 00091 ** all offset values and picking the one that results in the fewest number 00092 ** of escapes. To do that, we first scan the entire input and count the 00093 ** number of occurances of each character value in the input. Suppose 00094 ** the number of 0x00 characters is N(0), the number of occurances of 0x01 00095 ** is N(1), and so forth up to the number of occurances of 0xff is N(255). 00096 ** An offset of 0 is not allowed so we don't have to test it. The number 00097 ** of escapes required for an offset of 1 is N(1)+N(2)+N(40). The number 00098 ** of escapes required for an offset of 2 is N(2)+N(3)+N(41). And so forth. 00099 ** In this way we find the offset that gives the minimum number of escapes, 00100 ** and thus minimizes the length of the output string. 00101 */ 00102 00103 /* 00104 ** Encode a binary buffer "in" of size n bytes so that it contains 00105 ** no instances of characters '\'' or '\000'. The output is 00106 ** null-terminated and can be used as a string value in an INSERT 00107 ** or UPDATE statement. Use sqlite_decode_binary() to convert the 00108 ** string back into its original binary. 00109 ** 00110 ** The result is written into a preallocated output buffer "out". 00111 ** "out" must be able to hold at least 2 +(257*n)/254 bytes. 00112 ** In other words, the output will be expanded by as much as 3 00113 ** bytes for every 254 bytes of input plus 2 bytes of fixed overhead. 00114 ** (This is approximately 2 + 1.0118*n or about a 1.2% size increase.) 00115 ** 00116 ** The return value is the number of characters in the encoded 00117 ** string, excluding the "\000" terminator. 00118 ** 00119 ** If out==NULL then no output is generated but the routine still returns 00120 ** the number of characters that would have been generated if out had 00121 ** not been NULL. 00122 */ 00123 int sqlite_encode_binary(const unsigned char *in, int n, unsigned char *out){ 00124 int i, j, e, m; 00125 unsigned char x; 00126 int cnt[256]; 00127 if( n<=0 ){ 00128 if( out ){ 00129 out[0] = 'x'; 00130 out[1] = 0; 00131 } 00132 return 1; 00133 } 00134 memset(cnt, 0, sizeof(cnt)); 00135 for(i=n-1; i>=0; i--){ cnt[in[i]]++; } 00136 m = n; 00137 for(i=1; i<256; i++){ 00138 int sum; 00139 if( i=='\'' ) continue; 00140 sum = cnt[i] + cnt[(i+1)&0xff] + cnt[(i+'\'')&0xff]; 00141 if( sum<m ){ 00142 m = sum; 00143 e = i; 00144 if( m==0 ) break; 00145 } 00146 } 00147 if( out==0 ){ 00148 return n+m+1; 00149 } 00150 out[0] = e; 00151 j = 1; 00152 for(i=0; i<n; i++){ 00153 x = in[i] - e; 00154 if( x==0 || x==1 || x=='\''){ 00155 out[j++] = 1; 00156 x++; 00157 } 00158 out[j++] = x; 00159 } 00160 out[j] = 0; 00161 assert( j==n+m+1 ); 00162 return j; 00163 } 00164 00165 /* 00166 ** Decode the string "in" into binary data and write it into "out". 00167 ** This routine reverses the encoding created by sqlite_encode_binary(). 00168 ** The output will always be a few bytes less than the input. The number 00169 ** of bytes of output is returned. If the input is not a well-formed 00170 ** encoding, -1 is returned. 00171 ** 00172 ** The "in" and "out" parameters may point to the same buffer in order 00173 ** to decode a string in place. 00174 */ 00175 int sqlite_decode_binary(const unsigned char *in, unsigned char *out){ 00176 int i, e; 00177 unsigned char c; 00178 e = *(in++); 00179 i = 0; 00180 while( (c = *(in++))!=0 ){ 00181 if( c==1 ){ 00182 c = *(in++) - 1; 00183 } 00184 out[i++] = c + e; 00185 } 00186 return i; 00187 } 00188 00189 #ifdef ENCODER_TEST 00190 #include <stdio.h> 00191 /* 00192 ** The subroutines above are not tested by the usual test suite. To test 00193 ** these routines, compile just this one file with a -DENCODER_TEST=1 option 00194 ** and run the result. 00195 */ 00196 int main(int argc, char **argv){ 00197 int i, j, n, m, nOut, nByteIn, nByteOut; 00198 unsigned char in[30000]; 00199 unsigned char out[33000]; 00200 00201 nByteIn = nByteOut = 0; 00202 for(i=0; i<sizeof(in); i++){ 00203 printf("Test %d: ", i+1); 00204 n = rand() % (i+1); 00205 if( i%100==0 ){ 00206 int k; 00207 for(j=k=0; j<n; j++){ 00208 /* if( k==0 || k=='\'' ) k++; */ 00209 in[j] = k; 00210 k = (k+1)&0xff; 00211 } 00212 }else{ 00213 for(j=0; j<n; j++) in[j] = rand() & 0xff; 00214 } 00215 nByteIn += n; 00216 nOut = sqlite_encode_binary(in, n, out); 00217 nByteOut += nOut; 00218 if( nOut!=strlen(out) ){ 00219 printf(" ERROR return value is %d instead of %d\n", nOut, strlen(out)); 00220 exit(1); 00221 } 00222 if( nOut!=sqlite_encode_binary(in, n, 0) ){ 00223 printf(" ERROR actual output size disagrees with predicted size\n"); 00224 exit(1); 00225 } 00226 m = (256*n + 1262)/253; 00227 printf("size %d->%d (max %d)", n, strlen(out)+1, m); 00228 if( strlen(out)+1>m ){ 00229 printf(" ERROR output too big\n"); 00230 exit(1); 00231 } 00232 for(j=0; out[j]; j++){ 00233 if( out[j]=='\'' ){ 00234 printf(" ERROR contains (')\n"); 00235 exit(1); 00236 } 00237 } 00238 j = sqlite_decode_binary(out, out); 00239 if( j!=n ){ 00240 printf(" ERROR decode size %d\n", j); 00241 exit(1); 00242 } 00243 if( memcmp(in, out, n)!=0 ){ 00244 printf(" ERROR decode mismatch\n"); 00245 exit(1); 00246 } 00247 printf(" OK\n"); 00248 } 00249 fprintf(stderr,"Finished. Total encoding: %d->%d bytes\n", 00250 nByteIn, nByteOut); 00251 fprintf(stderr,"Avg size increase: %.3f%%\n", 00252 (nByteOut-nByteIn)*100.0/(double)nByteIn); 00253 } 00254 #endif /* ENCODER_TEST */