common/stringutils.cc

Go to the documentation of this file.
00001 
00004 /* Copyright (C) 2007 Olly Betts
00005  *
00006  * This program is free software; you can redistribute it and/or modify
00007  * it under the terms of the GNU General Public License as published by
00008  * the Free Software Foundation; either version 2 of the License, or
00009  * (at your option) any later version.
00010  *
00011  * This program is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014  * GNU General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU General Public License
00017  * along with this program; if not, write to the Free Software
00018  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
00019  */
00020 
00021 #include <config.h>
00022 
00023 #include "stringutils.h"
00024 
00025 namespace Xapian {
00026 namespace Internal {
00027 
00028 // FIXME: These tables assume ASCII or an ASCII compatible character set
00029 // such as ISO-8859-N or UTF-8.  EBCDIC would need some work (patches
00030 // welcome!)  For now, use a compile time check - if '\x20' isn't a space
00031 // then the array dimension will be negative.
00032 
00033 // FIXME: look at using boost's static_assert for cleaner compile time
00034 // asserts...
00035 
00036 const unsigned char is_tab[('\x20' == ' ') ? 256 : -1] = {
00037  /* \x00     */ 0,
00038  /* \x01     */ 0,
00039  /* \x02     */ 0,
00040  /* \x03     */ 0,
00041  /* \x04     */ 0,
00042  /* \x05     */ 0,
00043  /* \x06     */ 0,
00044  /* \x07     */ 0,
00045  /* \x08     */ 0,
00046  /* \x09     */ IS_SPACE,
00047  /* \x0a     */ IS_SPACE,
00048  /* \x0b     */ 0,
00049  /* \x0c     */ IS_SPACE,
00050  /* \x0d     */ IS_SPACE,
00051  /* \x0e     */ 0,
00052  /* \x0f     */ 0,
00053  /* \x10     */ 0,
00054  /* \x11     */ 0,
00055  /* \x12     */ 0,
00056  /* \x13     */ 0,
00057  /* \x14     */ 0,
00058  /* \x15     */ 0,
00059  /* \x16     */ 0,
00060  /* \x17     */ 0,
00061  /* \x18     */ 0,
00062  /* \x19     */ 0,
00063  /* \x1a     */ 0,
00064  /* \x1b     */ 0,
00065  /* \x1c     */ 0,
00066  /* \x1d     */ 0,
00067  /* \x1e     */ 0,
00068  /* \x1f     */ 0,
00069  /* \x20 ( ) */ IS_SPACE,
00070  /* \x21 (!) */ 0,
00071  /* \x22 (") */ 0,
00072  /* \x23 (#) */ 0,
00073  /* \x24 ($) */ 0,
00074  /* \x25 (%) */ 0,
00075  /* \x26 (&) */ 0,
00076  /* \x27 (') */ 0,
00077  /* \x28 (() */ 0,
00078  /* \x29 ()) */ 0,
00079  /* \x2a (*) */ 0,
00080  /* \x2b (+) */ IS_SIGN,
00081  /* \x2c (,) */ 0,
00082  /* \x2d (-) */ IS_SIGN,
00083  /* \x2e (.) */ 0,
00084  /* \x2f (/) */ 0,
00085  /* \x30 (0) */ IS_DIGIT|IS_HEX,
00086  /* \x31 (1) */ IS_DIGIT|IS_HEX,
00087  /* \x32 (2) */ IS_DIGIT|IS_HEX,
00088  /* \x33 (3) */ IS_DIGIT|IS_HEX,
00089  /* \x34 (4) */ IS_DIGIT|IS_HEX,
00090  /* \x35 (5) */ IS_DIGIT|IS_HEX,
00091  /* \x36 (6) */ IS_DIGIT|IS_HEX,
00092  /* \x37 (7) */ IS_DIGIT|IS_HEX,
00093  /* \x38 (8) */ IS_DIGIT|IS_HEX,
00094  /* \x39 (9) */ IS_DIGIT|IS_HEX,
00095  /* \x3a (:) */ 0,
00096  /* \x3b (;) */ 0,
00097  /* \x3c (<) */ 0,
00098  /* \x3d (=) */ 0,
00099  /* \x3e (>) */ 0,
00100  /* \x3f (?) */ 0,
00101  /* \x40 (@) */ 0,
00102  /* \x41 (A) */ IS_UPPER|IS_HEX,
00103  /* \x42 (B) */ IS_UPPER|IS_HEX,
00104  /* \x43 (C) */ IS_UPPER|IS_HEX,
00105  /* \x44 (D) */ IS_UPPER|IS_HEX,
00106  /* \x45 (E) */ IS_UPPER|IS_HEX,
00107  /* \x46 (F) */ IS_UPPER|IS_HEX,
00108  /* \x47 (G) */ IS_UPPER,
00109  /* \x48 (H) */ IS_UPPER,
00110  /* \x49 (I) */ IS_UPPER,
00111  /* \x4a (J) */ IS_UPPER,
00112  /* \x4b (K) */ IS_UPPER,
00113  /* \x4c (L) */ IS_UPPER,
00114  /* \x4d (M) */ IS_UPPER,
00115  /* \x4e (N) */ IS_UPPER,
00116  /* \x4f (O) */ IS_UPPER,
00117  /* \x50 (P) */ IS_UPPER,
00118  /* \x51 (Q) */ IS_UPPER,
00119  /* \x52 (R) */ IS_UPPER,
00120  /* \x53 (S) */ IS_UPPER,
00121  /* \x54 (T) */ IS_UPPER,
00122  /* \x55 (U) */ IS_UPPER,
00123  /* \x56 (V) */ IS_UPPER,
00124  /* \x57 (W) */ IS_UPPER,
00125  /* \x58 (X) */ IS_UPPER,
00126  /* \x59 (Y) */ IS_UPPER,
00127  /* \x5a (Z) */ IS_UPPER,
00128  /* \x5b ([) */ 0,
00129  /* \x5c (\) */ 0,
00130  /* \x5d (]) */ 0,
00131  /* \x5e (^) */ 0,
00132  /* \x5f (_) */ 0,
00133  /* \x60 (`) */ 0,
00134  /* \x61 (a) */ IS_LOWER|IS_HEX,
00135  /* \x62 (b) */ IS_LOWER|IS_HEX,
00136  /* \x63 (c) */ IS_LOWER|IS_HEX,
00137  /* \x64 (d) */ IS_LOWER|IS_HEX,
00138  /* \x65 (e) */ IS_LOWER|IS_HEX,
00139  /* \x66 (f) */ IS_LOWER|IS_HEX,
00140  /* \x67 (g) */ IS_LOWER,
00141  /* \x68 (h) */ IS_LOWER,
00142  /* \x69 (i) */ IS_LOWER,
00143  /* \x6a (j) */ IS_LOWER,
00144  /* \x6b (k) */ IS_LOWER,
00145  /* \x6c (l) */ IS_LOWER,
00146  /* \x6d (m) */ IS_LOWER,
00147  /* \x6e (n) */ IS_LOWER,
00148  /* \x6f (o) */ IS_LOWER,
00149  /* \x70 (p) */ IS_LOWER,
00150  /* \x71 (q) */ IS_LOWER,
00151  /* \x72 (r) */ IS_LOWER,
00152  /* \x73 (s) */ IS_LOWER,
00153  /* \x74 (t) */ IS_LOWER,
00154  /* \x75 (u) */ IS_LOWER,
00155  /* \x76 (v) */ IS_LOWER,
00156  /* \x77 (w) */ IS_LOWER,
00157  /* \x78 (x) */ IS_LOWER,
00158  /* \x79 (y) */ IS_LOWER,
00159  /* \x7a (z) */ IS_LOWER,
00160  /* \x7b ({) */ 0,
00161  /* \x7c (|) */ 0,
00162  /* \x7d (}) */ 0,
00163  /* \x7e (~) */ 0,
00164  /* \x7f     */ 0,
00165  /* \x80     */ 0,
00166  /* \x81     */ 0,
00167  /* \x82     */ 0,
00168  /* \x83     */ 0,
00169  /* \x84     */ 0,
00170  /* \x85     */ 0,
00171  /* \x86     */ 0,
00172  /* \x87     */ 0,
00173  /* \x88     */ 0,
00174  /* \x89     */ 0,
00175  /* \x8a     */ 0,
00176  /* \x8b     */ 0,
00177  /* \x8c     */ 0,
00178  /* \x8d     */ 0,
00179  /* \x8e     */ 0,
00180  /* \x8f     */ 0,
00181  /* \x90     */ 0,
00182  /* \x91     */ 0,
00183  /* \x92     */ 0,
00184  /* \x93     */ 0,
00185  /* \x94     */ 0,
00186  /* \x95     */ 0,
00187  /* \x96     */ 0,
00188  /* \x97     */ 0,
00189  /* \x98     */ 0,
00190  /* \x99     */ 0,
00191  /* \x9a     */ 0,
00192  /* \x9b     */ 0,
00193  /* \x9c     */ 0,
00194  /* \x9d     */ 0,
00195  /* \x9e     */ 0,
00196  /* \x9f     */ 0,
00197  /* \xa0     */ 0,
00198  /* \xa1 (¡) */ 0,
00199  /* \xa2 (¢) */ 0,
00200  /* \xa3 (£) */ 0,
00201  /* \xa4 (¤) */ 0,
00202  /* \xa5 (¥) */ 0,
00203  /* \xa6 (¦) */ 0,
00204  /* \xa7 (§) */ 0,
00205  /* \xa8 (¨) */ 0,
00206  /* \xa9 (©) */ 0,
00207  /* \xaa (ª) */ 0,
00208  /* \xab («) */ 0,
00209  /* \xac (¬) */ 0,
00210  /* \xad (­) */ 0,
00211  /* \xae (®) */ 0,
00212  /* \xaf (¯) */ 0,
00213  /* \xb0 (°) */ 0,
00214  /* \xb1 (±) */ 0,
00215  /* \xb2 (²) */ 0,
00216  /* \xb3 (³) */ 0,
00217  /* \xb4 (´) */ 0,
00218  /* \xb5 (µ) */ 0,
00219  /* \xb6 (¶) */ 0,
00220  /* \xb7 (·) */ 0,
00221  /* \xb8 (¸) */ 0,
00222  /* \xb9 (¹) */ 0,
00223  /* \xba (º) */ 0,
00224  /* \xbb (») */ 0,
00225  /* \xbc (¼) */ 0,
00226  /* \xbd (½) */ 0,
00227  /* \xbe (¾) */ 0,
00228  /* \xbf (¿) */ 0,
00229  /* \xc0 (À) */ 0,
00230  /* \xc1 (Á) */ 0,
00231  /* \xc2 (Â) */ 0,
00232  /* \xc3 (Ã) */ 0,
00233  /* \xc4 (Ä) */ 0,
00234  /* \xc5 (Å) */ 0,
00235  /* \xc6 (Æ) */ 0,
00236  /* \xc7 (Ç) */ 0,
00237  /* \xc8 (È) */ 0,
00238  /* \xc9 (É) */ 0,
00239  /* \xca (Ê) */ 0,
00240  /* \xcb (Ë) */ 0,
00241  /* \xcc (Ì) */ 0,
00242  /* \xcd (Í) */ 0,
00243  /* \xce (Î) */ 0,
00244  /* \xcf (Ï) */ 0,
00245  /* \xd0 (Ð) */ 0,
00246  /* \xd1 (Ñ) */ 0,
00247  /* \xd2 (Ò) */ 0,
00248  /* \xd3 (Ó) */ 0,
00249  /* \xd4 (Ô) */ 0,
00250  /* \xd5 (Õ) */ 0,
00251  /* \xd6 (Ö) */ 0,
00252  /* \xd7 (×) */ 0,
00253  /* \xd8 (Ø) */ 0,
00254  /* \xd9 (Ù) */ 0,
00255  /* \xda (Ú) */ 0,
00256  /* \xdb (Û) */ 0,
00257  /* \xdc (Ü) */ 0,
00258  /* \xdd (Ý) */ 0,
00259  /* \xde (Þ) */ 0,
00260  /* \xdf (ß) */ 0,
00261  /* \xe0 (à) */ 0,
00262  /* \xe1 (á) */ 0,
00263  /* \xe2 (â) */ 0,
00264  /* \xe3 (ã) */ 0,
00265  /* \xe4 (ä) */ 0,
00266  /* \xe5 (å) */ 0,
00267  /* \xe6 (æ) */ 0,
00268  /* \xe7 (ç) */ 0,
00269  /* \xe8 (è) */ 0,
00270  /* \xe9 (é) */ 0,
00271  /* \xea (ê) */ 0,
00272  /* \xeb (ë) */ 0,
00273  /* \xec (ì) */ 0,
00274  /* \xed (í) */ 0,
00275  /* \xee (î) */ 0,
00276  /* \xef (ï) */ 0,
00277  /* \xf0 (ð) */ 0,
00278  /* \xf1 (ñ) */ 0,
00279  /* \xf2 (ò) */ 0,
00280  /* \xf3 (ó) */ 0,
00281  /* \xf4 (ô) */ 0,
00282  /* \xf5 (õ) */ 0,
00283  /* \xf6 (ö) */ 0,
00284  /* \xf7 (÷) */ 0,
00285  /* \xf8 (ø) */ 0,
00286  /* \xf9 (ù) */ 0,
00287  /* \xfa (ú) */ 0,
00288  /* \xfb (û) */ 0,
00289  /* \xfc (ü) */ 0,
00290  /* \xfd (ý) */ 0,
00291  /* \xfe (þ) */ 0,
00292  /* \xff (ÿ) */ 0,
00293 };
00294 
00295 // C++ doesn't allow the handy C feature of initialising a char
00296 // array with a string which is exactly the right length, so the
00297 // trailing '\0' isn't included, so our tables need to be 257
00298 // bytes!
00299 const unsigned char lo_tab[257] =
00300     "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
00301     "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
00302     " !\"#$%&'()*+,-./0123456789:;<=>?"
00303     "@abcdefghijklmnopqrstuvwxyz[\\]^_"
00304     "`abcdefghijklmnopqrstuvwxyz{|}~\x7f"
00305     "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
00306     "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
00307     "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
00308     "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
00309     "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
00310     "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
00311     "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
00312     "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff";
00313 
00314 const unsigned char up_tab[257] =
00315     "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
00316     "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
00317     " !\"#$%&'()*+,-./0123456789:;<=>?"
00318     "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_"
00319     "`ABCDEFGHIJKLMNOPQRSTUVWXYZ{|}~\x7f"
00320     "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
00321     "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
00322     "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
00323     "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
00324     "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
00325     "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
00326     "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
00327     "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff";
00328 
00329 }
00330 }

Documentation for Xapian (version 1.0.10).
Generated on 24 Dec 2008 by Doxygen 1.5.2.