00001 /*------------------------------------------------------------------------- 00002 * 00003 * like_match.c 00004 * LIKE pattern matching internal code. 00005 * 00006 * This file is included by like.c four times, to provide matching code for 00007 * (1) single-byte encodings, (2) UTF8, (3) other multi-byte encodings, 00008 * and (4) case insensitive matches in single-byte encodings. 00009 * (UTF8 is a special case because we can use a much more efficient version 00010 * of NextChar than can be used for general multi-byte encodings.) 00011 * 00012 * Before the inclusion, we need to define the following macros: 00013 * 00014 * NextChar 00015 * MatchText - to name of function wanted 00016 * do_like_escape - name of function if wanted - needs CHAREQ and CopyAdvChar 00017 * MATCH_LOWER - define for case (4) to specify case folding for 1-byte chars 00018 * 00019 * Copyright (c) 1996-2013, PostgreSQL Global Development Group 00020 * 00021 * IDENTIFICATION 00022 * src/backend/utils/adt/like_match.c 00023 * 00024 *------------------------------------------------------------------------- 00025 */ 00026 00027 /* 00028 * Originally written by Rich $alz, mirror!rs, Wed Nov 26 19:03:17 EST 1986. 00029 * Rich $alz is now <[email protected]>. 00030 * Special thanks to Lars Mathiesen <[email protected]> for the LABORT code. 00031 * 00032 * This code was shamelessly stolen from the "pql" code by myself and 00033 * slightly modified :) 00034 * 00035 * All references to the word "star" were replaced by "percent" 00036 * All references to the word "wild" were replaced by "like" 00037 * 00038 * All the nice shell RE matching stuff was replaced by just "_" and "%" 00039 * 00040 * As I don't have a copy of the SQL standard handy I wasn't sure whether 00041 * to leave in the '\' escape character handling. 00042 * 00043 * Keith Parks. <[email protected]> 00044 * 00045 * SQL lets you specify the escape character by saying 00046 * LIKE <pattern> ESCAPE <escape character>. We are a small operation 00047 * so we force you to use '\'. - ay 7/95 00048 * 00049 * Now we have the like_escape() function that converts patterns with 00050 * any specified escape character (or none at all) to the internal 00051 * default escape character, which is still '\'. - tgl 9/2000 00052 * 00053 * The code is rewritten to avoid requiring null-terminated strings, 00054 * which in turn allows us to leave out some memcpy() operations. 00055 * This code should be faster and take less memory, but no promises... 00056 * - thomas 2000-08-06 00057 */ 00058 00059 00060 /*-------------------- 00061 * Match text and pattern, return LIKE_TRUE, LIKE_FALSE, or LIKE_ABORT. 00062 * 00063 * LIKE_TRUE: they match 00064 * LIKE_FALSE: they don't match 00065 * LIKE_ABORT: not only don't they match, but the text is too short. 00066 * 00067 * If LIKE_ABORT is returned, then no suffix of the text can match the 00068 * pattern either, so an upper-level % scan can stop scanning now. 00069 *-------------------- 00070 */ 00071 00072 #ifdef MATCH_LOWER 00073 #define GETCHAR(t) MATCH_LOWER(t) 00074 #else 00075 #define GETCHAR(t) (t) 00076 #endif 00077 00078 static int 00079 MatchText(char *t, int tlen, char *p, int plen, 00080 pg_locale_t locale, bool locale_is_c) 00081 { 00082 /* Fast path for match-everything pattern */ 00083 if (plen == 1 && *p == '%') 00084 return LIKE_TRUE; 00085 00086 /* 00087 * In this loop, we advance by char when matching wildcards (and thus on 00088 * recursive entry to this function we are properly char-synced). On other 00089 * occasions it is safe to advance by byte, as the text and pattern will 00090 * be in lockstep. This allows us to perform all comparisons between the 00091 * text and pattern on a byte by byte basis, even for multi-byte 00092 * encodings. 00093 */ 00094 while (tlen > 0 && plen > 0) 00095 { 00096 if (*p == '\\') 00097 { 00098 /* Next pattern byte must match literally, whatever it is */ 00099 NextByte(p, plen); 00100 /* ... and there had better be one, per SQL standard */ 00101 if (plen <= 0) 00102 ereport(ERROR, 00103 (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), 00104 errmsg("LIKE pattern must not end with escape character"))); 00105 if (GETCHAR(*p) != GETCHAR(*t)) 00106 return LIKE_FALSE; 00107 } 00108 else if (*p == '%') 00109 { 00110 char firstpat; 00111 00112 /* 00113 * % processing is essentially a search for a text position at 00114 * which the remainder of the text matches the remainder of the 00115 * pattern, using a recursive call to check each potential match. 00116 * 00117 * If there are wildcards immediately following the %, we can skip 00118 * over them first, using the idea that any sequence of N _'s and 00119 * one or more %'s is equivalent to N _'s and one % (ie, it will 00120 * match any sequence of at least N text characters). In this way 00121 * we will always run the recursive search loop using a pattern 00122 * fragment that begins with a literal character-to-match, thereby 00123 * not recursing more than we have to. 00124 */ 00125 NextByte(p, plen); 00126 00127 while (plen > 0) 00128 { 00129 if (*p == '%') 00130 NextByte(p, plen); 00131 else if (*p == '_') 00132 { 00133 /* If not enough text left to match the pattern, ABORT */ 00134 if (tlen <= 0) 00135 return LIKE_ABORT; 00136 NextChar(t, tlen); 00137 NextByte(p, plen); 00138 } 00139 else 00140 break; /* Reached a non-wildcard pattern char */ 00141 } 00142 00143 /* 00144 * If we're at end of pattern, match: we have a trailing % which 00145 * matches any remaining text string. 00146 */ 00147 if (plen <= 0) 00148 return LIKE_TRUE; 00149 00150 /* 00151 * Otherwise, scan for a text position at which we can match the 00152 * rest of the pattern. The first remaining pattern char is known 00153 * to be a regular or escaped literal character, so we can compare 00154 * the first pattern byte to each text byte to avoid recursing 00155 * more than we have to. This fact also guarantees that we don't 00156 * have to consider a match to the zero-length substring at the 00157 * end of the text. 00158 */ 00159 if (*p == '\\') 00160 { 00161 if (plen < 2) 00162 ereport(ERROR, 00163 (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), 00164 errmsg("LIKE pattern must not end with escape character"))); 00165 firstpat = GETCHAR(p[1]); 00166 } 00167 else 00168 firstpat = GETCHAR(*p); 00169 00170 while (tlen > 0) 00171 { 00172 if (GETCHAR(*t) == firstpat) 00173 { 00174 int matched = MatchText(t, tlen, p, plen, 00175 locale, locale_is_c); 00176 00177 if (matched != LIKE_FALSE) 00178 return matched; /* TRUE or ABORT */ 00179 } 00180 00181 NextChar(t, tlen); 00182 } 00183 00184 /* 00185 * End of text with no match, so no point in trying later places 00186 * to start matching this pattern. 00187 */ 00188 return LIKE_ABORT; 00189 } 00190 else if (*p == '_') 00191 { 00192 /* _ matches any single character, and we know there is one */ 00193 NextChar(t, tlen); 00194 NextByte(p, plen); 00195 continue; 00196 } 00197 else if (GETCHAR(*p) != GETCHAR(*t)) 00198 { 00199 /* non-wildcard pattern char fails to match text char */ 00200 return LIKE_FALSE; 00201 } 00202 00203 /* 00204 * Pattern and text match, so advance. 00205 * 00206 * It is safe to use NextByte instead of NextChar here, even for 00207 * multi-byte character sets, because we are not following immediately 00208 * after a wildcard character. If we are in the middle of a multibyte 00209 * character, we must already have matched at least one byte of the 00210 * character from both text and pattern; so we cannot get out-of-sync 00211 * on character boundaries. And we know that no backend-legal 00212 * encoding allows ASCII characters such as '%' to appear as non-first 00213 * bytes of characters, so we won't mistakenly detect a new wildcard. 00214 */ 00215 NextByte(t, tlen); 00216 NextByte(p, plen); 00217 } 00218 00219 if (tlen > 0) 00220 return LIKE_FALSE; /* end of pattern, but not of text */ 00221 00222 /* 00223 * End of text, but perhaps not of pattern. Match iff the remaining 00224 * pattern can match a zero-length string, ie, it's zero or more %'s. 00225 */ 00226 while (plen > 0 && *p == '%') 00227 NextByte(p, plen); 00228 if (plen <= 0) 00229 return LIKE_TRUE; 00230 00231 /* 00232 * End of text with no match, so no point in trying later places to start 00233 * matching this pattern. 00234 */ 00235 return LIKE_ABORT; 00236 } /* MatchText() */ 00237 00238 /* 00239 * like_escape() --- given a pattern and an ESCAPE string, 00240 * convert the pattern to use Postgres' standard backslash escape convention. 00241 */ 00242 #ifdef do_like_escape 00243 00244 static text * 00245 do_like_escape(text *pat, text *esc) 00246 { 00247 text *result; 00248 char *p, 00249 *e, 00250 *r; 00251 int plen, 00252 elen; 00253 bool afterescape; 00254 00255 p = VARDATA_ANY(pat); 00256 plen = VARSIZE_ANY_EXHDR(pat); 00257 e = VARDATA_ANY(esc); 00258 elen = VARSIZE_ANY_EXHDR(esc); 00259 00260 /* 00261 * Worst-case pattern growth is 2x --- unlikely, but it's hardly worth 00262 * trying to calculate the size more accurately than that. 00263 */ 00264 result = (text *) palloc(plen * 2 + VARHDRSZ); 00265 r = VARDATA(result); 00266 00267 if (elen == 0) 00268 { 00269 /* 00270 * No escape character is wanted. Double any backslashes in the 00271 * pattern to make them act like ordinary characters. 00272 */ 00273 while (plen > 0) 00274 { 00275 if (*p == '\\') 00276 *r++ = '\\'; 00277 CopyAdvChar(r, p, plen); 00278 } 00279 } 00280 else 00281 { 00282 /* 00283 * The specified escape must be only a single character. 00284 */ 00285 NextChar(e, elen); 00286 if (elen != 0) 00287 ereport(ERROR, 00288 (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), 00289 errmsg("invalid escape string"), 00290 errhint("Escape string must be empty or one character."))); 00291 00292 e = VARDATA_ANY(esc); 00293 00294 /* 00295 * If specified escape is '\', just copy the pattern as-is. 00296 */ 00297 if (*e == '\\') 00298 { 00299 memcpy(result, pat, VARSIZE_ANY(pat)); 00300 return result; 00301 } 00302 00303 /* 00304 * Otherwise, convert occurrences of the specified escape character to 00305 * '\', and double occurrences of '\' --- unless they immediately 00306 * follow an escape character! 00307 */ 00308 afterescape = false; 00309 while (plen > 0) 00310 { 00311 if (CHAREQ(p, e) && !afterescape) 00312 { 00313 *r++ = '\\'; 00314 NextChar(p, plen); 00315 afterescape = true; 00316 } 00317 else if (*p == '\\') 00318 { 00319 *r++ = '\\'; 00320 if (!afterescape) 00321 *r++ = '\\'; 00322 NextChar(p, plen); 00323 afterescape = false; 00324 } 00325 else 00326 { 00327 CopyAdvChar(r, p, plen); 00328 afterescape = false; 00329 } 00330 } 00331 } 00332 00333 SET_VARSIZE(result, r - ((char *) result)); 00334 00335 return result; 00336 } 00337 #endif /* do_like_escape */ 00338 00339 #ifdef CHAREQ 00340 #undef CHAREQ 00341 #endif 00342 00343 #undef NextChar 00344 #undef CopyAdvChar 00345 #undef MatchText 00346 00347 #ifdef do_like_escape 00348 #undef do_like_escape 00349 #endif 00350 00351 #undef GETCHAR 00352 00353 #ifdef MATCH_LOWER 00354 #undef MATCH_LOWER 00355 00356 #endif