Header And Logo

PostgreSQL
| The world's most advanced open source database.

like_match.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * like_match.c
00004  *    LIKE pattern matching internal code.
00005  *
00006  * This file is included by like.c four times, to provide matching code for
00007  * (1) single-byte encodings, (2) UTF8, (3) other multi-byte encodings,
00008  * and (4) case insensitive matches in single-byte encodings.
00009  * (UTF8 is a special case because we can use a much more efficient version
00010  * of NextChar than can be used for general multi-byte encodings.)
00011  *
00012  * Before the inclusion, we need to define the following macros:
00013  *
00014  * NextChar
00015  * MatchText - to name of function wanted
00016  * do_like_escape - name of function if wanted - needs CHAREQ and CopyAdvChar
00017  * MATCH_LOWER - define for case (4) to specify case folding for 1-byte chars
00018  *
00019  * Copyright (c) 1996-2013, PostgreSQL Global Development Group
00020  *
00021  * IDENTIFICATION
00022  *  src/backend/utils/adt/like_match.c
00023  *
00024  *-------------------------------------------------------------------------
00025  */
00026 
00027 /*
00028  *  Originally written by Rich $alz, mirror!rs, Wed Nov 26 19:03:17 EST 1986.
00029  *  Rich $alz is now <[email protected]>.
00030  *  Special thanks to Lars Mathiesen <[email protected]> for the LABORT code.
00031  *
00032  *  This code was shamelessly stolen from the "pql" code by myself and
00033  *  slightly modified :)
00034  *
00035  *  All references to the word "star" were replaced by "percent"
00036  *  All references to the word "wild" were replaced by "like"
00037  *
00038  *  All the nice shell RE matching stuff was replaced by just "_" and "%"
00039  *
00040  *  As I don't have a copy of the SQL standard handy I wasn't sure whether
00041  *  to leave in the '\' escape character handling.
00042  *
00043  *  Keith Parks. <[email protected]>
00044  *
00045  *  SQL lets you specify the escape character by saying
00046  *  LIKE <pattern> ESCAPE <escape character>. We are a small operation
00047  *  so we force you to use '\'. - ay 7/95
00048  *
00049  *  Now we have the like_escape() function that converts patterns with
00050  *  any specified escape character (or none at all) to the internal
00051  *  default escape character, which is still '\'. - tgl 9/2000
00052  *
00053  * The code is rewritten to avoid requiring null-terminated strings,
00054  * which in turn allows us to leave out some memcpy() operations.
00055  * This code should be faster and take less memory, but no promises...
00056  * - thomas 2000-08-06
00057  */
00058 
00059 
00060 /*--------------------
00061  *  Match text and pattern, return LIKE_TRUE, LIKE_FALSE, or LIKE_ABORT.
00062  *
00063  *  LIKE_TRUE: they match
00064  *  LIKE_FALSE: they don't match
00065  *  LIKE_ABORT: not only don't they match, but the text is too short.
00066  *
00067  * If LIKE_ABORT is returned, then no suffix of the text can match the
00068  * pattern either, so an upper-level % scan can stop scanning now.
00069  *--------------------
00070  */
00071 
00072 #ifdef MATCH_LOWER
00073 #define GETCHAR(t) MATCH_LOWER(t)
00074 #else
00075 #define GETCHAR(t) (t)
00076 #endif
00077 
00078 static int
00079 MatchText(char *t, int tlen, char *p, int plen,
00080           pg_locale_t locale, bool locale_is_c)
00081 {
00082     /* Fast path for match-everything pattern */
00083     if (plen == 1 && *p == '%')
00084         return LIKE_TRUE;
00085 
00086     /*
00087      * In this loop, we advance by char when matching wildcards (and thus on
00088      * recursive entry to this function we are properly char-synced). On other
00089      * occasions it is safe to advance by byte, as the text and pattern will
00090      * be in lockstep. This allows us to perform all comparisons between the
00091      * text and pattern on a byte by byte basis, even for multi-byte
00092      * encodings.
00093      */
00094     while (tlen > 0 && plen > 0)
00095     {
00096         if (*p == '\\')
00097         {
00098             /* Next pattern byte must match literally, whatever it is */
00099             NextByte(p, plen);
00100             /* ... and there had better be one, per SQL standard */
00101             if (plen <= 0)
00102                 ereport(ERROR,
00103                         (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
00104                  errmsg("LIKE pattern must not end with escape character")));
00105             if (GETCHAR(*p) != GETCHAR(*t))
00106                 return LIKE_FALSE;
00107         }
00108         else if (*p == '%')
00109         {
00110             char        firstpat;
00111 
00112             /*
00113              * % processing is essentially a search for a text position at
00114              * which the remainder of the text matches the remainder of the
00115              * pattern, using a recursive call to check each potential match.
00116              *
00117              * If there are wildcards immediately following the %, we can skip
00118              * over them first, using the idea that any sequence of N _'s and
00119              * one or more %'s is equivalent to N _'s and one % (ie, it will
00120              * match any sequence of at least N text characters).  In this way
00121              * we will always run the recursive search loop using a pattern
00122              * fragment that begins with a literal character-to-match, thereby
00123              * not recursing more than we have to.
00124              */
00125             NextByte(p, plen);
00126 
00127             while (plen > 0)
00128             {
00129                 if (*p == '%')
00130                     NextByte(p, plen);
00131                 else if (*p == '_')
00132                 {
00133                     /* If not enough text left to match the pattern, ABORT */
00134                     if (tlen <= 0)
00135                         return LIKE_ABORT;
00136                     NextChar(t, tlen);
00137                     NextByte(p, plen);
00138                 }
00139                 else
00140                     break;      /* Reached a non-wildcard pattern char */
00141             }
00142 
00143             /*
00144              * If we're at end of pattern, match: we have a trailing % which
00145              * matches any remaining text string.
00146              */
00147             if (plen <= 0)
00148                 return LIKE_TRUE;
00149 
00150             /*
00151              * Otherwise, scan for a text position at which we can match the
00152              * rest of the pattern.  The first remaining pattern char is known
00153              * to be a regular or escaped literal character, so we can compare
00154              * the first pattern byte to each text byte to avoid recursing
00155              * more than we have to.  This fact also guarantees that we don't
00156              * have to consider a match to the zero-length substring at the
00157              * end of the text.
00158              */
00159             if (*p == '\\')
00160             {
00161                 if (plen < 2)
00162                     ereport(ERROR,
00163                             (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
00164                              errmsg("LIKE pattern must not end with escape character")));
00165                 firstpat = GETCHAR(p[1]);
00166             }
00167             else
00168                 firstpat = GETCHAR(*p);
00169 
00170             while (tlen > 0)
00171             {
00172                 if (GETCHAR(*t) == firstpat)
00173                 {
00174                     int         matched = MatchText(t, tlen, p, plen,
00175                                                     locale, locale_is_c);
00176 
00177                     if (matched != LIKE_FALSE)
00178                         return matched; /* TRUE or ABORT */
00179                 }
00180 
00181                 NextChar(t, tlen);
00182             }
00183 
00184             /*
00185              * End of text with no match, so no point in trying later places
00186              * to start matching this pattern.
00187              */
00188             return LIKE_ABORT;
00189         }
00190         else if (*p == '_')
00191         {
00192             /* _ matches any single character, and we know there is one */
00193             NextChar(t, tlen);
00194             NextByte(p, plen);
00195             continue;
00196         }
00197         else if (GETCHAR(*p) != GETCHAR(*t))
00198         {
00199             /* non-wildcard pattern char fails to match text char */
00200             return LIKE_FALSE;
00201         }
00202 
00203         /*
00204          * Pattern and text match, so advance.
00205          *
00206          * It is safe to use NextByte instead of NextChar here, even for
00207          * multi-byte character sets, because we are not following immediately
00208          * after a wildcard character. If we are in the middle of a multibyte
00209          * character, we must already have matched at least one byte of the
00210          * character from both text and pattern; so we cannot get out-of-sync
00211          * on character boundaries.  And we know that no backend-legal
00212          * encoding allows ASCII characters such as '%' to appear as non-first
00213          * bytes of characters, so we won't mistakenly detect a new wildcard.
00214          */
00215         NextByte(t, tlen);
00216         NextByte(p, plen);
00217     }
00218 
00219     if (tlen > 0)
00220         return LIKE_FALSE;      /* end of pattern, but not of text */
00221 
00222     /*
00223      * End of text, but perhaps not of pattern.  Match iff the remaining
00224      * pattern can match a zero-length string, ie, it's zero or more %'s.
00225      */
00226     while (plen > 0 && *p == '%')
00227         NextByte(p, plen);
00228     if (plen <= 0)
00229         return LIKE_TRUE;
00230 
00231     /*
00232      * End of text with no match, so no point in trying later places to start
00233      * matching this pattern.
00234      */
00235     return LIKE_ABORT;
00236 }   /* MatchText() */
00237 
00238 /*
00239  * like_escape() --- given a pattern and an ESCAPE string,
00240  * convert the pattern to use Postgres' standard backslash escape convention.
00241  */
00242 #ifdef do_like_escape
00243 
00244 static text *
00245 do_like_escape(text *pat, text *esc)
00246 {
00247     text       *result;
00248     char       *p,
00249                *e,
00250                *r;
00251     int         plen,
00252                 elen;
00253     bool        afterescape;
00254 
00255     p = VARDATA_ANY(pat);
00256     plen = VARSIZE_ANY_EXHDR(pat);
00257     e = VARDATA_ANY(esc);
00258     elen = VARSIZE_ANY_EXHDR(esc);
00259 
00260     /*
00261      * Worst-case pattern growth is 2x --- unlikely, but it's hardly worth
00262      * trying to calculate the size more accurately than that.
00263      */
00264     result = (text *) palloc(plen * 2 + VARHDRSZ);
00265     r = VARDATA(result);
00266 
00267     if (elen == 0)
00268     {
00269         /*
00270          * No escape character is wanted.  Double any backslashes in the
00271          * pattern to make them act like ordinary characters.
00272          */
00273         while (plen > 0)
00274         {
00275             if (*p == '\\')
00276                 *r++ = '\\';
00277             CopyAdvChar(r, p, plen);
00278         }
00279     }
00280     else
00281     {
00282         /*
00283          * The specified escape must be only a single character.
00284          */
00285         NextChar(e, elen);
00286         if (elen != 0)
00287             ereport(ERROR,
00288                     (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
00289                      errmsg("invalid escape string"),
00290                   errhint("Escape string must be empty or one character.")));
00291 
00292         e = VARDATA_ANY(esc);
00293 
00294         /*
00295          * If specified escape is '\', just copy the pattern as-is.
00296          */
00297         if (*e == '\\')
00298         {
00299             memcpy(result, pat, VARSIZE_ANY(pat));
00300             return result;
00301         }
00302 
00303         /*
00304          * Otherwise, convert occurrences of the specified escape character to
00305          * '\', and double occurrences of '\' --- unless they immediately
00306          * follow an escape character!
00307          */
00308         afterescape = false;
00309         while (plen > 0)
00310         {
00311             if (CHAREQ(p, e) && !afterescape)
00312             {
00313                 *r++ = '\\';
00314                 NextChar(p, plen);
00315                 afterescape = true;
00316             }
00317             else if (*p == '\\')
00318             {
00319                 *r++ = '\\';
00320                 if (!afterescape)
00321                     *r++ = '\\';
00322                 NextChar(p, plen);
00323                 afterescape = false;
00324             }
00325             else
00326             {
00327                 CopyAdvChar(r, p, plen);
00328                 afterescape = false;
00329             }
00330         }
00331     }
00332 
00333     SET_VARSIZE(result, r - ((char *) result));
00334 
00335     return result;
00336 }
00337 #endif   /* do_like_escape */
00338 
00339 #ifdef CHAREQ
00340 #undef CHAREQ
00341 #endif
00342 
00343 #undef NextChar
00344 #undef CopyAdvChar
00345 #undef MatchText
00346 
00347 #ifdef do_like_escape
00348 #undef do_like_escape
00349 #endif
00350 
00351 #undef GETCHAR
00352 
00353 #ifdef MATCH_LOWER
00354 #undef MATCH_LOWER
00355 
00356 #endif