LLVM API Documentation

Regex.cpp
Go to the documentation of this file.
00001 //===-- Regex.cpp - Regular Expression matcher implementation -------------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 // This file implements a POSIX regular expression matcher.
00011 //
00012 //===----------------------------------------------------------------------===//
00013 
00014 #include "llvm/Support/Regex.h"
00015 #include "regex_impl.h"
00016 #include "llvm/ADT/SmallVector.h"
00017 #include "llvm/Support/ErrorHandling.h"
00018 #include "llvm/Support/raw_ostream.h"
00019 #include <string>
00020 using namespace llvm;
00021 
00022 Regex::Regex(StringRef regex, unsigned Flags) {
00023   unsigned flags = 0;
00024   preg = new llvm_regex();
00025   preg->re_endp = regex.end();
00026   if (Flags & IgnoreCase) 
00027     flags |= REG_ICASE;
00028   if (Flags & Newline)
00029     flags |= REG_NEWLINE;
00030   if (!(Flags & BasicRegex))
00031     flags |= REG_EXTENDED;
00032   error = llvm_regcomp(preg, regex.data(), flags|REG_PEND);
00033 }
00034 
00035 Regex::~Regex() {
00036   if (preg) {
00037     llvm_regfree(preg);
00038     delete preg;
00039   }
00040 }
00041 
00042 bool Regex::isValid(std::string &Error) {
00043   if (!error)
00044     return true;
00045   
00046   size_t len = llvm_regerror(error, preg, nullptr, 0);
00047   
00048   Error.resize(len - 1);
00049   llvm_regerror(error, preg, &Error[0], len);
00050   return false;
00051 }
00052 
00053 /// getNumMatches - In a valid regex, return the number of parenthesized
00054 /// matches it contains.
00055 unsigned Regex::getNumMatches() const {
00056   return preg->re_nsub;
00057 }
00058 
00059 bool Regex::match(StringRef String, SmallVectorImpl<StringRef> *Matches){
00060   unsigned nmatch = Matches ? preg->re_nsub+1 : 0;
00061 
00062   // pmatch needs to have at least one element.
00063   SmallVector<llvm_regmatch_t, 8> pm;
00064   pm.resize(nmatch > 0 ? nmatch : 1);
00065   pm[0].rm_so = 0;
00066   pm[0].rm_eo = String.size();
00067 
00068   int rc = llvm_regexec(preg, String.data(), nmatch, pm.data(), REG_STARTEND);
00069 
00070   if (rc == REG_NOMATCH)
00071     return false;
00072   if (rc != 0) {
00073     // regexec can fail due to invalid pattern or running out of memory.
00074     error = rc;
00075     return false;
00076   }
00077 
00078   // There was a match.
00079 
00080   if (Matches) { // match position requested
00081     Matches->clear();
00082     
00083     for (unsigned i = 0; i != nmatch; ++i) {
00084       if (pm[i].rm_so == -1) {
00085         // this group didn't match
00086         Matches->push_back(StringRef());
00087         continue;
00088       }
00089       assert(pm[i].rm_eo >= pm[i].rm_so);
00090       Matches->push_back(StringRef(String.data()+pm[i].rm_so,
00091                                    pm[i].rm_eo-pm[i].rm_so));
00092     }
00093   }
00094 
00095   return true;
00096 }
00097 
00098 std::string Regex::sub(StringRef Repl, StringRef String,
00099                        std::string *Error) {
00100   SmallVector<StringRef, 8> Matches;
00101 
00102   // Reset error, if given.
00103   if (Error && !Error->empty()) *Error = "";
00104 
00105   // Return the input if there was no match.
00106   if (!match(String, &Matches))
00107     return String;
00108 
00109   // Otherwise splice in the replacement string, starting with the prefix before
00110   // the match.
00111   std::string Res(String.begin(), Matches[0].begin());
00112 
00113   // Then the replacement string, honoring possible substitutions.
00114   while (!Repl.empty()) {
00115     // Skip to the next escape.
00116     std::pair<StringRef, StringRef> Split = Repl.split('\\');
00117 
00118     // Add the skipped substring.
00119     Res += Split.first;
00120 
00121     // Check for terminimation and trailing backslash.
00122     if (Split.second.empty()) {
00123       if (Repl.size() != Split.first.size() &&
00124           Error && Error->empty())
00125         *Error = "replacement string contained trailing backslash";
00126       break;
00127     }
00128 
00129     // Otherwise update the replacement string and interpret escapes.
00130     Repl = Split.second;
00131 
00132     // FIXME: We should have a StringExtras function for mapping C99 escapes.
00133     switch (Repl[0]) {
00134       // Treat all unrecognized characters as self-quoting.
00135     default:
00136       Res += Repl[0];
00137       Repl = Repl.substr(1);
00138       break;
00139 
00140       // Single character escapes.
00141     case 't':
00142       Res += '\t';
00143       Repl = Repl.substr(1);
00144       break;
00145     case 'n':
00146       Res += '\n';
00147       Repl = Repl.substr(1);
00148       break;
00149 
00150       // Decimal escapes are backreferences.
00151     case '0': case '1': case '2': case '3': case '4':
00152     case '5': case '6': case '7': case '8': case '9': {
00153       // Extract the backreference number.
00154       StringRef Ref = Repl.slice(0, Repl.find_first_not_of("0123456789"));
00155       Repl = Repl.substr(Ref.size());
00156 
00157       unsigned RefValue;
00158       if (!Ref.getAsInteger(10, RefValue) &&
00159           RefValue < Matches.size())
00160         Res += Matches[RefValue];
00161       else if (Error && Error->empty())
00162         *Error = "invalid backreference string '" + Ref.str() + "'";
00163       break;
00164     }
00165     }
00166   }
00167 
00168   // And finally the suffix.
00169   Res += StringRef(Matches[0].end(), String.end() - Matches[0].end());
00170 
00171   return Res;
00172 }
00173 
00174 // These are the special characters matched in functions like "p_ere_exp".
00175 static const char RegexMetachars[] = "()^$|*+?.[]\\{}";
00176 
00177 bool Regex::isLiteralERE(StringRef Str) {
00178   // Check for regex metacharacters.  This list was derived from our regex
00179   // implementation in regcomp.c and double checked against the POSIX extended
00180   // regular expression specification.
00181   return Str.find_first_of(RegexMetachars) == StringRef::npos;
00182 }
00183 
00184 std::string Regex::escape(StringRef String) {
00185   std::string RegexStr;
00186   for (unsigned i = 0, e = String.size(); i != e; ++i) {
00187     if (strchr(RegexMetachars, String[i]))
00188       RegexStr += '\\';
00189     RegexStr += String[i];
00190   }
00191 
00192   return RegexStr;
00193 }