LLVM API Documentation
00001 //===-- Regex.cpp - Regular Expression matcher implementation -------------===// 00002 // 00003 // The LLVM Compiler Infrastructure 00004 // 00005 // This file is distributed under the University of Illinois Open Source 00006 // License. See LICENSE.TXT for details. 00007 // 00008 //===----------------------------------------------------------------------===// 00009 // 00010 // This file implements a POSIX regular expression matcher. 00011 // 00012 //===----------------------------------------------------------------------===// 00013 00014 #include "llvm/Support/Regex.h" 00015 #include "regex_impl.h" 00016 #include "llvm/ADT/SmallVector.h" 00017 #include "llvm/Support/ErrorHandling.h" 00018 #include "llvm/Support/raw_ostream.h" 00019 #include <string> 00020 using namespace llvm; 00021 00022 Regex::Regex(StringRef regex, unsigned Flags) { 00023 unsigned flags = 0; 00024 preg = new llvm_regex(); 00025 preg->re_endp = regex.end(); 00026 if (Flags & IgnoreCase) 00027 flags |= REG_ICASE; 00028 if (Flags & Newline) 00029 flags |= REG_NEWLINE; 00030 if (!(Flags & BasicRegex)) 00031 flags |= REG_EXTENDED; 00032 error = llvm_regcomp(preg, regex.data(), flags|REG_PEND); 00033 } 00034 00035 Regex::~Regex() { 00036 if (preg) { 00037 llvm_regfree(preg); 00038 delete preg; 00039 } 00040 } 00041 00042 bool Regex::isValid(std::string &Error) { 00043 if (!error) 00044 return true; 00045 00046 size_t len = llvm_regerror(error, preg, nullptr, 0); 00047 00048 Error.resize(len - 1); 00049 llvm_regerror(error, preg, &Error[0], len); 00050 return false; 00051 } 00052 00053 /// getNumMatches - In a valid regex, return the number of parenthesized 00054 /// matches it contains. 00055 unsigned Regex::getNumMatches() const { 00056 return preg->re_nsub; 00057 } 00058 00059 bool Regex::match(StringRef String, SmallVectorImpl<StringRef> *Matches){ 00060 unsigned nmatch = Matches ? preg->re_nsub+1 : 0; 00061 00062 // pmatch needs to have at least one element. 00063 SmallVector<llvm_regmatch_t, 8> pm; 00064 pm.resize(nmatch > 0 ? nmatch : 1); 00065 pm[0].rm_so = 0; 00066 pm[0].rm_eo = String.size(); 00067 00068 int rc = llvm_regexec(preg, String.data(), nmatch, pm.data(), REG_STARTEND); 00069 00070 if (rc == REG_NOMATCH) 00071 return false; 00072 if (rc != 0) { 00073 // regexec can fail due to invalid pattern or running out of memory. 00074 error = rc; 00075 return false; 00076 } 00077 00078 // There was a match. 00079 00080 if (Matches) { // match position requested 00081 Matches->clear(); 00082 00083 for (unsigned i = 0; i != nmatch; ++i) { 00084 if (pm[i].rm_so == -1) { 00085 // this group didn't match 00086 Matches->push_back(StringRef()); 00087 continue; 00088 } 00089 assert(pm[i].rm_eo >= pm[i].rm_so); 00090 Matches->push_back(StringRef(String.data()+pm[i].rm_so, 00091 pm[i].rm_eo-pm[i].rm_so)); 00092 } 00093 } 00094 00095 return true; 00096 } 00097 00098 std::string Regex::sub(StringRef Repl, StringRef String, 00099 std::string *Error) { 00100 SmallVector<StringRef, 8> Matches; 00101 00102 // Reset error, if given. 00103 if (Error && !Error->empty()) *Error = ""; 00104 00105 // Return the input if there was no match. 00106 if (!match(String, &Matches)) 00107 return String; 00108 00109 // Otherwise splice in the replacement string, starting with the prefix before 00110 // the match. 00111 std::string Res(String.begin(), Matches[0].begin()); 00112 00113 // Then the replacement string, honoring possible substitutions. 00114 while (!Repl.empty()) { 00115 // Skip to the next escape. 00116 std::pair<StringRef, StringRef> Split = Repl.split('\\'); 00117 00118 // Add the skipped substring. 00119 Res += Split.first; 00120 00121 // Check for terminimation and trailing backslash. 00122 if (Split.second.empty()) { 00123 if (Repl.size() != Split.first.size() && 00124 Error && Error->empty()) 00125 *Error = "replacement string contained trailing backslash"; 00126 break; 00127 } 00128 00129 // Otherwise update the replacement string and interpret escapes. 00130 Repl = Split.second; 00131 00132 // FIXME: We should have a StringExtras function for mapping C99 escapes. 00133 switch (Repl[0]) { 00134 // Treat all unrecognized characters as self-quoting. 00135 default: 00136 Res += Repl[0]; 00137 Repl = Repl.substr(1); 00138 break; 00139 00140 // Single character escapes. 00141 case 't': 00142 Res += '\t'; 00143 Repl = Repl.substr(1); 00144 break; 00145 case 'n': 00146 Res += '\n'; 00147 Repl = Repl.substr(1); 00148 break; 00149 00150 // Decimal escapes are backreferences. 00151 case '0': case '1': case '2': case '3': case '4': 00152 case '5': case '6': case '7': case '8': case '9': { 00153 // Extract the backreference number. 00154 StringRef Ref = Repl.slice(0, Repl.find_first_not_of("0123456789")); 00155 Repl = Repl.substr(Ref.size()); 00156 00157 unsigned RefValue; 00158 if (!Ref.getAsInteger(10, RefValue) && 00159 RefValue < Matches.size()) 00160 Res += Matches[RefValue]; 00161 else if (Error && Error->empty()) 00162 *Error = "invalid backreference string '" + Ref.str() + "'"; 00163 break; 00164 } 00165 } 00166 } 00167 00168 // And finally the suffix. 00169 Res += StringRef(Matches[0].end(), String.end() - Matches[0].end()); 00170 00171 return Res; 00172 } 00173 00174 // These are the special characters matched in functions like "p_ere_exp". 00175 static const char RegexMetachars[] = "()^$|*+?.[]\\{}"; 00176 00177 bool Regex::isLiteralERE(StringRef Str) { 00178 // Check for regex metacharacters. This list was derived from our regex 00179 // implementation in regcomp.c and double checked against the POSIX extended 00180 // regular expression specification. 00181 return Str.find_first_of(RegexMetachars) == StringRef::npos; 00182 } 00183 00184 std::string Regex::escape(StringRef String) { 00185 std::string RegexStr; 00186 for (unsigned i = 0, e = String.size(); i != e; ++i) { 00187 if (strchr(RegexMetachars, String[i])) 00188 RegexStr += '\\'; 00189 RegexStr += String[i]; 00190 } 00191 00192 return RegexStr; 00193 }