LLVM API Documentation

StringRef.cpp
Go to the documentation of this file.
00001 //===-- StringRef.cpp - Lightweight String References ---------------------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 
00010 #include "llvm/ADT/StringRef.h"
00011 #include "llvm/ADT/APInt.h"
00012 #include "llvm/ADT/Hashing.h"
00013 #include "llvm/ADT/edit_distance.h"
00014 #include <bitset>
00015 
00016 using namespace llvm;
00017 
00018 // MSVC emits references to this into the translation units which reference it.
00019 #ifndef _MSC_VER
00020 const size_t StringRef::npos;
00021 #endif
00022 
00023 static char ascii_tolower(char x) {
00024   if (x >= 'A' && x <= 'Z')
00025     return x - 'A' + 'a';
00026   return x;
00027 }
00028 
00029 static char ascii_toupper(char x) {
00030   if (x >= 'a' && x <= 'z')
00031     return x - 'a' + 'A';
00032   return x;
00033 }
00034 
00035 static bool ascii_isdigit(char x) {
00036   return x >= '0' && x <= '9';
00037 }
00038 
00039 // strncasecmp() is not available on non-POSIX systems, so define an
00040 // alternative function here.
00041 static int ascii_strncasecmp(const char *LHS, const char *RHS, size_t Length) {
00042   for (size_t I = 0; I < Length; ++I) {
00043     unsigned char LHC = ascii_tolower(LHS[I]);
00044     unsigned char RHC = ascii_tolower(RHS[I]);
00045     if (LHC != RHC)
00046       return LHC < RHC ? -1 : 1;
00047   }
00048   return 0;
00049 }
00050 
00051 /// compare_lower - Compare strings, ignoring case.
00052 int StringRef::compare_lower(StringRef RHS) const {
00053   if (int Res = ascii_strncasecmp(Data, RHS.Data, std::min(Length, RHS.Length)))
00054     return Res;
00055   if (Length == RHS.Length)
00056     return 0;
00057   return Length < RHS.Length ? -1 : 1;
00058 }
00059 
00060 /// Check if this string starts with the given \p Prefix, ignoring case.
00061 bool StringRef::startswith_lower(StringRef Prefix) const {
00062   return Length >= Prefix.Length &&
00063       ascii_strncasecmp(Data, Prefix.Data, Prefix.Length) == 0;
00064 }
00065 
00066 /// Check if this string ends with the given \p Suffix, ignoring case.
00067 bool StringRef::endswith_lower(StringRef Suffix) const {
00068   return Length >= Suffix.Length &&
00069       ascii_strncasecmp(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0;
00070 }
00071 
00072 /// compare_numeric - Compare strings, handle embedded numbers.
00073 int StringRef::compare_numeric(StringRef RHS) const {
00074   for (size_t I = 0, E = std::min(Length, RHS.Length); I != E; ++I) {
00075     // Check for sequences of digits.
00076     if (ascii_isdigit(Data[I]) && ascii_isdigit(RHS.Data[I])) {
00077       // The longer sequence of numbers is considered larger.
00078       // This doesn't really handle prefixed zeros well.
00079       size_t J;
00080       for (J = I + 1; J != E + 1; ++J) {
00081         bool ld = J < Length && ascii_isdigit(Data[J]);
00082         bool rd = J < RHS.Length && ascii_isdigit(RHS.Data[J]);
00083         if (ld != rd)
00084           return rd ? -1 : 1;
00085         if (!rd)
00086           break;
00087       }
00088       // The two number sequences have the same length (J-I), just memcmp them.
00089       if (int Res = compareMemory(Data + I, RHS.Data + I, J - I))
00090         return Res < 0 ? -1 : 1;
00091       // Identical number sequences, continue search after the numbers.
00092       I = J - 1;
00093       continue;
00094     }
00095     if (Data[I] != RHS.Data[I])
00096       return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1;
00097   }
00098   if (Length == RHS.Length)
00099     return 0;
00100   return Length < RHS.Length ? -1 : 1;
00101 }
00102 
00103 // Compute the edit distance between the two given strings.
00104 unsigned StringRef::edit_distance(llvm::StringRef Other,
00105                                   bool AllowReplacements,
00106                                   unsigned MaxEditDistance) const {
00107   return llvm::ComputeEditDistance(
00108       makeArrayRef(data(), size()),
00109       makeArrayRef(Other.data(), Other.size()),
00110       AllowReplacements, MaxEditDistance);
00111 }
00112 
00113 //===----------------------------------------------------------------------===//
00114 // String Operations
00115 //===----------------------------------------------------------------------===//
00116 
00117 std::string StringRef::lower() const {
00118   std::string Result(size(), char());
00119   for (size_type i = 0, e = size(); i != e; ++i) {
00120     Result[i] = ascii_tolower(Data[i]);
00121   }
00122   return Result;
00123 }
00124 
00125 std::string StringRef::upper() const {
00126   std::string Result(size(), char());
00127   for (size_type i = 0, e = size(); i != e; ++i) {
00128     Result[i] = ascii_toupper(Data[i]);
00129   }
00130   return Result;
00131 }
00132 
00133 //===----------------------------------------------------------------------===//
00134 // String Searching
00135 //===----------------------------------------------------------------------===//
00136 
00137 
00138 /// find - Search for the first string \arg Str in the string.
00139 ///
00140 /// \return - The index of the first occurrence of \arg Str, or npos if not
00141 /// found.
00142 size_t StringRef::find(StringRef Str, size_t From) const {
00143   size_t N = Str.size();
00144   if (N > Length)
00145     return npos;
00146 
00147   // For short haystacks or unsupported needles fall back to the naive algorithm
00148   if (Length < 16 || N > 255 || N == 0) {
00149     for (size_t e = Length - N + 1, i = std::min(From, e); i != e; ++i)
00150       if (substr(i, N).equals(Str))
00151         return i;
00152     return npos;
00153   }
00154 
00155   if (From >= Length)
00156     return npos;
00157 
00158   // Build the bad char heuristic table, with uint8_t to reduce cache thrashing.
00159   uint8_t BadCharSkip[256];
00160   std::memset(BadCharSkip, N, 256);
00161   for (unsigned i = 0; i != N-1; ++i)
00162     BadCharSkip[(uint8_t)Str[i]] = N-1-i;
00163 
00164   unsigned Len = Length-From, Pos = From;
00165   while (Len >= N) {
00166     if (substr(Pos, N).equals(Str)) // See if this is the correct substring.
00167       return Pos;
00168 
00169     // Otherwise skip the appropriate number of bytes.
00170     uint8_t Skip = BadCharSkip[(uint8_t)(*this)[Pos+N-1]];
00171     Len -= Skip;
00172     Pos += Skip;
00173   }
00174 
00175   return npos;
00176 }
00177 
00178 /// rfind - Search for the last string \arg Str in the string.
00179 ///
00180 /// \return - The index of the last occurrence of \arg Str, or npos if not
00181 /// found.
00182 size_t StringRef::rfind(StringRef Str) const {
00183   size_t N = Str.size();
00184   if (N > Length)
00185     return npos;
00186   for (size_t i = Length - N + 1, e = 0; i != e;) {
00187     --i;
00188     if (substr(i, N).equals(Str))
00189       return i;
00190   }
00191   return npos;
00192 }
00193 
00194 /// find_first_of - Find the first character in the string that is in \arg
00195 /// Chars, or npos if not found.
00196 ///
00197 /// Note: O(size() + Chars.size())
00198 StringRef::size_type StringRef::find_first_of(StringRef Chars,
00199                                               size_t From) const {
00200   std::bitset<1 << CHAR_BIT> CharBits;
00201   for (size_type i = 0; i != Chars.size(); ++i)
00202     CharBits.set((unsigned char)Chars[i]);
00203 
00204   for (size_type i = std::min(From, Length), e = Length; i != e; ++i)
00205     if (CharBits.test((unsigned char)Data[i]))
00206       return i;
00207   return npos;
00208 }
00209 
00210 /// find_first_not_of - Find the first character in the string that is not
00211 /// \arg C or npos if not found.
00212 StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const {
00213   for (size_type i = std::min(From, Length), e = Length; i != e; ++i)
00214     if (Data[i] != C)
00215       return i;
00216   return npos;
00217 }
00218 
00219 /// find_first_not_of - Find the first character in the string that is not
00220 /// in the string \arg Chars, or npos if not found.
00221 ///
00222 /// Note: O(size() + Chars.size())
00223 StringRef::size_type StringRef::find_first_not_of(StringRef Chars,
00224                                                   size_t From) const {
00225   std::bitset<1 << CHAR_BIT> CharBits;
00226   for (size_type i = 0; i != Chars.size(); ++i)
00227     CharBits.set((unsigned char)Chars[i]);
00228 
00229   for (size_type i = std::min(From, Length), e = Length; i != e; ++i)
00230     if (!CharBits.test((unsigned char)Data[i]))
00231       return i;
00232   return npos;
00233 }
00234 
00235 /// find_last_of - Find the last character in the string that is in \arg C,
00236 /// or npos if not found.
00237 ///
00238 /// Note: O(size() + Chars.size())
00239 StringRef::size_type StringRef::find_last_of(StringRef Chars,
00240                                              size_t From) const {
00241   std::bitset<1 << CHAR_BIT> CharBits;
00242   for (size_type i = 0; i != Chars.size(); ++i)
00243     CharBits.set((unsigned char)Chars[i]);
00244 
00245   for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i)
00246     if (CharBits.test((unsigned char)Data[i]))
00247       return i;
00248   return npos;
00249 }
00250 
00251 /// find_last_not_of - Find the last character in the string that is not
00252 /// \arg C, or npos if not found.
00253 StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const {
00254   for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i)
00255     if (Data[i] != C)
00256       return i;
00257   return npos;
00258 }
00259 
00260 /// find_last_not_of - Find the last character in the string that is not in
00261 /// \arg Chars, or npos if not found.
00262 ///
00263 /// Note: O(size() + Chars.size())
00264 StringRef::size_type StringRef::find_last_not_of(StringRef Chars,
00265                                                  size_t From) const {
00266   std::bitset<1 << CHAR_BIT> CharBits;
00267   for (size_type i = 0, e = Chars.size(); i != e; ++i)
00268     CharBits.set((unsigned char)Chars[i]);
00269 
00270   for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i)
00271     if (!CharBits.test((unsigned char)Data[i]))
00272       return i;
00273   return npos;
00274 }
00275 
00276 void StringRef::split(SmallVectorImpl<StringRef> &A,
00277                       StringRef Separators, int MaxSplit,
00278                       bool KeepEmpty) const {
00279   StringRef rest = *this;
00280 
00281   // rest.data() is used to distinguish cases like "a," that splits into
00282   // "a" + "" and "a" that splits into "a" + 0.
00283   for (int splits = 0;
00284        rest.data() != nullptr && (MaxSplit < 0 || splits < MaxSplit);
00285        ++splits) {
00286     std::pair<StringRef, StringRef> p = rest.split(Separators);
00287 
00288     if (KeepEmpty || p.first.size() != 0)
00289       A.push_back(p.first);
00290     rest = p.second;
00291   }
00292   // If we have a tail left, add it.
00293   if (rest.data() != nullptr && (rest.size() != 0 || KeepEmpty))
00294     A.push_back(rest);
00295 }
00296 
00297 //===----------------------------------------------------------------------===//
00298 // Helpful Algorithms
00299 //===----------------------------------------------------------------------===//
00300 
00301 /// count - Return the number of non-overlapped occurrences of \arg Str in
00302 /// the string.
00303 size_t StringRef::count(StringRef Str) const {
00304   size_t Count = 0;
00305   size_t N = Str.size();
00306   if (N > Length)
00307     return 0;
00308   for (size_t i = 0, e = Length - N + 1; i != e; ++i)
00309     if (substr(i, N).equals(Str))
00310       ++Count;
00311   return Count;
00312 }
00313 
00314 static unsigned GetAutoSenseRadix(StringRef &Str) {
00315   if (Str.startswith("0x")) {
00316     Str = Str.substr(2);
00317     return 16;
00318   }
00319   
00320   if (Str.startswith("0b")) {
00321     Str = Str.substr(2);
00322     return 2;
00323   }
00324 
00325   if (Str.startswith("0o")) {
00326     Str = Str.substr(2);
00327     return 8;
00328   }
00329 
00330   if (Str.startswith("0"))
00331     return 8;
00332   
00333   return 10;
00334 }
00335 
00336 
00337 /// GetAsUnsignedInteger - Workhorse method that converts a integer character
00338 /// sequence of radix up to 36 to an unsigned long long value.
00339 bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix,
00340                                 unsigned long long &Result) {
00341   // Autosense radix if not specified.
00342   if (Radix == 0)
00343     Radix = GetAutoSenseRadix(Str);
00344 
00345   // Empty strings (after the radix autosense) are invalid.
00346   if (Str.empty()) return true;
00347 
00348   // Parse all the bytes of the string given this radix.  Watch for overflow.
00349   Result = 0;
00350   while (!Str.empty()) {
00351     unsigned CharVal;
00352     if (Str[0] >= '0' && Str[0] <= '9')
00353       CharVal = Str[0]-'0';
00354     else if (Str[0] >= 'a' && Str[0] <= 'z')
00355       CharVal = Str[0]-'a'+10;
00356     else if (Str[0] >= 'A' && Str[0] <= 'Z')
00357       CharVal = Str[0]-'A'+10;
00358     else
00359       return true;
00360 
00361     // If the parsed value is larger than the integer radix, the string is
00362     // invalid.
00363     if (CharVal >= Radix)
00364       return true;
00365 
00366     // Add in this character.
00367     unsigned long long PrevResult = Result;
00368     Result = Result*Radix+CharVal;
00369 
00370     // Check for overflow by shifting back and seeing if bits were lost.
00371     if (Result/Radix < PrevResult)
00372       return true;
00373 
00374     Str = Str.substr(1);
00375   }
00376 
00377   return false;
00378 }
00379 
00380 bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix,
00381                               long long &Result) {
00382   unsigned long long ULLVal;
00383 
00384   // Handle positive strings first.
00385   if (Str.empty() || Str.front() != '-') {
00386     if (getAsUnsignedInteger(Str, Radix, ULLVal) ||
00387         // Check for value so large it overflows a signed value.
00388         (long long)ULLVal < 0)
00389       return true;
00390     Result = ULLVal;
00391     return false;
00392   }
00393 
00394   // Get the positive part of the value.
00395   if (getAsUnsignedInteger(Str.substr(1), Radix, ULLVal) ||
00396       // Reject values so large they'd overflow as negative signed, but allow
00397       // "-0".  This negates the unsigned so that the negative isn't undefined
00398       // on signed overflow.
00399       (long long)-ULLVal > 0)
00400     return true;
00401 
00402   Result = -ULLVal;
00403   return false;
00404 }
00405 
00406 bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const {
00407   StringRef Str = *this;
00408 
00409   // Autosense radix if not specified.
00410   if (Radix == 0)
00411     Radix = GetAutoSenseRadix(Str);
00412 
00413   assert(Radix > 1 && Radix <= 36);
00414 
00415   // Empty strings (after the radix autosense) are invalid.
00416   if (Str.empty()) return true;
00417 
00418   // Skip leading zeroes.  This can be a significant improvement if
00419   // it means we don't need > 64 bits.
00420   while (!Str.empty() && Str.front() == '0')
00421     Str = Str.substr(1);
00422 
00423   // If it was nothing but zeroes....
00424   if (Str.empty()) {
00425     Result = APInt(64, 0);
00426     return false;
00427   }
00428 
00429   // (Over-)estimate the required number of bits.
00430   unsigned Log2Radix = 0;
00431   while ((1U << Log2Radix) < Radix) Log2Radix++;
00432   bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix);
00433 
00434   unsigned BitWidth = Log2Radix * Str.size();
00435   if (BitWidth < Result.getBitWidth())
00436     BitWidth = Result.getBitWidth(); // don't shrink the result
00437   else if (BitWidth > Result.getBitWidth())
00438     Result = Result.zext(BitWidth);
00439 
00440   APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix
00441   if (!IsPowerOf2Radix) {
00442     // These must have the same bit-width as Result.
00443     RadixAP = APInt(BitWidth, Radix);
00444     CharAP = APInt(BitWidth, 0);
00445   }
00446 
00447   // Parse all the bytes of the string given this radix.
00448   Result = 0;
00449   while (!Str.empty()) {
00450     unsigned CharVal;
00451     if (Str[0] >= '0' && Str[0] <= '9')
00452       CharVal = Str[0]-'0';
00453     else if (Str[0] >= 'a' && Str[0] <= 'z')
00454       CharVal = Str[0]-'a'+10;
00455     else if (Str[0] >= 'A' && Str[0] <= 'Z')
00456       CharVal = Str[0]-'A'+10;
00457     else
00458       return true;
00459 
00460     // If the parsed value is larger than the integer radix, the string is
00461     // invalid.
00462     if (CharVal >= Radix)
00463       return true;
00464 
00465     // Add in this character.
00466     if (IsPowerOf2Radix) {
00467       Result <<= Log2Radix;
00468       Result |= CharVal;
00469     } else {
00470       Result *= RadixAP;
00471       CharAP = CharVal;
00472       Result += CharAP;
00473     }
00474 
00475     Str = Str.substr(1);
00476   }
00477 
00478   return false;
00479 }
00480 
00481 
00482 // Implementation of StringRef hashing.
00483 hash_code llvm::hash_value(StringRef S) {
00484   return hash_combine_range(S.begin(), S.end());
00485 }