LLVM: UnicodeCharRanges.h Source File

Go to the documentation of this file.
00001 //===--- UnicodeCharRanges.h - Types and functions for character ranges ---===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 #ifndef LLVM_SUPPORT_UNICODECHARRANGES_H
00010 #define LLVM_SUPPORT_UNICODECHARRANGES_H
00011 
00012 #include "llvm/ADT/ArrayRef.h"
00013 #include "llvm/ADT/SmallPtrSet.h"
00014 #include "llvm/Support/Compiler.h"
00015 #include "llvm/Support/Debug.h"
00016 #include "llvm/Support/Mutex.h"
00017 #include "llvm/Support/MutexGuard.h"
00018 #include "llvm/Support/raw_ostream.h"
00019 #include <algorithm>
00020 
00021 namespace llvm {
00022 namespace sys {
00023 
00024 #define DEBUG_TYPE "unicode"
00025 
00026 /// \brief Represents a closed range of Unicode code points [Lower, Upper].
00027 struct UnicodeCharRange {
00028   uint32_t Lower;
00029   uint32_t Upper;
00030 };
00031 
00032 inline bool operator<(uint32_t Value, UnicodeCharRange Range) {
00033   return Value < Range.Lower;
00034 }
00035 inline bool operator<(UnicodeCharRange Range, uint32_t Value) {
00036   return Range.Upper < Value;
00037 }
00038 
00039 /// \brief Holds a reference to an ordered array of UnicodeCharRange and allows
00040 /// to quickly check if a code point is contained in the set represented by this
00041 /// array.
00042 class UnicodeCharSet {
00043 public:
00044   typedef ArrayRef<UnicodeCharRange> CharRanges;
00045 
00046   /// \brief Constructs a UnicodeCharSet instance from an array of
00047   /// UnicodeCharRanges.
00048   ///
00049   /// Array pointed by \p Ranges should have the lifetime at least as long as
00050   /// the UnicodeCharSet instance, and should not change. Array is validated by
00051   /// the constructor, so it makes sense to create as few UnicodeCharSet
00052   /// instances per each array of ranges, as possible.
00053   UnicodeCharSet(CharRanges Ranges) : Ranges(Ranges) {
00054     assert(rangesAreValid());
00055   }
00056 
00057   /// \brief Returns true if the character set contains the Unicode code point
00058   /// \p C.
00059   bool contains(uint32_t C) const {
00060     return std::binary_search(Ranges.begin(), Ranges.end(), C);
00061   }
00062 
00063 private:
00064   /// \brief Returns true if each of the ranges is a proper closed range
00065   /// [min, max], and if the ranges themselves are ordered and non-overlapping.
00066   bool rangesAreValid() const {
00067     uint32_t Prev = 0;
00068     for (CharRanges::const_iterator I = Ranges.begin(), E = Ranges.end();
00069          I != E; ++I) {
00070       if (I != Ranges.begin() && Prev >= I->Lower) {
00071         DEBUG(dbgs() << "Upper bound 0x");
00072         DEBUG(dbgs().write_hex(Prev));
00073         DEBUG(dbgs() << " should be less than succeeding lower bound 0x");
00074         DEBUG(dbgs().write_hex(I->Lower) << "\n");
00075         return false;
00076       }
00077       if (I->Upper < I->Lower) {
00078         DEBUG(dbgs() << "Upper bound 0x");
00079         DEBUG(dbgs().write_hex(I->Lower));
00080         DEBUG(dbgs() << " should not be less than lower bound 0x");
00081         DEBUG(dbgs().write_hex(I->Upper) << "\n");
00082         return false;
00083       }
00084       Prev = I->Upper;
00085     }
00086 
00087     return true;
00088   }
00089 
00090   const CharRanges Ranges;
00091 };
00092 
00093 #undef DEBUG_TYPE // "unicode"
00094 
00095 } // namespace sys
00096 } // namespace llvm
00097 
00098 
00099 #endif // LLVM_SUPPORT_UNICODECHARRANGES_H
Generated on Thu Sep 18 2014 03:43:35 for LLVM by Doxygen