RTBKit  0.9
Open-source framework to create real-time ad bidding systems.
soa/utf8cpp/source/utf8/core.h
00001 // Copyright 2006 Nemanja Trifunovic
00002 
00003 /*
00004 Permission is hereby granted, free of charge, to any person or organization
00005 obtaining a copy of the software and accompanying documentation covered by
00006 this license (the "Software") to use, reproduce, display, distribute,
00007 execute, and transmit the Software, and to prepare derivative works of the
00008 Software, and to permit third-parties to whom the Software is furnished to
00009 do so, all subject to the following:
00010 
00011 The copyright notices in the Software and this entire statement, including
00012 the above license grant, this restriction and the following disclaimer,
00013 must be included in all copies of the Software, in whole or in part, and
00014 all derivative works of the Software, unless such copies or derivative
00015 works are solely in the form of machine-executable object code generated by
00016 a source language processor.
00017 
00018 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
00019 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
00020 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
00021 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
00022 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
00023 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
00024 DEALINGS IN THE SOFTWARE.
00025 */
00026 
00027 
00028 #ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
00029 #define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
00030 
00031 #include <iterator>
00032 
00033 namespace utf8
00034 {
00035     // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers
00036     // You may need to change them to match your system.
00037     // These typedefs have the same names as ones from cstdint, or boost/cstdint
00038     typedef unsigned char   uint8_t;
00039     typedef unsigned short  uint16_t;
00040     typedef unsigned int    uint32_t;
00041 
00042 // Helper code - not intended to be directly called by the library users. May be changed at any time
00043 namespace internal
00044 {
00045     // Unicode constants
00046     // Leading (high) surrogates: 0xd800 - 0xdbff
00047     // Trailing (low) surrogates: 0xdc00 - 0xdfff
00048     const uint16_t LEAD_SURROGATE_MIN  = 0xd800u;
00049     const uint16_t LEAD_SURROGATE_MAX  = 0xdbffu;
00050     const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
00051     const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
00052     const uint16_t LEAD_OFFSET         = LEAD_SURROGATE_MIN - (0x10000 >> 10);
00053     const uint32_t SURROGATE_OFFSET    = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
00054 
00055     // Maximum valid value for a Unicode code point
00056     const uint32_t CODE_POINT_MAX      = 0x0010ffffu;
00057 
00058     template<typename octet_type>
00059     inline uint8_t mask8(octet_type oc)
00060     {
00061         return static_cast<uint8_t>(0xff & oc);
00062     }
00063     template<typename u16_type>
00064     inline uint16_t mask16(u16_type oc)
00065     {
00066         return static_cast<uint16_t>(0xffff & oc);
00067     }
00068     template<typename octet_type>
00069     inline bool is_trail(octet_type oc)
00070     {
00071         return ((mask8(oc) >> 6) == 0x2);
00072     }
00073 
00074     template <typename u16>
00075     inline bool is_lead_surrogate(u16 cp)
00076     {
00077         return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
00078     }
00079 
00080     template <typename u16>
00081     inline bool is_trail_surrogate(u16 cp)
00082     {
00083         return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
00084     }
00085 
00086     template <typename u16>
00087     inline bool is_surrogate(u16 cp)
00088     {
00089         return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
00090     }
00091 
00092     template <typename u32>
00093     inline bool is_code_point_valid(u32 cp)
00094     {
00095         return (cp <= CODE_POINT_MAX && !is_surrogate(cp));
00096     }
00097 
00098     template <typename octet_iterator>
00099     inline int
00100     sequence_length(octet_iterator lead_it)
00101     {
00102         uint8_t lead = mask8(*lead_it);
00103         if (lead < 0x80)
00104             return 1;
00105         else if ((lead >> 5) == 0x6)
00106             return 2;
00107         else if ((lead >> 4) == 0xe)
00108             return 3;
00109         else if ((lead >> 3) == 0x1e)
00110             return 4;
00111         else
00112             return 0;
00113     }
00114 
00115     template <typename octet_difference_type>
00116     inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
00117     {
00118         if (cp < 0x80) {
00119             if (length != 1) 
00120                 return true;
00121         }
00122         else if (cp < 0x800) {
00123             if (length != 2) 
00124                 return true;
00125         }
00126         else if (cp < 0x10000) {
00127             if (length != 3) 
00128                 return true;
00129         }
00130 
00131         return false;
00132     }
00133 
00134     enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
00135 
00137 
00138     template <typename octet_iterator>
00139     utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t* code_point)
00140     {
00141         if (it != end) {
00142             if (code_point)
00143                 *code_point = mask8(*it);
00144             return UTF8_OK;
00145         }
00146         return NOT_ENOUGH_ROOM;
00147     }
00148 
00149     template <typename octet_iterator>
00150     utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t* code_point)
00151     {
00152         utf_error ret_code = NOT_ENOUGH_ROOM;
00153 
00154         if (it != end) {
00155             uint32_t cp = mask8(*it);
00156             if (++it != end) {
00157                 if (is_trail(*it)) {
00158                     cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
00159 
00160                     if (code_point)
00161                         *code_point = cp;
00162                     ret_code = UTF8_OK;
00163                 }
00164                 else
00165                     ret_code = INCOMPLETE_SEQUENCE;
00166             }
00167             else
00168                 ret_code = NOT_ENOUGH_ROOM;
00169         }
00170 
00171         return ret_code;
00172     }
00173 
00174     template <typename octet_iterator>
00175     utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t* code_point)
00176     {
00177         utf_error ret_code = NOT_ENOUGH_ROOM;
00178 
00179         if (it != end) {
00180             uint32_t cp = mask8(*it);
00181             if (++it != end) {
00182                 if (is_trail(*it)) {
00183                     cp = ((cp << 12) & 0xffff) + ((mask8(*it) << 6) & 0xfff);
00184                     if (++it != end) {
00185                         if (is_trail(*it)) {
00186                             cp += (*it) & 0x3f;
00187 
00188                             if (code_point)
00189                                 *code_point = cp;
00190                             ret_code = UTF8_OK;
00191                         }
00192                         else 
00193                             ret_code = INCOMPLETE_SEQUENCE;
00194                     }
00195                     else
00196                         ret_code = NOT_ENOUGH_ROOM;
00197                 }
00198                 else
00199                     ret_code = INCOMPLETE_SEQUENCE;
00200             }
00201             else
00202                 ret_code = NOT_ENOUGH_ROOM;
00203         }
00204 
00205         return ret_code;
00206     }
00207 
00208     template <typename octet_iterator>
00209     utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t* code_point)
00210     {
00211         utf_error ret_code = NOT_ENOUGH_ROOM;
00212 
00213         if (it != end) {
00214             uint32_t cp = mask8(*it);
00215             if (++it != end) {
00216                 if (is_trail(*it)) {
00217                     cp = ((cp << 18) & 0x1fffff) + ((mask8(*it) << 12) & 0x3ffff);
00218                     if (++it != end) {
00219                         if (is_trail(*it)) {
00220                             cp += (mask8(*it) << 6) & 0xfff;
00221                             if (++it != end) {
00222                                 if (is_trail(*it)) {
00223                                     cp += (*it) & 0x3f;
00224 
00225                                     if (code_point)
00226                                         *code_point = cp;
00227                                     ret_code = UTF8_OK;
00228                                 }
00229                                 else
00230                                     ret_code = INCOMPLETE_SEQUENCE;
00231                             }
00232                             else
00233                                 ret_code = NOT_ENOUGH_ROOM;
00234                         }
00235                         else
00236                             ret_code = INCOMPLETE_SEQUENCE;
00237                     }
00238                     else
00239                         ret_code = NOT_ENOUGH_ROOM;
00240                 }
00241                 else 
00242                     ret_code = INCOMPLETE_SEQUENCE;
00243             }
00244             else
00245                 ret_code = NOT_ENOUGH_ROOM;
00246         }
00247 
00248         return ret_code;
00249     }
00250 
00251     template <typename octet_iterator>
00252     utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point)
00253     {
00254         // Save the original value of it so we can go back in case of failure
00255         // Of course, it does not make much sense with i.e. stream iterators
00256         octet_iterator original_it = it;
00257 
00258         uint32_t cp = 0;
00259         // Determine the sequence length based on the lead octet
00260         typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
00261         octet_difference_type length = sequence_length(it);
00262         if (length == 0)
00263             return INVALID_LEAD;
00264 
00265         // Now that we have a valid sequence length, get trail octets and calculate the code point
00266         utf_error err = UTF8_OK;
00267         switch (length) {
00268             case 1:
00269                 err = get_sequence_1(it, end, &cp);
00270                 break;
00271             case 2:
00272                 err = get_sequence_2(it, end, &cp);
00273             break;
00274             case 3:
00275                 err = get_sequence_3(it, end, &cp);
00276             break;
00277             case 4:
00278                 err = get_sequence_4(it, end, &cp);
00279             break;
00280         }
00281 
00282         if (err == UTF8_OK) {
00283             // Decoding succeeded. Now, security checks...
00284             if (is_code_point_valid(cp)) {
00285                 if (!is_overlong_sequence(cp, length)){
00286                     // Passed! Return here.
00287                     if (code_point)
00288                         *code_point = cp;
00289                     ++it;
00290                     return UTF8_OK;
00291                 }
00292                 else
00293                     err = OVERLONG_SEQUENCE;
00294             }
00295             else 
00296                 err = INVALID_CODE_POINT;
00297         }
00298 
00299         // Failure branch - restore the original value of the iterator
00300         it = original_it;
00301         return err;
00302     }
00303 
00304     template <typename octet_iterator>
00305     inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
00306         return validate_next(it, end, 0);
00307     }
00308 
00309 } // namespace internal
00310 
00312 
00313     // Byte order mark
00314     const uint8_t bom[] = {0xef, 0xbb, 0xbf};
00315 
00316     template <typename octet_iterator>
00317     octet_iterator find_invalid(octet_iterator start, octet_iterator end)
00318     {
00319         octet_iterator result = start;
00320         while (result != end) {
00321             internal::utf_error err_code = internal::validate_next(result, end);
00322             if (err_code != internal::UTF8_OK)
00323                 return result;
00324         }
00325         return result;
00326     }
00327 
00328     template <typename octet_iterator>
00329     inline bool is_valid(octet_iterator start, octet_iterator end)
00330     {
00331         return (find_invalid(start, end) == end);
00332     }
00333 
00334     template <typename octet_iterator>
00335     inline bool starts_with_bom (octet_iterator it, octet_iterator end)
00336     {
00337         return (
00338             ((it != end) && (internal::mask8(*it++)) == bom[0]) &&
00339             ((it != end) && (internal::mask8(*it++)) == bom[1]) &&
00340             ((it != end) && (internal::mask8(*it))   == bom[2])
00341            );
00342     }
00343     
00344     //Deprecated in release 2.3 
00345     template <typename octet_iterator>
00346     inline bool is_bom (octet_iterator it)
00347     {
00348         return (
00349             (internal::mask8(*it++)) == bom[0] &&
00350             (internal::mask8(*it++)) == bom[1] &&
00351             (internal::mask8(*it))   == bom[2]
00352            );
00353     }
00354 } // namespace utf8
00355 
00356 #endif // header guard
00357 
00358 
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator