RTBKit
0.9
Open-source framework to create real-time ad bidding systems.
|
00001 // Copyright 2006 Nemanja Trifunovic 00002 00003 /* 00004 Permission is hereby granted, free of charge, to any person or organization 00005 obtaining a copy of the software and accompanying documentation covered by 00006 this license (the "Software") to use, reproduce, display, distribute, 00007 execute, and transmit the Software, and to prepare derivative works of the 00008 Software, and to permit third-parties to whom the Software is furnished to 00009 do so, all subject to the following: 00010 00011 The copyright notices in the Software and this entire statement, including 00012 the above license grant, this restriction and the following disclaimer, 00013 must be included in all copies of the Software, in whole or in part, and 00014 all derivative works of the Software, unless such copies or derivative 00015 works are solely in the form of machine-executable object code generated by 00016 a source language processor. 00017 00018 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 00019 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 00020 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 00021 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 00022 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 00023 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 00024 DEALINGS IN THE SOFTWARE. 00025 */ 00026 00027 00028 #ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 00029 #define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 00030 00031 #include <iterator> 00032 00033 namespace utf8 00034 { 00035 // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers 00036 // You may need to change them to match your system. 00037 // These typedefs have the same names as ones from cstdint, or boost/cstdint 00038 typedef unsigned char uint8_t; 00039 typedef unsigned short uint16_t; 00040 typedef unsigned int uint32_t; 00041 00042 // Helper code - not intended to be directly called by the library users. May be changed at any time 00043 namespace internal 00044 { 00045 // Unicode constants 00046 // Leading (high) surrogates: 0xd800 - 0xdbff 00047 // Trailing (low) surrogates: 0xdc00 - 0xdfff 00048 const uint16_t LEAD_SURROGATE_MIN = 0xd800u; 00049 const uint16_t LEAD_SURROGATE_MAX = 0xdbffu; 00050 const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u; 00051 const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu; 00052 const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10); 00053 const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN; 00054 00055 // Maximum valid value for a Unicode code point 00056 const uint32_t CODE_POINT_MAX = 0x0010ffffu; 00057 00058 template<typename octet_type> 00059 inline uint8_t mask8(octet_type oc) 00060 { 00061 return static_cast<uint8_t>(0xff & oc); 00062 } 00063 template<typename u16_type> 00064 inline uint16_t mask16(u16_type oc) 00065 { 00066 return static_cast<uint16_t>(0xffff & oc); 00067 } 00068 template<typename octet_type> 00069 inline bool is_trail(octet_type oc) 00070 { 00071 return ((mask8(oc) >> 6) == 0x2); 00072 } 00073 00074 template <typename u16> 00075 inline bool is_lead_surrogate(u16 cp) 00076 { 00077 return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX); 00078 } 00079 00080 template <typename u16> 00081 inline bool is_trail_surrogate(u16 cp) 00082 { 00083 return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); 00084 } 00085 00086 template <typename u16> 00087 inline bool is_surrogate(u16 cp) 00088 { 00089 return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); 00090 } 00091 00092 template <typename u32> 00093 inline bool is_code_point_valid(u32 cp) 00094 { 00095 return (cp <= CODE_POINT_MAX && !is_surrogate(cp)); 00096 } 00097 00098 template <typename octet_iterator> 00099 inline int 00100 sequence_length(octet_iterator lead_it) 00101 { 00102 uint8_t lead = mask8(*lead_it); 00103 if (lead < 0x80) 00104 return 1; 00105 else if ((lead >> 5) == 0x6) 00106 return 2; 00107 else if ((lead >> 4) == 0xe) 00108 return 3; 00109 else if ((lead >> 3) == 0x1e) 00110 return 4; 00111 else 00112 return 0; 00113 } 00114 00115 template <typename octet_difference_type> 00116 inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length) 00117 { 00118 if (cp < 0x80) { 00119 if (length != 1) 00120 return true; 00121 } 00122 else if (cp < 0x800) { 00123 if (length != 2) 00124 return true; 00125 } 00126 else if (cp < 0x10000) { 00127 if (length != 3) 00128 return true; 00129 } 00130 00131 return false; 00132 } 00133 00134 enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; 00135 00137 00138 template <typename octet_iterator> 00139 utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t* code_point) 00140 { 00141 if (it != end) { 00142 if (code_point) 00143 *code_point = mask8(*it); 00144 return UTF8_OK; 00145 } 00146 return NOT_ENOUGH_ROOM; 00147 } 00148 00149 template <typename octet_iterator> 00150 utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t* code_point) 00151 { 00152 utf_error ret_code = NOT_ENOUGH_ROOM; 00153 00154 if (it != end) { 00155 uint32_t cp = mask8(*it); 00156 if (++it != end) { 00157 if (is_trail(*it)) { 00158 cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); 00159 00160 if (code_point) 00161 *code_point = cp; 00162 ret_code = UTF8_OK; 00163 } 00164 else 00165 ret_code = INCOMPLETE_SEQUENCE; 00166 } 00167 else 00168 ret_code = NOT_ENOUGH_ROOM; 00169 } 00170 00171 return ret_code; 00172 } 00173 00174 template <typename octet_iterator> 00175 utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t* code_point) 00176 { 00177 utf_error ret_code = NOT_ENOUGH_ROOM; 00178 00179 if (it != end) { 00180 uint32_t cp = mask8(*it); 00181 if (++it != end) { 00182 if (is_trail(*it)) { 00183 cp = ((cp << 12) & 0xffff) + ((mask8(*it) << 6) & 0xfff); 00184 if (++it != end) { 00185 if (is_trail(*it)) { 00186 cp += (*it) & 0x3f; 00187 00188 if (code_point) 00189 *code_point = cp; 00190 ret_code = UTF8_OK; 00191 } 00192 else 00193 ret_code = INCOMPLETE_SEQUENCE; 00194 } 00195 else 00196 ret_code = NOT_ENOUGH_ROOM; 00197 } 00198 else 00199 ret_code = INCOMPLETE_SEQUENCE; 00200 } 00201 else 00202 ret_code = NOT_ENOUGH_ROOM; 00203 } 00204 00205 return ret_code; 00206 } 00207 00208 template <typename octet_iterator> 00209 utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t* code_point) 00210 { 00211 utf_error ret_code = NOT_ENOUGH_ROOM; 00212 00213 if (it != end) { 00214 uint32_t cp = mask8(*it); 00215 if (++it != end) { 00216 if (is_trail(*it)) { 00217 cp = ((cp << 18) & 0x1fffff) + ((mask8(*it) << 12) & 0x3ffff); 00218 if (++it != end) { 00219 if (is_trail(*it)) { 00220 cp += (mask8(*it) << 6) & 0xfff; 00221 if (++it != end) { 00222 if (is_trail(*it)) { 00223 cp += (*it) & 0x3f; 00224 00225 if (code_point) 00226 *code_point = cp; 00227 ret_code = UTF8_OK; 00228 } 00229 else 00230 ret_code = INCOMPLETE_SEQUENCE; 00231 } 00232 else 00233 ret_code = NOT_ENOUGH_ROOM; 00234 } 00235 else 00236 ret_code = INCOMPLETE_SEQUENCE; 00237 } 00238 else 00239 ret_code = NOT_ENOUGH_ROOM; 00240 } 00241 else 00242 ret_code = INCOMPLETE_SEQUENCE; 00243 } 00244 else 00245 ret_code = NOT_ENOUGH_ROOM; 00246 } 00247 00248 return ret_code; 00249 } 00250 00251 template <typename octet_iterator> 00252 utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point) 00253 { 00254 // Save the original value of it so we can go back in case of failure 00255 // Of course, it does not make much sense with i.e. stream iterators 00256 octet_iterator original_it = it; 00257 00258 uint32_t cp = 0; 00259 // Determine the sequence length based on the lead octet 00260 typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type; 00261 octet_difference_type length = sequence_length(it); 00262 if (length == 0) 00263 return INVALID_LEAD; 00264 00265 // Now that we have a valid sequence length, get trail octets and calculate the code point 00266 utf_error err = UTF8_OK; 00267 switch (length) { 00268 case 1: 00269 err = get_sequence_1(it, end, &cp); 00270 break; 00271 case 2: 00272 err = get_sequence_2(it, end, &cp); 00273 break; 00274 case 3: 00275 err = get_sequence_3(it, end, &cp); 00276 break; 00277 case 4: 00278 err = get_sequence_4(it, end, &cp); 00279 break; 00280 } 00281 00282 if (err == UTF8_OK) { 00283 // Decoding succeeded. Now, security checks... 00284 if (is_code_point_valid(cp)) { 00285 if (!is_overlong_sequence(cp, length)){ 00286 // Passed! Return here. 00287 if (code_point) 00288 *code_point = cp; 00289 ++it; 00290 return UTF8_OK; 00291 } 00292 else 00293 err = OVERLONG_SEQUENCE; 00294 } 00295 else 00296 err = INVALID_CODE_POINT; 00297 } 00298 00299 // Failure branch - restore the original value of the iterator 00300 it = original_it; 00301 return err; 00302 } 00303 00304 template <typename octet_iterator> 00305 inline utf_error validate_next(octet_iterator& it, octet_iterator end) { 00306 return validate_next(it, end, 0); 00307 } 00308 00309 } // namespace internal 00310 00312 00313 // Byte order mark 00314 const uint8_t bom[] = {0xef, 0xbb, 0xbf}; 00315 00316 template <typename octet_iterator> 00317 octet_iterator find_invalid(octet_iterator start, octet_iterator end) 00318 { 00319 octet_iterator result = start; 00320 while (result != end) { 00321 internal::utf_error err_code = internal::validate_next(result, end); 00322 if (err_code != internal::UTF8_OK) 00323 return result; 00324 } 00325 return result; 00326 } 00327 00328 template <typename octet_iterator> 00329 inline bool is_valid(octet_iterator start, octet_iterator end) 00330 { 00331 return (find_invalid(start, end) == end); 00332 } 00333 00334 template <typename octet_iterator> 00335 inline bool starts_with_bom (octet_iterator it, octet_iterator end) 00336 { 00337 return ( 00338 ((it != end) && (internal::mask8(*it++)) == bom[0]) && 00339 ((it != end) && (internal::mask8(*it++)) == bom[1]) && 00340 ((it != end) && (internal::mask8(*it)) == bom[2]) 00341 ); 00342 } 00343 00344 //Deprecated in release 2.3 00345 template <typename octet_iterator> 00346 inline bool is_bom (octet_iterator it) 00347 { 00348 return ( 00349 (internal::mask8(*it++)) == bom[0] && 00350 (internal::mask8(*it++)) == bom[1] && 00351 (internal::mask8(*it)) == bom[2] 00352 ); 00353 } 00354 } // namespace utf8 00355 00356 #endif // header guard 00357 00358