RTBKit
0.9
Open-source framework to create real-time ad bidding systems.
|
00001 // Copyright 2006 Nemanja Trifunovic 00002 00003 /* 00004 Permission is hereby granted, free of charge, to any person or organization 00005 obtaining a copy of the software and accompanying documentation covered by 00006 this license (the "Software") to use, reproduce, display, distribute, 00007 execute, and transmit the Software, and to prepare derivative works of the 00008 Software, and to permit third-parties to whom the Software is furnished to 00009 do so, all subject to the following: 00010 00011 The copyright notices in the Software and this entire statement, including 00012 the above license grant, this restriction and the following disclaimer, 00013 must be included in all copies of the Software, in whole or in part, and 00014 all derivative works of the Software, unless such copies or derivative 00015 works are solely in the form of machine-executable object code generated by 00016 a source language processor. 00017 00018 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 00019 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 00020 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 00021 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 00022 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 00023 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 00024 DEALINGS IN THE SOFTWARE. 00025 */ 00026 00027 00028 #ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 00029 #define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 00030 00031 #include "core.h" 00032 #include <stdexcept> 00033 00034 namespace utf8 00035 { 00036 // Base for the exceptions that may be thrown from the library 00037 class exception : public std::exception { 00038 }; 00039 00040 // Exceptions that may be thrown from the library functions. 00041 class invalid_code_point : public exception { 00042 uint32_t cp; 00043 public: 00044 invalid_code_point(uint32_t cp) : cp(cp) {} 00045 virtual const char* what() const throw() { return "Invalid code point"; } 00046 uint32_t code_point() const {return cp;} 00047 }; 00048 00049 class invalid_utf8 : public exception { 00050 uint8_t u8; 00051 public: 00052 invalid_utf8 (uint8_t u) : u8(u) {} 00053 virtual const char* what() const throw() { return "Invalid UTF-8"; } 00054 uint8_t utf8_octet() const {return u8;} 00055 }; 00056 00057 class invalid_utf16 : public exception { 00058 uint16_t u16; 00059 public: 00060 invalid_utf16 (uint16_t u) : u16(u) {} 00061 virtual const char* what() const throw() { return "Invalid UTF-16"; } 00062 uint16_t utf16_word() const {return u16;} 00063 }; 00064 00065 class not_enough_room : public exception { 00066 public: 00067 virtual const char* what() const throw() { return "Not enough space"; } 00068 }; 00069 00071 00072 template <typename octet_iterator, typename output_iterator> 00073 output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) 00074 { 00075 while (start != end) { 00076 octet_iterator sequence_start = start; 00077 internal::utf_error err_code = internal::validate_next(start, end); 00078 switch (err_code) { 00079 case internal::UTF8_OK : 00080 for (octet_iterator it = sequence_start; it != start; ++it) 00081 *out++ = *it; 00082 break; 00083 case internal::NOT_ENOUGH_ROOM: 00084 throw not_enough_room(); 00085 case internal::INVALID_LEAD: 00086 append (replacement, out); 00087 ++start; 00088 break; 00089 case internal::INCOMPLETE_SEQUENCE: 00090 case internal::OVERLONG_SEQUENCE: 00091 case internal::INVALID_CODE_POINT: 00092 append (replacement, out); 00093 ++start; 00094 // just one replacement mark for the sequence 00095 while (internal::is_trail(*start) && start != end) 00096 ++start; 00097 break; 00098 } 00099 } 00100 return out; 00101 } 00102 00103 template <typename octet_iterator, typename output_iterator> 00104 inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) 00105 { 00106 static const uint32_t replacement_marker = internal::mask16(0xfffd); 00107 return replace_invalid(start, end, out, replacement_marker); 00108 } 00109 00110 template <typename octet_iterator> 00111 octet_iterator append(uint32_t cp, octet_iterator result) 00112 { 00113 if (!internal::is_code_point_valid(cp)) 00114 throw invalid_code_point(cp); 00115 00116 if (cp < 0x80) // one octet 00117 *(result++) = static_cast<uint8_t>(cp); 00118 else if (cp < 0x800) { // two octets 00119 *(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0); 00120 *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); 00121 } 00122 else if (cp < 0x10000) { // three octets 00123 *(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0); 00124 *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80); 00125 *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); 00126 } 00127 else { // four octets 00128 *(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0); 00129 *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f) | 0x80); 00130 *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80); 00131 *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); 00132 } 00133 return result; 00134 } 00135 00136 template <typename octet_iterator> 00137 uint32_t next(octet_iterator& it, octet_iterator end) 00138 { 00139 uint32_t cp = 0; 00140 internal::utf_error err_code = internal::validate_next(it, end, &cp); 00141 switch (err_code) { 00142 case internal::UTF8_OK : 00143 break; 00144 case internal::NOT_ENOUGH_ROOM : 00145 throw not_enough_room(); 00146 case internal::INVALID_LEAD : 00147 case internal::INCOMPLETE_SEQUENCE : 00148 case internal::OVERLONG_SEQUENCE : 00149 throw invalid_utf8(*it); 00150 case internal::INVALID_CODE_POINT : 00151 throw invalid_code_point(cp); 00152 } 00153 return cp; 00154 } 00155 00156 template <typename octet_iterator> 00157 uint32_t peek_next(octet_iterator it, octet_iterator end) 00158 { 00159 return next(it, end); 00160 } 00161 00162 template <typename octet_iterator> 00163 uint32_t prior(octet_iterator& it, octet_iterator start) 00164 { 00165 // can't do much if it == start 00166 if (it == start) 00167 throw not_enough_room(); 00168 00169 octet_iterator end = it; 00170 // Go back until we hit either a lead octet or start 00171 while (internal::is_trail(*(--it))) 00172 if (it == start) 00173 throw invalid_utf8(*it); // error - no lead byte in the sequence 00174 return peek_next(it, end); 00175 } 00176 00178 template <typename octet_iterator> 00179 uint32_t previous(octet_iterator& it, octet_iterator pass_start) 00180 { 00181 octet_iterator end = it; 00182 while (internal::is_trail(*(--it))) 00183 if (it == pass_start) 00184 throw invalid_utf8(*it); // error - no lead byte in the sequence 00185 octet_iterator temp = it; 00186 return next(temp, end); 00187 } 00188 00189 template <typename octet_iterator, typename distance_type> 00190 void advance (octet_iterator& it, distance_type n, octet_iterator end) 00191 { 00192 for (distance_type i = 0; i < n; ++i) 00193 next(it, end); 00194 } 00195 00196 template <typename octet_iterator> 00197 typename std::iterator_traits<octet_iterator>::difference_type 00198 distance (octet_iterator first, octet_iterator last) 00199 { 00200 typename std::iterator_traits<octet_iterator>::difference_type dist; 00201 for (dist = 0; first < last; ++dist) 00202 next(first, last); 00203 return dist; 00204 } 00205 00206 template <typename u16bit_iterator, typename octet_iterator> 00207 octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) 00208 { 00209 while (start != end) { 00210 uint32_t cp = internal::mask16(*start++); 00211 // Take care of surrogate pairs first 00212 if (internal::is_lead_surrogate(cp)) { 00213 if (start != end) { 00214 uint32_t trail_surrogate = internal::mask16(*start++); 00215 if (internal::is_trail_surrogate(trail_surrogate)) 00216 cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; 00217 else 00218 throw invalid_utf16(static_cast<uint16_t>(trail_surrogate)); 00219 } 00220 else 00221 throw invalid_utf16(static_cast<uint16_t>(cp)); 00222 00223 } 00224 // Lone trail surrogate 00225 else if (internal::is_trail_surrogate(cp)) 00226 throw invalid_utf16(static_cast<uint16_t>(cp)); 00227 00228 result = append(cp, result); 00229 } 00230 return result; 00231 } 00232 00233 template <typename u16bit_iterator, typename octet_iterator> 00234 u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) 00235 { 00236 while (start != end) { 00237 uint32_t cp = next(start, end); 00238 if (cp > 0xffff) { //make a surrogate pair 00239 *result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET); 00240 *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); 00241 } 00242 else 00243 *result++ = static_cast<uint16_t>(cp); 00244 } 00245 return result; 00246 } 00247 00248 template <typename octet_iterator, typename u32bit_iterator> 00249 octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) 00250 { 00251 while (start != end) 00252 result = append(*(start++), result); 00253 00254 return result; 00255 } 00256 00257 template <typename octet_iterator, typename u32bit_iterator> 00258 u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) 00259 { 00260 while (start != end) 00261 (*result++) = next(start, end); 00262 00263 return result; 00264 } 00265 00266 // The iterator class 00267 template <typename octet_iterator> 00268 class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> { 00269 octet_iterator it; 00270 octet_iterator range_start; 00271 octet_iterator range_end; 00272 public: 00273 iterator () {}; 00274 explicit iterator (const octet_iterator& octet_it, 00275 const octet_iterator& range_start, 00276 const octet_iterator& range_end) : 00277 it(octet_it), range_start(range_start), range_end(range_end) 00278 { 00279 if (it < range_start || it > range_end) 00280 throw std::out_of_range("Invalid utf-8 iterator position"); 00281 } 00282 // the default "big three" are OK 00283 octet_iterator base () const { return it; } 00284 uint32_t operator * () const 00285 { 00286 octet_iterator temp = it; 00287 return next(temp, range_end); 00288 } 00289 bool operator == (const iterator& rhs) const 00290 { 00291 if (range_start != rhs.range_start || range_end != rhs.range_end) 00292 throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); 00293 return (it == rhs.it); 00294 } 00295 bool operator != (const iterator& rhs) const 00296 { 00297 return !(operator == (rhs)); 00298 } 00299 iterator& operator ++ () 00300 { 00301 next(it, range_end); 00302 return *this; 00303 } 00304 iterator operator ++ (int) 00305 { 00306 iterator temp = *this; 00307 next(it, range_end); 00308 return temp; 00309 } 00310 iterator& operator -- () 00311 { 00312 prior(it, range_start); 00313 return *this; 00314 } 00315 iterator operator -- (int) 00316 { 00317 iterator temp = *this; 00318 prior(it, range_start); 00319 return temp; 00320 } 00321 }; // class iterator 00322 00323 } // namespace utf8 00324 00325 #endif //header guard 00326 00327