RTBKit
0.9
Open-source framework to create real-time ad bidding systems.
|
00001 /* id.cc 00002 Jeremy Barnes, 17 February 2012 00003 Copyright (c) 2012 Datacratic. All rights reserved. 00004 00005 */ 00006 00007 #include "id.h" 00008 #include "jml/arch/bit_range_ops.h" 00009 #include "jml/arch/format.h" 00010 #include "jml/arch/exception.h" 00011 #include "jml/db/persistent.h" 00012 #include "jml/utils/exc_assert.h" 00013 #include "soa/jsoncpp/value.h" 00014 00015 using namespace ML; 00016 using namespace std; 00017 00018 00019 namespace Datacratic { 00020 00021 00022 /*****************************************************************************/ 00023 /* ID */ 00024 /*****************************************************************************/ 00025 00026 static const signed char hexToDecLookups[128] = { 00027 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 00028 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 00029 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 00030 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, 00031 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, 00032 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 00033 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, 00034 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 00035 }; 00036 00037 JML_ALWAYS_INLINE int hexToDec(unsigned c) JML_CONST_FN; 00038 00039 JML_ALWAYS_INLINE int hexToDec(unsigned c) 00040 { 00041 //cerr << "c = " << (char)c 00042 // << " index " << (c & 0x7f) << " value = " 00043 // << (int)lookups[c & 0x7f] << endl; 00044 int mask = (c <= 0x7f); 00045 return hexToDecLookups[c & 0x7f] * mask - 1 + mask; 00046 } 00047 00048 // Not quite base64... these are re-arranged so that their ASCII order 00049 // corresponds to their integer value so they sort uniformly 00050 static const signed char base64ToDecLookups[128] = { 00051 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 00052 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 00053 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1, 1, 00054 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, 00055 00056 -1, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 00057 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, -1, -1, -1, -1, -1, 00058 -1, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 00059 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, -1, -1, -1, -1, -1, 00060 }; 00061 00062 JML_ALWAYS_INLINE int base64ToDec(unsigned c) 00063 { 00064 int mask = (c <= 0x7f); 00065 return base64ToDecLookups[c & 0x7f] * mask - 1 + mask; 00066 } 00067 00068 inline int hexToDec3(int c) 00069 { 00070 int d = c & 0x1f; 00071 int i = (c & 0x60) >> 5; 00072 d += (i == 1) * -16; 00073 d += (i >= 2) * 9; 00074 bool h = __builtin_isxdigit(c); 00075 return h * d - (!h); 00076 } 00077 00078 inline int hexToDec2(int c) 00079 { 00080 int v; 00081 00082 if (c >= '0' && c <= '9') 00083 v = c - '0'; 00084 else if (c >= 'a' && c <= 'f') 00085 v = c + 10 - 'a'; 00086 else if (c >= 'A' && c <= 'F') 00087 v = c + 10 - 'A'; 00088 else 00089 v = -1; 00090 00091 return v; 00092 } 00093 00094 void 00095 Id:: 00096 parse(const std::string & value, Type type) 00097 { 00098 Id r; 00099 00100 auto finish = [&] () 00101 { 00102 //if (r.toString() != value) 00103 // throw ML::Exception("Id::parse() modified an Id: input " + value 00104 // + " output " + r.toString()); 00105 if (r.type != type && type != UNKNOWN) 00106 throw ML::Exception("Id::parse() changed type from %d to %d parsing %s", 00107 r.type, type, value.c_str()); 00108 00109 *this = r; 00110 }; 00111 00112 if ((type == UNKNOWN || type == NONE) && value.empty()) { 00113 r.type = NONE; 00114 r.val1 = r.val2 = 0; 00115 finish(); 00116 return; 00117 } 00118 00119 if ((type == UNKNOWN || type == NULLID) && value.length() == 4 && value == "null") { 00120 //throw ML::Exception("null id"); 00121 r.type = NULLID; 00122 r.val1 = r.val2 = 0; 00123 finish(); 00124 return; 00125 } 00126 00127 if ((type == UNKNOWN || type == BIGDEC) && (value.size() == 1 && value[0] == '0')) { 00128 r.type = BIGDEC; 00129 r.val = 0; 00130 finish(); 00131 return; 00132 } 00133 00134 while ((type == UNKNOWN ||type == UUID) && value.length() == 36) { 00135 // not really a while... 00136 // Try a uuid 00137 // AGID: --> 0828398c-5965-11e0-84c8-0026b937c8e1 00138 00139 if (value[8] != '-') break; 00140 if (value[13] != '-') break; 00141 if (value[18] != '-') break; 00142 if (value[23] != '-') break; 00143 00144 unsigned f1; 00145 short f2, f3, f4; 00146 unsigned long long f5; 00147 00148 const char * p = value.c_str(); 00149 bool failed = false; 00150 00151 auto scanRange = [&] (int start, int len) -> unsigned long long 00152 { 00153 unsigned long long val = 0; 00154 for (unsigned i = start; i != start + len; ++i) { 00155 int c = p[i]; 00156 int v = hexToDec(c); 00157 if (v == -1) { 00158 failed = true; 00159 return val; 00160 } 00161 val = (val << 4) + v; 00162 } 00163 return val; 00164 }; 00165 00166 f1 = scanRange(0, 8); 00167 f2 = scanRange(9, 4); 00168 f3 = scanRange(14, 4); 00169 if (failed) break; 00170 f4 = scanRange(19, 4); 00171 f5 = scanRange(24, 12); 00172 if (failed) break; 00173 00174 r.type = UUID; 00175 r.f1 = f1; r.f2 = f2; r.f3 = f3; r.f4 = f4; r.f5 = f5; 00176 //r.val1 = ((uint64_t)f1 << 32) | ((uint64_t)f2 << 16) | f3; 00177 //r.val2 = ((uint64_t)f4 << 48) | f5; 00178 finish(); 00179 return; 00180 } 00181 00182 if ((type == UNKNOWN || type == GOOG128) 00183 && value.length() == 26 && value[0] == 'C' && value[1] == 'A' 00184 && value[2] == 'E' && value[3] == 'S' && value[4] == 'E') { 00185 00186 // Google ID: --> CAESEAYra3NIxLT9C8twKrzqaA 00187 00188 __uint128_t res = 0; 00189 00190 auto b64Decode = [] (int c) -> int 00191 { 00192 if (c >= '0' && c <= '9') 00193 return c - '0'; 00194 else if (c >= 'A' && c <= 'Z') 00195 return 10 + c - 'A'; 00196 else if (c >= 'a' && c <= 'z') 00197 return 36 + c - 'a'; 00198 else if (c == '-') 00199 return 62; 00200 else if (c == '_') 00201 return 63; 00202 else return -1; 00203 }; 00204 00205 bool error = false; 00206 for (unsigned i = 5; i < 26 && !error; ++i) { 00207 int v = b64Decode(value[i]); 00208 if (v == -1) error = true; 00209 res = (res << 6) | v; 00210 } 00211 00212 if (!error) { 00213 r.type = GOOG128; 00214 r.val = res; 00215 finish(); 00216 return; 00217 } 00218 } 00219 00220 if ((type == UNKNOWN || type == BIGDEC) 00221 && value[0] != '0' && value.length() < 32 /* TODO: better condition */) { 00222 // Try a big integer 00223 //ANID: --> 7394206091425759590 00224 __uint128_t res = 0; 00225 bool error = false; 00226 for (unsigned i = 0; i < value.size(); ++i) { 00227 if (!isdigit(value[i])) error = true; 00228 res = 10 * res + value[i] - '0'; 00229 } 00230 00231 if (!error) { 00232 r.type = BIGDEC; 00233 r.val = res; 00234 finish(); 00235 return; 00236 } 00237 } 00238 00239 if ((type == UNKNOWN || type == BASE64_96) && value.length() == 16) { 00240 auto scanRange = [&] (const char * p, size_t l) -> int64_t 00241 { 00242 uint64_t res = 0; 00243 for (unsigned i = 0; i < l; ++i) { 00244 int c = base64ToDec(p[i]); 00245 if (c == -1) return -1; 00246 res = res << 6 | c; 00247 } 00248 return res; 00249 }; 00250 00251 int64_t high = scanRange(value.c_str(), 8); 00252 int64_t low = scanRange(value.c_str() + 8, 8); 00253 00254 if (low != -1 && high != -1) { 00255 __int128_t val = high; 00256 val <<= 48; 00257 val |= low; 00258 00259 r.type = BASE64_96; 00260 r.val = val; 00261 finish(); 00262 return; 00263 } 00264 } 00265 00266 // Fall back to string 00267 r.type = STR; 00268 r.len = value.size(); 00269 char * s = new char[r.len]; 00270 r.str = s; 00271 r.ownstr = true; 00272 std::copy(value.c_str(), value.c_str() + value.size(), s); 00273 finish(); 00274 return; 00275 } 00276 00277 const Id & 00278 Id:: 00279 compoundId1() const 00280 { 00281 ExcAssertEqual(type, COMPOUND2); 00282 return *cmp1; 00283 } 00284 00285 const Id & 00286 Id:: 00287 compoundId2() const 00288 { 00289 ExcAssertEqual(type, COMPOUND2); 00290 return *cmp2; 00291 } 00292 00293 00294 std::string 00295 Id:: 00296 toString() const 00297 { 00298 switch (type) { 00299 case NONE: 00300 return ""; 00301 case NULLID: 00302 return "null"; 00303 case UUID: 00304 // AGID: --> 0828398c-5965-11e0-84c8-0026b937c8e1 00305 return ML::format("%08lx-%04x-%04x-%04x-%012llx", 00306 (unsigned)f1, (unsigned)f2, (unsigned)f3, (unsigned)f4, 00307 (unsigned long long)f5); 00308 case GOOG128: { 00309 // Google ID: --> CAESEAYra3NIxLT9C8twKrzqaA 00310 string result = "CAESE "; 00311 00312 auto b64Encode = [] (unsigned i) -> int 00313 { 00314 if (i < 10) return i + '0'; 00315 if (i < 36) return i - 10 + 'A'; 00316 if (i < 62) return i - 36 + 'a'; 00317 if (i == 62) return '-'; 00318 if (i == 63) return '_'; 00319 throw ML::Exception("bad goog base64 char"); 00320 }; 00321 00322 __uint128_t v = val; 00323 for (unsigned i = 0; i < 21; ++i) { 00324 result[25 - i] = b64Encode(v & 63); v = v >> 6; 00325 } 00326 return result; 00327 } 00328 case BIGDEC: { 00329 string result; 00330 __uint128_t v = val; 00331 if (v == 0) return "0"; 00332 while (v) { 00333 int c = v % 10; 00334 v /= 10; 00335 result += c + '0'; 00336 } 00337 std::reverse(result.begin(), result.end()); 00338 return result; 00339 } 00340 case BASE64_96: { 00341 string result = " "; 00342 00343 auto b64Encode = [] (unsigned i) -> int 00344 { 00345 if (i == 0) return '+'; 00346 if (i == 1) return '/'; 00347 if (i < 12) return '0' + i - 2; 00348 if (i < 38) return 'A' + i - 12; 00349 if (i < 64) return 'a' + i - 38; 00350 throw ML::Exception("bad base64 char"); 00351 }; 00352 00353 __uint128_t v = val; 00354 for (unsigned i = 0; i < 16; ++i) { 00355 result[15 - i] = b64Encode(v & 63); v = v >> 6; 00356 } 00357 return result; 00358 } 00359 case COMPOUND2: 00360 return compoundId1().toString() + ":" + compoundId2().toString(); 00361 case STR: 00362 return std::string(str, str + len); 00363 default: 00364 throw ML::Exception("unknown ID type"); 00365 } 00366 } 00367 00368 bool 00369 Id:: 00370 complexEqual(const Id & other) const 00371 { 00372 if (type == STR) 00373 return len == other.len && (str == other.str || std::equal(str, str + len, other.str)); 00374 else if (type == COMPOUND2) { 00375 return compoundId1() == other.compoundId1() 00376 && compoundId2() == other.compoundId2(); 00377 } 00378 else throw ML::Exception("unknown Id type"); 00379 } 00380 00381 bool 00382 Id:: 00383 complexLess(const Id & other) const 00384 { 00385 if (type == STR) 00386 return std::lexicographical_compare(str, str + len, 00387 other.str, other.str + other.len); 00388 else if (type == COMPOUND2) { 00389 return ML::less_all(compoundId1(), other.compoundId1(), 00390 compoundId2(), other.compoundId2()); 00391 } 00392 else throw ML::Exception("unknown Id type"); 00393 } 00394 00395 uint64_t 00396 Id:: 00397 complexHash() const 00398 { 00399 if (type == STR) 00400 return CityHash64(str, len); 00401 else if (type == COMPOUND2) { 00402 return Hash128to64(make_pair(compoundId1().hash(), 00403 compoundId2().hash())); 00404 } 00405 //else if (type == CUSTOM) 00406 // return controlFn(CF_HASH, data); 00407 else throw ML::Exception("unknown Id type"); 00408 } 00409 00410 void 00411 Id:: 00412 complexDestroy() 00413 { 00414 if (type < STR) return; 00415 if (type == STR) { 00416 if (ownstr) delete[] str; 00417 str = 0; 00418 ownstr = false; 00419 } 00420 else if (type == COMPOUND2) { 00421 delete cmp1; 00422 delete cmp2; 00423 cmp1 = cmp2 = 0; 00424 } 00425 //else if (type == CUSTOM) 00426 // controlFn(CF_DESTROY, data); 00427 else throw ML::Exception("unknown Id type"); 00428 } 00429 00430 void 00431 Id:: 00432 complexFinishCopy() 00433 { 00434 if (type == STR) { 00435 if (!ownstr) return; 00436 const char * oldStr = str; 00437 char * s = new char[len]; 00438 str = s; 00439 std::copy(oldStr, oldStr + len, s); 00440 } 00441 else if (type == COMPOUND2) { 00442 //cerr << "cmp1 = " << cmp1 << " cmp2 = " << cmp2 << " type = " 00443 // << (int)type << endl; 00444 cmp1 = new Id(compoundId1()); 00445 cmp2 = new Id(compoundId2()); 00446 } 00447 //else if (type == CUSTOM) 00448 // data = (void *)controlFn(CF_COPY, data); 00449 else throw ML::Exception("unknown Id type"); 00450 } 00451 00452 void 00453 Id:: 00454 serialize(ML::DB::Store_Writer & store) const 00455 { 00456 store << (char)1 << (char)type; 00457 //cerr << "after header: " << store.offset() << endl; 00458 switch (type) { 00459 case NONE: break; 00460 case NULLID: break; 00461 case UUID: 00462 case GOOG128: 00463 case BIGDEC: 00464 store.save_binary(&val1, 8); 00465 store.save_binary(&val2, 8); 00466 break; 00467 case BASE64_96: 00468 store.save_binary(&val1, 8); 00469 store.save_binary(&val2, 4); 00470 break; 00471 case STR: 00472 store << string(str, str + len); 00473 break; 00474 case COMPOUND2: 00475 compoundId1().serialize(store); 00476 compoundId2().serialize(store); 00477 break; 00478 default: 00479 throw ML::Exception("unknown Id type"); 00480 } 00481 00482 //cerr << "after value: " << store.offset() << endl; 00483 } 00484 00485 void 00486 Id:: 00487 reconstitute(ML::DB::Store_Reader & store) 00488 { 00489 Id r; 00490 00491 char v, tp; 00492 store >> v; 00493 if (v < 0 || v > 1) 00494 throw ML::Exception("unknown Id version reconstituting"); 00495 store >> tp; 00496 r.type = tp; 00497 00498 // Fix up from earlier reconstitution version 00499 if (v == 0 && tp == 5) 00500 r.type = STR; 00501 00502 if (v == 0) { 00503 // old domain field; no longer used 00504 int d; 00505 store >> d; 00506 //r.domain = d; 00507 } 00508 00509 //cerr << "reading after header: " << store.offset() << endl; 00510 //cerr << "type = " << (int)r.type << endl; 00511 00512 switch (r.type) { 00513 case NONE: break; 00514 case NULLID: break; 00515 case UUID: 00516 case GOOG128: 00517 case BIGDEC: { 00518 store.load_binary(&r.val1, 8); 00519 store.load_binary(&r.val2, 8); 00520 break; 00521 } 00522 case BASE64_96: { 00523 store.load_binary(&r.val1, 8); 00524 store.load_binary(&r.val2, 4); 00525 break; 00526 } 00527 case STR: { 00528 std::string s; 00529 store >> s; 00530 r.len = s.size(); 00531 r.ownstr = true; 00532 char * s2 = new char[s.size()]; 00533 r.str = s2; 00534 std::copy(s.begin(), s.end(), s2); 00535 break; 00536 } 00537 case COMPOUND2: { 00538 unique_ptr<Id> id1(new Id()), id2(new Id()); 00539 store >> *id1 >> *id2; 00540 r.cmp1 = id1.release(); 00541 r.cmp2 = id2.release(); 00542 break; 00543 } 00544 default: 00545 throw ML::Exception("unknown Id type %d reconstituting", 00546 tp); 00547 } 00548 00549 //cerr << "reading after value: " << store.offset() << endl; 00550 //cerr << "reconstituted " << r << endl; 00551 00552 *this = std::move(r); 00553 } 00554 00555 Json::Value 00556 Id:: 00557 toJson() const 00558 { 00559 if (notNull()) 00560 return toString(); 00561 else return Json::Value(); 00562 } 00563 00564 Id 00565 Id:: 00566 fromJson(const Json::Value & val) 00567 { 00568 if (val.isInt()) 00569 return Id(val.asInt()); 00570 00571 else if (val.isUInt()) 00572 return Id(val.asUInt()); 00573 00574 else if (val.isNull()) 00575 return Id(); 00576 00577 else return Id(val.asString()); 00578 } 00579 00580 } // namespace Datacratic