RTBKit  0.9
Open-source framework to create real-time ad bidding systems.
soa/types/id.cc
00001 /* id.cc
00002    Jeremy Barnes, 17 February 2012
00003    Copyright (c) 2012 Datacratic.  All rights reserved.
00004 
00005 */
00006 
00007 #include "id.h"
00008 #include "jml/arch/bit_range_ops.h"
00009 #include "jml/arch/format.h"
00010 #include "jml/arch/exception.h"
00011 #include "jml/db/persistent.h"
00012 #include "jml/utils/exc_assert.h"
00013 #include "soa/jsoncpp/value.h"
00014 
00015 using namespace ML;
00016 using namespace std;
00017 
00018 
00019 namespace Datacratic {
00020 
00021 
00022 /*****************************************************************************/
00023 /* ID                                                                        */
00024 /*****************************************************************************/
00025 
00026 static const signed char hexToDecLookups[128] = {
00027     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
00028     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
00029     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
00030      0,  1,  2,  3,  4,  5,  6,  7,  8,  9, -1, -1, -1, -1, -1, -1,
00031     -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
00032     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
00033     -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
00034     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
00035 };
00036 
00037 JML_ALWAYS_INLINE int hexToDec(unsigned c) JML_CONST_FN;
00038 
00039 JML_ALWAYS_INLINE int hexToDec(unsigned c)
00040 {
00041     //cerr << "c = " << (char)c
00042     //     << " index " << (c & 0x7f) << " value = "
00043     //     << (int)lookups[c & 0x7f] << endl;
00044     int mask = (c <= 0x7f);
00045     return hexToDecLookups[c & 0x7f] * mask - 1 + mask;
00046 }
00047 
00048 // Not quite base64... these are re-arranged so that their ASCII order
00049 // corresponds to their integer value so they sort uniformly
00050 static const signed char base64ToDecLookups[128] = {
00051     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
00052     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
00053     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0, -1, -1, -1,  1,
00054      2,  3,  4,  5,  6,  7,  8,  9, 10, 11, -1, -1, -1, -1, -1, -1,
00055 
00056     -1, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
00057     27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, -1, -1, -1, -1, -1,
00058     -1, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
00059     53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, -1, -1, -1, -1, -1,
00060 };
00061 
00062 JML_ALWAYS_INLINE int base64ToDec(unsigned c)
00063 {
00064     int mask = (c <= 0x7f);
00065     return base64ToDecLookups[c & 0x7f] * mask - 1 + mask;
00066 }
00067 
00068 inline int hexToDec3(int c)
00069 {
00070     int d = c & 0x1f;
00071     int i = (c & 0x60) >> 5;
00072     d += (i == 1) * -16;
00073     d += (i >= 2) * 9;
00074     bool h = __builtin_isxdigit(c);
00075     return h * d - (!h);
00076 }
00077 
00078 inline int hexToDec2(int c)
00079 {
00080     int v;
00081 
00082     if (c >= '0' && c <= '9')
00083         v = c - '0';
00084     else if (c >= 'a' && c <= 'f')
00085         v = c + 10 - 'a';
00086     else if (c >= 'A' && c <= 'F')
00087         v = c + 10 - 'A';
00088     else
00089         v = -1;
00090 
00091     return v;
00092 }
00093 
00094 void
00095 Id::
00096 parse(const std::string & value, Type type)
00097 {
00098     Id r;
00099 
00100     auto finish = [&] ()
00101         {
00102             //if (r.toString() != value)
00103             //    throw ML::Exception("Id::parse() modified an Id: input " + value
00104             //                        + " output " + r.toString());
00105             if (r.type != type && type != UNKNOWN)
00106                 throw ML::Exception("Id::parse() changed type from %d to %d parsing %s",
00107                                     r.type, type, value.c_str());
00108 
00109             *this = r;
00110         };
00111     
00112     if ((type == UNKNOWN || type == NONE) && value.empty()) {
00113         r.type = NONE;
00114         r.val1 = r.val2 = 0;
00115         finish();
00116         return;
00117     }
00118 
00119     if ((type == UNKNOWN || type == NULLID) && value.length() == 4 && value == "null") {
00120         //throw ML::Exception("null id");
00121         r.type = NULLID;
00122         r.val1 = r.val2 = 0;
00123         finish();
00124         return;
00125     }
00126 
00127     if ((type == UNKNOWN || type == BIGDEC) && (value.size() == 1 && value[0] == '0')) {
00128         r.type = BIGDEC;
00129         r.val = 0;
00130         finish();
00131         return;
00132     }
00133 
00134     while ((type == UNKNOWN ||type == UUID) && value.length() == 36) {
00135         // not really a while...
00136         // Try a uuid
00137         // AGID: --> 0828398c-5965-11e0-84c8-0026b937c8e1
00138 
00139         if (value[8] != '-') break;
00140         if (value[13] != '-') break;
00141         if (value[18] != '-') break;
00142         if (value[23] != '-') break;
00143 
00144         unsigned f1;
00145         short f2, f3, f4;
00146         unsigned long long f5;
00147 
00148         const char * p = value.c_str();
00149         bool failed = false;
00150 
00151         auto scanRange = [&] (int start, int len) -> unsigned long long
00152             {
00153                 unsigned long long val = 0;
00154                 for (unsigned i = start;  i != start + len;  ++i) {
00155                     int c = p[i];
00156                     int v = hexToDec(c);
00157                     if (v == -1) {
00158                         failed = true;
00159                         return val;
00160                     }
00161                     val = (val << 4) + v;
00162                 }
00163                 return val;
00164             };
00165 
00166         f1 = scanRange(0, 8);
00167         f2 = scanRange(9, 4);
00168         f3 = scanRange(14, 4);
00169         if (failed) break;
00170         f4 = scanRange(19, 4);
00171         f5 = scanRange(24, 12);
00172         if (failed) break;
00173 
00174         r.type = UUID;
00175         r.f1 = f1;  r.f2 = f2;  r.f3 = f3;  r.f4 = f4;  r.f5 = f5;
00176         //r.val1 = ((uint64_t)f1 << 32) | ((uint64_t)f2 << 16) | f3;
00177         //r.val2 = ((uint64_t)f4 << 48) | f5;
00178         finish();
00179         return;
00180     }
00181 
00182     if ((type == UNKNOWN || type == GOOG128)
00183         && value.length() == 26 && value[0] == 'C' && value[1] == 'A'
00184         && value[2] == 'E' && value[3] == 'S' && value[4] == 'E') {
00185 
00186         // Google ID: --> CAESEAYra3NIxLT9C8twKrzqaA
00187 
00188         __uint128_t res = 0;
00189 
00190         auto b64Decode = [] (int c) -> int
00191             {
00192                 if (c >= '0' && c <= '9')
00193                     return c - '0';
00194                 else if (c >= 'A' && c <= 'Z')
00195                     return 10 + c - 'A';
00196                 else if (c >= 'a' && c <= 'z')
00197                     return 36 + c - 'a';
00198                 else if (c == '-')
00199                     return 62;
00200                 else if (c == '_')
00201                     return 63;
00202                 else return -1;
00203             };
00204 
00205         bool error = false;
00206         for (unsigned i = 5;  i < 26 && !error;  ++i) {
00207             int v = b64Decode(value[i]);
00208             if (v == -1) error = true;
00209             res = (res << 6) | v;
00210         }
00211 
00212         if (!error) {
00213             r.type = GOOG128;
00214             r.val = res;
00215             finish();
00216             return;
00217         }
00218     }
00219 
00220     if ((type == UNKNOWN || type == BIGDEC)
00221         && value[0] != '0' && value.length() < 32 /* TODO: better condition */) {
00222         // Try a big integer
00223         //ANID: --> 7394206091425759590
00224         __uint128_t res = 0;
00225         bool error = false;
00226         for (unsigned i = 0;  i < value.size();  ++i) {
00227             if (!isdigit(value[i])) error = true;
00228             res = 10 * res + value[i] - '0';
00229         }
00230 
00231         if (!error) {
00232             r.type = BIGDEC;
00233             r.val = res;
00234             finish();
00235             return;
00236         }
00237     }
00238 
00239     if ((type == UNKNOWN || type == BASE64_96) && value.length() == 16) {
00240         auto scanRange = [&] (const char * p, size_t l) -> int64_t
00241             {
00242                 uint64_t res = 0;
00243                 for (unsigned i = 0;  i < l;  ++i) {
00244                     int c = base64ToDec(p[i]);
00245                     if (c == -1) return -1;
00246                     res = res << 6 | c;
00247                 }
00248                 return res;
00249             };
00250         
00251         int64_t high = scanRange(value.c_str(), 8);
00252         int64_t low  = scanRange(value.c_str() + 8, 8);
00253 
00254         if (low != -1 && high != -1) {
00255             __int128_t val = high;
00256             val <<= 48;
00257             val |= low;
00258             
00259             r.type = BASE64_96;
00260             r.val = val;
00261             finish();
00262             return;
00263         }
00264     }   
00265 
00266     // Fall back to string
00267     r.type = STR;
00268     r.len = value.size();
00269     char * s = new char[r.len];
00270     r.str = s;
00271     r.ownstr = true;
00272     std::copy(value.c_str(), value.c_str() + value.size(), s);
00273     finish();
00274     return;
00275 }
00276 
00277 const Id &
00278 Id::
00279 compoundId1() const
00280 {
00281     ExcAssertEqual(type, COMPOUND2);
00282     return *cmp1;
00283 }
00284 
00285 const Id &
00286 Id::
00287 compoundId2() const
00288 {
00289     ExcAssertEqual(type, COMPOUND2);
00290     return *cmp2;
00291 }
00292     
00293     
00294 std::string
00295 Id::
00296 toString() const
00297 {
00298     switch (type) {
00299     case NONE:
00300         return "";
00301     case NULLID:
00302         return "null";
00303     case UUID:
00304         // AGID: --> 0828398c-5965-11e0-84c8-0026b937c8e1
00305         return ML::format("%08lx-%04x-%04x-%04x-%012llx",
00306                           (unsigned)f1, (unsigned)f2, (unsigned)f3, (unsigned)f4,
00307                           (unsigned long long)f5);
00308     case GOOG128: {
00309         // Google ID: --> CAESEAYra3NIxLT9C8twKrzqaA
00310         string result = "CAESE                     ";
00311 
00312         auto b64Encode = [] (unsigned i) -> int
00313             {
00314                 if (i < 10) return i + '0';
00315                 if (i < 36) return i - 10 + 'A';
00316                 if (i < 62) return i - 36 + 'a';
00317                 if (i == 62) return '-';
00318                 if (i == 63) return '_';
00319                 throw ML::Exception("bad goog base64 char");
00320             };
00321 
00322         __uint128_t v = val;
00323         for (unsigned i = 0;  i < 21;  ++i) {
00324             result[25 - i] = b64Encode(v & 63);  v = v >> 6;
00325         }
00326         return result;
00327     }
00328     case BIGDEC: {
00329         string result;
00330         __uint128_t v = val;
00331         if (v == 0) return "0";
00332         while (v) {
00333             int c = v % 10;
00334             v /= 10;
00335             result += c + '0';
00336         }
00337         std::reverse(result.begin(), result.end());
00338         return result;
00339     }
00340     case BASE64_96: {
00341         string result = "                ";
00342 
00343         auto b64Encode = [] (unsigned i) -> int
00344             {
00345                 if (i == 0) return '+';
00346                 if (i == 1) return '/';
00347                 if (i < 12) return '0' + i - 2;
00348                 if (i < 38) return 'A' + i - 12;
00349                 if (i < 64) return 'a' + i - 38;
00350                 throw ML::Exception("bad base64 char");
00351             };
00352 
00353         __uint128_t v = val;
00354         for (unsigned i = 0;  i < 16;  ++i) {
00355             result[15 - i] = b64Encode(v & 63);  v = v >> 6;
00356         }
00357         return result;
00358     }
00359     case COMPOUND2:
00360         return compoundId1().toString() + ":" + compoundId2().toString();
00361     case STR:
00362         return std::string(str, str + len);
00363     default:
00364         throw ML::Exception("unknown ID type");
00365     }
00366 }
00367 
00368 bool
00369 Id::
00370 complexEqual(const Id & other) const
00371 {
00372     if (type == STR)
00373         return len == other.len && (str == other.str || std::equal(str, str + len, other.str));
00374     else if (type == COMPOUND2) {
00375         return compoundId1() == other.compoundId1()
00376             && compoundId2() == other.compoundId2();
00377     }
00378     else throw ML::Exception("unknown Id type");
00379 }
00380 
00381 bool
00382 Id::
00383 complexLess(const Id & other) const
00384 {
00385     if (type == STR)
00386         return std::lexicographical_compare(str, str + len,
00387                                             other.str, other.str + other.len);
00388     else if (type == COMPOUND2) {
00389         return ML::less_all(compoundId1(), other.compoundId1(),
00390                             compoundId2(), other.compoundId2());
00391     }
00392     else throw ML::Exception("unknown Id type");
00393 }
00394 
00395 uint64_t
00396 Id::
00397 complexHash() const
00398 {
00399     if (type == STR)
00400         return CityHash64(str, len);
00401     else if (type == COMPOUND2) {
00402         return Hash128to64(make_pair(compoundId1().hash(),
00403                                      compoundId2().hash()));
00404     }
00405     //else if (type == CUSTOM)
00406     //    return controlFn(CF_HASH, data);
00407     else throw ML::Exception("unknown Id type");
00408 }
00409 
00410 void
00411 Id::
00412 complexDestroy()
00413 {
00414     if (type < STR) return;
00415     if (type == STR) {
00416         if (ownstr) delete[] str;
00417         str = 0;
00418         ownstr = false;
00419     }
00420     else if (type == COMPOUND2) {
00421         delete cmp1;
00422         delete cmp2;
00423         cmp1 = cmp2 = 0;
00424     }
00425     //else if (type == CUSTOM)
00426     //    controlFn(CF_DESTROY, data);
00427     else throw ML::Exception("unknown Id type");
00428 }
00429 
00430 void
00431 Id::
00432 complexFinishCopy()
00433 {
00434     if (type == STR) {
00435         if (!ownstr) return;
00436         const char * oldStr = str;
00437         char * s = new char[len];
00438         str = s;
00439         std::copy(oldStr, oldStr + len, s);
00440     }
00441     else if (type == COMPOUND2) {
00442         //cerr << "cmp1 = " << cmp1 << " cmp2 = " << cmp2 << " type = "
00443         //     << (int)type << endl;
00444         cmp1 = new Id(compoundId1());
00445         cmp2 = new Id(compoundId2());
00446     }
00447     //else if (type == CUSTOM)
00448     //    data = (void *)controlFn(CF_COPY, data);
00449     else throw ML::Exception("unknown Id type");
00450 }
00451     
00452 void
00453 Id::
00454 serialize(ML::DB::Store_Writer & store) const
00455 {
00456     store << (char)1 << (char)type;
00457     //cerr << "after header: " << store.offset() << endl;
00458     switch (type) {
00459     case NONE: break;
00460     case NULLID: break;
00461     case UUID:
00462     case GOOG128:
00463     case BIGDEC:
00464         store.save_binary(&val1, 8);
00465         store.save_binary(&val2, 8);
00466         break;
00467     case BASE64_96:
00468         store.save_binary(&val1, 8);
00469         store.save_binary(&val2, 4);
00470         break;
00471     case STR:
00472         store << string(str, str + len);
00473         break;
00474     case COMPOUND2:
00475         compoundId1().serialize(store);
00476         compoundId2().serialize(store);
00477         break;
00478     default:
00479         throw ML::Exception("unknown Id type");
00480     }
00481 
00482     //cerr << "after value: " << store.offset() << endl;
00483 }
00484 
00485 void
00486 Id::
00487 reconstitute(ML::DB::Store_Reader & store)
00488 {
00489     Id r;
00490 
00491     char v, tp;
00492     store >> v;
00493     if (v < 0 || v > 1)
00494         throw ML::Exception("unknown Id version reconstituting");
00495     store >> tp;
00496     r.type = tp;
00497 
00498     // Fix up from earlier reconstitution version
00499     if (v == 0 && tp == 5)
00500         r.type = STR;
00501 
00502     if (v == 0) {
00503         // old domain field; no longer used
00504         int d;
00505         store >> d;
00506         //r.domain = d;
00507     }
00508 
00509     //cerr << "reading after header: " << store.offset() << endl;
00510     //cerr << "type = " << (int)r.type << endl;
00511 
00512     switch (r.type) {
00513     case NONE: break;
00514     case NULLID: break;
00515     case UUID:
00516     case GOOG128:
00517     case BIGDEC: {
00518         store.load_binary(&r.val1, 8);
00519         store.load_binary(&r.val2, 8);
00520         break;
00521     }
00522     case BASE64_96: {
00523         store.load_binary(&r.val1, 8);
00524         store.load_binary(&r.val2, 4);
00525         break;
00526     }
00527     case STR: {
00528         std::string s;
00529         store >> s;
00530         r.len = s.size();
00531         r.ownstr = true;
00532         char * s2 = new char[s.size()];
00533         r.str = s2;
00534         std::copy(s.begin(), s.end(), s2);
00535         break;
00536     }
00537     case COMPOUND2: {
00538         unique_ptr<Id> id1(new Id()), id2(new Id());
00539         store >> *id1 >> *id2;
00540         r.cmp1 = id1.release();
00541         r.cmp2 = id2.release();
00542         break;
00543     }
00544     default:
00545         throw ML::Exception("unknown Id type %d reconstituting",
00546                             tp);
00547     }
00548 
00549     //cerr << "reading after value: " << store.offset() << endl;
00550     //cerr << "reconstituted " << r << endl;
00551 
00552     *this = std::move(r);
00553 }
00554 
00555 Json::Value
00556 Id::
00557 toJson() const
00558 {
00559     if (notNull())
00560         return toString();
00561     else return Json::Value();
00562 }
00563 
00564 Id
00565 Id::
00566 fromJson(const Json::Value & val)
00567 {
00568     if (val.isInt())
00569         return Id(val.asInt());
00570 
00571     else if (val.isUInt())
00572         return Id(val.asUInt());
00573 
00574     else if (val.isNull())
00575         return Id();
00576 
00577     else return Id(val.asString());
00578 }
00579 
00580 } // namespace Datacratic
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator