![]() |
RTBKit
0.9
Open-source framework to create real-time ad bidding systems.
|
00001 /* segments.cc 00002 Jeremy Barnes, 12 March 2012 00003 Copyright (c) 2012 Datacratic. All rights reserved. 00004 00005 Implementation of segments. 00006 */ 00007 00008 #include "rtbkit/common/segments.h" 00009 #include <boost/function_output_iterator.hpp> 00010 #include "jml/arch/format.h" 00011 #include "jml/arch/exception.h" 00012 #include "jml/arch/backtrace.h" 00013 #include "jml/utils/exc_assert.h" 00014 #include "soa/types/value_description.h" 00015 #include "jml/db/persistent.h" 00016 #include <boost/make_shared.hpp> 00017 #include <boost/algorithm/string.hpp> 00018 00019 using namespace std; 00020 using namespace ML; 00021 using namespace ML::DB; 00022 using namespace Datacratic; 00023 00024 namespace RTBKIT { 00025 00026 00027 /*****************************************************************************/ 00028 /* SEGMENTS */ 00029 /*****************************************************************************/ 00030 00031 SegmentList:: 00032 SegmentList() 00033 { 00034 } 00035 00036 SegmentList:: 00037 SegmentList(const std::vector<string> & segs) 00038 { 00039 for (unsigned i = 0; i < segs.size(); ++i) 00040 add(segs[i]); 00041 sort(); 00042 } 00043 00044 SegmentList:: 00045 SegmentList(const std::vector<int> & segs) 00046 : ints(segs.begin(), segs.end()) 00047 { 00048 sort(); 00049 } 00050 00051 SegmentList:: 00052 SegmentList(const std::vector<std::pair<int, float> > & segs) 00053 { 00054 for (unsigned i = 0; i < segs.size(); ++i) 00055 add(segs[i].first, segs[i].second); 00056 sort(); 00057 } 00058 00059 bool 00060 SegmentList:: 00061 contains(int i) const 00062 { 00063 return std::binary_search(ints.begin(), ints.end(), i); 00064 } 00065 00066 bool 00067 SegmentList:: 00068 contains(const std::string & str) const 00069 { 00070 int i = parseSegmentNum(str); 00071 if (i == -1) 00072 return std::binary_search(strings.begin(), strings.end(), str); 00073 else return contains(i); 00074 } 00075 00076 #if 0 00077 float 00078 SegmentList:: 00079 weight(int i) const 00080 { 00081 } 00082 00083 float 00084 SegmentList:: 00085 weight(const std::string & str) const 00086 { 00087 } 00088 #endif 00089 00090 template<typename Seq1, typename Seq2> 00091 bool anyMatchesLookup(const Seq1 & seq1, const Seq2 & seq2) 00092 { 00093 auto it2 = seq2.begin(), end2 = seq2.end(); 00094 for (auto it1 = seq1.begin(), end1 = seq1.end(); 00095 it1 != end1; ++it1) 00096 if (std::binary_search(it2, end2, *it1)) return true; 00097 return false; 00098 } 00099 00100 template<typename Seq1, typename Seq2> 00101 bool anyMatches(const Seq1 & seq1, const Seq2 & seq2) 00102 { 00103 if (seq1.empty() || seq2.empty()) 00104 return false; 00105 else if (seq1.size() * 5 < seq2.size()) { 00106 // seq2 is much bigger... look up individually each element 00107 return anyMatchesLookup(seq1, seq2); 00108 } 00109 else if (seq2.size() * 5 < seq1.size()) { 00110 // seq1 is much bigger... look up individually each element 00111 return anyMatchesLookup(seq2, seq1); 00112 } 00113 else { 00114 // roughly equal sizes; jointly iterate 00115 auto it1 = seq1.begin(), end1 = seq1.end(); 00116 auto it2 = seq2.begin(), end2 = seq2.end(); 00117 00118 while (it1 != end1 && it2 != end2) { 00119 if (*it1 == *it2) return true; 00120 else if (*it1 < *it2) ++it1; 00121 else ++it2; 00122 } 00123 00124 return false; 00125 } 00126 } 00127 00128 bool 00129 SegmentList:: 00130 match(const SegmentList & other) const 00131 { 00132 return anyMatches(ints, other.ints) 00133 || anyMatches(strings, other.strings); 00134 } 00135 00136 bool 00137 SegmentList:: 00138 match(const std::vector<int> & other) const 00139 { 00140 return anyMatches(ints, other); 00141 } 00142 00143 bool 00144 SegmentList:: 00145 match(const std::vector<std::string> & other) const 00146 { 00147 return anyMatches(strings, other); 00148 } 00149 00150 size_t 00151 SegmentList:: 00152 size() const 00153 { 00154 return ints.size() + strings.size(); 00155 } 00156 00157 bool 00158 SegmentList:: 00159 empty() const 00160 { 00161 return ints.empty() && strings.empty(); 00162 } 00163 00164 SegmentList 00165 SegmentList:: 00166 createFromJson(const Json::Value & json) 00167 { 00168 SegmentList result; 00169 00170 if (!json.isArray()) 00171 throw Exception("augment must be an array of augmentations"); 00172 00173 for (unsigned i = 0; i < json.size(); ++i) { 00174 const Json::Value & val = json[i]; 00175 if (val.isArray()) { 00176 if (val.size() != 2) 00177 throw ML::Exception("can't create weighted segment from " 00178 + json.toString()); 00179 float weight = val[1].asDouble(); 00180 00181 if (val[0].isInt()) 00182 result.add(val[0].asInt(), weight); 00183 else if (val[0].isNumeric()) 00184 result.add(val[0].asDouble(), weight); 00185 else result.add(val[0].asString(), weight); 00186 } 00187 else if (val.isInt()) 00188 result.add(val.asInt()); 00189 else if (val.isNumeric()) 00190 result.add(val.asDouble()); 00191 else result.add(val.asString()); 00192 } 00193 00194 return result; 00195 } 00196 00197 Json::Value 00198 SegmentList:: 00199 toJson() const 00200 { 00201 Json::Value result; 00202 00203 if (weights.empty()) { 00204 if (strings.empty()) { 00205 for (unsigned i = 0; i < ints.size(); ++i) 00206 result[i] = ints[i]; 00207 } 00208 else { 00209 for (unsigned i = 0; i < ints.size(); ++i) 00210 result[i] = ML::format("%d", ints[i]); 00211 for (unsigned i = 0; i < strings.size(); ++i) 00212 result[i + ints.size()] = strings[i]; 00213 } 00214 } 00215 else { 00216 if (strings.empty()) { 00217 for (unsigned i = 0; i < ints.size(); ++i) { 00218 result[i][0] = ints[i]; 00219 result[i][1] = weights[i]; 00220 } 00221 } 00222 else { 00223 for (unsigned i = 0; i < ints.size(); ++i) { 00224 result[i][0] = ML::format("%d", ints[i]); 00225 result[i][1] = weights[i]; 00226 } 00227 for (unsigned i = 0; i < strings.size(); ++i) { 00228 result[i + ints.size()][0] = strings[i]; 00229 result[i + ints.size()][1] = weights[i + ints.size()]; 00230 } 00231 } 00232 } 00233 return result; 00234 } 00235 00236 std::string 00237 SegmentList:: 00238 toJsonStr() const 00239 { 00240 return boost::trim_copy(toJson().toString()); 00241 } 00242 00243 std::string 00244 SegmentList:: 00245 toString() const 00246 { 00247 return toJsonStr(); 00248 } 00249 00250 void 00251 SegmentList:: 00252 add(int i, float weight) 00253 { 00254 ints.push_back(i); 00255 if (weight != 1.0 || !weights.empty()) { 00256 if (weights.empty()) 00257 weights.resize(size() - 1, 1.0); 00258 weights.insert(weights.begin() + ints.size() - 1, weight); 00259 ExcAssertEqual(weights.size(), size()); 00260 } 00261 } 00262 00263 void 00264 SegmentList:: 00265 add(const std::string & str, float weight) 00266 { 00267 int i = parseSegmentNum(str); 00268 if (i == -1) { 00269 strings.push_back(str); 00270 if (weight != 1.0 || !weights.empty()) { 00271 if (weights.empty()) 00272 weights.resize(size() - 1, 1.0); 00273 weights.push_back(weight); 00274 ExcAssertEqual(weights.size(), size()); 00275 } 00276 } 00277 else add(i, weight); 00278 } 00279 00280 int 00281 SegmentList:: 00282 parseSegmentNum(const std::string & str) 00283 { 00284 if (str.empty()) return -1; 00285 00286 else if (str.length() == 1 && str[0] == '0') { 00287 return 0; 00288 } 00289 else if (str[0] != '0' && isdigit(str[0])) { 00290 char * endptr = const_cast<char *>(str.c_str() + str.length()); 00291 long i = strtol(str.c_str(), &endptr, 10); 00292 if (endptr == str.c_str() + str.length()) { 00293 if (i < 0) return -1; 00294 return i; 00295 } 00296 } 00297 00298 return -1; 00299 } 00300 00301 void 00302 SegmentList:: 00303 sort() 00304 { 00305 if (weights.empty()) { 00306 std::sort(ints.begin(), ints.end()); 00307 std::sort(strings.begin(), strings.end()); 00308 } 00309 else { 00310 ExcAssertEqual(weights.size(), size()); 00311 vector<pair<int, float> > isorted(ints.size()); 00312 for (unsigned i = 0; i < ints.size(); ++i) 00313 isorted[i] = make_pair(ints[i], weights[i]); 00314 std::sort(isorted.begin(), isorted.end()); 00315 00316 vector<pair<string, float> > ssorted(strings.size()); 00317 for (unsigned i = 0; i < strings.size(); ++i) 00318 ssorted[i] = make_pair(strings[i], weights[i + ints.size()]); 00319 std::sort(ssorted.begin(), ssorted.end()); 00320 00321 for (unsigned i = 0; i < ints.size(); ++i) { 00322 ints[i] = isorted[i].first; 00323 weights[i] = isorted[i].second; 00324 } 00325 for (unsigned i = 0; i < strings.size(); ++i) { 00326 strings[i] = ssorted[i].first; 00327 weights[i + ints.size()] = ssorted[i].second; 00328 } 00329 } 00330 } 00331 00332 void 00333 SegmentList:: 00334 serialize(ML::DB::Store_Writer & store) const 00335 { 00336 unsigned char version = 0; 00337 store << version << ints << strings << weights; 00338 } 00339 00340 void 00341 SegmentList:: 00342 reconstitute(ML::DB::Store_Reader & store) 00343 { 00344 unsigned char version; 00345 store >> version; 00346 if (version > 0) 00347 throw ML::Exception("unknown SegmentList version"); 00348 store >> ints >> strings >> weights; 00349 } 00350 00351 std::string 00352 SegmentList:: 00353 serializeToString() const 00354 { 00355 return ML::DB::serializeToString(*this); 00356 } 00357 00358 SegmentList 00359 SegmentList:: 00360 reconstituteFromString(const std::string & str) 00361 { 00362 return ML::DB::reconstituteFromString<SegmentList>(str); 00363 } 00364 00365 void 00366 SegmentList:: 00367 forEach(const std::function<void (int, string, float)> & onSegment) const 00368 { 00369 for (unsigned i = 0; i < ints.size(); ++i) 00370 onSegment(ints[i], ML::format("%d", ints[i]), 00371 weights.empty() ? 1.0 : weights[i]); 00372 for (unsigned i = 0; i < strings.size(); ++i) 00373 onSegment(-1, strings[i], 00374 weights.empty() ? 1.0 : weights[i + ints.size()]); 00375 } 00376 00377 00378 /*****************************************************************************/ 00379 /* SEGMENTS BY SOURCE */ 00380 /*****************************************************************************/ 00381 00382 SegmentsBySource:: 00383 SegmentsBySource() 00384 { 00385 } 00386 00387 SegmentsBySource:: 00388 SegmentsBySource(SegmentsBySourceBase && other) 00389 : SegmentsBySourceBase(other) 00390 { 00391 } 00392 00393 SegmentsBySource:: 00394 SegmentsBySource(const SegmentsBySourceBase & other) 00395 : SegmentsBySourceBase(other) 00396 { 00397 } 00398 00399 void 00400 SegmentsBySource:: 00401 sortAll() 00402 { 00403 for (auto it = begin(), end = this->end(); 00404 it != end; ++it) 00405 it->second->sort(); 00406 } 00407 00408 const SegmentList & 00409 SegmentsBySource:: 00410 get(const std::string & str) const 00411 { 00412 static const SegmentList NONE; 00413 00414 auto it = find(str); 00415 if (it == end()) return NONE; 00416 if (!it->second) 00417 throw ML::Exception("invalid segment list in segments"); 00418 return *it->second; 00419 } 00420 00421 void 00422 SegmentsBySource:: 00423 addSegment(const std::string & source, 00424 const std::shared_ptr<SegmentList> & segs) 00425 { 00426 if (!insert(make_pair(source, segs)).second) 00427 throw ML::Exception("attempt to add same segments twice"); 00428 } 00429 00430 void 00431 SegmentsBySource:: 00432 addInts(const std::string & source, 00433 const std::vector<int> & segs) 00434 { 00435 if (!insert(make_pair(source, std::make_shared<SegmentList>(segs))).second) 00436 throw ML::Exception("attempt to add same segments twice"); 00437 } 00438 00439 void 00440 SegmentsBySource:: 00441 addStrings(const std::string & source, 00442 const std::vector<string> & segs) 00443 { 00444 if (!insert(make_pair(source, std::make_shared<SegmentList>(segs))).second) 00445 throw ML::Exception("attempt to add same segments twice"); 00446 } 00447 00448 void 00449 SegmentsBySource:: 00450 addWeightedInts(const std::string & source, 00451 const std::vector<pair<int, float> > & segs) 00452 { 00453 if (!insert(make_pair(source, std::make_shared<SegmentList>(segs))).second) 00454 throw ML::Exception("attempt to add same segments twice"); 00455 } 00456 00457 void 00458 SegmentsBySource:: 00459 add(const std::string & source, const std::string & segment, float weight) 00460 { 00461 auto & entry = (*this)[source]; 00462 if (!entry) entry.reset(new SegmentList()); 00463 entry->add(segment, weight); 00464 } 00465 00466 void 00467 SegmentsBySource:: 00468 add(const std::string & source, int segment, float weight) 00469 { 00470 auto & entry = (*this)[source]; 00471 if (!entry) entry.reset(new SegmentList()); 00472 entry->add(segment, weight); 00473 } 00474 00475 Json::Value 00476 SegmentsBySource:: 00477 toJson() const 00478 { 00479 Json::Value result; 00480 for (auto it = begin(), end = this->end(); it != end; ++it) 00481 result[it->first] = it->second->toJson(); 00482 return result; 00483 } 00484 00485 SegmentsBySource 00486 SegmentsBySource:: 00487 createFromJson(const Json::Value & json) 00488 { 00489 SegmentsBySource result; 00490 00491 for (auto it = json.begin(), end = json.end(); it != end; ++it) { 00492 auto segs = std::make_shared<SegmentList>(); 00493 *segs = SegmentList::createFromJson(*it); 00494 result.addSegment(it.memberName(), segs); 00495 } 00496 00497 return result; 00498 } 00499 00500 void 00501 SegmentsBySource:: 00502 serialize(ML::DB::Store_Writer & store) const 00503 { 00504 unsigned char version = 0; 00505 store << version; 00506 store << compact_size_t(size()); 00507 for (auto it = begin(), end = this->end(); it != end; ++it) { 00508 store << it->first; 00509 it->second->serialize(store); 00510 } 00511 } 00512 00513 void 00514 SegmentsBySource:: 00515 reconstitute(ML::DB::Store_Reader & store) 00516 { 00517 unsigned char version; 00518 store >> version; 00519 if (version != 0) 00520 throw ML::Exception("invalid version"); 00521 compact_size_t sz(store); 00522 00523 SegmentsBySourceBase newMe; 00524 00525 for (unsigned i = 0; i < sz; ++i) { 00526 string k; 00527 store >> k; 00528 auto l = std::make_shared<SegmentList>(); 00529 store >> *l; 00530 newMe[k] = l; 00531 } 00532 00533 swap(newMe); 00534 } 00535 00536 struct SegmentsBySourceValueDescription 00537 : public ValueDescriptionT<SegmentsBySource> { 00538 00539 virtual void parseJsonTyped(SegmentsBySource * val, 00540 JsonParsingContext & context) const 00541 { 00542 Json::Value v = context.expectJson(); 00543 //cerr << "got segments " << v << endl; 00544 *val = std::move(RTBKIT::SegmentsBySource::createFromJson(v)); 00545 } 00546 00547 virtual void printJsonTyped(const SegmentsBySource * val, 00548 JsonPrintingContext & context) const 00549 { 00550 context.writeJson(val->toJson()); 00551 } 00552 00553 virtual bool isDefaultTyped(const SegmentsBySource * val) const 00554 { 00555 return val->empty(); 00556 } 00557 }; 00558 00559 ValueDescriptionT<RTBKIT::SegmentsBySource> * 00560 getDefaultDescription(RTBKIT::SegmentsBySource *) 00561 { 00562 return new SegmentsBySourceValueDescription(); 00563 } 00564 00565 } // namespace RTBKIT 00566
1.7.6.1