RTBKit  0.9
Open-source framework to create real-time ad bidding systems.
common/segments.cc
00001 /* segments.cc
00002    Jeremy Barnes, 12 March 2012
00003    Copyright (c) 2012 Datacratic.  All rights reserved.
00004 
00005    Implementation of segments.
00006 */
00007 
00008 #include "rtbkit/common/segments.h"
00009 #include <boost/function_output_iterator.hpp>
00010 #include "jml/arch/format.h"
00011 #include "jml/arch/exception.h"
00012 #include "jml/arch/backtrace.h"
00013 #include "jml/utils/exc_assert.h"
00014 #include "soa/types/value_description.h"
00015 #include "jml/db/persistent.h"
00016 #include <boost/make_shared.hpp>
00017 #include <boost/algorithm/string.hpp>
00018 
00019 using namespace std;
00020 using namespace ML;
00021 using namespace ML::DB;
00022 using namespace Datacratic;
00023 
00024 namespace RTBKIT {
00025 
00026 
00027 /*****************************************************************************/
00028 /* SEGMENTS                                                                  */
00029 /*****************************************************************************/
00030 
00031 SegmentList::
00032 SegmentList()
00033 {
00034 }
00035 
00036 SegmentList::
00037 SegmentList(const std::vector<string> & segs)
00038 {
00039     for (unsigned i = 0;  i < segs.size();  ++i)
00040         add(segs[i]);
00041     sort();
00042 }
00043 
00044 SegmentList::
00045 SegmentList(const std::vector<int> & segs)
00046     : ints(segs.begin(), segs.end())
00047 {
00048     sort();
00049 }
00050 
00051 SegmentList::
00052 SegmentList(const std::vector<std::pair<int, float> > & segs)
00053 {
00054     for (unsigned i = 0;  i < segs.size();  ++i)
00055         add(segs[i].first, segs[i].second);
00056     sort();
00057 }
00058 
00059 bool
00060 SegmentList::
00061 contains(int i) const
00062 {
00063     return std::binary_search(ints.begin(), ints.end(), i);
00064 }
00065 
00066 bool
00067 SegmentList::
00068 contains(const std::string & str) const
00069 {
00070     int i = parseSegmentNum(str);
00071     if (i == -1)
00072         return std::binary_search(strings.begin(), strings.end(), str);
00073     else return contains(i);
00074 }
00075 
00076 #if 0
00077 float
00078 SegmentList::
00079 weight(int i) const
00080 {
00081 }
00082 
00083 float
00084 SegmentList::
00085 weight(const std::string & str) const
00086 {
00087 }
00088 #endif
00089 
00090 template<typename Seq1, typename Seq2>
00091 bool anyMatchesLookup(const Seq1 & seq1, const Seq2 & seq2)
00092 {
00093     auto it2 = seq2.begin(), end2 = seq2.end();
00094     for (auto it1 = seq1.begin(), end1 = seq1.end();
00095          it1 != end1;  ++it1)
00096         if (std::binary_search(it2, end2, *it1)) return true;
00097     return false;
00098 }
00099 
00100 template<typename Seq1, typename Seq2>
00101 bool anyMatches(const Seq1 & seq1, const Seq2 & seq2)
00102 {
00103     if (seq1.empty() || seq2.empty())
00104         return false;
00105     else if (seq1.size() * 5 < seq2.size()) {
00106         // seq2 is much bigger... look up individually each element
00107         return anyMatchesLookup(seq1, seq2);
00108     }
00109     else if (seq2.size() * 5 < seq1.size()) {
00110         // seq1 is much bigger... look up individually each element
00111         return anyMatchesLookup(seq2, seq1);
00112     }
00113     else {
00114         // roughly equal sizes; jointly iterate
00115         auto it1 = seq1.begin(), end1 = seq1.end();
00116         auto it2 = seq2.begin(), end2 = seq2.end();
00117     
00118         while (it1 != end1 && it2 != end2) {
00119             if (*it1 == *it2) return true;
00120             else if (*it1 < *it2) ++it1;
00121             else ++it2;
00122         }
00123 
00124         return false;
00125     }
00126 }
00127 
00128 bool
00129 SegmentList::
00130 match(const SegmentList & other) const
00131 {
00132     return anyMatches(ints, other.ints)
00133         || anyMatches(strings, other.strings);
00134 }
00135 
00136 bool
00137 SegmentList::
00138 match(const std::vector<int> & other) const
00139 {
00140     return anyMatches(ints, other);
00141 }
00142 
00143 bool
00144 SegmentList::
00145 match(const std::vector<std::string> & other) const
00146 {
00147     return anyMatches(strings, other);
00148 }
00149 
00150 size_t
00151 SegmentList::
00152 size() const
00153 {
00154     return ints.size() + strings.size();
00155 }
00156 
00157 bool
00158 SegmentList::
00159 empty() const
00160 {
00161     return ints.empty() && strings.empty();
00162 }
00163 
00164 SegmentList
00165 SegmentList::
00166 createFromJson(const Json::Value & json)
00167 {
00168     SegmentList result;
00169 
00170     if (!json.isArray())
00171         throw Exception("augment must be an array of augmentations");
00172 
00173     for (unsigned i = 0;  i < json.size();  ++i) {
00174         const Json::Value & val = json[i];
00175         if (val.isArray()) {
00176             if (val.size() != 2)
00177                 throw ML::Exception("can't create weighted segment from "
00178                                     + json.toString());
00179             float weight = val[1].asDouble();
00180             
00181             if (val[0].isInt())
00182                 result.add(val[0].asInt(), weight);
00183             else if (val[0].isNumeric())
00184                 result.add(val[0].asDouble(), weight);
00185             else result.add(val[0].asString(), weight);
00186         }
00187         else if (val.isInt())
00188             result.add(val.asInt());
00189         else if (val.isNumeric())
00190             result.add(val.asDouble());
00191         else result.add(val.asString());
00192     }
00193     
00194     return result;
00195 }
00196 
00197 Json::Value
00198 SegmentList::
00199 toJson() const
00200 {
00201     Json::Value result;
00202 
00203     if (weights.empty()) {
00204         if (strings.empty()) {
00205             for (unsigned i = 0;  i < ints.size();  ++i)
00206                 result[i] = ints[i];
00207         }
00208         else {
00209             for (unsigned i = 0;  i < ints.size();  ++i)
00210                 result[i] = ML::format("%d", ints[i]);
00211             for (unsigned i = 0;  i < strings.size();  ++i)
00212                 result[i + ints.size()] = strings[i];
00213         }
00214     }
00215     else {
00216         if (strings.empty()) {
00217             for (unsigned i = 0;  i < ints.size();  ++i) {
00218                 result[i][0] = ints[i];
00219                 result[i][1] = weights[i];
00220             }
00221         }
00222         else {
00223             for (unsigned i = 0;  i < ints.size();  ++i) {
00224                 result[i][0] = ML::format("%d", ints[i]);
00225                 result[i][1] = weights[i];
00226             }
00227             for (unsigned i = 0;  i < strings.size();  ++i) {
00228                 result[i + ints.size()][0] = strings[i];
00229                 result[i + ints.size()][1] = weights[i + ints.size()];
00230             }
00231         }
00232     }
00233     return result;
00234 }
00235 
00236 std::string
00237 SegmentList::
00238 toJsonStr() const
00239 {
00240     return boost::trim_copy(toJson().toString());
00241 }
00242 
00243 std::string
00244 SegmentList::
00245 toString() const
00246 {
00247     return toJsonStr();
00248 }
00249 
00250 void
00251 SegmentList::
00252 add(int i, float weight)
00253 {
00254     ints.push_back(i);
00255     if (weight != 1.0 || !weights.empty()) {
00256         if (weights.empty())
00257             weights.resize(size() - 1, 1.0);
00258         weights.insert(weights.begin() + ints.size() - 1, weight);
00259         ExcAssertEqual(weights.size(), size());
00260     }
00261 }
00262 
00263 void
00264 SegmentList::
00265 add(const std::string & str, float weight)
00266 {
00267     int i = parseSegmentNum(str);
00268     if (i == -1) {
00269         strings.push_back(str);
00270         if (weight != 1.0 || !weights.empty()) {
00271             if (weights.empty())
00272                 weights.resize(size() - 1, 1.0);
00273             weights.push_back(weight);
00274             ExcAssertEqual(weights.size(), size());
00275         }
00276     }
00277     else add(i, weight);
00278 }
00279 
00280 int
00281 SegmentList::
00282 parseSegmentNum(const std::string & str)
00283 {
00284     if (str.empty()) return -1;
00285 
00286     else if (str.length() == 1 && str[0] == '0') {
00287         return 0;
00288     }
00289     else if (str[0] != '0' && isdigit(str[0])) {
00290         char * endptr = const_cast<char *>(str.c_str() + str.length());
00291         long i = strtol(str.c_str(), &endptr, 10);
00292         if (endptr == str.c_str() + str.length()) {
00293             if (i < 0) return -1;
00294             return i;
00295         }
00296     }
00297 
00298     return -1;
00299 }
00300 
00301 void
00302 SegmentList::
00303 sort()
00304 {
00305     if (weights.empty()) {
00306         std::sort(ints.begin(), ints.end());
00307         std::sort(strings.begin(), strings.end());
00308     }
00309     else {
00310         ExcAssertEqual(weights.size(), size());
00311         vector<pair<int, float> > isorted(ints.size());
00312         for (unsigned i = 0;  i < ints.size();  ++i)
00313             isorted[i] = make_pair(ints[i], weights[i]);
00314         std::sort(isorted.begin(), isorted.end());
00315 
00316         vector<pair<string, float> > ssorted(strings.size());
00317         for (unsigned i = 0;  i < strings.size();  ++i)
00318             ssorted[i] = make_pair(strings[i], weights[i + ints.size()]);
00319         std::sort(ssorted.begin(), ssorted.end());
00320 
00321         for (unsigned i = 0;  i < ints.size(); ++i) {
00322             ints[i] = isorted[i].first;
00323             weights[i] = isorted[i].second;
00324         }
00325         for (unsigned i = 0;  i < strings.size(); ++i) {
00326             strings[i] = ssorted[i].first;
00327             weights[i + ints.size()] = ssorted[i].second;
00328         }
00329     }
00330 }
00331 
00332 void
00333 SegmentList::
00334 serialize(ML::DB::Store_Writer & store) const
00335 {
00336     unsigned char version = 0;
00337     store << version << ints << strings << weights;
00338 }
00339 
00340 void
00341 SegmentList::
00342 reconstitute(ML::DB::Store_Reader & store)
00343 {
00344     unsigned char version;
00345     store >> version;
00346     if (version > 0)
00347         throw ML::Exception("unknown SegmentList version");
00348     store >> ints >> strings >> weights;
00349 }
00350 
00351 std::string
00352 SegmentList::
00353 serializeToString() const
00354 {
00355     return ML::DB::serializeToString(*this);
00356 }
00357 
00358 SegmentList
00359 SegmentList::
00360 reconstituteFromString(const std::string & str)
00361 {
00362     return ML::DB::reconstituteFromString<SegmentList>(str);
00363 }
00364 
00365 void
00366 SegmentList::
00367 forEach(const std::function<void (int, string, float)> & onSegment) const
00368 {
00369     for (unsigned i = 0;  i < ints.size();  ++i)
00370         onSegment(ints[i], ML::format("%d", ints[i]),
00371                   weights.empty() ? 1.0 : weights[i]);
00372     for (unsigned i = 0;  i < strings.size();  ++i)
00373         onSegment(-1, strings[i],
00374                   weights.empty() ? 1.0 : weights[i + ints.size()]);
00375 }
00376 
00377 
00378 /*****************************************************************************/
00379 /* SEGMENTS BY SOURCE                                                        */
00380 /*****************************************************************************/
00381 
00382 SegmentsBySource::
00383 SegmentsBySource()
00384 {
00385 }
00386 
00387 SegmentsBySource::
00388 SegmentsBySource(SegmentsBySourceBase && other)
00389     : SegmentsBySourceBase(other)
00390 {
00391 }
00392 
00393 SegmentsBySource::
00394 SegmentsBySource(const SegmentsBySourceBase & other)
00395     : SegmentsBySourceBase(other)
00396 {
00397 }
00398 
00399 void
00400 SegmentsBySource::
00401 sortAll()
00402 {
00403     for (auto it = begin(), end = this->end();
00404          it != end;  ++it)
00405         it->second->sort();
00406 }
00407 
00408 const SegmentList &
00409 SegmentsBySource::
00410 get(const std::string & str) const
00411 {
00412     static const SegmentList NONE;
00413     
00414     auto it = find(str);
00415     if (it == end()) return NONE;
00416     if (!it->second)
00417         throw ML::Exception("invalid segment list in segments");
00418     return *it->second;
00419 }
00420 
00421 void
00422 SegmentsBySource::
00423 addSegment(const std::string & source,
00424            const std::shared_ptr<SegmentList> & segs)
00425 {
00426     if (!insert(make_pair(source, segs)).second)
00427         throw ML::Exception("attempt to add same segments twice");
00428 }
00429 
00430 void
00431 SegmentsBySource::
00432 addInts(const std::string & source,
00433         const std::vector<int> & segs)
00434 {
00435     if (!insert(make_pair(source, std::make_shared<SegmentList>(segs))).second)
00436         throw ML::Exception("attempt to add same segments twice");
00437 }
00438 
00439 void
00440 SegmentsBySource::
00441 addStrings(const std::string & source,
00442            const std::vector<string> & segs)
00443 {
00444     if (!insert(make_pair(source, std::make_shared<SegmentList>(segs))).second)
00445         throw ML::Exception("attempt to add same segments twice");
00446 }
00447 
00448 void
00449 SegmentsBySource::
00450 addWeightedInts(const std::string & source,
00451                 const std::vector<pair<int, float> > & segs)
00452 {
00453     if (!insert(make_pair(source, std::make_shared<SegmentList>(segs))).second)
00454         throw ML::Exception("attempt to add same segments twice");
00455 }
00456 
00457 void
00458 SegmentsBySource::
00459 add(const std::string & source, const std::string & segment, float weight)
00460 {
00461     auto & entry = (*this)[source];
00462     if (!entry) entry.reset(new SegmentList());
00463     entry->add(segment, weight);
00464 }
00465 
00466 void
00467 SegmentsBySource::
00468 add(const std::string & source, int segment, float weight)
00469 {
00470     auto & entry = (*this)[source];
00471     if (!entry) entry.reset(new SegmentList());
00472     entry->add(segment, weight);
00473 }
00474 
00475 Json::Value
00476 SegmentsBySource::
00477 toJson() const
00478 {
00479     Json::Value result;
00480     for (auto it = begin(), end = this->end();  it != end;  ++it)
00481         result[it->first] = it->second->toJson();
00482     return result;
00483 }
00484 
00485 SegmentsBySource
00486 SegmentsBySource::
00487 createFromJson(const Json::Value & json)
00488 {
00489     SegmentsBySource result;
00490 
00491     for (auto it = json.begin(), end = json.end(); it != end;  ++it) {
00492         auto segs = std::make_shared<SegmentList>();
00493         *segs = SegmentList::createFromJson(*it);
00494         result.addSegment(it.memberName(), segs);
00495     }
00496     
00497     return result;
00498 }
00499 
00500 void
00501 SegmentsBySource::
00502 serialize(ML::DB::Store_Writer & store) const
00503 {
00504     unsigned char version = 0;
00505     store << version;
00506     store << compact_size_t(size());
00507     for (auto it = begin(), end = this->end();  it != end;  ++it) {
00508         store << it->first;
00509         it->second->serialize(store);
00510     }
00511 }
00512 
00513 void
00514 SegmentsBySource::
00515 reconstitute(ML::DB::Store_Reader & store)
00516 {
00517     unsigned char version;
00518     store >> version;
00519     if (version != 0)
00520         throw ML::Exception("invalid version");
00521     compact_size_t sz(store);
00522     
00523     SegmentsBySourceBase newMe;
00524     
00525     for (unsigned i = 0;  i < sz;  ++i) {
00526         string k;
00527         store >> k;
00528         auto l = std::make_shared<SegmentList>();
00529         store >> *l;
00530         newMe[k] = l;
00531     }
00532     
00533     swap(newMe);
00534 }
00535 
00536 struct SegmentsBySourceValueDescription
00537     : public ValueDescriptionT<SegmentsBySource> {
00538 
00539     virtual void parseJsonTyped(SegmentsBySource * val,
00540                                 JsonParsingContext & context) const
00541     {
00542         Json::Value v = context.expectJson();
00543         //cerr << "got segments " << v << endl;
00544         *val = std::move(RTBKIT::SegmentsBySource::createFromJson(v));
00545     }
00546 
00547     virtual void printJsonTyped(const SegmentsBySource * val,
00548                                 JsonPrintingContext & context) const
00549     {
00550         context.writeJson(val->toJson());
00551     }
00552 
00553     virtual bool isDefaultTyped(const SegmentsBySource * val) const
00554     {
00555         return val->empty();
00556     }
00557 };
00558 
00559 ValueDescriptionT<RTBKIT::SegmentsBySource> *
00560 getDefaultDescription(RTBKIT::SegmentsBySource *)
00561 {
00562     return new SegmentsBySourceValueDescription();
00563 }
00564 
00565 } // namespace RTBKIT
00566 
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator