RTBKit  0.9
Open-source framework to create real-time ad bidding systems.
core/agent_configuration/include_exclude.h
00001 /* include_exclude.h                                               -*- C++ -*-
00002    Jeremy Barnes, 8 March 2012
00003    Copyright (c) 2012 Datacratic.  All rights reserved.
00004 
00005    Deals with lists of include/exclude items.
00006 */
00007 
00008 #ifndef __rtb_router__include_exclude_h__
00009 #define __rtb_router__include_exclude_h__
00010 
00011 #include "jml/arch/exception.h"
00012 #include "jml/utils/lightweight_hash.h"
00013 #include "soa/types/url.h"
00014 #include "soa/jsoncpp/value.h"
00015 #include <boost/regex.hpp>
00016 #include <boost/regex/icu.hpp>
00017 #include "soa/types/string.h"
00018 #include <vector>
00019 #include <set>
00020 #include <iostream>
00021 
00022 
00023 namespace RTBKIT {
00024 
00025 using namespace Datacratic;
00026 
00027 struct SegmentList;
00028 
00029 template<typename T>
00030 inline bool matches(const T & t1, const T & t2)
00031 {
00032     return t1 == t2;
00033 }
00034 
00035 inline bool matches(const boost::u32regex & rex, const Utf8String & val)
00036 {
00037     std::string raw(val.rawData(), val.rawLength());
00038     boost::match_results<std::string::const_iterator> matches;
00039     bool result = boost::u32regex_search(raw, matches, rex) ;
00040     return result;
00041 }
00042 
00043 inline bool matches(const boost::regex & rex, const std::string & val)
00044 {
00045     //using namespace std;
00046     //cerr << "matching " << val << " with rex " << rex.str() << endl;
00047     return boost::regex_search(val, rex);
00048 }
00049 #if 0
00050 inline bool matches(const std::string & str, const std::string & val)
00051 {
00052     return str == val;
00053 }
00054 
00055 inline bool matches(int i, int j)
00056 {
00057     return i == j;
00058 }
00059 #endif
00060 
00061 void jsonParse(const Json::Value & value, boost::regex & reg);
00062 void jsonParse(const Json::Value & value, boost::u32regex & reg);
00063 void jsonParse(const Json::Value & value, std::string & str);
00064 void jsonParse(const Json::Value & value, int & i);
00065 
00066 inline Json::Value jsonPrint(const boost::regex & rex)
00067 {
00068     return rex.str();
00069 }
00070 
00071 inline Json::Value jsonPrint(const boost::u32regex & rex)
00072 {
00073     std::vector<unsigned char> utf8result;
00074     std::basic_string<int,std::char_traits<int>, std::allocator<int> >
00075         unicodeStr = rex.str();
00076     utf8::utf32to8(unicodeStr.begin(),
00077                    unicodeStr.begin() + unicodeStr.length(),
00078                    std::back_inserter(utf8result));
00079     Utf8String utf8str(std::string(utf8result.begin(), utf8result.end()));
00080     return utf8str;
00081 }
00082 
00083 
00084 inline Json::Value jsonPrint(const std::string & str)
00085 {
00086     return str;
00087 }
00088 
00089 inline Json::Value jsonPrint(int i)
00090 {
00091     return i;
00092 }
00093 
00094 struct JsonPrint {
00095     template<typename T>
00096     Json::Value operator () (const T & t) const
00097     {
00098         return jsonPrint(t);
00099     }
00100 };
00101 
00102 inline uint64_t hashString(const std::string & str)
00103 {
00104     uint64_t res = std::hash<std::string>()(str);
00105     //cerr << "hashString of " << str << " returned " << res << endl;
00106     return res;
00107 }
00108 
00109 inline uint64_t hashString(const Utf8String & str)
00110 {
00111     return std::hash<std::string>()(std::string(str.rawData(), str.rawLength()));
00112 }
00113 
00114 inline uint64_t hashString(const std::wstring & str)
00115 {
00116     uint64_t res = std::hash<std::wstring>()(str);
00117     return res;
00118 }
00119 
00120 inline void createRegex(boost::u32regex & regex, const wchar_t * str)
00121 {
00122     regex = boost::make_u32regex(str);
00123 }
00124 
00125 inline void createRegex(boost::regex & regex, const std::string & str)
00126 {
00127     regex = boost::regex(str);
00128 }
00129 
00130 /*****************************************************************************/
00131 /* CACHED REGEX                                                              */
00132 /*****************************************************************************/
00133 
00134 template<typename Base, typename Str>
00135 struct CachedRegex {
00136     Base base;
00137     uint64_t hash;
00138     
00139     CachedRegex()
00140         : hash(0)
00141     {
00142     }
00143 
00144     template<typename InitStr>
00145     CachedRegex(const InitStr & val)
00146         : hash(hashString(val))
00147     {
00148         createRegex(base, val);
00149     }
00150 
00151     void jsonParse(const Json::Value & val)
00152     {
00153         RTBKIT::jsonParse(val, base);
00154         hash = std::hash<std::string>() (val.asString());
00155     }
00156 
00157     bool operator < (const CachedRegex & other) const
00158     {
00159         return base < other.base;
00160     }
00161 
00162     bool matches(const Str & str) const
00163     {
00164         return RTBKIT::matches(base, str);
00165     }
00166 
00167     bool matches(const Str & str, uint64_t strHash,
00168                  ML::Lightweight_Hash<uint64_t, int> & cache) const
00169     {
00170         uint64_t bucket = hash ^ (strHash >> 1);
00171         bucket += (bucket == 0);
00172         int & cached = cache[bucket];
00173         if (cached == 0)
00174             cached = RTBKIT::matches(base, str) + 1;
00175         return cached - 1;
00176     }
00177 };
00178 
00179 template<typename Base, typename Str>
00180 void jsonParse(const Json::Value & value, CachedRegex<Base, Str> & rex)
00181 {
00182     rex.jsonParse(value);
00183 }
00184 
00185 template<typename Base, typename Str>
00186 inline Json::Value jsonPrint(const CachedRegex<Base, Str> & rex)
00187 {
00188     return jsonPrint(rex.base);
00189 }
00190 
00191 template<typename Base, typename Str, typename Cache>
00192 inline bool matches(const CachedRegex<Base, Str> & rex,
00193                     const Str & str, uint64_t strHash,
00194                     Cache & cache)
00195 {
00196     if (strHash == 0)
00197         throw ML::Exception("zero string hash");
00198     return rex.matches(str, strHash, cache);
00199 }
00200 
00201 template<typename Base, typename Str>
00202 inline bool matches(const CachedRegex<Base, Str> & rex, const Str & str)
00203 {
00204     return rex.matches(str);
00205 }
00206 
00207 
00208 /*****************************************************************************/
00209 /* URL MATCHER                                                               */
00210 /*****************************************************************************/
00211 
00212 struct DomainMatcher {
00213     boost::regex rex;
00214     bool isLiteral;
00215     std::string str;
00216     uint64_t hash;
00217     
00218     DomainMatcher()
00219         : hash(0)
00220     {
00221     }
00222 
00223     DomainMatcher(const std::string & val)
00224         : str(val), hash(std::hash<std::string>() (val))
00225     {
00226         isLiteral = true;
00227         return;
00228         if (false
00229             && val.find('*') == std::string::npos
00230             && val.find('/') == std::string::npos
00231             && val.find('?') == std::string::npos
00232             && val.find(':') == std::string::npos
00233             && val.find('.') != std::string::npos) {
00234             isLiteral = true;
00235         }
00236         else {
00237             rex = boost::regex(val);
00238             isLiteral = false;
00239         }
00240     }
00241 
00242     void jsonParse(const Json::Value & val)
00243     {
00244         std::string s = val.asString();
00245         *this = DomainMatcher(s);
00246     }
00247 
00248     bool operator < (const DomainMatcher & other) const
00249     {
00250         return str < other.str;
00251     }
00252 
00253     bool matches(const Url & url) const
00254     {
00255         if (isLiteral)
00256             return url.domainMatches(str);
00257         else return RTBKIT::matches(rex, url.host());
00258     }
00259 
00260     bool matches(const Url & url, uint64_t urlHash,
00261                  ML::Lightweight_Hash<uint64_t, int> & cache) const
00262     {
00263         uint64_t bucket = hash ^ (urlHash >> 1);
00264         bucket += (bucket == 0);
00265         int & cached = cache[bucket];
00266         if (cached == 0)
00267             cached = matches(url) + 1;
00268         return cached - 1;
00269     }
00270 };
00271 
00272 inline void jsonParse(const Json::Value & value, DomainMatcher & rex)
00273 {
00274     rex.jsonParse(value);
00275 }
00276 
00277 inline Json::Value jsonPrint(const DomainMatcher & rex)
00278 {
00279     return rex.str;
00280 }
00281 
00282 template<typename Cache>
00283 inline bool matches(const DomainMatcher & rex, const Url & url,
00284                     uint64_t urlHash,
00285                     Cache & cache)
00286 {
00287     return rex.matches(url, urlHash, cache);
00288 }
00289 
00290 inline bool matches(const DomainMatcher & rex, const Url & url)
00291 {
00292     return rex.matches(url);
00293 }
00294 
00295 
00296 /*****************************************************************************/
00297 /* MATCHING AND PARSING FUNCTIONS                                            */
00298 /*****************************************************************************/
00299 
00300 template<typename T, typename Fn>
00301 Json::Value
00302 collectionToJson(const std::vector<T> & vec, Fn fn)
00303 {
00304     Json::Value result;
00305     for (unsigned i = 0;  i < vec.size();  ++i)
00306         result[i] = fn(vec[i]);
00307     return result;
00308 }
00309 
00310 template<typename T, typename Fn>
00311 Json::Value
00312 collectionToJson(const std::set<T> & s, Fn fn)
00313 {
00314     Json::Value result;
00315     unsigned i = 0;
00316     for (auto it = s.begin(), end = s.end();  it != end;  ++it, ++i)
00317         result[i] = fn(*it);
00318     return result;
00319 }
00320 
00321 template<typename Collection, typename Fn>
00322 Json::Value
00323 includeExcludeToJson(const Collection & include,
00324                      const Collection & exclude,
00325                      Fn fn)
00326 {
00327     Json::Value result;
00328     if (!include.empty())
00329         result["include"] = collectionToJson(include, fn);
00330     if (!exclude.empty())
00331         result["exclude"] = collectionToJson(exclude, fn);
00332     return result;
00333 }
00334 
00335 template<typename T, typename U>
00336 bool matchesAny(const std::vector<T> & values, const U & key, bool matchIfEmpty)
00337 {
00338     if (values.empty())
00339     {
00340         return matchIfEmpty;
00341     }
00342     for (unsigned i = 0;  i < values.size();  ++i)
00343     {
00344         if (matches(values[i], key)) return true;
00345     }
00346     return false;
00347 }
00348 
00349 template<typename T, typename U, typename Cache>
00350 bool matchesAny(const std::vector<T> & values,
00351                 const U & key, uint64_t keyHash,
00352                 bool matchIfEmpty,
00353                 Cache & cache)
00354 {
00355     if (values.empty())
00356     {
00357         return matchIfEmpty;
00358     }
00359     for (unsigned i = 0;  i < values.size();  ++i)
00360     {
00361         if (matches(values[i], key, keyHash, cache)) return true;
00362     }
00363     return false;
00364 }
00365 
00366 // TODO: this is O(mn) but could be O(n+m) since they are sorted
00367 template<typename T, class Vec>
00368 bool matchesAnyAny(const std::vector<T> & values, const Vec & vec,
00369                    bool matchIfEmpty)
00370 {
00371     if (values.empty()) return matchIfEmpty;
00372     
00373     for (auto it = vec.begin(), end = vec.end(); it != end; ++it)
00374         if (matchesAny(values, *it, matchIfEmpty)) return true;
00375     return false;
00376 }
00377 
00378 bool matchesAnyAny(const std::vector<int> & values, const SegmentList & vals,
00379                    bool matchIfEmpty);
00380 
00381 enum IncludeExcludeResult {
00382     IE_NO_DATA,
00383     IE_NOT_INCLUDED,
00384     IE_EXCLUDED,
00385     IE_PASSED
00386 };
00387 
00388 template<typename U, typename IE>
00389 bool isIncludedImpl(const U & value, const IE & include, const IE & exclude)
00390 {
00391     if (!matchesAny(include, value, true)) return false;
00392     if (matchesAny(exclude, value, false)) return false;
00393     return true;
00394 }
00395 
00396 template<typename U, typename IE, typename Cache>
00397 bool isIncludedImpl(const U & value, uint64_t hash,
00398                     const IE & include, const IE & exclude,
00399                     Cache & cache)
00400 {
00401     if (!matchesAny(include, value, hash, true, cache)) return false;
00402     if (matchesAny(exclude, value, hash, false, cache)) return false;
00403     return true;
00404 }
00405 
00406 template<typename Vec, typename IE>
00407 bool anyIsIncludedImpl(const Vec & vec, const IE & include, const IE & exclude)
00408 {
00409     if (!matchesAnyAny(include, vec, true)) return false;
00410     if (matchesAnyAny(exclude, vec, false)) return false;
00411     return true;
00412 }
00413 
00414 
00415 
00416 /*****************************************************************************/
00417 /* INCLUDE EXCLUDE                                                           */
00418 /*****************************************************************************/
00419 
00420 template<typename T, typename IE = std::vector<T> >
00421 struct IncludeExclude {
00422     IE include;
00423     IE exclude;
00424 
00425     static IncludeExclude
00426     createFromJson(const Json::Value & val,
00427                    const std::string & name)
00428     {
00429         IncludeExclude result;
00430 
00431         for (auto jt = val.begin(), jend = val.end();  jt != jend;  ++jt) {
00432             if (jt.memberName() != "include"
00433                 && jt.memberName() != "exclude")
00434                 throw ML::Exception("filter %s has invalid key: %s",
00435                                     name.c_str(), jt.memberName().c_str());
00436             
00437             const Json::Value & val = *jt;
00438             for (unsigned i = 0;  i != val.size();  ++i) {
00439                 try {
00440                     T t;
00441                     jsonParse(val[i], t);
00442                     if (jt.memberName() == "include")
00443                         result.include.push_back(t);
00444                     else result.exclude.push_back(t);
00445                 } catch (...) {
00446                     throw ML::Exception("error parsing include/exclude %s in %s",
00447                                         val[i].toString().c_str(), name.c_str());
00448                 }
00449             }
00450         }
00451 
00452         std::sort(result.include.begin(), result.include.end());
00453         std::sort(result.exclude.begin(), result.exclude.end());
00454 
00455         return result;
00456     }
00457 
00458     void fromJson(const Json::Value & val, const std::string & name)
00459     {
00460         *this = createFromJson(val, name);
00461     }
00462 
00463     Json::Value toJson() const
00464     {
00465         Json::Value result = includeExcludeToJson(include, exclude, JsonPrint());
00466         return result;
00467     }
00468 
00469     bool empty() const { return include.empty() && exclude.empty(); }
00470 
00471     template<typename U>
00472     bool isIncluded(const U & value) const
00473     {
00474         return isIncludedImpl(value, include, exclude);
00475     }
00476     
00477     template<typename U, typename Cache>
00478     bool isIncluded(const U & value, uint64_t hash, Cache & cache) const
00479     {
00480         return isIncludedImpl(value, hash, include, exclude, cache);
00481     }
00482     
00483     template<typename Vec>
00484     bool anyIsIncluded(const Vec & vec) const
00485     {
00486         return anyIsIncludedImpl(vec, include, exclude);
00487     }
00488 };
00489 
00490 extern template class IncludeExclude<std::string>;
00491 extern template class IncludeExclude<boost::regex>;
00492 extern template class IncludeExclude<boost::u32regex>;
00493 extern template class IncludeExclude<int>;
00494 
00495 } // namespace RTBKIT
00496 
00497 #endif /* __rtb_router__include_exclude_h__ */
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator