![]() |
RTBKit
0.9
Open-source framework to create real-time ad bidding systems.
|
00001 /* include_exclude.h -*- C++ -*- 00002 Jeremy Barnes, 8 March 2012 00003 Copyright (c) 2012 Datacratic. All rights reserved. 00004 00005 Deals with lists of include/exclude items. 00006 */ 00007 00008 #ifndef __rtb_router__include_exclude_h__ 00009 #define __rtb_router__include_exclude_h__ 00010 00011 #include "jml/arch/exception.h" 00012 #include "jml/utils/lightweight_hash.h" 00013 #include "soa/types/url.h" 00014 #include "soa/jsoncpp/value.h" 00015 #include <boost/regex.hpp> 00016 #include <boost/regex/icu.hpp> 00017 #include "soa/types/string.h" 00018 #include <vector> 00019 #include <set> 00020 #include <iostream> 00021 00022 00023 namespace RTBKIT { 00024 00025 using namespace Datacratic; 00026 00027 struct SegmentList; 00028 00029 template<typename T> 00030 inline bool matches(const T & t1, const T & t2) 00031 { 00032 return t1 == t2; 00033 } 00034 00035 inline bool matches(const boost::u32regex & rex, const Utf8String & val) 00036 { 00037 std::string raw(val.rawData(), val.rawLength()); 00038 boost::match_results<std::string::const_iterator> matches; 00039 bool result = boost::u32regex_search(raw, matches, rex) ; 00040 return result; 00041 } 00042 00043 inline bool matches(const boost::regex & rex, const std::string & val) 00044 { 00045 //using namespace std; 00046 //cerr << "matching " << val << " with rex " << rex.str() << endl; 00047 return boost::regex_search(val, rex); 00048 } 00049 #if 0 00050 inline bool matches(const std::string & str, const std::string & val) 00051 { 00052 return str == val; 00053 } 00054 00055 inline bool matches(int i, int j) 00056 { 00057 return i == j; 00058 } 00059 #endif 00060 00061 void jsonParse(const Json::Value & value, boost::regex & reg); 00062 void jsonParse(const Json::Value & value, boost::u32regex & reg); 00063 void jsonParse(const Json::Value & value, std::string & str); 00064 void jsonParse(const Json::Value & value, int & i); 00065 00066 inline Json::Value jsonPrint(const boost::regex & rex) 00067 { 00068 return rex.str(); 00069 } 00070 00071 inline Json::Value jsonPrint(const boost::u32regex & rex) 00072 { 00073 std::vector<unsigned char> utf8result; 00074 std::basic_string<int,std::char_traits<int>, std::allocator<int> > 00075 unicodeStr = rex.str(); 00076 utf8::utf32to8(unicodeStr.begin(), 00077 unicodeStr.begin() + unicodeStr.length(), 00078 std::back_inserter(utf8result)); 00079 Utf8String utf8str(std::string(utf8result.begin(), utf8result.end())); 00080 return utf8str; 00081 } 00082 00083 00084 inline Json::Value jsonPrint(const std::string & str) 00085 { 00086 return str; 00087 } 00088 00089 inline Json::Value jsonPrint(int i) 00090 { 00091 return i; 00092 } 00093 00094 struct JsonPrint { 00095 template<typename T> 00096 Json::Value operator () (const T & t) const 00097 { 00098 return jsonPrint(t); 00099 } 00100 }; 00101 00102 inline uint64_t hashString(const std::string & str) 00103 { 00104 uint64_t res = std::hash<std::string>()(str); 00105 //cerr << "hashString of " << str << " returned " << res << endl; 00106 return res; 00107 } 00108 00109 inline uint64_t hashString(const Utf8String & str) 00110 { 00111 return std::hash<std::string>()(std::string(str.rawData(), str.rawLength())); 00112 } 00113 00114 inline uint64_t hashString(const std::wstring & str) 00115 { 00116 uint64_t res = std::hash<std::wstring>()(str); 00117 return res; 00118 } 00119 00120 inline void createRegex(boost::u32regex & regex, const wchar_t * str) 00121 { 00122 regex = boost::make_u32regex(str); 00123 } 00124 00125 inline void createRegex(boost::regex & regex, const std::string & str) 00126 { 00127 regex = boost::regex(str); 00128 } 00129 00130 /*****************************************************************************/ 00131 /* CACHED REGEX */ 00132 /*****************************************************************************/ 00133 00134 template<typename Base, typename Str> 00135 struct CachedRegex { 00136 Base base; 00137 uint64_t hash; 00138 00139 CachedRegex() 00140 : hash(0) 00141 { 00142 } 00143 00144 template<typename InitStr> 00145 CachedRegex(const InitStr & val) 00146 : hash(hashString(val)) 00147 { 00148 createRegex(base, val); 00149 } 00150 00151 void jsonParse(const Json::Value & val) 00152 { 00153 RTBKIT::jsonParse(val, base); 00154 hash = std::hash<std::string>() (val.asString()); 00155 } 00156 00157 bool operator < (const CachedRegex & other) const 00158 { 00159 return base < other.base; 00160 } 00161 00162 bool matches(const Str & str) const 00163 { 00164 return RTBKIT::matches(base, str); 00165 } 00166 00167 bool matches(const Str & str, uint64_t strHash, 00168 ML::Lightweight_Hash<uint64_t, int> & cache) const 00169 { 00170 uint64_t bucket = hash ^ (strHash >> 1); 00171 bucket += (bucket == 0); 00172 int & cached = cache[bucket]; 00173 if (cached == 0) 00174 cached = RTBKIT::matches(base, str) + 1; 00175 return cached - 1; 00176 } 00177 }; 00178 00179 template<typename Base, typename Str> 00180 void jsonParse(const Json::Value & value, CachedRegex<Base, Str> & rex) 00181 { 00182 rex.jsonParse(value); 00183 } 00184 00185 template<typename Base, typename Str> 00186 inline Json::Value jsonPrint(const CachedRegex<Base, Str> & rex) 00187 { 00188 return jsonPrint(rex.base); 00189 } 00190 00191 template<typename Base, typename Str, typename Cache> 00192 inline bool matches(const CachedRegex<Base, Str> & rex, 00193 const Str & str, uint64_t strHash, 00194 Cache & cache) 00195 { 00196 if (strHash == 0) 00197 throw ML::Exception("zero string hash"); 00198 return rex.matches(str, strHash, cache); 00199 } 00200 00201 template<typename Base, typename Str> 00202 inline bool matches(const CachedRegex<Base, Str> & rex, const Str & str) 00203 { 00204 return rex.matches(str); 00205 } 00206 00207 00208 /*****************************************************************************/ 00209 /* URL MATCHER */ 00210 /*****************************************************************************/ 00211 00212 struct DomainMatcher { 00213 boost::regex rex; 00214 bool isLiteral; 00215 std::string str; 00216 uint64_t hash; 00217 00218 DomainMatcher() 00219 : hash(0) 00220 { 00221 } 00222 00223 DomainMatcher(const std::string & val) 00224 : str(val), hash(std::hash<std::string>() (val)) 00225 { 00226 isLiteral = true; 00227 return; 00228 if (false 00229 && val.find('*') == std::string::npos 00230 && val.find('/') == std::string::npos 00231 && val.find('?') == std::string::npos 00232 && val.find(':') == std::string::npos 00233 && val.find('.') != std::string::npos) { 00234 isLiteral = true; 00235 } 00236 else { 00237 rex = boost::regex(val); 00238 isLiteral = false; 00239 } 00240 } 00241 00242 void jsonParse(const Json::Value & val) 00243 { 00244 std::string s = val.asString(); 00245 *this = DomainMatcher(s); 00246 } 00247 00248 bool operator < (const DomainMatcher & other) const 00249 { 00250 return str < other.str; 00251 } 00252 00253 bool matches(const Url & url) const 00254 { 00255 if (isLiteral) 00256 return url.domainMatches(str); 00257 else return RTBKIT::matches(rex, url.host()); 00258 } 00259 00260 bool matches(const Url & url, uint64_t urlHash, 00261 ML::Lightweight_Hash<uint64_t, int> & cache) const 00262 { 00263 uint64_t bucket = hash ^ (urlHash >> 1); 00264 bucket += (bucket == 0); 00265 int & cached = cache[bucket]; 00266 if (cached == 0) 00267 cached = matches(url) + 1; 00268 return cached - 1; 00269 } 00270 }; 00271 00272 inline void jsonParse(const Json::Value & value, DomainMatcher & rex) 00273 { 00274 rex.jsonParse(value); 00275 } 00276 00277 inline Json::Value jsonPrint(const DomainMatcher & rex) 00278 { 00279 return rex.str; 00280 } 00281 00282 template<typename Cache> 00283 inline bool matches(const DomainMatcher & rex, const Url & url, 00284 uint64_t urlHash, 00285 Cache & cache) 00286 { 00287 return rex.matches(url, urlHash, cache); 00288 } 00289 00290 inline bool matches(const DomainMatcher & rex, const Url & url) 00291 { 00292 return rex.matches(url); 00293 } 00294 00295 00296 /*****************************************************************************/ 00297 /* MATCHING AND PARSING FUNCTIONS */ 00298 /*****************************************************************************/ 00299 00300 template<typename T, typename Fn> 00301 Json::Value 00302 collectionToJson(const std::vector<T> & vec, Fn fn) 00303 { 00304 Json::Value result; 00305 for (unsigned i = 0; i < vec.size(); ++i) 00306 result[i] = fn(vec[i]); 00307 return result; 00308 } 00309 00310 template<typename T, typename Fn> 00311 Json::Value 00312 collectionToJson(const std::set<T> & s, Fn fn) 00313 { 00314 Json::Value result; 00315 unsigned i = 0; 00316 for (auto it = s.begin(), end = s.end(); it != end; ++it, ++i) 00317 result[i] = fn(*it); 00318 return result; 00319 } 00320 00321 template<typename Collection, typename Fn> 00322 Json::Value 00323 includeExcludeToJson(const Collection & include, 00324 const Collection & exclude, 00325 Fn fn) 00326 { 00327 Json::Value result; 00328 if (!include.empty()) 00329 result["include"] = collectionToJson(include, fn); 00330 if (!exclude.empty()) 00331 result["exclude"] = collectionToJson(exclude, fn); 00332 return result; 00333 } 00334 00335 template<typename T, typename U> 00336 bool matchesAny(const std::vector<T> & values, const U & key, bool matchIfEmpty) 00337 { 00338 if (values.empty()) 00339 { 00340 return matchIfEmpty; 00341 } 00342 for (unsigned i = 0; i < values.size(); ++i) 00343 { 00344 if (matches(values[i], key)) return true; 00345 } 00346 return false; 00347 } 00348 00349 template<typename T, typename U, typename Cache> 00350 bool matchesAny(const std::vector<T> & values, 00351 const U & key, uint64_t keyHash, 00352 bool matchIfEmpty, 00353 Cache & cache) 00354 { 00355 if (values.empty()) 00356 { 00357 return matchIfEmpty; 00358 } 00359 for (unsigned i = 0; i < values.size(); ++i) 00360 { 00361 if (matches(values[i], key, keyHash, cache)) return true; 00362 } 00363 return false; 00364 } 00365 00366 // TODO: this is O(mn) but could be O(n+m) since they are sorted 00367 template<typename T, class Vec> 00368 bool matchesAnyAny(const std::vector<T> & values, const Vec & vec, 00369 bool matchIfEmpty) 00370 { 00371 if (values.empty()) return matchIfEmpty; 00372 00373 for (auto it = vec.begin(), end = vec.end(); it != end; ++it) 00374 if (matchesAny(values, *it, matchIfEmpty)) return true; 00375 return false; 00376 } 00377 00378 bool matchesAnyAny(const std::vector<int> & values, const SegmentList & vals, 00379 bool matchIfEmpty); 00380 00381 enum IncludeExcludeResult { 00382 IE_NO_DATA, 00383 IE_NOT_INCLUDED, 00384 IE_EXCLUDED, 00385 IE_PASSED 00386 }; 00387 00388 template<typename U, typename IE> 00389 bool isIncludedImpl(const U & value, const IE & include, const IE & exclude) 00390 { 00391 if (!matchesAny(include, value, true)) return false; 00392 if (matchesAny(exclude, value, false)) return false; 00393 return true; 00394 } 00395 00396 template<typename U, typename IE, typename Cache> 00397 bool isIncludedImpl(const U & value, uint64_t hash, 00398 const IE & include, const IE & exclude, 00399 Cache & cache) 00400 { 00401 if (!matchesAny(include, value, hash, true, cache)) return false; 00402 if (matchesAny(exclude, value, hash, false, cache)) return false; 00403 return true; 00404 } 00405 00406 template<typename Vec, typename IE> 00407 bool anyIsIncludedImpl(const Vec & vec, const IE & include, const IE & exclude) 00408 { 00409 if (!matchesAnyAny(include, vec, true)) return false; 00410 if (matchesAnyAny(exclude, vec, false)) return false; 00411 return true; 00412 } 00413 00414 00415 00416 /*****************************************************************************/ 00417 /* INCLUDE EXCLUDE */ 00418 /*****************************************************************************/ 00419 00420 template<typename T, typename IE = std::vector<T> > 00421 struct IncludeExclude { 00422 IE include; 00423 IE exclude; 00424 00425 static IncludeExclude 00426 createFromJson(const Json::Value & val, 00427 const std::string & name) 00428 { 00429 IncludeExclude result; 00430 00431 for (auto jt = val.begin(), jend = val.end(); jt != jend; ++jt) { 00432 if (jt.memberName() != "include" 00433 && jt.memberName() != "exclude") 00434 throw ML::Exception("filter %s has invalid key: %s", 00435 name.c_str(), jt.memberName().c_str()); 00436 00437 const Json::Value & val = *jt; 00438 for (unsigned i = 0; i != val.size(); ++i) { 00439 try { 00440 T t; 00441 jsonParse(val[i], t); 00442 if (jt.memberName() == "include") 00443 result.include.push_back(t); 00444 else result.exclude.push_back(t); 00445 } catch (...) { 00446 throw ML::Exception("error parsing include/exclude %s in %s", 00447 val[i].toString().c_str(), name.c_str()); 00448 } 00449 } 00450 } 00451 00452 std::sort(result.include.begin(), result.include.end()); 00453 std::sort(result.exclude.begin(), result.exclude.end()); 00454 00455 return result; 00456 } 00457 00458 void fromJson(const Json::Value & val, const std::string & name) 00459 { 00460 *this = createFromJson(val, name); 00461 } 00462 00463 Json::Value toJson() const 00464 { 00465 Json::Value result = includeExcludeToJson(include, exclude, JsonPrint()); 00466 return result; 00467 } 00468 00469 bool empty() const { return include.empty() && exclude.empty(); } 00470 00471 template<typename U> 00472 bool isIncluded(const U & value) const 00473 { 00474 return isIncludedImpl(value, include, exclude); 00475 } 00476 00477 template<typename U, typename Cache> 00478 bool isIncluded(const U & value, uint64_t hash, Cache & cache) const 00479 { 00480 return isIncludedImpl(value, hash, include, exclude, cache); 00481 } 00482 00483 template<typename Vec> 00484 bool anyIsIncluded(const Vec & vec) const 00485 { 00486 return anyIsIncludedImpl(vec, include, exclude); 00487 } 00488 }; 00489 00490 extern template class IncludeExclude<std::string>; 00491 extern template class IncludeExclude<boost::regex>; 00492 extern template class IncludeExclude<boost::u32regex>; 00493 extern template class IncludeExclude<int>; 00494 00495 } // namespace RTBKIT 00496 00497 #endif /* __rtb_router__include_exclude_h__ */
1.7.6.1