RTBKit
0.9
Open-source framework to create real-time ad bidding systems.
|
00001 /* string_test.cc 00002 Copyright (c) 2012 Datacratic. All rights reserved. 00003 */ 00004 00005 #define BOOST_TEST_MAIN 00006 #define BOOST_TEST_DYN_LINK 00007 #include "soa/types/string.h" 00008 #include <boost/test/unit_test.hpp> 00009 #include <boost/regex/icu.hpp> 00010 #include <boost/regex.hpp> 00011 #include "soa/jsoncpp/json.h" 00012 #include "jml/arch/format.h" 00013 00014 using namespace std; 00015 using namespace ML; 00016 using namespace Datacratic; 00017 00018 00019 BOOST_AUTO_TEST_CASE( test_print_format ) 00020 { 00021 std::string raw = "saint-jérôme"; 00022 // Test 1 - Iterate through the raw string with normal iterators we should not find 'é' 00023 unsigned numAccentedChars = 0; 00024 for(string::const_iterator it = raw.begin() ; it != raw.end(); ++it) 00025 { 00026 if (*it == L'é' || *it == L'ô') 00027 numAccentedChars++; 00028 } 00029 BOOST_CHECK_EQUAL(numAccentedChars, 0); 00030 Utf8String utf8(raw); 00031 // Now iterate through the utf8 string 00032 for (Utf8String::const_iterator it = utf8.begin(); it != utf8.end(); ++it) 00033 { 00034 if (*it == L'é' || *it == L'ô') 00035 numAccentedChars++; 00036 } 00037 BOOST_CHECK_EQUAL(numAccentedChars, 2); 00038 // Now add another string to it 00039 std::string raw2 = "saint-jérôme2"; 00040 utf8+=raw2; 00041 numAccentedChars=0; 00042 // Now iterate through the utf8 string 00043 for (Utf8String::const_iterator it = utf8.begin(); it != utf8.end(); ++it) 00044 { 00045 if (*it == L'é' || *it == L'ô') 00046 numAccentedChars++; 00047 } 00048 BOOST_CHECK_EQUAL(numAccentedChars, 4); 00049 string theString(utf8.rawData(), utf8.rawLength()); 00050 size_t found = raw.find(L'é') ; 00051 BOOST_CHECK_EQUAL(found, string::npos); 00052 // We do a normal regex first 00053 boost::regex reg("é"); 00054 std::string raw4 = "saint-jérôme"; 00055 BOOST_CHECK_EQUAL( boost::regex_search(raw4, reg), true); 00056 // Please see Saint-j\xC3A9r\xC3B4me for UTF-8 character table 00057 boost::u32regex withHex = boost::make_u32regex("saint-j\xc3\xa9r\xc3\xb4me"); 00058 boost::u32regex withoutHex = boost::make_u32regex(L"[a-z]*-jérôme"); 00059 boost::match_results<std::string::const_iterator> matches; 00060 BOOST_CHECK_EQUAL(boost::u32regex_search(raw, matches, withoutHex), true); 00061 if (boost::u32regex_search(raw, matches, withoutHex)) 00062 { 00063 for (boost::match_results< std::string::const_iterator >::const_iterator i = matches.begin(); i != matches.end(); ++i) 00064 { 00065 if (i->matched) std::cout << "matches : [" << i->str() << "]\n"; 00066 else std::cout << "doesn't match : [" << i->str() << "]\n"; 00067 } 00068 } 00069 else 00070 { 00071 cerr << "did not get a match without hex" << endl; 00072 } 00073 BOOST_CHECK_EQUAL(boost::u32regex_search(raw, matches, withHex), true); 00074 }