tests/api_unicode.cc

Go to the documentation of this file.
00001 
00004 /* Copyright (C) 2006,2007,2008 Olly Betts
00005  *
00006  * This program is free software; you can redistribute it and/or modify
00007  * it under the terms of the GNU General Public License as published by
00008  * the Free Software Foundation; either version 2 of the License, or
00009  * (at your option) any later version.
00010  *
00011  * This program is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014  * GNU General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU General Public License
00017  * along with this program; if not, write to the Free Software
00018  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
00019  */
00020 
00021 #include <config.h>
00022 
00023 #include "api_unicode.h"
00024 
00025 #include <xapian.h>
00026 
00027 #include "apitest.h"
00028 #include "testutils.h"
00029 
00030 #include <cctype>
00031 
00032 using namespace std;
00033 
00034 struct testcase {
00035     const char * a, * b;
00036 };
00037 
00038 static const testcase testcases[] = {
00039     { "abcd", "abcd" }, // Sanity check!
00040     { "a\x80""bcd", "a\xc2\x80""bcd" },
00041     { "a\xa0", "a\xc2\xa0" },
00042     { 0, 0 }
00043 };
00044 
00045 // Test handling of invalid UTF-8 is as desired.
00046 DEFINE_TESTCASE(utf8iterator1,!backend) {
00047     const testcase * p;
00048     for (p = testcases; p->a; ++p) {
00049         tout << '"' << p->a << "\" and \"" << p->b << '"' << endl;
00050         size_t a_len = strlen(p->a);
00051         Xapian::Utf8Iterator a(p->a, a_len);
00052 
00053         size_t b_len = strlen(p->b);
00054         Xapian::Utf8Iterator b(p->b, b_len);
00055 
00056         while (a != Xapian::Utf8Iterator() && b != Xapian::Utf8Iterator()) {
00057             TEST_EQUAL(*a, *b);
00058             ++a;
00059             ++b;
00060         }
00061 
00062         // Test that we don't reach the end of one before the other.
00063         TEST(a == Xapian::Utf8Iterator());
00064         TEST(b == Xapian::Utf8Iterator());
00065     }
00066     return true;
00067 }
00068 
00069 struct testcase2 {
00070     const char * a;
00071     unsigned long n;
00072 };
00073 
00074 static const testcase2 testcases2[] = {
00075     { "a", 97 },
00076     { "\x80", 128 },
00077     { "\xa0", 160 },
00078     { "\xc2\x80", 128 },
00079     { "\xc2\xa0", 160 },
00080     { "\xf0\xa8\xa8\x8f", 166415 },
00081     { 0, 0 }
00082 };
00083 
00084 // Test decoding of UTF-8.
00085 DEFINE_TESTCASE(utf8iterator2,!backend) {
00086     const testcase2 * p;
00087     for (p = testcases2; p->a; ++p) {
00088         Xapian::Utf8Iterator a(p->a, strlen(p->a));
00089 
00090         TEST(a != Xapian::Utf8Iterator());
00091         TEST_EQUAL(*a, p->n);
00092         TEST(++a == Xapian::Utf8Iterator());
00093     }
00094     return true;
00095 }
00096 
00097 // Test Unicode categorisation.
00098 DEFINE_TESTCASE(unicode1,!backend) {
00099     using namespace Xapian;
00100     TEST_EQUAL(Unicode::get_category('a'), Unicode::LOWERCASE_LETTER);
00101     TEST_EQUAL(Unicode::get_category('0'), Unicode::DECIMAL_DIGIT_NUMBER);
00102     TEST_EQUAL(Unicode::get_category('$'), Unicode::CURRENCY_SYMBOL);
00103     TEST_EQUAL(Unicode::get_category(0xa3), Unicode::CURRENCY_SYMBOL);
00104     // U+0242 was added in Unicode 5.0.0.
00105     TEST_EQUAL(Unicode::get_category(0x242), Unicode::LOWERCASE_LETTER);
00106     TEST_EQUAL(Unicode::get_category(0xFFFF), Unicode::UNASSIGNED);
00107     // Test characters outside BMP.
00108     TEST_EQUAL(Unicode::get_category(0x10345), Unicode::OTHER_LETTER);
00109     TEST_EQUAL(Unicode::get_category(0x10FFFD), Unicode::PRIVATE_USE);
00110     TEST_EQUAL(Unicode::get_category(0x10FFFF), Unicode::UNASSIGNED);
00111     // Test some invalid Unicode values.
00112     TEST_EQUAL(Unicode::get_category(0x110000), Unicode::UNASSIGNED);
00113     TEST_EQUAL(Unicode::get_category(0xFFFFFFFF), Unicode::UNASSIGNED);
00114     return true;
00115 }
00116 
00117 DEFINE_TESTCASE(caseconvert1,!backend) {
00118     using namespace Xapian;
00119     for (unsigned ch = 0; ch < 128; ++ch) {
00120         if (isupper((char)ch)) {
00121             TEST_EQUAL(Unicode::tolower(ch), unsigned(tolower((char)ch)));
00122         } else {
00123             TEST_EQUAL(Unicode::tolower(ch), ch);
00124         }
00125         if (islower((char)ch)) {
00126             TEST_EQUAL(Unicode::toupper(ch), unsigned(toupper((char)ch)));
00127         } else {
00128             TEST_EQUAL(Unicode::toupper(ch), ch);
00129         }
00130     }
00131 
00132     // U+0242 was added in Unicode 5.0.0 as a lowercase form of U+0241.
00133     TEST_EQUAL(Unicode::tolower(0x242), 0x242);
00134     TEST_EQUAL(Unicode::toupper(0x242), 0x241);
00135     TEST_EQUAL(Unicode::toupper(0x241), 0x241);
00136     TEST_EQUAL(Unicode::tolower(0x241), 0x242);
00137 
00138     // Pound currency symbol:
00139     TEST_EQUAL(Unicode::tolower(0xa3), 0xa3);
00140     TEST_EQUAL(Unicode::toupper(0xa3), 0xa3);
00141     // Unassigned:
00142     TEST_EQUAL(Unicode::tolower(0xFFFF), 0xFFFF);
00143     TEST_EQUAL(Unicode::toupper(0xFFFF), 0xFFFF);
00144     // Test characters outside BMP.
00145     TEST_EQUAL(Unicode::tolower(0x10345), 0x10345);
00146     TEST_EQUAL(Unicode::toupper(0x10345), 0x10345);
00147     TEST_EQUAL(Unicode::tolower(0x10FFFD), 0x10FFFD);
00148     TEST_EQUAL(Unicode::toupper(0x10FFFD), 0x10FFFD);
00149     TEST_EQUAL(Unicode::tolower(0x10FFFF), 0x10FFFF);
00150     TEST_EQUAL(Unicode::toupper(0x10FFFF), 0x10FFFF);
00151     // Test some invalid Unicode values.
00152     TEST_EQUAL(Unicode::tolower(0x110000), 0x110000);
00153     TEST_EQUAL(Unicode::toupper(0x110000), 0x110000);
00154     TEST_EQUAL(Unicode::tolower(0xFFFFFFFF), 0xFFFFFFFF);
00155     TEST_EQUAL(Unicode::toupper(0xFFFFFFFF), 0xFFFFFFFF);
00156 
00157     return true;
00158 }

Documentation for Xapian (version 1.0.10).
Generated on 24 Dec 2008 by Doxygen 1.5.2.