00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include <config.h>
00024
00025 #include <stdlib.h>
00026
00027 #include <string>
00028 #include <fstream>
00029 #include <iostream>
00030
00031 #include <xapian/stem.h>
00032 #include "testsuite.h"
00033
00034 using namespace std;
00035
00036 static const int JUNKSIZE = 2 * 1048576;
00037
00038 static string language;
00039
00040 static Xapian::Stem stemmer;
00041
00042 static string srcdir;
00043
00044 static int seed;
00045
00046
00047 static bool
00048 test_stemrandom()
00049 {
00050 static const char wordchars[] =
00051 "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz0123456789^\0";
00052
00053
00054 if (getenv("OM_STEMTEST_SKIP_RANDOM"))
00055 SKIP_TEST("OM_STEMTEST_SKIP_RANDOM set");
00056
00057 tout << "Stemming random text... (seed " << seed << ")" << endl;
00058 srand(seed);
00059
00060 string word;
00061 int stemmed_size = 0;
00062 for (int c = JUNKSIZE; c; --c) {
00063 char ch = wordchars[(rand() >> 8) % sizeof wordchars];
00064 if (ch) {
00065 word += ch;
00066 continue;
00067 }
00068 stemmed_size += stemmer(word).length();
00069 word.resize(0);
00070 }
00071 stemmed_size += stemmer(word).length();
00072 tout << "Input size " << JUNKSIZE << ", stemmed size " << stemmed_size
00073 << endl;
00074
00075 if (stemmed_size > JUNKSIZE * 101 / 100) {
00076 FAIL_TEST("Stemmed data is significantly bigger than input: "
00077 << stemmed_size << " vs. " << JUNKSIZE);
00078 }
00079 if (stemmed_size < JUNKSIZE / 2) {
00080 FAIL_TEST("Stemmed data is significantly smaller than input: "
00081 << stemmed_size << " vs. " << JUNKSIZE);
00082 }
00083 return true;
00084 }
00085
00086
00087 static bool
00088 test_stemjunk()
00089 {
00090
00091 if (getenv("OM_STEMTEST_SKIP_RANDOM"))
00092 SKIP_TEST("OM_STEMTEST_SKIP_RANDOM set");
00093
00094 tout << "Stemming random junk... (seed " << seed << ")" << endl;
00095 srand(seed);
00096
00097 string word;
00098 int stemmed_size = 0;
00099 for (int c = JUNKSIZE; c; --c) {
00100 char ch = rand() >> 8;
00101 if (ch) {
00102 word += ch;
00103 continue;
00104 }
00105 stemmed_size += stemmer(word).length();
00106 word.resize(0);
00107 }
00108 stemmed_size += stemmer(word).length();
00109 tout << "Input size " << JUNKSIZE << ", stemmed size " << stemmed_size
00110 << endl;
00111
00112 if (stemmed_size > JUNKSIZE * 101 / 100) {
00113 FAIL_TEST("Stemmed data is significantly bigger than input ("
00114 << stemmed_size << " vs. " << JUNKSIZE);
00115 }
00116 if (stemmed_size < JUNKSIZE / 2) {
00117 FAIL_TEST("Stemmed data is significantly smaller than input ("
00118 << stemmed_size << " vs. " << JUNKSIZE);
00119 }
00120 return true;
00121 }
00122
00123 static bool
00124 test_stemdict()
00125 {
00126 string dir = srcdir + "/../../xapian-data/stemming/";
00127
00128 ifstream txt((dir + language + ".voc").c_str());
00129 if (!txt.is_open()) {
00130 SKIP_TEST(language + ".voc not found");
00131 }
00132
00133 ifstream st((dir + language + ".st").c_str());
00134 if (!st.is_open()) {
00135 txt.close();
00136 SKIP_TEST(language + ".st not found");
00137 }
00138
00139 int wordcount = 0;
00140
00141 tout << "Testing " << language << " with fixed dictionary..." << endl;
00142
00143 string word, stem, expect;
00144 while (!txt.eof() && !st.eof()) {
00145 getline(txt, word);
00146 getline(st, expect);
00147
00148 stem = stemmer(word);
00149
00150 TEST_EQUAL(stem, expect);
00151 ++wordcount;
00152 }
00153 txt.close();
00154 st.close();
00155
00156 return true;
00157 }
00158
00159
00160
00161
00162
00164 test_desc tests[] = {
00165 {"stemrandom", test_stemrandom},
00166 {"stemjunk", test_stemjunk},
00167 {"stemdict", test_stemdict},
00168 {0, 0}
00169 };
00170
00171 int main(int argc, char **argv)
00172 {
00173 string langs, seed_str;
00174
00175 char * val;
00176
00177 val = getenv("OM_STEMTEST_LANGUAGES");
00178 if (val && *val)
00179 langs = val;
00180 else
00181 langs = Xapian::Stem::get_available_languages();
00182 test_driver::add_command_line_option("languages", 'l', &langs);
00183
00184 seed = 42;
00185
00186
00187 val = getenv("OM_STEMTEST_SEED");
00188 if (val && *val) {
00189 seed_str = val;
00190 } else {
00191
00192
00193 }
00194 test_driver::add_command_line_option("seed", 's', &seed_str);
00195
00196 test_driver::parse_command_line(argc, argv);
00197 srcdir = test_driver::get_srcdir();
00198 int result = 0;
00199
00200 if (!seed_str.empty()) seed = atoi(seed_str.c_str());
00201 cout << "The random seed is " << seed << endl;
00202 cout << "Please report the seed when reporting a test failure." << endl;
00203
00204 string::size_type b = 0;
00205 while (b != langs.size()) {
00206 string::size_type a = b;
00207 while (b < langs.size() && langs[b] != ' ') ++b;
00208 language = langs.substr(a, b - a);
00209 while (b < langs.size() && langs[b] == ' ') ++b;
00210 cout << "Running tests with " << language << " stemmer..." << endl;
00211 stemmer = Xapian::Stem(language);
00212 result = max(result, test_driver::run(tests));
00213 }
00214 return result;
00215 }