tests/stemtest.cc

Go to the documentation of this file.
00001 /* stemtest.cc
00002  *
00003  * Copyright 1999,2000,2001 BrightStation PLC
00004  * Copyright 2002 Ananova Ltd
00005  * Copyright 2002,2003,2004,2007 Olly Betts
00006  *
00007  * This program is free software; you can redistribute it and/or
00008  * modify it under the terms of the GNU General Public License as
00009  * published by the Free Software Foundation; either version 2 of the
00010  * License, or (at your option) any later version.
00011  *
00012  * This program is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015  * GNU General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU General Public License
00018  * along with this program; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
00020  * USA
00021  */
00022 
00023 #include <config.h>
00024 
00025 #include <stdlib.h>
00026 
00027 #include <string>
00028 #include <fstream>
00029 #include <iostream>
00030 
00031 #include <xapian/stem.h>
00032 #include "testsuite.h"
00033 
00034 using namespace std;
00035 
00036 static const int JUNKSIZE = 2 * 1048576;
00037 
00038 static string language;
00039 
00040 static Xapian::Stem stemmer;
00041 
00042 static string srcdir;
00043 
00044 static int seed;
00045 
00046 // run stemmers on random text
00047 static bool
00048 test_stemrandom()
00049 {
00050     static const char wordchars[] =
00051         "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz0123456789^\0";
00052 
00053     // FIXME:1.1: remove check for OM_STEMTEST_SKIP_RANDOM
00054     if (getenv("OM_STEMTEST_SKIP_RANDOM"))
00055         SKIP_TEST("OM_STEMTEST_SKIP_RANDOM set");
00056 
00057     tout << "Stemming random text... (seed " << seed << ")" << endl;
00058     srand(seed);
00059 
00060     string word;
00061     int stemmed_size = 0;
00062     for (int c = JUNKSIZE; c; --c) {
00063         char ch = wordchars[(rand() >> 8) % sizeof wordchars];
00064         if (ch) {
00065             word += ch;
00066             continue;
00067         }
00068         stemmed_size += stemmer(word).length();
00069         word.resize(0);
00070     }
00071     stemmed_size += stemmer(word).length();
00072     tout << "Input size " << JUNKSIZE << ", stemmed size " << stemmed_size
00073          << endl;
00074 
00075     if (stemmed_size > JUNKSIZE * 101 / 100) {
00076         FAIL_TEST("Stemmed data is significantly bigger than input: "
00077                   << stemmed_size << " vs. " << JUNKSIZE);
00078     }
00079     if (stemmed_size < JUNKSIZE / 2) {
00080         FAIL_TEST("Stemmed data is significantly smaller than input: "
00081                   << stemmed_size << " vs. " << JUNKSIZE);
00082     }
00083     return true;
00084 }
00085 
00086 // run stemmers on random junk
00087 static bool
00088 test_stemjunk()
00089 {
00090     // FIXME:1.1: remove check for OM_STEMTEST_SKIP_RANDOM
00091     if (getenv("OM_STEMTEST_SKIP_RANDOM"))
00092         SKIP_TEST("OM_STEMTEST_SKIP_RANDOM set");
00093 
00094     tout << "Stemming random junk... (seed " << seed << ")" << endl;
00095     srand(seed);
00096 
00097     string word;
00098     int stemmed_size = 0;
00099     for (int c = JUNKSIZE; c; --c) {
00100         char ch = rand() >> 8;
00101         if (ch) {
00102             word += ch;
00103             continue;
00104         }
00105         stemmed_size += stemmer(word).length();
00106         word.resize(0);
00107     }
00108     stemmed_size += stemmer(word).length();
00109     tout << "Input size " << JUNKSIZE << ", stemmed size " << stemmed_size
00110          << endl;
00111 
00112     if (stemmed_size > JUNKSIZE * 101 / 100) {
00113         FAIL_TEST("Stemmed data is significantly bigger than input ("
00114                   << stemmed_size << " vs. " << JUNKSIZE);
00115     }
00116     if (stemmed_size < JUNKSIZE / 2) {
00117         FAIL_TEST("Stemmed data is significantly smaller than input ("
00118                   << stemmed_size << " vs. " << JUNKSIZE);
00119     }
00120     return true;
00121 }
00122 
00123 static bool
00124 test_stemdict()
00125 {
00126     string dir = srcdir + "/../../xapian-data/stemming/";
00127 
00128     ifstream txt((dir + language + ".voc").c_str());
00129     if (!txt.is_open()) {
00130         SKIP_TEST(language + ".voc not found");
00131     }
00132 
00133     ifstream st((dir + language + ".st").c_str());
00134     if (!st.is_open()) {
00135         txt.close();
00136         SKIP_TEST(language + ".st not found");
00137     }
00138 
00139     int wordcount = 0;
00140 
00141     tout << "Testing " << language << " with fixed dictionary..." << endl;
00142 
00143     string word, stem, expect;
00144     while (!txt.eof() && !st.eof()) {
00145         getline(txt, word);
00146         getline(st, expect);
00147 
00148         stem = stemmer(word);
00149 
00150         TEST_EQUAL(stem, expect);
00151         ++wordcount;
00152     }
00153     txt.close();
00154     st.close();
00155 
00156     return true;
00157 }
00158 
00159 // ##################################################################
00160 // # End of actual tests                                            #
00161 // ##################################################################
00162 
00164 test_desc tests[] = {
00165     {"stemrandom",              test_stemrandom},
00166     {"stemjunk",                test_stemjunk},
00167     {"stemdict",                test_stemdict},
00168     {0, 0}
00169 };
00170 
00171 int main(int argc, char **argv)
00172 {
00173     string langs, seed_str;
00174     // Backward compatibility
00175     char * val;
00176     // FIXME:1.1: remove check for OM_STEMTEST_LANGUAGES
00177     val = getenv("OM_STEMTEST_LANGUAGES");
00178     if (val && *val)
00179         langs = val;
00180     else
00181         langs = Xapian::Stem::get_available_languages();
00182     test_driver::add_command_line_option("languages", 'l', &langs);
00183 
00184     seed = 42;
00185     // FIXME:1.1: remove check for OM_STEMTEST_SEED
00186     // Backward compatibility
00187     val = getenv("OM_STEMTEST_SEED");
00188     if (val && *val) {
00189         seed_str = val;
00190     } else {
00191         // FIXME hash hostname like stemtest.pl did???
00192         //$seed = unpack("%32L*", `hostname`);
00193     }
00194     test_driver::add_command_line_option("seed", 's', &seed_str);
00195 
00196     test_driver::parse_command_line(argc, argv);
00197     srcdir = test_driver::get_srcdir();
00198     int result = 0;
00199 
00200     if (!seed_str.empty()) seed = atoi(seed_str.c_str());
00201     cout << "The random seed is " << seed << endl;
00202     cout << "Please report the seed when reporting a test failure." << endl;
00203 
00204     string::size_type b = 0;
00205     while (b != langs.size()) {
00206         string::size_type a = b;
00207         while (b < langs.size() && langs[b] != ' ') ++b;
00208         language = langs.substr(a, b - a);
00209         while (b < langs.size() && langs[b] == ' ') ++b;
00210         cout << "Running tests with " << language << " stemmer..." << endl;
00211         stemmer = Xapian::Stem(language);
00212         result = max(result, test_driver::run(tests));
00213     }
00214     return result;
00215 }

Documentation for Xapian (version 1.0.10).
Generated on 24 Dec 2008 by Doxygen 1.5.2.