tests/api_spelling.cc

Go to the documentation of this file.
00001 
00004 /* Copyright (C) 2007,2008 Olly Betts
00005  * Copyright (C) 2007 Lemur Consulting Ltd
00006  *
00007  * This program is free software; you can redistribute it and/or modify
00008  * it under the terms of the GNU General Public License as published by
00009  * the Free Software Foundation; either version 2 of the License, or
00010  * (at your option) any later version.
00011  *
00012  * This program is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015  * GNU General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU General Public License
00018  * along with this program; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
00020  */
00021 
00022 #include <config.h>
00023 
00024 #include "api_spelling.h"
00025 
00026 #include <xapian.h>
00027 
00028 #include "apitest.h"
00029 #include "testsuite.h"
00030 #include "testutils.h"
00031 
00032 #include <string>
00033 
00034 using namespace std;
00035 
00036 // Test basic spelling correction features.
00037 DEFINE_TESTCASE(spell1, spelling) {
00038     Xapian::WritableDatabase db = get_writable_database();
00039 
00040     // Check that the more frequent term is chosen.
00041     db.add_spelling("hello");
00042     db.add_spelling("cell", 2);
00043     TEST_EQUAL(db.get_spelling_suggestion("hell"), "cell");
00044     db.flush();
00045     Xapian::Database dbr(get_writable_database_as_database());
00046     TEST_EQUAL(db.get_spelling_suggestion("hell"), "cell");
00047     TEST_EQUAL(dbr.get_spelling_suggestion("hell"), "cell");
00048 
00049     // Check suggestions for single edit errors to "zig".
00050     db.add_spelling("zig");
00051     // Transpositions:
00052     TEST_EQUAL(db.get_spelling_suggestion("izg"), "zig");
00053     TEST_EQUAL(db.get_spelling_suggestion("zgi"), "zig");
00054     // Substitutions:
00055     TEST_EQUAL(db.get_spelling_suggestion("sig"), "zig");
00056     TEST_EQUAL(db.get_spelling_suggestion("zog"), "zig");
00057     TEST_EQUAL(db.get_spelling_suggestion("zif"), "zig");
00058     // Deletions:
00059     TEST_EQUAL(db.get_spelling_suggestion("ig"), "zig");
00060     TEST_EQUAL(db.get_spelling_suggestion("zg"), "zig");
00061     TEST_EQUAL(db.get_spelling_suggestion("zi"), "zig");
00062     // Insertions:
00063     TEST_EQUAL(db.get_spelling_suggestion("azig"), "zig");
00064     TEST_EQUAL(db.get_spelling_suggestion("zaig"), "zig");
00065     TEST_EQUAL(db.get_spelling_suggestion("ziag"), "zig");
00066     TEST_EQUAL(db.get_spelling_suggestion("ziga"), "zig");
00067 
00068     // Check suggestions for single edit errors to "ch".
00069     db.add_spelling("ch");
00070     // Transpositions:
00071     TEST_EQUAL(db.get_spelling_suggestion("hc"), "ch");
00072     // Substitutions - we don't handle these for two character words:
00073     TEST_EQUAL(db.get_spelling_suggestion("qh"), "");
00074     TEST_EQUAL(db.get_spelling_suggestion("cq"), "");
00075     // Deletions would leave a single character, and we don't handle those.
00076     TEST_EQUAL(db.get_spelling_suggestion("c"), "");
00077     TEST_EQUAL(db.get_spelling_suggestion("h"), "");
00078     // Insertions:
00079     TEST_EQUAL(db.get_spelling_suggestion("qch"), "ch");
00080     TEST_EQUAL(db.get_spelling_suggestion("cqh"), "ch");
00081     TEST_EQUAL(db.get_spelling_suggestion("chq"), "ch");
00082 
00083     // Check assorted cases:
00084     TEST_EQUAL(db.get_spelling_suggestion("shello"), "hello");
00085     TEST_EQUAL(db.get_spelling_suggestion("hellot"), "hello");
00086     TEST_EQUAL(db.get_spelling_suggestion("acell"), "cell");
00087     TEST_EQUAL(db.get_spelling_suggestion("cella"), "cell");
00088     TEST_EQUAL(db.get_spelling_suggestion("acella"), "cell");
00089     TEST_EQUAL(db.get_spelling_suggestion("helo"), "hello");
00090     TEST_EQUAL(db.get_spelling_suggestion("cll"), "cell");
00091     TEST_EQUAL(db.get_spelling_suggestion("helol"), "hello");
00092     TEST_EQUAL(db.get_spelling_suggestion("clel"), "cell");
00093     TEST_EQUAL(db.get_spelling_suggestion("ecll"), "cell");
00094     TEST_EQUAL(db.get_spelling_suggestion("cll"), "cell");
00095 
00096     // Check that edit distance 3 isn't found by default:
00097     TEST_EQUAL(db.get_spelling_suggestion("shelolx"), "");
00098     TEST_EQUAL(db.get_spelling_suggestion("celling"), "");
00099     TEST_EQUAL(db.get_spelling_suggestion("dellin"), "");
00100 
00101     // Check that edit distance 3 is found if specified:
00102     TEST_EQUAL(db.get_spelling_suggestion("shelolx", 3), "hello");
00103     TEST_EQUAL(db.get_spelling_suggestion("celling", 3), "cell");
00104     TEST_EQUAL(db.get_spelling_suggestion("dellin", 3), "cell");
00105     return true;
00106 }
00107 
00108 // Test spelling correction for Unicode.
00109 DEFINE_TESTCASE(spell2, spelling) {
00110     Xapian::WritableDatabase db = get_writable_database();
00111 
00112     // Check that a UTF-8 sequence counts as a single character.
00113     db.add_spelling("h\xc3\xb6hle");
00114     db.add_spelling("ascii");
00115     TEST_EQUAL(db.get_spelling_suggestion("hohle", 1), "h\xc3\xb6hle");
00116     TEST_EQUAL(db.get_spelling_suggestion("hhle", 1), "h\xc3\xb6hle");
00117     TEST_EQUAL(db.get_spelling_suggestion("\xf0\xa8\xa8\x8f\xc3\xb6le", 2), "h\xc3\xb6hle");
00118     TEST_EQUAL(db.get_spelling_suggestion("hh\xc3\xb6l"), "h\xc3\xb6hle");
00119     TEST_EQUAL(db.get_spelling_suggestion("as\xc3\xb6\xc3\xb7i"), "ascii");
00120     TEST_EQUAL(db.get_spelling_suggestion("asc\xc3\xb6i\xc3\xb7i"), "ascii");
00121     db.flush();
00122     Xapian::Database dbr(get_writable_database_as_database());
00123     TEST_EQUAL(dbr.get_spelling_suggestion("hohle", 1), "h\xc3\xb6hle");
00124     TEST_EQUAL(dbr.get_spelling_suggestion("hhle", 1), "h\xc3\xb6hle");
00125     TEST_EQUAL(dbr.get_spelling_suggestion("\xf0\xa8\xa8\x8f\xc3\xb6le", 2), "h\xc3\xb6hle");
00126     TEST_EQUAL(dbr.get_spelling_suggestion("hh\xc3\xb6l"), "h\xc3\xb6hle");
00127     TEST_EQUAL(dbr.get_spelling_suggestion("as\xc3\xb6\xc3\xb7i"), "ascii");
00128     TEST_EQUAL(dbr.get_spelling_suggestion("asc\xc3\xb6i\xc3\xb7i"), "ascii");
00129 
00130     return true;
00131 }
00132 
00133 // Test spelling correction with multi databases
00134 DEFINE_TESTCASE(spell3, spelling) {
00135     Xapian::WritableDatabase db1 = get_writable_database();
00136     // We can't just call get_writable_database() since it would delete db1
00137     // which doesn't work at all under __WIN32__ and will go wrong elsewhere if
00138     // changes to db1 are committed.
00139     Xapian::WritableDatabase db2 = get_named_writable_database("spell3", "");
00140 
00141     db1.add_spelling("hello");
00142     db1.add_spelling("cell", 2);
00143     db2.add_spelling("hello", 2);
00144     db2.add_spelling("helo");
00145 
00146     Xapian::Database db;
00147     db.add_database(db1);
00148     db.add_database(db2);
00149 
00150     TEST_EQUAL(db.get_spelling_suggestion("hello"), "");
00151     TEST_EQUAL(db.get_spelling_suggestion("hell"), "hello");
00152     TEST_EQUAL(db1.get_spelling_suggestion("hell"), "cell");
00153     TEST_EQUAL(db2.get_spelling_suggestion("hell"), "hello");
00154 
00155 
00156     // Test spelling iterator
00157     Xapian::TermIterator i(db1.spellings_begin());
00158     TEST_EQUAL(*i, "cell");
00159     TEST_EQUAL(i.get_termfreq(), 2);
00160     ++i;
00161     TEST_EQUAL(*i, "hello");
00162     TEST_EQUAL(i.get_termfreq(), 1);
00163     ++i;
00164     TEST(i == db1.spellings_end());
00165 
00166     i = db2.spellings_begin();
00167     TEST_EQUAL(*i, "hello");
00168     TEST_EQUAL(i.get_termfreq(), 2);
00169     ++i;
00170     TEST_EQUAL(*i, "helo");
00171     TEST_EQUAL(i.get_termfreq(), 1);
00172     ++i;
00173     TEST(i == db2.spellings_end());
00174 
00175     i = db.spellings_begin();
00176     TEST_EQUAL(*i, "cell");
00177     TEST_EQUAL(i.get_termfreq(), 2);
00178     ++i;
00179     TEST_EQUAL(*i, "hello");
00180     TEST_EQUAL(i.get_termfreq(), 3);
00181     ++i;
00182     TEST_EQUAL(*i, "helo");
00183     TEST_EQUAL(i.get_termfreq(), 1);
00184     ++i;
00185     TEST(i == db.spellings_end());
00186 
00187     return true;
00188 }
00189 
00190 // Regression test - check that appending works correctly.
00191 DEFINE_TESTCASE(spell4, spelling) {
00192     Xapian::WritableDatabase db = get_writable_database();
00193 
00194     db.add_spelling("check");
00195     db.add_spelling("pecks", 2);
00196     db.flush();
00197     db.add_spelling("becky");
00198     db.flush();
00199 
00200     TEST_EQUAL(db.get_spelling_suggestion("jeck", 2), "pecks");
00201 
00202     return true;
00203 }
00204 
00205 // Regression test - used to segfault with some input values.
00206 DEFINE_TESTCASE(spell5, spelling) {
00207     const char * target = "\xe4\xb8\x80\xe4\xba\x9b";
00208 
00209     Xapian::WritableDatabase db = get_writable_database();
00210     db.add_spelling(target);
00211     db.flush();
00212 
00213     string s = db.get_spelling_suggestion("\xe4\xb8\x8d", 3);
00214     TEST_EQUAL(s, target);
00215 
00216     return true;
00217 }
00218 
00219 // Test basic spelling correction features.
00220 DEFINE_TESTCASE(spell6, spelling) {
00221     Xapian::WritableDatabase db = get_writable_database();
00222 
00223     // Check that the more frequent term is chosen.
00224     db.add_spelling("hello", 2);
00225     db.add_spelling("sell", 3);
00226     TEST_EQUAL(db.get_spelling_suggestion("hell"), "sell");
00227     db.flush();
00228     Xapian::Database dbr(get_writable_database_as_database());
00229     TEST_EQUAL(db.get_spelling_suggestion("hell"), "sell");
00230     TEST_EQUAL(dbr.get_spelling_suggestion("hell"), "sell");
00231 
00232     return true;
00233 }

Documentation for Xapian (version 1.0.10).
Generated on 24 Dec 2008 by Doxygen 1.5.2.