net/serialise.cc

Go to the documentation of this file.
00001 /* @file serialise.cc
00002  * @brief functions to convert Xapian objects to strings and back
00003  */
00004 /* Copyright (C) 2006,2007 Olly Betts
00005  *
00006  * This program is free software; you can redistribute it and/or modify
00007  * it under the terms of the GNU General Public License as published by
00008  * the Free Software Foundation; either version 2 of the License, or
00009  * (at your option) any later version.
00010  *
00011  * This program is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014  * GNU General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU General Public License
00017  * along with this program; if not, write to the Free Software
00018  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
00019  */
00020 
00021 #include <config.h>
00022 
00023 #include <xapian/document.h>
00024 #include <xapian/error.h>
00025 #include <xapian/positioniterator.h>
00026 #include <xapian/termiterator.h>
00027 #include <xapian/valueiterator.h>
00028 
00029 #include "omassert.h"
00030 #include "omenquireinternal.h"
00031 #include "serialise.h"
00032 #include "serialise-double.h"
00033 #include "stats.h"
00034 #include "utils.h"
00035 
00036 #include <string>
00037 #include <string.h>
00038 
00039 using namespace std;
00040 
00041 string
00042 encode_length(size_t len)
00043 {
00044     string result;
00045     if (len < 255) {
00046         result += static_cast<unsigned char>(len);
00047     } else {
00048         result += '\xff';
00049         len -= 255;
00050         while (true) {
00051             unsigned char byte = static_cast<unsigned char>(len & 0x7f);
00052             len >>= 7;
00053             if (!len) {
00054                 result += (byte | static_cast<unsigned char>(0x80));
00055                 break;
00056             }
00057             result += byte;
00058         }
00059     }
00060     return result;
00061 }
00062 
00063 size_t
00064 decode_length(const char ** p, const char *end, bool check_remaining)
00065 {
00066     if (*p == end) {
00067         throw Xapian::NetworkError("Bad encoded length: no data");
00068     }
00069 
00070     size_t len = static_cast<unsigned char>(*(*p)++);
00071     if (len == 0xff) {
00072         len = 0;
00073         unsigned char ch;
00074         int shift = 0;
00075         do {
00076             if (*p == end || shift > 28)
00077                 throw Xapian::NetworkError("Bad encoded length: insufficient data");
00078             ch = *(*p)++;
00079             len |= size_t(ch & 0x7f) << shift;
00080             shift += 7;
00081         } while ((ch & 0x80) == 0);
00082         len += 255;
00083     }
00084     if (check_remaining && len > size_t(end - *p)) {
00085         throw Xapian::NetworkError("Bad encoded length: length greater than data");
00086     }
00087     return len;
00088 }
00089 
00090 string
00091 serialise_error(const Xapian::Error &e)
00092 {
00093     string result;
00094     result += encode_length(strlen(e.get_type()));
00095     result += e.get_type();
00096     result += encode_length(e.get_context().length());
00097     result += e.get_context();
00098     result += encode_length(e.get_msg().length());
00099     result += e.get_msg();
00100     // The "error string" goes last so we don't need to store its length.
00101     const char * err = e.get_error_string();
00102     if (err) result += err;
00103     return result;
00104 }
00105 
00106 void
00107 unserialise_error(const string &serialised_error, const string &prefix,
00108                   const string &new_context)
00109 {
00110     // Use c_str() so last string is nul-terminated.
00111     const char * p = serialised_error.c_str();
00112     const char * end = p + serialised_error.size();
00113     size_t len;
00114     len = decode_length(&p, end, true);
00115     if (len == 7 && memcmp(p, "UNKNOWN", 7) == 0) {
00116         throw Xapian::InternalError("UNKNOWN");
00117     }
00118     string type(p, len);
00119     p += len;
00120 
00121     len = decode_length(&p, end, true);
00122     string context(p, len);
00123     p += len;
00124 
00125     len = decode_length(&p, end, true);
00126     string msg(prefix);
00127     msg.append(p, len);
00128     p += len;
00129 
00130     const char * error_string = (p == end) ? NULL : p;
00131 
00132     if (!context.empty() && !new_context.empty()) {
00133         msg += "; context was: ";
00134         msg += context;
00135         context = new_context;
00136     }
00137 
00138 #include <xapian/errordispatch.h>
00139 
00140     msg = "Unknown remote exception type " + type + ": " + msg;
00141     throw Xapian::InternalError(msg, context);
00142 }
00143 
00144 string serialise_stats(const Stats &stats)
00145 {
00146     string result;
00147 
00148     result += encode_length(stats.collection_size);
00149     result += encode_length(stats.rset_size);
00150     result += serialise_double(stats.average_length);
00151 
00152     map<string, Xapian::doccount>::const_iterator i;
00153 
00154     result += encode_length(stats.termfreq.size());
00155     for (i = stats.termfreq.begin(); i != stats.termfreq.end(); ++i) {
00156         result += encode_length(i->first.size());
00157         result += i->first;
00158         result += encode_length(i->second);
00159     }
00160 
00161     for (i = stats.reltermfreq.begin(); i != stats.reltermfreq.end(); ++i) {
00162         result += encode_length(i->first.size());
00163         result += i->first;
00164         result += encode_length(i->second);
00165     }
00166 
00167     return result;
00168 }
00169 
00170 Stats
00171 unserialise_stats(const string &s)
00172 {
00173     const char * p = s.c_str();
00174     const char * p_end = p + s.size();
00175 
00176     Stats stat;
00177 
00178     stat.collection_size = decode_length(&p, p_end, false);
00179     stat.rset_size = decode_length(&p, p_end, false);
00180     stat.average_length = unserialise_double(&p, p_end);
00181 
00182     size_t n = decode_length(&p, p_end, false);
00183     while (n--) {
00184         size_t len = decode_length(&p, p_end, true);
00185         string term(p, len);
00186         p += len;
00187         stat.termfreq.insert(make_pair(term, decode_length(&p, p_end, false)));
00188     }
00189 
00190     while (p != p_end) {
00191         size_t len = decode_length(&p, p_end, true);
00192         string term(p, len);
00193         p += len;
00194         stat.reltermfreq.insert(make_pair(term, decode_length(&p, p_end, false)));
00195     }
00196 
00197     return stat;
00198 }
00199 
00200 string
00201 serialise_mset_pre_30_5(const Xapian::MSet &mset)
00202 {
00203     string result;
00204 
00205     result += encode_length(mset.get_firstitem());
00206     result += encode_length(mset.get_matches_lower_bound());
00207     result += encode_length(mset.get_matches_estimated());
00208     result += encode_length(mset.get_matches_upper_bound());
00209     result += serialise_double(mset.get_max_possible());
00210     result += serialise_double(mset.get_max_attained());
00211     result += encode_length(mset.size());
00212     for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); ++i) {
00213         result += serialise_double(i.get_weight());
00214         result += encode_length(*i);
00215         result += encode_length(i.get_collapse_key().size());
00216         result += i.get_collapse_key();
00217         result += encode_length(i.get_collapse_count());
00218     }
00219 
00220     const map<string, Xapian::MSet::Internal::TermFreqAndWeight> &termfreqandwts
00221         = mset.internal->termfreqandwts;
00222 
00223     map<string, Xapian::MSet::Internal::TermFreqAndWeight>::const_iterator j;
00224     for (j = termfreqandwts.begin(); j != termfreqandwts.end(); ++j) {
00225         result += encode_length(j->first.size());
00226         result += j->first;
00227         result += encode_length(j->second.termfreq);
00228         result += serialise_double(j->second.termweight);
00229     }
00230 
00231     return result;
00232 }
00233 
00234 string
00235 serialise_mset(const Xapian::MSet &mset)
00236 {
00237     string result;
00238 
00239     result += encode_length(mset.get_firstitem());
00240     result += encode_length(mset.get_matches_lower_bound());
00241     result += encode_length(mset.get_matches_estimated());
00242     result += encode_length(mset.get_matches_upper_bound());
00243     result += serialise_double(mset.get_max_possible());
00244     result += serialise_double(mset.get_max_attained());
00245 
00246     result += serialise_double(mset.internal->percent_factor);
00247 
00248     result += encode_length(mset.size());
00249     for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); ++i) {
00250         result += serialise_double(i.get_weight());
00251         result += encode_length(*i);
00252         result += encode_length(i.get_collapse_key().size());
00253         result += i.get_collapse_key();
00254         result += encode_length(i.get_collapse_count());
00255     }
00256 
00257     const map<string, Xapian::MSet::Internal::TermFreqAndWeight> &termfreqandwts
00258         = mset.internal->termfreqandwts;
00259 
00260     map<string, Xapian::MSet::Internal::TermFreqAndWeight>::const_iterator j;
00261     for (j = termfreqandwts.begin(); j != termfreqandwts.end(); ++j) {
00262         result += encode_length(j->first.size());
00263         result += j->first;
00264         result += encode_length(j->second.termfreq);
00265         result += serialise_double(j->second.termweight);
00266     }
00267 
00268     return result;
00269 }
00270 
00271 Xapian::MSet
00272 unserialise_mset(const string &s)
00273 {
00274     const char * p = s.data();
00275     const char * p_end = p + s.size();
00276 
00277     Xapian::doccount firstitem = decode_length(&p, p_end, false);
00278     Xapian::doccount matches_lower_bound = decode_length(&p, p_end, false);
00279     Xapian::doccount matches_estimated = decode_length(&p, p_end, false);
00280     Xapian::doccount matches_upper_bound = decode_length(&p, p_end, false);
00281     Xapian::weight max_possible = unserialise_double(&p, p_end);
00282     Xapian::weight max_attained = unserialise_double(&p, p_end);
00283 
00284     double percent_factor = unserialise_double(&p, p_end);
00285 
00286     vector<Xapian::Internal::MSetItem> items;
00287     size_t msize = decode_length(&p, p_end, false);
00288     while (msize-- > 0) {
00289         Xapian::weight wt = unserialise_double(&p, p_end);
00290         Xapian::docid did = decode_length(&p, p_end, false);
00291         size_t len = decode_length(&p, p_end, true);
00292         string key(p, len);
00293         p += len;
00294         items.push_back(Xapian::Internal::MSetItem(wt, did, key,
00295                                                    decode_length(&p, p_end, false)));
00296     }
00297 
00298     map<string, Xapian::MSet::Internal::TermFreqAndWeight> terminfo;
00299     while (p != p_end) {
00300         Xapian::MSet::Internal::TermFreqAndWeight tfaw;
00301         size_t len = decode_length(&p, p_end, true);
00302         string term(p, len);
00303         p += len;
00304         tfaw.termfreq = decode_length(&p, p_end, false);
00305         tfaw.termweight = unserialise_double(&p, p_end);
00306         terminfo.insert(make_pair(term, tfaw));
00307     }
00308 
00309     return Xapian::MSet(new Xapian::MSet::Internal(
00310                                        firstitem,
00311                                        matches_upper_bound,
00312                                        matches_lower_bound,
00313                                        matches_estimated,
00314                                        max_possible, max_attained,
00315                                        items, terminfo, percent_factor));
00316 }
00317 
00318 string
00319 serialise_rset(const Xapian::RSet &rset)
00320 {
00321     const set<Xapian::docid> & items = rset.internal->get_items();
00322     string result;
00323     set<Xapian::docid>::const_iterator i;
00324     Xapian::docid lastdid = 0;
00325     for (i = items.begin(); i != items.end(); ++i) {
00326         Xapian::docid did = *i;
00327         result += encode_length(did - lastdid - 1);
00328         lastdid = did;
00329     }
00330     return result;
00331 }
00332 
00333 Xapian::RSet
00334 unserialise_rset(const string &s)
00335 {
00336     Xapian::RSet rset;
00337 
00338     const char * p = s.data();
00339     const char * p_end = p + s.size();
00340 
00341     Xapian::docid did = 0;
00342     while (p != p_end) {
00343         did += decode_length(&p, p_end, false) + 1;
00344         rset.add_document(did);
00345     }
00346 
00347     return rset;
00348 }
00349 
00350 string
00351 serialise_document(const Xapian::Document &doc)
00352 {
00353     string result;
00354 
00355     size_t n = doc.values_count();
00356     result += encode_length(doc.values_count());
00357     Xapian::ValueIterator value;
00358     for (value = doc.values_begin(); value != doc.values_end(); ++value) {
00359         result += encode_length(value.get_valueno());
00360         result += encode_length((*value).size());
00361         result += *value;
00362         --n;
00363     }
00364     Assert(n == 0);
00365 
00366     result += encode_length(doc.termlist_count());
00367     Xapian::TermIterator term;
00368     n = doc.termlist_count();
00369     for (term = doc.termlist_begin(); term != doc.termlist_end(); ++term) {
00370         result += encode_length((*term).size());
00371         result += *term;
00372         result += encode_length(term.get_wdf());
00373 
00374         result += encode_length(term.positionlist_count());
00375         Xapian::PositionIterator pos;
00376         Xapian::termpos oldpos = 0;
00377         size_t x = term.positionlist_count();
00378         for (pos = term.positionlist_begin(); pos != term.positionlist_end(); ++pos) {
00379             Xapian::termpos diff = *pos - oldpos;
00380             string delta = encode_length(diff);
00381             result += delta;
00382             oldpos = *pos;
00383             --x;
00384         }
00385         Assert(x == 0);
00386         --n;
00387     }
00388     Assert(n == 0);
00389 
00390     result += doc.get_data();
00391     return result;
00392 }
00393 
00394 Xapian::Document
00395 unserialise_document(const string &s)
00396 {
00397     Xapian::Document doc;
00398     const char * p = s.data();
00399     const char * p_end = p + s.size();
00400 
00401     size_t n_values = decode_length(&p, p_end, false);
00402     while (n_values--) {
00403         Xapian::valueno valno = decode_length(&p, p_end, false);
00404         size_t len = decode_length(&p, p_end, true);
00405         doc.add_value(valno, string(p, len));
00406         p += len;
00407     }
00408 
00409     size_t n_terms = decode_length(&p, p_end, false);
00410     while (n_terms--) {
00411         size_t len = decode_length(&p, p_end, true);
00412         string term(p, len);
00413         p += len;
00414 
00415         // Set all the wdf using add_term, then pass wdf_inc 0 to add_posting.
00416         Xapian::termcount wdf = decode_length(&p, p_end, false);
00417         doc.add_term(term, wdf);
00418 
00419         size_t n_pos = decode_length(&p, p_end, false);
00420         Xapian::termpos pos = 0;
00421         while (n_pos--) {
00422             pos += decode_length(&p, p_end, false);
00423             doc.add_posting(term, pos, 0);
00424         }
00425     }
00426 
00427     doc.set_data(string(p, p_end - p));
00428     return doc;
00429 }

Documentation for Xapian (version 1.0.10).
Generated on 24 Dec 2008 by Doxygen 1.5.2.