api/sortable-serialise.cc

Go to the documentation of this file.
00001 
00004 /* Copyright (C) 2007 Olly Betts
00005  *
00006  * This program is free software; you can redistribute it and/or modify
00007  * it under the terms of the GNU General Public License as published by
00008  * the Free Software Foundation; either version 2 of the License, or
00009  * (at your option) any later version.
00010  *
00011  * This program is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014  * GNU General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU General Public License
00017  * along with this program; if not, write to the Free Software
00018  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
00019  */
00020 
00021 #include <config.h>
00022 
00023 #include <xapian/queryparser.h>
00024 
00025 #include <float.h>
00026 #include <math.h>
00027 
00028 #include <string>
00029 
00030 #include "omassert.h"
00031 
00032 using namespace std;
00033 
00034 #if FLT_RADIX != 2
00035 # error Code currently assumes FLT_RADIX == 2
00036 #endif
00037 
00038 #ifdef _MSC_VER
00039 // Disable warning about negating an unsigned type, which we do deliberately.
00040 # pragma warning(disable:4146)
00041 #endif
00042 
00043 string
00044 Xapian::sortable_serialise(double value)
00045 {
00046     double mantissa;
00047     int exponent;
00048 
00049     // Negative infinity.
00050     if (value < -DBL_MAX) return string();
00051 
00052     mantissa = frexp(value, &exponent);
00053 
00054     /* Deal with zero specially.
00055      *
00056      * IEEE representation of doubles uses 11 bits for the exponent, with a
00057      * bias of 1023.  We bias this by subtracting 8, and non-IEEE
00058      * representations may allow higher exponents, so allow exponents down to
00059      * -2039 - if smaller exponents are possible anywhere, we underflow such
00060      *  numbers to 0.
00061      */
00062     if (mantissa == 0.0 || exponent < -2039) return "\x80";
00063 
00064     bool negative = (mantissa < 0);
00065     if (negative) mantissa = -mantissa;
00066 
00067     // Infinity, or extremely large non-IEEE representation.
00068     if (value > DBL_MAX || exponent > 2055) {
00069         if (negative) {
00070             // This can only happen with a non-IEEE representation, because
00071             // we've already tested for value < -DBL_MAX
00072             return string();
00073         } else {
00074             return string(9, '\xff');
00075         }
00076     }
00077 
00078     // Encoding:
00079     //
00080     // [ 7 | 6 | 5 | 4 3 2 1 0]
00081     //   Sm  Se  Le
00082     //
00083     // Sm stores the sign of the mantissa: 1 = positive or zero, 0 = negative.
00084     // Se stores the sign of the exponent: Sm for positive/zero, !Sm for neg.
00085     // Le stores the length of the exponent: !Se for 3 bits, Se for 11 bits.
00086     unsigned char next = (negative ? 0 : 0xe0);
00087 
00088     // Bias the exponent by 8 so that more small integers get short encodings.
00089     exponent -= 8;
00090     bool exponent_negative = (exponent < 0);
00091     if (exponent_negative) {
00092         exponent = -exponent;
00093         next ^= 0x60;
00094     }
00095 
00096     string result;
00097 
00098     /* We store the exponent in 3 or 11 bits.  If the number is negative, we
00099      * flip all the bits of the exponent, since larger negative numbers should
00100      * sort first.
00101      *
00102      * If the exponent is negative, we flip the bits of the exponent, since
00103      * larger negative exponents should sort first (unless the number is
00104      * negative, in which case they should sort later).
00105      */
00106     Assert(exponent >= 0);
00107     if (exponent < 8) {
00108         next ^= 0x20;
00109         next |= (exponent << 2);
00110         if (negative ^ exponent_negative) next ^= 0x1c;
00111     } else {
00112         Assert((exponent >> 11) == 0);
00113         // Put the top 5 bits of the exponent into the lower 5 bits of the
00114         // first byte:
00115         next |= char(exponent >> 6);
00116         if (negative ^ exponent_negative) next ^= 0x1f;
00117         result += next;
00118         // And the lower 6 bits of the exponent go into the upper 6 bits
00119         // of the second byte:
00120         next = char(exponent) << 2;
00121         if (negative ^ exponent_negative) next ^= 0xfc;
00122     }
00123 
00124     // Convert the 52 (or 53) bits of the mantissa into two 32-bit words.
00125     mantissa *= 1 << (negative ? 26 : 27);
00126     unsigned word1 = (unsigned)mantissa;
00127     mantissa -= word1;
00128     unsigned word2 = (unsigned)(mantissa * 4294967296.0); // 1<<32
00129     // If the number is positive, the first bit will always be set because 0.5
00130     // <= mantissa < 1, unless mantissa is zero, which we handle specially
00131     // above).  If the number is negative, we negate the mantissa instead of
00132     // flipping all the bits, so in the case of 0.5, the first bit isn't set
00133     // so we need to store it explicitly.  But for the cost of one extra
00134     // leading bit, we can save several trailing 0xff bytes in lots of common
00135     // cases.
00136     Assert(negative || (word1 & (1<<26)));
00137     if (negative) {
00138         // We negate the mantissa for negative numbers, so that the sort order
00139         // is reversed (since larger negative numbers should come first).
00140         word1 = -word1;
00141         if (word2 != 0) ++word1;
00142         word2 = -word2;
00143     }
00144 
00145     word1 &= 0x03ffffff;
00146     next |= (word1 >> 24);
00147     result += next;
00148     result.push_back(word1 >> 16);
00149     result.push_back(word1 >> 8);
00150     result.push_back(word1);
00151 
00152     result.push_back(word2 >> 24);
00153     result.push_back(word2 >> 16);
00154     result.push_back(word2 >> 8);
00155     result.push_back(word2);
00156 
00157     // Finally, we can chop off any trailing zero bytes.
00158     size_t len = result.size();
00159     while (len > 0 && result[len - 1] == '\0') {
00160         --len;
00161     }
00162     result.resize(len);
00163 
00164     return result;
00165 }
00166 
00169 static inline unsigned char
00170 numfromstr(const std::string & str, std::string::size_type pos)
00171 {
00172     return (pos < str.size()) ? static_cast<unsigned char>(str[pos]) : 0;
00173 }
00174 
00175 double
00176 Xapian::sortable_unserialise(const std::string & value)
00177 {
00178     // Zero.
00179     if (value == "\x80") return 0.0;
00180 
00181     // Positive infinity.
00182     if (value == string(9, '\xff')) {
00183 #ifdef INFINITY
00184         // INFINITY is C99.  Oddly, it's of type "float" so sanity check in
00185         // case it doesn't cast to double as infinity (apparently some
00186         // implementations have this problem).
00187         if (double(INFINITY) > HUGE_VAL) return INFINITY;
00188 #endif
00189         return HUGE_VAL;
00190     }
00191 
00192     // Negative infinity.
00193     if (value.empty()) {
00194 #ifdef INFINITY
00195         if (double(INFINITY) > HUGE_VAL) return -INFINITY;
00196 #endif
00197         return -HUGE_VAL;
00198     }
00199 
00200     unsigned char first = numfromstr(value, 0);
00201     size_t i = 0;
00202 
00203     first ^= (first & 0xc0) >> 1;
00204     bool negative = !(first & 0x80);
00205     bool exponent_negative = (first & 0x40);
00206     bool explen = !(first & 0x20);
00207     int exponent = first & 0x1f;
00208     if (!explen) {
00209         exponent >>= 2;
00210         if (negative ^ exponent_negative) exponent ^= 0x07;
00211     } else {
00212         first = numfromstr(value, ++i);
00213         exponent <<= 6;
00214         exponent |= (first >> 2);
00215         if (negative ^ exponent_negative) exponent ^= 0x07ff;
00216     }
00217 
00218     unsigned word1;
00219     word1 = (unsigned(first & 0x03) << 24);
00220     word1 |= numfromstr(value, ++i) << 16;
00221     word1 |= numfromstr(value, ++i) << 8;
00222     word1 |= numfromstr(value, ++i);
00223 
00224     unsigned word2 = 0;
00225     if (i < value.size()) {
00226         word2 = numfromstr(value, ++i) << 24;
00227         word2 |= numfromstr(value, ++i) << 16;
00228         word2 |= numfromstr(value, ++i) << 8;
00229         word2 |= numfromstr(value, ++i);
00230     }
00231 
00232     if (negative) {
00233         word1 = -word1;
00234         if (word2 != 0) ++word1;
00235         word2 = -word2;
00236         Assert((word1 & 0xf0000000) != 0);
00237         word1 &= 0x03ffffff;
00238     }
00239     if (!negative) word1 |= 1<<26;
00240 
00241     double mantissa = 0;
00242     if (word2) mantissa = word2 / 4294967296.0; // 1<<32
00243     mantissa += word1;
00244     mantissa /= 1 << (negative ? 26 : 27);
00245 
00246     if (exponent_negative) exponent = -exponent;
00247     exponent += 8;
00248 
00249     if (negative) mantissa = -mantissa;
00250 
00251     return ldexp(mantissa, exponent);
00252 }

Documentation for Xapian (version 1.0.10).
Generated on 24 Dec 2008 by Doxygen 1.5.2.