Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
softfloat.c
Go to the documentation of this file.
1 /*
2 ===============================================================================
3 
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point
5 Arithmetic Package, Release 2.
6 
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the web page
15 http://www.jhauser.us/arithmetic/SoftFloat-2b/SoftFloat-source.txt
16 
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
18 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
19 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
20 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
21 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
22 
23 Derivative works are acceptable, even for commercial purposes, so long as
24 (1) they include prominent notice that the work is derivative, and (2) they
25 include prominent notice akin to these three paragraphs for those parts of
26 this code that are retained.
27 
28 ===============================================================================
29 */
30 
31 #include <asm/div64.h>
32 
33 #include "fpa11.h"
34 //#include "milieu.h"
35 //#include "softfloat.h"
36 
37 /*
38 -------------------------------------------------------------------------------
39 Primitive arithmetic functions, including multi-word arithmetic, and
40 division and square root approximations. (Can be specialized to target if
41 desired.)
42 -------------------------------------------------------------------------------
43 */
44 #include "softfloat-macros"
45 
46 /*
47 -------------------------------------------------------------------------------
48 Functions and definitions to determine: (1) whether tininess for underflow
49 is detected before or after rounding by default, (2) what (if anything)
50 happens when exceptions are raised, (3) how signaling NaNs are distinguished
51 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
52 are propagated from function inputs to output. These details are target-
53 specific.
54 -------------------------------------------------------------------------------
55 */
56 #include "softfloat-specialize"
57 
58 /*
59 -------------------------------------------------------------------------------
60 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
61 and 7, and returns the properly rounded 32-bit integer corresponding to the
62 input. If `zSign' is nonzero, the input is negated before being converted
63 to an integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point
64 input is simply rounded to an integer, with the inexact exception raised if
65 the input cannot be represented exactly as an integer. If the fixed-point
66 input is too large, however, the invalid exception is raised and the largest
67 positive or negative integer is returned.
68 -------------------------------------------------------------------------------
69 */
70 static int32 roundAndPackInt32( struct roundingData *roundData, flag zSign, bits64 absZ )
71 {
72  int8 roundingMode;
73  flag roundNearestEven;
74  int8 roundIncrement, roundBits;
75  int32 z;
76 
77  roundingMode = roundData->mode;
78  roundNearestEven = ( roundingMode == float_round_nearest_even );
79  roundIncrement = 0x40;
80  if ( ! roundNearestEven ) {
81  if ( roundingMode == float_round_to_zero ) {
82  roundIncrement = 0;
83  }
84  else {
85  roundIncrement = 0x7F;
86  if ( zSign ) {
87  if ( roundingMode == float_round_up ) roundIncrement = 0;
88  }
89  else {
90  if ( roundingMode == float_round_down ) roundIncrement = 0;
91  }
92  }
93  }
94  roundBits = absZ & 0x7F;
95  absZ = ( absZ + roundIncrement )>>7;
96  absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
97  z = absZ;
98  if ( zSign ) z = - z;
99  if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
100  roundData->exception |= float_flag_invalid;
101  return zSign ? 0x80000000 : 0x7FFFFFFF;
102  }
103  if ( roundBits ) roundData->exception |= float_flag_inexact;
104  return z;
105 
106 }
107 
108 /*
109 -------------------------------------------------------------------------------
110 Returns the fraction bits of the single-precision floating-point value `a'.
111 -------------------------------------------------------------------------------
112 */
114 {
115 
116  return a & 0x007FFFFF;
117 
118 }
119 
120 /*
121 -------------------------------------------------------------------------------
122 Returns the exponent bits of the single-precision floating-point value `a'.
123 -------------------------------------------------------------------------------
124 */
126 {
127 
128  return ( a>>23 ) & 0xFF;
129 
130 }
131 
132 /*
133 -------------------------------------------------------------------------------
134 Returns the sign bit of the single-precision floating-point value `a'.
135 -------------------------------------------------------------------------------
136 */
137 #if 0 /* in softfloat.h */
139 {
140 
141  return a>>31;
142 
143 }
144 #endif
145 
146 /*
147 -------------------------------------------------------------------------------
148 Normalizes the subnormal single-precision floating-point value represented
149 by the denormalized significand `aSig'. The normalized exponent and
150 significand are stored at the locations pointed to by `zExpPtr' and
151 `zSigPtr', respectively.
152 -------------------------------------------------------------------------------
153 */
154 static void
155  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
156 {
157  int8 shiftCount;
158 
159  shiftCount = countLeadingZeros32( aSig ) - 8;
160  *zSigPtr = aSig<<shiftCount;
161  *zExpPtr = 1 - shiftCount;
162 
163 }
164 
165 /*
166 -------------------------------------------------------------------------------
167 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
168 single-precision floating-point value, returning the result. After being
169 shifted into the proper positions, the three fields are simply added
170 together to form the result. This means that any integer portion of `zSig'
171 will be added into the exponent. Since a properly normalized significand
172 will have an integer portion equal to 1, the `zExp' input should be 1 less
173 than the desired result exponent whenever `zSig' is a complete, normalized
174 significand.
175 -------------------------------------------------------------------------------
176 */
178 {
179 #if 0
180  float32 f;
181  __asm__("@ packFloat32 \n\
182  mov %0, %1, asl #31 \n\
183  orr %0, %2, asl #23 \n\
184  orr %0, %3"
185  : /* no outputs */
186  : "g" (f), "g" (zSign), "g" (zExp), "g" (zSig)
187  : "cc");
188  return f;
189 #else
190  return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
191 #endif
192 }
193 
194 /*
195 -------------------------------------------------------------------------------
196 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
197 and significand `zSig', and returns the proper single-precision floating-
198 point value corresponding to the abstract input. Ordinarily, the abstract
199 value is simply rounded and packed into the single-precision format, with
200 the inexact exception raised if the abstract input cannot be represented
201 exactly. If the abstract value is too large, however, the overflow and
202 inexact exceptions are raised and an infinity or maximal finite value is
203 returned. If the abstract value is too small, the input value is rounded to
204 a subnormal number, and the underflow and inexact exceptions are raised if
205 the abstract input cannot be represented exactly as a subnormal single-
206 precision floating-point number.
207  The input significand `zSig' has its binary point between bits 30
208 and 29, which is 7 bits to the left of the usual location. This shifted
209 significand must be normalized or smaller. If `zSig' is not normalized,
210 `zExp' must be 0; in that case, the result returned is a subnormal number,
211 and it must not require rounding. In the usual case that `zSig' is
212 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
213 The handling of underflow and overflow follows the IEC/IEEE Standard for
214 Binary Floating-point Arithmetic.
215 -------------------------------------------------------------------------------
216 */
217 static float32 roundAndPackFloat32( struct roundingData *roundData, flag zSign, int16 zExp, bits32 zSig )
218 {
219  int8 roundingMode;
220  flag roundNearestEven;
221  int8 roundIncrement, roundBits;
222  flag isTiny;
223 
224  roundingMode = roundData->mode;
225  roundNearestEven = ( roundingMode == float_round_nearest_even );
226  roundIncrement = 0x40;
227  if ( ! roundNearestEven ) {
228  if ( roundingMode == float_round_to_zero ) {
229  roundIncrement = 0;
230  }
231  else {
232  roundIncrement = 0x7F;
233  if ( zSign ) {
234  if ( roundingMode == float_round_up ) roundIncrement = 0;
235  }
236  else {
237  if ( roundingMode == float_round_down ) roundIncrement = 0;
238  }
239  }
240  }
241  roundBits = zSig & 0x7F;
242  if ( 0xFD <= (bits16) zExp ) {
243  if ( ( 0xFD < zExp )
244  || ( ( zExp == 0xFD )
245  && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
246  ) {
248  return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
249  }
250  if ( zExp < 0 ) {
251  isTiny =
253  || ( zExp < -1 )
254  || ( zSig + roundIncrement < 0x80000000 );
255  shift32RightJamming( zSig, - zExp, &zSig );
256  zExp = 0;
257  roundBits = zSig & 0x7F;
258  if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow;
259  }
260  }
261  if ( roundBits ) roundData->exception |= float_flag_inexact;
262  zSig = ( zSig + roundIncrement )>>7;
263  zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
264  if ( zSig == 0 ) zExp = 0;
265  return packFloat32( zSign, zExp, zSig );
266 
267 }
268 
269 /*
270 -------------------------------------------------------------------------------
271 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
272 and significand `zSig', and returns the proper single-precision floating-
273 point value corresponding to the abstract input. This routine is just like
274 `roundAndPackFloat32' except that `zSig' does not have to be normalized in
275 any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
276 point exponent.
277 -------------------------------------------------------------------------------
278 */
279 static float32
280  normalizeRoundAndPackFloat32( struct roundingData *roundData, flag zSign, int16 zExp, bits32 zSig )
281 {
282  int8 shiftCount;
283 
284  shiftCount = countLeadingZeros32( zSig ) - 1;
285  return roundAndPackFloat32( roundData, zSign, zExp - shiftCount, zSig<<shiftCount );
286 
287 }
288 
289 /*
290 -------------------------------------------------------------------------------
291 Returns the fraction bits of the double-precision floating-point value `a'.
292 -------------------------------------------------------------------------------
293 */
295 {
296 
297  return a & LIT64( 0x000FFFFFFFFFFFFF );
298 
299 }
300 
301 /*
302 -------------------------------------------------------------------------------
303 Returns the exponent bits of the double-precision floating-point value `a'.
304 -------------------------------------------------------------------------------
305 */
307 {
308 
309  return ( a>>52 ) & 0x7FF;
310 
311 }
312 
313 /*
314 -------------------------------------------------------------------------------
315 Returns the sign bit of the double-precision floating-point value `a'.
316 -------------------------------------------------------------------------------
317 */
318 #if 0 /* in softfloat.h */
320 {
321 
322  return a>>63;
323 
324 }
325 #endif
326 
327 /*
328 -------------------------------------------------------------------------------
329 Normalizes the subnormal double-precision floating-point value represented
330 by the denormalized significand `aSig'. The normalized exponent and
331 significand are stored at the locations pointed to by `zExpPtr' and
332 `zSigPtr', respectively.
333 -------------------------------------------------------------------------------
334 */
335 static void
336  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
337 {
338  int8 shiftCount;
339 
340  shiftCount = countLeadingZeros64( aSig ) - 11;
341  *zSigPtr = aSig<<shiftCount;
342  *zExpPtr = 1 - shiftCount;
343 
344 }
345 
346 /*
347 -------------------------------------------------------------------------------
348 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
349 double-precision floating-point value, returning the result. After being
350 shifted into the proper positions, the three fields are simply added
351 together to form the result. This means that any integer portion of `zSig'
352 will be added into the exponent. Since a properly normalized significand
353 will have an integer portion equal to 1, the `zExp' input should be 1 less
354 than the desired result exponent whenever `zSig' is a complete, normalized
355 significand.
356 -------------------------------------------------------------------------------
357 */
359 {
360 
361  return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
362 
363 }
364 
365 /*
366 -------------------------------------------------------------------------------
367 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
368 and significand `zSig', and returns the proper double-precision floating-
369 point value corresponding to the abstract input. Ordinarily, the abstract
370 value is simply rounded and packed into the double-precision format, with
371 the inexact exception raised if the abstract input cannot be represented
372 exactly. If the abstract value is too large, however, the overflow and
373 inexact exceptions are raised and an infinity or maximal finite value is
374 returned. If the abstract value is too small, the input value is rounded to
375 a subnormal number, and the underflow and inexact exceptions are raised if
376 the abstract input cannot be represented exactly as a subnormal double-
377 precision floating-point number.
378  The input significand `zSig' has its binary point between bits 62
379 and 61, which is 10 bits to the left of the usual location. This shifted
380 significand must be normalized or smaller. If `zSig' is not normalized,
381 `zExp' must be 0; in that case, the result returned is a subnormal number,
382 and it must not require rounding. In the usual case that `zSig' is
383 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
384 The handling of underflow and overflow follows the IEC/IEEE Standard for
385 Binary Floating-point Arithmetic.
386 -------------------------------------------------------------------------------
387 */
388 static float64 roundAndPackFloat64( struct roundingData *roundData, flag zSign, int16 zExp, bits64 zSig )
389 {
390  int8 roundingMode;
391  flag roundNearestEven;
392  int16 roundIncrement, roundBits;
393  flag isTiny;
394 
395  roundingMode = roundData->mode;
396  roundNearestEven = ( roundingMode == float_round_nearest_even );
397  roundIncrement = 0x200;
398  if ( ! roundNearestEven ) {
399  if ( roundingMode == float_round_to_zero ) {
400  roundIncrement = 0;
401  }
402  else {
403  roundIncrement = 0x3FF;
404  if ( zSign ) {
405  if ( roundingMode == float_round_up ) roundIncrement = 0;
406  }
407  else {
408  if ( roundingMode == float_round_down ) roundIncrement = 0;
409  }
410  }
411  }
412  roundBits = zSig & 0x3FF;
413  if ( 0x7FD <= (bits16) zExp ) {
414  if ( ( 0x7FD < zExp )
415  || ( ( zExp == 0x7FD )
416  && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
417  ) {
418  //register int lr = __builtin_return_address(0);
419  //printk("roundAndPackFloat64 called from 0x%08x\n",lr);
421  return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
422  }
423  if ( zExp < 0 ) {
424  isTiny =
426  || ( zExp < -1 )
427  || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
428  shift64RightJamming( zSig, - zExp, &zSig );
429  zExp = 0;
430  roundBits = zSig & 0x3FF;
431  if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow;
432  }
433  }
434  if ( roundBits ) roundData->exception |= float_flag_inexact;
435  zSig = ( zSig + roundIncrement )>>10;
436  zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
437  if ( zSig == 0 ) zExp = 0;
438  return packFloat64( zSign, zExp, zSig );
439 
440 }
441 
442 /*
443 -------------------------------------------------------------------------------
444 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
445 and significand `zSig', and returns the proper double-precision floating-
446 point value corresponding to the abstract input. This routine is just like
447 `roundAndPackFloat64' except that `zSig' does not have to be normalized in
448 any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
449 point exponent.
450 -------------------------------------------------------------------------------
451 */
452 static float64
453  normalizeRoundAndPackFloat64( struct roundingData *roundData, flag zSign, int16 zExp, bits64 zSig )
454 {
455  int8 shiftCount;
456 
457  shiftCount = countLeadingZeros64( zSig ) - 1;
458  return roundAndPackFloat64( roundData, zSign, zExp - shiftCount, zSig<<shiftCount );
459 
460 }
461 
462 #ifdef FLOATX80
463 
464 /*
465 -------------------------------------------------------------------------------
466 Returns the fraction bits of the extended double-precision floating-point
467 value `a'.
468 -------------------------------------------------------------------------------
469 */
470 INLINE bits64 extractFloatx80Frac( floatx80 a )
471 {
472 
473  return a.low;
474 
475 }
476 
477 /*
478 -------------------------------------------------------------------------------
479 Returns the exponent bits of the extended double-precision floating-point
480 value `a'.
481 -------------------------------------------------------------------------------
482 */
483 INLINE int32 extractFloatx80Exp( floatx80 a )
484 {
485 
486  return a.high & 0x7FFF;
487 
488 }
489 
490 /*
491 -------------------------------------------------------------------------------
492 Returns the sign bit of the extended double-precision floating-point value
493 `a'.
494 -------------------------------------------------------------------------------
495 */
496 INLINE flag extractFloatx80Sign( floatx80 a )
497 {
498 
499  return a.high>>15;
500 
501 }
502 
503 /*
504 -------------------------------------------------------------------------------
505 Normalizes the subnormal extended double-precision floating-point value
506 represented by the denormalized significand `aSig'. The normalized exponent
507 and significand are stored at the locations pointed to by `zExpPtr' and
508 `zSigPtr', respectively.
509 -------------------------------------------------------------------------------
510 */
511 static void
512  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
513 {
514  int8 shiftCount;
515 
516  shiftCount = countLeadingZeros64( aSig );
517  *zSigPtr = aSig<<shiftCount;
518  *zExpPtr = 1 - shiftCount;
519 
520 }
521 
522 /*
523 -------------------------------------------------------------------------------
524 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
525 extended double-precision floating-point value, returning the result.
526 -------------------------------------------------------------------------------
527 */
528 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
529 {
530  floatx80 z;
531 
532  z.low = zSig;
533  z.high = ( ( (bits16) zSign )<<15 ) + zExp;
534  z.__padding = 0;
535  return z;
536 
537 }
538 
539 /*
540 -------------------------------------------------------------------------------
541 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
542 and extended significand formed by the concatenation of `zSig0' and `zSig1',
543 and returns the proper extended double-precision floating-point value
544 corresponding to the abstract input. Ordinarily, the abstract value is
545 rounded and packed into the extended double-precision format, with the
546 inexact exception raised if the abstract input cannot be represented
547 exactly. If the abstract value is too large, however, the overflow and
548 inexact exceptions are raised and an infinity or maximal finite value is
549 returned. If the abstract value is too small, the input value is rounded to
550 a subnormal number, and the underflow and inexact exceptions are raised if
551 the abstract input cannot be represented exactly as a subnormal extended
552 double-precision floating-point number.
553  If `roundingPrecision' is 32 or 64, the result is rounded to the same
554 number of bits as single or double precision, respectively. Otherwise, the
555 result is rounded to the full precision of the extended double-precision
556 format.
557  The input significand must be normalized or smaller. If the input
558 significand is not normalized, `zExp' must be 0; in that case, the result
559 returned is a subnormal number, and it must not require rounding. The
560 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
561 Floating-point Arithmetic.
562 -------------------------------------------------------------------------------
563 */
564 static floatx80
565  roundAndPackFloatx80(
566  struct roundingData *roundData, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
567  )
568 {
569  int8 roundingMode, roundingPrecision;
570  flag roundNearestEven, increment, isTiny;
571  int64 roundIncrement, roundMask, roundBits;
572 
573  roundingMode = roundData->mode;
574  roundingPrecision = roundData->precision;
575  roundNearestEven = ( roundingMode == float_round_nearest_even );
576  if ( roundingPrecision == 80 ) goto precision80;
577  if ( roundingPrecision == 64 ) {
578  roundIncrement = LIT64( 0x0000000000000400 );
579  roundMask = LIT64( 0x00000000000007FF );
580  }
581  else if ( roundingPrecision == 32 ) {
582  roundIncrement = LIT64( 0x0000008000000000 );
583  roundMask = LIT64( 0x000000FFFFFFFFFF );
584  }
585  else {
586  goto precision80;
587  }
588  zSig0 |= ( zSig1 != 0 );
589  if ( ! roundNearestEven ) {
590  if ( roundingMode == float_round_to_zero ) {
591  roundIncrement = 0;
592  }
593  else {
594  roundIncrement = roundMask;
595  if ( zSign ) {
596  if ( roundingMode == float_round_up ) roundIncrement = 0;
597  }
598  else {
599  if ( roundingMode == float_round_down ) roundIncrement = 0;
600  }
601  }
602  }
603  roundBits = zSig0 & roundMask;
604  if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
605  if ( ( 0x7FFE < zExp )
606  || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
607  ) {
608  goto overflow;
609  }
610  if ( zExp <= 0 ) {
611  isTiny =
613  || ( zExp < 0 )
614  || ( zSig0 <= zSig0 + roundIncrement );
615  shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
616  zExp = 0;
617  roundBits = zSig0 & roundMask;
618  if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow;
619  if ( roundBits ) roundData->exception |= float_flag_inexact;
620  zSig0 += roundIncrement;
621  if ( (sbits64) zSig0 < 0 ) zExp = 1;
622  roundIncrement = roundMask + 1;
623  if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
624  roundMask |= roundIncrement;
625  }
626  zSig0 &= ~ roundMask;
627  return packFloatx80( zSign, zExp, zSig0 );
628  }
629  }
630  if ( roundBits ) roundData->exception |= float_flag_inexact;
631  zSig0 += roundIncrement;
632  if ( zSig0 < roundIncrement ) {
633  ++zExp;
634  zSig0 = LIT64( 0x8000000000000000 );
635  }
636  roundIncrement = roundMask + 1;
637  if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
638  roundMask |= roundIncrement;
639  }
640  zSig0 &= ~ roundMask;
641  if ( zSig0 == 0 ) zExp = 0;
642  return packFloatx80( zSign, zExp, zSig0 );
643  precision80:
644  increment = ( (sbits64) zSig1 < 0 );
645  if ( ! roundNearestEven ) {
646  if ( roundingMode == float_round_to_zero ) {
647  increment = 0;
648  }
649  else {
650  if ( zSign ) {
651  increment = ( roundingMode == float_round_down ) && zSig1;
652  }
653  else {
654  increment = ( roundingMode == float_round_up ) && zSig1;
655  }
656  }
657  }
658  if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
659  if ( ( 0x7FFE < zExp )
660  || ( ( zExp == 0x7FFE )
661  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
662  && increment
663  )
664  ) {
665  roundMask = 0;
666  overflow:
668  if ( ( roundingMode == float_round_to_zero )
669  || ( zSign && ( roundingMode == float_round_up ) )
670  || ( ! zSign && ( roundingMode == float_round_down ) )
671  ) {
672  return packFloatx80( zSign, 0x7FFE, ~ roundMask );
673  }
674  return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
675  }
676  if ( zExp <= 0 ) {
677  isTiny =
679  || ( zExp < 0 )
680  || ! increment
681  || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
682  shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
683  zExp = 0;
684  if ( isTiny && zSig1 ) roundData->exception |= float_flag_underflow;
685  if ( zSig1 ) roundData->exception |= float_flag_inexact;
686  if ( roundNearestEven ) {
687  increment = ( (sbits64) zSig1 < 0 );
688  }
689  else {
690  if ( zSign ) {
691  increment = ( roundingMode == float_round_down ) && zSig1;
692  }
693  else {
694  increment = ( roundingMode == float_round_up ) && zSig1;
695  }
696  }
697  if ( increment ) {
698  ++zSig0;
699  zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
700  if ( (sbits64) zSig0 < 0 ) zExp = 1;
701  }
702  return packFloatx80( zSign, zExp, zSig0 );
703  }
704  }
705  if ( zSig1 ) roundData->exception |= float_flag_inexact;
706  if ( increment ) {
707  ++zSig0;
708  if ( zSig0 == 0 ) {
709  ++zExp;
710  zSig0 = LIT64( 0x8000000000000000 );
711  }
712  else {
713  zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
714  }
715  }
716  else {
717  if ( zSig0 == 0 ) zExp = 0;
718  }
719 
720  return packFloatx80( zSign, zExp, zSig0 );
721 }
722 
723 /*
724 -------------------------------------------------------------------------------
725 Takes an abstract floating-point value having sign `zSign', exponent
726 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
727 and returns the proper extended double-precision floating-point value
728 corresponding to the abstract input. This routine is just like
729 `roundAndPackFloatx80' except that the input significand does not have to be
730 normalized.
731 -------------------------------------------------------------------------------
732 */
733 static floatx80
734  normalizeRoundAndPackFloatx80(
735  struct roundingData *roundData, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
736  )
737 {
738  int8 shiftCount;
739 
740  if ( zSig0 == 0 ) {
741  zSig0 = zSig1;
742  zSig1 = 0;
743  zExp -= 64;
744  }
745  shiftCount = countLeadingZeros64( zSig0 );
746  shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
747  zExp -= shiftCount;
748  return
749  roundAndPackFloatx80( roundData, zSign, zExp, zSig0, zSig1 );
750 
751 }
752 
753 #endif
754 
755 /*
756 -------------------------------------------------------------------------------
757 Returns the result of converting the 32-bit two's complement integer `a' to
758 the single-precision floating-point format. The conversion is performed
759 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
760 -------------------------------------------------------------------------------
761 */
763 {
764  flag zSign;
765 
766  if ( a == 0 ) return 0;
767  if ( a == 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
768  zSign = ( a < 0 );
769  return normalizeRoundAndPackFloat32( roundData, zSign, 0x9C, zSign ? - a : a );
770 
771 }
772 
773 /*
774 -------------------------------------------------------------------------------
775 Returns the result of converting the 32-bit two's complement integer `a' to
776 the double-precision floating-point format. The conversion is performed
777 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
778 -------------------------------------------------------------------------------
779 */
781 {
782  flag aSign;
783  uint32 absA;
784  int8 shiftCount;
785  bits64 zSig;
786 
787  if ( a == 0 ) return 0;
788  aSign = ( a < 0 );
789  absA = aSign ? - a : a;
790  shiftCount = countLeadingZeros32( absA ) + 21;
791  zSig = absA;
792  return packFloat64( aSign, 0x432 - shiftCount, zSig<<shiftCount );
793 
794 }
795 
796 #ifdef FLOATX80
797 
798 /*
799 -------------------------------------------------------------------------------
800 Returns the result of converting the 32-bit two's complement integer `a'
801 to the extended double-precision floating-point format. The conversion
802 is performed according to the IEC/IEEE Standard for Binary Floating-point
803 Arithmetic.
804 -------------------------------------------------------------------------------
805 */
806 floatx80 int32_to_floatx80( int32 a )
807 {
808  flag zSign;
809  uint32 absA;
810  int8 shiftCount;
811  bits64 zSig;
812 
813  if ( a == 0 ) return packFloatx80( 0, 0, 0 );
814  zSign = ( a < 0 );
815  absA = zSign ? - a : a;
816  shiftCount = countLeadingZeros32( absA ) + 32;
817  zSig = absA;
818  return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
819 
820 }
821 
822 #endif
823 
824 /*
825 -------------------------------------------------------------------------------
826 Returns the result of converting the single-precision floating-point value
827 `a' to the 32-bit two's complement integer format. The conversion is
828 performed according to the IEC/IEEE Standard for Binary Floating-point
829 Arithmetic---which means in particular that the conversion is rounded
830 according to the current rounding mode. If `a' is a NaN, the largest
831 positive integer is returned. Otherwise, if the conversion overflows, the
832 largest integer with the same sign as `a' is returned.
833 -------------------------------------------------------------------------------
834 */
836 {
837  flag aSign;
838  int16 aExp, shiftCount;
839  bits32 aSig;
840  bits64 zSig;
841 
842  aSig = extractFloat32Frac( a );
843  aExp = extractFloat32Exp( a );
844  aSign = extractFloat32Sign( a );
845  if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
846  if ( aExp ) aSig |= 0x00800000;
847  shiftCount = 0xAF - aExp;
848  zSig = aSig;
849  zSig <<= 32;
850  if ( 0 < shiftCount ) shift64RightJamming( zSig, shiftCount, &zSig );
851  return roundAndPackInt32( roundData, aSign, zSig );
852 
853 }
854 
855 /*
856 -------------------------------------------------------------------------------
857 Returns the result of converting the single-precision floating-point value
858 `a' to the 32-bit two's complement integer format. The conversion is
859 performed according to the IEC/IEEE Standard for Binary Floating-point
860 Arithmetic, except that the conversion is always rounded toward zero. If
861 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
862 conversion overflows, the largest integer with the same sign as `a' is
863 returned.
864 -------------------------------------------------------------------------------
865 */
867 {
868  flag aSign;
869  int16 aExp, shiftCount;
870  bits32 aSig;
871  int32 z;
872 
873  aSig = extractFloat32Frac( a );
874  aExp = extractFloat32Exp( a );
875  aSign = extractFloat32Sign( a );
876  shiftCount = aExp - 0x9E;
877  if ( 0 <= shiftCount ) {
878  if ( a == 0xCF000000 ) return 0x80000000;
880  if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
881  return 0x80000000;
882  }
883  else if ( aExp <= 0x7E ) {
884  if ( aExp | aSig ) float_raise( float_flag_inexact );
885  return 0;
886  }
887  aSig = ( aSig | 0x00800000 )<<8;
888  z = aSig>>( - shiftCount );
889  if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
891  }
892  return aSign ? - z : z;
893 
894 }
895 
896 /*
897 -------------------------------------------------------------------------------
898 Returns the result of converting the single-precision floating-point value
899 `a' to the double-precision floating-point format. The conversion is
900 performed according to the IEC/IEEE Standard for Binary Floating-point
901 Arithmetic.
902 -------------------------------------------------------------------------------
903 */
905 {
906  flag aSign;
907  int16 aExp;
908  bits32 aSig;
909 
910  aSig = extractFloat32Frac( a );
911  aExp = extractFloat32Exp( a );
912  aSign = extractFloat32Sign( a );
913  if ( aExp == 0xFF ) {
914  if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
915  return packFloat64( aSign, 0x7FF, 0 );
916  }
917  if ( aExp == 0 ) {
918  if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
919  normalizeFloat32Subnormal( aSig, &aExp, &aSig );
920  --aExp;
921  }
922  return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
923 
924 }
925 
926 #ifdef FLOATX80
927 
928 /*
929 -------------------------------------------------------------------------------
930 Returns the result of converting the single-precision floating-point value
931 `a' to the extended double-precision floating-point format. The conversion
932 is performed according to the IEC/IEEE Standard for Binary Floating-point
933 Arithmetic.
934 -------------------------------------------------------------------------------
935 */
936 floatx80 float32_to_floatx80( float32 a )
937 {
938  flag aSign;
939  int16 aExp;
940  bits32 aSig;
941 
942  aSig = extractFloat32Frac( a );
943  aExp = extractFloat32Exp( a );
944  aSign = extractFloat32Sign( a );
945  if ( aExp == 0xFF ) {
946  if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
947  return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
948  }
949  if ( aExp == 0 ) {
950  if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
951  normalizeFloat32Subnormal( aSig, &aExp, &aSig );
952  }
953  aSig |= 0x00800000;
954  return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
955 
956 }
957 
958 #endif
959 
960 /*
961 -------------------------------------------------------------------------------
962 Rounds the single-precision floating-point value `a' to an integer, and
963 returns the result as a single-precision floating-point value. The
964 operation is performed according to the IEC/IEEE Standard for Binary
965 Floating-point Arithmetic.
966 -------------------------------------------------------------------------------
967 */
969 {
970  flag aSign;
971  int16 aExp;
972  bits32 lastBitMask, roundBitsMask;
973  int8 roundingMode;
974  float32 z;
975 
976  aExp = extractFloat32Exp( a );
977  if ( 0x96 <= aExp ) {
978  if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
979  return propagateFloat32NaN( a, a );
980  }
981  return a;
982  }
983  roundingMode = roundData->mode;
984  if ( aExp <= 0x7E ) {
985  if ( (bits32) ( a<<1 ) == 0 ) return a;
986  roundData->exception |= float_flag_inexact;
987  aSign = extractFloat32Sign( a );
988  switch ( roundingMode ) {
990  if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
991  return packFloat32( aSign, 0x7F, 0 );
992  }
993  break;
994  case float_round_down:
995  return aSign ? 0xBF800000 : 0;
996  case float_round_up:
997  return aSign ? 0x80000000 : 0x3F800000;
998  }
999  return packFloat32( aSign, 0, 0 );
1000  }
1001  lastBitMask = 1;
1002  lastBitMask <<= 0x96 - aExp;
1003  roundBitsMask = lastBitMask - 1;
1004  z = a;
1005  if ( roundingMode == float_round_nearest_even ) {
1006  z += lastBitMask>>1;
1007  if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1008  }
1009  else if ( roundingMode != float_round_to_zero ) {
1010  if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1011  z += roundBitsMask;
1012  }
1013  }
1014  z &= ~ roundBitsMask;
1015  if ( z != a ) roundData->exception |= float_flag_inexact;
1016  return z;
1017 
1018 }
1019 
1020 /*
1021 -------------------------------------------------------------------------------
1022 Returns the result of adding the absolute values of the single-precision
1023 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
1024 before being returned. `zSign' is ignored if the result is a NaN. The
1025 addition is performed according to the IEC/IEEE Standard for Binary
1026 Floating-point Arithmetic.
1027 -------------------------------------------------------------------------------
1028 */
1029 static float32 addFloat32Sigs( struct roundingData *roundData, float32 a, float32 b, flag zSign )
1030 {
1031  int16 aExp, bExp, zExp;
1032  bits32 aSig, bSig, zSig;
1033  int16 expDiff;
1034 
1035  aSig = extractFloat32Frac( a );
1036  aExp = extractFloat32Exp( a );
1037  bSig = extractFloat32Frac( b );
1038  bExp = extractFloat32Exp( b );
1039  expDiff = aExp - bExp;
1040  aSig <<= 6;
1041  bSig <<= 6;
1042  if ( 0 < expDiff ) {
1043  if ( aExp == 0xFF ) {
1044  if ( aSig ) return propagateFloat32NaN( a, b );
1045  return a;
1046  }
1047  if ( bExp == 0 ) {
1048  --expDiff;
1049  }
1050  else {
1051  bSig |= 0x20000000;
1052  }
1053  shift32RightJamming( bSig, expDiff, &bSig );
1054  zExp = aExp;
1055  }
1056  else if ( expDiff < 0 ) {
1057  if ( bExp == 0xFF ) {
1058  if ( bSig ) return propagateFloat32NaN( a, b );
1059  return packFloat32( zSign, 0xFF, 0 );
1060  }
1061  if ( aExp == 0 ) {
1062  ++expDiff;
1063  }
1064  else {
1065  aSig |= 0x20000000;
1066  }
1067  shift32RightJamming( aSig, - expDiff, &aSig );
1068  zExp = bExp;
1069  }
1070  else {
1071  if ( aExp == 0xFF ) {
1072  if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1073  return a;
1074  }
1075  if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1076  zSig = 0x40000000 + aSig + bSig;
1077  zExp = aExp;
1078  goto roundAndPack;
1079  }
1080  aSig |= 0x20000000;
1081  zSig = ( aSig + bSig )<<1;
1082  --zExp;
1083  if ( (sbits32) zSig < 0 ) {
1084  zSig = aSig + bSig;
1085  ++zExp;
1086  }
1087  roundAndPack:
1088  return roundAndPackFloat32( roundData, zSign, zExp, zSig );
1089 
1090 }
1091 
1092 /*
1093 -------------------------------------------------------------------------------
1094 Returns the result of subtracting the absolute values of the single-
1095 precision floating-point values `a' and `b'. If `zSign' is true, the
1096 difference is negated before being returned. `zSign' is ignored if the
1097 result is a NaN. The subtraction is performed according to the IEC/IEEE
1098 Standard for Binary Floating-point Arithmetic.
1099 -------------------------------------------------------------------------------
1100 */
1101 static float32 subFloat32Sigs( struct roundingData *roundData, float32 a, float32 b, flag zSign )
1102 {
1103  int16 aExp, bExp, zExp;
1104  bits32 aSig, bSig, zSig;
1105  int16 expDiff;
1106 
1107  aSig = extractFloat32Frac( a );
1108  aExp = extractFloat32Exp( a );
1109  bSig = extractFloat32Frac( b );
1110  bExp = extractFloat32Exp( b );
1111  expDiff = aExp - bExp;
1112  aSig <<= 7;
1113  bSig <<= 7;
1114  if ( 0 < expDiff ) goto aExpBigger;
1115  if ( expDiff < 0 ) goto bExpBigger;
1116  if ( aExp == 0xFF ) {
1117  if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1118  roundData->exception |= float_flag_invalid;
1119  return float32_default_nan;
1120  }
1121  if ( aExp == 0 ) {
1122  aExp = 1;
1123  bExp = 1;
1124  }
1125  if ( bSig < aSig ) goto aBigger;
1126  if ( aSig < bSig ) goto bBigger;
1127  return packFloat32( roundData->mode == float_round_down, 0, 0 );
1128  bExpBigger:
1129  if ( bExp == 0xFF ) {
1130  if ( bSig ) return propagateFloat32NaN( a, b );
1131  return packFloat32( zSign ^ 1, 0xFF, 0 );
1132  }
1133  if ( aExp == 0 ) {
1134  ++expDiff;
1135  }
1136  else {
1137  aSig |= 0x40000000;
1138  }
1139  shift32RightJamming( aSig, - expDiff, &aSig );
1140  bSig |= 0x40000000;
1141  bBigger:
1142  zSig = bSig - aSig;
1143  zExp = bExp;
1144  zSign ^= 1;
1145  goto normalizeRoundAndPack;
1146  aExpBigger:
1147  if ( aExp == 0xFF ) {
1148  if ( aSig ) return propagateFloat32NaN( a, b );
1149  return a;
1150  }
1151  if ( bExp == 0 ) {
1152  --expDiff;
1153  }
1154  else {
1155  bSig |= 0x40000000;
1156  }
1157  shift32RightJamming( bSig, expDiff, &bSig );
1158  aSig |= 0x40000000;
1159  aBigger:
1160  zSig = aSig - bSig;
1161  zExp = aExp;
1162  normalizeRoundAndPack:
1163  --zExp;
1164  return normalizeRoundAndPackFloat32( roundData, zSign, zExp, zSig );
1165 
1166 }
1167 
1168 /*
1169 -------------------------------------------------------------------------------
1170 Returns the result of adding the single-precision floating-point values `a'
1171 and `b'. The operation is performed according to the IEC/IEEE Standard for
1172 Binary Floating-point Arithmetic.
1173 -------------------------------------------------------------------------------
1174 */
1175 float32 float32_add( struct roundingData *roundData, float32 a, float32 b )
1176 {
1177  flag aSign, bSign;
1178 
1179  aSign = extractFloat32Sign( a );
1180  bSign = extractFloat32Sign( b );
1181  if ( aSign == bSign ) {
1182  return addFloat32Sigs( roundData, a, b, aSign );
1183  }
1184  else {
1185  return subFloat32Sigs( roundData, a, b, aSign );
1186  }
1187 
1188 }
1189 
1190 /*
1191 -------------------------------------------------------------------------------
1192 Returns the result of subtracting the single-precision floating-point values
1193 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1194 for Binary Floating-point Arithmetic.
1195 -------------------------------------------------------------------------------
1196 */
1197 float32 float32_sub( struct roundingData *roundData, float32 a, float32 b )
1198 {
1199  flag aSign, bSign;
1200 
1201  aSign = extractFloat32Sign( a );
1202  bSign = extractFloat32Sign( b );
1203  if ( aSign == bSign ) {
1204  return subFloat32Sigs( roundData, a, b, aSign );
1205  }
1206  else {
1207  return addFloat32Sigs( roundData, a, b, aSign );
1208  }
1209 
1210 }
1211 
1212 /*
1213 -------------------------------------------------------------------------------
1214 Returns the result of multiplying the single-precision floating-point values
1215 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1216 for Binary Floating-point Arithmetic.
1217 -------------------------------------------------------------------------------
1218 */
1219 float32 float32_mul( struct roundingData *roundData, float32 a, float32 b )
1220 {
1221  flag aSign, bSign, zSign;
1222  int16 aExp, bExp, zExp;
1223  bits32 aSig, bSig;
1224  bits64 zSig64;
1225  bits32 zSig;
1226 
1227  aSig = extractFloat32Frac( a );
1228  aExp = extractFloat32Exp( a );
1229  aSign = extractFloat32Sign( a );
1230  bSig = extractFloat32Frac( b );
1231  bExp = extractFloat32Exp( b );
1232  bSign = extractFloat32Sign( b );
1233  zSign = aSign ^ bSign;
1234  if ( aExp == 0xFF ) {
1235  if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1236  return propagateFloat32NaN( a, b );
1237  }
1238  if ( ( bExp | bSig ) == 0 ) {
1239  roundData->exception |= float_flag_invalid;
1240  return float32_default_nan;
1241  }
1242  return packFloat32( zSign, 0xFF, 0 );
1243  }
1244  if ( bExp == 0xFF ) {
1245  if ( bSig ) return propagateFloat32NaN( a, b );
1246  if ( ( aExp | aSig ) == 0 ) {
1247  roundData->exception |= float_flag_invalid;
1248  return float32_default_nan;
1249  }
1250  return packFloat32( zSign, 0xFF, 0 );
1251  }
1252  if ( aExp == 0 ) {
1253  if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1254  normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1255  }
1256  if ( bExp == 0 ) {
1257  if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1258  normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1259  }
1260  zExp = aExp + bExp - 0x7F;
1261  aSig = ( aSig | 0x00800000 )<<7;
1262  bSig = ( bSig | 0x00800000 )<<8;
1263  shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1264  zSig = zSig64;
1265  if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1266  zSig <<= 1;
1267  --zExp;
1268  }
1269  return roundAndPackFloat32( roundData, zSign, zExp, zSig );
1270 
1271 }
1272 
1273 /*
1274 -------------------------------------------------------------------------------
1275 Returns the result of dividing the single-precision floating-point value `a'
1276 by the corresponding value `b'. The operation is performed according to the
1277 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1278 -------------------------------------------------------------------------------
1279 */
1280 float32 float32_div( struct roundingData *roundData, float32 a, float32 b )
1281 {
1282  flag aSign, bSign, zSign;
1283  int16 aExp, bExp, zExp;
1284  bits32 aSig, bSig, zSig;
1285 
1286  aSig = extractFloat32Frac( a );
1287  aExp = extractFloat32Exp( a );
1288  aSign = extractFloat32Sign( a );
1289  bSig = extractFloat32Frac( b );
1290  bExp = extractFloat32Exp( b );
1291  bSign = extractFloat32Sign( b );
1292  zSign = aSign ^ bSign;
1293  if ( aExp == 0xFF ) {
1294  if ( aSig ) return propagateFloat32NaN( a, b );
1295  if ( bExp == 0xFF ) {
1296  if ( bSig ) return propagateFloat32NaN( a, b );
1297  roundData->exception |= float_flag_invalid;
1298  return float32_default_nan;
1299  }
1300  return packFloat32( zSign, 0xFF, 0 );
1301  }
1302  if ( bExp == 0xFF ) {
1303  if ( bSig ) return propagateFloat32NaN( a, b );
1304  return packFloat32( zSign, 0, 0 );
1305  }
1306  if ( bExp == 0 ) {
1307  if ( bSig == 0 ) {
1308  if ( ( aExp | aSig ) == 0 ) {
1309  roundData->exception |= float_flag_invalid;
1310  return float32_default_nan;
1311  }
1312  roundData->exception |= float_flag_divbyzero;
1313  return packFloat32( zSign, 0xFF, 0 );
1314  }
1315  normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1316  }
1317  if ( aExp == 0 ) {
1318  if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1319  normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1320  }
1321  zExp = aExp - bExp + 0x7D;
1322  aSig = ( aSig | 0x00800000 )<<7;
1323  bSig = ( bSig | 0x00800000 )<<8;
1324  if ( bSig <= ( aSig + aSig ) ) {
1325  aSig >>= 1;
1326  ++zExp;
1327  }
1328  {
1329  bits64 tmp = ( (bits64) aSig )<<32;
1330  do_div( tmp, bSig );
1331  zSig = tmp;
1332  }
1333  if ( ( zSig & 0x3F ) == 0 ) {
1334  zSig |= ( ( (bits64) bSig ) * zSig != ( (bits64) aSig )<<32 );
1335  }
1336  return roundAndPackFloat32( roundData, zSign, zExp, zSig );
1337 
1338 }
1339 
1340 /*
1341 -------------------------------------------------------------------------------
1342 Returns the remainder of the single-precision floating-point value `a'
1343 with respect to the corresponding value `b'. The operation is performed
1344 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1345 -------------------------------------------------------------------------------
1346 */
1347 float32 float32_rem( struct roundingData *roundData, float32 a, float32 b )
1348 {
1349  flag aSign, bSign, zSign;
1350  int16 aExp, bExp, expDiff;
1351  bits32 aSig, bSig;
1352  bits32 q;
1353  bits64 aSig64, bSig64, q64;
1354  bits32 alternateASig;
1355  sbits32 sigMean;
1356 
1357  aSig = extractFloat32Frac( a );
1358  aExp = extractFloat32Exp( a );
1359  aSign = extractFloat32Sign( a );
1360  bSig = extractFloat32Frac( b );
1361  bExp = extractFloat32Exp( b );
1362  bSign = extractFloat32Sign( b );
1363  if ( aExp == 0xFF ) {
1364  if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1365  return propagateFloat32NaN( a, b );
1366  }
1367  roundData->exception |= float_flag_invalid;
1368  return float32_default_nan;
1369  }
1370  if ( bExp == 0xFF ) {
1371  if ( bSig ) return propagateFloat32NaN( a, b );
1372  return a;
1373  }
1374  if ( bExp == 0 ) {
1375  if ( bSig == 0 ) {
1376  roundData->exception |= float_flag_invalid;
1377  return float32_default_nan;
1378  }
1379  normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1380  }
1381  if ( aExp == 0 ) {
1382  if ( aSig == 0 ) return a;
1383  normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1384  }
1385  expDiff = aExp - bExp;
1386  aSig |= 0x00800000;
1387  bSig |= 0x00800000;
1388  if ( expDiff < 32 ) {
1389  aSig <<= 8;
1390  bSig <<= 8;
1391  if ( expDiff < 0 ) {
1392  if ( expDiff < -1 ) return a;
1393  aSig >>= 1;
1394  }
1395  q = ( bSig <= aSig );
1396  if ( q ) aSig -= bSig;
1397  if ( 0 < expDiff ) {
1398  bits64 tmp = ( (bits64) aSig )<<32;
1399  do_div( tmp, bSig );
1400  q = tmp;
1401  q >>= 32 - expDiff;
1402  bSig >>= 2;
1403  aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1404  }
1405  else {
1406  aSig >>= 2;
1407  bSig >>= 2;
1408  }
1409  }
1410  else {
1411  if ( bSig <= aSig ) aSig -= bSig;
1412  aSig64 = ( (bits64) aSig )<<40;
1413  bSig64 = ( (bits64) bSig )<<40;
1414  expDiff -= 64;
1415  while ( 0 < expDiff ) {
1416  q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1417  q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1418  aSig64 = - ( ( bSig * q64 )<<38 );
1419  expDiff -= 62;
1420  }
1421  expDiff += 64;
1422  q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1423  q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1424  q = q64>>( 64 - expDiff );
1425  bSig <<= 6;
1426  aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1427  }
1428  do {
1429  alternateASig = aSig;
1430  ++q;
1431  aSig -= bSig;
1432  } while ( 0 <= (sbits32) aSig );
1433  sigMean = aSig + alternateASig;
1434  if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1435  aSig = alternateASig;
1436  }
1437  zSign = ( (sbits32) aSig < 0 );
1438  if ( zSign ) aSig = - aSig;
1439  return normalizeRoundAndPackFloat32( roundData, aSign ^ zSign, bExp, aSig );
1440 
1441 }
1442 
1443 /*
1444 -------------------------------------------------------------------------------
1445 Returns the square root of the single-precision floating-point value `a'.
1446 The operation is performed according to the IEC/IEEE Standard for Binary
1447 Floating-point Arithmetic.
1448 -------------------------------------------------------------------------------
1449 */
1450 float32 float32_sqrt( struct roundingData *roundData, float32 a )
1451 {
1452  flag aSign;
1453  int16 aExp, zExp;
1454  bits32 aSig, zSig;
1455  bits64 rem, term;
1456 
1457  aSig = extractFloat32Frac( a );
1458  aExp = extractFloat32Exp( a );
1459  aSign = extractFloat32Sign( a );
1460  if ( aExp == 0xFF ) {
1461  if ( aSig ) return propagateFloat32NaN( a, 0 );
1462  if ( ! aSign ) return a;
1463  roundData->exception |= float_flag_invalid;
1464  return float32_default_nan;
1465  }
1466  if ( aSign ) {
1467  if ( ( aExp | aSig ) == 0 ) return a;
1468  roundData->exception |= float_flag_invalid;
1469  return float32_default_nan;
1470  }
1471  if ( aExp == 0 ) {
1472  if ( aSig == 0 ) return 0;
1473  normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1474  }
1475  zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1476  aSig = ( aSig | 0x00800000 )<<8;
1477  zSig = estimateSqrt32( aExp, aSig ) + 2;
1478  if ( ( zSig & 0x7F ) <= 5 ) {
1479  if ( zSig < 2 ) {
1480  zSig = 0xFFFFFFFF;
1481  }
1482  else {
1483  aSig >>= aExp & 1;
1484  term = ( (bits64) zSig ) * zSig;
1485  rem = ( ( (bits64) aSig )<<32 ) - term;
1486  while ( (sbits64) rem < 0 ) {
1487  --zSig;
1488  rem += ( ( (bits64) zSig )<<1 ) | 1;
1489  }
1490  zSig |= ( rem != 0 );
1491  }
1492  }
1493  shift32RightJamming( zSig, 1, &zSig );
1494  return roundAndPackFloat32( roundData, 0, zExp, zSig );
1495 
1496 }
1497 
1498 /*
1499 -------------------------------------------------------------------------------
1500 Returns 1 if the single-precision floating-point value `a' is equal to the
1501 corresponding value `b', and 0 otherwise. The comparison is performed
1502 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1503 -------------------------------------------------------------------------------
1504 */
1506 {
1507 
1508  if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1509  || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1510  ) {
1513  }
1514  return 0;
1515  }
1516  return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1517 
1518 }
1519 
1520 /*
1521 -------------------------------------------------------------------------------
1522 Returns 1 if the single-precision floating-point value `a' is less than or
1523 equal to the corresponding value `b', and 0 otherwise. The comparison is
1524 performed according to the IEC/IEEE Standard for Binary Floating-point
1525 Arithmetic.
1526 -------------------------------------------------------------------------------
1527 */
1529 {
1530  flag aSign, bSign;
1531 
1532  if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1533  || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1534  ) {
1536  return 0;
1537  }
1538  aSign = extractFloat32Sign( a );
1539  bSign = extractFloat32Sign( b );
1540  if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1541  return ( a == b ) || ( aSign ^ ( a < b ) );
1542 
1543 }
1544 
1545 /*
1546 -------------------------------------------------------------------------------
1547 Returns 1 if the single-precision floating-point value `a' is less than
1548 the corresponding value `b', and 0 otherwise. The comparison is performed
1549 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1550 -------------------------------------------------------------------------------
1551 */
1553 {
1554  flag aSign, bSign;
1555 
1556  if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1557  || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1558  ) {
1560  return 0;
1561  }
1562  aSign = extractFloat32Sign( a );
1563  bSign = extractFloat32Sign( b );
1564  if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1565  return ( a != b ) && ( aSign ^ ( a < b ) );
1566 
1567 }
1568 
1569 /*
1570 -------------------------------------------------------------------------------
1571 Returns 1 if the single-precision floating-point value `a' is equal to the
1572 corresponding value `b', and 0 otherwise. The invalid exception is raised
1573 if either operand is a NaN. Otherwise, the comparison is performed
1574 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1575 -------------------------------------------------------------------------------
1576 */
1578 {
1579 
1580  if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1581  || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1582  ) {
1584  return 0;
1585  }
1586  return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1587 
1588 }
1589 
1590 /*
1591 -------------------------------------------------------------------------------
1592 Returns 1 if the single-precision floating-point value `a' is less than or
1593 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
1594 cause an exception. Otherwise, the comparison is performed according to the
1595 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1596 -------------------------------------------------------------------------------
1597 */
1599 {
1600  flag aSign, bSign;
1601  //int16 aExp, bExp;
1602 
1603  if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1604  || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1605  ) {
1606  /* Do nothing, even if NaN as we're quiet */
1607  return 0;
1608  }
1609  aSign = extractFloat32Sign( a );
1610  bSign = extractFloat32Sign( b );
1611  if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1612  return ( a == b ) || ( aSign ^ ( a < b ) );
1613 
1614 }
1615 
1616 /*
1617 -------------------------------------------------------------------------------
1618 Returns 1 if the single-precision floating-point value `a' is less than
1619 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
1620 exception. Otherwise, the comparison is performed according to the IEC/IEEE
1621 Standard for Binary Floating-point Arithmetic.
1622 -------------------------------------------------------------------------------
1623 */
1625 {
1626  flag aSign, bSign;
1627 
1628  if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1629  || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1630  ) {
1631  /* Do nothing, even if NaN as we're quiet */
1632  return 0;
1633  }
1634  aSign = extractFloat32Sign( a );
1635  bSign = extractFloat32Sign( b );
1636  if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1637  return ( a != b ) && ( aSign ^ ( a < b ) );
1638 
1639 }
1640 
1641 /*
1642 -------------------------------------------------------------------------------
1643 Returns the result of converting the double-precision floating-point value
1644 `a' to the 32-bit two's complement integer format. The conversion is
1645 performed according to the IEC/IEEE Standard for Binary Floating-point
1646 Arithmetic---which means in particular that the conversion is rounded
1647 according to the current rounding mode. If `a' is a NaN, the largest
1648 positive integer is returned. Otherwise, if the conversion overflows, the
1649 largest integer with the same sign as `a' is returned.
1650 -------------------------------------------------------------------------------
1651 */
1653 {
1654  flag aSign;
1655  int16 aExp, shiftCount;
1656  bits64 aSig;
1657 
1658  aSig = extractFloat64Frac( a );
1659  aExp = extractFloat64Exp( a );
1660  aSign = extractFloat64Sign( a );
1661  if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1662  if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1663  shiftCount = 0x42C - aExp;
1664  if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1665  return roundAndPackInt32( roundData, aSign, aSig );
1666 
1667 }
1668 
1669 /*
1670 -------------------------------------------------------------------------------
1671 Returns the result of converting the double-precision floating-point value
1672 `a' to the 32-bit two's complement integer format. The conversion is
1673 performed according to the IEC/IEEE Standard for Binary Floating-point
1674 Arithmetic, except that the conversion is always rounded toward zero. If
1675 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1676 conversion overflows, the largest integer with the same sign as `a' is
1677 returned.
1678 -------------------------------------------------------------------------------
1679 */
1681 {
1682  flag aSign;
1683  int16 aExp, shiftCount;
1684  bits64 aSig, savedASig;
1685  int32 z;
1686 
1687  aSig = extractFloat64Frac( a );
1688  aExp = extractFloat64Exp( a );
1689  aSign = extractFloat64Sign( a );
1690  shiftCount = 0x433 - aExp;
1691  if ( shiftCount < 21 ) {
1692  if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1693  goto invalid;
1694  }
1695  else if ( 52 < shiftCount ) {
1696  if ( aExp || aSig ) float_raise( float_flag_inexact );
1697  return 0;
1698  }
1699  aSig |= LIT64( 0x0010000000000000 );
1700  savedASig = aSig;
1701  aSig >>= shiftCount;
1702  z = aSig;
1703  if ( aSign ) z = - z;
1704  if ( ( z < 0 ) ^ aSign ) {
1705  invalid:
1707  return aSign ? 0x80000000 : 0x7FFFFFFF;
1708  }
1709  if ( ( aSig<<shiftCount ) != savedASig ) {
1711  }
1712  return z;
1713 
1714 }
1715 
1716 /*
1717 -------------------------------------------------------------------------------
1718 Returns the result of converting the double-precision floating-point value
1719 `a' to the 32-bit two's complement unsigned integer format. The conversion
1720 is performed according to the IEC/IEEE Standard for Binary Floating-point
1721 Arithmetic---which means in particular that the conversion is rounded
1722 according to the current rounding mode. If `a' is a NaN, the largest
1723 positive integer is returned. Otherwise, if the conversion overflows, the
1724 largest positive integer is returned.
1725 -------------------------------------------------------------------------------
1726 */
1728 {
1729  flag aSign;
1730  int16 aExp, shiftCount;
1731  bits64 aSig;
1732 
1733  aSig = extractFloat64Frac( a );
1734  aExp = extractFloat64Exp( a );
1735  aSign = 0; //extractFloat64Sign( a );
1736  //if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1737  if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1738  shiftCount = 0x42C - aExp;
1739  if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1740  return roundAndPackInt32( roundData, aSign, aSig );
1741 }
1742 
1743 /*
1744 -------------------------------------------------------------------------------
1745 Returns the result of converting the double-precision floating-point value
1746 `a' to the 32-bit two's complement integer format. The conversion is
1747 performed according to the IEC/IEEE Standard for Binary Floating-point
1748 Arithmetic, except that the conversion is always rounded toward zero. If
1749 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1750 conversion overflows, the largest positive integer is returned.
1751 -------------------------------------------------------------------------------
1752 */
1754 {
1755  flag aSign;
1756  int16 aExp, shiftCount;
1757  bits64 aSig, savedASig;
1758  int32 z;
1759 
1760  aSig = extractFloat64Frac( a );
1761  aExp = extractFloat64Exp( a );
1762  aSign = extractFloat64Sign( a );
1763  shiftCount = 0x433 - aExp;
1764  if ( shiftCount < 21 ) {
1765  if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1766  goto invalid;
1767  }
1768  else if ( 52 < shiftCount ) {
1769  if ( aExp || aSig ) float_raise( float_flag_inexact );
1770  return 0;
1771  }
1772  aSig |= LIT64( 0x0010000000000000 );
1773  savedASig = aSig;
1774  aSig >>= shiftCount;
1775  z = aSig;
1776  if ( aSign ) z = - z;
1777  if ( ( z < 0 ) ^ aSign ) {
1778  invalid:
1780  return aSign ? 0x80000000 : 0x7FFFFFFF;
1781  }
1782  if ( ( aSig<<shiftCount ) != savedASig ) {
1784  }
1785  return z;
1786 }
1787 
1788 /*
1789 -------------------------------------------------------------------------------
1790 Returns the result of converting the double-precision floating-point value
1791 `a' to the single-precision floating-point format. The conversion is
1792 performed according to the IEC/IEEE Standard for Binary Floating-point
1793 Arithmetic.
1794 -------------------------------------------------------------------------------
1795 */
1797 {
1798  flag aSign;
1799  int16 aExp;
1800  bits64 aSig;
1801  bits32 zSig;
1802 
1803  aSig = extractFloat64Frac( a );
1804  aExp = extractFloat64Exp( a );
1805  aSign = extractFloat64Sign( a );
1806  if ( aExp == 0x7FF ) {
1807  if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
1808  return packFloat32( aSign, 0xFF, 0 );
1809  }
1810  shift64RightJamming( aSig, 22, &aSig );
1811  zSig = aSig;
1812  if ( aExp || zSig ) {
1813  zSig |= 0x40000000;
1814  aExp -= 0x381;
1815  }
1816  return roundAndPackFloat32( roundData, aSign, aExp, zSig );
1817 
1818 }
1819 
1820 #ifdef FLOATX80
1821 
1822 /*
1823 -------------------------------------------------------------------------------
1824 Returns the result of converting the double-precision floating-point value
1825 `a' to the extended double-precision floating-point format. The conversion
1826 is performed according to the IEC/IEEE Standard for Binary Floating-point
1827 Arithmetic.
1828 -------------------------------------------------------------------------------
1829 */
1830 floatx80 float64_to_floatx80( float64 a )
1831 {
1832  flag aSign;
1833  int16 aExp;
1834  bits64 aSig;
1835 
1836  aSig = extractFloat64Frac( a );
1837  aExp = extractFloat64Exp( a );
1838  aSign = extractFloat64Sign( a );
1839  if ( aExp == 0x7FF ) {
1840  if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
1841  return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1842  }
1843  if ( aExp == 0 ) {
1844  if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1845  normalizeFloat64Subnormal( aSig, &aExp, &aSig );
1846  }
1847  return
1848  packFloatx80(
1849  aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
1850 
1851 }
1852 
1853 #endif
1854 
1855 /*
1856 -------------------------------------------------------------------------------
1857 Rounds the double-precision floating-point value `a' to an integer, and
1858 returns the result as a double-precision floating-point value. The
1859 operation is performed according to the IEC/IEEE Standard for Binary
1860 Floating-point Arithmetic.
1861 -------------------------------------------------------------------------------
1862 */
1864 {
1865  flag aSign;
1866  int16 aExp;
1867  bits64 lastBitMask, roundBitsMask;
1868  int8 roundingMode;
1869  float64 z;
1870 
1871  aExp = extractFloat64Exp( a );
1872  if ( 0x433 <= aExp ) {
1873  if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
1874  return propagateFloat64NaN( a, a );
1875  }
1876  return a;
1877  }
1878  if ( aExp <= 0x3FE ) {
1879  if ( (bits64) ( a<<1 ) == 0 ) return a;
1880  roundData->exception |= float_flag_inexact;
1881  aSign = extractFloat64Sign( a );
1882  switch ( roundData->mode ) {
1884  if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
1885  return packFloat64( aSign, 0x3FF, 0 );
1886  }
1887  break;
1888  case float_round_down:
1889  return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
1890  case float_round_up:
1891  return
1892  aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
1893  }
1894  return packFloat64( aSign, 0, 0 );
1895  }
1896  lastBitMask = 1;
1897  lastBitMask <<= 0x433 - aExp;
1898  roundBitsMask = lastBitMask - 1;
1899  z = a;
1900  roundingMode = roundData->mode;
1901  if ( roundingMode == float_round_nearest_even ) {
1902  z += lastBitMask>>1;
1903  if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1904  }
1905  else if ( roundingMode != float_round_to_zero ) {
1906  if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1907  z += roundBitsMask;
1908  }
1909  }
1910  z &= ~ roundBitsMask;
1911  if ( z != a ) roundData->exception |= float_flag_inexact;
1912  return z;
1913 
1914 }
1915 
1916 /*
1917 -------------------------------------------------------------------------------
1918 Returns the result of adding the absolute values of the double-precision
1919 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
1920 before being returned. `zSign' is ignored if the result is a NaN. The
1921 addition is performed according to the IEC/IEEE Standard for Binary
1922 Floating-point Arithmetic.
1923 -------------------------------------------------------------------------------
1924 */
1925 static float64 addFloat64Sigs( struct roundingData *roundData, float64 a, float64 b, flag zSign )
1926 {
1927  int16 aExp, bExp, zExp;
1928  bits64 aSig, bSig, zSig;
1929  int16 expDiff;
1930 
1931  aSig = extractFloat64Frac( a );
1932  aExp = extractFloat64Exp( a );
1933  bSig = extractFloat64Frac( b );
1934  bExp = extractFloat64Exp( b );
1935  expDiff = aExp - bExp;
1936  aSig <<= 9;
1937  bSig <<= 9;
1938  if ( 0 < expDiff ) {
1939  if ( aExp == 0x7FF ) {
1940  if ( aSig ) return propagateFloat64NaN( a, b );
1941  return a;
1942  }
1943  if ( bExp == 0 ) {
1944  --expDiff;
1945  }
1946  else {
1947  bSig |= LIT64( 0x2000000000000000 );
1948  }
1949  shift64RightJamming( bSig, expDiff, &bSig );
1950  zExp = aExp;
1951  }
1952  else if ( expDiff < 0 ) {
1953  if ( bExp == 0x7FF ) {
1954  if ( bSig ) return propagateFloat64NaN( a, b );
1955  return packFloat64( zSign, 0x7FF, 0 );
1956  }
1957  if ( aExp == 0 ) {
1958  ++expDiff;
1959  }
1960  else {
1961  aSig |= LIT64( 0x2000000000000000 );
1962  }
1963  shift64RightJamming( aSig, - expDiff, &aSig );
1964  zExp = bExp;
1965  }
1966  else {
1967  if ( aExp == 0x7FF ) {
1968  if ( aSig | bSig ) return propagateFloat64NaN( a, b );
1969  return a;
1970  }
1971  if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
1972  zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
1973  zExp = aExp;
1974  goto roundAndPack;
1975  }
1976  aSig |= LIT64( 0x2000000000000000 );
1977  zSig = ( aSig + bSig )<<1;
1978  --zExp;
1979  if ( (sbits64) zSig < 0 ) {
1980  zSig = aSig + bSig;
1981  ++zExp;
1982  }
1983  roundAndPack:
1984  return roundAndPackFloat64( roundData, zSign, zExp, zSig );
1985 
1986 }
1987 
1988 /*
1989 -------------------------------------------------------------------------------
1990 Returns the result of subtracting the absolute values of the double-
1991 precision floating-point values `a' and `b'. If `zSign' is true, the
1992 difference is negated before being returned. `zSign' is ignored if the
1993 result is a NaN. The subtraction is performed according to the IEC/IEEE
1994 Standard for Binary Floating-point Arithmetic.
1995 -------------------------------------------------------------------------------
1996 */
1997 static float64 subFloat64Sigs( struct roundingData *roundData, float64 a, float64 b, flag zSign )
1998 {
1999  int16 aExp, bExp, zExp;
2000  bits64 aSig, bSig, zSig;
2001  int16 expDiff;
2002 
2003  aSig = extractFloat64Frac( a );
2004  aExp = extractFloat64Exp( a );
2005  bSig = extractFloat64Frac( b );
2006  bExp = extractFloat64Exp( b );
2007  expDiff = aExp - bExp;
2008  aSig <<= 10;
2009  bSig <<= 10;
2010  if ( 0 < expDiff ) goto aExpBigger;
2011  if ( expDiff < 0 ) goto bExpBigger;
2012  if ( aExp == 0x7FF ) {
2013  if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2014  roundData->exception |= float_flag_invalid;
2015  return float64_default_nan;
2016  }
2017  if ( aExp == 0 ) {
2018  aExp = 1;
2019  bExp = 1;
2020  }
2021  if ( bSig < aSig ) goto aBigger;
2022  if ( aSig < bSig ) goto bBigger;
2023  return packFloat64( roundData->mode == float_round_down, 0, 0 );
2024  bExpBigger:
2025  if ( bExp == 0x7FF ) {
2026  if ( bSig ) return propagateFloat64NaN( a, b );
2027  return packFloat64( zSign ^ 1, 0x7FF, 0 );
2028  }
2029  if ( aExp == 0 ) {
2030  ++expDiff;
2031  }
2032  else {
2033  aSig |= LIT64( 0x4000000000000000 );
2034  }
2035  shift64RightJamming( aSig, - expDiff, &aSig );
2036  bSig |= LIT64( 0x4000000000000000 );
2037  bBigger:
2038  zSig = bSig - aSig;
2039  zExp = bExp;
2040  zSign ^= 1;
2041  goto normalizeRoundAndPack;
2042  aExpBigger:
2043  if ( aExp == 0x7FF ) {
2044  if ( aSig ) return propagateFloat64NaN( a, b );
2045  return a;
2046  }
2047  if ( bExp == 0 ) {
2048  --expDiff;
2049  }
2050  else {
2051  bSig |= LIT64( 0x4000000000000000 );
2052  }
2053  shift64RightJamming( bSig, expDiff, &bSig );
2054  aSig |= LIT64( 0x4000000000000000 );
2055  aBigger:
2056  zSig = aSig - bSig;
2057  zExp = aExp;
2058  normalizeRoundAndPack:
2059  --zExp;
2060  return normalizeRoundAndPackFloat64( roundData, zSign, zExp, zSig );
2061 
2062 }
2063 
2064 /*
2065 -------------------------------------------------------------------------------
2066 Returns the result of adding the double-precision floating-point values `a'
2067 and `b'. The operation is performed according to the IEC/IEEE Standard for
2068 Binary Floating-point Arithmetic.
2069 -------------------------------------------------------------------------------
2070 */
2071 float64 float64_add( struct roundingData *roundData, float64 a, float64 b )
2072 {
2073  flag aSign, bSign;
2074 
2075  aSign = extractFloat64Sign( a );
2076  bSign = extractFloat64Sign( b );
2077  if ( aSign == bSign ) {
2078  return addFloat64Sigs( roundData, a, b, aSign );
2079  }
2080  else {
2081  return subFloat64Sigs( roundData, a, b, aSign );
2082  }
2083 
2084 }
2085 
2086 /*
2087 -------------------------------------------------------------------------------
2088 Returns the result of subtracting the double-precision floating-point values
2089 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2090 for Binary Floating-point Arithmetic.
2091 -------------------------------------------------------------------------------
2092 */
2093 float64 float64_sub( struct roundingData *roundData, float64 a, float64 b )
2094 {
2095  flag aSign, bSign;
2096 
2097  aSign = extractFloat64Sign( a );
2098  bSign = extractFloat64Sign( b );
2099  if ( aSign == bSign ) {
2100  return subFloat64Sigs( roundData, a, b, aSign );
2101  }
2102  else {
2103  return addFloat64Sigs( roundData, a, b, aSign );
2104  }
2105 
2106 }
2107 
2108 /*
2109 -------------------------------------------------------------------------------
2110 Returns the result of multiplying the double-precision floating-point values
2111 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2112 for Binary Floating-point Arithmetic.
2113 -------------------------------------------------------------------------------
2114 */
2115 float64 float64_mul( struct roundingData *roundData, float64 a, float64 b )
2116 {
2117  flag aSign, bSign, zSign;
2118  int16 aExp, bExp, zExp;
2119  bits64 aSig, bSig, zSig0, zSig1;
2120 
2121  aSig = extractFloat64Frac( a );
2122  aExp = extractFloat64Exp( a );
2123  aSign = extractFloat64Sign( a );
2124  bSig = extractFloat64Frac( b );
2125  bExp = extractFloat64Exp( b );
2126  bSign = extractFloat64Sign( b );
2127  zSign = aSign ^ bSign;
2128  if ( aExp == 0x7FF ) {
2129  if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2130  return propagateFloat64NaN( a, b );
2131  }
2132  if ( ( bExp | bSig ) == 0 ) {
2133  roundData->exception |= float_flag_invalid;
2134  return float64_default_nan;
2135  }
2136  return packFloat64( zSign, 0x7FF, 0 );
2137  }
2138  if ( bExp == 0x7FF ) {
2139  if ( bSig ) return propagateFloat64NaN( a, b );
2140  if ( ( aExp | aSig ) == 0 ) {
2141  roundData->exception |= float_flag_invalid;
2142  return float64_default_nan;
2143  }
2144  return packFloat64( zSign, 0x7FF, 0 );
2145  }
2146  if ( aExp == 0 ) {
2147  if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2148  normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2149  }
2150  if ( bExp == 0 ) {
2151  if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2152  normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2153  }
2154  zExp = aExp + bExp - 0x3FF;
2155  aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2156  bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2157  mul64To128( aSig, bSig, &zSig0, &zSig1 );
2158  zSig0 |= ( zSig1 != 0 );
2159  if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2160  zSig0 <<= 1;
2161  --zExp;
2162  }
2163  return roundAndPackFloat64( roundData, zSign, zExp, zSig0 );
2164 
2165 }
2166 
2167 /*
2168 -------------------------------------------------------------------------------
2169 Returns the result of dividing the double-precision floating-point value `a'
2170 by the corresponding value `b'. The operation is performed according to
2171 the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2172 -------------------------------------------------------------------------------
2173 */
2174 float64 float64_div( struct roundingData *roundData, float64 a, float64 b )
2175 {
2176  flag aSign, bSign, zSign;
2177  int16 aExp, bExp, zExp;
2178  bits64 aSig, bSig, zSig;
2179  bits64 rem0, rem1;
2180  bits64 term0, term1;
2181 
2182  aSig = extractFloat64Frac( a );
2183  aExp = extractFloat64Exp( a );
2184  aSign = extractFloat64Sign( a );
2185  bSig = extractFloat64Frac( b );
2186  bExp = extractFloat64Exp( b );
2187  bSign = extractFloat64Sign( b );
2188  zSign = aSign ^ bSign;
2189  if ( aExp == 0x7FF ) {
2190  if ( aSig ) return propagateFloat64NaN( a, b );
2191  if ( bExp == 0x7FF ) {
2192  if ( bSig ) return propagateFloat64NaN( a, b );
2193  roundData->exception |= float_flag_invalid;
2194  return float64_default_nan;
2195  }
2196  return packFloat64( zSign, 0x7FF, 0 );
2197  }
2198  if ( bExp == 0x7FF ) {
2199  if ( bSig ) return propagateFloat64NaN( a, b );
2200  return packFloat64( zSign, 0, 0 );
2201  }
2202  if ( bExp == 0 ) {
2203  if ( bSig == 0 ) {
2204  if ( ( aExp | aSig ) == 0 ) {
2205  roundData->exception |= float_flag_invalid;
2206  return float64_default_nan;
2207  }
2208  roundData->exception |= float_flag_divbyzero;
2209  return packFloat64( zSign, 0x7FF, 0 );
2210  }
2211  normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2212  }
2213  if ( aExp == 0 ) {
2214  if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2215  normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2216  }
2217  zExp = aExp - bExp + 0x3FD;
2218  aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2219  bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2220  if ( bSig <= ( aSig + aSig ) ) {
2221  aSig >>= 1;
2222  ++zExp;
2223  }
2224  zSig = estimateDiv128To64( aSig, 0, bSig );
2225  if ( ( zSig & 0x1FF ) <= 2 ) {
2226  mul64To128( bSig, zSig, &term0, &term1 );
2227  sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2228  while ( (sbits64) rem0 < 0 ) {
2229  --zSig;
2230  add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2231  }
2232  zSig |= ( rem1 != 0 );
2233  }
2234  return roundAndPackFloat64( roundData, zSign, zExp, zSig );
2235 
2236 }
2237 
2238 /*
2239 -------------------------------------------------------------------------------
2240 Returns the remainder of the double-precision floating-point value `a'
2241 with respect to the corresponding value `b'. The operation is performed
2242 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2243 -------------------------------------------------------------------------------
2244 */
2245 float64 float64_rem( struct roundingData *roundData, float64 a, float64 b )
2246 {
2247  flag aSign, bSign, zSign;
2248  int16 aExp, bExp, expDiff;
2249  bits64 aSig, bSig;
2250  bits64 q, alternateASig;
2251  sbits64 sigMean;
2252 
2253  aSig = extractFloat64Frac( a );
2254  aExp = extractFloat64Exp( a );
2255  aSign = extractFloat64Sign( a );
2256  bSig = extractFloat64Frac( b );
2257  bExp = extractFloat64Exp( b );
2258  bSign = extractFloat64Sign( b );
2259  if ( aExp == 0x7FF ) {
2260  if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2261  return propagateFloat64NaN( a, b );
2262  }
2263  roundData->exception |= float_flag_invalid;
2264  return float64_default_nan;
2265  }
2266  if ( bExp == 0x7FF ) {
2267  if ( bSig ) return propagateFloat64NaN( a, b );
2268  return a;
2269  }
2270  if ( bExp == 0 ) {
2271  if ( bSig == 0 ) {
2272  roundData->exception |= float_flag_invalid;
2273  return float64_default_nan;
2274  }
2275  normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2276  }
2277  if ( aExp == 0 ) {
2278  if ( aSig == 0 ) return a;
2279  normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2280  }
2281  expDiff = aExp - bExp;
2282  aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2283  bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2284  if ( expDiff < 0 ) {
2285  if ( expDiff < -1 ) return a;
2286  aSig >>= 1;
2287  }
2288  q = ( bSig <= aSig );
2289  if ( q ) aSig -= bSig;
2290  expDiff -= 64;
2291  while ( 0 < expDiff ) {
2292  q = estimateDiv128To64( aSig, 0, bSig );
2293  q = ( 2 < q ) ? q - 2 : 0;
2294  aSig = - ( ( bSig>>2 ) * q );
2295  expDiff -= 62;
2296  }
2297  expDiff += 64;
2298  if ( 0 < expDiff ) {
2299  q = estimateDiv128To64( aSig, 0, bSig );
2300  q = ( 2 < q ) ? q - 2 : 0;
2301  q >>= 64 - expDiff;
2302  bSig >>= 2;
2303  aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2304  }
2305  else {
2306  aSig >>= 2;
2307  bSig >>= 2;
2308  }
2309  do {
2310  alternateASig = aSig;
2311  ++q;
2312  aSig -= bSig;
2313  } while ( 0 <= (sbits64) aSig );
2314  sigMean = aSig + alternateASig;
2315  if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2316  aSig = alternateASig;
2317  }
2318  zSign = ( (sbits64) aSig < 0 );
2319  if ( zSign ) aSig = - aSig;
2320  return normalizeRoundAndPackFloat64( roundData, aSign ^ zSign, bExp, aSig );
2321 
2322 }
2323 
2324 /*
2325 -------------------------------------------------------------------------------
2326 Returns the square root of the double-precision floating-point value `a'.
2327 The operation is performed according to the IEC/IEEE Standard for Binary
2328 Floating-point Arithmetic.
2329 -------------------------------------------------------------------------------
2330 */
2331 float64 float64_sqrt( struct roundingData *roundData, float64 a )
2332 {
2333  flag aSign;
2334  int16 aExp, zExp;
2335  bits64 aSig, zSig;
2336  bits64 rem0, rem1, term0, term1; //, shiftedRem;
2337  //float64 z;
2338 
2339  aSig = extractFloat64Frac( a );
2340  aExp = extractFloat64Exp( a );
2341  aSign = extractFloat64Sign( a );
2342  if ( aExp == 0x7FF ) {
2343  if ( aSig ) return propagateFloat64NaN( a, a );
2344  if ( ! aSign ) return a;
2345  roundData->exception |= float_flag_invalid;
2346  return float64_default_nan;
2347  }
2348  if ( aSign ) {
2349  if ( ( aExp | aSig ) == 0 ) return a;
2350  roundData->exception |= float_flag_invalid;
2351  return float64_default_nan;
2352  }
2353  if ( aExp == 0 ) {
2354  if ( aSig == 0 ) return 0;
2355  normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2356  }
2357  zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2358  aSig |= LIT64( 0x0010000000000000 );
2359  zSig = estimateSqrt32( aExp, aSig>>21 );
2360  zSig <<= 31;
2361  aSig <<= 9 - ( aExp & 1 );
2362  zSig = estimateDiv128To64( aSig, 0, zSig ) + zSig + 2;
2363  if ( ( zSig & 0x3FF ) <= 5 ) {
2364  if ( zSig < 2 ) {
2365  zSig = LIT64( 0xFFFFFFFFFFFFFFFF );
2366  }
2367  else {
2368  aSig <<= 2;
2369  mul64To128( zSig, zSig, &term0, &term1 );
2370  sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2371  while ( (sbits64) rem0 < 0 ) {
2372  --zSig;
2373  shortShift128Left( 0, zSig, 1, &term0, &term1 );
2374  term1 |= 1;
2375  add128( rem0, rem1, term0, term1, &rem0, &rem1 );
2376  }
2377  zSig |= ( ( rem0 | rem1 ) != 0 );
2378  }
2379  }
2380  shift64RightJamming( zSig, 1, &zSig );
2381  return roundAndPackFloat64( roundData, 0, zExp, zSig );
2382 
2383 }
2384 
2385 /*
2386 -------------------------------------------------------------------------------
2387 Returns 1 if the double-precision floating-point value `a' is equal to the
2388 corresponding value `b', and 0 otherwise. The comparison is performed
2389 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2390 -------------------------------------------------------------------------------
2391 */
2393 {
2394 
2395  if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2396  || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2397  ) {
2400  }
2401  return 0;
2402  }
2403  return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2404 
2405 }
2406 
2407 /*
2408 -------------------------------------------------------------------------------
2409 Returns 1 if the double-precision floating-point value `a' is less than or
2410 equal to the corresponding value `b', and 0 otherwise. The comparison is
2411 performed according to the IEC/IEEE Standard for Binary Floating-point
2412 Arithmetic.
2413 -------------------------------------------------------------------------------
2414 */
2416 {
2417  flag aSign, bSign;
2418 
2419  if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2420  || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2421  ) {
2423  return 0;
2424  }
2425  aSign = extractFloat64Sign( a );
2426  bSign = extractFloat64Sign( b );
2427  if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2428  return ( a == b ) || ( aSign ^ ( a < b ) );
2429 
2430 }
2431 
2432 /*
2433 -------------------------------------------------------------------------------
2434 Returns 1 if the double-precision floating-point value `a' is less than
2435 the corresponding value `b', and 0 otherwise. The comparison is performed
2436 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2437 -------------------------------------------------------------------------------
2438 */
2440 {
2441  flag aSign, bSign;
2442 
2443  if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2444  || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2445  ) {
2447  return 0;
2448  }
2449  aSign = extractFloat64Sign( a );
2450  bSign = extractFloat64Sign( b );
2451  if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2452  return ( a != b ) && ( aSign ^ ( a < b ) );
2453 
2454 }
2455 
2456 /*
2457 -------------------------------------------------------------------------------
2458 Returns 1 if the double-precision floating-point value `a' is equal to the
2459 corresponding value `b', and 0 otherwise. The invalid exception is raised
2460 if either operand is a NaN. Otherwise, the comparison is performed
2461 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2462 -------------------------------------------------------------------------------
2463 */
2465 {
2466 
2467  if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2468  || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2469  ) {
2471  return 0;
2472  }
2473  return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2474 
2475 }
2476 
2477 /*
2478 -------------------------------------------------------------------------------
2479 Returns 1 if the double-precision floating-point value `a' is less than or
2480 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2481 cause an exception. Otherwise, the comparison is performed according to the
2482 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2483 -------------------------------------------------------------------------------
2484 */
2486 {
2487  flag aSign, bSign;
2488  //int16 aExp, bExp;
2489 
2490  if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2491  || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2492  ) {
2493  /* Do nothing, even if NaN as we're quiet */
2494  return 0;
2495  }
2496  aSign = extractFloat64Sign( a );
2497  bSign = extractFloat64Sign( b );
2498  if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2499  return ( a == b ) || ( aSign ^ ( a < b ) );
2500 
2501 }
2502 
2503 /*
2504 -------------------------------------------------------------------------------
2505 Returns 1 if the double-precision floating-point value `a' is less than
2506 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2507 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2508 Standard for Binary Floating-point Arithmetic.
2509 -------------------------------------------------------------------------------
2510 */
2512 {
2513  flag aSign, bSign;
2514 
2515  if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2516  || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2517  ) {
2518  /* Do nothing, even if NaN as we're quiet */
2519  return 0;
2520  }
2521  aSign = extractFloat64Sign( a );
2522  bSign = extractFloat64Sign( b );
2523  if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2524  return ( a != b ) && ( aSign ^ ( a < b ) );
2525 
2526 }
2527 
2528 #ifdef FLOATX80
2529 
2530 /*
2531 -------------------------------------------------------------------------------
2532 Returns the result of converting the extended double-precision floating-
2533 point value `a' to the 32-bit two's complement integer format. The
2534 conversion is performed according to the IEC/IEEE Standard for Binary
2535 Floating-point Arithmetic---which means in particular that the conversion
2536 is rounded according to the current rounding mode. If `a' is a NaN, the
2537 largest positive integer is returned. Otherwise, if the conversion
2538 overflows, the largest integer with the same sign as `a' is returned.
2539 -------------------------------------------------------------------------------
2540 */
2541 int32 floatx80_to_int32( struct roundingData *roundData, floatx80 a )
2542 {
2543  flag aSign;
2544  int32 aExp, shiftCount;
2545  bits64 aSig;
2546 
2547  aSig = extractFloatx80Frac( a );
2548  aExp = extractFloatx80Exp( a );
2549  aSign = extractFloatx80Sign( a );
2550  if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2551  shiftCount = 0x4037 - aExp;
2552  if ( shiftCount <= 0 ) shiftCount = 1;
2553  shift64RightJamming( aSig, shiftCount, &aSig );
2554  return roundAndPackInt32( roundData, aSign, aSig );
2555 
2556 }
2557 
2558 /*
2559 -------------------------------------------------------------------------------
2560 Returns the result of converting the extended double-precision floating-
2561 point value `a' to the 32-bit two's complement integer format. The
2562 conversion is performed according to the IEC/IEEE Standard for Binary
2563 Floating-point Arithmetic, except that the conversion is always rounded
2564 toward zero. If `a' is a NaN, the largest positive integer is returned.
2565 Otherwise, if the conversion overflows, the largest integer with the same
2566 sign as `a' is returned.
2567 -------------------------------------------------------------------------------
2568 */
2569 int32 floatx80_to_int32_round_to_zero( floatx80 a )
2570 {
2571  flag aSign;
2572  int32 aExp, shiftCount;
2573  bits64 aSig, savedASig;
2574  int32 z;
2575 
2576  aSig = extractFloatx80Frac( a );
2577  aExp = extractFloatx80Exp( a );
2578  aSign = extractFloatx80Sign( a );
2579  shiftCount = 0x403E - aExp;
2580  if ( shiftCount < 32 ) {
2581  if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2582  goto invalid;
2583  }
2584  else if ( 63 < shiftCount ) {
2585  if ( aExp || aSig ) float_raise( float_flag_inexact );
2586  return 0;
2587  }
2588  savedASig = aSig;
2589  aSig >>= shiftCount;
2590  z = aSig;
2591  if ( aSign ) z = - z;
2592  if ( ( z < 0 ) ^ aSign ) {
2593  invalid:
2595  return aSign ? 0x80000000 : 0x7FFFFFFF;
2596  }
2597  if ( ( aSig<<shiftCount ) != savedASig ) {
2599  }
2600  return z;
2601 
2602 }
2603 
2604 /*
2605 -------------------------------------------------------------------------------
2606 Returns the result of converting the extended double-precision floating-
2607 point value `a' to the single-precision floating-point format. The
2608 conversion is performed according to the IEC/IEEE Standard for Binary
2609 Floating-point Arithmetic.
2610 -------------------------------------------------------------------------------
2611 */
2612 float32 floatx80_to_float32( struct roundingData *roundData, floatx80 a )
2613 {
2614  flag aSign;
2615  int32 aExp;
2616  bits64 aSig;
2617 
2618  aSig = extractFloatx80Frac( a );
2619  aExp = extractFloatx80Exp( a );
2620  aSign = extractFloatx80Sign( a );
2621  if ( aExp == 0x7FFF ) {
2622  if ( (bits64) ( aSig<<1 ) ) {
2623  return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
2624  }
2625  return packFloat32( aSign, 0xFF, 0 );
2626  }
2627  shift64RightJamming( aSig, 33, &aSig );
2628  if ( aExp || aSig ) aExp -= 0x3F81;
2629  return roundAndPackFloat32( roundData, aSign, aExp, aSig );
2630 
2631 }
2632 
2633 /*
2634 -------------------------------------------------------------------------------
2635 Returns the result of converting the extended double-precision floating-
2636 point value `a' to the double-precision floating-point format. The
2637 conversion is performed according to the IEC/IEEE Standard for Binary
2638 Floating-point Arithmetic.
2639 -------------------------------------------------------------------------------
2640 */
2641 float64 floatx80_to_float64( struct roundingData *roundData, floatx80 a )
2642 {
2643  flag aSign;
2644  int32 aExp;
2645  bits64 aSig, zSig;
2646 
2647  aSig = extractFloatx80Frac( a );
2648  aExp = extractFloatx80Exp( a );
2649  aSign = extractFloatx80Sign( a );
2650  if ( aExp == 0x7FFF ) {
2651  if ( (bits64) ( aSig<<1 ) ) {
2652  return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
2653  }
2654  return packFloat64( aSign, 0x7FF, 0 );
2655  }
2656  shift64RightJamming( aSig, 1, &zSig );
2657  if ( aExp || aSig ) aExp -= 0x3C01;
2658  return roundAndPackFloat64( roundData, aSign, aExp, zSig );
2659 
2660 }
2661 
2662 /*
2663 -------------------------------------------------------------------------------
2664 Rounds the extended double-precision floating-point value `a' to an integer,
2665 and returns the result as an extended quadruple-precision floating-point
2666 value. The operation is performed according to the IEC/IEEE Standard for
2667 Binary Floating-point Arithmetic.
2668 -------------------------------------------------------------------------------
2669 */
2670 floatx80 floatx80_round_to_int( struct roundingData *roundData, floatx80 a )
2671 {
2672  flag aSign;
2673  int32 aExp;
2674  bits64 lastBitMask, roundBitsMask;
2675  int8 roundingMode;
2676  floatx80 z;
2677 
2678  aExp = extractFloatx80Exp( a );
2679  if ( 0x403E <= aExp ) {
2680  if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
2681  return propagateFloatx80NaN( a, a );
2682  }
2683  return a;
2684  }
2685  if ( aExp <= 0x3FFE ) {
2686  if ( ( aExp == 0 )
2687  && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
2688  return a;
2689  }
2690  roundData->exception |= float_flag_inexact;
2691  aSign = extractFloatx80Sign( a );
2692  switch ( roundData->mode ) {
2694  if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
2695  ) {
2696  return
2697  packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
2698  }
2699  break;
2700  case float_round_down:
2701  return
2702  aSign ?
2703  packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
2704  : packFloatx80( 0, 0, 0 );
2705  case float_round_up:
2706  return
2707  aSign ? packFloatx80( 1, 0, 0 )
2708  : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
2709  }
2710  return packFloatx80( aSign, 0, 0 );
2711  }
2712  lastBitMask = 1;
2713  lastBitMask <<= 0x403E - aExp;
2714  roundBitsMask = lastBitMask - 1;
2715  z = a;
2716  roundingMode = roundData->mode;
2717  if ( roundingMode == float_round_nearest_even ) {
2718  z.low += lastBitMask>>1;
2719  if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
2720  }
2721  else if ( roundingMode != float_round_to_zero ) {
2722  if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2723  z.low += roundBitsMask;
2724  }
2725  }
2726  z.low &= ~ roundBitsMask;
2727  if ( z.low == 0 ) {
2728  ++z.high;
2729  z.low = LIT64( 0x8000000000000000 );
2730  }
2731  if ( z.low != a.low ) roundData->exception |= float_flag_inexact;
2732  return z;
2733 
2734 }
2735 
2736 /*
2737 -------------------------------------------------------------------------------
2738 Returns the result of adding the absolute values of the extended double-
2739 precision floating-point values `a' and `b'. If `zSign' is true, the sum is
2740 negated before being returned. `zSign' is ignored if the result is a NaN.
2741 The addition is performed according to the IEC/IEEE Standard for Binary
2742 Floating-point Arithmetic.
2743 -------------------------------------------------------------------------------
2744 */
2745 static floatx80 addFloatx80Sigs( struct roundingData *roundData, floatx80 a, floatx80 b, flag zSign )
2746 {
2747  int32 aExp, bExp, zExp;
2748  bits64 aSig, bSig, zSig0, zSig1;
2749  int32 expDiff;
2750 
2751  aSig = extractFloatx80Frac( a );
2752  aExp = extractFloatx80Exp( a );
2753  bSig = extractFloatx80Frac( b );
2754  bExp = extractFloatx80Exp( b );
2755  expDiff = aExp - bExp;
2756  if ( 0 < expDiff ) {
2757  if ( aExp == 0x7FFF ) {
2758  if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2759  return a;
2760  }
2761  if ( bExp == 0 ) --expDiff;
2762  shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2763  zExp = aExp;
2764  }
2765  else if ( expDiff < 0 ) {
2766  if ( bExp == 0x7FFF ) {
2767  if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2768  return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2769  }
2770  if ( aExp == 0 ) ++expDiff;
2771  shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2772  zExp = bExp;
2773  }
2774  else {
2775  if ( aExp == 0x7FFF ) {
2776  if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2777  return propagateFloatx80NaN( a, b );
2778  }
2779  return a;
2780  }
2781  zSig1 = 0;
2782  zSig0 = aSig + bSig;
2783  if ( aExp == 0 ) {
2784  normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
2785  goto roundAndPack;
2786  }
2787  zExp = aExp;
2788  goto shiftRight1;
2789  }
2790 
2791  zSig0 = aSig + bSig;
2792 
2793  if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
2794  shiftRight1:
2795  shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
2796  zSig0 |= LIT64( 0x8000000000000000 );
2797  ++zExp;
2798  roundAndPack:
2799  return
2800  roundAndPackFloatx80(
2801  roundData, zSign, zExp, zSig0, zSig1 );
2802 
2803 }
2804 
2805 /*
2806 -------------------------------------------------------------------------------
2807 Returns the result of subtracting the absolute values of the extended
2808 double-precision floating-point values `a' and `b'. If `zSign' is true,
2809 the difference is negated before being returned. `zSign' is ignored if the
2810 result is a NaN. The subtraction is performed according to the IEC/IEEE
2811 Standard for Binary Floating-point Arithmetic.
2812 -------------------------------------------------------------------------------
2813 */
2814 static floatx80 subFloatx80Sigs( struct roundingData *roundData, floatx80 a, floatx80 b, flag zSign )
2815 {
2816  int32 aExp, bExp, zExp;
2817  bits64 aSig, bSig, zSig0, zSig1;
2818  int32 expDiff;
2819  floatx80 z;
2820 
2821  aSig = extractFloatx80Frac( a );
2822  aExp = extractFloatx80Exp( a );
2823  bSig = extractFloatx80Frac( b );
2824  bExp = extractFloatx80Exp( b );
2825  expDiff = aExp - bExp;
2826  if ( 0 < expDiff ) goto aExpBigger;
2827  if ( expDiff < 0 ) goto bExpBigger;
2828  if ( aExp == 0x7FFF ) {
2829  if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2830  return propagateFloatx80NaN( a, b );
2831  }
2832  roundData->exception |= float_flag_invalid;
2833  z.low = floatx80_default_nan_low;
2834  z.high = floatx80_default_nan_high;
2835  z.__padding = 0;
2836  return z;
2837  }
2838  if ( aExp == 0 ) {
2839  aExp = 1;
2840  bExp = 1;
2841  }
2842  zSig1 = 0;
2843  if ( bSig < aSig ) goto aBigger;
2844  if ( aSig < bSig ) goto bBigger;
2845  return packFloatx80( roundData->mode == float_round_down, 0, 0 );
2846  bExpBigger:
2847  if ( bExp == 0x7FFF ) {
2848  if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2849  return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
2850  }
2851  if ( aExp == 0 ) ++expDiff;
2852  shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2853  bBigger:
2854  sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
2855  zExp = bExp;
2856  zSign ^= 1;
2857  goto normalizeRoundAndPack;
2858  aExpBigger:
2859  if ( aExp == 0x7FFF ) {
2860  if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2861  return a;
2862  }
2863  if ( bExp == 0 ) --expDiff;
2864  shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2865  aBigger:
2866  sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
2867  zExp = aExp;
2868  normalizeRoundAndPack:
2869  return
2870  normalizeRoundAndPackFloatx80(
2871  roundData, zSign, zExp, zSig0, zSig1 );
2872 
2873 }
2874 
2875 /*
2876 -------------------------------------------------------------------------------
2877 Returns the result of adding the extended double-precision floating-point
2878 values `a' and `b'. The operation is performed according to the IEC/IEEE
2879 Standard for Binary Floating-point Arithmetic.
2880 -------------------------------------------------------------------------------
2881 */
2882 floatx80 floatx80_add( struct roundingData *roundData, floatx80 a, floatx80 b )
2883 {
2884  flag aSign, bSign;
2885 
2886  aSign = extractFloatx80Sign( a );
2887  bSign = extractFloatx80Sign( b );
2888  if ( aSign == bSign ) {
2889  return addFloatx80Sigs( roundData, a, b, aSign );
2890  }
2891  else {
2892  return subFloatx80Sigs( roundData, a, b, aSign );
2893  }
2894 
2895 }
2896 
2897 /*
2898 -------------------------------------------------------------------------------
2899 Returns the result of subtracting the extended double-precision floating-
2900 point values `a' and `b'. The operation is performed according to the
2901 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2902 -------------------------------------------------------------------------------
2903 */
2904 floatx80 floatx80_sub( struct roundingData *roundData, floatx80 a, floatx80 b )
2905 {
2906  flag aSign, bSign;
2907 
2908  aSign = extractFloatx80Sign( a );
2909  bSign = extractFloatx80Sign( b );
2910  if ( aSign == bSign ) {
2911  return subFloatx80Sigs( roundData, a, b, aSign );
2912  }
2913  else {
2914  return addFloatx80Sigs( roundData, a, b, aSign );
2915  }
2916 
2917 }
2918 
2919 /*
2920 -------------------------------------------------------------------------------
2921 Returns the result of multiplying the extended double-precision floating-
2922 point values `a' and `b'. The operation is performed according to the
2923 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2924 -------------------------------------------------------------------------------
2925 */
2926 floatx80 floatx80_mul( struct roundingData *roundData, floatx80 a, floatx80 b )
2927 {
2928  flag aSign, bSign, zSign;
2929  int32 aExp, bExp, zExp;
2930  bits64 aSig, bSig, zSig0, zSig1;
2931  floatx80 z;
2932 
2933  aSig = extractFloatx80Frac( a );
2934  aExp = extractFloatx80Exp( a );
2935  aSign = extractFloatx80Sign( a );
2936  bSig = extractFloatx80Frac( b );
2937  bExp = extractFloatx80Exp( b );
2938  bSign = extractFloatx80Sign( b );
2939  zSign = aSign ^ bSign;
2940  if ( aExp == 0x7FFF ) {
2941  if ( (bits64) ( aSig<<1 )
2942  || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
2943  return propagateFloatx80NaN( a, b );
2944  }
2945  if ( ( bExp | bSig ) == 0 ) goto invalid;
2946  return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2947  }
2948  if ( bExp == 0x7FFF ) {
2949  if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2950  if ( ( aExp | aSig ) == 0 ) {
2951  invalid:
2952  roundData->exception |= float_flag_invalid;
2953  z.low = floatx80_default_nan_low;
2954  z.high = floatx80_default_nan_high;
2955  z.__padding = 0;
2956  return z;
2957  }
2958  return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2959  }
2960  if ( aExp == 0 ) {
2961  if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
2962  normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
2963  }
2964  if ( bExp == 0 ) {
2965  if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
2966  normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
2967  }
2968  zExp = aExp + bExp - 0x3FFE;
2969  mul64To128( aSig, bSig, &zSig0, &zSig1 );
2970  if ( 0 < (sbits64) zSig0 ) {
2971  shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
2972  --zExp;
2973  }
2974  return
2975  roundAndPackFloatx80(
2976  roundData, zSign, zExp, zSig0, zSig1 );
2977 
2978 }
2979 
2980 /*
2981 -------------------------------------------------------------------------------
2982 Returns the result of dividing the extended double-precision floating-point
2983 value `a' by the corresponding value `b'. The operation is performed
2984 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2985 -------------------------------------------------------------------------------
2986 */
2987 floatx80 floatx80_div( struct roundingData *roundData, floatx80 a, floatx80 b )
2988 {
2989  flag aSign, bSign, zSign;
2990  int32 aExp, bExp, zExp;
2991  bits64 aSig, bSig, zSig0, zSig1;
2992  bits64 rem0, rem1, rem2, term0, term1, term2;
2993  floatx80 z;
2994 
2995  aSig = extractFloatx80Frac( a );
2996  aExp = extractFloatx80Exp( a );
2997  aSign = extractFloatx80Sign( a );
2998  bSig = extractFloatx80Frac( b );
2999  bExp = extractFloatx80Exp( b );
3000  bSign = extractFloatx80Sign( b );
3001  zSign = aSign ^ bSign;
3002  if ( aExp == 0x7FFF ) {
3003  if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3004  if ( bExp == 0x7FFF ) {
3005  if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3006  goto invalid;
3007  }
3008  return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3009  }
3010  if ( bExp == 0x7FFF ) {
3011  if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3012  return packFloatx80( zSign, 0, 0 );
3013  }
3014  if ( bExp == 0 ) {
3015  if ( bSig == 0 ) {
3016  if ( ( aExp | aSig ) == 0 ) {
3017  invalid:
3018  roundData->exception |= float_flag_invalid;
3019  z.low = floatx80_default_nan_low;
3020  z.high = floatx80_default_nan_high;
3021  z.__padding = 0;
3022  return z;
3023  }
3024  roundData->exception |= float_flag_divbyzero;
3025  return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3026  }
3027  normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3028  }
3029  if ( aExp == 0 ) {
3030  if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3031  normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3032  }
3033  zExp = aExp - bExp + 0x3FFE;
3034  rem1 = 0;
3035  if ( bSig <= aSig ) {
3036  shift128Right( aSig, 0, 1, &aSig, &rem1 );
3037  ++zExp;
3038  }
3039  zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3040  mul64To128( bSig, zSig0, &term0, &term1 );
3041  sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3042  while ( (sbits64) rem0 < 0 ) {
3043  --zSig0;
3044  add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3045  }
3046  zSig1 = estimateDiv128To64( rem1, 0, bSig );
3047  if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3048  mul64To128( bSig, zSig1, &term1, &term2 );
3049  sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3050  while ( (sbits64) rem1 < 0 ) {
3051  --zSig1;
3052  add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3053  }
3054  zSig1 |= ( ( rem1 | rem2 ) != 0 );
3055  }
3056  return
3057  roundAndPackFloatx80(
3058  roundData, zSign, zExp, zSig0, zSig1 );
3059 
3060 }
3061 
3062 /*
3063 -------------------------------------------------------------------------------
3064 Returns the remainder of the extended double-precision floating-point value
3065 `a' with respect to the corresponding value `b'. The operation is performed
3066 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3067 -------------------------------------------------------------------------------
3068 */
3069 floatx80 floatx80_rem( struct roundingData *roundData, floatx80 a, floatx80 b )
3070 {
3071  flag aSign, bSign, zSign;
3072  int32 aExp, bExp, expDiff;
3073  bits64 aSig0, aSig1, bSig;
3074  bits64 q, term0, term1, alternateASig0, alternateASig1;
3075  floatx80 z;
3076 
3077  aSig0 = extractFloatx80Frac( a );
3078  aExp = extractFloatx80Exp( a );
3079  aSign = extractFloatx80Sign( a );
3080  bSig = extractFloatx80Frac( b );
3081  bExp = extractFloatx80Exp( b );
3082  bSign = extractFloatx80Sign( b );
3083  if ( aExp == 0x7FFF ) {
3084  if ( (bits64) ( aSig0<<1 )
3085  || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3086  return propagateFloatx80NaN( a, b );
3087  }
3088  goto invalid;
3089  }
3090  if ( bExp == 0x7FFF ) {
3091  if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3092  return a;
3093  }
3094  if ( bExp == 0 ) {
3095  if ( bSig == 0 ) {
3096  invalid:
3097  roundData->exception |= float_flag_invalid;
3098  z.low = floatx80_default_nan_low;
3099  z.high = floatx80_default_nan_high;
3100  z.__padding = 0;
3101  return z;
3102  }
3103  normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3104  }
3105  if ( aExp == 0 ) {
3106  if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3107  normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3108  }
3109  bSig |= LIT64( 0x8000000000000000 );
3110  zSign = aSign;
3111  expDiff = aExp - bExp;
3112  aSig1 = 0;
3113  if ( expDiff < 0 ) {
3114  if ( expDiff < -1 ) return a;
3115  shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3116  expDiff = 0;
3117  }
3118  q = ( bSig <= aSig0 );
3119  if ( q ) aSig0 -= bSig;
3120  expDiff -= 64;
3121  while ( 0 < expDiff ) {
3122  q = estimateDiv128To64( aSig0, aSig1, bSig );
3123  q = ( 2 < q ) ? q - 2 : 0;
3124  mul64To128( bSig, q, &term0, &term1 );
3125  sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3126  shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3127  expDiff -= 62;
3128  }
3129  expDiff += 64;
3130  if ( 0 < expDiff ) {
3131  q = estimateDiv128To64( aSig0, aSig1, bSig );
3132  q = ( 2 < q ) ? q - 2 : 0;
3133  q >>= 64 - expDiff;
3134  mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3135  sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3136  shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3137  while ( le128( term0, term1, aSig0, aSig1 ) ) {
3138  ++q;
3139  sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3140  }
3141  }
3142  else {
3143  term1 = 0;
3144  term0 = bSig;
3145  }
3146  sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3147  if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3148  || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3149  && ( q & 1 ) )
3150  ) {
3151  aSig0 = alternateASig0;
3152  aSig1 = alternateASig1;
3153  zSign = ! zSign;
3154  }
3155 
3156  return
3157  normalizeRoundAndPackFloatx80(
3158  roundData, zSign, bExp + expDiff, aSig0, aSig1 );
3159 
3160 }
3161 
3162 /*
3163 -------------------------------------------------------------------------------
3164 Returns the square root of the extended double-precision floating-point
3165 value `a'. The operation is performed according to the IEC/IEEE Standard
3166 for Binary Floating-point Arithmetic.
3167 -------------------------------------------------------------------------------
3168 */
3169 floatx80 floatx80_sqrt( struct roundingData *roundData, floatx80 a )
3170 {
3171  flag aSign;
3172  int32 aExp, zExp;
3173  bits64 aSig0, aSig1, zSig0, zSig1;
3174  bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3175  bits64 shiftedRem0, shiftedRem1;
3176  floatx80 z;
3177 
3178  aSig0 = extractFloatx80Frac( a );
3179  aExp = extractFloatx80Exp( a );
3180  aSign = extractFloatx80Sign( a );
3181  if ( aExp == 0x7FFF ) {
3182  if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3183  if ( ! aSign ) return a;
3184  goto invalid;
3185  }
3186  if ( aSign ) {
3187  if ( ( aExp | aSig0 ) == 0 ) return a;
3188  invalid:
3189  roundData->exception |= float_flag_invalid;
3190  z.low = floatx80_default_nan_low;
3191  z.high = floatx80_default_nan_high;
3192  z.__padding = 0;
3193  return z;
3194  }
3195  if ( aExp == 0 ) {
3196  if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3197  normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3198  }
3199  zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3200  zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3201  zSig0 <<= 31;
3202  aSig1 = 0;
3203  shift128Right( aSig0, 0, ( aExp & 1 ) + 2, &aSig0, &aSig1 );
3204  zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0 ) + zSig0 + 4;
3205  if ( 0 <= (sbits64) zSig0 ) zSig0 = LIT64( 0xFFFFFFFFFFFFFFFF );
3206  shortShift128Left( aSig0, aSig1, 2, &aSig0, &aSig1 );
3207  mul64To128( zSig0, zSig0, &term0, &term1 );
3208  sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3209  while ( (sbits64) rem0 < 0 ) {
3210  --zSig0;
3211  shortShift128Left( 0, zSig0, 1, &term0, &term1 );
3212  term1 |= 1;
3213  add128( rem0, rem1, term0, term1, &rem0, &rem1 );
3214  }
3215  shortShift128Left( rem0, rem1, 63, &shiftedRem0, &shiftedRem1 );
3216  zSig1 = estimateDiv128To64( shiftedRem0, shiftedRem1, zSig0 );
3217  if ( (bits64) ( zSig1<<1 ) <= 10 ) {
3218  if ( zSig1 == 0 ) zSig1 = 1;
3219  mul64To128( zSig0, zSig1, &term1, &term2 );
3220  shortShift128Left( term1, term2, 1, &term1, &term2 );
3221  sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3222  mul64To128( zSig1, zSig1, &term2, &term3 );
3223  sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3224  while ( (sbits64) rem1 < 0 ) {
3225  --zSig1;
3226  shortShift192Left( 0, zSig0, zSig1, 1, &term1, &term2, &term3 );
3227  term3 |= 1;
3228  add192(
3229  rem1, rem2, rem3, term1, term2, term3, &rem1, &rem2, &rem3 );
3230  }
3231  zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3232  }
3233  return
3234  roundAndPackFloatx80(
3235  roundData, 0, zExp, zSig0, zSig1 );
3236 
3237 }
3238 
3239 /*
3240 -------------------------------------------------------------------------------
3241 Returns 1 if the extended double-precision floating-point value `a' is
3242 equal to the corresponding value `b', and 0 otherwise. The comparison is
3243 performed according to the IEC/IEEE Standard for Binary Floating-point
3244 Arithmetic.
3245 -------------------------------------------------------------------------------
3246 */
3247 flag floatx80_eq( floatx80 a, floatx80 b )
3248 {
3249 
3250  if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3251  && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3252  || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3253  && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3254  ) {
3255  if ( floatx80_is_signaling_nan( a )
3256  || floatx80_is_signaling_nan( b ) ) {
3258  }
3259  return 0;
3260  }
3261  return
3262  ( a.low == b.low )
3263  && ( ( a.high == b.high )
3264  || ( ( a.low == 0 )
3265  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3266  );
3267 
3268 }
3269 
3270 /*
3271 -------------------------------------------------------------------------------
3272 Returns 1 if the extended double-precision floating-point value `a' is
3273 less than or equal to the corresponding value `b', and 0 otherwise. The
3274 comparison is performed according to the IEC/IEEE Standard for Binary
3275 Floating-point Arithmetic.
3276 -------------------------------------------------------------------------------
3277 */
3278 flag floatx80_le( floatx80 a, floatx80 b )
3279 {
3280  flag aSign, bSign;
3281 
3282  if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3283  && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3284  || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3285  && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3286  ) {
3288  return 0;
3289  }
3290  aSign = extractFloatx80Sign( a );
3291  bSign = extractFloatx80Sign( b );
3292  if ( aSign != bSign ) {
3293  return
3294  aSign
3295  || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3296  == 0 );
3297  }
3298  return
3299  aSign ? le128( b.high, b.low, a.high, a.low )
3300  : le128( a.high, a.low, b.high, b.low );
3301 
3302 }
3303 
3304 /*
3305 -------------------------------------------------------------------------------
3306 Returns 1 if the extended double-precision floating-point value `a' is
3307 less than the corresponding value `b', and 0 otherwise. The comparison
3308 is performed according to the IEC/IEEE Standard for Binary Floating-point
3309 Arithmetic.
3310 -------------------------------------------------------------------------------
3311 */
3312 flag floatx80_lt( floatx80 a, floatx80 b )
3313 {
3314  flag aSign, bSign;
3315 
3316  if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3317  && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3318  || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3319  && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3320  ) {
3322  return 0;
3323  }
3324  aSign = extractFloatx80Sign( a );
3325  bSign = extractFloatx80Sign( b );
3326  if ( aSign != bSign ) {
3327  return
3328  aSign
3329  && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3330  != 0 );
3331  }
3332  return
3333  aSign ? lt128( b.high, b.low, a.high, a.low )
3334  : lt128( a.high, a.low, b.high, b.low );
3335 
3336 }
3337 
3338 /*
3339 -------------------------------------------------------------------------------
3340 Returns 1 if the extended double-precision floating-point value `a' is equal
3341 to the corresponding value `b', and 0 otherwise. The invalid exception is
3342 raised if either operand is a NaN. Otherwise, the comparison is performed
3343 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3344 -------------------------------------------------------------------------------
3345 */
3346 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
3347 {
3348 
3349  if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3350  && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3351  || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3352  && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3353  ) {
3355  return 0;
3356  }
3357  return
3358  ( a.low == b.low )
3359  && ( ( a.high == b.high )
3360  || ( ( a.low == 0 )
3361  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3362  );
3363 
3364 }
3365 
3366 /*
3367 -------------------------------------------------------------------------------
3368 Returns 1 if the extended double-precision floating-point value `a' is less
3369 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
3370 do not cause an exception. Otherwise, the comparison is performed according
3371 to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3372 -------------------------------------------------------------------------------
3373 */
3374 flag floatx80_le_quiet( floatx80 a, floatx80 b )
3375 {
3376  flag aSign, bSign;
3377 
3378  if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3379  && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3380  || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3381  && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3382  ) {
3383  /* Do nothing, even if NaN as we're quiet */
3384  return 0;
3385  }
3386  aSign = extractFloatx80Sign( a );
3387  bSign = extractFloatx80Sign( b );
3388  if ( aSign != bSign ) {
3389  return
3390  aSign
3391  || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3392  == 0 );
3393  }
3394  return
3395  aSign ? le128( b.high, b.low, a.high, a.low )
3396  : le128( a.high, a.low, b.high, b.low );
3397 
3398 }
3399 
3400 /*
3401 -------------------------------------------------------------------------------
3402 Returns 1 if the extended double-precision floating-point value `a' is less
3403 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
3404 an exception. Otherwise, the comparison is performed according to the
3405 IEC/IEEE Standard for Binary Floating-point Arithmetic.
3406 -------------------------------------------------------------------------------
3407 */
3408 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
3409 {
3410  flag aSign, bSign;
3411 
3412  if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3413  && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3414  || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3415  && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3416  ) {
3417  /* Do nothing, even if NaN as we're quiet */
3418  return 0;
3419  }
3420  aSign = extractFloatx80Sign( a );
3421  bSign = extractFloatx80Sign( b );
3422  if ( aSign != bSign ) {
3423  return
3424  aSign
3425  && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3426  != 0 );
3427  }
3428  return
3429  aSign ? lt128( b.high, b.low, a.high, a.low )
3430  : lt128( a.high, a.low, b.high, b.low );
3431 
3432 }
3433 
3434 #endif
3435