arch/arm26/nwfpe/softfloat.c

   1 /*
   2 ===============================================================================
   3
   4 This C source file is part of the SoftFloat IEC/IEEE Floating-point
   5 Arithmetic Package, Release 2.
   6
   7 Written by John R. Hauser.  This work was made possible in part by the
   8 International Computer Science Institute, located at Suite 600, 1947 Center
   9 Street, Berkeley, California 94704.  Funding was partially provided by the
  10 National Science Foundation under grant MIP-9311980.  The original version
  11 of this code was written as part of a project to build a fixed-point vector
  12 processor in collaboration with the University of California at Berkeley,
  13 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  14 is available through the web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  15 arithmetic/softfloat.html'.
  16
  17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  18 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  19 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  20 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  21 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  22
  23 Derivative works are acceptable, even for commercial purposes, so long as
  24 (1) they include prominent notice that the work is derivative, and (2) they
  25 include prominent notice akin to these three paragraphs for those parts of
  26 this code that are retained.
  27
  28 ===============================================================================
  29 */
  30
  31 #include "fpa11.h"
  32 #include "milieu.h"
  33 #include "softfloat.h"
  34
  35 /*
  36 -------------------------------------------------------------------------------
  37 Floating-point rounding mode, extended double-precision rounding precision,
  38 and exception flags.
  39 -------------------------------------------------------------------------------
  40 */
  41 int8 float_rounding_mode = float_round_nearest_even;
  42 int8 floatx80_rounding_precision = 80;
  43 int8 float_exception_flags;
  44
  45 /*
  46 -------------------------------------------------------------------------------
  47 Primitive arithmetic functions, including multi-word arithmetic, and
  48 division and square root approximations.  (Can be specialized to target if
  49 desired.)
  50 -------------------------------------------------------------------------------
  51 */
  52 #include "softfloat-macros"
  53
  54 /*
  55 -------------------------------------------------------------------------------
  56 Functions and definitions to determine:  (1) whether tininess for underflow
  57 is detected before or after rounding by default, (2) what (if anything)
  58 happens when exceptions are raised, (3) how signaling NaNs are distinguished
  59 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  60 are propagated from function inputs to output.  These details are target-
  61 specific.
  62 -------------------------------------------------------------------------------
  63 */
  64 #include "softfloat-specialize"
  65
  66 /*
  67 -------------------------------------------------------------------------------
  68 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  69 and 7, and returns the properly rounded 32-bit integer corresponding to the
  70 input.  If `zSign' is nonzero, the input is negated before being converted
  71 to an integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point
  72 input is simply rounded to an integer, with the inexact exception raised if
  73 the input cannot be represented exactly as an integer.  If the fixed-point
  74 input is too large, however, the invalid exception is raised and the largest
  75 positive or negative integer is returned.
  76 -------------------------------------------------------------------------------
  77 */
  78 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
  79 {
  80     int8 roundingMode;
  81     flag roundNearestEven;
  82     int8 roundIncrement, roundBits;
  83     int32 z;
  84
  85     roundingMode = float_rounding_mode;
  86     roundNearestEven = ( roundingMode == float_round_nearest_even );
  87     roundIncrement = 0x40;
  88     if ( ! roundNearestEven ) {
  89         if ( roundingMode == float_round_to_zero ) {
  90             roundIncrement = 0;
  91         }
  92         else {
  93             roundIncrement = 0x7F;
  94             if ( zSign ) {
  95                 if ( roundingMode == float_round_up ) roundIncrement = 0;
  96             }
  97             else {
  98                 if ( roundingMode == float_round_down ) roundIncrement = 0;
  99             }
 100         }
 101     }
 102     roundBits = absZ & 0x7F;
 103     absZ = ( absZ + roundIncrement )>>7;
 104     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 105     z = absZ;
 106     if ( zSign ) z = - z;
 107     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 108         float_exception_flags |= float_flag_invalid;
 109         return zSign ? 0x80000000 : 0x7FFFFFFF;
 110     }
 111     if ( roundBits ) float_exception_flags |= float_flag_inexact;
 112     return z;
 113
 114 }
 115
 116 /*
 117 -------------------------------------------------------------------------------
 118 Returns the fraction bits of the single-precision floating-point value `a'.
 119 -------------------------------------------------------------------------------
 120 */
 121 INLINE bits32 extractFloat32Frac( float32 a )
 122 {
 123
 124     return a & 0x007FFFFF;
 125
 126 }
 127
 128 /*
 129 -------------------------------------------------------------------------------
 130 Returns the exponent bits of the single-precision floating-point value `a'.
 131 -------------------------------------------------------------------------------
 132 */
 133 INLINE int16 extractFloat32Exp( float32 a )
 134 {
 135
 136     return ( a>>23 ) & 0xFF;
 137
 138 }
 139
 140 /*
 141 -------------------------------------------------------------------------------
 142 Returns the sign bit of the single-precision floating-point value `a'.
 143 -------------------------------------------------------------------------------
 144 */
 145 INLINE flag extractFloat32Sign( float32 a )
 146 {
 147
 148     return a>>31;
 149
 150 }
 151
 152 /*
 153 -------------------------------------------------------------------------------
 154 Normalizes the subnormal single-precision floating-point value represented
 155 by the denormalized significand `aSig'.  The normalized exponent and
 156 significand are stored at the locations pointed to by `zExpPtr' and
 157 `zSigPtr', respectively.
 158 -------------------------------------------------------------------------------
 159 */
 160 static void
 161  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
 162 {
 163     int8 shiftCount;
 164
 165     shiftCount = countLeadingZeros32( aSig ) - 8;
 166     *zSigPtr = aSig<<shiftCount;
 167     *zExpPtr = 1 - shiftCount;
 168
 169 }
 170
 171 /*
 172 -------------------------------------------------------------------------------
 173 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 174 single-precision floating-point value, returning the result.  After being
 175 shifted into the proper positions, the three fields are simply added
 176 together to form the result.  This means that any integer portion of `zSig'
 177 will be added into the exponent.  Since a properly normalized significand
 178 will have an integer portion equal to 1, the `zExp' input should be 1 less
 179 than the desired result exponent whenever `zSig' is a complete, normalized
 180 significand.
 181 -------------------------------------------------------------------------------
 182 */
 183 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
 184 {
 185 #if 0
 186    float32 f;
 187    __asm__("@ packFloat32;              \n\
 188             mov %0, %1, asl #31;        \n\
 189             orr %0, %2, asl #23;        \n\
 190             orr %0, %3"
 191             : /* no outputs */
 192             : "g" (f), "g" (zSign), "g" (zExp), "g" (zSig)
 193             : "cc");
 194    return f;
 195 #else
 196     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
 197 #endif
 198 }
 199
 200 /*
 201 -------------------------------------------------------------------------------
 202 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 203 and significand `zSig', and returns the proper single-precision floating-
 204 point value corresponding to the abstract input.  Ordinarily, the abstract
 205 value is simply rounded and packed into the single-precision format, with
 206 the inexact exception raised if the abstract input cannot be represented
 207 exactly.  If the abstract value is too large, however, the overflow and
 208 inexact exceptions are raised and an infinity or maximal finite value is
 209 returned.  If the abstract value is too small, the input value is rounded to
 210 a subnormal number, and the underflow and inexact exceptions are raised if
 211 the abstract input cannot be represented exactly as a subnormal single-
 212 precision floating-point number.
 213     The input significand `zSig' has its binary point between bits 30
 214 and 29, which is 7 bits to the left of the usual location.  This shifted
 215 significand must be normalized or smaller.  If `zSig' is not normalized,
 216 `zExp' must be 0; in that case, the result returned is a subnormal number,
 217 and it must not require rounding.  In the usual case that `zSig' is
 218 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 219 The handling of underflow and overflow follows the IEC/IEEE Standard for
 220 Binary Floating-point Arithmetic.
 221 -------------------------------------------------------------------------------
 222 */
 223 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
 224 {
 225     int8 roundingMode;
 226     flag roundNearestEven;
 227     int8 roundIncrement, roundBits;
 228     flag isTiny;
 229
 230     roundingMode = float_rounding_mode;
 231     roundNearestEven = ( roundingMode == float_round_nearest_even );
 232     roundIncrement = 0x40;
 233     if ( ! roundNearestEven ) {
 234         if ( roundingMode == float_round_to_zero ) {
 235             roundIncrement = 0;
 236         }
 237         else {
 238             roundIncrement = 0x7F;
 239             if ( zSign ) {
 240                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 241             }
 242             else {
 243                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 244             }
 245         }
 246     }
 247     roundBits = zSig & 0x7F;
 248     if ( 0xFD <= (bits16) zExp ) {
 249         if (    ( 0xFD < zExp )
 250              || (    ( zExp == 0xFD )
 251                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
 252            ) {
 253             float_raise( float_flag_overflow | float_flag_inexact );
 254             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
 255         }
 256         if ( zExp < 0 ) {
 257             isTiny =
 258                    ( float_detect_tininess == float_tininess_before_rounding )
 259                 || ( zExp < -1 )
 260                 || ( zSig + roundIncrement < 0x80000000 );
 261             shift32RightJamming( zSig, - zExp, &zSig );
 262             zExp = 0;
 263             roundBits = zSig & 0x7F;
 264             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
 265         }
 266     }
 267     if ( roundBits ) float_exception_flags |= float_flag_inexact;
 268     zSig = ( zSig + roundIncrement )>>7;
 269     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 270     if ( zSig == 0 ) zExp = 0;
 271     return packFloat32( zSign, zExp, zSig );
 272
 273 }
 274
 275 /*
 276 -------------------------------------------------------------------------------
 277 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 278 and significand `zSig', and returns the proper single-precision floating-
 279 point value corresponding to the abstract input.  This routine is just like
 280 `roundAndPackFloat32' except that `zSig' does not have to be normalized in
 281 any way.  In all cases, `zExp' must be 1 less than the ``true'' floating-
 282 point exponent.
 283 -------------------------------------------------------------------------------
 284 */
 285 static float32
 286  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
 287 {
 288     int8 shiftCount;
 289
 290     shiftCount = countLeadingZeros32( zSig ) - 1;
 291     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
 292
 293 }
 294
 295 /*
 296 -------------------------------------------------------------------------------
 297 Returns the fraction bits of the double-precision floating-point value `a'.
 298 -------------------------------------------------------------------------------
 299 */
 300 INLINE bits64 extractFloat64Frac( float64 a )
 301 {
 302
 303     return a & LIT64( 0x000FFFFFFFFFFFFF );
 304
 305 }
 306
 307 /*
 308 -------------------------------------------------------------------------------
 309 Returns the exponent bits of the double-precision floating-point value `a'.
 310 -------------------------------------------------------------------------------
 311 */
 312 INLINE int16 extractFloat64Exp( float64 a )
 313 {
 314
 315     return ( a>>52 ) & 0x7FF;
 316
 317 }
 318
 319 /*
 320 -------------------------------------------------------------------------------
 321 Returns the sign bit of the double-precision floating-point value `a'.
 322 -------------------------------------------------------------------------------
 323 */
 324 INLINE flag extractFloat64Sign( float64 a )
 325 {
 326
 327     return a>>63;
 328
 329 }
 330
 331 /*
 332 -------------------------------------------------------------------------------
 333 Normalizes the subnormal double-precision floating-point value represented
 334 by the denormalized significand `aSig'.  The normalized exponent and
 335 significand are stored at the locations pointed to by `zExpPtr' and
 336 `zSigPtr', respectively.
 337 -------------------------------------------------------------------------------
 338 */
 339 static void
 340  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
 341 {
 342     int8 shiftCount;
 343
 344     shiftCount = countLeadingZeros64( aSig ) - 11;
 345     *zSigPtr = aSig<<shiftCount;
 346     *zExpPtr = 1 - shiftCount;
 347
 348 }
 349
 350 /*
 351 -------------------------------------------------------------------------------
 352 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 353 double-precision floating-point value, returning the result.  After being
 354 shifted into the proper positions, the three fields are simply added
 355 together to form the result.  This means that any integer portion of `zSig'
 356 will be added into the exponent.  Since a properly normalized significand
 357 will have an integer portion equal to 1, the `zExp' input should be 1 less
 358 than the desired result exponent whenever `zSig' is a complete, normalized
 359 significand.
 360 -------------------------------------------------------------------------------
 361 */
 362 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
 363 {
 364
 365     return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
 366
 367 }
 368
 369 /*
 370 -------------------------------------------------------------------------------
 371 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 372 and significand `zSig', and returns the proper double-precision floating-
 373 point value corresponding to the abstract input.  Ordinarily, the abstract
 374 value is simply rounded and packed into the double-precision format, with
 375 the inexact exception raised if the abstract input cannot be represented
 376 exactly.  If the abstract value is too large, however, the overflow and
 377 inexact exceptions are raised and an infinity or maximal finite value is
 378 returned.  If the abstract value is too small, the input value is rounded to
 379 a subnormal number, and the underflow and inexact exceptions are raised if
 380 the abstract input cannot be represented exactly as a subnormal double-
 381 precision floating-point number.
 382     The input significand `zSig' has its binary point between bits 62
 383 and 61, which is 10 bits to the left of the usual location.  This shifted
 384 significand must be normalized or smaller.  If `zSig' is not normalized,
 385 `zExp' must be 0; in that case, the result returned is a subnormal number,
 386 and it must not require rounding.  In the usual case that `zSig' is
 387 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 388 The handling of underflow and overflow follows the IEC/IEEE Standard for
 389 Binary Floating-point Arithmetic.
 390 -------------------------------------------------------------------------------
 391 */
 392 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
 393 {
 394     int8 roundingMode;
 395     flag roundNearestEven;
 396     int16 roundIncrement, roundBits;
 397     flag isTiny;
 398
 399     roundingMode = float_rounding_mode;
 400     roundNearestEven = ( roundingMode == float_round_nearest_even );
 401     roundIncrement = 0x200;
 402     if ( ! roundNearestEven ) {
 403         if ( roundingMode == float_round_to_zero ) {
 404             roundIncrement = 0;
 405         }
 406         else {
 407             roundIncrement = 0x3FF;
 408             if ( zSign ) {
 409                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 410             }
 411             else {
 412                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 413             }
 414         }
 415     }
 416     roundBits = zSig & 0x3FF;
 417     if ( 0x7FD <= (bits16) zExp ) {
 418         if (    ( 0x7FD < zExp )
 419              || (    ( zExp == 0x7FD )
 420                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
 421            ) {
 422             //register int lr = __builtin_return_address(0);
 423             //printk("roundAndPackFloat64 called from 0x%08x\n",lr);
 424             float_raise( float_flag_overflow | float_flag_inexact );
 425             return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
 426         }
 427         if ( zExp < 0 ) {
 428             isTiny =
 429                    ( float_detect_tininess == float_tininess_before_rounding )
 430                 || ( zExp < -1 )
 431                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 432             shift64RightJamming( zSig, - zExp, &zSig );
 433             zExp = 0;
 434             roundBits = zSig & 0x3FF;
 435             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
 436         }
 437     }
 438     if ( roundBits ) float_exception_flags |= float_flag_inexact;
 439     zSig = ( zSig + roundIncrement )>>10;
 440     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 441     if ( zSig == 0 ) zExp = 0;
 442     return packFloat64( zSign, zExp, zSig );
 443
 444 }
 445
 446 /*
 447 -------------------------------------------------------------------------------
 448 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 449 and significand `zSig', and returns the proper double-precision floating-
 450 point value corresponding to the abstract input.  This routine is just like
 451 `roundAndPackFloat64' except that `zSig' does not have to be normalized in
 452 any way.  In all cases, `zExp' must be 1 less than the ``true'' floating-
 453 point exponent.
 454 -------------------------------------------------------------------------------
 455 */
 456 static float64
 457  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
 458 {
 459     int8 shiftCount;
 460
 461     shiftCount = countLeadingZeros64( zSig ) - 1;
 462     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
 463
 464 }
 465
 466 #ifdef FLOATX80
 467
 468 /*
 469 -------------------------------------------------------------------------------
 470 Returns the fraction bits of the extended double-precision floating-point
 471 value `a'.
 472 -------------------------------------------------------------------------------
 473 */
 474 INLINE bits64 extractFloatx80Frac( floatx80 a )
 475 {
 476
 477     return a.low;
 478
 479 }
 480
 481 /*
 482 -------------------------------------------------------------------------------
 483 Returns the exponent bits of the extended double-precision floating-point
 484 value `a'.
 485 -------------------------------------------------------------------------------
 486 */
 487 INLINE int32 extractFloatx80Exp( floatx80 a )
 488 {
 489
 490     return a.high & 0x7FFF;
 491
 492 }
 493
 494 /*
 495 -------------------------------------------------------------------------------
 496 Returns the sign bit of the extended double-precision floating-point value
 497 `a'.
 498 -------------------------------------------------------------------------------
 499 */
 500 INLINE flag extractFloatx80Sign( floatx80 a )
 501 {
 502
 503     return a.high>>15;
 504
 505 }
 506
 507 /*
 508 -------------------------------------------------------------------------------
 509 Normalizes the subnormal extended double-precision floating-point value
 510 represented by the denormalized significand `aSig'.  The normalized exponent
 511 and significand are stored at the locations pointed to by `zExpPtr' and
 512 `zSigPtr', respectively.
 513 -------------------------------------------------------------------------------
 514 */
 515 static void
 516  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
 517 {
 518     int8 shiftCount;
 519
 520     shiftCount = countLeadingZeros64( aSig );
 521     *zSigPtr = aSig<<shiftCount;
 522     *zExpPtr = 1 - shiftCount;
 523
 524 }
 525
 526 /*
 527 -------------------------------------------------------------------------------
 528 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 529 extended double-precision floating-point value, returning the result.
 530 -------------------------------------------------------------------------------
 531 */
 532 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
 533 {
 534     floatx80 z;
 535
 536     z.low = zSig;
 537     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
 538     return z;
 539
 540 }
 541
 542 /*
 543 -------------------------------------------------------------------------------
 544 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 545 and extended significand formed by the concatenation of `zSig0' and `zSig1',
 546 and returns the proper extended double-precision floating-point value
 547 corresponding to the abstract input.  Ordinarily, the abstract value is
 548 rounded and packed into the extended double-precision format, with the
 549 inexact exception raised if the abstract input cannot be represented
 550 exactly.  If the abstract value is too large, however, the overflow and
 551 inexact exceptions are raised and an infinity or maximal finite value is
 552 returned.  If the abstract value is too small, the input value is rounded to
 553 a subnormal number, and the underflow and inexact exceptions are raised if
 554 the abstract input cannot be represented exactly as a subnormal extended
 555 double-precision floating-point number.
 556     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 557 number of bits as single or double precision, respectively.  Otherwise, the
 558 result is rounded to the full precision of the extended double-precision
 559 format.
 560     The input significand must be normalized or smaller.  If the input
 561 significand is not normalized, `zExp' must be 0; in that case, the result
 562 returned is a subnormal number, and it must not require rounding.  The
 563 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 564 Floating-point Arithmetic.
 565 -------------------------------------------------------------------------------
 566 */
 567 static floatx80
 568  roundAndPackFloatx80(
 569      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 570  )
 571 {
 572     int8 roundingMode;
 573     flag roundNearestEven, increment, isTiny;
 574     int64 roundIncrement, roundMask, roundBits;
 575
 576     roundingMode = float_rounding_mode;
 577     roundNearestEven = ( roundingMode == float_round_nearest_even );
 578     if ( roundingPrecision == 80 ) goto precision80;
 579     if ( roundingPrecision == 64 ) {
 580         roundIncrement = LIT64( 0x0000000000000400 );
 581         roundMask = LIT64( 0x00000000000007FF );
 582     }
 583     else if ( roundingPrecision == 32 ) {
 584         roundIncrement = LIT64( 0x0000008000000000 );
 585         roundMask = LIT64( 0x000000FFFFFFFFFF );
 586     }
 587     else {
 588         goto precision80;
 589     }
 590     zSig0 |= ( zSig1 != 0 );
 591     if ( ! roundNearestEven ) {
 592         if ( roundingMode == float_round_to_zero ) {
 593             roundIncrement = 0;
 594         }
 595         else {
 596             roundIncrement = roundMask;
 597             if ( zSign ) {
 598                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 599             }
 600             else {
 601                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 602             }
 603         }
 604     }
 605     roundBits = zSig0 & roundMask;
 606     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 607         if (    ( 0x7FFE < zExp )
 608              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 609            ) {
 610             goto overflow;
 611         }
 612         if ( zExp <= 0 ) {
 613             isTiny =
 614                    ( float_detect_tininess == float_tininess_before_rounding )
 615                 || ( zExp < 0 )
 616                 || ( zSig0 <= zSig0 + roundIncrement );
 617             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 618             zExp = 0;
 619             roundBits = zSig0 & roundMask;
 620             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
 621             if ( roundBits ) float_exception_flags |= float_flag_inexact;
 622             zSig0 += roundIncrement;
 623             if ( (sbits64) zSig0 < 0 ) zExp = 1;
 624             roundIncrement = roundMask + 1;
 625             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 626                 roundMask |= roundIncrement;
 627             }
 628             zSig0 &= ~ roundMask;
 629             return packFloatx80( zSign, zExp, zSig0 );
 630         }
 631     }
 632     if ( roundBits ) float_exception_flags |= float_flag_inexact;
 633     zSig0 += roundIncrement;
 634     if ( zSig0 < roundIncrement ) {
 635         ++zExp;
 636         zSig0 = LIT64( 0x8000000000000000 );
 637     }
 638     roundIncrement = roundMask + 1;
 639     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 640         roundMask |= roundIncrement;
 641     }
 642     zSig0 &= ~ roundMask;
 643     if ( zSig0 == 0 ) zExp = 0;
 644     return packFloatx80( zSign, zExp, zSig0 );
 645  precision80:
 646     increment = ( (sbits64) zSig1 < 0 );
 647     if ( ! roundNearestEven ) {
 648         if ( roundingMode == float_round_to_zero ) {
 649             increment = 0;
 650         }
 651         else {
 652             if ( zSign ) {
 653                 increment = ( roundingMode == float_round_down ) && zSig1;
 654             }
 655             else {
 656                 increment = ( roundingMode == float_round_up ) && zSig1;
 657             }
 658         }
 659     }
 660     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 661         if (    ( 0x7FFE < zExp )
 662              || (    ( zExp == 0x7FFE )
 663                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 664                   && increment
 665                 )
 666            ) {
 667             roundMask = 0;
 668  overflow:
 669             float_raise( float_flag_overflow | float_flag_inexact );
 670             if (    ( roundingMode == float_round_to_zero )
 671                  || ( zSign && ( roundingMode == float_round_up ) )
 672                  || ( ! zSign && ( roundingMode == float_round_down ) )
 673                ) {
 674                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 675             }
 676             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 677         }
 678         if ( zExp <= 0 ) {
 679             isTiny =
 680                    ( float_detect_tininess == float_tininess_before_rounding )
 681                 || ( zExp < 0 )
 682                 || ! increment
 683                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 684             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 685             zExp = 0;
 686             if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
 687             if ( zSig1 ) float_exception_flags |= float_flag_inexact;
 688             if ( roundNearestEven ) {
 689                 increment = ( (sbits64) zSig1 < 0 );
 690             }
 691             else {
 692                 if ( zSign ) {
 693                     increment = ( roundingMode == float_round_down ) && zSig1;
 694                 }
 695                 else {
 696                     increment = ( roundingMode == float_round_up ) && zSig1;
 697                 }
 698             }
 699             if ( increment ) {
 700                 ++zSig0;
 701                 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
 702                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
 703             }
 704             return packFloatx80( zSign, zExp, zSig0 );
 705         }
 706     }
 707     if ( zSig1 ) float_exception_flags |= float_flag_inexact;
 708     if ( increment ) {
 709         ++zSig0;
 710         if ( zSig0 == 0 ) {
 711             ++zExp;
 712             zSig0 = LIT64( 0x8000000000000000 );
 713         }
 714         else {
 715             zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
 716         }
 717     }
 718     else {
 719         if ( zSig0 == 0 ) zExp = 0;
 720     }
 721
 722     return packFloatx80( zSign, zExp, zSig0 );
 723 }
 724
 725 /*
 726 -------------------------------------------------------------------------------
 727 Takes an abstract floating-point value having sign `zSign', exponent
 728 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 729 and returns the proper extended double-precision floating-point value
 730 corresponding to the abstract input.  This routine is just like
 731 `roundAndPackFloatx80' except that the input significand does not have to be
 732 normalized.
 733 -------------------------------------------------------------------------------
 734 */
 735 static floatx80
 736  normalizeRoundAndPackFloatx80(
 737      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 738  )
 739 {
 740     int8 shiftCount;
 741
 742     if ( zSig0 == 0 ) {
 743         zSig0 = zSig1;
 744         zSig1 = 0;
 745         zExp -= 64;
 746     }
 747     shiftCount = countLeadingZeros64( zSig0 );
 748     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 749     zExp -= shiftCount;
 750     return
 751         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
 752
 753 }
 754
 755 #endif
 756
 757 /*
 758 -------------------------------------------------------------------------------
 759 Returns the result of converting the 32-bit two's complement integer `a' to
 760 the single-precision floating-point format.  The conversion is performed
 761 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
 762 -------------------------------------------------------------------------------
 763 */
 764 float32 int32_to_float32( int32 a )
 765 {
 766     flag zSign;
 767
 768     if ( a == 0 ) return 0;
 769     if ( a == 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
 770     zSign = ( a < 0 );
 771     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
 772
 773 }
 774
 775 /*
 776 -------------------------------------------------------------------------------
 777 Returns the result of converting the 32-bit two's complement integer `a' to
 778 the double-precision floating-point format.  The conversion is performed
 779 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
 780 -------------------------------------------------------------------------------
 781 */
 782 float64 int32_to_float64( int32 a )
 783 {
 784     flag aSign;
 785     uint32 absA;
 786     int8 shiftCount;
 787     bits64 zSig;
 788
 789     if ( a == 0 ) return 0;
 790     aSign = ( a < 0 );
 791     absA = aSign ? - a : a;
 792     shiftCount = countLeadingZeros32( absA ) + 21;
 793     zSig = absA;
 794     return packFloat64( aSign, 0x432 - shiftCount, zSig<<shiftCount );
 795
 796 }
 797
 798 #ifdef FLOATX80
 799
 800 /*
 801 -------------------------------------------------------------------------------
 802 Returns the result of converting the 32-bit two's complement integer `a'
 803 to the extended double-precision floating-point format.  The conversion
 804 is performed according to the IEC/IEEE Standard for Binary Floating-point
 805 Arithmetic.
 806 -------------------------------------------------------------------------------
 807 */
 808 floatx80 int32_to_floatx80( int32 a )
 809 {
 810     flag zSign;
 811     uint32 absA;
 812     int8 shiftCount;
 813     bits64 zSig;
 814
 815     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
 816     zSign = ( a < 0 );
 817     absA = zSign ? - a : a;
 818     shiftCount = countLeadingZeros32( absA ) + 32;
 819     zSig = absA;
 820     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
 821
 822 }
 823
 824 #endif
 825
 826 /*
 827 -------------------------------------------------------------------------------
 828 Returns the result of converting the single-precision floating-point value
 829 `a' to the 32-bit two's complement integer format.  The conversion is
 830 performed according to the IEC/IEEE Standard for Binary Floating-point
 831 Arithmetic---which means in particular that the conversion is rounded
 832 according to the current rounding mode.  If `a' is a NaN, the largest
 833 positive integer is returned.  Otherwise, if the conversion overflows, the
 834 largest integer with the same sign as `a' is returned.
 835 -------------------------------------------------------------------------------
 836 */
 837 int32 float32_to_int32( float32 a )
 838 {
 839     flag aSign;
 840     int16 aExp, shiftCount;
 841     bits32 aSig;
 842     bits64 zSig;
 843
 844     aSig = extractFloat32Frac( a );
 845     aExp = extractFloat32Exp( a );
 846     aSign = extractFloat32Sign( a );
 847     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
 848     if ( aExp ) aSig |= 0x00800000;
 849     shiftCount = 0xAF - aExp;
 850     zSig = aSig;
 851     zSig <<= 32;
 852     if ( 0 < shiftCount ) shift64RightJamming( zSig, shiftCount, &zSig );
 853     return roundAndPackInt32( aSign, zSig );
 854
 855 }
 856
 857 /*
 858 -------------------------------------------------------------------------------
 859 Returns the result of converting the single-precision floating-point value
 860 `a' to the 32-bit two's complement integer format.  The conversion is
 861 performed according to the IEC/IEEE Standard for Binary Floating-point
 862 Arithmetic, except that the conversion is always rounded toward zero.  If
 863 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
 864 conversion overflows, the largest integer with the same sign as `a' is
 865 returned.
 866 -------------------------------------------------------------------------------
 867 */
 868 int32 float32_to_int32_round_to_zero( float32 a )
 869 {
 870     flag aSign;
 871     int16 aExp, shiftCount;
 872     bits32 aSig;
 873     int32 z;
 874
 875     aSig = extractFloat32Frac( a );
 876     aExp = extractFloat32Exp( a );
 877     aSign = extractFloat32Sign( a );
 878     shiftCount = aExp - 0x9E;
 879     if ( 0 <= shiftCount ) {
 880         if ( a == 0xCF000000 ) return 0x80000000;
 881         float_raise( float_flag_invalid );
 882         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
 883         return 0x80000000;
 884     }
 885     else if ( aExp <= 0x7E ) {
 886         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
 887         return 0;
 888     }
 889     aSig = ( aSig | 0x00800000 )<<8;
 890     z = aSig>>( - shiftCount );
 891     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
 892         float_exception_flags |= float_flag_inexact;
 893     }
 894     return aSign ? - z : z;
 895
 896 }
 897
 898 /*
 899 -------------------------------------------------------------------------------
 900 Returns the result of converting the single-precision floating-point value
 901 `a' to the double-precision floating-point format.  The conversion is
 902 performed according to the IEC/IEEE Standard for Binary Floating-point
 903 Arithmetic.
 904 -------------------------------------------------------------------------------
 905 */
 906 float64 float32_to_float64( float32 a )
 907 {
 908     flag aSign;
 909     int16 aExp;
 910     bits32 aSig;
 911
 912     aSig = extractFloat32Frac( a );
 913     aExp = extractFloat32Exp( a );
 914     aSign = extractFloat32Sign( a );
 915     if ( aExp == 0xFF ) {
 916         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
 917         return packFloat64( aSign, 0x7FF, 0 );
 918     }
 919     if ( aExp == 0 ) {
 920         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
 921         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
 922         --aExp;
 923     }
 924     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
 925
 926 }
 927
 928 #ifdef FLOATX80
 929
 930 /*
 931 -------------------------------------------------------------------------------
 932 Returns the result of converting the single-precision floating-point value
 933 `a' to the extended double-precision floating-point format.  The conversion
 934 is performed according to the IEC/IEEE Standard for Binary Floating-point
 935 Arithmetic.
 936 -------------------------------------------------------------------------------
 937 */
 938 floatx80 float32_to_floatx80( float32 a )
 939 {
 940     flag aSign;
 941     int16 aExp;
 942     bits32 aSig;
 943
 944     aSig = extractFloat32Frac( a );
 945     aExp = extractFloat32Exp( a );
 946     aSign = extractFloat32Sign( a );
 947     if ( aExp == 0xFF ) {
 948         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
 949         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 950     }
 951     if ( aExp == 0 ) {
 952         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
 953         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
 954     }
 955     aSig |= 0x00800000;
 956     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
 957
 958 }
 959
 960 #endif
 961
 962 /*
 963 -------------------------------------------------------------------------------
 964 Rounds the single-precision floating-point value `a' to an integer, and
 965 returns the result as a single-precision floating-point value.  The
 966 operation is performed according to the IEC/IEEE Standard for Binary
 967 Floating-point Arithmetic.
 968 -------------------------------------------------------------------------------
 969 */
 970 float32 float32_round_to_int( float32 a )
 971 {
 972     flag aSign;
 973     int16 aExp;
 974     bits32 lastBitMask, roundBitsMask;
 975     int8 roundingMode;
 976     float32 z;
 977
 978     aExp = extractFloat32Exp( a );
 979     if ( 0x96 <= aExp ) {
 980         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
 981             return propagateFloat32NaN( a, a );
 982         }
 983         return a;
 984     }
 985     if ( aExp <= 0x7E ) {
 986         if ( (bits32) ( a<<1 ) == 0 ) return a;
 987         float_exception_flags |= float_flag_inexact;
 988         aSign = extractFloat32Sign( a );
 989         switch ( float_rounding_mode ) {
 990          case float_round_nearest_even:
 991             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
 992                 return packFloat32( aSign, 0x7F, 0 );
 993             }
 994             break;
 995          case float_round_down:
 996             return aSign ? 0xBF800000 : 0;
 997          case float_round_up:
 998             return aSign ? 0x80000000 : 0x3F800000;
 999         }
1000         return packFloat32( aSign, 0, 0 );
1001     }
1002     lastBitMask = 1;
1003     lastBitMask <<= 0x96 - aExp;
1004     roundBitsMask = lastBitMask - 1;
1005     z = a;
1006     roundingMode = float_rounding_mode;
1007     if ( roundingMode == float_round_nearest_even ) {
1008         z += lastBitMask>>1;
1009         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1010     }
1011     else if ( roundingMode != float_round_to_zero ) {
1012         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1013             z += roundBitsMask;
1014         }
1015     }
1016     z &= ~ roundBitsMask;
1017     if ( z != a ) float_exception_flags |= float_flag_inexact;
1018     return z;
1019
1020 }
1021
1022 /*
1023 -------------------------------------------------------------------------------
1024 Returns the result of adding the absolute values of the single-precision
1025 floating-point values `a' and `b'.  If `zSign' is true, the sum is negated
1026 before being returned.  `zSign' is ignored if the result is a NaN.  The
1027 addition is performed according to the IEC/IEEE Standard for Binary
1028 Floating-point Arithmetic.
1029 -------------------------------------------------------------------------------
1030 */
1031 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1032 {
1033     int16 aExp, bExp, zExp;
1034     bits32 aSig, bSig, zSig;
1035     int16 expDiff;
1036
1037     aSig = extractFloat32Frac( a );
1038     aExp = extractFloat32Exp( a );
1039     bSig = extractFloat32Frac( b );
1040     bExp = extractFloat32Exp( b );
1041     expDiff = aExp - bExp;
1042     aSig <<= 6;
1043     bSig <<= 6;
1044     if ( 0 < expDiff ) {
1045         if ( aExp == 0xFF ) {
1046             if ( aSig ) return propagateFloat32NaN( a, b );
1047             return a;
1048         }
1049         if ( bExp == 0 ) {
1050             --expDiff;
1051         }
1052         else {
1053             bSig |= 0x20000000;
1054         }
1055         shift32RightJamming( bSig, expDiff, &bSig );
1056         zExp = aExp;
1057     }
1058     else if ( expDiff < 0 ) {
1059         if ( bExp == 0xFF ) {
1060             if ( bSig ) return propagateFloat32NaN( a, b );
1061             return packFloat32( zSign, 0xFF, 0 );
1062         }
1063         if ( aExp == 0 ) {
1064             ++expDiff;
1065         }
1066         else {
1067             aSig |= 0x20000000;
1068         }
1069         shift32RightJamming( aSig, - expDiff, &aSig );
1070         zExp = bExp;
1071     }
1072     else {
1073         if ( aExp == 0xFF ) {
1074             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1075             return a;
1076         }
1077         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1078         zSig = 0x40000000 + aSig + bSig;
1079         zExp = aExp;
1080         goto roundAndPack;
1081     }
1082     aSig |= 0x20000000;
1083     zSig = ( aSig + bSig )<<1;
1084     --zExp;
1085     if ( (sbits32) zSig < 0 ) {
1086         zSig = aSig + bSig;
1087         ++zExp;
1088     }
1089  roundAndPack:
1090     return roundAndPackFloat32( zSign, zExp, zSig );
1091
1092 }
1093
1094 /*
1095 -------------------------------------------------------------------------------
1096 Returns the result of subtracting the absolute values of the single-
1097 precision floating-point values `a' and `b'.  If `zSign' is true, the
1098 difference is negated before being returned.  `zSign' is ignored if the
1099 result is a NaN.  The subtraction is performed according to the IEC/IEEE
1100 Standard for Binary Floating-point Arithmetic.
1101 -------------------------------------------------------------------------------
1102 */
1103 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1104 {
1105     int16 aExp, bExp, zExp;
1106     bits32 aSig, bSig, zSig;
1107     int16 expDiff;
1108
1109     aSig = extractFloat32Frac( a );
1110     aExp = extractFloat32Exp( a );
1111     bSig = extractFloat32Frac( b );
1112     bExp = extractFloat32Exp( b );
1113     expDiff = aExp - bExp;
1114     aSig <<= 7;
1115     bSig <<= 7;
1116     if ( 0 < expDiff ) goto aExpBigger;
1117     if ( expDiff < 0 ) goto bExpBigger;
1118     if ( aExp == 0xFF ) {
1119         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1120         float_raise( float_flag_invalid );
1121         return float32_default_nan;
1122     }
1123     if ( aExp == 0 ) {
1124         aExp = 1;
1125         bExp = 1;
1126     }
1127     if ( bSig < aSig ) goto aBigger;
1128     if ( aSig < bSig ) goto bBigger;
1129     return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1130  bExpBigger:
1131     if ( bExp == 0xFF ) {
1132         if ( bSig ) return propagateFloat32NaN( a, b );
1133         return packFloat32( zSign ^ 1, 0xFF, 0 );
1134     }
1135     if ( aExp == 0 ) {
1136         ++expDiff;
1137     }
1138     else {
1139         aSig |= 0x40000000;
1140     }
1141     shift32RightJamming( aSig, - expDiff, &aSig );
1142     bSig |= 0x40000000;
1143  bBigger:
1144     zSig = bSig - aSig;
1145     zExp = bExp;
1146     zSign ^= 1;
1147     goto normalizeRoundAndPack;
1148  aExpBigger:
1149     if ( aExp == 0xFF ) {
1150         if ( aSig ) return propagateFloat32NaN( a, b );
1151         return a;
1152     }
1153     if ( bExp == 0 ) {
1154         --expDiff;
1155     }
1156     else {
1157         bSig |= 0x40000000;
1158     }
1159     shift32RightJamming( bSig, expDiff, &bSig );
1160     aSig |= 0x40000000;
1161  aBigger:
1162     zSig = aSig - bSig;
1163     zExp = aExp;
1164  normalizeRoundAndPack:
1165     --zExp;
1166     return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1167
1168 }
1169
1170 /*
1171 -------------------------------------------------------------------------------
1172 Returns the result of adding the single-precision floating-point values `a'
1173 and `b'.  The operation is performed according to the IEC/IEEE Standard for
1174 Binary Floating-point Arithmetic.
1175 -------------------------------------------------------------------------------
1176 */
1177 float32 float32_add( float32 a, float32 b )
1178 {
1179     flag aSign, bSign;
1180
1181     aSign = extractFloat32Sign( a );
1182     bSign = extractFloat32Sign( b );
1183     if ( aSign == bSign ) {
1184         return addFloat32Sigs( a, b, aSign );
1185     }
1186     else {
1187         return subFloat32Sigs( a, b, aSign );
1188     }
1189
1190 }
1191
1192 /*
1193 -------------------------------------------------------------------------------
1194 Returns the result of subtracting the single-precision floating-point values
1195 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1196 for Binary Floating-point Arithmetic.
1197 -------------------------------------------------------------------------------
1198 */
1199 float32 float32_sub( float32 a, float32 b )
1200 {
1201     flag aSign, bSign;
1202
1203     aSign = extractFloat32Sign( a );
1204     bSign = extractFloat32Sign( b );
1205     if ( aSign == bSign ) {
1206         return subFloat32Sigs( a, b, aSign );
1207     }
1208     else {
1209         return addFloat32Sigs( a, b, aSign );
1210     }
1211
1212 }
1213
1214 /*
1215 -------------------------------------------------------------------------------
1216 Returns the result of multiplying the single-precision floating-point values
1217 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1218 for Binary Floating-point Arithmetic.
1219 -------------------------------------------------------------------------------
1220 */
1221 float32 float32_mul( float32 a, float32 b )
1222 {
1223     flag aSign, bSign, zSign;
1224     int16 aExp, bExp, zExp;
1225     bits32 aSig, bSig;
1226     bits64 zSig64;
1227     bits32 zSig;
1228
1229     aSig = extractFloat32Frac( a );
1230     aExp = extractFloat32Exp( a );
1231     aSign = extractFloat32Sign( a );
1232     bSig = extractFloat32Frac( b );
1233     bExp = extractFloat32Exp( b );
1234     bSign = extractFloat32Sign( b );
1235     zSign = aSign ^ bSign;
1236     if ( aExp == 0xFF ) {
1237         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1238             return propagateFloat32NaN( a, b );
1239         }
1240         if ( ( bExp | bSig ) == 0 ) {
1241             float_raise( float_flag_invalid );
1242             return float32_default_nan;
1243         }
1244         return packFloat32( zSign, 0xFF, 0 );
1245     }
1246     if ( bExp == 0xFF ) {
1247         if ( bSig ) return propagateFloat32NaN( a, b );
1248         if ( ( aExp | aSig ) == 0 ) {
1249             float_raise( float_flag_invalid );
1250             return float32_default_nan;
1251         }
1252         return packFloat32( zSign, 0xFF, 0 );
1253     }
1254     if ( aExp == 0 ) {
1255         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1256         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1257     }
1258     if ( bExp == 0 ) {
1259         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1260         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1261     }
1262     zExp = aExp + bExp - 0x7F;
1263     aSig = ( aSig | 0x00800000 )<<7;
1264     bSig = ( bSig | 0x00800000 )<<8;
1265     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1266     zSig = zSig64;
1267     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1268         zSig <<= 1;
1269         --zExp;
1270     }
1271     return roundAndPackFloat32( zSign, zExp, zSig );
1272
1273 }
1274
1275 /*
1276 -------------------------------------------------------------------------------
1277 Returns the result of dividing the single-precision floating-point value `a'
1278 by the corresponding value `b'.  The operation is performed according to the
1279 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1280 -------------------------------------------------------------------------------
1281 */
1282 float32 float32_div( float32 a, float32 b )
1283 {
1284     flag aSign, bSign, zSign;
1285     int16 aExp, bExp, zExp;
1286     bits32 aSig, bSig, zSig;
1287
1288     aSig = extractFloat32Frac( a );
1289     aExp = extractFloat32Exp( a );
1290     aSign = extractFloat32Sign( a );
1291     bSig = extractFloat32Frac( b );
1292     bExp = extractFloat32Exp( b );
1293     bSign = extractFloat32Sign( b );
1294     zSign = aSign ^ bSign;
1295     if ( aExp == 0xFF ) {
1296         if ( aSig ) return propagateFloat32NaN( a, b );
1297         if ( bExp == 0xFF ) {
1298             if ( bSig ) return propagateFloat32NaN( a, b );
1299             float_raise( float_flag_invalid );
1300             return float32_default_nan;
1301         }
1302         return packFloat32( zSign, 0xFF, 0 );
1303     }
1304     if ( bExp == 0xFF ) {
1305         if ( bSig ) return propagateFloat32NaN( a, b );
1306         return packFloat32( zSign, 0, 0 );
1307     }
1308     if ( bExp == 0 ) {
1309         if ( bSig == 0 ) {
1310             if ( ( aExp | aSig ) == 0 ) {
1311                 float_raise( float_flag_invalid );
1312                 return float32_default_nan;
1313             }
1314             float_raise( float_flag_divbyzero );
1315             return packFloat32( zSign, 0xFF, 0 );
1316         }
1317         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1318     }
1319     if ( aExp == 0 ) {
1320         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1321         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1322     }
1323     zExp = aExp - bExp + 0x7D;
1324     aSig = ( aSig | 0x00800000 )<<7;
1325     bSig = ( bSig | 0x00800000 )<<8;
1326     if ( bSig <= ( aSig + aSig ) ) {
1327         aSig >>= 1;
1328         ++zExp;
1329     }
1330     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1331     if ( ( zSig & 0x3F ) == 0 ) {
1332         zSig |= ( ( (bits64) bSig ) * zSig != ( (bits64) aSig )<<32 );
1333     }
1334     return roundAndPackFloat32( zSign, zExp, zSig );
1335
1336 }
1337
1338 /*
1339 -------------------------------------------------------------------------------
1340 Returns the remainder of the single-precision floating-point value `a'
1341 with respect to the corresponding value `b'.  The operation is performed
1342 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1343 -------------------------------------------------------------------------------
1344 */
1345 float32 float32_rem( float32 a, float32 b )
1346 {
1347     flag aSign, bSign, zSign;
1348     int16 aExp, bExp, expDiff;
1349     bits32 aSig, bSig;
1350     bits32 q;
1351     bits64 aSig64, bSig64, q64;
1352     bits32 alternateASig;
1353     sbits32 sigMean;
1354
1355     aSig = extractFloat32Frac( a );
1356     aExp = extractFloat32Exp( a );
1357     aSign = extractFloat32Sign( a );
1358     bSig = extractFloat32Frac( b );
1359     bExp = extractFloat32Exp( b );
1360     bSign = extractFloat32Sign( b );
1361     if ( aExp == 0xFF ) {
1362         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1363             return propagateFloat32NaN( a, b );
1364         }
1365         float_raise( float_flag_invalid );
1366         return float32_default_nan;
1367     }
1368     if ( bExp == 0xFF ) {
1369         if ( bSig ) return propagateFloat32NaN( a, b );
1370         return a;
1371     }
1372     if ( bExp == 0 ) {
1373         if ( bSig == 0 ) {
1374             float_raise( float_flag_invalid );
1375             return float32_default_nan;
1376         }
1377         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1378     }
1379     if ( aExp == 0 ) {
1380         if ( aSig == 0 ) return a;
1381         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1382     }
1383     expDiff = aExp - bExp;
1384     aSig |= 0x00800000;
1385     bSig |= 0x00800000;
1386     if ( expDiff < 32 ) {
1387         aSig <<= 8;
1388         bSig <<= 8;
1389         if ( expDiff < 0 ) {
1390             if ( expDiff < -1 ) return a;
1391             aSig >>= 1;
1392         }
1393         q = ( bSig <= aSig );
1394         if ( q ) aSig -= bSig;
1395         if ( 0 < expDiff ) {
1396             q = ( ( (bits64) aSig )<<32 ) / bSig;
1397             q >>= 32 - expDiff;
1398             bSig >>= 2;
1399             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1400         }
1401         else {
1402             aSig >>= 2;
1403             bSig >>= 2;
1404         }
1405     }
1406     else {
1407         if ( bSig <= aSig ) aSig -= bSig;
1408         aSig64 = ( (bits64) aSig )<<40;
1409         bSig64 = ( (bits64) bSig )<<40;
1410         expDiff -= 64;
1411         while ( 0 < expDiff ) {
1412             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1413             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1414             aSig64 = - ( ( bSig * q64 )<<38 );
1415             expDiff -= 62;
1416         }
1417         expDiff += 64;
1418         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1419         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1420         q = q64>>( 64 - expDiff );
1421         bSig <<= 6;
1422         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1423     }
1424     do {
1425         alternateASig = aSig;
1426         ++q;
1427         aSig -= bSig;
1428     } while ( 0 <= (sbits32) aSig );
1429     sigMean = aSig + alternateASig;
1430     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1431         aSig = alternateASig;
1432     }
1433     zSign = ( (sbits32) aSig < 0 );
1434     if ( zSign ) aSig = - aSig;
1435     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
1436
1437 }
1438
1439 /*
1440 -------------------------------------------------------------------------------
1441 Returns the square root of the single-precision floating-point value `a'.
1442 The operation is performed according to the IEC/IEEE Standard for Binary
1443 Floating-point Arithmetic.
1444 -------------------------------------------------------------------------------
1445 */
1446 float32 float32_sqrt( float32 a )
1447 {
1448     flag aSign;
1449     int16 aExp, zExp;
1450     bits32 aSig, zSig;
1451     bits64 rem, term;
1452
1453     aSig = extractFloat32Frac( a );
1454     aExp = extractFloat32Exp( a );
1455     aSign = extractFloat32Sign( a );
1456     if ( aExp == 0xFF ) {
1457         if ( aSig ) return propagateFloat32NaN( a, 0 );
1458         if ( ! aSign ) return a;
1459         float_raise( float_flag_invalid );
1460         return float32_default_nan;
1461     }
1462     if ( aSign ) {
1463         if ( ( aExp | aSig ) == 0 ) return a;
1464         float_raise( float_flag_invalid );
1465         return float32_default_nan;
1466     }
1467     if ( aExp == 0 ) {
1468         if ( aSig == 0 ) return 0;
1469         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1470     }
1471     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1472     aSig = ( aSig | 0x00800000 )<<8;
1473     zSig = estimateSqrt32( aExp, aSig ) + 2;
1474     if ( ( zSig & 0x7F ) <= 5 ) {
1475         if ( zSig < 2 ) {
1476             zSig = 0xFFFFFFFF;
1477         }
1478         else {
1479             aSig >>= aExp & 1;
1480             term = ( (bits64) zSig ) * zSig;
1481             rem = ( ( (bits64) aSig )<<32 ) - term;
1482             while ( (sbits64) rem < 0 ) {
1483                 --zSig;
1484                 rem += ( ( (bits64) zSig )<<1 ) | 1;
1485             }
1486             zSig |= ( rem != 0 );
1487         }
1488     }
1489     shift32RightJamming( zSig, 1, &zSig );
1490     return roundAndPackFloat32( 0, zExp, zSig );
1491
1492 }
1493
1494 /*
1495 -------------------------------------------------------------------------------
1496 Returns 1 if the single-precision floating-point value `a' is equal to the
1497 corresponding value `b', and 0 otherwise.  The comparison is performed
1498 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1499 -------------------------------------------------------------------------------
1500 */
1501 flag float32_eq( float32 a, float32 b )
1502 {
1503
1504     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1505          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1506        ) {
1507         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1508             float_raise( float_flag_invalid );
1509         }
1510         return 0;
1511     }
1512     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1513
1514 }
1515
1516 /*
1517 -------------------------------------------------------------------------------
1518 Returns 1 if the single-precision floating-point value `a' is less than or
1519 equal to the corresponding value `b', and 0 otherwise.  The comparison is
1520 performed according to the IEC/IEEE Standard for Binary Floating-point
1521 Arithmetic.
1522 -------------------------------------------------------------------------------
1523 */
1524 flag float32_le( float32 a, float32 b )
1525 {
1526     flag aSign, bSign;
1527
1528     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1529          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1530        ) {
1531         float_raise( float_flag_invalid );
1532         return 0;
1533     }
1534     aSign = extractFloat32Sign( a );
1535     bSign = extractFloat32Sign( b );
1536     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1537     return ( a == b ) || ( aSign ^ ( a < b ) );
1538
1539 }
1540
1541 /*
1542 -------------------------------------------------------------------------------
1543 Returns 1 if the single-precision floating-point value `a' is less than
1544 the corresponding value `b', and 0 otherwise.  The comparison is performed
1545 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1546 -------------------------------------------------------------------------------
1547 */
1548 flag float32_lt( float32 a, float32 b )
1549 {
1550     flag aSign, bSign;
1551
1552     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1553          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1554        ) {
1555         float_raise( float_flag_invalid );
1556         return 0;
1557     }
1558     aSign = extractFloat32Sign( a );
1559     bSign = extractFloat32Sign( b );
1560     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1561     return ( a != b ) && ( aSign ^ ( a < b ) );
1562
1563 }
1564
1565 /*
1566 -------------------------------------------------------------------------------
1567 Returns 1 if the single-precision floating-point value `a' is equal to the
1568 corresponding value `b', and 0 otherwise.  The invalid exception is raised
1569 if either operand is a NaN.  Otherwise, the comparison is performed
1570 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1571 -------------------------------------------------------------------------------
1572 */
1573 flag float32_eq_signaling( float32 a, float32 b )
1574 {
1575
1576     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1577          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1578        ) {
1579         float_raise( float_flag_invalid );
1580         return 0;
1581     }
1582     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1583
1584 }
1585
1586 /*
1587 -------------------------------------------------------------------------------
1588 Returns 1 if the single-precision floating-point value `a' is less than or
1589 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
1590 cause an exception.  Otherwise, the comparison is performed according to the
1591 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1592 -------------------------------------------------------------------------------
1593 */
1594 flag float32_le_quiet( float32 a, float32 b )
1595 {
1596     flag aSign, bSign;
1597     //int16 aExp, bExp;
1598
1599     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1600          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1601        ) {
1602         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1603             float_raise( float_flag_invalid );
1604         }
1605         return 0;
1606     }
1607     aSign = extractFloat32Sign( a );
1608     bSign = extractFloat32Sign( b );
1609     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1610     return ( a == b ) || ( aSign ^ ( a < b ) );
1611
1612 }
1613
1614 /*
1615 -------------------------------------------------------------------------------
1616 Returns 1 if the single-precision floating-point value `a' is less than
1617 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
1618 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
1619 Standard for Binary Floating-point Arithmetic.
1620 -------------------------------------------------------------------------------
1621 */
1622 flag float32_lt_quiet( float32 a, float32 b )
1623 {
1624     flag aSign, bSign;
1625
1626     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1627          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1628        ) {
1629         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1630             float_raise( float_flag_invalid );
1631         }
1632         return 0;
1633     }
1634     aSign = extractFloat32Sign( a );
1635     bSign = extractFloat32Sign( b );
1636     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1637     return ( a != b ) && ( aSign ^ ( a < b ) );
1638
1639 }
1640
1641 /*
1642 -------------------------------------------------------------------------------
1643 Returns the result of converting the double-precision floating-point value
1644 `a' to the 32-bit two's complement integer format.  The conversion is
1645 performed according to the IEC/IEEE Standard for Binary Floating-point
1646 Arithmetic---which means in particular that the conversion is rounded
1647 according to the current rounding mode.  If `a' is a NaN, the largest
1648 positive integer is returned.  Otherwise, if the conversion overflows, the
1649 largest integer with the same sign as `a' is returned.
1650 -------------------------------------------------------------------------------
1651 */
1652 int32 float64_to_int32( float64 a )
1653 {
1654     flag aSign;
1655     int16 aExp, shiftCount;
1656     bits64 aSig;
1657
1658     aSig = extractFloat64Frac( a );
1659     aExp = extractFloat64Exp( a );
1660     aSign = extractFloat64Sign( a );
1661     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1662     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1663     shiftCount = 0x42C - aExp;
1664     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1665     return roundAndPackInt32( aSign, aSig );
1666
1667 }
1668
1669 /*
1670 -------------------------------------------------------------------------------
1671 Returns the result of converting the double-precision floating-point value
1672 `a' to the 32-bit two's complement integer format.  The conversion is
1673 performed according to the IEC/IEEE Standard for Binary Floating-point
1674 Arithmetic, except that the conversion is always rounded toward zero.  If
1675 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1676 conversion overflows, the largest integer with the same sign as `a' is
1677 returned.
1678 -------------------------------------------------------------------------------
1679 */
1680 int32 float64_to_int32_round_to_zero( float64 a )
1681 {
1682     flag aSign;
1683     int16 aExp, shiftCount;
1684     bits64 aSig, savedASig;
1685     int32 z;
1686
1687     aSig = extractFloat64Frac( a );
1688     aExp = extractFloat64Exp( a );
1689     aSign = extractFloat64Sign( a );
1690     shiftCount = 0x433 - aExp;
1691     if ( shiftCount < 21 ) {
1692         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1693         goto invalid;
1694     }
1695     else if ( 52 < shiftCount ) {
1696         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
1697         return 0;
1698     }
1699     aSig |= LIT64( 0x0010000000000000 );
1700     savedASig = aSig;
1701     aSig >>= shiftCount;
1702     z = aSig;
1703     if ( aSign ) z = - z;
1704     if ( ( z < 0 ) ^ aSign ) {
1705  invalid:
1706         float_exception_flags |= float_flag_invalid;
1707         return aSign ? 0x80000000 : 0x7FFFFFFF;
1708     }
1709     if ( ( aSig<<shiftCount ) != savedASig ) {
1710         float_exception_flags |= float_flag_inexact;
1711     }
1712     return z;
1713
1714 }
1715
1716 /*
1717 -------------------------------------------------------------------------------
1718 Returns the result of converting the double-precision floating-point value
1719 `a' to the 32-bit two's complement unsigned integer format.  The conversion
1720 is performed according to the IEC/IEEE Standard for Binary Floating-point
1721 Arithmetic---which means in particular that the conversion is rounded
1722 according to the current rounding mode.  If `a' is a NaN, the largest
1723 positive integer is returned.  Otherwise, if the conversion overflows, the
1724 largest positive integer is returned.
1725 -------------------------------------------------------------------------------
1726 */
1727 int32 float64_to_uint32( float64 a )
1728 {
1729     flag aSign;
1730     int16 aExp, shiftCount;
1731     bits64 aSig;
1732
1733     aSig = extractFloat64Frac( a );
1734     aExp = extractFloat64Exp( a );
1735     aSign = 0; //extractFloat64Sign( a );
1736     //if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1737     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1738     shiftCount = 0x42C - aExp;
1739     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1740     return roundAndPackInt32( aSign, aSig );
1741 }
1742
1743 /*
1744 -------------------------------------------------------------------------------
1745 Returns the result of converting the double-precision floating-point value
1746 `a' to the 32-bit two's complement integer format.  The conversion is
1747 performed according to the IEC/IEEE Standard for Binary Floating-point
1748 Arithmetic, except that the conversion is always rounded toward zero.  If
1749 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1750 conversion overflows, the largest positive integer is returned.
1751 -------------------------------------------------------------------------------
1752 */
1753 int32 float64_to_uint32_round_to_zero( float64 a )
1754 {
1755     flag aSign;
1756     int16 aExp, shiftCount;
1757     bits64 aSig, savedASig;
1758     int32 z;
1759
1760     aSig = extractFloat64Frac( a );
1761     aExp = extractFloat64Exp( a );
1762     aSign = extractFloat64Sign( a );
1763     shiftCount = 0x433 - aExp;
1764     if ( shiftCount < 21 ) {
1765         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1766         goto invalid;
1767     }
1768     else if ( 52 < shiftCount ) {
1769         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
1770         return 0;
1771     }
1772     aSig |= LIT64( 0x0010000000000000 );
1773     savedASig = aSig;
1774     aSig >>= shiftCount;
1775     z = aSig;
1776     if ( aSign ) z = - z;
1777     if ( ( z < 0 ) ^ aSign ) {
1778  invalid:
1779         float_exception_flags |= float_flag_invalid;
1780         return aSign ? 0x80000000 : 0x7FFFFFFF;
1781     }
1782     if ( ( aSig<<shiftCount ) != savedASig ) {
1783         float_exception_flags |= float_flag_inexact;
1784     }
1785     return z;
1786 }
1787
1788 /*
1789 -------------------------------------------------------------------------------
1790 Returns the result of converting the double-precision floating-point value
1791 `a' to the single-precision floating-point format.  The conversion is
1792 performed according to the IEC/IEEE Standard for Binary Floating-point
1793 Arithmetic.
1794 -------------------------------------------------------------------------------
1795 */
1796 float32 float64_to_float32( float64 a )
1797 {
1798     flag aSign;
1799     int16 aExp;
1800     bits64 aSig;
1801     bits32 zSig;
1802
1803     aSig = extractFloat64Frac( a );
1804     aExp = extractFloat64Exp( a );
1805     aSign = extractFloat64Sign( a );
1806     if ( aExp == 0x7FF ) {
1807         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
1808         return packFloat32( aSign, 0xFF, 0 );
1809     }
1810     shift64RightJamming( aSig, 22, &aSig );
1811     zSig = aSig;
1812     if ( aExp || zSig ) {
1813         zSig |= 0x40000000;
1814         aExp -= 0x381;
1815     }
1816     return roundAndPackFloat32( aSign, aExp, zSig );
1817
1818 }
1819
1820 #ifdef FLOATX80
1821
1822 /*
1823 -------------------------------------------------------------------------------
1824 Returns the result of converting the double-precision floating-point value
1825 `a' to the extended double-precision floating-point format.  The conversion
1826 is performed according to the IEC/IEEE Standard for Binary Floating-point
1827 Arithmetic.
1828 -------------------------------------------------------------------------------
1829 */
1830 floatx80 float64_to_floatx80( float64 a )
1831 {
1832     flag aSign;
1833     int16 aExp;
1834     bits64 aSig;
1835
1836     aSig = extractFloat64Frac( a );
1837     aExp = extractFloat64Exp( a );
1838     aSign = extractFloat64Sign( a );
1839     if ( aExp == 0x7FF ) {
1840         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
1841         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1842     }
1843     if ( aExp == 0 ) {
1844         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1845         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
1846     }
1847     return
1848         packFloatx80(
1849             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
1850
1851 }
1852
1853 #endif
1854
1855 /*
1856 -------------------------------------------------------------------------------
1857 Rounds the double-precision floating-point value `a' to an integer, and
1858 returns the result as a double-precision floating-point value.  The
1859 operation is performed according to the IEC/IEEE Standard for Binary
1860 Floating-point Arithmetic.
1861 -------------------------------------------------------------------------------
1862 */
1863 float64 float64_round_to_int( float64 a )
1864 {
1865     flag aSign;
1866     int16 aExp;
1867     bits64 lastBitMask, roundBitsMask;
1868     int8 roundingMode;
1869     float64 z;
1870
1871     aExp = extractFloat64Exp( a );
1872     if ( 0x433 <= aExp ) {
1873         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
1874             return propagateFloat64NaN( a, a );
1875         }
1876         return a;
1877     }
1878     if ( aExp <= 0x3FE ) {
1879         if ( (bits64) ( a<<1 ) == 0 ) return a;
1880         float_exception_flags |= float_flag_inexact;
1881         aSign = extractFloat64Sign( a );
1882         switch ( float_rounding_mode ) {
1883          case float_round_nearest_even:
1884             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
1885                 return packFloat64( aSign, 0x3FF, 0 );
1886             }
1887             break;
1888          case float_round_down:
1889             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
1890          case float_round_up:
1891             return
1892             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
1893         }
1894         return packFloat64( aSign, 0, 0 );
1895     }
1896     lastBitMask = 1;
1897     lastBitMask <<= 0x433 - aExp;
1898     roundBitsMask = lastBitMask - 1;
1899     z = a;
1900     roundingMode = float_rounding_mode;
1901     if ( roundingMode == float_round_nearest_even ) {
1902         z += lastBitMask>>1;
1903         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1904     }
1905     else if ( roundingMode != float_round_to_zero ) {
1906         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1907             z += roundBitsMask;
1908         }
1909     }
1910     z &= ~ roundBitsMask;
1911     if ( z != a ) float_exception_flags |= float_flag_inexact;
1912     return z;
1913
1914 }
1915
1916 /*
1917 -------------------------------------------------------------------------------
1918 Returns the result of adding the absolute values of the double-precision
1919 floating-point values `a' and `b'.  If `zSign' is true, the sum is negated
1920 before being returned.  `zSign' is ignored if the result is a NaN.  The
1921 addition is performed according to the IEC/IEEE Standard for Binary
1922 Floating-point Arithmetic.
1923 -------------------------------------------------------------------------------
1924 */
1925 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
1926 {
1927     int16 aExp, bExp, zExp;
1928     bits64 aSig, bSig, zSig;
1929     int16 expDiff;
1930
1931     aSig = extractFloat64Frac( a );
1932     aExp = extractFloat64Exp( a );
1933     bSig = extractFloat64Frac( b );
1934     bExp = extractFloat64Exp( b );
1935     expDiff = aExp - bExp;
1936     aSig <<= 9;
1937     bSig <<= 9;
1938     if ( 0 < expDiff ) {
1939         if ( aExp == 0x7FF ) {
1940             if ( aSig ) return propagateFloat64NaN( a, b );
1941             return a;
1942         }
1943         if ( bExp == 0 ) {
1944             --expDiff;
1945         }
1946         else {
1947             bSig |= LIT64( 0x2000000000000000 );
1948         }
1949         shift64RightJamming( bSig, expDiff, &bSig );
1950         zExp = aExp;
1951     }
1952     else if ( expDiff < 0 ) {
1953         if ( bExp == 0x7FF ) {
1954             if ( bSig ) return propagateFloat64NaN( a, b );
1955             return packFloat64( zSign, 0x7FF, 0 );
1956         }
1957         if ( aExp == 0 ) {
1958             ++expDiff;
1959         }
1960         else {
1961             aSig |= LIT64( 0x2000000000000000 );
1962         }
1963         shift64RightJamming( aSig, - expDiff, &aSig );
1964         zExp = bExp;
1965     }
1966     else {
1967         if ( aExp == 0x7FF ) {
1968             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
1969             return a;
1970         }
1971         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
1972         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
1973         zExp = aExp;
1974         goto roundAndPack;
1975     }
1976     aSig |= LIT64( 0x2000000000000000 );
1977     zSig = ( aSig + bSig )<<1;
1978     --zExp;
1979     if ( (sbits64) zSig < 0 ) {
1980         zSig = aSig + bSig;
1981         ++zExp;
1982     }
1983  roundAndPack:
1984     return roundAndPackFloat64( zSign, zExp, zSig );
1985
1986 }
1987
1988 /*
1989 -------------------------------------------------------------------------------
1990 Returns the result of subtracting the absolute values of the double-
1991 precision floating-point values `a' and `b'.  If `zSign' is true, the
1992 difference is negated before being returned.  `zSign' is ignored if the
1993 result is a NaN.  The subtraction is performed according to the IEC/IEEE
1994 Standard for Binary Floating-point Arithmetic.
1995 -------------------------------------------------------------------------------
1996 */
1997 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
1998 {
1999     int16 aExp, bExp, zExp;
2000     bits64 aSig, bSig, zSig;
2001     int16 expDiff;
2002
2003     aSig = extractFloat64Frac( a );
2004     aExp = extractFloat64Exp( a );
2005     bSig = extractFloat64Frac( b );
2006     bExp = extractFloat64Exp( b );
2007     expDiff = aExp - bExp;
2008     aSig <<= 10;
2009     bSig <<= 10;
2010     if ( 0 < expDiff ) goto aExpBigger;
2011     if ( expDiff < 0 ) goto bExpBigger;
2012     if ( aExp == 0x7FF ) {
2013         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2014         float_raise( float_flag_invalid );
2015         return float64_default_nan;
2016     }
2017     if ( aExp == 0 ) {
2018         aExp = 1;
2019         bExp = 1;
2020     }
2021     if ( bSig < aSig ) goto aBigger;
2022     if ( aSig < bSig ) goto bBigger;
2023     return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2024  bExpBigger:
2025     if ( bExp == 0x7FF ) {
2026         if ( bSig ) return propagateFloat64NaN( a, b );
2027         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2028     }
2029     if ( aExp == 0 ) {
2030         ++expDiff;
2031     }
2032     else {
2033         aSig |= LIT64( 0x4000000000000000 );
2034     }
2035     shift64RightJamming( aSig, - expDiff, &aSig );
2036     bSig |= LIT64( 0x4000000000000000 );
2037  bBigger:
2038     zSig = bSig - aSig;
2039     zExp = bExp;
2040     zSign ^= 1;
2041     goto normalizeRoundAndPack;
2042  aExpBigger:
2043     if ( aExp == 0x7FF ) {
2044         if ( aSig ) return propagateFloat64NaN( a, b );
2045         return a;
2046     }
2047     if ( bExp == 0 ) {
2048         --expDiff;
2049     }
2050     else {
2051         bSig |= LIT64( 0x4000000000000000 );
2052     }
2053     shift64RightJamming( bSig, expDiff, &bSig );
2054     aSig |= LIT64( 0x4000000000000000 );
2055  aBigger:
2056     zSig = aSig - bSig;
2057     zExp = aExp;
2058  normalizeRoundAndPack:
2059     --zExp;
2060     return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2061
2062 }
2063
2064 /*
2065 -------------------------------------------------------------------------------
2066 Returns the result of adding the double-precision floating-point values `a'
2067 and `b'.  The operation is performed according to the IEC/IEEE Standard for
2068 Binary Floating-point Arithmetic.
2069 -------------------------------------------------------------------------------
2070 */
2071 float64 float64_add( float64 a, float64 b )
2072 {
2073     flag aSign, bSign;
2074
2075     aSign = extractFloat64Sign( a );
2076     bSign = extractFloat64Sign( b );
2077     if ( aSign == bSign ) {
2078         return addFloat64Sigs( a, b, aSign );
2079     }
2080     else {
2081         return subFloat64Sigs( a, b, aSign );
2082     }
2083
2084 }
2085
2086 /*
2087 -------------------------------------------------------------------------------
2088 Returns the result of subtracting the double-precision floating-point values
2089 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2090 for Binary Floating-point Arithmetic.
2091 -------------------------------------------------------------------------------
2092 */
2093 float64 float64_sub( float64 a, float64 b )
2094 {
2095     flag aSign, bSign;
2096
2097     aSign = extractFloat64Sign( a );
2098     bSign = extractFloat64Sign( b );
2099     if ( aSign == bSign ) {
2100         return subFloat64Sigs( a, b, aSign );
2101     }
2102     else {
2103         return addFloat64Sigs( a, b, aSign );
2104     }
2105
2106 }
2107
2108 /*
2109 -------------------------------------------------------------------------------
2110 Returns the result of multiplying the double-precision floating-point values
2111 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2112 for Binary Floating-point Arithmetic.
2113 -------------------------------------------------------------------------------
2114 */
2115 float64 float64_mul( float64 a, float64 b )
2116 {
2117     flag aSign, bSign, zSign;
2118     int16 aExp, bExp, zExp;
2119     bits64 aSig, bSig, zSig0, zSig1;
2120
2121     aSig = extractFloat64Frac( a );
2122     aExp = extractFloat64Exp( a );
2123     aSign = extractFloat64Sign( a );
2124     bSig = extractFloat64Frac( b );
2125     bExp = extractFloat64Exp( b );
2126     bSign = extractFloat64Sign( b );
2127     zSign = aSign ^ bSign;
2128     if ( aExp == 0x7FF ) {
2129         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2130             return propagateFloat64NaN( a, b );
2131         }
2132         if ( ( bExp | bSig ) == 0 ) {
2133             float_raise( float_flag_invalid );
2134             return float64_default_nan;
2135         }
2136         return packFloat64( zSign, 0x7FF, 0 );
2137     }
2138     if ( bExp == 0x7FF ) {
2139         if ( bSig ) return propagateFloat64NaN( a, b );
2140         if ( ( aExp | aSig ) == 0 ) {
2141             float_raise( float_flag_invalid );
2142             return float64_default_nan;
2143         }
2144         return packFloat64( zSign, 0x7FF, 0 );
2145     }
2146     if ( aExp == 0 ) {
2147         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2148         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2149     }
2150     if ( bExp == 0 ) {
2151         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2152         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2153     }
2154     zExp = aExp + bExp - 0x3FF;
2155     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2156     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2157     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2158     zSig0 |= ( zSig1 != 0 );
2159     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2160         zSig0 <<= 1;
2161         --zExp;
2162     }
2163     return roundAndPackFloat64( zSign, zExp, zSig0 );
2164
2165 }
2166
2167 /*
2168 -------------------------------------------------------------------------------
2169 Returns the result of dividing the double-precision floating-point value `a'
2170 by the corresponding value `b'.  The operation is performed according to
2171 the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2172 -------------------------------------------------------------------------------
2173 */
2174 float64 float64_div( float64 a, float64 b )
2175 {
2176     flag aSign, bSign, zSign;
2177     int16 aExp, bExp, zExp;
2178     bits64 aSig, bSig, zSig;
2179     bits64 rem0, rem1;
2180     bits64 term0, term1;
2181
2182     aSig = extractFloat64Frac( a );
2183     aExp = extractFloat64Exp( a );
2184     aSign = extractFloat64Sign( a );
2185     bSig = extractFloat64Frac( b );
2186     bExp = extractFloat64Exp( b );
2187     bSign = extractFloat64Sign( b );
2188     zSign = aSign ^ bSign;
2189     if ( aExp == 0x7FF ) {
2190         if ( aSig ) return propagateFloat64NaN( a, b );
2191         if ( bExp == 0x7FF ) {
2192             if ( bSig ) return propagateFloat64NaN( a, b );
2193             float_raise( float_flag_invalid );
2194             return float64_default_nan;
2195         }
2196         return packFloat64( zSign, 0x7FF, 0 );
2197     }
2198     if ( bExp == 0x7FF ) {
2199         if ( bSig ) return propagateFloat64NaN( a, b );
2200         return packFloat64( zSign, 0, 0 );
2201     }
2202     if ( bExp == 0 ) {
2203         if ( bSig == 0 ) {
2204             if ( ( aExp | aSig ) == 0 ) {
2205                 float_raise( float_flag_invalid );
2206                 return float64_default_nan;
2207             }
2208             float_raise( float_flag_divbyzero );
2209             return packFloat64( zSign, 0x7FF, 0 );
2210         }
2211         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2212     }
2213     if ( aExp == 0 ) {
2214         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2215         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2216     }
2217     zExp = aExp - bExp + 0x3FD;
2218     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2219     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2220     if ( bSig <= ( aSig + aSig ) ) {
2221         aSig >>= 1;
2222         ++zExp;
2223     }
2224     zSig = estimateDiv128To64( aSig, 0, bSig );
2225     if ( ( zSig & 0x1FF ) <= 2 ) {
2226         mul64To128( bSig, zSig, &term0, &term1 );
2227         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2228         while ( (sbits64) rem0 < 0 ) {
2229             --zSig;
2230             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2231         }
2232         zSig |= ( rem1 != 0 );
2233     }
2234     return roundAndPackFloat64( zSign, zExp, zSig );
2235
2236 }
2237
2238 /*
2239 -------------------------------------------------------------------------------
2240 Returns the remainder of the double-precision floating-point value `a'
2241 with respect to the corresponding value `b'.  The operation is performed
2242 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2243 -------------------------------------------------------------------------------
2244 */
2245 float64 float64_rem( float64 a, float64 b )
2246 {
2247     flag aSign, bSign, zSign;
2248     int16 aExp, bExp, expDiff;
2249     bits64 aSig, bSig;
2250     bits64 q, alternateASig;
2251     sbits64 sigMean;
2252
2253     aSig = extractFloat64Frac( a );
2254     aExp = extractFloat64Exp( a );
2255     aSign = extractFloat64Sign( a );
2256     bSig = extractFloat64Frac( b );
2257     bExp = extractFloat64Exp( b );
2258     bSign = extractFloat64Sign( b );
2259     if ( aExp == 0x7FF ) {
2260         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2261             return propagateFloat64NaN( a, b );
2262         }
2263         float_raise( float_flag_invalid );
2264         return float64_default_nan;
2265     }
2266     if ( bExp == 0x7FF ) {
2267         if ( bSig ) return propagateFloat64NaN( a, b );
2268         return a;
2269     }
2270     if ( bExp == 0 ) {
2271         if ( bSig == 0 ) {
2272             float_raise( float_flag_invalid );
2273             return float64_default_nan;
2274         }
2275         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2276     }
2277     if ( aExp == 0 ) {
2278         if ( aSig == 0 ) return a;
2279         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2280     }
2281     expDiff = aExp - bExp;
2282     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2283     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2284     if ( expDiff < 0 ) {
2285         if ( expDiff < -1 ) return a;
2286         aSig >>= 1;
2287     }
2288     q = ( bSig <= aSig );
2289     if ( q ) aSig -= bSig;
2290     expDiff -= 64;
2291     while ( 0 < expDiff ) {
2292         q = estimateDiv128To64( aSig, 0, bSig );
2293         q = ( 2 < q ) ? q - 2 : 0;
2294         aSig = - ( ( bSig>>2 ) * q );
2295         expDiff -= 62;
2296     }
2297     expDiff += 64;
2298     if ( 0 < expDiff ) {
2299         q = estimateDiv128To64( aSig, 0, bSig );
2300         q = ( 2 < q ) ? q - 2 : 0;
2301         q >>= 64 - expDiff;
2302         bSig >>= 2;
2303         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2304     }
2305     else {
2306         aSig >>= 2;
2307         bSig >>= 2;
2308     }
2309     do {
2310         alternateASig = aSig;
2311         ++q;
2312         aSig -= bSig;
2313     } while ( 0 <= (sbits64) aSig );
2314     sigMean = aSig + alternateASig;
2315     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2316         aSig = alternateASig;
2317     }
2318     zSign = ( (sbits64) aSig < 0 );
2319     if ( zSign ) aSig = - aSig;
2320     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
2321
2322 }
2323
2324 /*
2325 -------------------------------------------------------------------------------
2326 Returns the square root of the double-precision floating-point value `a'.
2327 The operation is performed according to the IEC/IEEE Standard for Binary
2328 Floating-point Arithmetic.
2329 -------------------------------------------------------------------------------
2330 */
2331 float64 float64_sqrt( float64 a )
2332 {
2333     flag aSign;
2334     int16 aExp, zExp;
2335     bits64 aSig, zSig;
2336     bits64 rem0, rem1, term0, term1; //, shiftedRem;
2337     //float64 z;
2338
2339     aSig = extractFloat64Frac( a );
2340     aExp = extractFloat64Exp( a );
2341     aSign = extractFloat64Sign( a );
2342     if ( aExp == 0x7FF ) {
2343         if ( aSig ) return propagateFloat64NaN( a, a );
2344         if ( ! aSign ) return a;
2345         float_raise( float_flag_invalid );
2346         return float64_default_nan;
2347     }
2348     if ( aSign ) {
2349         if ( ( aExp | aSig ) == 0 ) return a;
2350         float_raise( float_flag_invalid );
2351         return float64_default_nan;
2352     }
2353     if ( aExp == 0 ) {
2354         if ( aSig == 0 ) return 0;
2355         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2356     }
2357     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2358     aSig |= LIT64( 0x0010000000000000 );
2359     zSig = estimateSqrt32( aExp, aSig>>21 );
2360     zSig <<= 31;
2361     aSig <<= 9 - ( aExp & 1 );
2362     zSig = estimateDiv128To64( aSig, 0, zSig ) + zSig + 2;
2363     if ( ( zSig & 0x3FF ) <= 5 ) {
2364         if ( zSig < 2 ) {
2365             zSig = LIT64( 0xFFFFFFFFFFFFFFFF );
2366         }
2367         else {
2368             aSig <<= 2;
2369             mul64To128( zSig, zSig, &term0, &term1 );
2370             sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2371             while ( (sbits64) rem0 < 0 ) {
2372                 --zSig;
2373                 shortShift128Left( 0, zSig, 1, &term0, &term1 );
2374                 term1 |= 1;
2375                 add128( rem0, rem1, term0, term1, &rem0, &rem1 );
2376             }
2377             zSig |= ( ( rem0 | rem1 ) != 0 );
2378         }
2379     }
2380     shift64RightJamming( zSig, 1, &zSig );
2381     return roundAndPackFloat64( 0, zExp, zSig );
2382
2383 }
2384
2385 /*
2386 -------------------------------------------------------------------------------
2387 Returns 1 if the double-precision floating-point value `a' is equal to the
2388 corresponding value `b', and 0 otherwise.  The comparison is performed
2389 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2390 -------------------------------------------------------------------------------
2391 */
2392 flag float64_eq( float64 a, float64 b )
2393 {
2394
2395     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2396          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2397        ) {
2398         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2399             float_raise( float_flag_invalid );
2400         }
2401         return 0;
2402     }
2403     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2404
2405 }
2406
2407 /*
2408 -------------------------------------------------------------------------------
2409 Returns 1 if the double-precision floating-point value `a' is less than or
2410 equal to the corresponding value `b', and 0 otherwise.  The comparison is
2411 performed according to the IEC/IEEE Standard for Binary Floating-point
2412 Arithmetic.
2413 -------------------------------------------------------------------------------
2414 */
2415 flag float64_le( float64 a, float64 b )
2416 {
2417     flag aSign, bSign;
2418
2419     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2420          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2421        ) {
2422         float_raise( float_flag_invalid );
2423         return 0;
2424     }
2425     aSign = extractFloat64Sign( a );
2426     bSign = extractFloat64Sign( b );
2427     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2428     return ( a == b ) || ( aSign ^ ( a < b ) );
2429
2430 }
2431
2432 /*
2433 -------------------------------------------------------------------------------
2434 Returns 1 if the double-precision floating-point value `a' is less than
2435 the corresponding value `b', and 0 otherwise.  The comparison is performed
2436 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2437 -------------------------------------------------------------------------------
2438 */
2439 flag float64_lt( float64 a, float64 b )
2440 {
2441     flag aSign, bSign;
2442
2443     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2444          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2445        ) {
2446         float_raise( float_flag_invalid );
2447         return 0;
2448     }
2449     aSign = extractFloat64Sign( a );
2450     bSign = extractFloat64Sign( b );
2451     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2452     return ( a != b ) && ( aSign ^ ( a < b ) );
2453
2454 }
2455
2456 /*
2457 -------------------------------------------------------------------------------
2458 Returns 1 if the double-precision floating-point value `a' is equal to the
2459 corresponding value `b', and 0 otherwise.  The invalid exception is raised
2460 if either operand is a NaN.  Otherwise, the comparison is performed
2461 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2462 -------------------------------------------------------------------------------
2463 */
2464 flag float64_eq_signaling( float64 a, float64 b )
2465 {
2466
2467     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2468          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2469        ) {
2470         float_raise( float_flag_invalid );
2471         return 0;
2472     }
2473     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2474
2475 }
2476
2477 /*
2478 -------------------------------------------------------------------------------
2479 Returns 1 if the double-precision floating-point value `a' is less than or
2480 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2481 cause an exception.  Otherwise, the comparison is performed according to the
2482 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2483 -------------------------------------------------------------------------------
2484 */
2485 flag float64_le_quiet( float64 a, float64 b )
2486 {
2487     flag aSign, bSign;
2488     //int16 aExp, bExp;
2489
2490     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2491          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2492        ) {
2493         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2494             float_raise( float_flag_invalid );
2495         }
2496         return 0;
2497     }
2498     aSign = extractFloat64Sign( a );
2499     bSign = extractFloat64Sign( b );
2500     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2501     return ( a == b ) || ( aSign ^ ( a < b ) );
2502
2503 }
2504
2505 /*
2506 -------------------------------------------------------------------------------
2507 Returns 1 if the double-precision floating-point value `a' is less than
2508 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2509 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2510 Standard for Binary Floating-point Arithmetic.
2511 -------------------------------------------------------------------------------
2512 */
2513 flag float64_lt_quiet( float64 a, float64 b )
2514 {
2515     flag aSign, bSign;
2516
2517     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2518          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2519        ) {
2520         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2521             float_raise( float_flag_invalid );
2522         }
2523         return 0;
2524     }
2525     aSign = extractFloat64Sign( a );
2526     bSign = extractFloat64Sign( b );
2527     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2528     return ( a != b ) && ( aSign ^ ( a < b ) );
2529
2530 }
2531
2532 #ifdef FLOATX80
2533
2534 /*
2535 -------------------------------------------------------------------------------
2536 Returns the result of converting the extended double-precision floating-
2537 point value `a' to the 32-bit two's complement integer format.  The
2538 conversion is performed according to the IEC/IEEE Standard for Binary
2539 Floating-point Arithmetic---which means in particular that the conversion
2540 is rounded according to the current rounding mode.  If `a' is a NaN, the
2541 largest positive integer is returned.  Otherwise, if the conversion
2542 overflows, the largest integer with the same sign as `a' is returned.
2543 -------------------------------------------------------------------------------
2544 */
2545 int32 floatx80_to_int32( floatx80 a )
2546 {
2547     flag aSign;
2548     int32 aExp, shiftCount;
2549     bits64 aSig;
2550
2551     aSig = extractFloatx80Frac( a );
2552     aExp = extractFloatx80Exp( a );
2553     aSign = extractFloatx80Sign( a );
2554     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2555     shiftCount = 0x4037 - aExp;
2556     if ( shiftCount <= 0 ) shiftCount = 1;
2557     shift64RightJamming( aSig, shiftCount, &aSig );
2558     return roundAndPackInt32( aSign, aSig );
2559
2560 }
2561
2562 /*
2563 -------------------------------------------------------------------------------
2564 Returns the result of converting the extended double-precision floating-
2565 point value `a' to the 32-bit two's complement integer format.  The
2566 conversion is performed according to the IEC/IEEE Standard for Binary
2567 Floating-point Arithmetic, except that the conversion is always rounded
2568 toward zero.  If `a' is a NaN, the largest positive integer is returned.
2569 Otherwise, if the conversion overflows, the largest integer with the same
2570 sign as `a' is returned.
2571 -------------------------------------------------------------------------------
2572 */
2573 int32 floatx80_to_int32_round_to_zero( floatx80 a )
2574 {
2575     flag aSign;
2576     int32 aExp, shiftCount;
2577     bits64 aSig, savedASig;
2578     int32 z;
2579
2580     aSig = extractFloatx80Frac( a );
2581     aExp = extractFloatx80Exp( a );
2582     aSign = extractFloatx80Sign( a );
2583     shiftCount = 0x403E - aExp;
2584     if ( shiftCount < 32 ) {
2585         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2586         goto invalid;
2587     }
2588     else if ( 63 < shiftCount ) {
2589         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2590         return 0;
2591     }
2592     savedASig = aSig;
2593     aSig >>= shiftCount;
2594     z = aSig;
2595     if ( aSign ) z = - z;
2596     if ( ( z < 0 ) ^ aSign ) {
2597  invalid:
2598         float_exception_flags |= float_flag_invalid;
2599         return aSign ? 0x80000000 : 0x7FFFFFFF;
2600     }
2601     if ( ( aSig<<shiftCount ) != savedASig ) {
2602         float_exception_flags |= float_flag_inexact;
2603     }
2604     return z;
2605
2606 }
2607
2608 /*
2609 -------------------------------------------------------------------------------
2610 Returns the result of converting the extended double-precision floating-
2611 point value `a' to the single-precision floating-point format.  The
2612 conversion is performed according to the IEC/IEEE Standard for Binary
2613 Floating-point Arithmetic.
2614 -------------------------------------------------------------------------------
2615 */
2616 float32 floatx80_to_float32( floatx80 a )
2617 {
2618     flag aSign;
2619     int32 aExp;
2620     bits64 aSig;
2621
2622     aSig = extractFloatx80Frac( a );
2623     aExp = extractFloatx80Exp( a );
2624     aSign = extractFloatx80Sign( a );
2625     if ( aExp == 0x7FFF ) {
2626         if ( (bits64) ( aSig<<1 ) ) {
2627             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
2628         }
2629         return packFloat32( aSign, 0xFF, 0 );
2630     }
2631     shift64RightJamming( aSig, 33, &aSig );
2632     if ( aExp || aSig ) aExp -= 0x3F81;
2633     return roundAndPackFloat32( aSign, aExp, aSig );
2634
2635 }
2636
2637 /*
2638 -------------------------------------------------------------------------------
2639 Returns the result of converting the extended double-precision floating-
2640 point value `a' to the double-precision floating-point format.  The
2641 conversion is performed according to the IEC/IEEE Standard for Binary
2642 Floating-point Arithmetic.
2643 -------------------------------------------------------------------------------
2644 */
2645 float64 floatx80_to_float64( floatx80 a )
2646 {
2647     flag aSign;
2648     int32 aExp;
2649     bits64 aSig, zSig;
2650
2651     aSig = extractFloatx80Frac( a );
2652     aExp = extractFloatx80Exp( a );
2653     aSign = extractFloatx80Sign( a );
2654     if ( aExp == 0x7FFF ) {
2655         if ( (bits64) ( aSig<<1 ) ) {
2656             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
2657         }
2658         return packFloat64( aSign, 0x7FF, 0 );
2659     }
2660     shift64RightJamming( aSig, 1, &zSig );
2661     if ( aExp || aSig ) aExp -= 0x3C01;
2662     return roundAndPackFloat64( aSign, aExp, zSig );
2663
2664 }
2665
2666 /*
2667 -------------------------------------------------------------------------------
2668 Rounds the extended double-precision floating-point value `a' to an integer,
2669 and returns the result as an extended quadruple-precision floating-point
2670 value.  The operation is performed according to the IEC/IEEE Standard for
2671 Binary Floating-point Arithmetic.
2672 -------------------------------------------------------------------------------
2673 */
2674 floatx80 floatx80_round_to_int( floatx80 a )
2675 {
2676     flag aSign;
2677     int32 aExp;
2678     bits64 lastBitMask, roundBitsMask;
2679     int8 roundingMode;
2680     floatx80 z;
2681
2682     aExp = extractFloatx80Exp( a );
2683     if ( 0x403E <= aExp ) {
2684         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
2685             return propagateFloatx80NaN( a, a );
2686         }
2687         return a;
2688     }
2689     if ( aExp <= 0x3FFE ) {
2690         if (    ( aExp == 0 )
2691              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
2692             return a;
2693         }
2694         float_exception_flags |= float_flag_inexact;
2695         aSign = extractFloatx80Sign( a );
2696         switch ( float_rounding_mode ) {
2697          case float_round_nearest_even:
2698             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
2699                ) {
2700                 return
2701                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
2702             }
2703             break;
2704          case float_round_down:
2705             return
2706                   aSign ?
2707                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
2708                 : packFloatx80( 0, 0, 0 );
2709          case float_round_up:
2710             return
2711                   aSign ? packFloatx80( 1, 0, 0 )
2712                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
2713         }
2714         return packFloatx80( aSign, 0, 0 );
2715     }
2716     lastBitMask = 1;
2717     lastBitMask <<= 0x403E - aExp;
2718     roundBitsMask = lastBitMask - 1;
2719     z = a;
2720     roundingMode = float_rounding_mode;
2721     if ( roundingMode == float_round_nearest_even ) {
2722         z.low += lastBitMask>>1;
2723         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
2724     }
2725     else if ( roundingMode != float_round_to_zero ) {
2726         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2727             z.low += roundBitsMask;
2728         }
2729     }
2730     z.low &= ~ roundBitsMask;
2731     if ( z.low == 0 ) {
2732         ++z.high;
2733         z.low = LIT64( 0x8000000000000000 );
2734     }
2735     if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
2736     return z;
2737
2738 }
2739
2740 /*
2741 -------------------------------------------------------------------------------
2742 Returns the result of adding the absolute values of the extended double-
2743 precision floating-point values `a' and `b'.  If `zSign' is true, the sum is
2744 negated before being returned.  `zSign' is ignored if the result is a NaN.
2745 The addition is performed according to the IEC/IEEE Standard for Binary
2746 Floating-point Arithmetic.
2747 -------------------------------------------------------------------------------
2748 */
2749 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
2750 {
2751     int32 aExp, bExp, zExp;
2752     bits64 aSig, bSig, zSig0, zSig1;
2753     int32 expDiff;
2754
2755     aSig = extractFloatx80Frac( a );
2756     aExp = extractFloatx80Exp( a );
2757     bSig = extractFloatx80Frac( b );
2758     bExp = extractFloatx80Exp( b );
2759     expDiff = aExp - bExp;
2760     if ( 0 < expDiff ) {
2761         if ( aExp == 0x7FFF ) {
2762             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2763             return a;
2764         }
2765         if ( bExp == 0 ) --expDiff;
2766         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2767         zExp = aExp;
2768     }
2769     else if ( expDiff < 0 ) {
2770         if ( bExp == 0x7FFF ) {
2771             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2772             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2773         }
2774         if ( aExp == 0 ) ++expDiff;
2775         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2776         zExp = bExp;
2777     }
2778     else {
2779         if ( aExp == 0x7FFF ) {
2780             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2781                 return propagateFloatx80NaN( a, b );
2782             }
2783             return a;
2784         }
2785         zSig1 = 0;
2786         zSig0 = aSig + bSig;
2787         if ( aExp == 0 ) {
2788             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
2789             goto roundAndPack;
2790         }
2791         zExp = aExp;
2792         goto shiftRight1;
2793     }
2794
2795     zSig0 = aSig + bSig;
2796
2797     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
2798  shiftRight1:
2799     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
2800     zSig0 |= LIT64( 0x8000000000000000 );
2801     ++zExp;
2802  roundAndPack:
2803     return
2804         roundAndPackFloatx80(
2805             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
2806
2807 }
2808
2809 /*
2810 -------------------------------------------------------------------------------
2811 Returns the result of subtracting the absolute values of the extended
2812 double-precision floating-point values `a' and `b'.  If `zSign' is true,
2813 the difference is negated before being returned.  `zSign' is ignored if the
2814 result is a NaN.  The subtraction is performed according to the IEC/IEEE
2815 Standard for Binary Floating-point Arithmetic.
2816 -------------------------------------------------------------------------------
2817 */
2818 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
2819 {
2820     int32 aExp, bExp, zExp;
2821     bits64 aSig, bSig, zSig0, zSig1;
2822     int32 expDiff;
2823     floatx80 z;
2824
2825     aSig = extractFloatx80Frac( a );
2826     aExp = extractFloatx80Exp( a );
2827     bSig = extractFloatx80Frac( b );
2828     bExp = extractFloatx80Exp( b );
2829     expDiff = aExp - bExp;
2830     if ( 0 < expDiff ) goto aExpBigger;
2831     if ( expDiff < 0 ) goto bExpBigger;
2832     if ( aExp == 0x7FFF ) {
2833         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2834             return propagateFloatx80NaN( a, b );
2835         }
2836         float_raise( float_flag_invalid );
2837         z.low = floatx80_default_nan_low;
2838         z.high = floatx80_default_nan_high;
2839         return z;
2840     }
2841     if ( aExp == 0 ) {
2842         aExp = 1;
2843         bExp = 1;
2844     }
2845     zSig1 = 0;
2846     if ( bSig < aSig ) goto aBigger;
2847     if ( aSig < bSig ) goto bBigger;
2848     return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
2849  bExpBigger:
2850     if ( bExp == 0x7FFF ) {
2851         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2852         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
2853     }
2854     if ( aExp == 0 ) ++expDiff;
2855     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2856  bBigger:
2857     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
2858     zExp = bExp;
2859     zSign ^= 1;
2860     goto normalizeRoundAndPack;
2861  aExpBigger:
2862     if ( aExp == 0x7FFF ) {
2863         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2864         return a;
2865     }
2866     if ( bExp == 0 ) --expDiff;
2867     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2868  aBigger:
2869     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
2870     zExp = aExp;
2871  normalizeRoundAndPack:
2872     return
2873         normalizeRoundAndPackFloatx80(
2874             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
2875
2876 }
2877
2878 /*
2879 -------------------------------------------------------------------------------
2880 Returns the result of adding the extended double-precision floating-point
2881 values `a' and `b'.  The operation is performed according to the IEC/IEEE
2882 Standard for Binary Floating-point Arithmetic.
2883 -------------------------------------------------------------------------------
2884 */
2885 floatx80 floatx80_add( floatx80 a, floatx80 b )
2886 {
2887     flag aSign, bSign;
2888
2889     aSign = extractFloatx80Sign( a );
2890     bSign = extractFloatx80Sign( b );
2891     if ( aSign == bSign ) {
2892         return addFloatx80Sigs( a, b, aSign );
2893     }
2894     else {
2895         return subFloatx80Sigs( a, b, aSign );
2896     }
2897
2898 }
2899
2900 /*
2901 -------------------------------------------------------------------------------
2902 Returns the result of subtracting the extended double-precision floating-
2903 point values `a' and `b'.  The operation is performed according to the
2904 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2905 -------------------------------------------------------------------------------
2906 */
2907 floatx80 floatx80_sub( floatx80 a, floatx80 b )
2908 {
2909     flag aSign, bSign;
2910
2911     aSign = extractFloatx80Sign( a );
2912     bSign = extractFloatx80Sign( b );
2913     if ( aSign == bSign ) {
2914         return subFloatx80Sigs( a, b, aSign );
2915     }
2916     else {
2917         return addFloatx80Sigs( a, b, aSign );
2918     }
2919
2920 }
2921
2922 /*
2923 -------------------------------------------------------------------------------
2924 Returns the result of multiplying the extended double-precision floating-
2925 point values `a' and `b'.  The operation is performed according to the
2926 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2927 -------------------------------------------------------------------------------
2928 */
2929 floatx80 floatx80_mul( floatx80 a, floatx80 b )
2930 {
2931     flag aSign, bSign, zSign;
2932     int32 aExp, bExp, zExp;
2933     bits64 aSig, bSig, zSig0, zSig1;
2934     floatx80 z;
2935
2936     aSig = extractFloatx80Frac( a );
2937     aExp = extractFloatx80Exp( a );
2938     aSign = extractFloatx80Sign( a );
2939     bSig = extractFloatx80Frac( b );
2940     bExp = extractFloatx80Exp( b );
2941     bSign = extractFloatx80Sign( b );
2942     zSign = aSign ^ bSign;
2943     if ( aExp == 0x7FFF ) {
2944         if (    (bits64) ( aSig<<1 )
2945              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
2946             return propagateFloatx80NaN( a, b );
2947         }
2948         if ( ( bExp | bSig ) == 0 ) goto invalid;
2949         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2950     }
2951     if ( bExp == 0x7FFF ) {
2952         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2953         if ( ( aExp | aSig ) == 0 ) {
2954  invalid:
2955             float_raise( float_flag_invalid );
2956             z.low = floatx80_default_nan_low;
2957             z.high = floatx80_default_nan_high;
2958             return z;
2959         }
2960         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2961     }
2962     if ( aExp == 0 ) {
2963         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
2964         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
2965     }
2966     if ( bExp == 0 ) {
2967         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
2968         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
2969     }
2970     zExp = aExp + bExp - 0x3FFE;
2971     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2972     if ( 0 < (sbits64) zSig0 ) {
2973         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
2974         --zExp;
2975     }
2976     return
2977         roundAndPackFloatx80(
2978             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
2979
2980 }
2981
2982 /*
2983 -------------------------------------------------------------------------------
2984 Returns the result of dividing the extended double-precision floating-point
2985 value `a' by the corresponding value `b'.  The operation is performed
2986 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2987 -------------------------------------------------------------------------------
2988 */
2989 floatx80 floatx80_div( floatx80 a, floatx80 b )
2990 {
2991     flag aSign, bSign, zSign;
2992     int32 aExp, bExp, zExp;
2993     bits64 aSig, bSig, zSig0, zSig1;
2994     bits64 rem0, rem1, rem2, term0, term1, term2;
2995     floatx80 z;
2996
2997     aSig = extractFloatx80Frac( a );
2998     aExp = extractFloatx80Exp( a );
2999     aSign = extractFloatx80Sign( a );
3000     bSig = extractFloatx80Frac( b );
3001     bExp = extractFloatx80Exp( b );
3002     bSign = extractFloatx80Sign( b );
3003     zSign = aSign ^ bSign;
3004     if ( aExp == 0x7FFF ) {
3005         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3006         if ( bExp == 0x7FFF ) {
3007             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3008             goto invalid;
3009         }
3010         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3011     }
3012     if ( bExp == 0x7FFF ) {
3013         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3014         return packFloatx80( zSign, 0, 0 );
3015     }
3016     if ( bExp == 0 ) {
3017         if ( bSig == 0 ) {
3018             if ( ( aExp | aSig ) == 0 ) {
3019  invalid:
3020                 float_raise( float_flag_invalid );
3021                 z.low = floatx80_default_nan_low;
3022                 z.high = floatx80_default_nan_high;
3023                 return z;
3024             }
3025             float_raise( float_flag_divbyzero );
3026             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3027         }
3028         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3029     }
3030     if ( aExp == 0 ) {
3031         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3032         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3033     }
3034     zExp = aExp - bExp + 0x3FFE;
3035     rem1 = 0;
3036     if ( bSig <= aSig ) {
3037         shift128Right( aSig, 0, 1, &aSig, &rem1 );
3038         ++zExp;
3039     }
3040     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3041     mul64To128( bSig, zSig0, &term0, &term1 );
3042     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3043     while ( (sbits64) rem0 < 0 ) {
3044         --zSig0;
3045         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3046     }
3047     zSig1 = estimateDiv128To64( rem1, 0, bSig );
3048     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3049         mul64To128( bSig, zSig1, &term1, &term2 );
3050         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3051         while ( (sbits64) rem1 < 0 ) {
3052             --zSig1;
3053             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3054         }
3055         zSig1 |= ( ( rem1 | rem2 ) != 0 );
3056     }
3057     return
3058         roundAndPackFloatx80(
3059             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3060
3061 }
3062
3063 /*
3064 -------------------------------------------------------------------------------
3065 Returns the remainder of the extended double-precision floating-point value
3066 `a' with respect to the corresponding value `b'.  The operation is performed
3067 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3068 -------------------------------------------------------------------------------
3069 */
3070 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3071 {
3072     flag aSign, bSign, zSign;
3073     int32 aExp, bExp, expDiff;
3074     bits64 aSig0, aSig1, bSig;
3075     bits64 q, term0, term1, alternateASig0, alternateASig1;
3076     floatx80 z;
3077
3078     aSig0 = extractFloatx80Frac( a );
3079     aExp = extractFloatx80Exp( a );
3080     aSign = extractFloatx80Sign( a );
3081     bSig = extractFloatx80Frac( b );
3082     bExp = extractFloatx80Exp( b );
3083     bSign = extractFloatx80Sign( b );
3084     if ( aExp == 0x7FFF ) {
3085         if (    (bits64) ( aSig0<<1 )
3086              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3087             return propagateFloatx80NaN( a, b );
3088         }
3089         goto invalid;
3090     }
3091     if ( bExp == 0x7FFF ) {
3092         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3093         return a;
3094     }
3095     if ( bExp == 0 ) {
3096         if ( bSig == 0 ) {
3097  invalid:
3098             float_raise( float_flag_invalid );
3099             z.low = floatx80_default_nan_low;
3100             z.high = floatx80_default_nan_high;
3101             return z;
3102         }
3103         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3104     }
3105     if ( aExp == 0 ) {
3106         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3107         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3108     }
3109     bSig |= LIT64( 0x8000000000000000 );
3110     zSign = aSign;
3111     expDiff = aExp - bExp;
3112     aSig1 = 0;
3113     if ( expDiff < 0 ) {
3114         if ( expDiff < -1 ) return a;
3115         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3116         expDiff = 0;
3117     }
3118     q = ( bSig <= aSig0 );
3119     if ( q ) aSig0 -= bSig;
3120     expDiff -= 64;
3121     while ( 0 < expDiff ) {
3122         q = estimateDiv128To64( aSig0, aSig1, bSig );
3123         q = ( 2 < q ) ? q - 2 : 0;
3124         mul64To128( bSig, q, &term0, &term1 );
3125         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3126         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3127         expDiff -= 62;
3128     }
3129     expDiff += 64;
3130     if ( 0 < expDiff ) {
3131         q = estimateDiv128To64( aSig0, aSig1, bSig );
3132         q = ( 2 < q ) ? q - 2 : 0;
3133         q >>= 64 - expDiff;
3134         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3135         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3136         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3137         while ( le128( term0, term1, aSig0, aSig1 ) ) {
3138             ++q;
3139             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3140         }
3141     }
3142     else {
3143         term1 = 0;
3144         term0 = bSig;
3145     }
3146     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3147     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3148          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3149               && ( q & 1 ) )
3150        ) {
3151         aSig0 = alternateASig0;
3152         aSig1 = alternateASig1;
3153         zSign = ! zSign;
3154     }
3155     return
3156         normalizeRoundAndPackFloatx80(
3157             80, zSign, bExp + expDiff, aSig0, aSig1 );
3158
3159 }
3160
3161 /*
3162 -------------------------------------------------------------------------------
3163 Returns the square root of the extended double-precision floating-point
3164 value `a'.  The operation is performed according to the IEC/IEEE Standard
3165 for Binary Floating-point Arithmetic.
3166 -------------------------------------------------------------------------------
3167 */
3168 floatx80 floatx80_sqrt( floatx80 a )
3169 {
3170     flag aSign;
3171     int32 aExp, zExp;
3172     bits64 aSig0, aSig1, zSig0, zSig1;
3173     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3174     bits64 shiftedRem0, shiftedRem1;
3175     floatx80 z;
3176
3177     aSig0 = extractFloatx80Frac( a );
3178     aExp = extractFloatx80Exp( a );
3179     aSign = extractFloatx80Sign( a );
3180     if ( aExp == 0x7FFF ) {
3181         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3182         if ( ! aSign ) return a;
3183         goto invalid;
3184     }
3185     if ( aSign ) {
3186         if ( ( aExp | aSig0 ) == 0 ) return a;
3187  invalid:
3188         float_raise( float_flag_invalid );
3189         z.low = floatx80_default_nan_low;
3190         z.high = floatx80_default_nan_high;
3191         return z;
3192     }
3193     if ( aExp == 0 ) {
3194         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3195         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3196     }
3197     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3198     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3199     zSig0 <<= 31;
3200     aSig1 = 0;
3201     shift128Right( aSig0, 0, ( aExp & 1 ) + 2, &aSig0, &aSig1 );
3202     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0 ) + zSig0 + 4;
3203     if ( 0 <= (sbits64) zSig0 ) zSig0 = LIT64( 0xFFFFFFFFFFFFFFFF );
3204     shortShift128Left( aSig0, aSig1, 2, &aSig0, &aSig1 );
3205     mul64To128( zSig0, zSig0, &term0, &term1 );
3206     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3207     while ( (sbits64) rem0 < 0 ) {
3208         --zSig0;
3209         shortShift128Left( 0, zSig0, 1, &term0, &term1 );
3210         term1 |= 1;
3211         add128( rem0, rem1, term0, term1, &rem0, &rem1 );
3212     }
3213     shortShift128Left( rem0, rem1, 63, &shiftedRem0, &shiftedRem1 );
3214     zSig1 = estimateDiv128To64( shiftedRem0, shiftedRem1, zSig0 );
3215     if ( (bits64) ( zSig1<<1 ) <= 10 ) {
3216         if ( zSig1 == 0 ) zSig1 = 1;
3217         mul64To128( zSig0, zSig1, &term1, &term2 );
3218         shortShift128Left( term1, term2, 1, &term1, &term2 );
3219         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3220         mul64To128( zSig1, zSig1, &term2, &term3 );
3221         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3222         while ( (sbits64) rem1 < 0 ) {
3223             --zSig1;
3224             shortShift192Left( 0, zSig0, zSig1, 1, &term1, &term2, &term3 );
3225             term3 |= 1;
3226             add192(
3227                 rem1, rem2, rem3, term1, term2, term3, &rem1, &rem2, &rem3 );
3228         }
3229         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3230     }
3231     return
3232         roundAndPackFloatx80(
3233             floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
3234
3235 }
3236
3237 /*
3238 -------------------------------------------------------------------------------
3239 Returns 1 if the extended double-precision floating-point value `a' is
3240 equal to the corresponding value `b', and 0 otherwise.  The comparison is
3241 performed according to the IEC/IEEE Standard for Binary Floating-point
3242 Arithmetic.
3243 -------------------------------------------------------------------------------
3244 */
3245 flag floatx80_eq( floatx80 a, floatx80 b )
3246 {
3247
3248     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3249               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3250          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3251               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3252        ) {
3253         if (    floatx80_is_signaling_nan( a )
3254              || floatx80_is_signaling_nan( b ) ) {
3255             float_raise( float_flag_invalid );
3256         }
3257         return 0;
3258     }
3259     return
3260            ( a.low == b.low )
3261         && (    ( a.high == b.high )
3262              || (    ( a.low == 0 )
3263                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3264            );
3265
3266 }
3267
3268 /*
3269 -------------------------------------------------------------------------------
3270 Returns 1 if the extended double-precision floating-point value `a' is
3271 less than or equal to the corresponding value `b', and 0 otherwise.  The
3272 comparison is performed according to the IEC/IEEE Standard for Binary
3273 Floating-point Arithmetic.
3274 -------------------------------------------------------------------------------
3275 */
3276 flag floatx80_le( floatx80 a, floatx80 b )
3277 {
3278     flag aSign, bSign;
3279
3280     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3281               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3282          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3283               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3284        ) {
3285         float_raise( float_flag_invalid );
3286         return 0;
3287     }
3288     aSign = extractFloatx80Sign( a );
3289     bSign = extractFloatx80Sign( b );
3290     if ( aSign != bSign ) {
3291         return
3292                aSign
3293             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3294                  == 0 );
3295     }
3296     return
3297           aSign ? le128( b.high, b.low, a.high, a.low )
3298         : le128( a.high, a.low, b.high, b.low );
3299
3300 }
3301
3302 /*
3303 -------------------------------------------------------------------------------
3304 Returns 1 if the extended double-precision floating-point value `a' is
3305 less than the corresponding value `b', and 0 otherwise.  The comparison
3306 is performed according to the IEC/IEEE Standard for Binary Floating-point
3307 Arithmetic.
3308 -------------------------------------------------------------------------------
3309 */
3310 flag floatx80_lt( floatx80 a, floatx80 b )
3311 {
3312     flag aSign, bSign;
3313
3314     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3315               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3316          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3317               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3318        ) {
3319         float_raise( float_flag_invalid );
3320         return 0;
3321     }
3322     aSign = extractFloatx80Sign( a );
3323     bSign = extractFloatx80Sign( b );
3324     if ( aSign != bSign ) {
3325         return
3326                aSign
3327             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3328                  != 0 );
3329     }
3330     return
3331           aSign ? lt128( b.high, b.low, a.high, a.low )
3332         : lt128( a.high, a.low, b.high, b.low );
3333
3334 }
3335
3336 /*
3337 -------------------------------------------------------------------------------
3338 Returns 1 if the extended double-precision floating-point value `a' is equal
3339 to the corresponding value `b', and 0 otherwise.  The invalid exception is
3340 raised if either operand is a NaN.  Otherwise, the comparison is performed
3341 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3342 -------------------------------------------------------------------------------
3343 */
3344 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
3345 {
3346
3347     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3348               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3349          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3350               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3351        ) {
3352         float_raise( float_flag_invalid );
3353         return 0;
3354     }
3355     return
3356            ( a.low == b.low )
3357         && (    ( a.high == b.high )
3358              || (    ( a.low == 0 )
3359                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3360            );
3361
3362 }
3363
3364 /*
3365 -------------------------------------------------------------------------------
3366 Returns 1 if the extended double-precision floating-point value `a' is less
3367 than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
3368 do not cause an exception.  Otherwise, the comparison is performed according
3369 to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3370 -------------------------------------------------------------------------------
3371 */
3372 flag floatx80_le_quiet( floatx80 a, floatx80 b )
3373 {
3374     flag aSign, bSign;
3375
3376     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3377               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3378          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3379               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3380        ) {
3381         if (    floatx80_is_signaling_nan( a )
3382              || floatx80_is_signaling_nan( b ) ) {
3383             float_raise( float_flag_invalid );
3384         }
3385         return 0;
3386     }
3387     aSign = extractFloatx80Sign( a );
3388     bSign = extractFloatx80Sign( b );
3389     if ( aSign != bSign ) {
3390         return
3391                aSign
3392             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3393                  == 0 );
3394     }
3395     return
3396           aSign ? le128( b.high, b.low, a.high, a.low )
3397         : le128( a.high, a.low, b.high, b.low );
3398
3399 }
3400
3401 /*
3402 -------------------------------------------------------------------------------
3403 Returns 1 if the extended double-precision floating-point value `a' is less
3404 than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
3405 an exception.  Otherwise, the comparison is performed according to the
3406 IEC/IEEE Standard for Binary Floating-point Arithmetic.
3407 -------------------------------------------------------------------------------
3408 */
3409 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
3410 {
3411     flag aSign, bSign;
3412
3413     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3414               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3415          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3416               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3417        ) {
3418         if (    floatx80_is_signaling_nan( a )
3419              || floatx80_is_signaling_nan( b ) ) {
3420             float_raise( float_flag_invalid );
3421         }
3422         return 0;
3423     }
3424     aSign = extractFloatx80Sign( a );
3425     bSign = extractFloatx80Sign( b );
3426     if ( aSign != bSign ) {
3427         return
3428                aSign
3429             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3430                  != 0 );
3431     }
3432     return
3433           aSign ? lt128( b.high, b.low, a.high, a.low )
3434         : lt128( a.high, a.low, b.high, b.low );
3435
3436 }
3437
3438 #endif
3439