Super Fast Floating Point Multiply

Anonymous · ‎Jan 08, 2016

A faster not completely accurate version for multiplying two single numbers:

I've written a very fast floating point multiplier for another project that may be able to significantly speed up 3d printer software. This routine hacks the internal format of normalized IEEE 754 single precision floating point numbers.

As most of you probably know, you can take a fixed point integer, and shift left << 1 to multiply by 2, or shift right >> 1 to divide by 2.

This technique will break down one of the multiplacants bit by bit, and add the other to itself using a fixed point addition to perform the multiplication. For example, ff you want to multiple x by 5, that is the same as 4*x + 1*x; 4*x is x << 2.

DISCLAIMER: This is a hack... it is not completely accurate (only 5 decimal digits or so). It does not work with denormalized numbers, the numbers smaller than 1 e -38 or so and does not currently track, respect or report underflow, overflow, nan or infinity.

float fFastMult(float f, float g) {

      uint32 *pf = (uint32*) &f;
      uint32 *pg = (uint32*) &g;

      uint32 ufExp = *pf & SINGLEEXPBITS;
      if (!ufExp) return(0);                // Not competely accurate for denormalized numbers
      // if (ufExp == SINGLEEXPBITS) { ... // Detect and handle inf, nan, etc.

      uint32 ugExp = *pg & SINGLEEXPBITS;
      if (!ugExp) return(0);                // Not completely accurate
for denormalized numbers

      uint32 uSign = *pf ^ *pg;

      uSign &= 0x80000000;

      *pf &= SINGLEMANTISSABITS;    // Get only mantissa
      *pf |= SINGLELEADINGBIT;        // Set top assumed bit of Mantissa
      uint32 uBits = *pf;

      *pg &= 0x7FFFFF;            // Note, we will be adding to f, so we automatically get the first assigned bit for free.

      while (*pg & 0x00ffffff) {
          if (*pg & 0x00800000) {    // Note, never true for first time through the loop since we didn't set the assumed ugMantissa bit.
              *pf += uBits;
          }
          *pg <<= 1;
          uBits >>= 1;
      }

      while (*pf & 0xFF000000) {
          *pf >>= 1;
          ufExp += FASTDOUBLE;
      }

      ufExp += ugExp - 0x3F800000;

      //if (ufExp & 0x807FFFFF) {...   // A chance to test for overflow errors.

      *pf &= 0x007FFFFF;        // Clear the assumed top bit of the mantissa
      *pf |= ufExp | uSign;     // And we have our answer... sign, exponent, mantissa

      return(*pf);
}

Please note... additional speed gains can be made by rewriting this code specifically for an 8-bit processor. For example:

      while (*pg & 0x00ffffff) {
      ...
      *pg <<= 1;
      }

can be rewritten:

      while (*pg.LSB & 0xff) {
          if (*pg.BYTE3 & 0x80) {
      ...
      *pg <<= 1; // Long shift
      }

      while (*p16g.LSB & 0xff) {
          if (*p16g.BYTE2 & 0x80) {
      ...
      *p16g <<= 1;    // word shift
      }

      while (*p8g & 0xff) {
          if (*p8g & 0x80) {
      ...
      *p8g <<= 1; // Byte shift
      }

Note... it is best that the 2nd parameter, g, is a simple number. For example, fFastMult(2.33, 2.0) is much faster than fFastMult(2.0, 2.33).

----

And a version that uses a 16bit x 16bit hardware multiply if it is available:

#define FASTDOUBLE 0x00800000
#define FASTMULT4 0x01000000
#define FASTHALF 0xff8000000
#define FASTDIV4 0xff000000 // Note the signed
#define SINGLESIGNBIT 0x80000000
#define SINGLEEXPBITS 0x7f800000
#define SINGLEMANTISSABITS 0x007fffff
#define SINGLELEADINGBIT 0x00800000

// Currently Set for Little Endian
typedef union {
    float f;
    uint32 u;
    int32 i;

    struct {
        uint16 LSW;
        uint16 MSW;
    } w __attribute__ ((packed));

    struct {
        uint16 LSW;
        uint16 MSW;
    } ws __attribute__ ((packed));

    struct {
        uint8 LSB;
        uint8 LSBH;
        uint8 MSBL;
        uint8 MSB;
    } b;

    struct {
        int8 LSB;
        int8 LSBH;
        int8 MSBL;
        int8 MSB;
    } c;
} floatint; // __attribute__ ((packed));

float fFastMult(float f, float g) {

      floatint *pf = (floatint *) &f;
      floatint *pg = (floatint *) &g;

      uint32 uTemp = SINGLEEXPBITS;             // Intended to be loaded
into a register for fast use.

uint32 ufExp = (*pf).u & uTemp; // SINGLEEXPBITS;
if (!ufExp) return(0); // Denormalized numbers
==> 0

// if (ufExp == SINGLEEXPBITS) { ... // Detect and handle
inf, nan, etc.

      uint32 ugExp = (*pg).u & uTemp; //SINGLEEXPBITS;
      if (!ugExp) return(0);                    // Denormalized numbers
==> 0

      uTemp = FASTDOUBLE;

      ufExp += ugExp - 0x3f800000;
      ufExp |= (((*pf).u ^ (*pg).u) & 0x80000000);      // uSign = *pf ^
*pg

      (*pg).u &= SINGLEMANTISSABITS;        // Note, we will be adding
to f, so we automatically get the first assigned bit for free.
      (*pg).u |= SINGLELEADINGBIT;            // Set top assumed bit of
Mantissa

      (*pf).u &= SINGLEMANTISSABITS;        // Get only mantissa
      (*pf).u |= SINGLELEADINGBIT;            // Set top assumed bit of
Mantissa

      // This is the full multibyte multiply.
      //uint32 uLowByteMult = (*pf).b.LSB * (*pg).b.LSB;
      //uLowByteMult >>= 8;
      //uLowByteMult += (((*pf).b.LSBH * (*pg).b.LSB) + ((*pf).b.LSB *
(*pg).b.LSBH));
      //uLowByteMult >>= 8;                 // Only the *CARRY* bit
matters... We're skipping it!
      //uLowByteMult += (((*pf).b.MSBL * (*pg).b.LSB) + ((*pf).b.LSB *
(*pg).b.MSBL));
      uint32 uLowByteMult = (((*pf).b.MSBL * (*pg).b.LSB) + ((*pf).b.LSB
* (*pg).b.MSBL));

      (*pg).u >>= 8;
      (*pf).u >>= 8;

      // Note, it is possible here to detect if the 2nd bit of both *pf and *pg are set.
      // in this case, 2 overflow bits are guaranteed allowing the following to work:
      // *pf or *pg >> 7, the other >> 8;
      // *pf *= *pg;   // The assumed mantissa bit overflows, negating the need for an & (bitwise and) operation
      // Mantissa (without the LEADING BIT) = *pf >> 9
      // Requires an additional *ufExp += FASTDOUBLE;

      // Could do this as a long long (32 x 32 => 64 bit) multiply, but this is MUCH faster for on hardware that
      // supports 16b x 16b multiply
      (*pf).u *= (*pg).u;
      (*pf).u += uLowByteMult;

      // Note... Top two bits are overflow.
      // We need to check them, and shift the

      if ((*pf).u & 0x80000000) {           // Check for overflow.
          ufExp += uTemp; // FASTDOUBLE;
          (*pf).u >>= 1;
      }
      (*pf).u <<= 2;                        // Shift result to remove top bits (Same as & 0x7fff...)
      (*pf).u >>= 9;

      (*pf).u |= ufExp;                     // And we have our answer...
sign, exponent, mantissa
      return((*pf).f);
}