1 Reply Latest reply on Jan 11, 2016 10:22 PM by jobi

    Super Fast Floating Point Multiply

    cypress_1418426

      A faster not completely accurate version for multiplying two single numbers:

         

      I've written a very fast floating point multiplier for another project that may be able to significantly speed up 3d printer software. This routine hacks the internal format of normalized IEEE 754 single precision floating point numbers.

         

      As most of you probably know, you can take a fixed point integer, and shift left << 1 to multiply by 2, or shift right >> 1 to divide by 2.

         

      This technique will break down one of the multiplacants bit by bit, and add the other to itself using a fixed point addition to perform the multiplication. For example, ff you want to multiple x by 5, that is the same as 4*x + 1*x; 4*x is x << 2.

         

      DISCLAIMER:  This is a hack...  it is not completely accurate (only 5 decimal digits or so). It does not work with denormalized numbers, the numbers smaller than 1 e -38 or so and does not currently track, respect or report underflow, overflow, nan or infinity.

         

       
      float fFastMult(float f, float g) {
       
            uint32 *pf = (uint32*) &f;
            uint32 *pg = (uint32*) &g;
       
            uint32 ufExp = *pf & SINGLEEXPBITS;
            if (!ufExp) return(0);                // Not competely accurate for denormalized numbers
            // if (ufExp == SINGLEEXPBITS) { ...  // Detect and handle inf, nan, etc.
       
            uint32 ugExp = *pg & SINGLEEXPBITS;
            if (!ugExp) return(0);                // Not completely accurate
      for denormalized numbers
       
            uint32 uSign = *pf ^ *pg;
       
            uSign &= 0x80000000;
       
            *pf &= SINGLEMANTISSABITS;    // Get only mantissa
            *pf |= SINGLELEADINGBIT;        // Set top assumed bit of Mantissa
            uint32 uBits = *pf;
       
            *pg &= 0x7FFFFF;            // Note, we will be adding to f, so we automatically get the first assigned bit for free.
       
            while (*pg & 0x00ffffff) {
                if (*pg & 0x00800000) {    // Note, never true for first time through the loop since we didn't set the assumed ugMantissa bit.
                    *pf += uBits;
                }
                *pg <<= 1;
                uBits >>= 1;
            }
       
       
            while (*pf & 0xFF000000) {
                *pf >>= 1;
                ufExp += FASTDOUBLE;
            }
       
            ufExp += ugExp - 0x3F800000;
       
            //if (ufExp & 0x807FFFFF) {...   // A chance to test for overflow errors.
       
            *pf &= 0x007FFFFF;        // Clear the assumed top bit of the mantissa
            *pf |= ufExp | uSign;     // And we have our answer... sign, exponent, mantissa
       
            return(*pf);
       }

         


      Please note...  additional speed gains can be made by rewriting this code specifically for an 8-bit processor. For example:

         


            while (*pg & 0x00ffffff) {
            ...
            *pg <<= 1;
            }

         

      can be rewritten:

         

            while (*pg.LSB & 0xff) {
                if (*pg.BYTE3 & 0x80) {
            ...
            *pg <<= 1;  // Long shift
            }

         

            <byte shift down *pg to *p16g, a uint16 rather than a uint32>

         

            while (*p16g.LSB & 0xff) {
                if (*p16g.BYTE2 & 0x80) {
            ...
            *p16g <<= 1;    // word shift
            }

         

            <byte shift down *p16g to *p8g, a uint8 rather than a uint16>

         

            while (*p8g & 0xff) {
                if (*p8g & 0x80) {
            ...
            *p8g <<= 1; // Byte shift
            }

         


      Note...  it is best that the 2nd parameter, g, is a simple number. For example, fFastMult(2.33, 2.0) is much faster than fFastMult(2.0, 2.33).

         

       

         

      ----

         

       

         

      And a version that uses a 16bit x 16bit hardware multiply if it is available:

         


      #define FASTDOUBLE 0x00800000
      #define FASTMULT4 0x01000000
      #define FASTHALF 0xff8000000
      #define FASTDIV4 0xff000000             // Note the signed
      #define SINGLESIGNBIT 0x80000000
      #define SINGLEEXPBITS 0x7f800000
      #define SINGLEMANTISSABITS 0x007fffff
      #define SINGLELEADINGBIT 0x00800000

         

       

         

      //  Currently Set for Little Endian
      typedef union {
          float f;
          uint32 u;
          int32 i;
         
          struct {
              uint16 LSW;
              uint16 MSW;
          } w __attribute__ ((packed));

         

          struct {
              uint16 LSW;
              uint16 MSW;
          } ws __attribute__ ((packed));

         

          struct {
              uint8 LSB;
              uint8 LSBH;
              uint8 MSBL;
              uint8 MSB;
          } b;
         
          struct {
              int8 LSB;
              int8 LSBH;
              int8 MSBL;
              int8 MSB;
          } c;
      } floatint; // __attribute__ ((packed));

         

       

         

      float fFastMult(float f, float g) {

         

            floatint *pf = (floatint *) &f;
            floatint *pg = (floatint *) &g;
         
            uint32 uTemp = SINGLEEXPBITS;             // Intended to be loaded
      into a register for fast use.

         

            uint32 ufExp = (*pf).u & uTemp; // SINGLEEXPBITS;
            if (!ufExp) return(0);                    // Denormalized numbers
      ==> 0

         

            // if (ufExp == SINGLEEXPBITS) { ...      // Detect and handle
      inf, nan, etc.

         

            uint32 ugExp = (*pg).u & uTemp; //SINGLEEXPBITS;
            if (!ugExp) return(0);                    // Denormalized numbers
      ==> 0
         
            uTemp = FASTDOUBLE;

         

            ufExp += ugExp - 0x3f800000;
            ufExp |= (((*pf).u ^ (*pg).u) & 0x80000000);      // uSign = *pf ^
      *pg
         
            (*pg).u &= SINGLEMANTISSABITS;        // Note, we will be adding
      to f, so we automatically get the first assigned bit for free.
            (*pg).u |= SINGLELEADINGBIT;            // Set top assumed bit of
      Mantissa

         

            (*pf).u &= SINGLEMANTISSABITS;        // Get only mantissa
            (*pf).u |= SINGLELEADINGBIT;            // Set top assumed bit of
      Mantissa
         
            // This is the full multibyte multiply.
            //uint32 uLowByteMult = (*pf).b.LSB * (*pg).b.LSB;
            //uLowByteMult >>= 8;
            //uLowByteMult += (((*pf).b.LSBH * (*pg).b.LSB) + ((*pf).b.LSB *
      (*pg).b.LSBH));
            //uLowByteMult >>= 8;                 // Only the *CARRY* bit
      matters...  We're skipping it!
            //uLowByteMult += (((*pf).b.MSBL * (*pg).b.LSB) + ((*pf).b.LSB *
      (*pg).b.MSBL));
            uint32 uLowByteMult = (((*pf).b.MSBL * (*pg).b.LSB) + ((*pf).b.LSB
      * (*pg).b.MSBL));

         

            (*pg).u >>= 8;
            (*pf).u >>= 8;                       
         
            // Note, it is possible here to detect if the 2nd bit of both *pf and *pg are set.
            // in this case, 2 overflow bits are guaranteed allowing the following to work:
            // *pf or *pg >> 7,  the other >> 8;
            // *pf *= *pg;   //  The assumed mantissa bit overflows, negating the need for an & (bitwise and) operation
            // Mantissa (without the LEADING BIT) = *pf >> 9
            // Requires an additional *ufExp += FASTDOUBLE;
         
         
            // Could do this as a long long (32 x 32 => 64 bit) multiply, but this is MUCH faster for on hardware that
            // supports 16b x 16b multiply
            (*pf).u *= (*pg).u;
            (*pf).u += uLowByteMult;
         
            // Note...  Top two bits are overflow.
            // We need to check them, and shift the
          
            if ((*pf).u & 0x80000000) {           // Check for overflow.
                ufExp += uTemp; // FASTDOUBLE;
                (*pf).u >>= 1;
            }
            (*pf).u <<= 2;                        // Shift result to remove  top bits  (Same as & 0x7fff...)
            (*pf).u >>= 9;
         
            (*pf).u |= ufExp;                     // And we have our answer...
      sign, exponent, mantissa
            return((*pf).f);
       }