#include "RepRapFirmware.h" #include "DriveMovement.h" // The remaining functions are speed-critical, so use full optimisation #pragma GCC optimize ("O3") // Fast 62-bit integer square root function (thanks dmould) uint32_t isqrt64(uint64_t num) { uint32_t numHigh = (uint32_t)(num >> 32); uint32_t res; if ((numHigh & (3 << 30)) != 0) { // Input out of range - probably negative, so return -1 res = 0xFFFFFFFF; } else { res = 0; #define iter64a(N) \ { \ res <<= 1; \ uint32_t temp = (res | 1) << (N); \ if (numHigh >= temp) \ { \ numHigh -= temp; \ res |= 2; \ } \ } // We need to do 15 iterations (not 16 because we have eliminated the top 2 bits) iter64a(28) iter64a(26) iter64a(24) iter64a(22) iter64a(20) iter64a(18) iter64a(16) iter64a(14) iter64a(12) iter64a(10) iter64a(8) iter64a(6) iter64a(4) iter64a(2) iter64a(0) // resHigh is twice the square root of the msw, in the range 0..2^17-1 uint64_t numAll = ((uint64_t)numHigh << 32) | (uint32_t)num; #define iter64b(N) \ { \ res <<= 1; \ uint64_t temp = ((uint64_t)res | 1) << (N); \ if (numAll >= temp) \ { \ numAll -= temp; \ res |= 2; \ } \ } // We need to do 16 iterations. // After the last iteration, numAll may be between 0 and (1 + 2 * res) inclusive. // So to take square roots of numbers up to 64 bits, we need to do all these iterations using 64 bit maths. // If we restricted the input to e.g. 48 bits, then we could do some of the final iterations using 32-bit maths. iter64b(30) iter64b(28) iter64b(26) iter64b(24) iter64b(22) iter64b(20) iter64b(18) iter64b(16) iter64b(14) iter64b(12) iter64b(10) iter64b(8) iter64b(6) iter64b(4) iter64b(2) iter64b(0) res >>= 1; #undef iter64a #undef iter64b } return res; } #if 0 // Fast 64-bit integer square root function uint32_t isqrt2(uint64_t num) { uint32_t numHigh = (uint32_t)(num >> 32); uint32_t resHigh = 0; #define iter64a(N) \ { \ uint32_t temp = resHigh + (1 << (N)); \ if (numHigh >= temp << (N)) \ { \ numHigh -= temp << (N); \ resHigh |= 2 << (N); \ } \ } // We need to do 16 iterations iter64a(15); iter64a(14); iter64a(13); iter64a(12); iter64a(11); iter64a(10); iter64a(9); iter64a(8); iter64a(7); iter64a(6); iter64a(5); iter64a(4); iter64a(3); iter64a(2); iter64a(1); iter64a(0); // resHigh is twice the square root of the msw, in the range 0..2^17-1 uint64_t res = (uint64_t)resHigh << 16; uint64_t numAll = ((uint64_t)numHigh << 32) | (uint32_t)num; #define iter64b(N) \ { \ uint64_t temp = res | (1 << (N)); \ if (numAll >= temp << (N)) \ { \ numAll -= temp << (N); \ res |= 2 << (N); \ } \ } // We need to do 16 iterations. // After the last iteration, numAll may be between 0 and (1 + 2 * res) inclusive. // So to take square roots of numbers up to 64 bits, we need to do all these iterations using 64 bit maths. // If we restricted the input to e.g. 48 bits, then we could do some of the final iterations using 32-bit maths. iter64b(15) iter64b(14) iter64b(13) iter64b(12) iter64b(11) iter64b(10) iter64b(9) iter64b(8) iter64b(7) iter64b(6) iter64b(5) iter64b(4) iter64b(3) iter64b(2) iter64b(1) iter64b(0) return (uint32_t)(res >> 1); #undef iter64a #undef iter64b #undef iter32 } uint32_t sqSum1 = 0, sqSum2 = 0, sqCount = 0, sqErrors = 0, lastRes1 = 0, lastRes2 = 0; uint64_t lastNum = 0; /* static */ uint32_t isqrt64(uint64_t num) { irqflags_t flags = cpu_irq_save(); uint32_t t2 = Platform::GetInterruptClocks(); uint32_t res1 = isqrt1(num); uint32_t t3 = Platform::GetInterruptClocks(); uint32_t res2 = isqrt2(num); uint32_t t4 = Platform::GetInterruptClocks(); cpu_irq_restore(flags); sqSum1 += (t3 - t2); sqSum2 += (t4 - t3); ++sqCount; if (res1 != res2) { ++sqErrors; lastNum = num; lastRes1 = res1; lastRes2 = res2; } return res2; } #endif // End