diff options
Diffstat (limited to 'lib/math/div64.c')
| -rw-r--r-- | lib/math/div64.c | 185 |
1 files changed, 124 insertions, 61 deletions
diff --git a/lib/math/div64.c b/lib/math/div64.c index bf77b9843175..d1e92ea24fce 100644 --- a/lib/math/div64.c +++ b/lib/math/div64.c @@ -177,94 +177,157 @@ EXPORT_SYMBOL(div64_s64); * Iterative div/mod for use when dividend is not expected to be much * bigger than divisor. */ +#ifndef iter_div_u64_rem u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder) { return __iter_div_u64_rem(dividend, divisor, remainder); } EXPORT_SYMBOL(iter_div_u64_rem); +#endif -#ifndef mul_u64_u64_div_u64 -u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c) -{ - if (ilog2(a) + ilog2(b) <= 62) - return div64_u64(a * b, c); +#if !defined(mul_u64_add_u64_div_u64) || defined(test_mul_u64_add_u64_div_u64) -#if defined(__SIZEOF_INT128__) +#define mul_add(a, b, c) add_u64_u32(mul_u32_u32(a, b), c) +#if defined(__SIZEOF_INT128__) && !defined(test_mul_u64_add_u64_div_u64) +static inline u64 mul_u64_u64_add_u64(u64 *p_lo, u64 a, u64 b, u64 c) +{ /* native 64x64=128 bits multiplication */ - u128 prod = (u128)a * b; - u64 n_lo = prod, n_hi = prod >> 64; + u128 prod = (u128)a * b + c; + *p_lo = prod; + return prod >> 64; +} #else - - /* perform a 64x64=128 bits multiplication manually */ - u32 a_lo = a, a_hi = a >> 32, b_lo = b, b_hi = b >> 32; +static inline u64 mul_u64_u64_add_u64(u64 *p_lo, u64 a, u64 b, u64 c) +{ + /* perform a 64x64=128 bits multiplication in 32bit chunks */ u64 x, y, z; - x = (u64)a_lo * b_lo; - y = (u64)a_lo * b_hi + (u32)(x >> 32); - z = (u64)a_hi * b_hi + (u32)(y >> 32); - y = (u64)a_hi * b_lo + (u32)y; - z += (u32)(y >> 32); - x = (y << 32) + (u32)x; - - u64 n_lo = x, n_hi = z; + /* Since (x-1)(x-1) + 2(x-1) == x.x - 1 two u32 can be added to a u64 */ + x = mul_add(a, b, c); + y = mul_add(a, b >> 32, c >> 32); + y = add_u64_u32(y, x >> 32); + z = mul_add(a >> 32, b >> 32, y >> 32); + y = mul_add(a >> 32, b, y); + *p_lo = (y << 32) + (u32)x; + return add_u64_u32(z, y >> 32); +} +#endif +#ifndef BITS_PER_ITER +#define BITS_PER_ITER (__LONG_WIDTH__ >= 64 ? 32 : 16) #endif - /* make sure c is not zero, trigger runtime exception otherwise */ - if (unlikely(c == 0)) { - unsigned long zero = 0; +#if BITS_PER_ITER == 32 +#define mul_u64_long_add_u64(p_lo, a, b, c) mul_u64_u64_add_u64(p_lo, a, b, c) +#define add_u64_long(a, b) ((a) + (b)) +#else +#undef BITS_PER_ITER +#define BITS_PER_ITER 16 +static inline u32 mul_u64_long_add_u64(u64 *p_lo, u64 a, u32 b, u64 c) +{ + u64 n_lo = mul_add(a, b, c); + u64 n_med = mul_add(a >> 32, b, c >> 32); - OPTIMIZER_HIDE_VAR(zero); - return ~0UL/zero; - } + n_med = add_u64_u32(n_med, n_lo >> 32); + *p_lo = n_med << 32 | (u32)n_lo; + return n_med >> 32; +} + +#define add_u64_long(a, b) add_u64_u32(a, b) +#endif + +u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d) +{ + unsigned long d_msig, q_digit; + unsigned int reps, d_z_hi; + u64 quotient, n_lo, n_hi; + u32 overflow; - int shift = __builtin_ctzll(c); + n_hi = mul_u64_u64_add_u64(&n_lo, a, b, c); - /* try reducing the fraction in case the dividend becomes <= 64 bits */ - if ((n_hi >> shift) == 0) { - u64 n = shift ? (n_lo >> shift) | (n_hi << (64 - shift)) : n_lo; + if (!n_hi) + return div64_u64(n_lo, d); - return div64_u64(n, c >> shift); - /* - * The remainder value if needed would be: - * res = div64_u64_rem(n, c >> shift, &rem); - * rem = (rem << shift) + (n_lo - (n << shift)); - */ - } + if (unlikely(n_hi >= d)) { + /* trigger runtime exception if divisor is zero */ + if (d == 0) { + unsigned long zero = 0; - if (n_hi >= c) { + OPTIMIZER_HIDE_VAR(zero); + return ~0UL/zero; + } /* overflow: result is unrepresentable in a u64 */ - return -1; + return ~0ULL; } - /* Do the full 128 by 64 bits division */ - - shift = __builtin_clzll(c); - c <<= shift; + /* Left align the divisor, shifting the dividend to match */ + d_z_hi = __builtin_clzll(d); + if (d_z_hi) { + d <<= d_z_hi; + n_hi = n_hi << d_z_hi | n_lo >> (64 - d_z_hi); + n_lo <<= d_z_hi; + } - int p = 64 + shift; - u64 res = 0; - bool carry; + reps = 64 / BITS_PER_ITER; + /* Optimise loop count for small dividends */ + if (!(u32)(n_hi >> 32)) { + reps -= 32 / BITS_PER_ITER; + n_hi = n_hi << 32 | n_lo >> 32; + n_lo <<= 32; + } +#if BITS_PER_ITER == 16 + if (!(u32)(n_hi >> 48)) { + reps--; + n_hi = add_u64_u32(n_hi << 16, n_lo >> 48); + n_lo <<= 16; + } +#endif - do { - carry = n_hi >> 63; - shift = carry ? 1 : __builtin_clzll(n_hi); - if (p < shift) - break; - p -= shift; - n_hi <<= shift; - n_hi |= n_lo >> (64 - shift); - n_lo <<= shift; - if (carry || (n_hi >= c)) { - n_hi -= c; - res |= 1ULL << p; + /* Invert the dividend so we can use add instead of subtract. */ + n_lo = ~n_lo; + n_hi = ~n_hi; + + /* + * Get the most significant BITS_PER_ITER bits of the divisor. + * This is used to get a low 'guestimate' of the quotient digit. + */ + d_msig = (d >> (64 - BITS_PER_ITER)) + 1; + + /* + * Now do a 'long division' with BITS_PER_ITER bit 'digits'. + * The 'guess' quotient digit can be low and BITS_PER_ITER+1 bits. + * The worst case is dividing ~0 by 0x8000 which requires two subtracts. + */ + quotient = 0; + while (reps--) { + q_digit = (unsigned long)(~n_hi >> (64 - 2 * BITS_PER_ITER)) / d_msig; + /* Shift 'n' left to align with the product q_digit * d */ + overflow = n_hi >> (64 - BITS_PER_ITER); + n_hi = add_u64_u32(n_hi << BITS_PER_ITER, n_lo >> (64 - BITS_PER_ITER)); + n_lo <<= BITS_PER_ITER; + /* Add product to negated divisor */ + overflow += mul_u64_long_add_u64(&n_hi, d, q_digit, n_hi); + /* Adjust for the q_digit 'guestimate' being low */ + while (overflow < 0xffffffff >> (32 - BITS_PER_ITER)) { + q_digit++; + n_hi += d; + overflow += n_hi < d; } - } while (n_hi); - /* The remainder value if needed would be n_hi << p */ + quotient = add_u64_long(quotient << BITS_PER_ITER, q_digit); + } - return res; + /* + * The above only ensures the remainder doesn't overflow, + * it can still be possible to add (aka subtract) another copy + * of the divisor. + */ + if ((n_hi + d) > n_hi) + quotient++; + return quotient; } -EXPORT_SYMBOL(mul_u64_u64_div_u64); +#if !defined(test_mul_u64_add_u64_div_u64) +EXPORT_SYMBOL(mul_u64_add_u64_div_u64); +#endif #endif |
