|
| 1 | +#include "crc_common.h" |
| 2 | + |
| 3 | +// exclude broken/missing arm_acle.h |
| 4 | +#if defined(__ARM_FEATURE_CRYPTO) && defined(HEDLEY_GCC_VERSION) |
| 5 | +# if !defined(__aarch64__) && HEDLEY_GCC_VERSION_CHECK(7,0,0) && !HEDLEY_GCC_VERSION_CHECK(8,1,1) |
| 6 | +# undef __ARM_FEATURE_CRYPTO |
| 7 | +# endif |
| 8 | +# if defined(__aarch64__) && HEDLEY_GCC_VERSION_CHECK(9,4,0) && !HEDLEY_GCC_VERSION_CHECK(9,5,0) |
| 9 | +# undef __ARM_FEATURE_CRYPTO |
| 10 | +# endif |
| 11 | +#endif |
| 12 | +#if defined(__ARM_FEATURE_CRYPTO) && defined(__has_include) |
| 13 | +# if !__has_include(<arm_acle.h>) |
| 14 | +# undef __ARM_FEATURE_CRYPTO |
| 15 | +# endif |
| 16 | +#endif |
| 17 | + |
| 18 | +// ARM's intrinsics guide seems to suggest that vmull_p64 is available on A32, but neither Clang/GCC seem to support it on AArch32 |
| 19 | +#if (defined(__ARM_FEATURE_CRYPTO) && defined(__ARM_FEATURE_CRC32) && defined(__aarch64__)) || (defined(_M_ARM64) && !defined(__clang__)) |
| 20 | +#include <arm_neon.h> |
| 21 | +#if defined(_MSC_VER) && !defined(__clang__) |
| 22 | +#include <intrin.h> |
| 23 | + |
| 24 | +#ifdef _M_ARM64 |
| 25 | +// MSVC may detect this pattern: https://devblogs.microsoft.com/cppblog/a-tour-of-4-msvc-backend-improvements/#byteswap-identification |
| 26 | +unsigned __int64 rbit64(unsigned __int64 x) { |
| 27 | + x = _byteswap_uint64(x); |
| 28 | + x = (x & 0xaaaaaaaaaaaaaaaa) >> 1 | (x & 0x5555555555555555) << 1; |
| 29 | + x = (x & 0xcccccccccccccccc) >> 2 | (x & 0x3333333333333333) << 2; |
| 30 | + x = (x & 0xf0f0f0f0f0f0f0f0) >> 4 | (x & 0x0f0f0f0f0f0f0f0f) << 4; |
| 31 | + return x; |
| 32 | +} |
| 33 | +// ...whilst this seems to work best for 32-bit RBIT |
| 34 | +unsigned __int32 rbit32(unsigned __int32 x) { |
| 35 | + unsigned __int64 r = rbit64(x); |
| 36 | + return r >> 32; |
| 37 | +} |
| 38 | +#else |
| 39 | +#define rbit32 _arm_rbit |
| 40 | +#endif |
| 41 | +#else |
| 42 | +#include <arm_acle.h> |
| 43 | +#define rbit32 __rbit |
| 44 | +#define rbit64 __rbitll |
| 45 | +#endif |
| 46 | + |
| 47 | + |
| 48 | +// MSVC doesn't have poly64/poly128 types, so always use uint64 instead |
| 49 | + |
| 50 | +#ifdef __aarch64__ |
| 51 | +# if defined(__GNUC__) || defined(__clang__) |
| 52 | +static HEDLEY_ALWAYS_INLINE uint64x2_t pmull_low(uint64x1_t a, uint64x1_t b) { |
| 53 | + uint64x2_t result; |
| 54 | + __asm__ ("pmull %0.1q,%1.1d,%2.1d" |
| 55 | + : "=w"(result) |
| 56 | + : "w"(a), "w"(b) |
| 57 | + : /* No clobbers */); |
| 58 | + return result; |
| 59 | +} |
| 60 | +static HEDLEY_ALWAYS_INLINE uint64x2_t pmull_high(uint64x2_t a, uint64x2_t b) { |
| 61 | + uint64x2_t result; |
| 62 | + __asm__ ("pmull2 %0.1q,%1.2d,%2.2d" |
| 63 | + : "=w"(result) |
| 64 | + : "w"(a), "w"(b) |
| 65 | + : /* No clobbers */); |
| 66 | + return result; |
| 67 | +} |
| 68 | +# elif defined(_MSC_VER) && !defined(__clang__) |
| 69 | +# define pmull_low vmull_p64 |
| 70 | +# define pmull_high vmull_high_p64 |
| 71 | +# else |
| 72 | +# define pmull_low(x, y) vreinterpretq_u64_p128(vmull_p64(vreinterpret_p64_u64(x), vreinterpret_p64_u64(y))) |
| 73 | +# define pmull_high(x, y) vreinterpretq_u64_p128(vmull_high_p64(vreinterpretq_p64_u64(x), vreinterpretq_p64_u64(y))) |
| 74 | +# endif |
| 75 | +#else |
| 76 | +# if defined(_MSC_VER) && !defined(__clang__) |
| 77 | +# define pmull_low vmull_p64 |
| 78 | +# define pmull_high(x, y) vmull_p64(vget_high_u64(x), vget_high_u64(y)) |
| 79 | +# else |
| 80 | +# define pmull_low(x, y) vreinterpretq_u64_p128(vmull_p64(x, y)) |
| 81 | +# define pmull_high(x, y) vreinterpretq_u64_p128(vmull_p64(vget_high_p64(vreinterpretq_p64_u64(x)), vget_high_p64(vreinterpretq_p64_u64(y)))) |
| 82 | +# endif |
| 83 | +#endif |
| 84 | + |
| 85 | + |
| 86 | +uint32_t crc32_multiply_pmull(uint32_t a, uint32_t b) { |
| 87 | + uint64x1_t prod = vget_low_u64(pmull_low( |
| 88 | + vreinterpret_u64_u32(vset_lane_u32(a, vdup_n_u32(0), 0)), |
| 89 | + vreinterpret_u64_u32(vset_lane_u32(b, vdup_n_u32(0), 0)) |
| 90 | + )); |
| 91 | + #ifdef __aarch64__ |
| 92 | + uint64_t p = vget_lane_u64(prod, 0); |
| 93 | + return __crc32w(0, p+p) ^ (p >> 31); |
| 94 | + #else |
| 95 | + prod = vadd_u64(prod, prod); |
| 96 | + uint32x2_t prod32 = vreinterpret_u32_u64(prod); |
| 97 | + return __crc32w(0, vget_lane_u32(prod32, 0)) ^ vget_lane_u32(prod32, 1); |
| 98 | + #endif |
| 99 | +} |
| 100 | + |
| 101 | + |
| 102 | + |
| 103 | +const uint32_t crc_power_rev[32] = { // bit-reversed crc_power |
| 104 | + 0x00000002, 0x00000004, 0x00000010, 0x00000100, 0x00010000, 0x04c11db7, 0x490d678d, 0xe8a45605, |
| 105 | + 0x75be46b7, 0xe6228b11, 0x567fddeb, 0x88fe2237, 0x0e857e71, 0x7001e426, 0x075de2b2, 0xf12a7f90, |
| 106 | + 0xf0b4a1c1, 0x58f46c0c, 0xc3395ade, 0x96837f8c, 0x544037f9, 0x23b7b136, 0xb2e16ba8, 0x725e7bfa, |
| 107 | + 0xec709b5d, 0xf77a7274, 0x2845d572, 0x034e2515, 0x79695942, 0x540cb128, 0x0b65d023, 0x3c344723 |
| 108 | +}; |
| 109 | + |
| 110 | + |
| 111 | +static HEDLEY_ALWAYS_INLINE uint64x1_t crc32_shift_pmull_mulred(uint64x1_t a, uint64x1_t b) { |
| 112 | + uint64x2_t r = pmull_low(a, b); |
| 113 | + uint64x2_t h = pmull_high(r, vdupq_n_u64(0x490d678d)); |
| 114 | + return veor_u64(vget_low_u64(r), vget_low_u64(h)); |
| 115 | +} |
| 116 | + |
| 117 | + |
| 118 | +uint32_t crc32_shift_pmull(uint32_t crc1, uint32_t n) { |
| 119 | + crc1 = rbit32(crc1); |
| 120 | + |
| 121 | + uint64x1_t res; |
| 122 | + #ifdef __aarch64__ |
| 123 | + uint64_t crc = (uint64_t)crc1 << (n & 31); |
| 124 | + res = vset_lane_u64(crc, vdup_n_u64(0), 0); |
| 125 | + #else |
| 126 | + res = vreinterpret_u64_u32(vset_lane_u32(crc1, vdup_n_u32(0), 0)); |
| 127 | + res = vshl_u64(res, vdup_n_u64(n&31)); |
| 128 | + #endif |
| 129 | + n &= ~31; |
| 130 | + |
| 131 | + if(n) { |
| 132 | + #define LOAD_NEXT_POWER vreinterpret_u64_u32(vset_lane_u32(crc_power_rev[ctz32(n)], vdup_n_u32(0), 0)) |
| 133 | + uint64x1_t res2 = LOAD_NEXT_POWER; |
| 134 | + n &= n-1; |
| 135 | + |
| 136 | + if(n) { |
| 137 | + // first multiply doesn't need reduction |
| 138 | + res2 = vget_low_u64(pmull_low(res2, LOAD_NEXT_POWER)); |
| 139 | + n &= n-1; |
| 140 | + |
| 141 | + while(n) { |
| 142 | + res = crc32_shift_pmull_mulred(res, LOAD_NEXT_POWER); |
| 143 | + n &= n-1; |
| 144 | + |
| 145 | + if(n) { |
| 146 | + res2 = crc32_shift_pmull_mulred(res2, LOAD_NEXT_POWER); |
| 147 | + n &= n-1; |
| 148 | + } |
| 149 | + } |
| 150 | + } |
| 151 | + #undef LOAD_NEXT_POWER |
| 152 | + |
| 153 | + // merge two results |
| 154 | + uint64x2_t prod = pmull_low(res, res2); |
| 155 | + // weirdly, vrbitq_u8 is missing in ARM32 MSVC |
| 156 | + prod = vreinterpretq_u64_u8(vrev64q_u8(vrbitq_u8(vreinterpretq_u8_u64(prod)))); |
| 157 | + #ifdef __aarch64__ |
| 158 | + crc = __crc32d(0, vgetq_lane_u64(prod, 1)); |
| 159 | + uint64_t rem = vgetq_lane_u64(prod, 0); |
| 160 | + crc = __crc32w(rem, crc) ^ (rem >> 32); |
| 161 | + #else |
| 162 | + uint32x4_t prod32 = vreinterpretq_u32_u64(prod); |
| 163 | + uint32_t crc = __crc32w(0, vgetq_lane_u32(prod32, 2)); |
| 164 | + crc = __crc32w(vgetq_lane_u32(prod32, 3), crc); |
| 165 | + crc = __crc32w(vgetq_lane_u32(prod32, 0), crc) ^ vgetq_lane_u32(prod32, 1); |
| 166 | + #endif |
| 167 | + return crc; |
| 168 | + } else { |
| 169 | + #ifdef __aarch64__ |
| 170 | + crc = rbit64(crc); |
| 171 | + crc = __crc32w(0, crc) ^ (crc >> 32); |
| 172 | + return crc; |
| 173 | + #else |
| 174 | + uint32x2_t r = vreinterpret_u32_u64(res); |
| 175 | + return __crc32w(0, rbit32(vget_lane_u32(r, 1))) ^ rbit32(vget_lane_u32(r, 0)); |
| 176 | + #endif |
| 177 | + } |
| 178 | +} |
| 179 | + |
| 180 | + |
| 181 | +void crc_pmull_set_funcs() { |
| 182 | + _crc32_multiply = &crc32_multiply_pmull; |
| 183 | + _crc32_shift = &crc32_shift_pmull; |
| 184 | + _crc32_isa &= ISA_FEATURE_PMULL; |
| 185 | +} |
| 186 | + |
| 187 | +#else |
| 188 | +void crc_pmull_set_funcs() {} |
| 189 | +#endif /* defined(__ARM_FEATURE_CRYPTO) && defined(__ARM_FEATURE_CRC32) */ |
0 commit comments