Skip to content

Commit de6122a

Browse files
committed
Add ARM PMULL CRC32 calc functions
1 parent 55422f6 commit de6122a

File tree

5 files changed

+267
-24
lines changed

5 files changed

+267
-24
lines changed

binding.gyp

+37-1
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@
7878
"targets": [
7979
{
8080
"target_name": "yencode",
81-
"dependencies": ["yencode_sse2", "yencode_ssse3", "yencode_clmul", "yencode_clmul256", "yencode_avx", "yencode_avx2", "yencode_vbmi2", "yencode_neon", "yencode_armcrc", "yencode_rvv", "yencode_zbkc"],
81+
"dependencies": ["yencode_sse2", "yencode_ssse3", "yencode_clmul", "yencode_clmul256", "yencode_avx", "yencode_avx2", "yencode_vbmi2", "yencode_neon", "yencode_armcrc", "yencode_pmull", "yencode_rvv", "yencode_zbkc"],
8282
"sources": [
8383
"src/yencode.cc",
8484
"src/platform.cc",
@@ -416,6 +416,42 @@
416416
}]
417417
]
418418
},
419+
{
420+
"target_name": "yencode_pmull",
421+
"type": "static_library",
422+
"sources": [
423+
"src/crc_arm_pmull.cc"
424+
],
425+
"cflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
426+
"cxxflags!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
427+
"xcode_settings": {
428+
"OTHER_CFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"],
429+
"OTHER_CXXFLAGS!": ["-fno-omit-frame-pointer", "-fno-tree-vrp", "-fno-strict-aliasing"]
430+
},
431+
"msvs_settings": {"VCCLCompilerTool": {"BufferSecurityCheck": "false"}},
432+
"conditions": [
433+
['target_arch in "arm arm64"', {
434+
"cflags!": ["-march=native"],
435+
"cxxflags!": ["-march=native"],
436+
"cflags": ["-march=armv8-a+crc+crypto"],
437+
"cxxflags": ["-march=armv8-a+crc+crypto"],
438+
"xcode_settings": {
439+
"OTHER_CFLAGS!": ["-march=native"],
440+
"OTHER_CXXFLAGS!": ["-march=native"],
441+
"OTHER_CFLAGS": ["-march=armv8-a+crc+crypto"],
442+
"OTHER_CXXFLAGS": ["-march=armv8-a+crc+crypto"],
443+
}
444+
}],
445+
['OS!="win" and target_arch=="arm"', {
446+
"cflags": ["-mfpu=neon","-fno-lto"],
447+
"cxxflags": ["-mfpu=neon","-fno-lto"],
448+
"xcode_settings": {
449+
"OTHER_CFLAGS": ["-mfpu=neon","-fno-lto"],
450+
"OTHER_CXXFLAGS": ["-mfpu=neon","-fno-lto"]
451+
}
452+
}]
453+
]
454+
},
419455
{
420456
"target_name": "yencode_zbkc",
421457
"type": "static_library",

src/common.h

+1
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,7 @@ enum YEncDecIsaLevel {
240240
enum YEncDecIsaLevel {
241241
ISA_GENERIC = 0,
242242
ISA_FEATURE_CRC = 8,
243+
ISA_FEATURE_PMULL = 0x40,
243244
ISA_LEVEL_NEON = 0x1000
244245
};
245246
#elif defined(__riscv)

src/crc.cc

+39-22
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,7 @@ extern "C" {
194194
void crc_clmul_set_funcs();
195195
void crc_clmul256_set_funcs();
196196
void crc_arm_set_funcs();
197+
void crc_pmull_set_funcs();
197198
void crc_riscv_set_funcs();
198199

199200
#ifdef PLATFORM_X86
@@ -246,31 +247,47 @@ void crc_init() {
246247
#endif
247248
#ifdef PLATFORM_ARM
248249
# ifdef __APPLE__
249-
int supported = 0;
250-
size_t len = sizeof(supported);
251-
if(sysctlbyname("hw.optional.armv8_crc32", &supported, &len, NULL, 0))
252-
supported = 0;
253-
# endif
254-
if(
255-
# if defined(AT_HWCAP2) && defined(HWCAP2_CRC32)
256-
getauxval(AT_HWCAP2) & HWCAP2_CRC32
257-
# elif defined(AT_HWCAP) && defined(HWCAP_CRC32)
258-
getauxval(AT_HWCAP) & HWCAP_CRC32
259-
# elif defined(ANDROID_CPU_FAMILY_ARM) && defined(__aarch64__)
260-
android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_CRC32
261-
# elif defined(ANDROID_CPU_FAMILY_ARM) /* aarch32 */
262-
android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_CRC32
263-
# elif defined(_WIN32)
264-
IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE)
265-
# elif defined(__APPLE__)
266-
supported
267-
# elif defined(__ARM_FEATURE_CRC32)
268-
true /* assume available if compiled as such */
250+
int supports_crc = 0;
251+
int supports_pmull = 0;
252+
size_t len = sizeof(supports_crc);
253+
if(sysctlbyname("hw.optional.armv8_crc32", &supports_crc, &len, NULL, 0))
254+
supports_crc = 0;
255+
if(sysctlbyname("hw.optional.arm.FEAT_PMULL", &supports_pmull, &len, NULL, 0))
256+
supports_pmull = 0;
269257
# else
270-
false
258+
bool supports_crc = false;
259+
bool supports_pmull = false;
260+
# if defined(AT_HWCAP2) && defined(HWCAP2_CRC32)
261+
supports_crc = getauxval(AT_HWCAP2) & HWCAP2_CRC32;
262+
# elif defined(AT_HWCAP) && defined(HWCAP_CRC32)
263+
supports_crc = getauxval(AT_HWCAP) & HWCAP_CRC32;
264+
# elif defined(ANDROID_CPU_FAMILY_ARM) && defined(__aarch64__)
265+
supports_crc = android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_CRC32;
266+
supports_pmull = android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_PMULL;
267+
# elif defined(ANDROID_CPU_FAMILY_ARM) /* aarch32 */
268+
supports_crc = android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_CRC32;
269+
supports_pmull = android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_PMULL;
270+
# elif defined(_WIN32)
271+
supports_crc = IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE);
272+
supports_pmull = IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE);
273+
# else
274+
#ifdef __ARM_FEATURE_CRC32
275+
supports_crc = true; /* assume available if compiled as such */
276+
#endif
277+
#ifdef __ARM_FEATURE_CRYPTO
278+
supports_pmull = true;
279+
#endif
280+
# endif
281+
# if defined(AT_HWCAP2) && defined(HWCAP2_PMULL)
282+
supports_pmull = getauxval(AT_HWCAP2) & HWCAP2_PMULL;
283+
# elif defined(AT_HWCAP) && defined(HWCAP_PMULL)
284+
supports_pmull = getauxval(AT_HWCAP) & HWCAP_PMULL;
285+
# endif
271286
# endif
272-
) {
287+
288+
if(supports_crc) {
273289
crc_arm_set_funcs();
290+
if(supports_pmull) crc_pmull_set_funcs();
274291
}
275292
#endif
276293
#ifdef __riscv

src/crc.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ static inline uint32_t crc32_powmod(uint64_t n) {
2828
unsigned carry = __builtin_uadd_overflow(n >> 32, n, &res);
2929
res += carry;
3030
return res;
31-
#elif defined(_MSC_VER)
31+
#elif defined(_MSC_VER) && defined(PLATFORM_X86)
3232
unsigned res;
3333
unsigned char carry = _addcarry_u32(0, n >> 32, n, &res);
3434
_addcarry_u32(carry, res, 0, &res);

src/crc_arm_pmull.cc

+189
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
#include "crc_common.h"
2+
3+
// exclude broken/missing arm_acle.h
4+
#if defined(__ARM_FEATURE_CRYPTO) && defined(HEDLEY_GCC_VERSION)
5+
# if !defined(__aarch64__) && HEDLEY_GCC_VERSION_CHECK(7,0,0) && !HEDLEY_GCC_VERSION_CHECK(8,1,1)
6+
# undef __ARM_FEATURE_CRYPTO
7+
# endif
8+
# if defined(__aarch64__) && HEDLEY_GCC_VERSION_CHECK(9,4,0) && !HEDLEY_GCC_VERSION_CHECK(9,5,0)
9+
# undef __ARM_FEATURE_CRYPTO
10+
# endif
11+
#endif
12+
#if defined(__ARM_FEATURE_CRYPTO) && defined(__has_include)
13+
# if !__has_include(<arm_acle.h>)
14+
# undef __ARM_FEATURE_CRYPTO
15+
# endif
16+
#endif
17+
18+
// ARM's intrinsics guide seems to suggest that vmull_p64 is available on A32, but neither Clang/GCC seem to support it on AArch32
19+
#if (defined(__ARM_FEATURE_CRYPTO) && defined(__ARM_FEATURE_CRC32) && defined(__aarch64__)) || (defined(_M_ARM64) && !defined(__clang__))
20+
#include <arm_neon.h>
21+
#if defined(_MSC_VER) && !defined(__clang__)
22+
#include <intrin.h>
23+
24+
#ifdef _M_ARM64
25+
// MSVC may detect this pattern: https://devblogs.microsoft.com/cppblog/a-tour-of-4-msvc-backend-improvements/#byteswap-identification
26+
unsigned __int64 rbit64(unsigned __int64 x) {
27+
x = _byteswap_uint64(x);
28+
x = (x & 0xaaaaaaaaaaaaaaaa) >> 1 | (x & 0x5555555555555555) << 1;
29+
x = (x & 0xcccccccccccccccc) >> 2 | (x & 0x3333333333333333) << 2;
30+
x = (x & 0xf0f0f0f0f0f0f0f0) >> 4 | (x & 0x0f0f0f0f0f0f0f0f) << 4;
31+
return x;
32+
}
33+
// ...whilst this seems to work best for 32-bit RBIT
34+
unsigned __int32 rbit32(unsigned __int32 x) {
35+
unsigned __int64 r = rbit64(x);
36+
return r >> 32;
37+
}
38+
#else
39+
#define rbit32 _arm_rbit
40+
#endif
41+
#else
42+
#include <arm_acle.h>
43+
#define rbit32 __rbit
44+
#define rbit64 __rbitll
45+
#endif
46+
47+
48+
// MSVC doesn't have poly64/poly128 types, so always use uint64 instead
49+
50+
#ifdef __aarch64__
51+
# if defined(__GNUC__) || defined(__clang__)
52+
static HEDLEY_ALWAYS_INLINE uint64x2_t pmull_low(uint64x1_t a, uint64x1_t b) {
53+
uint64x2_t result;
54+
__asm__ ("pmull %0.1q,%1.1d,%2.1d"
55+
: "=w"(result)
56+
: "w"(a), "w"(b)
57+
: /* No clobbers */);
58+
return result;
59+
}
60+
static HEDLEY_ALWAYS_INLINE uint64x2_t pmull_high(uint64x2_t a, uint64x2_t b) {
61+
uint64x2_t result;
62+
__asm__ ("pmull2 %0.1q,%1.2d,%2.2d"
63+
: "=w"(result)
64+
: "w"(a), "w"(b)
65+
: /* No clobbers */);
66+
return result;
67+
}
68+
# elif defined(_MSC_VER) && !defined(__clang__)
69+
# define pmull_low vmull_p64
70+
# define pmull_high vmull_high_p64
71+
# else
72+
# define pmull_low(x, y) vreinterpretq_u64_p128(vmull_p64(vreinterpret_p64_u64(x), vreinterpret_p64_u64(y)))
73+
# define pmull_high(x, y) vreinterpretq_u64_p128(vmull_high_p64(vreinterpretq_p64_u64(x), vreinterpretq_p64_u64(y)))
74+
# endif
75+
#else
76+
# if defined(_MSC_VER) && !defined(__clang__)
77+
# define pmull_low vmull_p64
78+
# define pmull_high(x, y) vmull_p64(vget_high_u64(x), vget_high_u64(y))
79+
# else
80+
# define pmull_low(x, y) vreinterpretq_u64_p128(vmull_p64(x, y))
81+
# define pmull_high(x, y) vreinterpretq_u64_p128(vmull_p64(vget_high_p64(vreinterpretq_p64_u64(x)), vget_high_p64(vreinterpretq_p64_u64(y))))
82+
# endif
83+
#endif
84+
85+
86+
uint32_t crc32_multiply_pmull(uint32_t a, uint32_t b) {
87+
uint64x1_t prod = vget_low_u64(pmull_low(
88+
vreinterpret_u64_u32(vset_lane_u32(a, vdup_n_u32(0), 0)),
89+
vreinterpret_u64_u32(vset_lane_u32(b, vdup_n_u32(0), 0))
90+
));
91+
#ifdef __aarch64__
92+
uint64_t p = vget_lane_u64(prod, 0);
93+
return __crc32w(0, p+p) ^ (p >> 31);
94+
#else
95+
prod = vadd_u64(prod, prod);
96+
uint32x2_t prod32 = vreinterpret_u32_u64(prod);
97+
return __crc32w(0, vget_lane_u32(prod32, 0)) ^ vget_lane_u32(prod32, 1);
98+
#endif
99+
}
100+
101+
102+
103+
const uint32_t crc_power_rev[32] = { // bit-reversed crc_power
104+
0x00000002, 0x00000004, 0x00000010, 0x00000100, 0x00010000, 0x04c11db7, 0x490d678d, 0xe8a45605,
105+
0x75be46b7, 0xe6228b11, 0x567fddeb, 0x88fe2237, 0x0e857e71, 0x7001e426, 0x075de2b2, 0xf12a7f90,
106+
0xf0b4a1c1, 0x58f46c0c, 0xc3395ade, 0x96837f8c, 0x544037f9, 0x23b7b136, 0xb2e16ba8, 0x725e7bfa,
107+
0xec709b5d, 0xf77a7274, 0x2845d572, 0x034e2515, 0x79695942, 0x540cb128, 0x0b65d023, 0x3c344723
108+
};
109+
110+
111+
static HEDLEY_ALWAYS_INLINE uint64x1_t crc32_shift_pmull_mulred(uint64x1_t a, uint64x1_t b) {
112+
uint64x2_t r = pmull_low(a, b);
113+
uint64x2_t h = pmull_high(r, vdupq_n_u64(0x490d678d));
114+
return veor_u64(vget_low_u64(r), vget_low_u64(h));
115+
}
116+
117+
118+
uint32_t crc32_shift_pmull(uint32_t crc1, uint32_t n) {
119+
crc1 = rbit32(crc1);
120+
121+
uint64x1_t res;
122+
#ifdef __aarch64__
123+
uint64_t crc = (uint64_t)crc1 << (n & 31);
124+
res = vset_lane_u64(crc, vdup_n_u64(0), 0);
125+
#else
126+
res = vreinterpret_u64_u32(vset_lane_u32(crc1, vdup_n_u32(0), 0));
127+
res = vshl_u64(res, vdup_n_u64(n&31));
128+
#endif
129+
n &= ~31;
130+
131+
if(n) {
132+
#define LOAD_NEXT_POWER vreinterpret_u64_u32(vset_lane_u32(crc_power_rev[ctz32(n)], vdup_n_u32(0), 0))
133+
uint64x1_t res2 = LOAD_NEXT_POWER;
134+
n &= n-1;
135+
136+
if(n) {
137+
// first multiply doesn't need reduction
138+
res2 = vget_low_u64(pmull_low(res2, LOAD_NEXT_POWER));
139+
n &= n-1;
140+
141+
while(n) {
142+
res = crc32_shift_pmull_mulred(res, LOAD_NEXT_POWER);
143+
n &= n-1;
144+
145+
if(n) {
146+
res2 = crc32_shift_pmull_mulred(res2, LOAD_NEXT_POWER);
147+
n &= n-1;
148+
}
149+
}
150+
}
151+
#undef LOAD_NEXT_POWER
152+
153+
// merge two results
154+
uint64x2_t prod = pmull_low(res, res2);
155+
// weirdly, vrbitq_u8 is missing in ARM32 MSVC
156+
prod = vreinterpretq_u64_u8(vrev64q_u8(vrbitq_u8(vreinterpretq_u8_u64(prod))));
157+
#ifdef __aarch64__
158+
crc = __crc32d(0, vgetq_lane_u64(prod, 1));
159+
uint64_t rem = vgetq_lane_u64(prod, 0);
160+
crc = __crc32w(rem, crc) ^ (rem >> 32);
161+
#else
162+
uint32x4_t prod32 = vreinterpretq_u32_u64(prod);
163+
uint32_t crc = __crc32w(0, vgetq_lane_u32(prod32, 2));
164+
crc = __crc32w(vgetq_lane_u32(prod32, 3), crc);
165+
crc = __crc32w(vgetq_lane_u32(prod32, 0), crc) ^ vgetq_lane_u32(prod32, 1);
166+
#endif
167+
return crc;
168+
} else {
169+
#ifdef __aarch64__
170+
crc = rbit64(crc);
171+
crc = __crc32w(0, crc) ^ (crc >> 32);
172+
return crc;
173+
#else
174+
uint32x2_t r = vreinterpret_u32_u64(res);
175+
return __crc32w(0, rbit32(vget_lane_u32(r, 1))) ^ rbit32(vget_lane_u32(r, 0));
176+
#endif
177+
}
178+
}
179+
180+
181+
void crc_pmull_set_funcs() {
182+
_crc32_multiply = &crc32_multiply_pmull;
183+
_crc32_shift = &crc32_shift_pmull;
184+
_crc32_isa &= ISA_FEATURE_PMULL;
185+
}
186+
187+
#else
188+
void crc_pmull_set_funcs() {}
189+
#endif /* defined(__ARM_FEATURE_CRYPTO) && defined(__ARM_FEATURE_CRC32) */

0 commit comments

Comments
 (0)