@@ -42,7 +42,7 @@ static inline Zmm make_zmm(const Xmm &v) {
42
42
return Zmm (v.getIdx ());
43
43
}
44
44
45
- void jit_avx512_core_amx_copy_kern ::transpose (int s, const Ymm &dst1,
45
+ void jit_avx512_core_amx_copy_kern_t ::transpose (int s, const Ymm &dst1,
46
46
const Ymm &dst2, const Ymm &src1, const Ymm &src2) {
47
47
switch (s) {
48
48
case 32 :
@@ -91,8 +91,9 @@ void jit_avx512_core_amx_copy_kern::transpose(int s, const Ymm &dst1,
91
91
}
92
92
}
93
93
94
- void jit_avx512_core_amx_copy_kern::amxtrans8 (const Ymm &dst1, const Ymm &dst2,
95
- const Ymm &src1, const Ymm &src2, const Ymm &src3, const Ymm &src4) {
94
+ void jit_avx512_core_amx_copy_kern_t::amxtrans8 (const Ymm &dst1,
95
+ const Ymm &dst2, const Ymm &src1, const Ymm &src2, const Ymm &src3,
96
+ const Ymm &src4) {
96
97
vpunpcklbw (dst1, src1, src2);
97
98
vpunpckhbw (dst2, src1, src2);
98
99
vpunpcklbw (src1, src3, src4);
@@ -107,7 +108,7 @@ void jit_avx512_core_amx_copy_kern::amxtrans8(const Ymm &dst1, const Ymm &dst2,
107
108
vshufi32x4 (src4, dst1, dst2, 0x03 );
108
109
}
109
110
110
- void jit_avx512_core_amx_copy_kern ::amxtrans16 (
111
+ void jit_avx512_core_amx_copy_kern_t ::amxtrans16 (
111
112
const Ymm &dst1, const Ymm &dst2, const Ymm &src1, const Ymm &src2) {
112
113
vpunpcklwd (dst1, src1, src2);
113
114
vpunpckhwd (dst2, src1, src2);
@@ -117,7 +118,7 @@ void jit_avx512_core_amx_copy_kern::amxtrans16(
117
118
vshufi32x4 (src2, src2, src2, 0xd8 );
118
119
}
119
120
120
- void jit_avx512_core_amx_copy_kern ::load (
121
+ void jit_avx512_core_amx_copy_kern_t ::load (
121
122
const Xmm &dst, const Address &src, bool corner) {
122
123
if (!corner && isize_ == 1 )
123
124
vmovdqu8 (dst, src);
@@ -129,14 +130,15 @@ void jit_avx512_core_amx_copy_kern::load(
129
130
vmovdqu16 (dst | k1 | T_z, src);
130
131
}
131
132
132
- void jit_avx512_core_amx_copy_kern::store (const Address &dst, const Xmm &src) {
133
+ void jit_avx512_core_amx_copy_kern_t::store (
134
+ const Address &dst, const Xmm &src) {
133
135
if (size_ == 1 )
134
136
vmovdqu8 (dst, src);
135
137
else
136
138
vmovdqu16 (dst, src);
137
139
}
138
140
139
- void jit_avx512_core_amx_copy_kern ::kernel_AN (
141
+ void jit_avx512_core_amx_copy_kern_t ::kernel_AN (
140
142
int unroll_x, int unroll_y, int step, Reg64 A, Reg64 B, bool corner) {
141
143
// Transpose data.
142
144
int u[] = {32 , 16 , 8 , 4 };
@@ -170,7 +172,7 @@ void jit_avx512_core_amx_copy_kern::kernel_AN(
170
172
}
171
173
}
172
174
173
- void jit_avx512_core_amx_copy_kern ::kernel_BN (
175
+ void jit_avx512_core_amx_copy_kern_t ::kernel_BN (
174
176
int unroll_x, int unroll_y, int step, Reg64 A, Reg64 B, bool corner) {
175
177
// Store data.
176
178
for (int i = 0 ; i < 16 ; i++)
@@ -179,7 +181,7 @@ void jit_avx512_core_amx_copy_kern::kernel_BN(
179
181
src_[i]);
180
182
}
181
183
182
- void jit_avx512_core_amx_copy_kern ::kernel_AT (
184
+ void jit_avx512_core_amx_copy_kern_t ::kernel_AT (
183
185
int unroll_x, int unroll_y, int step, Reg64 A, Reg64 B, bool corner) {
184
186
Ymm v[16 ];
185
187
@@ -258,7 +260,7 @@ void jit_avx512_core_amx_copy_kern::kernel_AT(
258
260
}
259
261
}
260
262
261
- void jit_avx512_core_amx_copy_kern ::kernel_BT (
263
+ void jit_avx512_core_amx_copy_kern_t ::kernel_BT (
262
264
int unroll_x, int unroll_y, int step, Reg64 A, Reg64 B, bool corner) {
263
265
// Transpose data.
264
266
int u[] = {16 , 8 , 4 , 2 , 1 };
@@ -297,7 +299,7 @@ void jit_avx512_core_amx_copy_kern::kernel_BT(
297
299
L (store_end);
298
300
}
299
301
300
- void jit_avx512_core_amx_copy_kern ::kernel (
302
+ void jit_avx512_core_amx_copy_kern_t ::kernel (
301
303
int unroll_x, int unroll_y, int step, Reg64 A, Reg64 B, bool corner) {
302
304
303
305
// Load matrix.
@@ -326,7 +328,7 @@ void jit_avx512_core_amx_copy_kern::kernel(
326
328
kernel_BT (unroll_x, unroll_y, step, A, B, corner);
327
329
}
328
330
329
- void jit_avx512_core_amx_copy_kern ::copy_m (int unroll_m, int unroll_n) {
331
+ void jit_avx512_core_amx_copy_kern_t ::copy_m (int unroll_m, int unroll_n) {
330
332
if (is_trans_) {
331
333
mov (B1_, B_);
332
334
add (B_, unroll_m * unroll_n * size_);
@@ -378,7 +380,7 @@ void jit_avx512_core_amx_copy_kern::copy_m(int unroll_m, int unroll_n) {
378
380
L_aligned (kernel_tail_end);
379
381
}
380
382
381
- void jit_avx512_core_amx_copy_kern ::copy_ns (int unroll_n, Label &epilogue) {
383
+ void jit_avx512_core_amx_copy_kern_t ::copy_ns (int unroll_n, Label &epilogue) {
382
384
if (unroll_n > 0 ) {
383
385
copy_ns (unroll_n - 1 , epilogue);
384
386
@@ -393,7 +395,7 @@ void jit_avx512_core_amx_copy_kern::copy_ns(int unroll_n, Label &epilogue) {
393
395
}
394
396
}
395
397
396
- void jit_avx512_core_amx_copy_kern ::copy_n (int unroll_n, Label &epilogue) {
398
+ void jit_avx512_core_amx_copy_kern_t ::copy_n (int unroll_n, Label &epilogue) {
397
399
398
400
Label copy_m_loop, copy_m_end;
399
401
@@ -422,7 +424,7 @@ void jit_avx512_core_amx_copy_kern::copy_n(int unroll_n, Label &epilogue) {
422
424
copy_ns (unroll_n - 1 , epilogue);
423
425
}
424
426
425
- void jit_avx512_core_amx_copy_kern ::generate () {
427
+ void jit_avx512_core_amx_copy_kern_t ::generate () {
426
428
// Prologue
427
429
preamble ();
428
430
sub (rsp, stack_alloc_size_);
@@ -494,7 +496,7 @@ void jit_avx512_core_amx_copy_kern::generate() {
494
496
postamble ();
495
497
}
496
498
497
- jit_avx512_core_amx_copy_kern::jit_avx512_core_amx_copy_kern (
499
+ jit_avx512_core_amx_copy_kern_t::jit_avx512_core_amx_copy_kern_t (
498
500
bool is_a, bool is_trans, int isize)
499
501
: jit_generator_t (jit_name())
500
502
, is_a_(is_a)
0 commit comments