@@ -8,6 +8,13 @@ namespace ov {
8
8
namespace intel_cpu {
9
9
namespace aarch64 {
10
10
11
+ // In aarch64, conversion between f16 and i16/u16 can be done with single instruction. The supported
12
+ // conversion precicions are f32, i32, f16, i8 (byte), u8 (byte). If we introduce an intermediate
13
+ // precision i16/u16 (dbyte) in the following graph. Then the conversion between each pair of
14
+ // neighbors in this graph will be done with single instruction.
15
+ // f16 - f32 - i32 - dbyte - byte
16
+ // | |
17
+ // - - - - - - - - - - -
11
18
template <dnnl::impl::cpu::aarch64::cpu_isa_t isa>
12
19
void cvt_f16_to_f32 (dnnl::impl::cpu::aarch64::jit_generator* h, const std::vector<size_t > &in_idxs, const std::vector<size_t > &out_idxs) {
13
20
using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits<isa>::TReg;
@@ -41,37 +48,83 @@ void cvt_i32_to_f32(dnnl::impl::cpu::aarch64::jit_generator* h, const std::vecto
41
48
}
42
49
43
50
template <dnnl::impl::cpu::aarch64::cpu_isa_t isa>
44
- void cvt_i32_to_byte (dnnl::impl::cpu::aarch64::jit_generator* h, const std::vector<size_t > &in_idxs, const std::vector<size_t > &out_idxs,
51
+ void cvt_i32_to_dbyte (dnnl::impl::cpu::aarch64::jit_generator* h, const std::vector<size_t > &in_idxs, const std::vector<size_t > &out_idxs,
45
52
bool is_signed, bool is_saturated) {
46
53
using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits<isa>::TReg;
47
54
TReg src = TReg (in_idxs[0 ]);
48
55
TReg dst = TReg (out_idxs[0 ]);
49
56
if (is_saturated) {
50
57
if (is_signed) {
51
58
h->sqxtn (dst.h4 , src.s4 );
52
- h->sqxtn (dst.b8 , dst.h8 );
53
59
} else {
54
60
h->uqxtn (dst.h4 , src.s4 );
55
- h->uqxtn (dst.b8 , dst.h8 );
56
61
}
57
62
} else {
58
63
h->xtn (dst.h4 , src.s4 );
59
- h->xtn (dst.b8 , dst.h8 );
60
64
}
61
65
}
62
66
63
67
template <dnnl::impl::cpu::aarch64::cpu_isa_t isa>
64
- void cvt_byte_to_i32 (dnnl::impl::cpu::aarch64::jit_generator* h, const std::vector<size_t > &in_idxs, const std::vector<size_t > &out_idxs,
68
+ void cvt_dbyte_to_i32 (dnnl::impl::cpu::aarch64::jit_generator* h, const std::vector<size_t > &in_idxs, const std::vector<size_t > &out_idxs,
65
69
bool is_signed) {
66
70
using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits<isa>::TReg;
67
71
TReg src = TReg (in_idxs[0 ]);
68
72
TReg dst = TReg (out_idxs[0 ]);
73
+ if (is_signed) {
74
+ h->sxtl (dst.s4 , src.h4 );
75
+ } else {
76
+ h->uxtl (dst.s4 , src.h4 );
77
+ }
78
+ }
79
+
80
+ template <dnnl::impl::cpu::aarch64::cpu_isa_t isa>
81
+ void cvt_f16_to_dbyte (dnnl::impl::cpu::aarch64::jit_generator* h, const std::vector<size_t > &in_idxs, const std::vector<size_t > &out_idxs) {
82
+ using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits<isa>::TReg;
83
+ TReg src = TReg (in_idxs[0 ]);
84
+ TReg dst = TReg (out_idxs[0 ]);
85
+ h->fcvtzs (dst.h , src.h );
86
+ }
87
+
88
+ template <dnnl::impl::cpu::aarch64::cpu_isa_t isa>
89
+ void cvt_dbyte_to_f16 (dnnl::impl::cpu::aarch64::jit_generator* h, const std::vector<size_t > &in_idxs, const std::vector<size_t > &out_idxs,
90
+ bool is_signed) {
91
+ using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits<isa>::TReg;
92
+ TReg src = TReg (in_idxs[0 ]);
93
+ TReg dst = TReg (out_idxs[0 ]);
94
+ if (is_signed) {
95
+ h->scvtf (dst.h , src.h );
96
+ } else {
97
+ h->ucvtf (dst.h , src.h );
98
+ }
99
+ }
100
+
101
+ template <dnnl::impl::cpu::aarch64::cpu_isa_t isa>
102
+ void cvt_dbyte_to_byte (dnnl::impl::cpu::aarch64::jit_generator* h, const std::vector<size_t > &in_idxs, const std::vector<size_t > &out_idxs,
103
+ bool is_signed, bool is_saturated) {
104
+ using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits<isa>::TReg;
105
+ TReg src = TReg (in_idxs[0 ]);
106
+ TReg dst = TReg (out_idxs[0 ]);
107
+ if (is_saturated) {
108
+ if (is_signed) {
109
+ h->sqxtn (dst.b8 , src.h8 );
110
+ } else {
111
+ h->uqxtn (dst.b8 , src.h8 );
112
+ }
113
+ } else {
114
+ h->xtn (dst.b8 , src.h8 );
115
+ }
116
+ }
117
+
118
+ template <dnnl::impl::cpu::aarch64::cpu_isa_t isa>
119
+ void cvt_byte_to_dbyte (dnnl::impl::cpu::aarch64::jit_generator* h, const std::vector<size_t > &in_idxs, const std::vector<size_t > &out_idxs,
120
+ bool is_signed) {
121
+ using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits<isa>::TReg;
122
+ TReg src = TReg (in_idxs[0 ]);
123
+ TReg dst = TReg (out_idxs[0 ]);
69
124
if (is_signed) {
70
125
h->sxtl (dst.h8 , src.b8 );
71
- h->sxtl (dst.s4 , dst.h4 );
72
126
} else {
73
127
h->uxtl (dst.h8 , src.b8 );
74
- h->uxtl (dst.s4 , dst.h4 );
75
128
}
76
129
}
77
130
@@ -87,13 +140,28 @@ template void cvt_f32_to_i32<dnnl::impl::cpu::aarch64::asimd>(dnnl::impl::cpu::a
87
140
template void cvt_i32_to_f32<dnnl::impl::cpu::aarch64::asimd>(dnnl::impl::cpu::aarch64::jit_generator* h,
88
141
const std::vector<size_t > &in_idxs, const std::vector<size_t > &out_idxs);
89
142
90
- template void cvt_i32_to_byte<dnnl::impl::cpu::aarch64::asimd>(dnnl::impl::cpu::aarch64::jit_generator* h,
91
- const std::vector<size_t > &in_idxs, const std::vector<size_t > &out_idxs,
92
- bool is_signed, bool is_saturation);
143
+ template void cvt_i32_to_dbyte<dnnl::impl::cpu::aarch64::asimd>(dnnl::impl::cpu::aarch64::jit_generator* h,
144
+ const std::vector<size_t > &in_idxs, const std::vector<size_t > &out_idxs,
145
+ bool is_signed, bool is_saturation);
146
+
147
+ template void cvt_dbyte_to_i32<dnnl::impl::cpu::aarch64::asimd>(dnnl::impl::cpu::aarch64::jit_generator* h,
148
+ const std::vector<size_t > &in_idxs, const std::vector<size_t > &out_idxs,
149
+ bool is_signed);
150
+
151
+ template void cvt_f16_to_dbyte<dnnl::impl::cpu::aarch64::asimd>(dnnl::impl::cpu::aarch64::jit_generator* h,
152
+ const std::vector<size_t > &in_idxs, const std::vector<size_t > &out_idxs);
153
+
154
+ template void cvt_dbyte_to_f16<dnnl::impl::cpu::aarch64::asimd>(dnnl::impl::cpu::aarch64::jit_generator* h,
155
+ const std::vector<size_t > &in_idxs, const std::vector<size_t > &out_idxs,
156
+ bool is_signed);
157
+
158
+ template void cvt_dbyte_to_byte<dnnl::impl::cpu::aarch64::asimd>(dnnl::impl::cpu::aarch64::jit_generator* h,
159
+ const std::vector<size_t > &in_idxs, const std::vector<size_t > &out_idxs,
160
+ bool is_signed, bool is_saturation);
93
161
94
- template void cvt_byte_to_i32 <dnnl::impl::cpu::aarch64::asimd>(dnnl::impl::cpu::aarch64::jit_generator* h,
95
- const std::vector<size_t > &in_idxs, const std::vector<size_t > &out_idxs,
96
- bool is_signed);
162
+ template void cvt_byte_to_dbyte <dnnl::impl::cpu::aarch64::asimd>(dnnl::impl::cpu::aarch64::jit_generator* h,
163
+ const std::vector<size_t > &in_idxs, const std::vector<size_t > &out_idxs,
164
+ bool is_signed);
97
165
98
166
} // namespace aarch64
99
167
} // namespace intel_cpu
0 commit comments