@@ -200,6 +200,34 @@ class jit_generator : public Xbyak::CodeGenerator, public c_compatible {
200
200
}
201
201
}
202
202
203
+ inline void push (const std::vector<Xbyak::Xmm> &xmms) {
204
+ std::vector<std::function<void ()>> deferred_movs{};
205
+ size_t offset = 0 ;
206
+ for (size_t i = 0 ; i < xmms.size (); ++i) {
207
+ const auto & xmm = xmms[i];
208
+ if (xmm.isXMM ()) {
209
+ deferred_movs.emplace_back ([this , offset, &xmm]() {
210
+ uni_vmovdqu (ptr[rsp + offset], xmm);
211
+ });
212
+ offset += xmm_len;
213
+ } else if (xmm.isYMM ()) {
214
+ deferred_movs.emplace_back ([this , offset, &xmm]() {
215
+ uni_vmovdqu (ptr[rsp + offset], Xbyak::Ymm{xmm.getIdx ()});
216
+ });
217
+ offset += ymm_len;
218
+ } else if (xmm.isZMM ()) {
219
+ deferred_movs.emplace_back ([this , offset, &xmm]() {
220
+ uni_vmovdqu (ptr[rsp + offset], Xbyak::Zmm{xmm.getIdx ()});
221
+ });
222
+ offset += zmm_len;
223
+ }
224
+ }
225
+ sub (rsp, offset);
226
+ for (const auto & def_mov : deferred_movs) {
227
+ def_mov ();
228
+ }
229
+ }
230
+
203
231
inline void pop (const Xbyak::Xmm &xmm) {
204
232
if (xmm.isXMM ()) {
205
233
uni_vmovdqu (xmm, ptr[rsp]);
@@ -213,6 +241,34 @@ class jit_generator : public Xbyak::CodeGenerator, public c_compatible {
213
241
}
214
242
}
215
243
244
+ inline void pop (const std::vector<Xbyak::Xmm> &xmms) {
245
+ std::vector<std::function<void ()>> deferred_movs{};
246
+ size_t offset = 0 ;
247
+ for (size_t i = 0 ; i < xmms.size (); ++i) {
248
+ const auto & xmm = xmms[i];
249
+ if (xmm.isXMM ()) {
250
+ deferred_movs.emplace_back ([this , offset, &xmm]() {
251
+ uni_vmovdqu (xmm, ptr[rsp + offset]);
252
+ });
253
+ offset += xmm_len;
254
+ } else if (xmm.isYMM ()) {
255
+ deferred_movs.emplace_back ([this , offset, &xmm]() {
256
+ uni_vmovdqu (Xbyak::Ymm{xmm.getIdx ()}, ptr[rsp + offset]);
257
+ });
258
+ offset += ymm_len;
259
+ } else if (xmm.isZMM ()) {
260
+ deferred_movs.emplace_back ([this , offset, &xmm]() {
261
+ uni_vmovdqu (Xbyak::Zmm{xmm.getIdx ()}, ptr[rsp + offset]);
262
+ });
263
+ offset += zmm_len;
264
+ }
265
+ }
266
+ for (const auto & def_mov : deferred_movs) {
267
+ def_mov ();
268
+ }
269
+ add (rsp, offset);
270
+ }
271
+
216
272
void preamble () {
217
273
if (xmm_to_preserve) {
218
274
sub (rsp, xmm_to_preserve * xmm_len);
0 commit comments