46
46
#include " graph/backend/dnnl/fusion_info.hpp"
47
47
#include " graph/backend/dnnl/internal_attrs.hpp"
48
48
49
+ #if (DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE) \
50
+ && (DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL)
51
+
52
+ #include " gpu/intel/compute/compute_engine.hpp"
53
+ #include " gpu/intel/compute/compute_stream.hpp"
54
+ #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
55
+ #include " gpu/intel/ocl/stream.hpp"
56
+ #endif
57
+
58
+ #ifdef DNNL_WITH_SYCL
59
+ #include " gpu/intel/sycl/stream.hpp"
60
+ #endif
61
+
62
+ #endif
49
63
namespace dnnl {
50
64
namespace impl {
51
65
namespace graph {
@@ -2467,19 +2481,17 @@ struct groupnorm_executable_t : public op_executable_t {
2467
2481
dnnl::group_normalization_forward prim_;
2468
2482
};
2469
2483
2484
+ #if DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE
2485
+ using namespace dnnl ::impl::gpu::intel;
2486
+ #define MAX_NDIMS 6
2487
+ #endif
2470
2488
struct genindex_executable_t : public op_executable_t {
2471
2489
DECLARE_ARG_INDICES_GETTER;
2472
2490
2473
2491
genindex_executable_t (std::shared_ptr<op_t > &op,
2474
2492
const dnnl::engine &p_engine, fusion_info_mgr_t &mgr,
2475
2493
pd_cache_t &pd_cache) {
2476
- if (p_engine.get_kind () == engine::kind::gpu) {
2477
- assertm (false ,
2478
- " genindex opexcutable is unimplemented "
2479
- " under SYCL and OCL "
2480
- " runtime!" );
2481
- throw std::runtime_error (" Unimplement" );
2482
- }
2494
+
2483
2495
using ltw = logical_tensor_wrapper_t ;
2484
2496
const auto &input_lt = op->get_input_value (0 )->get_logical_tensor ();
2485
2497
nelems_ = ltw (input_lt).nelems ();
@@ -2490,6 +2502,26 @@ struct genindex_executable_t : public op_executable_t {
2490
2502
output_dims_[i] = output_lt.dims [i];
2491
2503
output_strides_[i] = output_lt.layout .strides [i];
2492
2504
}
2505
+ #if DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE \
2506
+ && DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
2507
+ if (p_engine.get_kind () == engine::kind::gpu) {
2508
+ compute::kernel_ctx_t kernel_ctx;
2509
+ kernel_ctx.define_int (" NDIMS" , ndims_);
2510
+ for (int d = 0 ; d < MAX_NDIMS; ++d) {
2511
+ dim_t dim = (d < ndims_) ? output_dims_[d] : 1 ;
2512
+ dim_t stride = (d < ndims_) ? output_strides_[d] : 0 ;
2513
+ kernel_ctx.define_int (dnnl::impl::utils::format (" D%d" , d), dim);
2514
+ kernel_ctx.define_int (
2515
+ dnnl::impl::utils::format (" S%d" , d), stride);
2516
+ }
2517
+ auto *compute_engine
2518
+ = dnnl::impl::utils::downcast<compute::compute_engine_t *>(
2519
+ p_engine.get ());
2520
+ std::vector<compute::kernel_t > kernels (1 );
2521
+ compute_engine->create_kernels (&kernels, {" gen_index" }, kernel_ctx);
2522
+ kernel_ = kernels[0 ];
2523
+ }
2524
+ #endif
2493
2525
}
2494
2526
2495
2527
void execute (const stream &stream,
@@ -2498,26 +2530,97 @@ struct genindex_executable_t : public op_executable_t {
2498
2530
#ifdef DNNL_WITH_SYCL
2499
2531
::sycl::event execute_sycl (const stream &stream,
2500
2532
const std::unordered_map<int , memory> &args,
2501
- const std::vector<::sycl::event> &deps) const override {
2502
- execute (stream, args);
2503
- return {};
2533
+ const std::vector<::sycl::event> &deps = {}) const override {
2534
+ #if (DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE) \
2535
+ && (DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL)
2536
+ auto compute_stream
2537
+ = dnnl::impl::utils::downcast<compute::compute_stream_t *>(
2538
+ stream.get ());
2539
+ compute::range_t gws = {static_cast <size_t >(nelems_)};
2540
+ auto nd_range = compute::nd_range_t (gws);
2541
+ compute::kernel_arg_list_t arg_list;
2542
+ const auto &dst = *(args.at (DNNL_ARG_DST).get ()->memory_storage ());
2543
+ arg_list.set (0 , dst);
2544
+ arg_list.set (1 , axis_);
2545
+ auto *sycl_stream
2546
+ = dnnl::impl::utils::downcast<sycl::stream_t *>(compute_stream);
2547
+ sycl_stream->before_exec_hook ();
2548
+ if (!deps.empty ()) sycl_stream->sycl_ctx ().set_deps (deps);
2549
+
2550
+ kernel_.parallel_for (*compute_stream, nd_range, arg_list,
2551
+ sycl_stream->sycl_ctx ().get_deps (),
2552
+ sycl_stream->sycl_ctx ().get_deps ());
2553
+ auto return_event = sycl_stream->get_output_event ();
2554
+
2555
+ sycl_stream->after_exec_hook ();
2556
+ return return_event;
2557
+ #else
2558
+ assertm (false ,
2559
+ " genindex opexcutable is only implemented for intel vendor "
2560
+ " under SYCL runtime " );
2561
+ throw std::runtime_error (" Unimplement" );
2562
+ #endif
2504
2563
}
2505
2564
#endif
2506
2565
2507
2566
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
2508
2567
cl_event execute_ocl (const stream &stream,
2509
2568
const std::unordered_map<int , memory> &args,
2510
- const std::vector<cl_event> &deps) const override {
2569
+ const std::vector<cl_event> &deps = {}) const override {
2570
+ #if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
2571
+ auto compute_stream
2572
+ = dnnl::impl::utils::downcast<compute::compute_stream_t *>(
2573
+ stream.get ());
2574
+
2575
+ compute::range_t gws = {static_cast <size_t >(nelems_)};
2576
+
2577
+ auto nd_range = compute::nd_range_t (gws);
2578
+ compute::kernel_arg_list_t arg_list;
2579
+ const auto &dst = *(args.at (DNNL_ARG_DST).get ()->memory_storage ());
2580
+ arg_list.set (0 , dst);
2581
+ arg_list.set (1 , axis_);
2582
+ auto *ocl_stream
2583
+ = dnnl::impl::utils::downcast<gpu::intel::ocl::stream_t *>(
2584
+ compute_stream);
2585
+
2586
+ ocl_stream->before_exec_hook ();
2587
+
2588
+ if (!deps.empty ()) {
2589
+ std::vector<xpu::ocl::wrapper_t <cl_event>> events (deps.size ());
2590
+ for (size_t i = 0 ; i < deps.size (); i++)
2591
+ events[i] = xpu::ocl::wrapper_t <cl_event>(deps[i], true );
2592
+ ocl_stream->ocl_ctx ().set_deps (events);
2593
+ }
2594
+
2595
+ kernel_.parallel_for (*compute_stream, nd_range, arg_list,
2596
+ compute_stream->ctx ().get_deps (),
2597
+ compute_stream->ctx ().get_deps ());
2598
+
2599
+ cl_event return_event = nullptr ;
2600
+ if ((ocl_stream->flags () & stream_flags::in_order) == 0 ) {
2601
+ auto last = ocl_stream->get_output_event ();
2602
+ return_event = last.release ();
2603
+ }
2604
+
2605
+ ocl_stream->after_exec_hook ();
2606
+ return return_event;
2607
+ #else
2511
2608
assertm (false ,
2512
- " genindex op excutable is unimplemented "
2513
- " under OCL runtime!" );
2514
- return {};
2609
+ " genindex opexcutable is only implemented for intel vendor "
2610
+ " under OCL runtime " );
2611
+ throw std::runtime_error (" Unimplement" );
2612
+ #endif
2515
2613
}
2516
2614
#endif
2517
2615
2518
2616
private:
2519
2617
int axis_, nelems_, ndims_;
2520
2618
dims_t output_dims_, output_strides_;
2619
+
2620
+ #if (DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE) \
2621
+ && (DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL)
2622
+ compute::kernel_t kernel_;
2623
+ #endif
2521
2624
};
2522
2625
2523
2626
} // namespace dnnl_impl
0 commit comments