[Op][PT FE] Enable ISTFT for Pytorch Frontend (#28743)

mitruska · mmikolajcz · web-flow · commit d6147c2f95e5 · 2025-02-28T18:08:58.000Z
### Details:

 - Enable ISTFT for Pytorch Frontend
- Adjust shape_infer for the case with odd value for frame_size (round
down to even)
  

 

### Tickets:
 - 159383

---------

Co-authored-by: Mateusz Mikolajczyk &lt;mateusz.mikolajczyk@intel.com&gt;
diff --git a/src/core/reference/src/op/istft.cpp b/src/core/reference/src/op/istft.cpp
@@ -33,7 +33,8 @@ void istft(const float* in_data,
     const auto num_frames = data_shape[frames_axis];
 
     const auto signal_length = (num_frames - 1) * frame_step + frame_size;
-    const int64_t final_signal_length = length > 0 ? length : (center ? (signal_length - frame_size) : signal_length);
+    const int64_t final_signal_length =
+        length > 0 ? length : (center ? (signal_length - (frame_size & ~1)) : signal_length);
     std::fill(final_result, final_result + batch_size * final_signal_length, 0.f);
 
     std::vector<float> mid_result(batch_size * signal_length, 0.f);
diff --git a/src/core/shape_inference/include/istft_shape_inference.hpp b/src/core/shape_inference/include/istft_shape_inference.hpp
@@ -111,9 +111,9 @@ std::vector<TRShape> shape_infer(const ISTFT* op,
 
         const int64_t frames_axis = 1 + (is_data_3D ? 0 : 1);
         const TDim& num_frames_dim = data_shape[frames_axis];
-        TDim signal_length = (num_frames_dim - 1) * frame_step_val;
-        if (!op->get_center()) {
-            signal_length += frame_size_val;
+        TDim signal_length = (num_frames_dim - 1) * frame_step_val + frame_size_val;
+        if (op->get_center()) {
+            signal_length = signal_length - (frame_size_val & ~1);
         }
         output_shapes[0][0] = std::move(signal_length);
     }
diff --git a/src/core/tests/type_prop/istft.cpp b/src/core/tests/type_prop/istft.cpp
@@ -135,7 +135,7 @@ INSTANTIATE_TEST_SUITE_P(
         std::make_tuple(PartialShape{1, 48}, PartialShape{16}, 16, 16, false, PartialShape{1, 9, 3, 2}),
         std::make_tuple(PartialShape{2, 48}, PartialShape{8}, 16, 4, false, PartialShape{2, 9, 9, 2}),
         std::make_tuple(PartialShape{2, 9}, PartialShape{5}, 9, 100, false, PartialShape{2, 5, 1, 2}),
-        std::make_tuple(PartialShape{2, 0}, PartialShape{5}, 9, 100, true, PartialShape{2, 5, 1, 2}),
+        std::make_tuple(PartialShape{2, 1}, PartialShape{5}, 9, 100, true, PartialShape{2, 5, 1, 2}),
         std::make_tuple(PartialShape{4, 47},
                         PartialShape{7},
                         11,
@@ -151,7 +151,7 @@ INSTANTIATE_TEST_SUITE_P(
                         3,
                         false,
                         PartialShape{{2, 4}, 6, {1, -1}, 2}),
-        std::make_tuple(PartialShape{{2, 4}, {-1, -1}},
+        std::make_tuple(PartialShape{{2, 4}, {1, -1}},
                         PartialShape{7},
                         11,
                         3,
diff --git a/src/frontends/pytorch/src/op/istft.cpp b/src/frontends/pytorch/src/op/istft.cpp
@@ -0,0 +1,106 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "openvino/op/istft.hpp"
+
+#include "openvino/frontend/complex_type_mark.hpp"
+#include "openvino/frontend/pytorch/node_context.hpp"
+#include "openvino/op/broadcast.hpp"
+#include "openvino/op/constant.hpp"
+#include "openvino/op/convert_like.hpp"
+#include "openvino/op/divide.hpp"
+#include "openvino/op/unsqueeze.hpp"
+#include "utils.hpp"
+
+namespace ov {
+namespace frontend {
+namespace pytorch {
+namespace op {
+
+using namespace ov::op;
+
+OutputVector translate_istft(const NodeContext& context) {
+    // aten::istft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool
+    // center=True, bool normalized=False, bool? onesided=None, int? length=None, bool return_complex=False)
+    num_inputs_check(context, 2, 10, true);
+
+    auto input = context.get_input(0);
+    auto complex_type_mark = as_type_ptr<ComplexTypeMark>(input.get_node_shared_ptr());
+    if (complex_type_mark) {
+        input = complex_type_mark->input_value(0);
+    }
+
+    auto n_fft = context.get_input(1);
+
+    ov::Output<ov::Node> hop_length;
+    if (!context.input_is_none(2)) {
+        hop_length = context.get_input(2);
+    } else {
+        // Defualt floor(n_fft / 4)
+        const auto four = context.mark_node(std::make_shared<ov::op::v0::Constant>(ov::element::i32, Shape{}, 4));
+        const auto four_cast = context.mark_node(std::make_shared<ov::op::v1::ConvertLike>(four, n_fft));
+        hop_length = context.mark_node(std::make_shared<ov::op::v1::Divide>(n_fft, four_cast));
+    }
+
+    ov::Output<ov::Node> win_length;
+    if (!context.input_is_none(3)) {
+        win_length = context.get_input(3);
+    } else {
+        win_length = n_fft;
+    }
+
+    ov::Output<ov::Node> window;
+    if (!context.input_is_none(4)) {
+        window = context.get_input(4);
+    } else {
+        const auto one = context.mark_node(std::make_shared<ov::op::v0::Constant>(ov::element::i32, Shape{}, 1));
+        const auto one_cast = context.mark_node(std::make_shared<ov::op::v1::ConvertLike>(one, input));
+        const auto zero = context.mark_node(std::make_shared<ov::op::v0::Constant>(ov::element::i32, Shape{1}, 0));
+        const auto win_length_cast =
+            context.mark_node(std::make_shared<ov::op::v0::Convert>(win_length, ov::element::i64));
+        const auto win_len_vec = context.mark_node(std::make_shared<ov::op::v0::Unsqueeze>(win_length_cast, zero));
+        window = context.mark_node(std::make_shared<ov::op::v3::Broadcast>(one_cast, win_len_vec));
+    }
+
+    bool center = true;
+    if (!context.input_is_none(5)) {
+        center = context.const_input<bool>(5);
+    }
+
+    bool normalized = false;
+    if (!context.input_is_none(6)) {
+        normalized = context.const_input<bool>(6);
+    }
+
+    bool onesided = true;
+    if (!context.input_is_none(7)) {
+        onesided = context.const_input<bool>(7);
+    }
+    PYTORCH_OP_CONVERSION_CHECK(onesided, "aten::istft conversion is currently supported with onesided=True only.");
+
+    bool return_complex = false;
+    if (!context.input_is_none(9)) {
+        return_complex = context.const_input<bool>(9);
+    }
+
+    // Perform ISTFT
+    ov::Output<ov::Node> istft;
+    if (context.input_is_none(8)) {
+        istft = context.mark_node(std::make_shared<v16::ISTFT>(input, window, n_fft, hop_length, center, normalized));
+    } else {
+        auto signal_length = context.get_input(8);
+        istft = context.mark_node(
+            std::make_shared<v16::ISTFT>(input, window, n_fft, hop_length, signal_length, center, normalized));
+    }
+
+    if (return_complex) {
+        return {context.mark_node(std::make_shared<ComplexTypeMark>(istft, istft.get_element_type()))};
+    } else {
+        return {istft};
+    }
+};
+}  // namespace op
+}  // namespace pytorch
+}  // namespace frontend
+}  // namespace ov
diff --git a/src/frontends/pytorch/src/op_table.cpp b/src/frontends/pytorch/src/op_table.cpp
@@ -125,6 +125,7 @@ OP_CONVERTER(translate_index_select);
 OP_CONVERTER(translate_instance_norm);
 OP_CONVERTER(translate_int);
 OP_CONVERTER(translate_inverse);
+OP_CONVERTER(translate_istft);
 OP_CONVERTER(translate_is_nonzero);
 OP_CONVERTER(translate_layer_norm);
 OP_CONVERTER(translate_len);
@@ -523,6 +524,7 @@ const std::unordered_map<std::string, CreatorFunction> get_supported_ops_ts() {
         {"aten::Int", op::translate_int},
         {"aten::IntImplicit", op::translate_int},
         {"aten::is_grad_enabled", op::return_false_scalar},
+        {"aten::istft", op::translate_istft},
         {"aten::is_nonzero", op::translate_is_nonzero},
         {"aten::isfinite", op::translate_1to1_match_1_inputs<opset10::IsFinite>},
         {"aten::isinf", op::translate_1to1_match_1_inputs<opset10::IsInf>},
diff --git a/tests/layer_tests/pytorch_tests/test_istft.py b/tests/layer_tests/pytorch_tests/test_istft.py

Original file line number	Diff line number	Diff line change
`@@ -111,9 +111,9 @@ std::vector<TRShape> shape_infer(const ISTFT* op,`
`111`	`111`
`112`	`112`	`const int64_t frames_axis = 1 + (is_data_3D ? 0 : 1);`
`113`	`113`	`const TDim& num_frames_dim = data_shape[frames_axis];`
`114`		`- TDim signal_length = (num_frames_dim - 1) * frame_step_val;`
`115`		`- if (!op->get_center()) {`
`116`		`- signal_length += frame_size_val;`
	`114`	`+ TDim signal_length = (num_frames_dim - 1) * frame_step_val + frame_size_val;`
	`115`	`+ if (op->get_center()) {`
	`116`	`+ signal_length = signal_length - (frame_size_val & ~1);`
`117`	`117`	`}`
`118`	`118`	`output_shapes[0][0] = std::move(signal_length);`
`119`	`119`	`}`