Skip to content

Commit df03b04

Browse files
authored
[Snippets] Refactored work with Buffers (openvinotoolkit#19644)
[Snippets] BufferManager is not derived from PassPipeline now [Snippets] Added MemorySolver support [Snippets] Made as static class [Snippets] Added one-level inplace support [Snippets] Added optimization bits [Snippets] Small cosmetic fixes [Snippets] Renamed to BufferSolver [Snippets] Refactored [Snippets] Fixed IdendifyBuffers [Snippets] Add inplace multi + identify buffers [Snippets] Made common pass [Snippets] Added PassPipeline::get_pass<>() [Snippets] Added comments, briefs, refactored smth [Snippets] Fixed win build [Snippets] Not allow to have the same Buffer ID for multi level Buffers [Snippets] Moved CleanupRepeatedPtrShifts to common pioeline [Snippets] Made IdentifyBuffers::ShiftPtrParams [Snippets] Fixed window sliding mode [Snippets] Refactored nested clusters [Snippets] Adde normalized buffer regs [Snippets] Not allowed to have the same ID for nested Buffers in IdentifyBuffers [Snippets] Fixed DefineBufferClusters::are_buffer_neighbours::find [Snippets] Removed useless method from InitLoops [Snippets] Fixed CC build [Snippets] Applied Ivan comments [Snippets] Applied Ivan comment: refactored pass classes [Snippets] Applied Vladislav comments [Snippets] Applied Ivan comments 2 [Runtime] Moved MemorySolver to API2.0 [Snippets] Created common buffer allocation pass AllocateBuffers [Snippets][Tests] Added InplaceEltwise unit test [Snippets] fixed NormalizeBufferIDs [Snippets][CPU] Fixed BrgemmBlocking lowered pass: move wsp for AMX to brgemm [Snippets][CPU][Tests] Covered AMX MHA buffer allocation by unit tests
1 parent 6ab5ef7 commit df03b04

29 files changed

+1689
-267
lines changed

src/common/snippets/include/snippets/lowered/linear_ir.hpp

+3
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ class Config {
2929
// Minimal advised work amount that should be processed during one call of the executable produced by Subgraph::generate
3030
// Set by a backend, should be large enough to compensate for the kernel call overheads
3131
size_t m_min_kernel_work_amount = 256;
32+
// True if the Buffer scratchpad size of LinearIR will be optimized (all possible optimizations will be activated)
33+
// False if all Buffers will have uniqie ID and offsets in the Linear IR
34+
bool m_are_buffers_optimized = true;
3235
};
3336

3437
/* The control flow of Snippets is built on Linear Intermediate Representation (Linear IR).

src/common/snippets/include/snippets/lowered/pass/allocate_buffers.hpp

+29-16
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
#pragma once
66

77
#include "pass.hpp"
8-
#include "snippets/snippets_isa.hpp"
98

109
namespace ov {
1110
namespace snippets {
@@ -14,26 +13,40 @@ namespace pass {
1413

1514
/**
1615
* @interface AllocateBuffers
17-
* @brief The pass calculates common size of buffer scratchpad and propagates Buffer offsets to connected MemoryAccess operations.
18-
* Notes:
19-
* - The pass implicitly regulates InPlace processing for some Buffers when it's possible.
20-
* The pass don't allocate new memory for InPlace Buffers, we propagate the same offsets for them.
21-
* - The pass should be splitted into two passes: ProcessInplace (markup of Buffers which can use the same memory)
22-
* and AllocateBuffer (allocate memory for Buffers using MemorySolver which can optimally reuse memory).
16+
* @brief The pass allocates common memory for all Buffers.
17+
* There are two modes: default and optimized allocation. Default allocation (non-optimized) mode sets unique offsets and ID to Buffers.
18+
* Optimized mode allocates memory for Buffer ops using the following optimizations:
19+
* - MemorySolver: helps to solve issue of optimal memory allocation;
20+
* - InPlace: Loop or MemoryAccess ops read from the memory and store data to the same memory if possible
21+
* - Reusing Buffer IDs: Buffers have the same IDs (gpr) in cases when Buffers aren't connected or have the same data ptr shifts
22+
* Note: All buffers are related to each other and represent common buffer scratchpad of Subgraph.
23+
* The buffer scratchpad has one general data pointer. Each buffer has offset relative to the data pointer of buffer scratchpad.
2324
* @ingroup snippets
2425
*/
25-
26-
class AllocateBuffers : public Pass {
26+
class AllocateBuffers: public Pass {
2727
public:
2828
OPENVINO_RTTI("AllocateBuffers", "Pass")
29-
bool run(lowered::LinearIR& linear_ir) override;
30-
31-
size_t get_scratchpad_size() const { return m_buffer_scratchpad_size; }
32-
29+
AllocateBuffers(size_t& buffer_scratchpad_size, bool is_optimized = true);
30+
31+
/**
32+
* @brief Apply the pass to the Linear IR
33+
* @param linear_ir the target Linear IR
34+
* @return status of the pass
35+
*/
36+
bool run(LinearIR& linear_ir) override;
37+
38+
/**
39+
* @brief Set offset to Buffer op and propagates its to the connected memory access ops
40+
* @param buffer_expr expression with Buffer op
41+
* @param offset offset in common buffer scratchpad
42+
*/
43+
static void set_buffer_offset(const ExpressionPtr& buffer_expr, const size_t offset);
44+
45+
using BufferCluster = std::set<ExpressionPtr>;
46+
using BufferClusters = std::vector<BufferCluster>;
3347
private:
34-
static void propagate_offset(const LinearIR& linear_ir, const ExpressionPtr& buffer_expr, size_t offset);
35-
36-
size_t m_buffer_scratchpad_size = 0;
48+
size_t& m_buffer_scratchpad_size;
49+
bool m_is_optimized_mode = true;
3750
};
3851

3952
} // namespace pass
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
// Copyright (C) 2023 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
//
4+
5+
#pragma once
6+
7+
#include "pass.hpp"
8+
9+
#include "allocate_buffers.hpp"
10+
11+
namespace ov {
12+
namespace snippets {
13+
namespace lowered {
14+
namespace pass {
15+
16+
/**
17+
* @interface DefineBufferClusters
18+
* @brief The pass defines buffer clusters. The buffers from one cluster share the
19+
* same memory (has the same offset relative to the data pointer of buffer scratchpad).
20+
* - If MemoryAccess op or Loop can read and write to the same (inplace behavior), the Buffers should be in the one cluster.
21+
* - If Buffer is in the Loop which read or write from/to the other Buffers, this Buffer can emulate `window` slidings.
22+
* It means that Buffer inside can reuse memory of Buffers outside in bounds of full Loop work.
23+
* Demonstration:
24+
* |-----------------------------------------------------|
25+
* | |------------| |------------| | InnerLoops have work amount 128
26+
* Buffer0 [3x128]-> | | InnerLoop0 | -> Buffer1 [3x128] -> | InnerLoop1 | | -> Buffer2 [3x128] OuterLoop has work amount 3
27+
* | |------------| OuterLoop |------------| |
28+
* |-----------------------------------------------------|
29+
* Buffer1 can reuse memory [128] of Buffer0 or Buffer2 in each iteration of OuterLoop
30+
* Note: The pass requires expression enumeration and buffer identification (for nested Buffers inplace).
31+
* These passes should be executed separately before this pass!
32+
* @ingroup snippets
33+
*/
34+
class DefineBufferClusters : public Pass {
35+
public:
36+
OPENVINO_RTTI("DefineBufferClusters", "Pass")
37+
38+
DefineBufferClusters(AllocateBuffers::BufferClusters& clusters) : m_clusters(clusters) {}
39+
40+
/**
41+
* @brief Apply the pass to the Linear IR
42+
* @param linear_ir the target Linear IR
43+
* @return status of the pass
44+
*/
45+
bool run(lowered::LinearIR& linear_ir) override;
46+
47+
private:
48+
using BufferPorts = std::unordered_map<ExpressionPtr, std::set<size_t>>;
49+
/**
50+
* @brief Finds Buffer cluster in set of clusters which contains the target expression with Buffer
51+
* @param target target expression with Buffer op
52+
* @return vector iterator which refers to the found cluster
53+
*/
54+
AllocateBuffers::BufferClusters::iterator find_cluster_by_expr(const ExpressionPtr& target);
55+
/**
56+
* @brief Returns True if Buffer is direct source for the target expr (there aren't other loop between the Buffer and target expr)
57+
* @param buffer_expr expression with assumed Buffer op
58+
* @param target_expr expression with target op - LoopEnd or MemoryAccess op
59+
* @return boolean value
60+
*/
61+
bool is_direct_buffer(const ExpressionPtr& buffer_expr, const ExpressionPtr& target_expr) const;
62+
/**
63+
* @brief Creates new buffer cluster if buffer_exprs is missed in clusters. If buffer_exprs is already in clusters, do nothing
64+
* @param buffer_expr expression with Buffer op
65+
*/
66+
void create_new_cluster(const ExpressionPtr& buffer_expr);
67+
/**
68+
* @brief Returns common ID of cluster if all buffer inside have the same Buffer ID. Otherwise returns the default value SIZE_MAX
69+
* that means that Buffers in cluster have different IDs.
70+
* @param cluster set of Buffer expressions - cluster
71+
* @return common buffer ID or SIZE_MAX - size value
72+
*/
73+
size_t get_cluster_buffer_id(const AllocateBuffers::BufferCluster& cluster) const;
74+
75+
/**
76+
* @brief Analyzes Loop: if Loop has Buffer ops on inputs and outputs, Loop can read and write from/to the same memory.
77+
* @param expr_it iterator of Linear IR which refers to the expression with LoopEnd
78+
*/
79+
void parse_loop(const LinearIR::constExprIt& expr_it);
80+
/**
81+
* @brief Analyzes full MemoryAccess op: if the op has Buffer ops on I/O, the op can read and write from/to the same memory.
82+
* @param expr expression with full MemoryAccess op
83+
*/
84+
void parse_memory_access_op(const ExpressionPtr& expr);
85+
/**
86+
* @brief Gets input outputs buffers of Loop
87+
* @param loop_expr expression with LoopEnd op
88+
* @return unordered map [Expression -> set of input ports] which represents input Buffers of Loop
89+
*/
90+
BufferPorts get_input_buffers(const ExpressionPtr& loop_expr) const;
91+
/**
92+
* @brief Gets output buffers of Loop
93+
* @param loop_expr expression with LoopEnd op
94+
* @return unordered map [Expression -> set of input ports] which represents output Buffers of Loop
95+
*/
96+
BufferPorts get_output_buffers(const ExpressionPtr& loop_expr) const;
97+
/**
98+
* @brief Analyzes nested Loops: unite nested buffer clusters if they can reproduce `window` sliding
99+
* @param input_buffers unordered map [Expression -> set of input ports] which represents input Buffers of Loop
100+
* @param output_buffers unordered map [Expression -> set of output ports (one)] which represents output Buffers of Loop
101+
* @param outer_loop_end_expr_it iterator of Linear IR which refers to the expression with outer LoopEnd
102+
*/
103+
void parse_nested_loops(const BufferPorts& input_buffers, const BufferPorts& output_buffers, const LinearIR::constExprIt& outer_loop_end_expr_it);
104+
/**
105+
* @brief Finds the last connected Loop to the target Buffer and returns the corresponding finalization offset
106+
* @param buffer_expr expression with Buffer op
107+
* @return finalization offset - int64_t value
108+
*/
109+
int64_t get_buffer_finalization_offset(const ExpressionPtr& buffer_expr) const;
110+
/**
111+
* @brief Check if two Buffer expressions are connected to the same Loop. Set common LoopEnd as `loop` parameter and
112+
* indexes of Loop ports `up_idx` and `down_idx` if Buffers are really neighbours
113+
* @param up expression with upper Buffer op
114+
* @param down expression with lower Buffer op
115+
* @param loop expression with common LoopEnd op
116+
* @param up_idx the reference to port index of upper Buffer op to the Loop
117+
* @param down_idx the reference to port index of lower Buffer op to the Loop
118+
* @return Return True if the Buffers are connected to the same Loop
119+
*/
120+
static bool are_buffer_neighbours(const ExpressionPtr& up, const ExpressionPtr& down, ExpressionPtr& loop, size_t& up_idx, size_t& down_idx);
121+
/**
122+
* @brief Unite clusters
123+
* @param inner_cluster_it iterator to inner cluster - buffer cluster is in the loop
124+
* @param outer_cluster buffer clusters with buffers outside the Loop
125+
* @param outer_buffer target Buffer from outer_cluster
126+
* @param is_outer_up true if outer buffer is upper in Linear IR than inner Buffers
127+
* @return Return True if clusters have been united
128+
*/
129+
bool unite_nested_clusters(const AllocateBuffers::BufferClusters::iterator& inner_cluster_it, AllocateBuffers::BufferCluster& outer_cluster,
130+
const ExpressionPtr& outer_buffer, bool is_outer_up);
131+
132+
AllocateBuffers::BufferClusters& m_clusters;
133+
};
134+
135+
} // namespace pass
136+
} // namespace lowered
137+
} // namespace snippets
138+
} // namespace ov
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
// Copyright (C) 2023 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
//
4+
5+
#pragma once
6+
7+
#include "pass.hpp"
8+
9+
namespace ov {
10+
namespace snippets {
11+
namespace lowered {
12+
namespace pass {
13+
14+
/**
15+
* @interface EnumerateExpressions
16+
* @brief The pass enumerates expression by execution order
17+
* @ingroup snippets
18+
*/
19+
class EnumerateExpressions : public Pass {
20+
public:
21+
OPENVINO_RTTI("EnumerateExpressions", "Pass")
22+
bool run(LinearIR& linear_ir) override;
23+
};
24+
25+
} // namespace pass
26+
} // namespace lowered
27+
} // namespace snippets
28+
} // namespace ov

src/common/snippets/include/snippets/lowered/pass/identify_buffers.hpp

+71-6
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@
66

77
#include "pass.hpp"
88

9-
#include "snippets/op/buffer.hpp"
10-
119
namespace ov {
1210
namespace snippets {
1311
namespace lowered {
@@ -22,7 +20,8 @@ namespace pass {
2220
* - Loops, Brgemm (the same other ops) - are "edges" between Buffers (hub of edges).
2321
* The buffers are connected to the same Loop - are adjacent in graph sense bounds.
2422
* - The vertices (buffers) are adjacent if they are connected to the same Loop and
25-
* their data pointers cannot be proportionally incremented in Loops: different ptr increments or data sizes;
23+
* their data pointers cannot be proportionally incremented in Loops: different ptr increments or data sizes -
24+
* or one of the Buffers is in some a Loop but another Buffer is not;
2625
* - Firstly, create adjacency matrix using the definition above;
2726
* - Secondly, assign the same color to non-adjacent vertices of graph (buffers), and use different colors otherwise.
2827
* Note: should be called before ResetBuffer() pass to have correct offsets
@@ -33,13 +32,79 @@ class IdentifyBuffers: public Pass {
3332
OPENVINO_RTTI("IdentifyBuffers", "Pass")
3433
IdentifyBuffers() = default;
3534

35+
/**
36+
* @brief Apply the pass to the Linear IR
37+
* @param linear_ir the target Linear IR
38+
* @return status of the pass
39+
*/
3640
bool run(LinearIR& linear_ir) override;
3741

42+
struct ShiftPtrParams {
43+
ShiftPtrParams() = default;
44+
ShiftPtrParams(int64_t ds, int64_t pi, int64_t fo) : data_size(ds), ptr_increment(pi), finalization_offset(fo) {}
45+
int64_t data_size = 0;
46+
int64_t ptr_increment = 0;
47+
int64_t finalization_offset = 0;
48+
49+
friend bool operator==(const ShiftPtrParams& lhs, const ShiftPtrParams& rhs);
50+
friend bool operator!=(const ShiftPtrParams& lhs, const ShiftPtrParams& rhs);
51+
};
52+
53+
/**
54+
* @brief Check if two Buffers can reuse ID by ShiftPtrParams < data_size, ptr_increment, finalization_offset >
55+
* @param lhs Data pointer shift params for first Buffer
56+
* @param rhs Data pointer shift params for second Buffer
57+
* @return Returns True if params are valid for reusing. Otherwise returns False
58+
*/
59+
static bool can_reuse_id(const ShiftPtrParams& lhs, const ShiftPtrParams& rhs);
60+
3861
private:
39-
using BufferSet = std::vector<std::shared_ptr<op::Buffer>>;
62+
using BufferPool = std::vector<ExpressionPtr>;
4063

41-
std::vector<bool> create_adjacency_matrix(const LinearIR& linear_ir, const BufferSet& buffers) const;
42-
std::map<size_t, BufferSet> coloring(BufferSet& buffers, std::vector<bool>& adj);
64+
/**
65+
* @brief Get Buffer Index in Buffer set
66+
* @param target the target Buffer expression
67+
* @param pool set of Buffers from the Linear IR
68+
* @return index of target Buffer expression in set
69+
*/
70+
static size_t get_buffer_idx(const ExpressionPtr& target, const BufferPool& pool);
71+
/**
72+
* @brief Create adjacency matrix for Buffer system. See comment in the method for more details.
73+
* @param linear_ir the target Linear IR
74+
* @param pool set of Buffers from the Linear IR
75+
* @return adjacency matrix where True value means that Buffers are adjacent and cannot have the same ID
76+
*/
77+
static std::vector<bool> create_adjacency_matrix(const LinearIR& linear_ir, const BufferPool& pool);
78+
/**
79+
* @brief Algorithm of Graph coloring where vertices are Buffers
80+
* @param buffers set of Buffers from the Linear IR
81+
* @param adj adjacency matrix
82+
* @return map [color id -> Buffer set]
83+
*/
84+
static std::map<size_t, BufferPool> coloring(BufferPool& buffers, std::vector<bool>& adj);
85+
/**
86+
* @brief Update the adjacency matrix:
87+
* - If Buffers are from the same Loops and connected to the same Loop and
88+
* they have not proportionally ptr shift params for this Loop, the Buffers are adjacent - set value True in the matrix;
89+
* - If one of Buffer inside Loop but another Buffer is connected to this Loop and this Buffer has not zero data shift params,
90+
* the Buffers are adjacent - set value True in the matrix;
91+
* @param lhs Pair where first value if Expression with first Buffer and second value is data pointer shift params for its
92+
* @param rhs Pair where first value if Expression with second Buffer and second value is data pointer shift params for its
93+
* @param buffers set of Buffers from the Linear IR
94+
* @param adj Target adjacency matrix
95+
*/
96+
static void update_adj_matrix(const std::pair<ExpressionPtr, ShiftPtrParams>& lhs,
97+
const std::pair<ExpressionPtr, ShiftPtrParams>& rhs,
98+
const BufferPool& buffers,
99+
std::vector<bool>& adj);
100+
/**
101+
* @brief Check if two Buffers are adjacent and cannot have the same ID
102+
* @param lhs Pair where first value is Expression with first Buffer and second value is data pointer shift params for it
103+
* @param rhs Pair where first value is Expression with second Buffer and second value is data pointer shift params for it
104+
* @return Returns True if they are adjacent, otherwise returns False
105+
*/
106+
static bool are_adjacent(const std::pair<ExpressionPtr, ShiftPtrParams>& lhs,
107+
const std::pair<ExpressionPtr, ShiftPtrParams>& rhs);
43108
};
44109

45110
} // namespace pass
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
// Copyright (C) 2023 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
//
4+
5+
#pragma once
6+
7+
#include "pass.hpp"
8+
9+
namespace ov {
10+
namespace snippets {
11+
namespace lowered {
12+
namespace pass {
13+
14+
/**
15+
* @interface InitBuffersDefault
16+
* @brief The pass inits Buffer expressions in LinearIR default (non-optimized): sets unique offsets and ID to Buffers.
17+
* @ingroup snippets
18+
*/
19+
20+
class InitBuffersDefault : public Pass {
21+
public:
22+
OPENVINO_RTTI("InitBuffersDefault", "Pass")
23+
24+
InitBuffersDefault(size_t& buffer_scratchpad_size) : m_buffer_scratchpad_size(buffer_scratchpad_size) {
25+
m_buffer_scratchpad_size = 0;
26+
}
27+
/**
28+
* @brief Apply the pass to the Linear IR
29+
* @param linear_ir the target Linear IR
30+
* @return status of the pass
31+
*/
32+
bool run(lowered::LinearIR& linear_ir) override;
33+
34+
private:
35+
size_t& m_buffer_scratchpad_size;
36+
};
37+
38+
} // namespace pass
39+
} // namespace lowered
40+
} // namespace snippets
41+
} // namespace ov

src/common/snippets/include/snippets/lowered/pass/init_loops.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ namespace pass {
1515

1616
/**
1717
* @interface InitLoops
18-
* @brief The pass initialize scheduling information in LoopInfo
18+
* @brief The pass initializes scheduling information in LoopInfo
1919
* @ingroup snippets
2020
*/
2121
class InitLoops : public Pass {

0 commit comments

Comments
 (0)