Skip to content

Parallel tc to ttgt #100

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Apr 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions frontends/numpy-scipy/cometpy/MLIRGen/lowering.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,7 @@ def translate_and_exec_llvm_with_jit(llvm_in,scf_lower_flags, func_name, inputs,
# 2. Call mlir-translate to convert llvm to llvmir
# 3. Call clang to generate library
# p = subprocess.run(to_llvm_command, input=llvm_in.encode('utf-8'), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
print(llvm_in)
p = subprocess.run(to_llvm_command +' 2>&1 | '+ translate_mlir_command +' | ' + gcc_command , input=llvm_in.encode('utf-8'), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
if(p.returncode != 0):
cleanup()
Expand Down Expand Up @@ -561,8 +562,8 @@ def lower_dialect_with_jit(ta_dialect_rep, target: str, out_dims, compile_with_f
mlir_lower_flags += "--opt-fusion"
compile_with_flags = compile_with_flags.replace("--opt-fusion","")
compile_with_flags = compile_with_flags.replace("--opt-comp-workspace","")
if "-opt-matmul-tiling" not in compile_with_flags:
mlir_lower_flags += " --convert-to-loops "
# if "-opt-matmul-tiling" not in compile_with_flags:
mlir_lower_flags += " --convert-to-loops "
mlir_lower_flags =" "+compile_with_flags + mlir_lower_flags
else:
mlir_lower_flags = " --convert-ta-to-it --convert-to-loops "
Expand Down
54 changes: 26 additions & 28 deletions lib/Dialect/TensorAlgebra/Transforms/LinalgTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,21 +31,27 @@

#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/LoopUtils.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/Linalg/Utils/Utils.h"
#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
#include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"
#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/IR/BuiltinTypeInterfaces.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/OpDefinition.h"
#include "mlir/IR/PatternMatch.h"
#include "mlir/IR/Value.h"
#include "mlir/IR/ValueRange.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "llvm/ADT/SmallVector.h"
#include <optional>

// suppress all warnings coming from inclusion of blis.h in source tree
#ifdef __clang__
Expand Down Expand Up @@ -299,7 +305,7 @@ namespace
get_level3_blocksizes(&mc, &kc, &nc, &mr, &nr, sizeof(double));

addPatternForTiling(ctx, tilingPatterns, "__with_tiling__", "__L2__with_tiling__", {mc, nc, kc}, false, {1, 2, 0});
addPatternForTiling(ctx, tilingPatterns, "__L2__with_tiling__", "__micro_kernel__", {mr, nr, kc}, false, {1, 0, 2});
addPatternForTiling(ctx, tilingPatterns, "__L2__with_tiling__", "__micro_kernel__", {mr, nr, kc}, true, {1, 0, 2});

if (failed(applyPatternsAndFoldGreedily(getOperation(),
std::move(tilingPatterns))))
Expand Down Expand Up @@ -739,40 +745,32 @@ struct OptDenseTranspose : public ConversionPattern
currentOrder.push_back(i);
}

SmallVector<mlir::Value, 6> in_ivs;
SmallVector<mlir::Value, 6> out_ivs;
SmallVector<OpFoldResult, 6> in_ivs;
SmallVector<OpFoldResult, 6> out_ivs;
in_ivs.resize(optimalOrder.size());
out_ivs.resize(outputIndices[0].size());
mlir::Value carried_val = output;
SmallVector<AffineForOp, 6> loops;

OpFoldResult one = rewriter.createOrFold<ConstantIndexOp>(loc, 1);
SmallVector<OpFoldResult, 4> ubs;
for (unsigned i = 0; i < optimalOrder.size(); i++)
{
int64_t upperBound = inputType.getDimSize(optimalOrder[i]);
if (upperBound == ShapedType::kDynamic)
{
assert(false && "TODO: This dimension is a dynamic size");
}

/// create for loops
auto loop = rewriter.create<AffineForOp>(loc, 0, upperBound, 1, carried_val);
loops.push_back(loop);
rewriter.setInsertionPointToStart(loop.getBody());
in_ivs[optimalOrder[i]] = loop.getInductionVar();
out_ivs[optimalOrder[outputIndices[0][i]]] = loop.getInductionVar();
carried_val = loop.getRegionIterArgs().front();
Value upperBound = rewriter.create<tensor::DimOp>(loc, input, optimalOrder[i]);
ubs.push_back(upperBound);
}

auto load_rhs = rewriter.create<tensor::ExtractOp>(loc, input, in_ivs);
auto store_lhs = rewriter.create<tensor::InsertOp>(loc, load_rhs, carried_val, out_ivs);
rewriter.create<AffineYieldOp>(loc, store_lhs.getResult());
for(int64_t i = loops.size()-2; i>=0 ; i--)

SmallVector<OpFoldResult, 4> ones(optimalOrder.size(), one);
auto forAll = rewriter.create<scf::ForallOp>(loc, ubs, output, std::nullopt);
rewriter.setInsertionPointToStart(forAll.getBody());
auto ivs = forAll.getLoopInductionVars();
for(size_t i = 0; i< forAll.getLoopInductionVars()->size(); i++)
{
rewriter.setInsertionPointToEnd(loops[i].getBody());
rewriter.create<AffineYieldOp>(loc, loops[i+1].getResult(0));
in_ivs[optimalOrder[i]] = forAll.getLoopInductionVars()->data()[i];
out_ivs[optimalOrder[outputIndices[0][i]]] = forAll.getLoopInductionVars()->data()[i];
}
auto extracts = rewriter.create<tensor::ExtractSliceOp>(loc, input, in_ivs, ones, ones);
rewriter.setInsertionPointToEnd(forAll.getTerminator().getBody());
rewriter.create<tensor::ParallelInsertSliceOp>(loc, extracts, forAll.getRegionIterArgs().front(), out_ivs, ones, ones);

rewriter.replaceAllUsesWith(op->getResult(0), loops[0].getResult(0));
rewriter.replaceAllUsesWith(op->getResult(0), forAll->getResult(0));
rewriter.eraseOp(op);

//module.dump();
Expand All @@ -796,7 +794,7 @@ namespace
comet_debug() << "OptDenseTransposePass : public PassWrapper<OptDenseTransposePass, FunctionPass>\n";
func::FuncOp func = getOperation();
ConversionTarget target(getContext());
target.addLegalDialect<TADialect, ArithDialect, AffineDialect, tensor::TensorDialect>();
target.addLegalDialect<TADialect, ArithDialect, scf::SCFDialect, AffineDialect, tensor::TensorDialect>();
RewritePatternSet patterns(&getContext());
patterns.insert<OptDenseTranspose>(&getContext(), tile_size, seperate_tiles);

Expand Down
2 changes: 1 addition & 1 deletion test/integration/opts/ccsd_t1_21_ttgt_all_opts.ta
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# RUN: comet-opt -opt-matmul-tiling -opt-matmul-mkernel -opt-dense-transpose --convert-tc-to-ttgt --convert-to-llvm %s &> ccsd_t1_21_ttgt_all.llvm
# RUN: mlir-cpu-runner ccsd_t1_21_ttgt_all.llvm -O3 -e main -entry-point-result=void -shared-libs=%comet_utility_library_dir/libcomet_runner_utils%shlibext | FileCheck %s
# RUN: mlir-cpu-runner ccsd_t1_21_ttgt_all.llvm -O3 -e main -entry-point-result=void -shared-libs=%comet_utility_library_dir/libcomet_runner_utils%shlibext,%mlir_utility_library_dir/libomp%shlibext | FileCheck %s

def main() {
#IndexLabel Declarations
Expand Down
8 changes: 4 additions & 4 deletions test/integration/opts/ccsd_t1_21_ttgt_tiling.ta
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
# RUN: comet-opt --opt-matmul-tiling --convert-tc-to-ttgt --convert-to-llvm %s &> ccsd_t1_21_ttgt_tiling.llvm
# RUN: mlir-cpu-runner ccsd_t1_21_ttgt_tiling.llvm -O3 -e main -entry-point-result=void -shared-libs=%comet_utility_library_dir/libcomet_runner_utils%shlibext | FileCheck %s
# RUN: mlir-cpu-runner ccsd_t1_21_ttgt_tiling.llvm -O3 -e main -entry-point-result=void -shared-libs=%comet_utility_library_dir/libcomet_runner_utils%shlibext,%mlir_utility_library_dir/libomp%shlibext | FileCheck %s

def main() {
#IndexLabel Declarations
IndexLabel [i, c] = [2];
IndexLabel [m, n, a] = [4];
IndexLabel [i, c] = [16];
IndexLabel [m, n, a] = [32];

Tensor<double> v([i, c, m, n], {Dense});
Tensor<double> t2([m, n, c, a], {Dense});
Expand All @@ -21,4 +21,4 @@ def main() {

# Print the result for verification.
# CHECK: data =
# CHECK-NEXT: 250.24,250.24,250.24,250.24,250.24,250.24,250.24,250.24,
# CHECK-NEXT: 128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,128123,
2 changes: 1 addition & 1 deletion test/integration/opts/ccsd_t1_4_ttgt_bestperm.ta
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# RUN: comet-opt --convert-tc-to-ttgt --convert-to-llvm %s &> ccsd_t1_4_ttgt_bestperm.llvm
# RUN: mlir-cpu-runner ccsd_t1_4_ttgt_bestperm.llvm -O3 -e main -entry-point-result=void -shared-libs=%comet_utility_library_dir/libcomet_runner_utils%shlibext | FileCheck %s
# RUN: mlir-cpu-runner ccsd_t1_4_ttgt_bestperm.llvm -O3 -e main -entry-point-result=void -shared-libs=%comet_utility_library_dir/libcomet_runner_utils%shlibext,%mlir_utility_library_dir/libomp%shlibext | FileCheck %s

def main() {
#IndexLabel Declarations
Expand Down
2 changes: 1 addition & 1 deletion test/integration/opts/opt_dense_transpose.ta
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# RUN: comet-opt -opt-dense-transpose --convert-ta-to-it --convert-to-loops --convert-to-llvm %s &> opt_dense_transpose.llvm
# RUN: mlir-cpu-runner opt_dense_transpose.llvm -O3 -e main -entry-point-result=void -shared-libs=%comet_utility_library_dir/libcomet_runner_utils%shlibext | FileCheck %s
# RUN: mlir-cpu-runner opt_dense_transpose.llvm -O3 -e main -entry-point-result=void -shared-libs=%comet_utility_library_dir/libcomet_runner_utils%shlibext,%mlir_utility_library_dir/libomp%shlibext | FileCheck %s

#TODO(gkestor): read dense input from file

Expand Down
Loading