From 83cfad792cdec7468a3ac74e78d6eaaf50a0c594 Mon Sep 17 00:00:00 2001
From: Marco Feder <marco.feder@sissa.it>
Date: Mon, 7 Oct 2024 23:33:01 +0200
Subject: [PATCH 1/3] Expand agglomerated multigrid to simplex meshes

---
 examples/simplex_agglomerated_multigrid.cc | 675 +++++++++++++++++++++
 include/poly_utils.h                       | 612 ++++++++++++++++++-
 include/utils.h                            |  12 +-
 source/multigrid_amg.cc                    |  24 +-
 4 files changed, 1311 insertions(+), 12 deletions(-)
 create mode 100644 examples/simplex_agglomerated_multigrid.cc
diff --git a/examples/simplex_agglomerated_multigrid.cc b/examples/simplex_agglomerated_multigrid.cc
new file mode 100644
index 00000000..3f9795f1
--- /dev/null
+++ b/examples/simplex_agglomerated_multigrid.cc
@@ -0,0 +1,675 @@
+// -----------------------------------------------------------------------------
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception OR LGPL-2.1-or-later
+// Copyright ( ) XXXX - YYYY by the deal.II authors
+//
+// This file is part of the deal.II library.
+//
+// Detailed license information governing the source code and contributions
+// can be found in LICENSE.md and CONTRIBUTING.md at the top level directory.
+//
+// -----------------------------------------------------------------------------
+
+
+// Agglomerated multigrid with simplex meshes.
+
+#include <deal.II/base/conditional_ostream.h>
+#include <deal.II/base/function_lib.h>
+#include <deal.II/base/mpi.h>
+#include <deal.II/base/utilities.h>
+
+#include <deal.II/distributed/fully_distributed_tria.h>
+#include <deal.II/distributed/tria.h>
+
+#include <deal.II/fe/mapping_fe.h>
+
+#include <deal.II/grid/grid_generator.h>
+#include <deal.II/grid/grid_in.h>
+
+#include <deal.II/lac/la_parallel_vector.h>
+#include <deal.II/lac/linear_operator_tools.h>
+#include <deal.II/lac/precondition.h>
+#include <deal.II/lac/sparse_direct.h>
+#include <deal.II/lac/trilinos_precondition.h>
+#include <deal.II/lac/trilinos_solver.h>
+#include <deal.II/lac/trilinos_sparse_matrix.h>
+#include <deal.II/lac/trilinos_vector.h>
+
+#include <deal.II/multigrid/mg_coarse.h>
+#include <deal.II/multigrid/mg_matrix.h>
+#include <deal.II/multigrid/mg_smoother.h>
+#include <deal.II/multigrid/mg_tools.h>
+#include <deal.II/multigrid/multigrid.h>
+
+#include <deal.II/numerics/data_out.h>
+#include <deal.II/numerics/vector_tools_integrate_difference.templates.h>
+#include <deal.II/numerics/vector_tools_interpolate.h>
+
+#include <agglomeration_handler.h>
+#include <agglomerator.h>
+#include <multigrid_amg.h>
+#include <poly_utils.h>
+#include <utils.h>
+
+using namespace dealii;
+
+static constexpr unsigned int degree_finite_element = 1;
+static constexpr unsigned int n_components          = 1;
+static constexpr bool         CHECK_AMG             = true;
+
+#define GEOMETRIC_APPROACH true
+
+enum class GridType
+{
+  grid_generator, // hyper_cube or hyper_ball
+  unstructured    // square generated with gmsh, unstructured
+};
+
+
+
+template <typename VectorType, typename MatrixType, typename SolverType>
+class MGCoarseDirect : public MGCoarseGridBase<VectorType>
+{
+public:
+  // using  = typename
+  // LinearAlgebra::distributed::Vector<MatrixType::value_type>;
+  MGCoarseDirect(const MatrixType &matrix)
+  {
+    coarse_matrix = &matrix;
+    direct_solver.initialize(*coarse_matrix);
+  }
+
+  void
+  initialize(const MatrixType &matrix)
+  {}
+
+  virtual void
+  operator()(const unsigned int, VectorType &dst, const VectorType &src) const
+  {
+    if constexpr (std::is_same_v<SolverType, SparseDirectUMFPACK>)
+      direct_solver.vmult(dst, src);
+    else if constexpr (std::is_same_v<SolverType,
+                                      TrilinosWrappers::SolverDirect>)
+      const_cast<SolverType *>(&direct_solver)->solve(*coarse_matrix, dst, src);
+    else
+      AssertThrow(false, ExcNotImplemented());
+  }
+
+  SolverType        direct_solver;
+  const MatrixType *coarse_matrix;
+};
+
+
+
+/**
+ * The actual class aimed at solving the differential problem.
+ */
+template <int dim>
+class AgglomeratedMultigridSimplex
+{
+public:
+  AgglomeratedMultigridSimplex(const GridType    &grid_type,
+                               const unsigned int degree,
+                               const unsigned int starting_level,
+                               const MPI_Comm     comm);
+  void
+  run();
+
+private:
+  void
+  make_fine_grid(const unsigned int);
+  void
+  agglomerate_and_compute_level_matrices();
+
+  const MPI_Comm                                 comm;
+  const unsigned int                             n_ranks;
+  FE_SimplexDGP<dim>                             fe_dg;
+  const GridType                                &grid_type;
+  parallel::fullydistributed::Triangulation<dim> tria_pft;
+  DoFHandler<dim>                                dof_handler;
+  ConditionalOStream                             pcout;
+  unsigned int                                   starting_level;
+  const MappingFE<dim>                           mapping;
+
+
+
+  std::vector<std::unique_ptr<AgglomerationHandler<dim>>>
+    agglomeration_handlers;
+
+  std::vector<TrilinosWrappers::SparseMatrix> injection_matrices_two_level;
+  std::unique_ptr<AgglomerationHandler<dim>>  agglomeration_handler_coarse;
+};
+
+
+
+template <int dim>
+AgglomeratedMultigridSimplex<dim>::AgglomeratedMultigridSimplex(
+  const GridType    &grid_type_,
+  const unsigned int degree,
+  const unsigned int starting_tree_level,
+  const MPI_Comm     communicator)
+  : comm(communicator)
+  , n_ranks(Utilities::MPI::n_mpi_processes(comm))
+  , fe_dg(degree)
+  , grid_type(grid_type_)
+  , tria_pft(comm)
+  , dof_handler(tria_pft)
+  , pcout(std::cout, (Utilities::MPI::this_mpi_process(comm) == 0))
+  , mapping(FE_SimplexP<dim>{1})
+{
+  pcout << "Running with " << n_ranks << " MPI ranks." << std::endl;
+  pcout << "Grid type:";
+  starting_level = starting_tree_level;
+  grid_type == GridType::grid_generator ?
+    pcout << " Structured mesh" << std::endl :
+    pcout << " Unstructured mesh" << std::endl;
+}
+
+
+
+template <int dim>
+void
+AgglomeratedMultigridSimplex<dim>::make_fine_grid(
+  const unsigned int n_global_refinements)
+{
+  Triangulation<dim> tria;
+
+  if (grid_type == GridType::unstructured)
+    {
+      GridIn<dim> grid_in;
+      grid_in.attach_triangulation(tria);
+
+      if constexpr (dim == 2)
+        {
+          std::ifstream gmsh_file(
+            "../../meshes/square_simplex_coarser.msh"); // unstructured square
+                                                        // made by triangles
+          grid_in.read_msh(gmsh_file);
+          tria.refine_global(n_global_refinements + 2);
+        }
+      else
+        {
+          // To decide
+          DEAL_II_NOT_IMPLEMENTED();
+        }
+    }
+  else if (grid_type == GridType::grid_generator)
+    {
+      Triangulation<dim> tria_hex;
+      GridGenerator::hyper_cube(tria_hex, 0., 1.);
+      tria_hex.refine_global(n_global_refinements);
+      GridGenerator::convert_hypercube_to_simplex_mesh(tria_hex, tria);
+    }
+  else
+    {
+      Assert(false, ExcInternalError());
+    }
+
+  pcout << "Total number of fine cells: " << tria.n_active_cells() << std::endl;
+
+  // Partition serial triangulation:
+  GridTools::partition_triangulation(n_ranks, tria);
+
+  // Create building blocks:
+  const TriangulationDescription::Description<dim, dim> description =
+    TriangulationDescription::Utilities::create_description_from_triangulation(
+      tria, comm);
+
+  tria_pft.create_triangulation(description);
+}
+
+
+
+template <int dim>
+void
+AgglomeratedMultigridSimplex<dim>::agglomerate_and_compute_level_matrices()
+{
+  using VectorType = TrilinosWrappers::MPI::Vector;
+  GridTools::Cache<dim> cached_tria(tria_pft, mapping);
+
+  // Define matrix free operator
+  AffineConstraints constraints;
+  constraints.close();
+  dof_handler.distribute_dofs(fe_dg);
+
+  // Start building R-tree
+  namespace bgi = boost::geometry::index;
+  static constexpr unsigned int max_elem_per_node =
+    PolyUtils::constexpr_pow(2, dim); // 2^dim
+  std::vector<std::pair<BoundingBox<dim>,
+                        typename Triangulation<dim>::active_cell_iterator>>
+               boxes(tria_pft.n_locally_owned_active_cells());
+  unsigned int i = 0;
+  for (const auto &cell : tria_pft.active_cell_iterators())
+    if (cell->is_locally_owned())
+      boxes[i++] = std::make_pair(mapping.get_bounding_box(cell), cell);
+
+  auto tree = pack_rtree<bgi::rstar<max_elem_per_node>>(boxes);
+  Assert(n_levels(tree) >= 2, ExcMessage("At least two levels are needed."));
+  pcout << "Total number of available levels: " << n_levels(tree) << std::endl;
+
+  agglomeration_handler_coarse =
+    std::make_unique<AgglomerationHandler<dim>>(cached_tria);
+
+
+  pcout << "Starting level: " << starting_level << std::endl;
+  const unsigned int total_tree_levels = n_levels(tree) - starting_level + 1;
+
+  agglomeration_handlers.resize(total_tree_levels);
+
+
+  // Loop through the available levels and set AgglomerationHandlers up.
+  for (unsigned int extraction_level = starting_level;
+       extraction_level <= n_levels(tree);
+       ++extraction_level)
+    {
+      agglomeration_handlers[extraction_level - starting_level] =
+        std::make_unique<AgglomerationHandler<dim>>(cached_tria);
+      CellsAgglomerator<dim, decltype(tree)> agglomerator{tree,
+                                                          extraction_level};
+      const auto agglomerates = agglomerator.extract_agglomerates();
+      agglomeration_handlers[extraction_level - starting_level]
+        ->connect_hierarchy(agglomerator);
+
+      // Flag elements for agglomeration
+      unsigned int agglo_index = 0;
+      for (unsigned int i = 0; i < agglomerates.size(); ++i)
+        {
+          const auto &agglo = agglomerates[i]; // i-th agglomerate
+          for (const auto &el : agglo)
+            {
+              el->set_material_id(agglo_index);
+            }
+          ++agglo_index;
+        }
+
+      const unsigned int n_local_agglomerates = agglo_index;
+      unsigned int       total_agglomerates =
+        Utilities::MPI::sum(n_local_agglomerates, comm);
+      pcout << "Total agglomerates per (tree) level: " << extraction_level
+            << ": " << total_agglomerates << std::endl;
+
+      // Now, perform agglomeration within each locally owned partition
+      std::vector<
+        std::vector<typename Triangulation<dim>::active_cell_iterator>>
+        cells_per_subdomain(n_local_agglomerates);
+      for (const auto &cell : tria_pft.active_cell_iterators())
+        if (cell->is_locally_owned())
+          cells_per_subdomain[cell->material_id()].push_back(cell);
+
+      // For every subdomain, agglomerate elements together
+      for (std::size_t i = 0; i < cells_per_subdomain.size(); ++i)
+        agglomeration_handlers[extraction_level - starting_level]
+          ->define_agglomerate(cells_per_subdomain[i]);
+
+      agglomeration_handlers[extraction_level - starting_level]
+        ->initialize_fe_values(QGaussSimplex<dim>(degree_finite_element + 1),
+                               update_values | update_gradients |
+                                 update_JxW_values | update_quadrature_points,
+                               QGaussSimplex<dim - 1>(degree_finite_element +
+                                                      1),
+                               update_JxW_values);
+      agglomeration_handlers[extraction_level - starting_level]
+        ->distribute_agglomerated_dofs(fe_dg);
+    }
+
+  // Compute two-level transfers between agglomeration handlers
+  pcout << "Fill injection matrices between agglomerated levels" << std::endl;
+  injection_matrices_two_level.resize(total_tree_levels);
+  pcout << "Number of injection matrices: "
+        << injection_matrices_two_level.size() << std::endl;
+  for (unsigned int l = 1; l < total_tree_levels; ++l)
+    {
+      pcout << "from level " << l - 1 << " to level " << l << std::endl;
+      SparsityPattern sparsity;
+      Utils::fill_injection_matrix(*agglomeration_handlers[l - 1],
+                                   *agglomeration_handlers[l],
+                                   sparsity,
+                                   injection_matrices_two_level[l - 1]);
+    }
+  pcout << "Computed two-level matrices between agglomerated levels"
+        << std::endl;
+
+
+  // Define transfer between levels.
+  std::vector<TrilinosWrappers::SparseMatrix *> transfer_matrices(
+    total_tree_levels);
+  for (unsigned int l = 0; l < total_tree_levels - 1; ++l)
+    transfer_matrices[l] = &injection_matrices_two_level[l];
+
+
+  // Last matrix, fill it by hand
+  // add last two-level (which is an embedding)
+  PolyUtils::fill_interpolation_matrix(*agglomeration_handlers.back(),
+                                       injection_matrices_two_level.back());
+  transfer_matrices[total_tree_levels - 1] =
+    &injection_matrices_two_level.back();
+
+  pcout << injection_matrices_two_level.back().m() << " and "
+        << injection_matrices_two_level.back().n() << std::endl;
+
+
+  TrilinosWrappers::MPI::Vector system_rhs;
+  const IndexSet &locally_owned_dofs = dof_handler.locally_owned_dofs();
+  system_rhs.reinit(locally_owned_dofs, comm);
+
+
+  MGLevelObject<std::unique_ptr<TrilinosWrappers::SparseMatrix>>
+    multigrid_matrices(0, total_tree_levels);
+
+  multigrid_matrices[multigrid_matrices.max_level()] =
+    std::make_unique<TrilinosWrappers::SparseMatrix>();
+
+  PolyUtils::assemble_dg_matrix_on_standard_mesh(
+    *multigrid_matrices[multigrid_matrices.max_level()],
+    system_rhs,
+    mapping,
+    fe_dg,
+    dof_handler);
+  pcout << "Built finest operator" << std::endl;
+
+#ifdef ALGEBRAIC_APPROACH
+  AmgProjector<dim, TrilinosWrappers::SparseMatrix, double> amg_projector(
+    injection_matrices_two_level);
+  pcout << "Initialized projector" << std::endl;
+  amg_projector.compute_level_matrices(multigrid_matrices);
+  pcout << "Projected using transfer_matrices:" << std::endl;
+
+#elif GEOMETRIC_APPROACH
+  const unsigned int max_level = multigrid_matrices.max_level();
+  // assemble level matrices on each level
+  for (unsigned int level = max_level;
+       level-- > multigrid_matrices.min_level();)
+    {
+      multigrid_matrices[level] =
+        std::make_unique<TrilinosWrappers::SparseMatrix>();
+      pcout << "Assemble matrix on level " << level << std::endl;
+      PolyUtils::assemble_dg_matrix(*multigrid_matrices[level],
+                                    fe_dg,
+                                    *agglomeration_handlers[level]);
+    }
+
+
+
+#endif
+
+  pcout << "Check dimensions of level operators" << std::endl;
+  for (unsigned int l = 0; l < total_tree_levels + 1; ++l)
+    pcout << "Number of rows and cols operator " << l << ":("
+          << multigrid_matrices[l]->m() << "," << multigrid_matrices[l]->n()
+          << ")" << std::endl;
+
+
+  // Setup multigrid
+
+
+  // Multigrid matrices
+  using LevelMatrixType = TrilinosWrappers::SparseMatrix;
+  mg::Matrix<VectorType> mg_matrix(multigrid_matrices);
+
+  using SmootherType = PreconditionChebyshev<LevelMatrixType, VectorType>;
+  mg::SmootherRelaxation<SmootherType, VectorType>     mg_smoother;
+  MGLevelObject<typename SmootherType::AdditionalData> smoother_data;
+  smoother_data.resize(0, total_tree_levels + 1);
+
+  // Fine level
+  std::vector<TrilinosWrappers::MPI::Vector> diag_inverses(total_tree_levels +
+                                                           1);
+  diag_inverses[total_tree_levels].reinit(dof_handler.locally_owned_dofs(),
+                                          comm);
+
+  // Set exact diagonal for each operator
+  for (unsigned int i =
+         multigrid_matrices[total_tree_levels]->local_range().first;
+       i < multigrid_matrices[total_tree_levels]->local_range().second;
+       ++i)
+    diag_inverses[total_tree_levels][i] =
+      1. / multigrid_matrices[total_tree_levels]->diag_element(i);
+
+  smoother_data[total_tree_levels].preconditioner =
+    std::make_shared<DiagonalMatrix<VectorType>>(
+      diag_inverses[total_tree_levels]);
+  pcout << "Fine smoother data: done" << std::endl;
+
+  pcout << "Start defining smoothers data" << std::endl;
+
+  for (unsigned int l = 0; l < total_tree_levels; ++l)
+    {
+      pcout << "l = " << l << std::endl;
+      diag_inverses[l].reinit(
+        agglomeration_handlers[l]->agglo_dh.locally_owned_dofs(), comm);
+
+      // Set exact diagonal for each operator
+      for (unsigned int i = multigrid_matrices[l]->local_range().first;
+           i < multigrid_matrices[l]->local_range().second;
+           ++i)
+        diag_inverses[l][i] = 1. / multigrid_matrices[l]->diag_element(i);
+
+      smoother_data[l].preconditioner =
+        std::make_shared<DiagonalMatrix<VectorType>>(diag_inverses[l]);
+    }
+
+  pcout << "Smoothers data initialized" << std::endl;
+
+  for (unsigned int level = 0; level < total_tree_levels + 1; ++level)
+    {
+      if (level > 0)
+        {
+          smoother_data[level].smoothing_range     = 20.; // 15.;
+          smoother_data[level].degree              = 5;   // 5;
+          smoother_data[level].eig_cg_n_iterations = 20;
+        }
+      else
+        {
+          smoother_data[0].smoothing_range = 1e-3;
+          smoother_data[0].degree = 3; // numbers::invalid_unsigned_int;
+          smoother_data[0].eig_cg_n_iterations = dof_handler.n_dofs();
+          smoother_data[0].eig_cg_n_iterations = multigrid_matrices[0]->m();
+        }
+    }
+
+  mg_smoother.set_steps(5);
+  mg_smoother.initialize(multigrid_matrices, smoother_data);
+
+  pcout << "Smoothers initialized" << std::endl;
+
+  // Define coarse grid solver
+  const unsigned int min_level = 0;
+  Utils::MGCoarseDirect<VectorType,
+                        TrilinosWrappers::SparseMatrix,
+                        TrilinosWrappers::SolverDirect>
+    mg_coarse(*multigrid_matrices[min_level]);
+
+  // Transfers
+  MGLevelObject<TrilinosWrappers::SparseMatrix *> mg_level_transfers(
+    0, total_tree_levels);
+  for (unsigned int l = 0; l < total_tree_levels; ++l)
+    mg_level_transfers[l] = transfer_matrices[l];
+
+
+  std::vector<DoFHandler<dim> *> dof_handlers(total_tree_levels + 1);
+  for (unsigned int l = 0; l < dof_handlers.size() - 1; ++l)
+    dof_handlers[l] = &agglomeration_handlers[l]->agglo_dh;
+  dof_handlers[dof_handlers.size() - 1] = &dof_handler; // fine
+
+  unsigned int lev = 0;
+  for (const auto &dh : dof_handlers)
+    pcout << "Number of DoFs in level " << lev++ << ": " << dh->n_dofs()
+          << std::endl;
+
+  MGTransferAgglomeration<dim, VectorType> mg_transfer(mg_level_transfers,
+                                                       dof_handlers);
+  pcout << "MG transfers initialized" << std::endl;
+
+  // Define multigrid object and convert to preconditioner.
+  Multigrid<VectorType> mg(mg_matrix,
+                           mg_coarse,
+                           mg_transfer,
+                           mg_smoother,
+                           mg_smoother,
+                           min_level,
+                           numbers::invalid_unsigned_int,
+                           Multigrid<VectorType>::v_cycle);
+
+  PreconditionMG<dim, VectorType, MGTransferAgglomeration<dim, VectorType>>
+    preconditioner(dof_handler, mg, mg_transfer);
+
+
+  // Finally, solve.
+
+  VectorType solution;
+  solution.reinit(system_rhs.locally_owned_elements());
+
+  ReductionControl     solver_control(10000, 1e-12, 1e-9);
+  SolverCG<VectorType> cg(solver_control);
+  double               start, stop;
+  pcout << "Start solver" << std::endl;
+  start = MPI_Wtime();
+  cg.solve(*multigrid_matrices[multigrid_matrices.max_level()],
+           solution,
+           system_rhs,
+           preconditioner);
+  stop = MPI_Wtime();
+  pcout << "Agglo AMG elapsed time: " << stop - start << "[s]" << std::endl;
+
+  pcout << "Initial value: " << solver_control.initial_value() << std::endl;
+  pcout << "Converged in " << solver_control.last_step()
+        << " iterations with value " << solver_control.last_value()
+        << std::endl;
+
+
+
+  [[maybe_unused]] auto output_results = [&]() -> void {
+    pcout << "Output results" << std::endl;
+    DataOut<dim> data_out;
+    data_out.attach_dof_handler(dof_handler);
+    data_out.add_data_vector(solution,
+                             "interpolated_solution",
+                             DataOut<dim>::type_dof_data);
+
+    Vector<float> subdomain(tria_pft.n_active_cells());
+
+    for (unsigned int i = 0; i < subdomain.size(); ++i)
+      subdomain(i) = tria_pft.locally_owned_subdomain();
+
+    data_out.add_data_vector(subdomain, "subdomain");
+
+    Vector<float> agglo_idx(tria_pft.n_active_cells());
+    for (const auto &cell : tria_pft.active_cell_iterators())
+      {
+        if (cell->is_locally_owned())
+          agglo_idx[cell->active_cell_index()] = cell->material_id();
+      }
+    data_out.add_data_vector(agglo_idx,
+                             "agglo_idx",
+                             DataOut<dim>::type_cell_data);
+
+    data_out.build_patches(mapping);
+    const std::string filename =
+      ("agglo_mg_simplex." +
+       Utilities::int_to_string(tria_pft.locally_owned_subdomain(), 4));
+    std::ofstream output((filename + ".vtu").c_str());
+    data_out.write_vtu(output);
+
+    {
+      std::vector<std::string> filenames;
+      for (unsigned int i = 0; i < Utilities::MPI::n_mpi_processes(comm); i++)
+        {
+          filenames.push_back("agglo_mg_simplex." +
+                              Utilities::int_to_string(i, 4) + ".vtu");
+        }
+      std::ofstream master_output("agglo_mg_simplex.pvtu");
+      data_out.write_pvtu_record(master_output, filenames);
+    }
+  };
+
+  if (dof_handler.n_dofs() < 3e6)
+    output_results();
+
+
+
+  if constexpr (CHECK_AMG == true)
+    {
+      if (starting_level == 2)
+        {
+          pcout << "Classical way" << std::endl;
+          TrilinosWrappers::PreconditionAMG                 prec_amg;
+          TrilinosWrappers::PreconditionAMG::AdditionalData amg_data;
+          amg_data.aggregation_threshold = 1e-3;
+          amg_data.smoother_type         = "Chebyshev";
+          amg_data.smoother_sweeps       = 5;
+          amg_data.output_details        = true;
+          if (degree_finite_element > 1)
+            amg_data.higher_order_elements = true;
+          pcout << "Initialized AMG prec matrix" << std::endl;
+          prec_amg.initialize(*multigrid_matrices[max_level], amg_data);
+
+          solution = 0.;
+          SolverCG<VectorType> cg_check(solver_control);
+          double               start_cg, stop_cg;
+          start_cg = MPI_Wtime();
+          cg_check.solve(*multigrid_matrices[max_level],
+                         solution,
+                         system_rhs,
+                         prec_amg); // with id
+          stop_cg = MPI_Wtime();
+
+          pcout << "CG+AMG elapsed time: " << stop_cg - start_cg << "[s]"
+                << std::endl;
+
+          pcout << "Initial value: " << solver_control.initial_value()
+                << std::endl;
+          pcout << "Converged (CG+AMG) in " << solver_control.last_step()
+                << " iterations with value " << solver_control.last_value()
+                << std::endl;
+
+
+          if (dof_handler.n_dofs() < 3e6)
+            {
+              DataOut<dim> data_out;
+              data_out.attach_dof_handler(dof_handler);
+              data_out.add_data_vector(solution,
+                                       "interpolated_solution",
+                                       DataOut<dim>::type_dof_data);
+
+              const std::string filename = "check_multigrid_mf_amg.vtu";
+              std::ofstream     output(filename);
+              data_out.build_patches(mapping);
+              data_out.write_vtu(output);
+            }
+        }
+    }
+}
+
+
+
+template <int dim>
+void
+AgglomeratedMultigridSimplex<dim>::run()
+{
+  make_fine_grid(3);
+  agglomerate_and_compute_level_matrices();
+  pcout << std::endl;
+}
+
+
+
+int
+main(int argc, char *argv[])
+{
+  Utilities::MPI::MPI_InitFinalize mpi_initialization(argc, argv, 1);
+  const MPI_Comm                   comm = MPI_COMM_WORLD;
+  static constexpr unsigned int    dim  = 2;
+
+  if (Utilities::MPI::this_mpi_process(comm) == 0)
+    std::cout << "Degree: " << degree_finite_element << std::endl;
+
+  for (unsigned int starting_level = 1; starting_level < 3; ++starting_level)
+    {
+      AgglomeratedMultigridSimplex<dim> problem(GridType::grid_generator,
+                                                degree_finite_element,
+                                                starting_level,
+                                                comm);
+      problem.run();
+    }
+}
\ No newline at end of file
diff --git a/include/poly_utils.h b/include/poly_utils.h
index f14c5869..58f50a87 100644
--- a/include/poly_utils.h
+++ b/include/poly_utils.h
@@ -1178,10 +1178,10 @@ namespace dealii::PolyUtils
 
   /**
    * Construct the interpolation matrix from the DG space defined the
-   * polytopic elements
-   * defined in @p agglomeration_handler to the DG space defined on the DoFHandler associated
-   * to standard shapes. The interpolation matrix is assumed to be
-   * default-constructed and is filled inside this function.
+   * polytopic elements defined in @p agglomeration_handler to the DG space
+   * defined on the @p DoFHandler associated to standard shapes. The
+   * interpolation matrix is assumed to be default-constructed and is filled
+   * inside this function.
    */
   template <int dim, int spacedim, typename MatrixType>
   void
@@ -1204,6 +1204,7 @@ namespace dealii::PolyUtils
 
     DoFHandler<dim, spacedim> *output_dh =
       const_cast<DoFHandler<dim, spacedim> *>(&agglomeration_handler.output_dh);
+    const Mapping<dim, spacedim> &mapping = agglomeration_handler.get_mapping();
     const FiniteElement<dim, spacedim> &fe = agglomeration_handler.get_fe();
     const Triangulation<dim, spacedim> &tria =
       agglomeration_handler.get_triangulation();
@@ -1229,7 +1230,8 @@ namespace dealii::PolyUtils
     std::vector<types::global_dof_index> output_dof_indices(fe.dofs_per_cell);
 
     Quadrature<dim>         quad(fe.get_unit_support_points());
-    FEValues<dim, spacedim> output_fe_values(fe,
+    FEValues<dim, spacedim> output_fe_values(mapping,
+                                             fe,
                                              quad,
                                              update_quadrature_points);
 
@@ -1577,6 +1579,606 @@ namespace dealii::PolyUtils
   }
 
 
+
+  /**
+   * Utility to compute jump terms when the interface is locally owned, i.e.
+   * both elements are locally owned.
+   */
+  template <int dim>
+  void
+  assemble_local_jumps_and_averages(FullMatrix<double>      &M11,
+                                    FullMatrix<double>      &M12,
+                                    FullMatrix<double>      &M21,
+                                    FullMatrix<double>      &M22,
+                                    const FEValuesBase<dim> &fe_faces0,
+                                    const FEValuesBase<dim> &fe_faces1,
+                                    const double             penalty_constant,
+                                    const double             h_f)
+  {
+    const std::vector<Tensor<1, dim>> &normals = fe_faces0.get_normal_vectors();
+    const unsigned int                 dofs_per_cell =
+      M11.m(); // size of local matrices equals the #DoFs
+    for (unsigned int q_index : fe_faces0.quadrature_point_indices())
+      {
+        const Tensor<1, dim> &normal = normals[q_index];
+        for (unsigned int i = 0; i < dofs_per_cell; ++i)
+          {
+            for (unsigned int j = 0; j < dofs_per_cell; ++j)
+              {
+                M11(i, j) += (-0.5 * fe_faces0.shape_grad(i, q_index) * normal *
+                                fe_faces0.shape_value(j, q_index) -
+                              0.5 * fe_faces0.shape_grad(j, q_index) * normal *
+                                fe_faces0.shape_value(i, q_index) +
+                              (penalty_constant / h_f) *
+                                fe_faces0.shape_value(i, q_index) *
+                                fe_faces0.shape_value(j, q_index)) *
+                             fe_faces0.JxW(q_index);
+                M12(i, j) += (0.5 * fe_faces0.shape_grad(i, q_index) * normal *
+                                fe_faces1.shape_value(j, q_index) -
+                              0.5 * fe_faces1.shape_grad(j, q_index) * normal *
+                                fe_faces0.shape_value(i, q_index) -
+                              (penalty_constant / h_f) *
+                                fe_faces0.shape_value(i, q_index) *
+                                fe_faces1.shape_value(j, q_index)) *
+                             fe_faces1.JxW(q_index);
+                M21(i, j) += (-0.5 * fe_faces1.shape_grad(i, q_index) * normal *
+                                fe_faces0.shape_value(j, q_index) +
+                              0.5 * fe_faces0.shape_grad(j, q_index) * normal *
+                                fe_faces1.shape_value(i, q_index) -
+                              (penalty_constant / h_f) *
+                                fe_faces1.shape_value(i, q_index) *
+                                fe_faces0.shape_value(j, q_index)) *
+                             fe_faces1.JxW(q_index);
+                M22(i, j) += (0.5 * fe_faces1.shape_grad(i, q_index) * normal *
+                                fe_faces1.shape_value(j, q_index) +
+                              0.5 * fe_faces1.shape_grad(j, q_index) * normal *
+                                fe_faces1.shape_value(i, q_index) +
+                              (penalty_constant / h_f) *
+                                fe_faces1.shape_value(i, q_index) *
+                                fe_faces1.shape_value(j, q_index)) *
+                             fe_faces1.JxW(q_index);
+              }
+          }
+      }
+  }
+  /**
+   * Same as above, but for a ghosted neighbor.
+   */
+  template <int dim>
+  void
+  assemble_local_jumps_and_averages_ghost(
+    FullMatrix<double>                             &M11,
+    FullMatrix<double>                             &M12,
+    FullMatrix<double>                             &M21,
+    FullMatrix<double>                             &M22,
+    const FEValuesBase<dim>                        &fe_faces0,
+    const std::vector<std::vector<double>>         &recv_values,
+    const std::vector<std::vector<Tensor<1, dim>>> &recv_gradients,
+    const std::vector<double>                      &recv_jxws,
+    const double                                    penalty_constant,
+    const double                                    h_f)
+  {
+    Assert(
+      (recv_values.size() > 0 && recv_gradients.size() && recv_jxws.size()),
+      ExcMessage(
+        "Not possible to assemble jumps and averages at a ghosted interface."));
+    const unsigned int                 dofs_per_cell = M11.m();
+    const std::vector<Tensor<1, dim>> &normals = fe_faces0.get_normal_vectors();
+    for (unsigned int q_index : fe_faces0.quadrature_point_indices())
+      {
+        const Tensor<1, dim> &normal = normals[q_index];
+        for (unsigned int i = 0; i < dofs_per_cell; ++i)
+          {
+            for (unsigned int j = 0; j < dofs_per_cell; ++j)
+              {
+                M11(i, j) += (-0.5 * fe_faces0.shape_grad(i, q_index) * normal *
+                                fe_faces0.shape_value(j, q_index) -
+                              0.5 * fe_faces0.shape_grad(j, q_index) * normal *
+                                fe_faces0.shape_value(i, q_index) +
+                              (penalty_constant / h_f) *
+                                fe_faces0.shape_value(i, q_index) *
+                                fe_faces0.shape_value(j, q_index)) *
+                             fe_faces0.JxW(q_index);
+                M12(i, j) += (0.5 * fe_faces0.shape_grad(i, q_index) * normal *
+                                recv_values[j][q_index] -
+                              0.5 * recv_gradients[j][q_index] * normal *
+                                fe_faces0.shape_value(i, q_index) -
+                              (penalty_constant / h_f) *
+                                fe_faces0.shape_value(i, q_index) *
+                                recv_values[j][q_index]) *
+                             recv_jxws[q_index];
+                M21(i, j) +=
+                  (-0.5 * recv_gradients[i][q_index] * normal *
+                     fe_faces0.shape_value(j, q_index) +
+                   0.5 * fe_faces0.shape_grad(j, q_index) * normal *
+                     recv_values[i][q_index] -
+                   (penalty_constant / h_f) * recv_values[i][q_index] *
+                     fe_faces0.shape_value(j, q_index)) *
+                  recv_jxws[q_index];
+                M22(i, j) +=
+                  (0.5 * recv_gradients[i][q_index] * normal *
+                     recv_values[j][q_index] +
+                   0.5 * recv_gradients[j][q_index] * normal *
+                     recv_values[i][q_index] +
+                   (penalty_constant / h_f) * recv_values[i][q_index] *
+                     recv_values[j][q_index]) *
+                  recv_jxws[q_index];
+              }
+          }
+      }
+  }
+
+
+  /**
+   * Utility function to assemble the SIPDG Laplace matrix.
+   * @note Supported matrix types are Trilinos types and native SparseMatrix
+   * objects provided by deal.II.
+   */
+  template <int dim, typename MatrixType>
+  void
+  assemble_dg_matrix(MatrixType                      &system_matrix,
+                     const FiniteElement<dim>        &fe_dg,
+                     const AgglomerationHandler<dim> &ah)
+  {
+    static_assert(
+      (std::is_same_v<MatrixType, TrilinosWrappers::SparseMatrix> ||
+       std::is_same_v<MatrixType,
+                      SparseMatrix<typename MatrixType::value_type>>));
+
+    Assert((dynamic_cast<const FE_DGQ<dim> *>(&fe_dg) ||
+            dynamic_cast<const FE_DGP<dim> *>(&fe_dg) ||
+            dynamic_cast<const FE_SimplexDGP<dim> *>(&fe_dg)),
+           ExcMessage("FE type not supported."));
+
+    AffineConstraints constraints;
+    constraints.close();
+    const double penalty_constant =
+      10 * (fe_dg.degree + dim) * (fe_dg.degree + 1);
+    TrilinosWrappers::SparsityPattern dsp;
+    const_cast<AgglomerationHandler<dim> &>(ah)
+      .create_agglomeration_sparsity_pattern(dsp);
+    system_matrix.reinit(dsp);
+    const unsigned int dofs_per_cell = fe_dg.n_dofs_per_cell();
+    FullMatrix<double> cell_matrix(dofs_per_cell, dofs_per_cell);
+    FullMatrix<double> M11(dofs_per_cell, dofs_per_cell);
+    FullMatrix<double> M12(dofs_per_cell, dofs_per_cell);
+    FullMatrix<double> M21(dofs_per_cell, dofs_per_cell);
+    FullMatrix<double> M22(dofs_per_cell, dofs_per_cell);
+    std::vector<types::global_dof_index> local_dof_indices(dofs_per_cell);
+    std::vector<types::global_dof_index> local_dof_indices_neighbor(
+      dofs_per_cell);
+
+    for (const auto &polytope : ah.polytope_iterators())
+      {
+        if (polytope->is_locally_owned())
+          {
+            cell_matrix              = 0.;
+            const auto &agglo_values = ah.reinit(polytope);
+            for (unsigned int q_index : agglo_values.quadrature_point_indices())
+              {
+                for (unsigned int i = 0; i < dofs_per_cell; ++i)
+                  {
+                    for (unsigned int j = 0; j < dofs_per_cell; ++j)
+                      {
+                        cell_matrix(i, j) +=
+                          agglo_values.shape_grad(i, q_index) *
+                          agglo_values.shape_grad(j, q_index) *
+                          agglo_values.JxW(q_index);
+                      }
+                  }
+              }
+            // get volumetric DoFs
+            polytope->get_dof_indices(local_dof_indices);
+            // Assemble face terms
+            unsigned int n_faces = polytope->n_faces();
+            const double h_f     = polytope->diameter();
+            for (unsigned int f = 0; f < n_faces; ++f)
+              {
+                if (polytope->at_boundary(f))
+                  {
+                    // Get normal vectors seen from each agglomeration.
+                    const auto &fe_face = ah.reinit(polytope, f);
+                    const auto &normals = fe_face.get_normal_vectors();
+                    for (unsigned int q_index :
+                         fe_face.quadrature_point_indices())
+                      {
+                        const Tensor<1, dim> &normal = normals[q_index];
+                        for (unsigned int i = 0; i < dofs_per_cell; ++i)
+                          {
+                            for (unsigned int j = 0; j < dofs_per_cell; ++j)
+                              {
+                                cell_matrix(i, j) +=
+                                  (-fe_face.shape_value(i, q_index) *
+                                     fe_face.shape_grad(j, q_index) * normal -
+                                   fe_face.shape_grad(i, q_index) * normal *
+                                     fe_face.shape_value(j, q_index) +
+                                   (penalty_constant / h_f) *
+                                     fe_face.shape_value(i, q_index) *
+                                     fe_face.shape_value(j, q_index)) *
+                                  fe_face.JxW(q_index);
+                              }
+                          }
+                      }
+                  }
+                else
+                  {
+                    const auto &neigh_polytope = polytope->neighbor(f);
+                    if (polytope->id() < neigh_polytope->id())
+                      {
+                        unsigned int nofn =
+                          polytope->neighbor_of_agglomerated_neighbor(f);
+                        Assert(neigh_polytope->neighbor(nofn)->id() ==
+                                 polytope->id(),
+                               ExcMessage("Mismatch."));
+                        const auto &fe_faces = ah.reinit_interface(
+                          polytope, neigh_polytope, f, nofn);
+                        const auto &fe_faces0 = fe_faces.first;
+                        if (neigh_polytope->is_locally_owned())
+                          {
+                            // use both fevalues
+                            const auto &fe_faces1 = fe_faces.second;
+                            M11                   = 0.;
+                            M12                   = 0.;
+                            M21                   = 0.;
+                            M22                   = 0.;
+                            assemble_local_jumps_and_averages(M11,
+                                                              M12,
+                                                              M21,
+                                                              M22,
+                                                              fe_faces0,
+                                                              fe_faces1,
+                                                              penalty_constant,
+                                                              h_f);
+                            // distribute DoFs accordingly
+                            // fluxes
+                            neigh_polytope->get_dof_indices(
+                              local_dof_indices_neighbor);
+                            constraints.distribute_local_to_global(
+                              M11, local_dof_indices, system_matrix);
+                            constraints.distribute_local_to_global(
+                              M12,
+                              local_dof_indices,
+                              local_dof_indices_neighbor,
+                              system_matrix);
+                            constraints.distribute_local_to_global(
+                              M21,
+                              local_dof_indices_neighbor,
+                              local_dof_indices,
+                              system_matrix);
+                            constraints.distribute_local_to_global(
+                              M22, local_dof_indices_neighbor, system_matrix);
+                          }
+                        else
+                          {
+                            // neigh polytope is ghosted, so retrieve necessary
+                            // metadata.
+                            types::subdomain_id neigh_rank =
+                              neigh_polytope->subdomain_id();
+                            const auto &recv_jxws =
+                              ah.recv_jxws.at(neigh_rank)
+                                .at({neigh_polytope->id(), nofn});
+                            const auto &recv_values =
+                              ah.recv_values.at(neigh_rank)
+                                .at({neigh_polytope->id(), nofn});
+                            const auto &recv_gradients =
+                              ah.recv_gradients.at(neigh_rank)
+                                .at({neigh_polytope->id(), nofn});
+                            M11 = 0.;
+                            M12 = 0.;
+                            M21 = 0.;
+                            M22 = 0.;
+                            // there's no FEFaceValues on the other side (it's
+                            // ghosted), so we just pass the actual data we have
+                            // recevied from the neighboring ghosted polytope
+                            assemble_local_jumps_and_averages_ghost(
+                              M11,
+                              M12,
+                              M21,
+                              M22,
+                              fe_faces0,
+                              recv_values,
+                              recv_gradients,
+                              recv_jxws,
+                              penalty_constant,
+                              h_f);
+                            // distribute DoFs accordingly
+                            // fluxes
+                            neigh_polytope->get_dof_indices(
+                              local_dof_indices_neighbor);
+                            constraints.distribute_local_to_global(
+                              M11, local_dof_indices, system_matrix);
+                            constraints.distribute_local_to_global(
+                              M12,
+                              local_dof_indices,
+                              local_dof_indices_neighbor,
+                              system_matrix);
+                            constraints.distribute_local_to_global(
+                              M21,
+                              local_dof_indices_neighbor,
+                              local_dof_indices,
+                              system_matrix);
+                            constraints.distribute_local_to_global(
+                              M22, local_dof_indices_neighbor, system_matrix);
+                          } // ghosted polytope case
+                      }     // only once
+                  }         // internal face
+              }             // face loop
+            constraints.distribute_local_to_global(cell_matrix,
+                                                   local_dof_indices,
+                                                   system_matrix);
+          } // locally owned polytopes
+      }
+    system_matrix.compress(VectorOperation::add);
+  }
+
+
+
+  /**
+   * Compute SIPDG matrix as well as rhs vector.
+   * @note Hardcoded for $f=1$ and simplex elements.
+   * TODO: Pass Function object for boundary conditions and forcing term.
+   */
+  template <int dim, typename MatrixType, typename VectorType>
+  void
+  assemble_dg_matrix_on_standard_mesh(MatrixType               &system_matrix,
+                                      VectorType               &system_rhs,
+                                      const Mapping<dim>       &mapping,
+                                      const FiniteElement<dim> &fe_dg,
+                                      const DoFHandler<dim>    &dof_handler)
+  {
+    static_assert(
+      (std::is_same_v<MatrixType, TrilinosWrappers::SparseMatrix> ||
+       std::is_same_v<MatrixType,
+                      SparseMatrix<typename MatrixType::value_type>>));
+
+    Assert((dynamic_cast<const FE_SimplexDGP<dim> *>(&fe_dg) != nullptr),
+           DEAL_II_NOT_IMPLEMENTED());
+
+    Assert(dof_handler.get_triangulation().all_reference_cells_are_simplex(),
+           ExcNotImplemented());
+
+    const double penalty_constant = .5 * fe_dg.degree * (fe_dg.degree + 1);
+    AffineConstraints<typename MatrixType::value_type> constraints;
+    constraints.close();
+
+    const IndexSet &locally_owned_dofs = dof_handler.locally_owned_dofs();
+    const IndexSet  locally_relevant_dofs =
+      DoFTools::extract_locally_relevant_dofs(dof_handler);
+
+    DynamicSparsityPattern dsp(locally_relevant_dofs);
+    DoFTools::make_flux_sparsity_pattern(dof_handler, dsp);
+    SparsityTools::distribute_sparsity_pattern(
+      dsp,
+      dof_handler.locally_owned_dofs(),
+      dof_handler.get_mpi_communicator(),
+      locally_relevant_dofs);
+
+    system_matrix.reinit(locally_owned_dofs,
+                         locally_owned_dofs,
+                         dsp,
+                         dof_handler.get_communicator());
+
+    system_rhs.reinit(locally_owned_dofs, dof_handler.get_communicator());
+
+    const unsigned int quadrature_degree = fe_dg.degree + 1;
+    FEFaceValues<dim>  fe_faces0(mapping,
+                                fe_dg,
+                                QGaussSimplex<dim - 1>(quadrature_degree),
+                                update_values | update_JxW_values |
+                                  update_gradients | update_quadrature_points |
+                                  update_normal_vectors);
+
+
+    FEValues<dim> fe_values(mapping,
+                            fe_dg,
+                            QGaussSimplex<dim>(quadrature_degree),
+                            update_values | update_JxW_values |
+                              update_gradients | update_quadrature_points);
+
+    FEFaceValues<dim>  fe_faces1(mapping,
+                                fe_dg,
+                                QGaussSimplex<dim - 1>(quadrature_degree),
+                                update_values | update_JxW_values |
+                                  update_gradients | update_quadrature_points |
+                                  update_normal_vectors);
+    const unsigned int dofs_per_cell = fe_dg.n_dofs_per_cell();
+
+    FullMatrix<double> cell_matrix(dofs_per_cell, dofs_per_cell);
+    Vector<double>     cell_rhs(dofs_per_cell);
+
+    FullMatrix<double> M11(dofs_per_cell, dofs_per_cell);
+    FullMatrix<double> M12(dofs_per_cell, dofs_per_cell);
+    FullMatrix<double> M21(dofs_per_cell, dofs_per_cell);
+    FullMatrix<double> M22(dofs_per_cell, dofs_per_cell);
+
+    std::vector<types::global_dof_index> local_dof_indices(dofs_per_cell);
+
+    // Loop over standard deal.II cells
+    for (const auto &cell : dof_handler.active_cell_iterators())
+      {
+        if (cell->is_locally_owned())
+          {
+            cell_matrix = 0.;
+            cell_rhs    = 0.;
+
+            fe_values.reinit(cell);
+
+            // const auto &q_points = fe_values.get_quadrature_points();
+            // const unsigned int n_qpoints = q_points.size();
+
+            for (unsigned int q_index : fe_values.quadrature_point_indices())
+              {
+                for (unsigned int i = 0; i < dofs_per_cell; ++i)
+                  {
+                    for (unsigned int j = 0; j < dofs_per_cell; ++j)
+                      {
+                        cell_matrix(i, j) += fe_values.shape_grad(i, q_index) *
+                                             fe_values.shape_grad(j, q_index) *
+                                             fe_values.JxW(q_index);
+                      }
+                    cell_rhs(i) +=
+                      fe_values.shape_value(i, q_index) * 1. *
+                      fe_values.JxW(q_index); // TODO: pass functional
+                  }
+              }
+
+            // distribute volumetric DoFs
+            cell->get_dof_indices(local_dof_indices);
+            double hf = 0.;
+            for (const auto f : cell->face_indices())
+              {
+                const double extent1 =
+                  cell->measure() / cell->face(f)->measure();
+
+                if (cell->face(f)->at_boundary())
+                  {
+                    hf = (1. / extent1 + 1. / extent1);
+                    fe_faces0.reinit(cell, f);
+
+                    const auto &normals = fe_faces0.get_normal_vectors();
+                    for (unsigned int q_index :
+                         fe_faces0.quadrature_point_indices())
+                      {
+                        for (unsigned int i = 0; i < dofs_per_cell; ++i)
+                          {
+                            for (unsigned int j = 0; j < dofs_per_cell; ++j)
+                              {
+                                cell_matrix(i, j) +=
+                                  (-fe_faces0.shape_value(i, q_index) *
+                                     fe_faces0.shape_grad(j, q_index) *
+                                     normals[q_index] -
+                                   fe_faces0.shape_grad(i, q_index) *
+                                     normals[q_index] *
+                                     fe_faces0.shape_value(j, q_index) +
+                                   (penalty_constant * hf) *
+                                     fe_faces0.shape_value(i, q_index) *
+                                     fe_faces0.shape_value(j, q_index)) *
+                                  fe_faces0.JxW(q_index);
+                              }
+                            cell_rhs(i) +=
+                              0.; // TODO: add bdary conditions functional
+                          }
+                      }
+                  }
+                else
+                  {
+                    const auto &neigh_cell = cell->neighbor(f);
+                    if (cell->global_active_cell_index() <
+                        neigh_cell->global_active_cell_index())
+                      {
+                        const double extent2 =
+                          neigh_cell->measure() /
+                          neigh_cell->face(cell->neighbor_of_neighbor(f))
+                            ->measure();
+                        hf = (1. / extent1 + 1. / extent2);
+                        fe_faces0.reinit(cell, f);
+                        fe_faces1.reinit(neigh_cell,
+                                         cell->neighbor_of_neighbor(f));
+
+                        std::vector<types::global_dof_index>
+                          local_dof_indices_neighbor(dofs_per_cell);
+
+                        M11 = 0.;
+                        M12 = 0.;
+                        M21 = 0.;
+                        M22 = 0.;
+
+                        const auto &normals = fe_faces0.get_normal_vectors();
+                        // M11
+                        for (unsigned int q_index :
+                             fe_faces0.quadrature_point_indices())
+                          {
+                            for (unsigned int i = 0; i < dofs_per_cell; ++i)
+                              {
+                                for (unsigned int j = 0; j < dofs_per_cell; ++j)
+                                  {
+                                    M11(i, j) +=
+                                      (-0.5 * fe_faces0.shape_grad(i, q_index) *
+                                         normals[q_index] *
+                                         fe_faces0.shape_value(j, q_index) -
+                                       0.5 * fe_faces0.shape_grad(j, q_index) *
+                                         normals[q_index] *
+                                         fe_faces0.shape_value(i, q_index) +
+                                       (penalty_constant * hf) *
+                                         fe_faces0.shape_value(i, q_index) *
+                                         fe_faces0.shape_value(j, q_index)) *
+                                      fe_faces0.JxW(q_index);
+
+                                    M12(i, j) +=
+                                      (0.5 * fe_faces0.shape_grad(i, q_index) *
+                                         normals[q_index] *
+                                         fe_faces1.shape_value(j, q_index) -
+                                       0.5 * fe_faces1.shape_grad(j, q_index) *
+                                         normals[q_index] *
+                                         fe_faces0.shape_value(i, q_index) -
+                                       (penalty_constant * hf) *
+                                         fe_faces0.shape_value(i, q_index) *
+                                         fe_faces1.shape_value(j, q_index)) *
+                                      fe_faces1.JxW(q_index);
+
+                                    // A10
+                                    M21(i, j) +=
+                                      (-0.5 * fe_faces1.shape_grad(i, q_index) *
+                                         normals[q_index] *
+                                         fe_faces0.shape_value(j, q_index) +
+                                       0.5 * fe_faces0.shape_grad(j, q_index) *
+                                         normals[q_index] *
+                                         fe_faces1.shape_value(i, q_index) -
+                                       (penalty_constant * hf) *
+                                         fe_faces1.shape_value(i, q_index) *
+                                         fe_faces0.shape_value(j, q_index)) *
+                                      fe_faces1.JxW(q_index);
+
+                                    // A11
+                                    M22(i, j) +=
+                                      (0.5 * fe_faces1.shape_grad(i, q_index) *
+                                         normals[q_index] *
+                                         fe_faces1.shape_value(j, q_index) +
+                                       0.5 * fe_faces1.shape_grad(j, q_index) *
+                                         normals[q_index] *
+                                         fe_faces1.shape_value(i, q_index) +
+                                       (penalty_constant * hf) *
+                                         fe_faces1.shape_value(i, q_index) *
+                                         fe_faces1.shape_value(j, q_index)) *
+                                      fe_faces1.JxW(q_index);
+                                  }
+                              }
+                          }
+
+                        // distribute DoFs accordingly
+
+                        neigh_cell->get_dof_indices(local_dof_indices_neighbor);
+
+                        constraints.distribute_local_to_global(
+                          M11, local_dof_indices, system_matrix);
+                        constraints.distribute_local_to_global(
+                          M12,
+                          local_dof_indices,
+                          local_dof_indices_neighbor,
+                          system_matrix);
+                        constraints.distribute_local_to_global(
+                          M21,
+                          local_dof_indices_neighbor,
+                          local_dof_indices,
+                          system_matrix);
+                        constraints.distribute_local_to_global(
+                          M22, local_dof_indices_neighbor, system_matrix);
+
+                      } // check idx neighbors
+                  }     // over faces
+              }
+            constraints.distribute_local_to_global(cell_matrix,
+                                                   cell_rhs,
+                                                   local_dof_indices,
+                                                   system_matrix,
+                                                   system_rhs);
+          }
+      }
+    system_matrix.compress(VectorOperation::add);
+    system_rhs.compress(VectorOperation::add);
+  }
+
+
 } // namespace dealii::PolyUtils
 
 #endif
diff --git a/include/utils.h b/include/utils.h
index 67576303..ecc9f40c 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -123,10 +123,11 @@ namespace Utils
     const DoFHandler<dim, spacedim> &coarse_agglo_dh = coarse_ah.agglo_dh;
     const DoFHandler<dim, spacedim> &fine_agglo_dh   = fine_ah.agglo_dh;
 
-    const FiniteElement<dim, spacedim> &fe   = coarse_ah.get_fe();
-    const Triangulation<dim, spacedim> &tria = coarse_ah.get_triangulation();
-    const auto &fine_bboxes                  = fine_ah.get_local_bboxes();
-    const auto &coarse_bboxes                = coarse_ah.get_local_bboxes();
+    const Mapping<dim, spacedim>       &mapping = fine_ah.get_mapping();
+    const FiniteElement<dim, spacedim> &fe      = coarse_ah.get_fe();
+    const Triangulation<dim, spacedim> &tria    = coarse_ah.get_triangulation();
+    const auto &fine_bboxes                     = fine_ah.get_local_bboxes();
+    const auto &coarse_bboxes                   = coarse_ah.get_local_bboxes();
 
     const IndexSet &locally_owned_dofs_fine =
       fine_agglo_dh.locally_owned_dofs();
@@ -158,7 +159,8 @@ namespace Utils
     const std::vector<Point<dim>> &unit_support_points =
       fe.get_unit_support_points();
     Quadrature<dim>         quad(unit_support_points);
-    FEValues<dim, spacedim> output_fe_values(fe,
+    FEValues<dim, spacedim> output_fe_values(mapping,
+                                             fe,
                                              quad,
                                              update_quadrature_points);
 
diff --git a/source/multigrid_amg.cc b/source/multigrid_amg.cc
index 0126bd78..62e1a822 100644
--- a/source/multigrid_amg.cc
+++ b/source/multigrid_amg.cc
@@ -125,7 +125,15 @@ namespace dealii
         dst[level].reinit(dof_handlers[level]->locally_owned_dofs(),
                           dof_handlers[level]->get_communicator());
       }
-    dst[dst.max_level()].copy_locally_owned_data_from(src);
+
+    if constexpr (std::is_same_v<VectorType, TrilinosWrappers::MPI::Vector>)
+      dst[dst.max_level()] = src;
+    else if constexpr (std::is_same_v<
+                         VectorType,
+                         LinearAlgebra::distributed::Vector<double>>)
+      dst[dst.max_level()].copy_locally_owned_data_from(src);
+    else
+      DEAL_II_NOT_IMPLEMENTED();
   }
 
 
@@ -138,7 +146,14 @@ namespace dealii
     const MGLevelObject<VectorType> &src) const
   {
     (void)dof_handler;
-    dst.copy_locally_owned_data_from(src[src.max_level()]);
+    if constexpr (std::is_same_v<VectorType, TrilinosWrappers::MPI::Vector>)
+      dst = src[src.max_level()];
+    else if constexpr (std::is_same_v<
+                         VectorType,
+                         LinearAlgebra::distributed::Vector<double>>)
+      dst.copy_locally_owned_data_from(src[src.max_level()]);
+    else
+      DEAL_II_NOT_IMPLEMENTED();
   }
 
 
@@ -157,4 +172,9 @@ namespace dealii
   template class MGTransferAgglomeration<
     3,
     LinearAlgebra::distributed::Vector<double>>;
+
+  // Trilinos vectors instantiations
+  template class MGTransferAgglomeration<1, TrilinosWrappers::MPI::Vector>;
+  template class MGTransferAgglomeration<2, TrilinosWrappers::MPI::Vector>;
+  template class MGTransferAgglomeration<3, TrilinosWrappers::MPI::Vector>;
 } // namespace dealii

From ae127c01a5ec8bfff41e09d717d9a47764a62f81 Mon Sep 17 00:00:00 2001
From: Marco Feder <marco.feder@sissa.it>
Date: Tue, 8 Oct 2024 01:24:46 +0200
Subject: [PATCH 2/3] Fix assert

---
 include/poly_utils.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/poly_utils.h b/include/poly_utils.h
index 58f50a87..35dac5bd 100644
--- a/include/poly_utils.h
+++ b/include/poly_utils.h
@@ -1932,7 +1932,8 @@ namespace dealii::PolyUtils
                       SparseMatrix<typename MatrixType::value_type>>));
 
     Assert((dynamic_cast<const FE_SimplexDGP<dim> *>(&fe_dg) != nullptr),
-           DEAL_II_NOT_IMPLEMENTED());
+           ExcNotImplemented(
+             "Implemented only for simplex meshes for the time being."));
 
     Assert(dof_handler.get_triangulation().all_reference_cells_are_simplex(),
            ExcNotImplemented());

From 679c2bc06b97f429419c82d1001dac0c3fee007a Mon Sep 17 00:00:00 2001
From: Marco Feder <marco.feder@sissa.it>
Date: Tue, 8 Oct 2024 13:38:12 +0200
Subject: [PATCH 3/3] Use LA::d::V parallel vectors

---
 examples/simplex_agglomerated_multigrid.cc | 23 ++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/examples/simplex_agglomerated_multigrid.cc b/examples/simplex_agglomerated_multigrid.cc
index 3f9795f1..d6761a52 100644
--- a/examples/simplex_agglomerated_multigrid.cc
+++ b/examples/simplex_agglomerated_multigrid.cc
@@ -71,8 +71,6 @@ template <typename VectorType, typename MatrixType, typename SolverType>
 class MGCoarseDirect : public MGCoarseGridBase<VectorType>
 {
 public:
-  // using  = typename
-  // LinearAlgebra::distributed::Vector<MatrixType::value_type>;
   MGCoarseDirect(const MatrixType &matrix)
   {
     coarse_matrix = &matrix;
@@ -224,7 +222,8 @@ template <int dim>
 void
 AgglomeratedMultigridSimplex<dim>::agglomerate_and_compute_level_matrices()
 {
-  using VectorType = TrilinosWrappers::MPI::Vector;
+  using VectorType = LinearAlgebra::distributed::Vector<
+    double>; // TrilinosWrappers::MPI::Vector;
   GridTools::Cache<dim> cached_tria(tria_pft, mapping);
 
   // Define matrix free operator
@@ -349,7 +348,7 @@ AgglomeratedMultigridSimplex<dim>::agglomerate_and_compute_level_matrices()
         << injection_matrices_two_level.back().n() << std::endl;
 
 
-  TrilinosWrappers::MPI::Vector system_rhs;
+  VectorType      system_rhs;
   const IndexSet &locally_owned_dofs = dof_handler.locally_owned_dofs();
   system_rhs.reinit(locally_owned_dofs, comm);
 
@@ -413,10 +412,8 @@ AgglomeratedMultigridSimplex<dim>::agglomerate_and_compute_level_matrices()
   smoother_data.resize(0, total_tree_levels + 1);
 
   // Fine level
-  std::vector<TrilinosWrappers::MPI::Vector> diag_inverses(total_tree_levels +
-                                                           1);
-  diag_inverses[total_tree_levels].reinit(dof_handler.locally_owned_dofs(),
-                                          comm);
+  std::vector<VectorType> diag_inverses(total_tree_levels + 1);
+  diag_inverses[total_tree_levels].reinit(locally_owned_dofs, comm);
 
   // Set exact diagonal for each operator
   for (unsigned int i =
@@ -518,7 +515,13 @@ AgglomeratedMultigridSimplex<dim>::agglomerate_and_compute_level_matrices()
   // Finally, solve.
 
   VectorType solution;
-  solution.reinit(system_rhs.locally_owned_elements());
+  if constexpr (std::is_same_v<VectorType,
+                               LinearAlgebra::distributed::Vector<double>>)
+    solution.reinit(locally_owned_dofs, comm);
+  else if constexpr (std::is_same_v<VectorType, TrilinosWrappers::MPI::Vector>)
+    solution.reinit(system_rhs.locally_owned_elements(), comm);
+  else
+    DEAL_II_NOT_IMPLEMENTED();
 
   ReductionControl     solver_control(10000, 1e-12, 1e-9);
   SolverCG<VectorType> cg(solver_control);
@@ -647,7 +650,7 @@ template <int dim>
 void
 AgglomeratedMultigridSimplex<dim>::run()
 {
-  make_fine_grid(3);
+  make_fine_grid(4);
   agglomerate_and_compute_level_matrices();
   pcout << std::endl;
 }