diff --git a/tests/benchdnn/graph/bench_graph.cpp b/tests/benchdnn/graph/bench_graph.cpp
index 441bd29ea8b..3224f254995 100644
--- a/tests/benchdnn/graph/bench_graph.cpp
+++ b/tests/benchdnn/graph/bench_graph.cpp
@@ -58,17 +58,6 @@ void check_correctness(const settings_t &s) {
     }
 }
 
-int verify_input(const settings_t &s) {
-    if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) {
-        // TODO: update graph driver doc page once the limitation is removed.
-        BENCHDNN_PRINT(0, "%s\n",
-                "Error: graph driver doesn't support "
-                "--mode-modifier=M/--mode=F.");
-        return FAIL;
-    }
-    return OK;
-}
-
 int bench(int argc, char **argv) {
     driver_name = "graph";
     using namespace parser;
@@ -88,7 +77,6 @@ int bench(int argc, char **argv) {
         if (!parsed_options) {
             if (!parse_input_file(s.json_file, argv[0]))
                 catch_unknown_options(argv[0]);
-            SAFE(verify_input(s), WARN);
             check_correctness(s);
             flush_temp_memory();
         }
diff --git a/tests/benchdnn/graph/custom_driver.cpp b/tests/benchdnn/graph/custom_driver.cpp
index d2307106b59..7a0540dab77 100644
--- a/tests/benchdnn/graph/custom_driver.cpp
+++ b/tests/benchdnn/graph/custom_driver.cpp
@@ -293,10 +293,20 @@ void setup_cmp(compare::compare_t &cmp, const prb_t *prb, data_kind_t kind,
 
 int fill_mem(dnn_mem_t &mem_dt, dnn_mem_t &mem_fp, int f_min, int f_max) {
 
+    const auto dt = mem_dt.dt();
+    if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)
+            && !is_integral_dt(dt)) {
+        // Use data filled by benchdnn for `no_ref_memory`, except some
+        // customized operations in Graph API which expect the input
+        // values to indicate indexing information, especially for integral
+        // inputs. Hence we need to be limited the input value to the
+        // provided range.
+        return OK;
+    }
+
     const auto nelems = mem_fp.nelems();
     if (nelems == 0) return OK;
 
-    const auto dt = mem_dt.dt();
     f_min = (dt == dnnl_u8 && f_min < 0) ? 0 : f_min;
     const int64_t n_chunks = 16;
     const int64_t chunk_size = div_up(nelems, n_chunks);
@@ -339,8 +349,6 @@ void init_memory_args(dnn_mem_map_t &mem_map, const prb_t *prb,
 
 int init_ref_memory_args(dnn_mem_map_t &ref_mem_map, dnn_mem_map_t &mem_map,
         const prb_t *prb, res_t *res) {
-    if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) return OK;
-
     switch (prb->alg) {
         case GENINDEX:
             SAFE(::custom::genindex::init_ref_memory_args(
diff --git a/tests/benchdnn/graph/graph.cpp b/tests/benchdnn/graph/graph.cpp
index 3860bbffb7c..71a99c31a50 100644
--- a/tests/benchdnn/graph/graph.cpp
+++ b/tests/benchdnn/graph/graph.cpp
@@ -202,6 +202,10 @@ int find_logical_tensor(size_t lt_id, const graph::op_ref_list_t &ops,
 int map_unmap_partition_mem(graph::partition_mem_map_t &partition_mem_map,
         const std::vector<dnnl::graph::logical_tensor> &lts,
         const int &map_flag, res_t *res) {
+
+    // Not map or unmap the reference primitive memories for `no_ref_memory`
+    if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) return OK;
+
     // In case one logical tensor is used for multiple inputs, record the
     // processed logical tensor ids to avoid duplicate processing
     std::unordered_set<size_t> processed_ids;
@@ -253,7 +257,6 @@ int make_input_tensors(std::vector<dnnl::graph::tensor> &input_ts,
         }
 
         // generate tensor for graph path
-
         const auto iter = partition_mem_map.find(lt_id);
         if (iter != partition_mem_map.end()) {
             const auto &graph_mem = iter->second;
@@ -663,10 +666,12 @@ int doit(const prb_t *prb, res_t *res) {
         std::vector<dnnl::graph::tensor> output_ts(outputs.size());
 
         ref_partition_t ref_partition(dg, partitions[i], inputs, outputs);
+
         // Construct memory for both perf & corr modes
-        SAFE(ref_partition.init_ref(
-                     graph_in_ports, partition_mem_map_v[i], res),
-                WARN);
+        SAFE(ref_partition.init_ref(graph_in_ports, res), WARN);
+        if (res->state == SKIPPED) return OK;
+
+        SAFE(ref_partition.init_graph_mem(partition_mem_map_v[i], res), WARN);
         if (res->state == SKIPPED) return OK;
 
         if (has_bench_mode_bit(mode_bit_t::corr)) {
@@ -683,15 +688,12 @@ int doit(const prb_t *prb, res_t *res) {
         }
 
         // unmap memory from host to device
-        map_unmap_partition_mem(partition_mem_map_v[i], inputs, UNMAP, res);
-        map_unmap_partition_mem(partition_mem_map_v[i], outputs, UNMAP, res);
-        if (res->state == FAIL) {
-            BENCHDNN_PRINT(0,
-                    "FAIL: Fail to unmap memories to host for partition "
-                    "%zu.\n",
-                    i);
-            return FAIL;
-        }
+        SAFE(map_unmap_partition_mem(
+                     partition_mem_map_v[i], inputs, UNMAP, res),
+                WARN);
+        SAFE(map_unmap_partition_mem(
+                     partition_mem_map_v[i], outputs, UNMAP, res),
+                WARN);
 
         const op_ref_list_t &op_list = ref_partition.get_partition_ops();
         const auto &inplace_ports
@@ -731,8 +733,10 @@ int doit(const prb_t *prb, res_t *res) {
         graph_mem_mgr.stop_graph_mem_check();
 
         // map memory from device back to host
-        map_unmap_partition_mem(partition_mem_map_v[i], inputs, MAP, res);
-        map_unmap_partition_mem(partition_mem_map_v[i], outputs, MAP, res);
+        SAFE(map_unmap_partition_mem(partition_mem_map_v[i], inputs, MAP, res),
+                WARN);
+        SAFE(map_unmap_partition_mem(partition_mem_map_v[i], outputs, MAP, res),
+                WARN);
 
         // If the device is out-of-memory due to graph path execution, skip the
         // case.
diff --git a/tests/benchdnn/graph/graph_memory.cpp b/tests/benchdnn/graph/graph_memory.cpp
index c8fb95b5a72..05c9578983f 100644
--- a/tests/benchdnn/graph/graph_memory.cpp
+++ b/tests/benchdnn/graph/graph_memory.cpp
@@ -45,23 +45,10 @@ size_t get_benchdnn_device_limit() {
 // Constructs memories for all inputs and outputs needed for comparison.
 dnn_graph_mem_t::dnn_graph_mem_t(const dnn_mem_t &mem,
         const deserialized_lt &lt, const bool is_op_input,
-        const bool is_fake_output)
+        const bool use_graph_layout)
     : graph_dims_(lt.shape_), graph_strides_(lt.stride_) {
-    const auto &prim_dt = mem.dt();
-    // Conversion from graph types to dnnl types + boolean to u8.
-    const auto &graph_dt = convert_dt(lt.get_data_type());
-
-    // Get memory tag of primitive memory
-    int ndims = mem.ndims();
-    dims_t strides(mem.strides(), mem.strides() + ndims);
-    std::string mtag = strides2memory_tag(ndims, strides);
-
     const auto &g_eng = get_graph_engine().operator const dnnl::engine &();
 
-    // We create memory for graph path in two steps:
-    // 1. Create memory objects.
-    // 2. Do memory copy if needed.
-    //
     // For inputs, graph path needs data from reference path,
     // and the data movement requires both memories have the same
     // shape, so the tag of graph path is used to create the memory.
@@ -70,42 +57,77 @@ dnn_graph_mem_t::dnn_graph_mem_t(const dnn_mem_t &mem,
     // otherwise use shape & tag from ref path side
 
     // Create memory for graph path
+    const auto &graph_dt = convert_dt(lt.get_data_type());
     const auto data_type = static_cast<dnnl::memory::data_type>(graph_dt);
-    if (is_op_input) {
-        if (graph_dims_.empty()) graph_dims_.push_back(1);
-        if (graph_strides_.empty()) graph_strides_.push_back(1);
 
-        // create graph memory
+    if (graph_dims_.empty()) {
+        // As graph strides are deduced from graph dims, they should be in
+        // compliance with each other.
+        assert(graph_strides_.empty());
+
+        graph_dims_.push_back(1);
+        graph_strides_.push_back(1);
+    }
+
+    if (is_op_input) {
+        // Create graph memory with memory description from graph path.
         dnnl::memory::desc md(graph_dims_, data_type, graph_strides_);
         mem_ = dnn_mem_t(md.get(), g_eng.get());
-
-        const auto prim_to_graph_memcpy = [](dnn_mem_t &graph_mem,
-                                                  const dnn_mem_t &prim_mem) {
-            const void *prim_data_handle = static_cast<const void *>(prim_mem);
-            void *graph_data_handle = graph_mem.get_mapped_pointer<void>();
-            std::memcpy(graph_data_handle, prim_data_handle, graph_mem.size());
-        };
-
-        if (prim_dt != graph_dt) {
-            // Call a reorder (for data conversion) when reference memory
-            // doesn't coincide with the graph memory...
-            dnn_mem_t c_mem(ndims, mem.dims(), graph_dt, mtag, g_eng.get());
-            SAFE_V(c_mem.reorder(mem));
-            prim_to_graph_memcpy(mem_, c_mem);
-        } else {
-            // ... otherwise, perform a plain memcpy.
-            prim_to_graph_memcpy(mem_, mem);
-        }
     } else {
-        if (is_fake_output) {
+        if (use_graph_layout) {
+            // For some cases such as fake outputs and no reference memory
+            // mode, which means the output does not have correctponding
+            // argument in primitives, we need to create them with memory
+            // description from graph path.
             dnnl::memory::desc md(graph_dims_, data_type, graph_strides_);
             mem_ = dnn_mem_t(md.get(), g_eng.get());
+
         } else {
+            // Use information from the reference memory descriptor to create
+            // memories. As we need to reorder output from both paths to abx
+            // for comparison, the memory tag of graph path output should align
+            // the reference path.
+
+            // Get memory tag of primitive memory
+            int ndims = mem.ndims();
+            dims_t strides(mem.strides(), mem.strides() + ndims);
+            std::string mtag = strides2memory_tag(ndims, strides);
+
             mem_ = dnn_mem_t(mem.md_, graph_dt, mtag, g_eng.get());
         }
     }
 }
 
+int dnn_graph_mem_t::fill_mem_with_data(const dnn_mem_t &mem) {
+
+    if (mem.size() != mem_.size()) return FAILED;
+
+    const auto &src_dt = mem.dt();
+    const auto &dst_dt = mem_.dt();
+
+    int ndims = mem.ndims();
+    dims_t strides(mem.strides(), mem.strides() + ndims);
+    std::string mtag = strides2memory_tag(ndims, strides);
+    const auto &g_eng = get_graph_engine().operator const dnnl::engine &();
+
+    const auto prim_to_graph_memcpy = [](dnn_mem_t &graph_mem,
+                                              const dnn_mem_t &prim_mem) {
+        const void *prim_data_handle = static_cast<const void *>(prim_mem);
+        void *graph_data_handle = graph_mem.get_mapped_pointer<void>();
+        std::memcpy(graph_data_handle, prim_data_handle, graph_mem.size());
+    };
+
+    if (src_dt != dst_dt) {
+        dnn_mem_t c_mem(ndims, mem.dims(), dst_dt, mtag, g_eng.get());
+        SAFE_V(c_mem.reorder(mem));
+        prim_to_graph_memcpy(mem_, c_mem);
+    } else {
+        prim_to_graph_memcpy(mem_, mem);
+    }
+
+    return OK;
+}
+
 dnnl::graph::tensor dnn_graph_mem_t::make_graph_tensor(
         const deserialized_lt &lt) const {
     void *data_handle;
diff --git a/tests/benchdnn/graph/graph_memory.hpp b/tests/benchdnn/graph/graph_memory.hpp
index 35fcc94e715..e9221fbe76b 100644
--- a/tests/benchdnn/graph/graph_memory.hpp
+++ b/tests/benchdnn/graph/graph_memory.hpp
@@ -156,12 +156,15 @@ struct dnn_graph_mem_t {
     //
     // The constructor accepts three boolean parameters:
     // 1. is_op_input: whether the logical tensor is an input of an op
-    // 2. is_fake_output: for fake outputs, the driver cannot create memory
-    // objects based on primitive memory for them, but construct memory
-    // from graph shape. The default value is false.
+    // 2. use_graph_layout: for fake outputs and mode without reference
+    // memories, the driver cannot create memory objects based on primitive
+    // memory for them, but construct memory from graph shape. The default
+    // value is false.
     //
     dnn_graph_mem_t(const dnn_mem_t &mem, const deserialized_lt &lt,
-            const bool is_op_input, const bool is_fake_output = false);
+            const bool is_op_input, const bool use_graph_layout = false);
+
+    int fill_mem_with_data(const dnn_mem_t &mem);
 
     dnnl::graph::tensor make_graph_tensor(const deserialized_lt &lt) const;
 
diff --git a/tests/benchdnn/graph/ref_partition.cpp b/tests/benchdnn/graph/ref_partition.cpp
index 5d42c4e7204..a87199e7a03 100644
--- a/tests/benchdnn/graph/ref_partition.cpp
+++ b/tests/benchdnn/graph/ref_partition.cpp
@@ -70,8 +70,14 @@ ref_partition_t::ref_partition_t(const deserialized_graph_t &dg,
     }
 };
 
-int ref_partition_t::init_ref(const std::vector<size_t> &graph_in_ports,
-        partition_mem_map_t &partition_mem_map, res_t *res) {
+int ref_partition_t::init_ref(
+        const std::vector<size_t> &graph_in_ports, res_t *res) {
+
+    // Not create reference primitives and filling data with pre-designed
+    // strategies for `no_ref_memory`
+    if (!has_bench_mode_bit(mode_bit_t::corr)
+            && has_bench_mode_modifier(mode_modifier_t::no_ref_memory))
+        return OK;
 
     for (const auto &par_op_ref : partition_ops_ref_) {
         // res should be independent from op to op
@@ -150,30 +156,47 @@ int ref_partition_t::init_ref(const std::vector<size_t> &graph_in_ports,
         SAFE_V(data_displacer.displace_input_data(
                 entry.first, const_cast<dnn_mem_t &>(entry.second), res));
     }
+    return OK;
+}
+
+int ref_partition_t::init_graph_mem(
+        partition_mem_map_t &partition_mem_map, res_t *res) {
 
     // init graph input/oputput memory from lt_id_2_mems_
     for (const auto &id : partition_in_ids_) {
-        if (lt_id_2_mems_.find(id) == lt_id_2_mems_.end()) {
+        partition_mem_map.emplace(id,
+                dnn_graph_mem_t({}, lt_id_2_lt_.at(id), /*is_op_input=*/true));
+        if (lt_id_2_mems_.find(id) != lt_id_2_mems_.end()) {
+            SAFE(partition_mem_map.at(id).fill_mem_with_data(
+                         lt_id_2_mems_.at(id)),
+                    WARN);
+        } else if (!has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) {
             BENCHDNN_PRINT(0, "Fail: cannot find memory for %zu\n", id);
             res->state = FAILED;
             return FAIL;
         }
-        partition_mem_map.emplace(id,
-                dnn_graph_mem_t(
-                        lt_id_2_mems_.at(id), lt_id_2_lt_.at(id), true));
     }
+
     for (const auto &id : partition_out_ids_) {
-        if (fake_lt_ids_.find(id) != fake_lt_ids_.end()) {
-            partition_mem_map.emplace(
-                    id, dnn_graph_mem_t({}, lt_id_2_lt_.at(id), false, true));
-        } else if (lt_id_2_mems_.find(id) == lt_id_2_mems_.end()) {
+
+        if (fake_lt_ids_.find(id) != fake_lt_ids_.end()
+                || has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) {
+            partition_mem_map.emplace(id,
+                    dnn_graph_mem_t({}, lt_id_2_lt_.at(id),
+                            /*is_op_input=*/false, /*use_graph_layout=*/true));
+        } else if (lt_id_2_mems_.find(id) != lt_id_2_mems_.end()) {
+            // For output memories of graph, they need to be in compliance with
+            // the reference memories regarding the shapes and memory tags, as
+            // the memories of both paths will be reordered to abx for
+            // comparison.
+            partition_mem_map.emplace(id,
+                    dnn_graph_mem_t(lt_id_2_mems_.at(id), lt_id_2_lt_.at(id),
+                            /*is_op_input=*/false));
+        } else {
             BENCHDNN_PRINT(0, "Fail: cannot find memory for %zu\n", id);
             res->state = FAILED;
             return FAIL;
-        } else
-            partition_mem_map.emplace(id,
-                    dnn_graph_mem_t(
-                            lt_id_2_mems_.at(id), lt_id_2_lt_.at(id), false));
+        }
     }
 
     return OK;
diff --git a/tests/benchdnn/graph/ref_partition.hpp b/tests/benchdnn/graph/ref_partition.hpp
index 7fe726792ad..e74d24f5ab3 100644
--- a/tests/benchdnn/graph/ref_partition.hpp
+++ b/tests/benchdnn/graph/ref_partition.hpp
@@ -40,8 +40,10 @@ class ref_partition_t {
             const std::vector<dnnl::graph::logical_tensor> &outs);
 
     // prepare memories in both paths, one by one ref primitive
-    int init_ref(const std::vector<size_t> &graph_ports,
-            partition_mem_map_t &partition_mem_map, res_t *res);
+    int init_ref(const std::vector<size_t> &graph_ports, res_t *res);
+
+    int init_graph_mem(partition_mem_map_t &partition_mem_map, res_t *res);
+
     // run partition in ref path, one by one ref primitive
     void exec_ops(res_t *res);
 
diff --git a/tests/benchdnn/graph/setting_handler.cpp b/tests/benchdnn/graph/setting_handler.cpp
index 7ce35dbf6b6..21b1d272a24 100644
--- a/tests/benchdnn/graph/setting_handler.cpp
+++ b/tests/benchdnn/graph/setting_handler.cpp
@@ -1627,7 +1627,6 @@ bool get_reduction_prb_vdims(
     }
 
     prb_vdims.vdims = {src_dims, dst_dims};
-    prb_vdims.dst_dims = src_dims;
     prb_vdims.ndims = static_cast<int>(src_dims.size());
     return true;
 }