Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
9325207
Fetch main branch of CCCL, using cudax
caugonnet Dec 12, 2024
37823b0
Merge branch 'branch-25.02' into cudastf
caugonnet Jan 9, 2025
012dd38
Merge branch 'branch-25.02' into cudastf
caugonnet Jan 10, 2025
ae3036b
thrust::binary_function was deprecated in CCCL 2.6 and removed in CCC…
caugonnet Jan 10, 2025
1d0d307
verify-copyright updates
caugonnet Jan 10, 2025
959bbc1
Include a thrust header that was missing to use thrust::max
caugonnet Jan 17, 2025
b76174d
Operators such as key_group_id_less_t are templated by functors which we
caugonnet Jan 17, 2025
86f4e00
CUDASTF needs -lcuda
caugonnet Jan 18, 2025
0c8497f
Start to reintroduce STF constructs in PRIMS algorithms
caugonnet Jan 18, 2025
0eb223a
Use argument elision with logical_token
caugonnet Jan 18, 2025
0f6e2ae
Merge branch 'rapidsai:branch-25.02' into cudastf
caugonnet Jan 21, 2025
22b54cc
More work (in progress) to enable only parts of the algorithms
caugonnet Jan 22, 2025
f960650
Save WIP for page rank with STF
caugonnet Jan 22, 2025
2f1e0c5
use async resources saved in raft handles to initialize the stream ctx
caugonnet Jan 22, 2025
dc863de
Merge branch 'branch-25.04' into cudastf
caugonnet Mar 26, 2025
0f0d53f
revert previous change broken by the previous merge
caugonnet Mar 26, 2025
f5f0c5e
Merge branch 'rapidsai:branch-25.04' into cudastf
caugonnet Mar 30, 2025
d2031bf
revert some outdated cmake changes
caugonnet Mar 30, 2025
fee82d2
Workaround a warning that we cannot capture structured binding before…
caugonnet Mar 31, 2025
3e62f73
Update token API
caugonnet Apr 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,15 @@ option(USE_RAFT_STATIC "Build raft as a static library" OFF)
option(CUGRAPH_COMPILE_RAFT_LIB "Compile the raft library instead of using it header-only" ON)
option(CUDA_STATIC_RUNTIME "Statically link the CUDA toolkit runtime and libraries" OFF)

option(BUILD_CUGRAPH_COMPONENTS_ALGORITHMS "Enable components algorithms" ON)
option(BUILD_CUGRAPH_SAMPLING_ALGORITHMS "Enable sampling algorithms" ON)
option(BUILD_CUGRAPH_CENTRALITY_ALGORITHMS "Enable centrality algorithms" ON)
option(BUILD_CUGRAPH_COMMUNITY_ALGORITHMS "Enable community algorithms" ON)
option(BUILD_CUGRAPH_TRAVERSAL_ALGORITHMS "Enable traversal algorithms" ON)
option(BUILD_CUGRAPH_TREE_ALGORITHMS "Enable tree algorithms" ON)
option(BUILD_CUGRAPH_LINK_ANALYSIS_ALGORITHMS "Enable link analysis algorithms" ON)
option(BUILD_CUGRAPH_LINK_PREDICTION_ALGORITHMS "Enable link prediction algorithms" ON)

message(VERBOSE "CUGRAPH: CUDA_STATIC_RUNTIME=${CUDA_STATIC_RUNTIME}")

################################################################################
Expand Down Expand Up @@ -494,6 +503,7 @@ target_link_libraries(cugraph
rmm::rmm
raft::raft
$<BUILD_LOCAL_INTERFACE:CUDA::toolkit>
cuda
PRIVATE
${COMPILED_RAFT_LIB}
cuco::cuco
Expand Down
9 changes: 9 additions & 0 deletions cpp/cmake/thirdparty/cccl_override.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"packages": {
"cccl": {
"version": "2.8.0",
"git_url": "https://github.com/NVIDIA/cccl.git",
"git_tag": "main"
}
}
}
9 changes: 8 additions & 1 deletion cpp/cmake/thirdparty/get_cccl.cmake
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# =============================================================================
# Copyright (c) 2020-2023, NVIDIA CORPORATION.
# Copyright (c) 2020-2025, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
Expand All @@ -15,6 +15,13 @@
# This function finds CCCL and sets any additional necessary environment variables.
function(find_and_configure_cccl)
include(${rapids-cmake-dir}/cpm/cccl.cmake)
include(${rapids-cmake-dir}/cpm/package_override.cmake)

rapids_cpm_package_override("${CMAKE_CURRENT_FUNCTION_LIST_DIR}/cccl_override.json")
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This may be something we surround with some CUGRAPH_USE_STF option for example


# Enable cudax namespace install
set(CCCL_ENABLE_UNSTABLE ON)

rapids_cpm_cccl(BUILD_EXPORT_SET cugraph-exports INSTALL_EXPORT_SET cugraph-exports)
endfunction()

Expand Down
10 changes: 10 additions & 0 deletions cpp/src/prims/detail/extract_transform_if_v_frontier_e.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,11 @@
#include <utility>
#include <vector>

#include <cuda/experimental/stf.cuh>
#include <raft/core/resource/custom_resource.hpp>

using namespace cuda::experimental::stf;

namespace cugraph {

namespace detail {
Expand Down Expand Up @@ -761,6 +766,9 @@ extract_transform_if_v_frontier_e(raft::handle_t const& handle,
constexpr bool try_bitmap = GraphViewType::is_multi_gpu && std::is_same_v<key_t, vertex_t> &&
KeyBucketType::is_sorted_unique;

async_resources_handle& cudastf_handle = *raft::resource::get_custom_resource<async_resources_handle>(handle);
stream_ctx cudastf_ctx(handle.get_stream(), cudastf_handle);

if (do_expensive_check) {
auto frontier_vertex_first =
thrust_tuple_get_or_identity<decltype(frontier.begin()), 0>(frontier.begin());
Expand Down Expand Up @@ -1658,6 +1666,8 @@ extract_transform_if_v_frontier_e(raft::handle_t const& handle,
if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); }
}

cudastf_ctx.finalize();

return std::make_tuple(std::move(key_buffer), std::move(value_buffer));
}

Expand Down
87 changes: 63 additions & 24 deletions cpp/src/prims/detail/per_v_transform_reduce_e.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,11 @@
#include <type_traits>
#include <utility>

#include <cuda/experimental/stf.cuh>
#include <raft/core/resource/custom_resource.hpp>

using namespace cuda::experimental::stf;

namespace cugraph {

namespace detail {
Expand Down Expand Up @@ -1164,6 +1169,15 @@ void per_v_transform_reduce_e_edge_partition(
std::optional<raft::host_span<size_t const>> key_segment_offsets,
std::optional<raft::host_span<size_t const>> const& edge_partition_stream_pool_indices)
{
async_resources_handle& cudastf_handle = *raft::resource::get_custom_resource<async_resources_handle>(handle);
stream_ctx cudastf_ctx(handle.get_stream(), cudastf_handle);

token output_tokens[4];
for (size_t i = 0; i < 4; i++)
{
output_tokens[i] = cudastf_ctx.token();
}

constexpr bool use_input_key = !std::is_same_v<OptionalKeyIterator, void*>;

using vertex_t = typename GraphViewType::vertex_type;
Expand All @@ -1187,10 +1201,13 @@ void per_v_transform_reduce_e_edge_partition(

if constexpr (update_major && !use_input_key) { // this is necessary as we don't visit
// every vertex in the hypersparse segment
thrust::fill(rmm::exec_policy_nosync(exec_stream),
output_buffer + (*key_segment_offsets)[3],
output_buffer + (*key_segment_offsets)[4],
major_init);
// TODO task write output_token[3]
cudastf_ctx.task(output_tokens[3].write())->*[=](cudaStream_t stream) {
thrust::fill(rmm::exec_policy_nosync(stream),
output_buffer + (*key_segment_offsets)[3],
output_buffer + (*key_segment_offsets)[4],
major_init);
};
}

auto segment_size = use_input_key
Expand All @@ -1200,8 +1217,9 @@ void per_v_transform_reduce_e_edge_partition(
raft::grid_1d_thread_t update_grid(segment_size,
detail::per_v_transform_reduce_e_kernel_block_size,
handle.get_device_properties().maxGridSize[0]);
size_t token_idx = 0;
auto segment_output_buffer = output_buffer;
if constexpr (update_major) { segment_output_buffer += (*key_segment_offsets)[3]; }
if constexpr (update_major) { segment_output_buffer += (*key_segment_offsets)[3]; token_idx +=3; }
auto segment_key_first = edge_partition_key_first;
auto segment_key_last = edge_partition_key_last;
if constexpr (use_input_key) {
Expand All @@ -1212,20 +1230,22 @@ void per_v_transform_reduce_e_edge_partition(
assert(segment_key_first == nullptr);
assert(segment_key_last == nullptr);
}
detail::per_v_transform_reduce_e_hypersparse<update_major, GraphViewType>
<<<update_grid.num_blocks, update_grid.block_size, 0, exec_stream>>>(
edge_partition,
segment_key_first,
segment_key_last,
edge_partition_src_value_input,
edge_partition_dst_value_input,
edge_partition_e_value_input,
edge_partition_e_mask,
segment_output_buffer,
e_op,
major_init,
reduce_op,
pred_op);
cudastf_ctx.task(output_tokens[token_idx].rw())->*[=](cudaStream_t stream) {
detail::per_v_transform_reduce_e_hypersparse<update_major, GraphViewType>
<<<update_grid.num_blocks, update_grid.block_size, 0, stream>>>(
edge_partition,
segment_key_first,
segment_key_last,
edge_partition_src_value_input,
edge_partition_dst_value_input,
edge_partition_e_value_input,
edge_partition_e_mask,
segment_output_buffer,
e_op,
major_init,
reduce_op,
pred_op);
};
}
}
if ((*key_segment_offsets)[3] - (*key_segment_offsets)[2]) {
Expand All @@ -1236,8 +1256,9 @@ void per_v_transform_reduce_e_edge_partition(
raft::grid_1d_thread_t update_grid((*key_segment_offsets)[3] - (*key_segment_offsets)[2],
detail::per_v_transform_reduce_e_kernel_block_size,
handle.get_device_properties().maxGridSize[0]);
size_t token_idx = 0;
auto segment_output_buffer = output_buffer;
if constexpr (update_major) { segment_output_buffer += (*key_segment_offsets)[2]; }
if constexpr (update_major) { segment_output_buffer += (*key_segment_offsets)[2]; token_idx += 2; }
std::optional<segment_key_iterator_t>
segment_key_first{}; // std::optional as thrust::transform_iterator's default constructor
// is a deleted function, segment_key_first should always have a value
Expand All @@ -1247,8 +1268,10 @@ void per_v_transform_reduce_e_edge_partition(
segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first());
}
*segment_key_first += (*key_segment_offsets)[2];

cudastf_ctx.task(output_tokens[token_idx].rw())->*[=](cudaStream_t stream) {
detail::per_v_transform_reduce_e_low_degree<update_major, GraphViewType>
<<<update_grid.num_blocks, update_grid.block_size, 0, exec_stream>>>(
<<<update_grid.num_blocks, update_grid.block_size, 0, stream>>>(
edge_partition,
*segment_key_first,
*segment_key_first + ((*key_segment_offsets)[3] - (*key_segment_offsets)[2]),
Expand All @@ -1261,6 +1284,7 @@ void per_v_transform_reduce_e_edge_partition(
major_init,
reduce_op,
pred_op);
};
}
if ((*key_segment_offsets)[2] - (*key_segment_offsets)[1] > 0) {
auto exec_stream = edge_partition_stream_pool_indices
Expand All @@ -1270,8 +1294,9 @@ void per_v_transform_reduce_e_edge_partition(
raft::grid_1d_warp_t update_grid((*key_segment_offsets)[2] - (*key_segment_offsets)[1],
detail::per_v_transform_reduce_e_kernel_block_size,
handle.get_device_properties().maxGridSize[0]);
size_t token_idx = 0;
auto segment_output_buffer = output_buffer;
if constexpr (update_major) { segment_output_buffer += (*key_segment_offsets)[1]; }
if constexpr (update_major) { segment_output_buffer += (*key_segment_offsets)[1]; token_idx += 1;}
std::optional<segment_key_iterator_t>
segment_key_first{}; // std::optional as thrust::transform_iterator's default constructor
// is a deleted function, segment_key_first should always have a value
Expand All @@ -1281,8 +1306,10 @@ void per_v_transform_reduce_e_edge_partition(
segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first());
}
*segment_key_first += (*key_segment_offsets)[1];

cudastf_ctx.task(output_tokens[token_idx].rw())->*[=](cudaStream_t stream) {
detail::per_v_transform_reduce_e_mid_degree<update_major, GraphViewType>
<<<update_grid.num_blocks, update_grid.block_size, 0, exec_stream>>>(
<<<update_grid.num_blocks, update_grid.block_size, 0, stream>>>(
edge_partition,
*segment_key_first,
*segment_key_first + ((*key_segment_offsets)[2] - (*key_segment_offsets)[1]),
Expand All @@ -1296,6 +1323,7 @@ void per_v_transform_reduce_e_edge_partition(
major_identity_element,
reduce_op,
pred_op);
};
}
if ((*key_segment_offsets)[1] > 0) {
auto exec_stream = edge_partition_stream_pool_indices
Expand All @@ -1316,8 +1344,9 @@ void per_v_transform_reduce_e_edge_partition(
} else {
segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first());
}
cudastf_ctx.task(output_tokens[0].rw())->*[=](cudaStream_t stream) {
detail::per_v_transform_reduce_e_high_degree<update_major, GraphViewType>
<<<update_grid.num_blocks, update_grid.block_size, 0, exec_stream>>>(
<<<update_grid.num_blocks, update_grid.block_size, 0, stream>>>(
edge_partition,
*segment_key_first,
*segment_key_first + (*key_segment_offsets)[1],
Expand All @@ -1331,6 +1360,7 @@ void per_v_transform_reduce_e_edge_partition(
major_identity_element,
reduce_op,
pred_op);
};
}
} else {
auto exec_stream = edge_partition_stream_pool_indices
Expand Down Expand Up @@ -1374,6 +1404,8 @@ void per_v_transform_reduce_e_edge_partition(
pred_op);
}
}

cudastf_ctx.finalize();
}

template <bool incoming, // iterate over incoming edges (incoming == true) or outgoing edges
Expand Down Expand Up @@ -3106,6 +3138,9 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
}
if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); }

// TODO BEGIN
//stream_ctx stf_ctx(handle.get_stream());

for (size_t j = 0; j < loop_count; ++j) {
if (process_local_edges[j]) {
auto partition_idx = i + j;
Expand Down Expand Up @@ -3278,6 +3313,10 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
}
}
}

//stf_ctx.finalize();

// TODO END
if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); }

if constexpr (GraphViewType::is_multi_gpu && update_major) {
Expand Down
Loading