Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions cuda_bindings/benchmarks/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,6 @@ __pycache__/

# Override root .gitignore *.cpp rule (which targets Cython-generated files)
!benchmarks/cpp/*.cpp

results-python.json
results-cpp.json
22 changes: 13 additions & 9 deletions cuda_bindings/benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,26 +32,30 @@ sudo $(pixi run -e wheel -- which python) -m pyperf system tune
To run the benchmarks combine the environment and task:

```bash

# Run the Python benchmarks in the wheel environment
pixi run -e wheel bench

# Run the Python benchmarks in the source environment
pixi run -e source bench

# Run the C++ benchmarks (environment is irrelavant here)
# Run the C++ benchmarks
pixi run -e wheel bench-cpp
```

## pyperf JSON
Both runners automatically save results to JSON files in the benchmarks
directory: `results-python.json` and `results-cpp.json`.

The benchmarks are run using [pyperf](https://pyperf.readthedocs.io/en/latest/).
The results are written to a JSON file in the format expected by pyperf.
## Output JSON and analysis

The C++ benchmarks also generate a valid JSON file, in the same format.
The benchmarks are run using [pyperf](https://pyperf.readthedocs.io/en/latest/).
Both Python and C++ results are saved in pyperf-compatible JSON format,
which can be analyzed with pyperf commands:

```
pixi run -e wheel bench-cpp -0 cpp.json
```bash
# Show results and statistics
pixi run -e wheel -- python -m pyperf stats results-python.json
pixi run -e wheel -- python -m pyperf stats results-cpp.json

pixi run -e wheel pyperf stats cpp.json
# Compare C++ vs Python results
pixi run -e wheel -- python -m pyperf compare_to results-cpp.json results-python.json
```
62 changes: 62 additions & 0 deletions cuda_bindings/benchmarks/benchmarks/bench_ctx_device.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

import time

from runner.runtime import ensure_context

from cuda.bindings import driver as cuda

CTX = ensure_context()

_, DEVICE = cuda.cuDeviceGet(0)
ATTRIBUTE = cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR


def bench_ctx_get_current(loops: int) -> float:
_cuCtxGetCurrent = cuda.cuCtxGetCurrent

t0 = time.perf_counter()
for _ in range(loops):
_cuCtxGetCurrent()
return time.perf_counter() - t0


def bench_ctx_set_current(loops: int) -> float:
_cuCtxSetCurrent = cuda.cuCtxSetCurrent
_ctx = CTX

t0 = time.perf_counter()
for _ in range(loops):
_cuCtxSetCurrent(_ctx)
return time.perf_counter() - t0


def bench_ctx_get_device(loops: int) -> float:
_cuCtxGetDevice = cuda.cuCtxGetDevice

t0 = time.perf_counter()
for _ in range(loops):
_cuCtxGetDevice()
return time.perf_counter() - t0


def bench_device_get(loops: int) -> float:
_cuDeviceGet = cuda.cuDeviceGet

t0 = time.perf_counter()
for _ in range(loops):
_cuDeviceGet(0)
return time.perf_counter() - t0


def bench_device_get_attribute(loops: int) -> float:
_cuDeviceGetAttribute = cuda.cuDeviceGetAttribute
_attr = ATTRIBUTE
_dev = DEVICE

t0 = time.perf_counter()
for _ in range(loops):
_cuDeviceGetAttribute(_attr, _dev)
return time.perf_counter() - t0
62 changes: 62 additions & 0 deletions cuda_bindings/benchmarks/benchmarks/bench_event.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

import time

from runner.runtime import ensure_context

from cuda.bindings import driver as cuda

ensure_context()

_err, STREAM = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)
_err, EVENT = cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING.value)

cuda.cuEventRecord(EVENT, STREAM)
cuda.cuStreamSynchronize(STREAM)

EVENT_FLAGS = cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING.value


def bench_event_create_destroy(loops: int) -> float:
_cuEventCreate = cuda.cuEventCreate
_cuEventDestroy = cuda.cuEventDestroy
_flags = EVENT_FLAGS

t0 = time.perf_counter()
for _ in range(loops):
_, e = _cuEventCreate(_flags)
_cuEventDestroy(e)
return time.perf_counter() - t0


def bench_event_record(loops: int) -> float:
_cuEventRecord = cuda.cuEventRecord
_event = EVENT
_stream = STREAM

t0 = time.perf_counter()
for _ in range(loops):
_cuEventRecord(_event, _stream)
return time.perf_counter() - t0


def bench_event_query(loops: int) -> float:
_cuEventQuery = cuda.cuEventQuery
_event = EVENT

t0 = time.perf_counter()
for _ in range(loops):
_cuEventQuery(_event)
return time.perf_counter() - t0


def bench_event_synchronize(loops: int) -> float:
_cuEventSynchronize = cuda.cuEventSynchronize
_event = EVENT

t0 = time.perf_counter()
for _ in range(loops):
_cuEventSynchronize(_event)
return time.perf_counter() - t0
103 changes: 103 additions & 0 deletions cuda_bindings/benchmarks/benchmarks/bench_launch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

import ctypes
import time

from runner.runtime import alloc_persistent, compile_and_load, ensure_context

from cuda.bindings import driver as cuda

ensure_context()

# Compile kernels
KERNEL_SOURCE = """\
extern "C" __global__ void empty_kernel() { return; }
extern "C" __global__ void small_kernel(float *f) { *f = 0.0f; }

#define ITEM_PARAM(x, T) T x
#define REP1(x, T) , ITEM_PARAM(x, T)
#define REP2(x, T) REP1(x##0, T) REP1(x##1, T)
#define REP4(x, T) REP2(x##0, T) REP2(x##1, T)
#define REP8(x, T) REP4(x##0, T) REP4(x##1, T)
#define REP16(x, T) REP8(x##0, T) REP8(x##1, T)

extern "C" __global__
void small_kernel_16_args(
ITEM_PARAM(F, int*)
REP1(A, int*)
REP2(A, int*)
REP4(A, int*)
REP8(A, int*))
{ *F = 0; }
"""

MODULE = compile_and_load(KERNEL_SOURCE)

# Get kernel handles
_err, EMPTY_KERNEL = cuda.cuModuleGetFunction(MODULE, b"empty_kernel")
_err, SMALL_KERNEL = cuda.cuModuleGetFunction(MODULE, b"small_kernel")
_err, KERNEL_16_ARGS = cuda.cuModuleGetFunction(MODULE, b"small_kernel_16_args")

# Create a non-blocking stream for launches
_err, STREAM = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)

# Allocate device memory for kernel arguments
FLOAT_PTR = alloc_persistent(ctypes.sizeof(ctypes.c_float))
INT_PTRS = [alloc_persistent(ctypes.sizeof(ctypes.c_int)) for _ in range(16)]

# Pre-pack ctypes params for the pre-packed benchmark
_val_ps = [ctypes.c_void_p(int(p)) for p in INT_PTRS]
PACKED_16 = (ctypes.c_void_p * 16)()
for _i in range(16):
PACKED_16[_i] = ctypes.addressof(_val_ps[_i])


def bench_launch_empty_kernel(loops: int) -> float:
_cuLaunchKernel = cuda.cuLaunchKernel
_kernel = EMPTY_KERNEL
_stream = STREAM

t0 = time.perf_counter()
for _ in range(loops):
_cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, 0, 0)
return time.perf_counter() - t0


def bench_launch_small_kernel(loops: int) -> float:
_cuLaunchKernel = cuda.cuLaunchKernel
_kernel = SMALL_KERNEL
_stream = STREAM
_args = (FLOAT_PTR,)
_arg_types = (None,)

t0 = time.perf_counter()
for _ in range(loops):
_cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
return time.perf_counter() - t0


def bench_launch_16_args(loops: int) -> float:
_cuLaunchKernel = cuda.cuLaunchKernel
_kernel = KERNEL_16_ARGS
_stream = STREAM
_args = tuple(INT_PTRS)
_arg_types = tuple([None] * 16)

t0 = time.perf_counter()
for _ in range(loops):
_cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
return time.perf_counter() - t0


def bench_launch_16_args_pre_packed(loops: int) -> float:
_cuLaunchKernel = cuda.cuLaunchKernel
_kernel = KERNEL_16_ARGS
_stream = STREAM
_packed = PACKED_16

t0 = time.perf_counter()
for _ in range(loops):
_cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, _packed, 0)
return time.perf_counter() - t0
45 changes: 45 additions & 0 deletions cuda_bindings/benchmarks/benchmarks/bench_stream.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

import time

from runner.runtime import ensure_context

from cuda.bindings import driver as cuda

ensure_context()

_err, STREAM = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)


def bench_stream_create_destroy(loops: int) -> float:
_cuStreamCreate = cuda.cuStreamCreate
_cuStreamDestroy = cuda.cuStreamDestroy
_flags = cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value

t0 = time.perf_counter()
for _ in range(loops):
_, s = _cuStreamCreate(_flags)
_cuStreamDestroy(s)
return time.perf_counter() - t0


def bench_stream_query(loops: int) -> float:
_cuStreamQuery = cuda.cuStreamQuery
_stream = STREAM

t0 = time.perf_counter()
for _ in range(loops):
_cuStreamQuery(_stream)
return time.perf_counter() - t0


def bench_stream_synchronize(loops: int) -> float:
_cuStreamSynchronize = cuda.cuStreamSynchronize
_stream = STREAM

t0 = time.perf_counter()
for _ in range(loops):
_cuStreamSynchronize(_stream)
return time.perf_counter() - t0
49 changes: 46 additions & 3 deletions cuda_bindings/benchmarks/benchmarks/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,26 @@ find_library(
"${CONDA_PREFIX_HINT}/lib/stubs"
)

# Find nvrtc.h and libnvrtc (for runtime compilation benchmarks)
find_path(
NVRTC_INCLUDE_DIR
nvrtc.h
HINTS
"${CUDA_HOME_HINT}/include"
"${CONDA_PREFIX_HINT}/targets/x86_64-linux/include"
"${CONDA_PREFIX_HINT}/include"
)

find_library(
NVRTC_LIBRARY
NAMES nvrtc
HINTS
"${CUDA_HOME_HINT}/lib64"
"${CUDA_HOME_HINT}/lib"
"${CONDA_PREFIX_HINT}/targets/x86_64-linux/lib"
"${CONDA_PREFIX_HINT}/lib"
)

if(NOT CUDA_DRIVER_INCLUDE_DIR)
message(FATAL_ERROR "Could not find cuda.h. Ensure CUDA_HOME is set or install cuda-crt-dev.")
endif()
Expand All @@ -43,6 +63,29 @@ if(NOT CUDA_DRIVER_LIBRARY)
message(FATAL_ERROR "Could not find libcuda. Ensure the NVIDIA driver is installed.")
endif()

add_executable(bench_pointer_attributes_cpp bench_pointer_attributes.cpp)
target_include_directories(bench_pointer_attributes_cpp PRIVATE "${CUDA_DRIVER_INCLUDE_DIR}")
target_link_libraries(bench_pointer_attributes_cpp PRIVATE "${CUDA_DRIVER_LIBRARY}")
# Helper: add a benchmark that only needs the driver API
function(add_driver_benchmark name)
add_executable(${name}_cpp ${name}.cpp)
target_include_directories(${name}_cpp PRIVATE "${CUDA_DRIVER_INCLUDE_DIR}")
target_link_libraries(${name}_cpp PRIVATE "${CUDA_DRIVER_LIBRARY}")
endfunction()

# Helper: add a benchmark that needs driver API + NVRTC
function(add_nvrtc_benchmark name)
add_executable(${name}_cpp ${name}.cpp)
target_include_directories(${name}_cpp PRIVATE "${CUDA_DRIVER_INCLUDE_DIR}" "${NVRTC_INCLUDE_DIR}")
target_link_libraries(${name}_cpp PRIVATE "${CUDA_DRIVER_LIBRARY}" "${NVRTC_LIBRARY}")
endfunction()

# Driver-only benchmarks
add_driver_benchmark(bench_pointer_attributes)
add_driver_benchmark(bench_ctx_device)
add_driver_benchmark(bench_stream)
add_driver_benchmark(bench_event)

# NVRTC benchmarks (require nvrtc for kernel compilation)
if(NVRTC_INCLUDE_DIR AND NVRTC_LIBRARY)
add_nvrtc_benchmark(bench_launch)
else()
message(WARNING "NVRTC not found — skipping bench_launch. Install cuda-nvrtc-dev.")
endif()
Loading
Loading