From a843f75dc5dbcc17ea9091839b530c125ab47614 Mon Sep 17 00:00:00 2001 From: Daniel Rodriguez Date: Wed, 1 Apr 2026 11:46:01 -0500 Subject: [PATCH 1/6] Add bench_ctx_device and fix JSON output --- cuda_bindings/benchmarks/.gitignore | 4 + .../benchmarks/benchmarks/bench_ctx_device.py | 20 ++ .../benchmarks/benchmarks/cpp/CMakeLists.txt | 4 + .../benchmarks/cpp/bench_ctx_device.cpp | 65 +++++++ cuda_bindings/benchmarks/pixi.toml | 2 +- cuda_bindings/benchmarks/run_cpp.py | 8 + cuda_bindings/benchmarks/runner/cpp.py | 181 ++++++++++++++++++ cuda_bindings/benchmarks/runner/main.py | 42 +++- 8 files changed, 323 insertions(+), 3 deletions(-) create mode 100644 cuda_bindings/benchmarks/benchmarks/bench_ctx_device.py create mode 100644 cuda_bindings/benchmarks/benchmarks/cpp/bench_ctx_device.cpp create mode 100644 cuda_bindings/benchmarks/run_cpp.py create mode 100644 cuda_bindings/benchmarks/runner/cpp.py diff --git a/cuda_bindings/benchmarks/.gitignore b/cuda_bindings/benchmarks/.gitignore index 68ec043c85..cb2aee641d 100644 --- a/cuda_bindings/benchmarks/.gitignore +++ b/cuda_bindings/benchmarks/.gitignore @@ -11,3 +11,7 @@ __pycache__/ # Override root .gitignore *.cpp rule (which targets Cython-generated files) !benchmarks/cpp/*.cpp + + +results-python.json +results-cpp.json diff --git a/cuda_bindings/benchmarks/benchmarks/bench_ctx_device.py b/cuda_bindings/benchmarks/benchmarks/bench_ctx_device.py new file mode 100644 index 0000000000..3825fe55a8 --- /dev/null +++ b/cuda_bindings/benchmarks/benchmarks/bench_ctx_device.py @@ -0,0 +1,20 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +import time + +from runner.runtime import ensure_context + +from cuda.bindings import driver as cuda + +ensure_context() + + +def bench_ctx_get_current(loops: int) -> float: + _cuCtxGetCurrent = cuda.cuCtxGetCurrent + + t0 = time.perf_counter() + for _ in range(loops): + _cuCtxGetCurrent() + return time.perf_counter() - t0 diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/CMakeLists.txt b/cuda_bindings/benchmarks/benchmarks/cpp/CMakeLists.txt index d0b1758062..5058643b93 100644 --- a/cuda_bindings/benchmarks/benchmarks/cpp/CMakeLists.txt +++ b/cuda_bindings/benchmarks/benchmarks/cpp/CMakeLists.txt @@ -46,3 +46,7 @@ endif() add_executable(bench_pointer_attributes_cpp bench_pointer_attributes.cpp) target_include_directories(bench_pointer_attributes_cpp PRIVATE "${CUDA_DRIVER_INCLUDE_DIR}") target_link_libraries(bench_pointer_attributes_cpp PRIVATE "${CUDA_DRIVER_LIBRARY}") + +add_executable(bench_ctx_device_cpp bench_ctx_device.cpp) +target_include_directories(bench_ctx_device_cpp PRIVATE "${CUDA_DRIVER_INCLUDE_DIR}") +target_link_libraries(bench_ctx_device_cpp PRIVATE "${CUDA_DRIVER_LIBRARY}") diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/bench_ctx_device.cpp b/cuda_bindings/benchmarks/benchmarks/cpp/bench_ctx_device.cpp new file mode 100644 index 0000000000..026e735c86 --- /dev/null +++ b/cuda_bindings/benchmarks/benchmarks/cpp/bench_ctx_device.cpp @@ -0,0 +1,65 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "bench_support.hpp" + +#include +#include + + +static void check_cu(CUresult status, const char* message) { + if (status != CUDA_SUCCESS) { + const char* error_name = nullptr; + cuGetErrorName(status, &error_name); + std::cerr << message << ": " << (error_name ? error_name : "unknown") << '\n'; + std::exit(1); + } +} + + +int main(int argc, char** argv) { + bench::Options options = bench::parse_args(argc, argv); + if (options.benchmark_name.empty()) { + options.benchmark_name = "cpp.ctx_device.ctx_get_current"; + } + + // Setup: init CUDA and create a context + check_cu(cuInit(0), "cuInit failed"); + + CUdevice device; + check_cu(cuDeviceGet(&device, 0), "cuDeviceGet failed"); + + CUcontext ctx; + CUctxCreateParams ctxParams = {}; + check_cu(cuCtxCreate(&ctx, &ctxParams, 0, device), "cuCtxCreate failed"); + + CUcontext current_ctx = nullptr; + + // Run benchmark + auto results = bench::run_benchmark(options, [&]() { + check_cu( + cuCtxGetCurrent(¤t_ctx), + "cuCtxGetCurrent failed" + ); + }); + + // Sanity check: the call actually returned our context + if (current_ctx != ctx) { + std::cerr << "unexpected: cuCtxGetCurrent returned a different context\n"; + } + + // Cleanup + check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed"); + + // Output + bench::print_summary(options.benchmark_name, results); + + if (!options.output_path.empty()) { + bench::write_pyperf_json(options.output_path, options.benchmark_name, options.loops, results); + } + + return 0; +} diff --git a/cuda_bindings/benchmarks/pixi.toml b/cuda_bindings/benchmarks/pixi.toml index b900158f5e..ff0f98a478 100644 --- a/cuda_bindings/benchmarks/pixi.toml +++ b/cuda_bindings/benchmarks/pixi.toml @@ -76,7 +76,7 @@ cmd = ["cmake", "--build", "$PIXI_PROJECT_ROOT/.build/cpp"] depends-on = [{ task = "bench-cpp-configure" }] [target.linux.tasks.bench-cpp] -cmd = ["$PIXI_PROJECT_ROOT/.build/cpp/bench_pointer_attributes_cpp"] +cmd = ["python", "$PIXI_PROJECT_ROOT/run_cpp.py"] depends-on = [{ task = "bench-cpp-build" }] [target.linux.tasks.lint] diff --git a/cuda_bindings/benchmarks/run_cpp.py b/cuda_bindings/benchmarks/run_cpp.py new file mode 100644 index 0000000000..96e50cb890 --- /dev/null +++ b/cuda_bindings/benchmarks/run_cpp.py @@ -0,0 +1,8 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from runner.cpp import main + +if __name__ == "__main__": + main() diff --git a/cuda_bindings/benchmarks/runner/cpp.py b/cuda_bindings/benchmarks/runner/cpp.py new file mode 100644 index 0000000000..87848145fc --- /dev/null +++ b/cuda_bindings/benchmarks/runner/cpp.py @@ -0,0 +1,181 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import json +import subprocess +import sys +import tempfile +from pathlib import Path + +PROJECT_ROOT = Path(__file__).resolve().parent.parent +BUILD_DIR = PROJECT_ROOT / ".build" / "cpp" +DEFAULT_OUTPUT = PROJECT_ROOT / "results-cpp.json" + +BINARY_PREFIX = "bench_" +BINARY_SUFFIX = "_cpp" + + +def discover_binaries() -> dict[str, Path]: + """Discover C++ benchmark binaries in the build directory """ + if not BUILD_DIR.is_dir(): + return {} + + registry: dict[str, Path] = {} + for path in sorted(BUILD_DIR.iterdir()): + if not path.is_file() or not path.name.startswith(BINARY_PREFIX): + continue + if not path.name.endswith(BINARY_SUFFIX): + continue + name = path.name.removeprefix(BINARY_PREFIX).removesuffix(BINARY_SUFFIX) + registry[name] = path + return registry + + +def strip_output_args(argv: list[str]) -> list[str]: + cleaned: list[str] = [] + skip_next = False + for arg in argv: + if skip_next: + skip_next = False + continue + if arg in ("-o", "--output"): + skip_next = True + continue + if arg.startswith("-o=") or arg.startswith("--output="): + continue + cleaned.append(arg) + return cleaned + + +def merge_pyperf_json(individual_files: list[Path], output_path: Path) -> int: + """Merge individual pyperf JSON files into a single BenchmarkSuite file. + + Each C++ binary produces a file with structure: + {"version": "1.0", "metadata": {...}, "benchmarks": [{...}]} + + We merge them by collecting all benchmark entries into one file. + """ + all_benchmarks = [] + + for path in individual_files: + with open(path) as f: + data = json.load(f) + + file_metadata = data.get("metadata", {}) + bench_name = file_metadata.get("name", "") + loops = file_metadata.get("loops") + unit = file_metadata.get("unit", "second") + + for bench in data.get("benchmarks", []): + for run in bench.get("runs", []): + run_meta = run.setdefault("metadata", {}) + if bench_name: + run_meta.setdefault("name", bench_name) + if loops is not None: + run_meta.setdefault("loops", loops) + run_meta.setdefault("unit", unit) + + all_benchmarks.append(bench) + + merged = { + "version": "1.0", + "benchmarks": all_benchmarks, + } + + with open(output_path, "w") as f: + json.dump(merged, f) + + return len(all_benchmarks) + + +def parse_args(argv: list[str]) -> tuple[argparse.Namespace, list[str]]: + parser = argparse.ArgumentParser( + description="Run C++ CUDA benchmarks", + add_help=False, + ) + parser.add_argument( + "--benchmark", + action="append", + default=[], + help="Benchmark name to run (e.g. 'ctx_device'). Repeat for multiple. Defaults to all.", + ) + parser.add_argument( + "--list", + action="store_true", + help="Print discovered benchmark names and exit.", + ) + parser.add_argument( + "-o", + "--output", + type=Path, + default=DEFAULT_OUTPUT, + help=f"JSON output file path (default: {DEFAULT_OUTPUT.name})", + ) + parsed, remaining = parser.parse_known_args(argv) + return parsed, remaining + + +def main() -> None: + parsed, remaining_argv = parse_args(sys.argv[1:]) + + registry = discover_binaries() + if not registry: + print( + f"No C++ benchmark binaries found in {BUILD_DIR}.\n" + "Run 'pixi run bench-cpp-build' first.", + file=sys.stderr, + ) + sys.exit(1) + + if parsed.list: + for name in sorted(registry): + print(name) + return + + if parsed.benchmark: + missing = sorted(set(parsed.benchmark) - set(registry)) + if missing: + known = ", ".join(sorted(registry)) + unknown = ", ".join(missing) + print( + f"Unknown benchmark(s): {unknown}. Known benchmarks: {known}", + file=sys.stderr, + ) + sys.exit(1) + names = parsed.benchmark + else: + names = sorted(registry) + + # Strip any --output args to avoid conflicts with our output handling + passthrough_argv = strip_output_args(remaining_argv) + + output_path = parsed.output.resolve() + failed = False + individual_files: list[Path] = [] + + with tempfile.TemporaryDirectory(prefix="cuda_bench_cpp_") as tmpdir: + tmpdir_path = Path(tmpdir) + + for name in names: + binary = registry[name] + tmp_json = tmpdir_path / f"{name}.json" + cmd = [str(binary), "-o", str(tmp_json), *passthrough_argv] + result = subprocess.run(cmd) + if result.returncode != 0: + print(f"FAILED: {name} (exit code {result.returncode})", file=sys.stderr) + failed = True + elif tmp_json.exists(): + individual_files.append(tmp_json) + + if individual_files: + count = merge_pyperf_json(individual_files, output_path) + print(f"\nResults saved to {output_path} ({count} benchmark(s))") + + if failed: + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/cuda_bindings/benchmarks/runner/main.py b/cuda_bindings/benchmarks/runner/main.py index f544a29f73..01b6b80de7 100644 --- a/cuda_bindings/benchmarks/runner/main.py +++ b/cuda_bindings/benchmarks/runner/main.py @@ -12,7 +12,9 @@ import pyperf -BENCH_DIR = Path(__file__).resolve().parent.parent / "benchmarks" +PROJECT_ROOT = Path(__file__).resolve().parent.parent +BENCH_DIR = PROJECT_ROOT / "benchmarks" +DEFAULT_OUTPUT = PROJECT_ROOT / "results-python.json" def load_module(module_path: Path) -> ModuleType: @@ -54,6 +56,22 @@ def discover_benchmarks() -> dict[str, Callable[[int], float]]: return registry +def strip_pyperf_output_args(argv: list[str]) -> list[str]: + cleaned: list[str] = [] + skip_next = False + for i, arg in enumerate(argv): + if skip_next: + skip_next = False + continue + if arg in ("-o", "--output", "--append"): + skip_next = True + continue + if arg.startswith("-o=") or arg.startswith("--output=") or arg.startswith("--append="): + continue + cleaned.append(arg) + return cleaned + + def parse_args(argv: list[str]) -> tuple[argparse.Namespace, list[str]]: parser = argparse.ArgumentParser(add_help=False) parser.add_argument( @@ -67,13 +85,19 @@ def parse_args(argv: list[str]) -> tuple[argparse.Namespace, list[str]]: action="store_true", help="Print discovered benchmark IDs and exit.", ) + parser.add_argument( + "-o", + "--output", + type=Path, + default=DEFAULT_OUTPUT, + help=f"JSON output file path (default: {DEFAULT_OUTPUT.name})", + ) parsed, remaining = parser.parse_known_args(argv) return parsed, remaining def main() -> None: parsed, remaining_argv = parse_args(sys.argv[1:]) - sys.argv = [sys.argv[0], *remaining_argv] registry = discover_benchmarks() if not registry: @@ -94,10 +118,24 @@ def main() -> None: else: benchmark_ids = sorted(registry) + # Strip any --output args to avoid conflicts with our output handling + output_path = parsed.output.resolve() + remaining_argv = strip_pyperf_output_args(remaining_argv) + is_worker = "--worker" in remaining_argv + + # Delete the file so this run starts fresh + if not is_worker: + output_path.unlink(missing_ok=True) + + sys.argv = [sys.argv[0], "--append", str(output_path), *remaining_argv] + runner = pyperf.Runner() for bench_id in benchmark_ids: runner.bench_time_func(bench_id, registry[bench_id]) + if not is_worker: + print(f"\nResults saved to {output_path}") + if __name__ == "__main__": main() From 780b435f8565665861293baa69a9b321dc803e06 Mon Sep 17 00:00:00 2001 From: Daniel Rodriguez Date: Wed, 1 Apr 2026 13:05:36 -0500 Subject: [PATCH 2/6] Remove prefix so we can compare benchmarks --- cuda_bindings/benchmarks/README.md | 22 +++++++++++-------- .../benchmarks/cpp/bench_ctx_device.cpp | 2 +- .../cpp/bench_pointer_attributes.cpp | 2 +- cuda_bindings/benchmarks/runner/main.py | 2 +- 4 files changed, 16 insertions(+), 12 deletions(-) diff --git a/cuda_bindings/benchmarks/README.md b/cuda_bindings/benchmarks/README.md index c2529bdb19..bc59aa06e7 100644 --- a/cuda_bindings/benchmarks/README.md +++ b/cuda_bindings/benchmarks/README.md @@ -32,26 +32,30 @@ sudo $(pixi run -e wheel -- which python) -m pyperf system tune To run the benchmarks combine the environment and task: ```bash - # Run the Python benchmarks in the wheel environment pixi run -e wheel bench # Run the Python benchmarks in the source environment pixi run -e source bench -# Run the C++ benchmarks (environment is irrelavant here) +# Run the C++ benchmarks pixi run -e wheel bench-cpp ``` -## pyperf JSON +Both runners automatically save results to JSON files in the benchmarks +directory: `results-python.json` and `results-cpp.json`. -The benchmarks are run using [pyperf](https://pyperf.readthedocs.io/en/latest/). -The results are written to a JSON file in the format expected by pyperf. +## Output JSON and analysis -The C++ benchmarks also generate a valid JSON file, in the same format. +The benchmarks are run using [pyperf](https://pyperf.readthedocs.io/en/latest/). +Both Python and C++ results are saved in pyperf-compatible JSON format, +which can be analyzed with pyperf commands: -``` -pixi run -e wheel bench-cpp -0 cpp.json +```bash +# Show results and statistics +pixi run -e wheel -- python -m pyperf stats results-python.json +pixi run -e wheel -- python -m pyperf stats results-cpp.json -pixi run -e wheel pyperf stats cpp.json +# Compare C++ vs Python results +pixi run -e wheel -- python -m pyperf compare_to results-cpp.json results-python.json ``` diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/bench_ctx_device.cpp b/cuda_bindings/benchmarks/benchmarks/cpp/bench_ctx_device.cpp index 026e735c86..fb5540d557 100644 --- a/cuda_bindings/benchmarks/benchmarks/cpp/bench_ctx_device.cpp +++ b/cuda_bindings/benchmarks/benchmarks/cpp/bench_ctx_device.cpp @@ -23,7 +23,7 @@ static void check_cu(CUresult status, const char* message) { int main(int argc, char** argv) { bench::Options options = bench::parse_args(argc, argv); if (options.benchmark_name.empty()) { - options.benchmark_name = "cpp.ctx_device.ctx_get_current"; + options.benchmark_name = "ctx_device.ctx_get_current"; } // Setup: init CUDA and create a context diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/bench_pointer_attributes.cpp b/cuda_bindings/benchmarks/benchmarks/cpp/bench_pointer_attributes.cpp index f1cf63d1bd..547a48b0f1 100644 --- a/cuda_bindings/benchmarks/benchmarks/cpp/bench_pointer_attributes.cpp +++ b/cuda_bindings/benchmarks/benchmarks/cpp/bench_pointer_attributes.cpp @@ -23,7 +23,7 @@ static void check_cu(CUresult status, const char* message) { int main(int argc, char** argv) { bench::Options options = bench::parse_args(argc, argv); if (options.benchmark_name.empty()) { - options.benchmark_name = "cpp.pointer_attributes.pointer_get_attribute"; + options.benchmark_name = "pointer_attributes.pointer_get_attribute"; } // Setup: init CUDA, allocate memory diff --git a/cuda_bindings/benchmarks/runner/main.py b/cuda_bindings/benchmarks/runner/main.py index 01b6b80de7..c65b0b5641 100644 --- a/cuda_bindings/benchmarks/runner/main.py +++ b/cuda_bindings/benchmarks/runner/main.py @@ -30,7 +30,7 @@ def load_module(module_path: Path) -> ModuleType: def benchmark_id(module_name: str, function_name: str) -> str: module_suffix = module_name.removeprefix("bench_") suffix = function_name.removeprefix("bench_") - return f"bindings.{module_suffix}.{suffix}" + return f"{module_suffix}.{suffix}" def discover_benchmarks() -> dict[str, Callable[[int], float]]: From 90b5e0b84b7c1e79394ccd39ea0a33ace8d1d330 Mon Sep 17 00:00:00 2001 From: Daniel Rodriguez Date: Wed, 1 Apr 2026 13:42:45 -0500 Subject: [PATCH 3/6] Add bench_event and bench_stream and compare script for a summary table --- .../benchmarks/benchmarks/bench_ctx_device.py | 45 ++++++- .../benchmarks/benchmarks/bench_event.py | 62 +++++++++ .../benchmarks/benchmarks/bench_stream.py | 45 +++++++ cuda_bindings/benchmarks/compare.py | 118 ++++++++++++++++++ cuda_bindings/benchmarks/pixi.toml | 4 + 5 files changed, 273 insertions(+), 1 deletion(-) create mode 100644 cuda_bindings/benchmarks/benchmarks/bench_event.py create mode 100644 cuda_bindings/benchmarks/benchmarks/bench_stream.py create mode 100644 cuda_bindings/benchmarks/compare.py diff --git a/cuda_bindings/benchmarks/benchmarks/bench_ctx_device.py b/cuda_bindings/benchmarks/benchmarks/bench_ctx_device.py index 3825fe55a8..3984689eed 100644 --- a/cuda_bindings/benchmarks/benchmarks/bench_ctx_device.py +++ b/cuda_bindings/benchmarks/benchmarks/bench_ctx_device.py @@ -8,7 +8,10 @@ from cuda.bindings import driver as cuda -ensure_context() +CTX = ensure_context() + +_, DEVICE = cuda.cuDeviceGet(0) +ATTRIBUTE = cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR def bench_ctx_get_current(loops: int) -> float: @@ -18,3 +21,43 @@ def bench_ctx_get_current(loops: int) -> float: for _ in range(loops): _cuCtxGetCurrent() return time.perf_counter() - t0 + + +def bench_ctx_set_current(loops: int) -> float: + _cuCtxSetCurrent = cuda.cuCtxSetCurrent + _ctx = CTX + + t0 = time.perf_counter() + for _ in range(loops): + _cuCtxSetCurrent(_ctx) + return time.perf_counter() - t0 + + +def bench_ctx_get_device(loops: int) -> float: + _cuCtxGetDevice = cuda.cuCtxGetDevice + + t0 = time.perf_counter() + for _ in range(loops): + _cuCtxGetDevice() + return time.perf_counter() - t0 + + +def bench_device_get(loops: int) -> float: + _cuDeviceGet = cuda.cuDeviceGet + + t0 = time.perf_counter() + for _ in range(loops): + _cuDeviceGet(0) + return time.perf_counter() - t0 + + +def bench_device_get_attribute(loops: int) -> float: + _cuDeviceGetAttribute = cuda.cuDeviceGetAttribute + _attr = ATTRIBUTE + _dev = DEVICE + + t0 = time.perf_counter() + for _ in range(loops): + _cuDeviceGetAttribute(_attr, _dev) + return time.perf_counter() - t0 + diff --git a/cuda_bindings/benchmarks/benchmarks/bench_event.py b/cuda_bindings/benchmarks/benchmarks/bench_event.py new file mode 100644 index 0000000000..e8e319115d --- /dev/null +++ b/cuda_bindings/benchmarks/benchmarks/bench_event.py @@ -0,0 +1,62 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +import time + +from runner.runtime import ensure_context + +from cuda.bindings import driver as cuda + +ensure_context() + +_err, STREAM = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value) +_err, EVENT = cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING.value) + +cuda.cuEventRecord(EVENT, STREAM) +cuda.cuStreamSynchronize(STREAM) + +EVENT_FLAGS = cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING.value + + +def bench_event_create_destroy(loops: int) -> float: + _cuEventCreate = cuda.cuEventCreate + _cuEventDestroy = cuda.cuEventDestroy + _flags = EVENT_FLAGS + + t0 = time.perf_counter() + for _ in range(loops): + _, e = _cuEventCreate(_flags) + _cuEventDestroy(e) + return time.perf_counter() - t0 + + +def bench_event_record(loops: int) -> float: + _cuEventRecord = cuda.cuEventRecord + _event = EVENT + _stream = STREAM + + t0 = time.perf_counter() + for _ in range(loops): + _cuEventRecord(_event, _stream) + return time.perf_counter() - t0 + + +def bench_event_query(loops: int) -> float: + _cuEventQuery = cuda.cuEventQuery + _event = EVENT + + t0 = time.perf_counter() + for _ in range(loops): + _cuEventQuery(_event) + return time.perf_counter() - t0 + + +def bench_event_synchronize(loops: int) -> float: + _cuEventSynchronize = cuda.cuEventSynchronize + _event = EVENT + + t0 = time.perf_counter() + for _ in range(loops): + _cuEventSynchronize(_event) + return time.perf_counter() - t0 diff --git a/cuda_bindings/benchmarks/benchmarks/bench_stream.py b/cuda_bindings/benchmarks/benchmarks/bench_stream.py new file mode 100644 index 0000000000..d816099ed5 --- /dev/null +++ b/cuda_bindings/benchmarks/benchmarks/bench_stream.py @@ -0,0 +1,45 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +import time + +from runner.runtime import ensure_context + +from cuda.bindings import driver as cuda + +ensure_context() + +_err, STREAM = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value) + + +def bench_stream_create_destroy(loops: int) -> float: + _cuStreamCreate = cuda.cuStreamCreate + _cuStreamDestroy = cuda.cuStreamDestroy + _flags = cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value + + t0 = time.perf_counter() + for _ in range(loops): + _, s = _cuStreamCreate(_flags) + _cuStreamDestroy(s) + return time.perf_counter() - t0 + + +def bench_stream_query(loops: int) -> float: + _cuStreamQuery = cuda.cuStreamQuery + _stream = STREAM + + t0 = time.perf_counter() + for _ in range(loops): + _cuStreamQuery(_stream) + return time.perf_counter() - t0 + + +def bench_stream_synchronize(loops: int) -> float: + _cuStreamSynchronize = cuda.cuStreamSynchronize + _stream = STREAM + + t0 = time.perf_counter() + for _ in range(loops): + _cuStreamSynchronize(_stream) + return time.perf_counter() - t0 diff --git a/cuda_bindings/benchmarks/compare.py b/cuda_bindings/benchmarks/compare.py new file mode 100644 index 0000000000..6a3e94f344 --- /dev/null +++ b/cuda_bindings/benchmarks/compare.py @@ -0,0 +1,118 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +"""Compare Python and C++ benchmark results in a summary table.""" + +import argparse +import json +import statistics +import sys +from pathlib import Path + +PROJECT_ROOT = Path(__file__).resolve().parent +DEFAULT_PYTHON = PROJECT_ROOT / "results-python.json" +DEFAULT_CPP = PROJECT_ROOT / "results-cpp.json" + + +def load_benchmarks(path: Path) -> dict[str, list[float]]: + """Load a pyperf JSON file and return {name: [values]}.""" + with open(path) as f: + data = json.load(f) + + results: dict[str, list[float]] = {} + for bench in data.get("benchmarks", []): + name = bench.get("metadata", {}).get("name", "") + if not name: + # Try to find name in run metadata + for run in bench.get("runs", []): + name = run.get("metadata", {}).get("name", "") + if name: + break + values = [] + for run in bench.get("runs", []): + values.extend(run.get("values", [])) + if name and values: + results[name] = values + return results + + +def fmt_ns(seconds: float) -> str: + ns = seconds * 1e9 + if ns >= 1000: + return f"{ns / 1000:.2f} us" + return f"{ns:.0f} ns" + + +def main() -> None: + parser = argparse.ArgumentParser(description="Compare Python vs C++ benchmark results") + parser.add_argument( + "--python", + type=Path, + default=DEFAULT_PYTHON, + help=f"Python results JSON (default: {DEFAULT_PYTHON.name})", + ) + parser.add_argument( + "--cpp", + type=Path, + default=DEFAULT_CPP, + help=f"C++ results JSON (default: {DEFAULT_CPP.name})", + ) + args = parser.parse_args() + + if not args.python.exists(): + print(f"Python results not found: {args.python}", file=sys.stderr) + print("Run: pixi run -e wheel bench", file=sys.stderr) + sys.exit(1) + + py_benchmarks = load_benchmarks(args.python) + cpp_benchmarks = load_benchmarks(args.cpp) if args.cpp.exists() else {} + + if not py_benchmarks: + print("No benchmarks found in Python results.", file=sys.stderr) + sys.exit(1) + + # Column widths + all_names = sorted(set(py_benchmarks) | set(cpp_benchmarks)) + name_width = max(len(n) for n in all_names) + name_width = max(name_width, len("Benchmark")) + + # Header + if cpp_benchmarks: + header = f"{'Benchmark':<{name_width}} {'C++ (mean)':>12} {'Python (mean)':>14} {'Overhead':>10}" + sep = "-" * len(header) + print(sep) + print(header) + print(sep) + else: + header = f"{'Benchmark':<{name_width}} {'Python (mean)':>14}" + sep = "-" * len(header) + print(sep) + print(header) + print(sep) + + for name in all_names: + py_vals = py_benchmarks.get(name) + cpp_vals = cpp_benchmarks.get(name) + + py_str = fmt_ns(statistics.mean(py_vals)) if py_vals else "-" + cpp_str = fmt_ns(statistics.mean(cpp_vals)) if cpp_vals else "-" + + if py_vals and cpp_vals: + py_mean = statistics.mean(py_vals) + cpp_mean = statistics.mean(cpp_vals) + overhead_ns = (py_mean - cpp_mean) * 1e9 + overhead_str = f"+{overhead_ns:.0f} ns" + else: + overhead_str = "-" + + if cpp_benchmarks: + print(f"{name:<{name_width}} {cpp_str:>12} {py_str:>14} {overhead_str:>10}") + else: + print(f"{name:<{name_width}} {py_str:>14}") + + print(sep) + + +if __name__ == "__main__": + main() diff --git a/cuda_bindings/benchmarks/pixi.toml b/cuda_bindings/benchmarks/pixi.toml index ff0f98a478..6a7850d759 100644 --- a/cuda_bindings/benchmarks/pixi.toml +++ b/cuda_bindings/benchmarks/pixi.toml @@ -29,6 +29,7 @@ cmake = "*" ninja = "*" cxx-compiler = "*" cuda-cudart-dev = "*" +cuda-nvrtc-dev = "*" [feature.cpp-bench.target.linux-64.dependencies] cuda-crt-dev_linux-64 = "*" @@ -79,5 +80,8 @@ depends-on = [{ task = "bench-cpp-configure" }] cmd = ["python", "$PIXI_PROJECT_ROOT/run_cpp.py"] depends-on = [{ task = "bench-cpp-build" }] +[target.linux.tasks.bench-compare] +cmd = ["python", "$PIXI_PROJECT_ROOT/compare.py"] + [target.linux.tasks.lint] cmd = ["pre-commit", "run", "--all-files"] From 8126ab7122399e485f532eeae02b8e37eaf4dedd Mon Sep 17 00:00:00 2001 From: Daniel Rodriguez Date: Wed, 1 Apr 2026 13:43:09 -0500 Subject: [PATCH 4/6] Add bench_event and bench_stream and compare script for a summary table --- .../benchmarks/benchmarks/cpp/CMakeLists.txt | 51 +++++++++-- .../benchmarks/cpp/bench_ctx_device.cpp | 62 ++++++++----- .../benchmarks/benchmarks/cpp/bench_event.cpp | 88 +++++++++++++++++++ .../cpp/bench_pointer_attributes.cpp | 34 +++---- .../benchmarks/cpp/bench_stream.cpp | 74 ++++++++++++++++ .../benchmarks/cpp/bench_support.hpp | 84 ++++++++++++++++++ 6 files changed, 345 insertions(+), 48 deletions(-) create mode 100644 cuda_bindings/benchmarks/benchmarks/cpp/bench_event.cpp create mode 100644 cuda_bindings/benchmarks/benchmarks/cpp/bench_stream.cpp diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/CMakeLists.txt b/cuda_bindings/benchmarks/benchmarks/cpp/CMakeLists.txt index 5058643b93..b4285834aa 100644 --- a/cuda_bindings/benchmarks/benchmarks/cpp/CMakeLists.txt +++ b/cuda_bindings/benchmarks/benchmarks/cpp/CMakeLists.txt @@ -35,6 +35,26 @@ find_library( "${CONDA_PREFIX_HINT}/lib/stubs" ) +# Find nvrtc.h and libnvrtc (for runtime compilation benchmarks) +find_path( + NVRTC_INCLUDE_DIR + nvrtc.h + HINTS + "${CUDA_HOME_HINT}/include" + "${CONDA_PREFIX_HINT}/targets/x86_64-linux/include" + "${CONDA_PREFIX_HINT}/include" +) + +find_library( + NVRTC_LIBRARY + NAMES nvrtc + HINTS + "${CUDA_HOME_HINT}/lib64" + "${CUDA_HOME_HINT}/lib" + "${CONDA_PREFIX_HINT}/targets/x86_64-linux/lib" + "${CONDA_PREFIX_HINT}/lib" +) + if(NOT CUDA_DRIVER_INCLUDE_DIR) message(FATAL_ERROR "Could not find cuda.h. Ensure CUDA_HOME is set or install cuda-crt-dev.") endif() @@ -43,10 +63,29 @@ if(NOT CUDA_DRIVER_LIBRARY) message(FATAL_ERROR "Could not find libcuda. Ensure the NVIDIA driver is installed.") endif() -add_executable(bench_pointer_attributes_cpp bench_pointer_attributes.cpp) -target_include_directories(bench_pointer_attributes_cpp PRIVATE "${CUDA_DRIVER_INCLUDE_DIR}") -target_link_libraries(bench_pointer_attributes_cpp PRIVATE "${CUDA_DRIVER_LIBRARY}") +# Helper: add a benchmark that only needs the driver API +function(add_driver_benchmark name) + add_executable(${name}_cpp ${name}.cpp) + target_include_directories(${name}_cpp PRIVATE "${CUDA_DRIVER_INCLUDE_DIR}") + target_link_libraries(${name}_cpp PRIVATE "${CUDA_DRIVER_LIBRARY}") +endfunction() -add_executable(bench_ctx_device_cpp bench_ctx_device.cpp) -target_include_directories(bench_ctx_device_cpp PRIVATE "${CUDA_DRIVER_INCLUDE_DIR}") -target_link_libraries(bench_ctx_device_cpp PRIVATE "${CUDA_DRIVER_LIBRARY}") +# Helper: add a benchmark that needs driver API + NVRTC +function(add_nvrtc_benchmark name) + add_executable(${name}_cpp ${name}.cpp) + target_include_directories(${name}_cpp PRIVATE "${CUDA_DRIVER_INCLUDE_DIR}" "${NVRTC_INCLUDE_DIR}") + target_link_libraries(${name}_cpp PRIVATE "${CUDA_DRIVER_LIBRARY}" "${NVRTC_LIBRARY}") +endfunction() + +# Driver-only benchmarks +add_driver_benchmark(bench_pointer_attributes) +add_driver_benchmark(bench_ctx_device) +add_driver_benchmark(bench_stream) +add_driver_benchmark(bench_event) + +# NVRTC benchmarks (require nvrtc for kernel compilation) +if(NVRTC_INCLUDE_DIR AND NVRTC_LIBRARY) + add_nvrtc_benchmark(bench_launch) +else() + message(WARNING "NVRTC not found — skipping bench_launch. Install cuda-nvrtc-dev.") +endif() diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/bench_ctx_device.cpp b/cuda_bindings/benchmarks/benchmarks/cpp/bench_ctx_device.cpp index fb5540d557..052df9cc1d 100644 --- a/cuda_bindings/benchmarks/benchmarks/cpp/bench_ctx_device.cpp +++ b/cuda_bindings/benchmarks/benchmarks/cpp/bench_ctx_device.cpp @@ -22,9 +22,6 @@ static void check_cu(CUresult status, const char* message) { int main(int argc, char** argv) { bench::Options options = bench::parse_args(argc, argv); - if (options.benchmark_name.empty()) { - options.benchmark_name = "ctx_device.ctx_get_current"; - } // Setup: init CUDA and create a context check_cu(cuInit(0), "cuInit failed"); @@ -36,30 +33,55 @@ int main(int argc, char** argv) { CUctxCreateParams ctxParams = {}; check_cu(cuCtxCreate(&ctx, &ctxParams, 0, device), "cuCtxCreate failed"); - CUcontext current_ctx = nullptr; + bench::BenchmarkSuite suite(options); - // Run benchmark - auto results = bench::run_benchmark(options, [&]() { - check_cu( - cuCtxGetCurrent(¤t_ctx), - "cuCtxGetCurrent failed" - ); - }); + // --- ctx_get_current --- + { + CUcontext current_ctx = nullptr; + suite.run("ctx_device.ctx_get_current", [&]() { + check_cu(cuCtxGetCurrent(¤t_ctx), "cuCtxGetCurrent failed"); + }); + } - // Sanity check: the call actually returned our context - if (current_ctx != ctx) { - std::cerr << "unexpected: cuCtxGetCurrent returned a different context\n"; + // --- ctx_set_current --- + { + suite.run("ctx_device.ctx_set_current", [&]() { + check_cu(cuCtxSetCurrent(ctx), "cuCtxSetCurrent failed"); + }); } - // Cleanup - check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed"); + // --- ctx_get_device --- + { + CUdevice dev; + suite.run("ctx_device.ctx_get_device", [&]() { + check_cu(cuCtxGetDevice(&dev), "cuCtxGetDevice failed"); + }); + } - // Output - bench::print_summary(options.benchmark_name, results); + // --- device_get --- + { + CUdevice dev; + suite.run("ctx_device.device_get", [&]() { + check_cu(cuDeviceGet(&dev, 0), "cuDeviceGet failed"); + }); + } - if (!options.output_path.empty()) { - bench::write_pyperf_json(options.output_path, options.benchmark_name, options.loops, results); + // --- device_get_attribute --- + { + int value = 0; + suite.run("ctx_device.device_get_attribute", [&]() { + check_cu( + cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device), + "cuDeviceGetAttribute failed" + ); + }); } + // Cleanup + check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed"); + + // Write all results + suite.write(); + return 0; } diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/bench_event.cpp b/cuda_bindings/benchmarks/benchmarks/cpp/bench_event.cpp new file mode 100644 index 0000000000..d6ea025e45 --- /dev/null +++ b/cuda_bindings/benchmarks/benchmarks/cpp/bench_event.cpp @@ -0,0 +1,88 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "bench_support.hpp" + +#include +#include + + +static void check_cu(CUresult status, const char* message) { + if (status != CUDA_SUCCESS) { + const char* error_name = nullptr; + cuGetErrorName(status, &error_name); + std::cerr << message << ": " << (error_name ? error_name : "unknown") << '\n'; + std::exit(1); + } +} + + +int main(int argc, char** argv) { + bench::Options options = bench::parse_args(argc, argv); + + // Setup + check_cu(cuInit(0), "cuInit failed"); + + CUdevice device; + check_cu(cuDeviceGet(&device, 0), "cuDeviceGet failed"); + + CUcontext ctx; + CUctxCreateParams ctxParams = {}; + check_cu(cuCtxCreate(&ctx, &ctxParams, 0, device), "cuCtxCreate failed"); + + CUstream stream; + check_cu(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), "cuStreamCreate failed"); + + // Persistent event for query/synchronize/record benchmarks + CUevent event; + check_cu(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING), "cuEventCreate failed"); + + // Record and sync so the event starts in a completed state + check_cu(cuEventRecord(event, stream), "cuEventRecord failed"); + check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed"); + + bench::BenchmarkSuite suite(options); + + // --- event_create_destroy --- + { + CUevent e; + suite.run("event.event_create_destroy", [&]() { + check_cu(cuEventCreate(&e, CU_EVENT_DISABLE_TIMING), "cuEventCreate failed"); + check_cu(cuEventDestroy(e), "cuEventDestroy failed"); + }); + } + + // --- event_record --- + { + suite.run("event.event_record", [&]() { + check_cu(cuEventRecord(event, stream), "cuEventRecord failed"); + }); + } + + // --- event_query --- + { + suite.run("event.event_query", [&]() { + // Returns CUDA_SUCCESS if complete, CUDA_ERROR_NOT_READY if not + cuEventQuery(event); + }); + } + + // --- event_synchronize --- + { + suite.run("event.event_synchronize", [&]() { + check_cu(cuEventSynchronize(event), "cuEventSynchronize failed"); + }); + } + + // Cleanup + check_cu(cuEventDestroy(event), "cuEventDestroy failed"); + check_cu(cuStreamDestroy(stream), "cuStreamDestroy failed"); + check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed"); + + suite.write(); + + return 0; +} diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/bench_pointer_attributes.cpp b/cuda_bindings/benchmarks/benchmarks/cpp/bench_pointer_attributes.cpp index 547a48b0f1..4d9afc6566 100644 --- a/cuda_bindings/benchmarks/benchmarks/cpp/bench_pointer_attributes.cpp +++ b/cuda_bindings/benchmarks/benchmarks/cpp/bench_pointer_attributes.cpp @@ -22,9 +22,6 @@ static void check_cu(CUresult status, const char* message) { int main(int argc, char** argv) { bench::Options options = bench::parse_args(argc, argv); - if (options.benchmark_name.empty()) { - options.benchmark_name = "pointer_attributes.pointer_get_attribute"; - } // Setup: init CUDA, allocate memory check_cu(cuInit(0), "cuInit failed"); @@ -39,31 +36,24 @@ int main(int argc, char** argv) { CUdeviceptr ptr; check_cu(cuMemAlloc(&ptr, 1 << 18), "cuMemAlloc failed"); - unsigned int memory_type = 0; - - // Run benchmark - auto results = bench::run_benchmark(options, [&]() { - check_cu( - cuPointerGetAttribute(&memory_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, ptr), - "cuPointerGetAttribute failed" - ); - }); - - // Sanity check: the call actually did something - if (memory_type == 0) { - std::cerr << "unexpected memory_type=0\n"; + bench::BenchmarkSuite suite(options); + + // --- pointer_get_attribute --- + { + unsigned int memory_type = 0; + suite.run("pointer_attributes.pointer_get_attribute", [&]() { + check_cu( + cuPointerGetAttribute(&memory_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, ptr), + "cuPointerGetAttribute failed" + ); + }); } // Cleanup check_cu(cuMemFree(ptr), "cuMemFree failed"); check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed"); - // Output - bench::print_summary(options.benchmark_name, results); - - if (!options.output_path.empty()) { - bench::write_pyperf_json(options.output_path, options.benchmark_name, options.loops, results); - } + suite.write(); return 0; } diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/bench_stream.cpp b/cuda_bindings/benchmarks/benchmarks/cpp/bench_stream.cpp new file mode 100644 index 0000000000..702e86aef0 --- /dev/null +++ b/cuda_bindings/benchmarks/benchmarks/cpp/bench_stream.cpp @@ -0,0 +1,74 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "bench_support.hpp" + +#include +#include + + +static void check_cu(CUresult status, const char* message) { + if (status != CUDA_SUCCESS) { + const char* error_name = nullptr; + cuGetErrorName(status, &error_name); + std::cerr << message << ": " << (error_name ? error_name : "unknown") << '\n'; + std::exit(1); + } +} + + +int main(int argc, char** argv) { + bench::Options options = bench::parse_args(argc, argv); + + // Setup + check_cu(cuInit(0), "cuInit failed"); + + CUdevice device; + check_cu(cuDeviceGet(&device, 0), "cuDeviceGet failed"); + + CUcontext ctx; + CUctxCreateParams ctxParams = {}; + check_cu(cuCtxCreate(&ctx, &ctxParams, 0, device), "cuCtxCreate failed"); + + // Persistent stream for query/synchronize benchmarks + CUstream stream; + check_cu(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), "cuStreamCreate failed"); + + bench::BenchmarkSuite suite(options); + + // --- stream_create_destroy --- + { + CUstream s; + suite.run("stream.stream_create_destroy", [&]() { + check_cu(cuStreamCreate(&s, CU_STREAM_NON_BLOCKING), "cuStreamCreate failed"); + check_cu(cuStreamDestroy(s), "cuStreamDestroy failed"); + }); + } + + // --- stream_query --- + { + suite.run("stream.stream_query", [&]() { + // cuStreamQuery returns CUDA_SUCCESS if stream is idle, + // CUDA_ERROR_NOT_READY if busy — both are valid here. + cuStreamQuery(stream); + }); + } + + // --- stream_synchronize --- + { + suite.run("stream.stream_synchronize", [&]() { + check_cu(cuStreamSynchronize(stream), "cuStreamSynchronize failed"); + }); + } + + // Cleanup + check_cu(cuStreamDestroy(stream), "cuStreamDestroy failed"); + check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed"); + + suite.write(); + + return 0; +} diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/bench_support.hpp b/cuda_bindings/benchmarks/benchmarks/cpp/bench_support.hpp index 10bcd4d231..837c15a9d1 100644 --- a/cuda_bindings/benchmarks/benchmarks/cpp/bench_support.hpp +++ b/cuda_bindings/benchmarks/benchmarks/cpp/bench_support.hpp @@ -222,4 +222,88 @@ inline void write_pyperf_json( out << "]}]}\n"; } +// A collected benchmark entry: name, loops, and run results +struct BenchmarkEntry { + std::string name; + std::uint64_t loops; + std::vector results; +}; + +// Collect multiple benchmarks from a single binary and write them all +// to one pyperf-compatible JSON file. +class BenchmarkSuite { +public: + explicit BenchmarkSuite(Options options) : options_(std::move(options)) {} + + // Run a benchmark and record it. The name is used as the benchmark ID. + template + void run(const std::string& name, Fn&& fn) { + auto results = run_benchmark(options_, std::forward(fn)); + print_summary(name, results); + entries_.push_back({name, options_.loops, std::move(results)}); + } + + // Write all collected benchmarks to the output file (if -o was given). + void write() const { + if (options_.output_path.empty() || entries_.empty()) + return; + write_multi_pyperf_json(options_.output_path, entries_); + } + +private: + Options options_; + std::vector entries_; + + static void write_multi_pyperf_json( + const std::string& output_path, + const std::vector& entries + ) { + std::ofstream out(output_path); + if (!out) { + std::cerr << "Failed to open output file: " << output_path << '\n'; + std::exit(3); + } + + out << std::setprecision(17); + out << "{\"version\": \"1.0\", \"benchmarks\": ["; + + for (std::size_t e = 0; e < entries.size(); ++e) { + const auto& entry = entries[e]; + if (e > 0) out << ", "; + + out << "{\"metadata\": {"; + out << "\"name\": " << json_str(entry.name) << ", "; + out << "\"loops\": " << entry.loops << ", "; + out << "\"unit\": \"second\""; + out << "}, \"runs\": ["; + + for (std::size_t r = 0; r < entry.results.size(); ++r) { + const auto& run = entry.results[r]; + if (r > 0) out << ", "; + + out << "{\"metadata\": {"; + out << "\"date\": " << json_str(run.date) << ", "; + out << "\"duration\": " << run.duration_sec; + out << "}, "; + + out << "\"warmups\": ["; + for (std::size_t w = 0; w < run.warmup_values.size(); ++w) { + if (w > 0) out << ", "; + out << "[" << entry.loops << ", " << run.warmup_values[w] << "]"; + } + out << "], "; + + out << "\"values\": ["; + for (std::size_t v = 0; v < run.values.size(); ++v) { + if (v > 0) out << ", "; + out << run.values[v]; + } + out << "]}"; + } + out << "]}"; + } + out << "]}\n"; + } +}; + } // namespace bench From a3f0678e0628440a65be445c10a6433b751a922a Mon Sep 17 00:00:00 2001 From: Daniel Rodriguez Date: Wed, 1 Apr 2026 13:48:07 -0500 Subject: [PATCH 5/6] Add Launch benchmarks --- cuda_bindings/benchmarks/.gitignore | 1 - .../benchmarks/benchmarks/bench_launch.py | 107 ++++++++++ .../benchmarks/cpp/bench_launch.cpp | 187 ++++++++++++++++++ cuda_bindings/benchmarks/pixi.lock | 30 +++ cuda_bindings/benchmarks/runner/runtime.py | 52 ++++- 5 files changed, 375 insertions(+), 2 deletions(-) create mode 100644 cuda_bindings/benchmarks/benchmarks/bench_launch.py create mode 100644 cuda_bindings/benchmarks/benchmarks/cpp/bench_launch.cpp diff --git a/cuda_bindings/benchmarks/.gitignore b/cuda_bindings/benchmarks/.gitignore index cb2aee641d..b795782a32 100644 --- a/cuda_bindings/benchmarks/.gitignore +++ b/cuda_bindings/benchmarks/.gitignore @@ -12,6 +12,5 @@ __pycache__/ # Override root .gitignore *.cpp rule (which targets Cython-generated files) !benchmarks/cpp/*.cpp - results-python.json results-cpp.json diff --git a/cuda_bindings/benchmarks/benchmarks/bench_launch.py b/cuda_bindings/benchmarks/benchmarks/bench_launch.py new file mode 100644 index 0000000000..e246db8b69 --- /dev/null +++ b/cuda_bindings/benchmarks/benchmarks/bench_launch.py @@ -0,0 +1,107 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +import ctypes +import time + +from runner.runtime import alloc_persistent, compile_and_load, ensure_context + +from cuda.bindings import driver as cuda + +ensure_context() + +# Compile kernels +KERNEL_SOURCE = """\ +extern "C" __global__ void empty_kernel() { return; } +extern "C" __global__ void small_kernel(float *f) { *f = 0.0f; } + +#define ITEM_PARAM(x, T) T x +#define REP1(x, T) , ITEM_PARAM(x, T) +#define REP2(x, T) REP1(x##0, T) REP1(x##1, T) +#define REP4(x, T) REP2(x##0, T) REP2(x##1, T) +#define REP8(x, T) REP4(x##0, T) REP4(x##1, T) +#define REP16(x, T) REP8(x##0, T) REP8(x##1, T) + +extern "C" __global__ +void small_kernel_16_args( + ITEM_PARAM(F, int*) + REP1(A, int*) + REP2(A, int*) + REP4(A, int*) + REP8(A, int*)) +{ *F = 0; } +""" + +MODULE = compile_and_load(KERNEL_SOURCE) + +# Get kernel handles +_err, EMPTY_KERNEL = cuda.cuModuleGetFunction(MODULE, b"empty_kernel") +_err, SMALL_KERNEL = cuda.cuModuleGetFunction(MODULE, b"small_kernel") +_err, KERNEL_16_ARGS = cuda.cuModuleGetFunction(MODULE, b"small_kernel_16_args") + +# Create a non-blocking stream for launches +_err, STREAM = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value) + +# Allocate device memory for kernel arguments +FLOAT_PTR = alloc_persistent(ctypes.sizeof(ctypes.c_float)) +INT_PTRS = [alloc_persistent(ctypes.sizeof(ctypes.c_int)) for _ in range(16)] + +# Pre-pack ctypes params for the pre-packed benchmark +_val_ps = [ctypes.c_void_p(int(p)) for p in INT_PTRS] +PACKED_16 = (ctypes.c_void_p * 16)() +for _i in range(16): + PACKED_16[_i] = ctypes.addressof(_val_ps[_i]) + + +def bench_launch_empty_kernel(loops: int) -> float: + _cuLaunchKernel = cuda.cuLaunchKernel + _kernel = EMPTY_KERNEL + _stream = STREAM + + t0 = time.perf_counter() + for _ in range(loops): + _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, 0, 0) + return time.perf_counter() - t0 + + +def bench_launch_small_kernel(loops: int) -> float: + _cuLaunchKernel = cuda.cuLaunchKernel + _kernel = SMALL_KERNEL + _stream = STREAM + _args = (FLOAT_PTR,) + _arg_types = (None,) + + t0 = time.perf_counter() + for _ in range(loops): + _cuLaunchKernel( + _kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0 + ) + return time.perf_counter() - t0 + + +def bench_launch_16_args(loops: int) -> float: + _cuLaunchKernel = cuda.cuLaunchKernel + _kernel = KERNEL_16_ARGS + _stream = STREAM + _args = tuple(INT_PTRS) + _arg_types = tuple([None] * 16) + + t0 = time.perf_counter() + for _ in range(loops): + _cuLaunchKernel( + _kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0 + ) + return time.perf_counter() - t0 + + +def bench_launch_16_args_pre_packed(loops: int) -> float: + _cuLaunchKernel = cuda.cuLaunchKernel + _kernel = KERNEL_16_ARGS + _stream = STREAM + _packed = PACKED_16 + + t0 = time.perf_counter() + for _ in range(loops): + _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, _packed, 0) + return time.perf_counter() - t0 diff --git a/cuda_bindings/benchmarks/benchmarks/cpp/bench_launch.cpp b/cuda_bindings/benchmarks/benchmarks/cpp/bench_launch.cpp new file mode 100644 index 0000000000..55d6fa6592 --- /dev/null +++ b/cuda_bindings/benchmarks/benchmarks/cpp/bench_launch.cpp @@ -0,0 +1,187 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#include "bench_support.hpp" + +#include +#include +#include +#include +#include + + +static void check_cu(CUresult status, const char* message) { + if (status != CUDA_SUCCESS) { + const char* error_name = nullptr; + cuGetErrorName(status, &error_name); + std::cerr << message << ": " << (error_name ? error_name : "unknown") << '\n'; + std::exit(1); + } +} + +static void check_nvrtc(nvrtcResult status, const char* message) { + if (status != NVRTC_SUCCESS) { + std::cerr << message << ": " << nvrtcGetErrorString(status) << '\n'; + std::exit(1); + } +} + +static CUmodule compile_and_load(const char* source, CUdevice device) { + int major = 0, minor = 0; + check_cu(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device), + "cuDeviceGetAttribute failed"); + check_cu(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device), + "cuDeviceGetAttribute failed"); + + nvrtcProgram prog; + check_nvrtc(nvrtcCreateProgram(&prog, source, "benchmark_kernel.cu", 0, nullptr, nullptr), + "nvrtcCreateProgram failed"); + + std::string arch = "--gpu-architecture=sm_" + std::to_string(major) + std::to_string(minor); + const char* opts[] = {"--fmad=false", arch.c_str()}; + nvrtcResult compile_result = nvrtcCompileProgram(prog, 2, opts); + + // Print log on failure + if (compile_result != NVRTC_SUCCESS) { + size_t log_size = 0; + nvrtcGetProgramLogSize(prog, &log_size); + std::vector log(log_size); + nvrtcGetProgramLog(prog, log.data()); + std::cerr << "NVRTC compile failed:\n" << log.data() << '\n'; + std::exit(1); + } + + size_t cubin_size = 0; + check_nvrtc(nvrtcGetCUBINSize(prog, &cubin_size), "nvrtcGetCUBINSize failed"); + std::vector cubin(cubin_size); + check_nvrtc(nvrtcGetCUBIN(prog, cubin.data()), "nvrtcGetCUBIN failed"); + nvrtcDestroyProgram(&prog); + + CUmodule module; + check_cu(cuModuleLoadData(&module, cubin.data()), "cuModuleLoadData failed"); + return module; +} + + +static const char* KERNEL_SOURCE = R"( +extern "C" __global__ void empty_kernel() { return; } +extern "C" __global__ void small_kernel(float *f) { *f = 0.0f; } + +#define ITEM_PARAM(x, T) T x +#define REP1(x, T) , ITEM_PARAM(x, T) +#define REP2(x, T) REP1(x##0, T) REP1(x##1, T) +#define REP4(x, T) REP2(x##0, T) REP2(x##1, T) +#define REP8(x, T) REP4(x##0, T) REP4(x##1, T) +#define REP16(x, T) REP8(x##0, T) REP8(x##1, T) + +extern "C" __global__ +void small_kernel_16_args( + ITEM_PARAM(F, int*) + REP1(A, int*) + REP2(A, int*) + REP4(A, int*) + REP8(A, int*)) +{ *F = 0; } +)"; + + +int main(int argc, char** argv) { + bench::Options options = bench::parse_args(argc, argv); + + // Setup + check_cu(cuInit(0), "cuInit failed"); + + CUdevice device; + check_cu(cuDeviceGet(&device, 0), "cuDeviceGet failed"); + + CUcontext ctx; + CUctxCreateParams ctxParams = {}; + check_cu(cuCtxCreate(&ctx, &ctxParams, 0, device), "cuCtxCreate failed"); + + CUmodule module = compile_and_load(KERNEL_SOURCE, device); + + CUfunction empty_kernel, small_kernel, kernel_16_args; + check_cu(cuModuleGetFunction(&empty_kernel, module, "empty_kernel"), "GetFunction failed"); + check_cu(cuModuleGetFunction(&small_kernel, module, "small_kernel"), "GetFunction failed"); + check_cu(cuModuleGetFunction(&kernel_16_args, module, "small_kernel_16_args"), "GetFunction failed"); + + CUstream stream; + check_cu(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), "cuStreamCreate failed"); + + // Allocate device memory for arguments + CUdeviceptr float_ptr; + check_cu(cuMemAlloc(&float_ptr, sizeof(float)), "cuMemAlloc failed"); + + CUdeviceptr int_ptrs[16]; + for (int i = 0; i < 16; ++i) { + check_cu(cuMemAlloc(&int_ptrs[i], sizeof(int)), "cuMemAlloc failed"); + } + + // Pre-pack kernel params for the pre-packed benchmark + void* packed_16[16]; + for (int i = 0; i < 16; ++i) { + packed_16[i] = &int_ptrs[i]; + } + + bench::BenchmarkSuite suite(options); + + // --- launch_empty_kernel --- + { + suite.run("launch.launch_empty_kernel", [&]() { + check_cu( + cuLaunchKernel(empty_kernel, 1, 1, 1, 1, 1, 1, 0, stream, nullptr, nullptr), + "cuLaunchKernel failed" + ); + }); + } + + // --- launch_small_kernel --- + { + void* params[] = {&float_ptr}; + suite.run("launch.launch_small_kernel", [&]() { + check_cu( + cuLaunchKernel(small_kernel, 1, 1, 1, 1, 1, 1, 0, stream, params, nullptr), + "cuLaunchKernel failed" + ); + }); + } + + // --- launch_16_args --- + { + suite.run("launch.launch_16_args", [&]() { + check_cu( + cuLaunchKernel(kernel_16_args, 1, 1, 1, 1, 1, 1, 0, stream, packed_16, nullptr), + "cuLaunchKernel failed" + ); + }); + } + + // --- launch_16_args_pre_packed (same as above for C++ — no packing overhead) --- + // In C++ the params are always pre-packed, so this is identical to launch_16_args. + // We include it for naming parity with the Python benchmark. + { + suite.run("launch.launch_16_args_pre_packed", [&]() { + check_cu( + cuLaunchKernel(kernel_16_args, 1, 1, 1, 1, 1, 1, 0, stream, packed_16, nullptr), + "cuLaunchKernel failed" + ); + }); + } + + // Cleanup + for (int i = 0; i < 16; ++i) { + check_cu(cuMemFree(int_ptrs[i]), "cuMemFree failed"); + } + check_cu(cuMemFree(float_ptr), "cuMemFree failed"); + check_cu(cuStreamDestroy(stream), "cuStreamDestroy failed"); + check_cu(cuModuleUnload(module), "cuModuleUnload failed"); + check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed"); + + suite.write(); + + return 0; +} diff --git a/cuda_bindings/benchmarks/pixi.lock b/cuda_bindings/benchmarks/pixi.lock index 3bc7dbfd59..4eb0c8f117 100644 --- a/cuda_bindings/benchmarks/pixi.lock +++ b/cuda_bindings/benchmarks/pixi.lock @@ -39,6 +39,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.2.51-h376f20c_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-driver-dev_linux-64-13.2.51-h376f20c_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.2.51-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.2.51-hecca717_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-13.2.51-h69a702a_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-64-13.2.51-ha770c72_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.2.51-h4bc722e_0.conda @@ -164,6 +165,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-cudart_linux-64-13.1.80-h376f20c_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-driver-dev_linux-64-13.1.80-h376f20c_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-13.1.115-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.1.115-hecca717_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.1.115-h4bc722e_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-pathfinder-1.4.0-pyhc364b38_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.1-h2ff5cdb_3.conda @@ -648,6 +650,34 @@ packages: license: LicenseRef-NVIDIA-End-User-License-Agreement size: 35736655 timestamp: 1773100338749 +- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.1.115-hecca717_0.conda + sha256: 2c929c592ca1909e3944edec62b77403d256156a4010bfa17fb0b948d33e54d3 + md5: 1096fce4abad7dd975ce6d9953fceb6a + depends: + - __glibc >=2.17,<3.0.a0 + - cuda-nvrtc 13.1.115 hecca717_0 + - cuda-version >=13.1,<13.2.0a0 + - libgcc >=14 + - libstdcxx >=14 + constrains: + - cuda-nvrtc-static >=13.1.115 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 35845 + timestamp: 1768273073971 +- conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvrtc-dev-13.2.51-hecca717_0.conda + sha256: be60eb4e84ff4846b27b323eca402b075f52caf6c138ebb06268fbaa26ef1879 + md5: 83535200a9e77165d5291b4ac82ebf6a + depends: + - __glibc >=2.17,<3.0.a0 + - cuda-nvrtc 13.2.51 hecca717_0 + - cuda-version >=13.2,<13.3.0a0 + - libgcc >=14 + - libstdcxx >=14 + constrains: + - cuda-nvrtc-static >=13.2.51 + license: LicenseRef-NVIDIA-End-User-License-Agreement + size: 36305 + timestamp: 1773100458841 - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-13.2.51-h69a702a_0.conda sha256: d0111ba8fa12b96d38989d2016ecec0c11410c0e566d839ed54f3925591efb0b md5: 03cd3639b8e13623c7b91b1cb0136402 diff --git a/cuda_bindings/benchmarks/runner/runtime.py b/cuda_bindings/benchmarks/runner/runtime.py index d7b6a7bf86..37166ff14c 100644 --- a/cuda_bindings/benchmarks/runner/runtime.py +++ b/cuda_bindings/benchmarks/runner/runtime.py @@ -5,9 +5,12 @@ import atexit from cuda.bindings import driver as cuda +from cuda.bindings import nvrtc _ctx = None +_device = None _persistent_ptrs: list[int] = [] +_modules: list = [] def assert_drv(err) -> None: @@ -16,7 +19,7 @@ def assert_drv(err) -> None: def ensure_context() -> int: - global _ctx + global _ctx, _device if _ctx is not None: return _ctx @@ -25,6 +28,7 @@ def ensure_context() -> int: err, device = cuda.cuDeviceGet(0) assert_drv(err) + _device = device err, ctx = cuda.cuCtxCreate(None, 0, device) assert_drv(err) @@ -40,6 +44,47 @@ def alloc_persistent(size: int) -> int: return ptr +def compile_and_load(kernel_source: str) -> int: + """Compile CUDA C source and returns the CUmodule handle """ + ensure_context() + + err, major = cuda.cuDeviceGetAttribute( + cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, _device + ) + assert_drv(err) + err, minor = cuda.cuDeviceGetAttribute( + cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, _device + ) + assert_drv(err) + + err, prog = nvrtc.nvrtcCreateProgram( + kernel_source.encode(), b"benchmark_kernel.cu", 0, [], [] + ) + assert_drv(err) + + arch_flag = f"--gpu-architecture=sm_{major}{minor}".encode() + (err,) = nvrtc.nvrtcCompileProgram(prog, 2, [b"--fmad=false", arch_flag]) + + # check for compile errors + err_log, log_size = nvrtc.nvrtcGetProgramLogSize(prog) + assert_drv(err_log) + log = b" " * log_size + (err_log,) = nvrtc.nvrtcGetProgramLog(prog, log) + assert_drv(err_log) + assert_drv(err) + + err, cubin_size = nvrtc.nvrtcGetCUBINSize(prog) + assert_drv(err) + cubin = b" " * cubin_size + (err,) = nvrtc.nvrtcGetCUBIN(prog, cubin) + assert_drv(err) + + err, module = cuda.cuModuleLoadData(cubin) + assert_drv(err) + _modules.append(module) + return module + + def cleanup() -> None: global _ctx for ptr in reversed(_persistent_ptrs): @@ -47,6 +92,11 @@ def cleanup() -> None: assert_drv(err) _persistent_ptrs.clear() + for module in reversed(_modules): + (err,) = cuda.cuModuleUnload(module) + assert_drv(err) + _modules.clear() + if _ctx is None: return (err,) = cuda.cuCtxDestroy(_ctx) From e4762ed0d2af53e6c62e019e7f75ded593b96522 Mon Sep 17 00:00:00 2001 From: Daniel Rodriguez Date: Fri, 3 Apr 2026 11:53:35 -0500 Subject: [PATCH 6/6] Lint --- .../benchmarks/benchmarks/bench_ctx_device.py | 1 - .../benchmarks/benchmarks/bench_launch.py | 8 ++----- cuda_bindings/benchmarks/pixi.lock | 21 ++++++++++++++++--- cuda_bindings/benchmarks/runner/cpp.py | 5 ++--- cuda_bindings/benchmarks/runner/runtime.py | 6 ++---- 5 files changed, 24 insertions(+), 17 deletions(-) diff --git a/cuda_bindings/benchmarks/benchmarks/bench_ctx_device.py b/cuda_bindings/benchmarks/benchmarks/bench_ctx_device.py index 3984689eed..1c82cd4046 100644 --- a/cuda_bindings/benchmarks/benchmarks/bench_ctx_device.py +++ b/cuda_bindings/benchmarks/benchmarks/bench_ctx_device.py @@ -60,4 +60,3 @@ def bench_device_get_attribute(loops: int) -> float: for _ in range(loops): _cuDeviceGetAttribute(_attr, _dev) return time.perf_counter() - t0 - diff --git a/cuda_bindings/benchmarks/benchmarks/bench_launch.py b/cuda_bindings/benchmarks/benchmarks/bench_launch.py index e246db8b69..f8e3122008 100644 --- a/cuda_bindings/benchmarks/benchmarks/bench_launch.py +++ b/cuda_bindings/benchmarks/benchmarks/bench_launch.py @@ -74,9 +74,7 @@ def bench_launch_small_kernel(loops: int) -> float: t0 = time.perf_counter() for _ in range(loops): - _cuLaunchKernel( - _kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0 - ) + _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0) return time.perf_counter() - t0 @@ -89,9 +87,7 @@ def bench_launch_16_args(loops: int) -> float: t0 = time.perf_counter() for _ in range(loops): - _cuLaunchKernel( - _kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0 - ) + _cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0) return time.perf_counter() - t0 diff --git a/cuda_bindings/benchmarks/pixi.lock b/cuda_bindings/benchmarks/pixi.lock index 4eb0c8f117..c610db2f45 100644 --- a/cuda_bindings/benchmarks/pixi.lock +++ b/cuda_bindings/benchmarks/pixi.lock @@ -44,7 +44,6 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-nvvm-dev_linux-64-13.2.51-ha770c72_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-impl-13.2.51-h4bc722e_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-nvvm-tools-13.2.51-h4bc722e_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-pathfinder-1.4.0-pyhc364b38_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-version-13.2-he2cc418_3.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/cxx-compiler-1.11.0-hfcd1e18_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/distlib-0.4.0-pyhd8ed1ab_0.conda @@ -132,6 +131,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda - conda: .. + - conda: ../../cuda_pathfinder wheel: channels: - url: https://conda.anaconda.org/conda-forge/ @@ -408,14 +408,15 @@ packages: timestamp: 1771378159534 - conda: .. name: cuda-bindings - version: 13.1.0 + version: 13.2.0 build: hb0f4dca_0 subdir: linux-64 variants: target_platform: linux-64 depends: - python - - cuda-pathfinder >=1.1,<2 + - cuda-version + - cuda-pathfinder - libnvjitlink - cuda-nvrtc - cuda-nvrtc >=13.2.51,<14.0a0 @@ -428,6 +429,9 @@ packages: - libstdcxx >=15 - python_abi 3.14.* *_cp314 license: LicenseRef-NVIDIA-SOFTWARE-LICENSE + sources: + cuda-pathfinder: + path: ../cuda_pathfinder - conda: https://conda.anaconda.org/conda-forge/linux-64/cuda-bindings-13.1.0-py314ha160325_1.conda sha256: aecfbbc9a687e5daba66b896613a00c617e3eadc21a31b19e53e8e642e83d7a7 md5: 3bd3abdf71e1b8c53310195677bf00be @@ -726,6 +730,17 @@ packages: license: LicenseRef-NVIDIA-End-User-License-Agreement size: 25988523 timestamp: 1773115248060 +- conda: ../../cuda_pathfinder + name: cuda-pathfinder + version: 1.3.4a0 + build: pyh4616a5c_0 + subdir: noarch + variants: + target_platform: noarch + depends: + - python >=3.10 + - python * + license: Apache-2.0 - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-pathfinder-1.4.0-pyhc364b38_0.conda sha256: edf16fdfbcce5bbb445118fd8d070dda8afe36b4b437a94f472fde153bc38151 md5: 2d13e524da66b60e6e7d5c6585729ea8 diff --git a/cuda_bindings/benchmarks/runner/cpp.py b/cuda_bindings/benchmarks/runner/cpp.py index 87848145fc..b148c2a628 100644 --- a/cuda_bindings/benchmarks/runner/cpp.py +++ b/cuda_bindings/benchmarks/runner/cpp.py @@ -18,7 +18,7 @@ def discover_binaries() -> dict[str, Path]: - """Discover C++ benchmark binaries in the build directory """ + """Discover C++ benchmark binaries in the build directory""" if not BUILD_DIR.is_dir(): return {} @@ -123,8 +123,7 @@ def main() -> None: registry = discover_binaries() if not registry: print( - f"No C++ benchmark binaries found in {BUILD_DIR}.\n" - "Run 'pixi run bench-cpp-build' first.", + f"No C++ benchmark binaries found in {BUILD_DIR}.\nRun 'pixi run bench-cpp-build' first.", file=sys.stderr, ) sys.exit(1) diff --git a/cuda_bindings/benchmarks/runner/runtime.py b/cuda_bindings/benchmarks/runner/runtime.py index 37166ff14c..c985adb2e2 100644 --- a/cuda_bindings/benchmarks/runner/runtime.py +++ b/cuda_bindings/benchmarks/runner/runtime.py @@ -45,7 +45,7 @@ def alloc_persistent(size: int) -> int: def compile_and_load(kernel_source: str) -> int: - """Compile CUDA C source and returns the CUmodule handle """ + """Compile CUDA C source and returns the CUmodule handle""" ensure_context() err, major = cuda.cuDeviceGetAttribute( @@ -57,9 +57,7 @@ def compile_and_load(kernel_source: str) -> int: ) assert_drv(err) - err, prog = nvrtc.nvrtcCreateProgram( - kernel_source.encode(), b"benchmark_kernel.cu", 0, [], [] - ) + err, prog = nvrtc.nvrtcCreateProgram(kernel_source.encode(), b"benchmark_kernel.cu", 0, [], []) assert_drv(err) arch_flag = f"--gpu-architecture=sm_{major}{minor}".encode()