AssertionError: M == 1 #506

fatteneder · 2023-08-28T22:36:40Z

Firstly, many thanks for this amazing package!

I managed to cook it down to the following MWE

using LoopVectorization


function testy(N, MDM, rhs, fx)

    NN = N^2
    rng = 1:NN
    v_rhs = reshape(view(rhs, rng), (N,N))
    v_fx  = reshape(view(fx, rng),  (N,N))

    x = 1.0
    @turbo for j = 1:N
      for i = 1:N
        rhsij = 0.0
        for k = 1:N
          # rhsij += MDM[i,k] * v_fx[k+(N-1)*j]
          rhsij += MDM[i,k] * v_fx[k,j]
        end
        rhsij *= x
        v_rhs[i,j] = rhsij
      end
    end

end


N = 8
MDM   = randn(N,N)
rhs = randn(N*N)
fx  = randn(N*N)

testy(N, MDM, rhs, fx)

gives

julia> include("mwe_lv3.jl")
ERROR: LoadError: AssertionError: M == 1
Stacktrace:
 [1] #s221#49
   @ ~/.julia/packages/LoopVectorization/xHfLl/src/codegen/lower_compute.jl:338 [inlined]
 [2] var"#s221#49"(F::Any, M::Any, K::Any, D::Any, S::Any, ::Any, f::Any, default::Any, #unused#::Type, #unused#::Any, vargs::Any)
   @ LoopVectorization ./none:0
 [3] (::Core.GeneratedFunctionStub)(::Any, ::Vararg{Any})
   @ Core ./boot.jl:602
 [4] macro expansion
   @ ~/.julia/packages/LoopVectorization/xHfLl/src/reconstruct_loopset.jl:1107 [inlined]
 [5] _turbo_!(::Val{(false, 0, 0, 0, false, 4, 32, 15, 64, 0x0000000000000001, 1, true)}, ::Val{(:numericconstant, Symbol("###0.0###5###"), LoopVectorization.OperationStruct(0x00000000000000000000000000000012, 0x00000000000000000000000000000000, 0x00000000000000000000000000000003, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, LoopVectorization.constant, 0x0001, 0x0000), :LoopVectorization, :getindex, LoopVectorization.OperationStruct(0x00000000000000000000000000000023, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, LoopVectorization.memload, 0x0002, 0x0001), :LoopVectorization, :getindex, LoopVectorization.OperationStruct(0x00000000000000000000000000000031, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, LoopVectorization.memload, 0x0003, 0x0002), :LoopVectorization, :vfmadd_fast, LoopVectorization.OperationStruct(0x00000000000000000000000000000231, 0x00000000000000000000000000000003, 0x00000000000000000000000000000000, 0x00000000000000000000000200030001, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, LoopVectorization.compute, 0x0001, 0x0000), :LoopVectorization, :identity, LoopVectorization.OperationStruct(0x00000000000000000000000000000012, 0x00000000000000000000000000000003, 0x00000000000000000000000000000000, 0x00000000000000000000000000000004, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, LoopVectorization.compute, 0x0001, 0x0000), :LoopVectorization, :LOOPCONSTANTINSTRUCTION, LoopVectorization.OperationStruct(0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, LoopVectorization.constant, 0x0004, 0x0000), :numericconstant, Symbol("###reduction##zero###11###"), LoopVectorization.OperationStruct(0x00000000000000000000000000000012, 0x00000000000000000000000000000000, 0x00000000000000000000000000000012, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, LoopVectorization.constant, 0x0005, 0x0000), :LoopVectorization, :mul_fast, LoopVectorization.OperationStruct(0x00000000000000000000000000000012, 0x00000000000000000000000000000012, 0x00000000000000000000000000000000, 0x00000000000000000000000000060007, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, LoopVectorization.compute, 0x0005, 0x0000), :LoopVectorization, :reduced_prod, LoopVectorization.OperationStruct(0x00000000000000000000000000000012, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000080005, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, LoopVectorization.compute, 0x0001, 0x0000), :LoopVectorization, :setindex!, LoopVectorization.OperationStruct(0x00000000000000000000000000000021, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000009, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, LoopVectorization.memstore, 0x0006, 0x0003))}, ::Val{(LoopVectorization.ArrayRefStruct{:MDM, Symbol("##vptr##_MDM")}(0x00000000000000000000000000000101, 0x00000000000000000000000000000203, 0x00000000000000000000000000000000, 0x00000000000000000000000000000101), LoopVectorization.ArrayRefStruct{:v_fx, Symbol("##vptr##_v_fx")}(0x00000000000000000000000000000101, 0x00000000000000000000000000000301, 0x00000000000000000000000000000000, 0x00000000000000000000000000000101), LoopVectorization.ArrayRefStruct{:v_rhs, Symbol("##vptr##_v_rhs")}(0x00000000000000000000000000000101, 0x00000000000000000000000000000201, 0x00000000000000000000000000000000, 0x00000000000000000000000000000101))}, ::Val{(0, (), (6,), (), (), ((1, LoopVectorization.HardFloat),), ((7, 2.0),))}, ::Val{(:j, :i, :k)}, ::Val{Tuple{Tuple{CloseOpenIntervals.CloseOpen{Static.StaticInt{0}, Int64}, CloseOpenIntervals.CloseOpen{Static.StaticInt{0}, Int64}, CloseOpenIntervals.CloseOpen{Static.StaticInt{0}, Int64}}, Tuple{LayoutPointers.GroupedStridedPointers{Tuple{Ptr{Float64}, Ptr{Float64}, Ptr{Float64}}, (1, 1, 1), (0, 0, 0), ((1, 2), (1, 2), (1, 2)), ((1, 2), (3, 4), (5, 6)), Tuple{Static.StaticInt{8}, Int64, Static.StaticInt{8}, Int64, Static.StaticInt{8}, Int64}, NTuple{6, Static.StaticInt{0}}}, Float64}}}, ::Int64, ::Int64, ::Int64, ::Ptr{Float64}, ::Ptr{Float64}, ::Ptr{Float64}, ::Int64, ::Int64, ::Int64, ::Float64)
   @ LoopVectorization ~/.julia/packages/LoopVectorization/xHfLl/src/reconstruct_loopset.jl:1107
 [6] testy(N::Int64, MDM::Matrix{Float64}, rhs::Vector{Float64}, fx::Vector{Float64})
   @ Main .../mwe_lv3.jl:12
 [7] top-level scope
   @ .../mwe_lv3.jl:32
 [8] include(fname::String)
   @ Base.MainInclude ./client.jl:478
 [9] top-level scope
   @ REPL[44]:1
in expression starting at .../mwe_lv3.jl:32

Changing one of the following makes it work though

remove @turbo (obviously)
move @turbo onto the i loop
lower N to 7 or smaller
replace the line in the inner most loop with rhsij += MDM[i,k] * v_fx[k+(N-1)*j]
remove the line rhsij *= x from within the i loop

My setup is

julia> versioninfo()
Julia Version 1.9.3
Commit bed2cd540a1 (2023-08-24 14:43 UTC)
Build Info:
  Official https://julialang.org/ release
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 12 × Intel(R) Core(TM) i7-5820K CPU @ 3.30GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-14.0.6 (ORCJIT, haswell)
  Threads: 1 on 12 virtual cores

and using LoopVectorization v0.12.165.

The text was updated successfully, but these errors were encountered:

chriselrod · 2023-08-28T23:13:20Z

My suggested workaround is

function testy(N, MDM, rhs, fx)

    NN = N^2
    rng = 1:NN
    v_rhs = reshape(view(rhs, rng), (N,N))
    v_fx  = reshape(view(fx, rng),  (N,N))

    x = 1.0
    @turbo for j = 1:N
      for i = 1:N
        rhsij = 0.0
        for k = 1:N
          # rhsij += MDM[i,k] * v_fx[k+(N-1)*j]
          rhsij += MDM[i,k] * v_fx[k,j]
        end
        v_rhs[i,j] = rhsij * x
      end
    end
end

I haven't tested it, but I assume doing rhsij * x instead will fix it.
Similarly, defining a new variable should work too:

function testy(N, MDM, rhs, fx)

    NN = N^2
    rng = 1:NN
    v_rhs = reshape(view(rhs, rng), (N,N))
    v_fx  = reshape(view(fx, rng),  (N,N))

    x = 1.0
    @turbo for j = 1:N
      for i = 1:N
        rhsij = 0.0
        for k = 1:N
          # rhsij += MDM[i,k] * v_fx[k+(N-1)*j]
          rhsij += MDM[i,k] * v_fx[k,j]
        end
        rhsijx = rhsij * x
        v_rhs[i,j] = rhsijx
      end
    end
end

fatteneder · 2023-08-29T07:46:31Z

Indeed, both versions work.

Is this a user problem, a current limitation of LV or just a bug?

chriselrod · 2023-08-29T11:37:39Z

I'd say it is just a bug, but I am personally unlikely to spend any time fixing it.
My goal is still to replace LoopVectorization.jl eventually with a library that doesn't have this bug, but it is a slow process.

fatteneder · 2023-08-29T19:12:12Z

Its fine with me, since your suggested workaround does the job quite well.

Btw I encountered another issue when extending the above MWE by a second inner loop. Consider

using LoopVectorization


function testy(N, MDM, rhs, fx)

    NN = N^2
    rng = 1:NN
    v_rhs = reshape(view(rhs, rng), (N,N))
    v_fx  = reshape(view(fx, rng),  (N,N))

    x = 1.0
    @turbo for j = 1:N
      for i = 1:N
        rhsij = 0.0
        for k = 1:N
          rhsij += MDM[i,k] * v_fx[k,j]
        end
        for l = 1:N # changing this to k gives silently wrong results
          rhsij += MDM[i,l] * v_fx[l,j]
        end
        v_rhs[i,j] = rhsij * x
      end
    end

end


N = 20

MDM = reshape(Float64.(collect(1:N^2)), (N,N))
rhs = deepcopy(MDM)
fx  = deepcopy(MDM)

testy(N, MDM, rhs, fx)
display(sum(rhs))

Depending on whether you use k for the index of both both inner loops or k and l for each the result differs.
I think the problem is independent of whether the bodies of the two loops agree, because in my real application I got my code working again after introducing the l and the loop bodies are different there.

I wonder if this falls under this limitation from the README?

We expect that any time you use the @turbo macro with a given block of code that you:
...
-4. Are not using multiple loops at the same level in nested loops.

chriselrod · 2023-08-29T19:45:16Z

Btw I encountered another issue when extending the above MWE by a second inner loop. Consider

LoopVectorization doesn't support that. LoopModels will.

LoopVectorization.jl requires a loop chain from outer to inner, i.e. no trees.
LoopModels supports trees and forests.

fatteneder · 2023-08-29T20:13:06Z

So I shouldn't rely on the code to work, although when using different indices k,l, it gives me the correct results (for the values I tested)?

chriselrod · 2023-08-29T21:55:09Z

So I shouldn't rely on the code to work, although when using different indices k,l, it gives me the correct results (for the values I tested)?

I am surprised it happened to work at all. Maybe if you try different (larger) sizes, it'll give wrong results?
Without checking, I'd have naively expected it to nest the 4 loops. It really should throw on that.

fatteneder · 2023-08-30T07:39:57Z

Maybe if you try different (larger) sizes, it'll give wrong results?

Indeed, for N >= 25 it starts to give wrong results compared to an implementation without @turbo, interesting.

Luckily, in my application I can split the extra loop off into a separate mul! and still get some benefit from using @turbo.

chriselrod · 2023-08-30T12:29:23Z

Indeed, for N >= 25 it starts to give wrong results compared to an implementation without @turbo, interesting.

I was thinking it nested all the loops, meaning it made a chain of loops 4 deep, instead of a tree with 2 leaves.
But that it managed to produce correct results anyway, due to unrolling aggressively enough that at least one loop didn't repeat when the trip count was low enough, so you happened to get correct answers.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

AssertionError: M == 1 #506

AssertionError: M == 1 #506

fatteneder commented Aug 28, 2023 •

edited

chriselrod commented Aug 28, 2023

fatteneder commented Aug 29, 2023

chriselrod commented Aug 29, 2023 •

edited

fatteneder commented Aug 29, 2023 •

edited

chriselrod commented Aug 29, 2023 •

edited

fatteneder commented Aug 29, 2023

chriselrod commented Aug 29, 2023

fatteneder commented Aug 30, 2023

chriselrod commented Aug 30, 2023

AssertionError: M == 1 #506

AssertionError: M == 1 #506

Comments

fatteneder commented Aug 28, 2023 • edited

chriselrod commented Aug 28, 2023

fatteneder commented Aug 29, 2023

chriselrod commented Aug 29, 2023 • edited

fatteneder commented Aug 29, 2023 • edited

chriselrod commented Aug 29, 2023 • edited

fatteneder commented Aug 29, 2023

chriselrod commented Aug 29, 2023

fatteneder commented Aug 30, 2023

chriselrod commented Aug 30, 2023

fatteneder commented Aug 28, 2023 •

edited

chriselrod commented Aug 29, 2023 •

edited

fatteneder commented Aug 29, 2023 •

edited

chriselrod commented Aug 29, 2023 •

edited