odin-lang/Odin Issue #4502: global is misaligned

Repo Home

Issues

2024-11-19 20:57:29 bolhak

Odin: dev-2024-11:e6475fec4

OS: Windows 11 Professional (version: 23H2), build 22631.4460

CPU: AMD Ryzen 9 7950X3D 16-Core Processor

RAM: 64701 MiB

Backend: LLVM 18.1.8

Context

I started using Odin about a year ago, and I love it! I can't stop using it, it's so much better than C/C++. I use it whenever I can.

So far, I haven’t had many issues, but today my luck ran out when I tried to compile my code with AVX instructions. After a bit of debugging, I found that some of my global data was misaligned.

I’ve reduced the code to a small example to reproduce the problem.

Expected Behavior

For the global data to be correctly aligned.

Current Behavior

The global data is misaligned, and the AVX builds are blowing up in my face. :(

Steps to Reproduce

Run the code. add/remove padding to make the struct misaligned.

package main

import "core:fmt"
import "core:math/rand"

Globals :: struct
{
    padding04: f32,
    padding08: f32,
    padding12: f32,
    padding16: f32,
    // padding20: f32,
    // padding24: f32,
    // padding28: f32,
    // padding32: f32,
    mat: matrix[4, 4]f32,
}
g: Globals = {} // <- struct is misaligned (seems to be stuck on 16 byte alignment)

// compiled with core-avx2
main :: proc()
{
    g.mat = {
        2, -1.0, -1.0, -1.0,
        -1.0, 2, -1.0, -1.0,
        -1.0, -1.0, 2, -1.0,
        -1.0, -1.0, -1.0, 2,
    } 

    mat: matrix[4, 4]f32
    for i in 0..<4 {
        for j in 0..<4 {
            //g.mat[i, j] = rand.float32_uniform(1.0, 10.0) fails to compile 
            mat[i, j] = rand.float32_uniform(1.0, 10.0)
        }
    }

    fmt.println(size_of(matrix[4, 4]f32))
    fmt.println(align_of(matrix[4, 4]f32))
    fmt.println(size_of(Globals))
    fmt.println(align_of(Globals))

    struct_memory_as_floats := cast([^]f32)&g

    fmt.println("struct address =", rawptr(struct_memory_as_floats)) 
    fmt.println(struct_memory_as_floats[:size_of(Globals) / size_of(f32)])

    fmt.println(g.mat * mat) // CRASH: vmovaps ymm, m256    <- not 32 byte aligned
}

![bugs](https://github.com/user-attachments/assets/81a875ac-0e58-469a-a6ee-50de9d934b30)

Comments (2)

2024-11-19 21:57:48 Kelimion

It doesn't crash for me (Win 10 Pro, AMD Ryzen 9 5950X).

But the last line does print "random" output every time you run it.

[0, 0, 0, 0, 0, 0, 0, 0, 2, -1, -1, -1, -1, 2, -1, -1, -1, -1, 2, -1, -1, -1, -1, 2]
matrix[-6.1015005, -2.5927868, -5.497138, -4.8243656; 5.1538019, -19.57756, -7.1953697, -14.878971; -3.7303448, 1.4992466, -2.9034424, -2.4826412; -19.605844, -5.652091, -12.6513329, -3.7133465]

[0, 0, 0, 0, 0, 0, 0, 0, 2, -1, -1, -1, -1, 2, -1, -1, -1, -1, 2, -1, -1, -1, -1, 2]
matrix[-13.136705, 1.1955423, -3.4499664, -10.6140099; -6.9107647, 1.10276508, 8.6884727, 1.2853293; 0.67079926, -9.7492828, -10.117503, -4.272027; 1.8718166, -10.0977879, -10.544214, -1.89500809]

etc.

2024-11-19 22:06:01 Kelimion

I did find a code gen bug by slightly changing the code:

package main

import "core:fmt"
import "core:math/rand"

Globals :: struct #align(128) {
	padding04: f32,
	padding08: f32,
	padding12: f32,
	padding16: f32,
	// padding20: f32,
	// padding24: f32,
	// padding28: f32,
	// padding32: f32,
	mat: matrix[4, 4]f32,
}
g := Globals{} // <- struct is misaligned (seems to be stuck on 16 byte alignment)

// compiled with core-avx2
main :: proc() {
	g.mat = {
		2, -1.0, -1.0, -1.0,
		-1.0, 2, -1.0, -1.0,
		-1.0, -1.0, 2, -1.0,
		-1.0, -1.0, -1.0, 2,
	}

	mat: matrix[4, 4]f32
	for i in 0..<4 {
		for j in 0..<4 {
			g.mat[i, j] = rand.float32_uniform(1.0, 10.0) // fails to compile
			// mat[i, j] = 1 // rand.float32_uniform(1.0, 10.0)
		}
	}

	fmt.println(size_of(matrix[4, 4]f32))
	fmt.println(align_of(matrix[4, 4]f32))
	fmt.println(size_of(Globals))
	fmt.println(align_of(Globals))

	struct_memory_as_floats := cast([^]f32)&g

	fmt.println("struct address =", rawptr(struct_memory_as_floats))
	fmt.println(struct_memory_as_floats[:size_of(Globals) / size_of(f32)])

	fmt.println(g.mat * mat) // CRASH: vmovaps ymm, m256    <- not 32 byte aligned
}

Which results in:

LLVM CODE GEN FAILED FOR PROCEDURE: main.main
define void @main.main(ptr noalias nocapture nonnull %__.context_ptr) {
decls:
  %mat = alloca [16 x float], align 64
  %i = alloca i64, align 8
  %0 = alloca i64, align 8
  %i1 = alloca i64, align 8
  %j = alloca i64, align 8
  %1 = alloca i64, align 8
  %j4 = alloca i64, align 8
  %2 = alloca { ptr, i64 }, align 8
  %3 = alloca [32 x i8], align 16
  %4 = alloca i64, align 8
  %5 = alloca %..any, align 8
  %6 = alloca i64, align 8
  %7 = alloca %..any, align 8
  %8 = alloca i64, align 8
  %9 = alloca %..any, align 8
  %10 = alloca i64, align 8
  %11 = alloca %..any, align 8
  %struct_memory_as_floats = alloca ptr, align 8
  %12 = alloca %..string, align 8
  %13 = alloca %..any, align 8
  %14 = alloca %..any, align 8
  %15 = alloca { ptr, i64 }, align 8
  %16 = alloca %..any, align 8
  %17 = alloca [16 x float], align 64
  %18 = alloca %..any, align 8
  br label %entry

entry:                                            ; preds = %decls
  store [16 x float] [float 2.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float 2.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float 2.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float 2.000000e+00], ptr getelementptr inbounds (%main.Globals, ptr @main.g, i64 0, i32 5), align 4
  call void @llvm.memset.p0.i64(ptr %mat, i8 0, i64 64, i1 false)
  store i64 0, ptr %i, align 8
  store i64 0, ptr %0, align 8
  br label %for.interval.loop

for.interval.loop:                                ; preds = %for.interval.post5, %entry
  %19 = load i64, ptr %i, align 8
  %20 = icmp slt i64 %19, 4
  br i1 %20, label %for.interval.body, label %for.interval.done6

for.interval.body:                                ; preds = %for.interval.loop
  %21 = load i64, ptr %i, align 8
  %22 = load i64, ptr %0, align 8
  store i64 %21, ptr %i1, align 8
  store i64 0, ptr %j, align 8
  store i64 0, ptr %1, align 8
  br label %for.interval.loop2

for.interval.loop2:                               ; preds = %for.interval.post, %for.interval.body
  %23 = load i64, ptr %j, align 8
  %24 = icmp slt i64 %23, 4
  br i1 %24, label %for.interval.body3, label %for.interval.done

for.interval.body3:                               ; preds = %for.interval.loop2
  %25 = load i64, ptr %j, align 8
  %26 = load i64, ptr %1, align 8
  store i64 %25, ptr %j4, align 8
  %27 = load i64, ptr %i1, align 8
  %28 = load i64, ptr %j4, align 8
  %29 = mul i64 %28, 4
  %30 = add i64 %27, %29
  call void @runtime.matrix_bounds_check_error(ptr @"ggv$3e", i32 31, i32 10, i64 %27, i64 %28, i64 4, i64 4)
  %31 = getelementptr inbounds %runtime.Context, ptr %__.context_ptr, i32 0, i32 4
  %32 = load %runtime.Random_Generator, ptr %31, align 8
  %33 = call float @rand.float32_range(float 1.000000e+00, float 1.000000e+01, ptr %31, ptr %__.context_ptr)
  store float %33, ptr getelementptr (%main.Globals, ptr @main.g, i64 0, i32 5, i64 %30), align 4
  br label %for.interval.post

for.interval.post:                                ; preds = %for.interval.body3
  %34 = load i64, ptr %j, align 8
  %35 = add i64 %34, 1
  store i64 %35, ptr %j, align 8
  %36 = load i64, ptr %1, align 8
  %37 = add i64 %36, 1
  store i64 %37, ptr %1, align 8
  br label %for.interval.loop2

for.interval.done:                                ; preds = %for.interval.loop2
  br label %for.interval.post5

for.interval.post5:                               ; preds = %for.interval.done
  %38 = load i64, ptr %i, align 8
  %39 = add i64 %38, 1
  store i64 %39, ptr %i, align 8
  %40 = load i64, ptr %0, align 8
  %41 = add i64 %40, 1
  store i64 %41, ptr %0, align 8
  br label %for.interval.loop

for.interval.done6:                               ; preds = %for.interval.loop
  call void @llvm.memset.inline.p0.i64(ptr %2, i8 0, i64 16, i1 false)
  call void @llvm.memset.inline.p0.i64(ptr %3, i8 0, i64 32, i1 false)
  %42 = getelementptr [1 x %..any], ptr %3, i64 0, i64 0
  store i64 64, ptr %4, align 8
  %43 = getelementptr inbounds %..any, ptr %5, i32 0, i32 0
  %44 = getelementptr inbounds %..any, ptr %5, i32 0, i32 1
  store ptr %4, ptr %43, align 8
  store i64 4683743612465315844, ptr %44, align 8
  %45 = load %..any, ptr %5, align 8
  store %..any %45, ptr %42, align 8
  %46 = getelementptr [1 x %..any], ptr %3, i64 0, i64 0
  %47 = getelementptr inbounds { ptr, i64 }, ptr %2, i32 0, i32 0
  store ptr %46, ptr %47, align 8
  %48 = getelementptr inbounds { ptr, i64 }, ptr %2, i32 0, i32 1
  store i64 1, ptr %48, align 8
  %49 = load { ptr, i64 }, ptr %2, align 8
  %50 = call i64 @fmt.println(ptr %2, ptr @"ggv$46", i1 zeroext true, ptr %__.context_ptr)
  %51 = getelementptr [1 x %..any], ptr %3, i64 0, i64 0
  store i64 32, ptr %6, align 8
  %52 = getelementptr inbounds %..any, ptr %7, i32 0, i32 0
  %53 = getelementptr inbounds %..any, ptr %7, i32 0, i32 1
  store ptr %6, ptr %52, align 8
  store i64 4683743612465315844, ptr %53, align 8
  %54 = load %..any, ptr %7, align 8
  store %..any %54, ptr %51, align 8
  %55 = getelementptr [1 x %..any], ptr %3, i64 0, i64 0
  %56 = getelementptr inbounds { ptr, i64 }, ptr %2, i32 0, i32 0
  store ptr %55, ptr %56, align 8
  %57 = getelementptr inbounds { ptr, i64 }, ptr %2, i32 0, i32 1
  store i64 1, ptr %57, align 8
  %58 = load { ptr, i64 }, ptr %2, align 8
  %59 = call i64 @fmt.println(ptr %2, ptr @"ggv$48", i1 zeroext true, ptr %__.context_ptr)
  %60 = getelementptr [1 x %..any], ptr %3, i64 0, i64 0
  store i64 128, ptr %8, align 8
  %61 = getelementptr inbounds %..any, ptr %9, i32 0, i32 0
  %62 = getelementptr inbounds %..any, ptr %9, i32 0, i32 1
  store ptr %8, ptr %61, align 8
  store i64 4683743612465315844, ptr %62, align 8
  %63 = load %..any, ptr %9, align 8
  store %..any %63, ptr %60, align 8
  %64 = getelementptr [1 x %..any], ptr %3, i64 0, i64 0
  %65 = getelementptr inbounds { ptr, i64 }, ptr %2, i32 0, i32 0
  store ptr %64, ptr %65, align 8
  %66 = getelementptr inbounds { ptr, i64 }, ptr %2, i32 0, i32 1
  store i64 1, ptr %66, align 8
  %67 = load { ptr, i64 }, ptr %2, align 8
  %68 = call i64 @fmt.println(ptr %2, ptr @"ggv$4c", i1 zeroext true, ptr %__.context_ptr)
  %69 = getelementptr [1 x %..any], ptr %3, i64 0, i64 0
  store i64 128, ptr %10, align 8
  %70 = getelementptr inbounds %..any, ptr %11, i32 0, i32 0
  %71 = getelementptr inbounds %..any, ptr %11, i32 0, i32 1
  store ptr %10, ptr %70, align 8
  store i64 4683743612465315844, ptr %71, align 8
  %72 = load %..any, ptr %11, align 8
  store %..any %72, ptr %69, align 8
  %73 = getelementptr [1 x %..any], ptr %3, i64 0, i64 0
  %74 = getelementptr inbounds { ptr, i64 }, ptr %2, i32 0, i32 0
  store ptr %73, ptr %74, align 8
  %75 = getelementptr inbounds { ptr, i64 }, ptr %2, i32 0, i32 1
  store i64 1, ptr %75, align 8
  %76 = load { ptr, i64 }, ptr %2, align 8
  %77 = call i64 @fmt.println(ptr %2, ptr @"ggv$4f", i1 zeroext true, ptr %__.context_ptr)
  store ptr @main.g, ptr %struct_memory_as_floats, align 8
  %78 = load ptr, ptr %struct_memory_as_floats, align 8
  %79 = getelementptr [2 x %..any], ptr %3, i64 0, i64 0
  store %..string { ptr @"csbs$bbd", i64 16 }, ptr %12, align 8
  %80 = getelementptr inbounds %..any, ptr %13, i32 0, i32 0
  %81 = getelementptr inbounds %..any, ptr %13, i32 0, i32 1
  store ptr %12, ptr %80, align 8
  store i64 432345564227567625, ptr %81, align 8
  %82 = load %..any, ptr %13, align 8
  store %..any %82, ptr %79, align 8
  %83 = getelementptr [2 x %..any], ptr %3, i64 0, i64 1
  call void @llvm.memset.inline.p0.i64(ptr %14, i8 0, i64 16, i1 false)
  %84 = getelementptr inbounds %..any, ptr %14, i32 0, i32 0
  %85 = getelementptr inbounds %..any, ptr %14, i32 0, i32 1
  store ptr %struct_memory_as_floats, ptr %84, align 8
  store i64 720575940379279361, ptr %85, align 8
  %86 = load %..any, ptr %14, align 8
  store %..any %86, ptr %83, align 8
  %87 = getelementptr [2 x %..any], ptr %3, i64 0, i64 0
  %88 = getelementptr inbounds { ptr, i64 }, ptr %2, i32 0, i32 0
  store ptr %87, ptr %88, align 8
  %89 = getelementptr inbounds { ptr, i64 }, ptr %2, i32 0, i32 1
  store i64 2, ptr %89, align 8
  %90 = load { ptr, i64 }, ptr %2, align 8
  %91 = call i64 @fmt.println(ptr %2, ptr @"ggv$5c", i1 zeroext true, ptr %__.context_ptr)
  %92 = load ptr, ptr %struct_memory_as_floats, align 8
  call void @runtime.multi_pointer_slice_expr_error(ptr @"ggv$60", i32 44, i32 37, i64 0, i64 32)
  %93 = getelementptr float, ptr %92, i64 0
  %94 = getelementptr inbounds { ptr, i64 }, ptr %15, i32 0, i32 0
  %95 = getelementptr inbounds { ptr, i64 }, ptr %15, i32 0, i32 1
  store ptr %93, ptr %94, align 8
  store i64 32, ptr %95, align 8
  %96 = load { ptr, i64 }, ptr %15, align 8
  %97 = getelementptr [1 x %..any], ptr %3, i64 0, i64 0
  call void @llvm.memset.inline.p0.i64(ptr %16, i8 0, i64 16, i1 false)
  %98 = getelementptr inbounds %..any, ptr %16, i32 0, i32 0
  %99 = getelementptr inbounds %..any, ptr %16, i32 0, i32 1
  store ptr %15, ptr %98, align 8
  store i64 1152921504606846981, ptr %99, align 8
  %100 = load %..any, ptr %16, align 8
  store %..any %100, ptr %97, align 8
  %101 = getelementptr [1 x %..any], ptr %3, i64 0, i64 0
  %102 = getelementptr inbounds { ptr, i64 }, ptr %2, i32 0, i32 0
  store ptr %101, ptr %102, align 8
  %103 = getelementptr inbounds { ptr, i64 }, ptr %2, i32 0, i32 1
  store i64 1, ptr %103, align 8
  %104 = load { ptr, i64 }, ptr %2, align 8
  %105 = call i64 @fmt.println(ptr %2, ptr @"ggv$67", i1 zeroext true, ptr %__.context_ptr)
  %106 = load [16 x float], ptr getelementptr inbounds (%main.Globals, ptr @main.g, i64 0, i32 5), align 4
  %107 = load [16 x float], ptr %mat, align 4
  %108 = load <16 x float>, ptr getelementptr inbounds (%main.Globals, ptr @main.g, i64 0, i32 5), align 32
  %109 = load <16 x float>, ptr %mat, align 32
  %110 = shufflevector <16 x float> %108, <16 x float> undef, <4 x i32> 
  %111 = shufflevector <16 x float> %108, <16 x float> undef, <4 x i32> 
  %112 = shufflevector <16 x float> %108, <16 x float> undef, <4 x i32> 
  %113 = shufflevector <16 x float> %108, <16 x float> undef, <4 x i32> 
  %114 = shufflevector <16 x float> %109, <16 x float> undef, <4 x i32> 
  %115 = shufflevector <16 x float> %109, <16 x float> undef, <4 x i32> 
  %116 = shufflevector <16 x float> %109, <16 x float> undef, <4 x i32> 
  %117 = shufflevector <16 x float> %109, <16 x float> undef, <4 x i32> 
  call void @llvm.memset.p0.i64(ptr %17, i8 0, i64 64, i1 false)
  %118 = fmul <4 x float> %110, %114
  %119 = shufflevector <4 x float> %118, <4 x float> undef, <2 x i32> 
  %120 = shufflevector <4 x float> %118, <4 x float> undef, <2 x i32> 
  %121 = fadd <2 x float> %119, %120
  %122 = shufflevector <2 x float> %121, <2 x float> undef, <1 x i32> zeroinitializer
  %123 = shufflevector <2 x float> %121, <2 x float> undef, <1 x i32> 
  %124 = fadd <1 x float> %122, %123
  %125 = extractelement <1 x float> %124, i32 0
  %126 = getelementptr [16 x float], ptr %17, i64 0, i64 0
  store float %125, ptr %126, align 4
  %127 = fmul <4 x float> %110, %115
  %128 = shufflevector <4 x float> %127, <4 x float> undef, <2 x i32> 
  %129 = shufflevector <4 x float> %127, <4 x float> undef, <2 x i32> 
  %130 = fadd <2 x float> %128, %129
  %131 = shufflevector <2 x float> %130, <2 x float> undef, <1 x i32> zeroinitializer
  %132 = shufflevector <2 x float> %130, <2 x float> undef, <1 x i32> 
  %133 = fadd <1 x float> %131, %132
  %134 = extractelement <1 x float> %133, i32 0
  %135 = getelementptr [16 x float], ptr %17, i64 0, i64 4
  store float %134, ptr %135, align 4
  %136 = fmul <4 x float> %110, %116
  %137 = shufflevector <4 x float> %136, <4 x float> undef, <2 x i32> 
  %138 = shufflevector <4 x float> %136, <4 x float> undef, <2 x i32> 
  %139 = fadd <2 x float> %137, %138
  %140 = shufflevector <2 x float> %139, <2 x float> undef, <1 x i32> zeroinitializer
  %141 = shufflevector <2 x float> %139, <2 x float> undef, <1 x i32> 
  %142 = fadd <1 x float> %140, %141
  %143 = extractelement <1 x float> %142, i32 0
  %144 = getelementptr [16 x float], ptr %17, i64 0, i64 8
  store float %143, ptr %144, align 4
  %145 = fmul <4 x float> %110, %117
  %146 = shufflevector <4 x float> %145, <4 x float> undef, <2 x i32> 
  %147 = shufflevector <4 x float> %145, <4 x float> undef, <2 x i32> 
  %148 = fadd <2 x float> %146, %147
  %149 = shufflevector <2 x float> %148, <2 x float> undef, <1 x i32> zeroinitializer
  %150 = shufflevector <2 x float> %148, <2 x float> undef, <1 x i32> 
  %151 = fadd <1 x float> %149, %150
  %152 = extractelement <1 x float> %151, i32 0
  %153 = getelementptr [16 x float], ptr %17, i64 0, i64 12
  store float %152, ptr %153, align 4
  %154 = fmul <4 x float> %111, %114
  %155 = shufflevector <4 x float> %154, <4 x float> undef, <2 x i32> 
  %156 = shufflevector <4 x float> %154, <4 x float> undef, <2 x i32> 
  %157 = fadd <2 x float> %155, %156
  %158 = shufflevector <2 x float> %157, <2 x float> undef, <1 x i32> zeroinitializer
  %159 = shufflevector <2 x float> %157, <2 x float> undef, <1 x i32> 
  %160 = fadd <1 x float> %158, %159
  %161 = extractelement <1 x float> %160, i32 0
  %162 = getelementptr [16 x float], ptr %17, i64 0, i64 1
  store float %161, ptr %162, align 4
  %163 = fmul <4 x float> %111, %115
  %164 = shufflevector <4 x float> %163, <4 x float> undef, <2 x i32> 
  %165 = shufflevector <4 x float> %163, <4 x float> undef, <2 x i32> 
  %166 = fadd <2 x float> %164, %165
  %167 = shufflevector <2 x float> %166, <2 x float> undef, <1 x i32> zeroinitializer
  %168 = shufflevector <2 x float> %166, <2 x float> undef, <1 x i32> 
  %169 = fadd <1 x float> %167, %168
  %170 = extractelement <1 x float> %169, i32 0
  %171 = getelementptr [16 x float], ptr %17, i64 0, i64 5
  store float %170, ptr %171, align 4
  %172 = fmul <4 x float> %111, %116
  %173 = shufflevector <4 x float> %172, <4 x float> undef, <2 x i32> 
  %174 = shufflevector <4 x float> %172, <4 x float> undef, <2 x i32> 
  %175 = fadd <2 x float> %173, %174
  %176 = shufflevector <2 x float> %175, <2 x float> undef, <1 x i32> zeroinitializer
  %177 = shufflevector <2 x float> %175, <2 x float> undef, <1 x i32> 
  %178 = fadd <1 x float> %176, %177
  %179 = extractelement <1 x float> %178, i32 0
  %180 = getelementptr [16 x float], ptr %17, i64 0, i64 9
  store float %179, ptr %180, align 4
  %181 = fmul <4 x float> %111, %117
  %182 = shufflevector <4 x float> %181, <4 x float> undef, <2 x i32> 
  %183 = shufflevector <4 x float> %181, <4 x float> undef, <2 x i32> 
  %184 = fadd <2 x float> %182, %183
  %185 = shufflevector <2 x float> %184, <2 x float> undef, <1 x i32> zeroinitializer
  %186 = shufflevector <2 x float> %184, <2 x float> undef, <1 x i32> 
  %187 = fadd <1 x float> %185, %186
  %188 = extractelement <1 x float> %187, i32 0
  %189 = getelementptr [16 x float], ptr %17, i64 0, i64 13
  store float %188, ptr %189, align 4
  %190 = fmul <4 x float> %112, %114
  %191 = shufflevector <4 x float> %190, <4 x float> undef, <2 x i32> 
  %192 = shufflevector <4 x float> %190, <4 x float> undef, <2 x i32> 
  %193 = fadd <2 x float> %191, %192
  %194 = shufflevector <2 x float> %193, <2 x float> undef, <1 x i32> zeroinitializer
  %195 = shufflevector <2 x float> %193, <2 x float> undef, <1 x i32> 
  %196 = fadd <1 x float> %194, %195
  %197 = extractelement <1 x float> %196, i32 0
  %198 = getelementptr [16 x float], ptr %17, i64 0, i64 2
  store float %197, ptr %198, align 4
  %199 = fmul <4 x float> %112, %115
  %200 = shufflevector <4 x float> %199, <4 x float> undef, <2 x i32> 
  %201 = shufflevector <4 x float> %199, <4 x float> undef, <2 x i32> 
  %202 = fadd <2 x float> %200, %201
  %203 = shufflevector <2 x float> %202, <2 x float> undef, <1 x i32> zeroinitializer
  %204 = shufflevector <2 x float> %202, <2 x float> undef, <1 x i32> 
  %205 = fadd <1 x float> %203, %204
  %206 = extractelement <1 x float> %205, i32 0
  %207 = getelementptr [16 x float], ptr %17, i64 0, i64 6
  store float %206, ptr %207, align 4
  %208 = fmul <4 x float> %112, %116
  %209 = shufflevector <4 x float> %208, <4 x float> undef, <2 x i32> 
  %210 = shufflevector <4 x float> %208, <4 x float> undef, <2 x i32> 
  %211 = fadd <2 x float> %209, %210
  %212 = shufflevector <2 x float> %211, <2 x float> undef, <1 x i32> zeroinitializer
  %213 = shufflevector <2 x float> %211, <2 x float> undef, <1 x i32> 
  %214 = fadd <1 x float> %212, %213
  %215 = extractelement <1 x float> %214, i32 0
  %216 = getelementptr [16 x float], ptr %17, i64 0, i64 10
  store float %215, ptr %216, align 4
  %217 = fmul <4 x float> %112, %117
  %218 = shufflevector <4 x float> %217, <4 x float> undef, <2 x i32> 
  %219 = shufflevector <4 x float> %217, <4 x float> undef, <2 x i32> 
  %220 = fadd <2 x float> %218, %219
  %221 = shufflevector <2 x float> %220, <2 x float> undef, <1 x i32> zeroinitializer
  %222 = shufflevector <2 x float> %220, <2 x float> undef, <1 x i32> 
  %223 = fadd <1 x float> %221, %222
  %224 = extractelement <1 x float> %223, i32 0
  %225 = getelementptr [16 x float], ptr %17, i64 0, i64 14
  store float %224, ptr %225, align 4
  %226 = fmul <4 x float> %113, %114
  %227 = shufflevector <4 x float> %226, <4 x float> undef, <2 x i32> 
  %228 = shufflevector <4 x float> %226, <4 x float> undef, <2 x i32> 
  %229 = fadd <2 x float> %227, %228
  %230 = shufflevector <2 x float> %229, <2 x float> undef, <1 x i32> zeroinitializer
  %231 = shufflevector <2 x float> %229, <2 x float> undef, <1 x i32> 
  %232 = fadd <1 x float> %230, %231
  %233 = extractelement <1 x float> %232, i32 0
  %234 = getelementptr [16 x float], ptr %17, i64 0, i64 3
  store float %233, ptr %234, align 4
  %235 = fmul <4 x float> %113, %115
  %236 = shufflevector <4 x float> %235, <4 x float> undef, <2 x i32> 
  %237 = shufflevector <4 x float> %235, <4 x float> undef, <2 x i32> 
  %238 = fadd <2 x float> %236, %237
  %239 = shufflevector <2 x float> %238, <2 x float> undef, <1 x i32> zeroinitializer
  %240 = shufflevector <2 x float> %238, <2 x float> undef, <1 x i32> 
  %241 = fadd <1 x float> %239, %240
  %242 = extractelement <1 x float> %241, i32 0
  %243 = getelementptr [16 x float], ptr %17, i64 0, i64 7
  store float %242, ptr %243, align 4
  %244 = fmul <4 x float> %113, %116
  %245 = shufflevector <4 x float> %244, <4 x float> undef, <2 x i32> 
  %246 = shufflevector <4 x float> %244, <4 x float> undef, <2 x i32> 
  %247 = fadd <2 x float> %245, %246
  %248 = shufflevector <2 x float> %247, <2 x float> undef, <1 x i32> zeroinitializer
  %249 = shufflevector <2 x float> %247, <2 x float> undef, <1 x i32> 
  %250 = fadd <1 x float> %248, %249
  %251 = extractelement <1 x float> %250, i32 0
  %252 = getelementptr [16 x float], ptr %17, i64 0, i64 11
  store float %251, ptr %252, align 4
  %253 = fmul <4 x float> %113, %117
  %254 = shufflevector <4 x float> %253, <4 x float> undef, <2 x i32> 
  %255 = shufflevector <4 x float> %253, <4 x float> undef, <2 x i32> 
  %256 = fadd <2 x float> %254, %255
  %257 = shufflevector <2 x float> %256, <2 x float> undef, <1 x i32> zeroinitializer
  %258 = shufflevector <2 x float> %256, <2 x float> undef, <1 x i32> 
  %259 = fadd <1 x float> %257, %258
  %260 = extractelement <1 x float> %259, i32 0
  %261 = getelementptr [16 x float], ptr %17, i64 0, i64 15
  store float %260, ptr %261, align 4
  %262 = load [16 x float], ptr %17, align 4
  %263 = getelementptr [1 x %..any], ptr %3, i64 0, i64 0
  call void @llvm.memset.inline.p0.i64(ptr %18, i8 0, i64 16, i1 false)
  %264 = getelementptr inbounds %..any, ptr %18, i32 0, i32 0
  %265 = getelementptr inbounds %..any, ptr %18, i32 0, i32 1
  store ptr %17, ptr %264, align 8
  store i64 1729382256910270472, ptr %265, align 8
  %266 = load %..any, ptr %18, align 8
  store %..any %266, ptr %263, align 8
  %267 = getelementptr [1 x %..any], ptr %3, i64 0, i64 0
  %268 = getelementptr inbounds { ptr, i64 }, ptr %2, i32 0, i32 0
  store ptr %267, ptr %268, align 8
  %269 = getelementptr inbounds { ptr, i64 }, ptr %2, i32 0, i32 1
  store i64 1, ptr %269, align 8
  %270 = load { ptr, i64 }, ptr %2, align 8
  %271 = call i64 @fmt.println(ptr %2, ptr @"ggv$6f", i1 zeroext true, ptr %__.context_ptr)
  ret void
}




Use of instruction is not an instruction!
  %30 = add i64 %27, %29