# Copyright 2025 The JAX Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import datetime
import numpy as np

array = np.array
float32 = np.float32


# Pasted from the test output (see export_back_compat_test_util.py module docstring)
data_2025_04_22 = dict(
    testdata_version=1,
    platform='cuda',
    custom_call_targets=['mosaic_gpu'],
    serialized_date=datetime.date(2025, 4, 22),
    inputs=(array([  0.,   1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,
        11.,  12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,  20.,  21.,
        22.,  23.,  24.,  25.,  26.,  27.,  28.,  29.,  30.,  31.,  32.,
        33.,  34.,  35.,  36.,  37.,  38.,  39.,  40.,  41.,  42.,  43.,
        44.,  45.,  46.,  47.,  48.,  49.,  50.,  51.,  52.,  53.,  54.,
        55.,  56.,  57.,  58.,  59.,  60.,  61.,  62.,  63.,  64.,  65.,
        66.,  67.,  68.,  69.,  70.,  71.,  72.,  73.,  74.,  75.,  76.,
        77.,  78.,  79.,  80.,  81.,  82.,  83.,  84.,  85.,  86.,  87.,
        88.,  89.,  90.,  91.,  92.,  93.,  94.,  95.,  96.,  97.,  98.,
        99., 100., 101., 102., 103., 104., 105., 106., 107., 108., 109.,
       110., 111., 112., 113., 114., 115., 116., 117., 118., 119., 120.,
       121., 122., 123., 124., 125., 126., 127., 128., 129., 130., 131.,
       132., 133., 134., 135., 136., 137., 138., 139., 140., 141., 142.,
       143., 144., 145., 146., 147., 148., 149., 150., 151., 152., 153.,
       154., 155., 156., 157., 158., 159., 160., 161., 162., 163., 164.,
       165., 166., 167., 168., 169., 170., 171., 172., 173., 174., 175.,
       176., 177., 178., 179., 180., 181., 182., 183., 184., 185., 186.,
       187., 188., 189., 190., 191., 192., 193., 194., 195., 196., 197.,
       198., 199., 200., 201., 202., 203., 204., 205., 206., 207., 208.,
       209., 210., 211., 212., 213., 214., 215., 216., 217., 218., 219.,
       220., 221., 222., 223., 224., 225., 226., 227., 228., 229., 230.,
       231., 232., 233., 234., 235., 236., 237., 238., 239., 240., 241.,
       242., 243., 244., 245., 246., 247., 248., 249., 250., 251., 252.,
       253., 254., 255.], dtype=float32),),
    expected_outputs=(array([  1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,  11.,
        12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,  20.,  21.,  22.,
        23.,  24.,  25.,  26.,  27.,  28.,  29.,  30.,  31.,  32.,  33.,
        34.,  35.,  36.,  37.,  38.,  39.,  40.,  41.,  42.,  43.,  44.,
        45.,  46.,  47.,  48.,  49.,  50.,  51.,  52.,  53.,  54.,  55.,
        56.,  57.,  58.,  59.,  60.,  61.,  62.,  63.,  64.,  65.,  66.,
        67.,  68.,  69.,  70.,  71.,  72.,  73.,  74.,  75.,  76.,  77.,
        78.,  79.,  80.,  81.,  82.,  83.,  84.,  85.,  86.,  87.,  88.,
        89.,  90.,  91.,  92.,  93.,  94.,  95.,  96.,  97.,  98.,  99.,
       100., 101., 102., 103., 104., 105., 106., 107., 108., 109., 110.,
       111., 112., 113., 114., 115., 116., 117., 118., 119., 120., 121.,
       122., 123., 124., 125., 126., 127., 128., 129., 130., 131., 132.,
       133., 134., 135., 136., 137., 138., 139., 140., 141., 142., 143.,
       144., 145., 146., 147., 148., 149., 150., 151., 152., 153., 154.,
       155., 156., 157., 158., 159., 160., 161., 162., 163., 164., 165.,
       166., 167., 168., 169., 170., 171., 172., 173., 174., 175., 176.,
       177., 178., 179., 180., 181., 182., 183., 184., 185., 186., 187.,
       188., 189., 190., 191., 192., 193., 194., 195., 196., 197., 198.,
       199., 200., 201., 202., 203., 204., 205., 206., 207., 208., 209.,
       210., 211., 212., 213., 214., 215., 216., 217., 218., 219., 220.,
       221., 222., 223., 224., 225., 226., 227., 228., 229., 230., 231.,
       232., 233., 234., 235., 236., 237., 238., 239., 240., 241., 242.,
       243., 244., 245., 246., 247., 248., 249., 250., 251., 252., 253.,
       254., 255., 256.], dtype=float32),),
    mlir_module_text=r"""
#loc1 = loc("args[0]")
module @jit_wrapped attributes {jax.uses_shape_polymorphism = false, mhlo.num_partitions = 1 : i32, mhlo.num_replicas = 1 : i32} {
  func.func public @main(%arg0: tensor<256xf32> loc("args[0]")) -> (tensor<256xf32> {jax.result_info = "result"}) {
    %0 = stablehlo.custom_call @mosaic_gpu(%arg0) {api_version = 2 : i32, backend_config = "\A9C\FB\81\9A1\C2?\0E\F4\E1\E4\E77\03\B6\97\E5G(]WR\98\EB{\BA\8A\84\01\12'#loc = loc(\22third_party/py/jax/tests/pallas/export_back_compat_pallas_test.py\22:83:4)\0A#loc1 = loc(\22-\22:94:40)\0A#loc2 = loc(\22-\22:94:47)\0A#loc3 = loc(\22-\22:94:54)\0A#loc4 = loc(\22-\22:94:116)\0A#loc5 = loc(\22-\22:94:123)\0A#loc6 = loc(\22-\22:94:130)\0A#loc7 = loc(\22-\22:94:65)\0A#loc8 = loc(\22-\22:94:78)\0A#loc9 = loc(\22-\22:94:91)\0A#loc10 = loc(\22-\22:94:141)\0A#loc11 = loc(\22-\22:94:157)\0A#loc12 = loc(\22-\22:94:174)\0A#loc17 = loc(\22jit(wrapped)/jit(main)/pallas_call\22(#loc))\0A\22builtin.module\22() <{sym_name = \22add_one\22}> ({\0A  \22stable_mosaic_gpu.func.func\22() ({\0A  }) {function_type = (!llvm.ptr, !llvm.ptr, i64, i64, !llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> (), sym_name = \22mosaic_gpu_init_tma_desc\22, sym_visibility = \22private\22} : () -> () loc(#loc17)\0A  \22stable_mosaic_gpu.llvm.mlir.global\22() ({\0A  }) {addr_space = 4 : i32, global_type = !llvm.array<0 x i8>, linkage = #llvm.linkage<external>, sym_name = \22global_scratch\22, unnamed_addr = 0 : i64, visibility_ = 0 : i64} : () -> () loc(#loc17)\0A  \22stable_mosaic_gpu.func.func\22() ({\0A  ^bb0(%arg0: !llvm.ptr loc(\22jit(wrapped)/jit(main)/pallas_call\22(#loc)), %arg1: !llvm.ptr loc(\22jit(wrapped)/jit(main)/pallas_call\22(#loc))):\0A    %0 = \22stable_mosaic_gpu.builtin.unrealized_conversion_cast\22(%arg0) : (!llvm.ptr) -> !gpu.async.token loc(#loc17)\0A    %1 = \22stable_mosaic_gpu.llvm.getelementptr\22(%arg1) {elem_type = !llvm.ptr, rawConstantIndices = array<i32: 0>} : (!llvm.ptr) -> !llvm.ptr loc(#loc17)\0A    %2 = \22stable_mosaic_gpu.llvm.load\22(%1) {ordering = 0 : i64} : (!llvm.ptr) -> !llvm.ptr loc(#loc17)\0A    %3 = \22stable_mosaic_gpu.llvm.mlir.undef\22() : () -> !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> loc(#loc17)\0A    %4 = \22stable_mosaic_gpu.llvm.insertvalue\22(%3, %2) {position = array<i64: 0>} : (!llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>, !llvm.ptr) -> !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> loc(#loc17)\0A    %5 = \22stable_mosaic_gpu.llvm.insertvalue\22(%4, %2) {position = array<i64: 1>} : (!llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>, !llvm.ptr) -> !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> loc(#loc17)\0A    %6 = \22stable_mosaic_gpu.llvm.mlir.constant\22() {value = 0 : i64} : () -> i64 loc(#loc17)\0A    %7 = \22stable_mosaic_gpu.llvm.insertvalue\22(%5, %6) {position = array<i64: 2>} : (!llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>, i64) -> !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> loc(#loc17)\0A    %8 = \22stable_mosaic_gpu.llvm.mlir.constant\22() {value = 256 : i64} : () -> i64 loc(#loc17)\0A    %9 = \22stable_mosaic_gpu.llvm.insertvalue\22(%7, %8) {position = array<i64: 3, 0>} : (!llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>, i64) -> !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> loc(#loc17)\0A    %10 = \22stable_mosaic_gpu.llvm.mlir.constant\22() {value = 1 : i64} : () -> i64 loc(#loc17)\0A    %11 = \22stable_mosaic_gpu.llvm.insertvalue\22(%9, %10) {position = array<i64: 4, 0>} : (!llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>, i64) -> !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> loc(#loc17)\0A    %12 = \22stable_mosaic_gpu.builtin.unrealized_conversion_cast\22(%11) : (!llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>) -> memref<256xf32> loc(#loc17)\0A    %13 = \22stable_mosaic_gpu.llvm.getelementptr\22(%arg1) {elem_type = !llvm.ptr, rawConstantIndices = array<i32: 1>} : (!llvm.ptr) -> !llvm.ptr loc(#loc17)\0A    %14 = \22stable_mosaic_gpu.llvm.load\22(%13) {ordering = 0 : i64} : (!llvm.ptr) -> !llvm.ptr loc(#loc17)\0A    %15 = \22stable_mosaic_gpu.llvm.mlir.undef\22() : () -> !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> loc(#loc17)\0A    %16 = \22stable_mosaic_gpu.llvm.insertvalue\22(%15, %14) {position = array<i64: 0>} : (!llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>, !llvm.ptr) -> !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> loc(#loc17)\0A    %17 = \22stable_mosaic_gpu.llvm.insertvalue\22(%16, %14) {position = array<i64: 1>} : (!llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>, !llvm.ptr) -> !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> loc(#loc17)\0A    %18 = \22stable_mosaic_gpu.llvm.mlir.constant\22() {value = 0 : i64} : () -> i64 loc(#loc17)\0A    %19 = \22stable_mosaic_gpu.llvm.insertvalue\22(%17, %18) {position = array<i64: 2>} : (!llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>, i64) -> !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> loc(#loc17)\0A    %20 = \22stable_mosaic_gpu.llvm.mlir.constant\22() {value = 256 : i64} : () -> i64 loc(#loc17)\0A    %21 = \22stable_mosaic_gpu.llvm.insertvalue\22(%19, %20) {position = array<i64: 3, 0>} : (!llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>, i64) -> !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> loc(#loc17)\0A    %22 = \22stable_mosaic_gpu.llvm.mlir.constant\22() {value = 1 : i64} : () -> i64 loc(#loc17)\0A    %23 = \22stable_mosaic_gpu.llvm.insertvalue\22(%21, %22) {position = array<i64: 4, 0>} : (!llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>, i64) -> !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> loc(#loc17)\0A    %24 = \22stable_mosaic_gpu.builtin.unrealized_conversion_cast\22(%23) : (!llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>) -> memref<256xf32> loc(#loc17)\0A    %25 = \22stable_mosaic_gpu.arith.constant\22() {value = 1 : i64} : () -> i64 loc(#loc17)\0A    %26 = \22stable_mosaic_gpu.llvm.alloca\22(%25) {alignment = 64 : i64, elem_type = !llvm.array<256 x i8>} : (i64) -> !llvm.ptr loc(#loc17)\0A    %27 = \22stable_mosaic_gpu.llvm.getelementptr\22(%26) {elem_type = i8, rawConstantIndices = array<i32: 0>} : (!llvm.ptr) -> !llvm.ptr loc(#loc17)\0A    %28:4 = \22stable_mosaic_gpu.memref.extract_strided_metadata\22(%12) : (memref<256xf32>) -> (memref<f32>, index, index, index) loc(#loc17)\0A    %29 = \22stable_mosaic_gpu.memref.extract_aligned_pointer_as_index\22(%12) : (memref<256xf32>) -> index loc(#loc17)\0A    %30 = \22stable_mosaic_gpu.arith.index_cast\22(%29) : (index) -> i64 loc(#loc17)\0A    %31 = \22stable_mosaic_gpu.llvm.inttoptr\22(%30) : (i64) -> !llvm.ptr loc(#loc17)\0A    %32 = \22stable_mosaic_gpu.arith.index_cast\22(%28#1) : (index) -> i64 loc(#loc17)\0A    %33 = \22stable_mosaic_gpu.llvm.getelementptr\22(%31, %32) {elem_type = f32, rawConstantIndices = array<i32: -2147483648>} : (!llvm.ptr, i64) -> !llvm.ptr loc(#loc17)\0A    %34 = \22stable_mosaic_gpu.arith.constant\22() {value = 6 : i64} : () -> i64 loc(#loc17)\0A    %35 = \22stable_mosaic_gpu.arith.constant\22() {value = 1 : i64} : () -> i64 loc(#loc17)\0A    %36 = \22stable_mosaic_gpu.arith.index_cast\22(%28#2) : (index) -> i64 loc(#loc17)\0A    %37 = \22stable_mosaic_gpu.arith.constant\22() {value = 1 : i64} : () -> i64 loc(#loc17)\0A    %38 = \22stable_mosaic_gpu.llvm.alloca\22(%37) {elem_type = i64} : (i64) -> !llvm.ptr loc(#loc17)\0A    %39 = \22stable_mosaic_gpu.llvm.getelementptr\22(%38) {elem_type = i64, rawConstantIndices = array<i32: 0>} : (!llvm.ptr) -> !llvm.ptr loc(#loc17)\0A    \22stable_mosaic_gpu.llvm.store\22(%36, %39) {ordering = 0 : i64} : (i64, !llvm.ptr) -> () loc(#loc17)\0A    %40 = \22stable_mosaic_gpu.arith.index_cast\22(%28#3) : (index) -> i64 loc(#loc17)\0A    %41 = \22stable_mosaic_gpu.arith.constant\22() {value = 1 : i64} : () -> i64 loc(#loc17)\0A    %42 = \22stable_mosaic_gpu.llvm.alloca\22(%41) {elem_type = i64} : (i64) -> !llvm.ptr loc(#loc17)\0A    %43 = \22stable_mosaic_gpu.llvm.getelementptr\22(%42) {elem_type = i64, rawConstantIndices = array<i32: 0>} : (!llvm.ptr) -> !llvm.ptr loc(#loc17)\0A    \22stable_mosaic_gpu.llvm.store\22(%40, %43) {ordering = 0 : i64} : (i64, !llvm.ptr) -> () loc(#loc17)\0A    %44 = \22stable_mosaic_gpu.arith.constant\22() {value = 16 : i64} : () -> i64 loc(#loc17)\0A    %45 = \22stable_mosaic_gpu.arith.constant\22() {value = 256 : i64} : () -> i64 loc(#loc17)\0A    %46 = \22stable_mosaic_gpu.arith.constant\22() {value = 1 : i64} : () -> i64 loc(#loc17)\0A    %47 = \22stable_mosaic_gpu.llvm.alloca\22(%46) {elem_type = i64} : (i64) -> !llvm.ptr loc(#loc17)\0A    %48 = \22stable_mosaic_gpu.llvm.getelementptr\22(%47) {elem_type = i64, rawConstantIndices = array<i32: 0>} : (!llvm.ptr) -> !llvm.ptr loc(#loc17)\0A    \22stable_mosaic_gpu.llvm.store\22(%45, %48) {ordering = 0 : i64} : (i64, !llvm.ptr) -> () loc(#loc17)\0A    \22stable_mosaic_gpu.func.call\22(%27, %33, %34, %35, %38, %42, %44, %47) {callee = @mosaic_gpu_init_tma_desc} : (!llvm.ptr, !llvm.ptr, i64, i64, !llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> () loc(#loc17)\0A    %49 = \22stable_mosaic_gpu.llvm.getelementptr\22(%26) {elem_type = i8, rawConstantIndices = array<i32: 128>} : (!llvm.ptr) -> !llvm.ptr loc(#loc17)\0A    %50:4 = \22stable_mosaic_gpu.memref.extract_strided_metadata\22(%24) : (memref<256xf32>) -> (memref<f32>, index, index, index) loc(#loc17)\0A    %51 = \22stable_mosaic_gpu.memref.extract_aligned_pointer_as_index\22(%24) : (memref<256xf32>) -> index loc(#loc17)\0A    %52 = \22stable_mosaic_gpu.arith.index_cast\22(%51) : (index) -> i64 loc(#loc17)\0A    %53 = \22stable_mosaic_gpu.llvm.inttoptr\22(%52) : (i64) -> !llvm.ptr loc(#loc17)\0A    %54 = \22stable_mosaic_gpu.arith.index_cast\22(%50#1) : (index) -> i64 loc(#loc17)\0A    %55 = \22stable_mosaic_gpu.llvm.getelementptr\22(%53, %54) {elem_type = f32, rawConstantIndices = array<i32: -2147483648>} : (!llvm.ptr, i64) -> !llvm.ptr loc(#loc17)\0A    %56 = \22stable_mosaic_gpu.arith.constant\22() {value = 6 : i64} : () -> i64 loc(#loc17)\0A    %57 = \22stable_mosaic_gpu.arith.constant\22() {value = 1 : i64} : () -> i64 loc(#loc17)\0A    %58 = \22stable_mosaic_gpu.arith.index_cast\22(%50#2) : (index) -> i64 loc(#loc17)\0A    %59 = \22stable_mosaic_gpu.arith.constant\22() {value = 1 : i64} : () -> i64 loc(#loc17)\0A    %60 = \22stable_mosaic_gpu.llvm.alloca\22(%59) {elem_type = i64} : (i64) -> !llvm.ptr loc(#loc17)\0A    %61 = \22stable_mosaic_gpu.llvm.getelementptr\22(%60) {elem_type = i64, rawConstantIndices = array<i32: 0>} : (!llvm.ptr) -> !llvm.ptr loc(#loc17)\0A    \22stable_mosaic_gpu.llvm.store\22(%58, %61) {ordering = 0 : i64} : (i64, !llvm.ptr) -> () loc(#loc17)\0A    %62 = \22stable_mosaic_gpu.arith.index_cast\22(%50#3) : (index) -> i64 loc(#loc17)\0A    %63 = \22stable_mosaic_gpu.arith.constant\22() {value = 1 : i64} : () -> i64 loc(#loc17)\0A    %64 = \22stable_mosaic_gpu.llvm.alloca\22(%63) {elem_type = i64} : (i64) -> !llvm.ptr loc(#loc17)\0A    %65 = \22stable_mosaic_gpu.llvm.getelementptr\22(%64) {elem_type = i64, rawConstantIndices = array<i32: 0>} : (!llvm.ptr) -> !llvm.ptr loc(#loc17)\0A    \22stable_mosaic_gpu.llvm.store\22(%62, %65) {ordering = 0 : i64} : (i64, !llvm.ptr) -> () loc(#loc17)\0A    %66 = \22stable_mosaic_gpu.arith.constant\22() {value = 16 : i64} : () -> i64 loc(#loc17)\0A    %67 = \22stable_mosaic_gpu.arith.constant\22() {value = 256 : i64} : () -> i64 loc(#loc17)\0A    %68 = \22stable_mosaic_gpu.arith.constant\22() {value = 1 : i64} : () -> i64 loc(#loc17)\0A    %69 = \22stable_mosaic_gpu.llvm.alloca\22(%68) {elem_type = i64} : (i64) -> !llvm.ptr loc(#loc17)\0A    %70 = \22stable_mosaic_gpu.llvm.getelementptr\22(%69) {elem_type = i64, rawConstantIndices = array<i32: 0>} : (!llvm.ptr) -> !llvm.ptr loc(#loc17)\0A    \22stable_mosaic_gpu.llvm.store\22(%67, %70) {ordering = 0 : i64} : (i64, !llvm.ptr) -> () loc(#loc17)\0A    \22stable_mosaic_gpu.func.call\22(%49, %55, %56, %57, %60, %64, %66, %69) {callee = @mosaic_gpu_init_tma_desc} : (!llvm.ptr, !llvm.ptr, i64, i64, !llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> () loc(#loc17)\0A    %71 = \22stable_mosaic_gpu.llvm.load\22(%26) {ordering = 0 : i64} : (!llvm.ptr) -> !llvm.array<256 x i8> loc(#loc17)\0A    %72 = \22stable_mosaic_gpu.arith.constant\22() {value = 2 : index} : () -> index loc(#loc17)\0A    %73 = \22stable_mosaic_gpu.arith.constant\22() {value = 1 : index} : () -> index loc(#loc17)\0A    %74 = \22stable_mosaic_gpu.arith.constant\22() {value = 1 : index} : () -> index loc(#loc17)\0A    %75 = \22stable_mosaic_gpu.arith.constant\22() {value = 128 : index} : () -> index loc(#loc17)\0A    %76 = \22stable_mosaic_gpu.arith.constant\22() {value = 1 : index} : () -> index loc(#loc17)\0A    %77 = \22stable_mosaic_gpu.arith.constant\22() {value = 1 : index} : () -> index loc(#loc17)\0A    %78 = \22stable_mosaic_gpu.arith.constant\22() {value = 2056 : i32} : () -> i32 loc(#loc17)\0A    %79 = \22stable_mosaic_gpu.gpu.launch\22(%0, %72, %73, %74, %75, %76, %77, %78) ({\0A    ^bb0(%arg2: index loc(\22-\22:94:40), %arg3: index loc(\22-\22:94:47), %arg4: index loc(\22-\22:94:54), %arg5: index loc(\22-\22:94:116), %arg6: index loc(\22-\22:94:123), %arg7: index loc(\22-\22:94:130), %arg8: index loc(\22-\22:94:65), %arg9: index loc(\22-\22:94:78), %arg10: index loc(\22-\22:94:91), %arg11: index loc(\22-\22:94:141), %arg12: index loc(\22-\22:94:157), %arg13: index loc(\22-\22:94:174)):\0A      %80 = \22stable_mosaic_gpu.gpu.dynamic_shared_memory\22() : () -> memref<?xi8, #gpu.address_space<workgroup>> loc(#loc17)\0A      %81 = \22stable_mosaic_gpu.builtin.unrealized_conversion_cast\22(%71) : (!llvm.array<256 x i8>) -> !llvm.ptr loc(#loc17)\0A      %82 = \22stable_mosaic_gpu.llvm.getelementptr\22(%81) {elem_type = i8, rawConstantIndices = array<i32: 128>} : (!llvm.ptr) -> !llvm.ptr loc(#loc18)\0A      %83 = \22stable_mosaic_gpu.llvm.getelementptr\22(%81) {elem_type = i8, rawConstantIndices = array<i32: 0>} : (!llvm.ptr) -> !llvm.ptr loc(#loc19)\0A      %84 = \22stable_mosaic_gpu.arith.constant\22() {value = 0 : index} : () -> index loc(#loc17)\0A      %85 = \22stable_mosaic_gpu.memref.view\22(%80, %84) : (memref<?xi8, #gpu.address_space<workgroup>>, index) -> memref<2048xi8, #gpu.address_space<workgroup>> loc(#loc17)\0A      %86 = \22stable_mosaic_gpu.builtin.unrealized_conversion_cast\22(%80) : (memref<?xi8, #gpu.address_space<workgroup>>) -> !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> loc(#loc17)\0A      %87 = \22stable_mosaic_gpu.llvm.extractvalue\22(%86) {position = array<i64: 1>} : (!llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>) -> !llvm.ptr<3> loc(#loc17)\0A      %88 = \22stable_mosaic_gpu.llvm.extractvalue\22(%86) {position = array<i64: 2>} : (!llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>) -> i64 loc(#loc17)\0A      %89 = \22stable_mosaic_gpu.arith.constant\22() {value = 1 : i64} : () -> i64 loc(#loc17)\0A      %90 = \22stable_mosaic_gpu.llvm.mul\22(%88, %89) : (i64, i64) -> i64 loc(#loc17)\0A      %91 = \22stable_mosaic_gpu.llvm.ptrtoint\22(%87) : (!llvm.ptr<3>) -> i64 loc(#loc17)\0A      %92 = \22stable_mosaic_gpu.llvm.add\22(%91, %90) : (i64, i64) -> i64 loc(#loc17)\0A      %93 = \22stable_mosaic_gpu.llvm.inttoptr\22(%92) : (i64) -> !llvm.ptr<3> loc(#loc17)\0A      %94 = \22stable_mosaic_gpu.llvm.getelementptr\22(%93) {elem_type = i8, rawConstantIndices = array<i32: 2048>} : (!llvm.ptr<3>) -> !llvm.ptr<3> loc(#loc17)\0A      %95 = \22stable_mosaic_gpu.memref.alloca\22() {operandSegmentSizes = array<i32: 0, 0>} : () -> memref<i32> loc(#loc17)\0A      %96 = \22stable_mosaic_gpu.arith.constant\22() {value = 0 : i32} : () -> i32 loc(#loc17)\0A      \22stable_mosaic_gpu.memref.store\22(%96, %95) : (i32, memref<i32>) -> () loc(#loc17)\0A      %97 = \22stable_mosaic_gpu.nvvm.elect.sync\22() : () -> i1 loc(#loc17)\0A      %98 = \22stable_mosaic_gpu.gpu.thread_id\22() {dimension = #gpu<dim x>} : () -> index loc(#loc17)\0A      %99 = \22stable_mosaic_gpu.arith.index_cast\22(%98) : (index) -> i32 loc(#loc17)\0A      %100 = \22stable_mosaic_gpu.gpu.block_dim\22() {dimension = #gpu<dim x>} : () -> index loc(#loc17)\0A      %101 = \22stable_mosaic_gpu.arith.index_cast\22(%100) : (index) -> i32 loc(#loc17)\0A      %102 = \22stable_mosaic_gpu.gpu.thread_id\22() {dimension = #gpu<dim y>} : () -> index loc(#loc17)\0A      %103 = \22stable_mosaic_gpu.arith.index_cast\22(%102) : (index) -> i32 loc(#loc17)\0A      %104 = \22stable_mosaic_gpu.arith.muli\22(%103, %101) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc17)\0A      %105 = \22stable_mosaic_gpu.arith.addi\22(%99, %104) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc17)\0A      %106 = \22stable_mosaic_gpu.gpu.block_dim\22() {dimension = #gpu<dim y>} : () -> index loc(#loc17)\0A      %107 = \22stable_mosaic_gpu.arith.index_cast\22(%106) : (index) -> i32 loc(#loc17)\0A      %108 = \22stable_mosaic_gpu.arith.muli\22(%101, %107) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc17)\0A      %109 = \22stable_mosaic_gpu.gpu.thread_id\22() {dimension = #gpu<dim z>} : () -> index loc(#loc17)\0A      %110 = \22stable_mosaic_gpu.arith.index_cast\22(%109) : (index) -> i32 loc(#loc17)\0A      %111 = \22stable_mosaic_gpu.arith.muli\22(%110, %108) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc17)\0A      %112 = \22stable_mosaic_gpu.arith.addi\22(%105, %111) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc17)\0A      %113 = \22stable_mosaic_gpu.gpu.block_dim\22() {dimension = #gpu<dim z>} : () -> index loc(#loc17)\0A      %114 = \22stable_mosaic_gpu.arith.index_cast\22(%113) : (index) -> i32 loc(#loc17)\0A      %115 = \22stable_mosaic_gpu.arith.muli\22(%108, %114) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc17)\0A      %116 = \22stable_mosaic_gpu.arith.constant\22() {value = 5 : i32} : () -> i32 loc(#loc17)\0A      %117 = \22stable_mosaic_gpu.arith.shrui\22(%112, %116) : (i32, i32) -> i32 loc(#loc17)\0A      %118 = \22stable_mosaic_gpu.arith.constant\22() {value = -1 : i32} : () -> i32 loc(#loc17)\0A      %119 = \22stable_mosaic_gpu.arith.constant\22() {value = 0 : i32} : () -> i32 loc(#loc17)\0A      %120 = \22stable_mosaic_gpu.arith.constant\22() {value = 31 : i32} : () -> i32 loc(#loc17)\0A      %121 = \22stable_mosaic_gpu.nvvm.shfl.sync\22(%118, %117, %119, %120) {kind = #nvvm<shfl_kind idx>} : (i32, i32, i32, i32) -> i32 loc(#loc17)\0A      %122 = \22stable_mosaic_gpu.arith.constant\22() {value = 0 : i32} : () -> i32 loc(#loc17)\0A      %123 = \22stable_mosaic_gpu.arith.cmpi\22(%121, %122) {predicate = 0 : i64} : (i32, i32) -> i1 loc(#loc17)\0A      %124 = \22stable_mosaic_gpu.arith.andi\22(%123, %97) : (i1, i1) -> i1 loc(#loc17)\0A      \22stable_mosaic_gpu.scf.if\22(%124) ({\0A        %332 = \22stable_mosaic_gpu.llvm.getelementptr\22(%94) {elem_type = i64, rawConstantIndices = array<i32: 0>} : (!llvm.ptr<3>) -> !llvm.ptr<3> loc(#loc17)\0A        %333 = \22stable_mosaic_gpu.arith.constant\22() {value = 128 : i32} : () -> i32 loc(#loc17)\0A        \22stable_mosaic_gpu.nvvm.mbarrier.init.shared\22(%332, %333) : (!llvm.ptr<3>, i32) -> () loc(#loc17)\0A        \22stable_mosaic_gpu.scf.yield\22() : () -> () loc(#loc13)\0A      }, {\0A      }) : (i1) -> () loc(#loc17)\0A      %125 = \22stable_mosaic_gpu.arith.constant\22() {value = 0 : i32} : () -> i32 loc(#loc17)\0A      \22stable_mosaic_gpu.nvvm.fence.mbarrier.init\22() : () -> () loc(#loc17)\0A      \22stable_mosaic_gpu.gpu.barrier\22() : () -> () loc(#loc17)\0A      %126 = \22stable_mosaic_gpu.nvvm.elect.sync\22() : () -> i1 loc(#loc17)\0A      %127 = \22stable_mosaic_gpu.gpu.thread_id\22() {dimension = #gpu<dim x>} : () -> index loc(#loc17)\0A      %128 = \22stable_mosaic_gpu.arith.index_cast\22(%127) : (index) -> i32 loc(#loc17)\0A      %129 = \22stable_mosaic_gpu.gpu.block_dim\22() {dimension = #gpu<dim x>} : () -> index loc(#loc17)\0A      %130 = \22stable_mosaic_gpu.arith.index_cast\22(%129) : (index) -> i32 loc(#loc17)\0A      %131 = \22stable_mosaic_gpu.gpu.thread_id\22() {dimension = #gpu<dim y>} : () -> index loc(#loc17)\0A      %132 = \22stable_mosaic_gpu.arith.index_cast\22(%131) : (index) -> i32 loc(#loc17)\0A      %133 = \22stable_mosaic_gpu.arith.muli\22(%132, %130) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc17)\0A      %134 = \22stable_mosaic_gpu.arith.addi\22(%128, %133) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc17)\0A      %135 = \22stable_mosaic_gpu.gpu.block_dim\22() {dimension = #gpu<dim y>} : () -> index loc(#loc17)\0A      %136 = \22stable_mosaic_gpu.arith.index_cast\22(%135) : (index) -> i32 loc(#loc17)\0A      %137 = \22stable_mosaic_gpu.arith.muli\22(%130, %136) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc17)\0A      %138 = \22stable_mosaic_gpu.gpu.thread_id\22() {dimension = #gpu<dim z>} : () -> index loc(#loc17)\0A      %139 = \22stable_mosaic_gpu.arith.index_cast\22(%138) : (index) -> i32 loc(#loc17)\0A      %140 = \22stable_mosaic_gpu.arith.muli\22(%139, %137) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc17)\0A      %141 = \22stable_mosaic_gpu.arith.addi\22(%134, %140) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc17)\0A      %142 = \22stable_mosaic_gpu.gpu.block_dim\22() {dimension = #gpu<dim z>} : () -> index loc(#loc17)\0A      %143 = \22stable_mosaic_gpu.arith.index_cast\22(%142) : (index) -> i32 loc(#loc17)\0A      %144 = \22stable_mosaic_gpu.arith.muli\22(%137, %143) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc17)\0A      %145 = \22stable_mosaic_gpu.arith.constant\22() {value = 5 : i32} : () -> i32 loc(#loc17)\0A      %146 = \22stable_mosaic_gpu.arith.shrui\22(%141, %145) : (i32, i32) -> i32 loc(#loc17)\0A      %147 = \22stable_mosaic_gpu.arith.constant\22() {value = -1 : i32} : () -> i32 loc(#loc17)\0A      %148 = \22stable_mosaic_gpu.arith.constant\22() {value = 0 : i32} : () -> i32 loc(#loc17)\0A      %149 = \22stable_mosaic_gpu.arith.constant\22() {value = 31 : i32} : () -> i32 loc(#loc17)\0A      %150 = \22stable_mosaic_gpu.nvvm.shfl.sync\22(%147, %146, %148, %149) {kind = #nvvm<shfl_kind idx>} : (i32, i32, i32, i32) -> i32 loc(#loc17)\0A      %151 = \22stable_mosaic_gpu.arith.constant\22() {value = 4 : i32} : () -> i32 loc(#loc17)\0A      %152 = \22stable_mosaic_gpu.arith.remui\22(%150, %151) : (i32, i32) -> i32 loc(#loc17)\0A      %153 = \22stable_mosaic_gpu.arith.constant\22() {value = 0 : i32} : () -> i32 loc(#loc17)\0A      %154 = \22stable_mosaic_gpu.arith.cmpi\22(%152, %153) {predicate = 0 : i64} : (i32, i32) -> i1 loc(#loc17)\0A      %155 = \22stable_mosaic_gpu.arith.andi\22(%154, %126) : (i1, i1) -> i1 loc(#loc17)\0A      %156 = \22stable_mosaic_gpu.nvvm.elect.sync\22() : () -> i1 loc(#loc17)\0A      %157 = \22stable_mosaic_gpu.gpu.block_id\22() {dimension = #gpu<dim x>} : () -> index loc(#loc17)\0A      %158 = \22stable_mosaic_gpu.arith.index_cast\22(%157) : (index) -> i32 loc(#loc17)\0A      %159 = \22stable_mosaic_gpu.gpu.dynamic_shared_memory\22() : () -> memref<?xi8, #gpu.address_space<workgroup>> loc(#loc20)\0A      %160 = \22stable_mosaic_gpu.arith.constant\22() {value = 0 : index} : () -> index loc(#loc20)\0A      %161 = \22stable_mosaic_gpu.memref.view\22(%159, %160) : (memref<?xi8, #gpu.address_space<workgroup>>, index) -> memref<1x256xf32, #gpu.address_space<workgroup>> loc(#loc20)\0A      %162 = \22stable_mosaic_gpu.gpu.dynamic_shared_memory\22() : () -> memref<?xi8, #gpu.address_space<workgroup>> loc(#loc20)\0A      %163 = \22stable_mosaic_gpu.arith.constant\22() {value = 1024 : index} : () -> index loc(#loc20)\0A      %164 = \22stable_mosaic_gpu.memref.view\22(%162, %163) : (memref<?xi8, #gpu.address_space<workgroup>>, index) -> memref<1x256xf32, #gpu.address_space<workgroup>> loc(#loc20)\0A      %165 = \22stable_mosaic_gpu.arith.constant\22() {value = 0 : index} : () -> index loc(#loc19)\0A      %166 = \22stable_mosaic_gpu.memref.subview\22(%161, %165) {operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: 1, 256>, static_strides = array<i64: 1, 1>} : (memref<1x256xf32, #gpu.address_space<workgroup>>, index) -> memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>> loc(#loc19)\0A      %167 = \22stable_mosaic_gpu.arith.constant\22() {value = 0 : index} : () -> index loc(#loc19)\0A      %168 = \22stable_mosaic_gpu.arith.index_castui\22(%167) : (index) -> i32 loc(#loc19)\0A      %169 = \22stable_mosaic_gpu.arith.addi\22(%125, %168) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc19)\0A      %170 = \22stable_mosaic_gpu.arith.constant\22() {value = 8 : i32} : () -> i32 loc(#loc19)\0A      %171 = \22stable_mosaic_gpu.llvm.getelementptr\22(%94, %169) {elem_type = i64, rawConstantIndices = array<i32: -2147483648>} : (!llvm.ptr<3>, i32) -> !llvm.ptr<3> loc(#loc19)\0A      \22stable_mosaic_gpu.nvvm.mbarrier.arrive.expect_tx.shared\22(%171, %170) : (!llvm.ptr<3>, i32) -> () loc(#loc19)\0A      %172 = \22stable_mosaic_gpu.arith.constant\22() {value = 0 : index} : () -> index loc(#loc19)\0A      %173 = \22stable_mosaic_gpu.arith.index_cast\22(%172) : (index) -> i32 loc(#loc19)\0A      %174 = \22stable_mosaic_gpu.builtin.unrealized_conversion_cast\22(%166) : (memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>>) -> !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> loc(#loc19)\0A      %175 = \22stable_mosaic_gpu.llvm.extractvalue\22(%174) {position = array<i64: 1>} : (!llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>) -> !llvm.ptr<3> loc(#loc19)\0A      %176 = \22stable_mosaic_gpu.llvm.extractvalue\22(%174) {position = array<i64: 2>} : (!llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>) -> i64 loc(#loc19)\0A      %177 = \22stable_mosaic_gpu.arith.constant\22() {value = 4 : i64} : () -> i64 loc(#loc19)\0A      %178 = \22stable_mosaic_gpu.llvm.mul\22(%176, %177) : (i64, i64) -> i64 loc(#loc19)\0A      %179 = \22stable_mosaic_gpu.llvm.ptrtoint\22(%175) : (!llvm.ptr<3>) -> i64 loc(#loc19)\0A      %180 = \22stable_mosaic_gpu.llvm.add\22(%179, %178) : (i64, i64) -> i64 loc(#loc19)\0A      %181 = \22stable_mosaic_gpu.llvm.inttoptr\22(%180) : (i64) -> !llvm.ptr<3> loc(#loc19)\0A      %182 = \22stable_mosaic_gpu.arith.constant\22() {value = 1024 : i32} : () -> i32 loc(#loc19)\0A      %183 = \22stable_mosaic_gpu.llvm.getelementptr\22(%94, %169) {elem_type = i64, rawConstantIndices = array<i32: -2147483648>} : (!llvm.ptr<3>, i32) -> !llvm.ptr<3> loc(#loc19)\0A      \22stable_mosaic_gpu.nvvm.cp.async.bulk.tensor.shared.cluster.global\22(%181, %83, %173, %183, %155) {operandSegmentSizes = array<i32: 1, 1, 1, 1, 0, 0, 0, 1>} : (!llvm.ptr<3>, !llvm.ptr, i32, !llvm.ptr<3>, i1) -> () loc(#loc19)\0A      %184 = \22stable_mosaic_gpu.arith.constant\22() {value = 0 : i32} : () -> i32 loc(#loc21)\0A      %185 = \22stable_mosaic_gpu.arith.constant\22() {value = 0 : i32} : () -> i32 loc(#loc21)\0A      %186 = \22stable_mosaic_gpu.arith.addi\22(%185, %184) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc21)\0A      %187 = \22stable_mosaic_gpu.arith.constant\22() {value = 1 : i32} : () -> i32 loc(#loc22)\0A      %188 = \22stable_mosaic_gpu.arith.remsi\22(%186, %187) : (i32, i32) -> i32 loc(#loc22)\0A      %189 = \22stable_mosaic_gpu.arith.index_cast\22(%188) : (i32) -> index loc(#loc23)\0A      %190 = \22stable_mosaic_gpu.arith.index_castui\22(%189) : (index) -> i32 loc(#loc23)\0A      %191 = \22stable_mosaic_gpu.arith.addi\22(%125, %190) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc23)\0A      %192 = \22stable_mosaic_gpu.memref.load\22(%95) : (memref<i32>) -> i32 loc(#loc23)\0A      %193 = \22stable_mosaic_gpu.arith.constant\22() {value = 1 : i32} : () -> i32 loc(#loc23)\0A      %194 = \22stable_mosaic_gpu.arith.shli\22(%193, %191) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc23)\0A      %195 = \22stable_mosaic_gpu.arith.andi\22(%192, %194) : (i32, i32) -> i32 loc(#loc23)\0A      %196 = \22stable_mosaic_gpu.arith.constant\22() {value = 0 : i32} : () -> i32 loc(#loc23)\0A      %197 = \22stable_mosaic_gpu.arith.cmpi\22(%195, %196) {predicate = 1 : i64} : (i32, i32) -> i1 loc(#loc23)\0A      %198 = \22stable_mosaic_gpu.arith.xori\22(%192, %194) : (i32, i32) -> i32 loc(#loc23)\0A      \22stable_mosaic_gpu.memref.store\22(%198, %95) : (i32, memref<i32>) -> () loc(#loc23)\0A      %199 = \22stable_mosaic_gpu.arith.constant\22() {value = 10000000 : i32} : () -> i32 loc(#loc23)\0A      %200 = \22stable_mosaic_gpu.arith.extui\22(%197) : (i1) -> i32 loc(#loc23)\0A      %201 = \22stable_mosaic_gpu.llvm.getelementptr\22(%94, %191) {elem_type = i64, rawConstantIndices = array<i32: -2147483648>} : (!llvm.ptr<3>, i32) -> !llvm.ptr<3> loc(#loc23)\0A      \22stable_mosaic_gpu.nvvm.mbarrier.try_wait.parity.shared\22(%201, %200, %199) : (!llvm.ptr<3>, i32, i32) -> () loc(#loc23)\0A      %202 = \22stable_mosaic_gpu.arith.index_cast\22(%188) : (i32) -> index loc(#loc24)\0A      %203 = \22stable_mosaic_gpu.memref.subview\22(%161, %202) {operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: 1, 256>, static_strides = array<i64: 1, 1>} : (memref<1x256xf32, #gpu.address_space<workgroup>>, index) -> memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>> loc(#loc24)\0A      %204 = \22stable_mosaic_gpu.arith.index_cast\22(%188) : (i32) -> index loc(#loc24)\0A      %205 = \22stable_mosaic_gpu.memref.subview\22(%164, %204) {operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: 1, 256>, static_strides = array<i64: 1, 1>} : (memref<1x256xf32, #gpu.address_space<workgroup>>, index) -> memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>> loc(#loc24)\0A      %206 = \22stable_mosaic_gpu.gpu.block_id\22() {dimension = #gpu<dim x>} : () -> index loc(#loc24)\0A      %207 = \22stable_mosaic_gpu.arith.index_cast\22(%206) : (index) -> i32 loc(#loc24)\0A      %208 = \22stable_mosaic_gpu.memref.subview\22(%203) {operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0>, static_sizes = array<i64: 256>, static_strides = array<i64: 1>} : (memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>>) -> memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>> loc(#loc25)\0A      %209 = \22stable_mosaic_gpu.memref.collapse_shape\22(%208) {reassociation = [[0]]} : (memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>>) -> memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>> loc(#loc25)\0A      %210 = \22stable_mosaic_gpu.gpu.thread_id\22() {dimension = #gpu<dim x>} : () -> index loc(#loc25)\0A      %211 = \22stable_mosaic_gpu.arith.constant\22() {value = 128 : index} : () -> index loc(#loc25)\0A      %212 = \22stable_mosaic_gpu.arith.remui\22(%210, %211) : (index, index) -> index loc(#loc25)\0A      %213 = \22stable_mosaic_gpu.arith.constant\22() {value = 2 : index} : () -> index loc(#loc25)\0A      %214 = \22stable_mosaic_gpu.arith.muli\22(%212, %213) {overflowFlags = #arith.overflow<none>} : (index, index) -> index loc(#loc25)\0A      %215 = \22stable_mosaic_gpu.arith.constant\22() {value = 0 : index} : () -> index loc(#loc25)\0A      %216 = \22stable_mosaic_gpu.arith.addi\22(%214, %215) {overflowFlags = #arith.overflow<none>} : (index, index) -> index loc(#loc25)\0A      %217 = \22stable_mosaic_gpu.vector.load\22(%209, %216) : (memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>>, index) -> vector<2xf32> loc(#loc25)\0A      %218 = \22stable_mosaic_gpu.arith.constant\22() {value = 1.000000e+00 : f32} : () -> f32 loc(#loc26)\0A      %219 = \22stable_mosaic_gpu.vector.splat\22(%218) : (f32) -> vector<2xf32> loc(#loc26)\0A      %220 = \22stable_mosaic_gpu.arith.addf\22(%217, %219) {fastmath = #arith.fastmath<contract>} : (vector<2xf32>, vector<2xf32>) -> vector<2xf32> loc(#loc26)\0A      %221 = \22stable_mosaic_gpu.memref.subview\22(%205) {operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0>, static_sizes = array<i64: 256>, static_strides = array<i64: 1>} : (memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>>) -> memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>> loc(#loc27)\0A      %222 = \22stable_mosaic_gpu.memref.collapse_shape\22(%221) {reassociation = [[0]]} : (memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>>) -> memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>> loc(#loc27)\0A      %223 = \22stable_mosaic_gpu.gpu.thread_id\22() {dimension = #gpu<dim x>} : () -> index loc(#loc27)\0A      %224 = \22stable_mosaic_gpu.arith.constant\22() {value = 128 : index} : () -> index loc(#loc27)\0A      %225 = \22stable_mosaic_gpu.arith.remui\22(%223, %224) : (index, index) -> index loc(#loc27)\0A      %226 = \22stable_mosaic_gpu.arith.constant\22() {value = 2 : index} : () -> index loc(#loc27)\0A      %227 = \22stable_mosaic_gpu.arith.muli\22(%225, %226) {overflowFlags = #arith.overflow<none>} : (index, index) -> index loc(#loc27)\0A      %228 = \22stable_mosaic_gpu.arith.constant\22() {value = 0 : index} : () -> index loc(#loc27)\0A      %229 = \22stable_mosaic_gpu.arith.addi\22(%227, %228) {overflowFlags = #arith.overflow<none>} : (index, index) -> index loc(#loc27)\0A      %230 = \22stable_mosaic_gpu.vector.load\22(%222, %229) : (memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>>, index) -> vector<2xf32> loc(#loc27)\0A      %231 = \22stable_mosaic_gpu.memref.collapse_shape\22(%221) {reassociation = [[0]]} : (memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>>) -> memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>> loc(#loc27)\0A      %232 = \22stable_mosaic_gpu.gpu.thread_id\22() {dimension = #gpu<dim x>} : () -> index loc(#loc27)\0A      %233 = \22stable_mosaic_gpu.arith.constant\22() {value = 128 : index} : () -> index loc(#loc27)\0A      %234 = \22stable_mosaic_gpu.arith.remui\22(%232, %233) : (index, index) -> index loc(#loc27)\0A      %235 = \22stable_mosaic_gpu.arith.constant\22() {value = 2 : index} : () -> index loc(#loc27)\0A      %236 = \22stable_mosaic_gpu.arith.muli\22(%234, %235) {overflowFlags = #arith.overflow<none>} : (index, index) -> index loc(#loc27)\0A      %237 = \22stable_mosaic_gpu.arith.constant\22() {value = 0 : index} : () -> index loc(#loc27)\0A      %238 = \22stable_mosaic_gpu.arith.addi\22(%236, %237) {overflowFlags = #arith.overflow<none>} : (index, index) -> index loc(#loc27)\0A      \22stable_mosaic_gpu.vector.store\22(%220, %231, %238) : (vector<2xf32>, memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>>, index) -> () loc(#loc27)\0A      \22stable_mosaic_gpu.nvvm.cp.async.bulk.commit.group\22() : () -> () loc(#loc28)\0A      %239 = \22stable_mosaic_gpu.arith.constant\22() {value = 1 : i32} : () -> i32 loc(#loc29)\0A      %240 = \22stable_mosaic_gpu.arith.addi\22(%186, %239) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc29)\0A      %241 = \22stable_mosaic_gpu.arith.constant\22() {value = 1 : i32} : () -> i32 loc(#loc22)\0A      %242 = \22stable_mosaic_gpu.arith.remsi\22(%240, %241) : (i32, i32) -> i32 loc(#loc22)\0A      %243 = \22stable_mosaic_gpu.arith.constant\22() {value = 0 : i32} : () -> i32 loc(#loc30)\0A      %244 = \22stable_mosaic_gpu.arith.cmpi\22(%186, %243) {predicate = 9 : i64} : (i32, i32) -> i1 loc(#loc30)\0A      %245 = \22stable_mosaic_gpu.arith.constant\22() {value = 1 : i32} : () -> i32 loc(#loc31)\0A      %246 = \22stable_mosaic_gpu.arith.cmpi\22(%240, %245) {predicate = 6 : i64} : (i32, i32) -> i1 loc(#loc31)\0A      %247 = \22stable_mosaic_gpu.arith.andi\22(%244, %246) : (i1, i1) -> i1 loc(#loc32)\0A      %248 = \22stable_mosaic_gpu.arith.extui\22(%247) : (i1) -> i32 loc(#loc33)\0A      %249 = \22stable_mosaic_gpu.arith.index_cast\22(%248) : (i32) -> index loc(#loc34)\0A      \22stable_mosaic_gpu.scf.index_switch\22(%249) ({\0A        %313 = \22stable_mosaic_gpu.arith.index_cast\22(%242) : (i32) -> index loc(#loc19)\0A        %314 = \22stable_mosaic_gpu.memref.subview\22(%161, %313) {operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: 1, 256>, static_strides = array<i64: 1, 1>} : (memref<1x256xf32, #gpu.address_space<workgroup>>, index) -> memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>> loc(#loc19)\0A        %315 = \22stable_mosaic_gpu.arith.index_cast\22(%242) : (i32) -> index loc(#loc19)\0A        %316 = \22stable_mosaic_gpu.arith.index_castui\22(%315) : (index) -> i32 loc(#loc19)\0A        %317 = \22stable_mosaic_gpu.arith.addi\22(%125, %316) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc19)\0A        %318 = \22stable_mosaic_gpu.arith.constant\22() {value = 8 : i32} : () -> i32 loc(#loc19)\0A        %319 = \22stable_mosaic_gpu.llvm.getelementptr\22(%94, %317) {elem_type = i64, rawConstantIndices = array<i32: -2147483648>} : (!llvm.ptr<3>, i32) -> !llvm.ptr<3> loc(#loc19)\0A        \22stable_mosaic_gpu.nvvm.mbarrier.arrive.expect_tx.shared\22(%319, %318) : (!llvm.ptr<3>, i32) -> () loc(#loc19)\0A        %320 = \22stable_mosaic_gpu.arith.constant\22() {value = 0 : index} : () -> index loc(#loc19)\0A        %321 = \22stable_mosaic_gpu.arith.index_cast\22(%320) : (index) -> i32 loc(#loc19)\0A        %322 = \22stable_mosaic_gpu.builtin.unrealized_conversion_cast\22(%314) : (memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>>) -> !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> loc(#loc19)\0A        %323 = \22stable_mosaic_gpu.llvm.extractvalue\22(%322) {position = array<i64: 1>} : (!llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>) -> !llvm.ptr<3> loc(#loc19)\0A        %324 = \22stable_mosaic_gpu.llvm.extractvalue\22(%322) {position = array<i64: 2>} : (!llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>) -> i64 loc(#loc19)\0A        %325 = \22stable_mosaic_gpu.arith.constant\22() {value = 4 : i64} : () -> i64 loc(#loc19)\0A        %326 = \22stable_mosaic_gpu.llvm.mul\22(%324, %325) : (i64, i64) -> i64 loc(#loc19)\0A        %327 = \22stable_mosaic_gpu.llvm.ptrtoint\22(%323) : (!llvm.ptr<3>) -> i64 loc(#loc19)\0A        %328 = \22stable_mosaic_gpu.llvm.add\22(%327, %326) : (i64, i64) -> i64 loc(#loc19)\0A        %329 = \22stable_mosaic_gpu.llvm.inttoptr\22(%328) : (i64) -> !llvm.ptr<3> loc(#loc19)\0A        %330 = \22stable_mosaic_gpu.arith.constant\22() {value = 1024 : i32} : () -> i32 loc(#loc19)\0A        %331 = \22stable_mosaic_gpu.llvm.getelementptr\22(%94, %317) {elem_type = i64, rawConstantIndices = array<i32: -2147483648>} : (!llvm.ptr<3>, i32) -> !llvm.ptr<3> loc(#loc19)\0A        \22stable_mosaic_gpu.nvvm.cp.async.bulk.tensor.shared.cluster.global\22(%329, %83, %321, %331, %155) {operandSegmentSizes = array<i32: 1, 1, 1, 1, 0, 0, 0, 1>} : (!llvm.ptr<3>, !llvm.ptr, i32, !llvm.ptr<3>, i1) -> () loc(#loc19)\0A        \22stable_mosaic_gpu.scf.yield\22() : () -> () loc(#loc16)\0A      }, {\0A        \22stable_mosaic_gpu.scf.yield\22() : () -> () loc(#loc34)\0A      }) {cases = array<i64: 0>} : (index) -> () loc(#loc34)\0A      %250 = \22stable_mosaic_gpu.arith.constant\22() {value = 1 : i32} : () -> i32 loc(#loc21)\0A      %251 = \22stable_mosaic_gpu.arith.addi\22(%184, %250) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc21)\0A      \22stable_mosaic_gpu.nvvm.fence.proxy\22() {kind = #nvvm.proxy_kind<async.shared>, space = #nvvm.shared_space<cta>} : () -> () loc(#loc35)\0A      %252 = \22stable_mosaic_gpu.gpu.thread_id\22() {dimension = #gpu<dim x>} : () -> index loc(#loc35)\0A      %253 = \22stable_mosaic_gpu.arith.index_cast\22(%252) : (index) -> i32 loc(#loc35)\0A      %254 = \22stable_mosaic_gpu.gpu.block_dim\22() {dimension = #gpu<dim x>} : () -> index loc(#loc35)\0A      %255 = \22stable_mosaic_gpu.arith.index_cast\22(%254) : (index) -> i32 loc(#loc35)\0A      %256 = \22stable_mosaic_gpu.gpu.thread_id\22() {dimension = #gpu<dim y>} : () -> index loc(#loc35)\0A      %257 = \22stable_mosaic_gpu.arith.index_cast\22(%256) : (index) -> i32 loc(#loc35)\0A      %258 = \22stable_mosaic_gpu.arith.muli\22(%257, %255) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc35)\0A      %259 = \22stable_mosaic_gpu.arith.addi\22(%253, %258) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc35)\0A      %260 = \22stable_mosaic_gpu.gpu.block_dim\22() {dimension = #gpu<dim y>} : () -> index loc(#loc35)\0A      %261 = \22stable_mosaic_gpu.arith.index_cast\22(%260) : (index) -> i32 loc(#loc35)\0A      %262 = \22stable_mosaic_gpu.arith.muli\22(%255, %261) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc35)\0A      %263 = \22stable_mosaic_gpu.gpu.thread_id\22() {dimension = #gpu<dim z>} : () -> index loc(#loc35)\0A      %264 = \22stable_mosaic_gpu.arith.index_cast\22(%263) : (index) -> i32 loc(#loc35)\0A      %265 = \22stable_mosaic_gpu.arith.muli\22(%264, %262) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc35)\0A      %266 = \22stable_mosaic_gpu.arith.addi\22(%259, %265) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc35)\0A      %267 = \22stable_mosaic_gpu.gpu.block_dim\22() {dimension = #gpu<dim z>} : () -> index loc(#loc35)\0A      %268 = \22stable_mosaic_gpu.arith.index_cast\22(%267) : (index) -> i32 loc(#loc35)\0A      %269 = \22stable_mosaic_gpu.arith.muli\22(%262, %268) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc35)\0A      %270 = \22stable_mosaic_gpu.arith.constant\22() {value = 7 : i32} : () -> i32 loc(#loc35)\0A      %271 = \22stable_mosaic_gpu.arith.shrui\22(%266, %270) : (i32, i32) -> i32 loc(#loc35)\0A      %272 = \22stable_mosaic_gpu.arith.constant\22() {value = 1 : i32} : () -> i32 loc(#loc35)\0A      %273 = \22stable_mosaic_gpu.arith.addi\22(%271, %272) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc35)\0A      %274 = \22stable_mosaic_gpu.llvm.inline_asm\22(%273) {asm_string = \22bar.sync $0, 128;\22, constraints = \22r\22, has_side_effects} : (i32) -> !llvm.void loc(#loc35)\0A      %275 = \22stable_mosaic_gpu.arith.constant\22() {value = 0 : i32} : () -> i32 loc(#loc22)\0A      %276 = \22stable_mosaic_gpu.arith.constant\22() {value = 1 : i32} : () -> i32 loc(#loc22)\0A      %277 = \22stable_mosaic_gpu.arith.remsi\22(%275, %276) : (i32, i32) -> i32 loc(#loc22)\0A      %278 = \22stable_mosaic_gpu.arith.index_cast\22(%277) : (i32) -> index loc(#loc18)\0A      %279 = \22stable_mosaic_gpu.memref.subview\22(%164, %278) {operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: 1, 256>, static_strides = array<i64: 1, 1>} : (memref<1x256xf32, #gpu.address_space<workgroup>>, index) -> memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>> loc(#loc18)\0A      %280 = \22stable_mosaic_gpu.arith.constant\22() {value = 0 : index} : () -> index loc(#loc18)\0A      %281 = \22stable_mosaic_gpu.arith.index_cast\22(%280) : (index) -> i32 loc(#loc18)\0A      %282 = \22stable_mosaic_gpu.builtin.unrealized_conversion_cast\22(%279) : (memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>>) -> !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> loc(#loc18)\0A      %283 = \22stable_mosaic_gpu.llvm.extractvalue\22(%282) {position = array<i64: 1>} : (!llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>) -> !llvm.ptr<3> loc(#loc18)\0A      %284 = \22stable_mosaic_gpu.llvm.extractvalue\22(%282) {position = array<i64: 2>} : (!llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>) -> i64 loc(#loc18)\0A      %285 = \22stable_mosaic_gpu.arith.constant\22() {value = 4 : i64} : () -> i64 loc(#loc18)\0A      %286 = \22stable_mosaic_gpu.llvm.mul\22(%284, %285) : (i64, i64) -> i64 loc(#loc18)\0A      %287 = \22stable_mosaic_gpu.llvm.ptrtoint\22(%283) : (!llvm.ptr<3>) -> i64 loc(#loc18)\0A      %288 = \22stable_mosaic_gpu.llvm.add\22(%287, %286) : (i64, i64) -> i64 loc(#loc18)\0A      %289 = \22stable_mosaic_gpu.llvm.inttoptr\22(%288) : (i64) -> !llvm.ptr<3> loc(#loc18)\0A      \22stable_mosaic_gpu.nvvm.cp.async.bulk.tensor.global.shared.cta\22(%82, %289, %281, %155) {operandSegmentSizes = array<i32: 1, 1, 1, 1>} : (!llvm.ptr, !llvm.ptr<3>, i32, i1) -> () loc(#loc18)\0A      \22stable_mosaic_gpu.nvvm.cp.async.bulk.commit.group\22() : () -> () loc(#loc28)\0A      \22stable_mosaic_gpu.nvvm.cp.async.bulk.wait_group\22() {group = 0 : i32} : () -> () loc(#loc36)\0A      %290 = \22stable_mosaic_gpu.gpu.thread_id\22() {dimension = #gpu<dim x>} : () -> index loc(#loc36)\0A      %291 = \22stable_mosaic_gpu.arith.index_cast\22(%290) : (index) -> i32 loc(#loc36)\0A      %292 = \22stable_mosaic_gpu.gpu.block_dim\22() {dimension = #gpu<dim x>} : () -> index loc(#loc36)\0A      %293 = \22stable_mosaic_gpu.arith.index_cast\22(%292) : (index) -> i32 loc(#loc36)\0A      %294 = \22stable_mosaic_gpu.gpu.thread_id\22() {dimension = #gpu<dim y>} : () -> index loc(#loc36)\0A      %295 = \22stable_mosaic_gpu.arith.index_cast\22(%294) : (index) -> i32 loc(#loc36)\0A      %296 = \22stable_mosaic_gpu.arith.muli\22(%295, %293) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc36)\0A      %297 = \22stable_mosaic_gpu.arith.addi\22(%291, %296) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc36)\0A      %298 = \22stable_mosaic_gpu.gpu.block_dim\22() {dimension = #gpu<dim y>} : () -> index loc(#loc36)\0A      %299 = \22stable_mosaic_gpu.arith.index_cast\22(%298) : (index) -> i32 loc(#loc36)\0A      %300 = \22stable_mosaic_gpu.arith.muli\22(%293, %299) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc36)\0A      %301 = \22stable_mosaic_gpu.gpu.thread_id\22() {dimension = #gpu<dim z>} : () -> index loc(#loc36)\0A      %302 = \22stable_mosaic_gpu.arith.index_cast\22(%301) : (index) -> i32 loc(#loc36)\0A      %303 = \22stable_mosaic_gpu.arith.muli\22(%302, %300) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc36)\0A      %304 = \22stable_mosaic_gpu.arith.addi\22(%297, %303) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc36)\0A      %305 = \22stable_mosaic_gpu.gpu.block_dim\22() {dimension = #gpu<dim z>} : () -> index loc(#loc36)\0A      %306 = \22stable_mosaic_gpu.arith.index_cast\22(%305) : (index) -> i32 loc(#loc36)\0A      %307 = \22stable_mosaic_gpu.arith.muli\22(%300, %306) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc36)\0A      %308 = \22stable_mosaic_gpu.arith.constant\22() {value = 7 : i32} : () -> i32 loc(#loc36)\0A      %309 = \22stable_mosaic_gpu.arith.shrui\22(%304, %308) : (i32, i32) -> i32 loc(#loc36)\0A      %310 = \22stable_mosaic_gpu.arith.constant\22() {value = 1 : i32} : () -> i32 loc(#loc36)\0A      %311 = \22stable_mosaic_gpu.arith.addi\22(%309, %310) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc36)\0A      %312 = \22stable_mosaic_gpu.llvm.inline_asm\22(%311) {asm_string = \22bar.sync $0, 128;\22, constraints = \22r\22, has_side_effects} : (i32) -> !llvm.void loc(#loc36)\0A      \22stable_mosaic_gpu.gpu.terminator\22() : () -> () loc(#loc17)\0A    }) {operandSegmentSizes = array<i32: 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1>, workgroup_attributions = 0 : i64} : (!gpu.async.token, index, index, index, index, index, index, i32) -> !gpu.async.token loc(#loc17)\0A    \22stable_mosaic_gpu.func.return\22() : () -> () loc(#loc17)\0A  }) {function_type = (!llvm.ptr, !llvm.ptr) -> (), llvm.emit_c_interface, sym_name = \22mosaic_gpu_body\22} : () -> () loc(#loc17)\0A}) {stable_mosaic_gpu.version = 1 : i64} : () -> () loc(#loc17)\0A#loc13 = loc(\22-\22:141:7)\0A#loc14 = loc(\22third_party/py/jax/tests/pallas/export_back_compat_pallas_test.py\22:78:19)\0A#loc15 = loc(\22third_party/py/jax/tests/pallas/export_back_compat_pallas_test.py\22:78:6)\0A#loc16 = loc(\22-\22:279:7)\0A#loc18 = loc(\22/copy_smem_to_gmem\22(#loc))\0A#loc19 = loc(\22/copy_gmem_to_smem\22(#loc))\0A#loc20 = loc(\22/run_scoped\22(#loc))\0A#loc21 = loc(\22/scan\22(#loc))\0A#loc22 = loc(\22/rem\22(#loc))\0A#loc23 = loc(\22/barrier_wait\22(#loc))\0A#loc24 = loc(\22/jaxpr_call\22(#loc))\0A#loc25 = loc(\22/get\22(#loc14))\0A#loc26 = loc(\22/add\22(#loc14))\0A#loc27 = loc(\22/swap\22(#loc15))\0A#loc28 = loc(\22/commit_group\22(#loc))\0A#loc29 = loc(\22/add\22(#loc))\0A#loc30 = loc(\22/ge\22(#loc))\0A#loc31 = loc(\22/lt\22(#loc))\0A#loc32 = loc(\22/and\22(#loc))\0A#loc33 = loc(\22/convert_element_type\22(#loc))\0A#loc34 = loc(\22/cond\22(#loc))\0A#loc35 = loc(\22/commit_smem\22(#loc))\0A#loc36 = loc(\22/wait_smem_to_gmem\22(#loc))\0A", operand_layouts = [dense<0> : tensor<1xindex>], result_layouts = [dense<0> : tensor<1xindex>]} : (tensor<256xf32>) -> tensor<256xf32> loc(#loc3)
    return %0 : tensor<256xf32> loc(#loc)
  } loc(#loc)
} loc(#loc)
#loc = loc(unknown)
#loc2 = loc("third_party/py/jax/tests/pallas/export_back_compat_pallas_test.py":83:4)
#loc3 = loc("jit(wrapped)/jit(main)/pallas_call"(#loc2))
""",
    mlir_module_serialized=b'ML\xefR\rStableHLO_v1.9.7\x00\x01\x19\x05\x01\x05\t\x01\x03\x0b\x03\x07\x0f\x13\x17\x03_=\x0f\x01\x1d\x07\x0f#\x0b\x0f\x0b\x0b\x0b\x0f\x0b\x0f\x0b\x13\x0b\x03!\x0b\x0f\x0f\x0b\x0b\x0f\x13\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b/\x01\x05\x0b\x0f\x03\x0b\x17\x17\x07\x13\x07\x02\xd5\x1f\x11\x03\x05\x03\x07\x07\t\x0b\x03\r\x03\x05\r\x11\x01\x00\x05\x0f\x05\x11\x05\x13\x1d\x13\x01\x05\x15\x1d\x17\x19\x05\x17\x17\x1b\xa7\t\x05\x19\x03\x01\x03\x03;\x03\x03#\r\x01#\x07\x03\x03)\r\x03+-\x1d\x1b\x1d\x1d\x1d\x1f\x1d!\x0b\x05\x1d#\x1d%\x05\x01\x1f\x0b\x11\x00\x00\x00\x00\x00\x00\x00\x00\x01\t\x01\x02\x02)\x03\x02\x08\t\x11\x03\x05\x03\x05\t)\x03\x05\r\x13\x04O\x05\x01Q\x01\x05\x01\x07\x04=\x03\x01\x05\x03P\x01\x03\x07\x04)\x03\x05\x0b\x03\x0b\x11\x00\x05F\x15\x05\x03\x05\x03\x01\x07\x04\x01\x03\x03\x06\x03\x01\x05\x01\x00D\xae\x05\'\x17\xa4\xa4\x05\x0f\x0b\x0f!\x85G\x11\x19%)9\x15\x1f\x11\x0f\x0b\x11builtin\x00vhlo\x00module\x00func_v1\x00custom_call_v1\x00return_v1\x00jax.uses_shape_polymorphism\x00mhlo.num_partitions\x00mhlo.num_replicas\x00jit_wrapped\x00args[0]\x00jit(wrapped)/jit(main)/pallas_call\x00third_party/py/jax/tests/pallas/export_back_compat_pallas_test.py\x00jax.result_info\x00result\x00main\x00public\x00\xa9C\xfb\x81\x9a1\xc2?\x0e\xf4\xe1\xe4\xe77\x03\xb6\x97\xe5G(]WR\x98\xeb{\xba\x8a\x84\x01\x12\'#loc = loc("third_party/py/jax/tests/pallas/export_back_compat_pallas_test.py":83:4)\n#loc1 = loc("-":94:40)\n#loc2 = loc("-":94:47)\n#loc3 = loc("-":94:54)\n#loc4 = loc("-":94:116)\n#loc5 = loc("-":94:123)\n#loc6 = loc("-":94:130)\n#loc7 = loc("-":94:65)\n#loc8 = loc("-":94:78)\n#loc9 = loc("-":94:91)\n#loc10 = loc("-":94:141)\n#loc11 = loc("-":94:157)\n#loc12 = loc("-":94:174)\n#loc17 = loc("jit(wrapped)/jit(main)/pallas_call"(#loc))\n"builtin.module"() <{sym_name = "add_one"}> ({\n  "stable_mosaic_gpu.func.func"() ({\n  }) {function_type = (!llvm.ptr, !llvm.ptr, i64, i64, !llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> (), sym_name = "mosaic_gpu_init_tma_desc", sym_visibility = "private"} : () -> () loc(#loc17)\n  "stable_mosaic_gpu.llvm.mlir.global"() ({\n  }) {addr_space = 4 : i32, global_type = !llvm.array<0 x i8>, linkage = #llvm.linkage<external>, sym_name = "global_scratch", unnamed_addr = 0 : i64, visibility_ = 0 : i64} : () -> () loc(#loc17)\n  "stable_mosaic_gpu.func.func"() ({\n  ^bb0(%arg0: !llvm.ptr loc("jit(wrapped)/jit(main)/pallas_call"(#loc)), %arg1: !llvm.ptr loc("jit(wrapped)/jit(main)/pallas_call"(#loc))):\n    %0 = "stable_mosaic_gpu.builtin.unrealized_conversion_cast"(%arg0) : (!llvm.ptr) -> !gpu.async.token loc(#loc17)\n    %1 = "stable_mosaic_gpu.llvm.getelementptr"(%arg1) {elem_type = !llvm.ptr, rawConstantIndices = array<i32: 0>} : (!llvm.ptr) -> !llvm.ptr loc(#loc17)\n    %2 = "stable_mosaic_gpu.llvm.load"(%1) {ordering = 0 : i64} : (!llvm.ptr) -> !llvm.ptr loc(#loc17)\n    %3 = "stable_mosaic_gpu.llvm.mlir.undef"() : () -> !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> loc(#loc17)\n    %4 = "stable_mosaic_gpu.llvm.insertvalue"(%3, %2) {position = array<i64: 0>} : (!llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>, !llvm.ptr) -> !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> loc(#loc17)\n    %5 = "stable_mosaic_gpu.llvm.insertvalue"(%4, %2) {position = array<i64: 1>} : (!llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>, !llvm.ptr) -> !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> loc(#loc17)\n    %6 = "stable_mosaic_gpu.llvm.mlir.constant"() {value = 0 : i64} : () -> i64 loc(#loc17)\n    %7 = "stable_mosaic_gpu.llvm.insertvalue"(%5, %6) {position = array<i64: 2>} : (!llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>, i64) -> !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> loc(#loc17)\n    %8 = "stable_mosaic_gpu.llvm.mlir.constant"() {value = 256 : i64} : () -> i64 loc(#loc17)\n    %9 = "stable_mosaic_gpu.llvm.insertvalue"(%7, %8) {position = array<i64: 3, 0>} : (!llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>, i64) -> !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> loc(#loc17)\n    %10 = "stable_mosaic_gpu.llvm.mlir.constant"() {value = 1 : i64} : () -> i64 loc(#loc17)\n    %11 = "stable_mosaic_gpu.llvm.insertvalue"(%9, %10) {position = array<i64: 4, 0>} : (!llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>, i64) -> !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> loc(#loc17)\n    %12 = "stable_mosaic_gpu.builtin.unrealized_conversion_cast"(%11) : (!llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>) -> memref<256xf32> loc(#loc17)\n    %13 = "stable_mosaic_gpu.llvm.getelementptr"(%arg1) {elem_type = !llvm.ptr, rawConstantIndices = array<i32: 1>} : (!llvm.ptr) -> !llvm.ptr loc(#loc17)\n    %14 = "stable_mosaic_gpu.llvm.load"(%13) {ordering = 0 : i64} : (!llvm.ptr) -> !llvm.ptr loc(#loc17)\n    %15 = "stable_mosaic_gpu.llvm.mlir.undef"() : () -> !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> loc(#loc17)\n    %16 = "stable_mosaic_gpu.llvm.insertvalue"(%15, %14) {position = array<i64: 0>} : (!llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>, !llvm.ptr) -> !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> loc(#loc17)\n    %17 = "stable_mosaic_gpu.llvm.insertvalue"(%16, %14) {position = array<i64: 1>} : (!llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>, !llvm.ptr) -> !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> loc(#loc17)\n    %18 = "stable_mosaic_gpu.llvm.mlir.constant"() {value = 0 : i64} : () -> i64 loc(#loc17)\n    %19 = "stable_mosaic_gpu.llvm.insertvalue"(%17, %18) {position = array<i64: 2>} : (!llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>, i64) -> !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> loc(#loc17)\n    %20 = "stable_mosaic_gpu.llvm.mlir.constant"() {value = 256 : i64} : () -> i64 loc(#loc17)\n    %21 = "stable_mosaic_gpu.llvm.insertvalue"(%19, %20) {position = array<i64: 3, 0>} : (!llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>, i64) -> !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> loc(#loc17)\n    %22 = "stable_mosaic_gpu.llvm.mlir.constant"() {value = 1 : i64} : () -> i64 loc(#loc17)\n    %23 = "stable_mosaic_gpu.llvm.insertvalue"(%21, %22) {position = array<i64: 4, 0>} : (!llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>, i64) -> !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> loc(#loc17)\n    %24 = "stable_mosaic_gpu.builtin.unrealized_conversion_cast"(%23) : (!llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>) -> memref<256xf32> loc(#loc17)\n    %25 = "stable_mosaic_gpu.arith.constant"() {value = 1 : i64} : () -> i64 loc(#loc17)\n    %26 = "stable_mosaic_gpu.llvm.alloca"(%25) {alignment = 64 : i64, elem_type = !llvm.array<256 x i8>} : (i64) -> !llvm.ptr loc(#loc17)\n    %27 = "stable_mosaic_gpu.llvm.getelementptr"(%26) {elem_type = i8, rawConstantIndices = array<i32: 0>} : (!llvm.ptr) -> !llvm.ptr loc(#loc17)\n    %28:4 = "stable_mosaic_gpu.memref.extract_strided_metadata"(%12) : (memref<256xf32>) -> (memref<f32>, index, index, index) loc(#loc17)\n    %29 = "stable_mosaic_gpu.memref.extract_aligned_pointer_as_index"(%12) : (memref<256xf32>) -> index loc(#loc17)\n    %30 = "stable_mosaic_gpu.arith.index_cast"(%29) : (index) -> i64 loc(#loc17)\n    %31 = "stable_mosaic_gpu.llvm.inttoptr"(%30) : (i64) -> !llvm.ptr loc(#loc17)\n    %32 = "stable_mosaic_gpu.arith.index_cast"(%28#1) : (index) -> i64 loc(#loc17)\n    %33 = "stable_mosaic_gpu.llvm.getelementptr"(%31, %32) {elem_type = f32, rawConstantIndices = array<i32: -2147483648>} : (!llvm.ptr, i64) -> !llvm.ptr loc(#loc17)\n    %34 = "stable_mosaic_gpu.arith.constant"() {value = 6 : i64} : () -> i64 loc(#loc17)\n    %35 = "stable_mosaic_gpu.arith.constant"() {value = 1 : i64} : () -> i64 loc(#loc17)\n    %36 = "stable_mosaic_gpu.arith.index_cast"(%28#2) : (index) -> i64 loc(#loc17)\n    %37 = "stable_mosaic_gpu.arith.constant"() {value = 1 : i64} : () -> i64 loc(#loc17)\n    %38 = "stable_mosaic_gpu.llvm.alloca"(%37) {elem_type = i64} : (i64) -> !llvm.ptr loc(#loc17)\n    %39 = "stable_mosaic_gpu.llvm.getelementptr"(%38) {elem_type = i64, rawConstantIndices = array<i32: 0>} : (!llvm.ptr) -> !llvm.ptr loc(#loc17)\n    "stable_mosaic_gpu.llvm.store"(%36, %39) {ordering = 0 : i64} : (i64, !llvm.ptr) -> () loc(#loc17)\n    %40 = "stable_mosaic_gpu.arith.index_cast"(%28#3) : (index) -> i64 loc(#loc17)\n    %41 = "stable_mosaic_gpu.arith.constant"() {value = 1 : i64} : () -> i64 loc(#loc17)\n    %42 = "stable_mosaic_gpu.llvm.alloca"(%41) {elem_type = i64} : (i64) -> !llvm.ptr loc(#loc17)\n    %43 = "stable_mosaic_gpu.llvm.getelementptr"(%42) {elem_type = i64, rawConstantIndices = array<i32: 0>} : (!llvm.ptr) -> !llvm.ptr loc(#loc17)\n    "stable_mosaic_gpu.llvm.store"(%40, %43) {ordering = 0 : i64} : (i64, !llvm.ptr) -> () loc(#loc17)\n    %44 = "stable_mosaic_gpu.arith.constant"() {value = 16 : i64} : () -> i64 loc(#loc17)\n    %45 = "stable_mosaic_gpu.arith.constant"() {value = 256 : i64} : () -> i64 loc(#loc17)\n    %46 = "stable_mosaic_gpu.arith.constant"() {value = 1 : i64} : () -> i64 loc(#loc17)\n    %47 = "stable_mosaic_gpu.llvm.alloca"(%46) {elem_type = i64} : (i64) -> !llvm.ptr loc(#loc17)\n    %48 = "stable_mosaic_gpu.llvm.getelementptr"(%47) {elem_type = i64, rawConstantIndices = array<i32: 0>} : (!llvm.ptr) -> !llvm.ptr loc(#loc17)\n    "stable_mosaic_gpu.llvm.store"(%45, %48) {ordering = 0 : i64} : (i64, !llvm.ptr) -> () loc(#loc17)\n    "stable_mosaic_gpu.func.call"(%27, %33, %34, %35, %38, %42, %44, %47) {callee = @mosaic_gpu_init_tma_desc} : (!llvm.ptr, !llvm.ptr, i64, i64, !llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> () loc(#loc17)\n    %49 = "stable_mosaic_gpu.llvm.getelementptr"(%26) {elem_type = i8, rawConstantIndices = array<i32: 128>} : (!llvm.ptr) -> !llvm.ptr loc(#loc17)\n    %50:4 = "stable_mosaic_gpu.memref.extract_strided_metadata"(%24) : (memref<256xf32>) -> (memref<f32>, index, index, index) loc(#loc17)\n    %51 = "stable_mosaic_gpu.memref.extract_aligned_pointer_as_index"(%24) : (memref<256xf32>) -> index loc(#loc17)\n    %52 = "stable_mosaic_gpu.arith.index_cast"(%51) : (index) -> i64 loc(#loc17)\n    %53 = "stable_mosaic_gpu.llvm.inttoptr"(%52) : (i64) -> !llvm.ptr loc(#loc17)\n    %54 = "stable_mosaic_gpu.arith.index_cast"(%50#1) : (index) -> i64 loc(#loc17)\n    %55 = "stable_mosaic_gpu.llvm.getelementptr"(%53, %54) {elem_type = f32, rawConstantIndices = array<i32: -2147483648>} : (!llvm.ptr, i64) -> !llvm.ptr loc(#loc17)\n    %56 = "stable_mosaic_gpu.arith.constant"() {value = 6 : i64} : () -> i64 loc(#loc17)\n    %57 = "stable_mosaic_gpu.arith.constant"() {value = 1 : i64} : () -> i64 loc(#loc17)\n    %58 = "stable_mosaic_gpu.arith.index_cast"(%50#2) : (index) -> i64 loc(#loc17)\n    %59 = "stable_mosaic_gpu.arith.constant"() {value = 1 : i64} : () -> i64 loc(#loc17)\n    %60 = "stable_mosaic_gpu.llvm.alloca"(%59) {elem_type = i64} : (i64) -> !llvm.ptr loc(#loc17)\n    %61 = "stable_mosaic_gpu.llvm.getelementptr"(%60) {elem_type = i64, rawConstantIndices = array<i32: 0>} : (!llvm.ptr) -> !llvm.ptr loc(#loc17)\n    "stable_mosaic_gpu.llvm.store"(%58, %61) {ordering = 0 : i64} : (i64, !llvm.ptr) -> () loc(#loc17)\n    %62 = "stable_mosaic_gpu.arith.index_cast"(%50#3) : (index) -> i64 loc(#loc17)\n    %63 = "stable_mosaic_gpu.arith.constant"() {value = 1 : i64} : () -> i64 loc(#loc17)\n    %64 = "stable_mosaic_gpu.llvm.alloca"(%63) {elem_type = i64} : (i64) -> !llvm.ptr loc(#loc17)\n    %65 = "stable_mosaic_gpu.llvm.getelementptr"(%64) {elem_type = i64, rawConstantIndices = array<i32: 0>} : (!llvm.ptr) -> !llvm.ptr loc(#loc17)\n    "stable_mosaic_gpu.llvm.store"(%62, %65) {ordering = 0 : i64} : (i64, !llvm.ptr) -> () loc(#loc17)\n    %66 = "stable_mosaic_gpu.arith.constant"() {value = 16 : i64} : () -> i64 loc(#loc17)\n    %67 = "stable_mosaic_gpu.arith.constant"() {value = 256 : i64} : () -> i64 loc(#loc17)\n    %68 = "stable_mosaic_gpu.arith.constant"() {value = 1 : i64} : () -> i64 loc(#loc17)\n    %69 = "stable_mosaic_gpu.llvm.alloca"(%68) {elem_type = i64} : (i64) -> !llvm.ptr loc(#loc17)\n    %70 = "stable_mosaic_gpu.llvm.getelementptr"(%69) {elem_type = i64, rawConstantIndices = array<i32: 0>} : (!llvm.ptr) -> !llvm.ptr loc(#loc17)\n    "stable_mosaic_gpu.llvm.store"(%67, %70) {ordering = 0 : i64} : (i64, !llvm.ptr) -> () loc(#loc17)\n    "stable_mosaic_gpu.func.call"(%49, %55, %56, %57, %60, %64, %66, %69) {callee = @mosaic_gpu_init_tma_desc} : (!llvm.ptr, !llvm.ptr, i64, i64, !llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> () loc(#loc17)\n    %71 = "stable_mosaic_gpu.llvm.load"(%26) {ordering = 0 : i64} : (!llvm.ptr) -> !llvm.array<256 x i8> loc(#loc17)\n    %72 = "stable_mosaic_gpu.arith.constant"() {value = 2 : index} : () -> index loc(#loc17)\n    %73 = "stable_mosaic_gpu.arith.constant"() {value = 1 : index} : () -> index loc(#loc17)\n    %74 = "stable_mosaic_gpu.arith.constant"() {value = 1 : index} : () -> index loc(#loc17)\n    %75 = "stable_mosaic_gpu.arith.constant"() {value = 128 : index} : () -> index loc(#loc17)\n    %76 = "stable_mosaic_gpu.arith.constant"() {value = 1 : index} : () -> index loc(#loc17)\n    %77 = "stable_mosaic_gpu.arith.constant"() {value = 1 : index} : () -> index loc(#loc17)\n    %78 = "stable_mosaic_gpu.arith.constant"() {value = 2056 : i32} : () -> i32 loc(#loc17)\n    %79 = "stable_mosaic_gpu.gpu.launch"(%0, %72, %73, %74, %75, %76, %77, %78) ({\n    ^bb0(%arg2: index loc("-":94:40), %arg3: index loc("-":94:47), %arg4: index loc("-":94:54), %arg5: index loc("-":94:116), %arg6: index loc("-":94:123), %arg7: index loc("-":94:130), %arg8: index loc("-":94:65), %arg9: index loc("-":94:78), %arg10: index loc("-":94:91), %arg11: index loc("-":94:141), %arg12: index loc("-":94:157), %arg13: index loc("-":94:174)):\n      %80 = "stable_mosaic_gpu.gpu.dynamic_shared_memory"() : () -> memref<?xi8, #gpu.address_space<workgroup>> loc(#loc17)\n      %81 = "stable_mosaic_gpu.builtin.unrealized_conversion_cast"(%71) : (!llvm.array<256 x i8>) -> !llvm.ptr loc(#loc17)\n      %82 = "stable_mosaic_gpu.llvm.getelementptr"(%81) {elem_type = i8, rawConstantIndices = array<i32: 128>} : (!llvm.ptr) -> !llvm.ptr loc(#loc18)\n      %83 = "stable_mosaic_gpu.llvm.getelementptr"(%81) {elem_type = i8, rawConstantIndices = array<i32: 0>} : (!llvm.ptr) -> !llvm.ptr loc(#loc19)\n      %84 = "stable_mosaic_gpu.arith.constant"() {value = 0 : index} : () -> index loc(#loc17)\n      %85 = "stable_mosaic_gpu.memref.view"(%80, %84) : (memref<?xi8, #gpu.address_space<workgroup>>, index) -> memref<2048xi8, #gpu.address_space<workgroup>> loc(#loc17)\n      %86 = "stable_mosaic_gpu.builtin.unrealized_conversion_cast"(%80) : (memref<?xi8, #gpu.address_space<workgroup>>) -> !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> loc(#loc17)\n      %87 = "stable_mosaic_gpu.llvm.extractvalue"(%86) {position = array<i64: 1>} : (!llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>) -> !llvm.ptr<3> loc(#loc17)\n      %88 = "stable_mosaic_gpu.llvm.extractvalue"(%86) {position = array<i64: 2>} : (!llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>) -> i64 loc(#loc17)\n      %89 = "stable_mosaic_gpu.arith.constant"() {value = 1 : i64} : () -> i64 loc(#loc17)\n      %90 = "stable_mosaic_gpu.llvm.mul"(%88, %89) : (i64, i64) -> i64 loc(#loc17)\n      %91 = "stable_mosaic_gpu.llvm.ptrtoint"(%87) : (!llvm.ptr<3>) -> i64 loc(#loc17)\n      %92 = "stable_mosaic_gpu.llvm.add"(%91, %90) : (i64, i64) -> i64 loc(#loc17)\n      %93 = "stable_mosaic_gpu.llvm.inttoptr"(%92) : (i64) -> !llvm.ptr<3> loc(#loc17)\n      %94 = "stable_mosaic_gpu.llvm.getelementptr"(%93) {elem_type = i8, rawConstantIndices = array<i32: 2048>} : (!llvm.ptr<3>) -> !llvm.ptr<3> loc(#loc17)\n      %95 = "stable_mosaic_gpu.memref.alloca"() {operandSegmentSizes = array<i32: 0, 0>} : () -> memref<i32> loc(#loc17)\n      %96 = "stable_mosaic_gpu.arith.constant"() {value = 0 : i32} : () -> i32 loc(#loc17)\n      "stable_mosaic_gpu.memref.store"(%96, %95) : (i32, memref<i32>) -> () loc(#loc17)\n      %97 = "stable_mosaic_gpu.nvvm.elect.sync"() : () -> i1 loc(#loc17)\n      %98 = "stable_mosaic_gpu.gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index loc(#loc17)\n      %99 = "stable_mosaic_gpu.arith.index_cast"(%98) : (index) -> i32 loc(#loc17)\n      %100 = "stable_mosaic_gpu.gpu.block_dim"() {dimension = #gpu<dim x>} : () -> index loc(#loc17)\n      %101 = "stable_mosaic_gpu.arith.index_cast"(%100) : (index) -> i32 loc(#loc17)\n      %102 = "stable_mosaic_gpu.gpu.thread_id"() {dimension = #gpu<dim y>} : () -> index loc(#loc17)\n      %103 = "stable_mosaic_gpu.arith.index_cast"(%102) : (index) -> i32 loc(#loc17)\n      %104 = "stable_mosaic_gpu.arith.muli"(%103, %101) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc17)\n      %105 = "stable_mosaic_gpu.arith.addi"(%99, %104) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc17)\n      %106 = "stable_mosaic_gpu.gpu.block_dim"() {dimension = #gpu<dim y>} : () -> index loc(#loc17)\n      %107 = "stable_mosaic_gpu.arith.index_cast"(%106) : (index) -> i32 loc(#loc17)\n      %108 = "stable_mosaic_gpu.arith.muli"(%101, %107) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc17)\n      %109 = "stable_mosaic_gpu.gpu.thread_id"() {dimension = #gpu<dim z>} : () -> index loc(#loc17)\n      %110 = "stable_mosaic_gpu.arith.index_cast"(%109) : (index) -> i32 loc(#loc17)\n      %111 = "stable_mosaic_gpu.arith.muli"(%110, %108) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc17)\n      %112 = "stable_mosaic_gpu.arith.addi"(%105, %111) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc17)\n      %113 = "stable_mosaic_gpu.gpu.block_dim"() {dimension = #gpu<dim z>} : () -> index loc(#loc17)\n      %114 = "stable_mosaic_gpu.arith.index_cast"(%113) : (index) -> i32 loc(#loc17)\n      %115 = "stable_mosaic_gpu.arith.muli"(%108, %114) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc17)\n      %116 = "stable_mosaic_gpu.arith.constant"() {value = 5 : i32} : () -> i32 loc(#loc17)\n      %117 = "stable_mosaic_gpu.arith.shrui"(%112, %116) : (i32, i32) -> i32 loc(#loc17)\n      %118 = "stable_mosaic_gpu.arith.constant"() {value = -1 : i32} : () -> i32 loc(#loc17)\n      %119 = "stable_mosaic_gpu.arith.constant"() {value = 0 : i32} : () -> i32 loc(#loc17)\n      %120 = "stable_mosaic_gpu.arith.constant"() {value = 31 : i32} : () -> i32 loc(#loc17)\n      %121 = "stable_mosaic_gpu.nvvm.shfl.sync"(%118, %117, %119, %120) {kind = #nvvm<shfl_kind idx>} : (i32, i32, i32, i32) -> i32 loc(#loc17)\n      %122 = "stable_mosaic_gpu.arith.constant"() {value = 0 : i32} : () -> i32 loc(#loc17)\n      %123 = "stable_mosaic_gpu.arith.cmpi"(%121, %122) {predicate = 0 : i64} : (i32, i32) -> i1 loc(#loc17)\n      %124 = "stable_mosaic_gpu.arith.andi"(%123, %97) : (i1, i1) -> i1 loc(#loc17)\n      "stable_mosaic_gpu.scf.if"(%124) ({\n        %332 = "stable_mosaic_gpu.llvm.getelementptr"(%94) {elem_type = i64, rawConstantIndices = array<i32: 0>} : (!llvm.ptr<3>) -> !llvm.ptr<3> loc(#loc17)\n        %333 = "stable_mosaic_gpu.arith.constant"() {value = 128 : i32} : () -> i32 loc(#loc17)\n        "stable_mosaic_gpu.nvvm.mbarrier.init.shared"(%332, %333) : (!llvm.ptr<3>, i32) -> () loc(#loc17)\n        "stable_mosaic_gpu.scf.yield"() : () -> () loc(#loc13)\n      }, {\n      }) : (i1) -> () loc(#loc17)\n      %125 = "stable_mosaic_gpu.arith.constant"() {value = 0 : i32} : () -> i32 loc(#loc17)\n      "stable_mosaic_gpu.nvvm.fence.mbarrier.init"() : () -> () loc(#loc17)\n      "stable_mosaic_gpu.gpu.barrier"() : () -> () loc(#loc17)\n      %126 = "stable_mosaic_gpu.nvvm.elect.sync"() : () -> i1 loc(#loc17)\n      %127 = "stable_mosaic_gpu.gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index loc(#loc17)\n      %128 = "stable_mosaic_gpu.arith.index_cast"(%127) : (index) -> i32 loc(#loc17)\n      %129 = "stable_mosaic_gpu.gpu.block_dim"() {dimension = #gpu<dim x>} : () -> index loc(#loc17)\n      %130 = "stable_mosaic_gpu.arith.index_cast"(%129) : (index) -> i32 loc(#loc17)\n      %131 = "stable_mosaic_gpu.gpu.thread_id"() {dimension = #gpu<dim y>} : () -> index loc(#loc17)\n      %132 = "stable_mosaic_gpu.arith.index_cast"(%131) : (index) -> i32 loc(#loc17)\n      %133 = "stable_mosaic_gpu.arith.muli"(%132, %130) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc17)\n      %134 = "stable_mosaic_gpu.arith.addi"(%128, %133) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc17)\n      %135 = "stable_mosaic_gpu.gpu.block_dim"() {dimension = #gpu<dim y>} : () -> index loc(#loc17)\n      %136 = "stable_mosaic_gpu.arith.index_cast"(%135) : (index) -> i32 loc(#loc17)\n      %137 = "stable_mosaic_gpu.arith.muli"(%130, %136) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc17)\n      %138 = "stable_mosaic_gpu.gpu.thread_id"() {dimension = #gpu<dim z>} : () -> index loc(#loc17)\n      %139 = "stable_mosaic_gpu.arith.index_cast"(%138) : (index) -> i32 loc(#loc17)\n      %140 = "stable_mosaic_gpu.arith.muli"(%139, %137) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc17)\n      %141 = "stable_mosaic_gpu.arith.addi"(%134, %140) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc17)\n      %142 = "stable_mosaic_gpu.gpu.block_dim"() {dimension = #gpu<dim z>} : () -> index loc(#loc17)\n      %143 = "stable_mosaic_gpu.arith.index_cast"(%142) : (index) -> i32 loc(#loc17)\n      %144 = "stable_mosaic_gpu.arith.muli"(%137, %143) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc17)\n      %145 = "stable_mosaic_gpu.arith.constant"() {value = 5 : i32} : () -> i32 loc(#loc17)\n      %146 = "stable_mosaic_gpu.arith.shrui"(%141, %145) : (i32, i32) -> i32 loc(#loc17)\n      %147 = "stable_mosaic_gpu.arith.constant"() {value = -1 : i32} : () -> i32 loc(#loc17)\n      %148 = "stable_mosaic_gpu.arith.constant"() {value = 0 : i32} : () -> i32 loc(#loc17)\n      %149 = "stable_mosaic_gpu.arith.constant"() {value = 31 : i32} : () -> i32 loc(#loc17)\n      %150 = "stable_mosaic_gpu.nvvm.shfl.sync"(%147, %146, %148, %149) {kind = #nvvm<shfl_kind idx>} : (i32, i32, i32, i32) -> i32 loc(#loc17)\n      %151 = "stable_mosaic_gpu.arith.constant"() {value = 4 : i32} : () -> i32 loc(#loc17)\n      %152 = "stable_mosaic_gpu.arith.remui"(%150, %151) : (i32, i32) -> i32 loc(#loc17)\n      %153 = "stable_mosaic_gpu.arith.constant"() {value = 0 : i32} : () -> i32 loc(#loc17)\n      %154 = "stable_mosaic_gpu.arith.cmpi"(%152, %153) {predicate = 0 : i64} : (i32, i32) -> i1 loc(#loc17)\n      %155 = "stable_mosaic_gpu.arith.andi"(%154, %126) : (i1, i1) -> i1 loc(#loc17)\n      %156 = "stable_mosaic_gpu.nvvm.elect.sync"() : () -> i1 loc(#loc17)\n      %157 = "stable_mosaic_gpu.gpu.block_id"() {dimension = #gpu<dim x>} : () -> index loc(#loc17)\n      %158 = "stable_mosaic_gpu.arith.index_cast"(%157) : (index) -> i32 loc(#loc17)\n      %159 = "stable_mosaic_gpu.gpu.dynamic_shared_memory"() : () -> memref<?xi8, #gpu.address_space<workgroup>> loc(#loc20)\n      %160 = "stable_mosaic_gpu.arith.constant"() {value = 0 : index} : () -> index loc(#loc20)\n      %161 = "stable_mosaic_gpu.memref.view"(%159, %160) : (memref<?xi8, #gpu.address_space<workgroup>>, index) -> memref<1x256xf32, #gpu.address_space<workgroup>> loc(#loc20)\n      %162 = "stable_mosaic_gpu.gpu.dynamic_shared_memory"() : () -> memref<?xi8, #gpu.address_space<workgroup>> loc(#loc20)\n      %163 = "stable_mosaic_gpu.arith.constant"() {value = 1024 : index} : () -> index loc(#loc20)\n      %164 = "stable_mosaic_gpu.memref.view"(%162, %163) : (memref<?xi8, #gpu.address_space<workgroup>>, index) -> memref<1x256xf32, #gpu.address_space<workgroup>> loc(#loc20)\n      %165 = "stable_mosaic_gpu.arith.constant"() {value = 0 : index} : () -> index loc(#loc19)\n      %166 = "stable_mosaic_gpu.memref.subview"(%161, %165) {operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: 1, 256>, static_strides = array<i64: 1, 1>} : (memref<1x256xf32, #gpu.address_space<workgroup>>, index) -> memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>> loc(#loc19)\n      %167 = "stable_mosaic_gpu.arith.constant"() {value = 0 : index} : () -> index loc(#loc19)\n      %168 = "stable_mosaic_gpu.arith.index_castui"(%167) : (index) -> i32 loc(#loc19)\n      %169 = "stable_mosaic_gpu.arith.addi"(%125, %168) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc19)\n      %170 = "stable_mosaic_gpu.arith.constant"() {value = 8 : i32} : () -> i32 loc(#loc19)\n      %171 = "stable_mosaic_gpu.llvm.getelementptr"(%94, %169) {elem_type = i64, rawConstantIndices = array<i32: -2147483648>} : (!llvm.ptr<3>, i32) -> !llvm.ptr<3> loc(#loc19)\n      "stable_mosaic_gpu.nvvm.mbarrier.arrive.expect_tx.shared"(%171, %170) : (!llvm.ptr<3>, i32) -> () loc(#loc19)\n      %172 = "stable_mosaic_gpu.arith.constant"() {value = 0 : index} : () -> index loc(#loc19)\n      %173 = "stable_mosaic_gpu.arith.index_cast"(%172) : (index) -> i32 loc(#loc19)\n      %174 = "stable_mosaic_gpu.builtin.unrealized_conversion_cast"(%166) : (memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>>) -> !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> loc(#loc19)\n      %175 = "stable_mosaic_gpu.llvm.extractvalue"(%174) {position = array<i64: 1>} : (!llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>) -> !llvm.ptr<3> loc(#loc19)\n      %176 = "stable_mosaic_gpu.llvm.extractvalue"(%174) {position = array<i64: 2>} : (!llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>) -> i64 loc(#loc19)\n      %177 = "stable_mosaic_gpu.arith.constant"() {value = 4 : i64} : () -> i64 loc(#loc19)\n      %178 = "stable_mosaic_gpu.llvm.mul"(%176, %177) : (i64, i64) -> i64 loc(#loc19)\n      %179 = "stable_mosaic_gpu.llvm.ptrtoint"(%175) : (!llvm.ptr<3>) -> i64 loc(#loc19)\n      %180 = "stable_mosaic_gpu.llvm.add"(%179, %178) : (i64, i64) -> i64 loc(#loc19)\n      %181 = "stable_mosaic_gpu.llvm.inttoptr"(%180) : (i64) -> !llvm.ptr<3> loc(#loc19)\n      %182 = "stable_mosaic_gpu.arith.constant"() {value = 1024 : i32} : () -> i32 loc(#loc19)\n      %183 = "stable_mosaic_gpu.llvm.getelementptr"(%94, %169) {elem_type = i64, rawConstantIndices = array<i32: -2147483648>} : (!llvm.ptr<3>, i32) -> !llvm.ptr<3> loc(#loc19)\n      "stable_mosaic_gpu.nvvm.cp.async.bulk.tensor.shared.cluster.global"(%181, %83, %173, %183, %155) {operandSegmentSizes = array<i32: 1, 1, 1, 1, 0, 0, 0, 1>} : (!llvm.ptr<3>, !llvm.ptr, i32, !llvm.ptr<3>, i1) -> () loc(#loc19)\n      %184 = "stable_mosaic_gpu.arith.constant"() {value = 0 : i32} : () -> i32 loc(#loc21)\n      %185 = "stable_mosaic_gpu.arith.constant"() {value = 0 : i32} : () -> i32 loc(#loc21)\n      %186 = "stable_mosaic_gpu.arith.addi"(%185, %184) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc21)\n      %187 = "stable_mosaic_gpu.arith.constant"() {value = 1 : i32} : () -> i32 loc(#loc22)\n      %188 = "stable_mosaic_gpu.arith.remsi"(%186, %187) : (i32, i32) -> i32 loc(#loc22)\n      %189 = "stable_mosaic_gpu.arith.index_cast"(%188) : (i32) -> index loc(#loc23)\n      %190 = "stable_mosaic_gpu.arith.index_castui"(%189) : (index) -> i32 loc(#loc23)\n      %191 = "stable_mosaic_gpu.arith.addi"(%125, %190) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc23)\n      %192 = "stable_mosaic_gpu.memref.load"(%95) : (memref<i32>) -> i32 loc(#loc23)\n      %193 = "stable_mosaic_gpu.arith.constant"() {value = 1 : i32} : () -> i32 loc(#loc23)\n      %194 = "stable_mosaic_gpu.arith.shli"(%193, %191) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc23)\n      %195 = "stable_mosaic_gpu.arith.andi"(%192, %194) : (i32, i32) -> i32 loc(#loc23)\n      %196 = "stable_mosaic_gpu.arith.constant"() {value = 0 : i32} : () -> i32 loc(#loc23)\n      %197 = "stable_mosaic_gpu.arith.cmpi"(%195, %196) {predicate = 1 : i64} : (i32, i32) -> i1 loc(#loc23)\n      %198 = "stable_mosaic_gpu.arith.xori"(%192, %194) : (i32, i32) -> i32 loc(#loc23)\n      "stable_mosaic_gpu.memref.store"(%198, %95) : (i32, memref<i32>) -> () loc(#loc23)\n      %199 = "stable_mosaic_gpu.arith.constant"() {value = 10000000 : i32} : () -> i32 loc(#loc23)\n      %200 = "stable_mosaic_gpu.arith.extui"(%197) : (i1) -> i32 loc(#loc23)\n      %201 = "stable_mosaic_gpu.llvm.getelementptr"(%94, %191) {elem_type = i64, rawConstantIndices = array<i32: -2147483648>} : (!llvm.ptr<3>, i32) -> !llvm.ptr<3> loc(#loc23)\n      "stable_mosaic_gpu.nvvm.mbarrier.try_wait.parity.shared"(%201, %200, %199) : (!llvm.ptr<3>, i32, i32) -> () loc(#loc23)\n      %202 = "stable_mosaic_gpu.arith.index_cast"(%188) : (i32) -> index loc(#loc24)\n      %203 = "stable_mosaic_gpu.memref.subview"(%161, %202) {operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: 1, 256>, static_strides = array<i64: 1, 1>} : (memref<1x256xf32, #gpu.address_space<workgroup>>, index) -> memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>> loc(#loc24)\n      %204 = "stable_mosaic_gpu.arith.index_cast"(%188) : (i32) -> index loc(#loc24)\n      %205 = "stable_mosaic_gpu.memref.subview"(%164, %204) {operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: 1, 256>, static_strides = array<i64: 1, 1>} : (memref<1x256xf32, #gpu.address_space<workgroup>>, index) -> memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>> loc(#loc24)\n      %206 = "stable_mosaic_gpu.gpu.block_id"() {dimension = #gpu<dim x>} : () -> index loc(#loc24)\n      %207 = "stable_mosaic_gpu.arith.index_cast"(%206) : (index) -> i32 loc(#loc24)\n      %208 = "stable_mosaic_gpu.memref.subview"(%203) {operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0>, static_sizes = array<i64: 256>, static_strides = array<i64: 1>} : (memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>>) -> memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>> loc(#loc25)\n      %209 = "stable_mosaic_gpu.memref.collapse_shape"(%208) {reassociation = [[0]]} : (memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>>) -> memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>> loc(#loc25)\n      %210 = "stable_mosaic_gpu.gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index loc(#loc25)\n      %211 = "stable_mosaic_gpu.arith.constant"() {value = 128 : index} : () -> index loc(#loc25)\n      %212 = "stable_mosaic_gpu.arith.remui"(%210, %211) : (index, index) -> index loc(#loc25)\n      %213 = "stable_mosaic_gpu.arith.constant"() {value = 2 : index} : () -> index loc(#loc25)\n      %214 = "stable_mosaic_gpu.arith.muli"(%212, %213) {overflowFlags = #arith.overflow<none>} : (index, index) -> index loc(#loc25)\n      %215 = "stable_mosaic_gpu.arith.constant"() {value = 0 : index} : () -> index loc(#loc25)\n      %216 = "stable_mosaic_gpu.arith.addi"(%214, %215) {overflowFlags = #arith.overflow<none>} : (index, index) -> index loc(#loc25)\n      %217 = "stable_mosaic_gpu.vector.load"(%209, %216) : (memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>>, index) -> vector<2xf32> loc(#loc25)\n      %218 = "stable_mosaic_gpu.arith.constant"() {value = 1.000000e+00 : f32} : () -> f32 loc(#loc26)\n      %219 = "stable_mosaic_gpu.vector.splat"(%218) : (f32) -> vector<2xf32> loc(#loc26)\n      %220 = "stable_mosaic_gpu.arith.addf"(%217, %219) {fastmath = #arith.fastmath<contract>} : (vector<2xf32>, vector<2xf32>) -> vector<2xf32> loc(#loc26)\n      %221 = "stable_mosaic_gpu.memref.subview"(%205) {operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0>, static_sizes = array<i64: 256>, static_strides = array<i64: 1>} : (memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>>) -> memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>> loc(#loc27)\n      %222 = "stable_mosaic_gpu.memref.collapse_shape"(%221) {reassociation = [[0]]} : (memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>>) -> memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>> loc(#loc27)\n      %223 = "stable_mosaic_gpu.gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index loc(#loc27)\n      %224 = "stable_mosaic_gpu.arith.constant"() {value = 128 : index} : () -> index loc(#loc27)\n      %225 = "stable_mosaic_gpu.arith.remui"(%223, %224) : (index, index) -> index loc(#loc27)\n      %226 = "stable_mosaic_gpu.arith.constant"() {value = 2 : index} : () -> index loc(#loc27)\n      %227 = "stable_mosaic_gpu.arith.muli"(%225, %226) {overflowFlags = #arith.overflow<none>} : (index, index) -> index loc(#loc27)\n      %228 = "stable_mosaic_gpu.arith.constant"() {value = 0 : index} : () -> index loc(#loc27)\n      %229 = "stable_mosaic_gpu.arith.addi"(%227, %228) {overflowFlags = #arith.overflow<none>} : (index, index) -> index loc(#loc27)\n      %230 = "stable_mosaic_gpu.vector.load"(%222, %229) : (memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>>, index) -> vector<2xf32> loc(#loc27)\n      %231 = "stable_mosaic_gpu.memref.collapse_shape"(%221) {reassociation = [[0]]} : (memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>>) -> memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>> loc(#loc27)\n      %232 = "stable_mosaic_gpu.gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index loc(#loc27)\n      %233 = "stable_mosaic_gpu.arith.constant"() {value = 128 : index} : () -> index loc(#loc27)\n      %234 = "stable_mosaic_gpu.arith.remui"(%232, %233) : (index, index) -> index loc(#loc27)\n      %235 = "stable_mosaic_gpu.arith.constant"() {value = 2 : index} : () -> index loc(#loc27)\n      %236 = "stable_mosaic_gpu.arith.muli"(%234, %235) {overflowFlags = #arith.overflow<none>} : (index, index) -> index loc(#loc27)\n      %237 = "stable_mosaic_gpu.arith.constant"() {value = 0 : index} : () -> index loc(#loc27)\n      %238 = "stable_mosaic_gpu.arith.addi"(%236, %237) {overflowFlags = #arith.overflow<none>} : (index, index) -> index loc(#loc27)\n      "stable_mosaic_gpu.vector.store"(%220, %231, %238) : (vector<2xf32>, memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>>, index) -> () loc(#loc27)\n      "stable_mosaic_gpu.nvvm.cp.async.bulk.commit.group"() : () -> () loc(#loc28)\n      %239 = "stable_mosaic_gpu.arith.constant"() {value = 1 : i32} : () -> i32 loc(#loc29)\n      %240 = "stable_mosaic_gpu.arith.addi"(%186, %239) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc29)\n      %241 = "stable_mosaic_gpu.arith.constant"() {value = 1 : i32} : () -> i32 loc(#loc22)\n      %242 = "stable_mosaic_gpu.arith.remsi"(%240, %241) : (i32, i32) -> i32 loc(#loc22)\n      %243 = "stable_mosaic_gpu.arith.constant"() {value = 0 : i32} : () -> i32 loc(#loc30)\n      %244 = "stable_mosaic_gpu.arith.cmpi"(%186, %243) {predicate = 9 : i64} : (i32, i32) -> i1 loc(#loc30)\n      %245 = "stable_mosaic_gpu.arith.constant"() {value = 1 : i32} : () -> i32 loc(#loc31)\n      %246 = "stable_mosaic_gpu.arith.cmpi"(%240, %245) {predicate = 6 : i64} : (i32, i32) -> i1 loc(#loc31)\n      %247 = "stable_mosaic_gpu.arith.andi"(%244, %246) : (i1, i1) -> i1 loc(#loc32)\n      %248 = "stable_mosaic_gpu.arith.extui"(%247) : (i1) -> i32 loc(#loc33)\n      %249 = "stable_mosaic_gpu.arith.index_cast"(%248) : (i32) -> index loc(#loc34)\n      "stable_mosaic_gpu.scf.index_switch"(%249) ({\n        %313 = "stable_mosaic_gpu.arith.index_cast"(%242) : (i32) -> index loc(#loc19)\n        %314 = "stable_mosaic_gpu.memref.subview"(%161, %313) {operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: 1, 256>, static_strides = array<i64: 1, 1>} : (memref<1x256xf32, #gpu.address_space<workgroup>>, index) -> memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>> loc(#loc19)\n        %315 = "stable_mosaic_gpu.arith.index_cast"(%242) : (i32) -> index loc(#loc19)\n        %316 = "stable_mosaic_gpu.arith.index_castui"(%315) : (index) -> i32 loc(#loc19)\n        %317 = "stable_mosaic_gpu.arith.addi"(%125, %316) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc19)\n        %318 = "stable_mosaic_gpu.arith.constant"() {value = 8 : i32} : () -> i32 loc(#loc19)\n        %319 = "stable_mosaic_gpu.llvm.getelementptr"(%94, %317) {elem_type = i64, rawConstantIndices = array<i32: -2147483648>} : (!llvm.ptr<3>, i32) -> !llvm.ptr<3> loc(#loc19)\n        "stable_mosaic_gpu.nvvm.mbarrier.arrive.expect_tx.shared"(%319, %318) : (!llvm.ptr<3>, i32) -> () loc(#loc19)\n        %320 = "stable_mosaic_gpu.arith.constant"() {value = 0 : index} : () -> index loc(#loc19)\n        %321 = "stable_mosaic_gpu.arith.index_cast"(%320) : (index) -> i32 loc(#loc19)\n        %322 = "stable_mosaic_gpu.builtin.unrealized_conversion_cast"(%314) : (memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>>) -> !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> loc(#loc19)\n        %323 = "stable_mosaic_gpu.llvm.extractvalue"(%322) {position = array<i64: 1>} : (!llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>) -> !llvm.ptr<3> loc(#loc19)\n        %324 = "stable_mosaic_gpu.llvm.extractvalue"(%322) {position = array<i64: 2>} : (!llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>) -> i64 loc(#loc19)\n        %325 = "stable_mosaic_gpu.arith.constant"() {value = 4 : i64} : () -> i64 loc(#loc19)\n        %326 = "stable_mosaic_gpu.llvm.mul"(%324, %325) : (i64, i64) -> i64 loc(#loc19)\n        %327 = "stable_mosaic_gpu.llvm.ptrtoint"(%323) : (!llvm.ptr<3>) -> i64 loc(#loc19)\n        %328 = "stable_mosaic_gpu.llvm.add"(%327, %326) : (i64, i64) -> i64 loc(#loc19)\n        %329 = "stable_mosaic_gpu.llvm.inttoptr"(%328) : (i64) -> !llvm.ptr<3> loc(#loc19)\n        %330 = "stable_mosaic_gpu.arith.constant"() {value = 1024 : i32} : () -> i32 loc(#loc19)\n        %331 = "stable_mosaic_gpu.llvm.getelementptr"(%94, %317) {elem_type = i64, rawConstantIndices = array<i32: -2147483648>} : (!llvm.ptr<3>, i32) -> !llvm.ptr<3> loc(#loc19)\n        "stable_mosaic_gpu.nvvm.cp.async.bulk.tensor.shared.cluster.global"(%329, %83, %321, %331, %155) {operandSegmentSizes = array<i32: 1, 1, 1, 1, 0, 0, 0, 1>} : (!llvm.ptr<3>, !llvm.ptr, i32, !llvm.ptr<3>, i1) -> () loc(#loc19)\n        "stable_mosaic_gpu.scf.yield"() : () -> () loc(#loc16)\n      }, {\n        "stable_mosaic_gpu.scf.yield"() : () -> () loc(#loc34)\n      }) {cases = array<i64: 0>} : (index) -> () loc(#loc34)\n      %250 = "stable_mosaic_gpu.arith.constant"() {value = 1 : i32} : () -> i32 loc(#loc21)\n      %251 = "stable_mosaic_gpu.arith.addi"(%184, %250) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc21)\n      "stable_mosaic_gpu.nvvm.fence.proxy"() {kind = #nvvm.proxy_kind<async.shared>, space = #nvvm.shared_space<cta>} : () -> () loc(#loc35)\n      %252 = "stable_mosaic_gpu.gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index loc(#loc35)\n      %253 = "stable_mosaic_gpu.arith.index_cast"(%252) : (index) -> i32 loc(#loc35)\n      %254 = "stable_mosaic_gpu.gpu.block_dim"() {dimension = #gpu<dim x>} : () -> index loc(#loc35)\n      %255 = "stable_mosaic_gpu.arith.index_cast"(%254) : (index) -> i32 loc(#loc35)\n      %256 = "stable_mosaic_gpu.gpu.thread_id"() {dimension = #gpu<dim y>} : () -> index loc(#loc35)\n      %257 = "stable_mosaic_gpu.arith.index_cast"(%256) : (index) -> i32 loc(#loc35)\n      %258 = "stable_mosaic_gpu.arith.muli"(%257, %255) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc35)\n      %259 = "stable_mosaic_gpu.arith.addi"(%253, %258) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc35)\n      %260 = "stable_mosaic_gpu.gpu.block_dim"() {dimension = #gpu<dim y>} : () -> index loc(#loc35)\n      %261 = "stable_mosaic_gpu.arith.index_cast"(%260) : (index) -> i32 loc(#loc35)\n      %262 = "stable_mosaic_gpu.arith.muli"(%255, %261) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc35)\n      %263 = "stable_mosaic_gpu.gpu.thread_id"() {dimension = #gpu<dim z>} : () -> index loc(#loc35)\n      %264 = "stable_mosaic_gpu.arith.index_cast"(%263) : (index) -> i32 loc(#loc35)\n      %265 = "stable_mosaic_gpu.arith.muli"(%264, %262) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc35)\n      %266 = "stable_mosaic_gpu.arith.addi"(%259, %265) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc35)\n      %267 = "stable_mosaic_gpu.gpu.block_dim"() {dimension = #gpu<dim z>} : () -> index loc(#loc35)\n      %268 = "stable_mosaic_gpu.arith.index_cast"(%267) : (index) -> i32 loc(#loc35)\n      %269 = "stable_mosaic_gpu.arith.muli"(%262, %268) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc35)\n      %270 = "stable_mosaic_gpu.arith.constant"() {value = 7 : i32} : () -> i32 loc(#loc35)\n      %271 = "stable_mosaic_gpu.arith.shrui"(%266, %270) : (i32, i32) -> i32 loc(#loc35)\n      %272 = "stable_mosaic_gpu.arith.constant"() {value = 1 : i32} : () -> i32 loc(#loc35)\n      %273 = "stable_mosaic_gpu.arith.addi"(%271, %272) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc35)\n      %274 = "stable_mosaic_gpu.llvm.inline_asm"(%273) {asm_string = "bar.sync $0, 128;", constraints = "r", has_side_effects} : (i32) -> !llvm.void loc(#loc35)\n      %275 = "stable_mosaic_gpu.arith.constant"() {value = 0 : i32} : () -> i32 loc(#loc22)\n      %276 = "stable_mosaic_gpu.arith.constant"() {value = 1 : i32} : () -> i32 loc(#loc22)\n      %277 = "stable_mosaic_gpu.arith.remsi"(%275, %276) : (i32, i32) -> i32 loc(#loc22)\n      %278 = "stable_mosaic_gpu.arith.index_cast"(%277) : (i32) -> index loc(#loc18)\n      %279 = "stable_mosaic_gpu.memref.subview"(%164, %278) {operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: 1, 256>, static_strides = array<i64: 1, 1>} : (memref<1x256xf32, #gpu.address_space<workgroup>>, index) -> memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>> loc(#loc18)\n      %280 = "stable_mosaic_gpu.arith.constant"() {value = 0 : index} : () -> index loc(#loc18)\n      %281 = "stable_mosaic_gpu.arith.index_cast"(%280) : (index) -> i32 loc(#loc18)\n      %282 = "stable_mosaic_gpu.builtin.unrealized_conversion_cast"(%279) : (memref<256xf32, strided<[1], offset: ?>, #gpu.address_space<workgroup>>) -> !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> loc(#loc18)\n      %283 = "stable_mosaic_gpu.llvm.extractvalue"(%282) {position = array<i64: 1>} : (!llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>) -> !llvm.ptr<3> loc(#loc18)\n      %284 = "stable_mosaic_gpu.llvm.extractvalue"(%282) {position = array<i64: 2>} : (!llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>) -> i64 loc(#loc18)\n      %285 = "stable_mosaic_gpu.arith.constant"() {value = 4 : i64} : () -> i64 loc(#loc18)\n      %286 = "stable_mosaic_gpu.llvm.mul"(%284, %285) : (i64, i64) -> i64 loc(#loc18)\n      %287 = "stable_mosaic_gpu.llvm.ptrtoint"(%283) : (!llvm.ptr<3>) -> i64 loc(#loc18)\n      %288 = "stable_mosaic_gpu.llvm.add"(%287, %286) : (i64, i64) -> i64 loc(#loc18)\n      %289 = "stable_mosaic_gpu.llvm.inttoptr"(%288) : (i64) -> !llvm.ptr<3> loc(#loc18)\n      "stable_mosaic_gpu.nvvm.cp.async.bulk.tensor.global.shared.cta"(%82, %289, %281, %155) {operandSegmentSizes = array<i32: 1, 1, 1, 1>} : (!llvm.ptr, !llvm.ptr<3>, i32, i1) -> () loc(#loc18)\n      "stable_mosaic_gpu.nvvm.cp.async.bulk.commit.group"() : () -> () loc(#loc28)\n      "stable_mosaic_gpu.nvvm.cp.async.bulk.wait_group"() {group = 0 : i32} : () -> () loc(#loc36)\n      %290 = "stable_mosaic_gpu.gpu.thread_id"() {dimension = #gpu<dim x>} : () -> index loc(#loc36)\n      %291 = "stable_mosaic_gpu.arith.index_cast"(%290) : (index) -> i32 loc(#loc36)\n      %292 = "stable_mosaic_gpu.gpu.block_dim"() {dimension = #gpu<dim x>} : () -> index loc(#loc36)\n      %293 = "stable_mosaic_gpu.arith.index_cast"(%292) : (index) -> i32 loc(#loc36)\n      %294 = "stable_mosaic_gpu.gpu.thread_id"() {dimension = #gpu<dim y>} : () -> index loc(#loc36)\n      %295 = "stable_mosaic_gpu.arith.index_cast"(%294) : (index) -> i32 loc(#loc36)\n      %296 = "stable_mosaic_gpu.arith.muli"(%295, %293) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc36)\n      %297 = "stable_mosaic_gpu.arith.addi"(%291, %296) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc36)\n      %298 = "stable_mosaic_gpu.gpu.block_dim"() {dimension = #gpu<dim y>} : () -> index loc(#loc36)\n      %299 = "stable_mosaic_gpu.arith.index_cast"(%298) : (index) -> i32 loc(#loc36)\n      %300 = "stable_mosaic_gpu.arith.muli"(%293, %299) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc36)\n      %301 = "stable_mosaic_gpu.gpu.thread_id"() {dimension = #gpu<dim z>} : () -> index loc(#loc36)\n      %302 = "stable_mosaic_gpu.arith.index_cast"(%301) : (index) -> i32 loc(#loc36)\n      %303 = "stable_mosaic_gpu.arith.muli"(%302, %300) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc36)\n      %304 = "stable_mosaic_gpu.arith.addi"(%297, %303) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc36)\n      %305 = "stable_mosaic_gpu.gpu.block_dim"() {dimension = #gpu<dim z>} : () -> index loc(#loc36)\n      %306 = "stable_mosaic_gpu.arith.index_cast"(%305) : (index) -> i32 loc(#loc36)\n      %307 = "stable_mosaic_gpu.arith.muli"(%300, %306) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc36)\n      %308 = "stable_mosaic_gpu.arith.constant"() {value = 7 : i32} : () -> i32 loc(#loc36)\n      %309 = "stable_mosaic_gpu.arith.shrui"(%304, %308) : (i32, i32) -> i32 loc(#loc36)\n      %310 = "stable_mosaic_gpu.arith.constant"() {value = 1 : i32} : () -> i32 loc(#loc36)\n      %311 = "stable_mosaic_gpu.arith.addi"(%309, %310) {overflowFlags = #arith.overflow<none>} : (i32, i32) -> i32 loc(#loc36)\n      %312 = "stable_mosaic_gpu.llvm.inline_asm"(%311) {asm_string = "bar.sync $0, 128;", constraints = "r", has_side_effects} : (i32) -> !llvm.void loc(#loc36)\n      "stable_mosaic_gpu.gpu.terminator"() : () -> () loc(#loc17)\n    }) {operandSegmentSizes = array<i32: 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1>, workgroup_attributions = 0 : i64} : (!gpu.async.token, index, index, index, index, index, index, i32) -> !gpu.async.token loc(#loc17)\n    "stable_mosaic_gpu.func.return"() : () -> () loc(#loc17)\n  }) {function_type = (!llvm.ptr, !llvm.ptr) -> (), llvm.emit_c_interface, sym_name = "mosaic_gpu_body"} : () -> () loc(#loc17)\n}) {stable_mosaic_gpu.version = 1 : i64} : () -> () loc(#loc17)\n#loc13 = loc("-":141:7)\n#loc14 = loc("third_party/py/jax/tests/pallas/export_back_compat_pallas_test.py":78:19)\n#loc15 = loc("third_party/py/jax/tests/pallas/export_back_compat_pallas_test.py":78:6)\n#loc16 = loc("-":279:7)\n#loc18 = loc("/copy_smem_to_gmem"(#loc))\n#loc19 = loc("/copy_gmem_to_smem"(#loc))\n#loc20 = loc("/run_scoped"(#loc))\n#loc21 = loc("/scan"(#loc))\n#loc22 = loc("/rem"(#loc))\n#loc23 = loc("/barrier_wait"(#loc))\n#loc24 = loc("/jaxpr_call"(#loc))\n#loc25 = loc("/get"(#loc14))\n#loc26 = loc("/add"(#loc14))\n#loc27 = loc("/swap"(#loc15))\n#loc28 = loc("/commit_group"(#loc))\n#loc29 = loc("/add"(#loc))\n#loc30 = loc("/ge"(#loc))\n#loc31 = loc("/lt"(#loc))\n#loc32 = loc("/and"(#loc))\n#loc33 = loc("/convert_element_type"(#loc))\n#loc34 = loc("/cond"(#loc))\n#loc35 = loc("/commit_smem"(#loc))\n#loc36 = loc("/wait_smem_to_gmem"(#loc))\n\x00mosaic_gpu\x00\x08\'\x07\x05\x1f\x01\x0b!%\'/1\x11357\x1d9\x1f\x1d\x1f',
    xla_call_module_version=9,
    nr_devices=1,
)  # End paste
